Source code for chelo.datasets.ames_mutagenicity

from typing import Sequence, Dict, Optional, Union
from ..base import CheLoDataset
from ..registry import register_dataset
from ..utils.downloader import DatasetDownloader
import pandas as pd


[docs] @register_dataset class AmesMutagenicityDataset(CheLoDataset): _URL: str = "https://raw.githubusercontent.com/edgarsmdn/MLCE_book/main/references/mutagenicity_kNN.csv" _FILE_NAME: str = "mutagenicity_kNN.csv" _CHECKSUM: str = "0f379d83a7eb246a5ef5e139ab355d64"
[docs] def __init__( self, selected_features: Optional[Sequence[str]] = None, selected_targets: Optional[Sequence[str]] = None, ) -> None: """ Initialize the Ames Mutagenicity dataset. :param selected_features: Features to select (default: all features). :param selected_targets: Targets to select (default: all targets). """ super().__init__(selected_features, selected_targets) self.dataset_name: str = "Ames Q(SAR) Mutagenicity Dataset" self.dataset_url: str = "https://edgarsmdn.github.io/MLCE_book/02_kNN_QSPR.html"
[docs] def load_data(self) -> None: """ Load the dataset into memory. """ downloader: DatasetDownloader = DatasetDownloader() file_path: str = downloader.download( self._URL, dataset_name="ames_mutagenicity", filename=self._FILE_NAME, checksum=self._CHECKSUM, ) data: pd.DataFrame = pd.read_csv(file_path, sep=",") columns_to_drop = [ "Unnamed: 0", "Id", "CAS", "SMILES", "Status", "Experimental value", "Predicted value" ] self.raw_features: Dict[str, Sequence[Union[int, float]]] = data.drop( columns=columns_to_drop, ).to_dict(orient="list") self.raw_targets: Dict[str, Sequence[int]] = { "mutagenicity": data["Experimental value"].tolist() } self._apply_initial_selections()
[docs] def get_dataset_info(self) -> Dict[str, Union[str, Sequence[str]]]: """ Retrieve metadata about the dataset. :return: A dictionary containing dataset metadata. """ return { "name": self.dataset_name, "description": ( "Dataset containing mutagenicity of various chemicals " "on Salmonella typhimurium (Ames test)." ), "features": self.list_features(), "targets": self.list_targets(), "url": self.dataset_url }