Source code for chelo.datasets.cstr_dataset

from typing import Sequence, Dict, Optional, Union, List
from ..base import CheLoDataset
from ..registry import register_dataset
from ..utils.downloader import DatasetDownloader
import pandas as pd
import numpy as np


[docs] @register_dataset class CSTRDataset(CheLoDataset): _URL: str = "https://raw.githubusercontent.com/edgarsmdn/MLCE_book/main/references/CSTR_ODE_data.txt" _FILE_NAME: str = "CSTR_ODE_data.txt" _CHECKSUM: str = "757f3928146122c37efe3fa1bd67a5db"
[docs] def __init__( self, selected_features: Optional[Sequence[str]] = None, selected_targets: Optional[Sequence[str]] = None, window: Optional[int] = None, ) -> None: """ Initialize the CSTR Dataset. The dataset contains the concentrations of three species (A, B, and X) over time. The inlet concentrations are fixed. :param selected_features: Features to select (default: all features). :param selected_targets: Targets to select (default: all targets). :param window: Number of previous time-steps to include in each feature (default: 1). """ super().__init__(selected_features, selected_targets) self.dataset_name: str = "CSTR Dataset" self.dataset_url: str = "https://edgarsmdn.github.io/MLCE_book/05_Hybrid_CSTR.html" self.window_size: int = window if window is not None else 1 self._data_type: str = "timeseries"
[docs] def load_data(self) -> None: """ Load the CSTRDataset dataset. """ downloader: DatasetDownloader = DatasetDownloader() file_path: str = downloader.download( self._URL, dataset_name="cstr", filename=self._FILE_NAME, checksum=self._CHECKSUM, ) data: pd.DataFrame = pd.read_csv(file_path, sep=";") data = data.dropna() self.raw_targets: Dict[str, List[Union[int, float]]] = data.iloc[self.window_size:, :].to_dict(orient="list") self.raw_features: Dict[str, List[Union[int, float]]] = data.to_dict(orient="list") for feature_name in self.raw_features: X = data[feature_name] X = np.asarray(X) X = np.array([X[i:i + self.window_size] for i in range(len(X) - self.window_size)]) self.raw_features[feature_name] = X self._apply_initial_selections()
[docs] def get_dataset_info(self) -> Dict[str, Union[str, Sequence[str]]]: """ Retrieve metadata about the dataset. :return: A dictionary containing dataset metadata. """ return { "name": self.dataset_name, "description": ( "Dataset containing concentrations of three species (A, B, and X) " "in a continuous stirred-tank reactor (CSTR) over time." ), "features": self.list_features(), "targets": self.list_targets(), "url": self.dataset_url, }