Source code for chelo.datasets.coal_fired_plant

from typing import List, Dict, Optional, Union
from ..base import CheLoDataset
from ..registry import register_dataset
from ..utils.kaggle_downloader import KaggleDatasetDownloader
import pandas as pd


[docs] @register_dataset class CoalFiredPlantDataset(CheLoDataset): """ Dataset class for Coal Fired Power Plant Thermal Performance. Provides utilities to load, process, and interact with the dataset. """ _DATASET_SLUG: str = "ainalirham/coal-fired-power-plant-thermal-performance-dataset" _FILES: List[str] = ['dataset_combined_final.xlsm'] _CHECKSUMS: List[str] = ["a275decc678749e39c08a9a44a48fc52"]
[docs] def __init__( self, selected_features: Optional[List[str]] = None, selected_targets: Optional[List[str]] = None, ) -> None: """ Initialize the Coal Fired Power Plant Thermal Performance Dataset. :param selected_features: List of features to select (default: all features). :param selected_targets: List of targets to select (default: all targets). """ super().__init__(selected_features, selected_targets) self.dataset_name: str = "Coal Fired Power Plant Thermal Performance Dataset" self.dataset_url: str = ("https://www.kaggle.com/datasets/ainalirham/" "coal-fired-power-plant-thermal-performance-dataset")
[docs] def load_data(self) -> None: """ Load the dataset from Kaggle or cache, and preprocess it. Downloads the dataset if not already cached, removes missing values, and initializes the feature and target sets. """ downloader: KaggleDatasetDownloader = KaggleDatasetDownloader() # Download and validate dataset files for file_name, checksum in zip(self._FILES, self._CHECKSUMS): downloader.download_dataset(self._DATASET_SLUG, file_name, checksum) # Load dataset from the downloaded file file_path: str = downloader._get_file_path(self._DATASET_SLUG, self._FILES[0]) data: pd.DataFrame = pd.read_excel(file_path) data.dropna(inplace=True) # Extract raw features and targets self.raw_features: Dict[str, List[Union[int, float, str]]] = data.drop( columns=["Tanggal", "Unnamed: 0"] ).to_dict(orient="list") self.raw_targets: Dict[str, List[Union[int, float, str]]] = data.drop( columns=["Tanggal", "Unnamed: 0"] ).to_dict(orient="list") # Set default features and targets if none are provided if self._selected_targets is None: self._selected_targets = ["Boiler Eff (%)"] if self._selected_features is None: self._selected_features = list(self.raw_features.keys()) for target in self._selected_targets: if target in self._selected_features: self._selected_features.remove(target) self._apply_initial_selections()
[docs] def get_dataset_info(self) -> Dict[str, Union[str, List[str]]]: """ Get metadata about the dataset. :return: A dictionary containing dataset metadata including name, description, features, and targets. """ return { "name": self.dataset_name, "description": ( "Dataset containing thermal performance attributes of coal-fired " "power plants, including features like boiler efficiency." ), "features": self.list_features(), "targets": self.list_targets(), "url": self.dataset_url }