Source code for chelo.datasets.coal_fired_plant

from typing import List, Dict, Optional, Union
from ..base import CheLoDataset
from ..registry import register_dataset
from ..utils.kaggle_downloader import KaggleDatasetDownloader
import pandas as pd



[docs]
@register_dataset
class CoalFiredPlantDataset(CheLoDataset):
    """
    Dataset class for Coal Fired Power Plant Thermal Performance.

    Provides utilities to load, process, and interact with the dataset.
    """
    _DATASET_SLUG: str = "ainalirham/coal-fired-power-plant-thermal-performance-dataset"
    _FILES: List[str] = ['dataset_combined_final.xlsm']
    _CHECKSUMS: List[str] = ["a275decc678749e39c08a9a44a48fc52"]


[docs]
    def __init__(
        self,
        selected_features: Optional[List[str]] = None,
        selected_targets: Optional[List[str]] = None,
    ) -> None:
        """
        Initialize the Coal Fired Power Plant Thermal Performance Dataset.

        :param selected_features: List of features to select (default: all features).
        :param selected_targets: List of targets to select (default: all targets).
        """
        super().__init__(selected_features, selected_targets)
        self.dataset_name: str = "Coal Fired Power Plant Thermal Performance Dataset"
        self.dataset_url: str = ("https://www.kaggle.com/datasets/ainalirham/"
                                 "coal-fired-power-plant-thermal-performance-dataset")



[docs]
    def load_data(self) -> None:
        """
        Load the dataset from Kaggle or cache, and preprocess it.

        Downloads the dataset if not already cached, removes missing values,
        and initializes the feature and target sets.
        """
        downloader: KaggleDatasetDownloader = KaggleDatasetDownloader()

        # Download and validate dataset files
        for file_name, checksum in zip(self._FILES, self._CHECKSUMS):
            downloader.download_dataset(self._DATASET_SLUG, file_name, checksum)

        # Load dataset from the downloaded file
        file_path: str = downloader._get_file_path(self._DATASET_SLUG, self._FILES[0])
        data: pd.DataFrame = pd.read_excel(file_path)
        data.dropna(inplace=True)

        # Extract raw features and targets
        self.raw_features: Dict[str, List[Union[int, float, str]]] = data.drop(
            columns=["Tanggal", "Unnamed: 0"]
        ).to_dict(orient="list")

        self.raw_targets: Dict[str, List[Union[int, float, str]]] = data.drop(
            columns=["Tanggal", "Unnamed: 0"]
        ).to_dict(orient="list")

        # Set default features and targets if none are provided
        if self._selected_targets is None:
            self._selected_targets = ["Boiler Eff (%)"]

        if self._selected_features is None:
            self._selected_features = list(self.raw_features.keys())
            for target in self._selected_targets:
                if target in self._selected_features:
                    self._selected_features.remove(target)

        self._apply_initial_selections()



[docs]
    def get_dataset_info(self) -> Dict[str, Union[str, List[str]]]:
        """
        Get metadata about the dataset.

        :return: A dictionary containing dataset metadata including
                 name, description, features, and targets.
        """
        return {
            "name": self.dataset_name,
            "description": (
                "Dataset containing thermal performance attributes of coal-fired "
                "power plants, including features like boiler efficiency."
            ),
            "features": self.list_features(),
            "targets": self.list_targets(),
            "url": self.dataset_url
        }