Source code for chelo.datasets.wine_quality

from typing import List, Dict, Optional, Union
from ..base import CheLoDataset
from ..registry import register_dataset
from ..utils.downloader import DatasetDownloader
import pandas as pd


[docs] @register_dataset class WineQualityDataset(CheLoDataset): _BASE_URL: str = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/" _FILES: Dict[str, str] = { "red": "winequality-red.csv", "white": "winequality-white.csv", } _CHECKSUMS: Dict[str, str] = { "red": "2daeecee174368f8a33b82c8cccae3a5", "white": "5d9ff0f7f716dace19e3ab4578775fd7", }
[docs] def __init__( self, wine_type: str = "red", selected_features: Optional[List[str]] = None, selected_targets: Optional[List[str]] = None, ) -> None: """ Initialize the Wine Quality Dataset. :param wine_type: Type of wine ('red' or 'white'). :param selected_features: Features to select (default: all). :param selected_targets: Targets to select (default: all). """ super().__init__(selected_features, selected_targets) if wine_type not in self._FILES: raise ValueError(f"Invalid wine_type '{wine_type}'. Must be 'red' or 'white'.") self.wine_type: str = wine_type self.dataset_name: str = f"Wine Quality ({wine_type.capitalize()})" self.dataset_url: str = "https://archive.ics.uci.edu/dataset/186/wine+quality"
[docs] def load_data(self) -> None: """ Load the dataset from the UCI repository or cache. """ downloader: DatasetDownloader = DatasetDownloader() file_url: str = self._BASE_URL + self._FILES[self.wine_type] file_path: str = downloader.download( file_url, dataset_name="wine_quality", filename=self._FILES[self.wine_type], checksum=self._CHECKSUMS[self.wine_type], ) data: pd.DataFrame = pd.read_csv(file_path, sep=";") self.raw_features: Dict[str, List[Union[int, float]]] = data.drop( columns=["quality"] ).to_dict(orient="list") self.raw_targets: Dict[str, List[int]] = {"quality": data["quality"].tolist()} self._apply_initial_selections()
[docs] def get_dataset_info(self) -> Dict[str, Union[str, List[str]]]: """ Get metadata about the dataset. :return: A dictionary containing dataset metadata. """ return { "name": self.dataset_name, "description": "Dataset containing physicochemical attributes and quality ratings of wines.", "wine_type": self.wine_type, "features": self.list_features(), "targets": self.list_targets(), "url": self.dataset_url }