Source code for chelo.base

from abc import ABC, abstractmethod
from typing import List, Dict, Any, Optional, Tuple
import numpy as np
import pandas as pd


[docs] class CheLoDataset(ABC): """ Abstract Base Class for datasets. """
[docs] def __init__( self, selected_features: Optional[List[str]] = None, selected_targets: Optional[List[str]] = None ) -> None: """ Initialize the dataset with optional selected features and targets. :param selected_features: List of features to select (default: all). :param selected_targets: List of targets to select (default: all). """ self.raw_features: Optional[Dict[str, List[Any]]] = None # Immutable raw feature data self.raw_targets: Optional[Dict[str, List[Any]]] = None # Immutable raw target data self.features: Optional[Dict[str, List[Any]]] = None # Subset of features to use self.targets: Optional[Dict[str, List[Any]]] = None # Subset of targets to use self.dataset_name: Optional[str] = None # Name of the dataset self._selected_features: Optional[List[str]] = selected_features self._selected_targets: Optional[List[str]] = selected_targets self._features_shape = None self._targets_shape = None # This variable is used when special pre-processing is needed # For example, when set to time-series, it transports the data to ensure consistency self._data_type = None
[docs] @abstractmethod def load_data(self) -> None: """ Load the dataset and populate self.raw_features and self.raw_targets. """ pass
[docs] @abstractmethod def get_dataset_info(self) -> Dict[str, Any]: """ Provide metadata about the dataset (e.g., source, size, description). """ pass
[docs] def get_features_shape(self): """ Returns the shape of the dataset's feature data. :return: Tuple representing the feature data shape. """ assert self._features_shape is not None, "Dataset shape is populated upon getting the dataset" return self._features_shape
[docs] def get_target_shape(self): """ Returns the shape of the dataset's target data. :return: Tuple representing the target data shape. """ assert self._targets_shape is not None, "Dataset shape is populated upon getting the dataset" return self._targets_shape
def __len__(self): """ Returns the number of samples in the dataset. :return: Integer representing the number of samples. """ assert self._features_shape is not None, "Dataset size is populated upon getting the dataset" return self._features_shape[0]
[docs] def select_features(self, feature_names: List[str]) -> None: """ Dynamically select features from the dataset. :param feature_names: List of feature names to select. """ if not self.raw_features: raise ValueError(f"Dataset {self.dataset_name} not loaded yet!") self.features = {name: self.raw_features[name] for name in feature_names} X = np.array(list(self.features.values())).T if self._data_type == 'timeseries': X = X.transpose(1, 0, 2) self._features_shape = X.shape
[docs] def select_targets(self, target_names: List[str]) -> None: """ Dynamically select targets from the dataset. :param target_names: List of target names to select. """ if not self.raw_targets: raise ValueError(f"Dataset {self.dataset_name} not loaded yet!") self.targets = {name: self.raw_targets[name] for name in target_names} # Set the dataset size y = np.array(list(self.targets.values())).T if self._data_type == 'timeseries': # Not sure if this is generally ok... if len(y.shape) == 3: y = y.transpose((1, 0, 2)) self._targets_shape = y.shape
[docs] def selected_features(self): """ Returns a list of feature names selected. :return: List of feature names. """ return self._selected_features
[docs] def selected_targets(self): """ Returns a list of target names selected. :return: List of target names. """ return self._selected_targets
def _apply_initial_selections(self) -> None: """ Apply initial selections if specified during initialization. """ if self._selected_features: self.select_features(self._selected_features) else: self.features = self.raw_features if self._selected_targets: self.select_targets(self._selected_targets) else: self.targets = self.raw_targets
[docs] def size(self) -> int: """ Get the size of the dataset (number of samples). """ if not self.features: raise ValueError("Features are not loaded.") return len(next(iter(self.features.values())))
[docs] def statistics(self) -> Dict[str, Dict[str, float]]: """ Compute basic statistics for the features and targets. :return: A dictionary of statistics (mean, std, min, max) for each feature and target. """ stats = {} if not self.features or not self.targets: raise ValueError("Dataset is not loaded or selections are not applied.") for key, values in {**self.features, **self.targets}.items(): stats[key] = { "mean": np.mean(values), "std": np.std(values), "min": np.min(values), "max": np.max(values), } return stats
[docs] def to_numpy(self) -> Tuple[np.ndarray, np.ndarray]: """ Convert the dataset to numpy arrays. :return: Tuple of (features, targets) in numpy format. """ if not self.features or not self.targets: raise ValueError("Dataset is not loaded or selections are not applied.") X = np.array(list(self.features.values())).T y = np.array(list(self.targets.values())).T if self._data_type == 'timeseries': X = X.transpose(1, 0, 2) # Not sure if this is generally ok... if len(y.shape) == 3: y = y.transpose((1, 0, 2)) return (X, y)
[docs] def to_pandas(self) -> pd.DataFrame: """ Convert the dataset to a single pandas DataFrame with both features and targets. :return: A pandas DataFrame containing both features and targets. """ if not self.features or not self.targets: raise ValueError("Dataset is not loaded or selections are not applied.") # Convert features and targets to DataFrames features_df = pd.DataFrame(self.features, columns=self._selected_features) targets_df = pd.DataFrame(self.targets, columns=self._selected_targets) # Combine features and targets into a single DataFrame combined_df = pd.concat([features_df, targets_df], axis=1) return combined_df
[docs] def to_pytorch(self): """ Provide a PyTorch Dataset object. :return: A PyTorch Dataset containing features and targets. """ from torch.utils.data import Dataset import torch class PyTorchDataset(Dataset): def __init__(self, features: np.ndarray, targets: np.ndarray) -> None: self.features = torch.tensor(features, dtype=torch.float32) if issubclass(targets.dtype.type, np.floating): self.targets = torch.tensor(targets, dtype=torch.float32) elif issubclass(targets.dtype.type, np.integer): self.targets = torch.tensor(targets, dtype=torch.int64) else: assert False, "targets are neither float nor int" def __len__(self) -> int: return len(self.features) def __getitem__(self, idx: int) -> Tuple[np.ndarray, np.ndarray]: return self.features[idx], self.targets[idx] features, targets = self.to_numpy() return PyTorchDataset(features, targets)
[docs] def preview(self, n: int = 5) -> Dict[str, Dict[str, List[Any]]]: """ Preview the first n rows of the dataset. :return: A dictionary with the first few rows. """ if not self.features or not self.targets: raise ValueError("Dataset is not loaded or selections are not applied.") preview_data = { "features": {key: values[:n] for key, values in self.features.items()}, "targets": {key: values[:n] for key, values in self.targets.items()}, } print(preview_data) return preview_data
[docs] def list_features(self) -> List[str]: """Return the list of available features.""" return list(self.raw_features.keys())
[docs] def list_targets(self) -> List[str]: """Return the list of available targets.""" return list(self.raw_targets.keys())