Source code for sparkle.structures.feature_dataframe

"""Module to manage feature data files and common operations on them."""
from __future__ import annotations
import pandas as pd
import math
from pathlib import Path


[docs] class FeatureDataFrame: """Class to manage feature data CSV files and common operations on them.""" missing_value = math.nan multi_dim_names = ["FeatureGroup", "FeatureName", "Extractor"] def __init__(self: FeatureDataFrame, csv_filepath: Path, instances: list[str] = [], extractor_data: dict[str, list[tuple[str, str]]] = {} ) -> None: """Initialise a FeatureDataFrame object. Arguments: csv_filepath: The Path for the CSV storage. If it does not exist, a new DataFrame will be initialised and stored here. instances: The list of instances (Columns) to be added to the DataFrame. extractor_data: A dictionary with extractor names as key, and a list of tuples ordered as [(feature_group, feature_name), ...] as value. """ self.csv_filepath = csv_filepath if self.csv_filepath.exists(): # Read from file self.dataframe = pd.read_csv(self.csv_filepath, index_col=FeatureDataFrame.multi_dim_names) return # Unfold the extractor_data into lists multi_index_lists = [[], [], []] for extractor in extractor_data: for group, feature_name in extractor_data[extractor]: multi_index_lists[0].append(group) multi_index_lists[1].append(feature_name) multi_index_lists[2].append(extractor) # Initialise new dataframe self.dataframe = pd.DataFrame(FeatureDataFrame.missing_value, index=multi_index_lists, columns=instances) self.dataframe.index.names = FeatureDataFrame.multi_dim_names self.save_csv()
[docs] def add_extractor(self: FeatureDataFrame, extractor: str, extractor_features: list[tuple[str, str]], values: list[list[float]] = None) -> None: """Add an extractor and its feature names to the dataframe. Arguments: extractor: Name of the extractor extractor_features: Tuples of [FeatureGroup, FeatureName] values: Initial values of the Extractor per instance in the dataframe. Defaults to FeatureDataFrame.missing_value. """ if values is None: values = [FeatureDataFrame.missing_value for _ in range(len(extractor_features))] # Unfold to indices to lists for index, pair in enumerate(extractor_features): feature_group, feature = pair self.dataframe.loc[(feature_group, feature, extractor), :] = values[index]
[docs] def add_instances(self: FeatureDataFrame, instance: str | list[str], values: list[float] = None) -> None: """Add one or more instances to the dataframe.""" if values is None: values = FeatureDataFrame.missing_value self.dataframe[instance] = values
[docs] def remove_extractor(self: FeatureDataFrame, extractor: str) -> None: """Remove an extractor from the dataframe.""" self.dataframe.drop(extractor, axis=0, level="Extractor", inplace=True)
[docs] def remove_instances(self: FeatureDataFrame, instances: str | list[str]) -> None: """Remove an instance from the dataframe.""" self.dataframe.drop(instances, axis=1, inplace=True)
[docs] def get_feature_groups(self: FeatureDataFrame, extractor: str | list[str] = None) -> list[str]: """Retrieve the feature groups in the dataframe. Args: extractor: Optional. If extractor(s) are given, yields only feature groups of that extractor. Returns: A list of feature groups. """ indices = self.dataframe.index if extractor is not None: if isinstance(extractor, str): extractor = [extractor] indices = indices[indices.isin(extractor, level=2)] return indices.get_level_values(level=0).unique().to_list()
[docs] def get_value(self: FeatureDataFrame, instance: str, extractor: str, feature_group: str, feature_name: str) -> None: """Return a value in the dataframe.""" return self.dataframe.loc[(feature_group, feature_name, extractor), instance]
[docs] def set_value(self: FeatureDataFrame, instance: str, extractor: str, feature_group: str, feature_name: str, value: float) -> None: """Set a value in the dataframe.""" self.dataframe.loc[(feature_group, feature_name, extractor), instance] = value
[docs] def has_missing_vectors(self: FeatureDataFrame) -> bool: """Returns True if there are any Extractors still to be run on any instance.""" for instance in self.dataframe.columns: for extractor in self.extractors: extractor_features = self.dataframe.xs(extractor, level=2, drop_level=False) if extractor_features.loc[:, instance].isnull().all(): return True return False
[docs] def remaining_jobs(self: FeatureDataFrame) -> list[tuple[str, str, str]]: """Determines needed feature computations per instance/extractor/group. Returns: list: A list of tuples representing (Extractor, Instance, Feature Group). that needs to be computed. """ remaining_jobs = [] for extractor in self.extractors: for group in self.get_feature_groups(extractor): subset = self.dataframe.xs((group, extractor), level=(0, 2)) for instance in self.dataframe.columns: if subset.loc[:, instance].isnull().all(): remaining_jobs.append((instance, extractor, group)) return remaining_jobs
[docs] def get_instance(self: FeatureDataFrame, instance: str) -> list[float]: """Return the feature vector of an instance.""" return self.dataframe[instance].tolist()
[docs] def impute_missing_values(self: FeatureDataFrame) -> None: """Imputes all NaN values by taking the average feature value.""" self.dataframe = self.dataframe.T.fillna(self.dataframe.mean(axis=1)).T
[docs] def has_missing_value(self: FeatureDataFrame) -> bool: """Return whether there are missing values in the feature data.""" return self.dataframe.isnull().any().any()
[docs] def reset_dataframe(self: FeatureDataFrame) -> bool: """Resets all values to FeatureDataFrame.missing_value.""" self.dataframe.loc[:, :] = FeatureDataFrame.missing_value
[docs] def sort(self: FeatureDataFrame) -> None: """Sorts the DataFrame by Multi-Index for readability.""" self.dataframe.sort_index(level=FeatureDataFrame.multi_dim_names)
@property def instances(self: FeatureDataFrame) -> list[str]: """Return the instances in the dataframe.""" return self.dataframe.columns @property def extractors(self: FeatureDataFrame) -> list[str]: """Returns all unique extractors in the DataFrame.""" return self.dataframe.index.get_level_values("Extractor").unique().to_list() @property def num_features(self: FeatureDataFrame) -> int: """Return the number of features in the dataframe.""" return self.dataframe.shape[0]
[docs] def save_csv(self: FeatureDataFrame, csv_filepath: Path = None) -> None: """Write a CSV to the given path. Args: csv_filepath: String path to the csv file. Defaults to self.csv_filepath. """ csv_filepath = self.csv_filepath if csv_filepath is None else csv_filepath self.dataframe.to_csv(csv_filepath)
[docs] def to_autofolio(self: FeatureDataFrame, target: Path = None) -> Path: """Port the data to a format acceptable for AutoFolio.""" autofolio_df = self.dataframe.copy() autofolio_df.index = autofolio_df.index.map("_".join) # Reduce Multi-Index autofolio_df = autofolio_df.T # Autofolio has feature columns and instance rows if target is None: path = self.csv_filepath.parent / f"autofolio_{self.csv_filepath.name}" else: path = target / f"autofolio_{self.csv_filepath.name}" autofolio_df.to_csv(path) return path