Coverage for sparkle/structures/feature_dataframe.py: 98%
91 statements
« prev ^ index » next coverage.py v7.10.7, created at 2025-09-29 10:17 +0000
« prev ^ index » next coverage.py v7.10.7, created at 2025-09-29 10:17 +0000
1"""Module to manage feature data files and common operations on them."""
3from __future__ import annotations
4import pandas as pd
5import math
6from pathlib import Path
9class FeatureDataFrame(pd.DataFrame):
10 """Class to manage feature data CSV files and common operations on them."""
12 missing_value = math.nan
13 multi_dim_names = ["FeatureGroup", "FeatureName", "Extractor"]
15 def __init__(
16 self: FeatureDataFrame,
17 csv_filepath: Path,
18 instances: list[str] = [],
19 extractor_data: dict[str, list[tuple[str, str]]] = {},
20 ) -> None:
21 """Initialise a FeatureDataFrame object.
23 Arguments:
24 csv_filepath: The Path for the CSV storage. If it does not exist,
25 a new DataFrame will be initialised and stored here.
26 instances: The list of instances (Columns) to be added to the DataFrame.
27 extractor_data: A dictionary with extractor names as key, and a list of
28 tuples ordered as [(feature_group, feature_name), ...] as value.
29 """
30 # Initialize a dataframe from an existing file
31 if csv_filepath.exists():
32 # Read from file
33 temp_df = pd.read_csv(
34 csv_filepath, index_col=FeatureDataFrame.multi_dim_names
35 )
36 super().__init__(temp_df)
37 self.csv_filepath = csv_filepath
38 # Create a new dataframe
39 else:
40 # Unfold the extractor_data into lists
41 multi_index_lists = [[], [], []]
42 for extractor in extractor_data:
43 for group, feature_name in extractor_data[extractor]:
44 multi_index_lists[0].append(group)
45 multi_index_lists[1].append(feature_name)
46 multi_index_lists[2].append(extractor)
47 # Initialise new dataframe
48 multi_index = pd.MultiIndex.from_arrays(
49 multi_index_lists, names=self.multi_dim_names
50 )
51 super().__init__(
52 data=self.missing_value, index=multi_index, columns=instances
53 )
54 self.csv_filepath = csv_filepath
55 self.save_csv()
57 def add_extractor(
58 self: FeatureDataFrame,
59 extractor: str,
60 extractor_features: list[tuple[str, str]],
61 values: list[list[float]] = None,
62 ) -> None:
63 """Add an extractor and its feature names to the dataframe.
65 Arguments:
66 extractor: Name of the extractor
67 extractor_features: Tuples of [FeatureGroup, FeatureName]
68 values: Initial values of the Extractor per instance in the dataframe.
69 Defaults to FeatureDataFrame.missing_value.
70 """
71 if values is None:
72 values = [self.missing_value] * len(extractor_features)
73 # Unfold to indices to lists
74 for index, pair in enumerate(extractor_features):
75 feature_group, feature = pair
76 self.loc[(feature_group, feature, extractor), :] = values[index]
78 def add_instances(
79 self: FeatureDataFrame, instance: str | list[str], values: list[float] = None
80 ) -> None:
81 """Add one or more instances to the dataframe."""
82 if values is None:
83 values = FeatureDataFrame.missing_value
84 self[instance] = values
86 def remove_extractor(self: FeatureDataFrame, extractor: str) -> None:
87 """Remove an extractor from the dataframe."""
88 self.drop(extractor, axis=0, level="Extractor", inplace=True)
90 def remove_instances(self: FeatureDataFrame, instances: str | list[str]) -> None:
91 """Remove an instance from the dataframe."""
92 self.drop(instances, axis=1, inplace=True)
94 def get_feature_groups(
95 self: FeatureDataFrame, extractor: str | list[str] = None
96 ) -> list[str]:
97 """Retrieve the feature groups in the dataframe.
99 Args:
100 extractor: Optional. If extractor(s) are given,
101 yields only feature groups of that extractor.
103 Returns:
104 A list of feature groups.
105 """
106 indices = self.index
107 if extractor is not None:
108 if isinstance(extractor, str):
109 extractor = [extractor]
110 indices = indices[indices.isin(extractor, level=2)]
111 return indices.get_level_values(level=0).unique().to_list()
113 def get_value(
114 self: FeatureDataFrame,
115 instance: str,
116 extractor: str,
117 feature_group: str,
118 feature_name: str,
119 ) -> None:
120 """Return a value in the dataframe."""
121 return self.loc[(feature_group, feature_name, extractor), instance]
123 def set_value(
124 self: FeatureDataFrame,
125 instance: str,
126 extractor: str,
127 feature_group: str,
128 feature_name: str,
129 value: float,
130 ) -> None:
131 """Set a value in the dataframe."""
132 self.loc[(feature_group, feature_name, extractor), instance] = value
134 def has_missing_vectors(self: FeatureDataFrame) -> bool:
135 """Returns True if there are any Extractors still to be run on any instance."""
136 for instance in self.columns:
137 for extractor in self.extractors:
138 extractor_features = self.xs(extractor, level=2, drop_level=False)
139 if extractor_features.loc[:, instance].isnull().all():
140 return True
141 return False
143 def remaining_jobs(self: FeatureDataFrame) -> list[tuple[str, str, str]]:
144 """Determines needed feature computations per instance/extractor/group.
146 Returns:
147 list: A list of tuples representing (Extractor, Instance, Feature Group).
148 that needs to be computed.
149 """
150 remaining_jobs = []
151 for extractor in self.extractors:
152 for group in self.get_feature_groups(extractor):
153 subset = self.xs((group, extractor), level=(0, 2))
154 for instance in self.columns:
155 if subset.loc[:, instance].isnull().all():
156 remaining_jobs.append((instance, extractor, group))
157 return remaining_jobs
159 def get_instance(self: FeatureDataFrame, instance: str) -> list[float]:
160 """Return the feature vector of an instance."""
161 return self[instance].tolist()
163 def impute_missing_values(self: FeatureDataFrame) -> None:
164 """Imputes all NaN values by taking the average feature value."""
165 imputed_df = self.T.fillna(self.mean(axis=1)).T
166 self[:] = imputed_df.values
168 def has_missing_value(self: FeatureDataFrame) -> bool:
169 """Return whether there are missing values in the feature data."""
170 return self.isnull().any().any()
172 def reset_dataframe(self: FeatureDataFrame) -> bool:
173 """Resets all values to FeatureDataFrame.missing_value."""
174 self.loc[:, :] = FeatureDataFrame.missing_value
176 def sort(self: FeatureDataFrame) -> None:
177 """Sorts the DataFrame by Multi-Index for readability."""
178 self.sort_index(level=FeatureDataFrame.multi_dim_names, inplace=True)
180 @property
181 def instances(self: FeatureDataFrame) -> list[str]:
182 """Return the instances in the dataframe."""
183 return self.columns
185 @property
186 def extractors(self: FeatureDataFrame) -> list[str]:
187 """Returns all unique extractors in the DataFrame."""
188 return self.index.get_level_values("Extractor").unique().to_list()
190 @property
191 def num_features(self: FeatureDataFrame) -> int:
192 """Return the number of features in the dataframe."""
193 return self.shape[0]
195 @property
196 def features(self: FeatureDataFrame) -> list[str]:
197 """Return the features in the dataframe."""
198 return self.index.get_level_values("FeatureName").unique().to_list()
200 def save_csv(self: FeatureDataFrame, csv_filepath: Path = None) -> None:
201 """Write a CSV to the given path.
203 Args:
204 csv_filepath: String path to the csv file. Defaults to self.csv_filepath.
205 """
206 csv_filepath = self.csv_filepath if csv_filepath is None else csv_filepath
207 if csv_filepath is None:
208 raise ValueError("Cannot save DataFrame: no `csv_filepath` was provided.")
209 self.to_csv(csv_filepath)