Coverage for sparkle/structures/feature_dataframe.py: 98%

91 statements  

« prev     ^ index     » next       coverage.py v7.10.7, created at 2025-09-29 10:17 +0000

1"""Module to manage feature data files and common operations on them.""" 

2 

3from __future__ import annotations 

4import pandas as pd 

5import math 

6from pathlib import Path 

7 

8 

9class FeatureDataFrame(pd.DataFrame): 

10 """Class to manage feature data CSV files and common operations on them.""" 

11 

12 missing_value = math.nan 

13 multi_dim_names = ["FeatureGroup", "FeatureName", "Extractor"] 

14 

15 def __init__( 

16 self: FeatureDataFrame, 

17 csv_filepath: Path, 

18 instances: list[str] = [], 

19 extractor_data: dict[str, list[tuple[str, str]]] = {}, 

20 ) -> None: 

21 """Initialise a FeatureDataFrame object. 

22 

23 Arguments: 

24 csv_filepath: The Path for the CSV storage. If it does not exist, 

25 a new DataFrame will be initialised and stored here. 

26 instances: The list of instances (Columns) to be added to the DataFrame. 

27 extractor_data: A dictionary with extractor names as key, and a list of 

28 tuples ordered as [(feature_group, feature_name), ...] as value. 

29 """ 

30 # Initialize a dataframe from an existing file 

31 if csv_filepath.exists(): 

32 # Read from file 

33 temp_df = pd.read_csv( 

34 csv_filepath, index_col=FeatureDataFrame.multi_dim_names 

35 ) 

36 super().__init__(temp_df) 

37 self.csv_filepath = csv_filepath 

38 # Create a new dataframe 

39 else: 

40 # Unfold the extractor_data into lists 

41 multi_index_lists = [[], [], []] 

42 for extractor in extractor_data: 

43 for group, feature_name in extractor_data[extractor]: 

44 multi_index_lists[0].append(group) 

45 multi_index_lists[1].append(feature_name) 

46 multi_index_lists[2].append(extractor) 

47 # Initialise new dataframe 

48 multi_index = pd.MultiIndex.from_arrays( 

49 multi_index_lists, names=self.multi_dim_names 

50 ) 

51 super().__init__( 

52 data=self.missing_value, index=multi_index, columns=instances 

53 ) 

54 self.csv_filepath = csv_filepath 

55 self.save_csv() 

56 

57 def add_extractor( 

58 self: FeatureDataFrame, 

59 extractor: str, 

60 extractor_features: list[tuple[str, str]], 

61 values: list[list[float]] = None, 

62 ) -> None: 

63 """Add an extractor and its feature names to the dataframe. 

64 

65 Arguments: 

66 extractor: Name of the extractor 

67 extractor_features: Tuples of [FeatureGroup, FeatureName] 

68 values: Initial values of the Extractor per instance in the dataframe. 

69 Defaults to FeatureDataFrame.missing_value. 

70 """ 

71 if values is None: 

72 values = [self.missing_value] * len(extractor_features) 

73 # Unfold to indices to lists 

74 for index, pair in enumerate(extractor_features): 

75 feature_group, feature = pair 

76 self.loc[(feature_group, feature, extractor), :] = values[index] 

77 

78 def add_instances( 

79 self: FeatureDataFrame, instance: str | list[str], values: list[float] = None 

80 ) -> None: 

81 """Add one or more instances to the dataframe.""" 

82 if values is None: 

83 values = FeatureDataFrame.missing_value 

84 self[instance] = values 

85 

86 def remove_extractor(self: FeatureDataFrame, extractor: str) -> None: 

87 """Remove an extractor from the dataframe.""" 

88 self.drop(extractor, axis=0, level="Extractor", inplace=True) 

89 

90 def remove_instances(self: FeatureDataFrame, instances: str | list[str]) -> None: 

91 """Remove an instance from the dataframe.""" 

92 self.drop(instances, axis=1, inplace=True) 

93 

94 def get_feature_groups( 

95 self: FeatureDataFrame, extractor: str | list[str] = None 

96 ) -> list[str]: 

97 """Retrieve the feature groups in the dataframe. 

98 

99 Args: 

100 extractor: Optional. If extractor(s) are given, 

101 yields only feature groups of that extractor. 

102 

103 Returns: 

104 A list of feature groups. 

105 """ 

106 indices = self.index 

107 if extractor is not None: 

108 if isinstance(extractor, str): 

109 extractor = [extractor] 

110 indices = indices[indices.isin(extractor, level=2)] 

111 return indices.get_level_values(level=0).unique().to_list() 

112 

113 def get_value( 

114 self: FeatureDataFrame, 

115 instance: str, 

116 extractor: str, 

117 feature_group: str, 

118 feature_name: str, 

119 ) -> None: 

120 """Return a value in the dataframe.""" 

121 return self.loc[(feature_group, feature_name, extractor), instance] 

122 

123 def set_value( 

124 self: FeatureDataFrame, 

125 instance: str, 

126 extractor: str, 

127 feature_group: str, 

128 feature_name: str, 

129 value: float, 

130 ) -> None: 

131 """Set a value in the dataframe.""" 

132 self.loc[(feature_group, feature_name, extractor), instance] = value 

133 

134 def has_missing_vectors(self: FeatureDataFrame) -> bool: 

135 """Returns True if there are any Extractors still to be run on any instance.""" 

136 for instance in self.columns: 

137 for extractor in self.extractors: 

138 extractor_features = self.xs(extractor, level=2, drop_level=False) 

139 if extractor_features.loc[:, instance].isnull().all(): 

140 return True 

141 return False 

142 

143 def remaining_jobs(self: FeatureDataFrame) -> list[tuple[str, str, str]]: 

144 """Determines needed feature computations per instance/extractor/group. 

145 

146 Returns: 

147 list: A list of tuples representing (Extractor, Instance, Feature Group). 

148 that needs to be computed. 

149 """ 

150 remaining_jobs = [] 

151 for extractor in self.extractors: 

152 for group in self.get_feature_groups(extractor): 

153 subset = self.xs((group, extractor), level=(0, 2)) 

154 for instance in self.columns: 

155 if subset.loc[:, instance].isnull().all(): 

156 remaining_jobs.append((instance, extractor, group)) 

157 return remaining_jobs 

158 

159 def get_instance(self: FeatureDataFrame, instance: str) -> list[float]: 

160 """Return the feature vector of an instance.""" 

161 return self[instance].tolist() 

162 

163 def impute_missing_values(self: FeatureDataFrame) -> None: 

164 """Imputes all NaN values by taking the average feature value.""" 

165 imputed_df = self.T.fillna(self.mean(axis=1)).T 

166 self[:] = imputed_df.values 

167 

168 def has_missing_value(self: FeatureDataFrame) -> bool: 

169 """Return whether there are missing values in the feature data.""" 

170 return self.isnull().any().any() 

171 

172 def reset_dataframe(self: FeatureDataFrame) -> bool: 

173 """Resets all values to FeatureDataFrame.missing_value.""" 

174 self.loc[:, :] = FeatureDataFrame.missing_value 

175 

176 def sort(self: FeatureDataFrame) -> None: 

177 """Sorts the DataFrame by Multi-Index for readability.""" 

178 self.sort_index(level=FeatureDataFrame.multi_dim_names, inplace=True) 

179 

180 @property 

181 def instances(self: FeatureDataFrame) -> list[str]: 

182 """Return the instances in the dataframe.""" 

183 return self.columns 

184 

185 @property 

186 def extractors(self: FeatureDataFrame) -> list[str]: 

187 """Returns all unique extractors in the DataFrame.""" 

188 return self.index.get_level_values("Extractor").unique().to_list() 

189 

190 @property 

191 def num_features(self: FeatureDataFrame) -> int: 

192 """Return the number of features in the dataframe.""" 

193 return self.shape[0] 

194 

195 @property 

196 def features(self: FeatureDataFrame) -> list[str]: 

197 """Return the features in the dataframe.""" 

198 return self.index.get_level_values("FeatureName").unique().to_list() 

199 

200 def save_csv(self: FeatureDataFrame, csv_filepath: Path = None) -> None: 

201 """Write a CSV to the given path. 

202 

203 Args: 

204 csv_filepath: String path to the csv file. Defaults to self.csv_filepath. 

205 """ 

206 csv_filepath = self.csv_filepath if csv_filepath is None else csv_filepath 

207 if csv_filepath is None: 

208 raise ValueError("Cannot save DataFrame: no `csv_filepath` was provided.") 

209 self.to_csv(csv_filepath)