Coverage for sparkle/structures/performance_dataframe.py: 89%

418 statements  

« prev     ^ index     » next       coverage.py v7.9.1, created at 2025-07-01 13:21 +0000

1"""Module to manage performance data files and common operations on them.""" 

2from __future__ import annotations 

3import ast 

4import copy 

5from typing import Any 

6import itertools 

7from pathlib import Path 

8import math 

9import numpy as np 

10import pandas as pd 

11 

12from sparkle.types import SparkleObjective, resolve_objective 

13 

14 

15class PerformanceDataFrame(pd.DataFrame): 

16 """Class to manage performance data and common operations on them.""" 

17 missing_value = math.nan 

18 

19 missing_objective = "UNKNOWN" 

20 default_configuration = "Default" 

21 

22 index_objective = "Objective" 

23 index_instance = "Instance" 

24 index_run = "Run" 

25 multi_index_names = [index_objective, index_instance, index_run] 

26 

27 column_solver = "Solver" 

28 column_configuration = "Configuration" 

29 column_meta = "Meta" 

30 column_value = "Value" 

31 column_seed = "Seed" 

32 multi_column_names = [column_solver, column_configuration, column_meta] 

33 multi_column_value = [column_value, column_seed] 

34 multi_column_dtypes = [str, int] 

35 

36 def __init__(self: PerformanceDataFrame, 

37 csv_filepath: Path, 

38 solvers: list[str] = None, 

39 configurations: dict[str, dict[str, dict]] = None, 

40 objectives: list[str | SparkleObjective] = None, 

41 instances: list[str] = None, 

42 n_runs: int = 1) -> None: 

43 """Initialise a PerformanceDataFrame. 

44 

45 Consists of: 

46 - Columns representing the Solvers 

47 - Rows representing the result by multi-index in order of: 

48 * Objective (Static, given in constructor or read from file) 

49 * Instance 

50 * Runs (Static, given in constructor or read from file) 

51 

52 Args: 

53 csv_filepath: If path exists, load from Path. 

54 Otherwise create new and save to this path. 

55 solvers: List of solver names to be added into the Dataframe 

56 configurations: The configuration keys per solver to add, structured as 

57 configurations[solver][config_key] = {"parameter": "value", ..} 

58 objectives: List of SparkleObjectives or objective names. By default None, 

59 then the objectives will be derived from Sparkle Settings if possible. 

60 instances: List of instance names to be added into the Dataframe 

61 n_runs: The number of runs to consider per Solver/Objective/Instance comb. 

62 """ 

63 if csv_filepath and csv_filepath.exists(): # Read from file 

64 df = pd.read_csv(csv_filepath, 

65 header=[0, 1, 2], index_col=[0, 1, 2], 

66 dtype={"Value": str, "Seed": int}, 

67 on_bad_lines="skip", 

68 comment="$") # $ For extra data lines 

69 super().__init__(df) 

70 self.csv_filepath = csv_filepath 

71 # Load configuration mapping 

72 with self.csv_filepath.open() as f: 

73 configuration_lines = [line.strip().strip("$").split(",", maxsplit=2) 

74 for line in f.readlines() 

75 if line.startswith("$")] 

76 configurations = {s: {} for s in self.solvers} 

77 for solver, config_key, config in configuration_lines[1:]: # Skip header 

78 configurations[solver][config_key] = ast.literal_eval(config.strip('"')) 

79 else: # New PerformanceDataFrame 

80 # Initialize empty DataFrame 

81 run_ids = list(range(1, n_runs + 1)) # We count runs from 1 

82 # We always need objectives to maintain the dimensions 

83 if objectives is None: 

84 objectives = [PerformanceDataFrame.missing_objective] 

85 else: 

86 objectives = [str(o) for o in objectives] 

87 # We always need an instance to maintain the dimensions 

88 if instances is None: 

89 instances = [PerformanceDataFrame.missing_value] 

90 # We always need a solver to maintain the dimensions 

91 if solvers is None: 

92 solvers = [PerformanceDataFrame.missing_value] 

93 midx = pd.MultiIndex.from_product( 

94 [objectives, instances, run_ids], 

95 names=PerformanceDataFrame.multi_index_names) 

96 # Create the multi index tuples 

97 if configurations is None: 

98 configurations = \ 

99 {solver: {PerformanceDataFrame.default_configuration: {}} 

100 for solver in solvers} 

101 column_tuples = [] 

102 # We cannot do .from_product here as config ids are per solver 

103 for solver in configurations.keys(): 

104 for config_id in configurations[solver].keys(): 

105 column_tuples.extend([ 

106 (solver, config_id, PerformanceDataFrame.column_seed), 

107 (solver, config_id, PerformanceDataFrame.column_value)]) 

108 mcolumns = pd.MultiIndex.from_tuples( 

109 column_tuples, 

110 names=[PerformanceDataFrame.column_solver, 

111 PerformanceDataFrame.column_configuration, 

112 PerformanceDataFrame.column_meta]) 

113 # Set dtype object to avoid inferring float for categorical objectives 

114 super().__init__(PerformanceDataFrame.missing_value, 

115 index=midx, columns=mcolumns, dtype="object") 

116 self.csv_filepath = csv_filepath 

117 

118 # Store configuration in global attributes dictionary, see Pandas Docs 

119 self.attrs = configurations 

120 

121 if self.index.duplicated().any(): # Combine duplicate indices 

122 combined = self.groupby(level=[0, 1, 2]).first() 

123 # We keep the last to allow overwriting existing values 

124 duplicates = self.index[self.index.duplicated(keep="last")] 

125 # Remove all duplicate entries from self 

126 self.drop(duplicates, inplace=True) 

127 for d in duplicates: # Place combined duplicates in self 

128 self.loc[d, :] = combined.loc[d, :] 

129 

130 # Sort the index to optimize lookup speed 

131 self.sort_index(axis=0, inplace=True) 

132 self.sort_index(axis=1, inplace=True) 

133 

134 if csv_filepath and not self.csv_filepath.exists(): # New Performance DataFrame 

135 self.save_csv() 

136 

137 # Properties 

138 

139 @property 

140 def num_objectives(self: PerformanceDataFrame) -> int: 

141 """Retrieve the number of objectives in the DataFrame.""" 

142 return self.index.get_level_values(0).unique().size 

143 

144 @property 

145 def num_instances(self: PerformanceDataFrame) -> int: 

146 """Return the number of instances.""" 

147 return self.index.get_level_values(1).unique().size 

148 

149 @property 

150 def num_runs(self: PerformanceDataFrame) -> int: 

151 """Return the maximum number of runs of each instance.""" 

152 return self.index.get_level_values(2).unique().size 

153 

154 @property 

155 def num_solvers(self: PerformanceDataFrame) -> int: 

156 """Return the number of solvers.""" 

157 return self.columns.get_level_values(0).unique().size 

158 

159 @property 

160 def num_solver_configurations(self: PerformanceDataFrame) -> int: 

161 """Return the number of solver configurations.""" 

162 return int(self.columns.get_level_values( # Config has a seed & value 

163 PerformanceDataFrame.column_configuration).size / 2) 

164 

165 @property 

166 def multi_objective(self: PerformanceDataFrame) -> bool: 

167 """Return whether the dataframe represent MO or not.""" 

168 return self.num_objectives > 1 

169 

170 @property 

171 def solvers(self: PerformanceDataFrame) -> list[str]: 

172 """Return the solver present as a list of strings.""" 

173 # Do not return the nan solver as its not an actual solver 

174 return self.columns.get_level_values( 

175 PerformanceDataFrame.column_solver).dropna().unique().to_list() 

176 

177 @property 

178 def configuration_ids(self: PerformanceDataFrame) -> list[str]: 

179 """Return the list of configuration keys.""" 

180 return self.columns.get_level_values( 

181 PerformanceDataFrame.column_configuration).unique().to_list() 

182 

183 @property 

184 def configurations(self: PerformanceDataFrame) -> dict[str, dict[str, dict]]: 

185 """Return a dictionary (copy) containing the configurations for each solver.""" 

186 return copy.deepcopy(self.attrs) # Deepcopy to avoid mutation of attribute 

187 

188 @property 

189 def objective_names(self: PerformanceDataFrame) -> list[str]: 

190 """Return the objective names as a list of strings.""" 

191 return self.index.get_level_values(0).unique().to_list() 

192 

193 @property 

194 def objectives(self: PerformanceDataFrame) -> list[SparkleObjective]: 

195 """Return the objectives as a list of SparkleObjectives.""" 

196 return [resolve_objective(o) for o in self.objective_names] 

197 

198 @property 

199 def instances(self: PerformanceDataFrame) -> list[str]: 

200 """Return the instances as a Pandas Index object.""" 

201 return self.index.get_level_values(1).unique().to_list() 

202 

203 @property 

204 def run_ids(self: PerformanceDataFrame) -> list[int]: 

205 """Return the run ids as a list of integers.""" 

206 return self.index.get_level_values(2).unique().to_list() 

207 

208 @property 

209 def has_missing_values(self: PerformanceDataFrame) -> bool: 

210 """Returns True if there are any missing values in the dataframe.""" 

211 return self.drop(PerformanceDataFrame.column_seed, 

212 level=PerformanceDataFrame.column_meta, 

213 axis=1).isnull().any().any() 

214 

215 def is_missing(self: PerformanceDataFrame, 

216 solver: str, 

217 instance: str,) -> int: 

218 """Checks if a solver/instance is missing values.""" 

219 return self.xs(solver, axis=1).xs( 

220 instance, axis=0, 

221 level=PerformanceDataFrame.index_instance).drop( 

222 PerformanceDataFrame.column_seed, 

223 level=PerformanceDataFrame.column_meta, 

224 axis=1).isnull().any().any() 

225 

226 def verify_objective(self: PerformanceDataFrame, 

227 objective: str) -> str: 

228 """Method to check whether the specified objective is valid. 

229 

230 Users are allowed to index the dataframe without specifying all dimensions. 

231 However, when dealing with multiple objectives this is not allowed and this 

232 is verified here. If we have only one objective this is returned. Otherwise, 

233 if an objective is specified by the user this is returned. 

234 

235 Args: 

236 objective: The objective given by the user 

237 """ 

238 if objective is None: 

239 if self.multi_objective: 

240 raise ValueError("Error: MO Data, but objective not specified.") 

241 elif self.num_objectives == 1: 

242 return self.objective_names[0] 

243 else: 

244 return PerformanceDataFrame.missing_objective 

245 return objective 

246 

247 def verify_run_id(self: PerformanceDataFrame, 

248 run_id: int) -> int: 

249 """Method to check whether run id is valid. 

250 

251 Similar to verify_objective but here we check the dimensionality of runs. 

252 

253 Args: 

254 run_id: the run as specified by the user. 

255 """ 

256 if run_id is None: 

257 if self.num_runs > 1: 

258 raise ValueError("Error: Multiple run performance data, " 

259 "but run not specified") 

260 else: 

261 run_id = self.run_ids[0] 

262 return run_id 

263 

264 def verify_indexing(self: PerformanceDataFrame, 

265 objective: str, 

266 run_id: int) -> tuple[str, int]: 

267 """Method to check whether data indexing is correct. 

268 

269 Users are allowed to use the Performance Dataframe without the second and 

270 fourth dimension (Objective and Run respectively) in the case they only 

271 have one objective or only do one run. This method adjusts the indexing for 

272 those cases accordingly. 

273 

274 Args: 

275 objective: The given objective name 

276 run_id: The given run index 

277 

278 Returns: 

279 A tuple representing the (possibly adjusted) Objective and Run index. 

280 """ 

281 objective = self.verify_objective(objective) 

282 run_id = self.verify_run_id(run_id) 

283 return objective, run_id 

284 

285 # Getters and Setters 

286 

287 def add_solver(self: PerformanceDataFrame, 

288 solver_name: str, 

289 configurations: list[(str, dict)] = None, 

290 initial_value: float | list[str | float] = None) -> None: 

291 """Add a new solver to the dataframe. Initializes value to None by default. 

292 

293 Args: 

294 solver_name: The name of the solver to be added. 

295 configurations: A list of configuration keys for the solver. 

296 initial_value: The value assigned for each index of the new solver. 

297 If not None, must match the index dimension (n_obj * n_inst * n_runs). 

298 """ 

299 if solver_name in self.solvers: 

300 print(f"WARNING: Tried adding already existing solver {solver_name} to " 

301 f"Performance DataFrame: {self.csv_filepath}") 

302 return 

303 if not isinstance(initial_value, list): # Single value 

304 initial_value = [[initial_value, initial_value]] 

305 if configurations is None: 

306 configurations = [(PerformanceDataFrame.default_configuration, {})] 

307 self.attrs[solver_name] = {} 

308 for (config_key, config), (value, seed) in itertools.product(configurations, 

309 initial_value): 

310 self[(solver_name, config_key, PerformanceDataFrame.column_seed)] = seed 

311 self[(solver_name, config_key, PerformanceDataFrame.column_value)] = value 

312 self.attrs[solver_name][config_key] = config 

313 if self.num_solvers == 2: # Remove nan solver 

314 for solver in self.solvers: 

315 if str(solver) == str(PerformanceDataFrame.missing_value): 

316 self.remove_solver(solver) 

317 break 

318 

319 def add_configuration( 

320 self: PerformanceDataFrame, 

321 solver: str, 

322 configuration_id: str | list[str], 

323 configuration: dict[str, Any] | list[dict[str, Any]] = None) -> None: 

324 """Add new configurations for a solver to the dataframe. 

325 

326 If the key already exists, update the value. 

327 

328 Args: 

329 solver: The name of the solver to be added. 

330 configuration_id: The name of the configuration to be added. 

331 configuration: The configuration to be added. 

332 """ 

333 if not isinstance(configuration_id, list): 

334 configuration_id = [configuration_id] 

335 if not isinstance(configuration, list): 

336 configuration = [configuration] 

337 for config_id, config in zip(configuration_id, configuration): 

338 if config_id not in self.get_configurations(solver): 

339 self[(solver, config_id, PerformanceDataFrame.column_value)] = None 

340 self[(solver, config_id, PerformanceDataFrame.column_seed)] = None 

341 self.attrs[solver][config_id] = config 

342 # Sort the index to optimize lookup speed 

343 self.sort_index(axis=1, inplace=True) 

344 

345 def add_objective(self: PerformanceDataFrame, 

346 objective_name: str, 

347 initial_value: float = None) -> None: 

348 """Add an objective to the DataFrame.""" 

349 initial_value = initial_value or self.missing_value 

350 if objective_name in self.objective_names: 

351 print(f"WARNING: Tried adding already existing objective {objective_name} " 

352 f"to Performance DataFrame: {self.csv_filepath}") 

353 return 

354 for instance, run in itertools.product(self.instances, self.run_ids): 

355 self.loc[(objective_name, instance, run)] = initial_value 

356 self.sort_index(axis=0, inplace=True) 

357 

358 def add_instance(self: PerformanceDataFrame, 

359 instance_name: str, 

360 initial_values: Any | list[Any] = None) -> None: 

361 """Add and instance to the DataFrame. 

362 

363 Args: 

364 instance_name: The name of the instance to be added. 

365 initial_values: The values assigned for each index of the new instance. 

366 If list, must match the column dimension (Value, Seed, Configuration). 

367 """ 

368 initial_values = initial_values or self.missing_value 

369 if not isinstance(initial_values, list): 

370 initial_values = ([initial_values] 

371 * 2 # Value and Seed per target column 

372 * self.num_solver_configurations) 

373 elif len(initial_values) == len(PerformanceDataFrame.multi_column_names): 

374 initial_values = initial_values * self.num_solvers 

375 

376 if instance_name in self.instances: 

377 print(f"WARNING: Tried adding already existing instance {instance_name} " 

378 f"to Performance DataFrame: {self.csv_filepath}") 

379 return 

380 # Add rows for all combinations 

381 for objective, run in itertools.product(self.objective_names, self.run_ids): 

382 self.loc[(objective, instance_name, run)] = initial_values 

383 if self.num_instances == 2: # Remove nan instance 

384 for instance in self.instances: 

385 if not isinstance(instance, str) and math.isnan(instance): 

386 self.remove_instances(instance) 

387 break 

388 # Sort the index to optimize lookup speed 

389 self.sort_index(axis=0, inplace=True) 

390 

391 def add_runs(self: PerformanceDataFrame, 

392 num_extra_runs: int, 

393 instance_names: list[str] = None, 

394 initial_values: Any | list[Any] = None) -> None: 

395 """Add runs to the DataFrame. 

396 

397 Args: 

398 num_extra_runs: The number of runs to be added. 

399 instance_names: The instances for which runs are to be added. 

400 By default None, which means runs are added to all instances. 

401 initial_values: The initial value for each objective of each new run. 

402 If a list, needs to have a value for Value, Seed and Configuration. 

403 """ 

404 initial_values = initial_values or self.missing_value 

405 if not isinstance(initial_values, list): 

406 initial_values =\ 

407 [initial_values] * self.num_solvers * 2 # Value and Seed 

408 elif len(initial_values) == 2: # Value and seed provided 

409 initial_values = initial_values * self.num_solvers 

410 instance_names = self.instances if instance_names is None else instance_names 

411 for objective, instance in itertools.product(self.objective_names, 

412 instance_names): 

413 index_runs_start = len(self.loc[(objective, instance)]) + 1 

414 for run in range(index_runs_start, index_runs_start + num_extra_runs): 

415 self.loc[(objective, instance, run)] = initial_values 

416 # Sort the index to optimize lookup speed 

417 # NOTE: It would be better to do this at the end, but that results in 

418 # PerformanceWarning: indexing past lexsort depth may impact performance. 

419 self.sort_index(axis=0, inplace=True) 

420 

421 def get_configurations(self: PerformanceDataFrame, 

422 solver_name: str) -> list[str]: 

423 """Return the list of configuration keys for a solver.""" 

424 return list(self[solver_name].columns.get_level_values( 

425 PerformanceDataFrame.column_configuration).unique()) 

426 

427 def get_full_configuration(self: PerformanceDataFrame, 

428 solver: str, 

429 configuration_id: str | list[str]) -> dict | list[dict]: 

430 """Return the actual configuration associated with the configuration key.""" 

431 if isinstance(configuration_id, str): 

432 return self.attrs[solver][configuration_id] 

433 return [self.attrs[solver][cid] for cid in configuration_id] 

434 

435 def remove_solver(self: PerformanceDataFrame, solvers: str | list[str]) -> None: 

436 """Drop one or more solvers from the Dataframe.""" 

437 if not solvers: # Bugfix for when an empty list is passed to avoid nan adding 

438 return 

439 # To make sure objectives / runs are saved when no solvers are present 

440 solvers = [solvers] if isinstance(solvers, str) else solvers 

441 if self.num_solvers == 1: # This would preferrably be done after removing 

442 for field in PerformanceDataFrame.multi_column_value: 

443 self[PerformanceDataFrame.missing_value, 

444 PerformanceDataFrame.missing_value, field] =\ 

445 PerformanceDataFrame.missing_value 

446 self.drop(columns=solvers, level=0, axis=1, inplace=True) 

447 for solver in solvers: 

448 del self.attrs[solver] 

449 

450 def remove_configuration(self: PerformanceDataFrame, 

451 solver: str, configuration: str | list[str]) -> None: 

452 """Drop one or more configurations from the Dataframe.""" 

453 if isinstance(configuration, str): 

454 configuration = [configuration] 

455 for config in configuration: 

456 self.drop((solver, config), axis=1, inplace=True) 

457 del self.attrs[solver][config] 

458 # Sort the index to optimize lookup speed 

459 self.sort_index(axis=1, inplace=True) 

460 

461 def remove_objective(self: PerformanceDataFrame, 

462 objectives: str | list[str]) -> None: 

463 """Remove objective from the Dataframe.""" 

464 if len(self.objectives) < 2: 

465 raise Exception("Cannot remove last objective from PerformanceDataFrame") 

466 self.drop(objectives, 

467 axis=0, level=PerformanceDataFrame.index_objective, inplace=True) 

468 

469 def remove_instances(self: PerformanceDataFrame, instances: str | list[str]) -> None: 

470 """Drop instances from the Dataframe.""" 

471 # To make sure objectives / runs are saved when no instances are present 

472 num_instances = len(instances) if isinstance(instances, list) else 1 

473 if self.num_instances - num_instances == 0: 

474 for objective, run in itertools.product(self.objective_names, self.run_ids): 

475 self.loc[(objective, PerformanceDataFrame.missing_value, run)] =\ 

476 PerformanceDataFrame.missing_value 

477 self.drop(instances, 

478 axis=0, 

479 level=PerformanceDataFrame.index_instance, inplace=True) 

480 # Sort the index to optimize lookup speed 

481 self.sort_index(axis=0, inplace=True) 

482 

483 def remove_runs(self: PerformanceDataFrame, 

484 runs: int | list[int], 

485 instance_names: list[str] = None) -> None: 

486 """Drop one or more runs from the Dataframe. 

487 

488 Args: 

489 runs: The run indices to be removed. If its an int, 

490 the last n runs are removed. NOTE: If each instance has a different 

491 number of runs, the amount of removed runs is not uniform. 

492 instance_names: The instances for which runs are to be removed. 

493 By default None, which means runs are removed from all instances. 

494 """ 

495 instance_names = self.instances if instance_names is None else instance_names 

496 runs = list(range((self.num_runs + 1) - runs, (self.num_runs + 1)))\ 

497 if isinstance(runs, int) else runs 

498 self.drop(runs, 

499 axis=0, 

500 level=PerformanceDataFrame.index_run, 

501 inplace=True) 

502 # Sort the index to optimize lookup speed 

503 self.sort_index(axis=0, inplace=True) 

504 

505 def remove_empty_runs(self: PerformanceDataFrame) -> None: 

506 """Remove runs that contain no data, except for the first.""" 

507 for row_index in self.index: 

508 if row_index[2] == 1: # First run, never delete 

509 continue 

510 if self.loc[row_index].isna().all(): 

511 self.drop(row_index, inplace=True) 

512 

513 def filter_objective(self: PerformanceDataFrame, 

514 objective: str | list[str]) -> None: 

515 """Filter the Dataframe to a subset of objectives.""" 

516 if isinstance(objective, str): 

517 objective = [objective] 

518 self.drop(list(set(self.objective_names) - set(objective)), 

519 axis=0, level=PerformanceDataFrame.index_objective, inplace=True) 

520 

521 def reset_value(self: PerformanceDataFrame, 

522 solver: str, 

523 instance: str, 

524 objective: str = None, 

525 run: int = None) -> None: 

526 """Reset a value in the dataframe.""" 

527 self.set_value(PerformanceDataFrame.missing_value, 

528 solver, instance, objective, run) 

529 

530 def set_value(self: PerformanceDataFrame, 

531 value: float | str | list[float | str] | list[list[float | str]], 

532 solver: str | list[str], 

533 instance: str | list[str], 

534 configuration: str = None, 

535 objective: str | list[str] = None, 

536 run: int | list[int] = None, 

537 solver_fields: list[str] = ["Value"], 

538 append_write_csv: bool = False) -> None: 

539 """Setter method to assign a value to the Dataframe. 

540 

541 Allows for setting the same value to multiple indices. 

542 

543 Args: 

544 value: Value(s) to be assigned. If value is a list, first dimension is 

545 the solver field, second dimension is if multiple different values are 

546 to be assigned. Must be the same shape as target. 

547 solver: The solver(s) for which the value should be set. 

548 If solver is a list, multiple solvers are set. If None, all 

549 solvers are set. 

550 instance: The instance(s) for which the value should be set. 

551 If instance is a list, multiple instances are set. If None, all 

552 instances are set. 

553 configuration: The configuration(s) for which the value should be set. 

554 When left None, set for all configurations 

555 objective: The objectives for which the value should be set. 

556 When left None, set for all objectives 

557 run: The run index for which the value should be set. 

558 If left None, set for all runs. 

559 solver_fields: The level to which each value should be assinged. 

560 Defaults to ["Value"]. 

561 append_write_csv: For concurrent writing to the PerformanceDataFrame. 

562 If True, the value is directly appended to the CSV file. 

563 This will create duplicate entries in the file, but these are combined 

564 when loading the file. 

565 """ 

566 # Convert indices to slices for None values 

567 solver = slice(solver) if solver is None else solver 

568 configuration = slice(configuration) if configuration is None else configuration 

569 instance = slice(instance) if instance is None else instance 

570 objective = slice(objective) if objective is None else objective 

571 run = slice(run) if run is None else run 

572 # Convert column indices to slices for setting multiple columns 

573 value = [value] if not isinstance(value, list) else value 

574 # NOTE: We currently forloop levels here, as it allows us to set the same 

575 # sequence of values to the indices 

576 for item, level in zip(value, solver_fields): 

577 self.loc[(objective, instance, run), (solver, configuration, level)] = item 

578 

579 if append_write_csv: 

580 writeable = self.loc[(objective, instance, run), :] 

581 if isinstance(writeable, pd.Series): # Single row, convert to pd.DataFrame 

582 writeable = self.loc[[(objective, instance, run)], :] 

583 # Append the new rows to the dataframe csv file 

584 writeable.to_csv(self.csv_filepath, mode="a", header=False) 

585 

586 def get_value(self: PerformanceDataFrame, 

587 solver: str | list[str] = None, 

588 instance: str | list[str] = None, 

589 configuration: str = None, 

590 objective: str = None, 

591 run: int = None, 

592 solver_fields: list[str] = ["Value"] 

593 ) -> float | str | list[Any]: 

594 """Index a value of the DataFrame and return it.""" 

595 # Convert indices to slices for None values 

596 solver = slice(solver) if solver is None else solver 

597 configuration = slice(configuration) if configuration is None else configuration 

598 instance = slice(instance) if instance is None else instance 

599 objective = slice(objective) if objective is None else objective 

600 solver_fields = slice(solver_fields) if solver_fields is None else solver_fields 

601 run = slice(run) if run is None else run 

602 target = self.loc[(objective, instance, run), 

603 (solver, configuration, solver_fields)].values 

604 # Reduce dimensions when relevant 

605 if len(target) > 0 and isinstance(target[0], np.ndarray) and len(target[0]) == 1: 

606 target = target.flatten() 

607 target = target.tolist() 

608 if len(target) == 1: 

609 return target[0] 

610 return target 

611 

612 def get_instance_num_runs(self: PerformanceDataFrame, 

613 instance: str) -> int: 

614 """Return the number of runs for an instance.""" 

615 # We assume each objective has the same index for Instance/Runs 

616 return len(self.loc[(self.objective_names[0], instance)].index) 

617 

618 # Calculables 

619 

620 def mean(self: PerformanceDataFrame, 

621 objective: str = None, 

622 solver: str = None, 

623 instance: str = None) -> float: 

624 """Return the mean value of a slice of the dataframe.""" 

625 objective = self.verify_objective(objective) 

626 subset = self.xs(objective, level=0) 

627 if solver is not None: 

628 subset = subset.xs(solver, axis=1, drop_level=False) 

629 if instance is not None: 

630 subset = subset.xs(instance, axis=0, drop_level=False) 

631 value = subset.astype(float).mean() 

632 if isinstance(value, pd.Series): 

633 return value.mean() 

634 return value 

635 

636 def get_job_list(self: PerformanceDataFrame, rerun: bool = False) \ 

637 -> list[tuple[str, str]]: 

638 """Return a list of performance computation jobs there are to be done. 

639 

640 Get a list of tuple[instance, solver] to run from the performance data. 

641 If rerun is False (default), get only the tuples that don't have a 

642 value, else (True) get all the tuples. 

643 

644 Args: 

645 rerun: Boolean indicating if we want to rerun all jobs 

646 

647 Returns: 

648 A tuple of (solver, config, instance, run) combinations 

649 """ 

650 # Drop the seed as we are looking for nan values, not seeds 

651 df = self.drop(PerformanceDataFrame.column_seed, axis=1, 

652 level=PerformanceDataFrame.column_meta) 

653 df = df.droplevel(PerformanceDataFrame.column_meta, axis=1) 

654 if rerun: # Return all combinations 

655 # Drop objective, not needed 

656 df = df.droplevel(PerformanceDataFrame.index_objective, axis=0) 

657 result = [tuple(column) + tuple(index) 

658 for column, index in itertools.product(df.columns, df.index)] 

659 else: 

660 result = [] 

661 for (solver, config), (objective, instance, run) in itertools.product( 

662 df.columns, df.index): 

663 value = df.loc[(objective, instance, run), (solver, config)] 

664 if value is None or ( 

665 isinstance(value, (int, float)) and math.isnan(value)): 

666 result.append(tuple([solver, config, instance, run])) 

667 # Filter duplicates 

668 result = list(set(result)) 

669 return result 

670 

671 def configuration_performance( 

672 self: PerformanceDataFrame, 

673 solver: str, 

674 configuration: str | list[str] = None, 

675 objective: str | SparkleObjective = None, 

676 instances: list[str] = None, 

677 per_instance: bool = False) -> tuple[str, float]: 

678 """Return the (best) configuration performance for objective over the instances. 

679 

680 Args: 

681 solver: The solver for which we determine evaluate the configuration 

682 configuration: The configuration (id) to evaluate 

683 objective: The objective for which we calculate find the best value 

684 instances: The instances which should be selected for the evaluation 

685 per_instance: Whether to return the performance per instance, 

686 or aggregated. 

687 

688 Returns: 

689 The (best) configuration id and its aggregated performance. 

690 """ 

691 objective = self.verify_objective(objective) 

692 if isinstance(objective, str): 

693 objective = resolve_objective(objective) 

694 # Filter objective 

695 subdf = self.xs(objective.name, level=0, drop_level=True) 

696 # Filter solver 

697 subdf = subdf.xs(solver, axis=1, drop_level=True) 

698 # Drop the seed, then drop meta level as it is no longer needed 

699 subdf = subdf.drop(PerformanceDataFrame.column_seed, axis=1, 

700 level=PerformanceDataFrame.column_meta) 

701 subdf = subdf.droplevel(PerformanceDataFrame.column_meta, axis=1) 

702 # Ensure the objective is numeric 

703 subdf = subdf.astype(float) 

704 

705 if instances: # Filter instances 

706 subdf = subdf.loc[instances, :] 

707 if configuration: # Filter configuration 

708 if not isinstance(configuration, list): 

709 configuration = [configuration] 

710 subdf = subdf.filter(configuration, axis=1) 

711 # Aggregate the runs 

712 subdf = subdf.groupby(PerformanceDataFrame.index_instance).agg( 

713 func=objective.run_aggregator.__name__) 

714 # Aggregate the instances 

715 sub_series = subdf.agg(func=objective.instance_aggregator.__name__) 

716 # Select the best configuration 

717 best_conf = sub_series.idxmin() if objective.minimise else sub_series.idxmax() 

718 if per_instance: # Return a list of instance results 

719 return best_conf, subdf[best_conf].to_list() 

720 return best_conf, sub_series[best_conf] 

721 

722 def best_configuration(self: PerformanceDataFrame, 

723 solver: str, 

724 objective: SparkleObjective = None, 

725 instances: list[str] = None) -> tuple[str, float]: 

726 """Return the best configuration for the given objective over the instances. 

727 

728 Args: 

729 solver: The solver for which we determine the best configuration 

730 objective: The objective for which we calculate the best configuration 

731 instances: The instances which should be selected for the evaluation 

732 

733 Returns: 

734 The best configuration id and its aggregated performance. 

735 """ 

736 return self.configuration_performance(solver, None, objective, instances) 

737 

738 def best_instance_performance( 

739 self: PerformanceDataFrame, 

740 objective: str | SparkleObjective = None, 

741 instances: list[str] = None, 

742 run_id: int = None, 

743 exclude_solvers: list[(str, str)] = None) -> pd.Series: 

744 """Return the best performance for each instance in the portfolio. 

745 

746 Args: 

747 objective: The objective for which we calculate the best performance 

748 instances: The instances which should be selected for the evaluation 

749 run_id: The run for which we calculate the best performance. If None, 

750 we consider all runs. 

751 exclude_solvers: List of (solver, config_id) to exclude in the calculation. 

752 

753 Returns: 

754 The best performance for each instance in the portfolio. 

755 """ 

756 objective = self.verify_objective(objective) 

757 if isinstance(objective, str): 

758 objective = resolve_objective(objective) 

759 subdf = self.drop( # Drop Seed, not needed 

760 [PerformanceDataFrame.column_seed], 

761 axis=1, level=PerformanceDataFrame.column_meta) 

762 subdf = subdf.xs(objective.name, level=0) # Drop objective 

763 if exclude_solvers is not None: 

764 subdf = subdf.drop(exclude_solvers, axis=1) 

765 if instances is not None: 

766 subdf = subdf.loc[instances, :] 

767 if run_id is not None: 

768 run_id = self.verify_run_id(run_id) 

769 subdf = subdf.xs(run_id, level=1) 

770 else: 

771 # Drop the run level 

772 subdf = subdf.droplevel(level=1) 

773 # Ensure the objective is numeric 

774 subdf = subdf.astype(float) 

775 series = subdf.min(axis=1) if objective.minimise else subdf.max(axis=1) 

776 # Ensure we always return the best for each run 

777 series = series.sort_values(ascending=objective.minimise) 

778 return series.groupby(series.index).first().astype(float) 

779 

780 def best_performance( 

781 self: PerformanceDataFrame, 

782 exclude_solvers: list[(str, str)] = [], 

783 instances: list[str] = None, 

784 objective: str | SparkleObjective = None) -> float: 

785 """Return the overall best performance of the portfolio. 

786 

787 Args: 

788 exclude_solvers: List of (solver, config_id) to exclude in the calculation. 

789 Defaults to none. 

790 instances: The instances which should be selected for the evaluation 

791 If None, use all instances. 

792 objective: The objective for which we calculate the best performance 

793 

794 Returns: 

795 The aggregated best performance of the portfolio over all instances. 

796 """ 

797 objective = self.verify_objective(objective) 

798 if isinstance(objective, str): 

799 objective = resolve_objective(objective) 

800 instance_best = self.best_instance_performance( 

801 objective, instances=instances, 

802 exclude_solvers=exclude_solvers).to_numpy(dtype=float) 

803 return objective.instance_aggregator(instance_best) 

804 

805 def schedule_performance( 

806 self: PerformanceDataFrame, 

807 schedule: dict[str: dict[str: (str, str, int)]], 

808 target_solver: str | tuple[str, str] = None, 

809 objective: str | SparkleObjective = None) -> float: 

810 """Return the performance of a selection schedule on the portfolio. 

811 

812 Args: 

813 schedule: Compute the best performance according to a selection schedule. 

814 A schedule is a dictionary of instances, with a schedule per instance, 

815 consisting of a triple of solver, config_id and maximum runtime. 

816 target_solver: If not None, store the found values in this solver of the DF. 

817 objective: The objective for which we calculate the best performance 

818 

819 Returns: 

820 The performance of the schedule over the instances in the dictionary. 

821 """ 

822 objective = self.verify_objective(objective) 

823 if isinstance(objective, str): 

824 objective = resolve_objective(objective) 

825 select = min if objective.minimise else max 

826 performances = [0.0] * len(schedule.keys()) 

827 if not isinstance(target_solver, tuple): 

828 target_conf = PerformanceDataFrame.default_configuration 

829 else: 

830 target_solver, target_conf = target_solver 

831 if target_solver and target_solver not in self.solvers: 

832 self.add_solver(target_solver) 

833 for ix, instance in enumerate(schedule.keys()): 

834 for iy, (solver, config, max_runtime) in enumerate(schedule[instance]): 

835 performance = float(self.get_value( 

836 solver, instance, config, objective.name)) 

837 if max_runtime is not None: # We are dealing with runtime 

838 performances[ix] += performance 

839 if performance < max_runtime: 

840 break # Solver finished in time 

841 else: # Quality, we take the best found performance 

842 if iy == 0: # First solver, set initial value 

843 performances[ix] = performance 

844 continue 

845 performances[ix] = select(performances[ix], performance) 

846 if target_solver is not None: 

847 self.set_value(performances[ix], target_solver, 

848 instance, target_conf, objective.name) 

849 return performances 

850 

851 def marginal_contribution( 

852 self: PerformanceDataFrame, 

853 objective: str | SparkleObjective = None, 

854 instances: list[str] = None, 

855 sort: bool = False) -> list[float]: 

856 """Return the marginal contribution of the solver configuration on the instances. 

857 

858 Args: 

859 objective: The objective for which we calculate the marginal contribution. 

860 instances: The instances which should be selected for the evaluation 

861 sort: Whether to sort the results afterwards 

862 Returns: 

863 The marginal contribution of each solver. 

864 """ 

865 output = [] 

866 objective = self.verify_objective(objective) 

867 if isinstance(objective, str): 

868 objective = resolve_objective(objective) 

869 best_performance = self.best_performance(objective=objective, 

870 instances=instances) 

871 for solver in self.solvers: 

872 for config_id in self.get_configurations(solver): 

873 # By calculating the best performance excluding this Solver, 

874 # we can determine its relative impact on the portfolio. 

875 missing_solver_config_best = self.best_performance( 

876 exclude_solvers=[(solver, config_id)], 

877 instances=instances, 

878 objective=objective) 

879 # Now we need to see how much the portfolio's best performance 

880 # decreases without this solver. 

881 marginal_contribution = missing_solver_config_best / best_performance 

882 if missing_solver_config_best == best_performance: 

883 # No change, no contribution 

884 marginal_contribution = 0.0 

885 output.append((solver, config_id, 

886 marginal_contribution, missing_solver_config_best)) 

887 if sort: 

888 output.sort(key=lambda x: x[2], reverse=objective.minimise) 

889 return output 

890 

891 def get_solver_ranking(self: PerformanceDataFrame, 

892 objective: str | SparkleObjective = None, 

893 instances: list[str] = None, 

894 ) -> list[tuple[str, dict, float]]: 

895 """Return a list with solvers ranked by average performance.""" 

896 objective = self.verify_objective(objective) 

897 if isinstance(objective, str): 

898 objective = resolve_objective(objective) 

899 # Drop Seed 

900 sub_df = self.drop( 

901 [PerformanceDataFrame.column_seed], 

902 axis=1, level=PerformanceDataFrame.column_meta) 

903 # Reduce objective 

904 sub_df: pd.DataFrame = sub_df.loc(axis=0)[objective.name, :, :] 

905 # Drop Objective, Meta multi index 

906 sub_df = sub_df.droplevel(PerformanceDataFrame.index_objective).droplevel( 

907 PerformanceDataFrame.column_meta, axis=1) 

908 if instances is not None: # Select instances 

909 sub_df = sub_df.loc(axis=0)[instances, ] 

910 # Ensure data is numeric 

911 sub_df = sub_df.astype(float) 

912 # Aggregate runs 

913 sub_df = sub_df.groupby(PerformanceDataFrame.index_instance).agg( 

914 func=objective.run_aggregator.__name__) 

915 # Aggregate instances 

916 sub_series = sub_df.aggregate(func=objective.instance_aggregator.__name__) 

917 # Sort by objective 

918 sub_series.sort_values(ascending=objective.minimise, inplace=True) 

919 return [(index[0], index[1], sub_series[index]) for index in sub_series.index] 

920 

921 def save_csv(self: PerformanceDataFrame, csv_filepath: Path = None) -> None: 

922 """Write a CSV to the given path. 

923 

924 Args: 

925 csv_filepath: String path to the csv file. Defaults to self.csv_filepath. 

926 """ 

927 csv_filepath = self.csv_filepath if csv_filepath is None else csv_filepath 

928 self.to_csv(csv_filepath) 

929 # Append the configurations 

930 with csv_filepath.open("a") as fout: 

931 fout.write("\n$Solver,configuration_id,Configuration\n") 

932 for solver in self.solvers: 

933 for config_id in self.attrs[solver]: 

934 configuration = self.attrs[solver][config_id] 

935 fout.write(f"${solver},{config_id},{str(configuration)}\n") 

936 

937 def clone(self: PerformanceDataFrame, 

938 csv_filepath: Path = None) -> PerformanceDataFrame: 

939 """Create a copy of this object. 

940 

941 Args: 

942 csv_filepath: The new filepath to use for saving the object to. 

943 If None, will not be saved. 

944 Warning: If the original path is used, it could lead to dataloss! 

945 """ 

946 pd_copy = PerformanceDataFrame( 

947 csv_filepath=csv_filepath, 

948 solvers=self.solvers, 

949 configurations=self.configurations, 

950 objectives=self.objectives, 

951 instances=self.instances, 

952 n_runs=self.num_runs) 

953 # Copy values 

954 for column_index in self.columns: 

955 for index in self.index: 

956 pd_copy.at[index, column_index] = self.loc[index, column_index] 

957 # Ensure everything is sorted? 

958 return pd_copy 

959 

960 def clean_csv(self: PerformanceDataFrame) -> None: 

961 """Set all values in Performance Data to None.""" 

962 self[:] = PerformanceDataFrame.missing_value 

963 self.save_csv()