Coverage for sparkle/structures/performance_dataframe.py: 88%

371 statements  

« prev     ^ index     » next       coverage.py v7.6.10, created at 2025-01-07 15:22 +0000

1"""Module to manage performance data files and common operations on them.""" 

2from __future__ import annotations 

3import ast 

4from typing import Any 

5import itertools 

6from pathlib import Path 

7import math 

8import numpy as np 

9import pandas as pd 

10 

11from sparkle.types import SparkleObjective, resolve_objective 

12 

13 

14class PerformanceDataFrame(pd.DataFrame): 

15 """Class to manage performance data and common operations on them.""" 

16 missing_value = math.nan 

17 

18 missing_objective = "UNKNOWN" 

19 

20 index_objective = "Objective" 

21 index_instance = "Instance" 

22 index_run = "Run" 

23 multi_index_names = [index_objective, index_instance, index_run] 

24 

25 column_value = "Value" 

26 column_seed = "Seed" 

27 column_configuration = "Configuration" 

28 multi_column_names = [column_value, column_seed, column_configuration] 

29 multi_column_dtypes = [float, int, str] 

30 

31 def __init__(self: PerformanceDataFrame, 

32 csv_filepath: Path, 

33 solvers: list[str] = None, 

34 objectives: list[str | SparkleObjective] = None, 

35 instances: list[str] = None, 

36 n_runs: int = 1, 

37 ) -> None: 

38 """Initialise a PerformanceDataFrame. 

39 

40 Consists of: 

41 - Columns representing the Solvers 

42 - Rows representing the result by multi-index in order of: 

43 * Objective (Static, given in constructor or read from file) 

44 * Instance 

45 * Runs (Static, given in constructor or read from file) 

46 

47 Args: 

48 csv_filepath: If path exists, load from Path. 

49 Otherwise create new and save to this path. 

50 solvers: List of solver names to be added into the Dataframe 

51 objectives: List of SparkleObjectives or objective names. By default None, 

52 then the objectives will be derived from Sparkle Settings if possible. 

53 instances: List of instance names to be added into the Dataframe 

54 n_runs: The number of runs to consider per Solver/Objective/Instance comb. 

55 """ 

56 if csv_filepath.exists(): 

57 dtypes = {key: value for key, value in zip( 

58 PerformanceDataFrame.multi_column_names, 

59 PerformanceDataFrame.multi_column_dtypes)} 

60 df = pd.read_csv(csv_filepath, 

61 header=[0, 1], index_col=[0, 1, 2], 

62 dtype=dtypes, 

63 on_bad_lines="skip") 

64 super().__init__(df) 

65 self.csv_filepath = csv_filepath 

66 else: 

67 # Initialize empty DataFrame 

68 run_ids = list(range(1, n_runs + 1)) # We count runs from 1 

69 # We always need objectives to maintain the dimensions 

70 if objectives is None: 

71 objectives = [PerformanceDataFrame.missing_objective] 

72 else: 

73 objectives = [str(o) for o in objectives] 

74 # We always need an instance to maintain the dimensions 

75 if instances is None: 

76 instances = [PerformanceDataFrame.missing_value] 

77 # We always need a solver to maintain the dimensions 

78 if solvers is None: 

79 solvers = [PerformanceDataFrame.missing_value] 

80 midx = pd.MultiIndex.from_product( 

81 [objectives, instances, run_ids], 

82 names=PerformanceDataFrame.multi_index_names) 

83 mcolumns = pd.MultiIndex.from_product( 

84 [solvers, PerformanceDataFrame.multi_column_names], 

85 names=["Solver", "Meta"]) 

86 super().__init__(PerformanceDataFrame.missing_value, 

87 index=midx, columns=mcolumns) 

88 self.csv_filepath = csv_filepath 

89 self.save_csv() 

90 

91 if self.index.duplicated().any(): # Combine duplicate indices 

92 combined = self.groupby(level=[0, 1, 2]).first() 

93 duplicates = self.index[self.index.duplicated(keep="first")] 

94 # Remove all duplicate entries from self 

95 self.drop(duplicates, inplace=True) 

96 for d in duplicates: # Place combined duplicates in self 

97 self.loc[d, :] = combined.loc[d, :] 

98 

99 # Sort the index to optimize lookup speed 

100 self.sort_index(axis=0, inplace=True) 

101 

102 # Properties 

103 

104 @property 

105 def num_objectives(self: PerformanceDataFrame) -> int: 

106 """Retrieve the number of objectives in the DataFrame.""" 

107 return self.index.get_level_values(0).unique().size 

108 

109 @property 

110 def num_instances(self: PerformanceDataFrame) -> int: 

111 """Return the number of instances.""" 

112 return self.index.get_level_values(1).unique().size 

113 

114 @property 

115 def num_runs(self: PerformanceDataFrame) -> int: 

116 """Return the maximum number of runs of each instance.""" 

117 return self.index.get_level_values(2).unique().size 

118 

119 @property 

120 def num_solvers(self: PerformanceDataFrame) -> int: 

121 """Return the number of solvers.""" 

122 return self.columns.get_level_values(0).unique().size 

123 

124 @property 

125 def multi_objective(self: PerformanceDataFrame) -> bool: 

126 """Return whether the dataframe represent MO or not.""" 

127 return self.num_objectives > 1 

128 

129 @property 

130 def solvers(self: PerformanceDataFrame) -> list[str]: 

131 """Return the solver present as a list of strings.""" 

132 return self.columns.get_level_values(0).unique().to_list() 

133 

134 @property 

135 def objective_names(self: PerformanceDataFrame) -> list[str]: 

136 """Return the objective names as a list of strings.""" 

137 return self.index.get_level_values(0).unique().to_list() 

138 

139 @property 

140 def objectives(self: PerformanceDataFrame) -> list[SparkleObjective]: 

141 """Return the objectives as a list of SparkleObjectives.""" 

142 return [resolve_objective(o) for o in self.objective_names] 

143 

144 @property 

145 def instances(self: PerformanceDataFrame) -> list[str]: 

146 """Return the instances as a Pandas Index object.""" 

147 return self.index.get_level_values(1).unique().to_list() 

148 

149 @property 

150 def run_ids(self: PerformanceDataFrame) -> list[int]: 

151 """Return the run ids as a list of integers.""" 

152 return self.index.get_level_values(2).unique().to_list() 

153 

154 @property 

155 def has_missing_values(self: PerformanceDataFrame) -> bool: 

156 """Returns True if there are any missing values in the dataframe.""" 

157 return self.isnull().any().drop([PerformanceDataFrame.column_seed, 

158 PerformanceDataFrame.column_configuration], 

159 level=1).any() 

160 

161 def verify_objective(self: PerformanceDataFrame, 

162 objective: str) -> str: 

163 """Method to check whether the specified objective is valid. 

164 

165 Users are allowed to index the dataframe without specifying all dimensions. 

166 However, when dealing with multiple objectives this is not allowed and this 

167 is verified here. If we have only one objective this is returned. Otherwise, 

168 if an objective is specified by the user this is returned. 

169 

170 Args: 

171 objective: The objective given by the user 

172 """ 

173 if objective is None: 

174 if self.multi_objective: 

175 raise ValueError("Error: MO Data, but objective not specified.") 

176 elif self.num_objectives == 1: 

177 return self.objective_names[0] 

178 else: 

179 return PerformanceDataFrame.missing_objective 

180 return objective 

181 

182 def verify_run_id(self: PerformanceDataFrame, 

183 run_id: int) -> int: 

184 """Method to check whether run id is valid. 

185 

186 Similar to verify_objective but here we check the dimensionality of runs. 

187 

188 Args: 

189 run_id: the run as specified by the user. 

190 """ 

191 if run_id is None: 

192 if self.num_runs > 1: 

193 raise ValueError("Error: Multiple run performance data, " 

194 "but run not specified") 

195 else: 

196 run_id = self.run_ids[0] 

197 return run_id 

198 

199 def verify_indexing(self: PerformanceDataFrame, 

200 objective: str, 

201 run_id: int) -> tuple[str, int]: 

202 """Method to check whether data indexing is correct. 

203 

204 Users are allowed to use the Performance Dataframe without the second and 

205 fourth dimension (Objective and Run respectively) in the case they only 

206 have one objective or only do one run. This method adjusts the indexing for 

207 those cases accordingly. 

208 

209 Args: 

210 objective: The given objective name 

211 run_id: The given run index 

212 

213 Returns: 

214 A tuple representing the (possibly adjusted) Objective and Run index. 

215 """ 

216 objective = self.verify_objective(objective) 

217 run_id = self.verify_run_id(run_id) 

218 return objective, run_id 

219 

220 # Getters and Setters 

221 

222 def add_solver(self: PerformanceDataFrame, 

223 solver_name: str, 

224 initial_value: float | list[str | float] = None) -> None: 

225 """Add a new solver to the dataframe. Initializes value to None by default. 

226 

227 Args: 

228 solver_name: The name of the solver to be added. 

229 initial_value: The value assigned for each index of the new solver. 

230 If not None, must match the index dimension (n_obj * n_inst * n_runs). 

231 """ 

232 if solver_name in self.solvers: 

233 print(f"WARNING: Tried adding already existing solver {solver_name} to " 

234 f"Performance DataFrame: {self.csv_filepath}") 

235 return 

236 initial_value =\ 

237 [initial_value] if not isinstance(initial_value, list) else initial_value 

238 column_dim_size = len(PerformanceDataFrame.multi_column_names) 

239 if len(initial_value) < column_dim_size: 

240 initial_value.extend([None] * (column_dim_size - len(initial_value))) 

241 for field, value in zip(PerformanceDataFrame.multi_column_names, initial_value): 

242 self[solver_name, field] = value 

243 if self.num_solvers == 2: # Remove nan solver 

244 for solver in self.solvers: 

245 if str(solver) == str(PerformanceDataFrame.missing_value): 

246 self.remove_solver(solver) 

247 break 

248 

249 def add_objective(self: PerformanceDataFrame, 

250 objective_name: str, 

251 initial_value: float = None) -> None: 

252 """Add an objective to the DataFrame.""" 

253 initial_value = initial_value or self.missing_value 

254 if objective_name in self.objective_names: 

255 print(f"WARNING: Tried adding already existing objective {objective_name} " 

256 f"to Performance DataFrame: {self.csv_filepath}") 

257 return 

258 for instance, run in itertools.product(self.instances, self.run_ids): 

259 self.loc[(objective_name, instance, run)] = initial_value 

260 self.sort_index(axis=0, inplace=True) 

261 

262 def add_instance(self: PerformanceDataFrame, 

263 instance_name: str, 

264 initial_value: float = None) -> None: 

265 """Add and instance to the DataFrame.""" 

266 initial_value = initial_value or self.missing_value 

267 

268 if instance_name in self.instances: 

269 print(f"WARNING: Tried adding already existing instance {instance_name} " 

270 f"to Performance DataFrame: {self.csv_filepath}") 

271 return 

272 # Add rows for all combinations 

273 for objective, run in itertools.product(self.objective_names, self.run_ids): 

274 self.loc[(objective, instance_name, run)] = initial_value 

275 if self.num_instances == 2: # Remove nan instance 

276 for instance in self.instances: 

277 if not isinstance(instance, str) and math.isnan(instance): 

278 self.remove_instance(instance) 

279 break 

280 # Sort the index to optimize lookup speed 

281 self.sort_index(axis=0, inplace=True) 

282 

283 def add_runs(self: PerformanceDataFrame, 

284 num_extra_runs: int, 

285 instance_names: list[str] = None) -> None: 

286 """Add runs to the DataFrame. 

287 

288 Args: 

289 num_extra_runs: The number of runs to be added. 

290 instance_names: The instances for which runs are to be added. 

291 By default None, which means runs are added to all instances. 

292 """ 

293 instance_names = self.instances if instance_names is None else instance_names 

294 for instance in instance_names: 

295 for objective in self.objective_names: 

296 index_runs_start = len(self.loc[(objective, instance)]) + 1 

297 for run in range(index_runs_start, index_runs_start + num_extra_runs): 

298 self.loc[(objective, instance, run)] = self.missing_value 

299 # Sort the index to optimize lookup speed 

300 # NOTE: It would be better to do this at the end, but that results in 

301 # PerformanceWarning: indexing past lexsort depth may impact performance. 

302 self.sort_index(axis=0, inplace=True) 

303 

304 def remove_solver(self: PerformanceDataFrame, solver_name: str | list[str]) -> None: 

305 """Drop one or more solvers from the Dataframe.""" 

306 # To make sure objectives / runs are saved when no solvers are present 

307 if self.num_solvers == 1: 

308 for field in PerformanceDataFrame.multi_column_names: 

309 self[PerformanceDataFrame.missing_value, field] =\ 

310 PerformanceDataFrame.missing_value 

311 self.drop(columns=solver_name, level=0, axis=1, inplace=True) 

312 

313 def remove_instance(self: PerformanceDataFrame, instance_name: str) -> None: 

314 """Drop an instance from the Dataframe.""" 

315 # To make sure objectives / runs are saved when no instances are present 

316 if self.num_instances == 1: 

317 for objective, run in itertools.product(self.objective_names, self.run_ids): 

318 self.loc[(objective, PerformanceDataFrame.missing_value, run)] =\ 

319 PerformanceDataFrame.missing_value 

320 self.drop(instance_name, 

321 axis=0, 

322 level=PerformanceDataFrame.index_instance, inplace=True) 

323 # Sort the index to optimize lookup speed 

324 self.sort_index(axis=0, inplace=True) 

325 

326 def remove_runs(self: PerformanceDataFrame, 

327 runs: int | list[int], 

328 instance_names: list[str] = None) -> None: 

329 """Drop one or more runs from the Dataframe. 

330 

331 Args: 

332 runs: The run indices to be removed. If its an int, 

333 the last n runs are removed. NOTE: If each instance has a different 

334 number of runs, the amount of removed runs is not uniform. 

335 instance_names: The instances for which runs are to be removed. 

336 By default None, which means runs are removed from all instances. 

337 """ 

338 instance_names = self.instances if instance_names is None else instance_names 

339 runs = list(range((self.num_runs + 1) - runs, (self.num_runs + 1)))\ 

340 if isinstance(runs, int) else runs 

341 self.drop(runs, 

342 axis=0, 

343 level=PerformanceDataFrame.index_run, 

344 inplace=True) 

345 # Sort the index to optimize lookup speed 

346 self.sort_index(axis=0, inplace=True) 

347 

348 def remove_empty_runs(self: PerformanceDataFrame) -> None: 

349 """Remove runs that contain no data, except for the first.""" 

350 for row_index in self.index: 

351 if row_index[2] == 1: # First run, never delete 

352 continue 

353 if self.loc[row_index].isna().all(): 

354 self.drop(row_index, inplace=True) 

355 

356 def reset_value(self: PerformanceDataFrame, 

357 solver: str, 

358 instance: str, 

359 objective: str = None, 

360 run: int = None) -> None: 

361 """Reset a value in the dataframe.""" 

362 self.set_value(PerformanceDataFrame.missing_value, 

363 solver, instance, objective, run) 

364 

365 def set_value(self: PerformanceDataFrame, 

366 value: float | str | list[float | str] | list[list[float | str]], 

367 solver: str | list[str], 

368 instance: str | list[str], 

369 objective: str | list[str] = None, 

370 run: int | list[int] = None, 

371 solver_fields: list[str] = ["Value"], 

372 append_write_csv: bool = False) -> None: 

373 """Setter method to assign a value to the Dataframe. 

374 

375 Allows for setting the same value to multiple indices. 

376 

377 Args: 

378 value: Value(s) to be assigned. If value is a list, first dimension is 

379 the solver field, second dimension is if multiple different values are 

380 to be assigned. Must be the same shape as target. 

381 solver: The solver(s) for which the value should be set. 

382 If solver is a list, multiple solvers are set. If None, all 

383 solvers are set. 

384 instance: The instance(s) for which the value should be set. 

385 If instance is a list, multiple instances are set. If None, all 

386 instances are set. 

387 objective: The objectives for which the value should be set. 

388 When left None, set for all objectives 

389 run: The run index for which the value should be set. 

390 If left None, set for all runs. 

391 solver_fields: The level to which each value should be assinged. 

392 Defaults to ["Value"]. 

393 append_write_csv: For concurrent writing to the PerformanceDataFrame. 

394 If True, the value is directly appended to the CSV file. 

395 This will create duplicate entries in the file, but these are combined 

396 when loading the file. 

397 """ 

398 # Convert indices to slices for None values 

399 solver = slice(solver) if solver is None else solver 

400 instance = slice(instance) if instance is None else instance 

401 objective = slice(objective) if objective is None else objective 

402 run = slice(run) if run is None else run 

403 # Convert column indices to slices for setting multiple columns 

404 value = [value] if not isinstance(value, list) else value 

405 # NOTE: We currently forloop levels here, as it allows us to set the same 

406 # sequence of values to the indices 

407 for item, level in zip(value, solver_fields): 

408 self.loc[(objective, instance, run), (solver, level)] = item 

409 

410 if append_write_csv: 

411 writeable = self.loc[(objective, instance, run), :] 

412 if isinstance(writeable, pd.Series): # Single row, convert to pd.DataFrame 

413 writeable = self.loc[[(objective, instance, run)], :] 

414 # Append the new rows to the dataframe csv file 

415 writeable.to_csv(self.csv_filepath, mode="a", header=False) 

416 

417 def get_value(self: PerformanceDataFrame, 

418 solver: str | list[str], 

419 instance: str | list[str], 

420 objective: str = None, 

421 run: int = None, 

422 solver_fields: list[str] = ["Value"] 

423 ) -> float | str | list[Any]: 

424 """Index a value of the DataFrame and return it.""" 

425 # Convert indices to slices for None values 

426 solver = slice(solver) if solver is None else solver 

427 instance = slice(instance) if instance is None else instance 

428 objective = slice(objective) if objective is None else objective 

429 run = slice(run) if run is None else run 

430 target = self.loc[(objective, instance, run), (solver, solver_fields)].values 

431 

432 # Reduce dimensions when relevant 

433 if isinstance(target[0], np.ndarray) and len(target[0]) == 1: 

434 target = target.flatten() 

435 target = target.tolist() 

436 if len(target) == 1: 

437 return target[0] 

438 return target 

439 

440 # This method can be removed now that above method does its job 

441 def get_values(self: PerformanceDataFrame, 

442 solver: str, 

443 instance: str = None, 

444 objective: str = None, 

445 run: int = None, 

446 solver_fields: list[str] = ["Value"] 

447 ) -> list[float | str] | list[list[float | str]]: 

448 """Return a list of solver values.""" 

449 subdf = self[solver][solver_fields] 

450 if objective is not None: 

451 objective = self.verify_objective(objective) 

452 subdf = subdf.xs(objective, level=0, drop_level=False) 

453 if instance is not None: 

454 subdf = subdf.xs(instance, level=1, drop_level=False) 

455 if run is not None: 

456 run = self.verify_run_id(run) 

457 subdf = subdf.xs(run, level=2, drop_level=False) 

458 # Convert dict to list 

459 result = [subdf[field].to_list() for field in solver_fields] 

460 if len(result) == 1: 

461 return result[0] 

462 return result 

463 

464 def get_instance_num_runs(self: PerformanceDataFrame, 

465 instance: str) -> int: 

466 """Return the number of runs for an instance.""" 

467 # We assume each objective has the same index for Instance/Runs 

468 return len(self.loc[(self.objective_names[0], instance)].index) 

469 

470 # Calculables 

471 

472 def mean(self: PerformanceDataFrame, 

473 objective: str = None, 

474 solver: str = None, 

475 instance: str = None) -> float: 

476 """Return the mean value of a slice of the dataframe.""" 

477 objective = self.verify_objective(objective) 

478 subset = self.xs(objective, level=0) 

479 if solver is not None: 

480 subset = subset.xs(solver, axis=1, drop_level=False) 

481 if instance is not None: 

482 subset = subset.xs(instance, axis=0, drop_level=False) 

483 value = subset.astype(float).mean() 

484 if isinstance(value, pd.Series): 

485 return value.mean() 

486 return value 

487 

488 # TODO: This method should be refactored or not exist 

489 def get_job_list(self: PerformanceDataFrame, rerun: bool = False) \ 

490 -> list[tuple[str, str]]: 

491 """Return a list of performance computation jobs there are to be done. 

492 

493 Get a list of tuple[instance, solver] to run from the performance data. 

494 If rerun is False (default), get only the tuples that don't have a 

495 value, else (True) get all the tuples. 

496 

497 Args: 

498 rerun: Boolean indicating if we want to rerun all jobs 

499 

500 Returns: 

501 A list of [instance, solver] combinations 

502 """ 

503 # Format the dataframe such that only the values remain 

504 df = self.stack(future_stack=True) 

505 df.drop([PerformanceDataFrame.column_seed, 

506 PerformanceDataFrame.column_configuration], level=-1, inplace=True) 

507 df.index.droplevel() 

508 if not rerun: # Filter the nan values 

509 df = df.isnull() 

510 

511 # Count the number of missing objective values for each Instance/Run/Algorithm 

512 df.index = df.index.droplevel(PerformanceDataFrame.index_objective) 

513 df.index = df.index.droplevel(-1) 

514 index_names = df.index.names 

515 df = df.groupby(df.index).agg({cname: "sum" for cname in df.columns}) 

516 df.index = pd.MultiIndex.from_tuples(df.index, names=index_names) 

517 

518 # Return the Instance, Run, Solver combinations 

519 return [index + (column, ) 

520 for index, column in itertools.product(df.index, df.columns) 

521 if rerun or df[column][index] > 0] 

522 

523 # TODO: This method should be refactored or not exist 

524 def remaining_jobs(self: PerformanceDataFrame) -> dict[str, list[str]]: 

525 """Return a dictionary for empty values as instance key and solver values.""" 

526 remaining_jobs = {} 

527 jobs = self.get_job_list(rerun=False) 

528 for instance, _, solver in jobs: 

529 if instance not in remaining_jobs: 

530 remaining_jobs[instance] = [solver] 

531 else: 

532 remaining_jobs[instance].append(solver) 

533 return remaining_jobs 

534 

535 def configuration_performance( 

536 self: PerformanceDataFrame, 

537 solver: str, 

538 configuration: dict, 

539 objective: str | SparkleObjective = None, 

540 instances: list[str] = None, 

541 per_instance: bool = False) -> tuple[dict, float]: 

542 """Return the configuration performance for objective over the instances. 

543 

544 Args: 

545 solver: The solver for which we determine evaluate the configuration 

546 configuration: The configuration to evaluate 

547 objective: The objective for which we calculate find the best value 

548 instances: The instances which should be selected for the evaluation 

549 per_instance: Whether to return the performance per instance, 

550 or aggregated. 

551 

552 Returns: 

553 The best configuration and its aggregated performance. 

554 """ 

555 objective = self.verify_objective(objective) 

556 instances = instances or slice(instances) # Convert None to slice 

557 if isinstance(objective, str): 

558 objective = resolve_objective(objective) 

559 # Filter objective 

560 subdf = self.xs(objective.name, level=0, drop_level=True) 

561 

562 if configuration: # Filter configuration 

563 if not isinstance(configuration, dict): # Get empty configuration 

564 subdf = subdf[subdf[solver][ 

565 PerformanceDataFrame.column_configuration].isna()] 

566 else: 

567 subdf = subdf[subdf[solver][ 

568 PerformanceDataFrame.column_configuration] == str(configuration)] 

569 # Filter solver 

570 subdf = subdf.xs(solver, axis=1, drop_level=True) 

571 

572 # Drop the seed, filter instances 

573 subdf = subdf.drop(PerformanceDataFrame.column_seed, axis=1).loc[instances, :] 

574 # Aggregate the runs per instance/configuration 

575 try: # Can only aggregate numerical values 

576 subdf[PerformanceDataFrame.column_value] =\ 

577 pd.to_numeric(subdf[PerformanceDataFrame.column_value]) # Ensure type 

578 subdf = subdf.groupby([PerformanceDataFrame.index_instance, 

579 PerformanceDataFrame.column_configuration], 

580 dropna=False).agg(objective.run_aggregator.__name__) 

581 except ValueError: 

582 subdf.drop(PerformanceDataFrame.column_configuration, axis=1, inplace=True) 

583 return configuration, subdf.values.flatten().tolist() 

584 if per_instance: # No instance aggregation 

585 # NOTE: How do we select the best configuration now if conf == None? 

586 return configuration, subdf.values.flatten().tolist() 

587 

588 # Aggregate the instances per configuration 

589 subdf = subdf.droplevel(level=0).reset_index() # Drop instance column 

590 subdf = subdf.groupby(PerformanceDataFrame.column_configuration, 

591 dropna=False).agg( 

592 func=objective.instance_aggregator.__name__) 

593 

594 if configuration: 

595 return configuration, subdf.values[0][0] 

596 # In case of no configuration given, select the one with best objective value 

597 best_index = subdf.idxmin() if objective.minimise else subdf.idxmax() 

598 try: 

599 best_configuration = ast.literal_eval(best_index.values[0]) 

600 except Exception: # Configuration is not a dictionary 

601 best_value = subdf.min() if objective.minimise else subdf.max() 

602 return {}, best_value.values[0] 

603 return (best_configuration, 

604 subdf.loc[best_index, PerformanceDataFrame.column_value].values[0]) 

605 

606 def best_configuration(self: PerformanceDataFrame, 

607 solver: str, 

608 objective: SparkleObjective = None, 

609 instances: list[str] = None) -> tuple[dict, float]: 

610 """Return the best configuration for the given objective over the instances. 

611 

612 Args: 

613 solver: The solver for which we determine the best configuration 

614 objective: The objective for which we calculate the best configuration 

615 instances: The instances which should be selected for the evaluation 

616 

617 Returns: 

618 The best configuration and its aggregated performance. 

619 """ 

620 return self.configuration_performance(solver, None, objective, instances) 

621 

622 def best_instance_performance( 

623 self: PerformanceDataFrame, 

624 objective: str | SparkleObjective = None, 

625 run_id: int = None, 

626 exclude_solvers: list[str] = None) -> pd.Series: 

627 """Return the best performance for each instance in the portfolio. 

628 

629 Args: 

630 objective: The objective for which we calculate the best performance 

631 run_id: The run for which we calculate the best performance. If None, 

632 we consider all runs. 

633 exclude_solvers: List of solvers to exclude in the calculation. 

634 

635 Returns: 

636 The best performance for each instance in the portfolio. 

637 """ 

638 objective = self.verify_objective(objective) 

639 if isinstance(objective, str): 

640 objective = resolve_objective(objective) 

641 # Drop Seed/Configuration 

642 subdf = self.drop( 

643 [PerformanceDataFrame.column_seed, 

644 PerformanceDataFrame.column_configuration], 

645 axis=1, level=1) 

646 subdf = subdf.xs(objective.name, level=0) 

647 if exclude_solvers is not None: 

648 subdf = subdf.drop(exclude_solvers, axis=1, level=0) 

649 if run_id is not None: 

650 run_id = self.verify_run_id(run_id) 

651 subdf = subdf.xs(run_id, level=1) 

652 else: 

653 # Drop the run level 

654 subdf = subdf.droplevel(level=1) 

655 if objective.minimise: 

656 series = subdf.min(axis=1) 

657 else: 

658 series = subdf.max(axis=1) 

659 # Ensure we always return the best for each run 

660 series = series.sort_values(ascending=objective.minimise) 

661 return series.groupby(series.index).first().astype(float) 

662 

663 def best_performance( 

664 self: PerformanceDataFrame, 

665 exclude_solvers: list[str] = [], 

666 objective: str | SparkleObjective = None) -> float: 

667 """Return the overall best performance of the portfolio. 

668 

669 Args: 

670 exclude_solvers: List of solvers to exclude in the calculation. 

671 Defaults to none. 

672 objective: The objective for which we calculate the best performance 

673 

674 Returns: 

675 The aggregated best performance of the portfolio over all instances. 

676 """ 

677 objective = self.verify_objective(objective) 

678 if isinstance(objective, str): 

679 objective = resolve_objective(objective) 

680 instance_best = self.best_instance_performance( 

681 objective, exclude_solvers=exclude_solvers).to_numpy(dtype=float) 

682 return objective.instance_aggregator(instance_best) 

683 

684 def schedule_performance( 

685 self: PerformanceDataFrame, 

686 schedule: dict[str: list[tuple[str, float | None]]], 

687 target_solver: str = None, 

688 objective: str | SparkleObjective = None) -> float: 

689 """Return the performance of a selection schedule on the portfolio. 

690 

691 Args: 

692 schedule: Compute the best performance according to a selection schedule. 

693 A dictionary with instances as keys and a list of tuple consisting of 

694 (solver, max_runtime) or solvers if no runtime prediction should be used. 

695 target_solver: If not None, store the values in this solver of the DF. 

696 objective: The objective for which we calculate the best performance 

697 

698 Returns: 

699 The performance of the schedule over the instances in the dictionary. 

700 """ 

701 objective = self.verify_objective(objective) 

702 if isinstance(objective, str): 

703 objective = resolve_objective(objective) 

704 select = min if objective.minimise else max 

705 performances = [0.0] * len(schedule.keys()) 

706 for ix, instance in enumerate(schedule.keys()): 

707 for iy, (solver, max_runtime) in enumerate(schedule[instance]): 

708 performance = float(self.get_value(solver, instance, objective.name)) 

709 if max_runtime is not None: # We are dealing with runtime 

710 performances[ix] += performance 

711 if performance < max_runtime: 

712 break # Solver finished in time 

713 else: # Quality, we take the best found performance 

714 if iy == 0: # First solver, set initial value 

715 performances[ix] = performance 

716 continue 

717 performances[ix] = select(performances[ix], performance) 

718 if target_solver is not None: 

719 self.set_value(performances[ix], target_solver, instance, objective.name) 

720 return performances 

721 

722 def marginal_contribution( 

723 self: PerformanceDataFrame, 

724 objective: str | SparkleObjective = None, 

725 sort: bool = False) -> list[float]: 

726 """Return the marginal contribution of the solvers on the instances. 

727 

728 Args: 

729 objective: The objective for which we calculate the marginal contribution. 

730 sort: Whether to sort the results afterwards 

731 Returns: 

732 The marginal contribution of each solver. 

733 """ 

734 output = [] 

735 objective = self.verify_objective(objective) 

736 if isinstance(objective, str): 

737 objective = resolve_objective(objective) 

738 best_performance = self.best_performance(objective=objective) 

739 for solver in self.solvers: 

740 # By calculating the best performance excluding this Solver, 

741 # we can determine its relative impact on the portfolio. 

742 missing_solver_best = self.best_performance( 

743 exclude_solvers=[solver], 

744 objective=objective) 

745 # Now we need to see how much the portfolio's best performance 

746 # decreases without this solver. 

747 marginal_contribution = missing_solver_best / best_performance 

748 if missing_solver_best == best_performance: 

749 # No change, no contribution 

750 marginal_contribution = 0.0 

751 output.append((solver, marginal_contribution, missing_solver_best)) 

752 if sort: 

753 output.sort(key=lambda x: x[1], reverse=objective.minimise) 

754 return output 

755 

756 def get_solver_ranking(self: PerformanceDataFrame, 

757 objective: str | SparkleObjective = None 

758 ) -> list[tuple[str, float]]: 

759 """Return a list with solvers ranked by average performance.""" 

760 objective = self.verify_objective(objective) 

761 if isinstance(objective, str): 

762 objective = resolve_objective(objective) 

763 # Drop Seed/Configuration 

764 subdf = self.drop( 

765 [PerformanceDataFrame.column_seed, 

766 PerformanceDataFrame.column_configuration], 

767 axis=1, level=1) 

768 sub_df = subdf.loc(axis=0)[objective.name, :, :] 

769 # Reduce Runs Dimension 

770 sub_df = sub_df.droplevel("Run").astype(float) 

771 # By using .__name__, pandas converts it to a Pandas Aggregator function 

772 sub_df = sub_df.groupby(sub_df.index).agg(func=objective.run_aggregator.__name__) 

773 solver_ranking = [(solver, objective.instance_aggregator( 

774 sub_df[solver].astype(float))) for solver in self.solvers] 

775 # Sort the list by second value (the performance) 

776 solver_ranking.sort(key=lambda performance: performance[1], 

777 reverse=(not objective.minimise)) 

778 return solver_ranking 

779 

780 def save_csv(self: PerformanceDataFrame, csv_filepath: Path = None) -> None: 

781 """Write a CSV to the given path. 

782 

783 Args: 

784 csv_filepath: String path to the csv file. Defaults to self.csv_filepath. 

785 """ 

786 csv_filepath = self.csv_filepath if csv_filepath is None else csv_filepath 

787 self.to_csv(csv_filepath) 

788 

789 def clone(self: PerformanceDataFrame, 

790 csv_filepath: Path = None) -> PerformanceDataFrame: 

791 """Create a copy of this object. 

792 

793 Args: 

794 csv_filepath: The new filepath to use for saving the object to. 

795 Warning: If the original path is used, it could lead to dataloss! 

796 """ 

797 csv_filepath = csv_filepath or self.csv_filepath 

798 if self.csv_filepath.exists(): 

799 pd_copy = PerformanceDataFrame(csv_filepath) 

800 else: 

801 pd_copy = PerformanceDataFrame( 

802 csv_filepath=csv_filepath, 

803 solvers=self.solvers, 

804 objectives=self.objectives, 

805 instances=self.instances, 

806 n_runs=self.num_runs) 

807 for solver in self.solvers: 

808 for index in self.index: 

809 for field in PerformanceDataFrame.multi_column_names: 

810 pd_copy.at[index, (solver, field)] =\ 

811 self.loc[index, solver][field] 

812 return pd_copy 

813 

814 def clean_csv(self: PerformanceDataFrame) -> None: 

815 """Set all values in Performance Data to None.""" 

816 self[:] = PerformanceDataFrame.missing_value 

817 self.save_csv() 

818 

819 def to_autofolio(self: PerformanceDataFrame, 

820 objective: SparkleObjective = None, 

821 target: Path = None) -> Path: 

822 """Port the data to a format acceptable for AutoFolio.""" 

823 if (objective is None and self.multi_objective or self.num_runs > 1): 

824 print(f"ERROR: Currently no porting available for {self.csv_filepath} " 

825 "to Autofolio due to multi objective or number of runs.") 

826 return 

827 autofolio_df = super().copy() 

828 # Drop Seed/Configuration, then drop the level 

829 autofolio_df = autofolio_df.drop([PerformanceDataFrame.column_seed, 

830 PerformanceDataFrame.column_configuration], 

831 axis=1, level=1).droplevel(level=1, axis=1) 

832 if objective is not None: 

833 autofolio_df = autofolio_df.loc[objective.name] 

834 autofolio_df.index = autofolio_df.index.droplevel("Run") 

835 else: 

836 autofolio_df.index = autofolio_df.index.droplevel(["Objective", "Run"]) 

837 if target is None: 

838 path = self.csv_filepath.parent / f"autofolio_{self.csv_filepath.name}" 

839 else: 

840 path = target / f"autofolio_{self.csv_filepath.name}" 

841 autofolio_df.to_csv(path) 

842 return path