Coverage for sparkle/structures/performance_dataframe.py: 88%

365 statements  

« prev     ^ index     » next       coverage.py v7.8.0, created at 2025-04-03 10:42 +0000

1"""Module to manage performance data files and common operations on them.""" 

2from __future__ import annotations 

3import ast 

4from typing import Any 

5import itertools 

6from pathlib import Path 

7import math 

8import numpy as np 

9import pandas as pd 

10 

11from sparkle.types import SparkleObjective, resolve_objective 

12 

13 

14class PerformanceDataFrame(pd.DataFrame): 

15 """Class to manage performance data and common operations on them.""" 

16 missing_value = math.nan 

17 

18 missing_objective = "UNKNOWN" 

19 

20 index_objective = "Objective" 

21 index_instance = "Instance" 

22 index_run = "Run" 

23 multi_index_names = [index_objective, index_instance, index_run] 

24 

25 column_value = "Value" 

26 column_seed = "Seed" 

27 column_configuration = "Configuration" 

28 multi_column_names = [column_value, column_seed, column_configuration] 

29 multi_column_dtypes = [float, int, str] 

30 

31 def __init__(self: PerformanceDataFrame, 

32 csv_filepath: Path, 

33 solvers: list[str] = None, 

34 objectives: list[str | SparkleObjective] = None, 

35 instances: list[str] = None, 

36 n_runs: int = 1, 

37 ) -> None: 

38 """Initialise a PerformanceDataFrame. 

39 

40 Consists of: 

41 - Columns representing the Solvers 

42 - Rows representing the result by multi-index in order of: 

43 * Objective (Static, given in constructor or read from file) 

44 * Instance 

45 * Runs (Static, given in constructor or read from file) 

46 

47 Args: 

48 csv_filepath: If path exists, load from Path. 

49 Otherwise create new and save to this path. 

50 solvers: List of solver names to be added into the Dataframe 

51 objectives: List of SparkleObjectives or objective names. By default None, 

52 then the objectives will be derived from Sparkle Settings if possible. 

53 instances: List of instance names to be added into the Dataframe 

54 n_runs: The number of runs to consider per Solver/Objective/Instance comb. 

55 """ 

56 if csv_filepath.exists(): 

57 dtypes = {key: value for key, value in zip( 

58 PerformanceDataFrame.multi_column_names, 

59 PerformanceDataFrame.multi_column_dtypes)} 

60 df = pd.read_csv(csv_filepath, 

61 header=[0, 1], index_col=[0, 1, 2], 

62 dtype=dtypes, 

63 on_bad_lines="skip") 

64 super().__init__(df) 

65 self.csv_filepath = csv_filepath 

66 else: 

67 # Initialize empty DataFrame 

68 run_ids = list(range(1, n_runs + 1)) # We count runs from 1 

69 # We always need objectives to maintain the dimensions 

70 if objectives is None: 

71 objectives = [PerformanceDataFrame.missing_objective] 

72 else: 

73 objectives = [str(o) for o in objectives] 

74 # We always need an instance to maintain the dimensions 

75 if instances is None: 

76 instances = [PerformanceDataFrame.missing_value] 

77 # We always need a solver to maintain the dimensions 

78 if solvers is None: 

79 solvers = [PerformanceDataFrame.missing_value] 

80 midx = pd.MultiIndex.from_product( 

81 [objectives, instances, run_ids], 

82 names=PerformanceDataFrame.multi_index_names) 

83 mcolumns = pd.MultiIndex.from_product( 

84 [solvers, PerformanceDataFrame.multi_column_names], 

85 names=["Solver", "Meta"]) 

86 super().__init__(PerformanceDataFrame.missing_value, 

87 index=midx, columns=mcolumns) 

88 self.csv_filepath = csv_filepath 

89 self.save_csv() 

90 

91 if self.index.duplicated().any(): # Combine duplicate indices 

92 combined = self.groupby(level=[0, 1, 2]).first() 

93 duplicates = self.index[self.index.duplicated(keep="first")] 

94 # Remove all duplicate entries from self 

95 self.drop(duplicates, inplace=True) 

96 for d in duplicates: # Place combined duplicates in self 

97 self.loc[d, :] = combined.loc[d, :] 

98 

99 # Sort the index to optimize lookup speed 

100 self.sort_index(axis=0, inplace=True) 

101 

102 # Properties 

103 

104 @property 

105 def num_objectives(self: PerformanceDataFrame) -> int: 

106 """Retrieve the number of objectives in the DataFrame.""" 

107 return self.index.get_level_values(0).unique().size 

108 

109 @property 

110 def num_instances(self: PerformanceDataFrame) -> int: 

111 """Return the number of instances.""" 

112 return self.index.get_level_values(1).unique().size 

113 

114 @property 

115 def num_runs(self: PerformanceDataFrame) -> int: 

116 """Return the maximum number of runs of each instance.""" 

117 return self.index.get_level_values(2).unique().size 

118 

119 @property 

120 def num_solvers(self: PerformanceDataFrame) -> int: 

121 """Return the number of solvers.""" 

122 return self.columns.get_level_values(0).unique().size 

123 

124 @property 

125 def multi_objective(self: PerformanceDataFrame) -> bool: 

126 """Return whether the dataframe represent MO or not.""" 

127 return self.num_objectives > 1 

128 

129 @property 

130 def solvers(self: PerformanceDataFrame) -> list[str]: 

131 """Return the solver present as a list of strings.""" 

132 return self.columns.get_level_values(0).unique().to_list() 

133 

134 @property 

135 def objective_names(self: PerformanceDataFrame) -> list[str]: 

136 """Return the objective names as a list of strings.""" 

137 return self.index.get_level_values(0).unique().to_list() 

138 

139 @property 

140 def objectives(self: PerformanceDataFrame) -> list[SparkleObjective]: 

141 """Return the objectives as a list of SparkleObjectives.""" 

142 return [resolve_objective(o) for o in self.objective_names] 

143 

144 @property 

145 def instances(self: PerformanceDataFrame) -> list[str]: 

146 """Return the instances as a Pandas Index object.""" 

147 return self.index.get_level_values(1).unique().to_list() 

148 

149 @property 

150 def run_ids(self: PerformanceDataFrame) -> list[int]: 

151 """Return the run ids as a list of integers.""" 

152 return self.index.get_level_values(2).unique().to_list() 

153 

154 @property 

155 def has_missing_values(self: PerformanceDataFrame) -> bool: 

156 """Returns True if there are any missing values in the dataframe.""" 

157 return self.isnull().any().drop([PerformanceDataFrame.column_seed, 

158 PerformanceDataFrame.column_configuration], 

159 level=1).any() 

160 

161 def verify_objective(self: PerformanceDataFrame, 

162 objective: str) -> str: 

163 """Method to check whether the specified objective is valid. 

164 

165 Users are allowed to index the dataframe without specifying all dimensions. 

166 However, when dealing with multiple objectives this is not allowed and this 

167 is verified here. If we have only one objective this is returned. Otherwise, 

168 if an objective is specified by the user this is returned. 

169 

170 Args: 

171 objective: The objective given by the user 

172 """ 

173 if objective is None: 

174 if self.multi_objective: 

175 raise ValueError("Error: MO Data, but objective not specified.") 

176 elif self.num_objectives == 1: 

177 return self.objective_names[0] 

178 else: 

179 return PerformanceDataFrame.missing_objective 

180 return objective 

181 

182 def verify_run_id(self: PerformanceDataFrame, 

183 run_id: int) -> int: 

184 """Method to check whether run id is valid. 

185 

186 Similar to verify_objective but here we check the dimensionality of runs. 

187 

188 Args: 

189 run_id: the run as specified by the user. 

190 """ 

191 if run_id is None: 

192 if self.num_runs > 1: 

193 raise ValueError("Error: Multiple run performance data, " 

194 "but run not specified") 

195 else: 

196 run_id = self.run_ids[0] 

197 return run_id 

198 

199 def verify_indexing(self: PerformanceDataFrame, 

200 objective: str, 

201 run_id: int) -> tuple[str, int]: 

202 """Method to check whether data indexing is correct. 

203 

204 Users are allowed to use the Performance Dataframe without the second and 

205 fourth dimension (Objective and Run respectively) in the case they only 

206 have one objective or only do one run. This method adjusts the indexing for 

207 those cases accordingly. 

208 

209 Args: 

210 objective: The given objective name 

211 run_id: The given run index 

212 

213 Returns: 

214 A tuple representing the (possibly adjusted) Objective and Run index. 

215 """ 

216 objective = self.verify_objective(objective) 

217 run_id = self.verify_run_id(run_id) 

218 return objective, run_id 

219 

220 # Getters and Setters 

221 

222 def add_solver(self: PerformanceDataFrame, 

223 solver_name: str, 

224 initial_value: float | list[str | float] = None) -> None: 

225 """Add a new solver to the dataframe. Initializes value to None by default. 

226 

227 Args: 

228 solver_name: The name of the solver to be added. 

229 initial_value: The value assigned for each index of the new solver. 

230 If not None, must match the index dimension (n_obj * n_inst * n_runs). 

231 """ 

232 if solver_name in self.solvers: 

233 print(f"WARNING: Tried adding already existing solver {solver_name} to " 

234 f"Performance DataFrame: {self.csv_filepath}") 

235 return 

236 initial_value =\ 

237 [initial_value] if not isinstance(initial_value, list) else initial_value 

238 column_dim_size = len(PerformanceDataFrame.multi_column_names) 

239 if len(initial_value) < column_dim_size: 

240 initial_value.extend([None] * (column_dim_size - len(initial_value))) 

241 for field, value in zip(PerformanceDataFrame.multi_column_names, initial_value): 

242 self[solver_name, field] = value 

243 if self.num_solvers == 2: # Remove nan solver 

244 for solver in self.solvers: 

245 if str(solver) == str(PerformanceDataFrame.missing_value): 

246 self.remove_solver(solver) 

247 break 

248 

249 def add_objective(self: PerformanceDataFrame, 

250 objective_name: str, 

251 initial_value: float = None) -> None: 

252 """Add an objective to the DataFrame.""" 

253 initial_value = initial_value or self.missing_value 

254 if objective_name in self.objective_names: 

255 print(f"WARNING: Tried adding already existing objective {objective_name} " 

256 f"to Performance DataFrame: {self.csv_filepath}") 

257 return 

258 for instance, run in itertools.product(self.instances, self.run_ids): 

259 self.loc[(objective_name, instance, run)] = initial_value 

260 self.sort_index(axis=0, inplace=True) 

261 

262 def add_instance(self: PerformanceDataFrame, 

263 instance_name: str, 

264 initial_values: Any | list[Any] = None) -> None: 

265 """Add and instance to the DataFrame. 

266 

267 Args: 

268 instance_name: The name of the instance to be added. 

269 initial_values: The values assigned for each index of the new instance. 

270 If list, must match the column dimension (Value, Seed, Configuration). 

271 """ 

272 initial_values = initial_values or self.missing_value 

273 if not isinstance(initial_values, list): 

274 initial_values = ([initial_values] 

275 * len(PerformanceDataFrame.multi_column_names) 

276 * self.num_solvers) 

277 elif len(initial_values) == len(PerformanceDataFrame.multi_column_names): 

278 initial_values = initial_values * self.num_solvers 

279 

280 if instance_name in self.instances: 

281 print(f"WARNING: Tried adding already existing instance {instance_name} " 

282 f"to Performance DataFrame: {self.csv_filepath}") 

283 return 

284 # Add rows for all combinations 

285 for objective, run in itertools.product(self.objective_names, self.run_ids): 

286 self.loc[(objective, instance_name, run)] = initial_values 

287 if self.num_instances == 2: # Remove nan instance 

288 for instance in self.instances: 

289 if not isinstance(instance, str) and math.isnan(instance): 

290 self.remove_instance(instance) 

291 break 

292 # Sort the index to optimize lookup speed 

293 self.sort_index(axis=0, inplace=True) 

294 

295 def add_runs(self: PerformanceDataFrame, 

296 num_extra_runs: int, 

297 instance_names: list[str] = None, 

298 initial_values: Any | list[Any] = None) -> None: 

299 """Add runs to the DataFrame. 

300 

301 Args: 

302 num_extra_runs: The number of runs to be added. 

303 instance_names: The instances for which runs are to be added. 

304 By default None, which means runs are added to all instances. 

305 initial_values: The initial value for each objective of each new run. 

306 If a list, needs to have a value for Value, Seed and Configuration. 

307 """ 

308 initial_values = initial_values or self.missing_value 

309 if not isinstance(initial_values, list): 

310 initial_values =\ 

311 [initial_values] * len(self.multi_column_names) * self.num_solvers 

312 elif len(initial_values) == len(self.multi_column_names): 

313 initial_values = initial_values * self.num_solvers 

314 instance_names = self.instances if instance_names is None else instance_names 

315 for instance in instance_names: 

316 for objective in self.objective_names: 

317 index_runs_start = len(self.loc[(objective, instance)]) + 1 

318 for run in range(index_runs_start, index_runs_start + num_extra_runs): 

319 self.loc[(objective, instance, run)] = initial_values 

320 # Sort the index to optimize lookup speed 

321 # NOTE: It would be better to do this at the end, but that results in 

322 # PerformanceWarning: indexing past lexsort depth may impact performance. 

323 self.sort_index(axis=0, inplace=True) 

324 

325 def remove_solver(self: PerformanceDataFrame, solver_name: str | list[str]) -> None: 

326 """Drop one or more solvers from the Dataframe.""" 

327 # To make sure objectives / runs are saved when no solvers are present 

328 if self.num_solvers == 1: 

329 for field in PerformanceDataFrame.multi_column_names: 

330 self[PerformanceDataFrame.missing_value, field] =\ 

331 PerformanceDataFrame.missing_value 

332 self.drop(columns=solver_name, level=0, axis=1, inplace=True) 

333 

334 def remove_instance(self: PerformanceDataFrame, instance_name: str) -> None: 

335 """Drop an instance from the Dataframe.""" 

336 # To make sure objectives / runs are saved when no instances are present 

337 if self.num_instances == 1: 

338 for objective, run in itertools.product(self.objective_names, self.run_ids): 

339 self.loc[(objective, PerformanceDataFrame.missing_value, run)] =\ 

340 PerformanceDataFrame.missing_value 

341 self.drop(instance_name, 

342 axis=0, 

343 level=PerformanceDataFrame.index_instance, inplace=True) 

344 # Sort the index to optimize lookup speed 

345 self.sort_index(axis=0, inplace=True) 

346 

347 def remove_runs(self: PerformanceDataFrame, 

348 runs: int | list[int], 

349 instance_names: list[str] = None) -> None: 

350 """Drop one or more runs from the Dataframe. 

351 

352 Args: 

353 runs: The run indices to be removed. If its an int, 

354 the last n runs are removed. NOTE: If each instance has a different 

355 number of runs, the amount of removed runs is not uniform. 

356 instance_names: The instances for which runs are to be removed. 

357 By default None, which means runs are removed from all instances. 

358 """ 

359 instance_names = self.instances if instance_names is None else instance_names 

360 runs = list(range((self.num_runs + 1) - runs, (self.num_runs + 1)))\ 

361 if isinstance(runs, int) else runs 

362 self.drop(runs, 

363 axis=0, 

364 level=PerformanceDataFrame.index_run, 

365 inplace=True) 

366 # Sort the index to optimize lookup speed 

367 self.sort_index(axis=0, inplace=True) 

368 

369 def remove_empty_runs(self: PerformanceDataFrame) -> None: 

370 """Remove runs that contain no data, except for the first.""" 

371 for row_index in self.index: 

372 if row_index[2] == 1: # First run, never delete 

373 continue 

374 if self.loc[row_index].isna().all(): 

375 self.drop(row_index, inplace=True) 

376 

377 def reset_value(self: PerformanceDataFrame, 

378 solver: str, 

379 instance: str, 

380 objective: str = None, 

381 run: int = None) -> None: 

382 """Reset a value in the dataframe.""" 

383 self.set_value(PerformanceDataFrame.missing_value, 

384 solver, instance, objective, run) 

385 

386 def set_value(self: PerformanceDataFrame, 

387 value: float | str | list[float | str] | list[list[float | str]], 

388 solver: str | list[str], 

389 instance: str | list[str], 

390 objective: str | list[str] = None, 

391 run: int | list[int] = None, 

392 solver_fields: list[str] = ["Value"], 

393 append_write_csv: bool = False) -> None: 

394 """Setter method to assign a value to the Dataframe. 

395 

396 Allows for setting the same value to multiple indices. 

397 

398 Args: 

399 value: Value(s) to be assigned. If value is a list, first dimension is 

400 the solver field, second dimension is if multiple different values are 

401 to be assigned. Must be the same shape as target. 

402 solver: The solver(s) for which the value should be set. 

403 If solver is a list, multiple solvers are set. If None, all 

404 solvers are set. 

405 instance: The instance(s) for which the value should be set. 

406 If instance is a list, multiple instances are set. If None, all 

407 instances are set. 

408 objective: The objectives for which the value should be set. 

409 When left None, set for all objectives 

410 run: The run index for which the value should be set. 

411 If left None, set for all runs. 

412 solver_fields: The level to which each value should be assinged. 

413 Defaults to ["Value"]. 

414 append_write_csv: For concurrent writing to the PerformanceDataFrame. 

415 If True, the value is directly appended to the CSV file. 

416 This will create duplicate entries in the file, but these are combined 

417 when loading the file. 

418 """ 

419 # Convert indices to slices for None values 

420 solver = slice(solver) if solver is None else solver 

421 instance = slice(instance) if instance is None else instance 

422 objective = slice(objective) if objective is None else objective 

423 run = slice(run) if run is None else run 

424 # Convert column indices to slices for setting multiple columns 

425 value = [value] if not isinstance(value, list) else value 

426 # NOTE: We currently forloop levels here, as it allows us to set the same 

427 # sequence of values to the indices 

428 for item, level in zip(value, solver_fields): 

429 self.loc[(objective, instance, run), (solver, level)] = item 

430 

431 if append_write_csv: 

432 writeable = self.loc[(objective, instance, run), :] 

433 if isinstance(writeable, pd.Series): # Single row, convert to pd.DataFrame 

434 writeable = self.loc[[(objective, instance, run)], :] 

435 # Append the new rows to the dataframe csv file 

436 writeable.to_csv(self.csv_filepath, mode="a", header=False) 

437 

438 def get_value(self: PerformanceDataFrame, 

439 solver: str | list[str], 

440 instance: str | list[str], 

441 objective: str = None, 

442 run: int = None, 

443 solver_fields: list[str] = ["Value"] 

444 ) -> float | str | list[Any]: 

445 """Index a value of the DataFrame and return it.""" 

446 # Convert indices to slices for None values 

447 solver = slice(solver) if solver is None else solver 

448 instance = slice(instance) if instance is None else instance 

449 objective = slice(objective) if objective is None else objective 

450 run = slice(run) if run is None else run 

451 target = self.loc[(objective, instance, run), (solver, solver_fields)].values 

452 

453 # Reduce dimensions when relevant 

454 if isinstance(target[0], np.ndarray) and len(target[0]) == 1: 

455 target = target.flatten() 

456 target = target.tolist() 

457 if len(target) == 1: 

458 return target[0] 

459 return target 

460 

461 # This method can be removed now that above method does its job 

462 def get_values(self: PerformanceDataFrame, 

463 solver: str, 

464 instance: str = None, 

465 objective: str = None, 

466 run: int = None, 

467 solver_fields: list[str] = ["Value"] 

468 ) -> list[float | str] | list[list[float | str]]: 

469 """Return a list of solver values.""" 

470 subdf = self[solver][solver_fields] 

471 if objective is not None: 

472 objective = self.verify_objective(objective) 

473 subdf = subdf.xs(objective, level=0, drop_level=False) 

474 if instance is not None: 

475 subdf = subdf.xs(instance, level=1, drop_level=False) 

476 if run is not None: 

477 run = self.verify_run_id(run) 

478 subdf = subdf.xs(run, level=2, drop_level=False) 

479 # Convert dict to list 

480 result = [subdf[field].to_list() for field in solver_fields] 

481 if len(result) == 1: 

482 return result[0] 

483 return result 

484 

485 def get_instance_num_runs(self: PerformanceDataFrame, 

486 instance: str) -> int: 

487 """Return the number of runs for an instance.""" 

488 # We assume each objective has the same index for Instance/Runs 

489 return len(self.loc[(self.objective_names[0], instance)].index) 

490 

491 # Calculables 

492 

493 def mean(self: PerformanceDataFrame, 

494 objective: str = None, 

495 solver: str = None, 

496 instance: str = None) -> float: 

497 """Return the mean value of a slice of the dataframe.""" 

498 objective = self.verify_objective(objective) 

499 subset = self.xs(objective, level=0) 

500 if solver is not None: 

501 subset = subset.xs(solver, axis=1, drop_level=False) 

502 if instance is not None: 

503 subset = subset.xs(instance, axis=0, drop_level=False) 

504 value = subset.astype(float).mean() 

505 if isinstance(value, pd.Series): 

506 return value.mean() 

507 return value 

508 

509 # TODO: This method should be refactored or not exist 

510 def get_job_list(self: PerformanceDataFrame, rerun: bool = False) \ 

511 -> list[tuple[str, str]]: 

512 """Return a list of performance computation jobs there are to be done. 

513 

514 Get a list of tuple[instance, solver] to run from the performance data. 

515 If rerun is False (default), get only the tuples that don't have a 

516 value, else (True) get all the tuples. 

517 

518 Args: 

519 rerun: Boolean indicating if we want to rerun all jobs 

520 

521 Returns: 

522 A list of [instance, solver] combinations 

523 """ 

524 # Format the dataframe such that only the values remain 

525 df = self.stack(future_stack=True) 

526 df.drop([PerformanceDataFrame.column_seed, 

527 PerformanceDataFrame.column_configuration], level=-1, inplace=True) 

528 df.index.droplevel() 

529 if not rerun: # Filter the nan values 

530 df = df.isnull() 

531 

532 # Count the number of missing objective values for each Instance/Run/Algorithm 

533 df.index = df.index.droplevel(PerformanceDataFrame.index_objective) 

534 df.index = df.index.droplevel(-1) 

535 index_names = df.index.names 

536 df = df.groupby(df.index).agg({cname: "sum" for cname in df.columns}) 

537 df.index = pd.MultiIndex.from_tuples(df.index, names=index_names) 

538 

539 # Return the Instance, Run, Solver combinations 

540 return [index + (column, ) 

541 for index, column in itertools.product(df.index, df.columns) 

542 if rerun or df[column][index] > 0] 

543 

544 # TODO: This method should be refactored or not exist 

545 def remaining_jobs(self: PerformanceDataFrame) -> dict[str, list[str]]: 

546 """Return a dictionary for empty values as instance key and solver values.""" 

547 remaining_jobs = {} 

548 jobs = self.get_job_list(rerun=False) 

549 for instance, _, solver in jobs: 

550 if instance not in remaining_jobs: 

551 remaining_jobs[instance] = [solver] 

552 else: 

553 remaining_jobs[instance].append(solver) 

554 return remaining_jobs 

555 

556 def configuration_performance( 

557 self: PerformanceDataFrame, 

558 solver: str, 

559 configuration: dict, 

560 objective: str | SparkleObjective = None, 

561 instances: list[str] = None, 

562 per_instance: bool = False) -> tuple[dict, float]: 

563 """Return the configuration performance for objective over the instances. 

564 

565 Args: 

566 solver: The solver for which we determine evaluate the configuration 

567 configuration: The configuration to evaluate 

568 objective: The objective for which we calculate find the best value 

569 instances: The instances which should be selected for the evaluation 

570 per_instance: Whether to return the performance per instance, 

571 or aggregated. 

572 

573 Returns: 

574 The best configuration and its aggregated performance. 

575 """ 

576 objective = self.verify_objective(objective) 

577 instances = instances or slice(instances) # Convert None to slice 

578 if isinstance(objective, str): 

579 objective = resolve_objective(objective) 

580 # Filter objective 

581 subdf = self.xs(objective.name, level=0, drop_level=True) 

582 

583 if configuration: # Filter configuration 

584 if not isinstance(configuration, dict): # Get empty configuration 

585 subdf = subdf[subdf[solver][ 

586 PerformanceDataFrame.column_configuration].isna()] 

587 else: 

588 subdf = subdf[subdf[solver][ 

589 PerformanceDataFrame.column_configuration] == str(configuration)] 

590 # Filter solver 

591 subdf = subdf.xs(solver, axis=1, drop_level=True) 

592 

593 # Drop the seed, filter instances 

594 subdf = subdf.drop(PerformanceDataFrame.column_seed, axis=1).loc[instances, :] 

595 # Aggregate the runs per instance/configuration 

596 try: # Can only aggregate numerical values 

597 subdf[PerformanceDataFrame.column_value] =\ 

598 pd.to_numeric(subdf[PerformanceDataFrame.column_value]) # Ensure type 

599 subdf = subdf.groupby([PerformanceDataFrame.index_instance, 

600 PerformanceDataFrame.column_configuration], 

601 dropna=False).agg(objective.run_aggregator.__name__) 

602 except ValueError: 

603 subdf.drop(PerformanceDataFrame.column_configuration, axis=1, inplace=True) 

604 return configuration, subdf.values.flatten().tolist() 

605 if per_instance: # No instance aggregation 

606 # NOTE: How do we select the best configuration now if conf == None? 

607 return configuration, subdf.values.flatten().tolist() 

608 

609 # Aggregate the instances per configuration 

610 subdf = subdf.droplevel(level=0).reset_index() # Drop instance column 

611 subdf = subdf.groupby(PerformanceDataFrame.column_configuration, 

612 dropna=False).agg( 

613 func=objective.instance_aggregator.__name__) 

614 

615 if configuration: 

616 return configuration, subdf.values[0][0] 

617 # In case of no configuration given, select the one with best objective value 

618 best_index = subdf.idxmin() if objective.minimise else subdf.idxmax() 

619 try: 

620 best_configuration = ast.literal_eval(best_index.values[0]) 

621 except Exception: # Configuration is not a dictionary 

622 best_value = subdf.min() if objective.minimise else subdf.max() 

623 return {}, best_value.values[0] 

624 return (best_configuration, 

625 subdf.loc[best_index, PerformanceDataFrame.column_value].values[0]) 

626 

627 def best_configuration(self: PerformanceDataFrame, 

628 solver: str, 

629 objective: SparkleObjective = None, 

630 instances: list[str] = None) -> tuple[dict, float]: 

631 """Return the best configuration for the given objective over the instances. 

632 

633 Args: 

634 solver: The solver for which we determine the best configuration 

635 objective: The objective for which we calculate the best configuration 

636 instances: The instances which should be selected for the evaluation 

637 

638 Returns: 

639 The best configuration and its aggregated performance. 

640 """ 

641 return self.configuration_performance(solver, None, objective, instances) 

642 

643 def best_instance_performance( 

644 self: PerformanceDataFrame, 

645 objective: str | SparkleObjective = None, 

646 run_id: int = None, 

647 exclude_solvers: list[str] = None) -> pd.Series: 

648 """Return the best performance for each instance in the portfolio. 

649 

650 Args: 

651 objective: The objective for which we calculate the best performance 

652 run_id: The run for which we calculate the best performance. If None, 

653 we consider all runs. 

654 exclude_solvers: List of solvers to exclude in the calculation. 

655 

656 Returns: 

657 The best performance for each instance in the portfolio. 

658 """ 

659 objective = self.verify_objective(objective) 

660 if isinstance(objective, str): 

661 objective = resolve_objective(objective) 

662 # Drop Seed/Configuration 

663 subdf = self.drop( 

664 [PerformanceDataFrame.column_seed, 

665 PerformanceDataFrame.column_configuration], 

666 axis=1, level=1) 

667 subdf = subdf.xs(objective.name, level=0) 

668 if exclude_solvers is not None: 

669 subdf = subdf.drop(exclude_solvers, axis=1, level=0) 

670 if run_id is not None: 

671 run_id = self.verify_run_id(run_id) 

672 subdf = subdf.xs(run_id, level=1) 

673 else: 

674 # Drop the run level 

675 subdf = subdf.droplevel(level=1) 

676 if objective.minimise: 

677 series = subdf.min(axis=1) 

678 else: 

679 series = subdf.max(axis=1) 

680 # Ensure we always return the best for each run 

681 series = series.sort_values(ascending=objective.minimise) 

682 return series.groupby(series.index).first().astype(float) 

683 

684 def best_performance( 

685 self: PerformanceDataFrame, 

686 exclude_solvers: list[str] = [], 

687 objective: str | SparkleObjective = None) -> float: 

688 """Return the overall best performance of the portfolio. 

689 

690 Args: 

691 exclude_solvers: List of solvers to exclude in the calculation. 

692 Defaults to none. 

693 objective: The objective for which we calculate the best performance 

694 

695 Returns: 

696 The aggregated best performance of the portfolio over all instances. 

697 """ 

698 objective = self.verify_objective(objective) 

699 if isinstance(objective, str): 

700 objective = resolve_objective(objective) 

701 instance_best = self.best_instance_performance( 

702 objective, exclude_solvers=exclude_solvers).to_numpy(dtype=float) 

703 return objective.instance_aggregator(instance_best) 

704 

705 def schedule_performance( 

706 self: PerformanceDataFrame, 

707 schedule: dict[str: dict[str: (str, int)]], 

708 target_solver: str = None, 

709 objective: str | SparkleObjective = None) -> float: 

710 """Return the performance of a selection schedule on the portfolio. 

711 

712 Args: 

713 schedule: Compute the best performance according to a selection schedule. 

714 A schedule is a dictionary of instances, with a schedule per instance, 

715 consisting of a pair of solver and maximum runtime. 

716 target_solver: If not None, store the values in this solver of the DF. 

717 objective: The objective for which we calculate the best performance 

718 

719 Returns: 

720 The performance of the schedule over the instances in the dictionary. 

721 """ 

722 objective = self.verify_objective(objective) 

723 if isinstance(objective, str): 

724 objective = resolve_objective(objective) 

725 select = min if objective.minimise else max 

726 performances = [0.0] * len(schedule.keys()) 

727 for ix, instance in enumerate(schedule.keys()): 

728 for iy, (solver, max_runtime) in enumerate(schedule[instance]): 

729 performance = float(self.get_value(solver, instance, objective.name)) 

730 if max_runtime is not None: # We are dealing with runtime 

731 performances[ix] += performance 

732 if performance < max_runtime: 

733 break # Solver finished in time 

734 else: # Quality, we take the best found performance 

735 if iy == 0: # First solver, set initial value 

736 performances[ix] = performance 

737 continue 

738 performances[ix] = select(performances[ix], performance) 

739 if target_solver is not None: 

740 self.set_value(performances[ix], target_solver, instance, objective.name) 

741 return performances 

742 

743 def marginal_contribution( 

744 self: PerformanceDataFrame, 

745 objective: str | SparkleObjective = None, 

746 sort: bool = False) -> list[float]: 

747 """Return the marginal contribution of the solvers on the instances. 

748 

749 Args: 

750 objective: The objective for which we calculate the marginal contribution. 

751 sort: Whether to sort the results afterwards 

752 Returns: 

753 The marginal contribution of each solver. 

754 """ 

755 output = [] 

756 objective = self.verify_objective(objective) 

757 if isinstance(objective, str): 

758 objective = resolve_objective(objective) 

759 best_performance = self.best_performance(objective=objective) 

760 for solver in self.solvers: 

761 # By calculating the best performance excluding this Solver, 

762 # we can determine its relative impact on the portfolio. 

763 missing_solver_best = self.best_performance( 

764 exclude_solvers=[solver], 

765 objective=objective) 

766 # Now we need to see how much the portfolio's best performance 

767 # decreases without this solver. 

768 marginal_contribution = missing_solver_best / best_performance 

769 if missing_solver_best == best_performance: 

770 # No change, no contribution 

771 marginal_contribution = 0.0 

772 output.append((solver, marginal_contribution, missing_solver_best)) 

773 if sort: 

774 output.sort(key=lambda x: x[1], reverse=objective.minimise) 

775 return output 

776 

777 def get_solver_ranking(self: PerformanceDataFrame, 

778 objective: str | SparkleObjective = None 

779 ) -> list[tuple[str, float]]: 

780 """Return a list with solvers ranked by average performance.""" 

781 objective = self.verify_objective(objective) 

782 if isinstance(objective, str): 

783 objective = resolve_objective(objective) 

784 # Drop Seed/Configuration 

785 subdf = self.drop( 

786 [PerformanceDataFrame.column_seed, 

787 PerformanceDataFrame.column_configuration], 

788 axis=1, level=1) 

789 sub_df = subdf.loc(axis=0)[objective.name, :, :] 

790 # Reduce Runs Dimension 

791 sub_df = sub_df.droplevel("Run").astype(float) 

792 # By using .__name__, pandas converts it to a Pandas Aggregator function 

793 sub_df = sub_df.groupby(sub_df.index).agg(func=objective.run_aggregator.__name__) 

794 solver_ranking = [(solver, objective.instance_aggregator( 

795 sub_df[solver].astype(float))) for solver in self.solvers] 

796 # Sort the list by second value (the performance) 

797 solver_ranking.sort(key=lambda performance: performance[1], 

798 reverse=(not objective.minimise)) 

799 return solver_ranking 

800 

801 def save_csv(self: PerformanceDataFrame, csv_filepath: Path = None) -> None: 

802 """Write a CSV to the given path. 

803 

804 Args: 

805 csv_filepath: String path to the csv file. Defaults to self.csv_filepath. 

806 """ 

807 csv_filepath = self.csv_filepath if csv_filepath is None else csv_filepath 

808 self.to_csv(csv_filepath) 

809 

810 def clone(self: PerformanceDataFrame, 

811 csv_filepath: Path = None) -> PerformanceDataFrame: 

812 """Create a copy of this object. 

813 

814 Args: 

815 csv_filepath: The new filepath to use for saving the object to. 

816 Warning: If the original path is used, it could lead to dataloss! 

817 """ 

818 csv_filepath = csv_filepath or self.csv_filepath 

819 if self.csv_filepath.exists(): 

820 pd_copy = PerformanceDataFrame(csv_filepath) 

821 else: 

822 pd_copy = PerformanceDataFrame( 

823 csv_filepath=csv_filepath, 

824 solvers=self.solvers, 

825 objectives=self.objectives, 

826 instances=self.instances, 

827 n_runs=self.num_runs) 

828 for solver in self.solvers: 

829 for index in self.index: 

830 for field in PerformanceDataFrame.multi_column_names: 

831 pd_copy.at[index, (solver, field)] =\ 

832 self.loc[index, solver][field] 

833 return pd_copy 

834 

835 def clean_csv(self: PerformanceDataFrame) -> None: 

836 """Set all values in Performance Data to None.""" 

837 self[:] = PerformanceDataFrame.missing_value 

838 self.save_csv()