Coverage for sparkle/structures/performance_dataframe.py: 88%
365 statements
« prev ^ index » next coverage.py v7.8.0, created at 2025-04-03 10:42 +0000
« prev ^ index » next coverage.py v7.8.0, created at 2025-04-03 10:42 +0000
1"""Module to manage performance data files and common operations on them."""
2from __future__ import annotations
3import ast
4from typing import Any
5import itertools
6from pathlib import Path
7import math
8import numpy as np
9import pandas as pd
11from sparkle.types import SparkleObjective, resolve_objective
14class PerformanceDataFrame(pd.DataFrame):
15 """Class to manage performance data and common operations on them."""
16 missing_value = math.nan
18 missing_objective = "UNKNOWN"
20 index_objective = "Objective"
21 index_instance = "Instance"
22 index_run = "Run"
23 multi_index_names = [index_objective, index_instance, index_run]
25 column_value = "Value"
26 column_seed = "Seed"
27 column_configuration = "Configuration"
28 multi_column_names = [column_value, column_seed, column_configuration]
29 multi_column_dtypes = [float, int, str]
31 def __init__(self: PerformanceDataFrame,
32 csv_filepath: Path,
33 solvers: list[str] = None,
34 objectives: list[str | SparkleObjective] = None,
35 instances: list[str] = None,
36 n_runs: int = 1,
37 ) -> None:
38 """Initialise a PerformanceDataFrame.
40 Consists of:
41 - Columns representing the Solvers
42 - Rows representing the result by multi-index in order of:
43 * Objective (Static, given in constructor or read from file)
44 * Instance
45 * Runs (Static, given in constructor or read from file)
47 Args:
48 csv_filepath: If path exists, load from Path.
49 Otherwise create new and save to this path.
50 solvers: List of solver names to be added into the Dataframe
51 objectives: List of SparkleObjectives or objective names. By default None,
52 then the objectives will be derived from Sparkle Settings if possible.
53 instances: List of instance names to be added into the Dataframe
54 n_runs: The number of runs to consider per Solver/Objective/Instance comb.
55 """
56 if csv_filepath.exists():
57 dtypes = {key: value for key, value in zip(
58 PerformanceDataFrame.multi_column_names,
59 PerformanceDataFrame.multi_column_dtypes)}
60 df = pd.read_csv(csv_filepath,
61 header=[0, 1], index_col=[0, 1, 2],
62 dtype=dtypes,
63 on_bad_lines="skip")
64 super().__init__(df)
65 self.csv_filepath = csv_filepath
66 else:
67 # Initialize empty DataFrame
68 run_ids = list(range(1, n_runs + 1)) # We count runs from 1
69 # We always need objectives to maintain the dimensions
70 if objectives is None:
71 objectives = [PerformanceDataFrame.missing_objective]
72 else:
73 objectives = [str(o) for o in objectives]
74 # We always need an instance to maintain the dimensions
75 if instances is None:
76 instances = [PerformanceDataFrame.missing_value]
77 # We always need a solver to maintain the dimensions
78 if solvers is None:
79 solvers = [PerformanceDataFrame.missing_value]
80 midx = pd.MultiIndex.from_product(
81 [objectives, instances, run_ids],
82 names=PerformanceDataFrame.multi_index_names)
83 mcolumns = pd.MultiIndex.from_product(
84 [solvers, PerformanceDataFrame.multi_column_names],
85 names=["Solver", "Meta"])
86 super().__init__(PerformanceDataFrame.missing_value,
87 index=midx, columns=mcolumns)
88 self.csv_filepath = csv_filepath
89 self.save_csv()
91 if self.index.duplicated().any(): # Combine duplicate indices
92 combined = self.groupby(level=[0, 1, 2]).first()
93 duplicates = self.index[self.index.duplicated(keep="first")]
94 # Remove all duplicate entries from self
95 self.drop(duplicates, inplace=True)
96 for d in duplicates: # Place combined duplicates in self
97 self.loc[d, :] = combined.loc[d, :]
99 # Sort the index to optimize lookup speed
100 self.sort_index(axis=0, inplace=True)
102 # Properties
104 @property
105 def num_objectives(self: PerformanceDataFrame) -> int:
106 """Retrieve the number of objectives in the DataFrame."""
107 return self.index.get_level_values(0).unique().size
109 @property
110 def num_instances(self: PerformanceDataFrame) -> int:
111 """Return the number of instances."""
112 return self.index.get_level_values(1).unique().size
114 @property
115 def num_runs(self: PerformanceDataFrame) -> int:
116 """Return the maximum number of runs of each instance."""
117 return self.index.get_level_values(2).unique().size
119 @property
120 def num_solvers(self: PerformanceDataFrame) -> int:
121 """Return the number of solvers."""
122 return self.columns.get_level_values(0).unique().size
124 @property
125 def multi_objective(self: PerformanceDataFrame) -> bool:
126 """Return whether the dataframe represent MO or not."""
127 return self.num_objectives > 1
129 @property
130 def solvers(self: PerformanceDataFrame) -> list[str]:
131 """Return the solver present as a list of strings."""
132 return self.columns.get_level_values(0).unique().to_list()
134 @property
135 def objective_names(self: PerformanceDataFrame) -> list[str]:
136 """Return the objective names as a list of strings."""
137 return self.index.get_level_values(0).unique().to_list()
139 @property
140 def objectives(self: PerformanceDataFrame) -> list[SparkleObjective]:
141 """Return the objectives as a list of SparkleObjectives."""
142 return [resolve_objective(o) for o in self.objective_names]
144 @property
145 def instances(self: PerformanceDataFrame) -> list[str]:
146 """Return the instances as a Pandas Index object."""
147 return self.index.get_level_values(1).unique().to_list()
149 @property
150 def run_ids(self: PerformanceDataFrame) -> list[int]:
151 """Return the run ids as a list of integers."""
152 return self.index.get_level_values(2).unique().to_list()
154 @property
155 def has_missing_values(self: PerformanceDataFrame) -> bool:
156 """Returns True if there are any missing values in the dataframe."""
157 return self.isnull().any().drop([PerformanceDataFrame.column_seed,
158 PerformanceDataFrame.column_configuration],
159 level=1).any()
161 def verify_objective(self: PerformanceDataFrame,
162 objective: str) -> str:
163 """Method to check whether the specified objective is valid.
165 Users are allowed to index the dataframe without specifying all dimensions.
166 However, when dealing with multiple objectives this is not allowed and this
167 is verified here. If we have only one objective this is returned. Otherwise,
168 if an objective is specified by the user this is returned.
170 Args:
171 objective: The objective given by the user
172 """
173 if objective is None:
174 if self.multi_objective:
175 raise ValueError("Error: MO Data, but objective not specified.")
176 elif self.num_objectives == 1:
177 return self.objective_names[0]
178 else:
179 return PerformanceDataFrame.missing_objective
180 return objective
182 def verify_run_id(self: PerformanceDataFrame,
183 run_id: int) -> int:
184 """Method to check whether run id is valid.
186 Similar to verify_objective but here we check the dimensionality of runs.
188 Args:
189 run_id: the run as specified by the user.
190 """
191 if run_id is None:
192 if self.num_runs > 1:
193 raise ValueError("Error: Multiple run performance data, "
194 "but run not specified")
195 else:
196 run_id = self.run_ids[0]
197 return run_id
199 def verify_indexing(self: PerformanceDataFrame,
200 objective: str,
201 run_id: int) -> tuple[str, int]:
202 """Method to check whether data indexing is correct.
204 Users are allowed to use the Performance Dataframe without the second and
205 fourth dimension (Objective and Run respectively) in the case they only
206 have one objective or only do one run. This method adjusts the indexing for
207 those cases accordingly.
209 Args:
210 objective: The given objective name
211 run_id: The given run index
213 Returns:
214 A tuple representing the (possibly adjusted) Objective and Run index.
215 """
216 objective = self.verify_objective(objective)
217 run_id = self.verify_run_id(run_id)
218 return objective, run_id
220 # Getters and Setters
222 def add_solver(self: PerformanceDataFrame,
223 solver_name: str,
224 initial_value: float | list[str | float] = None) -> None:
225 """Add a new solver to the dataframe. Initializes value to None by default.
227 Args:
228 solver_name: The name of the solver to be added.
229 initial_value: The value assigned for each index of the new solver.
230 If not None, must match the index dimension (n_obj * n_inst * n_runs).
231 """
232 if solver_name in self.solvers:
233 print(f"WARNING: Tried adding already existing solver {solver_name} to "
234 f"Performance DataFrame: {self.csv_filepath}")
235 return
236 initial_value =\
237 [initial_value] if not isinstance(initial_value, list) else initial_value
238 column_dim_size = len(PerformanceDataFrame.multi_column_names)
239 if len(initial_value) < column_dim_size:
240 initial_value.extend([None] * (column_dim_size - len(initial_value)))
241 for field, value in zip(PerformanceDataFrame.multi_column_names, initial_value):
242 self[solver_name, field] = value
243 if self.num_solvers == 2: # Remove nan solver
244 for solver in self.solvers:
245 if str(solver) == str(PerformanceDataFrame.missing_value):
246 self.remove_solver(solver)
247 break
249 def add_objective(self: PerformanceDataFrame,
250 objective_name: str,
251 initial_value: float = None) -> None:
252 """Add an objective to the DataFrame."""
253 initial_value = initial_value or self.missing_value
254 if objective_name in self.objective_names:
255 print(f"WARNING: Tried adding already existing objective {objective_name} "
256 f"to Performance DataFrame: {self.csv_filepath}")
257 return
258 for instance, run in itertools.product(self.instances, self.run_ids):
259 self.loc[(objective_name, instance, run)] = initial_value
260 self.sort_index(axis=0, inplace=True)
262 def add_instance(self: PerformanceDataFrame,
263 instance_name: str,
264 initial_values: Any | list[Any] = None) -> None:
265 """Add and instance to the DataFrame.
267 Args:
268 instance_name: The name of the instance to be added.
269 initial_values: The values assigned for each index of the new instance.
270 If list, must match the column dimension (Value, Seed, Configuration).
271 """
272 initial_values = initial_values or self.missing_value
273 if not isinstance(initial_values, list):
274 initial_values = ([initial_values]
275 * len(PerformanceDataFrame.multi_column_names)
276 * self.num_solvers)
277 elif len(initial_values) == len(PerformanceDataFrame.multi_column_names):
278 initial_values = initial_values * self.num_solvers
280 if instance_name in self.instances:
281 print(f"WARNING: Tried adding already existing instance {instance_name} "
282 f"to Performance DataFrame: {self.csv_filepath}")
283 return
284 # Add rows for all combinations
285 for objective, run in itertools.product(self.objective_names, self.run_ids):
286 self.loc[(objective, instance_name, run)] = initial_values
287 if self.num_instances == 2: # Remove nan instance
288 for instance in self.instances:
289 if not isinstance(instance, str) and math.isnan(instance):
290 self.remove_instance(instance)
291 break
292 # Sort the index to optimize lookup speed
293 self.sort_index(axis=0, inplace=True)
295 def add_runs(self: PerformanceDataFrame,
296 num_extra_runs: int,
297 instance_names: list[str] = None,
298 initial_values: Any | list[Any] = None) -> None:
299 """Add runs to the DataFrame.
301 Args:
302 num_extra_runs: The number of runs to be added.
303 instance_names: The instances for which runs are to be added.
304 By default None, which means runs are added to all instances.
305 initial_values: The initial value for each objective of each new run.
306 If a list, needs to have a value for Value, Seed and Configuration.
307 """
308 initial_values = initial_values or self.missing_value
309 if not isinstance(initial_values, list):
310 initial_values =\
311 [initial_values] * len(self.multi_column_names) * self.num_solvers
312 elif len(initial_values) == len(self.multi_column_names):
313 initial_values = initial_values * self.num_solvers
314 instance_names = self.instances if instance_names is None else instance_names
315 for instance in instance_names:
316 for objective in self.objective_names:
317 index_runs_start = len(self.loc[(objective, instance)]) + 1
318 for run in range(index_runs_start, index_runs_start + num_extra_runs):
319 self.loc[(objective, instance, run)] = initial_values
320 # Sort the index to optimize lookup speed
321 # NOTE: It would be better to do this at the end, but that results in
322 # PerformanceWarning: indexing past lexsort depth may impact performance.
323 self.sort_index(axis=0, inplace=True)
325 def remove_solver(self: PerformanceDataFrame, solver_name: str | list[str]) -> None:
326 """Drop one or more solvers from the Dataframe."""
327 # To make sure objectives / runs are saved when no solvers are present
328 if self.num_solvers == 1:
329 for field in PerformanceDataFrame.multi_column_names:
330 self[PerformanceDataFrame.missing_value, field] =\
331 PerformanceDataFrame.missing_value
332 self.drop(columns=solver_name, level=0, axis=1, inplace=True)
334 def remove_instance(self: PerformanceDataFrame, instance_name: str) -> None:
335 """Drop an instance from the Dataframe."""
336 # To make sure objectives / runs are saved when no instances are present
337 if self.num_instances == 1:
338 for objective, run in itertools.product(self.objective_names, self.run_ids):
339 self.loc[(objective, PerformanceDataFrame.missing_value, run)] =\
340 PerformanceDataFrame.missing_value
341 self.drop(instance_name,
342 axis=0,
343 level=PerformanceDataFrame.index_instance, inplace=True)
344 # Sort the index to optimize lookup speed
345 self.sort_index(axis=0, inplace=True)
347 def remove_runs(self: PerformanceDataFrame,
348 runs: int | list[int],
349 instance_names: list[str] = None) -> None:
350 """Drop one or more runs from the Dataframe.
352 Args:
353 runs: The run indices to be removed. If its an int,
354 the last n runs are removed. NOTE: If each instance has a different
355 number of runs, the amount of removed runs is not uniform.
356 instance_names: The instances for which runs are to be removed.
357 By default None, which means runs are removed from all instances.
358 """
359 instance_names = self.instances if instance_names is None else instance_names
360 runs = list(range((self.num_runs + 1) - runs, (self.num_runs + 1)))\
361 if isinstance(runs, int) else runs
362 self.drop(runs,
363 axis=0,
364 level=PerformanceDataFrame.index_run,
365 inplace=True)
366 # Sort the index to optimize lookup speed
367 self.sort_index(axis=0, inplace=True)
369 def remove_empty_runs(self: PerformanceDataFrame) -> None:
370 """Remove runs that contain no data, except for the first."""
371 for row_index in self.index:
372 if row_index[2] == 1: # First run, never delete
373 continue
374 if self.loc[row_index].isna().all():
375 self.drop(row_index, inplace=True)
377 def reset_value(self: PerformanceDataFrame,
378 solver: str,
379 instance: str,
380 objective: str = None,
381 run: int = None) -> None:
382 """Reset a value in the dataframe."""
383 self.set_value(PerformanceDataFrame.missing_value,
384 solver, instance, objective, run)
386 def set_value(self: PerformanceDataFrame,
387 value: float | str | list[float | str] | list[list[float | str]],
388 solver: str | list[str],
389 instance: str | list[str],
390 objective: str | list[str] = None,
391 run: int | list[int] = None,
392 solver_fields: list[str] = ["Value"],
393 append_write_csv: bool = False) -> None:
394 """Setter method to assign a value to the Dataframe.
396 Allows for setting the same value to multiple indices.
398 Args:
399 value: Value(s) to be assigned. If value is a list, first dimension is
400 the solver field, second dimension is if multiple different values are
401 to be assigned. Must be the same shape as target.
402 solver: The solver(s) for which the value should be set.
403 If solver is a list, multiple solvers are set. If None, all
404 solvers are set.
405 instance: The instance(s) for which the value should be set.
406 If instance is a list, multiple instances are set. If None, all
407 instances are set.
408 objective: The objectives for which the value should be set.
409 When left None, set for all objectives
410 run: The run index for which the value should be set.
411 If left None, set for all runs.
412 solver_fields: The level to which each value should be assinged.
413 Defaults to ["Value"].
414 append_write_csv: For concurrent writing to the PerformanceDataFrame.
415 If True, the value is directly appended to the CSV file.
416 This will create duplicate entries in the file, but these are combined
417 when loading the file.
418 """
419 # Convert indices to slices for None values
420 solver = slice(solver) if solver is None else solver
421 instance = slice(instance) if instance is None else instance
422 objective = slice(objective) if objective is None else objective
423 run = slice(run) if run is None else run
424 # Convert column indices to slices for setting multiple columns
425 value = [value] if not isinstance(value, list) else value
426 # NOTE: We currently forloop levels here, as it allows us to set the same
427 # sequence of values to the indices
428 for item, level in zip(value, solver_fields):
429 self.loc[(objective, instance, run), (solver, level)] = item
431 if append_write_csv:
432 writeable = self.loc[(objective, instance, run), :]
433 if isinstance(writeable, pd.Series): # Single row, convert to pd.DataFrame
434 writeable = self.loc[[(objective, instance, run)], :]
435 # Append the new rows to the dataframe csv file
436 writeable.to_csv(self.csv_filepath, mode="a", header=False)
438 def get_value(self: PerformanceDataFrame,
439 solver: str | list[str],
440 instance: str | list[str],
441 objective: str = None,
442 run: int = None,
443 solver_fields: list[str] = ["Value"]
444 ) -> float | str | list[Any]:
445 """Index a value of the DataFrame and return it."""
446 # Convert indices to slices for None values
447 solver = slice(solver) if solver is None else solver
448 instance = slice(instance) if instance is None else instance
449 objective = slice(objective) if objective is None else objective
450 run = slice(run) if run is None else run
451 target = self.loc[(objective, instance, run), (solver, solver_fields)].values
453 # Reduce dimensions when relevant
454 if isinstance(target[0], np.ndarray) and len(target[0]) == 1:
455 target = target.flatten()
456 target = target.tolist()
457 if len(target) == 1:
458 return target[0]
459 return target
461 # This method can be removed now that above method does its job
462 def get_values(self: PerformanceDataFrame,
463 solver: str,
464 instance: str = None,
465 objective: str = None,
466 run: int = None,
467 solver_fields: list[str] = ["Value"]
468 ) -> list[float | str] | list[list[float | str]]:
469 """Return a list of solver values."""
470 subdf = self[solver][solver_fields]
471 if objective is not None:
472 objective = self.verify_objective(objective)
473 subdf = subdf.xs(objective, level=0, drop_level=False)
474 if instance is not None:
475 subdf = subdf.xs(instance, level=1, drop_level=False)
476 if run is not None:
477 run = self.verify_run_id(run)
478 subdf = subdf.xs(run, level=2, drop_level=False)
479 # Convert dict to list
480 result = [subdf[field].to_list() for field in solver_fields]
481 if len(result) == 1:
482 return result[0]
483 return result
485 def get_instance_num_runs(self: PerformanceDataFrame,
486 instance: str) -> int:
487 """Return the number of runs for an instance."""
488 # We assume each objective has the same index for Instance/Runs
489 return len(self.loc[(self.objective_names[0], instance)].index)
491 # Calculables
493 def mean(self: PerformanceDataFrame,
494 objective: str = None,
495 solver: str = None,
496 instance: str = None) -> float:
497 """Return the mean value of a slice of the dataframe."""
498 objective = self.verify_objective(objective)
499 subset = self.xs(objective, level=0)
500 if solver is not None:
501 subset = subset.xs(solver, axis=1, drop_level=False)
502 if instance is not None:
503 subset = subset.xs(instance, axis=0, drop_level=False)
504 value = subset.astype(float).mean()
505 if isinstance(value, pd.Series):
506 return value.mean()
507 return value
509 # TODO: This method should be refactored or not exist
510 def get_job_list(self: PerformanceDataFrame, rerun: bool = False) \
511 -> list[tuple[str, str]]:
512 """Return a list of performance computation jobs there are to be done.
514 Get a list of tuple[instance, solver] to run from the performance data.
515 If rerun is False (default), get only the tuples that don't have a
516 value, else (True) get all the tuples.
518 Args:
519 rerun: Boolean indicating if we want to rerun all jobs
521 Returns:
522 A list of [instance, solver] combinations
523 """
524 # Format the dataframe such that only the values remain
525 df = self.stack(future_stack=True)
526 df.drop([PerformanceDataFrame.column_seed,
527 PerformanceDataFrame.column_configuration], level=-1, inplace=True)
528 df.index.droplevel()
529 if not rerun: # Filter the nan values
530 df = df.isnull()
532 # Count the number of missing objective values for each Instance/Run/Algorithm
533 df.index = df.index.droplevel(PerformanceDataFrame.index_objective)
534 df.index = df.index.droplevel(-1)
535 index_names = df.index.names
536 df = df.groupby(df.index).agg({cname: "sum" for cname in df.columns})
537 df.index = pd.MultiIndex.from_tuples(df.index, names=index_names)
539 # Return the Instance, Run, Solver combinations
540 return [index + (column, )
541 for index, column in itertools.product(df.index, df.columns)
542 if rerun or df[column][index] > 0]
544 # TODO: This method should be refactored or not exist
545 def remaining_jobs(self: PerformanceDataFrame) -> dict[str, list[str]]:
546 """Return a dictionary for empty values as instance key and solver values."""
547 remaining_jobs = {}
548 jobs = self.get_job_list(rerun=False)
549 for instance, _, solver in jobs:
550 if instance not in remaining_jobs:
551 remaining_jobs[instance] = [solver]
552 else:
553 remaining_jobs[instance].append(solver)
554 return remaining_jobs
556 def configuration_performance(
557 self: PerformanceDataFrame,
558 solver: str,
559 configuration: dict,
560 objective: str | SparkleObjective = None,
561 instances: list[str] = None,
562 per_instance: bool = False) -> tuple[dict, float]:
563 """Return the configuration performance for objective over the instances.
565 Args:
566 solver: The solver for which we determine evaluate the configuration
567 configuration: The configuration to evaluate
568 objective: The objective for which we calculate find the best value
569 instances: The instances which should be selected for the evaluation
570 per_instance: Whether to return the performance per instance,
571 or aggregated.
573 Returns:
574 The best configuration and its aggregated performance.
575 """
576 objective = self.verify_objective(objective)
577 instances = instances or slice(instances) # Convert None to slice
578 if isinstance(objective, str):
579 objective = resolve_objective(objective)
580 # Filter objective
581 subdf = self.xs(objective.name, level=0, drop_level=True)
583 if configuration: # Filter configuration
584 if not isinstance(configuration, dict): # Get empty configuration
585 subdf = subdf[subdf[solver][
586 PerformanceDataFrame.column_configuration].isna()]
587 else:
588 subdf = subdf[subdf[solver][
589 PerformanceDataFrame.column_configuration] == str(configuration)]
590 # Filter solver
591 subdf = subdf.xs(solver, axis=1, drop_level=True)
593 # Drop the seed, filter instances
594 subdf = subdf.drop(PerformanceDataFrame.column_seed, axis=1).loc[instances, :]
595 # Aggregate the runs per instance/configuration
596 try: # Can only aggregate numerical values
597 subdf[PerformanceDataFrame.column_value] =\
598 pd.to_numeric(subdf[PerformanceDataFrame.column_value]) # Ensure type
599 subdf = subdf.groupby([PerformanceDataFrame.index_instance,
600 PerformanceDataFrame.column_configuration],
601 dropna=False).agg(objective.run_aggregator.__name__)
602 except ValueError:
603 subdf.drop(PerformanceDataFrame.column_configuration, axis=1, inplace=True)
604 return configuration, subdf.values.flatten().tolist()
605 if per_instance: # No instance aggregation
606 # NOTE: How do we select the best configuration now if conf == None?
607 return configuration, subdf.values.flatten().tolist()
609 # Aggregate the instances per configuration
610 subdf = subdf.droplevel(level=0).reset_index() # Drop instance column
611 subdf = subdf.groupby(PerformanceDataFrame.column_configuration,
612 dropna=False).agg(
613 func=objective.instance_aggregator.__name__)
615 if configuration:
616 return configuration, subdf.values[0][0]
617 # In case of no configuration given, select the one with best objective value
618 best_index = subdf.idxmin() if objective.minimise else subdf.idxmax()
619 try:
620 best_configuration = ast.literal_eval(best_index.values[0])
621 except Exception: # Configuration is not a dictionary
622 best_value = subdf.min() if objective.minimise else subdf.max()
623 return {}, best_value.values[0]
624 return (best_configuration,
625 subdf.loc[best_index, PerformanceDataFrame.column_value].values[0])
627 def best_configuration(self: PerformanceDataFrame,
628 solver: str,
629 objective: SparkleObjective = None,
630 instances: list[str] = None) -> tuple[dict, float]:
631 """Return the best configuration for the given objective over the instances.
633 Args:
634 solver: The solver for which we determine the best configuration
635 objective: The objective for which we calculate the best configuration
636 instances: The instances which should be selected for the evaluation
638 Returns:
639 The best configuration and its aggregated performance.
640 """
641 return self.configuration_performance(solver, None, objective, instances)
643 def best_instance_performance(
644 self: PerformanceDataFrame,
645 objective: str | SparkleObjective = None,
646 run_id: int = None,
647 exclude_solvers: list[str] = None) -> pd.Series:
648 """Return the best performance for each instance in the portfolio.
650 Args:
651 objective: The objective for which we calculate the best performance
652 run_id: The run for which we calculate the best performance. If None,
653 we consider all runs.
654 exclude_solvers: List of solvers to exclude in the calculation.
656 Returns:
657 The best performance for each instance in the portfolio.
658 """
659 objective = self.verify_objective(objective)
660 if isinstance(objective, str):
661 objective = resolve_objective(objective)
662 # Drop Seed/Configuration
663 subdf = self.drop(
664 [PerformanceDataFrame.column_seed,
665 PerformanceDataFrame.column_configuration],
666 axis=1, level=1)
667 subdf = subdf.xs(objective.name, level=0)
668 if exclude_solvers is not None:
669 subdf = subdf.drop(exclude_solvers, axis=1, level=0)
670 if run_id is not None:
671 run_id = self.verify_run_id(run_id)
672 subdf = subdf.xs(run_id, level=1)
673 else:
674 # Drop the run level
675 subdf = subdf.droplevel(level=1)
676 if objective.minimise:
677 series = subdf.min(axis=1)
678 else:
679 series = subdf.max(axis=1)
680 # Ensure we always return the best for each run
681 series = series.sort_values(ascending=objective.minimise)
682 return series.groupby(series.index).first().astype(float)
684 def best_performance(
685 self: PerformanceDataFrame,
686 exclude_solvers: list[str] = [],
687 objective: str | SparkleObjective = None) -> float:
688 """Return the overall best performance of the portfolio.
690 Args:
691 exclude_solvers: List of solvers to exclude in the calculation.
692 Defaults to none.
693 objective: The objective for which we calculate the best performance
695 Returns:
696 The aggregated best performance of the portfolio over all instances.
697 """
698 objective = self.verify_objective(objective)
699 if isinstance(objective, str):
700 objective = resolve_objective(objective)
701 instance_best = self.best_instance_performance(
702 objective, exclude_solvers=exclude_solvers).to_numpy(dtype=float)
703 return objective.instance_aggregator(instance_best)
705 def schedule_performance(
706 self: PerformanceDataFrame,
707 schedule: dict[str: dict[str: (str, int)]],
708 target_solver: str = None,
709 objective: str | SparkleObjective = None) -> float:
710 """Return the performance of a selection schedule on the portfolio.
712 Args:
713 schedule: Compute the best performance according to a selection schedule.
714 A schedule is a dictionary of instances, with a schedule per instance,
715 consisting of a pair of solver and maximum runtime.
716 target_solver: If not None, store the values in this solver of the DF.
717 objective: The objective for which we calculate the best performance
719 Returns:
720 The performance of the schedule over the instances in the dictionary.
721 """
722 objective = self.verify_objective(objective)
723 if isinstance(objective, str):
724 objective = resolve_objective(objective)
725 select = min if objective.minimise else max
726 performances = [0.0] * len(schedule.keys())
727 for ix, instance in enumerate(schedule.keys()):
728 for iy, (solver, max_runtime) in enumerate(schedule[instance]):
729 performance = float(self.get_value(solver, instance, objective.name))
730 if max_runtime is not None: # We are dealing with runtime
731 performances[ix] += performance
732 if performance < max_runtime:
733 break # Solver finished in time
734 else: # Quality, we take the best found performance
735 if iy == 0: # First solver, set initial value
736 performances[ix] = performance
737 continue
738 performances[ix] = select(performances[ix], performance)
739 if target_solver is not None:
740 self.set_value(performances[ix], target_solver, instance, objective.name)
741 return performances
743 def marginal_contribution(
744 self: PerformanceDataFrame,
745 objective: str | SparkleObjective = None,
746 sort: bool = False) -> list[float]:
747 """Return the marginal contribution of the solvers on the instances.
749 Args:
750 objective: The objective for which we calculate the marginal contribution.
751 sort: Whether to sort the results afterwards
752 Returns:
753 The marginal contribution of each solver.
754 """
755 output = []
756 objective = self.verify_objective(objective)
757 if isinstance(objective, str):
758 objective = resolve_objective(objective)
759 best_performance = self.best_performance(objective=objective)
760 for solver in self.solvers:
761 # By calculating the best performance excluding this Solver,
762 # we can determine its relative impact on the portfolio.
763 missing_solver_best = self.best_performance(
764 exclude_solvers=[solver],
765 objective=objective)
766 # Now we need to see how much the portfolio's best performance
767 # decreases without this solver.
768 marginal_contribution = missing_solver_best / best_performance
769 if missing_solver_best == best_performance:
770 # No change, no contribution
771 marginal_contribution = 0.0
772 output.append((solver, marginal_contribution, missing_solver_best))
773 if sort:
774 output.sort(key=lambda x: x[1], reverse=objective.minimise)
775 return output
777 def get_solver_ranking(self: PerformanceDataFrame,
778 objective: str | SparkleObjective = None
779 ) -> list[tuple[str, float]]:
780 """Return a list with solvers ranked by average performance."""
781 objective = self.verify_objective(objective)
782 if isinstance(objective, str):
783 objective = resolve_objective(objective)
784 # Drop Seed/Configuration
785 subdf = self.drop(
786 [PerformanceDataFrame.column_seed,
787 PerformanceDataFrame.column_configuration],
788 axis=1, level=1)
789 sub_df = subdf.loc(axis=0)[objective.name, :, :]
790 # Reduce Runs Dimension
791 sub_df = sub_df.droplevel("Run").astype(float)
792 # By using .__name__, pandas converts it to a Pandas Aggregator function
793 sub_df = sub_df.groupby(sub_df.index).agg(func=objective.run_aggregator.__name__)
794 solver_ranking = [(solver, objective.instance_aggregator(
795 sub_df[solver].astype(float))) for solver in self.solvers]
796 # Sort the list by second value (the performance)
797 solver_ranking.sort(key=lambda performance: performance[1],
798 reverse=(not objective.minimise))
799 return solver_ranking
801 def save_csv(self: PerformanceDataFrame, csv_filepath: Path = None) -> None:
802 """Write a CSV to the given path.
804 Args:
805 csv_filepath: String path to the csv file. Defaults to self.csv_filepath.
806 """
807 csv_filepath = self.csv_filepath if csv_filepath is None else csv_filepath
808 self.to_csv(csv_filepath)
810 def clone(self: PerformanceDataFrame,
811 csv_filepath: Path = None) -> PerformanceDataFrame:
812 """Create a copy of this object.
814 Args:
815 csv_filepath: The new filepath to use for saving the object to.
816 Warning: If the original path is used, it could lead to dataloss!
817 """
818 csv_filepath = csv_filepath or self.csv_filepath
819 if self.csv_filepath.exists():
820 pd_copy = PerformanceDataFrame(csv_filepath)
821 else:
822 pd_copy = PerformanceDataFrame(
823 csv_filepath=csv_filepath,
824 solvers=self.solvers,
825 objectives=self.objectives,
826 instances=self.instances,
827 n_runs=self.num_runs)
828 for solver in self.solvers:
829 for index in self.index:
830 for field in PerformanceDataFrame.multi_column_names:
831 pd_copy.at[index, (solver, field)] =\
832 self.loc[index, solver][field]
833 return pd_copy
835 def clean_csv(self: PerformanceDataFrame) -> None:
836 """Set all values in Performance Data to None."""
837 self[:] = PerformanceDataFrame.missing_value
838 self.save_csv()