Coverage for sparkle/structures/performance_dataframe.py: 88%
371 statements
« prev ^ index » next coverage.py v7.6.10, created at 2025-01-07 15:22 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2025-01-07 15:22 +0000
1"""Module to manage performance data files and common operations on them."""
2from __future__ import annotations
3import ast
4from typing import Any
5import itertools
6from pathlib import Path
7import math
8import numpy as np
9import pandas as pd
11from sparkle.types import SparkleObjective, resolve_objective
14class PerformanceDataFrame(pd.DataFrame):
15 """Class to manage performance data and common operations on them."""
16 missing_value = math.nan
18 missing_objective = "UNKNOWN"
20 index_objective = "Objective"
21 index_instance = "Instance"
22 index_run = "Run"
23 multi_index_names = [index_objective, index_instance, index_run]
25 column_value = "Value"
26 column_seed = "Seed"
27 column_configuration = "Configuration"
28 multi_column_names = [column_value, column_seed, column_configuration]
29 multi_column_dtypes = [float, int, str]
31 def __init__(self: PerformanceDataFrame,
32 csv_filepath: Path,
33 solvers: list[str] = None,
34 objectives: list[str | SparkleObjective] = None,
35 instances: list[str] = None,
36 n_runs: int = 1,
37 ) -> None:
38 """Initialise a PerformanceDataFrame.
40 Consists of:
41 - Columns representing the Solvers
42 - Rows representing the result by multi-index in order of:
43 * Objective (Static, given in constructor or read from file)
44 * Instance
45 * Runs (Static, given in constructor or read from file)
47 Args:
48 csv_filepath: If path exists, load from Path.
49 Otherwise create new and save to this path.
50 solvers: List of solver names to be added into the Dataframe
51 objectives: List of SparkleObjectives or objective names. By default None,
52 then the objectives will be derived from Sparkle Settings if possible.
53 instances: List of instance names to be added into the Dataframe
54 n_runs: The number of runs to consider per Solver/Objective/Instance comb.
55 """
56 if csv_filepath.exists():
57 dtypes = {key: value for key, value in zip(
58 PerformanceDataFrame.multi_column_names,
59 PerformanceDataFrame.multi_column_dtypes)}
60 df = pd.read_csv(csv_filepath,
61 header=[0, 1], index_col=[0, 1, 2],
62 dtype=dtypes,
63 on_bad_lines="skip")
64 super().__init__(df)
65 self.csv_filepath = csv_filepath
66 else:
67 # Initialize empty DataFrame
68 run_ids = list(range(1, n_runs + 1)) # We count runs from 1
69 # We always need objectives to maintain the dimensions
70 if objectives is None:
71 objectives = [PerformanceDataFrame.missing_objective]
72 else:
73 objectives = [str(o) for o in objectives]
74 # We always need an instance to maintain the dimensions
75 if instances is None:
76 instances = [PerformanceDataFrame.missing_value]
77 # We always need a solver to maintain the dimensions
78 if solvers is None:
79 solvers = [PerformanceDataFrame.missing_value]
80 midx = pd.MultiIndex.from_product(
81 [objectives, instances, run_ids],
82 names=PerformanceDataFrame.multi_index_names)
83 mcolumns = pd.MultiIndex.from_product(
84 [solvers, PerformanceDataFrame.multi_column_names],
85 names=["Solver", "Meta"])
86 super().__init__(PerformanceDataFrame.missing_value,
87 index=midx, columns=mcolumns)
88 self.csv_filepath = csv_filepath
89 self.save_csv()
91 if self.index.duplicated().any(): # Combine duplicate indices
92 combined = self.groupby(level=[0, 1, 2]).first()
93 duplicates = self.index[self.index.duplicated(keep="first")]
94 # Remove all duplicate entries from self
95 self.drop(duplicates, inplace=True)
96 for d in duplicates: # Place combined duplicates in self
97 self.loc[d, :] = combined.loc[d, :]
99 # Sort the index to optimize lookup speed
100 self.sort_index(axis=0, inplace=True)
102 # Properties
104 @property
105 def num_objectives(self: PerformanceDataFrame) -> int:
106 """Retrieve the number of objectives in the DataFrame."""
107 return self.index.get_level_values(0).unique().size
109 @property
110 def num_instances(self: PerformanceDataFrame) -> int:
111 """Return the number of instances."""
112 return self.index.get_level_values(1).unique().size
114 @property
115 def num_runs(self: PerformanceDataFrame) -> int:
116 """Return the maximum number of runs of each instance."""
117 return self.index.get_level_values(2).unique().size
119 @property
120 def num_solvers(self: PerformanceDataFrame) -> int:
121 """Return the number of solvers."""
122 return self.columns.get_level_values(0).unique().size
124 @property
125 def multi_objective(self: PerformanceDataFrame) -> bool:
126 """Return whether the dataframe represent MO or not."""
127 return self.num_objectives > 1
129 @property
130 def solvers(self: PerformanceDataFrame) -> list[str]:
131 """Return the solver present as a list of strings."""
132 return self.columns.get_level_values(0).unique().to_list()
134 @property
135 def objective_names(self: PerformanceDataFrame) -> list[str]:
136 """Return the objective names as a list of strings."""
137 return self.index.get_level_values(0).unique().to_list()
139 @property
140 def objectives(self: PerformanceDataFrame) -> list[SparkleObjective]:
141 """Return the objectives as a list of SparkleObjectives."""
142 return [resolve_objective(o) for o in self.objective_names]
144 @property
145 def instances(self: PerformanceDataFrame) -> list[str]:
146 """Return the instances as a Pandas Index object."""
147 return self.index.get_level_values(1).unique().to_list()
149 @property
150 def run_ids(self: PerformanceDataFrame) -> list[int]:
151 """Return the run ids as a list of integers."""
152 return self.index.get_level_values(2).unique().to_list()
154 @property
155 def has_missing_values(self: PerformanceDataFrame) -> bool:
156 """Returns True if there are any missing values in the dataframe."""
157 return self.isnull().any().drop([PerformanceDataFrame.column_seed,
158 PerformanceDataFrame.column_configuration],
159 level=1).any()
161 def verify_objective(self: PerformanceDataFrame,
162 objective: str) -> str:
163 """Method to check whether the specified objective is valid.
165 Users are allowed to index the dataframe without specifying all dimensions.
166 However, when dealing with multiple objectives this is not allowed and this
167 is verified here. If we have only one objective this is returned. Otherwise,
168 if an objective is specified by the user this is returned.
170 Args:
171 objective: The objective given by the user
172 """
173 if objective is None:
174 if self.multi_objective:
175 raise ValueError("Error: MO Data, but objective not specified.")
176 elif self.num_objectives == 1:
177 return self.objective_names[0]
178 else:
179 return PerformanceDataFrame.missing_objective
180 return objective
182 def verify_run_id(self: PerformanceDataFrame,
183 run_id: int) -> int:
184 """Method to check whether run id is valid.
186 Similar to verify_objective but here we check the dimensionality of runs.
188 Args:
189 run_id: the run as specified by the user.
190 """
191 if run_id is None:
192 if self.num_runs > 1:
193 raise ValueError("Error: Multiple run performance data, "
194 "but run not specified")
195 else:
196 run_id = self.run_ids[0]
197 return run_id
199 def verify_indexing(self: PerformanceDataFrame,
200 objective: str,
201 run_id: int) -> tuple[str, int]:
202 """Method to check whether data indexing is correct.
204 Users are allowed to use the Performance Dataframe without the second and
205 fourth dimension (Objective and Run respectively) in the case they only
206 have one objective or only do one run. This method adjusts the indexing for
207 those cases accordingly.
209 Args:
210 objective: The given objective name
211 run_id: The given run index
213 Returns:
214 A tuple representing the (possibly adjusted) Objective and Run index.
215 """
216 objective = self.verify_objective(objective)
217 run_id = self.verify_run_id(run_id)
218 return objective, run_id
220 # Getters and Setters
222 def add_solver(self: PerformanceDataFrame,
223 solver_name: str,
224 initial_value: float | list[str | float] = None) -> None:
225 """Add a new solver to the dataframe. Initializes value to None by default.
227 Args:
228 solver_name: The name of the solver to be added.
229 initial_value: The value assigned for each index of the new solver.
230 If not None, must match the index dimension (n_obj * n_inst * n_runs).
231 """
232 if solver_name in self.solvers:
233 print(f"WARNING: Tried adding already existing solver {solver_name} to "
234 f"Performance DataFrame: {self.csv_filepath}")
235 return
236 initial_value =\
237 [initial_value] if not isinstance(initial_value, list) else initial_value
238 column_dim_size = len(PerformanceDataFrame.multi_column_names)
239 if len(initial_value) < column_dim_size:
240 initial_value.extend([None] * (column_dim_size - len(initial_value)))
241 for field, value in zip(PerformanceDataFrame.multi_column_names, initial_value):
242 self[solver_name, field] = value
243 if self.num_solvers == 2: # Remove nan solver
244 for solver in self.solvers:
245 if str(solver) == str(PerformanceDataFrame.missing_value):
246 self.remove_solver(solver)
247 break
249 def add_objective(self: PerformanceDataFrame,
250 objective_name: str,
251 initial_value: float = None) -> None:
252 """Add an objective to the DataFrame."""
253 initial_value = initial_value or self.missing_value
254 if objective_name in self.objective_names:
255 print(f"WARNING: Tried adding already existing objective {objective_name} "
256 f"to Performance DataFrame: {self.csv_filepath}")
257 return
258 for instance, run in itertools.product(self.instances, self.run_ids):
259 self.loc[(objective_name, instance, run)] = initial_value
260 self.sort_index(axis=0, inplace=True)
262 def add_instance(self: PerformanceDataFrame,
263 instance_name: str,
264 initial_value: float = None) -> None:
265 """Add and instance to the DataFrame."""
266 initial_value = initial_value or self.missing_value
268 if instance_name in self.instances:
269 print(f"WARNING: Tried adding already existing instance {instance_name} "
270 f"to Performance DataFrame: {self.csv_filepath}")
271 return
272 # Add rows for all combinations
273 for objective, run in itertools.product(self.objective_names, self.run_ids):
274 self.loc[(objective, instance_name, run)] = initial_value
275 if self.num_instances == 2: # Remove nan instance
276 for instance in self.instances:
277 if not isinstance(instance, str) and math.isnan(instance):
278 self.remove_instance(instance)
279 break
280 # Sort the index to optimize lookup speed
281 self.sort_index(axis=0, inplace=True)
283 def add_runs(self: PerformanceDataFrame,
284 num_extra_runs: int,
285 instance_names: list[str] = None) -> None:
286 """Add runs to the DataFrame.
288 Args:
289 num_extra_runs: The number of runs to be added.
290 instance_names: The instances for which runs are to be added.
291 By default None, which means runs are added to all instances.
292 """
293 instance_names = self.instances if instance_names is None else instance_names
294 for instance in instance_names:
295 for objective in self.objective_names:
296 index_runs_start = len(self.loc[(objective, instance)]) + 1
297 for run in range(index_runs_start, index_runs_start + num_extra_runs):
298 self.loc[(objective, instance, run)] = self.missing_value
299 # Sort the index to optimize lookup speed
300 # NOTE: It would be better to do this at the end, but that results in
301 # PerformanceWarning: indexing past lexsort depth may impact performance.
302 self.sort_index(axis=0, inplace=True)
304 def remove_solver(self: PerformanceDataFrame, solver_name: str | list[str]) -> None:
305 """Drop one or more solvers from the Dataframe."""
306 # To make sure objectives / runs are saved when no solvers are present
307 if self.num_solvers == 1:
308 for field in PerformanceDataFrame.multi_column_names:
309 self[PerformanceDataFrame.missing_value, field] =\
310 PerformanceDataFrame.missing_value
311 self.drop(columns=solver_name, level=0, axis=1, inplace=True)
313 def remove_instance(self: PerformanceDataFrame, instance_name: str) -> None:
314 """Drop an instance from the Dataframe."""
315 # To make sure objectives / runs are saved when no instances are present
316 if self.num_instances == 1:
317 for objective, run in itertools.product(self.objective_names, self.run_ids):
318 self.loc[(objective, PerformanceDataFrame.missing_value, run)] =\
319 PerformanceDataFrame.missing_value
320 self.drop(instance_name,
321 axis=0,
322 level=PerformanceDataFrame.index_instance, inplace=True)
323 # Sort the index to optimize lookup speed
324 self.sort_index(axis=0, inplace=True)
326 def remove_runs(self: PerformanceDataFrame,
327 runs: int | list[int],
328 instance_names: list[str] = None) -> None:
329 """Drop one or more runs from the Dataframe.
331 Args:
332 runs: The run indices to be removed. If its an int,
333 the last n runs are removed. NOTE: If each instance has a different
334 number of runs, the amount of removed runs is not uniform.
335 instance_names: The instances for which runs are to be removed.
336 By default None, which means runs are removed from all instances.
337 """
338 instance_names = self.instances if instance_names is None else instance_names
339 runs = list(range((self.num_runs + 1) - runs, (self.num_runs + 1)))\
340 if isinstance(runs, int) else runs
341 self.drop(runs,
342 axis=0,
343 level=PerformanceDataFrame.index_run,
344 inplace=True)
345 # Sort the index to optimize lookup speed
346 self.sort_index(axis=0, inplace=True)
348 def remove_empty_runs(self: PerformanceDataFrame) -> None:
349 """Remove runs that contain no data, except for the first."""
350 for row_index in self.index:
351 if row_index[2] == 1: # First run, never delete
352 continue
353 if self.loc[row_index].isna().all():
354 self.drop(row_index, inplace=True)
356 def reset_value(self: PerformanceDataFrame,
357 solver: str,
358 instance: str,
359 objective: str = None,
360 run: int = None) -> None:
361 """Reset a value in the dataframe."""
362 self.set_value(PerformanceDataFrame.missing_value,
363 solver, instance, objective, run)
365 def set_value(self: PerformanceDataFrame,
366 value: float | str | list[float | str] | list[list[float | str]],
367 solver: str | list[str],
368 instance: str | list[str],
369 objective: str | list[str] = None,
370 run: int | list[int] = None,
371 solver_fields: list[str] = ["Value"],
372 append_write_csv: bool = False) -> None:
373 """Setter method to assign a value to the Dataframe.
375 Allows for setting the same value to multiple indices.
377 Args:
378 value: Value(s) to be assigned. If value is a list, first dimension is
379 the solver field, second dimension is if multiple different values are
380 to be assigned. Must be the same shape as target.
381 solver: The solver(s) for which the value should be set.
382 If solver is a list, multiple solvers are set. If None, all
383 solvers are set.
384 instance: The instance(s) for which the value should be set.
385 If instance is a list, multiple instances are set. If None, all
386 instances are set.
387 objective: The objectives for which the value should be set.
388 When left None, set for all objectives
389 run: The run index for which the value should be set.
390 If left None, set for all runs.
391 solver_fields: The level to which each value should be assinged.
392 Defaults to ["Value"].
393 append_write_csv: For concurrent writing to the PerformanceDataFrame.
394 If True, the value is directly appended to the CSV file.
395 This will create duplicate entries in the file, but these are combined
396 when loading the file.
397 """
398 # Convert indices to slices for None values
399 solver = slice(solver) if solver is None else solver
400 instance = slice(instance) if instance is None else instance
401 objective = slice(objective) if objective is None else objective
402 run = slice(run) if run is None else run
403 # Convert column indices to slices for setting multiple columns
404 value = [value] if not isinstance(value, list) else value
405 # NOTE: We currently forloop levels here, as it allows us to set the same
406 # sequence of values to the indices
407 for item, level in zip(value, solver_fields):
408 self.loc[(objective, instance, run), (solver, level)] = item
410 if append_write_csv:
411 writeable = self.loc[(objective, instance, run), :]
412 if isinstance(writeable, pd.Series): # Single row, convert to pd.DataFrame
413 writeable = self.loc[[(objective, instance, run)], :]
414 # Append the new rows to the dataframe csv file
415 writeable.to_csv(self.csv_filepath, mode="a", header=False)
417 def get_value(self: PerformanceDataFrame,
418 solver: str | list[str],
419 instance: str | list[str],
420 objective: str = None,
421 run: int = None,
422 solver_fields: list[str] = ["Value"]
423 ) -> float | str | list[Any]:
424 """Index a value of the DataFrame and return it."""
425 # Convert indices to slices for None values
426 solver = slice(solver) if solver is None else solver
427 instance = slice(instance) if instance is None else instance
428 objective = slice(objective) if objective is None else objective
429 run = slice(run) if run is None else run
430 target = self.loc[(objective, instance, run), (solver, solver_fields)].values
432 # Reduce dimensions when relevant
433 if isinstance(target[0], np.ndarray) and len(target[0]) == 1:
434 target = target.flatten()
435 target = target.tolist()
436 if len(target) == 1:
437 return target[0]
438 return target
440 # This method can be removed now that above method does its job
441 def get_values(self: PerformanceDataFrame,
442 solver: str,
443 instance: str = None,
444 objective: str = None,
445 run: int = None,
446 solver_fields: list[str] = ["Value"]
447 ) -> list[float | str] | list[list[float | str]]:
448 """Return a list of solver values."""
449 subdf = self[solver][solver_fields]
450 if objective is not None:
451 objective = self.verify_objective(objective)
452 subdf = subdf.xs(objective, level=0, drop_level=False)
453 if instance is not None:
454 subdf = subdf.xs(instance, level=1, drop_level=False)
455 if run is not None:
456 run = self.verify_run_id(run)
457 subdf = subdf.xs(run, level=2, drop_level=False)
458 # Convert dict to list
459 result = [subdf[field].to_list() for field in solver_fields]
460 if len(result) == 1:
461 return result[0]
462 return result
464 def get_instance_num_runs(self: PerformanceDataFrame,
465 instance: str) -> int:
466 """Return the number of runs for an instance."""
467 # We assume each objective has the same index for Instance/Runs
468 return len(self.loc[(self.objective_names[0], instance)].index)
470 # Calculables
472 def mean(self: PerformanceDataFrame,
473 objective: str = None,
474 solver: str = None,
475 instance: str = None) -> float:
476 """Return the mean value of a slice of the dataframe."""
477 objective = self.verify_objective(objective)
478 subset = self.xs(objective, level=0)
479 if solver is not None:
480 subset = subset.xs(solver, axis=1, drop_level=False)
481 if instance is not None:
482 subset = subset.xs(instance, axis=0, drop_level=False)
483 value = subset.astype(float).mean()
484 if isinstance(value, pd.Series):
485 return value.mean()
486 return value
488 # TODO: This method should be refactored or not exist
489 def get_job_list(self: PerformanceDataFrame, rerun: bool = False) \
490 -> list[tuple[str, str]]:
491 """Return a list of performance computation jobs there are to be done.
493 Get a list of tuple[instance, solver] to run from the performance data.
494 If rerun is False (default), get only the tuples that don't have a
495 value, else (True) get all the tuples.
497 Args:
498 rerun: Boolean indicating if we want to rerun all jobs
500 Returns:
501 A list of [instance, solver] combinations
502 """
503 # Format the dataframe such that only the values remain
504 df = self.stack(future_stack=True)
505 df.drop([PerformanceDataFrame.column_seed,
506 PerformanceDataFrame.column_configuration], level=-1, inplace=True)
507 df.index.droplevel()
508 if not rerun: # Filter the nan values
509 df = df.isnull()
511 # Count the number of missing objective values for each Instance/Run/Algorithm
512 df.index = df.index.droplevel(PerformanceDataFrame.index_objective)
513 df.index = df.index.droplevel(-1)
514 index_names = df.index.names
515 df = df.groupby(df.index).agg({cname: "sum" for cname in df.columns})
516 df.index = pd.MultiIndex.from_tuples(df.index, names=index_names)
518 # Return the Instance, Run, Solver combinations
519 return [index + (column, )
520 for index, column in itertools.product(df.index, df.columns)
521 if rerun or df[column][index] > 0]
523 # TODO: This method should be refactored or not exist
524 def remaining_jobs(self: PerformanceDataFrame) -> dict[str, list[str]]:
525 """Return a dictionary for empty values as instance key and solver values."""
526 remaining_jobs = {}
527 jobs = self.get_job_list(rerun=False)
528 for instance, _, solver in jobs:
529 if instance not in remaining_jobs:
530 remaining_jobs[instance] = [solver]
531 else:
532 remaining_jobs[instance].append(solver)
533 return remaining_jobs
535 def configuration_performance(
536 self: PerformanceDataFrame,
537 solver: str,
538 configuration: dict,
539 objective: str | SparkleObjective = None,
540 instances: list[str] = None,
541 per_instance: bool = False) -> tuple[dict, float]:
542 """Return the configuration performance for objective over the instances.
544 Args:
545 solver: The solver for which we determine evaluate the configuration
546 configuration: The configuration to evaluate
547 objective: The objective for which we calculate find the best value
548 instances: The instances which should be selected for the evaluation
549 per_instance: Whether to return the performance per instance,
550 or aggregated.
552 Returns:
553 The best configuration and its aggregated performance.
554 """
555 objective = self.verify_objective(objective)
556 instances = instances or slice(instances) # Convert None to slice
557 if isinstance(objective, str):
558 objective = resolve_objective(objective)
559 # Filter objective
560 subdf = self.xs(objective.name, level=0, drop_level=True)
562 if configuration: # Filter configuration
563 if not isinstance(configuration, dict): # Get empty configuration
564 subdf = subdf[subdf[solver][
565 PerformanceDataFrame.column_configuration].isna()]
566 else:
567 subdf = subdf[subdf[solver][
568 PerformanceDataFrame.column_configuration] == str(configuration)]
569 # Filter solver
570 subdf = subdf.xs(solver, axis=1, drop_level=True)
572 # Drop the seed, filter instances
573 subdf = subdf.drop(PerformanceDataFrame.column_seed, axis=1).loc[instances, :]
574 # Aggregate the runs per instance/configuration
575 try: # Can only aggregate numerical values
576 subdf[PerformanceDataFrame.column_value] =\
577 pd.to_numeric(subdf[PerformanceDataFrame.column_value]) # Ensure type
578 subdf = subdf.groupby([PerformanceDataFrame.index_instance,
579 PerformanceDataFrame.column_configuration],
580 dropna=False).agg(objective.run_aggregator.__name__)
581 except ValueError:
582 subdf.drop(PerformanceDataFrame.column_configuration, axis=1, inplace=True)
583 return configuration, subdf.values.flatten().tolist()
584 if per_instance: # No instance aggregation
585 # NOTE: How do we select the best configuration now if conf == None?
586 return configuration, subdf.values.flatten().tolist()
588 # Aggregate the instances per configuration
589 subdf = subdf.droplevel(level=0).reset_index() # Drop instance column
590 subdf = subdf.groupby(PerformanceDataFrame.column_configuration,
591 dropna=False).agg(
592 func=objective.instance_aggregator.__name__)
594 if configuration:
595 return configuration, subdf.values[0][0]
596 # In case of no configuration given, select the one with best objective value
597 best_index = subdf.idxmin() if objective.minimise else subdf.idxmax()
598 try:
599 best_configuration = ast.literal_eval(best_index.values[0])
600 except Exception: # Configuration is not a dictionary
601 best_value = subdf.min() if objective.minimise else subdf.max()
602 return {}, best_value.values[0]
603 return (best_configuration,
604 subdf.loc[best_index, PerformanceDataFrame.column_value].values[0])
606 def best_configuration(self: PerformanceDataFrame,
607 solver: str,
608 objective: SparkleObjective = None,
609 instances: list[str] = None) -> tuple[dict, float]:
610 """Return the best configuration for the given objective over the instances.
612 Args:
613 solver: The solver for which we determine the best configuration
614 objective: The objective for which we calculate the best configuration
615 instances: The instances which should be selected for the evaluation
617 Returns:
618 The best configuration and its aggregated performance.
619 """
620 return self.configuration_performance(solver, None, objective, instances)
622 def best_instance_performance(
623 self: PerformanceDataFrame,
624 objective: str | SparkleObjective = None,
625 run_id: int = None,
626 exclude_solvers: list[str] = None) -> pd.Series:
627 """Return the best performance for each instance in the portfolio.
629 Args:
630 objective: The objective for which we calculate the best performance
631 run_id: The run for which we calculate the best performance. If None,
632 we consider all runs.
633 exclude_solvers: List of solvers to exclude in the calculation.
635 Returns:
636 The best performance for each instance in the portfolio.
637 """
638 objective = self.verify_objective(objective)
639 if isinstance(objective, str):
640 objective = resolve_objective(objective)
641 # Drop Seed/Configuration
642 subdf = self.drop(
643 [PerformanceDataFrame.column_seed,
644 PerformanceDataFrame.column_configuration],
645 axis=1, level=1)
646 subdf = subdf.xs(objective.name, level=0)
647 if exclude_solvers is not None:
648 subdf = subdf.drop(exclude_solvers, axis=1, level=0)
649 if run_id is not None:
650 run_id = self.verify_run_id(run_id)
651 subdf = subdf.xs(run_id, level=1)
652 else:
653 # Drop the run level
654 subdf = subdf.droplevel(level=1)
655 if objective.minimise:
656 series = subdf.min(axis=1)
657 else:
658 series = subdf.max(axis=1)
659 # Ensure we always return the best for each run
660 series = series.sort_values(ascending=objective.minimise)
661 return series.groupby(series.index).first().astype(float)
663 def best_performance(
664 self: PerformanceDataFrame,
665 exclude_solvers: list[str] = [],
666 objective: str | SparkleObjective = None) -> float:
667 """Return the overall best performance of the portfolio.
669 Args:
670 exclude_solvers: List of solvers to exclude in the calculation.
671 Defaults to none.
672 objective: The objective for which we calculate the best performance
674 Returns:
675 The aggregated best performance of the portfolio over all instances.
676 """
677 objective = self.verify_objective(objective)
678 if isinstance(objective, str):
679 objective = resolve_objective(objective)
680 instance_best = self.best_instance_performance(
681 objective, exclude_solvers=exclude_solvers).to_numpy(dtype=float)
682 return objective.instance_aggregator(instance_best)
684 def schedule_performance(
685 self: PerformanceDataFrame,
686 schedule: dict[str: list[tuple[str, float | None]]],
687 target_solver: str = None,
688 objective: str | SparkleObjective = None) -> float:
689 """Return the performance of a selection schedule on the portfolio.
691 Args:
692 schedule: Compute the best performance according to a selection schedule.
693 A dictionary with instances as keys and a list of tuple consisting of
694 (solver, max_runtime) or solvers if no runtime prediction should be used.
695 target_solver: If not None, store the values in this solver of the DF.
696 objective: The objective for which we calculate the best performance
698 Returns:
699 The performance of the schedule over the instances in the dictionary.
700 """
701 objective = self.verify_objective(objective)
702 if isinstance(objective, str):
703 objective = resolve_objective(objective)
704 select = min if objective.minimise else max
705 performances = [0.0] * len(schedule.keys())
706 for ix, instance in enumerate(schedule.keys()):
707 for iy, (solver, max_runtime) in enumerate(schedule[instance]):
708 performance = float(self.get_value(solver, instance, objective.name))
709 if max_runtime is not None: # We are dealing with runtime
710 performances[ix] += performance
711 if performance < max_runtime:
712 break # Solver finished in time
713 else: # Quality, we take the best found performance
714 if iy == 0: # First solver, set initial value
715 performances[ix] = performance
716 continue
717 performances[ix] = select(performances[ix], performance)
718 if target_solver is not None:
719 self.set_value(performances[ix], target_solver, instance, objective.name)
720 return performances
722 def marginal_contribution(
723 self: PerformanceDataFrame,
724 objective: str | SparkleObjective = None,
725 sort: bool = False) -> list[float]:
726 """Return the marginal contribution of the solvers on the instances.
728 Args:
729 objective: The objective for which we calculate the marginal contribution.
730 sort: Whether to sort the results afterwards
731 Returns:
732 The marginal contribution of each solver.
733 """
734 output = []
735 objective = self.verify_objective(objective)
736 if isinstance(objective, str):
737 objective = resolve_objective(objective)
738 best_performance = self.best_performance(objective=objective)
739 for solver in self.solvers:
740 # By calculating the best performance excluding this Solver,
741 # we can determine its relative impact on the portfolio.
742 missing_solver_best = self.best_performance(
743 exclude_solvers=[solver],
744 objective=objective)
745 # Now we need to see how much the portfolio's best performance
746 # decreases without this solver.
747 marginal_contribution = missing_solver_best / best_performance
748 if missing_solver_best == best_performance:
749 # No change, no contribution
750 marginal_contribution = 0.0
751 output.append((solver, marginal_contribution, missing_solver_best))
752 if sort:
753 output.sort(key=lambda x: x[1], reverse=objective.minimise)
754 return output
756 def get_solver_ranking(self: PerformanceDataFrame,
757 objective: str | SparkleObjective = None
758 ) -> list[tuple[str, float]]:
759 """Return a list with solvers ranked by average performance."""
760 objective = self.verify_objective(objective)
761 if isinstance(objective, str):
762 objective = resolve_objective(objective)
763 # Drop Seed/Configuration
764 subdf = self.drop(
765 [PerformanceDataFrame.column_seed,
766 PerformanceDataFrame.column_configuration],
767 axis=1, level=1)
768 sub_df = subdf.loc(axis=0)[objective.name, :, :]
769 # Reduce Runs Dimension
770 sub_df = sub_df.droplevel("Run").astype(float)
771 # By using .__name__, pandas converts it to a Pandas Aggregator function
772 sub_df = sub_df.groupby(sub_df.index).agg(func=objective.run_aggregator.__name__)
773 solver_ranking = [(solver, objective.instance_aggregator(
774 sub_df[solver].astype(float))) for solver in self.solvers]
775 # Sort the list by second value (the performance)
776 solver_ranking.sort(key=lambda performance: performance[1],
777 reverse=(not objective.minimise))
778 return solver_ranking
780 def save_csv(self: PerformanceDataFrame, csv_filepath: Path = None) -> None:
781 """Write a CSV to the given path.
783 Args:
784 csv_filepath: String path to the csv file. Defaults to self.csv_filepath.
785 """
786 csv_filepath = self.csv_filepath if csv_filepath is None else csv_filepath
787 self.to_csv(csv_filepath)
789 def clone(self: PerformanceDataFrame,
790 csv_filepath: Path = None) -> PerformanceDataFrame:
791 """Create a copy of this object.
793 Args:
794 csv_filepath: The new filepath to use for saving the object to.
795 Warning: If the original path is used, it could lead to dataloss!
796 """
797 csv_filepath = csv_filepath or self.csv_filepath
798 if self.csv_filepath.exists():
799 pd_copy = PerformanceDataFrame(csv_filepath)
800 else:
801 pd_copy = PerformanceDataFrame(
802 csv_filepath=csv_filepath,
803 solvers=self.solvers,
804 objectives=self.objectives,
805 instances=self.instances,
806 n_runs=self.num_runs)
807 for solver in self.solvers:
808 for index in self.index:
809 for field in PerformanceDataFrame.multi_column_names:
810 pd_copy.at[index, (solver, field)] =\
811 self.loc[index, solver][field]
812 return pd_copy
814 def clean_csv(self: PerformanceDataFrame) -> None:
815 """Set all values in Performance Data to None."""
816 self[:] = PerformanceDataFrame.missing_value
817 self.save_csv()
819 def to_autofolio(self: PerformanceDataFrame,
820 objective: SparkleObjective = None,
821 target: Path = None) -> Path:
822 """Port the data to a format acceptable for AutoFolio."""
823 if (objective is None and self.multi_objective or self.num_runs > 1):
824 print(f"ERROR: Currently no porting available for {self.csv_filepath} "
825 "to Autofolio due to multi objective or number of runs.")
826 return
827 autofolio_df = super().copy()
828 # Drop Seed/Configuration, then drop the level
829 autofolio_df = autofolio_df.drop([PerformanceDataFrame.column_seed,
830 PerformanceDataFrame.column_configuration],
831 axis=1, level=1).droplevel(level=1, axis=1)
832 if objective is not None:
833 autofolio_df = autofolio_df.loc[objective.name]
834 autofolio_df.index = autofolio_df.index.droplevel("Run")
835 else:
836 autofolio_df.index = autofolio_df.index.droplevel(["Objective", "Run"])
837 if target is None:
838 path = self.csv_filepath.parent / f"autofolio_{self.csv_filepath.name}"
839 else:
840 path = target / f"autofolio_{self.csv_filepath.name}"
841 autofolio_df.to_csv(path)
842 return path