"""
GreenScore calculation and metric normalization.
This module provides functions to calculate GreenScore, a composite metric
combining energy consumption, execution time, and carbon footprint with
configurable weights.
"""
import pandas as pd
from pathlib import Path
from typing import Union, Optional, Dict
# Methodology tag for "measured" (hardware); all others count as "estimated"
METHODOLOGY_MEASURED = "hardware_rapl_linux"
[docs]
def normalize_metrics(df: pd.DataFrame, output_path: Optional[Union[str, Path]] = None) -> pd.DataFrame:
"""
Normalize metrics across methods (row-wise) per algorithm.
Applies min-max normalization to each row, normalizing values between 0 and 1.
This allows fair comparison across different algorithms with different scales.
Args:
df: DataFrame with one row per algorithm, 'algorithm' column, and metric columns.
output_path: Optional path to save the normalized DataFrame as CSV.
Returns:
DataFrame with 'algorithm' and normalized method columns.
Examples:
>>> df = pd.DataFrame({
... 'algorithm': ['algo1', 'algo2'],
... 'method1': [100, 200],
... 'method2': [50, 150]
... })
>>> normalized = normalize_metrics(df)
"""
# Copy to avoid modifying original
df = df.copy()
# Extract algorithm names
algorithm_names = df['algorithm']
# Select only numeric method columns
method_cols = df.columns.drop('algorithm')
numeric_df = df[method_cols]
# Apply row-wise normalization (min-max per algorithm)
normalized_df = numeric_df.apply(
lambda row: (row - row.min()) / (row.max() - row.min())
if row.max() != row.min() else row * 0, # Handle constant rows
axis=1
)
# Add back the algorithm column
normalized_df.insert(0, 'algorithm', algorithm_names)
# Save to file if output path provided
if output_path is not None:
output = Path(output_path) if isinstance(output_path, str) else output_path
output.parent.mkdir(parents=True, exist_ok=True)
normalized_df.to_csv(output, index=False)
return normalized_df
def _methodology_counts_from_aggregated(aggregated_energy_paths: Dict[str, Path]) -> Dict[str, Dict[str, int]]:
"""For each method, count points_measured (hardware_rapl_linux) and points_estimated (else)."""
out = {}
for method, path in aggregated_energy_paths.items():
path = Path(path)
if not path.exists():
out[method] = {"points_measured": 0, "points_estimated": 0}
continue
df = pd.read_csv(path)
if "methodology" not in df.columns:
out[method] = {"points_measured": 0, "points_estimated": len(df)}
continue
measured = (df["methodology"] == METHODOLOGY_MEASURED).sum()
out[method] = {"points_measured": int(measured), "points_estimated": int(len(df) - measured)}
return out
[docs]
def calculate_greenscore(
energy_df: pd.DataFrame,
time_df: pd.DataFrame,
carbon_df: pd.DataFrame,
alpha: float = 0.4,
beta: float = 0.4,
gamma: float = 0.2,
output_path: Optional[Union[str, Path]] = None,
aggregated_energy_paths: Optional[Dict[str, Union[str, Path]]] = None,
) -> pd.DataFrame:
"""
Compute the GreenScore for each method by combining normalized
energy, time, and carbon scores with weighted averaging.
GreenScore = α·energy + β·carbon + γ·time
Lower scores indicate better sustainability (lower energy, time, and carbon).
Args:
energy_df: Raw energy DataFrame (with 'algorithm' column).
time_df: Raw time DataFrame (with 'algorithm' column).
carbon_df: Raw carbon DataFrame (with 'algorithm' column).
alpha: Weight for energy component (default: 0.4).
beta: Weight for carbon component (default: 0.4).
gamma: Weight for time component (default: 0.2).
output_path: Optional path to save the final ranking CSV.
aggregated_energy_paths: Optional dict method -> path to method's energy_aggregated.csv
used to add points_measured / points_estimated to the output.
Returns:
DataFrame sorted by green score (ascending, lower is better):
- 'method': Method name
- 'energy_mean': Mean normalized energy
- 'time_mean': Mean normalized time
- 'carbon_mean': Mean normalized carbon
- 'green_score': Composite GreenScore
- 'points_measured': (if aggregated_energy_paths given) Count of hardware-measured points
- 'points_estimated': (if aggregated_energy_paths given) Count of estimated points
- 'data_source_consistency': "Consistent" or "Inconsistent Data Source" (when method has both hardware and estimation)
Examples:
>>> energy_df = pd.read_csv('energy.csv')
>>> time_df = pd.read_csv('time.csv')
>>> carbon_df = pd.read_csv('carbon.csv')
>>> ranking = calculate_greenscore(energy_df, time_df, carbon_df)
"""
# Validate weights sum to 1.0
if abs(alpha + beta + gamma - 1.0) > 1e-6:
raise ValueError(f"Weights must sum to 1.0, got: alpha={alpha}, beta={beta}, gamma={gamma}")
# Step 1: Normalize each DataFrame
energy_norm = normalize_metrics(energy_df)
time_norm = normalize_metrics(time_df)
carbon_norm = normalize_metrics(carbon_df)
# Step 2: Drop algorithm column and compute column-wise means
energy_mean = energy_norm.drop(columns=['algorithm']).mean()
time_mean = time_norm.drop(columns=['algorithm']).mean()
carbon_mean = carbon_norm.drop(columns=['algorithm']).mean()
# Step 3: Method names from energy columns (already plain: cpython, py_compile, etc.)
# Carbon columns have '_CO2e_g' suffix but we use energy index so names stay correct
method_names = energy_mean.index
# Step 4: Combine into a single DataFrame
mean_df = pd.DataFrame({
'method': method_names,
'energy_mean': energy_mean.values,
'time_mean': time_mean.values,
'carbon_mean': carbon_mean.values
})
# Step 4b: Add methodology summary (points_measured vs points_estimated)
if aggregated_energy_paths:
paths_as_path = {k: Path(v) for k, v in aggregated_energy_paths.items()}
counts = _methodology_counts_from_aggregated(paths_as_path)
mean_df["points_measured"] = mean_df["method"].map(lambda m: counts.get(m, {}).get("points_measured", 0))
mean_df["points_estimated"] = mean_df["method"].map(lambda m: counts.get(m, {}).get("points_estimated", 0))
else:
mean_df["points_measured"] = 0
mean_df["points_estimated"] = 0
# Step 4c: Methodology consistency — flag "Inconsistent Data Source" when a method has both hardware and estimation
mean_df["data_source_consistency"] = mean_df.apply(
lambda r: "Inconsistent Data Source" if (r["points_measured"] > 0 and r["points_estimated"] > 0) else "Consistent",
axis=1,
)
# Step 5: Compute GreenScore = α·energy + β·carbon + γ·time
mean_df['green_score'] = (
alpha * mean_df['energy_mean'] +
beta * mean_df['carbon_mean'] +
gamma * mean_df['time_mean']
)
# Step 6: Sort methods by green score (lower is better)
green_score_df = mean_df.sort_values(by='green_score').reset_index(drop=True)
# Step 7: Save to file if output path provided
if output_path is not None:
output = Path(output_path) if isinstance(output_path, str) else output_path
output.parent.mkdir(parents=True, exist_ok=True)
green_score_df.to_csv(output, index=False)
return green_score_df