Improve Summary Analysis by Relativize the metric results if there is a status quo to relativize against (#4342)

shrutipatel31 · facebook-github-bot · commit af0a0722d6c0 · 2025-10-20T17:06:57.000-07:00
Summary: Pull Request resolved: #4342 Differential Revision: D82658357
diff --git a/ax/analysis/summary.py b/ax/analysis/summary.py
@@ -12,6 +12,7 @@
 
 from ax.analysis.analysis import Analysis
 from ax.analysis.analysis_card import AnalysisCard
+from ax.analysis.utils import filter_trials_by_indices_and_statuses
 from ax.core.experiment import Experiment
 from ax.core.trial_status import NON_STALE_STATUSES, TrialStatus
 from ax.exceptions.core import UserInputError
@@ -63,15 +64,46 @@ def compute(
         if experiment is None:
             raise UserInputError("`Summary` analysis requires an `Experiment` input")
 
+        # Get the trials that will be included in the summary
+        trials = filter_trials_by_indices_and_statuses(
+            experiment=experiment,
+            trial_indices=self.trial_indices,
+            trial_statuses=self.trial_statuses,
+        )
+
+        # Check if all trials have status quo in their arms
+        all_trials_have_status_quo = all(
+            experiment.status_quo is not None
+            and experiment.status_quo.name in trial.arms_by_name
+            for trial in trials
+        )
+
+        # Determine if we should relativize based on:
+        # (1) experiment has metrics and (2) experiment has status quo
+        # (3) all trials being used have status quo
+        should_relativize = (
+            len(experiment.metrics) > 0
+            and experiment.status_quo is not None
+            and all_trials_have_status_quo
+        )
+
         return self._create_analysis_card(
             title=(
                 "Summary for "
                 f"{experiment.name if experiment.has_name else 'Experiment'}"
             ),
-            subtitle="High-level summary of the `Trial`-s in this `Experiment`",
+            subtitle=(
+                "High-level summary of the `Trial`-s in this `Experiment`"
+                if not should_relativize
+                else (
+                    "High-level summary of the `Trial`-s in this `Experiment` "
+                    "Metric results are relativized against status quo."
+                )
+            ),
             df=experiment.to_df(
                 trial_indices=self.trial_indices,
                 omit_empty_columns=self.omit_empty_columns,
                 trial_statuses=self.trial_statuses,
+                relativize=should_relativize,
             ),
         )
diff --git a/ax/analysis/tests/test_summary.py b/ax/analysis/tests/test_summary.py
@@ -7,21 +7,29 @@
 
 import numpy as np
 import pandas as pd
+
+from ax.adapter.factory import get_sobol
 from ax.analysis.summary import Summary
 from ax.api.client import Client
 from ax.api.configs import RangeParameterConfig
 from ax.core.base_trial import TrialStatus
 from ax.core.trial import Trial
 from ax.exceptions.core import UserInputError
 from ax.utils.common.testutils import TestCase
-from ax.utils.testing.core_stubs import get_offline_experiments, get_online_experiments
+from ax.utils.testing.core_stubs import (
+    get_branin_data_batch,
+    get_branin_experiment,
+    get_offline_experiments,
+    get_online_experiments,
+)
 from pyre_extensions import assert_is_instance, none_throws
 
 
 class TestSummary(TestCase):
-    def test_compute(self) -> None:
-        client = Client()
-        client.configure_experiment(
+    def setUp(self) -> None:
+        super().setUp()
+        self.client = Client()
+        self.client.configure_experiment(
             name="test_experiment",
             parameters=[
                 RangeParameterConfig(
@@ -36,7 +44,10 @@ def test_compute(self) -> None:
                 ),
             ],
         )
-        client.configure_optimization(objective="foo, bar")
+        self.client.configure_optimization(objective="foo, bar")
+
+    def test_compute(self) -> None:
+        client = self.client
 
         # Get two trials and fail one, giving us a ragged structure
         client.get_next_trials(max_trials=2)
@@ -142,23 +153,7 @@ def test_offline(self) -> None:
 
     def test_trial_indices_filter(self) -> None:
         """Test that Client.summarize correctly uses Summary."""
-        client = Client()
-        client.configure_experiment(
-            name="test_experiment",
-            parameters=[
-                RangeParameterConfig(
-                    name="x1",
-                    parameter_type="float",
-                    bounds=(0, 1),
-                ),
-                RangeParameterConfig(
-                    name="x2",
-                    parameter_type="float",
-                    bounds=(0, 1),
-                ),
-            ],
-        )
-        client.configure_optimization(objective="foo")
+        client = self.client
 
         # Get a trial
         client.get_next_trials(max_trials=1)
@@ -228,19 +223,7 @@ def test_trial_status_filter(self) -> None:
 
     def test_default_excludes_stale_trials(self) -> None:
         """Test that Summary defaults to excluding STALE trials."""
-        # Set up experiment with basic configuration
-        client = Client()
-        client.configure_experiment(
-            name="test_experiment",
-            parameters=[
-                RangeParameterConfig(
-                    name="x1",
-                    parameter_type="float",
-                    bounds=(0, 1),
-                ),
-            ],
-        )
-        client.configure_optimization(objective="foo")
+        client = self.client
 
         # Create 3 trials with different statuses to test default filtering behavior
         client.get_next_trials(max_trials=3)
@@ -275,3 +258,54 @@ def test_default_excludes_stale_trials(self) -> None:
         # Verify that no trials in the output have STALE status
         stale_statuses = card.df[card.df["trial_status"] == "STALE"]
         self.assertEqual(len(stale_statuses), 0)
+
+    def test_metrics_relativized_with_status_quo(self) -> None:
+        """Test that Summary relativizes metrics by default when status
+        quos are present."""
+        experiment = get_branin_experiment(with_status_quo=True, named=True)
+        experiment.name = "test_experiment_relativize"
+
+        # Create batch trials with status quo
+        for _ in range(2):
+            sobol_generator = get_sobol(search_space=experiment.search_space)
+            trial = experiment.new_batch_trial(should_add_status_quo_arm=True)
+            trial.add_generator_run(sobol_generator.gen(n=1))
+            trial.mark_running(no_runner_required=True)
+            experiment.attach_data(
+                get_branin_data_batch(batch=trial, metrics=[*experiment.metrics.keys()])
+            )
+            trial.mark_completed()
+
+        analysis = Summary()
+        card = analysis.compute(experiment=experiment)
+
+        with self.subTest("subtitle_indicates_relativization"):
+            self.assertIn("relativized", card.subtitle.lower())
+
+        with self.subTest("metric_values_formatted_as_percentages"):
+            metric_values = card.df["branin"].dropna()
+            self.assertGreater(len(metric_values), 0)
+            for val in metric_values:
+                self.assertIsInstance(val, str)
+                self.assertTrue(val.endswith("%"))
+
+        with self.subTest("relativization_calculation_correct"):
+            raw_data = experiment.lookup_data().df
+            sq_name = none_throws(experiment.status_quo).name
+            trial_0_data = raw_data[raw_data["trial_index"] == 0]
+            treatment_arm = [a for a in experiment.trials[0].arms if a.name != sq_name][
+                0
+            ]
+
+            sq_val = trial_0_data[trial_0_data["arm_name"] == sq_name]["mean"].values[0]
+            arm_val = trial_0_data[trial_0_data["arm_name"] == treatment_arm.name][
+                "mean"
+            ].values[0]
+            expected = ((arm_val - sq_val) / sq_val) * 100
+
+            actual = float(
+                card.df[card.df["arm_name"] == treatment_arm.name]["branin"]
+                .values[0]
+                .rstrip("%")
+            )
+            self.assertAlmostEqual(actual, expected, places=1)
diff --git a/ax/core/experiment.py b/ax/core/experiment.py
@@ -2039,6 +2039,7 @@ def to_df(
         trial_indices: Iterable[int] | None = None,
         trial_statuses: Sequence[TrialStatus] | None = None,
         omit_empty_columns: bool = True,
+        relativize: bool = False,
     ) -> pd.DataFrame:
         """
         High-level summary of the Experiment with one row per arm. Any values missing at
@@ -2060,10 +2061,32 @@ def to_df(
             trial_indices: If specified, only include these trial indices.
             omit_empty_columns: If True, omit columns where every value is None.
             trial_status: If specified, only include trials with this status.
+            relativize: If True and:
+                * experiment has a status quo on all of its ``BatchTrial``-s
+                * OR a status quo trial among its ``Trial``-s,
+                , relativize metrics against the status quo.
         """
 
         records = []
-        data_df = self.lookup_data(trial_indices=trial_indices).df
+        data = self.lookup_data(trial_indices=trial_indices)
+
+        # Relativize metrics if requested
+        if relativize:
+            if self.status_quo is None:
+                raise UserInputError(
+                    "Attempting to relativize the experiment data, however, "
+                    "the experiment status quo is None. Please set the experiment "
+                    "status quo, or set `relativize` = False"
+                )
+
+            data_df = data.relativize(
+                status_quo_name=self.status_quo.name,
+                as_percent=True,
+                include_sq=True,
+            ).df
+        else:
+            data_df = data.df
+
         trials = (
             self.get_trials_by_indices(trial_indices=trial_indices)
             if trial_indices
@@ -2125,6 +2148,20 @@ def to_df(
         df = pd.DataFrame(records)
         if omit_empty_columns:
             df = df.loc[:, df.notnull().any()]
+
+        # Format metric columns as percentages with 4 significant figures when
+        # relativized
+        if relativize:
+            for metric_name in self.metrics.keys():
+                if metric_name in df.columns:
+                    df[metric_name] = df[metric_name].apply(
+                        lambda x: (
+                            f"{x:.4g}%"
+                            if pd.notna(x) and x != 0.0
+                            else ("0%" if pd.notna(x) else None)
+                        )
+                    )
+
         return df
 
     def add_auxiliary_experiment(
diff --git a/ax/core/tests/test_experiment.py b/ax/core/tests/test_experiment.py
@@ -1641,6 +1641,89 @@ def test_to_df(self) -> None:
         )
         self.assertTrue(df_completed.equals(expected_completed_df))
 
+    def test_to_df_with_relativize(self) -> None:
+        """Test the relativize flag in to_df method with status quo."""
+        # Create an experiment with status quo and completed trials
+        experiment = get_branin_experiment(
+            with_status_quo=True, with_completed_batch=True
+        )
+
+        with self.subTest("without relativization"):
+            df_no_rel = experiment.to_df(relativize=False)
+
+            # Verify dataframe has expected structure
+            self.assertGreater(len(df_no_rel), 0)
+            self.assertIn("trial_index", df_no_rel.columns)
+            self.assertIn("arm_name", df_no_rel.columns)
+
+            # Find metric columns
+            metric_cols = [
+                col
+                for col in df_no_rel.columns
+                if col
+                not in ["trial_index", "arm_name", "trial_status", "name", "x1", "x2"]
+            ]
+            self.assertGreater(len(metric_cols), 0, "Should have at least one metric")
+
+            # Verify metric values are numeric, not percentage strings
+            for metric_name in metric_cols:
+                values = df_no_rel[metric_name].dropna()
+                for val in values:
+                    self.assertNotIsInstance(
+                        val, str, "Non-relativized values should not be strings"
+                    )
+
+        with self.subTest("with relativization"):
+            df_with_rel = experiment.to_df(relativize=True)
+            df_no_rel = experiment.to_df(relativize=False)
+
+            # Verify structure is preserved
+            self.assertEqual(len(df_with_rel), len(df_no_rel))
+            self.assertEqual(set(df_with_rel.columns), set(df_no_rel.columns))
+
+            # Find metric columns
+            metric_cols = [
+                col
+                for col in df_no_rel.columns
+                if col
+                not in ["trial_index", "arm_name", "trial_status", "name", "x1", "x2"]
+            ]
+
+            # Verify relativization for each metric
+            self.assertIsNotNone(experiment.status_quo)
+            status_quo_name = experiment.status_quo.name
+            for metric_name in metric_cols:
+                # Status quo should be 0% after relativization (using .4g format)
+                sq_rel_values = df_with_rel[df_with_rel["arm_name"] == status_quo_name][
+                    metric_name
+                ]
+                for val in sq_rel_values.dropna():
+                    self.assertEqual(
+                        val, "0%", "Status quo should be relativized to 0%"
+                    )
+
+                # Non-status-quo arms should have percentage strings
+                non_sq_rel_values = df_with_rel[
+                    df_with_rel["arm_name"] != status_quo_name
+                ][metric_name].dropna()
+                for val in non_sq_rel_values:
+                    self.assertIsInstance(
+                        val, str, "Relativized values should be strings"
+                    )
+                    self.assertTrue(
+                        val.endswith("%"), "Relativized values should end with %"
+                    )
+
+                # Verify at least one non-status-quo value is non-zero
+                has_nonzero = any(
+                    float(v.rstrip("%")) != 0.0 for v in non_sq_rel_values
+                )
+                self.assertTrue(
+                    has_nonzero,
+                    "At least one non-status-quo arm should have non-zero "
+                    "relativized value",
+                )
+
 
 class ExperimentWithMapDataTest(TestCase):
     def setUp(self) -> None: