Pass At K Math (#647)

clefourrier · web-flow · commit fcb784d55e53 · 2025-04-04T11:32:03.000+02:00
* test 1

* change task

* change

* fix names
diff --git a/src/lighteval/metrics/metrics.py b/src/lighteval/metrics/metrics.py
@@ -28,6 +28,9 @@
     ExprExtractionConfig,
     IndicesExtractionConfig,
     LatexExtractionConfig,
+    compare_gold_target,
+    extract_target_from_pred,
+    get_extraction_regexes,
     multilingual_extractive_match_metric,
 )
 from lighteval.metrics.harness_compatibility.drop import drop_metrics
@@ -366,6 +369,167 @@ class Metrics(Enum):
         corpus_level_fn=np.mean,
         higher_is_better=True,
     )
+    math_pass_at_1_4n = SampleLevelMetric(
+        metric_name="math_pass@1:4_samples",
+        sample_level_fn=PassAtK(
+            k=1,
+            n=4,
+            strip_strings=True,
+            # Extracting mathematical expressions and latex expressions
+            normalize_gold=lambda k: extract_target_from_pred(
+                k,
+                get_extraction_regexes(
+                    formatted_doc=None,
+                    target_types=[ExprExtractionConfig(), LatexExtractionConfig()],
+                    language=Language.ENGLISH,
+                ),
+            ),
+            # Extracting mathematical expressions and latex expressions
+            normalize_pred=lambda k: extract_target_from_pred(
+                k,
+                get_extraction_regexes(
+                    formatted_doc=None,
+                    target_types=[ExprExtractionConfig(), LatexExtractionConfig()],
+                    language=Language.ENGLISH,
+                ),
+            ),
+            # Uses sympy for comparision
+            sample_scoring_function=compare_gold_target,
+        ).compute,
+        category=MetricCategory.GENERATIVE_SAMPLING,
+        use_case=MetricUseCase.REASONING,
+        corpus_level_fn=np.mean,
+        higher_is_better=True,
+    )
+    math_pass_at_1_8n = SampleLevelMetric(
+        metric_name="math_pass@1:8_samples",
+        sample_level_fn=PassAtK(
+            k=1,
+            n=8,
+            strip_strings=True,
+            # Extracting mathematical expressions and latex expressions
+            normalize_gold=lambda k: extract_target_from_pred(
+                k,
+                get_extraction_regexes(
+                    formatted_doc=None,
+                    target_types=[ExprExtractionConfig(), LatexExtractionConfig()],
+                    language=Language.ENGLISH,
+                ),
+            ),
+            # Extracting mathematical expressions and latex expressions
+            normalize_pred=lambda k: extract_target_from_pred(
+                k,
+                get_extraction_regexes(
+                    formatted_doc=None,
+                    target_types=[ExprExtractionConfig(), LatexExtractionConfig()],
+                    language=Language.ENGLISH,
+                ),
+            ),
+            # Uses sympy for comparision
+            sample_scoring_function=compare_gold_target,
+        ).compute,
+        category=MetricCategory.GENERATIVE_SAMPLING,
+        use_case=MetricUseCase.REASONING,
+        corpus_level_fn=np.mean,
+        higher_is_better=True,
+    )
+    math_pass_at_1_16n = SampleLevelMetric(
+        metric_name="math_pass@1:16_samples",
+        sample_level_fn=PassAtK(
+            k=1,
+            n=16,
+            strip_strings=True,
+            # Extracting mathematical expressions and latex expressions
+            normalize_gold=lambda k: extract_target_from_pred(
+                k,
+                get_extraction_regexes(
+                    formatted_doc=None,
+                    target_types=[ExprExtractionConfig(), LatexExtractionConfig()],
+                    language=Language.ENGLISH,
+                ),
+            ),
+            # Extracting mathematical expressions and latex expressions
+            normalize_pred=lambda k: extract_target_from_pred(
+                k,
+                get_extraction_regexes(
+                    formatted_doc=None,
+                    target_types=[ExprExtractionConfig(), LatexExtractionConfig()],
+                    language=Language.ENGLISH,
+                ),
+            ),
+            # Uses sympy for comparision
+            sample_scoring_function=compare_gold_target,
+        ).compute,
+        category=MetricCategory.GENERATIVE_SAMPLING,
+        use_case=MetricUseCase.REASONING,
+        corpus_level_fn=np.mean,
+        higher_is_better=True,
+    )
+    math_pass_at_1_32n = SampleLevelMetric(
+        metric_name="math_pass@1:32_samples",
+        sample_level_fn=PassAtK(
+            k=1,
+            n=32,
+            strip_strings=True,
+            # Extracting mathematical expressions and latex expressions
+            normalize_gold=lambda k: extract_target_from_pred(
+                k,
+                get_extraction_regexes(
+                    formatted_doc=None,
+                    target_types=[ExprExtractionConfig(), LatexExtractionConfig()],
+                    language=Language.ENGLISH,
+                ),
+            ),
+            # Extracting mathematical expressions and latex expressions
+            normalize_pred=lambda k: extract_target_from_pred(
+                k,
+                get_extraction_regexes(
+                    formatted_doc=None,
+                    target_types=[ExprExtractionConfig(), LatexExtractionConfig()],
+                    language=Language.ENGLISH,
+                ),
+            ),
+            # Uses sympy for comparision
+            sample_scoring_function=compare_gold_target,
+        ).compute,
+        category=MetricCategory.GENERATIVE_SAMPLING,
+        use_case=MetricUseCase.REASONING,
+        corpus_level_fn=np.mean,
+        higher_is_better=True,
+    )
+    math_pass_at_1_64n = SampleLevelMetric(
+        metric_name="math_pass@1:64_samples",
+        sample_level_fn=PassAtK(
+            k=1,
+            n=64,
+            strip_strings=True,
+            # Extracting mathematical expressions and latex expressions
+            normalize_gold=lambda k: extract_target_from_pred(
+                k,
+                get_extraction_regexes(
+                    formatted_doc=None,
+                    target_types=[ExprExtractionConfig(), LatexExtractionConfig()],
+                    language=Language.ENGLISH,
+                ),
+            ),
+            # Extracting mathematical expressions and latex expressions
+            normalize_pred=lambda k: extract_target_from_pred(
+                k,
+                get_extraction_regexes(
+                    formatted_doc=None,
+                    target_types=[ExprExtractionConfig(), LatexExtractionConfig()],
+                    language=Language.ENGLISH,
+                ),
+            ),
+            # Uses sympy for comparision
+            sample_scoring_function=compare_gold_target,
+        ).compute,
+        category=MetricCategory.GENERATIVE_SAMPLING,
+        use_case=MetricUseCase.REASONING,
+        corpus_level_fn=np.mean,
+        higher_is_better=True,
+    )
+
     mrr = SampleLevelMetric(
         metric_name="mrr",
         sample_level_fn=MRR().compute,
diff --git a/src/lighteval/models/endpoints/endpoint_model.py b/src/lighteval/models/endpoints/endpoint_model.py
@@ -409,8 +409,9 @@ def _async_process_request(
             decoder_input_details=True,
             grammar=grammar,
         )
+        generation_config_dict = {k: v for k, v in generation_config.__dict__.items() if v is not None}
 
-        generated_text = self.async_client.text_generation(prompt=context, generation_config=generation_config)
+        generated_text = self.async_client.text_generation(prompt=context, **generation_config_dict)
 
         return generated_text
 
@@ -431,10 +432,11 @@ def _process_request(
             decoder_input_details=True,
             grammar=grammar,
         )
+        generation_config_dict = {k: v for k, v in generation_config.__dict__.items() if v is not None}
 
         generated_text = self.client.text_generation(
             prompt=context,
-            generation_config=generation_config,
+            **generation_config_dict,
         )
 
         return generated_text
diff --git a/src/lighteval/tasks/default_tasks.py b/src/lighteval/tasks/default_tasks.py
@@ -323,7 +323,14 @@
     few_shots_split=None,
     few_shots_select=None,
     generation_size=32768,
-    metric=[Metrics.expr_gold_metric],
+    metric=[
+        Metrics.expr_gold_metric,
+        Metrics.math_pass_at_1_4n,
+        Metrics.math_pass_at_1_8n,
+        Metrics.math_pass_at_1_16n,
+        Metrics.math_pass_at_1_32n,
+        Metrics.math_pass_at_1_64n,
+    ],
     version=1,
 )
 aime25 = LightevalTaskConfig(
@@ -337,7 +344,14 @@
     few_shots_split=None,
     few_shots_select=None,
     generation_size=10000,
-    metric=[Metrics.expr_gold_metric],
+    metric=[
+        Metrics.expr_gold_metric,
+        Metrics.math_pass_at_1_4n,
+        Metrics.math_pass_at_1_8n,
+        Metrics.math_pass_at_1_16n,
+        Metrics.math_pass_at_1_32n,
+        Metrics.math_pass_at_1_64n,
+    ],
     version=1,
 )
 anachronisms_bigbench = LightevalTaskConfig(

Original file line number	Diff line number	Diff line change
`@@ -409,8 +409,9 @@ def _async_process_request(`
`409`	`409`	`decoder_input_details=True,`
`410`	`410`	`grammar=grammar,`
`411`	`411`	`)`
	`412`	`+ generation_config_dict = {k: v for k, v in generation_config.__dict__.items() if v is not None}`
`412`	`413`
`413`		`- generated_text = self.async_client.text_generation(prompt=context, generation_config=generation_config)`
	`414`	`+ generated_text = self.async_client.text_generation(prompt=context, **generation_config_dict)`
`414`	`415`
`415`	`416`	`return generated_text`
`416`	`417`
`@@ -431,10 +432,11 @@ def _process_request(`
`431`	`432`	`decoder_input_details=True,`
`432`	`433`	`grammar=grammar,`
`433`	`434`	`)`
	`435`	`+ generation_config_dict = {k: v for k, v in generation_config.__dict__.items() if v is not None}`
`434`	`436`
`435`	`437`	`generated_text = self.client.text_generation(`
`436`	`438`	`prompt=context,`
`437`		`- generation_config=generation_config,`
	`439`	`+ **generation_config_dict,`
`438`	`440`	`)`
`439`	`441`
`440`	`442`	`return generated_text`