Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
100 changes: 100 additions & 0 deletions src/lighteval/metrics/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -370,6 +370,38 @@ class Metrics(Enum):
corpus_level_fn=np.mean,
higher_is_better=True,
)
math_pass_at_1_1n = SampleLevelMetric(
metric_name="math_pass@1:1_samples",
sample_level_fn=PassAtK(
k=1,
n=1,
strip_strings=True,
# Extracting mathematical expressions and latex expressions
normalize_gold=lambda k: extract_target_from_pred(
k,
get_extraction_regexes(
formatted_doc=None,
target_types=[ExprExtractionConfig(), LatexExtractionConfig()],
language=Language.ENGLISH,
),
),
# Extracting mathematical expressions and latex expressions
normalize_pred=lambda k: extract_target_from_pred(
k,
get_extraction_regexes(
formatted_doc=None,
target_types=[ExprExtractionConfig(), LatexExtractionConfig()],
language=Language.ENGLISH,
),
),
# Uses sympy for comparision
sample_scoring_function=compare_gold_target,
).compute,
category=MetricCategory.GENERATIVE_SAMPLING,
use_case=MetricUseCase.REASONING,
corpus_level_fn=np.mean,
higher_is_better=True,
)
math_pass_at_1_4n = SampleLevelMetric(
metric_name="math_pass@1:4_samples",
sample_level_fn=PassAtK(
Expand Down Expand Up @@ -838,6 +870,74 @@ class Metrics(Enum):
pred_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")],
precision=6,
)
gpqa_instruct_pass_at_1_1n = SampleLevelMetric(
metric_name="gpqa_pass@1:1_samples",
sample_level_fn=PassAtK(
k=1,
n=1,
sample_scoring_function=lambda pred, ref, doc: multilingual_extractive_match_metric(
language=Language.ENGLISH,
gold_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")],
pred_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")],
precision=6,
).sample_level_fn([ref], [pred], doc),
).compute,
category=MetricCategory.GENERATIVE_SAMPLING,
use_case=MetricUseCase.REASONING,
corpus_level_fn=np.mean,
higher_is_better=True,
)
gpqa_instruct_pass_at_1_4n = SampleLevelMetric(
metric_name="gpqa_pass@1:4_samples",
sample_level_fn=PassAtK(
k=1,
n=4,
sample_scoring_function=lambda pred, ref, doc: multilingual_extractive_match_metric(
language=Language.ENGLISH,
gold_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")],
pred_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")],
precision=6,
).sample_level_fn([ref], [pred], doc),
).compute,
category=MetricCategory.GENERATIVE_SAMPLING,
use_case=MetricUseCase.REASONING,
corpus_level_fn=np.mean,
higher_is_better=True,
)
gpqa_instruct_pass_at_1_8n = SampleLevelMetric(
metric_name="gpqa_pass@1:8_samples",
sample_level_fn=PassAtK(
k=1,
n=8,
sample_scoring_function=lambda pred, ref, doc: multilingual_extractive_match_metric(
language=Language.ENGLISH,
gold_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")],
pred_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")],
precision=6,
).sample_level_fn([ref], [pred], doc),
).compute,
category=MetricCategory.GENERATIVE_SAMPLING,
use_case=MetricUseCase.REASONING,
corpus_level_fn=np.mean,
higher_is_better=True,
)
gpqa_instruct_pass_at_1_64n = SampleLevelMetric(
metric_name="gpqa_pass@1:64_samples",
sample_level_fn=PassAtK(
k=1,
n=64,
sample_scoring_function=lambda pred, ref, doc: multilingual_extractive_match_metric(
language=Language.ENGLISH,
gold_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")],
pred_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")],
precision=6,
).sample_level_fn([ref], [pred], doc),
).compute,
category=MetricCategory.GENERATIVE_SAMPLING,
use_case=MetricUseCase.REASONING,
corpus_level_fn=np.mean,
higher_is_better=True,
)

def __str__(self):
return self.name.replace("_at_", "@")
Expand Down
6 changes: 4 additions & 2 deletions src/lighteval/metrics/metrics_sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -1163,7 +1163,9 @@ def __init__(
self.type_exact_match = "full"
self.score_sample = self.default_sample_scoring

def compute(self, golds: list[str], predictions: list[str], **kwargs) -> dict[str, float]:
def compute(
self, golds: list[str], predictions: list[str], formatted_doc: Doc = None, **kwargs
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I had to pass formatted_doc in order to enable pass@1 with GPQA

) -> dict[str, float]:
"""Computes the metric over a list of golds and predictions for one single item with possibly many samples.
It applies normalisation (if needed) to model prediction and gold, computes their per prediction score,
then aggregates the scores over the samples using a pass@k.
Expand All @@ -1189,7 +1191,7 @@ def compute(self, golds: list[str], predictions: list[str], **kwargs) -> dict[st
all_scores = []
for pred in predictions[: self.n]:
cur_pred = self.get_processed_pred(pred=pred)
all_scores.append(self.score_sample(cur_pred, gold))
all_scores.append(self.score_sample(cur_pred, gold, formatted_doc))

return self.pass_at_k(all_scores)

Expand Down
4 changes: 2 additions & 2 deletions src/lighteval/models/vllm/vllm_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
from typing import Optional

import torch
from pydantic import NonNegativeFloat, PositiveInt
from pydantic import NonNegativeFloat, NonNegativeInt, PositiveInt
from tqdm import tqdm

from lighteval.data import GenerativeTaskDataset, LoglikelihoodDataset
Expand Down Expand Up @@ -82,7 +82,7 @@ class VLLMModelConfig(ModelConfig):
gpu_memory_utilization: NonNegativeFloat = 0.9 # lower this if you are running out of memory
max_model_length: PositiveInt | None = None # maximum length of the model, ussually infered automatically. reduce this if you encouter OOM issues, 4096 is usually enough
swap_space: PositiveInt = 4 # CPU swap space size (GiB) per GPU.
seed: PositiveInt = 1234
seed: NonNegativeInt = 1234
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Needed to allow seed=0 in the model args

trust_remote_code: bool = False
use_chat_template: bool = False
add_special_tokens: bool = True
Expand Down
32 changes: 24 additions & 8 deletions src/lighteval/tasks/default_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -324,10 +324,14 @@
few_shots_select=None,
generation_size=32768,
metric=[
Metrics.expr_gold_metric,
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was causing redundant computation since we can get the same result from pass@1 (n=1)

Metrics.math_pass_at_1_1n,
Metrics.math_pass_at_1_4n,
Metrics.math_pass_at_1_8n,
Metrics.math_pass_at_1_16n,
Metrics.math_pass_at_1_32n,
Metrics.math_pass_at_1_64n,
],
version=1,
version=2,
)
aime24_gpassk = LightevalTaskConfig(
name="aime24_gpassk",
Expand Down Expand Up @@ -355,10 +359,14 @@
few_shots_select=None,
generation_size=10000,
metric=[
Metrics.expr_gold_metric,
Metrics.math_pass_at_1_1n,
Metrics.math_pass_at_1_4n,
Metrics.math_pass_at_1_8n,
Metrics.math_pass_at_1_16n,
Metrics.math_pass_at_1_32n,
Metrics.math_pass_at_1_64n,
],
version=1,
version=2,
)
aime25_gpassk = LightevalTaskConfig(
name="aime25_gpassk",
Expand Down Expand Up @@ -7809,10 +7817,15 @@
few_shots_split=None,
few_shots_select=None,
generation_size=32768, # needed for reasoning models like R1
metric=[Metrics.gpqa_instruct_metric],
metric=[
Metrics.gpqa_instruct_pass_at_1_1n,
Metrics.gpqa_instruct_pass_at_1_4n,
Metrics.gpqa_instruct_pass_at_1_8n,
Metrics.gpqa_instruct_pass_at_1_64n,
],
stop_sequence=[], # no stop sequence, will use eos token
trust_dataset=True,
version=0,
version=1,
)
gpqa_extended_instruct_lighteval = LightevalTaskConfig(
name="gpqa:extended",
Expand Down Expand Up @@ -9688,8 +9701,11 @@
few_shots_split=None,
few_shots_select=None,
generation_size=32768,
metric=[Metrics.latex_gold_metric],
version=1,
metric=[
Metrics.math_pass_at_1_1n,
Metrics.math_pass_at_1_4n,
],
version=2,
)
math_500_gpassk = LightevalTaskConfig(
name="math_500_gpassk",
Expand Down
Loading