Fixes a TypeError in Sacrebleu. (#387)

JoelNiklaus · clefourrier · Hynek Kydlicek · commit 6af52801fc79 · 2024-11-26T14:56:06.000Z
---------

Co-authored-by: Clémentine Fourrier &lt;22726840+clefourrier@users.noreply.github.com&gt;
diff --git a/.github/ISSUE_TEMPLATE/evaluation-task-request.md b/.github/ISSUE_TEMPLATE/evaluation-task-request.md
@@ -13,6 +13,6 @@ assignees: ''
 
 ## Evaluation metadata
 Provide all available
-- Paper url: 
-- Github url: 
+- Paper url:
+- Github url:
 - Dataset url:
diff --git a/.github/ISSUE_TEMPLATE/feature-request.md b/.github/ISSUE_TEMPLATE/feature-request.md
@@ -15,4 +15,3 @@ A clear and concise description of what you want to happen.
 
 ## Posssible alternatives
 A clear and concise description of any alternative solutions or features you've considered.
-
diff --git a/README.md b/README.md
@@ -104,7 +104,7 @@ Harness and HELM teams for their pioneering work on LLM evaluations.
 Got ideas? Found a bug? Want to add a
 [task](https://github.com/huggingface/lighteval/wiki/Adding-a-Custom-Task) or
 [metric](https://github.com/huggingface/lighteval/wiki/Adding-a-New-Metric)?
-Contributions are warmly welcomed! 
+Contributions are warmly welcomed!
 
 If you're adding a new feature, please open an issue first.
 
diff --git a/examples/model_configs/peft_model.yaml b/examples/model_configs/peft_model.yaml
@@ -1,8 +1,8 @@
 model:
-  type: "base" 
+  type: "base"
   base_params:
     model_args: "pretrained=predibase/customer_support,revision=main" # pretrained=model_name,trust_remote_code=boolean,revision=revision_to_use,model_parallel=True ... For a PEFT model, the pretrained model should be the one trained with PEFT and the base model below will contain the original model on which the adapters will be applied.
-    dtype: "4bit"  # Specifying the model to be loaded in 4 bit uses BitsAndBytesConfig. The other option is to use "8bit" quantization. 
+    dtype: "4bit"  # Specifying the model to be loaded in 4 bit uses BitsAndBytesConfig. The other option is to use "8bit" quantization.
     compile: true
   merged_weights: # Ignore this section if you are not using PEFT models
     delta_weights: false # set to True of your model should be merged with a base model, also need to provide the base model name
diff --git a/examples/model_configs/quantized_model.yaml b/examples/model_configs/quantized_model.yaml
@@ -1,8 +1,8 @@
 model:
-  type: "base" 
+  type: "base"
   base_params:
     model_args: "pretrained=HuggingFaceH4/zephyr-7b-beta,revision=main" # pretrained=model_name,trust_remote_code=boolean,revision=revision_to_use,model_parallel=True ...
-    dtype: "4bit"  # Specifying the model to be loaded in 4 bit uses BitsAndBytesConfig. The other option is to use "8bit" quantization. 
+    dtype: "4bit"  # Specifying the model to be loaded in 4 bit uses BitsAndBytesConfig. The other option is to use "8bit" quantization.
     compile: true
   merged_weights: # Ignore this section if you are not using PEFT models
     delta_weights: false # set to True of your model should be merged with a base model, also need to provide the base model name
diff --git a/src/lighteval/metrics/metrics_corpus.py b/src/lighteval/metrics/metrics_corpus.py
@@ -30,6 +30,7 @@
 import sacrebleu
 import sklearn.metrics
 
+from lighteval.logging.hierarchical_logger import hlog_warn
 from lighteval.metrics.sample_preparator import (
     GenerativeCorpusMetricInput,
     LogprobCorpusMetricInput,
@@ -103,7 +104,14 @@ def __init__(self, metric_type: str):
     def compute(self, items: list[GenerativeCorpusMetricInput]) -> float:
         """Computes the metric score over all the corpus generated items, by using the sacrebleu implementation."""
         golds = [i.golds for i in items]
-        preds = [as_list(i.preds) for i in items]
+        preds = []
+        for i in items:
+            pred = as_list(i.preds)
+            if len(pred) > 1:
+                hlog_warn(
+                    f"Multiple predictions present, keeping only the first prediction (when computing sacrebleu.{self.metric.__name__})."
+                )
+            preds.append(pred[0])
         return float(self.metric(hypotheses=preds, references=golds).score)
 
 

Original file line number	Diff line number	Diff line change
`@@ -15,4 +15,3 @@ A clear and concise description of what you want to happen.`
`15`	`15`
`16`	`16`	`## Posssible alternatives`
`17`	`17`	`A clear and concise description of any alternative solutions or features you've considered.`
`18`		`-`