diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py index 05e9fd442..4fdc4293d 100644 --- a/src/lighteval/metrics/metrics_sample.py +++ b/src/lighteval/metrics/metrics_sample.py @@ -926,7 +926,7 @@ def compute(self, sample_ids: list[str], responses: list, formatted_docs: list[D """ questions = [formatted_doc.specific["question"] for formatted_doc in formatted_docs] options = [formatted_doc.choices for formatted_doc in formatted_docs] - golds = [formatted_doc.choices[formatted_doc.gold_index[0]] for formatted_doc in formatted_docs] + golds = [formatted_doc.get_golds()[0] for formatted_doc in formatted_docs] predictions = [response[0].result[0] for response in responses] scores, messages, judgements = self.judge.evaluate_answer_batch(questions, predictions, options, golds)