-
Couldn't load subscription status.
- Fork 840
Implement BLEU score evaluation for NLP tests #6537
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
bc34929
b05ee5b
d3f45a4
666093d
e7539a6
e8a88d8
8b8067f
bcccea8
47dba70
a0dbe6e
069373c
2ee2ef1
bbabe2a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,17 @@ | ||
| // Licensed to the .NET Foundation under one or more agreements. | ||
| // The .NET Foundation licenses this file to you under the MIT license. | ||
|
|
||
| namespace System.Runtime.CompilerServices; | ||
|
|
||
| [AttributeUsage(AttributeTargets.Class | AttributeTargets.Struct | AttributeTargets.Interface)] | ||
| internal sealed class CollectionBuilderAttribute : Attribute | ||
| { | ||
| public CollectionBuilderAttribute(Type builderType, string methodName) | ||
| { | ||
| BuilderType = builderType; | ||
| MethodName = methodName; | ||
| } | ||
|
|
||
| public Type BuilderType { get; } | ||
| public string MethodName { get; } | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,7 @@ | ||
| To use this source in your project, add the following to your `.csproj` file: | ||
|
|
||
| ```xml | ||
| <PropertyGroup> | ||
| <InjectCollectionBuilderAttributesOnLegacy>true</InjectCollectionBuilderAttributesOnLegacy> | ||
| </PropertyGroup> | ||
| ``` |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,96 @@ | ||
| // Licensed to the .NET Foundation under one or more agreements. | ||
| // The .NET Foundation licenses this file to you under the MIT license. | ||
|
|
||
| using System.Collections.Generic; | ||
| using System.Globalization; | ||
| using System.Linq; | ||
| using System.Threading; | ||
| using System.Threading.Tasks; | ||
| using Microsoft.Extensions.AI.Evaluation.NLP.Common; | ||
| using Microsoft.Extensions.AI.Evaluation.Utilities; | ||
| using Microsoft.Shared.Diagnostics; | ||
|
|
||
| namespace Microsoft.Extensions.AI.Evaluation.NLP; | ||
|
|
||
| /// <summary> | ||
| /// An <see cref="IEvaluator"/> that evaluates the quality of a response produced by an AI model by comparing | ||
| /// it to a reference response using the BLEU (Bilingual Evaluation Understudy) algorithm. It is often used | ||
| /// to evaluate the quality of machine translation or text generation tasks. | ||
| /// </summary> | ||
| /// <remarks> | ||
| /// <para> | ||
| /// The <see cref="BLEUEvaluator"/> computes the BLEU score of a response ("hypothesis") compared to a reference | ||
| /// supplied via <see cref="BLEUEvaluatorContext.References"/>. The score is returned in a <see cref="NumericMetric"/> | ||
| /// with a value between 0.0 and 1.0 where 0.0 represents no match at all and 1.0 indicates a perfect match. | ||
| /// By default, the score is interpreted with a pass/fail cutoff of 0.5. So a score of 0.5 or higher is | ||
| /// passing and a score below 0.5 is failing. | ||
| /// </para> | ||
| /// </remarks> | ||
| public sealed class BLEUEvaluator : IEvaluator | ||
| { | ||
| /// <summary> | ||
| /// Gets the <see cref="EvaluationMetric.Name"/> of the <see cref="NumericMetric"/> returned by | ||
| /// <see cref="BLEUEvaluator"/>. | ||
| /// </summary> | ||
| public static string BLEUMetricName => "BLEU"; | ||
|
|
||
| /// <inheritdoc/> | ||
| public IReadOnlyCollection<string> EvaluationMetricNames { get; } = [BLEUMetricName]; | ||
|
|
||
| /// <inheritdoc/> | ||
| public ValueTask<EvaluationResult> EvaluateAsync( | ||
| IEnumerable<ChatMessage> messages, | ||
| ChatResponse modelResponse, | ||
| ChatConfiguration? chatConfiguration = null, | ||
| IEnumerable<EvaluationContext>? additionalContext = null, | ||
| CancellationToken cancellationToken = default) | ||
| { | ||
| _ = Throw.IfNull(modelResponse); | ||
|
|
||
| var metric = new NumericMetric(BLEUMetricName); | ||
| var result = new EvaluationResult(metric); | ||
|
|
||
| if (string.IsNullOrWhiteSpace(modelResponse.Text)) | ||
| { | ||
| metric.AddDiagnostics( | ||
| EvaluationDiagnostic.Error($"The {nameof(modelResponse)} supplied for evaluation was null or empty.")); | ||
|
|
||
| return new ValueTask<EvaluationResult>(result); | ||
| } | ||
|
|
||
| if (additionalContext?.OfType<BLEUEvaluatorContext>().FirstOrDefault() | ||
| is not BLEUEvaluatorContext context) | ||
| { | ||
| metric.AddDiagnostics( | ||
| EvaluationDiagnostic.Error( | ||
| $"A value of type '{nameof(BLEUEvaluatorContext)}' was not found in the '{nameof(additionalContext)}' collection.")); | ||
|
|
||
| return new ValueTask<EvaluationResult>(result); | ||
| } | ||
|
|
||
peterwald marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| if (context.References.Count is 0) | ||
| { | ||
| metric.AddDiagnostics( | ||
| EvaluationDiagnostic.Error( | ||
| $"Supplied '{nameof(BLEUEvaluatorContext)}' did not contain any '{nameof(BLEUEvaluatorContext.References)}'.")); | ||
|
|
||
| return new ValueTask<EvaluationResult>(result); | ||
| } | ||
|
|
||
| var (score, duration) = TimingHelper.ExecuteWithTiming(() => | ||
| { | ||
| var references = context.References.Select(reference => SimpleWordTokenizer.WordTokenize(reference)); | ||
| var hypothesis = SimpleWordTokenizer.WordTokenize(modelResponse.Text); | ||
| return BLEUAlgorithm.SentenceBLEU(references, hypothesis, BLEUAlgorithm.DefaultBLEUWeights, SmoothingFunction.Method4); | ||
| }); | ||
|
|
||
| metric.Value = score; | ||
| string durationText = $"{duration.TotalSeconds.ToString("F2", CultureInfo.InvariantCulture)} s"; | ||
| metric.AddOrUpdateMetadata(name: "evaluation-duration", value: durationText); | ||
| metric.AddOrUpdateContext(context); | ||
| metric.Interpretation = NLPScoreInterpretation.Interpret(metric); | ||
|
|
||
| return new ValueTask<EvaluationResult>(result); | ||
peterwald marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| } | ||
|
|
||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,62 @@ | ||
| // Licensed to the .NET Foundation under one or more agreements. | ||
| // The .NET Foundation licenses this file to you under the MIT license. | ||
|
|
||
| #pragma warning disable S3604 | ||
| // S3604: Member initializer values should not be redundant. | ||
| // We disable this warning because it is a false positive arising from the analyzer's lack of support for C#'s primary | ||
| // constructor syntax. | ||
|
|
||
| using System.Collections.Generic; | ||
| using System.Linq; | ||
|
|
||
| namespace Microsoft.Extensions.AI.Evaluation.NLP; | ||
|
|
||
| /// <summary> | ||
| /// Contextual information that the <see cref="BLEUEvaluator"/> uses to compute the BLEU score for a response. | ||
| /// </summary> | ||
| /// <remarks> | ||
| /// <see cref="BLEUEvaluator"/> measures the BLEU score of a response compared to a reference. BLEU (Bilingual Evaluation Understudy) | ||
| /// is a metric used to evaluate the quality of machine-generated text. | ||
| /// </remarks> | ||
| public sealed class BLEUEvaluatorContext : EvaluationContext | ||
| { | ||
| /// <summary> | ||
| /// Gets the unique <see cref="EvaluationContext.Name"/> that is used for | ||
| /// <see cref="BLEUEvaluatorContext"/>. | ||
| /// </summary> | ||
| public static string BLEUContextName => "BLEU Context"; | ||
|
|
||
| /// <summary> | ||
| /// Gets the reference responses against which the provided model response will be scored. | ||
| /// </summary> | ||
| /// <remarks> | ||
| /// The <see cref="BLEUEvaluator"/> measures the degree to which the response being evaluated is similar to | ||
| /// the response supplied via <see cref="References"/>. The metric will be reported as a BLEU score. | ||
| /// </remarks> | ||
| public IReadOnlyList<string> References { get; } | ||
peterwald marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| /// <summary> | ||
| /// Initializes a new instance of the <see cref="BLEUEvaluatorContext"/> class. | ||
| /// </summary> | ||
| /// <param name="references"> | ||
| /// The reference responses against which the response that is being evaluated is compared. | ||
| /// </param> | ||
| public BLEUEvaluatorContext(params string[] references) | ||
| : this(references as IEnumerable<string>) | ||
| { | ||
| } | ||
|
|
||
| /// <summary> | ||
| /// Initializes a new instance of the <see cref="BLEUEvaluatorContext"/> class. | ||
| /// </summary> | ||
| /// <param name="references"> | ||
| /// The reference responses against which the response that is being evaluated is compared. | ||
| /// </param> | ||
| public BLEUEvaluatorContext(IEnumerable<string> references) | ||
| : base( | ||
| name: BLEUContextName, | ||
| contents: [.. references.Select(c => new TextContent(c))]) | ||
| { | ||
| References = [.. references]; | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is going to enumerate references twice, once for populating References and once for populating the contents arg passed to the base. It'd be nice to avoid that, in case references isn't a concrete collection type but instead something lazy and more expensive. (It can be addressed in a follow-up.) |
||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I realize this is just copying others, but, @joperezr, why are we still referencing netcoreapp3.1 anywhere in this repo?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this could be a left-over artifact from the R9 repo.