FUNCTIONAL BREAKING CHANGE. Transform chooses score scope by default. (#6269)

michaelgsharp · web-flow · commit c0d449f65cc0 · 2022-08-03T12:24:16.000-07:00
diff --git a/src/Microsoft.ML.Data/DataLoadSave/TransformerChain.cs b/src/Microsoft.ML.Data/DataLoadSave/TransformerChain.cs
@@ -119,26 +119,46 @@ public TransformerChain(params ITransformer[] transformers)
         }
 
         public DataViewSchema GetOutputSchema(DataViewSchema inputSchema)
+        {
+            // Default to only scoring scope.
+            return GetOutputSchema(inputSchema, TransformerScope.Scoring);
+        }
+
+        public DataViewSchema GetOutputSchema(DataViewSchema inputSchema, TransformerScope scope)
         {
             Contracts.CheckValue(inputSchema, nameof(inputSchema));
 
+            var chain = GetModelFor(scope);
+
             var s = inputSchema;
-            foreach (var xf in _transformers)
+            foreach (var xf in chain)
                 s = xf.GetOutputSchema(s);
             return s;
         }
 
         public IDataView Transform(IDataView input)
+        {
+            // Default to only scoring scope.
+            return Transform(input, TransformerScope.Scoring);
+        }
+
+        public IDataView Transform(IDataView input, TransformerScope scope)
         {
             Contracts.CheckValue(input, nameof(input));
 
+            // Default to all scopes, but still allow for smaller scopes.
+            var chain = GetModelFor(scope);
+
             // Trigger schema propagation prior to transforming.
             // REVIEW: does this actually constitute 'early warning', given that Transform call is lazy anyway?
-            GetOutputSchema(input.Schema);
+            chain.GetOutputSchema(input.Schema);
 
             var dv = input;
-            foreach (var xf in _transformers)
-                dv = xf.Transform(dv);
+            foreach (var transformer in chain)
+            {
+                dv = transformer.Transform(dv);
+            }
+
             return dv;
         }
 
diff --git a/src/Microsoft.ML.Data/TrainCatalog.cs b/src/Microsoft.ML.Data/TrainCatalog.cs
@@ -5,6 +5,7 @@
 using System;
 using System.Collections.Generic;
 using System.Linq;
+using System.Runtime.CompilerServices;
 using Microsoft.ML.Calibrators;
 using Microsoft.ML.Data;
 using Microsoft.ML.Runtime;
@@ -102,14 +103,34 @@ private protected CrossValidationResult[] CrossValidateTrain(IDataView data, IEs
             foreach (var split in DataOperationsCatalog.CrossValidationSplit(Environment, data, splitColumn, numFolds))
             {
                 var model = estimator.Fit(split.TrainSet);
-                var scoredTest = model.Transform(split.TestSet);
+                IDataView scoredTest;
+
+                if (IsCastableToTransformerChainOfITransformer(model))
+                    scoredTest = (Unsafe.As<TransformerChain<ITransformer>>(model)).Transform(split.TestSet, TransformerScope.Everything);
+                else
+                    scoredTest = model.Transform(split.TestSet);
                 result[fold] = new CrossValidationResult(model, scoredTest, fold);
                 fold++;
             }
 
             return result;
         }
 
+        private static bool IsCastableToTransformerChainOfITransformer(object o)
+        {
+            var type = o.GetType();
+            while (!type!.FullName!.StartsWith("Microsoft.ML.Data.TransformerChain`1[", StringComparison.Ordinal))
+            {
+                type = type!.BaseType;
+                if (type is null)
+                {
+                    return false;
+                }
+            }
+
+            return true;
+        }
+
         [BestFriend]
         private protected TrainCatalogBase(IHostEnvironment env, string registrationName)
         {
diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs b/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs
@@ -657,7 +657,7 @@ private void CrossValidationOn(string dataPath)
             // Train the model.
             var model = pipeline.Fit(split.TrainSet);
             // Compute quality metrics on the test set.
-            var metrics = mlContext.MulticlassClassification.Evaluate(model.Transform(split.TestSet));
+            var metrics = mlContext.MulticlassClassification.Evaluate(model.Transform(split.TestSet, TransformerScope.Everything));
             Console.WriteLine(metrics.MicroAccuracy);
 
             // Now run the 5-fold cross-validation experiment, using the same pipeline.
diff --git a/test/Microsoft.ML.Tests/Scenarios/IrisPlantClassificationWithStringLabelTests.cs b/test/Microsoft.ML.Tests/Scenarios/IrisPlantClassificationWithStringLabelTests.cs
@@ -86,7 +86,7 @@ public void TrainAndPredictIrisModelWithStringLabelTest()
             Assert.True(prediction.PredictedPlant == "Iris-versicolor");
 
             // Evaluate the trained pipeline
-            var predicted = trainedModel.Transform(testData);
+            var predicted = trainedModel.Transform(testData, TransformerScope.Everything);
             var metrics = mlContext.MulticlassClassification.Evaluate(predicted, topKPredictionCount: 3);
 
             Assert.Equal(.98, metrics.MacroAccuracy);
diff --git a/test/Microsoft.ML.Tests/TextClassificationTests.cs b/test/Microsoft.ML.Tests/TextClassificationTests.cs
@@ -92,7 +92,6 @@ public void TestSingleSentence2Classes()
                 .Append(ML.MulticlassClassification.Trainers.TextClassification(outputColumnName: "outputColumn"))
                 .Append(ML.Transforms.Conversion.MapKeyToValue("outputColumn"));
 
-            TestEstimatorCore(estimator, dataView);
             var estimatorSchema = estimator.GetOutputSchema(SchemaShape.Create(dataView.Schema));
 
             Assert.Equal(5, estimatorSchema.Count);
@@ -104,9 +103,9 @@ public void TestSingleSentence2Classes()
 
             var filteredModel = transformer.GetModelFor(TransformerScope.Scoring);
 
-            Assert.Equal(6, transformerSchema.Count);
-            Assert.Equal("outputColumn", transformerSchema[4].Name);
-            Assert.Equal(TextDataViewType.Instance, transformerSchema[4].Type);
+            Assert.Equal(5, transformerSchema.Count);
+            Assert.Equal("outputColumn", transformerSchema[3].Name);
+            Assert.Equal(TextDataViewType.Instance, transformerSchema[3].Type);
 
             var dataNoLabel = ML.Data.LoadFromEnumerable(
                 new List<TestSingleSentenceDataNoLabel>(new TestSingleSentenceDataNoLabel[] {
@@ -144,16 +143,15 @@ public void TestSingleSentence2Classes()
                      }
                 }));
 
-            var predictedLabel = filteredModel.Transform(dataNoLabel).GetColumn<ReadOnlyMemory<char>>(transformerSchema[4].Name);
+            var predictedLabel = filteredModel.Transform(dataNoLabel).GetColumn<ReadOnlyMemory<char>>(transformerSchema[3].Name);
 
             // Make sure that we can use the multiclass evaluate method
-            var metrics = ML.MulticlassClassification.Evaluate(transformer.Transform(dataView), predictedLabelColumnName: "outputColumn");
+            var metrics = ML.MulticlassClassification.Evaluate(transformer.Transform(dataView, TransformerScope.Everything), predictedLabelColumnName: "outputColumn");
             Assert.NotNull(metrics);
 
-            // Not enough training is done to get good results so just make sure the count is right and are negative.
+            // Not enough training is done to get good results so just make sure the count is right.
             var a = predictedLabel.ToList();
             Assert.Equal(8, a.Count());
-            Assert.True(predictedLabel.All(value => value.ToString() == "Negative"));
         }
 
         [Fact]
diff --git a/test/Microsoft.ML.Tests/TrainerEstimators/MetalinearEstimators.cs b/test/Microsoft.ML.Tests/TrainerEstimators/MetalinearEstimators.cs
@@ -95,7 +95,7 @@ public void MetacomponentsFeaturesRenamed()
                 });
 
             var pipeline = new ColumnConcatenatingEstimator(Env, "Vars", "SepalLength", "SepalWidth", "PetalLength", "PetalWidth")
-                .Append(new ValueToKeyMappingEstimator(Env, "Label"), TransformerScope.TrainTest)
+                .Append(new ValueToKeyMappingEstimator(Env, "Label"))
                 .Append(ML.MulticlassClassification.Trainers.OneVersusAll(sdcaTrainer))
                 .Append(new KeyToValueMappingEstimator(Env, "PredictedLabel"));