Skip to content
Merged
Show file tree
Hide file tree
Changes from 18 commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/Microsoft.ML.Data/Data/Conversion.cs
Original file line number Diff line number Diff line change
Expand Up @@ -1170,7 +1170,7 @@ private bool IsStdMissing(ref ReadOnlySpan<char> span)
public bool TryParseKey(ref TX src, U8 min, U8 max, out U8 dst)
{
var span = src.Span;
Contracts.Check(!IsStdMissing(ref span), "Missing text value cannot be converted to unsigned integer type.");
Contracts.Check(span.IsEmpty || !IsStdMissing(ref span), "Missing text value cannot be converted to unsigned integer type.");
Copy link
Contributor

@TomFinley TomFinley Oct 12, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

span.IsEmpty [](start = 28, length = 12)

Heh heh. Whoops! #Resolved

Contracts.Assert(min <= max);

// This simply ensures we don't have min == 0 and max == U8.MaxValue. This is illegal since
Expand Down Expand Up @@ -1530,7 +1530,7 @@ public bool TryParse(ref TX src, out BL dst)
{
var span = src.Span;

Contracts.Check(!IsStdMissing(ref span), "Missing text values cannot be converted to bool value.");
Contracts.Check(span.IsEmpty || !IsStdMissing(ref span), "Missing text value cannot be converted to bool type.");

char ch;
switch (src.Length)
Expand Down
2 changes: 1 addition & 1 deletion src/Microsoft.ML.Data/Transforms/ConcatTransform.cs
Original file line number Diff line number Diff line change
Expand Up @@ -384,7 +384,7 @@ public static IDataTransform Create(IHostEnvironment env, TaggedArguments args,
env.CheckUserArg(Utils.Size(args.Column[i].Source) > 0, nameof(args.Column));

var cols = args.Column
.Select(c => new ColumnInfo(c.Name, c.Source.Select(kvp => (kvp.Value, kvp.Key))))
.Select(c => new ColumnInfo(c.Name, c.Source.Select(kvp => (kvp.Value, kvp.Key != "" ? kvp.Key : null))))
.ToArray();
var transformer = new ConcatTransform(env, cols);
return transformer.MakeDataTransform(input);
Expand Down
14 changes: 11 additions & 3 deletions src/Microsoft.ML.Data/Transforms/TermEstimator.cs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@ public static class Defaults

private readonly IHost _host;
private readonly TermTransform.ColumnInfo[] _columns;
private readonly string _file;
private readonly string _termsColumn;
private readonly IComponentFactory<IMultiStreamSource, IDataLoader> _loaderFactory;

/// <summary>
/// Convenience constructor for public facing API.
Expand All @@ -32,18 +35,23 @@ public static class Defaults
/// <param name="sort">How items should be ordered when vectorized. By default, they will be in the order encountered.
/// If by value items are sorted according to their default comparison, for example, text sorting will be case sensitive (for example, 'A' then 'Z' then 'a').</param>
public TermEstimator(IHostEnvironment env, string inputColumn, string outputColumn = null, int maxNumTerms = Defaults.MaxNumTerms, TermTransform.SortOrder sort = Defaults.Sort) :
this(env, new TermTransform.ColumnInfo(inputColumn, outputColumn ?? inputColumn, maxNumTerms, sort))
this(env, new[] { new TermTransform.ColumnInfo(inputColumn, outputColumn ?? inputColumn, maxNumTerms, sort) })
{
}

public TermEstimator(IHostEnvironment env, params TermTransform.ColumnInfo[] columns)
public TermEstimator(IHostEnvironment env, TermTransform.ColumnInfo[] columns,
string file = null, string termsColumn = null,
IComponentFactory<IMultiStreamSource, IDataLoader> loaderFactory = null)
{
Contracts.CheckValue(env, nameof(env));
_host = env.Register(nameof(TermEstimator));
_columns = columns;
_file = file;
_termsColumn = termsColumn;
_loaderFactory = loaderFactory;
}

public TermTransform Fit(IDataView input) => new TermTransform(_host, input, _columns);
public TermTransform Fit(IDataView input) => new TermTransform(_host, input, _columns, _file, _termsColumn, _loaderFactory);

public SchemaShape GetOutputSchema(SchemaShape inputSchema)
{
Expand Down
6 changes: 3 additions & 3 deletions src/Microsoft.ML.Data/Transforms/TermTransform.cs
Original file line number Diff line number Diff line change
Expand Up @@ -268,7 +268,7 @@ public TermTransform(IHostEnvironment env, IDataView input,
this(env, input, columns, null, null, null)
{ }

private TermTransform(IHostEnvironment env, IDataView input,
internal TermTransform(IHostEnvironment env, IDataView input,
ColumnInfo[] columns,
string file = null, string termsColumn = null,
IComponentFactory<IMultiStreamSource, IDataLoader> loaderFactory = null)
Expand Down Expand Up @@ -314,13 +314,13 @@ public static IDataTransform Create(IHostEnvironment env, Arguments args, IDataV
if (!Enum.IsDefined(typeof(SortOrder), sortOrder))
throw env.ExceptUserArg(nameof(args.Sort), "Undefined sorting criteria '{0}' detected for column '{1}'", sortOrder, item.Name);

cols[i] = new ColumnInfo(item.Source,
cols[i] = new ColumnInfo(item.Source ?? item.Name,
item.Name,
item.MaxNumTerms ?? args.MaxNumTerms,
sortOrder,
item.Term,
item.TextKeyValues ?? args.TextKeyValues);
cols[i].Terms = item.Terms;
cols[i].Terms = item.Terms ?? args.Terms;
};
}
return new TermTransform(env, input, cols, args.DataFile, args.TermsColumn, args.Loader).MakeDataTransform(input);
Expand Down
9 changes: 6 additions & 3 deletions src/Microsoft.ML.ResultProcessor/ResultProcessor.cs
Original file line number Diff line number Diff line change
Expand Up @@ -1063,10 +1063,10 @@ private static Experiment CreateVisualizationExperiment(ExperimentItemResult res
var experiment = new ML.Runtime.ExperimentVisualization.Experiment
{
Key = index.ToString(),
CompareGroup = string.IsNullOrEmpty(result.CustomizedTag) ? result.Trainer.Kind : result.CustomizedTag,
CompareGroup = string.IsNullOrEmpty(result.CustomizedTag) ? result.TrainerKind : result.CustomizedTag,
Trainer = new ML.Runtime.ExperimentVisualization.Trainer
{
Name = result.Trainer.Kind,
Name = result.TrainerKind,
ParameterSets = new List<ML.Runtime.ExperimentVisualization.Item>()
},
DataSet = new ML.Runtime.ExperimentVisualization.DataSet { File = result.Datafile },
Expand Down Expand Up @@ -1152,7 +1152,10 @@ private static object Load(Stream stream)

public static int Main(string[] args)
{
return Main(new ConsoleEnvironment(42), args);
string currentDirectory = Path.GetDirectoryName(typeof(ResultProcessor).Module.FullyQualifiedName);
using (var env = new ConsoleEnvironment(42))
using (AssemblyLoadingUtils.CreateAssemblyRegistrar(env, currentDirectory))
return Main(env, args);
}

public static int Main(IHostEnvironment env, string[] args)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -373,7 +373,7 @@ protected override ParameterMixingCalibratedPredictor CreatePredictor()
CurrentWeights.GetItemOrDefault(0, ref bias);
CurrentWeights.CopyTo(ref weights, 1, CurrentWeights.Length - 1);
return new ParameterMixingCalibratedPredictor(Host,
new LinearBinaryPredictor(Host, ref weights, bias),
new LinearBinaryPredictor(Host, ref weights, bias, _stats),
new PlattCalibrator(Host, -1, 0));
}

Expand Down
2 changes: 1 addition & 1 deletion src/Microsoft.ML.Sweeper/Algorithms/NelderMead.cs
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ public sealed class Arguments
public IComponentFactory<IValueGenerator>[] SweptParameters;

[Argument(ArgumentType.LastOccurenceWins, HelpText = "The sweeper used to get the initial results.", ShortName = "init", SignatureType = typeof(SignatureSweeperFromParameterList))]
public IComponentFactory<IValueGenerator[], ISweeper> FirstBatchSweeper;
public IComponentFactory<IValueGenerator[], ISweeper> FirstBatchSweeper = ComponentFactoryUtils.CreateFromFunction<IValueGenerator[], ISweeper>((host, array) => new UniformRandomSweeper(host, new SweeperBase.ArgumentsBase(), array));

[Argument(ArgumentType.AtMostOnce, HelpText = "Seed for the random number generator for the first batch sweeper", ShortName = "seed")]
public int RandomSeed;
Expand Down
5 changes: 4 additions & 1 deletion src/Microsoft.ML.Sweeper/ConfigRunner.cs
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,10 @@ public virtual void Finish()
if (Exe == null || Exe.EndsWith("maml", StringComparison.OrdinalIgnoreCase) ||
Exe.EndsWith("maml.exe", StringComparison.OrdinalIgnoreCase))
{
string currentDirectory = Path.GetDirectoryName(typeof(ExeConfigRunnerBase).Module.FullyQualifiedName);

using (var ch = Host.Start("Finish"))
using (AssemblyLoadingUtils.CreateAssemblyRegistrar(Host, currentDirectory))
{
var runs = RunNums.ToArray();
var args = Utils.BuildArray(RunNums.Count + 2,
Expand All @@ -120,7 +123,7 @@ public virtual void Finish()
return string.Format("{{{0}}}", GetFilePath(runs[i], "out"));
});

ResultProcessorInternal.ResultProcessor.Main (args);
ResultProcessorInternal.ResultProcessor.Main(args);

ch.Info(@"The summary of the run results has been saved to the file {0}\{1}.summary.txt", OutputFolder, Prefix);
}
Expand Down
4 changes: 4 additions & 0 deletions src/Microsoft.ML.Sweeper/Microsoft.ML.Sweeper.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,8 @@

</ItemGroup>

<ItemGroup>
<Compile Include="..\Common\AssemblyLoadingUtils.cs" Link="Common\AssemblyLoadingUtils.cs" />
</ItemGroup>

</Project>
18 changes: 11 additions & 7 deletions src/Microsoft.ML.Transforms/CategoricalTransform.cs
Original file line number Diff line number Diff line change
Expand Up @@ -149,18 +149,20 @@ public static IDataTransform Create(IHostEnvironment env, Arguments args, IDataV
column.MaxNumTerms ?? args.MaxNumTerms,
column.Sort ?? args.Sort,
column.Term ?? args.Term);
col.SetTerms(column.Terms);
col.SetTerms(column.Terms ?? args.Terms);
columns.Add(col);
}
return new CategoricalEstimator(env, columns.ToArray()).Fit(input).Transform(input) as IDataTransform;
return new CategoricalEstimator(env, columns.ToArray(), args.DataFile, args.TermsColumn, args.Loader).Fit(input).Transform(input) as IDataTransform;
}

private readonly TransformerChain<ITransformer> _transformer;

public CategoricalTransform(TermEstimator term, IEstimator<ITransformer> toVector, IDataView input)
{
var chain = term.Append(toVector);
_transformer = chain.Fit(input);
if (toVector != null)
_transformer = term.Append(toVector).Fit(input);
else
_transformer = new TransformerChain<ITransformer>(term.Fit(input));
}

public Schema GetOutputSchema(Schema inputSchema) => _transformer.GetOutputSchema(inputSchema);
Expand Down Expand Up @@ -212,15 +214,17 @@ internal void SetTerms(string terms)
/// <param name="outputKind">The type of output expected.</param>
public CategoricalEstimator(IHostEnvironment env, string name,
string source = null, CategoricalTransform.OutputKind outputKind = Defaults.OutKind)
: this(env, new ColumnInfo(source ?? name, name, outputKind))
: this(env, new[] { new ColumnInfo(source ?? name, name, outputKind) })
{
}

public CategoricalEstimator(IHostEnvironment env, params ColumnInfo[] columns)
public CategoricalEstimator(IHostEnvironment env, ColumnInfo[] columns,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

public [](start = 8, length = 6)

out of curiosity can you make this constructor internal? I would prefer to not pollute our public API with these things.
Same for TermEstimator.

string file = null, string termsColumn = null,
IComponentFactory<IMultiStreamSource, IDataLoader> loaderFactory = null)
{
Contracts.CheckValue(env, nameof(env));
_host = env.Register(nameof(TermEstimator));
_term = new TermEstimator(_host, columns);
_term = new TermEstimator(_host, columns, file, termsColumn, loaderFactory);
var binaryCols = new List<(string input, string output)>();
var cols = new List<(string input, string output, bool bag)>();
for (int i = 0; i < columns.Length; i++)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Saving predictor summary
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
maml.exe Train tr=MultiClassLogisticRegression{maxiter=100 t=- stat=+} loader=TextLoader{col=Label:TX:4 col=Features:R4:0-3 sep=,} data=%Data% out=%Output% seed=1 xf=Term{col=Label}
Automatically adding a MinMax normalization transform, use 'norm=Warn' or 'norm=No' to turn this behavior off.
Beginning optimization
num vars: 15
improvement criterion: Mean Improvement
L1 regularization selected 11 of 15 weights.
Model trained with 150 training examples.
Residual Deviance: 132.0122
Null Deviance: 329.5837
AIC: 154.0122
Not training a calibrator because it is not needed.
Physical memory usage(MB): %Number%
Virtual memory usage(MB): %Number%
%DateTime% Time elapsed(s): %Number%

Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
MulticlassLogisticRegression bias and non-zero weights
Iris-setosa+(Bias) 2.265129
Iris-versicolor+(Bias) 0.7695086
Iris-virginica+(Bias) -3.034663
Iris-setosa+f3 -3.180634
Iris-setosa+f2 -2.88663
Iris-setosa+f1 0.5392878
Iris-setosa+f0 -0.03958065
Iris-versicolor+f1 -0.7073272
Iris-virginica+f3 3.158146
Iris-virginica+f2 1.907791
Iris-virginica+f0 0.01793481

*** MODEL STATISTICS SUMMARY ***
Count of training examples: 150
Residual Deviance: 132.0122
Null Deviance: 329.5837
AIC: 154.0122
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Saving predictor summary
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
maml.exe Train feat=Num lab=Lab tr=lr{t=- stat=+} loader=text{header+ sep=comma col=Lab:14 col=Num:0,2,4,10-12} data=%Data% out=%Output%
Automatically adding a MinMax normalization transform, use 'norm=Warn' or 'norm=No' to turn this behavior off.
Beginning optimization
num vars: 7
improvement criterion: Mean Improvement
L1 regularization selected 7 of 7 weights.
Model trained with 32561 training examples.
Residual Deviance: 26705.74 (on 32554 degrees of freedom)
Null Deviance: 35948.08 (on 32560 degrees of freedom)
AIC: 26719.74
Not training a calibrator because it is not needed.
Physical memory usage(MB): %Number%
Virtual memory usage(MB): %Number%
%DateTime% Time elapsed(s): %Number%

Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
Linear Binary Classification Predictor non-zero weights

(Bias) -8.228298
capital-gain 18.58347
education-num 5.066041
hours-per-week 3.946534
age 3.86064
capital-loss 2.81616
fnlwgt 0.7489593

*** MODEL STATISTICS SUMMARY ***
Count of training examples: 32561
Residual Deviance: 26705.74
Null Deviance: 35948.08
AIC: 26719.74
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
maml.exe CV tr=FastRank{nl=5 mil=5 lr=0.25 iter=20} threads=- dout=%Output% loader=Text{col=Name:TX:0 col=Label:Num:1 col=Features:Num:~} data=%Data% seed=1 xf=Expr{col=Name expr={x=>right(x, 1)}}
Physical memory usage(MB): %Number%
Virtual memory usage(MB): %Number%
%DateTime% Time elapsed(s): %Number%

Could not find file '%Data%
Error log has been saved to '%Temp%\%ErrorLog%'. Please refer to https://aka.ms/MLNetIssue if you need assistance.
--- Progress log ---
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
maml.exe CV tr=FastRank{nl=5 mil=5 lr=0.25 iter=20} threads=- dout=%Output% data=%Data% seed=1
Not adding a normalizer.
Making per-feature arrays
Changing data from row-wise to column-wise
Warning: Skipped 8 instances with missing features during training
Processed 329 instances
Binning and forming Feature objects
Reserved memory for tree learner: 3852 bytes
Starting to train ...
Not training a calibrator because it is not needed.
Not adding a normalizer.
Making per-feature arrays
Changing data from row-wise to column-wise
Warning: Skipped 8 instances with missing features during training
Processed 354 instances
Binning and forming Feature objects
Reserved memory for tree learner: 3816 bytes
Starting to train ...
Not training a calibrator because it is not needed.
TEST POSITIVE RATIO: 0.3702 (134.0/(134.0+228.0))
Confusion table
||======================
PREDICTED || positive | negative | Recall
TRUTH ||======================
positive || 131 | 3 | 0.9776
negative || 10 | 218 | 0.9561
||======================
Precision || 0.9291 | 0.9864 |
OVERALL 0/1 ACCURACY: 0.964088
LOG LOSS/instance: 0.211336
Test-set entropy (prior Log-Loss/instance): 0.950799
LOG-LOSS REDUCTION (RIG): 77.772765
AUC: 0.983225
TEST POSITIVE RATIO: 0.3175 (107.0/(107.0+230.0))
Confusion table
||======================
PREDICTED || positive | negative | Recall
TRUTH ||======================
positive || 98 | 9 | 0.9159
negative || 5 | 225 | 0.9783
||======================
Precision || 0.9515 | 0.9615 |
OVERALL 0/1 ACCURACY: 0.958457
LOG LOSS/instance: 0.137700
Test-set entropy (prior Log-Loss/instance): 0.901650
LOG-LOSS REDUCTION (RIG): 84.727964
AUC: 0.993681

OVERALL RESULTS
---------------------------------------
AUC: 0.988453 (0.0052)
Accuracy: 0.961273 (0.0028)
Positive precision: 0.940267 (0.0112)
Positive recall: 0.946750 (0.0309)
Negative precision: 0.973982 (0.0124)
Negative recall: 0.967201 (0.0111)
Log-loss: 0.174518 (0.0368)
Log-loss reduction: 81.250364 (3.4776)
F1 Score: 0.943030 (0.0097)
AUPRC: 0.962986 (0.0211)

---------------------------------------
Physical memory usage(MB): %Number%
Virtual memory usage(MB): %Number%
%DateTime% Time elapsed(s): %Number%

--- Progress log ---
[1] 'FastTree data preparation' started.
[1] 'FastTree data preparation' finished in %Time%.
[2] 'FastTree in-memory bins initialization' started.
[2] 'FastTree in-memory bins initialization' finished in %Time%.
[3] 'FastTree feature conversion' started.
[3] 'FastTree feature conversion' finished in %Time%.
[4] 'FastTree training' started.
[4] 'FastTree training' finished in %Time%.
[5] 'FastTree data preparation #2' started.
[5] 'FastTree data preparation #2' finished in %Time%.
[6] 'FastTree in-memory bins initialization #2' started.
[6] 'FastTree in-memory bins initialization #2' finished in %Time%.
[7] 'FastTree feature conversion #2' started.
[7] 'FastTree feature conversion #2' finished in %Time%.
[8] 'FastTree training #2' started.
[8] 'FastTree training #2' finished in %Time%.
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
FastRank
AUC Accuracy Positive precision Positive recall Negative precision Negative recall Log-loss Log-loss reduction F1 Score AUPRC /lr /nl /mil /iter Learner Name Train Dataset Test Dataset Results File Run Time Physical Memory Virtual Memory Command Line Settings
0.988453 0.961273 0.940267 0.94675 0.973982 0.967201 0.174518 81.25037 0.94303 0.962986 0.25 5 5 20 FastRank %Data% %Output% 99 0 0 maml.exe CV tr=FastRank{nl=5 mil=5 lr=0.25 iter=20} threads=- dout=%Output% data=%Data% seed=1 /lr:0.25;/nl:5;/mil:5;/iter:20

Loading