From 9376fedd7708ff107d4e1516d674e4591b9726a5 Mon Sep 17 00:00:00 2001
From: Stephen Toub <stoub@microsoft.com>
Date: Wed, 4 Oct 2023 16:53:26 -0400
Subject: [PATCH 1/3] Vectorize TensorPrimitives.Sigmoid and
 TensorPrimitives.SoftMax

- Adds a SigmoidOperator that just wraps the ExpOperator
- Vectorizes both passes of SoftMax, on top of ExpOperator. Simplest way to do this was to augment the existing InvokeSpanScalarIntoSpan to take a transform operator.
- In doing so, found some naming inconsistencies I'd previously introduced, so I did some automatic renaming to make things more consistent.
- Added XML comments to all the internal/private surface area.
- Fleshes out some tests (and test values).
---
 .../Numerics/Tensors/TensorPrimitives.cs      |  24 +-
 .../Tensors/TensorPrimitives.netcore.cs       | 368 ++++++++++++------
 .../Tensors/TensorPrimitives.netstandard.cs   | 231 ++++++++---
 .../tests/TensorPrimitivesTests.cs            | 146 +++++--
 4 files changed, 535 insertions(+), 234 deletions(-)
diff --git a/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/TensorPrimitives.cs b/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/TensorPrimitives.cs
index acc311df6e4cb0..5ddec7b23519ac 100644
--- a/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/TensorPrimitives.cs
+++ b/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/TensorPrimitives.cs
@@ -988,17 +988,7 @@ public static void Sigmoid(ReadOnlySpan<float> x, Span<float> destination)
                 ThrowHelper.ThrowArgument_SpansMustBeNonEmpty();
             }
 
-            if (x.Length > destination.Length)
-            {
-                ThrowHelper.ThrowArgument_DestinationTooShort();
-            }
-
-            ValidateInputOutputSpanNonOverlapping(x, destination);
-
-            for (int i = 0; i < x.Length; i++)
-            {
-                destination[i] = 1f / (1f + MathF.Exp(-x[i]));
-            }
+            InvokeSpanIntoSpan<SigmoidOperator>(x, destination);
         }
 
         /// <summary>Computes the element-wise hyperbolic sine of each single-precision floating-point radian angle in the specified tensor.</summary>
@@ -1067,17 +1057,9 @@ public static void SoftMax(ReadOnlySpan<float> x, Span<float> destination)
 
             ValidateInputOutputSpanNonOverlapping(x, destination);
 
-            float expSum = 0f;
-
-            for (int i = 0; i < x.Length; i++)
-            {
-                expSum += MathF.Exp(x[i]);
-            }
+            float expSum = Aggregate<ExpOperator, AddOperator>(x);
 
-            for (int i = 0; i < x.Length; i++)
-            {
-                destination[i] = MathF.Exp(x[i]) / expSum;
-            }
+            InvokeSpanScalarIntoSpan<ExpOperator, DivideOperator>(x, expSum, destination);
         }
 
         /// <summary>Computes the element-wise difference between single-precision floating-point numbers in the specified tensors.</summary>
diff --git a/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/TensorPrimitives.netcore.cs b/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/TensorPrimitives.netcore.cs
index aa306db678d77f..c8a980d70107aa 100644
--- a/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/TensorPrimitives.netcore.cs
+++ b/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/TensorPrimitives.netcore.cs
@@ -605,6 +605,8 @@ static Vector512<float> HalfAsWidenedUInt32ToSingle_Vector512(Vector512<uint> va
 #endif
         }
 
+        /// <summary>Computes the cosine similarity between the two specified non-empty, equal-length tensors of single-precision floating-point numbers.</summary>
+        /// <remarks>Assumes arguments have already been validated to be non-empty and equal length.</remarks>
         private static float CosineSimilarityCore(ReadOnlySpan<float> x, ReadOnlySpan<float> y)
         {
             // Compute the same as:
@@ -643,7 +645,7 @@ private static float CosineSimilarityCore(ReadOnlySpan<float> x, ReadOnlySpan<fl
                     Vector512<float> xVec = Vector512.LoadUnsafe(ref xRef, (uint)(x.Length - Vector512<float>.Count));
                     Vector512<float> yVec = Vector512.LoadUnsafe(ref yRef, (uint)(x.Length - Vector512<float>.Count));
 
-                    Vector512<float> remainderMask = LoadRemainderMaskSingleVector512(x.Length - i);
+                    Vector512<float> remainderMask = CreateRemainderMaskSingleVector512(x.Length - i);
                     xVec &= remainderMask;
                     yVec &= remainderMask;
 
@@ -690,7 +692,7 @@ private static float CosineSimilarityCore(ReadOnlySpan<float> x, ReadOnlySpan<fl
                     Vector256<float> xVec = Vector256.LoadUnsafe(ref xRef, (uint)(x.Length - Vector256<float>.Count));
                     Vector256<float> yVec = Vector256.LoadUnsafe(ref yRef, (uint)(x.Length - Vector256<float>.Count));
 
-                    Vector256<float> remainderMask = LoadRemainderMaskSingleVector256(x.Length - i);
+                    Vector256<float> remainderMask = CreateRemainderMaskSingleVector256(x.Length - i);
                     xVec &= remainderMask;
                     yVec &= remainderMask;
 
@@ -736,7 +738,7 @@ private static float CosineSimilarityCore(ReadOnlySpan<float> x, ReadOnlySpan<fl
                     Vector128<float> xVec = Vector128.LoadUnsafe(ref xRef, (uint)(x.Length - Vector128<float>.Count));
                     Vector128<float> yVec = Vector128.LoadUnsafe(ref yRef, (uint)(x.Length - Vector128<float>.Count));
 
-                    Vector128<float> remainderMask = LoadRemainderMaskSingleVector128(x.Length - i);
+                    Vector128<float> remainderMask = CreateRemainderMaskSingleVector128(x.Length - i);
                     xVec &= remainderMask;
                     yVec &= remainderMask;
 
@@ -767,10 +769,16 @@ private static float CosineSimilarityCore(ReadOnlySpan<float> x, ReadOnlySpan<fl
                 (MathF.Sqrt(xSumOfSquares) * MathF.Sqrt(ySumOfSquares));
         }
 
-        private static float Aggregate<TLoad, TAggregate>(
+        /// <summary>Performs an aggregation over all elements in <paramref name="x"/> to produce a single-precision floating-point value.</summary>
+        /// <typeparam name="TTransformOperator">Specifies the transform operation that should be applied to each element loaded from <paramref name="x"/>.</typeparam>
+        /// <typeparam name="TAggregationOperator">
+        /// Specifies the aggregation binary operation that should be applied to multiple values to aggregate them into a single value.
+        /// The aggregation is applied after the transform is applied to each element.
+        /// </typeparam>
+        private static float Aggregate<TTransformOperator, TAggregationOperator>(
             ReadOnlySpan<float> x)
-            where TLoad : struct, IUnaryOperator
-            where TAggregate : struct, IAggregationOperator
+            where TTransformOperator : struct, IUnaryOperator
+            where TAggregationOperator : struct, IAggregationOperator
         {
             if (x.Length == 0)
             {
@@ -783,7 +791,7 @@ private static float Aggregate<TLoad, TAggregate>(
             if (Vector512.IsHardwareAccelerated && x.Length >= Vector512<float>.Count)
             {
                 // Load the first vector as the initial set of results
-                Vector512<float> result = TLoad.Invoke(Vector512.LoadUnsafe(ref xRef, 0));
+                Vector512<float> result = TTransformOperator.Invoke(Vector512.LoadUnsafe(ref xRef, 0));
                 int oneVectorFromEnd = x.Length - Vector512<float>.Count;
                 int i = Vector512<float>.Count;
 
@@ -791,29 +799,29 @@ private static float Aggregate<TLoad, TAggregate>(
                 // least one full vector left to process.
                 while (i <= oneVectorFromEnd)
                 {
-                    result = TAggregate.Invoke(result, TLoad.Invoke(Vector512.LoadUnsafe(ref xRef, (uint)i)));
+                    result = TAggregationOperator.Invoke(result, TTransformOperator.Invoke(Vector512.LoadUnsafe(ref xRef, (uint)i)));
                     i += Vector512<float>.Count;
                 }
 
                 // Process the last vector in the span, masking off elements already processed.
                 if (i != x.Length)
                 {
-                    result = TAggregate.Invoke(result,
+                    result = TAggregationOperator.Invoke(result,
                         Vector512.ConditionalSelect(
-                            Vector512.Equals(LoadRemainderMaskSingleVector512(x.Length - i), Vector512<float>.Zero),
-                            Vector512.Create(TAggregate.IdentityValue),
-                            TLoad.Invoke(Vector512.LoadUnsafe(ref xRef, (uint)(x.Length - Vector512<float>.Count)))));
+                            Vector512.Equals(CreateRemainderMaskSingleVector512(x.Length - i), Vector512<float>.Zero),
+                            Vector512.Create(TAggregationOperator.IdentityValue),
+                            TTransformOperator.Invoke(Vector512.LoadUnsafe(ref xRef, (uint)(x.Length - Vector512<float>.Count)))));
                 }
 
                 // Aggregate the lanes in the vector back into the scalar result
-                return TAggregate.Invoke(result);
+                return TAggregationOperator.Invoke(result);
             }
 #endif
 
             if (Vector256.IsHardwareAccelerated && x.Length >= Vector256<float>.Count)
             {
                 // Load the first vector as the initial set of results
-                Vector256<float> result = TLoad.Invoke(Vector256.LoadUnsafe(ref xRef, 0));
+                Vector256<float> result = TTransformOperator.Invoke(Vector256.LoadUnsafe(ref xRef, 0));
                 int oneVectorFromEnd = x.Length - Vector256<float>.Count;
                 int i = Vector256<float>.Count;
 
@@ -821,28 +829,28 @@ private static float Aggregate<TLoad, TAggregate>(
                 // least one full vector left to process.
                 while (i <= oneVectorFromEnd)
                 {
-                    result = TAggregate.Invoke(result, TLoad.Invoke(Vector256.LoadUnsafe(ref xRef, (uint)i)));
+                    result = TAggregationOperator.Invoke(result, TTransformOperator.Invoke(Vector256.LoadUnsafe(ref xRef, (uint)i)));
                     i += Vector256<float>.Count;
                 }
 
                 // Process the last vector in the span, masking off elements already processed.
                 if (i != x.Length)
                 {
-                    result = TAggregate.Invoke(result,
+                    result = TAggregationOperator.Invoke(result,
                         Vector256.ConditionalSelect(
-                            Vector256.Equals(LoadRemainderMaskSingleVector256(x.Length - i), Vector256<float>.Zero),
-                            Vector256.Create(TAggregate.IdentityValue),
-                            TLoad.Invoke(Vector256.LoadUnsafe(ref xRef, (uint)(x.Length - Vector256<float>.Count)))));
+                            Vector256.Equals(CreateRemainderMaskSingleVector256(x.Length - i), Vector256<float>.Zero),
+                            Vector256.Create(TAggregationOperator.IdentityValue),
+                            TTransformOperator.Invoke(Vector256.LoadUnsafe(ref xRef, (uint)(x.Length - Vector256<float>.Count)))));
                 }
 
                 // Aggregate the lanes in the vector back into the scalar result
-                return TAggregate.Invoke(result);
+                return TAggregationOperator.Invoke(result);
             }
 
             if (Vector128.IsHardwareAccelerated && x.Length >= Vector128<float>.Count)
             {
                 // Load the first vector as the initial set of results
-                Vector128<float> result = TLoad.Invoke(Vector128.LoadUnsafe(ref xRef, 0));
+                Vector128<float> result = TTransformOperator.Invoke(Vector128.LoadUnsafe(ref xRef, 0));
                 int oneVectorFromEnd = x.Length - Vector128<float>.Count;
                 int i = Vector128<float>.Count;
 
@@ -850,41 +858,47 @@ private static float Aggregate<TLoad, TAggregate>(
                 // least one full vector left to process.
                 while (i <= oneVectorFromEnd)
                 {
-                    result = TAggregate.Invoke(result, TLoad.Invoke(Vector128.LoadUnsafe(ref xRef, (uint)i)));
+                    result = TAggregationOperator.Invoke(result, TTransformOperator.Invoke(Vector128.LoadUnsafe(ref xRef, (uint)i)));
                     i += Vector128<float>.Count;
                 }
 
                 // Process the last vector in the span, masking off elements already processed.
                 if (i != x.Length)
                 {
-                    result = TAggregate.Invoke(result,
+                    result = TAggregationOperator.Invoke(result,
                         Vector128.ConditionalSelect(
-                            Vector128.Equals(LoadRemainderMaskSingleVector128(x.Length - i), Vector128<float>.Zero),
-                            Vector128.Create(TAggregate.IdentityValue),
-                            TLoad.Invoke(Vector128.LoadUnsafe(ref xRef, (uint)(x.Length - Vector128<float>.Count)))));
+                            Vector128.Equals(CreateRemainderMaskSingleVector128(x.Length - i), Vector128<float>.Zero),
+                            Vector128.Create(TAggregationOperator.IdentityValue),
+                            TTransformOperator.Invoke(Vector128.LoadUnsafe(ref xRef, (uint)(x.Length - Vector128<float>.Count)))));
                 }
 
                 // Aggregate the lanes in the vector back into the scalar result
-                return TAggregate.Invoke(result);
+                return TAggregationOperator.Invoke(result);
             }
 
             // Vectorization isn't supported or there are too few elements to vectorize.
             // Use a scalar implementation.
             {
-                float result = TLoad.Invoke(x[0]);
+                float result = TTransformOperator.Invoke(x[0]);
                 for (int i = 1; i < x.Length; i++)
                 {
-                    result = TAggregate.Invoke(result, TLoad.Invoke(x[i]));
+                    result = TAggregationOperator.Invoke(result, TTransformOperator.Invoke(x[i]));
                 }
 
                 return result;
             }
         }
 
-        private static float Aggregate<TBinary, TAggregate>(
+        /// <summary>Performs an aggregation over all pair-wise elements in <paramref name="x"/> and <paramref name="y"/> to produce a single-precision floating-point value.</summary>
+        /// <typeparam name="TBinaryOperator">Specifies the binary operation that should be applied to the pair-wise elements loaded from <paramref name="x"/> and <paramref name="y"/>.</typeparam>
+        /// <typeparam name="TAggregationOperator">
+        /// Specifies the aggregation binary operation that should be applied to multiple values to aggregate them into a single value.
+        /// The aggregation is applied to the results of the binary operations on the pair-wise values.
+        /// </typeparam>
+        private static float Aggregate<TBinaryOperator, TAggregationOperator>(
             ReadOnlySpan<float> x, ReadOnlySpan<float> y)
-            where TBinary : struct, IBinaryOperator
-            where TAggregate : struct, IAggregationOperator
+            where TBinaryOperator : struct, IBinaryOperator
+            where TAggregationOperator : struct, IAggregationOperator
         {
             Debug.Assert(x.Length == y.Length);
 
@@ -900,7 +914,7 @@ private static float Aggregate<TBinary, TAggregate>(
             if (Vector512.IsHardwareAccelerated && x.Length >= Vector512<float>.Count)
             {
                 // Load the first vector as the initial set of results
-                Vector512<float> result = TBinary.Invoke(Vector512.LoadUnsafe(ref xRef, 0), Vector512.LoadUnsafe(ref yRef, 0));
+                Vector512<float> result = TBinaryOperator.Invoke(Vector512.LoadUnsafe(ref xRef, 0), Vector512.LoadUnsafe(ref yRef, 0));
                 int oneVectorFromEnd = x.Length - Vector512<float>.Count;
                 int i = Vector512<float>.Count;
 
@@ -908,31 +922,31 @@ private static float Aggregate<TBinary, TAggregate>(
                 // least one full vector left to process.
                 while (i <= oneVectorFromEnd)
                 {
-                    result = TAggregate.Invoke(result, TBinary.Invoke(Vector512.LoadUnsafe(ref xRef, (uint)i), Vector512.LoadUnsafe(ref yRef, (uint)i)));
+                    result = TAggregationOperator.Invoke(result, TBinaryOperator.Invoke(Vector512.LoadUnsafe(ref xRef, (uint)i), Vector512.LoadUnsafe(ref yRef, (uint)i)));
                     i += Vector512<float>.Count;
                 }
 
                 // Process the last vector in the spans, masking off elements already processed.
                 if (i != x.Length)
                 {
-                    result = TAggregate.Invoke(result,
+                    result = TAggregationOperator.Invoke(result,
                         Vector512.ConditionalSelect(
-                            Vector512.Equals(LoadRemainderMaskSingleVector512(x.Length - i), Vector512<float>.Zero),
-                            Vector512.Create(TAggregate.IdentityValue),
-                            TBinary.Invoke(
+                            Vector512.Equals(CreateRemainderMaskSingleVector512(x.Length - i), Vector512<float>.Zero),
+                            Vector512.Create(TAggregationOperator.IdentityValue),
+                            TBinaryOperator.Invoke(
                                 Vector512.LoadUnsafe(ref xRef, (uint)(x.Length - Vector512<float>.Count)),
                                 Vector512.LoadUnsafe(ref yRef, (uint)(x.Length - Vector512<float>.Count)))));
                 }
 
                 // Aggregate the lanes in the vector back into the scalar result
-                return TAggregate.Invoke(result);
+                return TAggregationOperator.Invoke(result);
             }
 #endif
 
             if (Vector256.IsHardwareAccelerated && x.Length >= Vector256<float>.Count)
             {
                 // Load the first vector as the initial set of results
-                Vector256<float> result = TBinary.Invoke(Vector256.LoadUnsafe(ref xRef, 0), Vector256.LoadUnsafe(ref yRef, 0));
+                Vector256<float> result = TBinaryOperator.Invoke(Vector256.LoadUnsafe(ref xRef, 0), Vector256.LoadUnsafe(ref yRef, 0));
                 int oneVectorFromEnd = x.Length - Vector256<float>.Count;
                 int i = Vector256<float>.Count;
 
@@ -940,30 +954,30 @@ private static float Aggregate<TBinary, TAggregate>(
                 // least one full vector left to process.
                 while (i <= oneVectorFromEnd)
                 {
-                    result = TAggregate.Invoke(result, TBinary.Invoke(Vector256.LoadUnsafe(ref xRef, (uint)i), Vector256.LoadUnsafe(ref yRef, (uint)i)));
+                    result = TAggregationOperator.Invoke(result, TBinaryOperator.Invoke(Vector256.LoadUnsafe(ref xRef, (uint)i), Vector256.LoadUnsafe(ref yRef, (uint)i)));
                     i += Vector256<float>.Count;
                 }
 
                 // Process the last vector in the spans, masking off elements already processed.
                 if (i != x.Length)
                 {
-                    result = TAggregate.Invoke(result,
+                    result = TAggregationOperator.Invoke(result,
                         Vector256.ConditionalSelect(
-                            Vector256.Equals(LoadRemainderMaskSingleVector256(x.Length - i), Vector256<float>.Zero),
-                            Vector256.Create(TAggregate.IdentityValue),
-                            TBinary.Invoke(
+                            Vector256.Equals(CreateRemainderMaskSingleVector256(x.Length - i), Vector256<float>.Zero),
+                            Vector256.Create(TAggregationOperator.IdentityValue),
+                            TBinaryOperator.Invoke(
                                 Vector256.LoadUnsafe(ref xRef, (uint)(x.Length - Vector256<float>.Count)),
                                 Vector256.LoadUnsafe(ref yRef, (uint)(x.Length - Vector256<float>.Count)))));
                 }
 
                 // Aggregate the lanes in the vector back into the scalar result
-                return TAggregate.Invoke(result);
+                return TAggregationOperator.Invoke(result);
             }
 
             if (Vector128.IsHardwareAccelerated && x.Length >= Vector128<float>.Count)
             {
                 // Load the first vector as the initial set of results
-                Vector128<float> result = TBinary.Invoke(Vector128.LoadUnsafe(ref xRef, 0), Vector128.LoadUnsafe(ref yRef, 0));
+                Vector128<float> result = TBinaryOperator.Invoke(Vector128.LoadUnsafe(ref xRef, 0), Vector128.LoadUnsafe(ref yRef, 0));
                 int oneVectorFromEnd = x.Length - Vector128<float>.Count;
                 int i = Vector128<float>.Count;
 
@@ -971,34 +985,34 @@ private static float Aggregate<TBinary, TAggregate>(
                 // least one full vector left to process.
                 while (i <= oneVectorFromEnd)
                 {
-                    result = TAggregate.Invoke(result, TBinary.Invoke(Vector128.LoadUnsafe(ref xRef, (uint)i), Vector128.LoadUnsafe(ref yRef, (uint)i)));
+                    result = TAggregationOperator.Invoke(result, TBinaryOperator.Invoke(Vector128.LoadUnsafe(ref xRef, (uint)i), Vector128.LoadUnsafe(ref yRef, (uint)i)));
                     i += Vector128<float>.Count;
                 }
 
                 // Process the last vector in the spans, masking off elements already processed.
                 if (i != x.Length)
                 {
-                    result = TAggregate.Invoke(result,
+                    result = TAggregationOperator.Invoke(result,
                         Vector128.ConditionalSelect(
-                            Vector128.Equals(LoadRemainderMaskSingleVector128(x.Length - i), Vector128<float>.Zero),
-                            Vector128.Create(TAggregate.IdentityValue),
-                            TBinary.Invoke(
+                            Vector128.Equals(CreateRemainderMaskSingleVector128(x.Length - i), Vector128<float>.Zero),
+                            Vector128.Create(TAggregationOperator.IdentityValue),
+                            TBinaryOperator.Invoke(
                                 Vector128.LoadUnsafe(ref xRef, (uint)(x.Length - Vector128<float>.Count)),
                                 Vector128.LoadUnsafe(ref yRef, (uint)(x.Length - Vector128<float>.Count)))));
                 }
 
                 // Aggregate the lanes in the vector back into the scalar result
-                return TAggregate.Invoke(result);
+                return TAggregationOperator.Invoke(result);
             }
 
             // Vectorization isn't supported or there are too few elements to vectorize.
             // Use a scalar implementation.
             {
-                float result = TBinary.Invoke(xRef, yRef);
+                float result = TBinaryOperator.Invoke(xRef, yRef);
                 for (int i = 1; i < x.Length; i++)
                 {
-                    result = TAggregate.Invoke(result,
-                        TBinary.Invoke(
+                    result = TAggregationOperator.Invoke(result,
+                        TBinaryOperator.Invoke(
                             Unsafe.Add(ref xRef, i),
                             Unsafe.Add(ref yRef, i)));
                 }
@@ -1008,10 +1022,11 @@ private static float Aggregate<TBinary, TAggregate>(
         }
 
         /// <remarks>
-        /// This is the same as <see cref="Aggregate{TLoad, TAggregate}(ReadOnlySpan{float})"/>,
-        /// except it early exits on NaN.
+        /// This is the same as <see cref="Aggregate{TTransformOperator, TAggregationOperator}(ReadOnlySpan{float})"/>
+        /// with an identity transform, except it early exits on NaN.
         /// </remarks>
-        private static float MinMaxCore<TMinMax>(ReadOnlySpan<float> x) where TMinMax : struct, IAggregationOperator
+        private static float MinMaxCore<TMinMaxOperator>(ReadOnlySpan<float> x)
+            where TMinMaxOperator : struct, IAggregationOperator
         {
             if (x.IsEmpty)
             {
@@ -1049,7 +1064,7 @@ private static float MinMaxCore<TMinMax>(ReadOnlySpan<float> x) where TMinMax :
                         return GetFirstNaN(current);
                     }
 
-                    result = TMinMax.Invoke(result, current);
+                    result = TMinMaxOperator.Invoke(result, current);
                     i += Vector512<float>.Count;
                 }
 
@@ -1063,13 +1078,13 @@ private static float MinMaxCore<TMinMax>(ReadOnlySpan<float> x) where TMinMax :
                     }
 
                     result = Vector512.ConditionalSelect(
-                        Vector512.Equals(LoadRemainderMaskSingleVector512(x.Length - i), Vector512<float>.Zero),
+                        Vector512.Equals(CreateRemainderMaskSingleVector512(x.Length - i), Vector512<float>.Zero),
                         result,
-                        TMinMax.Invoke(result, current));
+                        TMinMaxOperator.Invoke(result, current));
                 }
 
                 // Aggregate the lanes in the vector to create the final scalar result.
-                return TMinMax.Invoke(result);
+                return TMinMaxOperator.Invoke(result);
             }
 #endif
 
@@ -1098,7 +1113,7 @@ private static float MinMaxCore<TMinMax>(ReadOnlySpan<float> x) where TMinMax :
                         return GetFirstNaN(current);
                     }
 
-                    result = TMinMax.Invoke(result, current);
+                    result = TMinMaxOperator.Invoke(result, current);
                     i += Vector256<float>.Count;
                 }
 
@@ -1112,13 +1127,13 @@ private static float MinMaxCore<TMinMax>(ReadOnlySpan<float> x) where TMinMax :
                     }
 
                     result = Vector256.ConditionalSelect(
-                        Vector256.Equals(LoadRemainderMaskSingleVector256(x.Length - i), Vector256<float>.Zero),
+                        Vector256.Equals(CreateRemainderMaskSingleVector256(x.Length - i), Vector256<float>.Zero),
                         result,
-                        TMinMax.Invoke(result, current));
+                        TMinMaxOperator.Invoke(result, current));
                 }
 
                 // Aggregate the lanes in the vector to create the final scalar result.
-                return TMinMax.Invoke(result);
+                return TMinMaxOperator.Invoke(result);
             }
 
             if (Vector128.IsHardwareAccelerated && x.Length >= Vector128<float>.Count)
@@ -1146,7 +1161,7 @@ private static float MinMaxCore<TMinMax>(ReadOnlySpan<float> x) where TMinMax :
                         return GetFirstNaN(current);
                     }
 
-                    result = TMinMax.Invoke(result, current);
+                    result = TMinMaxOperator.Invoke(result, current);
                     i += Vector128<float>.Count;
                 }
 
@@ -1160,13 +1175,13 @@ private static float MinMaxCore<TMinMax>(ReadOnlySpan<float> x) where TMinMax :
                     }
 
                     result = Vector128.ConditionalSelect(
-                        Vector128.Equals(LoadRemainderMaskSingleVector128(x.Length - i), Vector128<float>.Zero),
+                        Vector128.Equals(CreateRemainderMaskSingleVector128(x.Length - i), Vector128<float>.Zero),
                         result,
-                        TMinMax.Invoke(result, current));
+                        TMinMaxOperator.Invoke(result, current));
                 }
 
                 // Aggregate the lanes in the vector to create the final scalar result.
-                return TMinMax.Invoke(result);
+                return TMinMaxOperator.Invoke(result);
             }
 
             // Scalar path used when either vectorization is not supported or the input is too small to vectorize.
@@ -1185,13 +1200,15 @@ private static float MinMaxCore<TMinMax>(ReadOnlySpan<float> x) where TMinMax :
                         return current;
                     }
 
-                    result = TMinMax.Invoke(result, current);
+                    result = TMinMaxOperator.Invoke(result, current);
                 }
 
                 return result;
             }
         }
 
+        /// <summary>Performs an element-wise operation on <paramref name="x"/> and writes the results to <paramref name="destination"/>.</summary>
+        /// <typeparam name="TUnaryOperator">Specifies the operation to perform on each element loaded from <paramref name="x"/>.</typeparam>
         private static unsafe void InvokeSpanIntoSpan<TUnaryOperator>(
             ReadOnlySpan<float> x, Span<float> destination)
             where TUnaryOperator : struct, IUnaryOperator
@@ -1227,7 +1244,7 @@ private static unsafe void InvokeSpanIntoSpan<TUnaryOperator>(
                     {
                         uint lastVectorIndex = (uint)(x.Length - Vector512<float>.Count);
                         Vector512.ConditionalSelect(
-                            Vector512.Equals(LoadRemainderMaskSingleVector512(x.Length - i), Vector512<float>.Zero),
+                            Vector512.Equals(CreateRemainderMaskSingleVector512(x.Length - i), Vector512<float>.Zero),
                             Vector512.LoadUnsafe(ref dRef, lastVectorIndex),
                             TUnaryOperator.Invoke(Vector512.LoadUnsafe(ref xRef, lastVectorIndex))).StoreUnsafe(ref dRef, lastVectorIndex);
                     }
@@ -1256,7 +1273,7 @@ private static unsafe void InvokeSpanIntoSpan<TUnaryOperator>(
                     {
                         uint lastVectorIndex = (uint)(x.Length - Vector256<float>.Count);
                         Vector256.ConditionalSelect(
-                            Vector256.Equals(LoadRemainderMaskSingleVector256(x.Length - i), Vector256<float>.Zero),
+                            Vector256.Equals(CreateRemainderMaskSingleVector256(x.Length - i), Vector256<float>.Zero),
                             Vector256.LoadUnsafe(ref dRef, lastVectorIndex),
                             TUnaryOperator.Invoke(Vector256.LoadUnsafe(ref xRef, lastVectorIndex))).StoreUnsafe(ref dRef, lastVectorIndex);
                     }
@@ -1284,7 +1301,7 @@ private static unsafe void InvokeSpanIntoSpan<TUnaryOperator>(
                     {
                         uint lastVectorIndex = (uint)(x.Length - Vector128<float>.Count);
                         Vector128.ConditionalSelect(
-                            Vector128.Equals(LoadRemainderMaskSingleVector128(x.Length - i), Vector128<float>.Zero),
+                            Vector128.Equals(CreateRemainderMaskSingleVector128(x.Length - i), Vector128<float>.Zero),
                             Vector128.LoadUnsafe(ref dRef, lastVectorIndex),
                             TUnaryOperator.Invoke(Vector128.LoadUnsafe(ref xRef, lastVectorIndex))).StoreUnsafe(ref dRef, lastVectorIndex);
                     }
@@ -1301,6 +1318,13 @@ private static unsafe void InvokeSpanIntoSpan<TUnaryOperator>(
             }
         }
 
+        /// <summary>
+        /// Performs an element-wise operation on <paramref name="x"/> and <paramref name="y"/>,
+        /// and writes the results to <paramref name="destination"/>.
+        /// </summary>
+        /// <typeparam name="TBinaryOperator">
+        /// Specifies the operation to perform on the pair-wise elements loaded from <paramref name="x"/> and <paramref name="y"/>.
+        /// </typeparam>
         private static unsafe void InvokeSpanSpanIntoSpan<TBinaryOperator>(
             ReadOnlySpan<float> x, ReadOnlySpan<float> y, Span<float> destination)
             where TBinaryOperator : struct, IBinaryOperator
@@ -1344,7 +1368,7 @@ private static unsafe void InvokeSpanSpanIntoSpan<TBinaryOperator>(
                     {
                         uint lastVectorIndex = (uint)(x.Length - Vector512<float>.Count);
                         Vector512.ConditionalSelect(
-                            Vector512.Equals(LoadRemainderMaskSingleVector512(x.Length - i), Vector512<float>.Zero),
+                            Vector512.Equals(CreateRemainderMaskSingleVector512(x.Length - i), Vector512<float>.Zero),
                             Vector512.LoadUnsafe(ref dRef, lastVectorIndex),
                             TBinaryOperator.Invoke(Vector512.LoadUnsafe(ref xRef, lastVectorIndex),
                                                    Vector512.LoadUnsafe(ref yRef, lastVectorIndex))).StoreUnsafe(ref dRef, lastVectorIndex);
@@ -1375,7 +1399,7 @@ private static unsafe void InvokeSpanSpanIntoSpan<TBinaryOperator>(
                     {
                         uint lastVectorIndex = (uint)(x.Length - Vector256<float>.Count);
                         Vector256.ConditionalSelect(
-                            Vector256.Equals(LoadRemainderMaskSingleVector256(x.Length - i), Vector256<float>.Zero),
+                            Vector256.Equals(CreateRemainderMaskSingleVector256(x.Length - i), Vector256<float>.Zero),
                             Vector256.LoadUnsafe(ref dRef, lastVectorIndex),
                             TBinaryOperator.Invoke(Vector256.LoadUnsafe(ref xRef, lastVectorIndex),
                                                    Vector256.LoadUnsafe(ref yRef, lastVectorIndex))).StoreUnsafe(ref dRef, lastVectorIndex);
@@ -1405,7 +1429,7 @@ private static unsafe void InvokeSpanSpanIntoSpan<TBinaryOperator>(
                     {
                         uint lastVectorIndex = (uint)(x.Length - Vector128<float>.Count);
                         Vector128.ConditionalSelect(
-                            Vector128.Equals(LoadRemainderMaskSingleVector128(x.Length - i), Vector128<float>.Zero),
+                            Vector128.Equals(CreateRemainderMaskSingleVector128(x.Length - i), Vector128<float>.Zero),
                             Vector128.LoadUnsafe(ref dRef, lastVectorIndex),
                             TBinaryOperator.Invoke(Vector128.LoadUnsafe(ref xRef, lastVectorIndex),
                                                    Vector128.LoadUnsafe(ref yRef, lastVectorIndex))).StoreUnsafe(ref dRef, lastVectorIndex);
@@ -1424,8 +1448,32 @@ private static unsafe void InvokeSpanSpanIntoSpan<TBinaryOperator>(
             }
         }
 
+        /// <summary>
+        /// Performs an element-wise operation on <paramref name="x"/> and <paramref name="y"/>,
+        /// and writes the results to <paramref name="destination"/>.
+        /// </summary>
+        /// <typeparam name="TBinaryOperator">
+        /// Specifies the operation to perform on each element loaded from <paramref name="x"/> with <paramref name="y"/>.
+        /// </typeparam>
         private static unsafe void InvokeSpanScalarIntoSpan<TBinaryOperator>(
             ReadOnlySpan<float> x, float y, Span<float> destination)
+            where TBinaryOperator : struct, IBinaryOperator =>
+            InvokeSpanScalarIntoSpan<IdentityOperator, TBinaryOperator>(x, y, destination);
+
+        /// <summary>
+        /// Performs an element-wise operation on <paramref name="x"/> and <paramref name="y"/>,
+        /// and writes the results to <paramref name="destination"/>.
+        /// </summary>
+        /// <typeparam name="TTransformOperator">
+        /// Specifies the operation to perform on each element loaded from <paramref name="x"/>.
+        /// It is not used with <paramref name="y"/>.
+        /// </typeparam>
+        /// <typeparam name="TBinaryOperator">
+        /// Specifies the operation to perform on the transformed value from <paramref name="x"/> with <paramref name="y"/>.
+        /// </typeparam>
+        private static unsafe void InvokeSpanScalarIntoSpan<TTransformOperator, TBinaryOperator>(
+            ReadOnlySpan<float> x, float y, Span<float> destination)
+            where TTransformOperator : struct, IUnaryOperator
             where TBinaryOperator : struct, IBinaryOperator
         {
             if (x.Length > destination.Length)
@@ -1450,7 +1498,7 @@ private static unsafe void InvokeSpanScalarIntoSpan<TBinaryOperator>(
                     // Loop handling one vector at a time.
                     do
                     {
-                        TBinaryOperator.Invoke(Vector512.LoadUnsafe(ref xRef, (uint)i),
+                        TBinaryOperator.Invoke(TTransformOperator.Invoke(Vector512.LoadUnsafe(ref xRef, (uint)i)),
                                                yVec).StoreUnsafe(ref dRef, (uint)i);
 
                         i += Vector512<float>.Count;
@@ -1462,9 +1510,9 @@ private static unsafe void InvokeSpanScalarIntoSpan<TBinaryOperator>(
                     {
                         uint lastVectorIndex = (uint)(x.Length - Vector512<float>.Count);
                         Vector512.ConditionalSelect(
-                            Vector512.Equals(LoadRemainderMaskSingleVector512(x.Length - i), Vector512<float>.Zero),
+                            Vector512.Equals(CreateRemainderMaskSingleVector512(x.Length - i), Vector512<float>.Zero),
                             Vector512.LoadUnsafe(ref dRef, lastVectorIndex),
-                            TBinaryOperator.Invoke(Vector512.LoadUnsafe(ref xRef, lastVectorIndex),
+                            TBinaryOperator.Invoke(TTransformOperator.Invoke(Vector512.LoadUnsafe(ref xRef, lastVectorIndex)),
                                                    yVec)).StoreUnsafe(ref dRef, lastVectorIndex);
                     }
 
@@ -1483,7 +1531,7 @@ private static unsafe void InvokeSpanScalarIntoSpan<TBinaryOperator>(
                     // Loop handling one vector at a time.
                     do
                     {
-                        TBinaryOperator.Invoke(Vector256.LoadUnsafe(ref xRef, (uint)i),
+                        TBinaryOperator.Invoke(TTransformOperator.Invoke(Vector256.LoadUnsafe(ref xRef, (uint)i)),
                                                yVec).StoreUnsafe(ref dRef, (uint)i);
 
                         i += Vector256<float>.Count;
@@ -1495,9 +1543,9 @@ private static unsafe void InvokeSpanScalarIntoSpan<TBinaryOperator>(
                     {
                         uint lastVectorIndex = (uint)(x.Length - Vector256<float>.Count);
                         Vector256.ConditionalSelect(
-                            Vector256.Equals(LoadRemainderMaskSingleVector256(x.Length - i), Vector256<float>.Zero),
+                            Vector256.Equals(CreateRemainderMaskSingleVector256(x.Length - i), Vector256<float>.Zero),
                             Vector256.LoadUnsafe(ref dRef, lastVectorIndex),
-                            TBinaryOperator.Invoke(Vector256.LoadUnsafe(ref xRef, lastVectorIndex),
+                            TBinaryOperator.Invoke(TTransformOperator.Invoke(Vector256.LoadUnsafe(ref xRef, lastVectorIndex)),
                                                    yVec)).StoreUnsafe(ref dRef, lastVectorIndex);
                     }
 
@@ -1515,7 +1563,7 @@ private static unsafe void InvokeSpanScalarIntoSpan<TBinaryOperator>(
                     // Loop handling one vector at a time.
                     do
                     {
-                        TBinaryOperator.Invoke(Vector128.LoadUnsafe(ref xRef, (uint)i),
+                        TBinaryOperator.Invoke(TTransformOperator.Invoke(Vector128.LoadUnsafe(ref xRef, (uint)i)),
                                                yVec).StoreUnsafe(ref dRef, (uint)i);
 
                         i += Vector128<float>.Count;
@@ -1527,9 +1575,9 @@ private static unsafe void InvokeSpanScalarIntoSpan<TBinaryOperator>(
                     {
                         uint lastVectorIndex = (uint)(x.Length - Vector128<float>.Count);
                         Vector128.ConditionalSelect(
-                            Vector128.Equals(LoadRemainderMaskSingleVector128(x.Length - i), Vector128<float>.Zero),
+                            Vector128.Equals(CreateRemainderMaskSingleVector128(x.Length - i), Vector128<float>.Zero),
                             Vector128.LoadUnsafe(ref dRef, lastVectorIndex),
-                            TBinaryOperator.Invoke(Vector128.LoadUnsafe(ref xRef, lastVectorIndex),
+                            TBinaryOperator.Invoke(TTransformOperator.Invoke(Vector128.LoadUnsafe(ref xRef, lastVectorIndex)),
                                                    yVec)).StoreUnsafe(ref dRef, lastVectorIndex);
                     }
 
@@ -1539,13 +1587,21 @@ private static unsafe void InvokeSpanScalarIntoSpan<TBinaryOperator>(
 
             while (i < x.Length)
             {
-                Unsafe.Add(ref dRef, i) = TBinaryOperator.Invoke(Unsafe.Add(ref xRef, i),
+                Unsafe.Add(ref dRef, i) = TBinaryOperator.Invoke(TTransformOperator.Invoke(Unsafe.Add(ref xRef, i)),
                                                                  y);
 
                 i++;
             }
         }
 
+        /// <summary>
+        /// Performs an element-wise operation on <paramref name="x"/>, <paramref name="y"/>, and <paramref name="z"/>,
+        /// and writes the results to <paramref name="destination"/>.
+        /// </summary>
+        /// <typeparam name="TTernaryOperator">
+        /// Specifies the operation to perform on the pair-wise elements loaded from <paramref name="x"/>, <paramref name="y"/>,
+        /// and <paramref name="z"/>.
+        /// </typeparam>
         private static unsafe void InvokeSpanSpanSpanIntoSpan<TTernaryOperator>(
             ReadOnlySpan<float> x, ReadOnlySpan<float> y, ReadOnlySpan<float> z, Span<float> destination)
             where TTernaryOperator : struct, ITernaryOperator
@@ -1592,7 +1648,7 @@ private static unsafe void InvokeSpanSpanSpanIntoSpan<TTernaryOperator>(
                     {
                         uint lastVectorIndex = (uint)(x.Length - Vector512<float>.Count);
                         Vector512.ConditionalSelect(
-                            Vector512.Equals(LoadRemainderMaskSingleVector512(x.Length - i), Vector512<float>.Zero),
+                            Vector512.Equals(CreateRemainderMaskSingleVector512(x.Length - i), Vector512<float>.Zero),
                             Vector512.LoadUnsafe(ref dRef, lastVectorIndex),
                             TTernaryOperator.Invoke(Vector512.LoadUnsafe(ref xRef, lastVectorIndex),
                                                     Vector512.LoadUnsafe(ref yRef, lastVectorIndex),
@@ -1625,7 +1681,7 @@ private static unsafe void InvokeSpanSpanSpanIntoSpan<TTernaryOperator>(
                     {
                         uint lastVectorIndex = (uint)(x.Length - Vector256<float>.Count);
                         Vector256.ConditionalSelect(
-                            Vector256.Equals(LoadRemainderMaskSingleVector256(x.Length - i), Vector256<float>.Zero),
+                            Vector256.Equals(CreateRemainderMaskSingleVector256(x.Length - i), Vector256<float>.Zero),
                             Vector256.LoadUnsafe(ref dRef, lastVectorIndex),
                             TTernaryOperator.Invoke(Vector256.LoadUnsafe(ref xRef, lastVectorIndex),
                                                     Vector256.LoadUnsafe(ref yRef, lastVectorIndex),
@@ -1657,7 +1713,7 @@ private static unsafe void InvokeSpanSpanSpanIntoSpan<TTernaryOperator>(
                     {
                         uint lastVectorIndex = (uint)(x.Length - Vector128<float>.Count);
                         Vector128.ConditionalSelect(
-                            Vector128.Equals(LoadRemainderMaskSingleVector128(x.Length - i), Vector128<float>.Zero),
+                            Vector128.Equals(CreateRemainderMaskSingleVector128(x.Length - i), Vector128<float>.Zero),
                             Vector128.LoadUnsafe(ref dRef, lastVectorIndex),
                             TTernaryOperator.Invoke(Vector128.LoadUnsafe(ref xRef, lastVectorIndex),
                                                     Vector128.LoadUnsafe(ref yRef, lastVectorIndex),
@@ -1678,6 +1734,14 @@ private static unsafe void InvokeSpanSpanSpanIntoSpan<TTernaryOperator>(
             }
         }
 
+        /// <summary>
+        /// Performs an element-wise operation on <paramref name="x"/>, <paramref name="y"/>, and <paramref name="z"/>,
+        /// and writes the results to <paramref name="destination"/>.
+        /// </summary>
+        /// <typeparam name="TTernaryOperator">
+        /// Specifies the operation to perform on the pair-wise elements loaded from <paramref name="x"/> and <paramref name="y"/>
+        /// with <paramref name="z"/>.
+        /// </typeparam>
         private static unsafe void InvokeSpanSpanScalarIntoSpan<TTernaryOperator>(
             ReadOnlySpan<float> x, ReadOnlySpan<float> y, float z, Span<float> destination)
             where TTernaryOperator : struct, ITernaryOperator
@@ -1724,7 +1788,7 @@ private static unsafe void InvokeSpanSpanScalarIntoSpan<TTernaryOperator>(
                     {
                         uint lastVectorIndex = (uint)(x.Length - Vector512<float>.Count);
                         Vector512.ConditionalSelect(
-                            Vector512.Equals(LoadRemainderMaskSingleVector512(x.Length - i), Vector512<float>.Zero),
+                            Vector512.Equals(CreateRemainderMaskSingleVector512(x.Length - i), Vector512<float>.Zero),
                             Vector512.LoadUnsafe(ref dRef, lastVectorIndex),
                             TTernaryOperator.Invoke(Vector512.LoadUnsafe(ref xRef, lastVectorIndex),
                                                     Vector512.LoadUnsafe(ref yRef, lastVectorIndex),
@@ -1759,7 +1823,7 @@ private static unsafe void InvokeSpanSpanScalarIntoSpan<TTernaryOperator>(
                     {
                         uint lastVectorIndex = (uint)(x.Length - Vector256<float>.Count);
                         Vector256.ConditionalSelect(
-                            Vector256.Equals(LoadRemainderMaskSingleVector256(x.Length - i), Vector256<float>.Zero),
+                            Vector256.Equals(CreateRemainderMaskSingleVector256(x.Length - i), Vector256<float>.Zero),
                             Vector256.LoadUnsafe(ref dRef, lastVectorIndex),
                             TTernaryOperator.Invoke(Vector256.LoadUnsafe(ref xRef, lastVectorIndex),
                                                     Vector256.LoadUnsafe(ref yRef, lastVectorIndex),
@@ -1793,7 +1857,7 @@ private static unsafe void InvokeSpanSpanScalarIntoSpan<TTernaryOperator>(
                     {
                         uint lastVectorIndex = (uint)(x.Length - Vector128<float>.Count);
                         Vector128.ConditionalSelect(
-                            Vector128.Equals(LoadRemainderMaskSingleVector128(x.Length - i), Vector128<float>.Zero),
+                            Vector128.Equals(CreateRemainderMaskSingleVector128(x.Length - i), Vector128<float>.Zero),
                             Vector128.LoadUnsafe(ref dRef, lastVectorIndex),
                             TTernaryOperator.Invoke(Vector128.LoadUnsafe(ref xRef, lastVectorIndex),
                                                     Vector128.LoadUnsafe(ref yRef, lastVectorIndex),
@@ -1814,6 +1878,14 @@ private static unsafe void InvokeSpanSpanScalarIntoSpan<TTernaryOperator>(
             }
         }
 
+        /// <summary>
+        /// Performs an element-wise operation on <paramref name="x"/>, <paramref name="y"/>, and <paramref name="z"/>,
+        /// and writes the results to <paramref name="destination"/>.
+        /// </summary>
+        /// <typeparam name="TTernaryOperator">
+        /// Specifies the operation to perform on the pair-wise element loaded from <paramref name="x"/>, with <paramref name="y"/>,
+        /// and the element loaded from <paramref name="z"/>.
+        /// </typeparam>
         private static unsafe void InvokeSpanScalarSpanIntoSpan<TTernaryOperator>(
             ReadOnlySpan<float> x, float y, ReadOnlySpan<float> z, Span<float> destination)
             where TTernaryOperator : struct, ITernaryOperator
@@ -1860,7 +1932,7 @@ private static unsafe void InvokeSpanScalarSpanIntoSpan<TTernaryOperator>(
                     {
                         uint lastVectorIndex = (uint)(x.Length - Vector512<float>.Count);
                         Vector512.ConditionalSelect(
-                            Vector512.Equals(LoadRemainderMaskSingleVector512(x.Length - i), Vector512<float>.Zero),
+                            Vector512.Equals(CreateRemainderMaskSingleVector512(x.Length - i), Vector512<float>.Zero),
                             Vector512.LoadUnsafe(ref dRef, lastVectorIndex),
                             TTernaryOperator.Invoke(Vector512.LoadUnsafe(ref xRef, lastVectorIndex),
                                                     yVec,
@@ -1895,7 +1967,7 @@ private static unsafe void InvokeSpanScalarSpanIntoSpan<TTernaryOperator>(
                     {
                         uint lastVectorIndex = (uint)(x.Length - Vector256<float>.Count);
                         Vector256.ConditionalSelect(
-                            Vector256.Equals(LoadRemainderMaskSingleVector256(x.Length - i), Vector256<float>.Zero),
+                            Vector256.Equals(CreateRemainderMaskSingleVector256(x.Length - i), Vector256<float>.Zero),
                             Vector256.LoadUnsafe(ref dRef, lastVectorIndex),
                             TTernaryOperator.Invoke(Vector256.LoadUnsafe(ref xRef, lastVectorIndex),
                                                     yVec,
@@ -1929,7 +2001,7 @@ private static unsafe void InvokeSpanScalarSpanIntoSpan<TTernaryOperator>(
                     {
                         uint lastVectorIndex = (uint)(x.Length - Vector128<float>.Count);
                         Vector128.ConditionalSelect(
-                            Vector128.Equals(LoadRemainderMaskSingleVector128(x.Length - i), Vector128<float>.Zero),
+                            Vector128.Equals(CreateRemainderMaskSingleVector128(x.Length - i), Vector128<float>.Zero),
                             Vector128.LoadUnsafe(ref dRef, lastVectorIndex),
                             TTernaryOperator.Invoke(Vector128.LoadUnsafe(ref xRef, lastVectorIndex),
                                                     yVec,
@@ -1950,6 +2022,7 @@ private static unsafe void InvokeSpanScalarSpanIntoSpan<TTernaryOperator>(
             }
         }
 
+        /// <summary>Performs (x * y) + z. It will be rounded as one ternary operation if such an operation is accelerated on the current hardware.</summary>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         private static Vector128<float> FusedMultiplyAdd(Vector128<float> x, Vector128<float> y, Vector128<float> addend)
         {
@@ -1966,6 +2039,7 @@ private static Vector128<float> FusedMultiplyAdd(Vector128<float> x, Vector128<f
             return (x * y) + addend;
         }
 
+        /// <summary>Performs (x * y) + z. It will be rounded as one ternary operation if such an operation is accelerated on the current hardware.</summary>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         private static Vector256<float> FusedMultiplyAdd(Vector256<float> x, Vector256<float> y, Vector256<float> addend)
         {
@@ -1978,6 +2052,7 @@ private static Vector256<float> FusedMultiplyAdd(Vector256<float> x, Vector256<f
         }
 
 #if NET8_0_OR_GREATER
+        /// <summary>Performs (x * y) + z. It will be rounded as one ternary operation if such an operation is accelerated on the current hardware.</summary>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         private static Vector512<float> FusedMultiplyAdd(Vector512<float> x, Vector512<float> y, Vector512<float> addend)
         {
@@ -1990,71 +2065,110 @@ private static Vector512<float> FusedMultiplyAdd(Vector512<float> x, Vector512<f
         }
 #endif
 
+        /// <summary>Aggregates all of the elements in the <paramref name="x"/> into a single value.</summary>
+        /// <typeparam name="TAggregate">Specifies the operation to be performed on each pair of values.</typeparam>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         private static float HorizontalAggregate<TAggregate>(Vector128<float> x) where TAggregate : struct, IBinaryOperator =>
             TAggregate.Invoke(
                 TAggregate.Invoke(x[0], x[1]),
                 TAggregate.Invoke(x[2], x[3]));
 
+        /// <summary>Aggregates all of the elements in the <paramref name="x"/> into a single value.</summary>
+        /// <typeparam name="TAggregate">Specifies the operation to be performed on each pair of values.</typeparam>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         private static float HorizontalAggregate<TAggregate>(Vector256<float> x) where TAggregate : struct, IBinaryOperator =>
             HorizontalAggregate<TAggregate>(TAggregate.Invoke(x.GetLower(), x.GetUpper()));
 
 #if NET8_0_OR_GREATER
+        /// <summary>Aggregates all of the elements in the <paramref name="x"/> into a single value.</summary>
+        /// <typeparam name="TAggregate">Specifies the operation to be performed on each pair of values.</typeparam>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         private static float HorizontalAggregate<TAggregate>(Vector512<float> x) where TAggregate : struct, IBinaryOperator =>
             HorizontalAggregate<TAggregate>(TAggregate.Invoke(x.GetLower(), x.GetUpper()));
 #endif
 
+        /// <summary>Gets whether the specified <see cref="float"/> is negative.</summary>
         private static bool IsNegative(float f) => float.IsNegative(f);
 
+        /// <summary>Gets whether each specified <see cref="float"/> is negative.</summary>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         private static Vector128<float> IsNegative(Vector128<float> vector) =>
             Vector128.LessThan(vector.AsInt32(), Vector128<int>.Zero).AsSingle();
 
+        /// <summary>Gets whether each specified <see cref="float"/> is negative.</summary>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         private static Vector256<float> IsNegative(Vector256<float> vector) =>
             Vector256.LessThan(vector.AsInt32(), Vector256<int>.Zero).AsSingle();
 
 #if NET8_0_OR_GREATER
+        /// <summary>Gets whether each specified <see cref="float"/> is negative.</summary>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         private static Vector512<float> IsNegative(Vector512<float> vector) =>
             Vector512.LessThan(vector.AsInt32(), Vector512<int>.Zero).AsSingle();
 #endif
 
-        private static float GetFirstNaN(Vector128<float> vector) =>
-            vector[BitOperations.TrailingZeroCount((~Vector128.Equals(vector, vector)).ExtractMostSignificantBits())];
+        /// <summary>Finds and returns the first NaN value in <paramref name="vector"/>.</summary>
+        /// <remarks>The vector must have already been validated to contain a NaN.</remarks>
+        private static float GetFirstNaN(Vector128<float> vector)
+        {
+            Debug.Assert(!Vector128.EqualsAll(vector, vector), "Expected vector to contain a NaN");
+            return vector[BitOperations.TrailingZeroCount((~Vector128.Equals(vector, vector)).ExtractMostSignificantBits())];
+        }
 
-        private static float GetFirstNaN(Vector256<float> vector) =>
-            vector[BitOperations.TrailingZeroCount((~Vector256.Equals(vector, vector)).ExtractMostSignificantBits())];
+        /// <summary>Finds and returns the first NaN value in <paramref name="vector"/>.</summary>
+        /// <remarks>The vector must have already been validated to contain a NaN.</remarks>
+        private static float GetFirstNaN(Vector256<float> vector)
+        {
+            Debug.Assert(!Vector256.EqualsAll(vector, vector), "Expected vector to contain a NaN");
+            return vector[BitOperations.TrailingZeroCount((~Vector256.Equals(vector, vector)).ExtractMostSignificantBits())];
+        }
 
 #if NET8_0_OR_GREATER
-        private static float GetFirstNaN(Vector512<float> vector) =>
-            vector[BitOperations.TrailingZeroCount((~Vector512.Equals(vector, vector)).ExtractMostSignificantBits())];
+        /// <summary>Finds and returns the first NaN value in <paramref name="vector"/>.</summary>
+        /// <remarks>The vector must have already been validated to contain a NaN.</remarks>
+        private static float GetFirstNaN(Vector512<float> vector)
+        {
+            Debug.Assert(!Vector512.EqualsAll(vector, vector), "Expected vector to contain a NaN");
+            return vector[BitOperations.TrailingZeroCount((~Vector512.Equals(vector, vector)).ExtractMostSignificantBits())];
+        }
 #endif
 
+        /// <summary>Gets the base 2 logarithm of <paramref name="x"/>.</summary>
         private static float Log2(float x) => MathF.Log2(x);
 
+        /// <summary>
+        /// Gets a vector mask that will be all-ones-set for the last <paramref name="count"/> elements
+        /// and zero for all other elements.
+        /// </summary>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static unsafe Vector128<float> LoadRemainderMaskSingleVector128(int validItems) =>
+        private static unsafe Vector128<float> CreateRemainderMaskSingleVector128(int count) =>
             Vector128.LoadUnsafe(
                 ref Unsafe.As<uint, float>(ref MemoryMarshal.GetReference(RemainderUInt32Mask_16x16)),
-                (uint)((validItems * 16) + 12)); // last four floats in the row
+                (uint)((count * 16) + 12)); // last four floats in the row
 
+        /// <summary>
+        /// Gets a vector mask that will be all-ones-set for the last <paramref name="count"/> elements
+        /// and zero for all other elements.
+        /// </summary>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static unsafe Vector256<float> LoadRemainderMaskSingleVector256(int validItems) =>
+        private static unsafe Vector256<float> CreateRemainderMaskSingleVector256(int count) =>
             Vector256.LoadUnsafe(
                 ref Unsafe.As<uint, float>(ref MemoryMarshal.GetReference(RemainderUInt32Mask_16x16)),
-                (uint)((validItems * 16) + 8)); // last eight floats in the row
+                (uint)((count * 16) + 8)); // last eight floats in the row
 
 #if NET8_0_OR_GREATER
+        /// <summary>
+        /// Gets a vector mask that will be all-ones-set for the last <paramref name="count"/> elements
+        /// and zero for all other elements.
+        /// </summary>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private static unsafe Vector512<float> LoadRemainderMaskSingleVector512(int validItems) =>
+        private static unsafe Vector512<float> CreateRemainderMaskSingleVector512(int count) =>
             Vector512.LoadUnsafe(
                 ref Unsafe.As<uint, float>(ref MemoryMarshal.GetReference(RemainderUInt32Mask_16x16)),
-                (uint)(validItems * 16)); // all sixteen floats in the row
+                (uint)(count * 16)); // all sixteen floats in the row
 #endif
 
+        /// <summary>x + y</summary>
         private readonly struct AddOperator : IAggregationOperator
         {
             public static float Invoke(float x, float y) => x + y;
@@ -2073,6 +2187,7 @@ ref Unsafe.As<uint, float>(ref MemoryMarshal.GetReference(RemainderUInt32Mask_16
             public static float IdentityValue => 0;
         }
 
+        /// <summary>x - y</summary>
         private readonly struct SubtractOperator : IBinaryOperator
         {
             public static float Invoke(float x, float y) => x - y;
@@ -2083,6 +2198,7 @@ ref Unsafe.As<uint, float>(ref MemoryMarshal.GetReference(RemainderUInt32Mask_16
 #endif
         }
 
+        /// <summary>(x - y) * (x - y)</summary>
         private readonly struct SubtractSquaredOperator : IBinaryOperator
         {
             public static float Invoke(float x, float y)
@@ -2112,6 +2228,7 @@ public static Vector512<float> Invoke(Vector512<float> x, Vector512<float> y)
 #endif
         }
 
+        /// <summary>x * y</summary>
         private readonly struct MultiplyOperator : IAggregationOperator
         {
             public static float Invoke(float x, float y) => x * y;
@@ -2130,6 +2247,7 @@ public static Vector512<float> Invoke(Vector512<float> x, Vector512<float> y)
             public static float IdentityValue => 1;
         }
 
+        /// <summary>x / y</summary>
         private readonly struct DivideOperator : IBinaryOperator
         {
             public static float Invoke(float x, float y) => x / y;
@@ -2140,6 +2258,7 @@ public static Vector512<float> Invoke(Vector512<float> x, Vector512<float> y)
 #endif
         }
 
+        /// <summary>MathF.Max(x, y) (but NaNs may not be propagated)</summary>
         private readonly struct MaxOperator : IAggregationOperator
         {
             [MethodImpl(MethodImplOptions.AggressiveInlining)]
@@ -2183,6 +2302,7 @@ public static Vector512<float> Invoke(Vector512<float> x, Vector512<float> y) =>
 #endif
         }
 
+        /// <summary>MathF.Max(x, y)</summary>
         private readonly struct MaxPropagateNaNOperator : IBinaryOperator
         {
             [MethodImpl(MethodImplOptions.AggressiveInlining)]
@@ -2229,6 +2349,7 @@ public static Vector512<float> Invoke(Vector512<float> x, Vector512<float> y) =>
 #endif
         }
 
+        /// <summary>Operator to get x or y based on which has the larger MathF.Abs (but NaNs may not be propagated)</summary>
         private readonly struct MaxMagnitudeOperator : IAggregationOperator
         {
             [MethodImpl(MethodImplOptions.AggressiveInlining)]
@@ -2280,6 +2401,7 @@ public static Vector512<float> Invoke(Vector512<float> x, Vector512<float> y)
 #endif
         }
 
+        /// <summary>Operator to get x or y based on which has the larger MathF.Abs</summary>
         private readonly struct MaxMagnitudePropagateNaNOperator : IBinaryOperator
         {
             [MethodImpl(MethodImplOptions.AggressiveInlining)]
@@ -2330,6 +2452,7 @@ public static Vector512<float> Invoke(Vector512<float> x, Vector512<float> y)
 #endif
         }
 
+        /// <summary>MathF.Min(x, y) (but NaNs may not be propagated)</summary>
         private readonly struct MinOperator : IAggregationOperator
         {
             [MethodImpl(MethodImplOptions.AggressiveInlining)]
@@ -2373,6 +2496,7 @@ public static Vector512<float> Invoke(Vector512<float> x, Vector512<float> y) =>
 #endif
         }
 
+        /// <summary>MathF.Min(x, y)</summary>
         private readonly struct MinPropagateNaNOperator : IBinaryOperator
         {
             [MethodImpl(MethodImplOptions.AggressiveInlining)]
@@ -2419,6 +2543,7 @@ public static Vector512<float> Invoke(Vector512<float> x, Vector512<float> y) =>
 #endif
         }
 
+        /// <summary>Operator to get x or y based on which has the smaller MathF.Abs (but NaNs may not be propagated)</summary>
         private readonly struct MinMagnitudeOperator : IAggregationOperator
         {
             [MethodImpl(MethodImplOptions.AggressiveInlining)]
@@ -2469,6 +2594,7 @@ public static Vector512<float> Invoke(Vector512<float> x, Vector512<float> y)
 #endif
         }
 
+        /// <summary>Operator to get x or y based on which has the smaller MathF.Abs</summary>
         private readonly struct MinMagnitudePropagateNaNOperator : IBinaryOperator
         {
             [MethodImpl(MethodImplOptions.AggressiveInlining)]
@@ -2519,6 +2645,7 @@ public static Vector512<float> Invoke(Vector512<float> x, Vector512<float> y)
 #endif
         }
 
+        /// <summary>-x</summary>
         private readonly struct NegateOperator : IUnaryOperator
         {
             public static float Invoke(float x) => -x;
@@ -2529,6 +2656,7 @@ public static Vector512<float> Invoke(Vector512<float> x, Vector512<float> y)
 #endif
         }
 
+        /// <summary>(x + y) * z</summary>
         private readonly struct AddMultiplyOperator : ITernaryOperator
         {
             public static float Invoke(float x, float y, float z) => (x + y) * z;
@@ -2539,6 +2667,7 @@ public static Vector512<float> Invoke(Vector512<float> x, Vector512<float> y)
 #endif
         }
 
+        /// <summary>(x * y) + z</summary>
         private readonly struct MultiplyAddOperator : ITernaryOperator
         {
             public static float Invoke(float x, float y, float z) => (x * y) + z;
@@ -2549,6 +2678,7 @@ public static Vector512<float> Invoke(Vector512<float> x, Vector512<float> y)
 #endif
         }
 
+        /// <summary>x</summary>
         private readonly struct IdentityOperator : IUnaryOperator
         {
             public static float Invoke(float x) => x;
@@ -2559,6 +2689,7 @@ public static Vector512<float> Invoke(Vector512<float> x, Vector512<float> y)
 #endif
         }
 
+        /// <summary>x * x</summary>
         private readonly struct SquaredOperator : IUnaryOperator
         {
             public static float Invoke(float x) => x * x;
@@ -2569,6 +2700,7 @@ public static Vector512<float> Invoke(Vector512<float> x, Vector512<float> y)
 #endif
         }
 
+        /// <summary>MathF.Abs(x)</summary>
         private readonly struct AbsoluteOperator : IUnaryOperator
         {
             public static float Invoke(float x) => MathF.Abs(x);
@@ -2579,6 +2711,7 @@ public static Vector512<float> Invoke(Vector512<float> x, Vector512<float> y)
 #endif
         }
 
+        /// <summary>MathF.Exp(x)</summary>
         private readonly struct ExpOperator : IUnaryOperator
         {
             // This code is based on `vrs4_expf` from amd/aocl-libm-ose
@@ -2859,6 +2992,7 @@ public static Vector512<float> Invoke(Vector512<float> x)
 #endif
         }
 
+        /// <summary>MathF.Log(x)</summary>
         private readonly struct LogOperator : IUnaryOperator
         {
             // This code is based on `vrs4_logf` from amd/aocl-libm-ose
@@ -3144,6 +3278,7 @@ public static Vector512<float> Invoke(Vector512<float> x)
 #endif
         }
 
+        /// <summary>MathF.Log2(x)</summary>
         private readonly struct Log2Operator : IUnaryOperator
         {
             // This code is based on `vrs4_log2f` from amd/aocl-libm-ose
@@ -3424,6 +3559,18 @@ public static Vector512<float> Invoke(Vector512<float> x)
 #endif
         }
 
+        /// <summary>1f / (1f + MathF.Exp(-x))</summary>
+        private readonly struct SigmoidOperator : IUnaryOperator
+        {
+            public static float Invoke(float x) => 1.0f / (1.0f + MathF.Exp(-x));
+            public static Vector128<float> Invoke(Vector128<float> x) => Vector128.Create(1f) / (Vector128.Create(1f) + ExpOperator.Invoke(-x));
+            public static Vector256<float> Invoke(Vector256<float> x) => Vector256.Create(1f) / (Vector256.Create(1f) + ExpOperator.Invoke(-x));
+#if NET8_0_OR_GREATER
+            public static Vector512<float> Invoke(Vector512<float> x) => Vector512.Create(1f) / (Vector512.Create(1f) + ExpOperator.Invoke(-x));
+#endif
+        }
+
+        /// <summary>Operator that takes one input value and returns a single value.</summary>
         private interface IUnaryOperator
         {
             static abstract float Invoke(float x);
@@ -3434,6 +3581,7 @@ private interface IUnaryOperator
 #endif
         }
 
+        /// <summary>Operator that takes two input values and returns a single value.</summary>
         private interface IBinaryOperator
         {
             static abstract float Invoke(float x, float y);
@@ -3444,6 +3592,7 @@ private interface IBinaryOperator
 #endif
         }
 
+        /// <summary><see cref="IBinaryOperator"/> that specializes horizontal aggregation of all elements in a vector.</summary>
         private interface IAggregationOperator : IBinaryOperator
         {
             static abstract float Invoke(Vector128<float> x);
@@ -3455,6 +3604,7 @@ private interface IAggregationOperator : IBinaryOperator
             static virtual float IdentityValue => throw new NotSupportedException();
         }
 
+        /// <summary>Operator that takes three input values and returns a single value.</summary>
         private interface ITernaryOperator
         {
             static abstract float Invoke(float x, float y, float z);
diff --git a/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/TensorPrimitives.netstandard.cs b/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/TensorPrimitives.netstandard.cs
index ae72988bbe00eb..28c5e2ac5d8bd5 100644
--- a/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/TensorPrimitives.netstandard.cs
+++ b/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/TensorPrimitives.netstandard.cs
@@ -9,6 +9,8 @@ namespace System.Numerics.Tensors
 {
     public static partial class TensorPrimitives
     {
+        /// <summary>Computes the cosine similarity between the two specified non-empty, equal-length tensors of single-precision floating-point numbers.</summary>
+        /// <remarks>Assumes arguments have already been validated to be non-empty and equal length.</remarks>
         private static float CosineSimilarityCore(ReadOnlySpan<float> x, ReadOnlySpan<float> y)
         {
             // Compute the same as:
@@ -52,7 +54,7 @@ private static float CosineSimilarityCore(ReadOnlySpan<float> x, ReadOnlySpan<fl
                     Vector<float> xVec = AsVector(ref xRef, x.Length - Vector<float>.Count);
                     Vector<float> yVec = AsVector(ref yRef, x.Length - Vector<float>.Count);
 
-                    Vector<float> remainderMask = LoadRemainderMaskSingleVector(x.Length - i);
+                    Vector<float> remainderMask = CreateRemainderMaskSingleVector(x.Length - i);
                     xVec &= remainderMask;
                     yVec &= remainderMask;
 
@@ -85,10 +87,16 @@ private static float CosineSimilarityCore(ReadOnlySpan<float> x, ReadOnlySpan<fl
             return dotProduct / (MathF.Sqrt(xSumOfSquares) * MathF.Sqrt(ySumOfSquares));
         }
 
-        private static float Aggregate<TLoad, TAggregate>(
-            ReadOnlySpan<float> x, TLoad load = default, TAggregate aggregate = default)
-            where TLoad : struct, IUnaryOperator
-            where TAggregate : struct, IAggregationOperator
+        /// <summary>Performs an aggregation over all elements in <paramref name="x"/> to produce a single-precision floating-point value.</summary>
+        /// <typeparam name="TTransformOperator">Specifies the transform operation that should be applied to each element loaded from <paramref name="x"/>.</typeparam>
+        /// <typeparam name="TAggregationOperator">
+        /// Specifies the aggregation binary operation that should be applied to multiple values to aggregate them into a single value.
+        /// The aggregation is applied after the transform is applied to each element.
+        /// </typeparam>
+        private static float Aggregate<TTransformOperator, TAggregationOperator>(
+            ReadOnlySpan<float> x, TTransformOperator transformOp = default, TAggregationOperator aggregationOp = default)
+            where TTransformOperator : struct, IUnaryOperator
+            where TAggregationOperator : struct, IAggregationOperator
         {
             if (x.Length == 0)
             {
@@ -97,12 +105,12 @@ private static float Aggregate<TLoad, TAggregate>(
 
             float result;
 
-            if (Vector.IsHardwareAccelerated && load.CanVectorize && x.Length >= Vector<float>.Count)
+            if (Vector.IsHardwareAccelerated && transformOp.CanVectorize && x.Length >= Vector<float>.Count)
             {
                 ref float xRef = ref MemoryMarshal.GetReference(x);
 
                 // Load the first vector as the initial set of results
-                Vector<float> resultVector = load.Invoke(AsVector(ref xRef, 0));
+                Vector<float> resultVector = transformOp.Invoke(AsVector(ref xRef, 0));
                 int oneVectorFromEnd = x.Length - Vector<float>.Count;
                 int i = Vector<float>.Count;
 
@@ -110,44 +118,50 @@ private static float Aggregate<TLoad, TAggregate>(
                 // least one full vector left to process.
                 while (i <= oneVectorFromEnd)
                 {
-                    resultVector = aggregate.Invoke(resultVector, load.Invoke(AsVector(ref xRef, i)));
+                    resultVector = aggregationOp.Invoke(resultVector, transformOp.Invoke(AsVector(ref xRef, i)));
                     i += Vector<float>.Count;
                 }
 
                 // Process the last vector in the span, masking off elements already processed.
                 if (i != x.Length)
                 {
-                    resultVector = aggregate.Invoke(resultVector,
+                    resultVector = aggregationOp.Invoke(resultVector,
                         Vector.ConditionalSelect(
-                            Vector.Equals(LoadRemainderMaskSingleVector(x.Length - i), Vector<float>.Zero),
-                            new Vector<float>(aggregate.IdentityValue),
-                            load.Invoke(AsVector(ref xRef, x.Length - Vector<float>.Count))));
+                            Vector.Equals(CreateRemainderMaskSingleVector(x.Length - i), Vector<float>.Zero),
+                            new Vector<float>(aggregationOp.IdentityValue),
+                            transformOp.Invoke(AsVector(ref xRef, x.Length - Vector<float>.Count))));
                 }
 
                 // Aggregate the lanes in the vector back into the scalar result
                 result = resultVector[0];
                 for (int f = 1; f < Vector<float>.Count; f++)
                 {
-                    result = aggregate.Invoke(result, resultVector[f]);
+                    result = aggregationOp.Invoke(result, resultVector[f]);
                 }
 
                 return result;
             }
 
             // Aggregate the remaining items in the input span.
-            result = load.Invoke(x[0]);
+            result = transformOp.Invoke(x[0]);
             for (int i = 1; i < x.Length; i++)
             {
-                result = aggregate.Invoke(result, load.Invoke(x[i]));
+                result = aggregationOp.Invoke(result, transformOp.Invoke(x[i]));
             }
 
             return result;
         }
 
-        private static float Aggregate<TBinary, TAggregate>(
-            ReadOnlySpan<float> x, ReadOnlySpan<float> y, TBinary binary = default, TAggregate aggregate = default)
-            where TBinary : struct, IBinaryOperator
-            where TAggregate : struct, IAggregationOperator
+        /// <summary>Performs an aggregation over all pair-wise elements in <paramref name="x"/> and <paramref name="y"/> to produce a single-precision floating-point value.</summary>
+        /// <typeparam name="TBinaryOperator">Specifies the binary operation that should be applied to the pair-wise elements loaded from <paramref name="x"/> and <paramref name="y"/>.</typeparam>
+        /// <typeparam name="TAggregationOperator">
+        /// Specifies the aggregation binary operation that should be applied to multiple values to aggregate them into a single value.
+        /// The aggregation is applied to the results of the binary operations on the pair-wise values.
+        /// </typeparam>
+        private static float Aggregate<TBinaryOperator, TAggregationOperator>(
+            ReadOnlySpan<float> x, ReadOnlySpan<float> y, TBinaryOperator binaryOp = default, TAggregationOperator aggregationOp = default)
+            where TBinaryOperator : struct, IBinaryOperator
+            where TAggregationOperator : struct, IAggregationOperator
         {
             Debug.Assert(x.Length == y.Length);
 
@@ -164,7 +178,7 @@ private static float Aggregate<TBinary, TAggregate>(
             if (Vector.IsHardwareAccelerated && x.Length >= Vector<float>.Count)
             {
                 // Load the first vector as the initial set of results
-                Vector<float> resultVector = binary.Invoke(AsVector(ref xRef, 0), AsVector(ref yRef, 0));
+                Vector<float> resultVector = binaryOp.Invoke(AsVector(ref xRef, 0), AsVector(ref yRef, 0));
                 int oneVectorFromEnd = x.Length - Vector<float>.Count;
                 int i = Vector<float>.Count;
 
@@ -172,18 +186,18 @@ private static float Aggregate<TBinary, TAggregate>(
                 // least one full vector left to process.
                 while (i <= oneVectorFromEnd)
                 {
-                    resultVector = aggregate.Invoke(resultVector, binary.Invoke(AsVector(ref xRef, i), AsVector(ref yRef, i)));
+                    resultVector = aggregationOp.Invoke(resultVector, binaryOp.Invoke(AsVector(ref xRef, i), AsVector(ref yRef, i)));
                     i += Vector<float>.Count;
                 }
 
                 // Process the last vector in the spans, masking off elements already processed.
                 if (i != x.Length)
                 {
-                    resultVector = aggregate.Invoke(resultVector,
+                    resultVector = aggregationOp.Invoke(resultVector,
                         Vector.ConditionalSelect(
-                            Vector.Equals(LoadRemainderMaskSingleVector(x.Length - i), Vector<float>.Zero),
-                            new Vector<float>(aggregate.IdentityValue),
-                            binary.Invoke(
+                            Vector.Equals(CreateRemainderMaskSingleVector(x.Length - i), Vector<float>.Zero),
+                            new Vector<float>(aggregationOp.IdentityValue),
+                            binaryOp.Invoke(
                                 AsVector(ref xRef, x.Length - Vector<float>.Count),
                                 AsVector(ref yRef, x.Length - Vector<float>.Count))));
                 }
@@ -192,23 +206,28 @@ private static float Aggregate<TBinary, TAggregate>(
                 result = resultVector[0];
                 for (int f = 1; f < Vector<float>.Count; f++)
                 {
-                    result = aggregate.Invoke(result, resultVector[f]);
+                    result = aggregationOp.Invoke(result, resultVector[f]);
                 }
 
                 return result;
             }
 
             // Aggregate the remaining items in the input span.
-            result = binary.Invoke(x[0], y[0]);
+            result = binaryOp.Invoke(x[0], y[0]);
             for (int i = 1; i < x.Length; i++)
             {
-                result = aggregate.Invoke(result, binary.Invoke(x[i], y[i]));
+                result = aggregationOp.Invoke(result, binaryOp.Invoke(x[i], y[i]));
             }
 
             return result;
         }
 
-        private static float MinMaxCore<TMinMax>(ReadOnlySpan<float> x, TMinMax minMax = default) where TMinMax : struct, IBinaryOperator
+        /// <remarks>
+        /// This is the same as <see cref="Aggregate{TTransformOperator, TAggregationOperator}(ReadOnlySpan{float}, TTransformOperator, TAggregationOperator)"/>
+        /// with an identity transform, except it early exits on NaN.
+        /// </remarks>
+        private static float MinMaxCore<TMinMaxOperator>(ReadOnlySpan<float> x, TMinMaxOperator op = default)
+            where TMinMaxOperator : struct, IBinaryOperator
         {
             if (x.IsEmpty)
             {
@@ -245,7 +264,7 @@ private static float MinMaxCore<TMinMax>(ReadOnlySpan<float> x, TMinMax minMax =
                             goto Scalar;
                         }
 
-                        resultVector = minMax.Invoke(resultVector, current);
+                        resultVector = op.Invoke(resultVector, current);
                         i += Vector<float>.Count;
                     }
 
@@ -258,13 +277,13 @@ private static float MinMaxCore<TMinMax>(ReadOnlySpan<float> x, TMinMax minMax =
                             goto Scalar;
                         }
 
-                        resultVector = minMax.Invoke(resultVector, current);
+                        resultVector = op.Invoke(resultVector, current);
                     }
 
                     // Aggregate the lanes in the vector to create the final scalar result.
                     for (int f = 0; f < Vector<float>.Count; f++)
                     {
-                        result = minMax.Invoke(result, resultVector[f]);
+                        result = op.Invoke(result, resultVector[f]);
                     }
 
                     return result;
@@ -283,12 +302,14 @@ private static float MinMaxCore<TMinMax>(ReadOnlySpan<float> x, TMinMax minMax =
                     return current;
                 }
 
-                result = minMax.Invoke(result, current);
+                result = op.Invoke(result, current);
             }
 
             return result;
         }
 
+        /// <summary>Performs an element-wise operation on <paramref name="x"/> and writes the results to <paramref name="destination"/>.</summary>
+        /// <typeparam name="TUnaryOperator">Specifies the operation to perform on each element loaded from <paramref name="x"/>.</typeparam>
         private static void InvokeSpanIntoSpan<TUnaryOperator>(
             ReadOnlySpan<float> x, Span<float> destination, TUnaryOperator op = default)
             where TUnaryOperator : struct, IUnaryOperator
@@ -324,7 +345,7 @@ private static void InvokeSpanIntoSpan<TUnaryOperator>(
                         int lastVectorIndex = x.Length - Vector<float>.Count;
                         ref Vector<float> dest = ref AsVector(ref dRef, lastVectorIndex);
                         dest = Vector.ConditionalSelect(
-                            Vector.Equals(LoadRemainderMaskSingleVector(x.Length - i), Vector<float>.Zero),
+                            Vector.Equals(CreateRemainderMaskSingleVector(x.Length - i), Vector<float>.Zero),
                             dest,
                             op.Invoke(AsVector(ref xRef, lastVectorIndex)));
                     }
@@ -342,6 +363,13 @@ private static void InvokeSpanIntoSpan<TUnaryOperator>(
             }
         }
 
+        /// <summary>
+        /// Performs an element-wise operation on <paramref name="x"/> and <paramref name="y"/>,
+        /// and writes the results to <paramref name="destination"/>.
+        /// </summary>
+        /// <typeparam name="TBinaryOperator">
+        /// Specifies the operation to perform on the pair-wise elements loaded from <paramref name="x"/> and <paramref name="y"/>.
+        /// </typeparam>
         private static void InvokeSpanSpanIntoSpan<TBinaryOperator>(
             ReadOnlySpan<float> x, ReadOnlySpan<float> y, Span<float> destination, TBinaryOperator op = default)
             where TBinaryOperator : struct, IBinaryOperator
@@ -385,7 +413,7 @@ private static void InvokeSpanSpanIntoSpan<TBinaryOperator>(
                         int lastVectorIndex = x.Length - Vector<float>.Count;
                         ref Vector<float> dest = ref AsVector(ref dRef, lastVectorIndex);
                         dest = Vector.ConditionalSelect(
-                            Vector.Equals(LoadRemainderMaskSingleVector(x.Length - i), Vector<float>.Zero),
+                            Vector.Equals(CreateRemainderMaskSingleVector(x.Length - i), Vector<float>.Zero),
                             dest,
                             op.Invoke(AsVector(ref xRef, lastVectorIndex),
                                       AsVector(ref yRef, lastVectorIndex)));
@@ -404,8 +432,32 @@ private static void InvokeSpanSpanIntoSpan<TBinaryOperator>(
             }
         }
 
+        /// <summary>
+        /// Performs an element-wise operation on <paramref name="x"/> and <paramref name="y"/>,
+        /// and writes the results to <paramref name="destination"/>.
+        /// </summary>
+        /// <typeparam name="TBinaryOperator">
+        /// Specifies the operation to perform on each element loaded from <paramref name="x"/> with <paramref name="y"/>.
+        /// </typeparam>
         private static void InvokeSpanScalarIntoSpan<TBinaryOperator>(
             ReadOnlySpan<float> x, float y, Span<float> destination, TBinaryOperator op = default)
+            where TBinaryOperator : struct, IBinaryOperator =>
+            InvokeSpanScalarIntoSpan<IdentityOperator, TBinaryOperator>(x, y, destination, default, op);
+
+        /// <summary>
+        /// Performs an element-wise operation on <paramref name="x"/> and <paramref name="y"/>,
+        /// and writes the results to <paramref name="destination"/>.
+        /// </summary>
+        /// <typeparam name="TTransformOperator">
+        /// Specifies the operation to perform on each element loaded from <paramref name="x"/>.
+        /// It is not used with <paramref name="y"/>.
+        /// </typeparam>
+        /// <typeparam name="TBinaryOperator">
+        /// Specifies the operation to perform on the transformed value from <paramref name="x"/> with <paramref name="y"/>.
+        /// </typeparam>
+        private static void InvokeSpanScalarIntoSpan<TTransformOperator, TBinaryOperator>(
+            ReadOnlySpan<float> x, float y, Span<float> destination, TTransformOperator xTransformOp = default, TBinaryOperator binaryOp = default)
+            where TTransformOperator : struct, IUnaryOperator
             where TBinaryOperator : struct, IBinaryOperator
         {
             if (x.Length > destination.Length)
@@ -419,7 +471,7 @@ private static void InvokeSpanScalarIntoSpan<TBinaryOperator>(
             ref float dRef = ref MemoryMarshal.GetReference(destination);
             int i = 0, oneVectorFromEnd;
 
-            if (Vector.IsHardwareAccelerated)
+            if (Vector.IsHardwareAccelerated && xTransformOp.CanVectorize)
             {
                 oneVectorFromEnd = x.Length - Vector<float>.Count;
                 if (oneVectorFromEnd >= 0)
@@ -428,7 +480,7 @@ private static void InvokeSpanScalarIntoSpan<TBinaryOperator>(
                     Vector<float> yVec = new(y);
                     do
                     {
-                        AsVector(ref dRef, i) = op.Invoke(AsVector(ref xRef, i),
+                        AsVector(ref dRef, i) = binaryOp.Invoke(xTransformOp.Invoke(AsVector(ref xRef, i)),
                                                           yVec);
 
                         i += Vector<float>.Count;
@@ -441,9 +493,9 @@ private static void InvokeSpanScalarIntoSpan<TBinaryOperator>(
                         int lastVectorIndex = x.Length - Vector<float>.Count;
                         ref Vector<float> dest = ref AsVector(ref dRef, lastVectorIndex);
                         dest = Vector.ConditionalSelect(
-                            Vector.Equals(LoadRemainderMaskSingleVector(x.Length - i), Vector<float>.Zero),
+                            Vector.Equals(CreateRemainderMaskSingleVector(x.Length - i), Vector<float>.Zero),
                             dest,
-                            op.Invoke(AsVector(ref xRef, lastVectorIndex), yVec));
+                            binaryOp.Invoke(xTransformOp.Invoke(AsVector(ref xRef, lastVectorIndex)), yVec));
                     }
 
                     return;
@@ -453,13 +505,21 @@ private static void InvokeSpanScalarIntoSpan<TBinaryOperator>(
             // Loop handling one element at a time.
             while (i < x.Length)
             {
-                Unsafe.Add(ref dRef, i) = op.Invoke(Unsafe.Add(ref xRef, i),
+                Unsafe.Add(ref dRef, i) = binaryOp.Invoke(xTransformOp.Invoke(Unsafe.Add(ref xRef, i)),
                                                     y);
 
                 i++;
             }
         }
 
+        /// <summary>
+        /// Performs an element-wise operation on <paramref name="x"/>, <paramref name="y"/>, and <paramref name="z"/>,
+        /// and writes the results to <paramref name="destination"/>.
+        /// </summary>
+        /// <typeparam name="TTernaryOperator">
+        /// Specifies the operation to perform on the pair-wise elements loaded from <paramref name="x"/>, <paramref name="y"/>,
+        /// and <paramref name="z"/>.
+        /// </typeparam>
         private static void InvokeSpanSpanSpanIntoSpan<TTernaryOperator>(
             ReadOnlySpan<float> x, ReadOnlySpan<float> y, ReadOnlySpan<float> z, Span<float> destination, TTernaryOperator op = default)
             where TTernaryOperator : struct, ITernaryOperator
@@ -506,7 +566,7 @@ private static void InvokeSpanSpanSpanIntoSpan<TTernaryOperator>(
                         int lastVectorIndex = x.Length - Vector<float>.Count;
                         ref Vector<float> dest = ref AsVector(ref dRef, lastVectorIndex);
                         dest = Vector.ConditionalSelect(
-                            Vector.Equals(LoadRemainderMaskSingleVector(x.Length - i), Vector<float>.Zero),
+                            Vector.Equals(CreateRemainderMaskSingleVector(x.Length - i), Vector<float>.Zero),
                             dest,
                             op.Invoke(AsVector(ref xRef, lastVectorIndex),
                                       AsVector(ref yRef, lastVectorIndex),
@@ -528,6 +588,14 @@ private static void InvokeSpanSpanSpanIntoSpan<TTernaryOperator>(
             }
         }
 
+        /// <summary>
+        /// Performs an element-wise operation on <paramref name="x"/>, <paramref name="y"/>, and <paramref name="z"/>,
+        /// and writes the results to <paramref name="destination"/>.
+        /// </summary>
+        /// <typeparam name="TTernaryOperator">
+        /// Specifies the operation to perform on the pair-wise elements loaded from <paramref name="x"/> and <paramref name="y"/>
+        /// with <paramref name="z"/>.
+        /// </typeparam>
         private static void InvokeSpanSpanScalarIntoSpan<TTernaryOperator>(
             ReadOnlySpan<float> x, ReadOnlySpan<float> y, float z, Span<float> destination, TTernaryOperator op = default)
             where TTernaryOperator : struct, ITernaryOperator
@@ -574,7 +642,7 @@ private static void InvokeSpanSpanScalarIntoSpan<TTernaryOperator>(
                         int lastVectorIndex = x.Length - Vector<float>.Count;
                         ref Vector<float> dest = ref AsVector(ref dRef, lastVectorIndex);
                         dest = Vector.ConditionalSelect(
-                            Vector.Equals(LoadRemainderMaskSingleVector(x.Length - i), Vector<float>.Zero),
+                            Vector.Equals(CreateRemainderMaskSingleVector(x.Length - i), Vector<float>.Zero),
                             dest,
                             op.Invoke(AsVector(ref xRef, lastVectorIndex),
                                       AsVector(ref yRef, lastVectorIndex),
@@ -596,6 +664,14 @@ private static void InvokeSpanSpanScalarIntoSpan<TTernaryOperator>(
             }
         }
 
+        /// <summary>
+        /// Performs an element-wise operation on <paramref name="x"/>, <paramref name="y"/>, and <paramref name="z"/>,
+        /// and writes the results to <paramref name="destination"/>.
+        /// </summary>
+        /// <typeparam name="TTernaryOperator">
+        /// Specifies the operation to perform on the pair-wise element loaded from <paramref name="x"/>, with <paramref name="y"/>,
+        /// and the element loaded from <paramref name="z"/>.
+        /// </typeparam>
         private static void InvokeSpanScalarSpanIntoSpan<TTernaryOperator>(
             ReadOnlySpan<float> x, float y, ReadOnlySpan<float> z, Span<float> destination, TTernaryOperator op = default)
             where TTernaryOperator : struct, ITernaryOperator
@@ -642,7 +718,7 @@ private static void InvokeSpanScalarSpanIntoSpan<TTernaryOperator>(
                         int lastVectorIndex = x.Length - Vector<float>.Count;
                         ref Vector<float> dest = ref AsVector(ref dRef, lastVectorIndex);
                         dest = Vector.ConditionalSelect(
-                            Vector.Equals(LoadRemainderMaskSingleVector(x.Length - i), Vector<float>.Zero),
+                            Vector.Equals(CreateRemainderMaskSingleVector(x.Length - i), Vector<float>.Zero),
                             dest,
                             op.Invoke(AsVector(ref xRef, lastVectorIndex),
                                       yVec,
@@ -664,27 +740,36 @@ private static void InvokeSpanScalarSpanIntoSpan<TTernaryOperator>(
             }
         }
 
+        /// <summary>Loads a <see cref="Vector{Single}"/> that begins at the specified <paramref name="offset"/> from <paramref name="start"/>.</summary>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         private static ref Vector<float> AsVector(ref float start, int offset) =>
             ref Unsafe.As<float, Vector<float>>(
                 ref Unsafe.Add(ref start, offset));
 
+        /// <summary>Gets whether the specified <see cref="float"/> is negative.</summary>
         private static unsafe bool IsNegative(float f) => *(int*)&f < 0;
 
+        /// <summary>Gets whether each specified <see cref="float"/> is negative.</summary>
         private static unsafe Vector<float> IsNegative(Vector<float> f) =>
             (Vector<float>)Vector.LessThan((Vector<int>)f, Vector<int>.Zero);
 
+        /// <summary>Gets the base 2 logarithm of <paramref name="x"/>.</summary>
         private static float Log2(float x) => MathF.Log(x, 2);
 
-        private static unsafe Vector<float> LoadRemainderMaskSingleVector(int validItems)
+        /// <summary>
+        /// Gets a vector mask that will be all-ones-set for the last <paramref name="count"/> elements
+        /// and zero for all other elements.
+        /// </summary>
+        private static unsafe Vector<float> CreateRemainderMaskSingleVector(int count)
         {
             Debug.Assert(Vector<float>.Count is 4 or 8 or 16);
 
             return AsVector(
                 ref Unsafe.As<uint, float>(ref MemoryMarshal.GetReference(RemainderUInt32Mask_16x16)),
-                (validItems * 16) + (16 - Vector<float>.Count));
+                (count * 16) + (16 - Vector<float>.Count));
         }
 
+        /// <summary>x + y</summary>
         private readonly struct AddOperator : IAggregationOperator
         {
             public float Invoke(float x, float y) => x + y;
@@ -692,12 +777,14 @@ ref Unsafe.As<uint, float>(ref MemoryMarshal.GetReference(RemainderUInt32Mask_16
             public float IdentityValue => 0;
         }
 
+        /// <summary>x - y</summary>
         private readonly struct SubtractOperator : IBinaryOperator
         {
             public float Invoke(float x, float y) => x - y;
             public Vector<float> Invoke(Vector<float> x, Vector<float> y) => x - y;
         }
 
+        /// <summary>(x - y) * (x - y)</summary>
         private readonly struct SubtractSquaredOperator : IBinaryOperator
         {
             public float Invoke(float x, float y)
@@ -713,6 +800,7 @@ public Vector<float> Invoke(Vector<float> x, Vector<float> y)
             }
         }
 
+        /// <summary>x * y</summary>
         private readonly struct MultiplyOperator : IAggregationOperator
         {
             public float Invoke(float x, float y) => x * y;
@@ -720,12 +808,14 @@ public Vector<float> Invoke(Vector<float> x, Vector<float> y)
             public float IdentityValue => 1;
         }
 
+        /// <summary>x / y</summary>
         private readonly struct DivideOperator : IBinaryOperator
         {
             public float Invoke(float x, float y) => x / y;
             public Vector<float> Invoke(Vector<float> x, Vector<float> y) => x / y;
         }
 
+        /// <summary>MathF.Max(x, y) (but without guaranteed NaN propagation)</summary>
         private readonly struct MaxOperator : IBinaryOperator
         {
             [MethodImpl(MethodImplOptions.AggressiveInlining)]
@@ -741,6 +831,7 @@ public Vector<float> Invoke(Vector<float> x, Vector<float> y) =>
                     Vector.Max(x, y));
         }
 
+        /// <summary>MathF.Max(x, y)</summary>
         private readonly struct MaxPropagateNaNOperator : IBinaryOperator
         {
             [MethodImpl(MethodImplOptions.AggressiveInlining)]
@@ -757,6 +848,7 @@ public Vector<float> Invoke(Vector<float> x, Vector<float> y) =>
                     x);
         }
 
+        /// <summary>Operator to get x or y based on which has the larger MathF.Abs (but NaNs may not be propagated)</summary>
         private readonly struct MaxMagnitudeOperator : IBinaryOperator
         {
             [MethodImpl(MethodImplOptions.AggressiveInlining)]
@@ -780,6 +872,7 @@ public Vector<float> Invoke(Vector<float> x, Vector<float> y)
             }
         }
 
+        /// <summary>Operator to get x or y based on which has the larger MathF.Abs</summary>
         private readonly struct MaxMagnitudePropagateNaNOperator : IBinaryOperator
         {
             [MethodImpl(MethodImplOptions.AggressiveInlining)]
@@ -804,6 +897,7 @@ public Vector<float> Invoke(Vector<float> x, Vector<float> y)
             }
         }
 
+        /// <summary>MathF.Min(x, y) (but NaNs may not be propagated)</summary>
         private readonly struct MinOperator : IBinaryOperator
         {
             [MethodImpl(MethodImplOptions.AggressiveInlining)]
@@ -819,6 +913,7 @@ public Vector<float> Invoke(Vector<float> x, Vector<float> y) =>
                     Vector.Min(x, y));
         }
 
+        /// <summary>MathF.Min(x, y)</summary>
         private readonly struct MinPropagateNaNOperator : IBinaryOperator
         {
             [MethodImpl(MethodImplOptions.AggressiveInlining)]
@@ -835,6 +930,7 @@ public Vector<float> Invoke(Vector<float> x, Vector<float> y) =>
                     x);
         }
 
+        /// <summary>Operator to get x or y based on which has the smaller MathF.Abs (but NaNs may not be propagated)</summary>
         private readonly struct MinMagnitudeOperator : IBinaryOperator
         {
             [MethodImpl(MethodImplOptions.AggressiveInlining)]
@@ -858,6 +954,7 @@ public Vector<float> Invoke(Vector<float> x, Vector<float> y)
             }
         }
 
+        /// <summary>Operator to get x or y based on which has the smaller MathF.Abs</summary>
         private readonly struct MinMagnitudePropagateNaNOperator : IBinaryOperator
         {
             [MethodImpl(MethodImplOptions.AggressiveInlining)]
@@ -883,6 +980,7 @@ public Vector<float> Invoke(Vector<float> x, Vector<float> y)
             }
         }
 
+        /// <summary>-x</summary>
         private readonly struct NegateOperator : IUnaryOperator
         {
             public bool CanVectorize => true;
@@ -890,18 +988,21 @@ public Vector<float> Invoke(Vector<float> x, Vector<float> y)
             public Vector<float> Invoke(Vector<float> x) => -x;
         }
 
+        /// <summary>(x + y) * z</summary>
         private readonly struct AddMultiplyOperator : ITernaryOperator
         {
             public float Invoke(float x, float y, float z) => (x + y) * z;
             public Vector<float> Invoke(Vector<float> x, Vector<float> y, Vector<float> z) => (x + y) * z;
         }
 
+        /// <summary>(x * y) + z</summary>
         private readonly struct MultiplyAddOperator : ITernaryOperator
         {
             public float Invoke(float x, float y, float z) => (x * y) + z;
             public Vector<float> Invoke(Vector<float> x, Vector<float> y, Vector<float> z) => (x * y) + z;
         }
 
+        /// <summary>x</summary>
         private readonly struct IdentityOperator : IUnaryOperator
         {
             public bool CanVectorize => true;
@@ -909,6 +1010,7 @@ public Vector<float> Invoke(Vector<float> x, Vector<float> y)
             public Vector<float> Invoke(Vector<float> x) => x;
         }
 
+        /// <summary>x * x</summary>
         private readonly struct SquaredOperator : IUnaryOperator
         {
             public bool CanVectorize => true;
@@ -916,6 +1018,7 @@ public Vector<float> Invoke(Vector<float> x, Vector<float> y)
             public Vector<float> Invoke(Vector<float> x) => x * x;
         }
 
+        /// <summary>MathF.Abs(x)</summary>
         private readonly struct AbsoluteOperator : IUnaryOperator
         {
             public bool CanVectorize => true;
@@ -926,42 +1029,43 @@ public Vector<float> Invoke(Vector<float> x, Vector<float> y)
         private readonly struct ExpOperator : IUnaryOperator
         {
             public bool CanVectorize => false;
-
             public float Invoke(float x) => MathF.Exp(x);
-
-            public Vector<float> Invoke(Vector<float> x)
-            {
-                // Vectorizing requires shift left support, which is .NET 7 or later
+            public Vector<float> Invoke(Vector<float> x) =>
+                // requires ShiftLeft (.NET 7+)
                 throw new NotImplementedException();
-            }
         }
 
+        /// <summary>MathF.Log(x)</summary>
         private readonly struct LogOperator : IUnaryOperator
         {
             public bool CanVectorize => false;
-
             public float Invoke(float x) => MathF.Log(x);
-
-            public Vector<float> Invoke(Vector<float> x)
-            {
-                // Vectorizing requires shift right support, which is .NET 7 or later
+            public Vector<float> Invoke(Vector<float> x) =>
+                // requires ShiftRightArithmetic (.NET 7+)
                 throw new NotImplementedException();
-            }
         }
 
+        /// <summary>MathF.Log2(x)</summary>
         private readonly struct Log2Operator : IUnaryOperator
         {
             public bool CanVectorize => false;
-
             public float Invoke(float x) => Log2(x);
+            public Vector<float> Invoke(Vector<float> x) =>
+                // requires ShiftRightArithmetic (.NET 7+)
+                throw new NotImplementedException();
+        }
 
-            public Vector<float> Invoke(Vector<float> x)
-            {
-                // Vectorizing requires shift right support, which is .NET 7 or later
+        /// <summary>1f / (1f + MathF.Exp(-x))</summary>
+        private readonly struct SigmoidOperator : IUnaryOperator
+        {
+            public bool CanVectorize => false;
+            public float Invoke(float x) => 1.0f / (1.0f + MathF.Exp(-x));
+            public Vector<float> Invoke(Vector<float> x) =>
+                // requires ShiftRightArithmetic (.NET 7+)
                 throw new NotImplementedException();
-            }
         }
 
+        /// <summary>Operator that takes one input value and returns a single value.</summary>
         private interface IUnaryOperator
         {
             bool CanVectorize { get; }
@@ -969,17 +1073,20 @@ private interface IUnaryOperator
             Vector<float> Invoke(Vector<float> x);
         }
 
+        /// <summary>Operator that takes two input values and returns a single value.</summary>
         private interface IBinaryOperator
         {
             float Invoke(float x, float y);
             Vector<float> Invoke(Vector<float> x, Vector<float> y);
         }
 
+        /// <summary><see cref="IBinaryOperator"/> that specializes horizontal aggregation of all elements in a vector.</summary>
         private interface IAggregationOperator : IBinaryOperator
         {
             float IdentityValue { get; }
         }
 
+        /// <summary>Operator that takes three input values and returns a single value.</summary>
         private interface ITernaryOperator
         {
             float Invoke(float x, float y, float z);
diff --git a/src/libraries/System.Numerics.Tensors/tests/TensorPrimitivesTests.cs b/src/libraries/System.Numerics.Tensors/tests/TensorPrimitivesTests.cs
index 652fb07d3fa385..23f39f1bf6b0b7 100644
--- a/src/libraries/System.Numerics.Tensors/tests/TensorPrimitivesTests.cs
+++ b/src/libraries/System.Numerics.Tensors/tests/TensorPrimitivesTests.cs
@@ -58,30 +58,56 @@ private static unsafe float MathFMinMagnitude(float x, float y)
             return (ax < ay) || float.IsNaN(ax) || (ax == ay && *(int*)&x < 0) ? x : y;
         }
 
-        private static unsafe int SingleToInt32(float f) => *(int*)&f;
+        private static unsafe float UInt32ToSingle(uint i) => *(float*)&i;
 
-        private static unsafe float Int32ToSingle(int i) => *(float*)&i;
-
-        private static float AnotherSingleNaN = Int32ToSingle(-8388607);
-
-        /// <summary>Loads a variety of special values (e.g. NaN) into random positions in <paramref name="x"/>.</summary>
-        private static void SetSpecialValues(Span<float> x)
+        /// <summary>Gets a variety of special values (e.g. NaN).</summary>
+        private static IEnumerable<float> GetSpecialValues()
         {
             // NaN
-            x[s_random.Next(x.Length)] = float.NaN;
-            x[s_random.Next(x.Length)] = AnotherSingleNaN;
+            yield return UInt32ToSingle(0xFFC0_0000); // -qNaN / float.NaN
+            yield return UInt32ToSingle(0xFFFF_FFFF); // -qNaN / all-bits-set
+            yield return UInt32ToSingle(0x7FC0_0000); // +qNaN
+            yield return UInt32ToSingle(0xFFA0_0000); // -sNaN
+            yield return UInt32ToSingle(0x7FA0_0000); // +sNaN
 
             // +Infinity, -Infinity
-            x[s_random.Next(x.Length)] = float.PositiveInfinity;
-            x[s_random.Next(x.Length)] = float.NegativeInfinity;
+            yield return float.PositiveInfinity;
+            yield return float.NegativeInfinity;
 
             // +Zero, -Zero
-            x[s_random.Next(x.Length)] = +0.0f;
-            x[s_random.Next(x.Length)] = -0.0f;
+            yield return +0.0f;
+            yield return -0.0f;
 
-            // +Epsilon, -Epsilon
-            x[s_random.Next(x.Length)] = +float.Epsilon;
-            x[s_random.Next(x.Length)] = -float.Epsilon;
+            // Subnormals
+            yield return +float.Epsilon;
+            yield return -float.Epsilon;
+            yield return UInt32ToSingle(0x007F_FFFF);
+            yield return UInt32ToSingle(0x807F_FFFF);
+
+            // Normals
+            yield return UInt32ToSingle(0x0080_0000);
+            yield return UInt32ToSingle(0x8080_0000);
+            yield return UInt32ToSingle(0x7F7F_FFFF);
+            yield return UInt32ToSingle(0x8F7F_FFFF);
+        }
+
+        /// <summary>
+        /// Runs the specified action for each special value. Before the action is invoked,
+        /// the value is stored into a random position in <paramref name="x"/>, and the original
+        /// value is subsequently restored.
+        /// </summary>
+        private static void RunForEachSpecialValue(Action action, BoundedMemory<float> x)
+        {
+            foreach (float value in GetSpecialValues())
+            {
+                int pos = s_random.Next(x.Length);
+                float orig = x[pos];
+                x[pos] = value;
+
+                action();
+
+                x[pos] = orig;
+            }
         }
 
         /// <summary>
@@ -95,7 +121,7 @@ private static void SetSpecialValues(Span<float> x, Span<float> y)
             // NaNs
             pos = s_random.Next(x.Length);
             x[pos] = float.NaN;
-            y[pos] = AnotherSingleNaN;
+            y[pos] = UInt32ToSingle(0x7FC0_0000);
 
             // +Infinity, -Infinity
             pos = s_random.Next(x.Length);
@@ -835,6 +861,23 @@ public static void Exp_InPlace(int tensorLength)
             }
         }
 
+        [Theory]
+        [MemberData(nameof(TensorLengths))]
+        public static void Exp_SpecialValues(int tensorLength)
+        {
+            using BoundedMemory<float> x = CreateAndFillTensor(tensorLength);
+            using BoundedMemory<float> destination = CreateTensor(tensorLength);
+
+            RunForEachSpecialValue(() =>
+            {
+                TensorPrimitives.Exp(x, destination);
+                for (int i = 0; i < tensorLength; i++)
+                {
+                    Assert.Equal(MathF.Exp(x[i]), destination[i], Tolerance);
+                }
+            }, x);
+        }
+
         [Theory]
         [MemberData(nameof(TensorLengths))]
         public static void Exp_ThrowsForTooShortDestination(int tensorLength)
@@ -1073,13 +1116,14 @@ public static void Log_SpecialValues(int tensorLength)
             using BoundedMemory<float> x = CreateAndFillTensor(tensorLength);
             using BoundedMemory<float> destination = CreateTensor(tensorLength);
 
-            SetSpecialValues(x);
-
-            TensorPrimitives.Log(x, destination);
-            for (int i = 0; i < tensorLength; i++)
+            RunForEachSpecialValue(() =>
             {
-                Assert.Equal(MathF.Log(x[i]), destination[i], Tolerance);
-            }
+                TensorPrimitives.Log(x, destination);
+                for (int i = 0; i < tensorLength; i++)
+                {
+                    Assert.Equal(MathF.Log(x[i]), destination[i], Tolerance);
+                }
+            }, x);
         }
 
         [Theory]
@@ -1139,13 +1183,14 @@ public static void Log2_SpecialValues(int tensorLength)
             using BoundedMemory<float> x = CreateAndFillTensor(tensorLength);
             using BoundedMemory<float> destination = CreateTensor(tensorLength);
 
-            SetSpecialValues(x);
-
-            TensorPrimitives.Log2(x, destination);
-            for (int i = 0; i < tensorLength; i++)
+            RunForEachSpecialValue(() =>
             {
-                Assert.Equal(MathF.Log(x[i], 2), destination[i], Tolerance);
-            }
+                TensorPrimitives.Log2(x, destination);
+                for (int i = 0; i < tensorLength; i++)
+                {
+                    Assert.Equal(MathF.Log(x[i], 2), destination[i], Tolerance);
+                }
+            }, x);
         }
 
         [Theory]
@@ -2271,12 +2316,19 @@ public static void Sigmoid_InPlace(int tensorLength)
 
         [Theory]
         [MemberData(nameof(TensorLengths))]
-        public static void Sigmoid_ThrowsForTooShortDestination(int tensorLength)
+        public static void Sigmoid_SpecialValues(int tensorLength)
         {
             using BoundedMemory<float> x = CreateAndFillTensor(tensorLength);
-            using BoundedMemory<float> destination = CreateTensor(tensorLength - 1);
+            using BoundedMemory<float> destination = CreateTensor(tensorLength);
 
-            AssertExtensions.Throws<ArgumentException>("destination", () => TensorPrimitives.Sigmoid(x, destination));
+            RunForEachSpecialValue(() =>
+            {
+                TensorPrimitives.Sigmoid(x, destination);
+                for (int i = 0; i < tensorLength; i++)
+                {
+                    Assert.Equal(1f / (1f + MathF.Exp(-x[i])), destination[i], Tolerance);
+                }
+            }, x);
         }
 
         [Theory]
@@ -2311,6 +2363,16 @@ public static void Sigmoid_DestinationLongerThanSource()
             Assert.Equal(originalLast, dest[dest.Length - 1]);
         }
 
+        [Theory]
+        [MemberData(nameof(TensorLengths))]
+        public static void Sigmoid_ThrowsForTooShortDestination(int tensorLength)
+        {
+            using BoundedMemory<float> x = CreateAndFillTensor(tensorLength);
+            using BoundedMemory<float> destination = CreateTensor(tensorLength - 1);
+
+            AssertExtensions.Throws<ArgumentException>("destination", () => TensorPrimitives.Sigmoid(x, destination));
+        }
+
         [Fact]
         public static void Sigmoid_ThrowsForEmptyInput()
         {
@@ -2409,16 +2471,6 @@ public static void SoftMax_InPlace(int tensorLength)
             }
         }
 
-        [Theory]
-        [MemberData(nameof(TensorLengths))]
-        public static void SoftMax_ThrowsForTooShortDestination(int tensorLength)
-        {
-            using BoundedMemory<float> x = CreateAndFillTensor(tensorLength);
-            using BoundedMemory<float> destination = CreateTensor(tensorLength - 1);
-
-            AssertExtensions.Throws<ArgumentException>("destination", () => TensorPrimitives.SoftMax(x, destination));
-        }
-
         [Theory]
         [InlineData(new float[] { 3, 1, .2f }, new float[] { 0.8360188f, 0.11314284f, 0.05083836f })]
         [InlineData(new float[] { 3, 4, 1 }, new float[] { 0.2594f, 0.705384f, 0.0351f })]
@@ -2449,6 +2501,16 @@ public static void SoftMax_DestinationLongerThanSource()
             }
         }
 
+        [Theory]
+        [MemberData(nameof(TensorLengths))]
+        public static void SoftMax_ThrowsForTooShortDestination(int tensorLength)
+        {
+            using BoundedMemory<float> x = CreateAndFillTensor(tensorLength);
+            using BoundedMemory<float> destination = CreateTensor(tensorLength - 1);
+
+            AssertExtensions.Throws<ArgumentException>("destination", () => TensorPrimitives.SoftMax(x, destination));
+        }
+
         [Fact]
         public static void SoftMax_ThrowsForEmptyInput()
         {

From ae8836854baf5d55e4f1e113b41a650e55cab356 Mon Sep 17 00:00:00 2001
From: Stephen Toub <stoub@microsoft.com>
Date: Thu, 5 Oct 2023 08:55:59 -0400
Subject: [PATCH 2/3] Disable tests on mono

---
 .../System.Numerics.Tensors/tests/TensorPrimitivesTests.cs      | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/libraries/System.Numerics.Tensors/tests/TensorPrimitivesTests.cs b/src/libraries/System.Numerics.Tensors/tests/TensorPrimitivesTests.cs
index 23f39f1bf6b0b7..c40715840843b2 100644
--- a/src/libraries/System.Numerics.Tensors/tests/TensorPrimitivesTests.cs
+++ b/src/libraries/System.Numerics.Tensors/tests/TensorPrimitivesTests.cs
@@ -863,6 +863,7 @@ public static void Exp_InPlace(int tensorLength)
 
         [Theory]
         [MemberData(nameof(TensorLengths))]
+        [ActiveIssue("https://github.com/dotnet/runtime/issues/92885", TestRuntimes.Mono)]
         public static void Exp_SpecialValues(int tensorLength)
         {
             using BoundedMemory<float> x = CreateAndFillTensor(tensorLength);
@@ -2316,6 +2317,7 @@ public static void Sigmoid_InPlace(int tensorLength)
 
         [Theory]
         [MemberData(nameof(TensorLengths))]
+        [ActiveIssue("https://github.com/dotnet/runtime/issues/92885", TestRuntimes.Mono)]
         public static void Sigmoid_SpecialValues(int tensorLength)
         {
             using BoundedMemory<float> x = CreateAndFillTensor(tensorLength);

From c0bf298e20681d597d762b51b2356e93c71bd23e Mon Sep 17 00:00:00 2001
From: Stephen Toub <stoub@microsoft.com>
Date: Thu, 5 Oct 2023 18:05:24 -0400
Subject: [PATCH 3/3] Address PR feedback

---
 .../src/System/Numerics/Tensors/TensorPrimitives.netcore.cs | 6 +++---
 .../System.Numerics.Tensors/tests/TensorPrimitivesTests.cs  | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/TensorPrimitives.netcore.cs b/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/TensorPrimitives.netcore.cs
index c8a980d70107aa..75515ed9187c80 100644
--- a/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/TensorPrimitives.netcore.cs
+++ b/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/TensorPrimitives.netcore.cs
@@ -2112,7 +2112,7 @@ private static Vector512<float> IsNegative(Vector512<float> vector) =>
         private static float GetFirstNaN(Vector128<float> vector)
         {
             Debug.Assert(!Vector128.EqualsAll(vector, vector), "Expected vector to contain a NaN");
-            return vector[BitOperations.TrailingZeroCount((~Vector128.Equals(vector, vector)).ExtractMostSignificantBits())];
+            return vector.GetElement(BitOperations.TrailingZeroCount((~Vector128.Equals(vector, vector)).ExtractMostSignificantBits()));
         }
 
         /// <summary>Finds and returns the first NaN value in <paramref name="vector"/>.</summary>
@@ -2120,7 +2120,7 @@ private static float GetFirstNaN(Vector128<float> vector)
         private static float GetFirstNaN(Vector256<float> vector)
         {
             Debug.Assert(!Vector256.EqualsAll(vector, vector), "Expected vector to contain a NaN");
-            return vector[BitOperations.TrailingZeroCount((~Vector256.Equals(vector, vector)).ExtractMostSignificantBits())];
+            return vector.GetElement(BitOperations.TrailingZeroCount((~Vector256.Equals(vector, vector)).ExtractMostSignificantBits()));
         }
 
 #if NET8_0_OR_GREATER
@@ -2129,7 +2129,7 @@ private static float GetFirstNaN(Vector256<float> vector)
         private static float GetFirstNaN(Vector512<float> vector)
         {
             Debug.Assert(!Vector512.EqualsAll(vector, vector), "Expected vector to contain a NaN");
-            return vector[BitOperations.TrailingZeroCount((~Vector512.Equals(vector, vector)).ExtractMostSignificantBits())];
+            return vector.GetElement(BitOperations.TrailingZeroCount((~Vector512.Equals(vector, vector)).ExtractMostSignificantBits()));
         }
 #endif
 
diff --git a/src/libraries/System.Numerics.Tensors/tests/TensorPrimitivesTests.cs b/src/libraries/System.Numerics.Tensors/tests/TensorPrimitivesTests.cs
index c40715840843b2..1bb23713357364 100644
--- a/src/libraries/System.Numerics.Tensors/tests/TensorPrimitivesTests.cs
+++ b/src/libraries/System.Numerics.Tensors/tests/TensorPrimitivesTests.cs
@@ -87,8 +87,8 @@ private static IEnumerable<float> GetSpecialValues()
             // Normals
             yield return UInt32ToSingle(0x0080_0000);
             yield return UInt32ToSingle(0x8080_0000);
-            yield return UInt32ToSingle(0x7F7F_FFFF);
-            yield return UInt32ToSingle(0x8F7F_FFFF);
+            yield return UInt32ToSingle(0x7F7F_FFFF); // MaxValue
+            yield return UInt32ToSingle(0xFF7F_FFFF); // MinValue
         }
 
         /// <summary>