Use FMA in TensorPrimitives (dotnet#92205)

stephentoub · michaelgsharp · commit bd57689c30bb · 2023-10-20T01:07:02.000-06:00
diff --git a/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/TensorPrimitives.netcore.cs b/src/libraries/System.Numerics.Tensors/src/System/Numerics/Tensors/TensorPrimitives.netcore.cs
@@ -1,9 +1,12 @@
 ﻿// Licensed to the .NET Foundation under one or more agreements.
 // The .NET Foundation licenses this file to you under the MIT license.
 
+using System.Diagnostics;
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
 using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.Arm;
+using System.Runtime.Intrinsics.X86;
 
 namespace System.Numerics.Tensors
 {
@@ -86,9 +89,9 @@ private static float CosineSimilarityCore(ReadOnlySpan<float> x, ReadOnlySpan<fl
                     Vector512<float> xVec = Vector512.LoadUnsafe(ref xRef, (uint)i);
                     Vector512<float> yVec = Vector512.LoadUnsafe(ref yRef, (uint)i);
 
-                    dotProductVector += xVec * yVec;
-                    xSumOfSquaresVector += xVec * xVec;
-                    ySumOfSquaresVector += yVec * yVec;
+                    dotProductVector = FusedMultiplyAdd(xVec, yVec, dotProductVector);
+                    xSumOfSquaresVector = FusedMultiplyAdd(xVec, xVec, xSumOfSquaresVector);
+                    ySumOfSquaresVector = FusedMultiplyAdd(yVec, yVec, ySumOfSquaresVector);
 
                     i += Vector512<float>.Count;
                 }
@@ -117,9 +120,9 @@ private static float CosineSimilarityCore(ReadOnlySpan<float> x, ReadOnlySpan<fl
                     Vector256<float> xVec = Vector256.LoadUnsafe(ref xRef, (uint)i);
                     Vector256<float> yVec = Vector256.LoadUnsafe(ref yRef, (uint)i);
 
-                    dotProductVector += xVec * yVec;
-                    xSumOfSquaresVector += xVec * xVec;
-                    ySumOfSquaresVector += yVec * yVec;
+                    dotProductVector = FusedMultiplyAdd(xVec, yVec, dotProductVector);
+                    xSumOfSquaresVector = FusedMultiplyAdd(xVec, xVec, xSumOfSquaresVector);
+                    ySumOfSquaresVector = FusedMultiplyAdd(yVec, yVec, ySumOfSquaresVector);
 
                     i += Vector256<float>.Count;
                 }
@@ -146,9 +149,9 @@ private static float CosineSimilarityCore(ReadOnlySpan<float> x, ReadOnlySpan<fl
                     Vector128<float> xVec = Vector128.LoadUnsafe(ref xRef, (uint)i);
                     Vector128<float> yVec = Vector128.LoadUnsafe(ref yRef, (uint)i);
 
-                    dotProductVector += xVec * yVec;
-                    xSumOfSquaresVector += xVec * xVec;
-                    ySumOfSquaresVector += yVec * yVec;
+                    dotProductVector = FusedMultiplyAdd(xVec, yVec, dotProductVector);
+                    xSumOfSquaresVector = FusedMultiplyAdd(xVec, xVec, xSumOfSquaresVector);
+                    ySumOfSquaresVector = FusedMultiplyAdd(yVec, yVec, ySumOfSquaresVector);
 
                     i += Vector128<float>.Count;
                 }
@@ -163,9 +166,9 @@ private static float CosineSimilarityCore(ReadOnlySpan<float> x, ReadOnlySpan<fl
             // Process any remaining elements past the last vector.
             for (; (uint)i < (uint)x.Length; i++)
             {
-                dotProduct += x[i] * y[i];
-                xSumOfSquares += x[i] * x[i];
-                ySumOfSquares += y[i] * y[i];
+                dotProduct = MathF.FusedMultiplyAdd(x[i], y[i], dotProduct);
+                xSumOfSquares = MathF.FusedMultiplyAdd(x[i], x[i], xSumOfSquares);
+                ySumOfSquares = MathF.FusedMultiplyAdd(y[i], y[i], ySumOfSquares);
             }
 
             // Sum(X * Y) / (|X| * |Y|)
@@ -1032,6 +1035,46 @@ private static unsafe void InvokeSpanScalarSpanIntoSpan<TTernaryOperator>(
             }
         }
 
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static Vector128<float> FusedMultiplyAdd(Vector128<float> x, Vector128<float> y, Vector128<float> addend)
+        {
+            if (Fma.IsSupported)
+            {
+                return Fma.MultiplyAdd(x, y, addend);
+            }
+
+            if (AdvSimd.IsSupported)
+            {
+                return AdvSimd.FusedMultiplyAdd(addend, x, y);
+            }
+
+            return (x * y) + addend;
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static Vector256<float> FusedMultiplyAdd(Vector256<float> x, Vector256<float> y, Vector256<float> addend)
+        {
+            if (Fma.IsSupported)
+            {
+                return Fma.MultiplyAdd(x, y, addend);
+            }
+
+            return (x * y) + addend;
+        }
+
+#if NET8_0_OR_GREATER
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static Vector512<float> FusedMultiplyAdd(Vector512<float> x, Vector512<float> y, Vector512<float> addend)
+        {
+            if (Avx512F.IsSupported)
+            {
+                return Avx512F.FusedMultiplyAdd(x, y, addend);
+            }
+
+            return (x * y) + addend;
+        }
+#endif
+
         private readonly struct AddOperator : IBinaryOperator
         {
             public static float Invoke(float x, float y) => x + y;
@@ -1182,11 +1225,11 @@ public static float Invoke(Vector512<float> x)
 
         private readonly struct MultiplyAddOperator : ITernaryOperator
         {
-            public static float Invoke(float x, float y, float z) => (x * y) + z;
-            public static Vector128<float> Invoke(Vector128<float> x, Vector128<float> y, Vector128<float> z) => (x * y) + z;
-            public static Vector256<float> Invoke(Vector256<float> x, Vector256<float> y, Vector256<float> z) => (x * y) + z;
+            public static float Invoke(float x, float y, float z) => MathF.FusedMultiplyAdd(x, y, z);
+            public static Vector128<float> Invoke(Vector128<float> x, Vector128<float> y, Vector128<float> z) => FusedMultiplyAdd(x, y, z);
+            public static Vector256<float> Invoke(Vector256<float> x, Vector256<float> y, Vector256<float> z) => FusedMultiplyAdd(x, y, z);
 #if NET8_0_OR_GREATER
-            public static Vector512<float> Invoke(Vector512<float> x, Vector512<float> y, Vector512<float> z) => (x * y) + z;
+            public static Vector512<float> Invoke(Vector512<float> x, Vector512<float> y, Vector512<float> z) => FusedMultiplyAdd(x, y, z);
 #endif
         }
 
diff --git a/src/libraries/System.Numerics.Tensors/tests/TensorPrimitivesTests.cs b/src/libraries/System.Numerics.Tensors/tests/TensorPrimitivesTests.cs
@@ -59,7 +59,7 @@ public static void AddTwoTensors(int tensorLength)
 
             for (int i = 0; i < tensorLength; i++)
             {
-                Assert.Equal(x[i] + y[i], destination[i]);
+                Assert.Equal(x[i] + y[i], destination[i], Tolerance);
             }
         }
 
@@ -97,7 +97,7 @@ public static void AddTensorAndScalar(int tensorLength)
 
             for (int i = 0; i < tensorLength; i++)
             {
-                Assert.Equal(x[i] + y, destination[i]);
+                Assert.Equal(x[i] + y, destination[i], Tolerance);
             }
         }
 
@@ -124,7 +124,7 @@ public static void SubtractTwoTensors(int tensorLength)
 
             for (int i = 0; i < tensorLength; i++)
             {
-                Assert.Equal(x[i] - y[i], destination[i]);
+                Assert.Equal(x[i] - y[i], destination[i], Tolerance);
             }
         }
 
@@ -162,7 +162,7 @@ public static void SubtractTensorAndScalar(int tensorLength)
 
             for (int i = 0; i < tensorLength; i++)
             {
-                Assert.Equal(x[i] - y, destination[i]);
+                Assert.Equal(x[i] - y, destination[i], Tolerance);
             }
         }
 
@@ -189,7 +189,7 @@ public static void MultiplyTwoTensors(int tensorLength)
 
             for (int i = 0; i < tensorLength; i++)
             {
-                Assert.Equal(x[i] * y[i], destination[i]);
+                Assert.Equal(x[i] * y[i], destination[i], Tolerance);
             }
         }
 
@@ -227,7 +227,7 @@ public static void MultiplyTensorAndScalar(int tensorLength)
 
             for (int i = 0; i < tensorLength; i++)
             {
-                Assert.Equal(x[i] * y, destination[i]);
+                Assert.Equal(x[i] * y, destination[i], Tolerance);
             }
         }
 
@@ -254,7 +254,7 @@ public static void DivideTwoTensors(int tensorLength)
 
             for (int i = 0; i < tensorLength; i++)
             {
-                Assert.Equal(x[i] / y[i], destination[i]);
+                Assert.Equal(x[i] / y[i], destination[i], Tolerance);
             }
         }
 
@@ -292,7 +292,7 @@ public static void DivideTensorAndScalar(int tensorLength)
 
             for (int i = 0; i < tensorLength; i++)
             {
-                Assert.Equal(x[i] / y, destination[i]);
+                Assert.Equal(x[i] / y, destination[i], Tolerance);
             }
         }
 
@@ -318,7 +318,7 @@ public static void NegateTensor(int tensorLength)
 
             for (int i = 0; i < tensorLength; i++)
             {
-                Assert.Equal(-x[i], destination[i]);
+                Assert.Equal(-x[i], destination[i], Tolerance);
             }
         }
 
@@ -345,7 +345,7 @@ public static void AddTwoTensorsAndMultiplyWithThirdTensor(int tensorLength)
 
             for (int i = 0; i < tensorLength; i++)
             {
-                Assert.Equal((x[i] + y[i]) * multiplier[i], destination[i]);
+                Assert.Equal((x[i] + y[i]) * multiplier[i], destination[i], Tolerance);
             }
         }
 
@@ -398,7 +398,7 @@ public static void AddTwoTensorsAndMultiplyWithScalar(int tensorLength)
 
             for (int i = 0; i < tensorLength; i++)
             {
-                Assert.Equal((x[i] + y[i]) * multiplier, destination[i]);
+                Assert.Equal((x[i] + y[i]) * multiplier, destination[i], Tolerance);
             }
         }
 
@@ -439,7 +439,7 @@ public static void AddTensorAndScalarAndMultiplyWithTensor(int tensorLength)
 
             for (int i = 0; i < tensorLength; i++)
             {
-                Assert.Equal((x[i] + y) * multiplier[i], destination[i]);
+                Assert.Equal((x[i] + y) * multiplier[i], destination[i], Tolerance);
             }
         }
 
@@ -480,7 +480,7 @@ public static void MultiplyTwoTensorsAndAddWithThirdTensor(int tensorLength)
 
             for (int i = 0; i < tensorLength; i++)
             {
-                Assert.Equal((x[i] * y[i]) + addend[i], destination[i]);
+                Assert.Equal((x[i] * y[i]) + addend[i], destination[i], Tolerance);
             }
         }
 
@@ -533,7 +533,7 @@ public static void MultiplyTwoTensorsAndAddWithScalar(int tensorLength)
 
             for (int i = 0; i < tensorLength; i++)
             {
-                Assert.Equal((x[i] * y[i]) + addend, destination[i]);
+                Assert.Equal((x[i] * y[i]) + addend, destination[i], Tolerance);
             }
         }
 
@@ -562,7 +562,7 @@ public static void MultiplyTensorAndScalarAndAddWithTensor(int tensorLength)
 
             for (int i = 0; i < tensorLength; i++)
             {
-                Assert.Equal((x[i] * y) + addend[i], destination[i]);
+                Assert.Equal((x[i] * y) + addend[i], destination[i], Tolerance);
             }
         }
 
@@ -589,7 +589,7 @@ public static void ExpTensor(int tensorLength)
 
             for (int i = 0; i < tensorLength; i++)
             {
-                Assert.Equal(MathF.Exp(x[i]), destination[i]);
+                Assert.Equal(MathF.Exp(x[i]), destination[i], Tolerance);
             }
         }
 
@@ -614,7 +614,7 @@ public static void LogTensor(int tensorLength)
 
             for (int i = 0; i < tensorLength; i++)
             {
-                Assert.Equal(MathF.Log(x[i]), destination[i]);
+                Assert.Equal(MathF.Log(x[i]), destination[i], Tolerance);
             }
         }
 
@@ -664,7 +664,7 @@ public static void CoshTensor(int tensorLength)
 
             for (int i = 0; i < tensorLength; i++)
             {
-                Assert.Equal(MathF.Cosh(x[i]), destination[i]);
+                Assert.Equal(MathF.Cosh(x[i]), destination[i], Tolerance);
             }
         }
 
@@ -689,7 +689,7 @@ public static void SinhTensor(int tensorLength)
 
             for (int i = 0; i < tensorLength; i++)
             {
-                Assert.Equal(MathF.Sinh(x[i]), destination[i]);
+                Assert.Equal(MathF.Sinh(x[i]), destination[i], Tolerance);
             }
         }
 
@@ -714,7 +714,7 @@ public static void TanhTensor(int tensorLength)
 
             for (int i = 0; i < tensorLength; i++)
             {
-                Assert.Equal(MathF.Tanh(x[i]), destination[i]);
+                Assert.Equal(MathF.Tanh(x[i]), destination[i], Tolerance);
             }
         }
 

Original file line number	Diff line number	Diff line change
`@@ -59,7 +59,7 @@ public static void AddTwoTensors(int tensorLength)`
`59`	`59`
`60`	`60`	`for (int i = 0; i < tensorLength; i++)`
`61`	`61`	`{`
`62`		`- Assert.Equal(x[i] + y[i], destination[i]);`
	`62`	`+ Assert.Equal(x[i] + y[i], destination[i], Tolerance);`
`63`	`63`	`}`
`64`	`64`	`}`
`65`	`65`
`@@ -97,7 +97,7 @@ public static void AddTensorAndScalar(int tensorLength)`
`97`	`97`
`98`	`98`	`for (int i = 0; i < tensorLength; i++)`
`99`	`99`	`{`
`100`		`- Assert.Equal(x[i] + y, destination[i]);`
	`100`	`+ Assert.Equal(x[i] + y, destination[i], Tolerance);`
`101`	`101`	`}`
`102`	`102`	`}`
`103`	`103`
`@@ -124,7 +124,7 @@ public static void SubtractTwoTensors(int tensorLength)`
`124`	`124`
`125`	`125`	`for (int i = 0; i < tensorLength; i++)`
`126`	`126`	`{`
`127`		`- Assert.Equal(x[i] - y[i], destination[i]);`
	`127`	`+ Assert.Equal(x[i] - y[i], destination[i], Tolerance);`
`128`	`128`	`}`
`129`	`129`	`}`
`130`	`130`
`@@ -162,7 +162,7 @@ public static void SubtractTensorAndScalar(int tensorLength)`
`162`	`162`
`163`	`163`	`for (int i = 0; i < tensorLength; i++)`
`164`	`164`	`{`
`165`		`- Assert.Equal(x[i] - y, destination[i]);`
	`165`	`+ Assert.Equal(x[i] - y, destination[i], Tolerance);`
`166`	`166`	`}`
`167`	`167`	`}`
`168`	`168`
`@@ -189,7 +189,7 @@ public static void MultiplyTwoTensors(int tensorLength)`
`189`	`189`
`190`	`190`	`for (int i = 0; i < tensorLength; i++)`
`191`	`191`	`{`
`192`		`- Assert.Equal(x[i] * y[i], destination[i]);`
	`192`	`+ Assert.Equal(x[i] * y[i], destination[i], Tolerance);`
`193`	`193`	`}`
`194`	`194`	`}`
`195`	`195`
`@@ -227,7 +227,7 @@ public static void MultiplyTensorAndScalar(int tensorLength)`
`227`	`227`
`228`	`228`	`for (int i = 0; i < tensorLength; i++)`
`229`	`229`	`{`
`230`		`- Assert.Equal(x[i] * y, destination[i]);`
	`230`	`+ Assert.Equal(x[i] * y, destination[i], Tolerance);`
`231`	`231`	`}`
`232`	`232`	`}`
`233`	`233`
`@@ -254,7 +254,7 @@ public static void DivideTwoTensors(int tensorLength)`
`254`	`254`
`255`	`255`	`for (int i = 0; i < tensorLength; i++)`
`256`	`256`	`{`
`257`		`- Assert.Equal(x[i] / y[i], destination[i]);`
	`257`	`+ Assert.Equal(x[i] / y[i], destination[i], Tolerance);`
`258`	`258`	`}`
`259`	`259`	`}`
`260`	`260`
`@@ -292,7 +292,7 @@ public static void DivideTensorAndScalar(int tensorLength)`
`292`	`292`
`293`	`293`	`for (int i = 0; i < tensorLength; i++)`
`294`	`294`	`{`
`295`		`- Assert.Equal(x[i] / y, destination[i]);`
	`295`	`+ Assert.Equal(x[i] / y, destination[i], Tolerance);`
`296`	`296`	`}`
`297`	`297`	`}`
`298`	`298`
`@@ -318,7 +318,7 @@ public static void NegateTensor(int tensorLength)`
`318`	`318`
`319`	`319`	`for (int i = 0; i < tensorLength; i++)`
`320`	`320`	`{`
`321`		`- Assert.Equal(-x[i], destination[i]);`
	`321`	`+ Assert.Equal(-x[i], destination[i], Tolerance);`
`322`	`322`	`}`
`323`	`323`	`}`
`324`	`324`
`@@ -345,7 +345,7 @@ public static void AddTwoTensorsAndMultiplyWithThirdTensor(int tensorLength)`
`345`	`345`
`346`	`346`	`for (int i = 0; i < tensorLength; i++)`
`347`	`347`	`{`
`348`		`- Assert.Equal((x[i] + y[i]) * multiplier[i], destination[i]);`
	`348`	`+ Assert.Equal((x[i] + y[i]) * multiplier[i], destination[i], Tolerance);`
`349`	`349`	`}`
`350`	`350`	`}`
`351`	`351`
`@@ -398,7 +398,7 @@ public static void AddTwoTensorsAndMultiplyWithScalar(int tensorLength)`
`398`	`398`
`399`	`399`	`for (int i = 0; i < tensorLength; i++)`
`400`	`400`	`{`
`401`		`- Assert.Equal((x[i] + y[i]) * multiplier, destination[i]);`
	`401`	`+ Assert.Equal((x[i] + y[i]) * multiplier, destination[i], Tolerance);`
`402`	`402`	`}`
`403`	`403`	`}`
`404`	`404`
`@@ -439,7 +439,7 @@ public static void AddTensorAndScalarAndMultiplyWithTensor(int tensorLength)`
`439`	`439`
`440`	`440`	`for (int i = 0; i < tensorLength; i++)`
`441`	`441`	`{`
`442`		`- Assert.Equal((x[i] + y) * multiplier[i], destination[i]);`
	`442`	`+ Assert.Equal((x[i] + y) * multiplier[i], destination[i], Tolerance);`
`443`	`443`	`}`
`444`	`444`	`}`
`445`	`445`
`@@ -480,7 +480,7 @@ public static void MultiplyTwoTensorsAndAddWithThirdTensor(int tensorLength)`
`480`	`480`
`481`	`481`	`for (int i = 0; i < tensorLength; i++)`
`482`	`482`	`{`
`483`		`- Assert.Equal((x[i] * y[i]) + addend[i], destination[i]);`
	`483`	`+ Assert.Equal((x[i] * y[i]) + addend[i], destination[i], Tolerance);`
`484`	`484`	`}`
`485`	`485`	`}`
`486`	`486`
`@@ -533,7 +533,7 @@ public static void MultiplyTwoTensorsAndAddWithScalar(int tensorLength)`
`533`	`533`
`534`	`534`	`for (int i = 0; i < tensorLength; i++)`
`535`	`535`	`{`
`536`		`- Assert.Equal((x[i] * y[i]) + addend, destination[i]);`
	`536`	`+ Assert.Equal((x[i] * y[i]) + addend, destination[i], Tolerance);`
`537`	`537`	`}`
`538`	`538`	`}`
`539`	`539`
`@@ -562,7 +562,7 @@ public static void MultiplyTensorAndScalarAndAddWithTensor(int tensorLength)`
`562`	`562`
`563`	`563`	`for (int i = 0; i < tensorLength; i++)`
`564`	`564`	`{`
`565`		`- Assert.Equal((x[i] * y) + addend[i], destination[i]);`
	`565`	`+ Assert.Equal((x[i] * y) + addend[i], destination[i], Tolerance);`
`566`	`566`	`}`
`567`	`567`	`}`
`568`	`568`
`@@ -589,7 +589,7 @@ public static void ExpTensor(int tensorLength)`
`589`	`589`
`590`	`590`	`for (int i = 0; i < tensorLength; i++)`
`591`	`591`	`{`
`592`		`- Assert.Equal(MathF.Exp(x[i]), destination[i]);`
	`592`	`+ Assert.Equal(MathF.Exp(x[i]), destination[i], Tolerance);`
`593`	`593`	`}`
`594`	`594`	`}`
`595`	`595`
`@@ -614,7 +614,7 @@ public static void LogTensor(int tensorLength)`
`614`	`614`
`615`	`615`	`for (int i = 0; i < tensorLength; i++)`
`616`	`616`	`{`
`617`		`- Assert.Equal(MathF.Log(x[i]), destination[i]);`
	`617`	`+ Assert.Equal(MathF.Log(x[i]), destination[i], Tolerance);`
`618`	`618`	`}`
`619`	`619`	`}`
`620`	`620`
`@@ -664,7 +664,7 @@ public static void CoshTensor(int tensorLength)`
`664`	`664`
`665`	`665`	`for (int i = 0; i < tensorLength; i++)`
`666`	`666`	`{`
`667`		`- Assert.Equal(MathF.Cosh(x[i]), destination[i]);`
	`667`	`+ Assert.Equal(MathF.Cosh(x[i]), destination[i], Tolerance);`
`668`	`668`	`}`
`669`	`669`	`}`
`670`	`670`
`@@ -689,7 +689,7 @@ public static void SinhTensor(int tensorLength)`
`689`	`689`
`690`	`690`	`for (int i = 0; i < tensorLength; i++)`
`691`	`691`	`{`
`692`		`- Assert.Equal(MathF.Sinh(x[i]), destination[i]);`
	`692`	`+ Assert.Equal(MathF.Sinh(x[i]), destination[i], Tolerance);`
`693`	`693`	`}`
`694`	`694`	`}`
`695`	`695`
`@@ -714,7 +714,7 @@ public static void TanhTensor(int tensorLength)`
`714`	`714`
`715`	`715`	`for (int i = 0; i < tensorLength; i++)`
`716`	`716`	`{`
`717`		`- Assert.Equal(MathF.Tanh(x[i]), destination[i]);`
	`717`	`+ Assert.Equal(MathF.Tanh(x[i]), destination[i], Tolerance);`
`718`	`718`	`}`
`719`	`719`	`}`
`720`	`720`