dotnet · tannergooding · Aug 13, 2024 · Aug 8, 2024 · Aug 9, 2024 · Aug 9, 2024
diff --git a/...nsors/src/System/Numerics/Tensors/netcore/Common/TensorPrimitives.IAggregationOperator.cs b/...nsors/src/System/Numerics/Tensors/netcore/Common/TensorPrimitives.IAggregationOperator.cs
@@ -1227,9 +1227,12 @@ static T Vectorized128(ref T xRef, ref T yRef, nuint remainder)
 
                         // We need to the ensure the underlying data can be aligned and only align
                         // it if it can. It is possible we have an unaligned ref, in which case we
-                        // can never achieve the required SIMD alignment.
+                        // can never achieve the required SIMD alignment. This cannot be done for
+                        // float or double since that changes how results compound together.
 
-                        bool canAlign = ((nuint)xPtr % (nuint)sizeof(T)) == 0;
+                        bool canAlign = (typeof(T) != typeof(float)) &&
+                                        (typeof(T) != typeof(double)) &&
+                                        ((nuint)xPtr % (nuint)sizeof(T)) == 0;
 
                         if (canAlign)
                         {
@@ -1418,9 +1421,12 @@ static T Vectorized256(ref T xRef, ref T yRef, nuint remainder)
 
                         // We need to the ensure the underlying data can be aligned and only align
                         // it if it can. It is possible we have an unaligned ref, in which case we
-                        // can never achieve the required SIMD alignment.
+                        // can never achieve the required SIMD alignment. This cannot be done for
+                        // float or double since that changes how results compound together.
 
-                        bool canAlign = ((nuint)xPtr % (nuint)sizeof(T)) == 0;
+                        bool canAlign = (typeof(T) != typeof(float)) &&
+                                        (typeof(T) != typeof(double)) &&
+                                        ((nuint)xPtr % (nuint)sizeof(T)) == 0;
 
                         if (canAlign)
                         {
@@ -1609,9 +1615,12 @@ static T Vectorized512(ref T xRef, ref T yRef, nuint remainder)
 
                         // We need to the ensure the underlying data can be aligned and only align
                         // it if it can. It is possible we have an unaligned ref, in which case we
-                        // can never achieve the required SIMD alignment.
+                        // can never achieve the required SIMD alignment. This cannot be done for
+                        // float or double since that changes how results compound together.
 
-                        bool canAlign = ((nuint)xPtr % (nuint)sizeof(T)) == 0;
+                        bool canAlign = (typeof(T) != typeof(float)) &&
+                                        (typeof(T) != typeof(double)) &&
+                                        ((nuint)xPtr % (nuint)sizeof(T)) == 0;
 
                         if (canAlign)
                         {

diff --git a/...cs.Tensors/src/System/Numerics/Tensors/netstandard/TensorPrimitives.Single.netstandard.cs b/...cs.Tensors/src/System/Numerics/Tensors/netstandard/TensorPrimitives.Single.netstandard.cs
@@ -467,8 +467,6 @@ static float Vectorized(ref float xRef, ref float yRef, nuint remainder, TBinary
                 Vector<float> end = binaryOp.Invoke(AsVector(ref xRef, remainder - (uint)(Vector<float>.Count)),
                                                     AsVector(ref yRef, remainder - (uint)(Vector<float>.Count)));
 
-                nuint misalignment = 0;
-
                 if (remainder > (uint)(Vector<float>.Count * 8))
                 {
                     // Pinning is cheap and will be short lived for small inputs and unlikely to be impactful
@@ -480,29 +478,9 @@ static float Vectorized(ref float xRef, ref float yRef, nuint remainder, TBinary
                         float* xPtr = px;
                         float* yPtr = py;
 
-                        // We need to the ensure the underlying data can be aligned and only align
-                        // it if it can. It is possible we have an unaligned ref, in which case we
-                        // can never achieve the required SIMD alignment.
-
-                        bool canAlign = ((nuint)(xPtr) % sizeof(float)) == 0;
-
-                        if (canAlign)
-                        {
-                            // Compute by how many elements we're misaligned and adjust the pointers accordingly
-                            //
-                            // Noting that we are only actually aligning dPtr. This is because unaligned stores
-                            // are more expensive than unaligned loads and aligning both is significantly more
-                            // complex.
-
-                            misalignment = ((uint)(sizeof(Vector<float>)) - ((nuint)(xPtr) % (uint)(sizeof(Vector<float>)))) / sizeof(float);
-
-                            xPtr += misalignment;
-                            yPtr += misalignment;
-
-                            Debug.Assert(((nuint)(xPtr) % (uint)(sizeof(Vector<float>))) == 0);
-
-                            remainder -= misalignment;
-                        }
+                        // Unlike many other vectorization algorithms, we cannot align for aggregation
+                        // because that changes how results compound together and can cause a significant
+                        // difference in the output.
 
                         Vector<float> vector1;
                         Vector<float> vector2;
@@ -564,7 +542,6 @@ static float Vectorized(ref float xRef, ref float yRef, nuint remainder, TBinary
                 // Store the first block. Handling this separately simplifies the latter code as we know
                 // they come after and so we can relegate it to full blocks or the trailing elements
 
-                beg = Vector.ConditionalSelect(CreateAlignmentMaskSingleVector((int)(misalignment)), beg, new Vector<float>(aggregationOp.IdentityValue));
                 vresult = aggregationOp.Invoke(vresult, beg);
 
                 // Process the remaining [0, Count * 7] elements via a jump table
@@ -575,7 +552,7 @@ static float Vectorized(ref float xRef, ref float yRef, nuint remainder, TBinary
 
                 nuint blocks = remainder / (nuint)(Vector<float>.Count);
                 nuint trailing = remainder - (blocks * (nuint)(Vector<float>.Count));
-                blocks -= (misalignment == 0) ? 1u : 0u;
+                blocks -= 1u;
                 remainder -= trailing;
 
                 switch (blocks)