Change SIMD Loop from Fast to only reassoc/contract (#49405)

vchuravy · giordano · web-flow · commit 663c58d00dc2 · 2023-06-28T18:12:17.000-04:00
Addresses #49387 Co-authored-by: Mosè Giordano <mose@gnu.org>
diff --git a/NEWS.md b/NEWS.md
@@ -21,6 +21,10 @@ Language changes
   that significantly improves load and inference times for heavily overloaded methods that
   dispatch on Types (such as traits and constructors).
 * The "h bar" `ℏ` (`\hslash` U+210F) character is now treated as equivalent to `ħ` (`\hbar` U+0127).
+* The `@simd` macro now has a more limited and clearer semantics, it only enables reordering and contraction
+  of floating-point operations, instead of turning on all "fastmath" optimizations.
+  If you observe performance regressions due to this change, you can recover previous behavior with `@fastmath @simd`,
+  if you are OK with all the optimizations enabled by the `@fastmath` macro. ([#49405])
 * When a method with keyword arguments is displayed in the stack trace view, the textual
   representation of the keyword arguments' types is simplified using the new
   `@Kwargs{key1::Type1, ...}` macro syntax ([#49959]).
diff --git a/base/simdloop.jl b/base/simdloop.jl
@@ -100,7 +100,7 @@ The object iterated over in a `@simd for` loop should be a one-dimensional range
 By using `@simd`, you are asserting several properties of the loop:
 
 * It is safe to execute iterations in arbitrary or overlapping order, with special consideration for reduction variables.
-* Floating-point operations on reduction variables can be reordered, possibly causing different results than without `@simd`.
+* Floating-point operations on reduction variables can be reordered or contracted, possibly causing different results than without `@simd`.
 
 In many cases, Julia is able to automatically vectorize inner for loops without the use of `@simd`.
 Using `@simd` gives the compiler a little extra leeway to make it possible in more situations. In
diff --git a/src/llvm-muladd.cpp b/src/llvm-muladd.cpp
@@ -40,10 +40,10 @@ STATISTIC(TotalContracted, "Total number of multiplies marked for FMA");
  * Combine
  * ```
  * %v0 = fmul ... %a, %b
- * %v = fadd fast ... %v0, %c
+ * %v = fadd contract ... %v0, %c
  * ```
  * to
- * `%v = call fast @llvm.fmuladd.<...>(... %a, ... %b, ... %c)`
+ * `%v = call contract @llvm.fmuladd.<...>(... %a, ... %b, ... %c)`
  * when `%v0` has no other use
  */
 
@@ -87,13 +87,13 @@ static bool combineMulAdd(Function &F) JL_NOTSAFEPOINT
             it++;
             switch (I.getOpcode()) {
             case Instruction::FAdd: {
-                if (!I.isFast())
+                if (!I.hasAllowContract())
                     continue;
                 modified |= checkCombine(I.getOperand(0), ORE) || checkCombine(I.getOperand(1), ORE);
                 break;
             }
             case Instruction::FSub: {
-                if (!I.isFast())
+                if (!I.hasAllowContract())
                     continue;
                 modified |= checkCombine(I.getOperand(0), ORE) || checkCombine(I.getOperand(1), ORE);
                 break;
diff --git a/src/llvm-simdloop.cpp b/src/llvm-simdloop.cpp
@@ -149,7 +149,8 @@ static void enableUnsafeAlgebraIfReduction(PHINode *Phi, Loop *L, OptimizationRe
             return OptimizationRemark(DEBUG_TYPE, "MarkedUnsafeAlgebra", *K)
                    << "marked unsafe algebra on " << ore::NV("Instruction", *K);
         });
-        (*K)->setFast(true);
+        (*K)->setHasAllowReassoc(true);
+        (*K)->setHasAllowContract(true);
         ++length;
     }
     ReductionChainLength += length;
diff --git a/test/llvmpasses/loopinfo.jl b/test/llvmpasses/loopinfo.jl
@@ -29,10 +29,10 @@ function simdf(X)
         acc += x
 # CHECK: call void @julia.loopinfo_marker(), {{.*}}, !julia.loopinfo [[LOOPINFO:![0-9]+]]
 # LOWER-NOT: llvm.mem.parallel_loop_access
-# LOWER: fadd fast double
+# LOWER: fadd reassoc contract double
 # LOWER-NOT: call void @julia.loopinfo_marker()
 # LOWER: br {{.*}}, !llvm.loop [[LOOPID:![0-9]+]]
-# FINAL: fadd fast <{{(vscale x )?}}{{[0-9]+}} x double>
+# FINAL: fadd reassoc contract <{{(vscale x )?}}{{[0-9]+}} x double>
     end
     acc
 end
@@ -46,7 +46,7 @@ function simdf2(X)
 # CHECK: call void @julia.loopinfo_marker(), {{.*}}, !julia.loopinfo [[LOOPINFO2:![0-9]+]]
 # LOWER: llvm.mem.parallel_loop_access
 # LOWER-NOT: call void @julia.loopinfo_marker()
-# LOWER: fadd fast double
+# LOWER: fadd reassoc contract double
 # LOWER: br {{.*}}, !llvm.loop [[LOOPID2:![0-9]+]]
     end
     acc
diff --git a/test/llvmpasses/simdloop.ll b/test/llvmpasses/simdloop.ll
@@ -40,7 +40,7 @@ loop:
 ; CHECK: llvm.mem.parallel_loop_access
   %aval = load double, double *%aptr
   %nextv = fsub double %v, %aval
-; CHECK: fsub fast double %v, %aval
+; CHECK: fsub reassoc contract double %v, %aval
   %nexti = add i64 %i, 1
   call void @julia.loopinfo_marker(), !julia.loopinfo !3
   %done = icmp sgt i64 %nexti, 500
@@ -59,7 +59,7 @@ loop:
   %aptr = getelementptr double, double *%a, i64 %i
   %aval = load double, double *%aptr
   %nextv = fsub double %v, %aval
-; CHECK: fsub fast double %v, %aval
+; CHECK: fsub reassoc contract double %v, %aval
   %nexti = add i64 %i, 1
   call void @julia.loopinfo_marker(), !julia.loopinfo !2
   %done = icmp sgt i64 %nexti, 500