From 4c0093fcd3ea3e2ea6a760c0bc97976f57505e49 Mon Sep 17 00:00:00 2001
From: Radek Doulik <radek.doulik@gmail.com>
Date: Wed, 3 May 2023 14:21:56 +0200
Subject: [PATCH 1/2] [wasm] PackedSimd, add floating point methods

---
 .../Wasm/PackedSimd.PlatformNotSupported.cs   |  51 ++++++
 .../Runtime/Intrinsics/Wasm/PackedSimd.cs     | 171 ++++++++++++++++++
 .../ref/System.Runtime.Intrinsics.cs          |  30 +++
 src/mono/mono/mini/llvm-intrinsics.h          |  18 +-
 src/mono/mono/mini/mini-llvm.c                |   4 +-
 src/mono/mono/mini/mini-ops.h                 |   4 +-
 src/mono/mono/mini/simd-arm64.h               |   4 +-
 src/mono/mono/mini/simd-intrinsics.c          |  85 ++++++---
 src/mono/mono/mini/simd-methods.h             |   3 +
 9 files changed, 329 insertions(+), 41 deletions(-)
diff --git a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Wasm/PackedSimd.PlatformNotSupported.cs b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Wasm/PackedSimd.PlatformNotSupported.cs
index bd1192b7f1af50..d2103ae038c811 100644
--- a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Wasm/PackedSimd.PlatformNotSupported.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Wasm/PackedSimd.PlatformNotSupported.cs
@@ -392,6 +392,57 @@ public abstract class PackedSimd
         public static Vector128<nint>   CompareGreaterThanOrEqual(Vector128<nint>   left, Vector128<nint>   right) { throw new PlatformNotSupportedException(); }
         public static Vector128<nuint>  CompareGreaterThanOrEqual(Vector128<nuint>  left, Vector128<nuint>  right) { throw new PlatformNotSupportedException(); }
 
+       // Floating-point sign bit operations
+
+        public static Vector128<float>  Negate(Vector128<float>  value) { throw new PlatformNotSupportedException(); }
+        public static Vector128<double> Negate(Vector128<double> value) { throw new PlatformNotSupportedException(); }
+
+        public static Vector128<float>  Abs(Vector128<float>  value) { throw new PlatformNotSupportedException(); }
+        public static Vector128<double> Abs(Vector128<double> value) { throw new PlatformNotSupportedException(); }
+
+        // Floating-point min and max
+
+        public static Vector128<float>  Min(Vector128<float>  left, Vector128<float>  right) { throw new PlatformNotSupportedException(); }
+        public static Vector128<double> Min(Vector128<double> left, Vector128<double> right) { throw new PlatformNotSupportedException(); }
+
+        public static Vector128<float>  Max(Vector128<float>  left, Vector128<float>  right) { throw new PlatformNotSupportedException(); }
+        public static Vector128<double> Max(Vector128<double> left, Vector128<double> right) { throw new PlatformNotSupportedException(); }
+
+        public static Vector128<float>  PseudoMin(Vector128<float>  left, Vector128<float>  right) { throw new PlatformNotSupportedException(); }
+        public static Vector128<double> PseudoMin(Vector128<double> left, Vector128<double> right) { throw new PlatformNotSupportedException(); }
+
+        public static Vector128<float>  PseudoMax(Vector128<float>  left, Vector128<float>  right) { throw new PlatformNotSupportedException(); }
+        public static Vector128<double> PseudoMax(Vector128<double> left, Vector128<double> right) { throw new PlatformNotSupportedException(); }
+
+        // Floating-point arithmetic
+
+        public static Vector128<float>  Add(Vector128<float>  left, Vector128<float>  right) { throw new PlatformNotSupportedException(); }
+        public static Vector128<double> Add(Vector128<double> left, Vector128<double> right) { throw new PlatformNotSupportedException(); }
+
+        public static Vector128<float>  Subtract(Vector128<float>  left, Vector128<float>  right) { throw new PlatformNotSupportedException(); }
+        public static Vector128<double> Subtract(Vector128<double> left, Vector128<double> right) { throw new PlatformNotSupportedException(); }
+
+        public static Vector128<float>  Divide(Vector128<float>  left, Vector128<float>  right) { throw new PlatformNotSupportedException(); }
+        public static Vector128<double> Divide(Vector128<double> left, Vector128<double> right) { throw new PlatformNotSupportedException(); }
+
+        public static Vector128<float>  Multiply(Vector128<float>  left, Vector128<float>  right) { throw new PlatformNotSupportedException(); }
+        public static Vector128<double> Multiply(Vector128<double> left, Vector128<double> right) { throw new PlatformNotSupportedException(); }
+
+        public static Vector128<float>  Sqrt(Vector128<float>  value) { throw new PlatformNotSupportedException(); }
+        public static Vector128<double> Sqrt(Vector128<double> value) { throw new PlatformNotSupportedException(); }
+
+        public static Vector128<float>  Ceiling(Vector128<float>  value) { throw new PlatformNotSupportedException(); }
+        public static Vector128<double> Ceiling(Vector128<double> value) { throw new PlatformNotSupportedException(); }
+
+        public static Vector128<float>  Floor(Vector128<float>  value) { throw new PlatformNotSupportedException(); }
+        public static Vector128<double> Floor(Vector128<double> value) { throw new PlatformNotSupportedException(); }
+
+        public static Vector128<float>  Truncate(Vector128<float>  value) { throw new PlatformNotSupportedException(); }
+        public static Vector128<double> Truncate(Vector128<double> value) { throw new PlatformNotSupportedException(); }
+
+        public static Vector128<float>  RoundToNearest(Vector128<float>  value) { throw new PlatformNotSupportedException(); }
+        public static Vector128<double> RoundToNearest(Vector128<double> value) { throw new PlatformNotSupportedException(); }
+
         // Conversions
 
         internal static Vector128<sbyte>  ConvertNarrowingSignedSaturate(Vector128<short>   lower, Vector128<short> upper) { throw new PlatformNotSupportedException(); }
diff --git a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Wasm/PackedSimd.cs b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Wasm/PackedSimd.cs
index 2a9ac47545ad28..680b8af89492bc 100644
--- a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Wasm/PackedSimd.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Wasm/PackedSimd.cs
@@ -1699,6 +1699,177 @@ public abstract class PackedSimd
         [Intrinsic]
         public static Vector128<nuint>  CompareGreaterThanOrEqual(Vector128<nuint>  left, Vector128<nuint>  right) => CompareGreaterThanOrEqual(left, right);
 
+       // Floating-point sign bit operations
+
+        /// <summary>
+        ///   f32x4.neg
+        /// </summary>
+        [Intrinsic]
+        public static Vector128<float>  Negate(Vector128<float>  value) => Negate(value);
+        /// <summary>
+        ///   f64x2.neg
+        /// </summary>
+        [Intrinsic]
+        public static Vector128<double> Negate(Vector128<double> value) => Negate(value);
+
+        /// <summary>
+        ///   f32x4.abs
+        /// </summary>
+        [Intrinsic]
+        public static Vector128<float>  Abs(Vector128<float>  value) => Abs(value);
+        /// <summary>
+        ///   f64x2.abs
+        /// </summary>
+        [Intrinsic]
+        public static Vector128<double> Abs(Vector128<double> value) => Abs(value);
+
+        // Floating-point min and max
+
+        /// <summary>
+        ///   f32x4.min
+        /// </summary>
+        [Intrinsic]
+        public static Vector128<float>  Min(Vector128<float>  left, Vector128<float>  right) => Min(left, right);
+        /// <summary>
+        ///   f64x2.min
+        /// </summary>
+        [Intrinsic]
+        public static Vector128<double> Min(Vector128<double> left, Vector128<double> right) => Min(left, right);
+
+        /// <summary>
+        ///   f32x4.max
+        /// </summary>
+        [Intrinsic]
+        public static Vector128<float>  Max(Vector128<float>  left, Vector128<float>  right) => Max(left, right);
+        /// <summary>
+        ///   f64x2.max
+        /// </summary>
+        [Intrinsic]
+        public static Vector128<double> Max(Vector128<double> left, Vector128<double> right) => Max(left, right);
+
+        /// <summary>
+        ///   f32x4.pmin
+        /// </summary>
+        [Intrinsic]
+        public static Vector128<float>  PseudoMin(Vector128<float>  left, Vector128<float>  right) => PseudoMin(left, right);
+        /// <summary>
+        ///   f64x2.pmin
+        /// </summary>
+        [Intrinsic]
+        public static Vector128<double> PseudoMin(Vector128<double> left, Vector128<double> right) => PseudoMin(left, right);
+
+        /// <summary>
+        ///   f32x4.pmax
+        /// </summary>
+        [Intrinsic]
+        public static Vector128<float>  PseudoMax(Vector128<float>  left, Vector128<float>  right) => PseudoMax(left, right);
+        /// <summary>
+        ///   f64x2.pmax
+        /// </summary>
+        [Intrinsic]
+        public static Vector128<double> PseudoMax(Vector128<double> left, Vector128<double> right) => PseudoMax(left, right);
+
+        // Floating-point arithmetic
+
+        /// <summary>
+        ///   f32x4.add
+        /// </summary>
+        [Intrinsic]
+        public static Vector128<float>  Add(Vector128<float>  left, Vector128<float>  right) => Add(left, right);
+        /// <summary>
+        ///   f64x2.add
+        /// </summary>
+        [Intrinsic]
+        public static Vector128<double> Add(Vector128<double> left, Vector128<double> right) => Add(left, right);
+
+        /// <summary>
+        ///   f32x4.sub
+        /// </summary>
+        [Intrinsic]
+        public static Vector128<float>  Subtract(Vector128<float>  left, Vector128<float>  right) => Subtract(left, right);
+        /// <summary>
+        ///   f64x2.sub
+        /// </summary>
+        [Intrinsic]
+        public static Vector128<double> Subtract(Vector128<double> left, Vector128<double> right) => Subtract(left, right);
+
+        /// <summary>
+        ///   f32x4.div
+        /// </summary>
+        [Intrinsic]
+        public static Vector128<float>  Divide(Vector128<float>  left, Vector128<float>  right) => Divide(left, right);
+        /// <summary>
+        ///   f64x2.div
+        /// </summary>
+        [Intrinsic]
+        public static Vector128<double> Divide(Vector128<double> left, Vector128<double> right) => Divide(left, right);
+
+        /// <summary>
+        ///   f32x4.mul
+        /// </summary>
+        [Intrinsic]
+        public static Vector128<float>  Multiply(Vector128<float>  left, Vector128<float>  right) => Multiply(left, right);
+        /// <summary>
+        ///   f64x2.mul
+        /// </summary>
+        [Intrinsic]
+        public static Vector128<double> Multiply(Vector128<double> left, Vector128<double> right) => Multiply(left, right);
+
+        /// <summary>
+        ///   f32x4.sqrt
+        /// </summary>
+        [Intrinsic]
+        public static Vector128<float>  Sqrt(Vector128<float>  value) => Sqrt(value);
+        /// <summary>
+        ///   f64x2.sqrt
+        /// </summary>
+        [Intrinsic]
+        public static Vector128<double> Sqrt(Vector128<double> value) => Sqrt(value);
+
+        /// <summary>
+        ///   f32x4.ceil
+        /// </summary>
+        [Intrinsic]
+        public static Vector128<float>  Ceiling(Vector128<float>  value) => Ceiling(value);
+        /// <summary>
+        ///   f64x2.ceil
+        /// </summary>
+        [Intrinsic]
+        public static Vector128<double> Ceiling(Vector128<double> value) => Ceiling(value);
+
+        /// <summary>
+        ///   f32x4.floor
+        /// </summary>
+        [Intrinsic]
+        public static Vector128<float>  Floor(Vector128<float>  value) => Floor(value);
+        /// <summary>
+        ///   f64x2.floor
+        /// </summary>
+        [Intrinsic]
+        public static Vector128<double> Floor(Vector128<double> value) => Floor(value);
+
+        /// <summary>
+        ///   f32x4.trunc
+        /// </summary>
+        [Intrinsic]
+        public static Vector128<float>  Truncate(Vector128<float>  value) => Truncate(value);
+        /// <summary>
+        ///   f64x2.trunc
+        /// </summary>
+        [Intrinsic]
+        public static Vector128<double> Truncate(Vector128<double> value) => Truncate(value);
+
+        /// <summary>
+        ///   f32x4.nearest
+        /// </summary>
+        [Intrinsic]
+        public static Vector128<float>  RoundToNearest(Vector128<float>  value) => RoundToNearest(value);
+        /// <summary>
+        ///   f64x2.nearest
+        /// </summary>
+        [Intrinsic]
+        public static Vector128<double> RoundToNearest(Vector128<double> value) => RoundToNearest(value);
+
         // Conversions
 
         /// <summary>
diff --git a/src/libraries/System.Runtime.Intrinsics/ref/System.Runtime.Intrinsics.cs b/src/libraries/System.Runtime.Intrinsics/ref/System.Runtime.Intrinsics.cs
index d8c842b17ba835..d48000bba4b348 100644
--- a/src/libraries/System.Runtime.Intrinsics/ref/System.Runtime.Intrinsics.cs
+++ b/src/libraries/System.Runtime.Intrinsics/ref/System.Runtime.Intrinsics.cs
@@ -6319,5 +6319,35 @@ public abstract partial class PackedSimd
         public static Vector128<double> CompareGreaterThanOrEqual(Vector128<double> left, Vector128<double> right) { throw null; }
         public static Vector128<nint>   CompareGreaterThanOrEqual(Vector128<nint>   left, Vector128<nint>   right) { throw null; }
         public static Vector128<nuint>  CompareGreaterThanOrEqual(Vector128<nuint>  left, Vector128<nuint>  right) { throw null; }
+        public static Vector128<float>  Negate(Vector128<float>  value) { throw null; }
+        public static Vector128<double> Negate(Vector128<double> value) { throw null; }
+        public static Vector128<float>  Abs(Vector128<float>  value) { throw null; }
+        public static Vector128<double> Abs(Vector128<double> value) { throw null; }
+        public static Vector128<float>  Min(Vector128<float>  left, Vector128<float>  right) { throw null; }
+        public static Vector128<double> Min(Vector128<double> left, Vector128<double> right) { throw null; }
+        public static Vector128<float>  Max(Vector128<float>  left, Vector128<float>  right) { throw null; }
+        public static Vector128<double> Max(Vector128<double> left, Vector128<double> right) { throw null; }
+        public static Vector128<float>  PseudoMin(Vector128<float>  left, Vector128<float>  right) { throw null; }
+        public static Vector128<double> PseudoMin(Vector128<double> left, Vector128<double> right) { throw null; }
+        public static Vector128<float>  PseudoMax(Vector128<float>  left, Vector128<float>  right) { throw null; }
+        public static Vector128<double> PseudoMax(Vector128<double> left, Vector128<double> right) { throw null; }
+        public static Vector128<float>  Add(Vector128<float>  left, Vector128<float>  right) { throw null; }
+        public static Vector128<double> Add(Vector128<double> left, Vector128<double> right) { throw null; }
+        public static Vector128<float>  Subtract(Vector128<float>  left, Vector128<float>  right) { throw null; }
+        public static Vector128<double> Subtract(Vector128<double> left, Vector128<double> right) { throw null; }
+        public static Vector128<float>  Divide(Vector128<float>  left, Vector128<float>  right) { throw null; }
+        public static Vector128<double> Divide(Vector128<double> left, Vector128<double> right) { throw null; }
+        public static Vector128<float>  Multiply(Vector128<float>  left, Vector128<float>  right) { throw null; }
+        public static Vector128<double> Multiply(Vector128<double> left, Vector128<double> right) { throw null; }
+        public static Vector128<float>  Sqrt(Vector128<float>  value) { throw null; }
+        public static Vector128<double> Sqrt(Vector128<double> value) { throw null; }
+        public static Vector128<float>  Ceiling(Vector128<float>  value) { throw null; }
+        public static Vector128<double> Ceiling(Vector128<double> value) { throw null; }
+        public static Vector128<float>  Floor(Vector128<float>  value) { throw null; }
+        public static Vector128<double> Floor(Vector128<double> value) { throw null; }
+        public static Vector128<float>  Truncate(Vector128<float>  value) { throw null; }
+        public static Vector128<double> Truncate(Vector128<double> value) { throw null; }
+        public static Vector128<float>  RoundToNearest(Vector128<float>  value) { throw null; }
+        public static Vector128<double> RoundToNearest(Vector128<double> value) { throw null; }
     }
 }
diff --git a/src/mono/mono/mini/llvm-intrinsics.h b/src/mono/mono/mini/llvm-intrinsics.h
index 10932f78bc4ccf..953879caeb0421 100644
--- a/src/mono/mono/mini/llvm-intrinsics.h
+++ b/src/mono/mono/mini/llvm-intrinsics.h
@@ -100,6 +100,14 @@ INTRINS(PEXT_I64, x86_bmi_pext_64, X86)
 INTRINS(PDEP_I32, x86_bmi_pdep_32, X86)
 INTRINS(PDEP_I64, x86_bmi_pdep_64, X86)
 
+INTRINS_OVR(SIMD_SQRT_R8, sqrt, Generic, sse_r8_t)
+INTRINS_OVR(SIMD_SQRT_R4, sqrt, Generic, sse_r4_t)
+INTRINS_OVR_TAG(SIMD_FLOOR, floor, Generic, Scalar | V64 | V128 | R4 | R8)
+INTRINS_OVR_TAG(SIMD_CEIL, ceil, Generic, Scalar | V64 | V128 | R4 | R8)
+INTRINS_OVR_TAG(SIMD_TRUNC, trunc, Generic, Scalar | V64 | V128 | R4 | R8)
+INTRINS_OVR_TAG(SIMD_ROUND, round, Generic, Scalar | V64 | V128 | R4 | R8)
+INTRINS_OVR_TAG(SIMD_NEAREST, nearbyint, Generic, V64 | V128 | R4 | R8)
+
 #if LLVM_API_VERSION >= 1400
 INTRINS_OVR_TAG(ROUNDEVEN, roundeven, Generic, Scalar | V64 | V128 | R4 | R8)
 #endif
@@ -124,8 +132,6 @@ INTRINS(SSE_PSRL_Q, x86_sse2_psrl_q, X86)
 INTRINS(SSE_PSLL_W, x86_sse2_psll_w, X86)
 INTRINS(SSE_PSLL_D, x86_sse2_psll_d, X86)
 INTRINS(SSE_PSLL_Q, x86_sse2_psll_q, X86)
-INTRINS_OVR(SSE_SQRT_PD, sqrt, Generic, sse_r8_t)
-INTRINS_OVR(SSE_SQRT_PS, sqrt, Generic, sse_r4_t)
 INTRINS_OVR(SSE_SQRT_SD, sqrt, Generic, LLVMDoubleType ())
 INTRINS_OVR(SSE_SQRT_SS, sqrt, Generic, LLVMFloatType ())
 INTRINS(SSE_RCP_PS, x86_sse_rcp_ps, X86)
@@ -283,6 +289,10 @@ INTRINS_OVR_2_ARG(WASM_NARROW_SIGNED_V16, wasm_narrow_signed, Wasm, sse_i1_t, ss
 INTRINS_OVR_2_ARG(WASM_NARROW_SIGNED_V8, wasm_narrow_signed, Wasm, sse_i2_t, sse_i4_t)
 INTRINS_OVR_2_ARG(WASM_NARROW_UNSIGNED_V16, wasm_narrow_unsigned, Wasm, sse_i1_t, sse_i2_t)
 INTRINS_OVR_2_ARG(WASM_NARROW_UNSIGNED_V8, wasm_narrow_unsigned, Wasm, sse_i2_t, sse_i4_t)
+INTRINS_OVR_TAG(WASM_PMAX, wasm_pmax, Wasm, R4 | R8)
+INTRINS_OVR_TAG(WASM_PMIN, wasm_pmin, Wasm, R4 | R8)
+INTRINS_OVR(WASM_PMAX_V4, fabs, Generic, sse_r4_t)
+INTRINS_OVR(WASM_PMAX_V2, fabs, Generic, sse_r8_t)
 INTRINS(WASM_Q15MULR_SAT_SIGNED, wasm_q15mulr_sat_signed, Wasm)
 INTRINS(WASM_SHUFFLE, wasm_shuffle, Wasm)
 INTRINS_OVR(WASM_SUB_SAT_SIGNED_V16, wasm_sub_sat_signed, Wasm, sse_i1_t)
@@ -436,13 +446,9 @@ INTRINS_OVR_TAG(AARCH64_ADV_SIMD_FRECPS, aarch64_neon_frecps, Arm64, Scalar | V6
 INTRINS_OVR_TAG(AARCH64_ADV_SIMD_RBIT, aarch64_neon_rbit, Arm64, V64 | V128 | I1)
 #endif
 
-INTRINS_OVR_TAG(AARCH64_ADV_SIMD_FRINTA, round, Generic, Scalar | V64 | V128 | R4 | R8)
 #if LLVM_API_VERSION < 1400
 INTRINS_OVR_TAG(AARCH64_ADV_SIMD_FRINTN, aarch64_neon_frintn, Arm64, Scalar | V64 | V128 | R4 | R8)
 #endif
-INTRINS_OVR_TAG(AARCH64_ADV_SIMD_FRINTM, floor, Generic, Scalar | V64 | V128 | R4 | R8)
-INTRINS_OVR_TAG(AARCH64_ADV_SIMD_FRINTP, ceil, Generic, Scalar | V64 | V128 | R4 | R8)
-INTRINS_OVR_TAG(AARCH64_ADV_SIMD_FRINTZ, trunc, Generic, Scalar | V64 | V128 | R4 | R8)
 
 INTRINS_OVR_TAG(AARCH64_ADV_SIMD_SUQADD, aarch64_neon_suqadd, Arm64, Scalar | V64 | V128 | I1 | I2 | I4 | I8)
 INTRINS_OVR_TAG(AARCH64_ADV_SIMD_USQADD, aarch64_neon_usqadd, Arm64, Scalar | V64 | V128 | I1 | I2 | I4 | I8)
diff --git a/src/mono/mono/mini/mini-llvm.c b/src/mono/mono/mini/mini-llvm.c
index e2e9e2dd78d540..e6685c42810e09 100644
--- a/src/mono/mono/mini/mini-llvm.c
+++ b/src/mono/mono/mini/mini-llvm.c
@@ -11546,8 +11546,6 @@ MONO_RESTORE_WARNING
 			values [ins->dreg] = result;
 			break;
 		}
-#endif
-#if defined(TARGET_ARM64) || defined(TARGET_AMD64)
 		case OP_NEGATION:
 		case OP_NEGATION_SCALAR: {
 			gboolean scalar = ins->opcode == OP_NEGATION_SCALAR;
@@ -11565,6 +11563,8 @@ MONO_RESTORE_WARNING
 			values [ins->dreg] = result;
 			break;
 		}
+#endif
+#if defined(TARGET_ARM64) || defined(TARGET_AMD64)
 		case OP_ONES_COMPLEMENT: {
 			LLVMTypeRef ret_t = LLVMTypeOf (lhs);
 			LLVMValueRef result = bitcast_to_integral (ctx, lhs);
diff --git a/src/mono/mono/mini/mini-ops.h b/src/mono/mono/mini/mini-ops.h
index 00f382b4148d94..918a655ccbdd97 100644
--- a/src/mono/mono/mini/mini-ops.h
+++ b/src/mono/mono/mini/mini-ops.h
@@ -1803,8 +1803,6 @@ MINI_OP(OP_WASM_ONESCOMPLEMENT, "wasm_onescomplement", XREG, XREG, NONE)
 #endif
 
 #if defined(TARGET_ARM64) || defined(TARGET_AMD64)
-MINI_OP(OP_NEGATION,        "negate", XREG, XREG, NONE)
-MINI_OP(OP_NEGATION_SCALAR, "negate_scalar", XREG, XREG, NONE)
 MINI_OP(OP_ONES_COMPLEMENT, "ones_complement", XREG, XREG, NONE)
 
 MINI_OP(OP_CVT_FP_UI,        "convert_fp_to_ui", XREG, XREG, NONE)
@@ -1819,6 +1817,8 @@ MINI_OP(OP_CVT_SI_FP_SCALAR, "convert_si_to_fp_scalar", XREG, XREG, NONE)
 #endif // TARGET_ARM64 || TARGET_AMD64
 
 #if defined(TARGET_ARM64) || defined(TARGET_AMD64) || defined(TARGET_WASM)
+MINI_OP(OP_NEGATION,        "negate", XREG, XREG, NONE)
+MINI_OP(OP_NEGATION_SCALAR, "negate_scalar", XREG, XREG, NONE)
 MINI_OP3(OP_BSL,            "bitwise_select", XREG, XREG, XREG, XREG)
 #endif // TARGET_ARM64 || TARGET_AMD64 || TARGET_WASM
 
diff --git a/src/mono/mono/mini/simd-arm64.h b/src/mono/mono/mini/simd-arm64.h
index 9bd2c31cb7dd79..c7d60f4f5f97f1 100644
--- a/src/mono/mono/mini/simd-arm64.h
+++ b/src/mono/mono/mini/simd-arm64.h
@@ -78,8 +78,8 @@ SIMD_OP  (128, OP_XBINOP_FORCEINT,    XBINOP_FORCEINT_XOR,    WDSS,      arm_neo
 SIMD_OP  (128, OP_ARM64_XADDV, INTRINS_AARCH64_ADV_SIMD_UADDV, WTDS,     arm_neon_addv,    arm_neon_addv,    arm_neon_addv,   _SKIP,             _UNDEF,           _UNDEF)
 SIMD_OP  (128, OP_ARM64_XADDV, INTRINS_AARCH64_ADV_SIMD_SADDV, WTDS,     arm_neon_addv,    arm_neon_addv,    arm_neon_addv,   _SKIP,             _UNDEF,           _UNDEF)
 SIMD_OP  (128, OP_ARM64_XADDV, INTRINS_AARCH64_ADV_SIMD_FADDV, WTDS,     _UNDEF,           _UNDEF,           _UNDEF,          _UNDEF,            _SKIP,            _SKIP)
-SIMD_OP  (128, OP_XOP_OVR_X_X, INTRINS_AARCH64_ADV_SIMD_FRINTP, WTDS,    _UNDEF,           _UNDEF,           _UNDEF,          _UNDEF,            arm_neon_frintp,  arm_neon_frintp)
-SIMD_OP  (128, OP_XOP_OVR_X_X, INTRINS_AARCH64_ADV_SIMD_FRINTM, WTDS,    _UNDEF,           _UNDEF,           _UNDEF,          _UNDEF,            arm_neon_frintm,  arm_neon_frintm)
+SIMD_OP  (128, OP_XOP_OVR_X_X, INTRINS_SIMD_CEIL, WTDS,    _UNDEF,           _UNDEF,           _UNDEF,          _UNDEF,            arm_neon_frintp,  arm_neon_frintp)
+SIMD_OP  (128, OP_XOP_OVR_X_X, INTRINS_SIMD_FLOOR, WTDS,    _UNDEF,           _UNDEF,           _UNDEF,          _UNDEF,            arm_neon_frintm,  arm_neon_frintm)
 SIMD_OP  (128, OP_XOP_OVR_X_X, INTRINS_AARCH64_ADV_SIMD_FSQRT,  WTDS,    _UNDEF,           _UNDEF,           _UNDEF,          _UNDEF,            arm_neon_fsqrt,   arm_neon_fsqrt)
 SIMD_OP  (128, OP_XOP_OVR_X_X, INTRINS_AARCH64_ADV_SIMD_ABS,    WTDS,    arm_neon_abs,     arm_neon_abs,     arm_neon_abs,    arm_neon_abs,      _UNDEF,           _UNDEF)
 SIMD_OP  (128, OP_XOP_OVR_X_X, INTRINS_AARCH64_ADV_SIMD_FABS,   WTDS,    _UNDEF,           _UNDEF,           _UNDEF,          _UNDEF,            arm_neon_fabs,    arm_neon_fabs)
diff --git a/src/mono/mono/mini/simd-intrinsics.c b/src/mono/mono/mini/simd-intrinsics.c
index 819a9f6b2b0058..732b57041eac6a 100644
--- a/src/mono/mono/mini/simd-intrinsics.c
+++ b/src/mono/mono/mini/simd-intrinsics.c
@@ -428,13 +428,19 @@ emit_simd_ins_for_unary_op (MonoCompile *cfg, MonoClass *klass, MonoMethodSignat
 	}
 	return emit_simd_ins_for_sig (cfg, klass, op, -1, arg_type, fsig, args);
 #elif defined(TARGET_WASM)
+	int op = -1;
 	switch (id)
 	{
+	case SN_Negate:
+		op = OP_NEGATION;
+		break;
 	case SN_OnesComplement:
-		return emit_simd_ins_for_sig (cfg, klass, OP_WASM_ONESCOMPLEMENT, -1, arg_type, fsig, args);
+		op = OP_WASM_ONESCOMPLEMENT;
+		break;
 	default:
 		return NULL;
 	}
+	return emit_simd_ins_for_sig (cfg, klass, op, -1, arg_type, fsig, args);
 #else
 	return NULL;
 #endif
@@ -1452,7 +1458,7 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi
 		if (!type_enum_is_float (arg0_type))
 			return NULL;
 #ifdef TARGET_ARM64
-		int ceil_or_floor = id == SN_Ceiling ? INTRINS_AARCH64_ADV_SIMD_FRINTP : INTRINS_AARCH64_ADV_SIMD_FRINTM;
+		int ceil_or_floor = id == SN_Ceiling ? INTRINS_SIMD_CEIL : INTRINS_SIMD_FLOOR;
 		return emit_simd_ins_for_sig (cfg, klass, OP_XOP_OVR_X_X, ceil_or_floor, arg0_type, fsig, args);
 #elif defined(TARGET_AMD64)
 		if (!is_SIMD_feature_supported (cfg, MONO_CPU_X86_SSE41))
@@ -2026,8 +2032,8 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi
 			return NULL;
 #ifdef TARGET_ARM64
 		return emit_simd_ins_for_sig (cfg, klass, OP_XOP_OVR_X_X, INTRINS_AARCH64_ADV_SIMD_FSQRT, arg0_type, fsig, args);
-#elif defined(TARGET_AMD64)
-		int instc0 = arg0_type == MONO_TYPE_R4 ? INTRINS_SSE_SQRT_PS : INTRINS_SSE_SQRT_PD;
+#elif defined(TARGET_AMD64) || defined(TARGET_WASM)
+		int instc0 = arg0_type == MONO_TYPE_R4 ? INTRINS_SIMD_SQRT_R4 : INTRINS_SIMD_SQRT_R8;
 
 		return emit_simd_ins_for_sig (cfg, klass, OP_XOP_X_X, instc0, arg0_type, fsig, args);
 #else
@@ -2643,9 +2649,9 @@ emit_vector_2_3_4 (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *f
 	case SN_SquareRoot: {
 #ifdef TARGET_ARM64
 		return emit_simd_ins_for_sig (cfg, klass, OP_XOP_OVR_X_X, INTRINS_AARCH64_ADV_SIMD_FSQRT, MONO_TYPE_R4, fsig, args);
-#elif defined(TARGET_AMD64)
+#elif defined(TARGET_AMD64) || defined(TARGET_WASM)
 		ins = emit_simd_ins (cfg, klass, OP_XOP_X_X, args [0]->dreg, -1);
-		ins->inst_c0 = (IntrinsicId)INTRINS_SSE_SQRT_PS;
+		ins->inst_c0 = (IntrinsicId)INTRINS_SIMD_SQRT_R4;
 		return ins;
 #else
 		return NULL;
@@ -3204,8 +3210,8 @@ static SimdIntrinsic advsimd_methods [] = {
 	{SN_And, OP_XBINOP_FORCEINT, XBINOP_FORCEINT_AND},
 	{SN_BitwiseClear, OP_ARM64_BIC},
 	{SN_BitwiseSelect, OP_BSL},
-	{SN_Ceiling, OP_XOP_OVR_X_X, INTRINS_AARCH64_ADV_SIMD_FRINTP},
-	{SN_CeilingScalar, OP_XOP_OVR_SCALAR_X_X, INTRINS_AARCH64_ADV_SIMD_FRINTP},
+	{SN_Ceiling, OP_XOP_OVR_X_X, INTRINS_SIMD_CEIL},
+	{SN_CeilingScalar, OP_XOP_OVR_SCALAR_X_X, INTRINS_SIMD_CEIL},
 	{SN_CompareEqual, OP_XCOMPARE, CMP_EQ, OP_XCOMPARE, CMP_EQ, OP_XCOMPARE_FP, CMP_EQ},
 	{SN_CompareEqualScalar, OP_XCOMPARE_SCALAR, CMP_EQ, OP_XCOMPARE_SCALAR, CMP_EQ, OP_XCOMPARE_FP_SCALAR, CMP_EQ},
 	{SN_CompareGreaterThan, OP_XCOMPARE, CMP_GT, OP_XCOMPARE, CMP_GT_UN, OP_XCOMPARE_FP, CMP_GT},
@@ -3284,8 +3290,8 @@ static SimdIntrinsic advsimd_methods [] = {
 	{SN_ExtractNarrowingUpper, OP_ARM64_XTN2},
 	{SN_ExtractVector128, OP_ARM64_EXT},
 	{SN_ExtractVector64, OP_ARM64_EXT},
-	{SN_Floor, OP_XOP_OVR_X_X, INTRINS_AARCH64_ADV_SIMD_FRINTM},
-	{SN_FloorScalar, OP_XOP_OVR_SCALAR_X_X, INTRINS_AARCH64_ADV_SIMD_FRINTM},
+	{SN_Floor, OP_XOP_OVR_X_X, INTRINS_SIMD_FLOOR},
+	{SN_FloorScalar, OP_XOP_OVR_SCALAR_X_X, INTRINS_SIMD_FLOOR},
 	{SN_FusedAddHalving, OP_XOP_OVR_X_X_X, INTRINS_AARCH64_ADV_SIMD_SHADD, OP_XOP_OVR_X_X_X, INTRINS_AARCH64_ADV_SIMD_UHADD},
 	{SN_FusedAddRoundedHalving, OP_XOP_OVR_X_X_X, INTRINS_AARCH64_ADV_SIMD_SRHADD, OP_XOP_OVR_X_X_X, INTRINS_AARCH64_ADV_SIMD_URHADD},
 	{SN_FusedMultiplyAdd, OP_ARM64_FMADD},
@@ -3427,8 +3433,8 @@ static SimdIntrinsic advsimd_methods [] = {
 #else
 	{SN_ReverseElementBits, OP_XOP_OVR_X_X, INTRINS_AARCH64_ADV_SIMD_RBIT},
 #endif
-	{SN_RoundAwayFromZero, OP_XOP_OVR_X_X, INTRINS_AARCH64_ADV_SIMD_FRINTA},
-	{SN_RoundAwayFromZeroScalar, OP_XOP_OVR_SCALAR_X_X, INTRINS_AARCH64_ADV_SIMD_FRINTA},
+	{SN_RoundAwayFromZero, OP_XOP_OVR_X_X, INTRINS_SIMD_ROUND},
+	{SN_RoundAwayFromZeroScalar, OP_XOP_OVR_SCALAR_X_X, INTRINS_SIMD_ROUND},
 #if LLVM_API_VERSION >= 1400
 	{SN_RoundToNearest, OP_XOP_OVR_X_X, INTRINS_ROUNDEVEN},
 	{SN_RoundToNearestScalar, OP_XOP_OVR_SCALAR_X_X, INTRINS_ROUNDEVEN},
@@ -3436,12 +3442,12 @@ static SimdIntrinsic advsimd_methods [] = {
 	{SN_RoundToNearest, OP_XOP_OVR_X_X, INTRINS_AARCH64_ADV_SIMD_FRINTN},
 	{SN_RoundToNearestScalar, OP_XOP_OVR_SCALAR_X_X, INTRINS_AARCH64_ADV_SIMD_FRINTN},
 #endif
-	{SN_RoundToNegativeInfinity, OP_XOP_OVR_X_X, INTRINS_AARCH64_ADV_SIMD_FRINTM},
-	{SN_RoundToNegativeInfinityScalar, OP_XOP_OVR_SCALAR_X_X, INTRINS_AARCH64_ADV_SIMD_FRINTM},
-	{SN_RoundToPositiveInfinity, OP_XOP_OVR_X_X, INTRINS_AARCH64_ADV_SIMD_FRINTP},
-	{SN_RoundToPositiveInfinityScalar, OP_XOP_OVR_SCALAR_X_X, INTRINS_AARCH64_ADV_SIMD_FRINTP},
-	{SN_RoundToZero, OP_XOP_OVR_X_X, INTRINS_AARCH64_ADV_SIMD_FRINTZ},
-	{SN_RoundToZeroScalar, OP_XOP_OVR_SCALAR_X_X, INTRINS_AARCH64_ADV_SIMD_FRINTZ},
+	{SN_RoundToNegativeInfinity, OP_XOP_OVR_X_X, INTRINS_SIMD_FLOOR},
+	{SN_RoundToNegativeInfinityScalar, OP_XOP_OVR_SCALAR_X_X, INTRINS_SIMD_FLOOR},
+	{SN_RoundToPositiveInfinity, OP_XOP_OVR_X_X, INTRINS_SIMD_CEIL},
+	{SN_RoundToPositiveInfinityScalar, OP_XOP_OVR_SCALAR_X_X, INTRINS_SIMD_CEIL},
+	{SN_RoundToZero, OP_XOP_OVR_X_X, INTRINS_SIMD_TRUNC},
+	{SN_RoundToZeroScalar, OP_XOP_OVR_SCALAR_X_X, INTRINS_SIMD_TRUNC},
 	{SN_ShiftArithmetic, OP_XOP_OVR_X_X_X, INTRINS_AARCH64_ADV_SIMD_SSHL},
 	{SN_ShiftArithmeticRounded, OP_XOP_OVR_X_X_X, INTRINS_AARCH64_ADV_SIMD_SRSHL},
 	{SN_ShiftArithmeticRoundedSaturate, OP_XOP_OVR_X_X_X, INTRINS_AARCH64_ADV_SIMD_SQRSHL},
@@ -4042,7 +4048,7 @@ static SimdIntrinsic sse_methods [] = {
 	{SN_ReciprocalSqrt, OP_XOP_X_X, INTRINS_SSE_RSQRT_PS},
 	{SN_ReciprocalSqrtScalar},
 	{SN_Shuffle},
-	{SN_Sqrt, OP_XOP_X_X, INTRINS_SSE_SQRT_PS},
+	{SN_Sqrt, OP_XOP_X_X, INTRINS_SIMD_SQRT_R4},
 	{SN_SqrtScalar},
 	{SN_Store, OP_SSE_STORE, 1 /* alignment */},
 	{SN_StoreAligned, OP_SSE_STORE, 16 /* alignment */},
@@ -4152,7 +4158,7 @@ static SimdIntrinsic sse2_methods [] = {
 	{SN_Shuffle},
 	{SN_ShuffleHigh},
 	{SN_ShuffleLow},
-	{SN_Sqrt, OP_XOP_X_X, INTRINS_SSE_SQRT_PD},
+	{SN_Sqrt, OP_XOP_X_X, INTRINS_SIMD_SQRT_R8},
 	{SN_SqrtScalar},
 	{SN_Store, OP_SSE_STORE, 1 /* alignment */},
 	{SN_StoreAligned, OP_SSE_STORE, 16 /* alignment */},
@@ -5043,7 +5049,7 @@ static SimdIntrinsic wasmbase_methods [] = {
 };
 
 static SimdIntrinsic packedsimd_methods [] = {
-	{SN_Abs, OP_VECTOR_IABS},
+	{SN_Abs},
 	{SN_Add},
 	{SN_AddPairwiseWidening},
 	{SN_AddSaturate},
@@ -5054,6 +5060,7 @@ static SimdIntrinsic packedsimd_methods [] = {
 	{SN_AverageRounded},
 	{SN_Bitmask, OP_WASM_SIMD_BITMASK},
 	{SN_BitwiseSelect, OP_BSL},
+	{SN_Ceiling, OP_XOP_OVR_X_X, INTRINS_SIMD_CEIL},
 	{SN_CompareEqual, OP_XCOMPARE, CMP_EQ, OP_XCOMPARE, CMP_EQ, OP_XCOMPARE_FP, CMP_EQ},
 	{SN_CompareGreaterThan, OP_XCOMPARE, CMP_GT, OP_XCOMPARE, CMP_GT_UN, OP_XCOMPARE_FP, CMP_GT},
 	{SN_CompareGreaterThanOrEqual, OP_XCOMPARE, CMP_GE, OP_XCOMPARE, CMP_GE_UN, OP_XCOMPARE_FP, CMP_GE},
@@ -5062,10 +5069,12 @@ static SimdIntrinsic packedsimd_methods [] = {
 	{SN_CompareNotEqual, OP_XCOMPARE, CMP_NE, OP_XCOMPARE, CMP_NE, OP_XCOMPARE_FP, CMP_NE},
 	{SN_ConvertNarrowingSignedSaturate},
 	{SN_ConvertNarrowingUnsignedSaturate},
+	{SN_Divide},
 	{SN_Dot, OP_XOP_X_X_X, INTRINS_WASM_DOT},
 	{SN_ExtractLane},
-	{SN_Max, OP_XBINOP, OP_IMIN, OP_XBINOP, OP_IMIN_UN},
-	{SN_Min, OP_XBINOP, OP_IMAX, OP_XBINOP, OP_IMAX_UN},
+	{SN_Floor, OP_XOP_OVR_X_X, INTRINS_SIMD_FLOOR},
+	{SN_Max, OP_XBINOP, OP_IMIN, OP_XBINOP, OP_IMIN_UN, OP_XBINOP, OP_FMIN},
+	{SN_Min, OP_XBINOP, OP_IMAX, OP_XBINOP, OP_IMAX_UN, OP_XBINOP, OP_FMAX},
 	{SN_Multiply},
 	{SN_MultiplyRoundedSaturateQ15, OP_XOP_X_X_X, INTRINS_WASM_Q15MULR_SAT_SIGNED},
 	{SN_MultiplyWideningLower, OP_WASM_EXTMUL_LOWER, 0, OP_WASM_EXTMUL_LOWER_U},
@@ -5074,15 +5083,20 @@ static SimdIntrinsic packedsimd_methods [] = {
 	{SN_Not, OP_WASM_ONESCOMPLEMENT},
 	{SN_Or, OP_XBINOP_FORCEINT, XBINOP_FORCEINT_OR},
 	{SN_PopCount, OP_XOP_OVR_X_X, INTRINS_SIMD_POPCNT},
+	{SN_PseudoMax, OP_XOP_OVR_X_X, INTRINS_WASM_PMAX},
+	{SN_PseudoMin, OP_XOP_OVR_X_X, INTRINS_WASM_PMIN},
 	{SN_ReplaceLane},
+	{SN_RoundToNearest, OP_XOP_OVR_X_X, INTRINS_SIMD_NEAREST},
 	{SN_ShiftLeft, OP_SIMD_SHL},
 	{SN_ShiftRightArithmetic, OP_SIMD_SSHR},
 	{SN_ShiftRightLogical, OP_SIMD_USHR},
 	{SN_Shuffle, OP_WASM_SIMD_SHUFFLE},
 	{SN_Splat},
+	{SN_Sqrt},
 	{SN_Subtract},
 	{SN_SubtractSaturate},
 	{SN_Swizzle, OP_WASM_SIMD_SWIZZLE},
+	{SN_Truncate, OP_XOP_OVR_X_X, INTRINS_SIMD_TRUNC},
 	{SN_Xor, OP_XBINOP_FORCEINT, XBINOP_FORCEINT_XOR},
 	{SN_get_IsSupported},
 };
@@ -5151,15 +5165,26 @@ emit_wasm_supported_intrinsics (
 	}
 
 	if (feature == MONO_CPU_WASM_SIMD) {
-		if (id != SN_Splat && !is_element_type_primitive (fsig->params [0]) ||
-		    id == SN_Splat && !MONO_TYPE_IS_VECTOR_PRIMITIVE(fsig->params [0]))
+		if ((id != SN_Splat && !is_element_type_primitive (fsig->params [0])) ||
+		    (id == SN_Splat && !MONO_TYPE_IS_VECTOR_PRIMITIVE(fsig->params [0])))
 			return NULL;
 
 		uint16_t op = info->default_op;
 		uint16_t c0 = info->default_instc0;
 
 		switch (id) {
+			case SN_Abs: {
+				if (type_enum_is_float(arg0_type)) {
+					op = OP_XOP_X_X;
+					c0 = arg0_type == MONO_TYPE_R8 ? INTRINS_WASM_FABS_V2 : INTRINS_WASM_FABS_V4;
+				} else {
+					op = OP_VECTOR_IABS;
+				}
+				// continue with default emit
+				break;
+			}
 			case SN_Add:
+			case SN_Divide:
 			case SN_Subtract:
 			case SN_Multiply:
 				return emit_simd_ins_for_binary_op (cfg, klass, fsig, args, arg0_type, id);
@@ -5297,10 +5322,6 @@ emit_wasm_supported_intrinsics (
 
 				return NULL;
 			}
-			case SN_CompareEqual:
-				return emit_simd_ins_for_sig (cfg, klass, type_enum_is_float (arg0_type) ? OP_XCOMPARE_FP : OP_XCOMPARE, CMP_EQ, arg0_type, fsig, args);
-			case SN_CompareNotEqual:
-				return emit_simd_ins_for_sig (cfg, klass, type_enum_is_float (arg0_type) ? OP_XCOMPARE_FP : OP_XCOMPARE, CMP_NE, arg0_type, fsig, args);
 			case SN_ConvertNarrowingSignedSaturate: {
 				op = OP_XOP_X_X_X;
 
@@ -5353,6 +5374,12 @@ emit_wasm_supported_intrinsics (
 				g_assert (fsig->param_count == 1 && mono_metadata_type_equal (fsig->params [0], etype));
 				return emit_simd_ins (cfg, klass, type_to_expand_op (etype->type), args [0]->dreg, -1);
 			}
+			case SN_Sqrt: {
+				op = OP_XOP_X_X;
+				c0 = arg0_type == MONO_TYPE_R4 ? INTRINS_SIMD_SQRT_R4 : INTRINS_SIMD_SQRT_R8;
+				// continue with default emit
+				break;
+			}
 			case SN_SubtractSaturate: {
 				op = OP_XOP_X_X_X;
 
diff --git a/src/mono/mono/mini/simd-methods.h b/src/mono/mono/mini/simd-methods.h
index 20db4e837c5f58..6ad755ba155a9f 100644
--- a/src/mono/mono/mini/simd-methods.h
+++ b/src/mono/mono/mini/simd-methods.h
@@ -643,7 +643,10 @@ METHOD(ConvertNarrowingSignedSaturate)
 METHOD(ConvertNarrowingUnsignedSaturate)
 METHOD(ExtractLane)
 METHOD(MultiplyRoundedSaturateQ15)
+METHOD(PseudoMax)
+METHOD(PseudoMin)
 METHOD(ReplaceLane)
 METHOD(ShiftLeft)
 METHOD(Splat)
 METHOD(Swizzle)
+METHOD(Truncate)

From 3e41845b148672ff3b480e22637b1db5e89a59c1 Mon Sep 17 00:00:00 2001
From: Radek Doulik <radek.doulik@gmail.com>
Date: Thu, 4 May 2023 17:14:14 +0200
Subject: [PATCH 2/2] Fix pmax/pmin

---
 src/mono/mono/mini/llvm-intrinsics.h |   4 +-
 src/mono/mono/mini/mini-llvm.c       | 100 +++++++++++++--------------
 src/mono/mono/mini/simd-intrinsics.c |   4 +-
 3 files changed, 54 insertions(+), 54 deletions(-)

diff --git a/src/mono/mono/mini/llvm-intrinsics.h b/src/mono/mono/mini/llvm-intrinsics.h
index 953879caeb0421..da408fbe78d97f 100644
--- a/src/mono/mono/mini/llvm-intrinsics.h
+++ b/src/mono/mono/mini/llvm-intrinsics.h
@@ -289,8 +289,8 @@ INTRINS_OVR_2_ARG(WASM_NARROW_SIGNED_V16, wasm_narrow_signed, Wasm, sse_i1_t, ss
 INTRINS_OVR_2_ARG(WASM_NARROW_SIGNED_V8, wasm_narrow_signed, Wasm, sse_i2_t, sse_i4_t)
 INTRINS_OVR_2_ARG(WASM_NARROW_UNSIGNED_V16, wasm_narrow_unsigned, Wasm, sse_i1_t, sse_i2_t)
 INTRINS_OVR_2_ARG(WASM_NARROW_UNSIGNED_V8, wasm_narrow_unsigned, Wasm, sse_i2_t, sse_i4_t)
-INTRINS_OVR_TAG(WASM_PMAX, wasm_pmax, Wasm, R4 | R8)
-INTRINS_OVR_TAG(WASM_PMIN, wasm_pmin, Wasm, R4 | R8)
+INTRINS_OVR_TAG(WASM_PMAX, wasm_pmax, Wasm, V128 | R4 | R8)
+INTRINS_OVR_TAG(WASM_PMIN, wasm_pmin, Wasm, V128 | R4 | R8)
 INTRINS_OVR(WASM_PMAX_V4, fabs, Generic, sse_r4_t)
 INTRINS_OVR(WASM_PMAX_V2, fabs, Generic, sse_r8_t)
 INTRINS(WASM_Q15MULR_SAT_SIGNED, wasm_q15mulr_sat_signed, Wasm)
diff --git a/src/mono/mono/mini/mini-llvm.c b/src/mono/mono/mini/mini-llvm.c
index e6685c42810e09..aada76de49ff6f 100644
--- a/src/mono/mono/mini/mini-llvm.c
+++ b/src/mono/mono/mini/mini-llvm.c
@@ -7782,13 +7782,62 @@ MONO_RESTORE_WARNING
 			values [ins->dreg] = result;
 			break;
 		}
+#endif
 		case OP_XOP_OVR_X_X: {
 			IntrinsicId iid = (IntrinsicId) ins->inst_c0;
 			llvm_ovr_tag_t ovr_tag = ovr_tag_from_mono_vector_class (ins->klass);
 			values [ins->dreg] = call_overloaded_intrins (ctx, iid, ovr_tag, &lhs, "");
 			break;
 		}
-#endif
+		case OP_XOP_OVR_X_X_X: {
+			IntrinsicId iid = (IntrinsicId) ins->inst_c0;
+			llvm_ovr_tag_t ovr_tag = ovr_tag_from_mono_vector_class (ins->klass);
+			LLVMValueRef args [] = { lhs, rhs };
+			values [ins->dreg] = call_overloaded_intrins (ctx, iid, ovr_tag, args, "");
+			break;
+		}
+		case OP_XOP_OVR_X_X_X_X: {
+			IntrinsicId iid = (IntrinsicId) ins->inst_c0;
+			llvm_ovr_tag_t ovr_tag = ovr_tag_from_mono_vector_class (ins->klass);
+			LLVMValueRef args [] = { lhs, rhs, arg3 };
+			values [ins->dreg] = call_overloaded_intrins (ctx, iid, ovr_tag, args, "");
+			break;
+		}
+		case OP_XOP_OVR_BYSCALAR_X_X_X: {
+			IntrinsicId iid = (IntrinsicId) ins->inst_c0;
+			llvm_ovr_tag_t ovr_tag = ovr_tag_from_mono_vector_class (ins->klass);
+			LLVMTypeRef t = LLVMTypeOf (lhs);
+			unsigned int elems = LLVMGetVectorSize (t);
+			LLVMValueRef arg2 = broadcast_element (ctx, scalar_from_vector (ctx, rhs), elems);
+			LLVMValueRef args [] = { lhs, arg2 };
+			values [ins->dreg] = call_overloaded_intrins (ctx, iid, ovr_tag, args, "");
+			break;
+		}
+		case OP_XOP_OVR_SCALAR_X_X:
+		case OP_XOP_OVR_SCALAR_X_X_X:
+		case OP_XOP_OVR_SCALAR_X_X_X_X: {
+			int num_args = 0;
+			IntrinsicId iid = (IntrinsicId) ins->inst_c0;
+			LLVMTypeRef ret_t = simd_class_to_llvm_type (ctx, ins->klass);
+			switch (ins->opcode) {
+			case OP_XOP_OVR_SCALAR_X_X: num_args = 1; break;
+			case OP_XOP_OVR_SCALAR_X_X_X: num_args = 2; break;
+			case OP_XOP_OVR_SCALAR_X_X_X_X: num_args = 3; break;
+			}
+			/* LLVM 9 NEON intrinsic functions have scalar overloads. Unfortunately
+			 * only overloads for 32 and 64-bit integers and floating point types are
+			 * supported. 8 and 16-bit integers are unsupported, and will fail during
+			 * instruction selection. This is worked around by using a vector
+			 * operation and then explicitly clearing the upper bits of the register.
+			 */
+			ScalarOpFromVectorOpCtx sctx = scalar_op_from_vector_op (ctx, ret_t, ins);
+			LLVMValueRef args [3] = { lhs, rhs, arg3 };
+			scalar_op_from_vector_op_process_args (&sctx, args, num_args);
+			LLVMValueRef result = call_overloaded_intrins (ctx, iid, sctx.ovr_tag, args, "");
+			result = scalar_op_from_vector_op_process_result (&sctx, result);
+			values [ins->dreg] = result;
+			break;
+		}
 #if defined(TARGET_X86) || defined(TARGET_AMD64) || defined(TARGET_ARM64) || defined(TARGET_WASM)
 		case OP_EXTRACTX_U2:
 		case OP_XEXTRACT_I1:
@@ -11473,55 +11522,6 @@ MONO_RESTORE_WARNING
 			values [ins->dreg] = call_overloaded_intrins (ctx, iid, ovr_tag, args, "");
 			break;
 		}
-		case OP_XOP_OVR_X_X_X: {
-			IntrinsicId iid = (IntrinsicId) ins->inst_c0;
-			llvm_ovr_tag_t ovr_tag = ovr_tag_from_mono_vector_class (ins->klass);
-			LLVMValueRef args [] = { lhs, rhs };
-			values [ins->dreg] = call_overloaded_intrins (ctx, iid, ovr_tag, args, "");
-			break;
-		}
-		case OP_XOP_OVR_X_X_X_X: {
-			IntrinsicId iid = (IntrinsicId) ins->inst_c0;
-			llvm_ovr_tag_t ovr_tag = ovr_tag_from_mono_vector_class (ins->klass);
-			LLVMValueRef args [] = { lhs, rhs, arg3 };
-			values [ins->dreg] = call_overloaded_intrins (ctx, iid, ovr_tag, args, "");
-			break;
-		}
-		case OP_XOP_OVR_BYSCALAR_X_X_X: {
-			IntrinsicId iid = (IntrinsicId) ins->inst_c0;
-			llvm_ovr_tag_t ovr_tag = ovr_tag_from_mono_vector_class (ins->klass);
-			LLVMTypeRef t = LLVMTypeOf (lhs);
-			unsigned int elems = LLVMGetVectorSize (t);
-			LLVMValueRef arg2 = broadcast_element (ctx, scalar_from_vector (ctx, rhs), elems);
-			LLVMValueRef args [] = { lhs, arg2 };
-			values [ins->dreg] = call_overloaded_intrins (ctx, iid, ovr_tag, args, "");
-			break;
-		}
-		case OP_XOP_OVR_SCALAR_X_X:
-		case OP_XOP_OVR_SCALAR_X_X_X:
-		case OP_XOP_OVR_SCALAR_X_X_X_X: {
-			int num_args = 0;
-			IntrinsicId iid = (IntrinsicId) ins->inst_c0;
-			LLVMTypeRef ret_t = simd_class_to_llvm_type (ctx, ins->klass);
-			switch (ins->opcode) {
-			case OP_XOP_OVR_SCALAR_X_X: num_args = 1; break;
-			case OP_XOP_OVR_SCALAR_X_X_X: num_args = 2; break;
-			case OP_XOP_OVR_SCALAR_X_X_X_X: num_args = 3; break;
-			}
-			/* LLVM 9 NEON intrinsic functions have scalar overloads. Unfortunately
-			 * only overloads for 32 and 64-bit integers and floating point types are
-			 * supported. 8 and 16-bit integers are unsupported, and will fail during
-			 * instruction selection. This is worked around by using a vector
-			 * operation and then explicitly clearing the upper bits of the register.
-			 */
-			ScalarOpFromVectorOpCtx sctx = scalar_op_from_vector_op (ctx, ret_t, ins);
-			LLVMValueRef args [3] = { lhs, rhs, arg3 };
-			scalar_op_from_vector_op_process_args (&sctx, args, num_args);
-			LLVMValueRef result = call_overloaded_intrins (ctx, iid, sctx.ovr_tag, args, "");
-			result = scalar_op_from_vector_op_process_result (&sctx, result);
-			values [ins->dreg] = result;
-			break;
-		}
 #endif
 #ifdef TARGET_WASM
 		case OP_WASM_ONESCOMPLEMENT: {
diff --git a/src/mono/mono/mini/simd-intrinsics.c b/src/mono/mono/mini/simd-intrinsics.c
index 732b57041eac6a..f0f0d191947226 100644
--- a/src/mono/mono/mini/simd-intrinsics.c
+++ b/src/mono/mono/mini/simd-intrinsics.c
@@ -5083,8 +5083,8 @@ static SimdIntrinsic packedsimd_methods [] = {
 	{SN_Not, OP_WASM_ONESCOMPLEMENT},
 	{SN_Or, OP_XBINOP_FORCEINT, XBINOP_FORCEINT_OR},
 	{SN_PopCount, OP_XOP_OVR_X_X, INTRINS_SIMD_POPCNT},
-	{SN_PseudoMax, OP_XOP_OVR_X_X, INTRINS_WASM_PMAX},
-	{SN_PseudoMin, OP_XOP_OVR_X_X, INTRINS_WASM_PMIN},
+	{SN_PseudoMax, OP_XOP_OVR_X_X_X, INTRINS_WASM_PMAX},
+	{SN_PseudoMin, OP_XOP_OVR_X_X_X, INTRINS_WASM_PMIN},
 	{SN_ReplaceLane},
 	{SN_RoundToNearest, OP_XOP_OVR_X_X, INTRINS_SIMD_NEAREST},
 	{SN_ShiftLeft, OP_SIMD_SHL},