Skip to content

Minor cleanup of approved Avx512 API surface #86168

@tannergooding

Description

@tannergooding

Summary

While implementing the API surface for:

There were some minor differences that came up in what was reviewed vs the correct implementation and which need official sign-off.

API Proposal

Only the APIs that differ from the original proposal or were found to be missing are covered below.

Most of these differences or missing APIs are "inoccuous" and are simply cases where the unsigned variant was missing or where the proposal said the API took Vector256 but it should've been Vector512.

There are however a couple places where the actual name of the API had to change and needs a bit more scrutiny.

I did not call out the APIs which are taking a byte parameter and which were annotated with [ConstantExpectedAttribute]. I considered this part of the same mass approval for the attribute and hwintrinsics in general.

 namespace System.Runtime.Intrinsics.X86;
 
 public abstract partial class Avx512BW
 {
     // Missing 
+    public static Vector256<byte>   ConvertToVector256Byte  (Vector512<short>  value);
+    public static Vector256<byte>   ConvertToVector256Byte  (Vector512<ushort> value);
+    public static Vector256<sbyte>  ConvertToVector256SByte (Vector512<short>  value);
+    public static Vector256<sbyte>  ConvertToVector256SByte (Vector512<ushort> value);
+    public static Vector512<ushort> ConvertToVector512UInt16(Vector256<sbyte>  value);
+    public static Vector512<ushort> ConvertToVector512UInt16(Vector256<byte>   value);

     // movdqu8/movdqu16 are Avx512BW only, the versions exposed on Avx512F emit movdqu32
+    public new unsafe static Vector512<byte>   LoadVector512(byte*   address);
+    public new unsafe static Vector512<short>  LoadVector512(short*  address);
+    public new unsafe static Vector512<sbyte>  LoadVector512(sbyte*  address);
+    public new unsafe static Vector512<ushort> LoadVector512(ushort* address);

     // The ones that take two operands are called `permutex2var` in native, have a different parameter order
     // and do a slightly different operation from the PermuteVar32x16 that takes one parameter
-    public static Vector512<short>  PermuteVar32x16  (Vector512<short>  left,  Vector512<short>  right,   Vector512<short>  control);
-    public static Vector512<ushort> PermuteVar32x16  (Vector512<ushort> left,  Vector512<ushort> right,   Vector512<ushort> control);
+    public static Vector512<short>  PermuteVar32x16x2(Vector512<short>  lower, Vector512<short>  indices, Vector512<short>  upper);
+    public static Vector512<ushort> PermuteVar32x16x2(Vector512<ushort> lower, Vector512<ushort> indices, Vector512<ushort> upper);

     // Underlying instruction only operates on bytes. We expose the other overloads for V128/V256, but they cause confusion
-    public static Vector512<short>  ShiftLeftLogical128BitLane(Vector512<short>  value, byte numBytes);
-    public static Vector512<int>    ShiftLeftLogical128BitLane(Vector512<int>    value, byte numBytes);
-    public static Vector512<long>   ShiftLeftLogical128BitLane(Vector512<long>   value, byte numBytes);
-    public static Vector512<ushort> ShiftLeftLogical128BitLane(Vector512<ushort> value, byte numBytes);
-    public static Vector512<uint>   ShiftLeftLogical128BitLane(Vector512<uint>   value, byte numBytes);
-    public static Vector512<ulong>  ShiftLeftLogical128BitLane(Vector512<ulong>  value, byte numBytes);

     // Underlying instruction only operates on bytes. We expose the other overloads for V128/V256, but they cause confusion
-    public static Vector512<short>  ShiftRightLogical128BitLane(Vector512<short>  value, byte numBytes);
-    public static Vector512<int>    ShiftRightLogical128BitLane(Vector512<int>    value, byte numBytes);
-    public static Vector512<long>   ShiftRightLogical128BitLane(Vector512<long>   value, byte numBytes);
-    public static Vector512<ushort> ShiftRightLogical128BitLane(Vector512<ushort> value, byte numBytes);
-    public static Vector512<uint>   ShiftRightLogical128BitLane(Vector512<uint>   value, byte numBytes);
-    public static Vector512<ulong>  ShiftRightLogical128BitLane(Vector512<ulong>  value, byte numBytes);

     // movdqu8/movdqu16 are Avx512BW only, the versions exposed on Avx512F emit movdqu32
+    public new unsafe static void Store(byte*   address, Vector512<byte>   source) { }
+    public new unsafe static void Store(short*  address, Vector512<short>  source) { }
+    public new unsafe static void Store(sbyte*  address, Vector512<sbyte>  source) { }
+    public new unsafe static void Store(ushort* address, Vector512<ushort> source) { }

     // Operation is different from the regular SumAbsoluteDifferences
-    public static Vector512<byte>   SumAbsoluteDifferencesWidening (Vector512<byte> left, Vector512<byte> right, byte control);
+    public static Vector512<ushort> SumAbsoluteDifferencesInBlock32(Vector512<byte> left, Vector512<byte> right, byte control);

     public new abstract partial class VL
     {
         // Avx512BW.VL APIs were not covered

+        public static Vector128<byte> ConvertToVector128Byte(Vector128<short>  value);
+        public static Vector128<byte> ConvertToVector128Byte(Vector128<ushort> value);
+        public static Vector128<byte> ConvertToVector128Byte(Vector256<short>  value);
+        public static Vector128<byte> ConvertToVector128Byte(Vector256<ushort> value);

+        public static Vector128<byte> ConvertToVector128ByteWithSaturation(Vector128<ushort> value);
+        public static Vector128<byte> ConvertToVector128ByteWithSaturation(Vector256<ushort> value);

+        public static Vector128<sbyte> ConvertToVector128SByte(Vector128<short>  value);
+        public static Vector128<sbyte> ConvertToVector128SByte(Vector128<ushort> value);
+        public static Vector128<sbyte> ConvertToVector128SByte(Vector256<short>  value);
+        public static Vector128<sbyte> ConvertToVector128SByte(Vector256<ushort> value);

+        public static Vector128<sbyte> ConvertToVector128SByteWithSaturation(Vector128<short> value);
+        public static Vector128<sbyte> ConvertToVector128SByteWithSaturation(Vector256<short> value);

+        public static Vector128<short>  PermuteVar8x16(Vector128<short>  left, Vector128<short>  control);
+        public static Vector128<ushort> PermuteVar8x16(Vector128<ushort> left, Vector128<ushort> control);

+        public static Vector128<short>  PermuteVar8x16x2(Vector128<short>  lower, Vector128<short>  indices, Vector128<short>  upper);
+        public static Vector128<ushort> PermuteVar8x16x2(Vector128<ushort> lower, Vector128<ushort> indices, Vector128<ushort> upper);

+        public static Vector256<short>  PermuteVar16x16(Vector256<short>  left, Vector256<short>  control);
+        public static Vector256<ushort> PermuteVar16x16(Vector256<ushort> left, Vector256<ushort> control);

+        public static Vector256<short>  PermuteVar16x16x2(Vector256<short>  lower, Vector256<short>  indices, Vector256<short>  upper);
+        public static Vector256<ushort> PermuteVar16x16x2(Vector256<ushort> lower, Vector256<ushort> indices, Vector256<ushort> upper);

+        public static Vector128<short>  ShiftLeftLogicalVariable(Vector128<short>  value, Vector128<ushort> count);
+        public static Vector128<ushort> ShiftLeftLogicalVariable(Vector128<ushort> value, Vector128<ushort> count);
+        public static Vector256<short>  ShiftLeftLogicalVariable(Vector256<short>  value, Vector256<ushort> count);
+        public static Vector256<ushort> ShiftLeftLogicalVariable(Vector256<ushort> value, Vector256<ushort> count);

+        public static Vector128<short> ShiftRightArithmeticVariable(Vector128<short> value, Vector128<ushort> count);
+        public static Vector256<short> ShiftRightArithmeticVariable(Vector256<short> value, Vector256<ushort> count);

+        public static Vector128<short>  ShiftRightLogicalVariable(Vector128<short>  value, Vector128<ushort> count);
+        public static Vector128<ushort> ShiftRightLogicalVariable(Vector128<ushort> value, Vector128<ushort> count);

+        public static Vector256<short>  ShiftRightLogicalVariable(Vector256<short>  value, Vector256<ushort> count);
+        public static Vector256<ushort> ShiftRightLogicalVariable(Vector256<ushort> value, Vector256<ushort> count);

+        public static Vector128<ushort> SumAbsoluteDifferencesInBlock32(Vector128<byte> left, Vector128<byte> right, byte control);
+        public static Vector256<ushort> SumAbsoluteDifferencesInBlock32(Vector256<byte> left, Vector256<byte> right, byte control);
     }
 }

 public abstract partial class Avx512CD
 {
     public new abstract partial class VL
     {
         // Avx512BW.VL APIs were not covered

+        public static Vector128<int>   DetectConflicts(Vector128<int>   value);
+        public static Vector128<long>  DetectConflicts(Vector128<long>  value);
+        public static Vector128<uint>  DetectConflicts(Vector128<uint>  value);
+        public static Vector128<ulong> DetectConflicts(Vector128<ulong> value);
+
+        public static Vector256<int>   DetectConflicts(Vector256<int>   value);
+        public static Vector256<long>  DetectConflicts(Vector256<long>  value);
+        public static Vector256<uint>  DetectConflicts(Vector256<uint>  value);
+        public static Vector256<ulong> DetectConflicts(Vector256<ulong> value);
+
+        public static Vector128<int>   LeadingZeroCount(Vector128<int>   value);
+        public static Vector128<long>  LeadingZeroCount(Vector128<long>  value);
+        public static Vector128<uint>  LeadingZeroCount(Vector128<uint>  value);
+        public static Vector128<ulong> LeadingZeroCount(Vector128<ulong> value);

+        public static Vector256<int>   LeadingZeroCount(Vector256<int>   value);
+        public static Vector256<long>  LeadingZeroCount(Vector256<long>  value);
+        public static Vector256<uint>  LeadingZeroCount(Vector256<uint>  value);
+        public static Vector256<ulong> LeadingZeroCount(Vector256<ulong> value);
     }
 }

 public abstract partial class Avx512DQ
 {
     // Instructions require a memory address
-    public static Vector512<double>        BroadcastToVector512         (Vector128<double> value);
-    public static Vector512<int>           BroadcastToVector512         (Vector256<int>    value);
-    public static Vector512<long>          BroadcastToVector512         (Vector128<long>   value);
-    public static Vector512<float>         BroadcastToVector512         (Vector256<float>  value);
-    public static Vector512<uint>          BroadcastToVector512         (Vector256<uint>   value);
-    public static Vector512<ulong>         BroadcastToVector512         (Vector128<ulong>  value);
+    public static unsafe Vector512<double> BroadcastVector128ToVector512(double*           address);
+    public static unsafe Vector512<long>   BroadcastVector128ToVector512(long*             address);
+    public static unsafe Vector512<ulong>  BroadcastVector128ToVector512(ulong*            address);
+    public static unsafe Vector512<int>    BroadcastVector256ToVector512(int*              address);
+    public static unsafe Vector512<float>  BroadcastVector256ToVector512(float*            address);
+    public static unsafe Vector512<uint>   BroadcastVector256ToVector512(uint*             address);

     // Signed versions were exposed, unsigned versions were missing
+    public static Vector512<ulong> ConvertToVector512UInt64WithTruncation(Vector256<float>  value);
+    public static Vector512<ulong> ConvertToVector512UInt64WithTruncation(Vector256<float>  value, FloatRoundingMode mode);
+    public static Vector512<ulong> ConvertToVector512UInt64WithTruncation(Vector512<double> value);
+    public static Vector512<ulong> ConvertToVector512UInt64WithTruncation(Vector512<double> value, FloatRoundingMode mode);

     // Native uses the name "range"
-    public static Vector512<double> RangeRestrict(Vector512<double> left, Vector512<double> right, byte control);
-    public static Vector512<float>  RangeRestrict(Vector512<float>  left, Vector512<float>  right, byte control);
+    public static Vector512<double> Range        (Vector512<double> left, Vector512<double> right, byte control);
+    public static Vector512<float>  Range        (Vector512<float>   left, Vector512<float> right, byte control);

     // Native uses the name "range"
-    public static Vector128<double> RangeRestrictScalar(Vector128<double> left, Vector128<double> right, byte control);
-    public static Vector128<float>  RangeRestrictScalar(Vector128<float>  left, Vector128<float>  right, byte control);
+    public static Vector128<double> RangeScalar        (Vector128<double> left, Vector128<double> right, byte control);
+    public static Vector128<float>  RangeScalar        (Vector128<float>  left, Vector128<float>  right, byte control);

     // Upper bits come from left as its a binary operation
-    public static Vector128<double> RangeRestrictScalar(Vector128<double> upper, Vector128<double> left, Vector128<double> right, byte control);
-    public static Vector128<float>  RangeRestrictScalar(Vector128<float>  upper, Vector128<float>  left, Vector128<float>  right, byte control);

     // Native uses the name "reduce"
-    public static Vector512<double> ReductionTransform(Vector512<double> value, byte control);
-    public static Vector512<float>  ReductionTransform(Vector512<float>  value, byte control);
+    public static Vector512<double> Reduce            (Vector512<double> value, byte control);
+    public static Vector512<float>  Reduce            (Vector512<float>  value, byte control);

     // Scalar APIs consistently have variants that do and do not take upper, to help with codegen
+    public static Vector128<double> ReduceScalar(Vector128<double> value, byte control);
+    public static Vector128<float>  ReduceScalar(Vector128<float>  value, byte control);

     // Native uses the name "reduce"
-    public static Vector128<double> ReductionTransformScalar(Vector128<double> upper, Vector128<double> value, byte control);
-    public static Vector128<float>  ReductionTransformScalar(Vector128<float>  upper, Vector128<float>  value, byte control);
+    public static Vector128<double> ReduceScalar            (Vector128<double> upper, Vector128<double> value, byte control);
+    public static Vector128<float>  ReduceScalar            (Vector128<float>  upper, Vector128<float>  value, byte control);

     public new abstract partial class VL
     {
         // Avx512DQ.VL APIs were not covered

+        public static Vector128<int>   BroadcastPairScalarToVector128(Vector128<int>  value);
+        public static Vector128<uint>  BroadcastPairScalarToVector128(Vector128<uint> value);

+        public static Vector256<int>   BroadcastPairScalarToVector256(Vector128<int>   value);
+        public static Vector256<float> BroadcastPairScalarToVector256(Vector128<float> value);
+        public static Vector256<uint>  BroadcastPairScalarToVector256(Vector128<uint>  value);

+        public static Vector128<double> ConvertToVector128Double(Vector128<long>  value);
+        public static Vector128<double> ConvertToVector128Double(Vector128<ulong> value);

+        public static Vector128<long> ConvertToVector128Int64(Vector128<double> value);
+        public static Vector128<long> ConvertToVector128Int64(Vector128<float>  value);

+        public static Vector128<long> ConvertToVector128Int64WithTruncation(Vector128<double> value);
+        public static Vector128<long> ConvertToVector128Int64WithTruncation(Vector128<float>  value);

+        public static Vector128<float> ConvertToVector128Single(Vector128<long>  value);
+        public static Vector128<float> ConvertToVector128Single(Vector256<long>  value);
+        public static Vector128<float> ConvertToVector128Single(Vector128<ulong> value);
+        public static Vector128<float> ConvertToVector128Single(Vector256<ulong> value);

+        public static Vector128<ulong> ConvertToVector128UInt64(Vector128<double> value);
+        public static Vector128<ulong> ConvertToVector128UInt64(Vector128<float>  value);

+        public static Vector128<ulong> ConvertToVector128UInt64WithTruncation(Vector128<double> value);
+        public static Vector128<ulong> ConvertToVector128UInt64WithTruncation(Vector128<float>  value);

+        public static Vector256<double> ConvertToVector256Double(Vector256<long>  value);
+        public static Vector256<double> ConvertToVector256Double(Vector256<ulong> value);

+        public static Vector256<long> ConvertToVector256Int64(Vector256<double> value);
+        public static Vector256<long> ConvertToVector256Int64(Vector128<float>  value);

+        public static Vector256<long> ConvertToVector256Int64WithTruncation(Vector256<double> value);
+        public static Vector256<long> ConvertToVector256Int64WithTruncation(Vector128<float>  value);

+        public static Vector256<ulong> ConvertToVector256UInt64(Vector256<double> value);
+        public static Vector256<ulong> ConvertToVector256UInt64(Vector128<float>  value);

+        public static Vector256<ulong> ConvertToVector256UInt64WithTruncation(Vector256<double> value);
+        public static Vector256<ulong> ConvertToVector256UInt64WithTruncation(Vector128<float>  value);

+        public static Vector128<long>  MultiplyLow(Vector128<long> left, Vector128<long> right);
+        public static Vector128<ulong> MultiplyLow(Vector128<ulong> left, Vector128<ulong> right);

+        public static Vector256<long>  MultiplyLow(Vector256<long> left, Vector256<long> right);
+        public static Vector256<ulong> MultiplyLow(Vector256<ulong> left, Vector256<ulong> right);

+        public static Vector128<double> Range(Vector128<double> left, Vector128<double> right, byte control);
+        public static Vector128<float>  Range(Vector128<float>  left, Vector128<float>  right, byte control);

+        public static Vector256<double> Range(Vector256<double> left, Vector256<double> right, byte control);
+        public static Vector256<float>  Range(Vector256<float>  left, Vector256<float>  right, byte control);

+        public static Vector128<double> Reduce(Vector128<double> value, byte control);
+        public static Vector128<float>  Reduce(Vector128<float>  value, byte control);

+        public static Vector256<double> Reduce(Vector256<double> value, byte control);
+        public static Vector256<float>  Reduce(Vector256<float>  value, byte control);
     }
 }

 public abstract partial class Avx512F
 {
     // Missing unsigned versions
     public static Vector512<uint>  Add(Vector512<uint>  left, Vector512<uint>  right);
     public static Vector512<ulong> Add(Vector512<ulong> left, Vector512<ulong> right);

     // Operates differently from existing AlignRight
-    public static Vector512<int>   AlignRight  (Vector512<int>   left, Vector512<int>   right, byte mask);
-    public static Vector512<long>  AlignRight  (Vector512<long>  left, Vector512<long>  right, byte mask);
+    public static Vector512<int>   AlignRight32(Vector512<int>   left, Vector512<int>   right, byte mask);
+    public static Vector512<uint>  AlignRight32(Vector512<uint>  left, Vector512<uint>  right, byte mask);
+    public static Vector512<long>  AlignRight64(Vector512<long>  left, Vector512<long>  right, byte mask);
+    public static Vector512<ulong> AlignRight64(Vector512<ulong> left, Vector512<ulong> right, byte mask);

     // Misssing unsigned and small types
+    public static Vector512<byte> And(Vector512<byte> left, Vector512<byte> right);
+    public static Vector512<short> And(Vector512<short> left, Vector512<short> right);
+    public static Vector512<sbyte> And(Vector512<sbyte> left, Vector512<sbyte> right);
+    public static Vector512<ushort> And(Vector512<ushort> left, Vector512<ushort> right);
+    public static Vector512<uint> And(Vector512<uint> left, Vector512<uint> right);
+    public static Vector512<ulong> And(Vector512<ulong> left, Vector512<ulong> right);

     // Misssing unsigned and small types
+    public static Vector512<byte> AndNot(Vector512<byte> left, Vector512<byte> right);
+    public static Vector512<short> AndNot(Vector512<short> left, Vector512<short> right);
+    public static Vector512<sbyte> AndNot(Vector512<sbyte> left, Vector512<sbyte> right);
+    public static Vector512<ushort> AndNot(Vector512<ushort> left, Vector512<ushort> right);
+    public static Vector512<uint> AndNot(Vector512<uint> left, Vector512<uint> right);
+    public static Vector512<ulong> AndNot(Vector512<ulong> left, Vector512<ulong> right);

     // Missing unsigned types
+    public static Vector512<uint> BroadcastScalarToVector512(Vector128<uint> value);
+    public static Vector512<ulong> BroadcastScalarToVector512(Vector128<ulong> value);

     // Instructions require an address
-    public static Vector512<double>        BroadcastToVector512         (Vector256<double> value);
-    public static Vector512<int>           BroadcastToVector512         (Vector128<int>    value);
-    public static Vector512<long>          BroadcastToVector512         (Vector256<long>   value);
-    public static Vector512<float>         BroadcastToVector512         (Vector128<float>  value);
+    public static unsafe Vector512<int>    BroadcastVector128ToVector512(int*              address);
+    public static unsafe Vector512<float>  BroadcastVector128ToVector512(float*            address);
+    public static unsafe Vector512<uint>   BroadcastVector128ToVector512(uint*             address);
+    public static unsafe Vector512<double> BroadcastVector256ToVector512(double*           address);
+    public static unsafe Vector512<long>   BroadcastVector256ToVector512(long*             address);
+    public static unsafe Vector512<ulong>  BroadcastVector256ToVector512(ulong*            address);

     // Missing types
+    public static Vector128<byte> ConvertToVector128Byte(Vector512<int>   value);
+    public static Vector128<byte> ConvertToVector128Byte(Vector512<long>  value);
+    public static Vector128<byte> ConvertToVector128Byte(Vector512<uint>  value);
+    public static Vector128<byte> ConvertToVector128Byte(Vector512<ulong> value);

     // Missing types
+    public static Vector128<short> ConvertToVector128Int16(Vector512<ulong> value);

     // Missing types
+    public static Vector128<sbyte> ConvertToVector128SByte(Vector512<uint>  value);
+    public static Vector128<sbyte> ConvertToVector128SByte(Vector512<ulong> value);

     // Missing types
+    public static Vector128<ushort> ConvertToVector128UInt16(Vector512<long>  value);
+    public static Vector128<ushort> ConvertToVector128UInt16(Vector512<ulong> value);

     // Missing types
+    public static Vector256<short> ConvertToVector256Int16(Vector512<uint> value);

     // Missing types
+    public static Vector256<int> ConvertToVector256Int32(Vector512<ulong> value);

     // Missing types
+    public static Vector256<ushort> ConvertToVector256UInt16(Vector512<int>  value);
+    public static Vector256<ushort> ConvertToVector256UInt16(Vector512<uint> value);

     // Missing types
+    public static Vector256<uint> ConvertToVector256UInt32(Vector512<ulong> value);

     // Incorrect name
-    public static Vector512<double> ConvertToVector256Double(Vector256<int> value);
+    public static Vector512<double> ConvertToVector512Double(Vector256<int> value);

     // Incorrect input types
-    public static Vector512<int> ConvertToVector512Int32(Vector512<short>  value);
-    public static Vector512<int> ConvertToVector512Int32(Vector512<sbyte>  value);
-    public static Vector512<int> ConvertToVector512Int32(Vector128<ushort> value);
+    public static Vector512<int> ConvertToVector512Int32(Vector256<short>  value);
+    public static Vector512<int> ConvertToVector512Int32(Vector128<sbyte>  value);
+    public static Vector512<int> ConvertToVector512Int32(Vector256<ushort> value);

     // Incorrect input types
-    public static Vector512<long> ConvertToVector512Int64(Vector512<short>  value);
-    public static Vector512<long> ConvertToVector512Int64(Vector512<int>    value);
-    public static Vector512<long> ConvertToVector512Int64(Vector512<sbyte>  value);
-    public static Vector512<long> ConvertToVector512Int64(Vector256<ushort> value);
+    public static Vector512<long> ConvertToVector512Int64(Vector128<short>  value);
+    public static Vector512<long> ConvertToVector512Int64(Vector256<int>    value);
+    public static Vector512<long> ConvertToVector512Int64(Vector128<sbyte>  value);
+    public static Vector512<long> ConvertToVector512Int64(Vector128<ushort> value);

     // Incorrect name
-    public static Vector512<float> ConvertToVector256Single(Vector512<int> value);
+    public static Vector512<float> ConvertToVector512Single(Vector512<int> value);

     // Missing types
+    public static Vector512<uint> ConvertToVector512UInt32(Vector128<byte>   value);
+    public static Vector512<uint> ConvertToVector512UInt32(Vector256<short>  value);
+    public static Vector512<uint> ConvertToVector512UInt32(Vector128<sbyte>  value);
+    public static Vector512<uint> ConvertToVector512UInt32(Vector256<ushort> value);

     // Missing types
+    public static Vector512<ulong> ConvertToVector512UInt64(Vector128<byte>   value);
+    public static Vector512<ulong> ConvertToVector512UInt64(Vector128<short>  value);
+    public static Vector512<ulong> ConvertToVector512UInt64(Vector256<int>    value);
+    public static Vector512<ulong> ConvertToVector512UInt64(Vector128<sbyte>  value);
+    public static Vector512<ulong> ConvertToVector512UInt64(Vector128<ushort> value);
+    public static Vector512<ulong> ConvertToVector512UInt64(Vector256<uint>   value);

     // Incorrect name due to differing semantics between 128 and 256/512
-    public static Vector512<double> MoveAndDuplicate(Vector512<double> value);
+    public static Vector512<double> DuplicateEvenIndexed(Vector512<double> value);

     // Missing Types
+    public static Vector128<byte>   ExtractVector128(Vector512<byte>   value, byte index);
+    public static Vector128<double> ExtractVector128(Vector512<double> value, byte index);
+    public static Vector128<short>  ExtractVector128(Vector512<short>  value, byte index);
+    public static Vector128<long>   ExtractVector128(Vector512<long>   value, byte index);
+    public static Vector128<sbyte>  ExtractVector128(Vector512<sbyte>  value, byte index);
+    public static Vector128<ushort> ExtractVector128(Vector512<ushort> value, byte index);
+    public static Vector128<uint>   ExtractVector128(Vector512<uint>   value, byte index);
+    public static Vector128<ulong>  ExtractVector128(Vector512<ulong>  value, byte index);

     // Missing Types
+    public static Vector256<byte>   ExtractVector256(Vector512<byte>   value, byte index);
+    public static Vector256<short>  ExtractVector256(Vector512<short>  value, byte index);
+    public static Vector256<int>    ExtractVector256(Vector512<int>    value, byte index);
+    public static Vector256<sbyte>  ExtractVector256(Vector512<sbyte>  value, byte index);
+    public static Vector256<float>  ExtractVector256(Vector512<float>  value, byte index);
+    public static Vector256<ushort> ExtractVector256(Vector512<ushort> value, byte index);
+    public static Vector256<uint>   ExtractVector256(Vector512<uint>   value, byte index);
+    public static Vector256<ulong>  ExtractVector256(Vector512<ulong>  value, byte index);

     // Missing APIs, different name from Fma
+    public static Vector512<double> FusedMultiplyAdd(Vector512<double> a, Vector512<double> b, Vector512<double> c);
+    public static Vector512<float>  FusedMultiplyAdd(Vector512<float>  a, Vector512<float>  b, Vector512<float>  c);

     // Missing APIs, different name from Fma
+    public static Vector512<double> FusedMultiplyAddNegated(Vector512<double> a, Vector512<double> b, Vector512<double> c);
+    public static Vector512<float>  FusedMultiplyAddNegated(Vector512<float>  a, Vector512<float>  b, Vector512<float>  c);

     // Missing APIs, different name from Fma
+    public static Vector512<double> FusedMultiplyAddSubtract(Vector512<double> a, Vector512<double> b, Vector512<double> c);
+    public static Vector512<float>  FusedMultiplyAddSubtract(Vector512<float>  a, Vector512<float>  b, Vector512<float>  c);

     // Missing APIs, different name from Fma
+    public static Vector512<double> FusedMultiplySubtract(Vector512<double> a, Vector512<double> b, Vector512<double> c);
+    public static Vector512<float>  FusedMultiplySubtract(Vector512<float>  a, Vector512<float>  b, Vector512<float>  c);

     // Missing APIs, different name from Fma
+    public static Vector512<double> FusedMultiplySubtractAdd(Vector512<double> a, Vector512<double> b, Vector512<double> c);
+    public static Vector512<float>  FusedMultiplySubtractAdd(Vector512<float>  a, Vector512<float>  b, Vector512<float>  c);

     // Missing APIs, different name from Fma
+    public static Vector512<double> FusedMultiplySubtractNegated(Vector512<double> a, Vector512<double> b, Vector512<double> c);
+    public static Vector512<float>  FusedMultiplySubtractNegated(Vector512<float>  a, Vector512<float>  b, Vector512<float>  c);

     // Different signature due to complications in handling two constants and unique among intrinsics
-    public static Vector512<double> GetMantissa(Vector512<double> value, byte interval, byte signControl);
-    public static Vector512<float>  GetMantissa(Vector512<float>  value, byte interval, byte signControl);
+    public static Vector512<double> GetMantissa(Vector512<double> value, byte control);
+    public static Vector512<float>  GetMantissa(Vector512<float>  value, byte control);

     // Missing APIs, scalars expose variants that take and do not take upper
+    public static Vector128<float>  GetMantissaScalar(Vector128<float>  value, byte control);
+    public static Vector128<double> GetMantissaScalar(Vector128<double> value, byte control);

-    public static Vector128<double> GetMantissaScalar(Vector128<double> upper, Vector128<double> value, byte interval, byte signControl);
-    public static Vector128<float>  GetMantissaScalar(Vector128<float>  upper, Vector128<float>  value, byte interval, byte signControl);
+    public static Vector128<double> GetMantissaScalar(Vector128<double> upper, Vector128<double> value, byte control);
+    public static Vector128<float>  GetMantissaScalar(Vector128<float>  upper, Vector128<float>  value, byte control);

     // Incorrect signature, to be reviewed later taking VectorMask<T>
-    public static Vector512<double> GatherVector512(double* baseAddress, Vector256<int>   index, byte scale);
-    public static Vector512<double> GatherVector512(double* baseAddress, Vector512<long>  index, byte scale);
-    public static Vector512<int>    GatherVector512(int*    baseAddress, Vector256<int>   index, byte scale);
-    public static Vector512<int>    GatherVector512(int*    baseAddress, Vector512<int>   index, byte scale);
-    public static Vector512<long>   GatherVector512(void*   baseAddress, Vector256<int>   index, byte scale);
-    public static Vector512<long>   GatherVector512(void*   baseAddress, Vector512<long>  index, byte scale);
-    public static Vector512<float>  GatherVector512(float*  baseAddress, Vector256<int>   index, byte scale);
-    public static Vector512<float>  GatherVector512(void*   baseAddress, Vector512<long>  index, byte scale);

     // Missing Types
+    public static Vector512<byte>   InsertVector128(Vector512<byte>   value, Vector128<byte>   data, byte index);
+    public static Vector512<double> InsertVector128(Vector512<double> value, Vector128<double> data, byte index);
+    public static Vector512<short>  InsertVector128(Vector512<short>  value, Vector128<short>  data, byte index);
+    public static Vector512<long>   InsertVector128(Vector512<long>   value, Vector128<long>   data, byte index);
+    public static Vector512<sbyte>  InsertVector128(Vector512<sbyte>  value, Vector128<sbyte>  data, byte index);
+    public static Vector512<ushort> InsertVector128(Vector512<ushort> value, Vector128<ushort> data, byte index);
+    public static Vector512<uint>   InsertVector128(Vector512<uint>   value, Vector128<uint>   data, byte index);
+    public static Vector512<ulong>  InsertVector128(Vector512<ulong>  value, Vector128<ulong>  data, byte index);

+    public static Vector512<byte>   InsertVector256(Vector512<byte>   value, Vector256<byte>   data, byte index);
+    public static Vector512<short>  InsertVector256(Vector512<short>  value, Vector256<short>  data, byte index);
+    public static Vector512<int>    InsertVector256(Vector512<int>    value, Vector256<int>    data, byte index);
+    public static Vector512<sbyte>  InsertVector256(Vector512<sbyte>  value, Vector256<sbyte>  data, byte index);
+    public static Vector512<float>  InsertVector256(Vector512<float>  value, Vector256<float>  data, byte index);
+    public static Vector512<ushort> InsertVector256(Vector512<ushort> value, Vector256<ushort> data, byte index);
+    public static Vector512<uint>   InsertVector256(Vector512<uint>   value, Vector256<uint>   data, byte index);
+    public static Vector512<ulong>  InsertVector256(Vector512<ulong>  value, Vector256<ulong>  data, byte index);

     // Returning wrong type
-    public static Vector512<int>   Multiply(Vector512<int>  left, Vector512<int>  right);
-    public static Vector512<uint>  Multiply(Vector512<uint> left, Vector512<uint> right);
+    public static Vector512<long>  Multiply(Vector512<int>  left, Vector512<int>  right);
+    public static Vector512<ulong> Multiply(Vector512<uint> left, Vector512<uint> right);

      // Missing unsigned type
+    public static Vector512<uint> MultiplyLow(Vector512<uint> left, Vector512<uint> right);

     // Missing types
+    public static Vector512<byte>   Or(Vector512<byte>   left, Vector512<byte>   right);
+    public static Vector512<short>  Or(Vector512<short>  left, Vector512<short>  right);
+    public static Vector512<sbyte>  Or(Vector512<sbyte>  left, Vector512<sbyte>  right);
+    public static Vector512<ushort> Or(Vector512<ushort> left, Vector512<ushort> right);
+    public static Vector512<uint>   Or(Vector512<uint>   left, Vector512<uint>   right);
+    public static Vector512<ulong>  Or(Vector512<ulong>  left, Vector512<ulong>  right);

     // Incorrect name due to differing semantics
-    public static Vector512<float> Permute    (Vector512<float> value, byte control);
+    public static Vector512<float> Permute4x32(Vector512<float> value, byte control);

     // Incorrect name due to differing semantics
-    public static Vector512<double> Permute    (Vector512<double> value, byte control);
+    public static Vector512<double> Permute4x64(Vector512<double> value, byte control);

     // Missing unsigned
+    public static Vector512<ulong> Permute4x64(Vector512<ulong> value, byte control);

     // Incorrect name due to differing semantics
-    public static Vector512<double> PermuteVar    (Vector512<double> value, Vector512<long> control);
-    public static Vector512<float>  PermuteVar    (Vector512<float>  value, Vector512<int>  control);
+    public static Vector512<double> PermuteVar2x64(Vector512<double> value, Vector512<long> control);
+    public static Vector512<float>  PermuteVar4x32(Vector512<float>  value, Vector512<int>  control);

     // Incorrect name, missing unsigned overload
-    public static Vector512<long>  PermuteVar4x64(Vector512<long>  value, Vector512<long>  control);
+    public static Vector512<long>  PermuteVar8x64(Vector512<long>  value, Vector512<long>  control);
+    public static Vector512<ulong> PermuteVar8x64(Vector512<ulong> value, Vector512<ulong> control);

     // Incorrect name, missing unsigned overload
-    public static Vector512<double> PermuteVar8x64(Vector512<double>   left,  Vector512<double> right,   Vector512<double> control);
-    public static Vector512<long>   PermuteVar8x64(Vector512<long>     left,  Vector512<long>   right,   Vector512<long>   control);
+    public static Vector512<double> PermuteVar8x64x2(Vector512<double> lower, Vector512<long>   indices, Vector512<double> upper);
+    public static Vector512<long>   PermuteVar8x64x2(Vector512<long>   lower, Vector512<long>   indices, Vector512<long>   upper);
+    public static Vector512<ulong>  PermuteVar8x64x2(Vector512<ulong>  lower, Vector512<ulong>  indices, Vector512<ulong>  upper);

     // Incorrect name, missing unsigned overload
-    public static Vector512<int>  PermuteVar8x32 (Vector512<int>  value, Vector512<int>  control);
+    public static Vector512<int>  PermuteVar16x32(Vector512<int>  left,  Vector512<int>  control);
+    public static Vector512<uint> PermuteVar16x32(Vector512<uint> left,  Vector512<uint> control);

     // Incorrect name, missing unsigned overload
-    public static Vector512<int>   PermuteVar16x32  (Vector512<int>   left,  Vector512<int>   right,   Vector512<int>   control);
-    public static Vector512<float> PermuteVar16x32  (Vector512<float> left,  Vector512<float> right,   Vector512<float> control);
+    public static Vector512<int>   PermuteVar16x32x2(Vector512<int>   lower, Vector512<int>   indices, Vector512<int>   upper);
+    public static Vector512<float> PermuteVar16x32x2(Vector512<float> lower, Vector512<int>   indices, Vector512<float> upper);
+    public static Vector512<uint>  PermuteVar16x32x2(Vector512<uint>  lower, Vector512<uint>  indices, Vector512<uint>  upper);

     // Missing scalar overloads that don't take upper
+    public static Vector128<double> Reciprocal14Scalar(Vector128<double> value);
+    public static Vector128<float>  Reciprocal14Scalar(Vector128<float>  value);

     // Missing scalar overloads that don't take upper
+    public static Vector128<double> ReciprocalSqrt14Scalar(Vector128<double> value);
+    public static Vector128<float>  ReciprocalSqrt14Scalar(Vector128<float>  value);

     // Missing unsigned types
+    public static Vector512<uint>  RotateLeft(Vector512<uint>  value, byte count);
+    public static Vector512<ulong> RotateLeft(Vector512<ulong> value, byte count);

     // Missing unsigned types
+    public static Vector512<uint>  RotateLeftVariable(Vector512<uint>  value, Vector512<uint>  count);
+    public static Vector512<ulong> RotateLeftVariable(Vector512<ulong> value, Vector512<ulong> count);

     // Missing unsigned types
+    public static Vector512<uint>  RotateRight(Vector512<uint>  value, byte count);
+    public static Vector512<ulong> RotateRight(Vector512<ulong> value, byte count);

     // Missing unsigned types
+    public static Vector512<uint>  RotateRightVariable(Vector512<uint>  value, Vector512<uint>  count);
+    public static Vector512<ulong> RotateRightVariable(Vector512<ulong> value, Vector512<ulong> count);

     // Missing scalar overloads that don't take upper
+    public static Vector128<double> RoundScaleScalar(Vector128<double> value, byte control);
+    public static Vector128<float>  RoundScaleScalar(Vector128<float>  value, byte control);

     // Missing scalar overloads that don't take upper
+    public static Vector128<double> ScaleScalar(Vector128<double> left, Vector128<double> right);
+    public static Vector128<float>  ScaleScalar(Vector128<float>  left, Vector128<float>  right);

     // Incorrect signature, to be reviewed later taking VectorMask<T>
-    public static void Scatter(double* baseAddress, Vector256<int> index,  byte scale, Vector512<double> value);
-    public static void Scatter(double* baseAddress, Vector512<long> index, byte scale, Vector512<double> value);
-    public static void Scatter(int*    baseAddress, Vector512<int> index,  byte scale, Vector512<int>    value);
-    public static void Scatter(int*    baseAddress, Vector512<long> index, byte scale, Vector256<int>    value);
-    public static void Scatter(long*   baseAddress, Vector256<int> index,  byte scale, Vector512<long>   value);
-    public static void Scatter(long*   baseAddress, Vector512<long> index, byte scale, Vector512<long>   value);
-    public static void Scatter(float*  baseAddress, Vector512<int> index,  byte scale, Vector512<float>  value);
-    public static void Scatter(float*  baseAddress, Vector512<long> index, byte scale, Vector256<float>  value);

     // Missing unsigned types
+    public static Vector512<uint>  ShiftLeftLogical(Vector512<uint>  value, byte             count);
+    public static Vector512<uint>  ShiftLeftLogical(Vector512<uint>  value, Vector128<uint>  count);
+    public static Vector512<ulong> ShiftLeftLogical(Vector512<ulong> value, byte             count);
+    public static Vector512<ulong> ShiftLeftLogical(Vector512<ulong> value, Vector128<ulong> count);

     // Missing unsigned types
+    public static Vector512<uint>  ShiftLeftLogicalVariable(Vector512<uint>  value, Vector512<uint>  count);
+    public static Vector512<ulong> ShiftLeftLogicalVariable(Vector512<ulong> value, Vector512<ulong> count);

     // Missing unsigned types
+    public static Vector512<uint>  ShiftRightLogical(Vector512<uint>  value, byte             count);
+    public static Vector512<uint>  ShiftRightLogical(Vector512<uint>  value, Vector128<uint>  count);
+    public static Vector512<ulong> ShiftRightLogical(Vector512<ulong> value, byte             count);
+    public static Vector512<ulong> ShiftRightLogical(Vector512<ulong> value, Vector128<ulong> count);

     // Missing unsigned types
+    public static Vector512<uint>  ShiftRightLogicalVariable(Vector512<uint>  value, Vector512<uint>  count);
+    public static Vector512<ulong> ShiftRightLogicalVariable(Vector512<ulong> value, Vector512<ulong> count);

     // Missing API
+    public static Vector512<uint> Shuffle(Vector512<uint> value, byte control);

     // Incorrect name due to differing semantics
-    public static Vector512<double> Shuffle     (Vector512<double> left, Vector512<double> right, byte control);
-    public static Vector512<int>    Shuffle     (Vector512<int>    left, Vector512<int>    right, byte control);
-    public static Vector512<long>   Shuffle     (Vector512<long>   left, Vector512<long>   right, byte control);
-    public static Vector512<float>  Shuffle     (Vector512<float>  left, Vector512<float>  right, byte control);
+    public static Vector512<double> Shuffle4x128(Vector512<double> left, Vector512<double> right, byte control);
+    public static Vector512<int>    Shuffle4x128(Vector512<int>    left, Vector512<int>    right, byte control);
+    public static Vector512<long>   Shuffle4x128(Vector512<long>   left, Vector512<long>   right, byte control);
+    public static Vector512<float>  Shuffle4x128(Vector512<float>  left, Vector512<float>  right, byte control);
+    public static Vector512<uint>   Shuffle4x128(Vector512<uint>   left, Vector512<uint>   right, byte control);
+    public static Vector512<ulong>  Shuffle4x128(Vector512<ulong>  left, Vector512<ulong>  right, byte control);

     // Missing unsigned types
+    public static Vector512<uint>  Subtract(Vector512<uint>  left, Vector512<uint>  right);
+    public static Vector512<ulong> Subtract(Vector512<ulong> left, Vector512<ulong> right);

     // Missing types
+    public static Vector512<byte>   TernaryLogic(Vector512<byte>   a, Vector512<byte>   b, Vector512<byte>   c, byte control);
+    public static Vector512<double> TernaryLogic(Vector512<double> a, Vector512<double> b, Vector512<double> c, byte control);
+    public static Vector512<short>  TernaryLogic(Vector512<short>  a, Vector512<short>  b, Vector512<short>  c, byte control);
+    public static Vector512<sbyte>  TernaryLogic(Vector512<sbyte>  a, Vector512<sbyte>  b, Vector512<sbyte>  c, byte control);
+    public static Vector512<float>  TernaryLogic(Vector512<float>  a, Vector512<float>  b, Vector512<float>  c, byte control);
+    public static Vector512<ushort> TernaryLogic(Vector512<ushort> a, Vector512<ushort> b, Vector512<ushort> c, byte control);
+    public static Vector512<uint>   TernaryLogic(Vector512<uint>   a, Vector512<uint>   b, Vector512<uint>   c, byte control);
+    public static Vector512<ulong>  TernaryLogic(Vector512<ulong>  a, Vector512<ulong>  b, Vector512<ulong>  c, byte control);

     // Missing unsigned types
+    public static Vector512<uint> UnpackHigh(Vector512<uint> left, Vector512<uint> right);
+    public static Vector512<ulong> UnpackHigh(Vector512<ulong> left, Vector512<ulong> right);

     // Missing unsigned types
+    public static Vector512<uint> UnpackLow(Vector512<uint> left, Vector512<uint> right);
+    public static Vector512<ulong> UnpackLow(Vector512<ulong> left, Vector512<ulong> right);

     // Missing types
+    public static Vector512<byte>   Xor(Vector512<byte>   left, Vector512<byte>   right);
+    public static Vector512<short>  Xor(Vector512<short>  left, Vector512<short>  right);
+    public static Vector512<sbyte>  Xor(Vector512<sbyte>  left, Vector512<sbyte>  right);
+    public static Vector512<ushort> Xor(Vector512<ushort> left, Vector512<ushort> right);
+    public static Vector512<uint>   Xor(Vector512<uint>   left, Vector512<uint>   right);
+    public static Vector512<ulong>  Xor(Vector512<ulong>  left, Vector512<ulong>  right);

     public new abstract partial class VL
     {
         // Missing APIs
+        public static Vector128<ulong> Abs(Vector128<long> value);
+        public static Vector256<ulong> Abs(Vector256<long> value);

         // Missing APIs
+        public static Vector128<int>   AlignRight32(Vector128<int>   left, Vector128<int>   right, byte mask);
+        public static Vector128<uint>  AlignRight32(Vector128<uint>  left, Vector128<uint>  right, byte mask);
+        public static Vector256<int>   AlignRight32(Vector256<int>   left, Vector256<int>   right, byte mask);
+        public static Vector256<uint>  AlignRight32(Vector256<uint>  left, Vector256<uint>  right, byte mask);

         // Missing APIs
+        public static Vector128<long>  AlignRight64(Vector128<long>  left, Vector128<long>  right, byte mask);
+        public static Vector128<ulong> AlignRight64(Vector128<ulong> left, Vector128<ulong> right, byte mask);
+        public static Vector256<long>  AlignRight64(Vector256<long>  left, Vector256<long>  right, byte mask);
+        public static Vector256<ulong> AlignRight64(Vector256<ulong> left, Vector256<ulong> right, byte mask);

         // Incorrect APIs, exposed in AVX2
-        public static Vector256<int> BroadcastToVector256(Vector128<int>     value);
-        public static Vector256<uint> BroadcastToVector256(Vector128<uint>   value);
-        public static Vector256<float> BroadcastToVector256(Vector128<float> value);

         // Missing API
+        public static Vector128<uint> ConvertToVector128UInt32(Vector128<double> value);

         // Missing APIs
+        public static Vector128<uint> ConvertToVector128UInt32WithTruncation(Vector128<double> value);
+        public static Vector128<uint> ConvertToVector128UInt32WithTruncation(Vector128<float>  value);
+        public static Vector128<uint> ConvertToVector128UInt32WithTruncation(Vector256<double> value);

         // Incorrect input type
-        public static Vector256<float> ConvertToVector256Single(Vector128<uint> value);
+        public static Vector256<float> ConvertToVector256Single(Vector256<uint> value);

         // Missing API
+        public static Vector256<uint> ConvertToVector256UInt32(Vector256<float> value);

         // Missing API
+        public static Vector256<uint> ConvertToVector256UInt32WithTruncation(Vector256<float> value);

         // Different signature due to complications in handling two constants and unique among intrinsics
-        public static Vector128<double> GetMantissa(Vector128<double> value, byte interval, byte signControl);
-        public static Vector128<float>  GetMantissa(Vector128<float>  value, byte interval, byte signControl);
+        public static Vector128<double> GetMantissa(Vector128<double> value, byte control);
+        public static Vector128<float>  GetMantissa(Vector128<float>  value, byte control);

         // Different signature due to complications in handling two constants and unique among intrinsics
-        public static Vector256<double> GetMantissa(Vector256<double> value, byte interval, byte signControl);
-        public static Vector256<float>  GetMantissa(Vector256<float>  value, byte interval, byte signControl);
+        public static Vector256<double> GetMantissa(Vector256<double> value, byte control);
+        public static Vector256<float>  GetMantissa(Vector256<float>  value, byte control);

         // Missing APIs
+        public static Vector128<long>  Max(Vector128<long>  left, Vector128<long>  right);
+        public static Vector128<ulong> Max(Vector128<ulong> left, Vector128<ulong> right);
+        public static Vector256<long>  Max(Vector256<long>  left, Vector256<long>  right);
+        public static Vector256<ulong> Max(Vector256<ulong> left, Vector256<ulong> right);

         // Missing APIs
+        public static Vector128<long>  Min(Vector128<long>  left, Vector128<long>  right);
+        public static Vector128<ulong> Min(Vector128<ulong> left, Vector128<ulong> right);
+        public static Vector256<long>  Min(Vector256<long>  left, Vector256<long>  right);
+        public static Vector256<ulong> Min(Vector256<ulong> left, Vector256<ulong> right);

         // Incorrect name due to differing semantics
-        public static Vector128<double> PermuteVar2x64  (Vector128<double> left,  Vector128<double> right,  Vector128<long>    control);
-        public static Vector128<long>   PermuteVar2x64  (Vector128<long>   left,  Vector128<long>   right,  Vector128<long>    control);
-        public static Vector128<ulong>  PermuteVar2x64  (Vector128<ulong>  left,  Vector128<ulong>  right,  Vector128<ulong>   control);
+        public static Vector128<double> PermuteVar2x64x2(Vector128<double> lower, Vector128<long>   indices, Vector128<double> upper);
+        public static Vector128<long>   PermuteVar2x64x2(Vector128<long>   lower, Vector128<long>   indices, Vector128<long>   upper);
+        public static Vector128<ulong>  PermuteVar2x64x2(Vector128<ulong>  lower, Vector128<ulong>  indices, Vector128<ulong>  upper);

         // Incorrect name due to differing semantics
-        public static Vector128<float> PermuteVar4x32  (Vector128<float> left,  Vector128<float> right>,  Vector128<int>   control);
-        public static Vector128<int>   PermuteVar4x32  (Vector128<int>   left,  Vector128<int>   right,   Vector128<int>   control);
-        public static Vector128<uint>  PermuteVar4x32  (Vector128<uint>  left,  Vector128<uint>  right,   Vector128<uint>  control);
+        public static Vector128<int>   PermuteVar4x32x2(Vector128<int>   lower, Vector128<int>   indices, Vector128<int>   upper);
+        public static Vector128<float> PermuteVar4x32x2(Vector128<float> lower, Vector128<int>   indices, Vector128<float> upper);
+        public static Vector128<uint>  PermuteVar4x32x2(Vector128<uint>  lower, Vector128<uint>  indices, Vector128<uint>  upper);

         // Missing API
+        public static Vector256<double> PermuteVar4x64(Vector256<double> value, Vector256<long> control);

         // Incorrect name due to differing semantics
-        public static Vector256<double> PermuteVar4x64 (Vector256<double>  left,  Vector256<double> right,  Vector256<long>    control);
-        public static Vector256<long>   PermuteVar4x64 (Vector256<long>    left,  Vector256<long>   right,  Vector256<long>    control);
-        public static Vector256<ulong>  PermuteVar4x64 (Vector256<ulong>   left,  Vector256<ulong>  right,  Vector256<ulong>   control);
+        public static Vector256<double> PermuteVar4x64x2(Vector256<double> lower, Vector256<long>   indices, Vector256<double> upper);
+        public static Vector256<long>   PermuteVar4x64x2(Vector256<long>   lower, Vector256<long>   indices, Vector256<long>   upper);
+        public static Vector256<ulong>  PermuteVar4x64x2(Vector256<ulong>  lower, Vector256<ulong>  indices, Vector256<ulong>  upper);

         // Missing API
+        public static Vector256<float> PermuteVar8x32(Vector256<float> value, Vector256<int> control);

         // Incorrect name due to differing semantics
-        public static Vector256<float> PermuteVar8x32  (Vector256<float> left,  Vector256<float> right>,  Vector256<int>   control);
-        public static Vector256<int>   PermuteVar8x32  (Vector256<int>   left,  Vector256<int>   right,   Vector256<int>   control);
-        public static Vector256<uint>  PermuteVar8x32  (Vector256<uint>  left,  Vector256<uint>  right,   Vector256<uint>  control);
+        public static Vector256<int>   PermuteVar8x32x2(Vector256<int>   lower, Vector256<int>   indices, Vector256<int>   upper);
+        public static Vector256<float> PermuteVar8x32x2(Vector256<float> lower, Vector256<int>   indices, Vector256<float> upper);
+        public static Vector256<uint>  PermuteVar8x32x2(Vector256<uint>  lower, Vector256<uint>  indices, Vector256<uint>  upper);

         // Missing API
+        public static Vector128<long> ShiftRightArithmetic(Vector128<long> value, byte count);
+        public static Vector256<long> ShiftRightArithmetic(Vector256<long> value, byte count);

         // Incorrect signature, to be reviewed later taking VectorMask<T>
-        public static void Scatter(double* baseAddress, Vector128<int>    index, byte scale, Vector256<double> value);
-        public static void Scatter(double* baseAddress, Vector128<int>    index, byte scale, Vector128<double> value);
-        public static void Scatter(double* baseAddress, Vector256<double> index, byte scale, Vector256<double> value);
-        public static void Scatter(double* baseAddress, Vector128<double> index, byte scale, Vector128<double> value);
-        public static void Scatter(int*    baseAddress, Vector256<int>    index, byte scale, Vector256<int>    value);
-        public static void Scatter(int*    baseAddress, Vector128<int>    index, byte scale, Vector128<int>    value);
-        public static void Scatter(int*    baseAddress, Vector256<long>   index, byte scale, Vector128<int>    value);
-        public static void Scatter(int*    baseAddress, Vector128<long>   index, byte scale, Vector128<int>    value);
-        public static void Scatter(long*   baseAddress, Vector128<int>    index, byte scale, Vector256<long>   value);
-        public static void Scatter(long*   baseAddress, Vector128<int>    index, byte scale, Vector128<long>   value);
-        public static void Scatter(long*   baseAddress, Vector256<long>   index, byte scale, Vector256<long>   value);
-        public static void Scatter(long*   baseAddress, Vector128<long>   index, byte scale, Vector128<long>   value);
-        public static void Scatter(float*  baseAddress, Vector256<float>  index, byte scale, Vector256<float>  value);
-        public static void Scatter(float*  baseAddress, Vector128<float>  index, byte scale, Vector128<float>  value);
-        public static void Scatter(float*  baseAddress, Vector256<long>   index, byte scale, Vector128<float>  value);
-        public static void Scatter(float*  baseAddress, Vector128<long>   index, byte scale, Vector128<float>  value);

         // Missing APIs and incorrect name due to differing semantics
-        public static Vector256<int>    Shuffle     (Vector256<int>    left, Vector256<int>    right, byte control);
+        public static Vector256<double> Shuffle2x128(Vector256<double> left, Vector256<double> right, byte control);
+        public static Vector256<int>    Shuffle2x128(Vector256<int>    left, Vector256<int>    right, byte control);
+        public static Vector256<long>   Shuffle2x128(Vector256<long>   left, Vector256<long>   right, byte control);
+        public static Vector256<float>  Shuffle2x128(Vector256<float>  left, Vector256<float>  right, byte control);
+        public static Vector256<uint>   Shuffle2x128(Vector256<uint>   left, Vector256<uint>   right, byte control);
+        public static Vector256<ulong>  Shuffle2x128(Vector256<ulong>  left, Vector256<ulong>  right, byte control);

         // Missing types
+        public static Vector128<byte>   TernaryLogic(Vector128<byte>   a, Vector128<byte>   b, Vector128<byte>   c, byte control);
+        public static Vector128<double> TernaryLogic(Vector128<double> a, Vector128<double> b, Vector128<double> c, byte control);
+        public static Vector128<short>  TernaryLogic(Vector128<short>  a, Vector128<short>  b, Vector128<short>  c, byte control);
+        public static Vector128<sbyte>  TernaryLogic(Vector128<sbyte>  a, Vector128<sbyte>  b, Vector128<sbyte>  c, byte control);
+        public static Vector128<float>  TernaryLogic(Vector128<float>  a, Vector128<float>  b, Vector128<float>  c, byte control);
+        public static Vector128<ushort> TernaryLogic(Vector128<ushort> a, Vector128<ushort> b, Vector128<ushort> c, byte control);
+        public static Vector128<uint>   TernaryLogic(Vector128<uint>   a, Vector128<uint>   b, Vector128<uint>   c, byte control);
+        public static Vector128<ulong>  TernaryLogic(Vector128<ulong>  a, Vector128<ulong>  b, Vector128<ulong>  c, byte control);

         // Missing types
+        public static Vector256<byte>   TernaryLogic(Vector256<byte>   a, Vector256<byte>   b, Vector256<byte>   c, byte control);
+        public static Vector256<double> TernaryLogic(Vector256<double> a, Vector256<double> b, Vector256<double> c, byte control);
+        public static Vector256<short>  TernaryLogic(Vector256<short>  a, Vector256<short>  b, Vector256<short>  c, byte control);
+        public static Vector256<sbyte>  TernaryLogic(Vector256<sbyte>  a, Vector256<sbyte>  b, Vector256<sbyte>  c, byte control);
+        public static Vector256<float>  TernaryLogic(Vector256<float>  a, Vector256<float>  b, Vector256<float>  c, byte control);
+        public static Vector256<ushort> TernaryLogic(Vector256<ushort> a, Vector256<ushort> b, Vector256<ushort> c, byte control);
+        public static Vector256<uint>   TernaryLogic(Vector256<uint>   a, Vector256<uint>   b, Vector256<uint>   c, byte control);
+        public static Vector256<ulong>  TernaryLogic(Vector256<ulong>  a, Vector256<ulong>  b, Vector256<ulong>  c, byte control);
     }
 }

 // Avx512Vbmi APIs were not covered
+public abstract partial class Avx512Vbmi : Avx512BW
+{
+    public static new bool IsSupported { get; }
+
+    // Vector512<byte> result = Vector512<byte>.Zero;
+    //
+    // for (int i = 0; i < Vector512<long>.Count; i++)
+    // {
+    //    for (int j = 0; j < 8; j++)
+    //    {
+    //        ulong tmpRes = 0;
+    //        ulong tmpVal = value.GetElement(i);
+    //        byte  tmpCtl = control.GetElement((i * 8) + j) & 0b0011_1111;
+    //
+    //        for (int k = 0; k < 8; k++)
+    //        {
+    //            ulong bit = (tmpVal >> ((tmpCtl + k) & 0b0011_1111)) & 1;
+    //            tmpRes |= (bit << k);
+    //        }
+    //
+    //        result = result.WithElement(i, tmpRes);
+    //    }
+    // }
+    //
+    // return result;
+    public static Vector512<byte>  MultiShift(Vector512<byte>  control, Vector512<ulong> value);
+    public static Vector512<sbyte> MultiShift(Vector512<sbyte> control, Vector512<long>  value);
+
+    public static Vector512<byte>  PermuteVar64x8  (Vector512<byte>  left,  Vector512<byte>  control);
+    public static Vector512<sbyte> PermuteVar64x8  (Vector512<sbyte> left,  Vector512<sbyte> control);
+
+    public static Vector512<byte>  PermuteVar64x8x2(Vector512<byte>  lower, Vector512<byte>  indices, Vector512<byte>  upper);
+    public static Vector512<sbyte> PermuteVar64x8x2(Vector512<sbyte> lower, Vector512<sbyte> indices, Vector512<sbyte> upper);
+
+    public new abstract partial class VL : Avx512BW.VL
+    {
+        public static new bool IsSupported { get; }
+
+        public static Vector128<byte>  MultiShift(Vector128<byte>  control, Vector128<ulong> value);
+        public static Vector128<sbyte> MultiShift(Vector128<sbyte> control, Vector128<long>  value);
+        public static Vector256<byte>  MultiShift(Vector256<byte>  control, Vector256<ulong> value);
+        public static Vector256<sbyte> MultiShift(Vector256<sbyte> control, Vector256<long>  value);
+
+        public static Vector128<byte>  PermuteVar16x8(Vector128<byte>  left, Vector128<byte>  control);
+        public static Vector128<sbyte> PermuteVar16x8(Vector128<sbyte> left, Vector128<sbyte> control);
+
+        public static Vector128<byte>  PermuteVar16x8x2(Vector128<byte>  lower, Vector128<byte>  indices, Vector128<byte>  upper);
+        public static Vector128<sbyte> PermuteVar16x8x2(Vector128<sbyte> lower, Vector128<sbyte> indices, Vector128<sbyte> upper);
+
+        public static Vector256<byte>  PermuteVar32x8(Vector256<byte>  left, Vector256<byte>  control);
+        public static Vector256<sbyte> PermuteVar32x8(Vector256<sbyte> left, Vector256<sbyte> control);
+
+        public static Vector256<byte>  PermuteVar32x8x2(Vector256<byte>  lower, Vector256<byte>  indices, Vector256<byte>  upper);
+        public static Vector256<sbyte> PermuteVar32x8x2(Vector256<sbyte> lower, Vector256<sbyte> indices, Vector256<sbyte> upper);
+    }
+    public new abstract partial class X64 : Avx512BW.X64
+    {
+        public static new bool IsSupported { get; }
+    }
+}

Metadata

Metadata

Assignees

No one assigned

    Labels

    api-approvedAPI was approved in API review, it can be implementedarea-System.Runtime.Intrinsicsavx512Related to the AVX-512 architecturein-prThere is an active PR which will close this issue when it is merged

    Type

    No type

    Projects

    No projects

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions