SixLabors · brianpopow · Nov 2, 2021 · Nov 1, 2021 · Nov 1, 2021 · Nov 2, 2021
diff --git a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs
@@ -27,6 +27,10 @@ internal static unsafe class LosslessUtils
 
         private const double Log2Reciprocal = 1.44269504088896338700465094007086;
 
+#if SUPPORTS_RUNTIME_INTRINSICS
+        private static readonly Vector128<byte> Zero = Vector128.Create(0).AsByte();
+#endif
+
         /// <summary>
         /// Returns the exact index where array1 and array2 are different. For an index
         /// inferior or equal to bestLenMatch, the return value just has to be strictly
@@ -551,6 +555,7 @@ public static void PredictorInverseTransform(
                     int mask = tileWidth - 1;
                     int tilesPerRow = SubSampleSize(width, transform.Bits);
                     int predictorModeIdxBase = (y >> transform.Bits) * tilesPerRow;
+                    Span<short> scratch = stackalloc short[8];
                     while (y < yEnd)
                     {
                         int predictorModeIdx = predictorModeIdxBase;
@@ -608,7 +613,7 @@ public static void PredictorInverseTransform(
                                     PredictorAdd10(input + x, output + x - width, xEnd - x, output + x);
                                     break;
                                 case 11:
-                                    PredictorAdd11(input + x, output + x - width, xEnd - x, output + x);
+                                    PredictorAdd11(input + x, output + x - width, xEnd - x, output + x, scratch);
                                     break;
                                 case 12:
                                     PredictorAdd12(input + x, output + x - width, xEnd - x, output + x);
@@ -974,11 +979,11 @@ private static void PredictorAdd10(uint* input, uint* upper, int numberOfPixels,
         }
 
         [MethodImpl(InliningOptions.ShortMethod)]
-        private static void PredictorAdd11(uint* input, uint* upper, int numberOfPixels, uint* output)
+        private static void PredictorAdd11(uint* input, uint* upper, int numberOfPixels, uint* output, Span<short> scratch)
         {
             for (int x = 0; x < numberOfPixels; x++)
             {
-                uint pred = Predictor11(output[x - 1], upper + x);
+                uint pred = Predictor11(output[x - 1], upper + x, scratch);
                 output[x] = AddPixels(input[x], pred);
             }
         }
@@ -1031,7 +1036,7 @@ private static void PredictorAdd13(uint* input, uint* upper, int numberOfPixels,
         public static uint Predictor10(uint left, uint* top) => Average4(left, top[-1], top[0], top[1]);
 
         [MethodImpl(InliningOptions.ShortMethod)]
-        public static uint Predictor11(uint left, uint* top) => Select(top[0], left, top[-1]);
+        public static uint Predictor11(uint left, uint* top, Span<short> scratch) => Select(top[0], left, top[-1], scratch);
 
         [MethodImpl(InliningOptions.ShortMethod)]
         public static uint Predictor12(uint left, uint* top) => ClampedAddSubtractFull(left, top[0], top[-1]);
@@ -1148,11 +1153,11 @@ public static void PredictorSub10(uint* input, uint* upper, int numPixels, uint*
         }
 
         [MethodImpl(InliningOptions.ShortMethod)]
-        public static void PredictorSub11(uint* input, uint* upper, int numPixels, uint* output)
+        public static void PredictorSub11(uint* input, uint* upper, int numPixels, uint* output, Span<short> scratch)
         {
             for (int x = 0; x < numPixels; x++)
             {
-                uint pred = Predictor11(input[x - 1], upper + x);
+                uint pred = Predictor11(input[x - 1], upper + x, scratch);
                 output[x] = SubPixels(input[x], pred);
             }
         }
@@ -1240,14 +1245,43 @@ private static uint ClampedAddSubtractHalf(uint c0, uint c1, uint c2)
         private static Vector128<int> MkCst16(int hi, int lo) => Vector128.Create((hi << 16) | (lo & 0xffff));
 #endif
 
-        private static uint Select(uint a, uint b, uint c)
+        private static uint Select(uint a, uint b, uint c, Span<short> scratch)
         {
-            int paMinusPb =
-                Sub3((int)(a >> 24), (int)(b >> 24), (int)(c >> 24)) +
-                Sub3((int)((a >> 16) & 0xff), (int)((b >> 16) & 0xff), (int)((c >> 16) & 0xff)) +
-                Sub3((int)((a >> 8) & 0xff), (int)((b >> 8) & 0xff), (int)((c >> 8) & 0xff)) +
-                Sub3((int)(a & 0xff), (int)(b & 0xff), (int)(c & 0xff));
-            return paMinusPb <= 0 ? a : b;
+#if SUPPORTS_RUNTIME_INTRINSICS
+            if (Sse2.IsSupported)
+            {
+                Span<short> output = scratch;
+                fixed (short* p = output)
+                {
+                    Vector128<byte> a0 = Sse2.ConvertScalarToVector128UInt32(a).AsByte();
+                    Vector128<byte> b0 = Sse2.ConvertScalarToVector128UInt32(b).AsByte();
+                    Vector128<byte> c0 = Sse2.ConvertScalarToVector128UInt32(c).AsByte();
+                    Vector128<byte> ac0 = Sse2.SubtractSaturate(a0, c0);
+                    Vector128<byte> ca0 = Sse2.SubtractSaturate(c0, a0);
+                    Vector128<byte> bc0 = Sse2.SubtractSaturate(b0, c0);
+                    Vector128<byte> cb0 = Sse2.SubtractSaturate(c0, b0);
+                    Vector128<byte> ac = Sse2.Or(ac0, ca0);
+                    Vector128<byte> bc = Sse2.Or(bc0, cb0);
+                    Vector128<byte> pa = Sse2.UnpackLow(ac, Zero); // |a - c|
+                    Vector128<byte> pb = Sse2.UnpackLow(bc, Zero); // |b - c|
+                    Vector128<ushort> diff = Sse2.Subtract(pb.AsUInt16(), pa.AsUInt16());
+                    Sse2.Store((ushort*)p, diff);
+                }
+
+                int paMinusPb = output[0] + output[1] + output[2] + output[3];
+
+                return (paMinusPb <= 0) ? a : b;
+            }
+            else
+#endif
+            {
+                int paMinusPb =
+                    Sub3((int)(a >> 24), (int)(b >> 24), (int)(c >> 24)) +
+                    Sub3((int)((a >> 16) & 0xff), (int)((b >> 16) & 0xff), (int)((c >> 16) & 0xff)) +
+                    Sub3((int)((a >> 8) & 0xff), (int)((b >> 8) & 0xff), (int)((c >> 8) & 0xff)) +
+                    Sub3((int)(a & 0xff), (int)(b & 0xff), (int)(c & 0xff));
+                return paMinusPb <= 0 ? a : b;
+            }
         }
 
         [MethodImpl(InliningOptions.ShortMethod)]

diff --git a/src/ImageSharp/Formats/Webp/Lossless/PredictorEncoder.cs b/src/ImageSharp/Formats/Webp/Lossless/PredictorEncoder.cs
@@ -50,6 +50,7 @@ public static void ResidualImage(
             int tilesPerRow = LosslessUtils.SubSampleSize(width, bits);
             int tilesPerCol = LosslessUtils.SubSampleSize(height, bits);
             int maxQuantization = 1 << LosslessUtils.NearLosslessBits(nearLosslessQuality);
+            Span<short> scratch = stackalloc short[8];
 
             // TODO: Can we optimize this?
             int[][] histo = new int[4][];
@@ -84,7 +85,8 @@ public static void ResidualImage(
                             transparentColorMode,
                             usedSubtractGreen,
                             nearLossless,
-                            image);
+                            image,
+                            scratch);
 
                         image[(tileY * tilesPerRow) + tileX] = (uint)(WebpConstants.ArgbBlack | (pred << 8));
                     }
@@ -192,7 +194,8 @@ private static int GetBestPredictorForTile(
             WebpTransparentColorMode transparentColorMode,
             bool usedSubtractGreen,
             bool nearLossless,
-            Span<uint> modes)
+            Span<uint> modes,
+            Span<short> scratch)
         {
             const int numPredModes = 14;
             int startX = tileX << bits;
@@ -272,7 +275,7 @@ private static int GetBestPredictorForTile(
                         }
                     }
 
-                    GetResidual(width, height, upperRow, currentRow, maxDiffs, mode, startX, startX + maxX, y, maxQuantization, transparentColorMode, usedSubtractGreen, nearLossless, residuals);
+                    GetResidual(width, height, upperRow, currentRow, maxDiffs, mode, startX, startX + maxX, y, maxQuantization, transparentColorMode, usedSubtractGreen, nearLossless, residuals, scratch);
                     for (int relativeX = 0; relativeX < maxX; ++relativeX)
                     {
                         UpdateHisto(histoArgb, residuals[relativeX]);
@@ -333,11 +336,12 @@ private static void GetResidual(
             WebpTransparentColorMode transparentColorMode,
             bool usedSubtractGreen,
             bool nearLossless,
-            Span<uint> output)
+            Span<uint> output,
+            Span<short> scratch)
         {
             if (transparentColorMode == WebpTransparentColorMode.Preserve)
             {
-                PredictBatch(mode, xStart, y, xEnd - xStart, currentRowSpan, upperRowSpan, output);
+                PredictBatch(mode, xStart, y, xEnd - xStart, currentRowSpan, upperRowSpan, output, scratch);
             }
             else
             {
@@ -395,7 +399,7 @@ private static void GetResidual(
                                     predict = LosslessUtils.Predictor10(currentRow[x - 1], upperRow + x);
                                     break;
                                 case 11:
-                                    predict = LosslessUtils.Predictor11(currentRow[x - 1], upperRow + x);
+                                    predict = LosslessUtils.Predictor11(currentRow[x - 1], upperRow + x, scratch);
                                     break;
                                 case 12:
                                     predict = LosslessUtils.Predictor12(currentRow[x - 1], upperRow + x);
@@ -583,6 +587,7 @@ private static void CopyImageWithPrediction(
             Span<byte> currentMaxDiffs = MemoryMarshal.Cast<uint, byte>(currentRow.Slice(width + 1));
 
             Span<byte> lowerMaxDiffs = currentMaxDiffs.Slice(width);
+            Span<short> scratch = stackalloc short[8];
             for (int y = 0; y < height; y++)
             {
                 Span<uint> tmp32 = upperRow;
@@ -593,7 +598,7 @@ private static void CopyImageWithPrediction(
 
                 if (lowEffort)
                 {
-                    PredictBatch(PredLowEffort, 0, y, width, currentRow, upperRow, argb.Slice(y * width));
+                    PredictBatch(PredLowEffort, 0, y, width, currentRow, upperRow, argb.Slice(y * width), scratch);
                 }
                 else
                 {
@@ -634,7 +639,8 @@ private static void CopyImageWithPrediction(
                             transparentColorMode,
                             usedSubtractGreen,
                             nearLossless,
-                            argb.Slice((y * width) + x));
+                            argb.Slice((y * width) + x),
+                            scratch);
 
                         x = xEnd;
                     }
@@ -649,7 +655,8 @@ private static void PredictBatch(
             int numPixels,
             Span<uint> currentSpan,
             Span<uint> upperSpan,
-            Span<uint> outputSpan)
+            Span<uint> outputSpan,
+            Span<short> scratch)
         {
 #pragma warning disable SA1503 // Braces should not be omitted
             fixed (uint* current = currentSpan)
@@ -718,7 +725,7 @@ private static void PredictBatch(
                             LosslessUtils.PredictorSub10(current + xStart, upper + xStart, numPixels, output);
                             break;
                         case 11:
-                            LosslessUtils.PredictorSub11(current + xStart, upper + xStart, numPixels, output);
+                            LosslessUtils.PredictorSub11(current + xStart, upper + xStart, numPixels, output, scratch);
                             break;
                         case 12:
                             LosslessUtils.PredictorSub12(current + xStart, upper + xStart, numPixels, output);