Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 47 additions & 13 deletions src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,10 @@ internal static unsafe class LosslessUtils

private const double Log2Reciprocal = 1.44269504088896338700465094007086;

#if SUPPORTS_RUNTIME_INTRINSICS
private static readonly Vector128<byte> Zero = Vector128.Create(0).AsByte();
#endif

/// <summary>
/// Returns the exact index where array1 and array2 are different. For an index
/// inferior or equal to bestLenMatch, the return value just has to be strictly
Expand Down Expand Up @@ -551,6 +555,7 @@ public static void PredictorInverseTransform(
int mask = tileWidth - 1;
int tilesPerRow = SubSampleSize(width, transform.Bits);
int predictorModeIdxBase = (y >> transform.Bits) * tilesPerRow;
Span<short> scratch = stackalloc short[8];
while (y < yEnd)
{
int predictorModeIdx = predictorModeIdxBase;
Expand Down Expand Up @@ -608,7 +613,7 @@ public static void PredictorInverseTransform(
PredictorAdd10(input + x, output + x - width, xEnd - x, output + x);
break;
case 11:
PredictorAdd11(input + x, output + x - width, xEnd - x, output + x);
PredictorAdd11(input + x, output + x - width, xEnd - x, output + x, scratch);
break;
case 12:
PredictorAdd12(input + x, output + x - width, xEnd - x, output + x);
Expand Down Expand Up @@ -974,11 +979,11 @@ private static void PredictorAdd10(uint* input, uint* upper, int numberOfPixels,
}

[MethodImpl(InliningOptions.ShortMethod)]
private static void PredictorAdd11(uint* input, uint* upper, int numberOfPixels, uint* output)
private static void PredictorAdd11(uint* input, uint* upper, int numberOfPixels, uint* output, Span<short> scratch)
{
for (int x = 0; x < numberOfPixels; x++)
{
uint pred = Predictor11(output[x - 1], upper + x);
uint pred = Predictor11(output[x - 1], upper + x, scratch);
output[x] = AddPixels(input[x], pred);
}
}
Expand Down Expand Up @@ -1031,7 +1036,7 @@ private static void PredictorAdd13(uint* input, uint* upper, int numberOfPixels,
public static uint Predictor10(uint left, uint* top) => Average4(left, top[-1], top[0], top[1]);

[MethodImpl(InliningOptions.ShortMethod)]
public static uint Predictor11(uint left, uint* top) => Select(top[0], left, top[-1]);
public static uint Predictor11(uint left, uint* top, Span<short> scratch) => Select(top[0], left, top[-1], scratch);

[MethodImpl(InliningOptions.ShortMethod)]
public static uint Predictor12(uint left, uint* top) => ClampedAddSubtractFull(left, top[0], top[-1]);
Expand Down Expand Up @@ -1148,11 +1153,11 @@ public static void PredictorSub10(uint* input, uint* upper, int numPixels, uint*
}

[MethodImpl(InliningOptions.ShortMethod)]
public static void PredictorSub11(uint* input, uint* upper, int numPixels, uint* output)
public static void PredictorSub11(uint* input, uint* upper, int numPixels, uint* output, Span<short> scratch)
{
for (int x = 0; x < numPixels; x++)
{
uint pred = Predictor11(input[x - 1], upper + x);
uint pred = Predictor11(input[x - 1], upper + x, scratch);
output[x] = SubPixels(input[x], pred);
}
}
Expand Down Expand Up @@ -1240,14 +1245,43 @@ private static uint ClampedAddSubtractHalf(uint c0, uint c1, uint c2)
private static Vector128<int> MkCst16(int hi, int lo) => Vector128.Create((hi << 16) | (lo & 0xffff));
#endif

private static uint Select(uint a, uint b, uint c)
private static uint Select(uint a, uint b, uint c, Span<short> scratch)
{
int paMinusPb =
Sub3((int)(a >> 24), (int)(b >> 24), (int)(c >> 24)) +
Sub3((int)((a >> 16) & 0xff), (int)((b >> 16) & 0xff), (int)((c >> 16) & 0xff)) +
Sub3((int)((a >> 8) & 0xff), (int)((b >> 8) & 0xff), (int)((c >> 8) & 0xff)) +
Sub3((int)(a & 0xff), (int)(b & 0xff), (int)(c & 0xff));
return paMinusPb <= 0 ? a : b;
#if SUPPORTS_RUNTIME_INTRINSICS
if (Sse2.IsSupported)
{
Span<short> output = scratch;
fixed (short* p = output)
{
Vector128<byte> a0 = Sse2.ConvertScalarToVector128UInt32(a).AsByte();
Vector128<byte> b0 = Sse2.ConvertScalarToVector128UInt32(b).AsByte();
Vector128<byte> c0 = Sse2.ConvertScalarToVector128UInt32(c).AsByte();
Vector128<byte> ac0 = Sse2.SubtractSaturate(a0, c0);
Vector128<byte> ca0 = Sse2.SubtractSaturate(c0, a0);
Vector128<byte> bc0 = Sse2.SubtractSaturate(b0, c0);
Vector128<byte> cb0 = Sse2.SubtractSaturate(c0, b0);
Vector128<byte> ac = Sse2.Or(ac0, ca0);
Vector128<byte> bc = Sse2.Or(bc0, cb0);
Vector128<byte> pa = Sse2.UnpackLow(ac, Zero); // |a - c|
Copy link
Contributor

@br3aker br3aker Nov 1, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

While vector creation on pre .net6 is compiled to a really bad sequence of scalar sets, explicit Vector128<T>.Zero compiles to a xor command which would be a little clearer imo. Plus it might be a very tiny bit faster than static variable read.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thanks @br3aker

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FYI: default for the vector creates the same code as Vector128<T>.Zero

(I prefer Vector128.Zero).

Vector128<byte> pb = Sse2.UnpackLow(bc, Zero); // |b - c|
Vector128<ushort> diff = Sse2.Subtract(pb.AsUInt16(), pa.AsUInt16());
Sse2.Store((ushort*)p, diff);
}

int paMinusPb = output[0] + output[1] + output[2] + output[3];
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can put this into the fixed-block and access it via the pointer to avoid bound checks*.
If output would be too small, then there's a bug somewhere else 😉 (fortunately there's none).

* or reverse the order to read output[3] first, then [2], ... thant there's only one bound-check.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ahhh, too late....

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

or reverse the order to read output[3] first, then [2], ... thant there's only one bound-check.

ah yeah, always forget about that trick, thx. Will do with a follow up PR.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll put it (incl. the return) within the fixed block here.


return (paMinusPb <= 0) ? a : b;
}
else
#endif
{
int paMinusPb =
Sub3((int)(a >> 24), (int)(b >> 24), (int)(c >> 24)) +
Sub3((int)((a >> 16) & 0xff), (int)((b >> 16) & 0xff), (int)((c >> 16) & 0xff)) +
Sub3((int)((a >> 8) & 0xff), (int)((b >> 8) & 0xff), (int)((c >> 8) & 0xff)) +
Sub3((int)(a & 0xff), (int)(b & 0xff), (int)(c & 0xff));
return paMinusPb <= 0 ? a : b;
}
}

[MethodImpl(InliningOptions.ShortMethod)]
Expand Down
27 changes: 17 additions & 10 deletions src/ImageSharp/Formats/Webp/Lossless/PredictorEncoder.cs
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ public static void ResidualImage(
int tilesPerRow = LosslessUtils.SubSampleSize(width, bits);
int tilesPerCol = LosslessUtils.SubSampleSize(height, bits);
int maxQuantization = 1 << LosslessUtils.NearLosslessBits(nearLosslessQuality);
Span<short> scratch = stackalloc short[8];

// TODO: Can we optimize this?
int[][] histo = new int[4][];
Expand Down Expand Up @@ -84,7 +85,8 @@ public static void ResidualImage(
transparentColorMode,
usedSubtractGreen,
nearLossless,
image);
image,
scratch);

image[(tileY * tilesPerRow) + tileX] = (uint)(WebpConstants.ArgbBlack | (pred << 8));
}
Expand Down Expand Up @@ -192,7 +194,8 @@ private static int GetBestPredictorForTile(
WebpTransparentColorMode transparentColorMode,
bool usedSubtractGreen,
bool nearLossless,
Span<uint> modes)
Span<uint> modes,
Span<short> scratch)
{
const int numPredModes = 14;
int startX = tileX << bits;
Expand Down Expand Up @@ -272,7 +275,7 @@ private static int GetBestPredictorForTile(
}
}

GetResidual(width, height, upperRow, currentRow, maxDiffs, mode, startX, startX + maxX, y, maxQuantization, transparentColorMode, usedSubtractGreen, nearLossless, residuals);
GetResidual(width, height, upperRow, currentRow, maxDiffs, mode, startX, startX + maxX, y, maxQuantization, transparentColorMode, usedSubtractGreen, nearLossless, residuals, scratch);
for (int relativeX = 0; relativeX < maxX; ++relativeX)
{
UpdateHisto(histoArgb, residuals[relativeX]);
Expand Down Expand Up @@ -333,11 +336,12 @@ private static void GetResidual(
WebpTransparentColorMode transparentColorMode,
bool usedSubtractGreen,
bool nearLossless,
Span<uint> output)
Span<uint> output,
Span<short> scratch)
{
if (transparentColorMode == WebpTransparentColorMode.Preserve)
{
PredictBatch(mode, xStart, y, xEnd - xStart, currentRowSpan, upperRowSpan, output);
PredictBatch(mode, xStart, y, xEnd - xStart, currentRowSpan, upperRowSpan, output, scratch);
}
else
{
Expand Down Expand Up @@ -395,7 +399,7 @@ private static void GetResidual(
predict = LosslessUtils.Predictor10(currentRow[x - 1], upperRow + x);
break;
case 11:
predict = LosslessUtils.Predictor11(currentRow[x - 1], upperRow + x);
predict = LosslessUtils.Predictor11(currentRow[x - 1], upperRow + x, scratch);
break;
case 12:
predict = LosslessUtils.Predictor12(currentRow[x - 1], upperRow + x);
Expand Down Expand Up @@ -583,6 +587,7 @@ private static void CopyImageWithPrediction(
Span<byte> currentMaxDiffs = MemoryMarshal.Cast<uint, byte>(currentRow.Slice(width + 1));

Span<byte> lowerMaxDiffs = currentMaxDiffs.Slice(width);
Span<short> scratch = stackalloc short[8];
for (int y = 0; y < height; y++)
{
Span<uint> tmp32 = upperRow;
Expand All @@ -593,7 +598,7 @@ private static void CopyImageWithPrediction(

if (lowEffort)
{
PredictBatch(PredLowEffort, 0, y, width, currentRow, upperRow, argb.Slice(y * width));
PredictBatch(PredLowEffort, 0, y, width, currentRow, upperRow, argb.Slice(y * width), scratch);
}
else
{
Expand Down Expand Up @@ -634,7 +639,8 @@ private static void CopyImageWithPrediction(
transparentColorMode,
usedSubtractGreen,
nearLossless,
argb.Slice((y * width) + x));
argb.Slice((y * width) + x),
scratch);

x = xEnd;
}
Expand All @@ -649,7 +655,8 @@ private static void PredictBatch(
int numPixels,
Span<uint> currentSpan,
Span<uint> upperSpan,
Span<uint> outputSpan)
Span<uint> outputSpan,
Span<short> scratch)
{
#pragma warning disable SA1503 // Braces should not be omitted
fixed (uint* current = currentSpan)
Expand Down Expand Up @@ -718,7 +725,7 @@ private static void PredictBatch(
LosslessUtils.PredictorSub10(current + xStart, upper + xStart, numPixels, output);
break;
case 11:
LosslessUtils.PredictorSub11(current + xStart, upper + xStart, numPixels, output);
LosslessUtils.PredictorSub11(current + xStart, upper + xStart, numPixels, output, scratch);
break;
case 12:
LosslessUtils.PredictorSub12(current + xStart, upper + xStart, numPixels, output);
Expand Down