Skip to content
Merged
184 changes: 152 additions & 32 deletions src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs
Original file line number Diff line number Diff line change
Expand Up @@ -3,20 +3,37 @@

using System;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
#if SUPPORTS_RUNTIME_INTRINSICS
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
#endif

namespace SixLabors.ImageSharp.Formats.Webp.Lossy
{
/// <summary>
/// Quantization methods.
/// </summary>
internal static class QuantEnc
internal static unsafe class QuantEnc
{
private static readonly byte[] Zigzag = { 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15 };

private static readonly ushort[] WeightY = { 38, 32, 20, 9, 32, 28, 17, 7, 20, 17, 10, 4, 9, 7, 4, 2 };

private const int MaxLevel = 2047;

#if SUPPORTS_RUNTIME_INTRINSICS
private static readonly Vector128<short> MaxCoeff2047 = Vector128.Create((short)MaxLevel);

private static readonly Vector128<byte> CstLo = Vector128.Create(0, 1, 2, 3, 8, 9, 254, 255, 10, 11, 4, 5, 6, 7, 12, 13);

private static readonly Vector128<byte> Cst7 = Vector128.Create(254, 255, 254, 255, 254, 255, 254, 255, 14, 15, 254, 255, 254, 255, 254, 255);

private static readonly Vector128<byte> CstHi = Vector128.Create(2, 3, 8, 9, 10, 11, 4, 5, 254, 255, 6, 7, 12, 13, 14, 15);

private static readonly Vector128<byte> Cst8 = Vector128.Create(254, 255, 254, 255, 254, 255, 0, 1, 254, 255, 254, 255, 254, 255, 254, 255);
#endif

// Diffusion weights. We under-correct a bit (15/16th of the error is actually
// diffused) to avoid 'rainbow' chessboard pattern of blocks at q~=0.
private const int C1 = 7; // fraction of error sent to the 4x4 block below
Expand Down Expand Up @@ -510,51 +527,154 @@ public static void RefineUsingDistortion(Vp8EncIterator it, Vp8SegmentInfo[] seg
[MethodImpl(InliningOptions.ShortMethod)]
public static int Quantize2Blocks(Span<short> input, Span<short> output, Vp8Matrix mtx)
Copy link
Member

@antonfirsov antonfirsov Nov 9, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You need to pass Vp8Matrix by reference now to avoid full stack copy.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ah yes, good spot

{
int nz = QuantizeBlock(input, output, mtx) << 0;
nz |= QuantizeBlock(input.Slice(1 * 16), output.Slice(1 * 16), mtx) << 1;
int nz = QuantizeBlock(input.Slice(0, 16), output.Slice(0, 16), mtx) << 0;
nz |= QuantizeBlock(input.Slice(1 * 16, 16), output.Slice(1 * 16, 16), mtx) << 1;
return nz;
}

public static int QuantizeBlock(Span<short> input, Span<short> output, Vp8Matrix mtx)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here too: ref Vp8Matrix mtx

{
int last = -1;
int n;
for (n = 0; n < 16; ++n)
#if SUPPORTS_RUNTIME_INTRINSICS
if (Sse41.IsSupported)
{
int j = Zigzag[n];
bool sign = input[j] < 0;
uint coeff = (uint)((sign ? -input[j] : input[j]) + mtx.Sharpen[j]);
if (coeff > mtx.ZThresh[j])
#pragma warning disable SA1503 // Braces should not be omitted
// Load all inputs.
Vector128<short> input0 = Unsafe.As<short, Vector128<short>>(ref MemoryMarshal.GetReference(input));
Vector128<short> input8 = Unsafe.As<short, Vector128<short>>(ref MemoryMarshal.GetReference(input.Slice(8, 8)));
Vector128<ushort> iq0 = Unsafe.As<ushort, Vector128<ushort>>(ref MemoryMarshal.GetReference(mtx.IQ.AsSpan(0, 8)));
Vector128<ushort> iq8 = Unsafe.As<ushort, Vector128<ushort>>(ref MemoryMarshal.GetReference(mtx.IQ.AsSpan(8, 8)));
Vector128<ushort> q0 = Unsafe.As<ushort, Vector128<ushort>>(ref MemoryMarshal.GetReference(mtx.Q.AsSpan(0, 8)));
Vector128<ushort> q8 = Unsafe.As<ushort, Vector128<ushort>>(ref MemoryMarshal.GetReference(mtx.Q.AsSpan(8, 8)));

// coeff = abs(in)
Vector128<ushort> coeff0 = Ssse3.Abs(input0);
Vector128<ushort> coeff8 = Ssse3.Abs(input8);

// coeff = abs(in) + sharpen
Vector128<short> sharpen0 = Unsafe.As<short, Vector128<short>>(ref MemoryMarshal.GetReference(mtx.Sharpen.AsSpan(0, 8)));
Vector128<short> sharpen8 = Unsafe.As<short, Vector128<short>>(ref MemoryMarshal.GetReference(mtx.Sharpen.AsSpan(8, 8)));
Sse2.Add(coeff0.AsInt16(), sharpen0);
Sse2.Add(coeff8.AsInt16(), sharpen8);

// out = (coeff * iQ + B) >> QFIX
// doing calculations with 32b precision (QFIX=17)
// out = (coeff * iQ)
Vector128<ushort> coeffiQ0H = Sse2.MultiplyHigh(coeff0, iq0);
Vector128<ushort> coeffiQ0L = Sse2.MultiplyLow(coeff0, iq0);
Vector128<ushort> coeffiQ8H = Sse2.MultiplyHigh(coeff8, iq8);
Vector128<ushort> coeffiQ8L = Sse2.MultiplyLow(coeff8, iq8);
Vector128<ushort> out00 = Sse2.UnpackLow(coeffiQ0L, coeffiQ0H);
Vector128<ushort> out04 = Sse2.UnpackHigh(coeffiQ0L, coeffiQ0H);
Vector128<ushort> out08 = Sse2.UnpackLow(coeffiQ8L, coeffiQ8H);
Vector128<ushort> out12 = Sse2.UnpackHigh(coeffiQ8L, coeffiQ8H);

// out = (coeff * iQ + B)
Vector128<uint> bias00 = Unsafe.As<uint, Vector128<uint>>(ref MemoryMarshal.GetReference(mtx.Bias.AsSpan(0, 4)));
Vector128<uint> bias04 = Unsafe.As<uint, Vector128<uint>>(ref MemoryMarshal.GetReference(mtx.Bias.AsSpan(4, 4)));
Vector128<uint> bias08 = Unsafe.As<uint, Vector128<uint>>(ref MemoryMarshal.GetReference(mtx.Bias.AsSpan(8, 4)));
Vector128<uint> bias12 = Unsafe.As<uint, Vector128<uint>>(ref MemoryMarshal.GetReference(mtx.Bias.AsSpan(12, 4)));
out00 = Sse2.Add(out00.AsInt32(), bias00.AsInt32()).AsUInt16();
out04 = Sse2.Add(out04.AsInt32(), bias04.AsInt32()).AsUInt16();
out08 = Sse2.Add(out08.AsInt32(), bias08.AsInt32()).AsUInt16();
out12 = Sse2.Add(out12.AsInt32(), bias12.AsInt32()).AsUInt16();

// out = QUANTDIV(coeff, iQ, B, QFIX)
out00 = Sse2.ShiftRightArithmetic(out00.AsInt32(), WebpConstants.QFix).AsUInt16();
out04 = Sse2.ShiftRightArithmetic(out04.AsInt32(), WebpConstants.QFix).AsUInt16();
out08 = Sse2.ShiftRightArithmetic(out08.AsInt32(), WebpConstants.QFix).AsUInt16();
out12 = Sse2.ShiftRightArithmetic(out12.AsInt32(), WebpConstants.QFix).AsUInt16();

// pack result as 16b
Vector128<short> out0 = Sse2.PackSignedSaturate(out00.AsInt32(), out04.AsInt32());
Vector128<short> out8 = Sse2.PackSignedSaturate(out08.AsInt32(), out12.AsInt32());

// if (coeff > 2047) coeff = 2047
out0 = Sse2.Min(out0, MaxCoeff2047);
out8 = Sse2.Min(out8, MaxCoeff2047);

// put sign back
out0 = Ssse3.Sign(out0, input0);
out8 = Ssse3.Sign(out8, input8);

// in = out * Q
input0 = Sse2.MultiplyLow(out0, q0.AsInt16());
input8 = Sse2.MultiplyLow(out8, q8.AsInt16());

fixed (short* inputPtr = input)
{
uint q = mtx.Q[j];
uint iQ = mtx.IQ[j];
uint b = mtx.Bias[j];
int level = QuantDiv(coeff, iQ, b);
if (level > MaxLevel)
{
level = MaxLevel;
}
// in = out * Q
Sse2.Store(inputPtr, input0);
Sse2.Store(inputPtr + 8, input8);
}

// zigzag the output before storing it. The re-ordering is:
// 0 1 2 3 4 5 6 7 | 8 9 10 11 12 13 14 15
// -> 0 1 4[8]5 2 3 6 | 9 12 13 10 [7]11 14 15
// There's only two misplaced entries ([8] and [7]) that are crossing the
// reg's boundaries.
// We use pshufb instead of pshuflo/pshufhi.
Vector128<byte> tmpLo = Ssse3.Shuffle(out0.AsByte(), CstLo);
Vector128<byte> tmp7 = Ssse3.Shuffle(out0.AsByte(), Cst7); // extract #7
Vector128<byte> tmpHi = Ssse3.Shuffle(out8.AsByte(), CstHi);
Vector128<byte> tmp8 = Ssse3.Shuffle(out8.AsByte(), Cst8); // extract #8
Vector128<byte> outZ0 = Sse2.Or(tmpLo, tmp8);
Vector128<byte> outZ8 = Sse2.Or(tmpHi, tmp7);

fixed (short* outputPtr = output)
{
Sse2.Store(outputPtr, outZ0.AsInt16());
Sse2.Store(outputPtr + 8, outZ8.AsInt16());
}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We still need to pin for the final Sse2.Store right?

Haven't checked if the following compiles & works, but since MemoryMarshal.GetReference and Unsafe.As return references, you can use them on the left side of an assignment:

Suggested change
fixed (short* outputPtr = output)
{
Sse2.Store(outputPtr, outZ0.AsInt16());
Sse2.Store(outputPtr + 8, outZ8.AsInt16());
}
ref short outputRef = ref MemoryMarshal.GetReference(output);
Unsafe.As<short, Vector128<short>>(ref outputRef) = outZ0.AsInt16();
Unsafe.As<short, Vector128<short>>(ref Unsafe.Add(ref outputRef, 8)) = outZ8.AsInt16();

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah this works 👍


if (sign)
Vector128<sbyte> packedOutput = Sse2.PackSignedSaturate(outZ0.AsInt16(), outZ8.AsInt16());

// Detect if all 'out' values are zeroes or not.
Vector128<sbyte> cmpeq = Sse2.CompareEqual(packedOutput, Vector128<sbyte>.Zero);
return Sse2.MoveMask(cmpeq) != 0xffff ? 1 : 0;
#pragma warning restore SA1503 // Braces should not be omitted
}
else
#endif
{
int last = -1;
int n;
for (n = 0; n < 16; ++n)
{
int j = Zigzag[n];
bool sign = input[j] < 0;
uint coeff = (uint)((sign ? -input[j] : input[j]) + mtx.Sharpen[j]);
if (coeff > mtx.ZThresh[j])
{
level = -level;
}
uint q = mtx.Q[j];
uint iQ = mtx.IQ[j];
uint b = mtx.Bias[j];
int level = QuantDiv(coeff, iQ, b);
if (level > MaxLevel)
{
level = MaxLevel;
}

input[j] = (short)(level * (int)q);
output[n] = (short)level;
if (level != 0)
if (sign)
{
level = -level;
}

input[j] = (short)(level * (int)q);
output[n] = (short)level;
if (level != 0)
{
last = n;
}
}
else
{
last = n;
output[n] = 0;
input[j] = 0;
}
}
else
{
output[n] = 0;
input[j] = 0;
}
}

return last >= 0 ? 1 : 0;
return last >= 0 ? 1 : 0;
}
}

// Quantize as usual, but also compute and return the quantization error.
Expand Down
9 changes: 9 additions & 0 deletions src/ImageSharp/Formats/Webp/Lossy/Vp8Matrix.cs
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,15 @@ public Vp8Matrix()
this.Sharpen = new short[16];
}

public Vp8Matrix(ushort[] q, ushort[] iq, uint[] bias, uint[] zThresh, short[] sharpen)
{
this.Q = q;
this.IQ = iq;
this.Bias = bias;
this.ZThresh = zThresh;
this.Sharpen = sharpen;
}

/// <summary>
/// Gets the quantizer steps.
/// </summary>
Expand Down
56 changes: 56 additions & 0 deletions tests/ImageSharp.Tests/Formats/WebP/QuantEncTests.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
// Copyright (c) Six Labors.
// Licensed under the Apache License, Version 2.0.

using System.Linq;
using SixLabors.ImageSharp.Formats.Webp.Lossy;
using SixLabors.ImageSharp.Tests.TestUtilities;
using Xunit;

namespace SixLabors.ImageSharp.Tests.Formats.WebP
{
[Trait("Format", "Webp")]
public class QuantEncTests
{
private static void RunQuantizeBlockTest()
{
// arrange
short[] input = { 378, 777, -851, 888, 259, 148, 0, -111, -185, -185, -74, -37, 148, 74, 111, 74 };
short[] output = new short[16];
ushort[] q = { 42, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37 };
ushort[] iq = { 3120, 3542, 3542, 3542, 3542, 3542, 3542, 3542, 3542, 3542, 3542, 3542, 3542, 3542, 3542, 3542 };
uint[] bias =
{
49152, 55296, 55296, 55296, 55296, 55296, 55296, 55296, 55296, 55296, 55296, 55296, 55296, 55296,
55296, 55296
};
uint[] zthresh = { 26, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 };
short[] expectedOutput = { 9, 21, 7, -5, 4, -23, 24, 0, -5, 4, 2, -2, -3, -1, 3, 2 };
int expectedResult = 1;
var vp8Matrix = new Vp8Matrix(q, iq, bias, zthresh, new short[16]);

// act
int actualResult = QuantEnc.QuantizeBlock(input, output, vp8Matrix);

// assert
Assert.True(output.SequenceEqual(expectedOutput));
Assert.Equal(expectedResult, actualResult);
}

[Fact]
public void QuantizeBlock_Works() => RunQuantizeBlockTest();

#if SUPPORTS_RUNTIME_INTRINSICS
[Fact]
public void QuantizeBlock_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunQuantizeBlockTest, HwIntrinsics.AllowAll);

[Fact]
public void QuantizeBlock_WithoutSSE2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunQuantizeBlockTest, HwIntrinsics.DisableSSE2);

[Fact]
public void QuantizeBlock_WithoutSSSE3_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunQuantizeBlockTest, HwIntrinsics.DisableSSSE3);

[Fact]
public void QuantizeBlock_WithoutSSE2AndSSSE3_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunQuantizeBlockTest, HwIntrinsics.DisableSSE2 | HwIntrinsics.DisableSSSE3);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't see why do we need all these variants.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You think its enough to test only without SSE2? Can it not be that the CPU supports SSE2, but not SSSE3?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What I meant is that the feature needs SSE4.1, so unless we really believe we will add SSE2 and SSSE3 versions of the algorithm later, I don't see why is it necessary to test switching off those particular instruction sets.

We should just switch off SSE4.1 or all intrinsics in general instead I think.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, i haved changed it to test with and without intrinsics

#endif
}
}