diff --git a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs index a10ec6eabb..b8986f66ff 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs @@ -704,28 +704,7 @@ public static int TTransformSse41(Span inputA, Span inputB, Span transpose00 = Sse2.UnpackLow(b0, b1); - Vector128 transpose01 = Sse2.UnpackLow(b2, b3); - Vector128 transpose02 = Sse2.UnpackHigh(b0, b1); - Vector128 transpose03 = Sse2.UnpackHigh(b2, b3); - - // a00 a10 a01 a11 a02 a12 a03 a13 - // a20 a30 a21 a31 a22 a32 a23 a33 - // b00 b10 b01 b11 b02 b12 b03 b13 - // b20 b30 b21 b31 b22 b32 b23 b33 - Vector128 transpose10 = Sse2.UnpackLow(transpose00.AsInt32(), transpose01.AsInt32()); - Vector128 transpose11 = Sse2.UnpackLow(transpose02.AsInt32(), transpose03.AsInt32()); - Vector128 transpose12 = Sse2.UnpackHigh(transpose00.AsInt32(), transpose01.AsInt32()); - Vector128 transpose13 = Sse2.UnpackHigh(transpose02.AsInt32(), transpose03.AsInt32()); - - // a00 a10 a20 a30 a01 a11 a21 a31 - // b00 b10 b20 b30 b01 b11 b21 b31 - // a02 a12 a22 a32 a03 a13 a23 a33 - // b02 b12 a22 b32 b03 b13 b23 b33 - Vector128 output0 = Sse2.UnpackLow(transpose10.AsInt64(), transpose11.AsInt64()); - Vector128 output1 = Sse2.UnpackHigh(transpose10.AsInt64(), transpose11.AsInt64()); - Vector128 output2 = Sse2.UnpackLow(transpose12.AsInt64(), transpose13.AsInt64()); - Vector128 output3 = Sse2.UnpackHigh(transpose12.AsInt64(), transpose13.AsInt64()); + Vp8Transpose_2_4x4_16b(b0, b1, b2, b3, out Vector128 output0, out Vector128 output1, out Vector128 output2, out Vector128 output3); // a00 a10 a20 a30 b00 b10 b20 b30 // a01 a11 a21 a31 b01 b11 b21 b31 @@ -769,6 +748,44 @@ public static int TTransformSse41(Span inputA, Span inputB, Span b0, Vector128 b1, Vector128 b2, Vector128 b3, out Vector128 output0, out Vector128 output1, out Vector128 output2, out Vector128 output3) + { + // Transpose the two 4x4. + // a00 a01 a02 a03 b00 b01 b02 b03 + // a10 a11 a12 a13 b10 b11 b12 b13 + // a20 a21 a22 a23 b20 b21 b22 b23 + // a30 a31 a32 a33 b30 b31 b32 b33 + Vector128 transpose00 = Sse2.UnpackLow(b0, b1); + Vector128 transpose01 = Sse2.UnpackLow(b2, b3); + Vector128 transpose02 = Sse2.UnpackHigh(b0, b1); + Vector128 transpose03 = Sse2.UnpackHigh(b2, b3); + + // a00 a10 a01 a11 a02 a12 a03 a13 + // a20 a30 a21 a31 a22 a32 a23 a33 + // b00 b10 b01 b11 b02 b12 b03 b13 + // b20 b30 b21 b31 b22 b32 b23 b33 + Vector128 transpose10 = Sse2.UnpackLow(transpose00.AsInt32(), transpose01.AsInt32()); + Vector128 transpose11 = Sse2.UnpackLow(transpose02.AsInt32(), transpose03.AsInt32()); + Vector128 transpose12 = Sse2.UnpackHigh(transpose00.AsInt32(), transpose01.AsInt32()); + Vector128 transpose13 = Sse2.UnpackHigh(transpose02.AsInt32(), transpose03.AsInt32()); + + // a00 a10 a20 a30 a01 a11 a21 a31 + // b00 b10 b20 b30 b01 b11 b21 b31 + // a02 a12 a22 a32 a03 a13 a23 a33 + // b02 b12 a22 b32 b03 b13 b23 b33 + output0 = Sse2.UnpackLow(transpose10.AsInt64(), transpose11.AsInt64()); + output1 = Sse2.UnpackHigh(transpose10.AsInt64(), transpose11.AsInt64()); + output2 = Sse2.UnpackLow(transpose12.AsInt64(), transpose13.AsInt64()); + output3 = Sse2.UnpackHigh(transpose12.AsInt64(), transpose13.AsInt64()); + + // a00 a10 a20 a30 b00 b10 b20 b30 + // a01 a11 a21 a31 b01 b11 b21 b31 + // a02 a12 a22 a32 b02 b12 b22 b32 + // a03 a13 a23 a33 b03 b13 b23 b33 + } #endif public static void TransformTwo(Span src, Span dst, Span scratch) diff --git a/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs b/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs index 38ed80590d..2fcea8ceea 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs @@ -329,7 +329,7 @@ public static int ReconstructIntra16(Vp8EncIterator it, Vp8SegmentInfo dqm, Vp8M LossyUtils.TransformWht(dcTmp, tmp, scratch); for (n = 0; n < 16; n += 2) { - Vp8Encoding.ITransform(reference.Slice(WebpLookupTables.Vp8Scan[n]), tmp.Slice(n * 16, 32), yuvOut.Slice(WebpLookupTables.Vp8Scan[n]), true, scratch); + Vp8Encoding.ITransform(reference.Slice(WebpLookupTables.Vp8Scan[n]), tmp.Slice(n * 16, 32), yuvOut.Slice(WebpLookupTables.Vp8Scan[n]), scratch); } return nz; @@ -342,7 +342,7 @@ public static int ReconstructIntra4(Vp8EncIterator it, Vp8SegmentInfo dqm, Span< Span scratch = it.Scratch3.AsSpan(0, 16); Vp8Encoding.FTransform(src, reference, tmp, scratch); int nz = QuantizeBlock(tmp, levels, ref dqm.Y1); - Vp8Encoding.ITransform(reference, tmp, yuvOut, false, scratch); + Vp8Encoding.ITransformOne(reference, tmp, yuvOut, scratch); return nz; } @@ -375,7 +375,7 @@ public static int ReconstructUv(Vp8EncIterator it, Vp8SegmentInfo dqm, Vp8ModeSc for (n = 0; n < 8; n += 2) { - Vp8Encoding.ITransform(reference.Slice(WebpLookupTables.Vp8ScanUv[n]), tmp.Slice(n * 16, 32), yuvOut.Slice(WebpLookupTables.Vp8ScanUv[n]), true, scratch); + Vp8Encoding.ITransform(reference.Slice(WebpLookupTables.Vp8ScanUv[n]), tmp.Slice(n * 16, 32), yuvOut.Slice(WebpLookupTables.Vp8ScanUv[n]), scratch); } return nz << 16; diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs index af7e8eaa36..aa4ab5767b 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs @@ -4,6 +4,11 @@ using System; using System.Buffers.Binary; using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +#if SUPPORTS_RUNTIME_INTRINSICS +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; +#endif namespace SixLabors.ImageSharp.Formats.Webp.Lossy { @@ -60,6 +65,14 @@ internal static class Vp8Encoding public static readonly int[] Vp8I4ModeOffsets = { I4DC4, I4TM4, I4VE4, I4HE4, I4RD4, I4VR4, I4LD4, I4VL4, I4HD4, I4HU4 }; +#if SUPPORTS_RUNTIME_INTRINSICS + public static readonly Vector128 K1 = Vector128.Create((short)20091).AsInt16(); + + public static readonly Vector128 K2 = Vector128.Create((short)-30068).AsInt16(); + + public static readonly Vector128 Four = Vector128.Create((short)4); +#endif + static Vp8Encoding() { for (int i = -255; i <= 255 + 255; i++) @@ -68,51 +81,299 @@ static Vp8Encoding() } } - public static void ITransform(Span reference, Span input, Span dst, bool doTwo, Span scratch) + // Transforms (Paragraph 14.4) + // Does two inverse transforms. + public static void ITransform(Span reference, Span input, Span dst, Span scratch) { - ITransformOne(reference, input, dst, scratch); - if (doTwo) +#if SUPPORTS_RUNTIME_INTRINSICS + if (Sse2.IsSupported) + { + // This implementation makes use of 16-bit fixed point versions of two + // multiply constants: + // K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16 + // K2 = sqrt(2) * sin (pi/8) ~= 35468 / 2^16 + // + // To be able to use signed 16-bit integers, we use the following trick to + // have constants within range: + // - Associated constants are obtained by subtracting the 16-bit fixed point + // version of one: + // k = K - (1 << 16) => K = k + (1 << 16) + // K1 = 85267 => k1 = 20091 + // K2 = 35468 => k2 = -30068 + // - The multiplication of a variable by a constant become the sum of the + // variable and the multiplication of that variable by the associated + // constant: + // (x * K) >> 16 = (x * (k + (1 << 16))) >> 16 = ((x * k ) >> 16) + x + + // Load and concatenate the transform coefficients (we'll do two inverse + // transforms in parallel). In the case of only one inverse transform, the + // second half of the vectors will just contain random value we'll never + // use nor store. + ref short inputRef = ref MemoryMarshal.GetReference(input); + var in0 = Vector128.Create(Unsafe.As(ref inputRef), 0); + var in1 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref inputRef, 4)), 0); + var in2 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref inputRef, 8)), 0); + var in3 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref inputRef, 12)), 0); + + // a00 a10 a20 a30 x x x x + // a01 a11 a21 a31 x x x x + // a02 a12 a22 a32 x x x x + // a03 a13 a23 a33 x x x x + var inb0 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref inputRef, 16)), 0); + var inb1 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref inputRef, 20)), 0); + var inb2 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref inputRef, 24)), 0); + var inb3 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref inputRef, 28)), 0); + + in0 = Sse2.UnpackLow(in0, inb0); + in1 = Sse2.UnpackLow(in1, inb1); + in2 = Sse2.UnpackLow(in2, inb2); + in3 = Sse2.UnpackLow(in3, inb3); + + // a00 a10 a20 a30 b00 b10 b20 b30 + // a01 a11 a21 a31 b01 b11 b21 b31 + // a02 a12 a22 a32 b02 b12 b22 b32 + // a03 a13 a23 a33 b03 b13 b23 b33 + + // Vertical pass and subsequent transpose. + // First pass, c and d calculations are longer because of the "trick" multiplications. + InverseTransformVerticalPass(in0, in2, in1, in3, out Vector128 tmp0, out Vector128 tmp1, out Vector128 tmp2, out Vector128 tmp3); + + // Transpose the two 4x4. + LossyUtils.Vp8Transpose_2_4x4_16b(tmp0, tmp1, tmp2, tmp3, out Vector128 t0, out Vector128 t1, out Vector128 t2, out Vector128 t3); + + // Horizontal pass and subsequent transpose. + // First pass, c and d calculations are longer because of the "trick" multiplications. + InverseTransformHorizontalPass(t0, t2, t1, t3, out Vector128 shifted0, out Vector128 shifted1, out Vector128 shifted2, out Vector128 shifted3); + + // Transpose the two 4x4. + LossyUtils.Vp8Transpose_2_4x4_16b(shifted0, shifted1, shifted2, shifted3, out t0, out t1, out t2, out t3); + + // Add inverse transform to 'ref' and store. + // Load the reference(s). + Vector128 ref0 = Vector128.Zero; + Vector128 ref1 = Vector128.Zero; + Vector128 ref2 = Vector128.Zero; + Vector128 ref3 = Vector128.Zero; + ref byte referenceRef = ref MemoryMarshal.GetReference(reference); + + // Load eight bytes/pixels per line. + ref0 = Vector128.Create(Unsafe.As(ref referenceRef), 0).AsByte(); + ref1 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps)), 0).AsByte(); + ref2 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 2)), 0).AsByte(); + ref3 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 3)), 0).AsByte(); + + // Convert to 16b. + ref0 = Sse2.UnpackLow(ref0, Vector128.Zero); + ref1 = Sse2.UnpackLow(ref1, Vector128.Zero); + ref2 = Sse2.UnpackLow(ref2, Vector128.Zero); + ref3 = Sse2.UnpackLow(ref3, Vector128.Zero); + + // Add the inverse transform(s). + Vector128 ref0InvAdded = Sse2.Add(ref0.AsInt16(), t0.AsInt16()); + Vector128 ref1InvAdded = Sse2.Add(ref1.AsInt16(), t1.AsInt16()); + Vector128 ref2InvAdded = Sse2.Add(ref2.AsInt16(), t2.AsInt16()); + Vector128 ref3InvAdded = Sse2.Add(ref3.AsInt16(), t3.AsInt16()); + + // Unsigned saturate to 8b. + ref0 = Sse2.PackUnsignedSaturate(ref0InvAdded, ref0InvAdded); + ref1 = Sse2.PackUnsignedSaturate(ref1InvAdded, ref1InvAdded); + ref2 = Sse2.PackUnsignedSaturate(ref2InvAdded, ref2InvAdded); + ref3 = Sse2.PackUnsignedSaturate(ref3InvAdded, ref3InvAdded); + + // Unsigned saturate to 8b. + ref byte outputRef = ref MemoryMarshal.GetReference(dst); + + // Store eight bytes/pixels per line. + Unsafe.As>(ref outputRef) = ref0.GetLower(); + Unsafe.As>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps)) = ref1.GetLower(); + Unsafe.As>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 2)) = ref2.GetLower(); + Unsafe.As>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 3)) = ref3.GetLower(); + } + else +#endif { + ITransformOne(reference, input, dst, scratch); ITransformOne(reference.Slice(4), input.Slice(16), dst.Slice(4), scratch); } } public static void ITransformOne(Span reference, Span input, Span dst, Span scratch) { - int i; - Span tmp = scratch.Slice(0, 16); - for (i = 0; i < 4; i++) +#if SUPPORTS_RUNTIME_INTRINSICS + if (Sse2.IsSupported) { - // vertical pass. - int a = input[0] + input[8]; - int b = input[0] - input[8]; - int c = Mul(input[4], KC2) - Mul(input[12], KC1); - int d = Mul(input[4], KC1) + Mul(input[12], KC2); - tmp[0] = a + d; - tmp[1] = b + c; - tmp[2] = b - c; - tmp[3] = a - d; - tmp = tmp.Slice(4); - input = input.Slice(1); + // Load and concatenate the transform coefficients (we'll do two inverse + // transforms in parallel). In the case of only one inverse transform, the + // second half of the vectors will just contain random value we'll never + // use nor store. + ref short inputRef = ref MemoryMarshal.GetReference(input); + var in0 = Vector128.Create(Unsafe.As(ref inputRef), 0); + var in1 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref inputRef, 4)), 0); + var in2 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref inputRef, 8)), 0); + var in3 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref inputRef, 12)), 0); + + // a00 a10 a20 a30 x x x x + // a01 a11 a21 a31 x x x x + // a02 a12 a22 a32 x x x x + // a03 a13 a23 a33 x x x x + + // Vertical pass and subsequent transpose. + // First pass, c and d calculations are longer because of the "trick" multiplications. + InverseTransformVerticalPass(in0, in2, in1, in3, out Vector128 tmp0, out Vector128 tmp1, out Vector128 tmp2, out Vector128 tmp3); + + // Transpose the two 4x4. + LossyUtils.Vp8Transpose_2_4x4_16b(tmp0, tmp1, tmp2, tmp3, out Vector128 t0, out Vector128 t1, out Vector128 t2, out Vector128 t3); + + // Horizontal pass and subsequent transpose. + // First pass, c and d calculations are longer because of the "trick" multiplications. + InverseTransformHorizontalPass(t0, t2, t1, t3, out Vector128 shifted0, out Vector128 shifted1, out Vector128 shifted2, out Vector128 shifted3); + + // Transpose the two 4x4. + LossyUtils.Vp8Transpose_2_4x4_16b(shifted0, shifted1, shifted2, shifted3, out t0, out t1, out t2, out t3); + + // Add inverse transform to 'ref' and store. + // Load the reference(s). + Vector128 ref0 = Vector128.Zero; + Vector128 ref1 = Vector128.Zero; + Vector128 ref2 = Vector128.Zero; + Vector128 ref3 = Vector128.Zero; + ref byte referenceRef = ref MemoryMarshal.GetReference(reference); + + // Load four bytes/pixels per line. + ref0 = Sse2.ConvertScalarToVector128Int32(Unsafe.As(ref referenceRef)).AsByte(); + ref1 = Sse2.ConvertScalarToVector128Int32(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps))).AsByte(); + ref2 = Sse2.ConvertScalarToVector128Int32(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 2))).AsByte(); + ref3 = Sse2.ConvertScalarToVector128Int32(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 3))).AsByte(); + + // Convert to 16b. + ref0 = Sse2.UnpackLow(ref0, Vector128.Zero); + ref1 = Sse2.UnpackLow(ref1, Vector128.Zero); + ref2 = Sse2.UnpackLow(ref2, Vector128.Zero); + ref3 = Sse2.UnpackLow(ref3, Vector128.Zero); + + // Add the inverse transform(s). + Vector128 ref0InvAdded = Sse2.Add(ref0.AsInt16(), t0.AsInt16()); + Vector128 ref1InvAdded = Sse2.Add(ref1.AsInt16(), t1.AsInt16()); + Vector128 ref2InvAdded = Sse2.Add(ref2.AsInt16(), t2.AsInt16()); + Vector128 ref3InvAdded = Sse2.Add(ref3.AsInt16(), t3.AsInt16()); + + // Unsigned saturate to 8b. + ref0 = Sse2.PackUnsignedSaturate(ref0InvAdded, ref0InvAdded); + ref1 = Sse2.PackUnsignedSaturate(ref1InvAdded, ref1InvAdded); + ref2 = Sse2.PackUnsignedSaturate(ref2InvAdded, ref2InvAdded); + ref3 = Sse2.PackUnsignedSaturate(ref3InvAdded, ref3InvAdded); + + // Unsigned saturate to 8b. + ref byte outputRef = ref MemoryMarshal.GetReference(dst); + + // Store four bytes/pixels per line. + int output0 = Sse2.ConvertToInt32(ref0.AsInt32()); + int output1 = Sse2.ConvertToInt32(ref1.AsInt32()); + int output2 = Sse2.ConvertToInt32(ref2.AsInt32()); + int output3 = Sse2.ConvertToInt32(ref3.AsInt32()); + + Unsafe.As(ref outputRef) = output0; + Unsafe.As(ref Unsafe.Add(ref outputRef, WebpConstants.Bps)) = output1; + Unsafe.As(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 2)) = output2; + Unsafe.As(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 3)) = output3; } - - tmp = scratch; - for (i = 0; i < 4; i++) + else +#endif { - // horizontal pass. - int dc = tmp[0] + 4; - int a = dc + tmp[8]; - int b = dc - tmp[8]; - int c = Mul(tmp[4], KC2) - Mul(tmp[12], KC1); - int d = Mul(tmp[4], KC1) + Mul(tmp[12], KC2); - Store(dst, reference, 0, i, a + d); - Store(dst, reference, 1, i, b + c); - Store(dst, reference, 2, i, b - c); - Store(dst, reference, 3, i, a - d); - tmp = tmp.Slice(1); + int i; + Span tmp = scratch.Slice(0, 16); + for (i = 0; i < 4; i++) + { + // vertical pass. + int a = input[0] + input[8]; + int b = input[0] - input[8]; + int c = Mul(input[4], KC2) - Mul(input[12], KC1); + int d = Mul(input[4], KC1) + Mul(input[12], KC2); + tmp[0] = a + d; + tmp[1] = b + c; + tmp[2] = b - c; + tmp[3] = a - d; + tmp = tmp.Slice(4); + input = input.Slice(1); + } + + tmp = scratch; + for (i = 0; i < 4; i++) + { + // horizontal pass. + int dc = tmp[0] + 4; + int a = dc + tmp[8]; + int b = dc - tmp[8]; + int c = Mul(tmp[4], KC2) - Mul(tmp[12], KC1); + int d = Mul(tmp[4], KC1) + Mul(tmp[12], KC2); + Store(dst, reference, 0, i, a + d); + Store(dst, reference, 1, i, b + c); + Store(dst, reference, 2, i, b - c); + Store(dst, reference, 3, i, a - d); + tmp = tmp.Slice(1); + } } } +#if SUPPORTS_RUNTIME_INTRINSICS + private static void InverseTransformVerticalPass(Vector128 in0, Vector128 in2, Vector128 in1, Vector128 in3, out Vector128 tmp0, out Vector128 tmp1, out Vector128 tmp2, out Vector128 tmp3) + { + Vector128 a = Sse2.Add(in0.AsInt16(), in2.AsInt16()); + Vector128 b = Sse2.Subtract(in0.AsInt16(), in2.AsInt16()); + + // c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3 + Vector128 c1 = Sse2.MultiplyHigh(in1.AsInt16(), K2); + Vector128 c2 = Sse2.MultiplyHigh(in3.AsInt16(), K1); + Vector128 c3 = Sse2.Subtract(in1.AsInt16(), in3.AsInt16()); + Vector128 c4 = Sse2.Subtract(c1, c2); + Vector128 c = Sse2.Add(c3, c4); + + // d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3 + Vector128 d1 = Sse2.MultiplyHigh(in1.AsInt16(), K1); + Vector128 d2 = Sse2.MultiplyHigh(in3.AsInt16(), K2); + Vector128 d3 = Sse2.Add(in1.AsInt16(), in3.AsInt16()); + Vector128 d4 = Sse2.Add(d1, d2); + Vector128 d = Sse2.Add(d3, d4); + + // Second pass. + tmp0 = Sse2.Add(a, d); + tmp1 = Sse2.Add(b, c); + tmp2 = Sse2.Subtract(b, c); + tmp3 = Sse2.Subtract(a, d); + } + + private static void InverseTransformHorizontalPass(Vector128 t0, Vector128 t2, Vector128 t1, Vector128 t3, out Vector128 shifted0, out Vector128 shifted1, out Vector128 shifted2, out Vector128 shifted3) + { + Vector128 dc = Sse2.Add(t0.AsInt16(), Four); + Vector128 a = Sse2.Add(dc, t2.AsInt16()); + Vector128 b = Sse2.Subtract(dc, t2.AsInt16()); + + // c = MUL(T1, K2) - MUL(T3, K1) = MUL(T1, k2) - MUL(T3, k1) + T1 - T3 + Vector128 c1 = Sse2.MultiplyHigh(t1.AsInt16(), K2); + Vector128 c2 = Sse2.MultiplyHigh(t3.AsInt16(), K1); + Vector128 c3 = Sse2.Subtract(t1.AsInt16(), t3.AsInt16()); + Vector128 c4 = Sse2.Subtract(c1, c2); + Vector128 c = Sse2.Add(c3, c4); + + // d = MUL(T1, K1) + MUL(T3, K2) = MUL(T1, k1) + MUL(T3, k2) + T1 + T3 + Vector128 d1 = Sse2.MultiplyHigh(t1.AsInt16(), K1); + Vector128 d2 = Sse2.MultiplyHigh(t3.AsInt16(), K2); + Vector128 d3 = Sse2.Add(t1.AsInt16(), t3.AsInt16()); + Vector128 d4 = Sse2.Add(d1, d2); + Vector128 d = Sse2.Add(d3, d4); + + // Second pass. + Vector128 tmp0 = Sse2.Add(a, d); + Vector128 tmp1 = Sse2.Add(b, c); + Vector128 tmp2 = Sse2.Subtract(b, c); + Vector128 tmp3 = Sse2.Subtract(a, d); + shifted0 = Sse2.ShiftRightArithmetic(tmp0, 3); + shifted1 = Sse2.ShiftRightArithmetic(tmp1, 3); + shifted2 = Sse2.ShiftRightArithmetic(tmp2, 3); + shifted3 = Sse2.ShiftRightArithmetic(tmp3, 3); + } +#endif + public static void FTransform2(Span src, Span reference, Span output, Span output2, Span scratch) { FTransform(src, reference, output, scratch); diff --git a/tests/ImageSharp.Tests/Formats/WebP/Vp8EncodingTests.cs b/tests/ImageSharp.Tests/Formats/WebP/Vp8EncodingTests.cs new file mode 100644 index 0000000000..17c9beb9b7 --- /dev/null +++ b/tests/ImageSharp.Tests/Formats/WebP/Vp8EncodingTests.cs @@ -0,0 +1,98 @@ +// Copyright (c) Six Labors. +// Licensed under the Apache License, Version 2.0. + +using System.Linq; +using SixLabors.ImageSharp.Formats.Webp.Lossy; +using SixLabors.ImageSharp.Tests.TestUtilities; +using Xunit; + +namespace SixLabors.ImageSharp.Tests.Formats.WebP +{ + [Trait("Format", "Webp")] + public class Vp8EncodingTests + { + private static void RunOneInverseTransformTest() + { + // arrange + byte[] reference = + { + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 129, 129, 129, 129, + 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, + 129, 129, 129, 129, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 129, 129, 129, 129, 129, 129, 129, 129, + 129, 129, 129, 129, 129, 129, 129, 129 + }; + short[] input = { 1, 216, -48, 0, 96, -24, -48, 24, 0, -24, 24, 0, 0, 0, 0, 0, 38, -240, -72, -24, 0, -24, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + byte[] dst = new byte[128]; + byte[] expected = + { + 161, 160, 149, 105, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 160, 160, 133, 85, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 156, 147, 109, 76, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 152, 128, 87, 83, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0 + }; + int[] scratch = new int[16]; + + // act + Vp8Encoding.ITransformOne(reference, input, dst, scratch); + + // assert + Assert.True(dst.SequenceEqual(expected)); + } + + private static void RunTwoInverseTransformTest() + { + // arrange + byte[] reference = + { + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 129, 129, 129, 129, + 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, + 129, 129, 129, 129, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 129, 129, 129, 129, 129, 129, 129, 129, + 129, 129, 129, 129, 129, 129, 129, 129 + }; + short[] input = { 1, 216, -48, 0, 96, -24, -48, 24, 0, -24, 24, 0, 0, 0, 0, 0, 38, -240, -72, -24, 0, -24, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + byte[] dst = new byte[128]; + byte[] expected = + { + 161, 160, 149, 105, 78, 127, 156, 170, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 160, 160, 133, 85, 81, 129, 155, 167, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 156, 147, 109, 76, 85, 130, 153, 163, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 152, 128, 87, 83, 88, 132, 152, 159, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + }; + int[] scratch = new int[16]; + + // act + Vp8Encoding.ITransform(reference, input, dst, scratch); + + // assert + Assert.True(dst.SequenceEqual(expected)); + } + + [Fact] + public void OneInverseTransform_Works() => RunOneInverseTransformTest(); + + [Fact] + public void TwoInverseTransform_Works() => RunTwoInverseTransformTest(); + +#if SUPPORTS_RUNTIME_INTRINSICS + [Fact] + public void OneInverseTransform_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunOneInverseTransformTest, HwIntrinsics.AllowAll); + + [Fact] + public void OneInverseTransform_WithoutHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunOneInverseTransformTest, HwIntrinsics.DisableHWIntrinsic); + + [Fact] + public void TwoInverseTransform_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunTwoInverseTransformTest, HwIntrinsics.AllowAll); + + [Fact] + public void TwoInverseTransform_WithoutHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunTwoInverseTransformTest, HwIntrinsics.DisableHWIntrinsic); +#endif + } +}