Skip to content

Commit 7d74c4c

Browse files
Merge pull request #1817 from SixLabors/bp/sse4X4
Add SSE2 version of Vp8Sse4X4
2 parents 255226b + 1997d59 commit 7d74c4c

File tree

6 files changed

+102
-39
lines changed

6 files changed

+102
-39
lines changed

src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs

Lines changed: 53 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -19,17 +19,63 @@ internal static class LossyUtils
1919
private static readonly Vector128<byte> Mean16x4Mask = Vector128.Create((short)0x00ff).AsByte();
2020
#endif
2121

22+
// Note: method name in libwebp reference implementation is called VP8SSE16x16.
2223
[MethodImpl(InliningOptions.ShortMethod)]
23-
public static int Vp8Sse16X16(Span<byte> a, Span<byte> b) => GetSse(a, b, 16, 16);
24+
public static int Vp8_Sse16X16(Span<byte> a, Span<byte> b) => Vp8_SseNxN(a, b, 16, 16);
2425

26+
// Note: method name in libwebp reference implementation is called VP8SSE16x8.
2527
[MethodImpl(InliningOptions.ShortMethod)]
26-
public static int Vp8Sse16X8(Span<byte> a, Span<byte> b) => GetSse(a, b, 16, 8);
28+
public static int Vp8_Sse16X8(Span<byte> a, Span<byte> b) => Vp8_SseNxN(a, b, 16, 8);
2729

30+
// Note: method name in libwebp reference implementation is called VP8SSE4x4.
2831
[MethodImpl(InliningOptions.ShortMethod)]
29-
public static int Vp8Sse4X4(Span<byte> a, Span<byte> b) => GetSse(a, b, 4, 4);
32+
public static int Vp8_Sse4X4(Span<byte> a, Span<byte> b)
33+
{
34+
#if SUPPORTS_RUNTIME_INTRINSICS
35+
if (Sse2.IsSupported)
36+
{
37+
// Load values.
38+
ref byte aRef = ref MemoryMarshal.GetReference(a);
39+
Vector128<byte> a0 = Unsafe.As<byte, Vector128<byte>>(ref aRef);
40+
Vector128<byte> a1 = Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref aRef, WebpConstants.Bps));
41+
Vector128<byte> a2 = Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref aRef, WebpConstants.Bps * 2));
42+
Vector128<byte> a3 = Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref aRef, WebpConstants.Bps * 3));
43+
ref byte bRef = ref MemoryMarshal.GetReference(b);
44+
Vector128<byte> b0 = Unsafe.As<byte, Vector128<byte>>(ref bRef);
45+
Vector128<byte> b1 = Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref bRef, WebpConstants.Bps));
46+
Vector128<byte> b2 = Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref bRef, WebpConstants.Bps * 2));
47+
Vector128<byte> b3 = Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref bRef, WebpConstants.Bps * 3));
48+
49+
// Combine pair of lines.
50+
Vector128<int> a01 = Sse2.UnpackLow(a0.AsInt32(), a1.AsInt32());
51+
Vector128<int> a23 = Sse2.UnpackLow(a2.AsInt32(), a3.AsInt32());
52+
Vector128<int> b01 = Sse2.UnpackLow(b0.AsInt32(), b1.AsInt32());
53+
Vector128<int> b23 = Sse2.UnpackLow(b2.AsInt32(), b3.AsInt32());
54+
55+
// Convert to 16b.
56+
Vector128<byte> a01s = Sse2.UnpackLow(a01.AsByte(), Vector128<byte>.Zero);
57+
Vector128<byte> a23s = Sse2.UnpackLow(a23.AsByte(), Vector128<byte>.Zero);
58+
Vector128<byte> b01s = Sse2.UnpackLow(b01.AsByte(), Vector128<byte>.Zero);
59+
Vector128<byte> b23s = Sse2.UnpackLow(b23.AsByte(), Vector128<byte>.Zero);
60+
61+
// subtract, square and accumulate.
62+
Vector128<byte> d0 = Sse2.SubtractSaturate(a01s, b01s);
63+
Vector128<byte> d1 = Sse2.SubtractSaturate(a23s, b23s);
64+
Vector128<int> e0 = Sse2.MultiplyAddAdjacent(d0.AsInt16(), d0.AsInt16());
65+
Vector128<int> e1 = Sse2.MultiplyAddAdjacent(d1.AsInt16(), d1.AsInt16());
66+
Vector128<int> sum = Sse2.Add(e0, e1);
67+
68+
return Numerics.ReduceSum(sum);
69+
}
70+
else
71+
#endif
72+
{
73+
return Vp8_SseNxN(a, b, 4, 4);
74+
}
75+
}
3076

3177
[MethodImpl(InliningOptions.ShortMethod)]
32-
public static int GetSse(Span<byte> a, Span<byte> b, int w, int h)
78+
public static int Vp8_SseNxN(Span<byte> a, Span<byte> b, int w, int h)
3379
{
3480
int count = 0;
3581
int aOffset = 0;
@@ -88,7 +134,7 @@ public static int Vp8Disto4X4(Span<byte> a, Span<byte> b, Span<ushort> w, Span<i
88134
#if SUPPORTS_RUNTIME_INTRINSICS
89135
if (Sse41.IsSupported)
90136
{
91-
int diffSum = TTransformSse41(a, b, w, scratch);
137+
int diffSum = TTransformSse41(a, b, w);
92138
return Math.Abs(diffSum) >> 5;
93139
}
94140
else
@@ -615,11 +661,8 @@ public static int TTransform(Span<byte> input, Span<ushort> w, Span<int> scratch
615661
/// Returns the weighted sum of the absolute value of transformed coefficients.
616662
/// w[] contains a row-major 4 by 4 symmetric matrix.
617663
/// </summary>
618-
public static int TTransformSse41(Span<byte> inputA, Span<byte> inputB, Span<ushort> w, Span<int> scratch)
664+
public static int TTransformSse41(Span<byte> inputA, Span<byte> inputB, Span<ushort> w)
619665
{
620-
Span<int> sum = scratch.Slice(0, 4);
621-
sum.Clear();
622-
623666
// Load and combine inputs.
624667
Vector128<byte> ina0 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(inputA));
625668
Vector128<byte> ina1 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(inputA.Slice(WebpConstants.Bps, 16)));
@@ -724,9 +767,7 @@ public static int TTransformSse41(Span<byte> inputA, Span<byte> inputB, Span<ush
724767
// difference of weighted sums.
725768
Vector128<int> result = Sse2.Subtract(ab0ab2Sum.AsInt32(), b0w0bb2w8Sum.AsInt32());
726769

727-
ref int outputRef = ref MemoryMarshal.GetReference(sum);
728-
Unsafe.As<int, Vector128<int>>(ref outputRef) = result.AsInt32();
729-
return sum[3] + sum[2] + sum[1] + sum[0];
770+
return Numerics.ReduceSum(result);
730771
}
731772
#endif
732773

@@ -739,7 +780,6 @@ public static void TransformTwo(Span<short> src, Span<byte> dst, Span<int> scrat
739780
public static void TransformOne(Span<short> src, Span<byte> dst, Span<int> scratch)
740781
{
741782
Span<int> tmp = scratch.Slice(0, 16);
742-
tmp.Clear();
743783
int tmpOffset = 0;
744784
for (int srcOffset = 0; srcOffset < 4; srcOffset++)
745785
{

src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs

Lines changed: 6 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ public static void PickBestIntra16(Vp8EncIterator it, ref Vp8ModeScore rd, Vp8Se
6666
rdCur.Nz = (uint)ReconstructIntra16(it, dqm, rdCur, tmpDst, mode);
6767

6868
// Measure RD-score.
69-
rdCur.D = LossyUtils.Vp8Sse16X16(src, tmpDst);
69+
rdCur.D = LossyUtils.Vp8_Sse16X16(src, tmpDst);
7070
rdCur.SD = tlambda != 0 ? Mult8B(tlambda, LossyUtils.Vp8Disto16X16(src, tmpDst, WeightY, scratch)) : 0;
7171
rdCur.H = WebpConstants.Vp8FixedCostsI16[mode];
7272
rdCur.R = it.GetCostLuma16(rdCur, proba, res);
@@ -160,7 +160,7 @@ public static bool PickBestIntra4(Vp8EncIterator it, ref Vp8ModeScore rd, Vp8Seg
160160
rdTmp.Nz = (uint)ReconstructIntra4(it, dqm, tmpLevels, src, tmpDst, mode);
161161

162162
// Compute RD-score.
163-
rdTmp.D = LossyUtils.Vp8Sse4X4(src, tmpDst);
163+
rdTmp.D = LossyUtils.Vp8_Sse4X4(src, tmpDst);
164164
rdTmp.SD = tlambda != 0 ? Mult8B(tlambda, LossyUtils.Vp8Disto4X4(src, tmpDst, WeightY, scratch)) : 0;
165165
rdTmp.H = modeCosts[mode];
166166

@@ -251,7 +251,7 @@ public static void PickBestUv(Vp8EncIterator it, ref Vp8ModeScore rd, Vp8Segment
251251
rdUv.Nz = (uint)ReconstructUv(it, dqm, rdUv, tmpDst, mode);
252252

253253
// Compute RD-score
254-
rdUv.D = LossyUtils.Vp8Sse16X8(src, tmpDst);
254+
rdUv.D = LossyUtils.Vp8_Sse16X8(src, tmpDst);
255255
rdUv.SD = 0; // not calling TDisto here: it tends to flatten areas.
256256
rdUv.H = WebpConstants.Vp8FixedCostsUv[mode];
257257
rdUv.R = it.GetCostUv(rdUv, proba, res);
@@ -340,8 +340,6 @@ public static int ReconstructIntra4(Vp8EncIterator it, Vp8SegmentInfo dqm, Span<
340340
Span<byte> reference = it.YuvP.AsSpan(Vp8Encoding.Vp8I4ModeOffsets[mode]);
341341
Span<short> tmp = it.Scratch2.AsSpan(0, 16);
342342
Span<int> scratch = it.Scratch3.AsSpan(0, 16);
343-
tmp.Clear();
344-
scratch.Clear();
345343
Vp8Encoding.FTransform(src, reference, tmp, scratch);
346344
int nz = QuantizeBlock(tmp, levels, ref dqm.Y1);
347345
Vp8Encoding.ITransform(reference, tmp, yuvOut, false, scratch);
@@ -357,8 +355,6 @@ public static int ReconstructUv(Vp8EncIterator it, Vp8SegmentInfo dqm, Vp8ModeSc
357355
int n;
358356
Span<short> tmp = it.Scratch2.AsSpan(0, 8 * 16);
359357
Span<int> scratch = it.Scratch3.AsSpan(0, 16);
360-
tmp.Clear();
361-
scratch.Clear();
362358

363359
for (n = 0; n < 8; n += 2)
364360
{
@@ -411,7 +407,7 @@ public static void RefineUsingDistortion(Vp8EncIterator it, Vp8SegmentInfo[] seg
411407
for (mode = 0; mode < WebpConstants.NumPredModes; ++mode)
412408
{
413409
Span<byte> reference = it.YuvP.AsSpan(Vp8Encoding.Vp8I16ModeOffsets[mode]);
414-
long score = (LossyUtils.Vp8Sse16X16(src, reference) * WebpConstants.RdDistoMult) + (WebpConstants.Vp8FixedCostsI16[mode] * lambdaDi16);
410+
long score = (LossyUtils.Vp8_Sse16X16(src, reference) * WebpConstants.RdDistoMult) + (WebpConstants.Vp8FixedCostsI16[mode] * lambdaDi16);
415411

416412
if (mode > 0 && WebpConstants.Vp8FixedCostsI16[mode] > bitLimit)
417413
{
@@ -458,7 +454,7 @@ public static void RefineUsingDistortion(Vp8EncIterator it, Vp8SegmentInfo[] seg
458454
for (mode = 0; mode < WebpConstants.NumBModes; ++mode)
459455
{
460456
Span<byte> reference = it.YuvP.AsSpan(Vp8Encoding.Vp8I4ModeOffsets[mode]);
461-
long score = (LossyUtils.Vp8Sse4X4(src, reference) * WebpConstants.RdDistoMult) + (modeCosts[mode] * lambdaDi4);
457+
long score = (LossyUtils.Vp8_Sse4X4(src, reference) * WebpConstants.RdDistoMult) + (modeCosts[mode] * lambdaDi4);
462458
if (score < bestI4Score)
463459
{
464460
bestI4Mode = mode;
@@ -507,7 +503,7 @@ public static void RefineUsingDistortion(Vp8EncIterator it, Vp8SegmentInfo[] seg
507503
for (mode = 0; mode < WebpConstants.NumPredModes; ++mode)
508504
{
509505
Span<byte> reference = it.YuvP.AsSpan(Vp8Encoding.Vp8UvModeOffsets[mode]);
510-
long score = (LossyUtils.Vp8Sse16X8(src, reference) * WebpConstants.RdDistoMult) + (WebpConstants.Vp8FixedCostsUv[mode] * lambdaDuv);
506+
long score = (LossyUtils.Vp8_Sse16X8(src, reference) * WebpConstants.RdDistoMult) + (WebpConstants.Vp8FixedCostsUv[mode] * lambdaDuv);
511507
if (score < bestUvScore)
512508
{
513509
bestMode = mode;

src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,6 @@ public static void ITransformOne(Span<byte> reference, Span<short> input, Span<b
8181
{
8282
int i;
8383
Span<int> tmp = scratch.Slice(0, 16);
84-
tmp.Clear();
8584
for (i = 0; i < 4; i++)
8685
{
8786
// vertical pass.
@@ -124,7 +123,6 @@ public static void FTransform(Span<byte> src, Span<byte> reference, Span<short>
124123
{
125124
int i;
126125
Span<int> tmp = scratch.Slice(0, 16);
127-
tmp.Clear();
128126

129127
int srcIdx = 0;
130128
int refIdx = 0;
@@ -163,7 +161,6 @@ public static void FTransform(Span<byte> src, Span<byte> reference, Span<short>
163161
public static void FTransformWht(Span<short> input, Span<short> output, Span<int> scratch)
164162
{
165163
Span<int> tmp = scratch.Slice(0, 16);
166-
tmp.Clear();
167164

168165
int i;
169166
int inputIdx = 0;

src/ImageSharp/Formats/Webp/Lossy/Vp8Histogram.cs

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,6 @@ public void CollectHistogram(Span<byte> reference, Span<byte> pred, int startBlo
4949
this.distribution.AsSpan().Clear();
5050
for (j = startBlock; j < endBlock; j++)
5151
{
52-
this.output.AsSpan().Clear();
5352
this.Vp8FTransform(reference.Slice(WebpLookupTables.Vp8DspScan[j]), pred.Slice(WebpLookupTables.Vp8DspScan[j]), this.output);
5453

5554
// Convert coefficients to bin.

src/ImageSharp/Formats/Webp/Lossy/Vp8ModeScore.cs

Lines changed: 5 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -97,18 +97,11 @@ public Vp8ModeScore()
9797

9898
public void Clear()
9999
{
100-
this.YDcLevels.AsSpan().Clear();
101-
this.YAcLevels.AsSpan().Clear();
102-
this.UvLevels.AsSpan().Clear();
103-
this.ModesI4.AsSpan().Clear();
104-
105-
for (int i = 0; i < 2; i++)
106-
{
107-
for (int j = 0; j < 3; j++)
108-
{
109-
this.Derr[i, j] = 0;
110-
}
111-
}
100+
Array.Clear(this.YDcLevels, 0, this.YDcLevels.Length);
101+
Array.Clear(this.YAcLevels, 0, this.YAcLevels.Length);
102+
Array.Clear(this.UvLevels, 0, this.UvLevels.Length);
103+
Array.Clear(this.ModesI4, 0, this.ModesI4.Length);
104+
Array.Clear(this.Derr, 0, this.Derr.Length);
112105
}
113106

114107
public void InitScore()

tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,35 @@ namespace SixLabors.ImageSharp.Tests.Formats.WebP
1111
[Trait("Format", "Webp")]
1212
public class LossyUtilsTests
1313
{
14+
private static void RunVp8Sse4X4Test()
15+
{
16+
byte[] a =
17+
{
18+
27, 27, 28, 29, 29, 28, 27, 27, 27, 28, 28, 29, 29, 28, 28, 27, 129, 129, 129, 129, 129, 129, 129,
19+
129, 128, 128, 128, 128, 128, 128, 128, 128, 27, 27, 27, 27, 27, 27, 27, 27, 27, 28, 28, 29, 29, 28,
20+
28, 27, 129, 129, 129, 129, 129, 129, 129, 129, 128, 128, 128, 128, 128, 128, 128, 128, 27, 27, 26,
21+
26, 26, 26, 27, 27, 27, 28, 28, 29, 29, 28, 28, 27, 129, 129, 129, 129, 129, 129, 129, 129, 128,
22+
128, 128, 128, 128, 128, 128, 128, 28, 27, 27, 26, 26, 27, 27, 28, 27, 28, 28, 29, 29, 28, 28, 27,
23+
129, 129, 129, 129, 129, 129, 129, 129, 128, 128, 128, 128, 128, 128, 128, 128
24+
};
25+
26+
byte[] b =
27+
{
28+
26, 26, 26, 26, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 204, 204, 204, 204, 204, 204, 204,
29+
204, 204, 204, 204, 204, 204, 204, 204, 204, 26, 26, 26, 26, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
30+
28, 28, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 26, 26, 26,
31+
26, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 204, 204, 204, 204, 204, 204, 204, 204, 204,
32+
204, 204, 204, 204, 204, 204, 204, 26, 26, 26, 26, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
33+
204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204
34+
};
35+
36+
int expected = 27;
37+
38+
int actual = LossyUtils.Vp8_Sse4X4(a, b);
39+
40+
Assert.Equal(expected, actual);
41+
}
42+
1443
private static void RunMean16x4Test()
1544
{
1645
// arrange
@@ -61,13 +90,22 @@ private static void RunHadamardTransformTest()
6190
Assert.Equal(expected, actual);
6291
}
6392

93+
[Fact]
94+
public void Vp8Sse4X4_Works() => RunVp8Sse4X4Test();
95+
6496
[Fact]
6597
public void Mean16x4_Works() => RunMean16x4Test();
6698

6799
[Fact]
68100
public void HadamardTransform_Works() => RunHadamardTransformTest();
69101

70102
#if SUPPORTS_RUNTIME_INTRINSICS
103+
[Fact]
104+
public void Vp8Sse4X4_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse4X4Test, HwIntrinsics.AllowAll);
105+
106+
[Fact]
107+
public void Vp8Sse4X4_WithoutHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse4X4Test, HwIntrinsics.DisableHWIntrinsic);
108+
71109
[Fact]
72110
public void Mean16x4_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunMean16x4Test, HwIntrinsics.AllowAll);
73111

0 commit comments

Comments
 (0)