diff --git a/shared-infrastructure b/shared-infrastructure
index 33cb12ca77..a042aba176 160000
--- a/shared-infrastructure
+++ b/shared-infrastructure
@@ -1 +1 @@
-Subproject commit 33cb12ca77f919b44de56f344d2627cc2a108c3a
+Subproject commit a042aba176cdb840d800c6ed4cfe41a54fb7b1e3
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs
index 9d49b8c45f..27bb2fc3cd 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs
@@ -337,6 +337,64 @@ public nint GetLastNonZeroIndex()
}
}
+ ///
+ /// Transpose the block inplace.
+ ///
+ [MethodImpl(InliningOptions.ShortMethod)]
+ public void TransposeInplace()
+ {
+ ref short elemRef = ref Unsafe.As(ref this);
+
+ // row #0
+ Swap(ref Unsafe.Add(ref elemRef, 1), ref Unsafe.Add(ref elemRef, 8));
+ Swap(ref Unsafe.Add(ref elemRef, 2), ref Unsafe.Add(ref elemRef, 16));
+ Swap(ref Unsafe.Add(ref elemRef, 3), ref Unsafe.Add(ref elemRef, 24));
+ Swap(ref Unsafe.Add(ref elemRef, 4), ref Unsafe.Add(ref elemRef, 32));
+ Swap(ref Unsafe.Add(ref elemRef, 5), ref Unsafe.Add(ref elemRef, 40));
+ Swap(ref Unsafe.Add(ref elemRef, 6), ref Unsafe.Add(ref elemRef, 48));
+ Swap(ref Unsafe.Add(ref elemRef, 7), ref Unsafe.Add(ref elemRef, 56));
+
+ // row #1
+ Swap(ref Unsafe.Add(ref elemRef, 10), ref Unsafe.Add(ref elemRef, 17));
+ Swap(ref Unsafe.Add(ref elemRef, 11), ref Unsafe.Add(ref elemRef, 25));
+ Swap(ref Unsafe.Add(ref elemRef, 12), ref Unsafe.Add(ref elemRef, 33));
+ Swap(ref Unsafe.Add(ref elemRef, 13), ref Unsafe.Add(ref elemRef, 41));
+ Swap(ref Unsafe.Add(ref elemRef, 14), ref Unsafe.Add(ref elemRef, 49));
+ Swap(ref Unsafe.Add(ref elemRef, 15), ref Unsafe.Add(ref elemRef, 57));
+
+ // row #2
+ Swap(ref Unsafe.Add(ref elemRef, 19), ref Unsafe.Add(ref elemRef, 26));
+ Swap(ref Unsafe.Add(ref elemRef, 20), ref Unsafe.Add(ref elemRef, 34));
+ Swap(ref Unsafe.Add(ref elemRef, 21), ref Unsafe.Add(ref elemRef, 42));
+ Swap(ref Unsafe.Add(ref elemRef, 22), ref Unsafe.Add(ref elemRef, 50));
+ Swap(ref Unsafe.Add(ref elemRef, 23), ref Unsafe.Add(ref elemRef, 58));
+
+ // row #3
+ Swap(ref Unsafe.Add(ref elemRef, 28), ref Unsafe.Add(ref elemRef, 35));
+ Swap(ref Unsafe.Add(ref elemRef, 29), ref Unsafe.Add(ref elemRef, 43));
+ Swap(ref Unsafe.Add(ref elemRef, 30), ref Unsafe.Add(ref elemRef, 51));
+ Swap(ref Unsafe.Add(ref elemRef, 31), ref Unsafe.Add(ref elemRef, 59));
+
+ // row #4
+ Swap(ref Unsafe.Add(ref elemRef, 37), ref Unsafe.Add(ref elemRef, 44));
+ Swap(ref Unsafe.Add(ref elemRef, 38), ref Unsafe.Add(ref elemRef, 52));
+ Swap(ref Unsafe.Add(ref elemRef, 39), ref Unsafe.Add(ref elemRef, 60));
+
+ // row #5
+ Swap(ref Unsafe.Add(ref elemRef, 46), ref Unsafe.Add(ref elemRef, 53));
+ Swap(ref Unsafe.Add(ref elemRef, 47), ref Unsafe.Add(ref elemRef, 61));
+
+ // row #6
+ Swap(ref Unsafe.Add(ref elemRef, 55), ref Unsafe.Add(ref elemRef, 62));
+
+ static void Swap(ref short a, ref short b)
+ {
+ short tmp = a;
+ a = b;
+ b = tmp;
+ }
+ }
+
///
/// Calculate the total sum of absolute differences of elements in 'a' and 'b'.
///
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Decoder/HuffmanScanDecoder.cs b/src/ImageSharp/Formats/Jpeg/Components/Decoder/HuffmanScanDecoder.cs
index ad09b50655..6f104351c8 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/HuffmanScanDecoder.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/HuffmanScanDecoder.cs
@@ -502,7 +502,7 @@ private void DecodeBlockBaseline(
{
i += r;
s = buffer.Receive(s);
- Unsafe.Add(ref blockDataRef, ZigZag.ZigZagOrder[i++]) = (short)s;
+ Unsafe.Add(ref blockDataRef, ZigZag.TransposingOrder[i++]) = (short)s;
}
else
{
@@ -571,7 +571,7 @@ private void DecodeBlockProgressiveAC(ref Block8x8 block, ref HuffmanTable acTab
if (s != 0)
{
s = buffer.Receive(s);
- Unsafe.Add(ref blockDataRef, ZigZag.ZigZagOrder[i]) = (short)(s << low);
+ Unsafe.Add(ref blockDataRef, ZigZag.TransposingOrder[i]) = (short)(s << low);
}
else
{
@@ -647,7 +647,7 @@ private void DecodeBlockProgressiveACRefined(ref short blockDataRef, ref Huffman
do
{
- ref short coef = ref Unsafe.Add(ref blockDataRef, ZigZag.ZigZagOrder[k]);
+ ref short coef = ref Unsafe.Add(ref blockDataRef, ZigZag.TransposingOrder[k]);
if (coef != 0)
{
buffer.CheckBits();
@@ -673,7 +673,7 @@ private void DecodeBlockProgressiveACRefined(ref short blockDataRef, ref Huffman
if ((s != 0) && (k < 64))
{
- Unsafe.Add(ref blockDataRef, ZigZag.ZigZagOrder[k]) = (short)s;
+ Unsafe.Add(ref blockDataRef, ZigZag.TransposingOrder[k]) = (short)s;
}
}
}
@@ -682,7 +682,7 @@ private void DecodeBlockProgressiveACRefined(ref short blockDataRef, ref Huffman
{
for (; k <= end; k++)
{
- ref short coef = ref Unsafe.Add(ref blockDataRef, ZigZag.ZigZagOrder[k]);
+ ref short coef = ref Unsafe.Add(ref blockDataRef, ZigZag.TransposingOrder[k]);
if (coef != 0)
{
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Decoder/JpegBlockPostProcessor.cs b/src/ImageSharp/Formats/Jpeg/Components/Decoder/JpegBlockPostProcessor.cs
index 085cd4a291..15f212b400 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/JpegBlockPostProcessor.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/JpegBlockPostProcessor.cs
@@ -18,11 +18,6 @@ internal struct JpegBlockPostProcessor
///
public Block8x8F SourceBlock;
- ///
- /// Temporal block to store intermediate computation results.
- ///
- public Block8x8F WorkspaceBlock;
-
///
/// The quantization table as .
///
@@ -45,7 +40,6 @@ public JpegBlockPostProcessor(IRawJpegData decoder, IJpegComponent component)
this.subSamplingDivisors = component.SubSamplingDivisors;
this.SourceBlock = default;
- this.WorkspaceBlock = default;
}
///
@@ -71,7 +65,7 @@ public void ProcessBlockColorsInto(
// Dequantize:
block.MultiplyInPlace(ref this.DequantiazationTable);
- FastFloatingPointDCT.TransformIDCT(ref block, ref this.WorkspaceBlock);
+ FastFloatingPointDCT.TransformIDCT(ref block);
// To conform better to libjpeg we actually NEED TO loose precision here.
// This is because they store blocks as Int16 between all the operations.
diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs
index ab9462632f..94864005ec 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs
@@ -2,9 +2,6 @@
// Licensed under the Apache License, Version 2.0.
#if SUPPORTS_RUNTIME_INTRINSICS
-using System.Diagnostics;
-using System.Numerics;
-using System.Runtime.CompilerServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
@@ -12,149 +9,147 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
{
internal static partial class FastFloatingPointDCT
{
-#pragma warning disable SA1310, SA1311, IDE1006 // naming rules violation warnings
+#pragma warning disable SA1310, SA1311, IDE1006 // naming rule violation warnings
private static readonly Vector256 mm256_F_0_7071 = Vector256.Create(0.707106781f);
private static readonly Vector256 mm256_F_0_3826 = Vector256.Create(0.382683433f);
private static readonly Vector256 mm256_F_0_5411 = Vector256.Create(0.541196100f);
private static readonly Vector256 mm256_F_1_3065 = Vector256.Create(1.306562965f);
- private static readonly Vector256 mm256_F_1_1758 = Vector256.Create(1.175876f);
- private static readonly Vector256 mm256_F_n1_9615 = Vector256.Create(-1.961570560f);
- private static readonly Vector256 mm256_F_n0_3901 = Vector256.Create(-0.390180644f);
- private static readonly Vector256 mm256_F_n0_8999 = Vector256.Create(-0.899976223f);
- private static readonly Vector256 mm256_F_n2_5629 = Vector256.Create(-2.562915447f);
- private static readonly Vector256 mm256_F_0_2986 = Vector256.Create(0.298631336f);
- private static readonly Vector256 mm256_F_2_0531 = Vector256.Create(2.053119869f);
- private static readonly Vector256 mm256_F_3_0727 = Vector256.Create(3.072711026f);
- private static readonly Vector256 mm256_F_1_5013 = Vector256.Create(1.501321110f);
- private static readonly Vector256 mm256_F_n1_8477 = Vector256.Create(-1.847759065f);
- private static readonly Vector256 mm256_F_0_7653 = Vector256.Create(0.765366865f);
+ private static readonly Vector256 mm256_F_1_4142 = Vector256.Create(1.414213562f);
+ private static readonly Vector256 mm256_F_1_8477 = Vector256.Create(1.847759065f);
+ private static readonly Vector256 mm256_F_n1_0823 = Vector256.Create(-1.082392200f);
+ private static readonly Vector256 mm256_F_n2_6131 = Vector256.Create(-2.613125930f);
#pragma warning restore SA1310, SA1311, IDE1006
///
/// Apply floating point FDCT inplace using simd operations.
///
- /// Input matrix.
- private static void ForwardTransform_Avx(ref Block8x8F block)
+ /// Input block.
+ private static void FDCT8x8_Avx(ref Block8x8F block)
{
DebugGuard.IsTrue(Avx.IsSupported, "Avx support is required to execute this operation.");
// First pass - process rows
block.TransposeInplace();
- FDCT8x8_Avx(ref block);
+ FDCT8x8_1D_Avx(ref block);
// Second pass - process columns
block.TransposeInplace();
- FDCT8x8_Avx(ref block);
+ FDCT8x8_1D_Avx(ref block);
+
+ // Applies 1D floating point FDCT inplace
+ static void FDCT8x8_1D_Avx(ref Block8x8F block)
+ {
+ Vector256 tmp0 = Avx.Add(block.V0, block.V7);
+ Vector256 tmp7 = Avx.Subtract(block.V0, block.V7);
+ Vector256 tmp1 = Avx.Add(block.V1, block.V6);
+ Vector256 tmp6 = Avx.Subtract(block.V1, block.V6);
+ Vector256 tmp2 = Avx.Add(block.V2, block.V5);
+ Vector256 tmp5 = Avx.Subtract(block.V2, block.V5);
+ Vector256 tmp3 = Avx.Add(block.V3, block.V4);
+ Vector256 tmp4 = Avx.Subtract(block.V3, block.V4);
+
+ // Even part
+ Vector256 tmp10 = Avx.Add(tmp0, tmp3);
+ Vector256 tmp13 = Avx.Subtract(tmp0, tmp3);
+ Vector256 tmp11 = Avx.Add(tmp1, tmp2);
+ Vector256 tmp12 = Avx.Subtract(tmp1, tmp2);
+
+ block.V0 = Avx.Add(tmp10, tmp11);
+ block.V4 = Avx.Subtract(tmp10, tmp11);
+
+ Vector256 z1 = Avx.Multiply(Avx.Add(tmp12, tmp13), mm256_F_0_7071);
+ block.V2 = Avx.Add(tmp13, z1);
+ block.V6 = Avx.Subtract(tmp13, z1);
+
+ // Odd part
+ tmp10 = Avx.Add(tmp4, tmp5);
+ tmp11 = Avx.Add(tmp5, tmp6);
+ tmp12 = Avx.Add(tmp6, tmp7);
+
+ Vector256 z5 = Avx.Multiply(Avx.Subtract(tmp10, tmp12), mm256_F_0_3826);
+ Vector256 z2 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, mm256_F_0_5411, tmp10);
+ Vector256 z4 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, mm256_F_1_3065, tmp12);
+ Vector256 z3 = Avx.Multiply(tmp11, mm256_F_0_7071);
+
+ Vector256 z11 = Avx.Add(tmp7, z3);
+ Vector256 z13 = Avx.Subtract(tmp7, z3);
+
+ block.V5 = Avx.Add(z13, z2);
+ block.V3 = Avx.Subtract(z13, z2);
+ block.V1 = Avx.Add(z11, z4);
+ block.V7 = Avx.Subtract(z11, z4);
+ }
}
///
- /// Apply 1D floating point FDCT inplace using AVX operations on 8x8 matrix.
+ /// Apply floating point IDCT inplace using simd operations.
///
- ///
- /// Requires Avx support.
- ///
- /// Input matrix.
- public static void FDCT8x8_Avx(ref Block8x8F block)
+ /// Transposed input block.
+ private static void IDCT8x8_Avx(ref Block8x8F transposedBlock)
{
DebugGuard.IsTrue(Avx.IsSupported, "Avx support is required to execute this operation.");
- Vector256 tmp0 = Avx.Add(block.V0, block.V7);
- Vector256 tmp7 = Avx.Subtract(block.V0, block.V7);
- Vector256 tmp1 = Avx.Add(block.V1, block.V6);
- Vector256 tmp6 = Avx.Subtract(block.V1, block.V6);
- Vector256 tmp2 = Avx.Add(block.V2, block.V5);
- Vector256 tmp5 = Avx.Subtract(block.V2, block.V5);
- Vector256 tmp3 = Avx.Add(block.V3, block.V4);
- Vector256 tmp4 = Avx.Subtract(block.V3, block.V4);
-
- // Even part
- Vector256 tmp10 = Avx.Add(tmp0, tmp3);
- Vector256 tmp13 = Avx.Subtract(tmp0, tmp3);
- Vector256 tmp11 = Avx.Add(tmp1, tmp2);
- Vector256 tmp12 = Avx.Subtract(tmp1, tmp2);
-
- block.V0 = Avx.Add(tmp10, tmp11);
- block.V4 = Avx.Subtract(tmp10, tmp11);
-
- Vector256 z1 = Avx.Multiply(Avx.Add(tmp12, tmp13), mm256_F_0_7071);
- block.V2 = Avx.Add(tmp13, z1);
- block.V6 = Avx.Subtract(tmp13, z1);
-
- // Odd part
- tmp10 = Avx.Add(tmp4, tmp5);
- tmp11 = Avx.Add(tmp5, tmp6);
- tmp12 = Avx.Add(tmp6, tmp7);
-
- Vector256 z5 = Avx.Multiply(Avx.Subtract(tmp10, tmp12), mm256_F_0_3826);
- Vector256 z2 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, mm256_F_0_5411, tmp10);
- Vector256 z4 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, mm256_F_1_3065, tmp12);
- Vector256 z3 = Avx.Multiply(tmp11, mm256_F_0_7071);
-
- Vector256 z11 = Avx.Add(tmp7, z3);
- Vector256 z13 = Avx.Subtract(tmp7, z3);
-
- block.V5 = Avx.Add(z13, z2);
- block.V3 = Avx.Subtract(z13, z2);
- block.V1 = Avx.Add(z11, z4);
- block.V7 = Avx.Subtract(z11, z4);
- }
-
- ///
- /// Combined operation of and
- /// using AVX commands.
- ///
- /// Source
- /// Destination
- public static void IDCT8x8_Avx(ref Block8x8F s, ref Block8x8F d)
- {
- Debug.Assert(Avx.IsSupported, "AVX is required to execute this method");
-
- Vector256 my1 = s.V1;
- Vector256 my7 = s.V7;
- Vector256 mz0 = Avx.Add(my1, my7);
-
- Vector256 my3 = s.V3;
- Vector256 mz2 = Avx.Add(my3, my7);
- Vector256 my5 = s.V5;
- Vector256 mz1 = Avx.Add(my3, my5);
- Vector256 mz3 = Avx.Add(my1, my5);
-
- Vector256 mz4 = Avx.Multiply(Avx.Add(mz0, mz1), mm256_F_1_1758);
-
- mz2 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, mz2, mm256_F_n1_9615);
- mz3 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, mz3, mm256_F_n0_3901);
- mz0 = Avx.Multiply(mz0, mm256_F_n0_8999);
- mz1 = Avx.Multiply(mz1, mm256_F_n2_5629);
-
- Vector256 mb3 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz0, my7, mm256_F_0_2986), mz2);
- Vector256 mb2 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz1, my5, mm256_F_2_0531), mz3);
- Vector256 mb1 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz1, my3, mm256_F_3_0727), mz2);
- Vector256 mb0 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz0, my1, mm256_F_1_5013), mz3);
-
- Vector256 my2 = s.V2;
- Vector256 my6 = s.V6;
- mz4 = Avx.Multiply(Avx.Add(my2, my6), mm256_F_0_5411);
- Vector256 my0 = s.V0;
- Vector256 my4 = s.V4;
- mz0 = Avx.Add(my0, my4);
- mz1 = Avx.Subtract(my0, my4);
- mz2 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, my6, mm256_F_n1_8477);
- mz3 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, my2, mm256_F_0_7653);
-
- my0 = Avx.Add(mz0, mz3);
- my3 = Avx.Subtract(mz0, mz3);
- my1 = Avx.Add(mz1, mz2);
- my2 = Avx.Subtract(mz1, mz2);
-
- d.V0 = Avx.Add(my0, mb0);
- d.V7 = Avx.Subtract(my0, mb0);
- d.V1 = Avx.Add(my1, mb1);
- d.V6 = Avx.Subtract(my1, mb1);
- d.V2 = Avx.Add(my2, mb2);
- d.V5 = Avx.Subtract(my2, mb2);
- d.V3 = Avx.Add(my3, mb3);
- d.V4 = Avx.Subtract(my3, mb3);
+ // First pass - process columns
+ IDCT8x8_1D_Avx(ref transposedBlock);
+
+ // Second pass - process rows
+ transposedBlock.TransposeInplace();
+ IDCT8x8_1D_Avx(ref transposedBlock);
+
+ // Applies 1D floating point FDCT inplace
+ static void IDCT8x8_1D_Avx(ref Block8x8F block)
+ {
+ // Even part
+ Vector256 tmp0 = block.V0;
+ Vector256 tmp1 = block.V2;
+ Vector256 tmp2 = block.V4;
+ Vector256 tmp3 = block.V6;
+
+ Vector256 z5 = tmp0;
+ Vector256 tmp10 = Avx.Add(z5, tmp2);
+ Vector256 tmp11 = Avx.Subtract(z5, tmp2);
+
+ Vector256 tmp13 = Avx.Add(tmp1, tmp3);
+ Vector256 tmp12 = SimdUtils.HwIntrinsics.MultiplySubstract(tmp13, Avx.Subtract(tmp1, tmp3), mm256_F_1_4142);
+
+ tmp0 = Avx.Add(tmp10, tmp13);
+ tmp3 = Avx.Subtract(tmp10, tmp13);
+ tmp1 = Avx.Add(tmp11, tmp12);
+ tmp2 = Avx.Subtract(tmp11, tmp12);
+
+ // Odd part
+ Vector256 tmp4 = block.V1;
+ Vector256 tmp5 = block.V3;
+ Vector256 tmp6 = block.V5;
+ Vector256 tmp7 = block.V7;
+
+ Vector256 z13 = Avx.Add(tmp6, tmp5);
+ Vector256 z10 = Avx.Subtract(tmp6, tmp5);
+ Vector256 z11 = Avx.Add(tmp4, tmp7);
+ Vector256 z12 = Avx.Subtract(tmp4, tmp7);
+
+ tmp7 = Avx.Add(z11, z13);
+ tmp11 = Avx.Multiply(Avx.Subtract(z11, z13), mm256_F_1_4142);
+
+ z5 = Avx.Multiply(Avx.Add(z10, z12), mm256_F_1_8477);
+
+ tmp10 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, z12, mm256_F_n1_0823);
+ tmp12 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, z10, mm256_F_n2_6131);
+
+ tmp6 = Avx.Subtract(tmp12, tmp7);
+ tmp5 = Avx.Subtract(tmp11, tmp6);
+ tmp4 = Avx.Subtract(tmp10, tmp5);
+
+ block.V0 = Avx.Add(tmp0, tmp7);
+ block.V7 = Avx.Subtract(tmp0, tmp7);
+ block.V1 = Avx.Add(tmp1, tmp6);
+ block.V6 = Avx.Subtract(tmp1, tmp6);
+ block.V2 = Avx.Add(tmp2, tmp5);
+ block.V5 = Avx.Subtract(tmp2, tmp5);
+ block.V3 = Avx.Add(tmp3, tmp4);
+ block.V4 = Avx.Subtract(tmp3, tmp4);
+ }
}
}
}
diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
index 6963c36369..81bfe2135d 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
@@ -3,6 +3,7 @@
using System.Numerics;
using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
#if SUPPORTS_RUNTIME_INTRINSICS
using System.Runtime.Intrinsics.X86;
#endif
@@ -15,102 +16,202 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
///
internal static partial class FastFloatingPointDCT
{
-#pragma warning disable SA1310 // FieldNamesMustNotContainUnderscore
- private const float C_1_175876 = 1.175875602f;
- private const float C_1_961571 = -1.961570560f;
- private const float C_0_390181 = -0.390180644f;
- private const float C_0_899976 = -0.899976223f;
- private const float C_2_562915 = -2.562915447f;
- private const float C_0_298631 = 0.298631336f;
- private const float C_2_053120 = 2.053119869f;
- private const float C_3_072711 = 3.072711026f;
- private const float C_1_501321 = 1.501321110f;
- private const float C_0_541196 = 0.541196100f;
- private const float C_1_847759 = -1.847759065f;
- private const float C_0_765367 = 0.765366865f;
-
- private const float C_0_125 = 0.1250f;
-
-#pragma warning disable SA1311, IDE1006 // naming rules violation warnings
- private static readonly Vector4 mm128_F_0_7071 = new Vector4(0.707106781f);
- private static readonly Vector4 mm128_F_0_3826 = new Vector4(0.382683433f);
- private static readonly Vector4 mm128_F_0_5411 = new Vector4(0.541196100f);
- private static readonly Vector4 mm128_F_1_3065 = new Vector4(1.306562965f);
-#pragma warning restore SA1311, IDE1006
-
-#pragma warning restore SA1310 // FieldNamesMustNotContainUnderscore
+#pragma warning disable SA1310, SA1311, IDE1006 // naming rules violation warnings
+ private static readonly Vector4 mm128_F_0_7071 = new(0.707106781f);
+ private static readonly Vector4 mm128_F_0_3826 = new(0.382683433f);
+ private static readonly Vector4 mm128_F_0_5411 = new(0.541196100f);
+ private static readonly Vector4 mm128_F_1_3065 = new(1.306562965f);
+
+ private static readonly Vector4 mm128_F_1_4142 = new(1.414213562f);
+ private static readonly Vector4 mm128_F_1_8477 = new(1.847759065f);
+ private static readonly Vector4 mm128_F_n1_0823 = new(-1.082392200f);
+ private static readonly Vector4 mm128_F_n2_6131 = new(-2.613125930f);
+#pragma warning restore SA1310, SA1311, IDE1006
///
- /// Gets reciprocal coefficients for jpeg quantization tables calculation.
+ /// Gets adjustment table for quantization tables.
///
///
///
- /// Current FDCT implementation expects its results to be multiplied by
- /// a reciprocal quantization table. To get 8x8 reciprocal block values in this
- /// table must be divided by quantization table values scaled with quality settings.
+ /// Current IDCT and FDCT implementations are based on Arai, Agui,
+ /// and Nakajima's algorithm. Both DCT methods does not
+ /// produce finished DCT output, final step is fused into the
+ /// quantization step. Quantization and de-quantization coefficients
+ /// must be multiplied by these values.
///
///
- /// These values were calculates with this formula:
- ///
- /// value[row * 8 + col] = scalefactor[row] * scalefactor[col] * 8;
- ///
- /// Where:
+ /// Given values were generated by formula:
///
+ /// scalefactor[row] * scalefactor[col], where
/// scalefactor[0] = 1
- ///
- ///
/// scalefactor[k] = cos(k*PI/16) * sqrt(2) for k=1..7
///
- /// Values are also scaled by 8 so DCT code won't do extra division/multiplication.
///
///
- internal static readonly float[] DctReciprocalAdjustmentCoefficients = new float[]
+ private static readonly float[] AdjustmentCoefficients = new float[]
{
- 0.125f, 0.09011998f, 0.09567086f, 0.10630376f, 0.125f, 0.15909483f, 0.23096988f, 0.45306373f,
- 0.09011998f, 0.064972885f, 0.068974845f, 0.07664074f, 0.09011998f, 0.11470097f, 0.16652f, 0.32664075f,
- 0.09567086f, 0.068974845f, 0.07322331f, 0.081361376f, 0.09567086f, 0.121765904f, 0.17677669f, 0.34675997f,
- 0.10630376f, 0.07664074f, 0.081361376f, 0.09040392f, 0.10630376f, 0.13529903f, 0.19642374f, 0.38529903f,
- 0.125f, 0.09011998f, 0.09567086f, 0.10630376f, 0.125f, 0.15909483f, 0.23096988f, 0.45306373f,
- 0.15909483f, 0.11470097f, 0.121765904f, 0.13529903f, 0.15909483f, 0.2024893f, 0.2939689f, 0.5766407f,
- 0.23096988f, 0.16652f, 0.17677669f, 0.19642374f, 0.23096988f, 0.2939689f, 0.4267767f, 0.8371526f,
- 0.45306373f, 0.32664075f, 0.34675997f, 0.38529903f, 0.45306373f, 0.5766407f, 0.8371526f, 1.642134f,
+ 1f, 1.3870399f, 1.306563f, 1.1758755f, 1f, 0.78569496f, 0.5411961f, 0.27589938f,
+ 1.3870399f, 1.9238797f, 1.812255f, 1.6309863f, 1.3870399f, 1.0897902f, 0.7506606f, 0.38268346f,
+ 1.306563f, 1.812255f, 1.707107f, 1.5363555f, 1.306563f, 1.02656f, 0.7071068f, 0.36047992f,
+ 1.1758755f, 1.6309863f, 1.5363555f, 1.3826833f, 1.1758755f, 0.9238795f, 0.63637924f, 0.32442334f,
+ 1f, 1.3870399f, 1.306563f, 1.1758755f, 1f, 0.78569496f, 0.5411961f, 0.27589938f,
+ 0.78569496f, 1.0897902f, 1.02656f, 0.9238795f, 0.78569496f, 0.61731654f, 0.42521507f, 0.21677275f,
+ 0.5411961f, 0.7506606f, 0.7071068f, 0.63637924f, 0.5411961f, 0.42521507f, 0.29289323f, 0.14931567f,
+ 0.27589938f, 0.38268346f, 0.36047992f, 0.32442334f, 0.27589938f, 0.21677275f, 0.14931567f, 0.076120466f,
};
///
- /// Adjusts given quantization table to be complient with FDCT implementation.
+ /// Adjusts given quantization table for usage with .
+ ///
+ /// Quantization table to adjust.
+ public static void AdjustToIDCT(ref Block8x8F quantTable)
+ {
+ ref float tableRef = ref Unsafe.As(ref quantTable);
+ ref float multipliersRef = ref MemoryMarshal.GetReference(AdjustmentCoefficients);
+ for (nint i = 0; i < Block8x8F.Size; i++)
+ {
+ tableRef = 0.125f * tableRef * Unsafe.Add(ref multipliersRef, i);
+ tableRef = ref Unsafe.Add(ref tableRef, 1);
+ }
+
+ // Spectral macroblocks are transposed before quantization
+ // so we must transpose quantization table
+ quantTable.TransposeInplace();
+ }
+
+ ///
+ /// Adjusts given quantization table for usage with .
+ ///
+ /// Quantization table to adjust.
+ public static void AdjustToFDCT(ref Block8x8F quantTable)
+ {
+ ref float tableRef = ref Unsafe.As(ref quantTable);
+ ref float multipliersRef = ref MemoryMarshal.GetReference(AdjustmentCoefficients);
+ for (nint i = 0; i < Block8x8F.Size; i++)
+ {
+ tableRef = 0.125f / (tableRef * Unsafe.Add(ref multipliersRef, i));
+ tableRef = ref Unsafe.Add(ref tableRef, 1);
+ }
+ }
+
+ ///
+ /// Apply 2D floating point IDCT inplace.
///
///
- /// See docs for explanation.
+ /// Input block must be dequantized before this method with table
+ /// adjusted by .
///
- /// Quantization table to adjust.
- public static void AdjustToFDCT(ref Block8x8F quantizationtable)
+ /// Input block.
+ public static void TransformIDCT(ref Block8x8F block)
{
- for (int i = 0; i < Block8x8F.Size; i++)
+#if SUPPORTS_RUNTIME_INTRINSICS
+ if (Avx.IsSupported)
{
- quantizationtable[i] = DctReciprocalAdjustmentCoefficients[i] / quantizationtable[i];
+ IDCT8x8_Avx(ref block);
+ }
+ else
+#endif
+ {
+ IDCT_Vector4(ref block);
}
}
///
- /// Apply 2D floating point FDCT inplace.
+ /// Apply 2D floating point IDCT inplace.
///
- /// Input matrix.
+ ///
+ /// Input block must be quantized after this method with table adjusted
+ /// by .
+ ///
+ /// Input block.
public static void TransformFDCT(ref Block8x8F block)
{
#if SUPPORTS_RUNTIME_INTRINSICS
if (Avx.IsSupported)
{
- ForwardTransform_Avx(ref block);
+ FDCT8x8_Avx(ref block);
}
else
#endif
if (Vector.IsHardwareAccelerated)
{
- ForwardTransform_Vector4(ref block);
+ FDCT_Vector4(ref block);
}
else
{
- ForwardTransform_Scalar(ref block);
+ FDCT_Scalar(ref block);
+ }
+ }
+
+ ///
+ /// Apply floating point IDCT inplace using API.
+ ///
+ /// Input block.
+ private static void IDCT_Vector4(ref Block8x8F transposedBlock)
+ {
+ DebugGuard.IsTrue(Vector.IsHardwareAccelerated, "Scalar implementation should be called for non-accelerated hardware.");
+
+ // First pass - process columns
+ IDCT8x4_Vector4(ref transposedBlock.V0L);
+ IDCT8x4_Vector4(ref transposedBlock.V0R);
+
+ // Second pass - process rows
+ transposedBlock.TransposeInplace();
+ IDCT8x4_Vector4(ref transposedBlock.V0L);
+ IDCT8x4_Vector4(ref transposedBlock.V0R);
+
+ // Applies 1D floating point IDCT inplace on 8x4 part of 8x8 block
+ static void IDCT8x4_Vector4(ref Vector4 vecRef)
+ {
+ // Even part
+ Vector4 tmp0 = Unsafe.Add(ref vecRef, 0 * 2);
+ Vector4 tmp1 = Unsafe.Add(ref vecRef, 2 * 2);
+ Vector4 tmp2 = Unsafe.Add(ref vecRef, 4 * 2);
+ Vector4 tmp3 = Unsafe.Add(ref vecRef, 6 * 2);
+
+ Vector4 z5 = tmp0;
+ Vector4 tmp10 = z5 + tmp2;
+ Vector4 tmp11 = z5 - tmp2;
+
+ Vector4 tmp13 = tmp1 + tmp3;
+ Vector4 tmp12 = ((tmp1 - tmp3) * mm128_F_1_4142) - tmp13;
+
+ tmp0 = tmp10 + tmp13;
+ tmp3 = tmp10 - tmp13;
+ tmp1 = tmp11 + tmp12;
+ tmp2 = tmp11 - tmp12;
+
+ // Odd part
+ Vector4 tmp4 = Unsafe.Add(ref vecRef, 1 * 2);
+ Vector4 tmp5 = Unsafe.Add(ref vecRef, 3 * 2);
+ Vector4 tmp6 = Unsafe.Add(ref vecRef, 5 * 2);
+ Vector4 tmp7 = Unsafe.Add(ref vecRef, 7 * 2);
+
+ Vector4 z13 = tmp6 + tmp5;
+ Vector4 z10 = tmp6 - tmp5;
+ Vector4 z11 = tmp4 + tmp7;
+ Vector4 z12 = tmp4 - tmp7;
+
+ tmp7 = z11 + z13;
+ tmp11 = (z11 - z13) * mm128_F_1_4142;
+
+ z5 = (z10 + z12) * mm128_F_1_8477;
+
+ tmp10 = (z12 * mm128_F_n1_0823) + z5;
+ tmp12 = (z10 * mm128_F_n2_6131) + z5;
+
+ tmp6 = tmp12 - tmp7;
+ tmp5 = tmp11 - tmp6;
+ tmp4 = tmp10 - tmp5;
+
+ Unsafe.Add(ref vecRef, 0 * 2) = tmp0 + tmp7;
+ Unsafe.Add(ref vecRef, 7 * 2) = tmp0 - tmp7;
+ Unsafe.Add(ref vecRef, 1 * 2) = tmp1 + tmp6;
+ Unsafe.Add(ref vecRef, 6 * 2) = tmp1 - tmp6;
+ Unsafe.Add(ref vecRef, 2 * 2) = tmp2 + tmp5;
+ Unsafe.Add(ref vecRef, 5 * 2) = tmp2 - tmp5;
+ Unsafe.Add(ref vecRef, 3 * 2) = tmp3 + tmp4;
+ Unsafe.Add(ref vecRef, 4 * 2) = tmp3 - tmp4;
}
}
@@ -120,8 +221,8 @@ public static void TransformFDCT(ref Block8x8F block)
///
/// Ported from libjpeg-turbo https://github.com/libjpeg-turbo/libjpeg-turbo/blob/main/jfdctflt.c.
///
- /// Input matrix.
- private static void ForwardTransform_Scalar(ref Block8x8F block)
+ /// Input block.
+ private static void FDCT_Scalar(ref Block8x8F block)
{
const int dctSize = 8;
@@ -130,17 +231,17 @@ private static void ForwardTransform_Scalar(ref Block8x8F block)
float z1, z2, z3, z4, z5, z11, z13;
// First pass - process rows
- ref float dataRef = ref Unsafe.As(ref block);
+ ref float blockRef = ref Unsafe.As(ref block);
for (int ctr = 7; ctr >= 0; ctr--)
{
- tmp0 = Unsafe.Add(ref dataRef, 0) + Unsafe.Add(ref dataRef, 7);
- tmp7 = Unsafe.Add(ref dataRef, 0) - Unsafe.Add(ref dataRef, 7);
- tmp1 = Unsafe.Add(ref dataRef, 1) + Unsafe.Add(ref dataRef, 6);
- tmp6 = Unsafe.Add(ref dataRef, 1) - Unsafe.Add(ref dataRef, 6);
- tmp2 = Unsafe.Add(ref dataRef, 2) + Unsafe.Add(ref dataRef, 5);
- tmp5 = Unsafe.Add(ref dataRef, 2) - Unsafe.Add(ref dataRef, 5);
- tmp3 = Unsafe.Add(ref dataRef, 3) + Unsafe.Add(ref dataRef, 4);
- tmp4 = Unsafe.Add(ref dataRef, 3) - Unsafe.Add(ref dataRef, 4);
+ tmp0 = Unsafe.Add(ref blockRef, 0) + Unsafe.Add(ref blockRef, 7);
+ tmp7 = Unsafe.Add(ref blockRef, 0) - Unsafe.Add(ref blockRef, 7);
+ tmp1 = Unsafe.Add(ref blockRef, 1) + Unsafe.Add(ref blockRef, 6);
+ tmp6 = Unsafe.Add(ref blockRef, 1) - Unsafe.Add(ref blockRef, 6);
+ tmp2 = Unsafe.Add(ref blockRef, 2) + Unsafe.Add(ref blockRef, 5);
+ tmp5 = Unsafe.Add(ref blockRef, 2) - Unsafe.Add(ref blockRef, 5);
+ tmp3 = Unsafe.Add(ref blockRef, 3) + Unsafe.Add(ref blockRef, 4);
+ tmp4 = Unsafe.Add(ref blockRef, 3) - Unsafe.Add(ref blockRef, 4);
// Even part
tmp10 = tmp0 + tmp3;
@@ -148,12 +249,12 @@ private static void ForwardTransform_Scalar(ref Block8x8F block)
tmp11 = tmp1 + tmp2;
tmp12 = tmp1 - tmp2;
- Unsafe.Add(ref dataRef, 0) = tmp10 + tmp11;
- Unsafe.Add(ref dataRef, 4) = tmp10 - tmp11;
+ Unsafe.Add(ref blockRef, 0) = tmp10 + tmp11;
+ Unsafe.Add(ref blockRef, 4) = tmp10 - tmp11;
z1 = (tmp12 + tmp13) * 0.707106781f;
- Unsafe.Add(ref dataRef, 2) = tmp13 + z1;
- Unsafe.Add(ref dataRef, 6) = tmp13 - z1;
+ Unsafe.Add(ref blockRef, 2) = tmp13 + z1;
+ Unsafe.Add(ref blockRef, 6) = tmp13 - z1;
// Odd part
tmp10 = tmp4 + tmp5;
@@ -168,26 +269,26 @@ private static void ForwardTransform_Scalar(ref Block8x8F block)
z11 = tmp7 + z3;
z13 = tmp7 - z3;
- Unsafe.Add(ref dataRef, 5) = z13 + z2;
- Unsafe.Add(ref dataRef, 3) = z13 - z2;
- Unsafe.Add(ref dataRef, 1) = z11 + z4;
- Unsafe.Add(ref dataRef, 7) = z11 - z4;
+ Unsafe.Add(ref blockRef, 5) = z13 + z2;
+ Unsafe.Add(ref blockRef, 3) = z13 - z2;
+ Unsafe.Add(ref blockRef, 1) = z11 + z4;
+ Unsafe.Add(ref blockRef, 7) = z11 - z4;
- dataRef = ref Unsafe.Add(ref dataRef, dctSize);
+ blockRef = ref Unsafe.Add(ref blockRef, dctSize);
}
// Second pass - process columns
- dataRef = ref Unsafe.As(ref block);
+ blockRef = ref Unsafe.As(ref block);
for (int ctr = 7; ctr >= 0; ctr--)
{
- tmp0 = Unsafe.Add(ref dataRef, dctSize * 0) + Unsafe.Add(ref dataRef, dctSize * 7);
- tmp7 = Unsafe.Add(ref dataRef, dctSize * 0) - Unsafe.Add(ref dataRef, dctSize * 7);
- tmp1 = Unsafe.Add(ref dataRef, dctSize * 1) + Unsafe.Add(ref dataRef, dctSize * 6);
- tmp6 = Unsafe.Add(ref dataRef, dctSize * 1) - Unsafe.Add(ref dataRef, dctSize * 6);
- tmp2 = Unsafe.Add(ref dataRef, dctSize * 2) + Unsafe.Add(ref dataRef, dctSize * 5);
- tmp5 = Unsafe.Add(ref dataRef, dctSize * 2) - Unsafe.Add(ref dataRef, dctSize * 5);
- tmp3 = Unsafe.Add(ref dataRef, dctSize * 3) + Unsafe.Add(ref dataRef, dctSize * 4);
- tmp4 = Unsafe.Add(ref dataRef, dctSize * 3) - Unsafe.Add(ref dataRef, dctSize * 4);
+ tmp0 = Unsafe.Add(ref blockRef, dctSize * 0) + Unsafe.Add(ref blockRef, dctSize * 7);
+ tmp7 = Unsafe.Add(ref blockRef, dctSize * 0) - Unsafe.Add(ref blockRef, dctSize * 7);
+ tmp1 = Unsafe.Add(ref blockRef, dctSize * 1) + Unsafe.Add(ref blockRef, dctSize * 6);
+ tmp6 = Unsafe.Add(ref blockRef, dctSize * 1) - Unsafe.Add(ref blockRef, dctSize * 6);
+ tmp2 = Unsafe.Add(ref blockRef, dctSize * 2) + Unsafe.Add(ref blockRef, dctSize * 5);
+ tmp5 = Unsafe.Add(ref blockRef, dctSize * 2) - Unsafe.Add(ref blockRef, dctSize * 5);
+ tmp3 = Unsafe.Add(ref blockRef, dctSize * 3) + Unsafe.Add(ref blockRef, dctSize * 4);
+ tmp4 = Unsafe.Add(ref blockRef, dctSize * 3) - Unsafe.Add(ref blockRef, dctSize * 4);
// Even part
tmp10 = tmp0 + tmp3;
@@ -195,12 +296,12 @@ private static void ForwardTransform_Scalar(ref Block8x8F block)
tmp11 = tmp1 + tmp2;
tmp12 = tmp1 - tmp2;
- Unsafe.Add(ref dataRef, dctSize * 0) = tmp10 + tmp11;
- Unsafe.Add(ref dataRef, dctSize * 4) = tmp10 - tmp11;
+ Unsafe.Add(ref blockRef, dctSize * 0) = tmp10 + tmp11;
+ Unsafe.Add(ref blockRef, dctSize * 4) = tmp10 - tmp11;
z1 = (tmp12 + tmp13) * 0.707106781f;
- Unsafe.Add(ref dataRef, dctSize * 2) = tmp13 + z1;
- Unsafe.Add(ref dataRef, dctSize * 6) = tmp13 - z1;
+ Unsafe.Add(ref blockRef, dctSize * 2) = tmp13 + z1;
+ Unsafe.Add(ref blockRef, dctSize * 6) = tmp13 - z1;
// Odd part
tmp10 = tmp4 + tmp5;
@@ -215,12 +316,12 @@ private static void ForwardTransform_Scalar(ref Block8x8F block)
z11 = tmp7 + z3;
z13 = tmp7 - z3;
- Unsafe.Add(ref dataRef, dctSize * 5) = z13 + z2;
- Unsafe.Add(ref dataRef, dctSize * 3) = z13 - z2;
- Unsafe.Add(ref dataRef, dctSize * 1) = z11 + z4;
- Unsafe.Add(ref dataRef, dctSize * 7) = z11 - z4;
+ Unsafe.Add(ref blockRef, dctSize * 5) = z13 + z2;
+ Unsafe.Add(ref blockRef, dctSize * 3) = z13 - z2;
+ Unsafe.Add(ref blockRef, dctSize * 1) = z11 + z4;
+ Unsafe.Add(ref blockRef, dctSize * 7) = z11 - z4;
- dataRef = ref Unsafe.Add(ref dataRef, 1);
+ blockRef = ref Unsafe.Add(ref blockRef, 1);
}
}
@@ -230,11 +331,11 @@ private static void ForwardTransform_Scalar(ref Block8x8F block)
///
/// This implementation must be called only if hardware supports 4
/// floating point numbers vector. Otherwise explicit scalar
- /// implementation is faster
- /// because it does not rely on matrix transposition.
+ /// implementation is faster
+ /// because it does not rely on block transposition.
///
- /// Input matrix.
- private static void ForwardTransform_Vector4(ref Block8x8F block)
+ /// Input block.
+ public static void FDCT_Vector4(ref Block8x8F block)
{
DebugGuard.IsTrue(Vector.IsHardwareAccelerated, "Scalar implementation should be called for non-accelerated hardware.");
@@ -247,209 +348,50 @@ private static void ForwardTransform_Vector4(ref Block8x8F block)
block.TransposeInplace();
FDCT8x4_Vector4(ref block.V0L);
FDCT8x4_Vector4(ref block.V0R);
- }
- ///
- /// Apply 1D floating point FDCT inplace on 8x4 part of 8x8 matrix.
- ///
- ///
- /// Implemented using Vector4 API operations for either scalar or sse hardware implementation.
- /// Must be called on both 8x4 matrix parts for the full FDCT transform.
- ///
- /// Input reference to the first
- private static void FDCT8x4_Vector4(ref Vector4 blockRef)
- {
- Vector4 tmp0 = Unsafe.Add(ref blockRef, 0) + Unsafe.Add(ref blockRef, 14);
- Vector4 tmp7 = Unsafe.Add(ref blockRef, 0) - Unsafe.Add(ref blockRef, 14);
- Vector4 tmp1 = Unsafe.Add(ref blockRef, 2) + Unsafe.Add(ref blockRef, 12);
- Vector4 tmp6 = Unsafe.Add(ref blockRef, 2) - Unsafe.Add(ref blockRef, 12);
- Vector4 tmp2 = Unsafe.Add(ref blockRef, 4) + Unsafe.Add(ref blockRef, 10);
- Vector4 tmp5 = Unsafe.Add(ref blockRef, 4) - Unsafe.Add(ref blockRef, 10);
- Vector4 tmp3 = Unsafe.Add(ref blockRef, 6) + Unsafe.Add(ref blockRef, 8);
- Vector4 tmp4 = Unsafe.Add(ref blockRef, 6) - Unsafe.Add(ref blockRef, 8);
-
- // Even part
- Vector4 tmp10 = tmp0 + tmp3;
- Vector4 tmp13 = tmp0 - tmp3;
- Vector4 tmp11 = tmp1 + tmp2;
- Vector4 tmp12 = tmp1 - tmp2;
-
- Unsafe.Add(ref blockRef, 0) = tmp10 + tmp11;
- Unsafe.Add(ref blockRef, 8) = tmp10 - tmp11;
-
- Vector4 z1 = (tmp12 + tmp13) * mm128_F_0_7071;
- Unsafe.Add(ref blockRef, 4) = tmp13 + z1;
- Unsafe.Add(ref blockRef, 12) = tmp13 - z1;
-
- // Odd part
- tmp10 = tmp4 + tmp5;
- tmp11 = tmp5 + tmp6;
- tmp12 = tmp6 + tmp7;
-
- Vector4 z5 = (tmp10 - tmp12) * mm128_F_0_3826;
- Vector4 z2 = (mm128_F_0_5411 * tmp10) + z5;
- Vector4 z4 = (mm128_F_1_3065 * tmp12) + z5;
- Vector4 z3 = tmp11 * mm128_F_0_7071;
-
- Vector4 z11 = tmp7 + z3;
- Vector4 z13 = tmp7 - z3;
-
- Unsafe.Add(ref blockRef, 10) = z13 + z2;
- Unsafe.Add(ref blockRef, 6) = z13 - z2;
- Unsafe.Add(ref blockRef, 2) = z11 + z4;
- Unsafe.Add(ref blockRef, 14) = z11 - z4;
- }
+ // Applies 1D floating point FDCT inplace on 8x4 part of 8x8 block
+ static void FDCT8x4_Vector4(ref Vector4 vecRef)
+ {
+ Vector4 tmp0 = Unsafe.Add(ref vecRef, 0) + Unsafe.Add(ref vecRef, 14);
+ Vector4 tmp7 = Unsafe.Add(ref vecRef, 0) - Unsafe.Add(ref vecRef, 14);
+ Vector4 tmp1 = Unsafe.Add(ref vecRef, 2) + Unsafe.Add(ref vecRef, 12);
+ Vector4 tmp6 = Unsafe.Add(ref vecRef, 2) - Unsafe.Add(ref vecRef, 12);
+ Vector4 tmp2 = Unsafe.Add(ref vecRef, 4) + Unsafe.Add(ref vecRef, 10);
+ Vector4 tmp5 = Unsafe.Add(ref vecRef, 4) - Unsafe.Add(ref vecRef, 10);
+ Vector4 tmp3 = Unsafe.Add(ref vecRef, 6) + Unsafe.Add(ref vecRef, 8);
+ Vector4 tmp4 = Unsafe.Add(ref vecRef, 6) - Unsafe.Add(ref vecRef, 8);
- ///
- /// Apply floating point IDCT inplace.
- /// Ported from https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L239.
- ///
- /// Input matrix.
- /// Matrix to store temporal results.
- public static void TransformIDCT(ref Block8x8F block, ref Block8x8F temp)
- {
- block.TransposeInplace();
- IDCT8x8(ref block, ref temp);
- temp.TransposeInplace();
- IDCT8x8(ref temp, ref block);
+ // Even part
+ Vector4 tmp10 = tmp0 + tmp3;
+ Vector4 tmp13 = tmp0 - tmp3;
+ Vector4 tmp11 = tmp1 + tmp2;
+ Vector4 tmp12 = tmp1 - tmp2;
- // TODO: This can be fused into quantization table step
- block.MultiplyInPlace(C_0_125);
- }
+ Unsafe.Add(ref vecRef, 0) = tmp10 + tmp11;
+ Unsafe.Add(ref vecRef, 8) = tmp10 - tmp11;
- ///
- /// Performs 8x8 matrix Inverse Discrete Cosine Transform
- ///
- /// Source
- /// Destination
- private static void IDCT8x8(ref Block8x8F s, ref Block8x8F d)
- {
-#if SUPPORTS_RUNTIME_INTRINSICS
- if (Avx.IsSupported)
- {
- IDCT8x8_Avx(ref s, ref d);
- }
- else
-#endif
- {
- IDCT8x4_LeftPart(ref s, ref d);
- IDCT8x4_RightPart(ref s, ref d);
- }
- }
+ Vector4 z1 = (tmp12 + tmp13) * mm128_F_0_7071;
+ Unsafe.Add(ref vecRef, 4) = tmp13 + z1;
+ Unsafe.Add(ref vecRef, 12) = tmp13 - z1;
- ///
- /// Do IDCT internal operations on the left part of the block. Original src:
- /// https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L261
- ///
- /// The source block
- /// Destination block
- [MethodImpl(MethodImplOptions.AggressiveInlining)]
- public static void IDCT8x4_LeftPart(ref Block8x8F s, ref Block8x8F d)
- {
- Vector4 my1 = s.V1L;
- Vector4 my7 = s.V7L;
- Vector4 mz0 = my1 + my7;
-
- Vector4 my3 = s.V3L;
- Vector4 mz2 = my3 + my7;
- Vector4 my5 = s.V5L;
- Vector4 mz1 = my3 + my5;
- Vector4 mz3 = my1 + my5;
-
- Vector4 mz4 = (mz0 + mz1) * C_1_175876;
-
- mz2 = (mz2 * C_1_961571) + mz4;
- mz3 = (mz3 * C_0_390181) + mz4;
- mz0 = mz0 * C_0_899976;
- mz1 = mz1 * C_2_562915;
-
- Vector4 mb3 = (my7 * C_0_298631) + mz0 + mz2;
- Vector4 mb2 = (my5 * C_2_053120) + mz1 + mz3;
- Vector4 mb1 = (my3 * C_3_072711) + mz1 + mz2;
- Vector4 mb0 = (my1 * C_1_501321) + mz0 + mz3;
-
- Vector4 my2 = s.V2L;
- Vector4 my6 = s.V6L;
- mz4 = (my2 + my6) * C_0_541196;
- Vector4 my0 = s.V0L;
- Vector4 my4 = s.V4L;
- mz0 = my0 + my4;
- mz1 = my0 - my4;
-
- mz2 = mz4 + (my6 * C_1_847759);
- mz3 = mz4 + (my2 * C_0_765367);
-
- my0 = mz0 + mz3;
- my3 = mz0 - mz3;
- my1 = mz1 + mz2;
- my2 = mz1 - mz2;
-
- d.V0L = my0 + mb0;
- d.V7L = my0 - mb0;
- d.V1L = my1 + mb1;
- d.V6L = my1 - mb1;
- d.V2L = my2 + mb2;
- d.V5L = my2 - mb2;
- d.V3L = my3 + mb3;
- d.V4L = my3 - mb3;
- }
+ // Odd part
+ tmp10 = tmp4 + tmp5;
+ tmp11 = tmp5 + tmp6;
+ tmp12 = tmp6 + tmp7;
- ///
- /// Do IDCT internal operations on the right part of the block.
- /// Original src:
- /// https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L261
- ///
- /// The source block
- /// The destination block
- [MethodImpl(MethodImplOptions.AggressiveInlining)]
- public static void IDCT8x4_RightPart(ref Block8x8F s, ref Block8x8F d)
- {
- Vector4 my1 = s.V1R;
- Vector4 my7 = s.V7R;
- Vector4 mz0 = my1 + my7;
-
- Vector4 my3 = s.V3R;
- Vector4 mz2 = my3 + my7;
- Vector4 my5 = s.V5R;
- Vector4 mz1 = my3 + my5;
- Vector4 mz3 = my1 + my5;
-
- Vector4 mz4 = (mz0 + mz1) * C_1_175876;
-
- mz2 = (mz2 * C_1_961571) + mz4;
- mz3 = (mz3 * C_0_390181) + mz4;
- mz0 = mz0 * C_0_899976;
- mz1 = mz1 * C_2_562915;
-
- Vector4 mb3 = (my7 * C_0_298631) + mz0 + mz2;
- Vector4 mb2 = (my5 * C_2_053120) + mz1 + mz3;
- Vector4 mb1 = (my3 * C_3_072711) + mz1 + mz2;
- Vector4 mb0 = (my1 * C_1_501321) + mz0 + mz3;
-
- Vector4 my2 = s.V2R;
- Vector4 my6 = s.V6R;
- mz4 = (my2 + my6) * C_0_541196;
- Vector4 my0 = s.V0R;
- Vector4 my4 = s.V4R;
- mz0 = my0 + my4;
- mz1 = my0 - my4;
-
- mz2 = mz4 + (my6 * C_1_847759);
- mz3 = mz4 + (my2 * C_0_765367);
-
- my0 = mz0 + mz3;
- my3 = mz0 - mz3;
- my1 = mz1 + mz2;
- my2 = mz1 - mz2;
-
- d.V0R = my0 + mb0;
- d.V7R = my0 - mb0;
- d.V1R = my1 + mb1;
- d.V6R = my1 - mb1;
- d.V2R = my2 + mb2;
- d.V5R = my2 - mb2;
- d.V3R = my3 + mb3;
- d.V4R = my3 - mb3;
+ Vector4 z5 = (tmp10 - tmp12) * mm128_F_0_3826;
+ Vector4 z2 = (mm128_F_0_5411 * tmp10) + z5;
+ Vector4 z4 = (mm128_F_1_3065 * tmp12) + z5;
+ Vector4 z3 = tmp11 * mm128_F_0_7071;
+
+ Vector4 z11 = tmp7 + z3;
+ Vector4 z13 = tmp7 - z3;
+
+ Unsafe.Add(ref vecRef, 10) = z13 + z2;
+ Unsafe.Add(ref vecRef, 6) = z13 - z2;
+ Unsafe.Add(ref vecRef, 2) = z11 + z4;
+ Unsafe.Add(ref vecRef, 14) = z11 - z4;
+ }
}
}
}
diff --git a/src/ImageSharp/Formats/Jpeg/Components/ZigZag.cs b/src/ImageSharp/Formats/Jpeg/Components/ZigZag.cs
index e519a8a1dc..ab80b3ae67 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/ZigZag.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/ZigZag.cs
@@ -35,5 +35,34 @@ internal static partial class ZigZag
63, 63, 63, 63, 63, 63, 63, 63,
63, 63, 63, 63, 63, 63, 63, 63
};
+
+ ///
+ /// Gets span of zig-zag with fused transpose step ordering indices.
+ ///
+ ///
+ /// When reading corrupted data, the Huffman decoders could attempt
+ /// to reference an entry beyond the end of this array (if the decoded
+ /// zero run length reaches past the end of the block). To prevent
+ /// wild stores without adding an inner-loop test, we put some extra
+ /// "63"s after the real entries. This will cause the extra coefficient
+ /// to be stored in location 63 of the block, not somewhere random.
+ /// The worst case would be a run-length of 15, which means we need 16
+ /// fake entries.
+ ///
+ public static ReadOnlySpan TransposingOrder => new byte[]
+ {
+ 0, 8, 1, 2, 9, 16, 24, 17,
+ 10, 3, 4, 11, 18, 25, 32, 40,
+ 33, 26, 19, 12, 5, 6, 13, 20,
+ 27, 34, 41, 48, 56, 49, 42, 35,
+ 28, 21, 14, 7, 15, 22, 29, 36,
+ 43, 50, 57, 58, 51, 44, 37, 30,
+ 23, 31, 38, 45, 52, 59, 60, 53,
+ 46, 39, 47, 54, 61, 62, 55, 63,
+
+ // Extra entries for safety in decoder
+ 63, 63, 63, 63, 63, 63, 63, 63,
+ 63, 63, 63, 63, 63, 63, 63, 63
+ };
}
}
diff --git a/src/ImageSharp/Formats/Jpeg/JpegDecoderCore.cs b/src/ImageSharp/Formats/Jpeg/JpegDecoderCore.cs
index 9a9e5eb799..73763f4ab8 100644
--- a/src/ImageSharp/Formats/Jpeg/JpegDecoderCore.cs
+++ b/src/ImageSharp/Formats/Jpeg/JpegDecoderCore.cs
@@ -942,6 +942,9 @@ private void ProcessDefineQuantizationTablesMarker(BufferedReadStream stream, in
break;
}
}
+
+ // Adjusting table for IDCT step during decompression
+ FastFloatingPointDCT.AdjustToIDCT(ref table);
}
}
diff --git a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/DecodeJpeg.cs b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/DecodeJpeg.cs
index cb89f90829..9665ca42d6 100644
--- a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/DecodeJpeg.cs
+++ b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/DecodeJpeg.cs
@@ -66,16 +66,17 @@ public void Cleanup()
/*
-BenchmarkDotNet=v0.13.0, OS=Windows 10.0.19042.1288 (20H2/October2020Update)
+BenchmarkDotNet=v0.13.0, OS=Windows 10.0.19042.1348 (20H2/October2020Update)
Intel Core i7-6700K CPU 4.00GHz (Skylake), 1 CPU, 8 logical and 4 physical cores
.NET SDK=6.0.100-preview.3.21202.5
[Host] : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT
DefaultJob : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT
+
| Method | Mean | Error | StdDev |
|------------------------------------ |----------:|----------:|----------:|
-| 'Baseline 4:4:4 Interleaved' | 11.781 ms | 0.0737 ms | 0.0654 ms |
-| 'Baseline 4:2:0 Interleaved' | 8.688 ms | 0.0345 ms | 0.0306 ms |
-| 'Baseline 4:0:0 (grayscale)' | 1.643 ms | 0.0092 ms | 0.0086 ms |
-| 'Progressive 4:2:0 Non-Interleaved' | 13.770 ms | 0.0928 ms | 0.0823 ms |
+| 'Baseline 4:4:4 Interleaved' | 11.127 ms | 0.0659 ms | 0.0550 ms |
+| 'Baseline 4:2:0 Interleaved' | 8.458 ms | 0.0289 ms | 0.0256 ms |
+| 'Baseline 4:0:0 (grayscale)' | 1.550 ms | 0.0050 ms | 0.0044 ms |
+| 'Progressive 4:2:0 Non-Interleaved' | 13.220 ms | 0.0449 ms | 0.0398 ms |
*/
diff --git a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs
index e5dc0ba01f..8f5f10f195 100644
--- a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs
@@ -183,9 +183,12 @@ static void RunTest()
Assert.Equal(expected, actual);
}
+ // This method has only 2 implementations:
+ // 1. AVX
+ // 2. Scalar
FeatureTestRunner.RunWithHwIntrinsicsFeature(
RunTest,
- HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX | HwIntrinsics.DisableHWIntrinsic);
+ HwIntrinsics.AllowAll | HwIntrinsics.DisableHWIntrinsic);
}
private static float[] Create8x8ColorCropTestData()
diff --git a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8Tests.cs b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8Tests.cs
index 3737cce804..b13a196cb9 100644
--- a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8Tests.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8Tests.cs
@@ -276,5 +276,31 @@ static void RunTest(string seedSerialized)
seed,
HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2);
}
+
+ [Fact]
+ public void TransposeInplace()
+ {
+ static void RunTest()
+ {
+ short[] expected = Create8x8ShortData();
+ ReferenceImplementations.Transpose8x8(expected);
+
+ var block8x8 = default(Block8x8);
+ block8x8.LoadFrom(Create8x8ShortData());
+
+ block8x8.TransposeInplace();
+
+ short[] actual = new short[64];
+ block8x8.CopyTo(actual);
+
+ Assert.Equal(expected, actual);
+ }
+
+ // This method has only 1 implementation:
+ // 1. Scalar
+ FeatureTestRunner.RunWithHwIntrinsicsFeature(
+ RunTest,
+ HwIntrinsics.DisableHWIntrinsic);
+ }
}
}
diff --git a/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs b/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs
index 0a49d20cd4..85f30d28d7 100644
--- a/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs
@@ -2,9 +2,6 @@
// Licensed under the Apache License, Version 2.0.
using System;
-#if SUPPORTS_RUNTIME_INTRINSICS
-using System.Runtime.Intrinsics.X86;
-#endif
using SixLabors.ImageSharp.Formats.Jpeg.Components;
using SixLabors.ImageSharp.Tests.Formats.Jpg.Utils;
using SixLabors.ImageSharp.Tests.TestUtilities;
@@ -17,6 +14,20 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
[Trait("Format", "Jpg")]
public static class DCTTests
{
+ private const int MaxAllowedValue = short.MaxValue;
+ private const int MinAllowedValue = short.MinValue;
+
+ internal static Block8x8F CreateBlockFromScalar(float value)
+ {
+ Block8x8F result = default;
+ for (int i = 0; i < Block8x8F.Size; i++)
+ {
+ result[i] = value;
+ }
+
+ return result;
+ }
+
public class FastFloatingPoint : JpegFixture
{
public FastFloatingPoint(ITestOutputHelper output)
@@ -24,130 +35,75 @@ public FastFloatingPoint(ITestOutputHelper output)
{
}
- // Reference tests
[Theory]
[InlineData(1)]
[InlineData(2)]
[InlineData(3)]
public void LLM_TransformIDCT_CompareToNonOptimized(int seed)
{
- float[] sourceArray = Create8x8RoundedRandomFloatData(-1000, 1000, seed);
+ float[] sourceArray = Create8x8RoundedRandomFloatData(MinAllowedValue, MaxAllowedValue, seed);
var srcBlock = Block8x8F.Load(sourceArray);
+ // reference
Block8x8F expected = ReferenceImplementations.LLM_FloatingPoint_DCT.TransformIDCT(ref srcBlock);
- var temp = default(Block8x8F);
- FastFloatingPointDCT.TransformIDCT(ref srcBlock, ref temp);
-
- this.CompareBlocks(expected, srcBlock, 1f);
- }
-
- [Theory]
- [InlineData(1)]
- [InlineData(2)]
- [InlineData(3)]
- public void LLM_TransformIDCT_CompareToAccurate(int seed)
- {
- float[] sourceArray = Create8x8RoundedRandomFloatData(-1000, 1000, seed);
+ // testee
+ // Part of the IDCT calculations is fused into the quantization step
+ // We must multiply input block with adjusted no-quantization matrix
+ // before applying IDCT
+ // Dequantization using unit matrix - no values are upscaled
+ Block8x8F dequantMatrix = CreateBlockFromScalar(1);
- var srcBlock = Block8x8F.Load(sourceArray);
+ // This step is needed to apply adjusting multipliers to the input block
+ FastFloatingPointDCT.AdjustToIDCT(ref dequantMatrix);
- Block8x8F expected = ReferenceImplementations.AccurateDCT.TransformIDCT(ref srcBlock);
+ // IDCT implementation tranforms blocks after transposition
+ srcBlock.TransposeInplace();
+ srcBlock.MultiplyInPlace(ref dequantMatrix);
- var temp = default(Block8x8F);
- FastFloatingPointDCT.TransformIDCT(ref srcBlock, ref temp);
+ // IDCT calculation
+ FastFloatingPointDCT.TransformIDCT(ref srcBlock);
this.CompareBlocks(expected, srcBlock, 1f);
}
- // Inverse transform
- [Theory]
- [InlineData(1)]
- [InlineData(2)]
- public void IDCT8x4_LeftPart(int seed)
- {
- Span src = Create8x8RoundedRandomFloatData(-200, 200, seed);
- var srcBlock = default(Block8x8F);
- srcBlock.LoadFrom(src);
-
- var destBlock = default(Block8x8F);
-
- var expectedDest = new float[64];
-
- // reference
- ReferenceImplementations.LLM_FloatingPoint_DCT.IDCT2D8x4_32f(src, expectedDest);
-
- // testee
- FastFloatingPointDCT.IDCT8x4_LeftPart(ref srcBlock, ref destBlock);
-
- var actualDest = new float[64];
- destBlock.ScaledCopyTo(actualDest);
-
- Assert.Equal(actualDest, expectedDest, new ApproximateFloatComparer(1f));
- }
-
[Theory]
[InlineData(1)]
[InlineData(2)]
- public void IDCT8x4_RightPart(int seed)
+ [InlineData(3)]
+ public void LLM_TransformIDCT_CompareToAccurate(int seed)
{
- Span src = Create8x8RoundedRandomFloatData(-200, 200, seed);
- var srcBlock = default(Block8x8F);
- srcBlock.LoadFrom(src);
+ float[] sourceArray = Create8x8RoundedRandomFloatData(MinAllowedValue, MaxAllowedValue, seed);
- var destBlock = default(Block8x8F);
-
- var expectedDest = new float[64];
+ var srcBlock = Block8x8F.Load(sourceArray);
// reference
- ReferenceImplementations.LLM_FloatingPoint_DCT.IDCT2D8x4_32f(src.Slice(4), expectedDest.AsSpan(4));
+ Block8x8F expected = ReferenceImplementations.AccurateDCT.TransformIDCT(ref srcBlock);
// testee
- FastFloatingPointDCT.IDCT8x4_RightPart(ref srcBlock, ref destBlock);
-
- var actualDest = new float[64];
- destBlock.ScaledCopyTo(actualDest);
-
- Assert.Equal(actualDest, expectedDest, new ApproximateFloatComparer(1f));
- }
-
- [Theory]
- [InlineData(1)]
- [InlineData(2)]
- public void IDCT8x8_Avx(int seed)
- {
-#if SUPPORTS_RUNTIME_INTRINSICS
- if (!Avx.IsSupported)
- {
- this.Output.WriteLine("No AVX present, skipping test!");
- return;
- }
-
- Span src = Create8x8RoundedRandomFloatData(-200, 200, seed);
- Block8x8F srcBlock = default;
- srcBlock.LoadFrom(src);
+ // Part of the IDCT calculations is fused into the quantization step
+ // We must multiply input block with adjusted no-quantization matrix
+ // before applying IDCT
+ // Dequantization using unit matrix - no values are upscaled
+ Block8x8F dequantMatrix = CreateBlockFromScalar(1);
- Block8x8F destBlock = default;
+ // This step is needed to apply adjusting multipliers to the input block
+ FastFloatingPointDCT.AdjustToIDCT(ref dequantMatrix);
- float[] expectedDest = new float[64];
+ // IDCT implementation tranforms blocks after transposition
+ srcBlock.TransposeInplace();
+ srcBlock.MultiplyInPlace(ref dequantMatrix);
- // reference, left part
- ReferenceImplementations.LLM_FloatingPoint_DCT.IDCT2D8x4_32f(src, expectedDest);
+ // IDCT calculation
+ FastFloatingPointDCT.TransformIDCT(ref srcBlock);
- // reference, right part
- ReferenceImplementations.LLM_FloatingPoint_DCT.IDCT2D8x4_32f(src.Slice(4), expectedDest.AsSpan(4));
-
- // testee, whole 8x8
- FastFloatingPointDCT.IDCT8x8_Avx(ref srcBlock, ref destBlock);
-
- float[] actualDest = new float[64];
- destBlock.ScaledCopyTo(actualDest);
-
- Assert.Equal(actualDest, expectedDest, new ApproximateFloatComparer(1f));
-#endif
+ this.CompareBlocks(expected, srcBlock, 1f);
}
+ // Inverse transform
+ // This test covers entire IDCT conversion chain
+ // This test checks all hardware implementations
[Theory]
[InlineData(1)]
[InlineData(2)]
@@ -157,41 +113,53 @@ static void RunTest(string serialized)
{
int seed = FeatureTestRunner.Deserialize(serialized);
- Span src = Create8x8RoundedRandomFloatData(-200, 200, seed);
+ Span src = Create8x8RoundedRandomFloatData(MinAllowedValue, MaxAllowedValue, seed);
var srcBlock = default(Block8x8F);
srcBlock.LoadFrom(src);
- var expectedDest = new float[64];
- var temp1 = new float[64];
- var temp2 = default(Block8x8F);
+ float[] expectedDest = new float[64];
+ float[] temp = new float[64];
// reference
- ReferenceImplementations.LLM_FloatingPoint_DCT.IDCT2D_llm(src, expectedDest, temp1);
+ ReferenceImplementations.LLM_FloatingPoint_DCT.IDCT2D_llm(src, expectedDest, temp);
// testee
- FastFloatingPointDCT.TransformIDCT(ref srcBlock, ref temp2);
+ // Part of the IDCT calculations is fused into the quantization step
+ // We must multiply input block with adjusted no-quantization matrix
+ // before applying IDCT
+ Block8x8F dequantMatrix = CreateBlockFromScalar(1);
+
+ // Dequantization using unit matrix - no values are upscaled
+ // as quant matrix is all 1's
+ // This step is needed to apply adjusting multipliers to the input block
+ FastFloatingPointDCT.AdjustToIDCT(ref dequantMatrix);
+ srcBlock.MultiplyInPlace(ref dequantMatrix);
+
+ // IDCT implementation tranforms blocks after transposition
+ srcBlock.TransposeInplace();
- var actualDest = new float[64];
- srcBlock.ScaledCopyTo(actualDest);
+ // IDCT calculation
+ FastFloatingPointDCT.TransformIDCT(ref srcBlock);
+
+ float[] actualDest = srcBlock.ToArray();
Assert.Equal(actualDest, expectedDest, new ApproximateFloatComparer(1f));
}
- // 3 paths:
+ // 4 paths:
// 1. AllowAll - call avx/fma implementation
- // 2. DisableFMA - call avx implementation without fma acceleration
- // 3. DisableAvx - call fallback code of Vector4 implementation
- //
- // DisableSSE isn't needed because fallback Vector4 code will compile to either sse or fallback code with same result
+ // 2. DisableFMA - call avx without fma implementation
+ // 3. DisableAvx - call sse Vector4 implementation
+ // 4. DisableHWIntrinsic - call scalar fallback implementation
FeatureTestRunner.RunWithHwIntrinsicsFeature(
RunTest,
seed,
- HwIntrinsics.AllowAll | HwIntrinsics.DisableFMA | HwIntrinsics.DisableAVX);
+ HwIntrinsics.AllowAll | HwIntrinsics.DisableFMA | HwIntrinsics.DisableAVX | HwIntrinsics.DisableHWIntrinsic);
}
// Forward transform
- // This test covers entire FDCT conversions chain
- // This test checks all implementations: intrinsic and scalar fallback
+ // This test covers entire FDCT conversion chain
+ // This test checks all hardware implementations
[Theory]
[InlineData(1)]
[InlineData(2)]
@@ -201,7 +169,7 @@ static void RunTest(string serialized)
{
int seed = FeatureTestRunner.Deserialize(serialized);
- Span src = Create8x8RoundedRandomFloatData(-200, 200, seed);
+ Span src = Create8x8RoundedRandomFloatData(MinAllowedValue, MaxAllowedValue, seed);
var block = default(Block8x8F);
block.LoadFrom(src);
@@ -212,23 +180,24 @@ static void RunTest(string serialized)
ReferenceImplementations.LLM_FloatingPoint_DCT.FDCT2D_llm(src, expectedDest, temp1, downscaleBy8: true);
// testee
- // Part of the FDCT calculations is fused into the quantization step
- // We must multiply transformed block with reciprocal values from FastFloatingPointDCT.ANN_DCT_reciprocalAdjustmen
FastFloatingPointDCT.TransformFDCT(ref block);
- for (int i = 0; i < 64; i++)
- {
- block[i] = block[i] * FastFloatingPointDCT.DctReciprocalAdjustmentCoefficients[i];
- }
+
+ // Part of the IDCT calculations is fused into the quantization step
+ // We must multiply input block with adjusted no-quantization matrix
+ // after applying FDCT
+ Block8x8F quantMatrix = CreateBlockFromScalar(1);
+ FastFloatingPointDCT.AdjustToFDCT(ref quantMatrix);
+ block.MultiplyInPlace(ref quantMatrix);
float[] actualDest = block.ToArray();
Assert.Equal(expectedDest, actualDest, new ApproximateFloatComparer(1f));
}
- // 3 paths:
+ // 4 paths:
// 1. AllowAll - call avx/fma implementation
- // 2. DisableFMA - call avx implementation without fma acceleration
- // 3. DisableAvx - call sse implementation
+ // 2. DisableFMA - call avx without fma implementation
+ // 3. DisableAvx - call sse Vector4 implementation
// 4. DisableHWIntrinsic - call scalar fallback implementation
FeatureTestRunner.RunWithHwIntrinsicsFeature(
RunTest,
diff --git a/tests/ImageSharp.Tests/Formats/Jpg/Utils/JpegFixture.cs b/tests/ImageSharp.Tests/Formats/Jpg/Utils/JpegFixture.cs
index a76b2bf2ef..1bdfc6ecad 100644
--- a/tests/ImageSharp.Tests/Formats/Jpg/Utils/JpegFixture.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/Utils/JpegFixture.cs
@@ -172,7 +172,7 @@ internal void CompareBlocks(Span a, Span b, float tolerance)
bool failed = false;
- for (int i = 0; i < 64; i++)
+ for (int i = 0; i < Block8x8F.Size; i++)
{
float expected = a[i];
float actual = b[i];
diff --git a/tests/ImageSharp.Tests/Formats/Jpg/Utils/LibJpegTools.ComponentData.cs b/tests/ImageSharp.Tests/Formats/Jpg/Utils/LibJpegTools.ComponentData.cs
index adbd695c0e..5c00b39af8 100644
--- a/tests/ImageSharp.Tests/Formats/Jpg/Utils/LibJpegTools.ComponentData.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/Utils/LibJpegTools.ComponentData.cs
@@ -48,6 +48,12 @@ public ComponentData(int widthInBlocks, int heightInBlocks, int index)
public short MaxVal { get; private set; } = short.MinValue;
+ internal void MakeBlock(Block8x8 block, int y, int x)
+ {
+ block.TransposeInplace();
+ this.MakeBlock(block.ToArray(), y, x);
+ }
+
internal void MakeBlock(short[] data, int y, int x)
{
this.MinVal = Math.Min(this.MinVal, data.Min());
@@ -66,11 +72,7 @@ public void LoadSpectralStride(Buffer2D data, int strideIndex)
Span blockRow = data.GetRowSpan(y - startIndex);
for (int x = 0; x < this.WidthInBlocks; x++)
{
- short[] block = blockRow[x].ToArray();
-
- // x coordinate stays the same - we load entire stride
- // y coordinate is tricky as we load single stride to full buffer - offset is needed
- this.MakeBlock(block, y, x);
+ this.MakeBlock(blockRow[x], y, x);
}
}
}
@@ -83,8 +85,7 @@ public void LoadSpectral(JpegComponent c)
Span blockRow = data.GetRowSpan(y);
for (int x = 0; x < this.WidthInBlocks; x++)
{
- short[] block = blockRow[x].ToArray();
- this.MakeBlock(block, y, x);
+ this.MakeBlock(blockRow[x], y, x);
}
}
}
diff --git a/tests/ImageSharp.Tests/Formats/Jpg/Utils/ReferenceImplementations.cs b/tests/ImageSharp.Tests/Formats/Jpg/Utils/ReferenceImplementations.cs
index c9741521c6..8dc1c83d45 100644
--- a/tests/ImageSharp.Tests/Formats/Jpg/Utils/ReferenceImplementations.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/Utils/ReferenceImplementations.cs
@@ -40,6 +40,23 @@ internal static void Transpose8x8(Span data)
}
}
+ ///
+ /// Transpose 8x8 block stored linearly in a (inplace)
+ ///
+ internal static void Transpose8x8(Span data)
+ {
+ for (int i = 1; i < 8; i++)
+ {
+ int i8 = i * 8;
+ for (int j = 0; j < i; j++)
+ {
+ short tmp = data[i8 + j];
+ data[i8 + j] = data[(j * 8) + i];
+ data[(j * 8) + i] = tmp;
+ }
+ }
+ }
+
///
/// Transpose 8x8 block stored linearly in a
///
diff --git a/tests/ImageSharp.Tests/Formats/Jpg/ZigZagTests.cs b/tests/ImageSharp.Tests/Formats/Jpg/ZigZagTests.cs
index 39046438a8..b67ad85eea 100644
--- a/tests/ImageSharp.Tests/Formats/Jpg/ZigZagTests.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/ZigZagTests.cs
@@ -1,6 +1,7 @@
// Copyright (c) Six Labors.
// Licensed under the Apache License, Version 2.0.
+using System;
using SixLabors.ImageSharp.Formats.Jpeg.Components;
using Xunit;
@@ -9,8 +10,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg
[Trait("Format", "Jpg")]
public class ZigZagTests
{
- [Fact]
- public void ZigZagCanHandleAllPossibleCoefficients()
+ private static void CanHandleAllPossibleCoefficients(ReadOnlySpan order)
{
// Mimic the behaviour of the huffman scan decoder using all possible byte values
short[] block = new short[64];
@@ -26,7 +26,7 @@ public void ZigZagCanHandleAllPossibleCoefficients()
if (s != 0)
{
i += r;
- block[ZigZag.ZigZagOrder[i++]] = (short)s;
+ block[order[i++]] = (short)s;
}
else
{
@@ -40,5 +40,13 @@ public void ZigZagCanHandleAllPossibleCoefficients()
}
}
}
+
+ [Fact]
+ public static void ZigZagCanHandleAllPossibleCoefficients() =>
+ CanHandleAllPossibleCoefficients(ZigZag.ZigZagOrder);
+
+ [Fact]
+ public static void TrasposingZigZagCanHandleAllPossibleCoefficients() =>
+ CanHandleAllPossibleCoefficients(ZigZag.TransposingOrder);
}
}