-
-
Notifications
You must be signed in to change notification settings - Fork 888
Add sse2 version of select #1804
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -27,6 +27,10 @@ internal static unsafe class LosslessUtils | |
|
|
||
| private const double Log2Reciprocal = 1.44269504088896338700465094007086; | ||
|
|
||
| #if SUPPORTS_RUNTIME_INTRINSICS | ||
| private static readonly Vector128<byte> Zero = Vector128.Create(0).AsByte(); | ||
| #endif | ||
|
|
||
| /// <summary> | ||
| /// Returns the exact index where array1 and array2 are different. For an index | ||
| /// inferior or equal to bestLenMatch, the return value just has to be strictly | ||
|
|
@@ -551,6 +555,7 @@ public static void PredictorInverseTransform( | |
| int mask = tileWidth - 1; | ||
| int tilesPerRow = SubSampleSize(width, transform.Bits); | ||
| int predictorModeIdxBase = (y >> transform.Bits) * tilesPerRow; | ||
| Span<short> scratch = stackalloc short[8]; | ||
| while (y < yEnd) | ||
| { | ||
| int predictorModeIdx = predictorModeIdxBase; | ||
|
|
@@ -608,7 +613,7 @@ public static void PredictorInverseTransform( | |
| PredictorAdd10(input + x, output + x - width, xEnd - x, output + x); | ||
| break; | ||
| case 11: | ||
| PredictorAdd11(input + x, output + x - width, xEnd - x, output + x); | ||
| PredictorAdd11(input + x, output + x - width, xEnd - x, output + x, scratch); | ||
| break; | ||
| case 12: | ||
| PredictorAdd12(input + x, output + x - width, xEnd - x, output + x); | ||
|
|
@@ -974,11 +979,11 @@ private static void PredictorAdd10(uint* input, uint* upper, int numberOfPixels, | |
| } | ||
|
|
||
| [MethodImpl(InliningOptions.ShortMethod)] | ||
| private static void PredictorAdd11(uint* input, uint* upper, int numberOfPixels, uint* output) | ||
| private static void PredictorAdd11(uint* input, uint* upper, int numberOfPixels, uint* output, Span<short> scratch) | ||
| { | ||
| for (int x = 0; x < numberOfPixels; x++) | ||
| { | ||
| uint pred = Predictor11(output[x - 1], upper + x); | ||
| uint pred = Predictor11(output[x - 1], upper + x, scratch); | ||
| output[x] = AddPixels(input[x], pred); | ||
| } | ||
| } | ||
|
|
@@ -1031,7 +1036,7 @@ private static void PredictorAdd13(uint* input, uint* upper, int numberOfPixels, | |
| public static uint Predictor10(uint left, uint* top) => Average4(left, top[-1], top[0], top[1]); | ||
|
|
||
| [MethodImpl(InliningOptions.ShortMethod)] | ||
| public static uint Predictor11(uint left, uint* top) => Select(top[0], left, top[-1]); | ||
| public static uint Predictor11(uint left, uint* top, Span<short> scratch) => Select(top[0], left, top[-1], scratch); | ||
|
|
||
| [MethodImpl(InliningOptions.ShortMethod)] | ||
| public static uint Predictor12(uint left, uint* top) => ClampedAddSubtractFull(left, top[0], top[-1]); | ||
|
|
@@ -1148,11 +1153,11 @@ public static void PredictorSub10(uint* input, uint* upper, int numPixels, uint* | |
| } | ||
|
|
||
| [MethodImpl(InliningOptions.ShortMethod)] | ||
| public static void PredictorSub11(uint* input, uint* upper, int numPixels, uint* output) | ||
| public static void PredictorSub11(uint* input, uint* upper, int numPixels, uint* output, Span<short> scratch) | ||
| { | ||
| for (int x = 0; x < numPixels; x++) | ||
| { | ||
| uint pred = Predictor11(input[x - 1], upper + x); | ||
| uint pred = Predictor11(input[x - 1], upper + x, scratch); | ||
| output[x] = SubPixels(input[x], pred); | ||
| } | ||
| } | ||
|
|
@@ -1240,14 +1245,43 @@ private static uint ClampedAddSubtractHalf(uint c0, uint c1, uint c2) | |
| private static Vector128<int> MkCst16(int hi, int lo) => Vector128.Create((hi << 16) | (lo & 0xffff)); | ||
| #endif | ||
|
|
||
| private static uint Select(uint a, uint b, uint c) | ||
| private static uint Select(uint a, uint b, uint c, Span<short> scratch) | ||
| { | ||
| int paMinusPb = | ||
| Sub3((int)(a >> 24), (int)(b >> 24), (int)(c >> 24)) + | ||
| Sub3((int)((a >> 16) & 0xff), (int)((b >> 16) & 0xff), (int)((c >> 16) & 0xff)) + | ||
| Sub3((int)((a >> 8) & 0xff), (int)((b >> 8) & 0xff), (int)((c >> 8) & 0xff)) + | ||
| Sub3((int)(a & 0xff), (int)(b & 0xff), (int)(c & 0xff)); | ||
| return paMinusPb <= 0 ? a : b; | ||
| #if SUPPORTS_RUNTIME_INTRINSICS | ||
| if (Sse2.IsSupported) | ||
| { | ||
| Span<short> output = scratch; | ||
| fixed (short* p = output) | ||
| { | ||
| Vector128<byte> a0 = Sse2.ConvertScalarToVector128UInt32(a).AsByte(); | ||
| Vector128<byte> b0 = Sse2.ConvertScalarToVector128UInt32(b).AsByte(); | ||
| Vector128<byte> c0 = Sse2.ConvertScalarToVector128UInt32(c).AsByte(); | ||
| Vector128<byte> ac0 = Sse2.SubtractSaturate(a0, c0); | ||
| Vector128<byte> ca0 = Sse2.SubtractSaturate(c0, a0); | ||
| Vector128<byte> bc0 = Sse2.SubtractSaturate(b0, c0); | ||
| Vector128<byte> cb0 = Sse2.SubtractSaturate(c0, b0); | ||
| Vector128<byte> ac = Sse2.Or(ac0, ca0); | ||
| Vector128<byte> bc = Sse2.Or(bc0, cb0); | ||
| Vector128<byte> pa = Sse2.UnpackLow(ac, Zero); // |a - c| | ||
| Vector128<byte> pb = Sse2.UnpackLow(bc, Zero); // |b - c| | ||
| Vector128<ushort> diff = Sse2.Subtract(pb.AsUInt16(), pa.AsUInt16()); | ||
| Sse2.Store((ushort*)p, diff); | ||
| } | ||
|
|
||
| int paMinusPb = output[0] + output[1] + output[2] + output[3]; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You can put this into the * or reverse the order to read
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ahhh, too late....
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
ah yeah, always forget about that trick, thx. Will do with a follow up PR.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'll put it (incl. the return) within the fixed block here. |
||
|
|
||
| return (paMinusPb <= 0) ? a : b; | ||
| } | ||
| else | ||
| #endif | ||
| { | ||
| int paMinusPb = | ||
| Sub3((int)(a >> 24), (int)(b >> 24), (int)(c >> 24)) + | ||
| Sub3((int)((a >> 16) & 0xff), (int)((b >> 16) & 0xff), (int)((c >> 16) & 0xff)) + | ||
| Sub3((int)((a >> 8) & 0xff), (int)((b >> 8) & 0xff), (int)((c >> 8) & 0xff)) + | ||
| Sub3((int)(a & 0xff), (int)(b & 0xff), (int)(c & 0xff)); | ||
| return paMinusPb <= 0 ? a : b; | ||
| } | ||
| } | ||
|
|
||
| [MethodImpl(InliningOptions.ShortMethod)] | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
While vector creation on pre .net6 is compiled to a really bad sequence of scalar sets, explicit
Vector128<T>.Zerocompiles to a xor command which would be a little clearer imo. Plus it might be a very tiny bit faster than static variable read.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
thanks @br3aker
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
FYI:
defaultfor the vector creates the same code asVector128<T>.Zero(I prefer Vector128.Zero).