@@ -18,12 +18,13 @@ pub const Ghash = struct {
1818 pub const mac_length = 16 ;
1919 pub const key_length = 16 ;
2020
21- const pc_count = if (builtin .mode != .ReleaseSmall ) 16 else 4 ;
22- const agg_2_treshold = 5 ;
21+ const pc_count = if (builtin .mode != .ReleaseSmall ) 16 else 2 ;
2322 const agg_4_treshold = 22 ;
2423 const agg_8_treshold = 84 ;
2524 const agg_16_treshold = 328 ;
2625
26+ const mul_algorithm = if (builtin .cpu .arch == .x86 ) .karatsuba else .textbook ;
27+
2728 hx : [pc_count ]Precomp ,
2829 acc : u128 = 0 ,
2930
@@ -43,10 +44,10 @@ pub const Ghash = struct {
4344 var hx : [pc_count ]Precomp = undefined ;
4445 hx [0 ] = h ;
4546 hx [1 ] = gcmReduce (clsq128 (hx [0 ])); // h^2
46- hx [2 ] = gcmReduce (clmul128 (hx [1 ], h )); // h^3
47- hx [3 ] = gcmReduce (clsq128 (hx [1 ])); // h^4 = h^2^2
4847
4948 if (builtin .mode != .ReleaseSmall ) {
49+ hx [2 ] = gcmReduce (clmul128 (hx [1 ], h )); // h^3
50+ hx [3 ] = gcmReduce (clsq128 (hx [1 ])); // h^4 = h^2^2
5051 if (block_count >= agg_8_treshold ) {
5152 hx [4 ] = gcmReduce (clmul128 (hx [3 ], h )); // h^5
5253 hx [5 ] = gcmReduce (clsq128 (hx [2 ])); // h^6 = h^3^2
@@ -69,24 +70,32 @@ pub const Ghash = struct {
6970 return Ghash .initForBlockCount (key , math .maxInt (usize ));
7071 }
7172
72- const Selector = enum { lo , hi };
73+ const Selector = enum { lo , hi , hi_lo };
7374
7475 // Carryless multiplication of two 64-bit integers for x86_64.
7576 inline fn clmulPclmul (x : u128 , y : u128 , comptime half : Selector ) u128 {
7677 if (half == .hi ) {
7778 const product = asm (
7879 \\ vpclmulqdq $0x11, %[x], %[y], %[out]
7980 : [out ] "=x" (- > @Vector (2 , u64 )),
80- : [x ] "x" (@bitCast (@Vector (2 , u64 ), @as ( u128 , x ) )),
81- [y ] "x" (@bitCast (@Vector (2 , u64 ), @as ( u128 , y ) )),
81+ : [x ] "x" (@bitCast (@Vector (2 , u64 ), x )),
82+ [y ] "x" (@bitCast (@Vector (2 , u64 ), y )),
8283 );
8384 return @bitCast (u128 , product );
84- } else {
85+ } else if ( half == .lo ) {
8586 const product = asm (
8687 \\ vpclmulqdq $0x00, %[x], %[y], %[out]
8788 : [out ] "=x" (- > @Vector (2 , u64 )),
88- : [x ] "x" (@bitCast (@Vector (2 , u64 ), @as (u128 , x ))),
89- [y ] "x" (@bitCast (@Vector (2 , u64 ), @as (u128 , y ))),
89+ : [x ] "x" (@bitCast (@Vector (2 , u64 ), x )),
90+ [y ] "x" (@bitCast (@Vector (2 , u64 ), y )),
91+ );
92+ return @bitCast (u128 , product );
93+ } else {
94+ const product = asm (
95+ \\ vpclmulqdq $0x10, %[x], %[y], %[out]
96+ : [out ] "=x" (- > @Vector (2 , u64 )),
97+ : [x ] "x" (@bitCast (@Vector (2 , u64 ), x )),
98+ [y ] "x" (@bitCast (@Vector (2 , u64 ), y )),
9099 );
91100 return @bitCast (u128 , product );
92101 }
@@ -98,16 +107,24 @@ pub const Ghash = struct {
98107 const product = asm (
99108 \\ pmull2 %[out].1q, %[x].2d, %[y].2d
100109 : [out ] "=w" (- > @Vector (2 , u64 )),
101- : [x ] "w" (@bitCast (@Vector (2 , u64 ), @as (u128 , x ))),
102- [y ] "w" (@bitCast (@Vector (2 , u64 ), @as (u128 , y ))),
110+ : [x ] "w" (@bitCast (@Vector (2 , u64 ), x )),
111+ [y ] "w" (@bitCast (@Vector (2 , u64 ), y )),
112+ );
113+ return @bitCast (u128 , product );
114+ } else if (half == .lo ) {
115+ const product = asm (
116+ \\ pmull %[out].1q, %[x].1d, %[y].1d
117+ : [out ] "=w" (- > @Vector (2 , u64 )),
118+ : [x ] "w" (@bitCast (@Vector (2 , u64 ), x )),
119+ [y ] "w" (@bitCast (@Vector (2 , u64 ), y )),
103120 );
104121 return @bitCast (u128 , product );
105122 } else {
106123 const product = asm (
107124 \\ pmull %[out].1q, %[x].1d, %[y].1d
108125 : [out ] "=w" (- > @Vector (2 , u64 )),
109- : [x ] "w" (@bitCast (@Vector (2 , u64 ), @as ( u128 , x ) )),
110- [y ] "w" (@bitCast (@Vector (2 , u64 ), @as ( u128 , y ) )),
126+ : [x ] "w" (@bitCast (@Vector (2 , u64 ), x >> 64 )),
127+ [y ] "w" (@bitCast (@Vector (2 , u64 ), y )),
111128 );
112129 return @bitCast (u128 , product );
113130 }
@@ -144,38 +161,63 @@ pub const Ghash = struct {
144161 (z3 & 0x88888888888888888888888888888888 ) ^ extra ;
145162 }
146163
164+ const I256 = struct {
165+ hi : u128 ,
166+ lo : u128 ,
167+ mid : u128 ,
168+ };
169+
170+ inline fn xor256 (x : * I256 , y : I256 ) void {
171+ x .* = I256 {
172+ .hi = x .hi ^ y .hi ,
173+ .lo = x .lo ^ y .lo ,
174+ .mid = x .mid ^ y .mid ,
175+ };
176+ }
177+
147178 // Square a 128-bit integer in GF(2^128).
148- fn clsq128 (x : u128 ) u256 {
149- const lo = @truncate (u64 , x );
150- const hi = @truncate (u64 , x >> 64 );
151- const mid = lo ^ hi ;
152- const r_lo = clmul (x , x , .lo );
153- const r_hi = clmul (x , x , .hi );
154- const r_mid = clmul (mid , mid , .lo ) ^ r_lo ^ r_hi ;
155- return (@as (u256 , r_hi ) << 128 ) ^ (@as (u256 , r_mid ) << 64 ) ^ r_lo ;
179+ fn clsq128 (x : u128 ) I256 {
180+ return .{
181+ .hi = clmul (x , x , .hi ),
182+ .lo = clmul (x , x , .lo ),
183+ .mid = 0 ,
184+ };
156185 }
157186
158187 // Multiply two 128-bit integers in GF(2^128).
159- inline fn clmul128 (x : u128 , y : u128 ) u256 {
160- const x_hi = @truncate (u64 , x >> 64 );
161- const y_hi = @truncate (u64 , y >> 64 );
162- const r_lo = clmul (x , y , .lo );
163- const r_hi = clmul (x , y , .hi );
164- const r_mid = clmul (x ^ x_hi , y ^ y_hi , .lo ) ^ r_lo ^ r_hi ;
165- return (@as (u256 , r_hi ) << 128 ) ^ (@as (u256 , r_mid ) << 64 ) ^ r_lo ;
188+ inline fn clmul128 (x : u128 , y : u128 ) I256 {
189+ if (mul_algorithm == .karatsuba ) {
190+ const x_hi = @truncate (u64 , x >> 64 );
191+ const y_hi = @truncate (u64 , y >> 64 );
192+ const r_lo = clmul (x , y , .lo );
193+ const r_hi = clmul (x , y , .hi );
194+ const r_mid = clmul (x ^ x_hi , y ^ y_hi , .lo ) ^ r_lo ^ r_hi ;
195+ return .{
196+ .hi = r_hi ,
197+ .lo = r_lo ,
198+ .mid = r_mid ,
199+ };
200+ } else {
201+ return .{
202+ .hi = clmul (x , y , .hi ),
203+ .lo = clmul (x , y , .lo ),
204+ .mid = clmul (x , y , .hi_lo ) ^ clmul (y , x , .hi_lo ),
205+ };
206+ }
166207 }
167208
168209 // Reduce a 256-bit representative of a polynomial modulo the irreducible polynomial x^128 + x^127 + x^126 + x^121 + 1.
169210 // This is done *without reversing the bits*, using Shay Gueron's black magic demysticated here:
170211 // https://blog.quarkslab.com/reversing-a-finite-field-multiplication-optimization.html
171- inline fn gcmReduce (x : u256 ) u128 {
212+ inline fn gcmReduce (x : I256 ) u128 {
213+ const hi = x .hi ^ (x .mid >> 64 );
214+ const lo = x .lo ^ (x .mid << 64 );
172215 const p64 = (((1 << 121 ) | (1 << 126 ) | (1 << 127 )) >> 64 );
173- const lo = @truncate (u128 , x );
174216 const a = clmul (lo , p64 , .lo );
175217 const b = ((lo << 64 ) | (lo >> 64 )) ^ a ;
176218 const c = clmul (b , p64 , .lo );
177219 const d = ((b << 64 ) | (b >> 64 )) ^ c ;
178- return d ^ @truncate ( u128 , x >> 128 ) ;
220+ return d ^ hi ;
179221 }
180222
181223 const has_pclmul = std .Target .x86 .featureSetHas (builtin .cpu .features , .pclmul );
@@ -202,7 +244,7 @@ pub const Ghash = struct {
202244 var u = clmul128 (acc ^ mem .readIntBig (u128 , msg [i .. ][0.. 16]), st .hx [15 - 0 ]);
203245 comptime var j = 1 ;
204246 inline while (j < 16 ) : (j += 1 ) {
205- u ^= clmul128 (mem .readIntBig (u128 , msg [i .. ][j * 16 .. ][0.. 16]), st .hx [15 - j ]);
247+ xor256 ( & u , clmul128 (mem .readIntBig (u128 , msg [i .. ][j * 16 .. ][0.. 16]), st .hx [15 - j ]) );
206248 }
207249 acc = gcmReduce (u );
208250 }
@@ -212,7 +254,7 @@ pub const Ghash = struct {
212254 var u = clmul128 (acc ^ mem .readIntBig (u128 , msg [i .. ][0.. 16]), st .hx [7 - 0 ]);
213255 comptime var j = 1 ;
214256 inline while (j < 8 ) : (j += 1 ) {
215- u ^= clmul128 (mem .readIntBig (u128 , msg [i .. ][j * 16 .. ][0.. 16]), st .hx [7 - j ]);
257+ xor256 ( & u , clmul128 (mem .readIntBig (u128 , msg [i .. ][j * 16 .. ][0.. 16]), st .hx [7 - j ]) );
216258 }
217259 acc = gcmReduce (u );
218260 }
@@ -222,31 +264,25 @@ pub const Ghash = struct {
222264 var u = clmul128 (acc ^ mem .readIntBig (u128 , msg [i .. ][0.. 16]), st .hx [3 - 0 ]);
223265 comptime var j = 1 ;
224266 inline while (j < 4 ) : (j += 1 ) {
225- u ^= clmul128 (mem .readIntBig (u128 , msg [i .. ][j * 16 .. ][0.. 16]), st .hx [3 - j ]);
267+ xor256 ( & u , clmul128 (mem .readIntBig (u128 , msg [i .. ][j * 16 .. ][0.. 16]), st .hx [3 - j ]) );
226268 }
227269 acc = gcmReduce (u );
228270 }
229- } else if (msg .len >= agg_2_treshold * block_length ) {
230- // 2-blocks aggregated reduction
231- while (i + 32 <= msg .len ) : (i += 32 ) {
232- var u = clmul128 (acc ^ mem .readIntBig (u128 , msg [i .. ][0.. 16]), st .hx [1 - 0 ]);
233- comptime var j = 1 ;
234- inline while (j < 2 ) : (j += 1 ) {
235- u ^= clmul128 (mem .readIntBig (u128 , msg [i .. ][j * 16 .. ][0.. 16]), st .hx [1 - j ]);
236- }
237- acc = gcmReduce (u );
271+ }
272+ // 2-blocks aggregated reduction
273+ while (i + 32 <= msg .len ) : (i += 32 ) {
274+ var u = clmul128 (acc ^ mem .readIntBig (u128 , msg [i .. ][0.. 16]), st .hx [1 - 0 ]);
275+ comptime var j = 1 ;
276+ inline while (j < 2 ) : (j += 1 ) {
277+ xor256 (& u , clmul128 (mem .readIntBig (u128 , msg [i .. ][j * 16 .. ][0.. 16]), st .hx [1 - j ]));
238278 }
279+ acc = gcmReduce (u );
239280 }
240281 // remaining blocks
241282 if (i < msg .len ) {
242- const n = (msg .len - i ) / 16 ;
243- var u = clmul128 (acc ^ mem .readIntBig (u128 , msg [i .. ][0.. 16]), st .hx [n - 1 - 0 ]);
244- var j : usize = 1 ;
245- while (j < n ) : (j += 1 ) {
246- u ^= clmul128 (mem .readIntBig (u128 , msg [i .. ][j * 16 .. ][0.. 16]), st .hx [n - 1 - j ]);
247- }
248- i += n * 16 ;
283+ const u = clmul128 (acc ^ mem .readIntBig (u128 , msg [i .. ][0.. 16]), st .hx [0 ]);
249284 acc = gcmReduce (u );
285+ i += 16 ;
250286 }
251287 assert (i == msg .len );
252288 st .acc = acc ;
0 commit comments