Skip to content

Commit 45d8998

Browse files
committed
std.crypto.onetimeauth.ghash: faster GHASH on modern CPUs
Carryless multiplication was slow on older Intel CPUs, justifying the need for using Karatsuba multiplication. This is not the case any more; using 4 multiplications to multiply two 128-bit numbers is actually faster than 3 multiplications + shifts and additions. This is also true on aarch64. Keep using Karatsuba only when targeting x86 (granted, this is a bit of a brutal shortcut, we should really list all the CPU models that had a slow clmul instruction). Also remove useless agg_2 treshold and restore the ability to precompute only H and H^2 in ReleaseSmall. Finally, avoid using u256. Using 128-bit registers is actually faster.
1 parent a09a5ad commit 45d8998

File tree

1 file changed

+87
-51
lines changed

1 file changed

+87
-51
lines changed

lib/std/crypto/ghash.zig

Lines changed: 87 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,13 @@ pub const Ghash = struct {
1818
pub const mac_length = 16;
1919
pub const key_length = 16;
2020

21-
const pc_count = if (builtin.mode != .ReleaseSmall) 16 else 4;
22-
const agg_2_treshold = 5;
21+
const pc_count = if (builtin.mode != .ReleaseSmall) 16 else 2;
2322
const agg_4_treshold = 22;
2423
const agg_8_treshold = 84;
2524
const agg_16_treshold = 328;
2625

26+
const mul_algorithm = if (builtin.cpu.arch == .x86) .karatsuba else .textbook;
27+
2728
hx: [pc_count]Precomp,
2829
acc: u128 = 0,
2930

@@ -43,10 +44,10 @@ pub const Ghash = struct {
4344
var hx: [pc_count]Precomp = undefined;
4445
hx[0] = h;
4546
hx[1] = gcmReduce(clsq128(hx[0])); // h^2
46-
hx[2] = gcmReduce(clmul128(hx[1], h)); // h^3
47-
hx[3] = gcmReduce(clsq128(hx[1])); // h^4 = h^2^2
4847

4948
if (builtin.mode != .ReleaseSmall) {
49+
hx[2] = gcmReduce(clmul128(hx[1], h)); // h^3
50+
hx[3] = gcmReduce(clsq128(hx[1])); // h^4 = h^2^2
5051
if (block_count >= agg_8_treshold) {
5152
hx[4] = gcmReduce(clmul128(hx[3], h)); // h^5
5253
hx[5] = gcmReduce(clsq128(hx[2])); // h^6 = h^3^2
@@ -69,24 +70,32 @@ pub const Ghash = struct {
6970
return Ghash.initForBlockCount(key, math.maxInt(usize));
7071
}
7172

72-
const Selector = enum { lo, hi };
73+
const Selector = enum { lo, hi, hi_lo };
7374

7475
// Carryless multiplication of two 64-bit integers for x86_64.
7576
inline fn clmulPclmul(x: u128, y: u128, comptime half: Selector) u128 {
7677
if (half == .hi) {
7778
const product = asm (
7879
\\ vpclmulqdq $0x11, %[x], %[y], %[out]
7980
: [out] "=x" (-> @Vector(2, u64)),
80-
: [x] "x" (@bitCast(@Vector(2, u64), @as(u128, x))),
81-
[y] "x" (@bitCast(@Vector(2, u64), @as(u128, y))),
81+
: [x] "x" (@bitCast(@Vector(2, u64), x)),
82+
[y] "x" (@bitCast(@Vector(2, u64), y)),
8283
);
8384
return @bitCast(u128, product);
84-
} else {
85+
} else if (half == .lo) {
8586
const product = asm (
8687
\\ vpclmulqdq $0x00, %[x], %[y], %[out]
8788
: [out] "=x" (-> @Vector(2, u64)),
88-
: [x] "x" (@bitCast(@Vector(2, u64), @as(u128, x))),
89-
[y] "x" (@bitCast(@Vector(2, u64), @as(u128, y))),
89+
: [x] "x" (@bitCast(@Vector(2, u64), x)),
90+
[y] "x" (@bitCast(@Vector(2, u64), y)),
91+
);
92+
return @bitCast(u128, product);
93+
} else {
94+
const product = asm (
95+
\\ vpclmulqdq $0x10, %[x], %[y], %[out]
96+
: [out] "=x" (-> @Vector(2, u64)),
97+
: [x] "x" (@bitCast(@Vector(2, u64), x)),
98+
[y] "x" (@bitCast(@Vector(2, u64), y)),
9099
);
91100
return @bitCast(u128, product);
92101
}
@@ -98,16 +107,24 @@ pub const Ghash = struct {
98107
const product = asm (
99108
\\ pmull2 %[out].1q, %[x].2d, %[y].2d
100109
: [out] "=w" (-> @Vector(2, u64)),
101-
: [x] "w" (@bitCast(@Vector(2, u64), @as(u128, x))),
102-
[y] "w" (@bitCast(@Vector(2, u64), @as(u128, y))),
110+
: [x] "w" (@bitCast(@Vector(2, u64), x)),
111+
[y] "w" (@bitCast(@Vector(2, u64), y)),
112+
);
113+
return @bitCast(u128, product);
114+
} else if (half == .lo) {
115+
const product = asm (
116+
\\ pmull %[out].1q, %[x].1d, %[y].1d
117+
: [out] "=w" (-> @Vector(2, u64)),
118+
: [x] "w" (@bitCast(@Vector(2, u64), x)),
119+
[y] "w" (@bitCast(@Vector(2, u64), y)),
103120
);
104121
return @bitCast(u128, product);
105122
} else {
106123
const product = asm (
107124
\\ pmull %[out].1q, %[x].1d, %[y].1d
108125
: [out] "=w" (-> @Vector(2, u64)),
109-
: [x] "w" (@bitCast(@Vector(2, u64), @as(u128, x))),
110-
[y] "w" (@bitCast(@Vector(2, u64), @as(u128, y))),
126+
: [x] "w" (@bitCast(@Vector(2, u64), x >> 64)),
127+
[y] "w" (@bitCast(@Vector(2, u64), y)),
111128
);
112129
return @bitCast(u128, product);
113130
}
@@ -144,38 +161,63 @@ pub const Ghash = struct {
144161
(z3 & 0x88888888888888888888888888888888) ^ extra;
145162
}
146163

164+
const I256 = struct {
165+
hi: u128,
166+
lo: u128,
167+
mid: u128,
168+
};
169+
170+
inline fn xor256(x: *I256, y: I256) void {
171+
x.* = I256{
172+
.hi = x.hi ^ y.hi,
173+
.lo = x.lo ^ y.lo,
174+
.mid = x.mid ^ y.mid,
175+
};
176+
}
177+
147178
// Square a 128-bit integer in GF(2^128).
148-
fn clsq128(x: u128) u256 {
149-
const lo = @truncate(u64, x);
150-
const hi = @truncate(u64, x >> 64);
151-
const mid = lo ^ hi;
152-
const r_lo = clmul(x, x, .lo);
153-
const r_hi = clmul(x, x, .hi);
154-
const r_mid = clmul(mid, mid, .lo) ^ r_lo ^ r_hi;
155-
return (@as(u256, r_hi) << 128) ^ (@as(u256, r_mid) << 64) ^ r_lo;
179+
fn clsq128(x: u128) I256 {
180+
return .{
181+
.hi = clmul(x, x, .hi),
182+
.lo = clmul(x, x, .lo),
183+
.mid = 0,
184+
};
156185
}
157186

158187
// Multiply two 128-bit integers in GF(2^128).
159-
inline fn clmul128(x: u128, y: u128) u256 {
160-
const x_hi = @truncate(u64, x >> 64);
161-
const y_hi = @truncate(u64, y >> 64);
162-
const r_lo = clmul(x, y, .lo);
163-
const r_hi = clmul(x, y, .hi);
164-
const r_mid = clmul(x ^ x_hi, y ^ y_hi, .lo) ^ r_lo ^ r_hi;
165-
return (@as(u256, r_hi) << 128) ^ (@as(u256, r_mid) << 64) ^ r_lo;
188+
inline fn clmul128(x: u128, y: u128) I256 {
189+
if (mul_algorithm == .karatsuba) {
190+
const x_hi = @truncate(u64, x >> 64);
191+
const y_hi = @truncate(u64, y >> 64);
192+
const r_lo = clmul(x, y, .lo);
193+
const r_hi = clmul(x, y, .hi);
194+
const r_mid = clmul(x ^ x_hi, y ^ y_hi, .lo) ^ r_lo ^ r_hi;
195+
return .{
196+
.hi = r_hi,
197+
.lo = r_lo,
198+
.mid = r_mid,
199+
};
200+
} else {
201+
return .{
202+
.hi = clmul(x, y, .hi),
203+
.lo = clmul(x, y, .lo),
204+
.mid = clmul(x, y, .hi_lo) ^ clmul(y, x, .hi_lo),
205+
};
206+
}
166207
}
167208

168209
// Reduce a 256-bit representative of a polynomial modulo the irreducible polynomial x^128 + x^127 + x^126 + x^121 + 1.
169210
// This is done *without reversing the bits*, using Shay Gueron's black magic demysticated here:
170211
// https://blog.quarkslab.com/reversing-a-finite-field-multiplication-optimization.html
171-
inline fn gcmReduce(x: u256) u128 {
212+
inline fn gcmReduce(x: I256) u128 {
213+
const hi = x.hi ^ (x.mid >> 64);
214+
const lo = x.lo ^ (x.mid << 64);
172215
const p64 = (((1 << 121) | (1 << 126) | (1 << 127)) >> 64);
173-
const lo = @truncate(u128, x);
174216
const a = clmul(lo, p64, .lo);
175217
const b = ((lo << 64) | (lo >> 64)) ^ a;
176218
const c = clmul(b, p64, .lo);
177219
const d = ((b << 64) | (b >> 64)) ^ c;
178-
return d ^ @truncate(u128, x >> 128);
220+
return d ^ hi;
179221
}
180222

181223
const has_pclmul = std.Target.x86.featureSetHas(builtin.cpu.features, .pclmul);
@@ -202,7 +244,7 @@ pub const Ghash = struct {
202244
var u = clmul128(acc ^ mem.readIntBig(u128, msg[i..][0..16]), st.hx[15 - 0]);
203245
comptime var j = 1;
204246
inline while (j < 16) : (j += 1) {
205-
u ^= clmul128(mem.readIntBig(u128, msg[i..][j * 16 ..][0..16]), st.hx[15 - j]);
247+
xor256(&u, clmul128(mem.readIntBig(u128, msg[i..][j * 16 ..][0..16]), st.hx[15 - j]));
206248
}
207249
acc = gcmReduce(u);
208250
}
@@ -212,7 +254,7 @@ pub const Ghash = struct {
212254
var u = clmul128(acc ^ mem.readIntBig(u128, msg[i..][0..16]), st.hx[7 - 0]);
213255
comptime var j = 1;
214256
inline while (j < 8) : (j += 1) {
215-
u ^= clmul128(mem.readIntBig(u128, msg[i..][j * 16 ..][0..16]), st.hx[7 - j]);
257+
xor256(&u, clmul128(mem.readIntBig(u128, msg[i..][j * 16 ..][0..16]), st.hx[7 - j]));
216258
}
217259
acc = gcmReduce(u);
218260
}
@@ -222,31 +264,25 @@ pub const Ghash = struct {
222264
var u = clmul128(acc ^ mem.readIntBig(u128, msg[i..][0..16]), st.hx[3 - 0]);
223265
comptime var j = 1;
224266
inline while (j < 4) : (j += 1) {
225-
u ^= clmul128(mem.readIntBig(u128, msg[i..][j * 16 ..][0..16]), st.hx[3 - j]);
267+
xor256(&u, clmul128(mem.readIntBig(u128, msg[i..][j * 16 ..][0..16]), st.hx[3 - j]));
226268
}
227269
acc = gcmReduce(u);
228270
}
229-
} else if (msg.len >= agg_2_treshold * block_length) {
230-
// 2-blocks aggregated reduction
231-
while (i + 32 <= msg.len) : (i += 32) {
232-
var u = clmul128(acc ^ mem.readIntBig(u128, msg[i..][0..16]), st.hx[1 - 0]);
233-
comptime var j = 1;
234-
inline while (j < 2) : (j += 1) {
235-
u ^= clmul128(mem.readIntBig(u128, msg[i..][j * 16 ..][0..16]), st.hx[1 - j]);
236-
}
237-
acc = gcmReduce(u);
271+
}
272+
// 2-blocks aggregated reduction
273+
while (i + 32 <= msg.len) : (i += 32) {
274+
var u = clmul128(acc ^ mem.readIntBig(u128, msg[i..][0..16]), st.hx[1 - 0]);
275+
comptime var j = 1;
276+
inline while (j < 2) : (j += 1) {
277+
xor256(&u, clmul128(mem.readIntBig(u128, msg[i..][j * 16 ..][0..16]), st.hx[1 - j]));
238278
}
279+
acc = gcmReduce(u);
239280
}
240281
// remaining blocks
241282
if (i < msg.len) {
242-
const n = (msg.len - i) / 16;
243-
var u = clmul128(acc ^ mem.readIntBig(u128, msg[i..][0..16]), st.hx[n - 1 - 0]);
244-
var j: usize = 1;
245-
while (j < n) : (j += 1) {
246-
u ^= clmul128(mem.readIntBig(u128, msg[i..][j * 16 ..][0..16]), st.hx[n - 1 - j]);
247-
}
248-
i += n * 16;
283+
const u = clmul128(acc ^ mem.readIntBig(u128, msg[i..][0..16]), st.hx[0]);
249284
acc = gcmReduce(u);
285+
i += 16;
250286
}
251287
assert(i == msg.len);
252288
st.acc = acc;

0 commit comments

Comments
 (0)