1 /// -*- mode: asm; asm-comment-char: ?/ -*-
3 /// GCM acceleration for x86 processors
5 /// (c) 2018 Straylight/Edgeware
8 ///----- Licensing notice ---------------------------------------------------
10 /// This file is part of Catacomb.
12 /// Catacomb is free software: you can redistribute it and/or modify it
13 /// under the terms of the GNU Library General Public License as published
14 /// by the Free Software Foundation; either version 2 of the License, or
15 /// (at your option) any later version.
17 /// Catacomb is distributed in the hope that it will be useful, but
18 /// WITHOUT ANY WARRANTY; without even the implied warranty of
19 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 /// Library General Public License for more details.
22 /// You should have received a copy of the GNU Library General Public
23 /// License along with Catacomb. If not, write to the Free Software
24 /// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
27 ///--------------------------------------------------------------------------
31 #include "asm-common.h"
37 ///--------------------------------------------------------------------------
38 /// Common register allocation.
43 #elif CPUFAM_AMD64 && ABI_SYSV
46 #elif CPUFAM_AMD64 && ABI_WIN
51 ///--------------------------------------------------------------------------
52 /// Multiplication macros.
54 // The good news is that we have a fancy instruction to do the
55 // multiplications. The bad news is that it's not particularly well-
58 // For one thing, it only does a 64-bit multiplication, so in general
59 // we'll need to synthesize the full-width multiply by hand. For
60 // another thing, it doesn't help with the reduction, so we have to
61 // do that by hand too. And, finally, GCM has crazy bit ordering,
62 // and the instruction does nothing useful for that at all.
64 // Focusing on that last problem first: the bits aren't in monotonic
65 // significance order unless we permute them. If we reverse the byte
66 // order, then we'll have the bits in monotonic order, but backwards,
67 // so the degree-0 coefficient will be in the most-significant bit.
69 // This is less of a difficulty than it seems at first, because
70 // algebra. Suppose we are given u = SUM_{0<=i<n} u_i t^i and v =
71 // SUM_{0<=j<n} v_j t^j; then
73 // u v = SUM_{0<=i,j<n} u_i v_j t^{i+j}
75 // Suppose instead that we're given ũ = SUM_{0<=i<n} u_{n-i-1} t^i
76 // and ṽ = SUM_{0<=j<n} v_{n-j-1} t^j, so the bits are backwards.
79 // ũ ṽ = SUM_{0<=i,j<n} u_{n-i-1} v_{n-j-1} t^{i+j}
80 // = SUM_{0<=i,j<n} u_i v_j t^{2n-2-(i+j)}
82 // which is almost the bit-reversal of u v, only it's shifted right
83 // by one place. Oh, well: we'll have to shift it back later.
85 // That was important to think about, but there's not a great deal to
86 // do about it yet other than to convert what we've got from the
87 // blockcipher's byte-ordering convention to our big-endian
88 // convention. Since this depends on the blockcipher convention,
89 // we'll leave the caller to cope with this: the macros here will
90 // assume that the operands are in `register' format, which is the
91 // byte-reversal of the external representation, padded at the
92 // most-significant end except for 96-bit blocks, which are
93 // zero-padded at the least-significant end (see `mul96' for the
94 // details). In the commentary, pieces of polynomial are numbered
95 // according to the degree of the coefficients, so the unit
96 // coefficient of some polynomial a is in a_0.
98 // The commentary for `mul128' is the most detailed. The other
99 // macros assume that you've already read and understood that.
102 // Enter with u and v in xmm0 and xmm1 respectively; leave with z =
103 // u v in xmm0. Clobbers xmm1--xmm4.
105 // First for the double-precision multiplication. It's tempting to
106 // use Karatsuba's identity here, but I suspect that loses more in
107 // the shifting, bit-twiddling, and dependency chains that it gains
108 // in saving a multiplication which otherwise pipelines well.
109 // xmm0 = // (u_1; u_0)
110 // xmm1 = // (v_1; v_0)
111 movdqa xmm2, xmm1 // (v_1; v_0) again
112 movdqa xmm3, xmm0 // (u_1; u_0) again
113 movdqa xmm4, xmm0 // (u_1; u_0) yet again
114 pclmulhqlqdq xmm2, xmm0 // u_1 v_0
115 pclmullqlqdq xmm0, xmm1 // u_1 v_1
116 pclmulhqlqdq xmm3, xmm1 // u_0 v_1
117 pclmulhqhqdq xmm4, xmm1 // u_0 v_0
119 // Arrange the pieces to form a double-precision polynomial.
120 pxor xmm2, xmm3 // (m_1; m_0) = u_1 v_0 + u_0 v_1
121 movdqa xmm1, xmm2 // (m_1; m_0) again
122 pslldq xmm2, 8 // (0; m_1)
123 psrldq xmm1, 8 // (m_0; 0)
124 pxor xmm0, xmm2 // x_1 = u_1 v_1 + m_1
125 pxor xmm1, xmm4 // x_0 = u_0 v_0 + t^64 m_0
127 // Two problems remain. The first is that this product is shifted
128 // left (from GCM's backwards perspective) by one place, which is
129 // annoying. Let's take care of that now. Once this is done, we'll
130 // be properly in GCM's backwards bit-ordering, so xmm1 will hold the
131 // low half of the product and xmm0 the high half. (The following
132 // diagrams show bit 0 consistently on the right.)
135 // ,-------------.-------------.-------------.-------------.
136 // | 0 x_0-x_30 | x_31-x_62 | x_63-x_94 | x_95-x_126 |
137 // `-------------^-------------^-------------^-------------'
140 // ,-------------.-------------.-------------.-------------.
141 // | x_127-x_158 | x_159-x_190 | x_191-x_222 | x_223-x_254 |
142 // `-------------^-------------^-------------^-------------'
144 // We start by shifting each 32-bit lane right (from GCM's point of
145 // view -- physically, left) by one place, which gives us this:
148 // ,-------------.-------------.-------------.-------------.
149 // | x_0-x_30 0 | x_32-x_62 0 | x_64-x_94 0 | x_96-x_126 0|
150 // `-------------^-------------^-------------^-------------'
153 // ,-------------.-------------.-------------.-------------.
154 // |x_128-x_158 0|x_160-x_190 0|x_192-x_222 0|x_224-x_254 0|
155 // `-------------^-------------^-------------^-------------'
157 // but we've lost a bunch of bits. We separately shift each lane
158 // left by 31 places to give us the bits we lost.
161 // ,-------------.-------------.-------------.-------------.
162 // | 0...0 | 0...0 x_31 | 0...0 x_63 | 0...0 x_95 |
163 // `-------------^-------------^-------------^-------------'
166 // ,-------------.-------------.-------------.-------------.
167 // | 0...0 x_127 | 0...0 x_159 | 0...0 x_191 | 0...0 x_223 |
168 // `-------------^-------------^-------------^-------------'
170 // Which is close, but we don't get a cigar yet. To get the missing
171 // bits into position, we shift each of these right by a lane, but,
172 // alas, the x_127 falls off, so, separately, we shift the high
173 // register left by three lanes, so that everything is lined up
174 // properly when we OR them all together:
177 // ,-------------.-------------.-------------.-------------.
178 // ? 0...0 x_31 | 0...0 x_63 | 0...0 x_95 | 0...0 |
179 // `-------------^-------------^-------------^-------------'
182 // ,-------------.-------------.-------------.-------------.
183 // | 0...0 | 0...0 | 0...0 | 0...0 x_127 |
184 // `-------------^-------------^-------------^-------------'
187 // ,-------------.-------------.-------------.-------------.
188 // | 0...0 x_159 | 0...0 x_191 | 0...0 x_223 | 0...0 |
189 // `-------------^-------------^-------------^-------------'
191 // The `low' and `wrap' registers (xmm1, xmm3, xmm4) then collect the
192 // low 128 coefficients, while the `high' registers (xmm0, xmm2)
193 // collect the high 127 registers, leaving a zero bit at the most
194 // significant end as we expect.
196 // xmm0 = // (x_7, x_6; x_5, x_4)
197 // xmm1 = // (x_3, x_2; x_1, x_0)
198 movdqa xmm3, xmm1 // (x_3, x_2; x_1, x_0) again
199 movdqa xmm2, xmm0 // (x_7, x_6; x_5, x_4) again
200 psrld xmm1, 31 // shifted left; just the carries
202 pslld xmm3, 1 // shifted right, but dropped carries
204 movdqa xmm4, xmm0 // another copy for the carry around
205 pslldq xmm1, 4 // move carries over
207 psrldq xmm4, 12 // the big carry wraps around
209 por xmm0, xmm2 // (y_7, y_6; y_5, y_4)
210 por xmm1, xmm4 // (y_3, y_2; y_1, y_0)
212 // And the other problem is that the result needs to be reduced
213 // modulo p(t) = t^128 + t^7 + t^2 + t + 1. Let R = t^128 = t^7 +
214 // t^2 + t + 1 in our field. So far, we've calculated z_0 and z_1
215 // such that z_0 + z_1 R = u v using the identity R = t^128: now we
216 // must collapse the two halves of z together using the other
217 // identity R = t^7 + t^2 + t + 1.
219 // We do this by working on each 32-bit word of the high half of z
220 // separately, so consider y_i, for some 4 <= i < 8. Certainly, y_i
221 // t^{32i} = y_i R t^{32(i-4)} = (t^7 + t^2 + t + 1) y_i t^{32(i-4)},
222 // but we can't use that directly without breaking up the 32-bit word
223 // structure. Instead, we start by considering just y_i t^7
224 // t^{32(i-4)}, which again looks tricky. Now, split y_i = a_i +
225 // t^25 b_i, with deg a_i < 25; then
227 // y_i t^7 t^{32(i-4)} = a_i t^7 t^{32(i-4)} + b_i t^{32(i-3)}
229 // We can similarly decompose y_i t^2 and y_i t into a pair of 32-bit
230 // contributions to the t^{32(i-4)} and t^{32(i-3)} words, but the
231 // splits are different. This is lovely, with one small snag: when
232 // we do this to y_7, we end up with a contribution back into the
233 // t^128 coefficient word. But notice that only the low seven bits
234 // of this word are affected, so there's no knock-on contribution
235 // into the t^32 word. Therefore, if we handle the high bits of each
236 // word together, and then the low bits, everything will be fine.
238 // First, shift the high bits down.
239 movdqa xmm2, xmm0 // (y_7, y_6; y_5, y_4) again
240 movdqa xmm3, xmm0 // (y_7, y_6; y_5, y_4) yet again
241 movdqa xmm4, xmm0 // (y_7, y_6; y_5, y_4) again again
242 pslld xmm2, 31 // the b_i for t
243 pslld xmm3, 30 // the b_i for t^2
244 pslld xmm4, 25 // the b_i for t^7
245 pxor xmm2, xmm3 // add them all together
247 movdqa xmm3, xmm2 // and a copy for later
248 psrldq xmm2, 4 // contribution into low half
249 pslldq xmm3, 12 // and high half
253 // And then shift the low bits up.
256 pxor xmm1, xmm0 // mix in the unit contribution
260 pxor xmm1, xmm2 // low half, unit, and t^2 contribs
261 pxor xmm0, xmm3 // t and t^7 contribs
262 pxor xmm0, xmm1 // mix them together and we're done
266 // Enter with u and v in the low halves of xmm0 and xmm1
267 // respectively; leave with z = u v in xmm0. Clobbers xmm1--xmm4.
269 // The multiplication is thankfully easy.
270 pclmullqlqdq xmm0, xmm1 // u v
272 // Shift the product up by one place. After this, we're in GCM
274 movdqa xmm1, xmm0 // u v again
275 psrld xmm0, 31 // shifted left; just the carries
276 pslld xmm1, 1 // shifted right, but dropped carries
277 pslldq xmm0, 4 // move carries over
278 por xmm1, xmm0 // (y_3, y_2; y_1, y_0)
280 // Now we must reduce. This is essentially the same as the 128-bit
281 // case above, but mostly simpler because everything is smaller. The
282 // polynomial this time is p(t) = t^64 + t^4 + t^3 + t + 1.
284 // First, we must detach the top (`low'!) half of the result.
285 movdqa xmm0, xmm1 // (y_3, y_2; y_1, y_0) again
286 psrldq xmm1, 8 // (y_1, y_0; 0, 0)
288 // Next, shift the high bits down.
289 movdqa xmm2, xmm0 // (y_3, y_2; ?, ?) again
290 movdqa xmm3, xmm0 // (y_3, y_2; ?, ?) yet again
291 movdqa xmm4, xmm0 // (y_3, y_2; ?, ?) again again
292 pslld xmm2, 31 // b_i for t
293 pslld xmm3, 29 // b_i for t^3
294 pslld xmm4, 28 // b_i for t^4
295 pxor xmm2, xmm3 // add them all together
297 movdqa xmm3, xmm2 // and a copy for later
298 movq xmm2, xmm2 // zap high half
299 pslldq xmm3, 4 // contribution into high half
300 psrldq xmm2, 4 // and low half
304 // And then shift the low bits up.
307 pxor xmm1, xmm0 // mix in the unit contribution
311 pxor xmm1, xmm2 // low half, unit, and t^3 contribs
312 pxor xmm0, xmm3 // t and t^4 contribs
313 pxor xmm0, xmm1 // mix them together and we're done
317 // Enter with u and v in the /high/ three words of xmm0 and xmm1
318 // respectively (and zero in the low word); leave with z = u v in the
319 // high three words of xmm0, and /junk/ in the low word. Clobbers
322 // This is an inconvenient size. There's nothing for it but to do
323 // four multiplications, as if for the 128-bit case. It's possible
324 // that there's cruft in the top 32 bits of the input registers, so
325 // shift both of them up by four bytes before we start. This will
326 // mean that the high 64 bits of the result (from GCM's viewpoint)
328 // xmm0 = // (0, u_2; u_1, u_0)
329 // xmm1 = // (0, v_2; v_1, v_0)
330 movdqa xmm2, xmm1 // (0, v_2; v_1, v_0) again
331 movdqa xmm3, xmm0 // (0, u_2; u_1, u_0) again
332 movdqa xmm4, xmm0 // (0, u_2; u_1, u_0) yet again
333 pclmulhqlqdq xmm2, xmm0 // u_2 (v_1 t^32 + v_0) = e_0
334 pclmullqlqdq xmm0, xmm1 // u_2 v_2 = d = (0; d)
335 pclmulhqlqdq xmm3, xmm1 // v_2 (u_1 t^32 + u_0) = e_1
336 pclmulhqhqdq xmm4, xmm1 // u_0 v_0 + (u_1 v_0 + u_0 v_1) t^32
337 // + u_1 v_1 t^64 = f
339 // Extract the high and low halves of the 192-bit result. We don't
340 // need be too picky about the unused high words of the result
341 // registers. The answer we want is d t^128 + e t^64 + f, where e =
344 // The place values for the two halves are (t^160, t^128; t^96, ?)
345 // and (?, t^64; t^32, 1).
346 psrldq xmm0, 8 // (d; 0) = d t^128
347 pxor xmm2, xmm3 // e = (e_0 + e_1)
348 movdqa xmm1, xmm4 // f again
349 pxor xmm0, xmm2 // d t^128 + e t^64
350 psrldq xmm2, 12 // e[31..0] t^64
351 psrldq xmm1, 4 // f[95..0]
352 pslldq xmm4, 8 // f[127..96]
353 pxor xmm1, xmm2 // low 96 bits of result
354 pxor xmm0, xmm4 // high 96 bits of result
356 // Next, shift everything one bit to the left to compensate for GCM's
357 // strange ordering. This will be easier if we shift up the high
358 // half by a word before we start. After this we're in GCM bizarro-
360 movdqa xmm3, xmm1 // low half again
361 pslldq xmm0, 4 // shift high half
362 psrld xmm1, 31 // shift low half down: just carries
363 movdqa xmm2, xmm0 // copy high half
364 pslld xmm3, 1 // shift low half down: drop carries
365 psrld xmm0, 31 // shift high half up: just carries
366 pslld xmm2, 1 // shift high half down: drop carries
367 movdqa xmm4, xmm0 // copy high carries for carry-around
368 pslldq xmm0, 4 // shift carries down
370 psrldq xmm4, 12 // the big carry wraps around
375 // Finally, the reduction. This is essentially the same as the
376 // 128-bit case, except that the polynomial is p(t) = t^96 + t^10 +
377 // t^9 + t^6 + 1. The degrees are larger but not enough to cause
378 // trouble for the general approach.
380 // First, shift the high bits down.
381 movdqa xmm2, xmm0 // copies of the high part
384 pslld xmm2, 26 // b_i for t^6
385 pslld xmm3, 23 // b_i for t^9
386 pslld xmm4, 22 // b_i for t^10
387 pxor xmm2, xmm3 // add them all together
388 pslldq xmm1, 4 // shift low part up to match
390 movdqa xmm3, xmm2 // and a copy for later
391 pslldq xmm2, 8 // contribution to high half
392 psrldq xmm3, 4 // contribution to low half
396 // And then shift the low bits up.
397 movdqa xmm2, xmm0 // copies of the high part
399 pxor xmm1, xmm0 // mix in the unit contribution
403 pxor xmm1, xmm2 // low half, unit, and t^9 contribs
404 pxor xmm0, xmm3 // t^6 and t^10 contribs
405 pxor xmm0, xmm1 // mix them together and we're done
409 // Enter with u and v in xmm0/xmm1 and xmm2/xmm3 respectively; leave
410 // with z = u v in xmm0/xmm1 -- the top halves of the high registers
411 // are unimportant. Clobbers xmm2--xmm7.
413 // Start multiplying and accumulating pieces of product.
414 // xmm0 = // (u_2; u_1)
415 // xmm1 = // (u_0; ?)
416 // xmm2 = // (v_2; v_1)
417 // xmm3 = // (v_0; ?)
418 movdqa xmm4, xmm0 // (u_2; u_1) again
419 movdqa xmm5, xmm0 // (u_2; u_1) yet again
420 movdqa xmm6, xmm0 // (u_2; u_1) again again
421 movdqa xmm7, xmm1 // (u_0; ?) again
422 punpcklqdq xmm1, xmm3 // (u_0; v_0)
423 pclmulhqhqdq xmm4, xmm2 // u_1 v_1
424 pclmullqlqdq xmm3, xmm0 // u_2 v_0
425 pclmullqhqdq xmm5, xmm2 // u_2 v_1
426 pclmulhqlqdq xmm6, xmm2 // u_1 v_2
427 pxor xmm4, xmm3 // u_2 v_0 + u_1 v_1
428 pclmullqlqdq xmm7, xmm2 // u_0 v_2
429 pxor xmm5, xmm6 // b = u_2 v_1 + u_1 v_2
430 movdqa xmm6, xmm0 // (u_2; u_1) like a bad penny
431 pxor xmm4, xmm7 // c = u_0 v_2 + u_1 v_1 + u_2 v_0
432 pclmullqlqdq xmm0, xmm2 // a = u_2 v_2
433 pclmulhqhqdq xmm6, xmm1 // u_1 v_0
434 pclmulhqlqdq xmm2, xmm1 // u_0 v_1
435 pclmullqhqdq xmm1, xmm1 // e = u_0 v_0
436 pxor xmm2, xmm6 // d = u_1 v_0 + u_0 v_1
438 // Next, the piecing together of the product.
439 // xmm0 = // (a_1; a_0) = a = u_2 v_2
440 // xmm5 = // (b_1; b_0) = b = u_1 v_2 + u_2 v_1
441 // xmm4 = // (c_1; c_0) = c = u_0 v_2 +
443 // xmm2 = // (d_1; d_0) = d = u_0 v_1 + u_1 v_0
444 // xmm1 = // (e_1; e_0) = e = u_0 v_0
445 // xmm3, xmm6, xmm7 spare
446 movdqa xmm3, xmm2 // (d_1; d_0) again
447 movdqa xmm6, xmm5 // (b_1; b_0) again
448 pslldq xmm2, 8 // (0; d_1)
449 psrldq xmm5, 8 // (b_0; 0)
450 psrldq xmm3, 8 // (d_0; 0)
451 pslldq xmm6, 8 // (0; b_1)
452 pxor xmm5, xmm2 // (b_0; d_1)
453 pxor xmm0, xmm6 // x_2 = (a_1; a_0 + b_1)
454 pxor xmm3, xmm1 // x_0 = (e_1 + d_0; e_0)
455 pxor xmm4, xmm5 // x_1 = (b_0 + c_1; c_0 + d_1)
457 // Now, shift it right (from GCM's point of view) by one bit, and try
458 // to leave the result in less random registers. After this, we'll
459 // be in GCM bizarro-world.
460 // xmm1, xmm2, xmm5, xmm6, xmm7 spare
461 movdqa xmm5, xmm0 // copy x_2
462 movdqa xmm1, xmm4 // copy x_1
463 movdqa xmm2, xmm3 // copy x_0
464 psrld xmm0, 31 // x_2 carries
465 psrld xmm4, 31 // x_1 carries
466 psrld xmm3, 31 // x_0 carries
467 pslld xmm5, 1 // x_2 shifted
468 pslld xmm1, 1 // x_1 shifted
469 pslld xmm2, 1 // x_0 shifted
470 movdqa xmm6, xmm0 // x_2 carry copy
471 movdqa xmm7, xmm4 // x_1 carry copy
472 pslldq xmm0, 4 // x_2 carry shifted
473 pslldq xmm4, 4 // x_1 carry shifted
474 pslldq xmm3, 4 // x_0 carry shifted
475 psrldq xmm6, 12 // x_2 carry out
476 psrldq xmm7, 12 // x_1 carry out
477 por xmm0, xmm5 // (y_5; y_4)
480 por xmm1, xmm6 // (y_3; y_2)
481 por xmm2, xmm7 // (y_1; y_0)
483 // Next, the reduction. Our polynomial this time is p(x) = t^192 +
484 // t^7 + t^2 + t + 1. Yes, the magic numbers are the same as the
485 // 128-bit case. I don't know why.
487 // First, shift the high bits down.
488 // xmm0 = // (y_5; y_4)
489 // xmm1 = // (y_3; y_2)
490 // xmm2 = // (y_1; y_0)
492 movdqa xmm3, xmm0 // (y_5; y_4) copy
493 movdqa xmm4, xmm0 // (y_5; y_4) copy
494 movdqa xmm5, xmm0 // (y_5; y_4) copy
495 pslld xmm3, 31 // (y_5; y_4) b_i for t
496 pslld xmm4, 30 // (y_5; y_4) b_i for t^2
497 pslld xmm5, 25 // (y_5; y_4) b_i for t^7
498 movq xmm6, xmm1 // (y_3; 0) copy
500 movq xmm7, xmm1 // (y_3; 0) copy
502 movq xmm5, xmm1 // (y_3; 0) copy
503 movdqa xmm4, xmm3 // (y_5; y_4) b_i combined
504 pslld xmm6, 31 // (y_3; 0) b_i for t
505 pslld xmm7, 30 // (y_3; 0) b_i for t^2
506 pslld xmm5, 25 // (y_3; 0) b_i for t^7
507 psrldq xmm3, 12 // (y_5; y_4) low contrib
508 pslldq xmm4, 4 // (y_5; y_4) high contrib
516 // And finally shift the low bits up. Unfortunately, we also have to
517 // split the low bits out.
518 // xmm0 = // (y'_5; y'_4)
519 // xmm1 = // (y'_3; y'_2)
520 // xmm2 = // (y'_1; y'_0)
521 movdqa xmm5, xmm1 // copies of (y'_3; y'_2)
524 psrldq xmm1, 8 // bring down (y'_2; ?)
525 movdqa xmm3, xmm0 // copies of (y'_5; y'_4)
527 punpcklqdq xmm1, xmm2 // (y'_2; y'_1)
528 psrldq xmm2, 8 // (y'_0; ?)
529 pxor xmm2, xmm5 // low half and unit contrib
537 pxor xmm2, xmm6 // low half, unit, t^2 contribs
539 pxor xmm5, xmm7 // t and t^7 contribs
541 pxor xmm5, xmm2 // mix everything together
543 movq xmm1, xmm5 // shunt (z_0; ?) into proper place
547 // Enter with u and v in xmm0/xmm1 and xmm2/xmm3 respectively; leave
548 // with z = u v in xmm0/xmm1. Clobbers xmm2--xmm7. On 32-bit x86,
549 // requires 16 bytes aligned space at SP; on amd64, also clobbers
552 // Now it's starting to look worthwhile to do Karatsuba. Suppose
553 // u = u_0 + u_1 B and v = v_0 + v_1 B. Then
555 // u v = (u_0 v_0) + (u_0 v_1 + u_1 v_0) B + (u_1 v_1) B^2
557 // Name these coefficients of B^i be a, b, and c, respectively, and
558 // let r = u_0 + u_1 and s = v_0 + v_1. Then observe that
560 // q = r s = (u_0 + u_1) (v_0 + v_1)
561 // = (u_0 v_0) + (u1 v_1) + (u_0 v_1 + u_1 v_0)
564 // The first two terms we've already calculated; the last is the
565 // remaining one we want. We'll set B = t^128. We know how to do
566 // 128-bit multiplications already, and Karatsuba is too annoying
567 // there, so there'll be 12 multiplications altogether, rather than
568 // the 16 we'd have if we did this the naïve way.
570 // On x86, there aren't quite enough registers, so spill one for a
571 // bit. On AMD64, we can keep on going, so it's all good.
573 // xmm0 = // u_1 = (u_11; u_10)
574 // xmm1 = // u_0 = (u_01; u_00)
575 // xmm2 = // v_1 = (v_11; v_10)
576 // xmm3 = // v_0 = (v_01; v_00)
577 movdqa xmm4, xmm0 // u_1 again
579 movdqa [SP + 0], xmm3
584 pxor xmm4, xmm1 // u_* = (u_01 + u_11; u_00 + u_10)
585 pxor xmm3, xmm2 // v_* = (v_01 + v_11; v_00 + v_10)
587 // Start by building the cross product, q = u_* v_*.
588 movdqa xmm7, xmm4 // more copies of u_*
591 pclmullqhqdq xmm4, xmm3 // u_*1 v_*0
592 pclmulhqlqdq xmm7, xmm3 // u_*0 v_*1
593 pclmullqlqdq xmm5, xmm3 // u_*1 v_*1
594 pclmulhqhqdq xmm6, xmm3 // u_*0 v_*0
595 pxor xmm4, xmm7 // u_*1 v_*0 + u_*0 v_*1
599 pxor xmm5, xmm4 // q_1
600 pxor xmm6, xmm7 // q_0
602 // Next, work on the high half, a = u_1 v_1.
603 movdqa xmm3, xmm0 // more copies of u_1
606 pclmullqhqdq xmm0, xmm2 // u_11 v_10
607 pclmulhqlqdq xmm3, xmm2 // u_10 v_11
608 pclmullqlqdq xmm4, xmm2 // u_11 v_11
609 pclmulhqhqdq xmm7, xmm2 // u_10 v_10
611 movdqa xmm2, [SP + 0]
614 pxor xmm0, xmm3 // u_10 v_11 + u_11 v_10
618 pxor xmm4, xmm0 // x_1 = a_1
619 pxor xmm7, xmm3 // a_0
621 // Mix that into the product now forming in xmm4--xmm7.
622 pxor xmm5, xmm4 // a_1 + q_1
623 pxor xmm6, xmm7 // a_0 + q_0
624 pxor xmm5, xmm7 // a_0 + (a_1 + q_1)
626 // Finally, the low half, c = u_0 v_0.
627 movdqa xmm0, xmm1 // more copies of u_0
630 pclmullqhqdq xmm1, V0 // u_01 v_00
631 pclmulhqlqdq xmm0, V0 // u_00 v_01
632 pclmullqlqdq xmm3, V0 // u_01 v_01
633 pclmulhqhqdq xmm7, V0 // u_00 v_00
634 pxor xmm0, xmm1 // u_10 v_11 + u_11 v_10
638 pxor xmm3, xmm0 // c_1
639 pxor xmm7, xmm1 // x_0 = c_0
641 // And mix that in to complete the product.
642 pxor xmm6, xmm3 // (a_0 + q_0) + c_1
643 pxor xmm5, xmm3 // x_2 = a_0 + (a_1 + c_1 + q_1) = a_0 + b_1
644 pxor xmm6, xmm7 // x_1 = (a_0 + c_0 + q_0) + c_1 = b_0 + c_1
648 // Now we need to shift that whole lot one bit to the left. This
649 // will also give us an opportunity to put the product back in
650 // xmm0--xmm3. This is a slightly merry dance because it's nearly
651 // pipelined but we don't have enough registers.
653 // After this, we'll be in GCM bizarro-world.
654 movdqa xmm0, xmm4 // x_3 again
655 psrld xmm4, 31 // x_3 carries
656 pslld xmm0, 1 // x_3 shifted left
657 movdqa xmm3, xmm4 // x_3 copy carries
658 movdqa xmm1, xmm5 // x_2 again
659 pslldq xmm4, 4 // x_3 carries shifted up
660 psrld xmm5, 31 // x_2 carries
661 psrldq xmm3, 12 // x_3 big carry out
662 pslld xmm1, 1 // x_2 shifted left
663 por xmm0, xmm4 // x_3 mixed together
664 movdqa xmm4, xmm5 // x_2 copy carries
665 movdqa xmm2, xmm6 // x_1 again
666 pslldq xmm5, 4 // x_2 carries shifted up
667 psrld xmm6, 31 // x_1 carries
668 psrldq xmm4, 12 // x_2 big carry out
669 pslld xmm2, 1 // x_1 shifted
670 por xmm1, xmm5 // x_2 mixed together
671 movdqa xmm5, xmm6 // x_1 copy carries
672 por xmm1, xmm3 // x_2 with carry from x_3
673 movdqa xmm3, xmm7 // x_0 again
674 pslldq xmm6, 4 // x_1 carries shifted up
675 psrld xmm7, 31 // x_2 carries
676 psrldq xmm5, 12 // x_1 big carry out
677 pslld xmm3, 1 // x_0 shifted
678 por xmm2, xmm6 // x_1 mixed together
679 pslldq xmm7, 4 // x_0 carries shifted up
680 por xmm2, xmm4 // x_1 with carry from x_2
681 por xmm3, xmm7 // x_0 mixed together
682 por xmm3, xmm5 // x_0 with carry from x_1
684 // Now we must reduce. This is essentially the same as the 128-bit
685 // case above, but more complicated because everything is bigger.
686 // The polynomial this time is p(t) = t^256 + t^10 + t^5 + t^2 + 1.
688 // First, shift the high bits down.
689 movdqa xmm4, xmm0 // y_3 again
690 movdqa xmm5, xmm0 // y_3 yet again
691 movdqa xmm6, xmm0 // y_3 again again
692 pslld xmm4, 30 // y_3: b_i for t^2
693 pslld xmm5, 27 // y_3: b_i for t^5
694 pslld xmm6, 22 // y_3: b_i for t^10
695 movdqa xmm7, xmm1 // y_2 again
697 movdqa xmm5, xmm1 // y_2 again
699 movdqa xmm6, xmm1 // y_2 again
700 pslld xmm7, 30 // y_2: b_i for t^2
701 pslld xmm5, 27 // y_2: b_i for t^5
702 pslld xmm6, 22 // y_2: b_i for t^10
716 // And then shift the low bits up.
717 movdqa xmm4, xmm0 // y_3 again
718 movdqa xmm5, xmm1 // y_2 again
719 movdqa xmm6, xmm0 // y_3 yet again
720 movdqa xmm7, xmm1 // y_2 yet again
721 pxor xmm2, xmm0 // y_1 and unit contrib from y_3
722 pxor xmm3, xmm1 // y_0 and unit contrib from y_2
729 pxor xmm0, xmm2 // y_1, with y_3 units and t^2
730 pxor xmm1, xmm3 // y_0, with y_2 units and t^2
731 pxor xmm4, xmm6 // y_3 t^5 and t^10 contribs
732 pxor xmm5, xmm7 // y_2 t^5 and t^10 contribs
733 pxor xmm0, xmm4 // high half of reduced result
734 pxor xmm1, xmm5 // low half; all done
737 ///--------------------------------------------------------------------------
740 // There are a number of representations of field elements in this code and
741 // it can be confusing.
743 // * The `external format' consists of a sequence of contiguous bytes in
744 // memory called a `block'. The GCM spec explains how to interpret this
745 // block as an element of a finite field. As discussed extensively, this
746 // representation is very annoying for a number of reasons. On the other
747 // hand, this code never actually deals with it directly.
749 // * The `register format' consists of one or more XMM registers, depending
750 // on the block size. The bytes in these registers are in reverse order
751 // -- so the least-significant byte of the lowest-numbered register holds
752 // the /last/ byte in the block. If the block size is not a multiple of
753 // 16 bytes, then there must be padding. 96-bit blocks are weird: the
754 // padding is inserted at the /least/ significant end, so the register
755 // holds (0, x_0; x_1, x_2); otherwise, the padding goes at the most
758 // * The `words' format consists of a sequence of bytes, as in the
759 // `external format', but, according to the blockcipher in use, the bytes
760 // within each 32-bit word may be reversed (`big-endian') or not
761 // (`little-endian'). Accordingly, there are separate entry points for
762 // each variant, identified with `b' or `l'.
765 FUNC(f##_avx); vzeroupper; endprologue; ENDFUNC; \
768 SSEFUNC(gcm_mulk_128b_x86ish_pclmul)
769 // On entry, A points to a 128-bit field element in big-endian words
770 // format; K points to a field-element in register format. On exit,
771 // A is updated with the product A K.
780 pshufd xmm0, xmm0, SHUF(3, 2, 1, 0)
782 pshufd xmm0, xmm0, SHUF(3, 2, 1, 0)
787 SSEFUNC(gcm_mulk_128l_x86ish_pclmul)
788 // On entry, A points to a 128-bit field element in little-endian
789 // words format; K points to a field-element in register format. On
790 // exit, A is updated with the product A K.
798 movdqa xmm7, [INTADDR(swaptab_128l, ecx)]
808 SSEFUNC(gcm_mulk_64b_x86ish_pclmul)
809 // On entry, A points to a 64-bit field element in big-endian words
810 // format; K points to a field-element in register format. On exit,
811 // A is updated with the product A K.
820 pshufd xmm0, xmm0, SHUF(1, 0, 3, 3)
822 pshufd xmm0, xmm0, SHUF(1, 0, 3, 3)
827 SSEFUNC(gcm_mulk_64l_x86ish_pclmul)
828 // On entry, A points to a 64-bit field element in little-endian
829 // words format; K points to a field-element in register format. On
830 // exit, A is updated with the product A K.
838 movdqa xmm7, [INTADDR(swaptab_64l, ecx)]
848 SSEFUNC(gcm_mulk_96b_x86ish_pclmul)
849 // On entry, A points to a 96-bit field element in big-endian words
850 // format; K points to a field-element in register format (i.e., 16
851 // bytes, with the first four bytes zero). On exit, A is updated
852 // with the product A K.
862 punpcklqdq xmm0, xmm2
863 pshufd xmm0, xmm0, SHUF(3, 2, 1, 0)
865 pshufd xmm1, xmm0, SHUF(3, 2, 1, 0)
872 SSEFUNC(gcm_mulk_96l_x86ish_pclmul)
873 // On entry, A points to a 96-bit field element in little-endian
874 // words format; K points to a field-element in register format
875 // (i.e., 16 bytes, with the first four bytes zero). On exit, A is
876 // updated with the product A K.
884 movdqa xmm7, [INTADDR(swaptab_128l, ecx)]
888 punpcklqdq xmm0, xmm2
898 SSEFUNC(gcm_mulk_192b_x86ish_pclmul)
899 // On entry, A points to a 192-bit field element in big-endian words
900 // format; K points to a field-element in register format. On exit,
901 // A is updated with the product A K.
907 #if CPUFAM_AMD64 && ABI_WIN
917 pshufd xmm0, xmm0, SHUF(3, 2, 1, 0)
918 pshufd xmm1, xmm1, SHUF(1, 0, 3, 3)
920 pshufd xmm0, xmm0, SHUF(3, 2, 1, 0)
921 pshufd xmm1, xmm1, SHUF(1, 0, 3, 3)
924 #if CPUFAM_AMD64 && ABI_WIN
932 SSEFUNC(gcm_mulk_192l_x86ish_pclmul)
933 // On entry, A points to a 192-bit field element in little-endian
934 // words format; K points to a field-element in register format. On
935 // exit, A is updated with the product A K.
942 #if CPUFAM_AMD64 && ABI_WIN
952 pshufb xmm0, [INTADDR(swaptab_128l, ecx)]
953 pshufb xmm1, [INTADDR(swaptab_64l, ecx)]
955 pshufb xmm0, [INTADDR(swaptab_128l, ecx)]
956 pshufb xmm1, [INTADDR(swaptab_64l, ecx)]
959 #if CPUFAM_AMD64 && ABI_WIN
967 SSEFUNC(gcm_mulk_256b_x86ish_pclmul)
968 // On entry, A points to a 256-bit field element in big-endian words
969 // format; K points to a field-element in register format. On exit,
970 // A is updated with the product A K.
980 #if CPUFAM_AMD64 && ABI_WIN
987 movdqu xmm0, [A + 16]
990 movdqu xmm3, [K + 16]
991 pshufd xmm0, xmm0, SHUF(3, 2, 1, 0)
992 pshufd xmm1, xmm1, SHUF(3, 2, 1, 0)
994 pshufd xmm0, xmm0, SHUF(3, 2, 1, 0)
995 pshufd xmm1, xmm1, SHUF(3, 2, 1, 0)
996 movdqu [A + 16], xmm0
1002 #if CPUFAM_AMD64 && ABI_WIN
1011 SSEFUNC(gcm_mulk_256l_x86ish_pclmul)
1012 // On entry, A points to a 256-bit field element in little-endian
1013 // words format; K points to a field-element in register format. On
1014 // exit, A is updated with the product A K.
1025 #if CPUFAM_AMD64 && ABI_WIN
1032 movdqa xmm7, [INTADDR(swaptab_128l, ecx)]
1033 movdqu xmm0, [A + 16]
1034 movdqu xmm1, [A + 0]
1035 movdqu xmm2, [K + 0]
1036 movdqu xmm3, [K + 16]
1040 movdqa xmm7, [INTADDR(swaptab_128l, ecx)]
1043 movdqu [A + 16], xmm0
1044 movdqu [A + 0], xmm1
1049 #if CPUFAM_AMD64 && ABI_WIN
1062 // Table for byte-swapping little-endian words-format blocks larger
1064 .byte 15, 14, 13, 12, 11, 10, 9, 8
1065 .byte 7, 6, 5, 4, 3, 2, 1, 0
1069 // Table for byte-swapping 64-bit little-endian words-format blocks.
1070 .byte 7, 6, 5, 4, 3, 2, 1, 0
1071 .byte 255, 255, 255, 255, 255, 255, 255, 255
1073 ///----- That's all, folks --------------------------------------------------