1 /// -*- mode: asm; asm-comment-char: ?/ -*-
3 /// GCM acceleration for ARM processors
5 /// (c) 2019 Straylight/Edgeware
8 ///----- Licensing notice ---------------------------------------------------
10 /// This file is part of Catacomb.
12 /// Catacomb is free software: you can redistribute it and/or modify it
13 /// under the terms of the GNU Library General Public License as published
14 /// by the Free Software Foundation; either version 2 of the License, or
15 /// (at your option) any later version.
17 /// Catacomb is distributed in the hope that it will be useful, but
18 /// WITHOUT ANY WARRANTY; without even the implied warranty of
19 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 /// Library General Public License for more details.
22 /// You should have received a copy of the GNU Library General Public
23 /// License along with Catacomb. If not, write to the Free Software
24 /// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
27 ///--------------------------------------------------------------------------
31 #include "asm-common.h"
34 .fpu crypto-neon-fp-armv8
38 ///--------------------------------------------------------------------------
39 /// Multiplication macros.
41 // The good news is that we have a fancy instruction to do the
42 // multiplications. The bad news is that it's not particularly well-
45 // For one thing, it only does a 64-bit multiplication, so in general
46 // we'll need to synthesize the full-width multiply by hand. For
47 // another thing, it doesn't help with the reduction, so we have to
48 // do that by hand too. And, finally, GCM has crazy bit ordering,
49 // and the instruction does nothing useful for that at all.
51 // Focusing on that last problem first: the bits aren't in monotonic
52 // significance order unless we permute them. If we reverse the byte
53 // order, then we'll have the bits in monotonic order, but backwards,
54 // so the degree-0 coefficient will be in the most-significant bit.
56 // This is less of a difficulty than it seems at first, because
57 // algebra. Suppose we are given u = SUM_{0<=i<n} u_i t^i and v =
58 // SUM_{0<=j<n} v_j t^j; then
60 // u v = SUM_{0<=i,j<n} u_i v_j t^{i+j}
62 // Suppose instead that we're given ũ = SUM_{0<=i<n} u_{n-i-1} t^i
63 // and ṽ = SUM_{0<=j<n} v_{n-j-1} t^j, so the bits are backwards.
66 // ũ ṽ = SUM_{0<=i,j<n} u_{n-i-1} v_{n-j-1} t^{i+j}
67 // = SUM_{0<=i,j<n} u_i v_j t^{2n-2-(i+j)}
69 // which is almost the bit-reversal of u v, only it's shifted right
70 // by one place. Oh, well: we'll have to shift it back later.
72 // That was important to think about, but there's not a great deal to
73 // do about it yet other than to convert what we've got from the
74 // blockcipher's byte-ordering convention to our big-endian
75 // convention. Since this depends on the blockcipher convention,
76 // we'll leave the caller to cope with this: the macros here will
77 // assume that the operands are in `register' format, which is the
78 // same as the external representation, except that the bytes within
79 // each 64-bit piece are reversed. In the commentary, pieces of
80 // polynomial are numbered according to the degree of the
81 // coefficients, so the unit coefficient of some polynomial a is in
84 // The commentary for `mul128' is the most detailed. The other
85 // macros assume that you've already read and understood that.
88 // Enter with u and v in q0 and q1 respectively; leave with z = u v
89 // in q0. Clobbers q1--q3, q8, q9.
91 // First for the double-precision multiplication. It's tempting to
92 // use Karatsuba's identity here, but I suspect that loses more in
93 // the shifting, bit-twiddling, and dependency chains that it gains
94 // in saving a multiplication which otherwise pipelines well.
97 vmull.p64 q2, d1, d2 // u_1 v_0
98 vmull.p64 q3, d0, d3 // u_0 v_1
99 vmull.p64 q8, d1, d3 // (x_3; t_1) = u_1 v_1
100 vmull.p64 q9, d0, d2 // (t_0; x_0) = u_0 v_0
102 // Arrange the pieces to form a double-precision polynomial.
103 veor q2, q2, q3 // (m_1; m_0) = u_0 v_1 + u_1 v_0
104 veor d17, d17, d4 // x_2 = t_1 + m_1
105 veor d18, d18, d5 // x_1 = t_0 + m_0
106 // q8 = // (x_3; x_2)
107 // q9 = // (x_1; x_0)
109 // Two-and-a-half problems remain. The first is that this product is
110 // shifted left by one place, which is annoying. Let's take care of
111 // that now. Once this is done, we'll be properly in GCM's backwards
114 // The half a problem is that the result wants to have its 64-bit
115 // halves switched. Here turns out to be the best place to arrange
119 // ,-------------.-------------. ,-------------.-------------.
120 // | 0 x_0-x_62 | x_63-x_126 | | x_127-x_190 | x_191-x_254 |
121 // `-------------^-------------' `-------------^-------------'
124 // We start by shifting each 32-bit lane right (from GCM's point of
125 // view -- physically, left) by one place, which gives us this:
127 // low (q9) high (q8)
128 // ,-------------.-------------. ,-------------.-------------.
129 // | x_0-x_62 0 |x_64-x_126 0 | |x_128-x_190 0|x_192-x_254 0|
130 // `-------------^-------------' `-------------^-------------'
133 // but we've lost a bunch of bits. We separately shift each lane
134 // left by 31 places to give us the bits we lost.
136 // low (q3) high (q2)
137 // ,-------------.-------------. ,-------------.-------------.
138 // | 0...0 | 0...0 x_63 | | 0...0 x_127 | 0...0 x_191 |
139 // `-------------^-------------' `-------------^-------------'
142 // Since we can address each of these pieces individually, putting
143 // them together is relatively straightforward.
146 vshr.u64 d6, d18, #63 // shifted left; just the carries
147 vshl.u64 q9, q9, #1 // shifted right, but dropped carries
150 vorr d0, d19, d6 // y_0
151 vorr d1, d18, d5 // y_1
152 vorr d2, d17, d4 // y_2
155 // And the other one is that the result needs to be reduced modulo
156 // p(t) = t^128 + t^7 + t^2 + t + 1. Let R = t^128 = t^7 + t^2 + t +
157 // 1 in our field. So far, we've calculated z_0 and z_1 such that
158 // z_0 + z_1 R = u v using the identity R = t^128: now we must
159 // collapse the two halves of y together using the other identity R =
160 // t^7 + t^2 + t + 1.
162 // We do this by working on y_2 and y_3 separately, so consider y_i
163 // for i = 2 or 3. Certainly, y_i t^{64i} = y_i R t^{64(i-2) =
164 // (t^7 + t^2 + t + 1) y_i t^{64(i-2)}, but we can't use that
165 // directly without breaking up the 64-bit word structure. Instead,
166 // we start by considering just y_i t^7 t^{64(i-2)}, which again
167 // looks tricky. Now, split y_i = a_i + t^57 b_i, with deg a_i < 57;
170 // y_i t^7 t^{64(i-2)} = a_i t^7 t^{64(i-2)} + b_i t^{64(i-1)}
172 // We can similarly decompose y_i t^2 and y_i t into a pair of 64-bit
173 // contributions to the t^{64(i-2)} and t^{64(i-1)} words, but the
174 // splits are different. This is lovely, with one small snag: when
175 // we do this to y_3, we end up with a contribution back into the
176 // t^128 coefficient word. But notice that only the low seven bits
177 // of this word are affected, so there's no knock-on contribution
178 // into the t^64 word. Therefore, if we handle the high bits of each
179 // word together, and then the low bits, everything will be fine.
181 // First, shift the high bits down.
182 vshl.u64 q2, q1, #63 // the b_i for t
183 vshl.u64 q3, q1, #62 // the b_i for t^2
184 vshl.u64 q8, q1, #57 // the b_i for t^7
185 veor q2, q2, q3 // add them all together
187 veor d2, d2, d5 // contribution into low half
188 veor d1, d1, d4 // and high half
190 // And then shift the low bits up.
194 veor q0, q0, q1 // mix in the unit contribution
195 veor q2, q2, q3 // t and t^2 contribs
196 veor q0, q0, q8 // low, unit, and t^7 contribs
197 veor q0, q0, q2 // mix them together and we're done
201 // Enter with u and v in the low halves of d0 and d1 respectively;
202 // leave with z = u v in d0. Clobbers d1--d5.
204 // The multiplication is thankfully easy.
205 vmull.p64 q0, d0, d1 // u v
207 // Shift the product up by one place, and swap the two halves. After
208 // this, we're in GCM bizarro-world.
209 vshr.u64 d2, d0, #63 // shifted left; just the carries
210 vshl.u64 d3, d1, #1 // low half right
211 vshl.u64 d1, d0, #1 // high half shifted right
212 vorr d0, d3, d2 // mix in the carries
214 // Now we must reduce. This is essentially the same as the 128-bit
215 // case above, but mostly simpler because everything is smaller. The
216 // polynomial this time is p(t) = t^64 + t^4 + t^3 + t + 1.
218 // First, shift the high bits down.
219 vshl.u64 d2, d1, #63 // b_i for t
220 vshl.u64 d3, d1, #61 // b_i for t^3
221 vshl.u64 d4, d1, #60 // b_i for t^4
222 veor d2, d2, d3 // add them all together
224 veor d1, d1, d2 // contribution back into high half
226 // And then shift the low bits up.
230 veor d0, d0, d1 // mix in the unit contribution
231 veor d2, d2, d3 // t and t^3 contribs
232 veor d0, d0, d4 // low, unit, and t^4
233 veor d0, d0, d2 // mix them together and we're done
237 // Enter with u and v in the most-significant three words of q0 and
238 // q1 respectively, and zero in the low words, and zero in q15; leave
239 // with z = u v in the high three words of q0, and /junk/ in the low
240 // word. Clobbers ???.
242 // This is an inconvenient size. There's nothing for it but to do
243 // four multiplications, as if for the 128-bit case. It's possible
244 // that there's cruft in the top 32 bits of the input registers, so
245 // shift both of them up by four bytes before we start. This will
246 // mean that the high 64 bits of the result (from GCM's viewpoint)
248 // q0 = // (u_0 + u_1 t^32; u_2)
249 // q1 = // (v_0 + v_1 t^32; v_2)
250 vmull.p64 q8, d1, d2 // u_2 (v_0 + v_1 t^32) = e_0
251 vmull.p64 q9, d0, d3 // v_2 (u_0 + u_1 t^32) = e_1
252 vmull.p64 q3, d1, d3 // u_2 v_2 t^64 = d = (0; d)
253 vmull.p64 q0, d0, d2 // u_0 v_0 + (u_0 v_1 + u_1 v_0) t^32
254 // + u_1 v_1 t^64 = f
256 // Extract the high and low halves of the 192-bit result. The answer
257 // we want is d t^128 + e t^64 + f, where e = e_0 + e_1. The low 96
258 // bits of the answer will end up in q0, and the high 96 bits will
259 // end up in q1; we'll need both of these to have zero in their
262 // Here, bot(x) is the low 96 bits of a 192-bit quantity x, arranged
263 // in the low 96 bits of a SIMD register, with junk in the top 32
264 // bits; and top(x) is the high 96 bits, also arranged in the low 96
265 // bits of a register, with /zero/ in the top 32 bits.
266 veor q8, q8, q9 // e_0 + e_1 = e
267 vshr128 q1, q3, 32 // top(d t^128)
268 vext.8 d19, d16, d17, #4 // top(e t^64)
269 vshl.u64 d16, d0, #32 // top(f), sort of
270 veor d3, d3, d19 // q1 = top(d t^128 + e t^64)
271 veor d0, d0, d17 // q0 = bot([d t^128] + e t^64 + f)
272 veor d3, d3, d16 // q1 = top(d t^128 + e t^64 + f)
274 // Shift the product right by one place (from GCM's point of view),
275 // but, unusually, don't swap the halves, because we need to work on
276 // the 32-bit pieces later. After this, we're in GCM bizarro-world.
277 // q0 = // (?, x_2; x_1, x_0)
278 // q1 = // (0, x_5; x_4, x_3)
279 vshr.u64 d4, d0, #63 // carry from d0 to d1
280 vshr.u64 d5, d2, #63 // carry from d2 to d3
281 vshr.u32 d6, d3, #31 // carry from d3 to d0
282 vshl.u64 q0, q0, #1 // shift low half
283 vshl.u64 q1, q1, #1 // shift high half
288 // Finally, the reduction. This is essentially the same as the
289 // 128-bit case, except that the polynomial is p(t) = t^96 + t^10 +
290 // t^9 + t^6 + 1. The degrees are larger but not enough to cause
291 // trouble for the general approach.
293 // First, shift the high bits down.
294 vshl.u32 q2, q1, #26 // b_i for t^6
295 vshl.u32 q3, q1, #23 // b_i for t^9
296 vshl.u32 q8, q1, #22 // b_i for t^10
297 veor q2, q2, q3 // add them all together
299 vshl128 q3, q2, 64 // contribution into high half
300 vshr128 q2, q2, 32 // and low half
301 veor q1, q1, q3 // mix them in
304 // And then shift the low bits up.
307 veor q0, q0, q1 // mix in the unit contribution
309 veor q2, q2, q3 // mix together t^6 and t^9
310 veor q0, q0, q8 // mix in t^10
311 veor q0, q0, q2 // and the rest
313 // And finally swap the two halves.
318 // Enter with u and v in d0--d2 and d3--d5 respectively; leave
319 // with z = u v in d0--d2. Clobbers q8--q15.
321 // Start multiplying and accumulating pieces of product.
322 // (d0; d1; d2) = // (u_0; u_1; u_2)
323 // (d3; d4; d5) = // (v_0; v_1; v_2)
324 vmull.p64 q10, d0, d3 // e = u_0 v_0
326 vmull.p64 q12, d0, d4 // u_0 v_1
327 vmull.p64 q13, d1, d3 // u_1 v_0
329 vmull.p64 q9, d0, d5 // u_0 v_2
330 vmull.p64 q14, d1, d4 // u_1 v_1
331 vmull.p64 q15, d2, d3 // u_2 v_0
332 veor q12, q12, q13 // d = u_0 v_1 + u_1 v_0
334 vmull.p64 q11, d1, d5 // u_1 v_2
335 vmull.p64 q13, d2, d4 // u_2 v_1
336 veor q9, q9, q14 // u_0 v_2 + u_1 v_1
338 vmull.p64 q8, d2, d5 // a = u_2 v_2
339 veor q9, q9, q15 // c = u_0 v_2 + u_1 v_1 + u_2 v_0
340 veor q11, q11, q13 // b = u_1 v_2 + u_2 v_1
342 // Piece the product together.
343 veor d17, d17, d22 // q8 = // (x_5; x_4)
345 veor d19, d19, d24 // q9 = // (x_3; x_2)
346 veor d20, d20, d25 // q10 = // (x_1; x_0)
348 // Shift the product right by one place (from GCM's point of view).
349 vshr.u64 q11, q8, #63 // carry from d16/d17 to d17/d18
350 vshr.u64 q12, q9, #63 // carry from d18/d19 to d19/d20
351 vshr.u64 d26, d20, #63 // carry from d20 to d21
352 vshl.u64 q8, q8, #1 // shift everything down
354 vshl.u64 q10, q10, #1
355 vorr d17, d17, d22 // and mix in the carries
361 // Next, the reduction. Our polynomial this time is p(x) = t^192 +
362 // t^7 + t^2 + t + 1. Yes, the magic numbers are the same as the
363 // 128-bit case. I don't know why.
365 // First, shift the high bits down.
366 // q8 = // (y_5; y_4)
367 // q9 = // (y_3; y_2)
368 // q10 = // (y_1; y_0)
369 vshl.u64 q11, q8, #63 // (y_5; y_4) b_i for t
370 vshl.u64 d28, d18, #63 // y_3 b_i for t
371 vshl.u64 q12, q8, #62 // (y_5; y_4) b_i for t^2
372 vshl.u64 d29, d18, #62 // y_3 b_i for t^2
373 vshl.u64 q13, q8, #57 // (y_5; y_4) b_i for t^7
374 vshl.u64 d30, d18, #57 // y_3 b_i for t^7
375 veor q11, q11, q12 // mix them all together
382 // And finally shift the low bits up. Also, switch the order of the
383 // pieces for output.
384 // q8 = // (y'_5; y'_4)
385 // q9 = // (y'_3; y'_2)
386 // q10 = // (y'_1; y'_0)
387 vshr.u64 q11, q8, #1 // (y_5; y_4) a_i for t
388 vshr.u64 d28, d18, #1 // y'_3 a_i for t
389 vshr.u64 q12, q8, #2 // (y_5; y_4) a_i for t^2
390 vshr.u64 d29, d18, #2 // y'_3 a_i for t^2
391 vshr.u64 q13, q8, #7 // (y_5; y_4) a_i for t^7
392 vshr.u64 d30, d18, #7 // y'_3 a_i for t^7
405 // Enter with u and v in q0/q1 and q2/q3 respectively; leave
406 // with z = u v in q0/q1. Clobbers q8--q15.
408 // Now it's starting to look worthwhile to do Karatsuba. Suppose
409 // u = u_0 + u_1 B and v = v_0 + v_1 B. Then
411 // u v = (u_0 v_0) + (u_0 v_1 + u_1 v_0) B + (u_1 v_1) B^2
413 // Name these coefficients of B^i be a, b, and c, respectively, and
414 // let r = u_0 + u_1 and s = v_0 + v_1. Then observe that
416 // q = r s = (u_0 + u_1) (v_0 + v_1)
417 // = (u_0 v_0) + (u1 v_1) + (u_0 v_1 + u_1 v_0)
420 // The first two terms we've already calculated; the last is the
421 // remaining one we want. We'll set B = t^128. We know how to do
422 // 128-bit multiplications already, and Karatsuba is too annoying
423 // there, so there'll be 12 multiplications altogether, rather than
424 // the 16 we'd have if we did this the naïve way.
425 // q0 = // u_0 = (u_00; u_01)
426 // q1 = // u_1 = (u_10; u_11)
427 // q2 = // v_0 = (v_00; v_01)
428 // q3 = // v_1 = (v_10; v_11)
430 veor q8, q0, q1 // u_* = (u_00 + u_10; u_01 + u_11)
431 veor q9, q2, q3 // v_* = (v_00 + v_10; v_01 + v_11)
433 // Start by building the cross product, q = u_* v_*.
434 vmull.p64 q14, d16, d19 // u_*0 v_*1
435 vmull.p64 q15, d17, d18 // u_*1 v_*0
436 vmull.p64 q12, d17, d19 // u_*1 v_*1
437 vmull.p64 q13, d16, d18 // u_*0 v_*0
438 veor q14, q14, q15 // u_*0 v_*1 + u_*1 v_*0
439 veor d25, d25, d28 // q12 = // q_1
440 veor d26, d26, d29 // q13 = // q_0
442 // Next, work on the low half, a = u_0 v_0.
443 vmull.p64 q14, d0, d5 // u_00 v_01
444 vmull.p64 q15, d1, d4 // u_01 v_00
445 vmull.p64 q10, d1, d5 // u_01 v_01
446 vmull.p64 q11, d0, d4 // u_00 v_00
447 veor q14, q14, q15 // u_00 v_01 + u_01 v_00
448 veor d21, d21, d28 // q10 = // a_1
449 veor d22, d22, d29 // q11 = // a_0
451 // Mix the pieces we have so far.
455 // Finally, the high half, c = u_1 v_1.
456 vmull.p64 q14, d2, d7 // u_10 v_11
457 vmull.p64 q15, d3, d6 // u_11 v_10
458 vmull.p64 q8, d3, d7 // u_11 v_11
459 vmull.p64 q9, d2, d6 // u_10 v_10
460 veor q14, q14, q15 // u_10 v_11 + u_11 v_10
461 veor d17, d17, d28 // q8 = // c_1
462 veor d18, d18, d29 // q9 = // c_0
464 // Finish mixing the product together.
465 veor q12, q12, q8 // q12 = // b_1
466 veor q13, q13, q9 // q13 = // b_0
470 // Shift the product right by one place (from GCM's point of view).
471 vshr.u64 q0, q8, #63 // carry from d16/d17 to d17/d18
472 vshr.u64 q1, q9, #63 // carry from d18/d19 to d19/d20
473 vshr.u64 q2, q10, #63 // carry from d20/d21 to d21/d22
474 vshr.u64 d6, d22, #63 // carry from d22 to d23
475 vshl.u64 q8, q8, #1 // shift everyting down
477 vshl.u64 q10, q10, #1
478 vshl.u64 q11, q11, #1
487 // Now we must reduce. This is essentially the same as the 192-bit
488 // case above, but more complicated because everything is bigger.
489 // The polynomial this time is p(t) = t^256 + t^10 + t^5 + t^2 + 1.
491 // First, shift the high bits down.
492 // q8 = // (y_7; y_6)
493 // q9 = // (y_5; y_4)
494 // q10 = // (y_3; y_2)
495 // q11 = // (y_1; y_0)
496 vshl.u64 q0, q8, #62 // (y_7; y_6) b_i for t^2
497 vshl.u64 q12, q9, #62 // (y_5; y_4) b_i for t^2
498 vshl.u64 q1, q8, #59 // (y_7; y_6) b_i for t^5
499 vshl.u64 q13, q9, #59 // (y_5; y_4) b_i for t^5
500 vshl.u64 q2, q8, #54 // (y_7; y_6) b_i for t^10
501 vshl.u64 q14, q9, #54 // (y_5; y_4) b_i for t^10
502 veor q0, q0, q1 // mix the contributions together
506 veor d19, d19, d0 // and combine into the lower pieces
511 // And then shift the low bits up. Also, switch the order of the
512 // pieces for output.
513 // q8 = // (y'_7; y'_6)
514 // q9 = // (y'_5; y'_4)
515 // q10 = // (y'_3; y'_2)
516 // q11 = // (y'_1; y'_0)
517 vshr.u64 q0, q8, #2 // (y_7; y_6) a_i for t^2
518 vshr.u64 q12, q9, #2 // (y_5; y'_4) a_i for t^2
519 vshr.u64 q1, q8, #5 // (y_7; y_6) a_i for t^5
520 vshr.u64 q13, q9, #5 // (y_5; y_4) a_i for t^5
521 vshr.u64 q2, q8, #10 // (y_7; y_6) a_i for t^10
522 vshr.u64 q14, q9, #10 // (y_5; y_4) a_i for t^10
524 veor q8, q8, q0 // mix the contributions together
530 veor d3, d20, d16 // and output
536 ///--------------------------------------------------------------------------
539 // There are a number of representations of field elements in this code and
540 // it can be confusing.
542 // * The `external format' consists of a sequence of contiguous bytes in
543 // memory called a `block'. The GCM spec explains how to interpret this
544 // block as an element of a finite field. As discussed extensively, this
545 // representation is very annoying for a number of reasons. On the other
546 // hand, this code never actually deals with it directly.
548 // * The `register format' consists of one or more NEON registers,
549 // depending on the block size. The bytes in each 64-bit lane of these
550 // registers are in reverse order, compared to the external format.
552 // * The `words' format consists of a sequence of bytes, as in the
553 // `external format', but, according to the blockcipher in use, the bytes
554 // within each 32-bit word may be reversed (`big-endian') or not
555 // (`little-endian'). Accordingly, there are separate entry points for
556 // each variant, identified with `b' or `l'.
558 FUNC(gcm_mulk_128b_arm_crypto)
559 // On entry, r0 points to a 128-bit field element A in big-endian
560 // words format; r1 points to a field-element K in register format.
561 // On exit, A is updated with the product A K.
572 FUNC(gcm_mulk_128l_arm_crypto)
573 // On entry, r0 points to a 128-bit field element A in little-endian
574 // words format; r1 points to a field-element K in register format.
575 // On exit, A is updated with the product A K.
586 FUNC(gcm_mulk_64b_arm_crypto)
587 // On entry, r0 points to a 64-bit field element A in big-endian
588 // words format; r1 points to a field-element K in register format.
589 // On exit, A is updated with the product A K.
600 FUNC(gcm_mulk_64l_arm_crypto)
601 // On entry, r0 points to a 64-bit field element A in little-endian
602 // words format; r1 points to a field-element K in register format.
603 // On exit, A is updated with the product A K.
615 FUNC(gcm_mulk_96b_arm_crypto)
616 // On entry, r0 points to a 96-bit field element A in big-endian
617 // words format; r1 points to a field-element K in register format.
618 // On exit, A is updated with the product A K.
635 FUNC(gcm_mulk_96l_arm_crypto)
636 // On entry, r0 points to a 128-bit field element A in little-endian
637 // words format; r1 points to a field-element K in register format.
638 // On exit, A is updated with the product A K.
654 FUNC(gcm_mulk_192b_arm_crypto)
655 // On entry, r0 points to a 192-bit field element A in big-endian
656 // words format; r1 points to a field-element K in register format.
657 // On exit, A is updated with the product A K.
670 FUNC(gcm_mulk_192l_arm_crypto)
671 // On entry, r0 points to a 192-bit field element A in little-endian
672 // words format; r1 points to a field-element K in register format.
673 // On exit, A is updated with the product A K.
686 FUNC(gcm_mulk_256b_arm_crypto)
687 // On entry, r0 points to a 256-bit field element A in big-endian
688 // words format; r1 points to a field-element K in register format.
689 // On exit, A is updated with the product A K.
691 vld1.8 {q0, q1}, [r0]
692 vld1.8 {q2, q3}, [r1]
698 vst1.8 {q0, q1}, [r0]
702 FUNC(gcm_mulk_256l_arm_crypto)
703 // On entry, r0 points to a 256-bit field element A in little-endian
704 // words format; r1 points to a field-element K in register format.
705 // On exit, A is updated with the product A K.
707 vld1.8 {q0, q1}, [r0]
708 vld1.8 {q2, q3}, [r1]
714 vst1.8 {q0, q1}, [r0]
718 ///----- That's all, folks --------------------------------------------------