Commit | Line | Data |
---|---|---|
444083ae MW |
1 | /// -*- mode: asm; asm-comment-char: ?/; comment-start: "// " -*- |
2 | /// | |
3 | /// Large SIMD-based multiplications | |
4 | /// | |
5 | /// (c) 2016 Straylight/Edgeware | |
6 | ||
7 | ///----- Licensing notice --------------------------------------------------- | |
8 | /// | |
9 | /// This file is part of Catacomb. | |
10 | /// | |
11 | /// Catacomb is free software; you can redistribute it and/or modify | |
12 | /// it under the terms of the GNU Library General Public License as | |
13 | /// published by the Free Software Foundation; either version 2 of the | |
14 | /// License, or (at your option) any later version. | |
15 | /// | |
16 | /// Catacomb is distributed in the hope that it will be useful, | |
17 | /// but WITHOUT ANY WARRANTY; without even the implied warranty of | |
18 | /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
19 | /// GNU Library General Public License for more details. | |
20 | /// | |
21 | /// You should have received a copy of the GNU Library General Public | |
22 | /// License along with Catacomb; if not, write to the Free | |
23 | /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, | |
24 | /// MA 02111-1307, USA. | |
25 | ||
26 | ///-------------------------------------------------------------------------- | |
df07f2c0 | 27 | /// Preliminaries. |
444083ae MW |
28 | |
29 | #include "config.h" | |
30 | #include "asm-common.h" | |
31 | ||
444083ae | 32 | .arch pentium4 |
df07f2c0 | 33 | |
444083ae MW |
34 | .text |
35 | ||
36 | ///-------------------------------------------------------------------------- | |
37 | /// Theory. | |
38 | /// | |
39 | /// We define a number of primitive fixed-size multipliers from which we can | |
40 | /// construct more general variable-length multipliers. | |
41 | /// | |
42 | /// The basic trick is the same throughout. In an operand-scanning | |
9599917f MW |
43 | /// multiplication, the inner multiplication loop multiplies a multiple- |
44 | /// precision operand by a single precision factor, and adds the result, | |
45 | /// appropriately shifted, to the result. A `finely integrated operand | |
46 | /// scanning' implementation of Montgomery multiplication also adds the | |
47 | /// product of a single-precision `Montgomery factor' and the modulus, | |
444083ae MW |
48 | /// calculated in the same pass. The more common `coarsely integrated |
49 | /// operand scanning' alternates main multiplication and Montgomery passes, | |
50 | /// which requires additional carry propagation. | |
51 | /// | |
52 | /// Throughout both plain-multiplication and Montgomery stages, then, one of | |
53 | /// the factors remains constant throughout the operation, so we can afford | |
54 | /// to take a little time to preprocess it. The transformation we perform is | |
55 | /// as follows. Let b = 2^16, and B = b^2 = 2^32. Suppose we're given a | |
56 | /// 128-bit factor v = v_0 + v_1 B + v_2 B^2 + v_3 B^3. Split each v_i into | |
57 | /// two sixteen-bit pieces, so v_i = v'_i + v''_i b. These eight 16-bit | |
58 | /// pieces are placed into 32-bit cells, and arranged as two 128-bit SSE | |
59 | /// operands, as follows. | |
60 | /// | |
981a9e5d MW |
61 | /// Offset 12 8 4 0 |
62 | /// 0 v''_1 v''_0 v'_1 v'_0 | |
63 | /// 16 v''_3 v''_2 v'_3 v'_2 | |
444083ae | 64 | /// |
2aaa07f8 | 65 | /// A `pmuludq' instruction ignores the odd positions in its operands; thus, |
444083ae MW |
66 | /// it will act on (say) v'_0 and v''_0 in a single instruction. Shifting |
67 | /// this vector right by 4 bytes brings v'_1 and v''_1 into position. We can | |
68 | /// multiply such a vector by a full 32-bit scalar to produce two 48-bit | |
69 | /// results in 64-bit fields. The sixteen bits of headroom allows us to add | |
70 | /// many products together before we must deal with carrying; it also allows | |
71 | /// for some calculations to be performed on the above expanded form. | |
72 | /// | |
9599917f MW |
73 | /// We maintain four `carry' registers XMM4--XMM7 accumulating intermediate |
74 | /// results. The registers' precise roles rotate during the computation; we | |
75 | /// name them `c0', `c1', `c2', and `c3'. Each carry register holds two | |
76 | /// 64-bit halves: the register c0, for example, holds c'_0 (low half) and | |
77 | /// c''_0 (high half), and represents the value c_0 = c'_0 + c''_0 b; the | |
78 | /// carry registers collectively represent the value c_0 + c_1 B + c_2 B^2 + | |
79 | /// c_3 B^3. The `pmuluqd' instruction acting on a scalar operand (broadcast | |
80 | /// across all lanes of its vector) and an operand in the expanded form above | |
81 | /// produces a result which can be added directly to the appropriate carry | |
82 | /// register. Following a pass of four multiplications, we perform some | |
83 | /// limited carry propagation: let t = c''_0 mod B, and let d = c'_0 + t b; | |
84 | /// then we output z = d mod B, add (floor(d/B), floor(c''_0/B)) to c1, and | |
85 | /// cycle the carry registers around, so that c1 becomes c0, and the old | |
86 | /// (implicitly) zeroed c0 becomes c3. | |
87 | /// | |
444083ae | 88 | /// On 32-bit x86, we are register starved: the expanded operands are kept in |
9599917f MW |
89 | /// memory, typically in warm L1 cache. The packed operands are read from |
90 | /// memory into working registers XMM0--XMM3 and processed immediately. | |
91 | /// The following conventional argument names and locations are used | |
92 | /// throughout. | |
93 | /// | |
94 | /// Arg Format Location Notes | |
95 | /// | |
96 | /// U packed [EAX] | |
97 | /// X packed [EBX] In Montgomery multiplication, X = N | |
98 | /// V expanded [ECX] | |
99 | /// Y expanded [EDX] In Montgomery multiplication, Y = (A + U V) M | |
100 | /// M expanded [ESI] -N^{-1} (mod B^4) | |
101 | /// N Modulus, for Montgomery multiplication | |
102 | /// A packed [EDI] Destination/accumulator | |
103 | /// C carry XMM4--XMM7 | |
104 | /// | |
105 | /// The calculation is some variant of | |
106 | /// | |
107 | /// A' + C' B^4 <- U V + X Y + A + C | |
108 | /// | |
109 | /// The low-level functions fit into a fairly traditional (finely-integrated) | |
110 | /// operand scanning loop over operand pairs (U, X) (indexed by j) and (V, Y) | |
111 | /// (indexed by i). | |
112 | /// | |
113 | /// The variants are as follows. | |
114 | /// | |
115 | /// Function Variant Use i j | |
116 | /// | |
117 | /// mmul4 A = C = 0 Montgomery 0 0 | |
118 | /// dmul4 A = 0 Montgomery 0 + | |
119 | /// mmla4 C = 0 Montgomery + 0 | |
120 | /// dmla4 exactly as shown Montgomery + + | |
121 | /// mont4 U = V = C = 0 Montgomery any 0 | |
122 | /// | |
123 | /// mul4zc U = V = A = C = 0 Plain 0 0 | |
124 | /// mul4 U = V = A = 0 Plain 0 + | |
125 | /// mla4zc U = V = C = 0 Plain + 0 | |
126 | /// mla4 U = V = 0 Plain + + | |
127 | /// | |
128 | /// The `mmul4' and `mmla4' functions are also responsible for calculating | |
129 | /// the Montgomery reduction factor Y = (A + U V) M used by the rest of the | |
130 | /// inner loop. | |
444083ae MW |
131 | |
132 | ///-------------------------------------------------------------------------- | |
133 | /// Macro definitions. | |
134 | ||
71ac8e5e | 135 | .macro mulcore r, s, d0, d1=nil, d2=nil, d3=nil |
444083ae MW |
136 | // Load a word r_i from R, multiply by the expanded operand [S], and |
137 | // leave the pieces of the product in registers D0, D1, D2, D3. | |
981a9e5d | 138 | movd \d0, \r // (0, 0; 0, r_i) |
444083ae | 139 | .ifnes "\d1", "nil" |
981a9e5d | 140 | movdqa \d1, [\s] // (s''_1, s''_0; s'_1, s'_0) |
444083ae MW |
141 | .endif |
142 | .ifnes "\d3", "nil" | |
981a9e5d | 143 | movdqa \d3, [\s + 16] // (s''_3, s''_2; s'_3, s'_2) |
444083ae | 144 | .endif |
981a9e5d | 145 | pshufd \d0, \d0, SHUF(3, 0, 3, 0) // (?, r_i; ?, r_i) |
444083ae | 146 | .ifnes "\d1", "nil" |
981a9e5d | 147 | psrldq \d1, 4 // (0, s''_1; s''_0, s'_1) |
444083ae MW |
148 | .endif |
149 | .ifnes "\d2", "nil" | |
150 | .ifnes "\d3", "nil" | |
8e91d6e5 | 151 | movdqa \d2, \d3 // another copy of (s'_2, s'_3; ...) |
444083ae | 152 | .else |
981a9e5d | 153 | movdqa \d2, \d0 // another copy of (?, r_i; ?, r_i) |
444083ae MW |
154 | .endif |
155 | .endif | |
156 | .ifnes "\d3", "nil" | |
981a9e5d | 157 | psrldq \d3, 4 // (0, s''_3; s''_2, s'_3) |
444083ae MW |
158 | .endif |
159 | .ifnes "\d1", "nil" | |
981a9e5d | 160 | pmuludq \d1, \d0 // (r_i s''_1; r_i s'_1) |
444083ae MW |
161 | .endif |
162 | .ifnes "\d3", "nil" | |
981a9e5d | 163 | pmuludq \d3, \d0 // (r_i s''_3; r_i s'_3) |
444083ae MW |
164 | .endif |
165 | .ifnes "\d2", "nil" | |
166 | .ifnes "\d3", "nil" | |
981a9e5d | 167 | pmuludq \d2, \d0 // (r_i s''_2; r_i s'_2) |
444083ae | 168 | .else |
2aaa07f8 | 169 | pmuludq \d2, [\s + 16] |
444083ae MW |
170 | .endif |
171 | .endif | |
981a9e5d | 172 | pmuludq \d0, [\s] // (r_i s''_0; r_i s'_0) |
444083ae MW |
173 | .endm |
174 | ||
71ac8e5e MW |
175 | .macro accum c0, c1=nil, c2=nil, c3=nil |
176 | // Accumulate 64-bit pieces in XMM0--XMM3 into the corresponding | |
177 | // carry registers C0--C3. Any or all of C1--C3 may be `nil' to skip | |
178 | // updating that register. | |
444083ae MW |
179 | paddq \c0, xmm0 |
180 | .ifnes "\c1", "nil" | |
181 | paddq \c1, xmm1 | |
182 | .endif | |
183 | .ifnes "\c2", "nil" | |
184 | paddq \c2, xmm2 | |
185 | .endif | |
186 | .ifnes "\c3", "nil" | |
187 | paddq \c3, xmm3 | |
188 | .endif | |
189 | .endm | |
190 | ||
71ac8e5e | 191 | .macro mulacc r, s, c0, c1, c2, c3, z3p=nil |
444083ae MW |
192 | // Load a word r_i from R, multiply by the expanded operand [S], |
193 | // and accumulate in carry registers C0, C1, C2, C3. If Z3P is `t' | |
194 | // then C3 notionally contains zero, but needs clearing; in practice, | |
195 | // we store the product directly rather than attempting to add. On | |
196 | // completion, XMM0, XMM1, and XMM2 are clobbered, as is XMM3 if Z3P | |
197 | // is not `t'. | |
198 | .ifeqs "\z3p", "t" | |
199 | mulcore \r, \s, xmm0, xmm1, xmm2, \c3 | |
71ac8e5e | 200 | accum \c0, \c1, \c2 |
444083ae MW |
201 | .else |
202 | mulcore \r, \s, xmm0, xmm1, xmm2, xmm3 | |
203 | accum \c0, \c1, \c2, \c3 | |
204 | .endif | |
205 | .endm | |
206 | ||
71ac8e5e | 207 | .macro propout d, c, cc=nil |
444083ae MW |
208 | // Calculate an output word from C, and store it in D; propagate |
209 | // carries out from C to CC in preparation for a rotation of the | |
210 | // carry registers. On completion, XMM3 is clobbered. If CC is | |
211 | // `nil', then the contribution which would have been added to it is | |
212 | // left in C. | |
981a9e5d MW |
213 | pshufd xmm3, \c, SHUF(2, 3, 3, 3) // (t = c'' mod B, ?; ?, ?) |
214 | psrldq xmm3, 12 // (0, 0; 0, t) = (0; t) | |
215 | pslldq xmm3, 2 // (0; t b) | |
216 | paddq \c, xmm3 // (c''; c' + t b) | |
444083ae MW |
217 | movd \d, \c |
218 | psrlq \c, 32 // floor(c/B) | |
219 | .ifnes "\cc", "nil" | |
220 | paddq \cc, \c // propagate up | |
221 | .endif | |
222 | .endm | |
223 | ||
224 | .macro endprop d, c, t | |
225 | // On entry, C contains a carry register. On exit, the low 32 bits | |
226 | // of the value represented in C are written to D, and the remaining | |
227 | // bits are left at the bottom of T. | |
228 | movdqa \t, \c | |
981a9e5d MW |
229 | psllq \t, 16 // (c'' b; ?) |
230 | pslldq \c, 8 // (c'; 0) | |
231 | paddq \t, \c // (c' + c'' b; ?) | |
232 | psrldq \t, 8 // (0; c' + c'' b) = (0; c) | |
444083ae | 233 | movd \d, \t |
8e91d6e5 | 234 | psrldq \t, 4 // (floor(c/B); 0) |
444083ae MW |
235 | .endm |
236 | ||
71ac8e5e | 237 | .macro expand z, a, b, c=nil, d=nil |
444083ae MW |
238 | // On entry, A and C hold packed 128-bit values, and Z is zero. On |
239 | // exit, A:B and C:D together hold the same values in expanded | |
240 | // form. If C is `nil', then only expand A to A:B. | |
981a9e5d | 241 | movdqa \b, \a // (a_3, a_2; a_1, a_0) |
444083ae | 242 | .ifnes "\c", "nil" |
981a9e5d | 243 | movdqa \d, \c // (c_3, c_2; c_1, c_0) |
444083ae | 244 | .endif |
981a9e5d MW |
245 | punpcklwd \a, \z // (a''_1, a'_1; a''_0, a'_0) |
246 | punpckhwd \b, \z // (a''_3, a'_3; a''_2, a'_2) | |
444083ae | 247 | .ifnes "\c", "nil" |
981a9e5d MW |
248 | punpcklwd \c, \z // (c''_1, c'_1; c''_0, c'_0) |
249 | punpckhwd \d, \z // (c''_3, c'_3; c''_2, c'_2) | |
444083ae | 250 | .endif |
981a9e5d MW |
251 | pshufd \a, \a, SHUF(3, 1, 2, 0) // (a''_1, a''_0; a'_1, a'_0) |
252 | pshufd \b, \b, SHUF(3, 1, 2, 0) // (a''_3, a''_2; a'_3, a'_2) | |
444083ae | 253 | .ifnes "\c", "nil" |
981a9e5d MW |
254 | pshufd \c, \c, SHUF(3, 1, 2, 0) // (c''_1, c''_0; c'_1, c'_0) |
255 | pshufd \d, \d, SHUF(3, 1, 2, 0) // (c''_3, c''_2; c'_3, c'_2) | |
444083ae MW |
256 | .endif |
257 | .endm | |
258 | ||
71ac8e5e | 259 | .macro squash c0, c1, c2, c3, t, u, lo, hi=nil |
444083ae | 260 | // On entry, C0, C1, C2, C3 are carry registers representing a value |
4b30aca5 | 261 | // Y. On exit, LO holds the low 128 bits of the carry value; C1, C2, |
444083ae | 262 | // C3, T, and U are clobbered; and the high bits of Y are stored in |
4b30aca5 | 263 | // HI, if this is not `nil'. |
444083ae MW |
264 | |
265 | // The first step is to eliminate the `double-prime' pieces -- i.e., | |
266 | // the ones offset by 16 bytes from a 32-bit boundary -- by carrying | |
267 | // them into the 32-bit-aligned pieces above and below. But before | |
268 | // we can do that, we must gather them together. | |
269 | movdqa \t, \c0 | |
270 | movdqa \u, \c1 | |
981a9e5d MW |
271 | punpcklqdq \t, \c2 // (y'_2; y'_0) |
272 | punpckhqdq \c0, \c2 // (y''_2; y''_0) | |
273 | punpcklqdq \u, \c3 // (y'_3; y'_1) | |
274 | punpckhqdq \c1, \c3 // (y''_3; y''_1) | |
444083ae MW |
275 | |
276 | // Now split the double-prime pieces. The high (up to) 48 bits will | |
277 | // go up; the low 16 bits go down. | |
278 | movdqa \c2, \c0 | |
279 | movdqa \c3, \c1 | |
280 | psllq \c2, 48 | |
281 | psllq \c3, 48 | |
981a9e5d MW |
282 | psrlq \c0, 16 // high parts of (y''_2; y''_0) |
283 | psrlq \c1, 16 // high parts of (y''_3; y''_1) | |
284 | psrlq \c2, 32 // low parts of (y''_2; y''_0) | |
285 | psrlq \c3, 32 // low parts of (y''_3; y''_1) | |
4b30aca5 MW |
286 | .ifnes "\hi", "nil" |
287 | movdqa \hi, \c1 | |
444083ae | 288 | .endif |
981a9e5d | 289 | pslldq \c1, 8 // high part of (y''_1; 0) |
444083ae MW |
290 | |
291 | paddq \t, \c2 // propagate down | |
292 | paddq \u, \c3 | |
981a9e5d MW |
293 | paddq \t, \c1 // and up: (y_2; y_0) |
294 | paddq \u, \c0 // (y_3; y_1) | |
4b30aca5 | 295 | .ifnes "\hi", "nil" |
981a9e5d | 296 | psrldq \hi, 8 // high part of (0; y''_3) |
444083ae MW |
297 | .endif |
298 | ||
299 | // Finally extract the answer. This complicated dance is better than | |
300 | // storing to memory and loading, because the piecemeal stores | |
301 | // inhibit store forwarding. | |
981a9e5d MW |
302 | movdqa \c3, \t // (?; y_0) |
303 | movdqa \lo, \t // (?, ?; ?, y^*_0) | |
304 | psrldq \t, 8 // (0; y_2) | |
8e91d6e5 MW |
305 | psrlq \c3, 32 // (floor(y_0/B); ?) |
306 | paddq \c3, \u // (y_1 + floor(y_0/B); ?) | |
981a9e5d MW |
307 | movdqa \c1, \c3 // (?, ?; ?, y^*_1) |
308 | psrldq \u, 8 // (0; y_3) | |
8e91d6e5 MW |
309 | psrlq \c3, 32 // (floor((y_1 B + y_0)/B^2; ?) |
310 | paddq \c3, \t // (y_2 + floor((y_1 B + y_0)/B^2; ?) | |
981a9e5d | 311 | punpckldq \lo, \c3 // (?, ?; y^*_2, y^*_0) |
8e91d6e5 MW |
312 | psrlq \c3, 32 // (floor((y_2 B^2 + y_1 B + y_0)/B^3; ?) |
313 | paddq \c3, \u // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3; ?) | |
4b30aca5 | 314 | .ifnes "\hi", "nil" |
d2269179 | 315 | movdqa \t, \c3 |
444083ae MW |
316 | pxor \u, \u |
317 | .endif | |
981a9e5d | 318 | punpckldq \c1, \c3 // (?, ?; y^*_3, y^*_1) |
4b30aca5 | 319 | .ifnes "\hi", "nil" |
444083ae | 320 | psrlq \t, 32 // very high bits of y |
4b30aca5 MW |
321 | paddq \hi, \t |
322 | punpcklqdq \hi, \u // carry up | |
444083ae | 323 | .endif |
4b30aca5 | 324 | punpckldq \lo, \c1 // y mod B^4 |
444083ae MW |
325 | .endm |
326 | ||
327 | .macro carryadd | |
328 | // On entry, EDI points to a packed addend A, and XMM4, XMM5, XMM6 | |
329 | // hold the incoming carry registers c0, c1, and c2 representing a | |
330 | // carry-in C. | |
331 | // | |
332 | // On exit, the carry registers, including XMM7, are updated to hold | |
333 | // C + A; XMM0, XMM1, XMM2, and XMM3 are clobbered. The other | |
334 | // registers are preserved. | |
981a9e5d MW |
335 | movd xmm0, [edi + 0] // (0; a_0) |
336 | movd xmm1, [edi + 4] // (0; a_1) | |
337 | movd xmm2, [edi + 8] // (0; a_2) | |
338 | movd xmm7, [edi + 12] // (0; a_3) | |
339 | ||
340 | paddq xmm4, xmm0 // (c''_0; c'_0 + a_0) | |
341 | paddq xmm5, xmm1 // (c''_1; c'_1 + a_1) | |
342 | paddq xmm6, xmm2 // (c''_2 + a_3 b; c'_2 + a_2) | |
444083ae MW |
343 | .endm |
344 | ||
345 | ///-------------------------------------------------------------------------- | |
346 | /// Primitive multipliers and related utilities. | |
347 | ||
1a517bb3 | 348 | INTFUNC(carryprop) |
444083ae MW |
349 | // On entry, XMM4, XMM5, and XMM6 hold a 144-bit carry in an expanded |
350 | // form. Store the low 128 bits of the represented carry to [EDI] as | |
351 | // a packed 128-bit value, and leave the remaining 16 bits in the low | |
352 | // 32 bits of XMM4. On exit, XMM3, XMM5 and XMM6 are clobbered. | |
0923a413 MW |
353 | endprologue |
354 | ||
444083ae MW |
355 | propout [edi + 0], xmm4, xmm5 |
356 | propout [edi + 4], xmm5, xmm6 | |
357 | propout [edi + 8], xmm6, nil | |
358 | endprop [edi + 12], xmm6, xmm4 | |
359 | ret | |
1a517bb3 MW |
360 | ENDFUNC |
361 | ||
362 | INTFUNC(dmul4) | |
444083ae MW |
363 | // On entry, EDI points to the destination buffer; EAX and EBX point |
364 | // to the packed operands U and X; ECX and EDX point to the expanded | |
365 | // operands V and Y; and XMM4, XMM5, XMM6 hold the incoming carry | |
366 | // registers c0, c1, and c2; c3 is assumed to be zero. | |
367 | // | |
368 | // On exit, we write the low 128 bits of the sum C + U V + X Y to | |
369 | // [EDI], and update the carry registers with the carry out. The | |
370 | // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the | |
371 | // general-purpose registers are preserved. | |
0923a413 MW |
372 | endprologue |
373 | ||
444083ae | 374 | mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7, t |
71ac8e5e | 375 | mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7 |
444083ae MW |
376 | propout [edi + 0], xmm4, xmm5 |
377 | ||
378 | mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t | |
71ac8e5e | 379 | mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4 |
444083ae MW |
380 | propout [edi + 4], xmm5, xmm6 |
381 | ||
382 | mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t | |
71ac8e5e | 383 | mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5 |
444083ae MW |
384 | propout [edi + 8], xmm6, xmm7 |
385 | ||
386 | mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t | |
71ac8e5e | 387 | mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6 |
444083ae MW |
388 | propout [edi + 12], xmm7, xmm4 |
389 | ||
390 | ret | |
1a517bb3 MW |
391 | ENDFUNC |
392 | ||
393 | INTFUNC(dmla4) | |
444083ae MW |
394 | // On entry, EDI points to the destination buffer, which also |
395 | // contains an addend A to accumulate; EAX and EBX point to the | |
396 | // packed operands U and X; ECX and EDX point to the expanded | |
397 | // operands V and Y; and XMM4, XMM5, XMM6 hold the incoming carry | |
398 | // registers c0, c1, and c2 representing a carry-in C; c3 is assumed | |
399 | // to be zero. | |
400 | // | |
401 | // On exit, we write the low 128 bits of the sum A + C + U V + X Y to | |
402 | // [EDI], and update the carry registers with the carry out. The | |
403 | // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the | |
404 | // general-purpose registers are preserved. | |
0923a413 MW |
405 | endprologue |
406 | ||
444083ae MW |
407 | carryadd |
408 | ||
71ac8e5e MW |
409 | mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7 |
410 | mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7 | |
444083ae MW |
411 | propout [edi + 0], xmm4, xmm5 |
412 | ||
413 | mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t | |
71ac8e5e | 414 | mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4 |
444083ae MW |
415 | propout [edi + 4], xmm5, xmm6 |
416 | ||
417 | mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t | |
71ac8e5e | 418 | mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5 |
444083ae MW |
419 | propout [edi + 8], xmm6, xmm7 |
420 | ||
421 | mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t | |
71ac8e5e | 422 | mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6 |
444083ae MW |
423 | propout [edi + 12], xmm7, xmm4 |
424 | ||
425 | ret | |
1a517bb3 MW |
426 | ENDFUNC |
427 | ||
428 | INTFUNC(mul4zc) | |
444083ae MW |
429 | // On entry, EDI points to the destination buffer; EBX points to a |
430 | // packed operand X; and EDX points to an expanded operand Y. | |
431 | // | |
432 | // On exit, we write the low 128 bits of the product X Y to [EDI], | |
433 | // and set the carry registers XMM4, XMM5, XMM6 to the carry out. | |
434 | // The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the | |
435 | // general-purpose registers are preserved. | |
0923a413 MW |
436 | endprologue |
437 | ||
444083ae MW |
438 | mulcore [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7 |
439 | propout [edi + 0], xmm4, xmm5 | |
440 | ||
441 | mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t | |
442 | propout [edi + 4], xmm5, xmm6 | |
443 | ||
444 | mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t | |
445 | propout [edi + 8], xmm6, xmm7 | |
446 | ||
447 | mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t | |
448 | propout [edi + 12], xmm7, xmm4 | |
449 | ||
450 | ret | |
1a517bb3 MW |
451 | ENDFUNC |
452 | ||
453 | INTFUNC(mul4) | |
444083ae MW |
454 | // On entry, EDI points to the destination buffer; EBX points to a |
455 | // packed operand X; EDX points to an expanded operand Y; and XMM4, | |
456 | // XMM5, XMM6 hold the incoming carry registers c0, c1, and c2, | |
457 | // representing a carry-in C; c3 is assumed to be zero. | |
458 | // | |
459 | // On exit, we write the low 128 bits of the sum C + X Y to [EDI], | |
460 | // and update the carry registers with the carry out. The registers | |
461 | // XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the | |
462 | // general-purpose registers are preserved. | |
0923a413 MW |
463 | endprologue |
464 | ||
444083ae MW |
465 | mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7, t |
466 | propout [edi + 0], xmm4, xmm5 | |
467 | ||
468 | mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t | |
469 | propout [edi + 4], xmm5, xmm6 | |
470 | ||
471 | mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t | |
472 | propout [edi + 8], xmm6, xmm7 | |
473 | ||
474 | mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t | |
475 | propout [edi + 12], xmm7, xmm4 | |
476 | ||
477 | ret | |
1a517bb3 MW |
478 | ENDFUNC |
479 | ||
480 | INTFUNC(mla4zc) | |
444083ae MW |
481 | // On entry, EDI points to the destination buffer, which also |
482 | // contains an addend A to accumulate; EBX points to a packed operand | |
483 | // X; and EDX points to an expanded operand Y. | |
484 | // | |
485 | // On exit, we write the low 128 bits of the sum A + X Y to [EDI], | |
486 | // and set the carry registers XMM4, XMM5, XMM6 to the carry out. | |
487 | // The registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the | |
488 | // general-purpose registers are preserved. | |
0923a413 MW |
489 | endprologue |
490 | ||
444083ae MW |
491 | movd xmm4, [edi + 0] |
492 | movd xmm5, [edi + 4] | |
493 | movd xmm6, [edi + 8] | |
494 | movd xmm7, [edi + 12] | |
495 | ||
71ac8e5e | 496 | mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7 |
444083ae MW |
497 | propout [edi + 0], xmm4, xmm5 |
498 | ||
499 | mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t | |
500 | propout [edi + 4], xmm5, xmm6 | |
501 | ||
502 | mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t | |
503 | propout [edi + 8], xmm6, xmm7 | |
504 | ||
505 | mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t | |
506 | propout [edi + 12], xmm7, xmm4 | |
507 | ||
508 | ret | |
1a517bb3 MW |
509 | ENDFUNC |
510 | ||
511 | INTFUNC(mla4) | |
444083ae MW |
512 | // On entry, EDI points to the destination buffer, which also |
513 | // contains an addend A to accumulate; EBX points to a packed operand | |
514 | // X; EDX points to an expanded operand Y; and XMM4, XMM5, XMM6 hold | |
515 | // the incoming carry registers c0, c1, and c2, representing a | |
516 | // carry-in C; c3 is assumed to be zero. | |
517 | // | |
518 | // On exit, we write the low 128 bits of the sum A + C + X Y to | |
519 | // [EDI], and update the carry registers with the carry out. The | |
520 | // registers XMM0, XMM1, XMM2, XMM3, and XMM7 are clobbered; the | |
521 | // general-purpose registers are preserved. | |
0923a413 MW |
522 | endprologue |
523 | ||
444083ae MW |
524 | carryadd |
525 | ||
71ac8e5e | 526 | mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7 |
444083ae MW |
527 | propout [edi + 0], xmm4, xmm5 |
528 | ||
529 | mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t | |
530 | propout [edi + 4], xmm5, xmm6 | |
531 | ||
532 | mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t | |
533 | propout [edi + 8], xmm6, xmm7 | |
534 | ||
535 | mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t | |
536 | propout [edi + 12], xmm7, xmm4 | |
537 | ||
538 | ret | |
1a517bb3 MW |
539 | ENDFUNC |
540 | ||
541 | INTFUNC(mmul4) | |
444083ae MW |
542 | // On entry, EDI points to the destination buffer; EAX and EBX point |
543 | // to the packed operands U and N; ECX and ESI point to the expanded | |
544 | // operands V and M; and EDX points to a place to store an expanded | |
545 | // result Y (32 bytes, at a 16-byte boundary). The stack pointer | |
6ecc0b8f | 546 | // must be 12 modulo 16, as is usual for modern x86 ABIs. |
444083ae MW |
547 | // |
548 | // On exit, we write Y = U V M mod B to [EDX], and the low 128 bits | |
549 | // of the sum U V + N Y to [EDI], leaving the remaining carry in | |
550 | // XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2, XMM3, and | |
551 | // XMM7 are clobbered; the general-purpose registers are preserved. | |
6ecc0b8f | 552 | stalloc 48 + 12 // space for the carries |
0923a413 | 553 | endprologue |
444083ae MW |
554 | |
555 | // Calculate W = U V, and leave it in the destination. Stash the | |
556 | // carry pieces for later. | |
557 | mulcore [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7 | |
558 | propout [edi + 0], xmm4, xmm5 | |
559 | jmp 5f | |
1a517bb3 MW |
560 | ENDFUNC |
561 | ||
562 | INTFUNC(mmla4) | |
444083ae | 563 | // On entry, EDI points to the destination buffer, which also |
14e7b1f5 MW |
564 | // contains an addend A to accumulate; EAX and EBX point to the |
565 | // packed operands U and N; ECX and ESI point to the expanded | |
444083ae MW |
566 | // operands V and M; and EDX points to a place to store an expanded |
567 | // result Y (32 bytes, at a 16-byte boundary). The stack pointer | |
6ecc0b8f | 568 | // must be 12 modulo 16, as is usual for modern x86 ABIs. |
444083ae MW |
569 | // |
570 | // On exit, we write Y = (A + U V) M mod B to [EDX], and the low 128 | |
571 | // bits of the sum A + U V + N Y to [EDI], leaving the remaining | |
572 | // carry in XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2, | |
573 | // XMM3, and XMM7 are clobbered; the general-purpose registers are | |
574 | // preserved. | |
6ecc0b8f | 575 | stalloc 48 + 12 // space for the carries |
0923a413 MW |
576 | endprologue |
577 | ||
444083ae MW |
578 | movd xmm4, [edi + 0] |
579 | movd xmm5, [edi + 4] | |
580 | movd xmm6, [edi + 8] | |
581 | movd xmm7, [edi + 12] | |
ba12677b MW |
582 | |
583 | // Calculate W = U V, and leave it in the destination. Stash the | |
584 | // carry pieces for later. | |
71ac8e5e | 585 | mulacc [eax + 0], ecx, xmm4, xmm5, xmm6, xmm7 |
444083ae MW |
586 | propout [edi + 0], xmm4, xmm5 |
587 | ||
588 | 5: mulacc [eax + 4], ecx, xmm5, xmm6, xmm7, xmm4, t | |
589 | propout [edi + 4], xmm5, xmm6 | |
590 | ||
591 | mulacc [eax + 8], ecx, xmm6, xmm7, xmm4, xmm5, t | |
592 | propout [edi + 8], xmm6, xmm7 | |
593 | ||
594 | mulacc [eax + 12], ecx, xmm7, xmm4, xmm5, xmm6, t | |
595 | propout [edi + 12], xmm7, xmm4 | |
596 | ||
a90d420c MW |
597 | movdqa [SP + 0], xmm4 |
598 | movdqa [SP + 16], xmm5 | |
599 | movdqa [SP + 32], xmm6 | |
444083ae MW |
600 | |
601 | // Calculate Y = W M. | |
602 | mulcore [edi + 0], esi, xmm4, xmm5, xmm6, xmm7 | |
603 | ||
71ac8e5e MW |
604 | mulcore [edi + 4], esi, xmm0, xmm1, xmm2 |
605 | accum xmm5, xmm6, xmm7 | |
444083ae | 606 | |
71ac8e5e MW |
607 | mulcore [edi + 8], esi, xmm0, xmm1 |
608 | accum xmm6, xmm7 | |
444083ae | 609 | |
71ac8e5e MW |
610 | mulcore [edi + 12], esi, xmm0 |
611 | accum xmm7 | |
444083ae MW |
612 | |
613 | // That's lots of pieces. Now we have to assemble the answer. | |
71ac8e5e | 614 | squash xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm4 |
444083ae MW |
615 | |
616 | // Expand it. | |
617 | pxor xmm2, xmm2 | |
71ac8e5e | 618 | expand xmm2, xmm4, xmm1 |
444083ae MW |
619 | movdqa [edx + 0], xmm4 |
620 | movdqa [edx + 16], xmm1 | |
621 | ||
622 | // Initialize the carry from the value for W we calculated earlier. | |
623 | movd xmm4, [edi + 0] | |
624 | movd xmm5, [edi + 4] | |
625 | movd xmm6, [edi + 8] | |
626 | movd xmm7, [edi + 12] | |
627 | ||
628 | // Finish the calculation by adding the Montgomery product. | |
71ac8e5e | 629 | mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7 |
444083ae MW |
630 | propout [edi + 0], xmm4, xmm5 |
631 | ||
632 | mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t | |
633 | propout [edi + 4], xmm5, xmm6 | |
634 | ||
635 | mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t | |
636 | propout [edi + 8], xmm6, xmm7 | |
637 | ||
638 | mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t | |
639 | propout [edi + 12], xmm7, xmm4 | |
640 | ||
641 | // Add add on the carry we calculated earlier. | |
a90d420c MW |
642 | paddq xmm4, [SP + 0] |
643 | paddq xmm5, [SP + 16] | |
644 | paddq xmm6, [SP + 32] | |
444083ae MW |
645 | |
646 | // And, with that, we're done. | |
6ecc0b8f | 647 | stfree 48 + 12 |
444083ae | 648 | ret |
1a517bb3 MW |
649 | ENDFUNC |
650 | ||
651 | INTFUNC(mont4) | |
444083ae | 652 | // On entry, EDI points to the destination buffer holding a packed |
8e5386aa | 653 | // value W; EBX points to a packed operand N; ESI points to an |
444083ae MW |
654 | // expanded operand M; and EDX points to a place to store an expanded |
655 | // result Y (32 bytes, at a 16-byte boundary). | |
656 | // | |
657 | // On exit, we write Y = W M mod B to [EDX], and the low 128 bits | |
658 | // of the sum W + N Y to [EDI], leaving the remaining carry in | |
659 | // XMM4, XMM5, and XMM6. The registers XMM0, XMM1, XMM2, XMM3, and | |
660 | // XMM7 are clobbered; the general-purpose registers are preserved. | |
0923a413 | 661 | endprologue |
444083ae MW |
662 | |
663 | // Calculate Y = W M. | |
664 | mulcore [edi + 0], esi, xmm4, xmm5, xmm6, xmm7 | |
665 | ||
71ac8e5e MW |
666 | mulcore [edi + 4], esi, xmm0, xmm1, xmm2 |
667 | accum xmm5, xmm6, xmm7 | |
444083ae | 668 | |
71ac8e5e MW |
669 | mulcore [edi + 8], esi, xmm0, xmm1 |
670 | accum xmm6, xmm7 | |
444083ae | 671 | |
71ac8e5e MW |
672 | mulcore [edi + 12], esi, xmm0 |
673 | accum xmm7 | |
444083ae MW |
674 | |
675 | // That's lots of pieces. Now we have to assemble the answer. | |
71ac8e5e | 676 | squash xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm4 |
444083ae MW |
677 | |
678 | // Expand it. | |
679 | pxor xmm2, xmm2 | |
71ac8e5e | 680 | expand xmm2, xmm4, xmm1 |
444083ae MW |
681 | movdqa [edx + 0], xmm4 |
682 | movdqa [edx + 16], xmm1 | |
683 | ||
684 | // Initialize the carry from W. | |
685 | movd xmm4, [edi + 0] | |
686 | movd xmm5, [edi + 4] | |
687 | movd xmm6, [edi + 8] | |
688 | movd xmm7, [edi + 12] | |
689 | ||
690 | // Finish the calculation by adding the Montgomery product. | |
71ac8e5e | 691 | mulacc [ebx + 0], edx, xmm4, xmm5, xmm6, xmm7 |
444083ae MW |
692 | propout [edi + 0], xmm4, xmm5 |
693 | ||
694 | mulacc [ebx + 4], edx, xmm5, xmm6, xmm7, xmm4, t | |
695 | propout [edi + 4], xmm5, xmm6 | |
696 | ||
697 | mulacc [ebx + 8], edx, xmm6, xmm7, xmm4, xmm5, t | |
698 | propout [edi + 8], xmm6, xmm7 | |
699 | ||
700 | mulacc [ebx + 12], edx, xmm7, xmm4, xmm5, xmm6, t | |
701 | propout [edi + 12], xmm7, xmm4 | |
702 | ||
703 | // And, with that, we're done. | |
704 | ret | |
1a517bb3 MW |
705 | ENDFUNC |
706 | ||
444083ae MW |
707 | ///-------------------------------------------------------------------------- |
708 | /// Bulk multipliers. | |
709 | ||
b9b279b4 MW |
710 | FUNC(mpx_umul4_x86_avx) |
711 | .arch .avx | |
712 | vzeroupper | |
713 | endprologue | |
714 | // and drop through... | |
715 | .arch pentium4 | |
716 | ENDFUNC | |
717 | ||
444083ae MW |
718 | FUNC(mpx_umul4_x86_sse2) |
719 | // void mpx_umul4_x86_sse2(mpw *dv, const mpw *av, const mpw *avl, | |
720 | // const mpw *bv, const mpw *bvl); | |
721 | ||
a90d420c | 722 | // Build a stack frame. Arguments will be relative to BP, as |
444083ae MW |
723 | // follows. |
724 | // | |
a90d420c MW |
725 | // BP + 20 dv |
726 | // BP + 24 av | |
727 | // BP + 28 avl | |
728 | // BP + 32 bv | |
729 | // BP + 36 bvl | |
444083ae | 730 | // |
a90d420c | 731 | // Locals are relative to SP, as follows. |
444083ae | 732 | // |
a90d420c MW |
733 | // SP + 0 expanded Y (32 bytes) |
734 | // SP + 32 (top of locals) | |
735 | pushreg BP | |
0923a413 MW |
736 | pushreg ebx |
737 | pushreg esi | |
738 | pushreg edi | |
42c44b27 | 739 | setfp |
6d2bd7f1 | 740 | stalloc 32 |
a90d420c | 741 | and SP, ~15 |
0923a413 | 742 | endprologue |
444083ae MW |
743 | |
744 | // Prepare for the first iteration. | |
a90d420c | 745 | mov esi, [BP + 32] // -> bv[0] |
444083ae MW |
746 | pxor xmm7, xmm7 |
747 | movdqu xmm0, [esi] // bv[0] | |
a90d420c | 748 | mov edi, [BP + 20] // -> dv[0] |
444083ae | 749 | mov ecx, edi // outer loop dv cursor |
71ac8e5e | 750 | expand xmm7, xmm0, xmm1 |
a90d420c MW |
751 | mov ebx, [BP + 24] // -> av[0] |
752 | mov eax, [BP + 28] // -> av[m] = av limit | |
753 | mov edx, SP // -> expanded Y = bv[0] | |
754 | movdqa [SP + 0], xmm0 // bv[0] expanded low | |
755 | movdqa [SP + 16], xmm1 // bv[0] expanded high | |
444083ae MW |
756 | call mul4zc |
757 | add ebx, 16 | |
758 | add edi, 16 | |
759 | add ecx, 16 | |
760 | add esi, 16 | |
761 | cmp ebx, eax // all done? | |
762 | jae 8f | |
763 | ||
764 | .p2align 4 | |
765 | // Continue with the first iteration. | |
766 | 0: call mul4 | |
767 | add ebx, 16 | |
768 | add edi, 16 | |
769 | cmp ebx, eax // all done? | |
770 | jb 0b | |
771 | ||
772 | // Write out the leftover carry. There can be no tail here. | |
773 | 8: call carryprop | |
a90d420c | 774 | cmp esi, [BP + 36] // more passes to do? |
444083ae MW |
775 | jae 9f |
776 | ||
777 | .p2align 4 | |
778 | // Set up for the next pass. | |
779 | 1: movdqu xmm0, [esi] // bv[i] | |
780 | mov edi, ecx // -> dv[i] | |
781 | pxor xmm7, xmm7 | |
71ac8e5e | 782 | expand xmm7, xmm0, xmm1 |
a90d420c MW |
783 | mov ebx, [BP + 24] // -> av[0] |
784 | movdqa [SP + 0], xmm0 // bv[i] expanded low | |
785 | movdqa [SP + 16], xmm1 // bv[i] expanded high | |
444083ae MW |
786 | call mla4zc |
787 | add edi, 16 | |
788 | add ebx, 16 | |
789 | add ecx, 16 | |
790 | add esi, 16 | |
791 | cmp ebx, eax // done yet? | |
792 | jae 8f | |
793 | ||
794 | .p2align 4 | |
795 | // Continue... | |
796 | 0: call mla4 | |
797 | add ebx, 16 | |
798 | add edi, 16 | |
799 | cmp ebx, eax | |
800 | jb 0b | |
801 | ||
802 | // Finish off this pass. There was no tail on the previous pass, and | |
803 | // there can be none on this pass. | |
804 | 8: call carryprop | |
a90d420c | 805 | cmp esi, [BP + 36] |
444083ae MW |
806 | jb 1b |
807 | ||
808 | // All over. | |
0923a413 | 809 | 9: dropfp |
444083ae MW |
810 | pop edi |
811 | pop esi | |
812 | pop ebx | |
a90d420c | 813 | pop BP |
444083ae | 814 | ret |
444083ae MW |
815 | ENDFUNC |
816 | ||
b9b279b4 MW |
817 | FUNC(mpxmont_mul4_x86_avx) |
818 | .arch .avx | |
819 | vzeroupper | |
820 | endprologue | |
821 | // and drop through... | |
822 | .arch pentium4 | |
823 | ENDFUNC | |
824 | ||
444083ae MW |
825 | FUNC(mpxmont_mul4_x86_sse2) |
826 | // void mpxmont_mul4_x86_sse2(mpw *dv, const mpw *av, const mpw *bv, | |
827 | // const mpw *nv, size_t n, const mpw *mi); | |
828 | ||
a90d420c | 829 | // Build a stack frame. Arguments will be relative to BP, as |
444083ae MW |
830 | // follows. |
831 | // | |
a90d420c MW |
832 | // BP + 20 dv |
833 | // BP + 24 av | |
834 | // BP + 28 bv | |
835 | // BP + 32 nv | |
836 | // BP + 36 n (nonzero multiple of 4) | |
837 | // BP + 40 mi | |
444083ae | 838 | // |
a90d420c | 839 | // Locals are relative to SP, which 16-byte aligned, as follows. |
444083ae | 840 | // |
a90d420c MW |
841 | // SP + 0 expanded V (32 bytes) |
842 | // SP + 32 expanded M (32 bytes) | |
843 | // SP + 64 expanded Y (32 bytes) | |
844 | // SP + 96 outer loop dv | |
845 | // SP + 100 outer loop bv | |
846 | // SP + 104 av limit (mostly in ESI) | |
847 | // SP + 108 bv limit | |
848 | // SP + 112 (top of locals) | |
849 | pushreg BP | |
0923a413 MW |
850 | pushreg ebx |
851 | pushreg esi | |
852 | pushreg edi | |
42c44b27 | 853 | setfp |
6d2bd7f1 | 854 | stalloc 112 |
a90d420c | 855 | and SP, ~15 |
0923a413 | 856 | endprologue |
444083ae MW |
857 | |
858 | // Establish the expanded operands. | |
859 | pxor xmm7, xmm7 | |
a90d420c MW |
860 | mov ecx, [BP + 28] // -> bv |
861 | mov edx, [BP + 40] // -> mi | |
444083ae MW |
862 | movdqu xmm0, [ecx] // bv[0] |
863 | movdqu xmm2, [edx] // mi | |
71ac8e5e | 864 | expand xmm7, xmm0, xmm1, xmm2, xmm3 |
a90d420c MW |
865 | movdqa [SP + 0], xmm0 // bv[0] expanded low |
866 | movdqa [SP + 16], xmm1 // bv[0] expanded high | |
867 | movdqa [SP + 32], xmm2 // mi expanded low | |
868 | movdqa [SP + 48], xmm3 // mi expanded high | |
444083ae MW |
869 | |
870 | // Set up the outer loop state and prepare for the first iteration. | |
a90d420c MW |
871 | mov edx, [BP + 36] // n |
872 | mov eax, [BP + 24] // -> U = av[0] | |
873 | mov ebx, [BP + 32] // -> X = nv[0] | |
874 | mov edi, [BP + 20] // -> Z = dv[0] | |
875 | mov [SP + 100], ecx | |
444083ae MW |
876 | lea ecx, [ecx + 4*edx] // -> bv[n/4] = bv limit |
877 | lea edx, [eax + 4*edx] // -> av[n/4] = av limit | |
a90d420c MW |
878 | mov [SP + 96], edi |
879 | mov [SP + 104], edx | |
880 | mov [SP + 108], ecx | |
881 | lea ecx, [SP + 0] // -> expanded V = bv[0] | |
882 | lea esi, [SP + 32] // -> expanded M = mi | |
883 | lea edx, [SP + 64] // -> space for Y | |
444083ae | 884 | call mmul4 |
a90d420c | 885 | mov esi, [SP + 104] // recover av limit |
444083ae MW |
886 | add edi, 16 |
887 | add eax, 16 | |
888 | add ebx, 16 | |
889 | cmp eax, esi // done already? | |
890 | jae 8f | |
a90d420c | 891 | mov [SP + 96], edi |
444083ae MW |
892 | |
893 | .p2align 4 | |
894 | // Complete the first inner loop. | |
895 | 0: call dmul4 | |
896 | add edi, 16 | |
897 | add eax, 16 | |
898 | add ebx, 16 | |
899 | cmp eax, esi // done yet? | |
900 | jb 0b | |
901 | ||
902 | // Still have carries left to propagate. | |
903 | call carryprop | |
904 | movd [edi + 16], xmm4 | |
905 | ||
906 | .p2align 4 | |
907 | // Embark on the next iteration. (There must be one. If n = 1, then | |
908 | // we would have bailed above, to label 8. Similarly, the subsequent | |
909 | // iterations can fall into the inner loop immediately.) | |
a90d420c MW |
910 | 1: mov eax, [SP + 100] // -> bv[i - 1] |
911 | mov edi, [SP + 96] // -> Z = dv[i] | |
444083ae MW |
912 | add eax, 16 // -> bv[i] |
913 | pxor xmm7, xmm7 | |
a90d420c MW |
914 | mov [SP + 100], eax |
915 | cmp eax, [SP + 108] // done yet? | |
444083ae | 916 | jae 9f |
6ecc0b8f | 917 | movdqu xmm0, [eax] // bv[i] |
a90d420c MW |
918 | mov ebx, [BP + 32] // -> X = nv[0] |
919 | lea esi, [SP + 32] // -> expanded M = mi | |
920 | mov eax, [BP + 24] // -> U = av[0] | |
71ac8e5e | 921 | expand xmm7, xmm0, xmm1 |
a90d420c MW |
922 | movdqa [SP + 0], xmm0 // bv[i] expanded low |
923 | movdqa [SP + 16], xmm1 // bv[i] expanded high | |
444083ae | 924 | call mmla4 |
a90d420c | 925 | mov esi, [SP + 104] // recover av limit |
444083ae MW |
926 | add edi, 16 |
927 | add eax, 16 | |
928 | add ebx, 16 | |
a90d420c | 929 | mov [SP + 96], edi |
444083ae MW |
930 | |
931 | .p2align 4 | |
932 | // Complete the next inner loop. | |
933 | 0: call dmla4 | |
934 | add edi, 16 | |
935 | add eax, 16 | |
936 | add ebx, 16 | |
937 | cmp eax, esi | |
938 | jb 0b | |
939 | ||
940 | // Still have carries left to propagate, and they overlap the | |
941 | // previous iteration's final tail, so read that in and add it. | |
942 | movd xmm0, [edi] | |
943 | paddq xmm4, xmm0 | |
944 | call carryprop | |
945 | movd [edi + 16], xmm4 | |
946 | ||
947 | // Back again. | |
948 | jmp 1b | |
949 | ||
950 | // First iteration was short. Write out the carries and we're done. | |
951 | // (This could be folded into the main loop structure, but that would | |
952 | // penalize small numbers more.) | |
953 | 8: call carryprop | |
954 | movd [edi + 16], xmm4 | |
955 | ||
956 | // All done. | |
0923a413 MW |
957 | 9: dropfp |
958 | popreg edi | |
959 | popreg esi | |
960 | popreg ebx | |
a90d420c | 961 | popreg BP |
444083ae | 962 | ret |
444083ae MW |
963 | ENDFUNC |
964 | ||
b9b279b4 MW |
965 | FUNC(mpxmont_redc4_x86_avx) |
966 | .arch .avx | |
967 | vzeroupper | |
968 | endprologue | |
969 | // and drop through... | |
970 | .arch pentium4 | |
971 | ENDFUNC | |
972 | ||
444083ae MW |
973 | FUNC(mpxmont_redc4_x86_sse2) |
974 | // void mpxmont_redc4_x86_sse2(mpw *dv, mpw *dvl, const mpw *nv, | |
975 | // size_t n, const mpw *mi); | |
976 | ||
a90d420c | 977 | // Build a stack frame. Arguments will be relative to BP, as |
444083ae MW |
978 | // follows. |
979 | // | |
a90d420c MW |
980 | // BP + 20 dv |
981 | // BP + 24 dvl | |
982 | // BP + 28 nv | |
983 | // BP + 32 n (nonzero multiple of 4) | |
984 | // BP + 36 mi | |
444083ae | 985 | // |
a90d420c | 986 | // Locals are relative to SP, as follows. |
444083ae | 987 | // |
a90d420c MW |
988 | // SP + 0 outer loop dv |
989 | // SP + 4 outer dv limit | |
990 | // SP + 8 blocks-of-4 dv limit | |
991 | // SP + 12 expanded M (32 bytes) | |
992 | // SP + 44 expanded Y (32 bytes) | |
993 | // SP + 76 (top of locals) | |
994 | pushreg BP | |
0923a413 MW |
995 | pushreg ebx |
996 | pushreg esi | |
997 | pushreg edi | |
42c44b27 | 998 | setfp |
a90d420c | 999 | and SP, ~15 |
6d2bd7f1 | 1000 | stalloc 76 |
0923a413 | 1001 | endprologue |
444083ae MW |
1002 | |
1003 | // Establish the expanded operands and the blocks-of-4 dv limit. | |
a90d420c | 1004 | mov edi, [BP + 20] // -> Z = dv[0] |
444083ae | 1005 | pxor xmm7, xmm7 |
a90d420c | 1006 | mov eax, [BP + 24] // -> dv[n] = dv limit |
444083ae | 1007 | sub eax, edi // length of dv in bytes |
a90d420c | 1008 | mov edx, [BP + 36] // -> mi |
444083ae MW |
1009 | movdqu xmm0, [edx] // mi |
1010 | and eax, ~15 // mask off the tail end | |
71ac8e5e | 1011 | expand xmm7, xmm0, xmm1 |
444083ae | 1012 | add eax, edi // find limit |
a90d420c MW |
1013 | movdqa [SP + 12], xmm0 // mi expanded low |
1014 | movdqa [SP + 28], xmm1 // mi expanded high | |
1015 | mov [SP + 8], eax | |
444083ae MW |
1016 | |
1017 | // Set up the outer loop state and prepare for the first iteration. | |
a90d420c MW |
1018 | mov ecx, [BP + 32] // n |
1019 | mov ebx, [BP + 28] // -> X = nv[0] | |
444083ae MW |
1020 | lea edx, [edi + 4*ecx] // -> dv[n/4] = outer dv limit |
1021 | lea ecx, [ebx + 4*ecx] // -> nv[n/4] = nv limit | |
a90d420c MW |
1022 | mov [SP + 0], edi |
1023 | mov [SP + 4], edx | |
1024 | lea esi, [SP + 12] // -> expanded M = mi | |
1025 | lea edx, [SP + 44] // -> space for Y | |
444083ae | 1026 | call mont4 |
444083ae | 1027 | add ebx, 16 |
a87d6f26 | 1028 | add edi, 16 |
444083ae MW |
1029 | cmp ebx, ecx // done already? |
1030 | jae 8f | |
1031 | ||
1032 | .p2align 4 | |
1033 | // Complete the first inner loop. | |
1034 | 5: call mla4 | |
1035 | add ebx, 16 | |
1036 | add edi, 16 | |
1037 | cmp ebx, ecx // done yet? | |
1038 | jb 5b | |
1039 | ||
1040 | // Still have carries left to propagate. | |
1041 | 8: carryadd | |
a90d420c MW |
1042 | mov esi, [SP + 8] // -> dv blocks limit |
1043 | mov edx, [BP + 24] // dv limit | |
444083ae MW |
1044 | psllq xmm7, 16 |
1045 | pslldq xmm7, 8 | |
1046 | paddq xmm6, xmm7 | |
1047 | call carryprop | |
1048 | movd eax, xmm4 | |
1049 | add edi, 16 | |
1050 | cmp edi, esi | |
1051 | jae 7f | |
1052 | ||
1053 | .p2align 4 | |
1054 | // Continue carry propagation until the end of the buffer. | |
1055 | 0: add [edi], eax | |
1056 | mov eax, 0 // preserves flags | |
bd6d65e3 MW |
1057 | adc dword ptr [edi + 4], 0 |
1058 | adc dword ptr [edi + 8], 0 | |
1059 | adc dword ptr [edi + 12], 0 | |
444083ae MW |
1060 | adc eax, 0 |
1061 | add edi, 16 | |
1062 | cmp edi, esi | |
1063 | jb 0b | |
1064 | ||
6966e7a6 MW |
1065 | // Deal with the tail end. Note that the actual destination length |
1066 | // won't be an exact number of blocks of four, so it's safe to just | |
1067 | // drop through here. | |
444083ae | 1068 | 7: add [edi], eax |
92edc356 | 1069 | mov eax, 0 |
444083ae MW |
1070 | add edi, 4 |
1071 | adc eax, 0 | |
1072 | cmp edi, edx | |
1073 | jb 7b | |
1074 | ||
6966e7a6 | 1075 | // All done for this iteration. Start the next. |
a90d420c MW |
1076 | 8: mov edi, [SP + 0] // -> dv[i - 1] |
1077 | mov ebx, [BP + 28] // -> X = nv[0] | |
1078 | lea edx, [SP + 44] // -> space for Y | |
1079 | lea esi, [SP + 12] // -> expanded M = mi | |
444083ae | 1080 | add edi, 16 // -> Z = dv[i] |
a90d420c | 1081 | cmp edi, [SP + 4] // all done yet? |
444083ae | 1082 | jae 9f |
a90d420c | 1083 | mov [SP + 0], edi |
444083ae MW |
1084 | call mont4 |
1085 | add edi, 16 | |
1086 | add ebx, 16 | |
1087 | jmp 5b | |
1088 | ||
1089 | // All over. | |
0923a413 MW |
1090 | 9: dropfp |
1091 | popreg edi | |
1092 | popreg esi | |
1093 | popreg ebx | |
a90d420c | 1094 | popreg BP |
444083ae | 1095 | ret |
444083ae MW |
1096 | ENDFUNC |
1097 | ||
1098 | ///-------------------------------------------------------------------------- | |
1099 | /// Testing and performance measurement. | |
1100 | ||
1101 | #ifdef TEST_MUL4 | |
1102 | ||
1103 | .macro cysetup c | |
1104 | rdtsc | |
1105 | mov [\c], eax | |
1106 | mov [\c + 4], edx | |
1107 | .endm | |
1108 | ||
1109 | .macro cystore c, v, n | |
1110 | rdtsc | |
1111 | sub eax, [\c] | |
1112 | sbb edx, [\c + 4] | |
1113 | mov ebx, [\v] | |
1114 | mov ecx, [\n] | |
1115 | dec ecx | |
1116 | mov [\n], ecx | |
1117 | mov [ebx + ecx*8], eax | |
1118 | mov [ebx + ecx*8 + 4], edx | |
1119 | .endm | |
1120 | ||
6d19758a | 1121 | .macro testprologue n |
a90d420c | 1122 | pushreg BP |
0923a413 MW |
1123 | pushreg ebx |
1124 | pushreg esi | |
1125 | pushreg edi | |
42c44b27 | 1126 | setfp |
6d2bd7f1 | 1127 | stalloc 3*32 + 4*4 |
a90d420c | 1128 | and SP, ~15 |
0923a413 | 1129 | endprologue |
6d19758a | 1130 | mov eax, \n |
a90d420c | 1131 | mov [SP + 104], eax |
444083ae | 1132 | // vars: |
a90d420c MW |
1133 | // SP + 0 = v expanded |
1134 | // SP + 32 = y expanded | |
1135 | // SP + 64 = ? expanded | |
1136 | // SP + 96 = cycles | |
1137 | // SP + 104 = count | |
444083ae MW |
1138 | .endm |
1139 | ||
1140 | .macro testepilogue | |
0923a413 MW |
1141 | dropfp |
1142 | popreg edi | |
1143 | popreg esi | |
1144 | popreg ebx | |
a90d420c | 1145 | popreg BP |
444083ae MW |
1146 | ret |
1147 | .endm | |
1148 | ||
1149 | .macro testldcarry c | |
1150 | mov ecx, \c // -> c | |
981a9e5d MW |
1151 | movdqu xmm4, [ecx + 0] // (c''_0; c'_0) |
1152 | movdqu xmm5, [ecx + 16] // (c''_1; c'_1) | |
1153 | movdqu xmm6, [ecx + 32] // (c''_2; c'_2) | |
444083ae MW |
1154 | .endm |
1155 | ||
71ac8e5e | 1156 | .macro testexpand v=nil, y=nil |
444083ae MW |
1157 | pxor xmm7, xmm7 |
1158 | .ifnes "\v", "nil" | |
1159 | mov ecx, \v | |
1160 | movdqu xmm0, [ecx] | |
71ac8e5e | 1161 | expand xmm7, xmm0, xmm1 |
a90d420c MW |
1162 | movdqa [SP + 0], xmm0 |
1163 | movdqa [SP + 16], xmm1 | |
444083ae MW |
1164 | .endif |
1165 | .ifnes "\y", "nil" | |
1166 | mov edx, \y | |
1167 | movdqu xmm2, [edx] | |
71ac8e5e | 1168 | expand xmm7, xmm2, xmm3 |
a90d420c MW |
1169 | movdqa [SP + 32], xmm2 |
1170 | movdqa [SP + 48], xmm3 | |
444083ae MW |
1171 | .endif |
1172 | .endm | |
1173 | ||
71ac8e5e | 1174 | .macro testtop u=nil, x=nil, mode=nil |
444083ae MW |
1175 | .p2align 4 |
1176 | 0: | |
1177 | .ifnes "\u", "nil" | |
a90d420c | 1178 | lea ecx, [SP + 0] |
444083ae MW |
1179 | .endif |
1180 | mov ebx, \x | |
1181 | .ifeqs "\mode", "mont" | |
a90d420c | 1182 | lea esi, [SP + 32] |
444083ae | 1183 | .endif |
a90d420c | 1184 | cysetup SP + 96 |
444083ae MW |
1185 | .ifnes "\u", "nil" |
1186 | mov eax, \u | |
1187 | .endif | |
1188 | .ifeqs "\mode", "mont" | |
a90d420c | 1189 | lea edx, [SP + 64] |
444083ae | 1190 | .else |
a90d420c | 1191 | lea edx, [SP + 32] |
444083ae MW |
1192 | .endif |
1193 | .endm | |
1194 | ||
6d19758a | 1195 | .macro testtail cyv |
a90d420c | 1196 | cystore SP + 96, \cyv, SP + 104 |
444083ae MW |
1197 | jnz 0b |
1198 | .endm | |
1199 | ||
1200 | .macro testcarryout c | |
1201 | mov ecx, \c | |
1202 | movdqu [ecx + 0], xmm4 | |
1203 | movdqu [ecx + 16], xmm5 | |
1204 | movdqu [ecx + 32], xmm6 | |
1205 | .endm | |
1206 | ||
0923a413 | 1207 | FUNC(test_dmul4) |
a90d420c MW |
1208 | testprologue [BP + 44] |
1209 | testldcarry [BP + 24] | |
1210 | testexpand [BP + 36], [BP + 40] | |
1211 | mov edi, [BP + 20] | |
1212 | testtop [BP + 28], [BP + 32] | |
444083ae | 1213 | call dmul4 |
a90d420c MW |
1214 | testtail [BP + 48] |
1215 | testcarryout [BP + 24] | |
444083ae | 1216 | testepilogue |
0923a413 | 1217 | ENDFUNC |
444083ae | 1218 | |
0923a413 | 1219 | FUNC(test_dmla4) |
a90d420c MW |
1220 | testprologue [BP + 44] |
1221 | testldcarry [BP + 24] | |
1222 | testexpand [BP + 36], [BP + 40] | |
1223 | mov edi, [BP + 20] | |
1224 | testtop [BP + 28], [BP + 32] | |
444083ae | 1225 | call dmla4 |
a90d420c MW |
1226 | testtail [BP + 48] |
1227 | testcarryout [BP + 24] | |
444083ae | 1228 | testepilogue |
0923a413 | 1229 | ENDFUNC |
444083ae | 1230 | |
0923a413 | 1231 | FUNC(test_mul4) |
a90d420c MW |
1232 | testprologue [BP + 36] |
1233 | testldcarry [BP + 24] | |
1234 | testexpand nil, [BP + 32] | |
1235 | mov edi, [BP + 20] | |
1236 | testtop nil, [BP + 28] | |
444083ae | 1237 | call mul4 |
a90d420c MW |
1238 | testtail [BP + 40] |
1239 | testcarryout [BP + 24] | |
444083ae | 1240 | testepilogue |
0923a413 | 1241 | ENDFUNC |
444083ae | 1242 | |
d0d41c6e | 1243 | FUNC(test_mul4zc) |
a90d420c MW |
1244 | testprologue [BP + 36] |
1245 | testldcarry [BP + 24] | |
1246 | testexpand nil, [BP + 32] | |
1247 | mov edi, [BP + 20] | |
1248 | testtop nil, [BP + 28] | |
d0d41c6e | 1249 | call mul4zc |
a90d420c MW |
1250 | testtail [BP + 40] |
1251 | testcarryout [BP + 24] | |
d0d41c6e MW |
1252 | testepilogue |
1253 | ENDFUNC | |
1254 | ||
0923a413 | 1255 | FUNC(test_mla4) |
a90d420c MW |
1256 | testprologue [BP + 36] |
1257 | testldcarry [BP + 24] | |
1258 | testexpand nil, [BP + 32] | |
1259 | mov edi, [BP + 20] | |
1260 | testtop nil, [BP + 28] | |
444083ae | 1261 | call mla4 |
a90d420c MW |
1262 | testtail [BP + 40] |
1263 | testcarryout [BP + 24] | |
444083ae | 1264 | testepilogue |
0923a413 | 1265 | ENDFUNC |
444083ae | 1266 | |
d0d41c6e | 1267 | FUNC(test_mla4zc) |
a90d420c MW |
1268 | testprologue [BP + 36] |
1269 | testldcarry [BP + 24] | |
1270 | testexpand nil, [BP + 32] | |
1271 | mov edi, [BP + 20] | |
1272 | testtop nil, [BP + 28] | |
d0d41c6e | 1273 | call mla4zc |
a90d420c MW |
1274 | testtail [BP + 40] |
1275 | testcarryout [BP + 24] | |
d0d41c6e MW |
1276 | testepilogue |
1277 | ENDFUNC | |
1278 | ||
0923a413 | 1279 | FUNC(test_mmul4) |
a90d420c MW |
1280 | testprologue [BP + 48] |
1281 | testexpand [BP + 40], [BP + 44] | |
1282 | mov edi, [BP + 20] | |
1283 | testtop [BP + 32], [BP + 36], mont | |
444083ae | 1284 | call mmul4 |
a90d420c MW |
1285 | testtail [BP + 52] |
1286 | mov edi, [BP + 28] | |
1287 | movdqa xmm0, [SP + 64] | |
1288 | movdqa xmm1, [SP + 80] | |
981a9e5d MW |
1289 | pshufd xmm0, xmm0, SHUF(3, 1, 2, 0) |
1290 | pshufd xmm1, xmm1, SHUF(3, 1, 2, 0) | |
444083ae MW |
1291 | movdqu [edi], xmm0 |
1292 | movdqu [edi + 16], xmm1 | |
a90d420c | 1293 | testcarryout [BP + 24] |
444083ae | 1294 | testepilogue |
0923a413 | 1295 | ENDFUNC |
444083ae | 1296 | |
0923a413 | 1297 | FUNC(test_mmla4) |
a90d420c MW |
1298 | testprologue [BP + 48] |
1299 | testexpand [BP + 40], [BP + 44] | |
1300 | mov edi, [BP + 20] | |
1301 | testtop [BP + 32], [BP + 36], mont | |
444083ae | 1302 | call mmla4 |
a90d420c MW |
1303 | testtail [BP + 52] |
1304 | mov edi, [BP + 28] | |
1305 | movdqa xmm0, [SP + 64] | |
1306 | movdqa xmm1, [SP + 80] | |
981a9e5d MW |
1307 | pshufd xmm0, xmm0, SHUF(3, 1, 2, 0) |
1308 | pshufd xmm1, xmm1, SHUF(3, 1, 2, 0) | |
444083ae MW |
1309 | movdqu [edi], xmm0 |
1310 | movdqu [edi + 16], xmm1 | |
a90d420c | 1311 | testcarryout [BP + 24] |
444083ae | 1312 | testepilogue |
0923a413 | 1313 | ENDFUNC |
444083ae | 1314 | |
0923a413 | 1315 | FUNC(test_mont4) |
a90d420c MW |
1316 | testprologue [BP + 40] |
1317 | testexpand nil, [BP + 36] | |
1318 | mov edi, [BP + 20] | |
1319 | testtop nil, [BP + 32], mont | |
444083ae | 1320 | call mont4 |
a90d420c MW |
1321 | testtail [BP + 44] |
1322 | mov edi, [BP + 28] | |
1323 | movdqa xmm0, [SP + 64] | |
1324 | movdqa xmm1, [SP + 80] | |
981a9e5d MW |
1325 | pshufd xmm0, xmm0, SHUF(3, 1, 2, 0) |
1326 | pshufd xmm1, xmm1, SHUF(3, 1, 2, 0) | |
444083ae MW |
1327 | movdqu [edi], xmm0 |
1328 | movdqu [edi + 16], xmm1 | |
a90d420c | 1329 | testcarryout [BP + 24] |
444083ae | 1330 | testepilogue |
0923a413 | 1331 | ENDFUNC |
444083ae MW |
1332 | |
1333 | #endif | |
1334 | ||
1335 | ///----- That's all, folks -------------------------------------------------- |