Commit | Line | Data |
---|---|---|
57496a50 MW |
1 | /* -*-c-*- |
2 | * | |
3 | * Poly1305 message authentication code | |
4 | * | |
5 | * (c) 2017 Straylight/Edgeware | |
6 | */ | |
7 | ||
8 | /*----- Licensing notice --------------------------------------------------* | |
9 | * | |
10 | * This file is part of Catacomb. | |
11 | * | |
12 | * Catacomb is free software; you can redistribute it and/or modify | |
13 | * it under the terms of the GNU Library General Public License as | |
14 | * published by the Free Software Foundation; either version 2 of the | |
15 | * License, or (at your option) any later version. | |
16 | * | |
17 | * Catacomb is distributed in the hope that it will be useful, | |
18 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
19 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
20 | * GNU Library General Public License for more details. | |
21 | * | |
22 | * You should have received a copy of the GNU Library General Public | |
23 | * License along with Catacomb; if not, write to the Free | |
24 | * Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, | |
25 | * MA 02111-1307, USA. | |
26 | */ | |
27 | ||
28 | /*----- Header files ------------------------------------------------------*/ | |
29 | ||
30 | #include "config.h" | |
31 | ||
32 | #include <assert.h> | |
33 | #include <string.h> | |
34 | ||
35 | #include "poly1305.h" | |
6a0eb244 | 36 | #include "rsvr.h" |
57496a50 MW |
37 | |
38 | /*----- Global variables --------------------------------------------------*/ | |
39 | ||
40 | const octet poly1305_keysz[] = { KSZ_SET, 16, 0 }; | |
41 | ||
42 | /*----- Low-level implementation for 32/64-bit targets --------------------*/ | |
43 | ||
44 | #if !defined(POLY1305_IMPL) && defined(HAVE_UINT64) | |
45 | # define POLY1305_IMPL 26 | |
46 | #endif | |
47 | ||
48 | #if POLY1305_IMPL == 26 | |
49 | ||
50 | /* Elements x of GF(2^130 - 5) are represented by five integers x_i: x = | |
51 | * SUM_{0<=i<5} x_i 2^{26i}. | |
52 | * | |
53 | * Not all elements are represented canonically. We have 0 <= r_i, s_i < | |
54 | * 2^26 by construction. We maintain 0 <= h_i < 2^27. When we read a | |
55 | * message block m, we have 0 <= m_i < 2^26 by construction again. When we | |
56 | * update the hash state, we calculate h' = r (h + m). Addition is done | |
57 | * componentwise; let t = h + m, and we will have 0 <= t_i < 3*2^26. | |
58 | */ | |
59 | typedef uint32 felt[5]; | |
60 | #define M26 0x03ffffff | |
61 | #define P p26 | |
62 | ||
63 | /* Convert 32-bit words into field-element pieces. */ | |
8c3c0886 MW |
64 | #define P26W0(x) (((x##0) << 0)&0x03ffffff) |
65 | #define P26W1(x) ((((x##1) << 6)&0x03ffffc0) | (((x##0) >> 26)&0x0000003f)) | |
66 | #define P26W2(x) ((((x##2) << 12)&0x03ffffff) | (((x##1) >> 20)&0x00000fff)) | |
67 | #define P26W3(x) ((((x##3) << 18)&0x03fc0000) | (((x##2) >> 14)&0x0003ffff)) | |
6fb4ecfb | 68 | #define P26W4(x) (((x##3) >> 8)&0x00ffffff) |
57496a50 MW |
69 | |
70 | /* Propagate carries in parallel. If 0 <= u_i < 2^26 c_i, then we shall have | |
71 | * 0 <= v_0 < 2^26 + 5 c_4, and 0 <= v_i < 2^26 + c_{i-1} for 1 <= i < 5. | |
72 | */ | |
73 | #define CARRY_REDUCE(v, u) do { \ | |
74 | (v##0) = ((u##0)&M26) + 5*((u##4) >> 26); \ | |
75 | (v##1) = ((u##1)&M26) + ((u##0) >> 26); \ | |
76 | (v##2) = ((u##2)&M26) + ((u##1) >> 26); \ | |
77 | (v##3) = ((u##3)&M26) + ((u##2) >> 26); \ | |
78 | (v##4) = ((u##4)&M26) + ((u##3) >> 26); \ | |
79 | } while (0) | |
80 | ||
81 | /* General multiplication, used by `concat'. */ | |
82 | static void mul(felt z, const felt x, const felt y) | |
83 | { | |
84 | /* Initial bounds: we assume x_i, y_i < 2^27. On exit, z_i < 2^27. */ | |
85 | ||
86 | uint32 x0 = x[0], x1 = x[1], x2 = x[2], x3 = x[3], x4 = x[4]; | |
87 | uint32 y0 = y[0], y1 = y[1], y2 = y[2], y3 = y[3], y4 = y[4]; | |
88 | uint64 u0, u1, u2, u3, u4; | |
89 | uint64 v0, v1, v2, v3, v4; | |
90 | uint32 z0, z1, z2, z3, z4; | |
91 | ||
92 | /* Do the multiplication: u = h x mod 2^130 - 5. We will have u_i < | |
93 | * 2^27 (5 (4 - i) + i + 1) 2^27 = 2^54 (21 - 4 i) = 2^52 (84 - 16 i). In | |
94 | * all cases we have u_i < 84*2^52 < 2^59. Notably, u_4 < 5*2^54 = | |
95 | * 20*2^52. | |
96 | */ | |
97 | #define M(x, y) ((uint64)(x)*(y)) | |
98 | u0 = M(x0, y0) + (M(x1, y4) + M(x2, y3) + M(x3, y2) + M(x4, y1))*5; | |
99 | u1 = M(x0, y1) + M(x1, y0) + (M(x2, y4) + M(x3, y3) + M(x4, y2))*5; | |
100 | u2 = M(x0, y2) + M(x1, y1) + M(x2, y0) + (M(x3, y4) + M(x4, y3))*5; | |
101 | u3 = M(x0, y3) + M(x1, y2) + M(x2, y1) + M(x3, y0) + (M(x4, y4))*5; | |
102 | u4 = M(x0, y4) + M(x1, y3) + M(x2, y2) + M(x3, y1) + M(x4, y0); | |
103 | #undef M | |
104 | ||
105 | /* Now we must reduce the coefficients. We do this in an approximate | |
106 | * manner which avoids long data-dependency chains, but requires two | |
107 | * passes. | |
108 | * | |
109 | * The reduced carry down from u_4 to u_0 in the first pass will be c_0 < | |
110 | * 100*2^26; the remaining c_i are smaller: c_i < 2^26 (84 - 16 i). This | |
111 | * leaves 0 <= v_i < 101*2^26. The carries in the second pass are bounded | |
112 | * above by 180. | |
113 | */ | |
114 | CARRY_REDUCE(v, u); CARRY_REDUCE(z, v); | |
115 | z[0] = z0; z[1] = z1; z[2] = z2; z[3] = z3; z[4] = z4; | |
116 | } | |
117 | ||
118 | /* General squaring, used by `concat'. */ | |
119 | static void sqr(felt z, const felt x) | |
120 | { | |
121 | /* Initial bounds: we assume x_i < 2^27. On exit, z_i < 2^27. */ | |
122 | ||
123 | uint32 x0 = x[0], x1 = x[1], x2 = x[2], x3 = x[3], x4 = x[4]; | |
124 | uint64 u0, u1, u2, u3, u4; | |
125 | uint64 v0, v1, v2, v3, v4; | |
126 | uint32 z0, z1, z2, z3, z4; | |
127 | ||
128 | /* Do the squaring. See `mul' for bounds. */ | |
129 | #define M(x, y) ((uint64)(x)*(y)) | |
130 | u0 = M(x0, x0) + 10*(M(x1, x4) + M(x2, x3)); | |
131 | u1 = 2* M(x0, x1) + 5*(M(x3, x3) + 2*M(x2, x4)); | |
132 | u2 = M(x1, x1) + 2* M(x0, x2) + 10* M(x3, x4); | |
133 | u3 = 2*(M(x0, x3) + M(x1, x2)) + 5* M(x4, x4); | |
134 | u4 = M(x2, x2) + 2*(M(x0, x4) + M(x1, x3)); | |
135 | #undef M | |
136 | ||
137 | /* Now we must reduce the coefficients. See `mul' for bounds. */ | |
138 | CARRY_REDUCE(v, u); CARRY_REDUCE(z, v); | |
139 | z[0] = z0; z[1] = z1; z[2] = z2; z[3] = z3; z[4] = z4; | |
140 | } | |
141 | ||
142 | /* Multiplication by r, using precomputation. */ | |
143 | static void mul_r(const poly1305_ctx *ctx, felt z, const felt x) | |
144 | { | |
145 | /* Initial bounds: by construction, r_i < 2^26. We assume x_i < 3*2^26. | |
146 | * On exit, z_i < 2^27. | |
147 | */ | |
148 | ||
149 | uint32 | |
150 | r0 = ctx->k.u.p26.r0, | |
151 | r1 = ctx->k.u.p26.r1, rr1 = ctx->k.u.p26.rr1, | |
152 | r2 = ctx->k.u.p26.r2, rr2 = ctx->k.u.p26.rr2, | |
153 | r3 = ctx->k.u.p26.r3, rr3 = ctx->k.u.p26.rr3, | |
154 | r4 = ctx->k.u.p26.r4, rr4 = ctx->k.u.p26.rr4; | |
155 | uint32 x0 = x[0], x1 = x[1], x2 = x[2], x3 = x[3], x4 = x[4]; | |
156 | uint64 u0, u1, u2, u3, u4; | |
157 | uint64 v0, v1, v2, v3, v4; | |
158 | uint32 z0, z1, z2, z3, z4; | |
159 | ||
160 | /* Do the multiplication: u = h x mod 2^130 - 5. We will have u_i < | |
161 | * 2^26 (5 (4 - i) + i + 1) 3*2^26 = 2^52 (63 - 12 i). In all cases | |
162 | * we have u_i < 63*2^52 < 2^58. Notably, u_4 < 15*2^52. | |
163 | */ | |
164 | #define M(x, y) ((uint64)(x)*(y)) | |
165 | u0 = M(x0, r0) + M(x1, rr4) + M(x2, rr3) + M(x3, rr2) + M(x4, rr1); | |
166 | u1 = M(x0, r1) + M(x1, r0) + M(x2, rr4) + M(x3, rr3) + M(x4, rr2); | |
167 | u2 = M(x0, r2) + M(x1, r1) + M(x2, r0) + M(x3, rr4) + M(x4, rr3); | |
168 | u3 = M(x0, r3) + M(x1, r2) + M(x2, r1) + M(x3, r0) + M(x4, rr4); | |
169 | u4 = M(x0, r4) + M(x1, r3) + M(x2, r2) + M(x3, r1) + M(x4, r0); | |
170 | #undef M | |
171 | ||
172 | /* Now we must reduce the coefficients. We do this in an approximate | |
173 | * manner which avoids long data-dependency chains, but requires two | |
174 | * passes. | |
175 | * | |
176 | * The reduced carry down from u_4 to u_0 in the first pass will be c_0 < | |
177 | * 75*2^26; the remaining c_i are smaller: c_i < 2^26 (63 - 12 i). This | |
178 | * leaves 0 <= v_i < 76*2^26. The carries in the second pass are bounded | |
179 | * above by 135. | |
180 | */ | |
181 | CARRY_REDUCE(v, u); CARRY_REDUCE(z, v); | |
182 | z[0] = z0; z[1] = z1; z[2] = z2; z[3] = z3; z[4] = z4; | |
183 | } | |
184 | ||
185 | #endif | |
186 | ||
6ddc1550 | 187 | /*----- Low-level implementation for 16/32-bit targets --------------------*/ |
57496a50 MW |
188 | |
189 | #ifndef POLY1305_IMPL | |
190 | # define POLY1305_IMPL 11 | |
191 | #endif | |
192 | ||
193 | #if POLY1305_IMPL == 11 | |
194 | ||
195 | /* Elements x of GF(2^130 - 5) are represented by 12 integers x_i: x = | |
196 | * SUM_{0<=i<12} x_i 2^P_i, where P_i = SUM_{0<=j<i} w_j, and w_5 = w_11 = | |
197 | * 10, and w_i = 11 for i in { 0, 1, 2, 3, 4, 6, 7, 8, 9, 10 }. | |
198 | * | |
199 | * Not all elements are represented canonically. We have 0 <= r_i, s_i < | |
200 | * 2^w_i <= 2^11 by construction. We maintain 0 <= h_i < 2^12. When we read | |
201 | * a message block m, we have 0 <= m_i < 2^w_i by construction again. When | |
202 | * we update the hash state, we calculate h' = r (h + m). Addition is done | |
203 | * componentwise; let t = h + m, and we will have 0 <= t_i < 3*2^11. | |
204 | */ | |
205 | typedef uint16 felt[12]; | |
206 | #define M10 0x3ff | |
207 | #define M11 0x7ff | |
208 | #define P p11 | |
209 | ||
210 | /* Load a field element from an octet string. */ | |
211 | static void load_p11(felt d, const octet *s) | |
212 | { | |
213 | unsigned i, j, n, w; | |
214 | uint16 m; | |
215 | uint32 a; | |
216 | ||
217 | for (i = j = n = 0, a = 0; j < 12; j++) { | |
218 | if (j == 5 || j == 11) { w = 10; m = M10; } | |
219 | else { w = 11; m = M11; } | |
220 | while (n < w && i < 16) { a |= s[i++] << n; n += 8; } | |
221 | d[j] = a&m; a >>= w; n -= w; | |
222 | } | |
223 | } | |
224 | ||
225 | /* Reduce a field-element's pieces to manageable size. */ | |
226 | static void carry_reduce(uint32 u[12]) | |
227 | { | |
228 | /* Initial bounds: we assume u_i < 636*2^22. On exit, u_i < 2^11. */ | |
229 | ||
230 | unsigned i; | |
231 | uint32 c; | |
232 | ||
233 | /* Do sequential carry propagation (16-bit CPUs are less likely to benefit | |
57e7040b MW |
234 | * from instruction-level parallelism). Start at u_9; truncate it to 11 |
235 | * bits, and add the carry onto u_10. Truncate u10 to 11 bits, and add the | |
236 | * carry onto u_11. Truncate u_11 to 10 bits, and add five times the carry | |
237 | * onto u_0. And so on. | |
57496a50 MW |
238 | * |
239 | * The carry is larger than the pieces we're leaving behind. Let c_i be | |
240 | * the high portion of u_i, to be carried onto u_{i+1}. I claim that c_i < | |
241 | * 2557*2^10. Then the carry /into/ any u_i is at most 12785*2^10 < 2^24 | |
242 | * (allowing for the reduction as we carry from u_11 to u_0), and u_i after | |
243 | * carry is bounded above by 636*2^22 + 12785*2^10 < 2557*2^20. Hence, the | |
244 | * carry out is at most 2557*2^10, as claimed. | |
245 | * | |
57e7040b MW |
246 | * Once we reach u_9 for the second time, we start with u_9 < 2^11. The |
247 | * carry into u_9 is at most 2557*2^10 < 1279*2^11 as calculated above; so | |
248 | * the carry out into u_10 is at most 1280. Since u_10 < 2^11 prior to | |
249 | * this carry in, we now have u_10 < 2^11 + 1280 < 2^12; so the carry out | |
250 | * into u_11 is at most 1. The final reduction therefore only needs a | |
251 | * conditional subtraction. | |
57496a50 | 252 | */ |
57e7040b MW |
253 | { c = u[9] >> 11; u[9] &= M11; } |
254 | { u[10] += c; c = u[10] >> 11; u[10] &= M11; } | |
57496a50 MW |
255 | { u[11] += c; c = u[11] >> 10; u[11] &= M10; } |
256 | { u[0] += 5*c; c = u[0] >> 11; u[0] &= M11; } | |
257 | for (i = 1; i < 5; i++) { u[i] += c; c = u[i] >> 11; u[i] &= M11; } | |
258 | { u[5] += c; c = u[5] >> 10; u[5] &= M10; } | |
259 | for (i = 6; i < 11; i++) { u[i] += c; c = u[i] >> 11; u[i] &= M11; } | |
260 | u[11] += c; | |
261 | } | |
262 | ||
263 | /* General multiplication. */ | |
264 | static void mul(felt z, const felt x, const felt y) | |
265 | { | |
266 | /* Initial bounds: we assume x_i < 3*2^11, and y_i < 2^12. On exit, | |
267 | * z_i < 2^12. | |
268 | */ | |
269 | ||
270 | uint32 u[12]; | |
271 | unsigned i, j, k; | |
272 | ||
273 | /* Do the main multiplication. After this, we shall have | |
274 | * | |
275 | * { 2^22 (636 - 184 i) for 0 <= i < 6 | |
276 | * u_i < { | |
277 | * { 2^22 (732 - 60 i) for 6 <= i < 12 | |
278 | * | |
279 | * In particular, u_0 < 636*2^22 < 2^32, and u_11 < 72*2^22. | |
280 | * | |
281 | * The irregularly positioned pieces are annoying. Because we fold the | |
282 | * reduction into the multiplication, it's also important to see where the | |
283 | * reduced products fit. Finally, products don't align with the piece | |
284 | * boundaries, and sometimes need to be doubled. The following table | |
285 | * tracks all of this. | |
286 | * | |
287 | * piece width offset second | |
288 | * 0 11 0 130 | |
289 | * 1 11 11 141 | |
290 | * 2 11 22 152 | |
291 | * 3 11 33 163 | |
292 | * 4 11 44 174 | |
293 | * 5 10 55 185 | |
294 | * 6 11 65 195 | |
295 | * 7 11 76 206 | |
296 | * 8 11 87 217 | |
297 | * 9 11 98 228 | |
298 | * 10 11 109 239 | |
299 | * 11 10 120 250 | |
300 | * | |
301 | * The next table tracks exactly which products end up being multiplied by | |
302 | * which constants and accumulated into which destination pieces. | |
303 | * | |
304 | * u_k = t_i r_j + 2 t_i r_j + 5 t_i r_j + 10 t_i r_j | |
305 | * 0 0/0 -- 6/6 1-5/11-7 7-11/5-1 | |
306 | * 1 0-1/1-0 -- 6-7/7-6 2-5/11-8 8-11/5-2 | |
307 | * 2 0-2/2-0 -- 6-8/8-6 3-5/11-9 9-11/5-3 | |
308 | * 3 0-3/3-0 -- 6-9/9-6 4-5/11-10 10-11/5-4 | |
309 | * 4 0-4/4-0 -- 6-10/10-6 5/11 11/5 | |
310 | * 5 0-5/5-0 -- 6-11/11-6 -- | |
311 | * 6 0/6 6/0 1-5/5-1 -- 7-11/11-7 | |
312 | * 7 0-1/7-6 6-7/1-0 2-5/5-2 -- 8-11/11-8 | |
313 | * 8 0-2/8-6 6-8/2-0 3-5/5-3 -- 9-11/11-9 | |
314 | * 9 0-3/9-6 6-9/3-0 4-5/5-4 -- 10-11/11-10 | |
315 | * 10 0-4/10-6 6-10/4-0 5/5 -- 11/11 | |
316 | * 11 0-11/11-0 -- -- -- | |
317 | * | |
318 | * And, finally, trying to bound the multiple of 6*2^22 in each destination | |
319 | * piece is fiddly, so here's a tableau showing the calculation. | |
320 | * | |
321 | * k 1* + 2* + 5* +10* = 1* + 5* = | |
322 | * 0 1 -- 1 10 1 21 106 | |
323 | * 1 2 -- 2 8 2 18 92 | |
324 | * 2 3 -- 3 6 3 15 78 | |
325 | * 3 4 -- 4 4 4 12 64 | |
326 | * 4 5 -- 5 2 5 9 50 | |
327 | * 5 6 -- 6 -- 6 6 36 | |
328 | * 6 2 5 -- 5 12 10 62 | |
329 | * 7 4 4 -- 4 12 8 52 | |
330 | * 8 6 3 -- 3 12 6 42 | |
331 | * 9 8 2 -- 2 12 4 32 | |
332 | * 10 10 1 -- 1 12 2 22 | |
333 | * 11 12 -- -- -- 12 0 12 | |
334 | */ | |
335 | ||
336 | for (i = 0; i < 12; i++) u[i] = 0; | |
337 | ||
338 | #define M(i, j) ((uint32)x[i]*y[j]) | |
339 | ||
340 | /* Product terms we must multiply by 10. */ | |
341 | for (k = 0; k < 5; k++) { | |
342 | for (i = k + 1; i < 6; i++) { | |
343 | j = 12 + k - i; | |
344 | u[k] += M(i, j) + M(j, i); | |
345 | u[k + 6] += M(i + 6, j); | |
346 | } | |
347 | } | |
348 | for (k = 0; k < 5; k++) u[k] *= 2; | |
349 | for (k = 6; k < 11; k++) u[k] *= 5; | |
350 | ||
351 | /* Product terms we must multiply by 5. */ | |
352 | for (k = 0; k < 6; k++) { | |
353 | for (i = k + 6; i >= 6; i--) { | |
354 | j = 12 + k - i; | |
355 | u[k] += M(i, j); | |
356 | } | |
357 | } | |
358 | for (k = 0; k < 6; k++) u[k] *= 5; | |
359 | ||
360 | /* Product terms we must multiply by 2. */ | |
361 | for (k = 6; k < 11; k++) { | |
362 | for (i = k - 5; i < 6; i++) { | |
363 | j = k - i; | |
364 | u[k] += M(i, j); | |
365 | } | |
366 | } | |
367 | for (k = 6; k < 11; k++) u[k] *= 2; | |
368 | ||
369 | /* Remaining product terms. */ | |
370 | for (k = 0; k < 6; k++) { | |
371 | for (i = k; i < 6; i--) { | |
372 | j = k - i; | |
373 | u[k] += M(i, j); | |
374 | u[k + 6] += M(i + 6, j) + M(i, j + 6); | |
375 | } | |
376 | } | |
377 | ||
378 | #undef M | |
379 | ||
380 | /* Do the reduction. Currently, `carry_reduce' does more than we need, but | |
381 | * that's fine. | |
382 | */ | |
383 | carry_reduce(u); | |
384 | ||
385 | /* Done. Write out the answer. */ | |
386 | for (i = 0; i < 12; i++) z[i] = u[i]; | |
387 | } | |
388 | ||
389 | /* General squaring, used by `concat'. */ | |
390 | static void sqr(felt z, const felt x) | |
391 | { mul(z, x, x); } | |
392 | ||
393 | /* Multiplication by r. */ | |
394 | static void mul_r(const poly1305_ctx *ctx, felt z, const felt x) | |
395 | { mul(z, x, ctx->k.u.p11.r); } | |
396 | ||
397 | #endif | |
398 | ||
399 | /*----- Interface functions -----------------------------------------------*/ | |
400 | ||
401 | /* --- @poly1305_keyinit@ --- * | |
402 | * | |
403 | * Arguments: @poly1305_key *key@ = key structure to fill in | |
404 | * @const void *k@ = pointer to key material | |
405 | * @size_t ksz@ = length of key (must be @POLY1305_KEYSZ == 16@) | |
406 | * | |
407 | * Returns: --- | |
408 | * | |
409 | * Use: Records a Poly1305 key and performs (minimal) | |
410 | * precomputations. | |
411 | */ | |
412 | ||
413 | void poly1305_keyinit(poly1305_key *key, const void *k, size_t ksz) | |
414 | { | |
415 | const octet *r = k; | |
416 | #if POLY1305_IMPL == 11 | |
417 | octet rr[16]; | |
418 | #endif | |
419 | ||
420 | KSZ_ASSERT(poly1305, ksz); | |
421 | ||
422 | #if POLY1305_IMPL == 26 | |
423 | uint32 r0 = LOAD32_L(r + 0), r1 = LOAD32_L(r + 4), | |
424 | r2 = LOAD32_L(r + 8), r3 = LOAD32_L(r + 12); | |
425 | ||
426 | r0 &= 0x0fffffff; r1 &= 0x0ffffffc; r2 &= 0x0ffffffc; r3 &= 0x0ffffffc; | |
427 | key->u.p26.r0 = P26W0(r); key->u.p26.r1 = P26W1(r); | |
428 | key->u.p26.r2 = P26W2(r); key->u.p26.r3 = P26W3(r); | |
429 | key->u.p26.r4 = P26W4(r); | |
430 | ||
431 | key->u.p26.rr1 = 5*key->u.p26.r1; key->u.p26.rr2 = 5*key->u.p26.r2; | |
432 | key->u.p26.rr3 = 5*key->u.p26.r3; key->u.p26.rr4 = 5*key->u.p26.r4; | |
433 | #else | |
434 | memcpy(rr, r, 16); | |
435 | rr[ 3] &= 0x0f; | |
436 | rr[ 4] &= 0xfc; rr[ 7] &= 0x0f; | |
437 | rr[ 8] &= 0xfc; rr[11] &= 0x0f; | |
438 | rr[12] &= 0xfc; rr[15] &= 0x0f; | |
439 | load_p11(key->u.p11.r, rr); | |
440 | #endif | |
441 | } | |
442 | ||
443 | /* --- @poly1305_macinit@ --- * | |
444 | * | |
445 | * Arguments: @poly1305_ctx *ctx@ = MAC context to fill in | |
446 | * @const poly1305_key *key@ = pointer to key structure to use | |
447 | * @const void *iv@ = pointer to mask string | |
448 | * | |
449 | * Returns: --- | |
450 | * | |
451 | * Use: Initializes a MAC context for use. The key can be discarded | |
452 | * at any time. | |
453 | * | |
454 | * It is permitted for @iv@ to be null, though it is not then | |
455 | * possible to complete the MAC computation on @ctx@. The | |
456 | * resulting context may still be useful, e.g., as an operand to | |
457 | * @poly1305_concat@. | |
458 | */ | |
459 | ||
460 | void poly1305_macinit(poly1305_ctx *ctx, | |
461 | const poly1305_key *key, const void *iv) | |
462 | { | |
463 | const octet *s = iv; | |
464 | #if POLY1305_IMPL == 26 | |
465 | uint32 s0, s1, s2, s3; | |
466 | #else | |
467 | unsigned i; | |
468 | #endif | |
469 | ||
470 | #if POLY1305_IMPL == 26 | |
471 | if (s) { | |
472 | s0 = LOAD32_L(s + 0); s1 = LOAD32_L(s + 4); | |
473 | s2 = LOAD32_L(s + 8); s3 = LOAD32_L(s + 12); | |
474 | ctx->u.p26.s0 = P26W0(s); ctx->u.p26.s1 = P26W1(s); | |
475 | ctx->u.p26.s2 = P26W2(s); ctx->u.p26.s3 = P26W3(s); | |
476 | ctx->u.p26.s4 = P26W4(s); | |
477 | } | |
478 | ctx->u.p26.h[0] = ctx->u.p26.h[1] = ctx->u.p26.h[2] = | |
479 | ctx->u.p26.h[3] = ctx->u.p26.h[4] = 0; | |
480 | #else | |
481 | if (s) load_p11(ctx->u.p11.s, s); | |
482 | for (i = 0; i < 12; i++) ctx->u.p11.h[i] = 0; | |
483 | #endif | |
484 | ctx->k = *key; | |
485 | ctx->nbuf = 0; | |
486 | ctx->count = 0; | |
487 | } | |
488 | ||
489 | /* --- @poly1305_copy@ --- * | |
490 | * | |
491 | * Arguments: @poly1305_ctx *to@ = destination context | |
492 | * @const poly1305_ctx *from@ = source context | |
493 | * | |
494 | * Returns: --- | |
495 | * | |
496 | * Use: Duplicates a Poly1305 MAC context. The destination need not | |
497 | * have been initialized. Both contexts can be used | |
498 | * independently afterwards. | |
499 | */ | |
500 | ||
501 | void poly1305_copy(poly1305_ctx *ctx, const poly1305_ctx *from) | |
502 | { *ctx = *from; } | |
503 | ||
504 | /* --- @poly1305_hash@ --- * | |
505 | * | |
506 | * Arguments: @poly1305_ctx *ctx@ = MAC context to update | |
507 | * @const void *p@ = pointer to message data | |
508 | * @size_t sz@ = length of message data | |
509 | * | |
510 | * Returns: --- | |
511 | * | |
512 | * Use: Processes a chunk of message. The message pieces may have | |
513 | * arbitrary lengths, and may be empty. | |
514 | */ | |
515 | ||
516 | static void update_full(poly1305_ctx *ctx, const octet *p) | |
517 | { | |
518 | felt t; | |
519 | #if POLY1305_IMPL == 26 | |
520 | uint32 | |
521 | m0 = LOAD32_L(p + 0), m1 = LOAD32_L(p + 4), | |
522 | m2 = LOAD32_L(p + 8), m3 = LOAD32_L(p + 12); | |
523 | ||
524 | t[0] = ctx->u.p26.h[0] + P26W0(m); | |
525 | t[1] = ctx->u.p26.h[1] + P26W1(m); | |
526 | t[2] = ctx->u.p26.h[2] + P26W2(m); | |
527 | t[3] = ctx->u.p26.h[3] + P26W3(m); | |
528 | t[4] = ctx->u.p26.h[4] + P26W4(m) + 0x01000000; | |
529 | #else | |
530 | unsigned i; | |
531 | ||
532 | load_p11(t, p); t[11] += 0x100; | |
533 | for (i = 0; i < 12; i++) t[i] += ctx->u.p11.h[i]; | |
534 | #endif | |
535 | ||
536 | mul_r(ctx, ctx->u.P.h, t); | |
537 | ctx->count++; | |
538 | } | |
539 | ||
6a0eb244 MW |
540 | static const rsvr_policy pol = { 0, 16, 16 }; |
541 | ||
57496a50 MW |
542 | void poly1305_hash(poly1305_ctx *ctx, const void *p, size_t sz) |
543 | { | |
6a0eb244 MW |
544 | rsvr_state st; |
545 | const octet *q = p; | |
546 | ||
547 | rsvr_setup(&st, &pol, &ctx->buf, &ctx->nbuf, p, sz); | |
548 | RSVR_DO(&st) while ((q = RSVR_NEXT(&st, 16)) != 0) update_full(ctx, q); | |
57496a50 MW |
549 | } |
550 | ||
551 | /* --- @poly1305_flush@ --- * | |
552 | * | |
553 | * Arguments: @poly1305_ctx *ctx@ = MAC context to flush | |
554 | * | |
555 | * Returns: --- | |
556 | * | |
557 | * Use: Forces any buffered message data in the context to be | |
558 | * processed. This has no effect if the message processed so | |
559 | * far is a whole number of blocks. Flushing is performed | |
560 | * automatically by @poly1305_done@, but it may be necessary to | |
561 | * force it by hand when using @poly1305_concat@. | |
20400191 | 562 | * (Alternatively, you might use @poly1305_flushzero@ instead.) |
57496a50 MW |
563 | * |
564 | * Flushing a partial block has an observable effect on the | |
565 | * computation: the resulting state is (with high probability) | |
566 | * dissimilar to any state reachable with a message which is a | |
567 | * whole number of blocks long. | |
568 | */ | |
569 | ||
570 | void poly1305_flush(poly1305_ctx *ctx) | |
571 | { | |
572 | felt t; | |
573 | #if POLY1305_IMPL == 26 | |
574 | uint32 m0, m1, m2, m3; | |
575 | #else | |
576 | unsigned i; | |
577 | #endif | |
578 | ||
579 | if (!ctx->nbuf) return; | |
580 | ctx->buf[ctx->nbuf++] = 1; memset(ctx->buf + ctx->nbuf, 0, 16 - ctx->nbuf); | |
581 | #if POLY1305_IMPL == 26 | |
582 | m0 = LOAD32_L(ctx->buf + 0); m1 = LOAD32_L(ctx->buf + 4); | |
583 | m2 = LOAD32_L(ctx->buf + 8); m3 = LOAD32_L(ctx->buf + 12); | |
584 | ||
585 | t[0] = ctx->u.p26.h[0] + P26W0(m); | |
586 | t[1] = ctx->u.p26.h[1] + P26W1(m); | |
587 | t[2] = ctx->u.p26.h[2] + P26W2(m); | |
588 | t[3] = ctx->u.p26.h[3] + P26W3(m); | |
589 | t[4] = ctx->u.p26.h[4] + P26W4(m); | |
590 | #else | |
591 | load_p11(t, ctx->buf); | |
592 | for (i = 0; i < 12; i++) t[i] += ctx->u.p11.h[i]; | |
593 | #endif | |
594 | ||
595 | mul_r(ctx, ctx->u.P.h, t); | |
20400191 MW |
596 | ctx->nbuf = 0; ctx->count++; |
597 | } | |
598 | ||
599 | /* --- @poly1305_flushzero@ --- * | |
600 | * | |
601 | * Arguments: @poly1305_ctx *ctx@ = MAC context to flush | |
602 | * | |
603 | * Returns: --- | |
604 | * | |
605 | * Use: Forces any buffered message data in the context to be | |
606 | * processed, by hashing between zero and fifteen additional | |
607 | * zero bytes. Like @poly1305_flush@, this has no effect if the | |
608 | * the message processed so far is a whole number of blocks. | |
609 | * Unlike @poly1305_flush@, the behaviour if the message is not | |
610 | * a whole number of blocks is equivalent to actually hashing | |
611 | * some extra data. | |
612 | */ | |
613 | ||
614 | void poly1305_flushzero(poly1305_ctx *ctx) | |
615 | { | |
616 | if (!ctx->nbuf) return; | |
617 | memset(ctx->buf + ctx->nbuf, 0, 16 - ctx->nbuf); | |
618 | update_full(ctx, ctx->buf); | |
57496a50 MW |
619 | ctx->nbuf = 0; |
620 | } | |
621 | ||
622 | /* --- @poly1305_concat@ --- * | |
623 | * | |
624 | * Arguments: @poly1305_ctx *ctx@ = destination context | |
625 | * @const poly1305_ctx *prefix, *suffix@ = two operand contexts | |
626 | * | |
627 | * Returns: --- | |
628 | * | |
629 | * Use: The two operand contexts @prefix@ and @suffix@ represent | |
630 | * processing of two messages %$m$% and %$m'$%; the effect is to | |
631 | * set @ctx@ to the state corresponding to their concatenation | |
632 | * %$m \cat m'$%. | |
633 | * | |
634 | * All three contexts must have been initialized using the same | |
635 | * key value (though not necessarily from the same key | |
636 | * structure). The mask values associated with the input | |
637 | * contexts are irrelevant. The @prefix@ message %$m$% must be | |
638 | * a whole number of blocks long: this can be arranged by | |
639 | * flushing the context. The @suffix@ message need not be a | |
640 | * whole number of blocks long. All of the contexts remain | |
641 | * operational and can be used independently afterwards. | |
642 | */ | |
643 | ||
644 | void poly1305_concat(poly1305_ctx *ctx, | |
645 | const poly1305_ctx *prefix, const poly1305_ctx *suffix) | |
646 | { | |
647 | /* Assume that lengths are public, so it's safe to behave conditionally on | |
648 | * the bits of ctx->count. | |
649 | */ | |
650 | unsigned long n; | |
651 | unsigned i; | |
652 | felt x; | |
653 | #if POLY1305_IMPL == 26 | |
654 | uint32 x0, x1, x2, x3, x4, y0, y1, y2, y3, y4; | |
655 | #else | |
656 | uint32 y[12]; | |
657 | #endif | |
658 | ||
659 | /* We can only concatenate if the prefix is block-aligned. */ | |
660 | assert(!prefix->nbuf); | |
661 | ||
662 | /* The hash for a message m = m_{k-1} m_{k-2} ... m_1 m_0 is h_r(m) = | |
663 | * SUM_{0<=i<k} m_i r^{i+1}. If we have two messages, m, m', of lengths k | |
664 | * and k' blocks respectively, then | |
665 | * | |
666 | * h_r(m || m') = SUM_{0<=i<k} m_i r^{k'+i+1} + | |
667 | * SUM_{0<=i<k'} m'_i r^{i+1} | |
668 | * = r^{k'} h_r(m) + h_r(m') | |
669 | * | |
670 | * This is simple left-to-right square-and-multiply exponentiation. | |
671 | */ | |
672 | n = suffix->count; | |
673 | x[0] = 1; | |
674 | #if POLY1305_IMPL == 26 | |
675 | x[1] = x[2] = x[3] = x[4] = 0; | |
676 | #else | |
677 | for (i = 1; i < 12; i++) x[i] = 0; | |
678 | #endif | |
ac082cc9 | 679 | #define BIT (1ul << (ULONG_BITS - 1)) |
57496a50 MW |
680 | if (n) { |
681 | i = ULONG_BITS; | |
682 | while (!(n & BIT)) { n <<= 1; i--; } | |
683 | mul_r(prefix, x, x); n <<= 1; i--; | |
684 | while (i--) { sqr(x, x); if (n & BIT) mul_r(prefix, x, x); n <<= 1; } | |
685 | } | |
686 | #undef BIT | |
687 | mul(x, prefix->u.P.h, x); | |
688 | ||
689 | /* Add on the suffix hash. */ | |
690 | #if POLY1305_IMPL == 26 | |
691 | /* We're going to add the two hashes elementwise. Both h' = h_r(m') and | |
692 | * x = r^{k'} h_r(m) are bounded above by 2^27, so the sum will be bounded | |
693 | * by 2^28; but this is too large to leave in the accumulator. (Strictly, | |
694 | * we could get away with it, but the caller can in theory chain an | |
695 | * arbitrary number of concatenations and expect us to cope, and we'd | |
696 | * definitely overflow eventually.) So we reduce. Since the excess is so | |
697 | * small, a single round of `CARRY_REDUCE' is enough. | |
698 | */ | |
699 | x0 = x[0] + suffix->u.p26.h[0]; x1 = x[1] + suffix->u.p26.h[1]; | |
700 | x2 = x[2] + suffix->u.p26.h[2]; x3 = x[3] + suffix->u.p26.h[3]; | |
701 | x4 = x[4] + suffix->u.p26.h[4]; | |
702 | CARRY_REDUCE(y, x); | |
703 | ctx->u.p26.h[0] = y0; ctx->u.p26.h[1] = y1; ctx->u.p26.h[2] = y2; | |
704 | ctx->u.p26.h[3] = y3; ctx->u.p26.h[4] = y4; | |
705 | #else | |
706 | /* We'll add the two hashes elementwise and have to reduce again. The | |
707 | * numbers are different, but the reasoning is basically the same. | |
708 | */ | |
709 | for (i = 0; i < 12; i++) y[i] = x[i] + suffix->u.p11.h[i]; | |
710 | carry_reduce(y); | |
711 | for (i = 0; i < 12; i++) ctx->u.p11.h[i] = y[i]; | |
712 | #endif | |
713 | ||
714 | /* Copy the remaining pieces of the context to set up the result. */ | |
715 | if (ctx != suffix) { | |
716 | memcpy(ctx->buf, suffix->buf, suffix->nbuf); | |
717 | ctx->nbuf = suffix->nbuf; | |
718 | } | |
719 | ctx->count = prefix->count + suffix->count; | |
720 | } | |
721 | ||
722 | /* --- @poly1305_done@ --- * | |
723 | * | |
724 | * Arguments: @poly1305_ctx *ctx@ = MAC context to finish | |
725 | * @void *h@ = buffer to write the tag to | |
726 | * | |
727 | * Returns: --- | |
728 | * | |
729 | * Use: Completes a Poly1305 MAC tag computation. | |
730 | */ | |
731 | ||
732 | void poly1305_done(poly1305_ctx *ctx, void *h) | |
733 | { | |
734 | octet *p = h; | |
735 | ||
736 | #if POLY1305_IMPL == 26 | |
737 | uint32 m_sub, t, c; | |
738 | uint32 h0, h1, h2, h3, h4, hh0, hh1, hh2, hh3, hh4; | |
739 | ||
740 | /* If there's anything left over in the buffer, pad it to form a final | |
741 | * coefficient and update the evaluation one last time. | |
742 | */ | |
743 | poly1305_flush(ctx); | |
744 | ||
745 | /* Collect the final hash state. */ | |
746 | h0 = ctx->u.p26.h[0]; | |
747 | h1 = ctx->u.p26.h[1]; | |
748 | h2 = ctx->u.p26.h[2]; | |
749 | h3 = ctx->u.p26.h[3]; | |
750 | h4 = ctx->u.p26.h[4]; | |
751 | ||
752 | /* Reduce the final value mod 2^130 - 5. First pass: set h <- h + | |
753 | * 5 floor(h/2^130). After this, the low pieces of h will be normalized: | |
754 | * 0 <= h_i < 2^26 for 0 <= i < 4; and 0 <= h_4 < 2^26 + 1. In the | |
755 | * (highly unlikely) event that h_4 >= 2^26, set c and truncate to 130 | |
756 | * bits. | |
757 | */ | |
758 | c = h4 >> 26; h4 &= M26; | |
759 | h0 += 5*c; c = h0 >> 26; h0 &= M26; | |
760 | h1 += c; c = h1 >> 26; h1 &= M26; | |
761 | h2 += c; c = h2 >> 26; h2 &= M26; | |
762 | h3 += c; c = h3 >> 26; h3 &= M26; | |
763 | h4 += c; c = h4 >> 26; h4 &= M26; | |
764 | ||
765 | /* Calculate h' = h - (2^130 - 5). If h' >= 0 then t ends up 1; otherwise | |
766 | * it's zero. | |
767 | */ | |
768 | t = h0 + 5; hh0 = t&M26; t >>= 26; | |
769 | t += h1; hh1 = t&M26; t >>= 26; | |
770 | t += h2; hh2 = t&M26; t >>= 26; | |
771 | t += h3; hh3 = t&M26; t >>= 26; | |
772 | t += h4; hh4 = t&M26; t >>= 26; | |
773 | ||
774 | /* Keep the subtraction result above if t or c is set. */ | |
775 | m_sub = -(t | c); | |
776 | h0 = (hh0&m_sub) | (h0&~m_sub); | |
777 | h1 = (hh1&m_sub) | (h1&~m_sub); | |
778 | h2 = (hh2&m_sub) | (h2&~m_sub); | |
779 | h3 = (hh3&m_sub) | (h3&~m_sub); | |
780 | h4 = (hh4&m_sub) | (h4&~m_sub); | |
781 | ||
782 | /* Add the mask onto the hash result. */ | |
783 | t = h0 + ctx->u.p26.s0; h0 = t&M26; t >>= 26; | |
784 | t += h1 + ctx->u.p26.s1; h1 = t&M26; t >>= 26; | |
785 | t += h2 + ctx->u.p26.s2; h2 = t&M26; t >>= 26; | |
786 | t += h3 + ctx->u.p26.s3; h3 = t&M26; t >>= 26; | |
787 | t += h4 + ctx->u.p26.s4; h4 = t&M26; t >>= 26; | |
788 | ||
789 | /* Convert this mess back into 32-bit words. We lose the top two bits, | |
790 | * but that's fine. | |
791 | */ | |
792 | h0 = (h0 >> 0) | ((h1 & 0x0000003f) << 26); | |
793 | h1 = (h1 >> 6) | ((h2 & 0x00000fff) << 20); | |
794 | h2 = (h2 >> 12) | ((h3 & 0x0003ffff) << 14); | |
795 | h3 = (h3 >> 18) | ((h4 & 0x00ffffff) << 8); | |
796 | ||
797 | /* All done. */ | |
798 | STORE32_L(p + 0, h0); STORE32_L(p + 4, h1); | |
799 | STORE32_L(p + 8, h2); STORE32_L(p + 12, h3); | |
800 | #else | |
801 | uint16 hh[12], hi[12], c, t, m_sub; | |
802 | uint32 a; | |
803 | unsigned i, j, n; | |
804 | ||
805 | /* If there's anything left over in the buffer, pad it to form a final | |
806 | * coefficient and update the evaluation one last time. | |
807 | */ | |
808 | poly1305_flush(ctx); | |
809 | ||
810 | /* Collect the final hash state. */ | |
811 | for (i = 0; i < 12; i++) hh[i] = ctx->u.p11.h[i]; | |
812 | ||
813 | /* Reduce the final value mod 2^130 - 5. First pass: set h <- h + | |
814 | * 5 floor(h/2^130). After this, the low pieces of h will be normalized: | |
815 | * 0 <= h_i < 2^{w_i} for 0 <= i < 11; and 0 <= h_{11} < 2^10 + 1. In the | |
816 | * (highly unlikely) event that h_{11} >= 2^10, set c and truncate to 130 | |
817 | * bits. | |
818 | */ | |
819 | c = 5*(hh[11] >> 10); hh[11] &= M10; | |
820 | for (i = 0; i < 12; i++) { | |
821 | if (i == 5 || i == 11) { c += hh[i]; hh[i] = c&M10; c >>= 10; } | |
822 | else { c += hh[i]; hh[i] = c&M11; c >>= 11; } | |
823 | } | |
824 | ||
825 | /* Calculate h' = h - (2^130 - 5). If h' >= 0 then t ends up 1; otherwise | |
826 | * it's zero. | |
827 | */ | |
828 | for (i = 0, t = 5; i < 12; i++) { | |
829 | t += hh[i]; | |
830 | if (i == 5 || i == 11) { hi[i] = t&M10; t >>= 10; } | |
831 | else { hi[i] = t&M11; t >>= 11; } | |
832 | } | |
833 | ||
834 | /* Keep the subtraction result above if t or c is set. */ | |
835 | m_sub = -(t | c); | |
836 | for (i = 0; i < 12; i++) hh[i] = (hi[i]&m_sub) | (hh[i]&~m_sub); | |
837 | ||
838 | /* Add the mask onto the hash result. */ | |
839 | for (i = 0, t = 0; i < 12; i++) { | |
840 | t += hh[i] + ctx->u.p11.s[i]; | |
841 | if (i == 5 || i == 11) { hh[i] = t&M10; t >>= 10; } | |
842 | else { hh[i] = t&M11; t >>= 11; } | |
843 | } | |
844 | ||
845 | /* Convert this mess back into bytes. We lose the top two bits, but that's | |
846 | * fine. | |
847 | */ | |
848 | for (i = j = n = 0, a = 0; i < 16; i++) { | |
849 | if (n < 8) { | |
850 | a |= hh[j] << n; | |
851 | n += (j == 5 || j == 11) ? 10 : 11; | |
852 | j++; | |
853 | } | |
854 | p[i] = a&0xff; a >>= 8; n -= 8; | |
855 | } | |
856 | ||
857 | #endif | |
858 | } | |
859 | ||
860 | /*----- Test rig ----------------------------------------------------------*/ | |
861 | ||
862 | #ifdef TEST_RIG | |
863 | ||
141c1284 | 864 | #include <mLib/macros.h> |
57496a50 MW |
865 | #include <mLib/testrig.h> |
866 | ||
1aaccf40 | 867 | #include "ct.h" |
47af781c MW |
868 | #include "rijndael-ecb.h" |
869 | ||
57496a50 MW |
870 | static int vrf_hash(dstr v[]) |
871 | { | |
872 | poly1305_key k; | |
873 | poly1305_ctx ctx; | |
874 | dstr t = DSTR_INIT; | |
875 | unsigned i, j; | |
876 | ||
877 | if (v[0].len != 16) { fprintf(stderr, "bad key length\n"); exit(2); } | |
878 | if (v[1].len != 16) { fprintf(stderr, "bad mask length\n"); exit(2); } | |
879 | if (v[3].len != 16) { fprintf(stderr, "bad tag length\n"); exit(2); } | |
880 | dstr_ensure(&t, 16); t.len = 16; | |
881 | ||
1aaccf40 | 882 | ct_poison(v[0].buf, v[0].len); |
57496a50 MW |
883 | poly1305_keyinit(&k, v[0].buf, v[0].len); |
884 | for (i = 0; i < v[2].len; i++) { | |
885 | for (j = i; j < v[2].len; j++) { | |
886 | poly1305_macinit(&ctx, &k, v[1].buf); | |
887 | poly1305_hash(&ctx, v[2].buf, i); | |
888 | poly1305_hash(&ctx, v[2].buf + i, j - i); | |
889 | poly1305_hash(&ctx, v[2].buf + j, v[2].len - j); | |
890 | poly1305_done(&ctx, t.buf); | |
1aaccf40 | 891 | ct_remedy(t.buf, t.len); |
141c1284 | 892 | if (MEMCMP(t.buf, !=, v[3].buf, 16)) { |
57496a50 MW |
893 | fprintf(stderr, "failed..."); |
894 | fprintf(stderr, "\n\tkey = "); type_hex.dump(&v[0], stderr); | |
895 | fprintf(stderr, "\n\tmask = "); type_hex.dump(&v[1], stderr); | |
896 | fprintf(stderr, "\n\tmsg = "); type_hex.dump(&v[2], stderr); | |
897 | fprintf(stderr, "\n\texp = "); type_hex.dump(&v[3], stderr); | |
898 | fprintf(stderr, "\n\tcalc = "); type_hex.dump(&t, stderr); | |
899 | fprintf(stderr, "\n\tsplits = 0 .. %u .. %u .. %lu\n", | |
900 | i, j, (unsigned long)v[1].len); | |
901 | return (0); | |
902 | } | |
903 | } | |
904 | } | |
905 | return (1); | |
906 | } | |
907 | ||
908 | static int vrf_cat(dstr v[]) | |
909 | { | |
910 | poly1305_key k; | |
911 | poly1305_ctx ctx, cc[3]; | |
912 | dstr t = DSTR_INIT; | |
913 | unsigned i; | |
914 | int ok = 1; | |
915 | ||
916 | if (v[0].len != 16) { fprintf(stderr, "bad key length\n"); exit(2); } | |
917 | if (v[1].len != 16) { fprintf(stderr, "bad mask length\n"); exit(2); } | |
918 | if (v[5].len != 16) { fprintf(stderr, "bad tag length\n"); exit(2); } | |
919 | dstr_ensure(&t, 16); t.len = 16; | |
920 | ||
921 | poly1305_keyinit(&k, v[0].buf, v[0].len); | |
922 | poly1305_macinit(&ctx, &k, v[1].buf); | |
923 | for (i = 0; i < 3; i++) { | |
924 | poly1305_macinit(&cc[i], &k, 0); | |
925 | poly1305_hash(&cc[i], v[i + 2].buf, v[i + 2].len); | |
926 | } | |
927 | for (i = 0; i < 2; i++) { | |
928 | if (!i) { | |
929 | poly1305_concat(&ctx, &cc[1], &cc[2]); | |
930 | poly1305_concat(&ctx, &cc[0], &ctx); | |
931 | } else { | |
932 | poly1305_concat(&ctx, &cc[0], &cc[1]); | |
933 | poly1305_concat(&ctx, &ctx, &cc[2]); | |
934 | } | |
935 | poly1305_done(&ctx, t.buf); | |
141c1284 | 936 | if (MEMCMP(t.buf, !=, v[5].buf, 16)) { |
57496a50 MW |
937 | fprintf(stderr, "failed..."); |
938 | fprintf(stderr, "\n\tkey = "); type_hex.dump(&v[0], stderr); | |
939 | fprintf(stderr, "\n\tmask = "); type_hex.dump(&v[1], stderr); | |
940 | fprintf(stderr, "\n\tmsg[0] = "); type_hex.dump(&v[2], stderr); | |
941 | fprintf(stderr, "\n\tmsg[1] = "); type_hex.dump(&v[3], stderr); | |
942 | fprintf(stderr, "\n\tmsg[2] = "); type_hex.dump(&v[4], stderr); | |
943 | fprintf(stderr, "\n\texp = "); type_hex.dump(&v[5], stderr); | |
944 | fprintf(stderr, "\n\tcalc = "); type_hex.dump(&t, stderr); | |
945 | fprintf(stderr, "\n\tassoc = %s\n", | |
946 | !i ? "msg[0] || (msg[1] || msg[2])" : | |
947 | "(msg[0] || msg[1]) || msg[2]"); | |
948 | ok = 0; | |
949 | } | |
950 | } | |
951 | return (ok); | |
952 | } | |
953 | ||
47af781c MW |
954 | #define MSZMAX 1000 |
955 | ||
956 | static int vrf_mct(dstr v[]) | |
957 | { | |
958 | unsigned j, msz; | |
2e572d83 | 959 | unsigned long i, start_iter, end_iter; |
47af781c MW |
960 | rijndael_ecbctx rij; |
961 | poly1305_key key; | |
962 | poly1305_ctx mac; | |
a9a5bfa0 MW |
963 | dstr dk = DSTR_INIT, dr = DSTR_INIT, dn = DSTR_INIT, |
964 | dt = DSTR_INIT, dm = DSTR_INIT; | |
965 | octet *k, *r, s[16], *n, *t, *m; | |
47af781c MW |
966 | int ok = 1; |
967 | ||
a9a5bfa0 MW |
968 | DENSURE(&dk, 16); k = (octet *)dk.buf; dk.len = 16; |
969 | DENSURE(&dr, 16); r = (octet *)dr.buf; dr.len = 16; | |
970 | DENSURE(&dn, 16); n = (octet *)dn.buf; dn.len = 16; | |
971 | DENSURE(&dt, 16); t = (octet *)dt.buf; dt.len = 16; | |
972 | DENSURE(&dm, MSZMAX); m = (octet *)dm.buf; dm.len = MSZMAX; | |
973 | memset(m, 0, MSZMAX); | |
974 | ||
2e572d83 MW |
975 | if (v[0].len != 16) { fprintf(stderr, "AES key len\n"); exit(2); } |
976 | if (v[1].len != 16) { fprintf(stderr, "poly key len\n"); exit(2); } | |
977 | if (v[2].len != 16) { fprintf(stderr, "nonce len\n"); exit(2); } | |
978 | if (v[3].len != MSZMAX) { fprintf(stderr, "msgbuf len\n"); exit(2); } | |
979 | if (v[6].len != 16) { fprintf(stderr, "result len\n"); exit(2); } | |
4741bd9f MW |
980 | memcpy(k, v[0].buf, 16); |
981 | memcpy(r, v[1].buf, 16); | |
982 | memcpy(n, v[2].buf, 16); | |
2e572d83 MW |
983 | memcpy(m, v[3].buf, MSZMAX); |
984 | start_iter = *(unsigned long *)v[4].buf; | |
985 | end_iter = *(unsigned long *)v[5].buf; | |
986 | if (end_iter < start_iter) { fprintf(stderr, "iter bounds\n"); exit(2); } | |
47af781c | 987 | |
4741bd9f MW |
988 | rijndael_ecbinit(&rij, k, 16, 0); |
989 | poly1305_keyinit(&key, r, 16); | |
2e572d83 | 990 | for (i = start_iter; i < end_iter; i++) { |
47af781c MW |
991 | msz = 0; |
992 | for (;;) { | |
993 | rijndael_ecbencrypt(&rij, n, s, 16); | |
994 | poly1305_macinit(&mac, &key, s); | |
995 | poly1305_hash(&mac, m, msz); | |
996 | poly1305_done(&mac, t); | |
997 | if (msz >= MSZMAX) break; | |
998 | n[0] ^= i&0xff; | |
999 | for (j = 0; j < 16; j++) n[j] ^= t[j]; | |
1000 | if (msz%2) { | |
1001 | for (j = 0; j < 16; j++) k[j] ^= t[j]; | |
4741bd9f | 1002 | rijndael_ecbinit(&rij, k, 16, 0); |
47af781c MW |
1003 | } |
1004 | if (msz%3) { | |
1005 | for (j = 0; j < 16; j++) r[j] ^= t[j]; | |
4741bd9f | 1006 | poly1305_keyinit(&key, r, 16); |
47af781c MW |
1007 | } |
1008 | m[msz++] ^= t[0]; | |
1009 | } | |
1010 | } | |
1011 | ||
2e572d83 | 1012 | if (MEMCMP(t, !=, v[6].buf, 16)) { |
47af781c MW |
1013 | ok = 0; |
1014 | fprintf(stderr, "failed..."); | |
1015 | fprintf(stderr, "\n\tinitial k = "); type_hex.dump(&v[0], stderr); | |
1016 | fprintf(stderr, "\n\tinitial r = "); type_hex.dump(&v[1], stderr); | |
1017 | fprintf(stderr, "\n\tinitial n = "); type_hex.dump(&v[2], stderr); | |
2e572d83 MW |
1018 | fprintf(stderr, "\n\tinitial m = "); type_hex.dump(&v[3], stderr); |
1019 | fprintf(stderr, "\n\tstart iter = %lu", start_iter); | |
1020 | fprintf(stderr, "\n\tend iter = %lu", end_iter); | |
1021 | fprintf(stderr, "\n\tfinal k = "); type_hex.dump(&dk, stderr); | |
1022 | fprintf(stderr, "\n\tfinal r = "); type_hex.dump(&dr, stderr); | |
1023 | fprintf(stderr, "\n\tfinal n = "); type_hex.dump(&dn, stderr); | |
1024 | fprintf(stderr, "\n\tfinal m = "); type_hex.dump(&dm, stderr); | |
1025 | fprintf(stderr, "\n\texpected = "); type_hex.dump(&v[6], stderr); | |
a9a5bfa0 | 1026 | fprintf(stderr, "\n\tcalculated = "); type_hex.dump(&dt, stderr); |
47af781c MW |
1027 | fputc('\n', stderr); |
1028 | } | |
1029 | ||
a9a5bfa0 MW |
1030 | dstr_destroy(&dk); |
1031 | dstr_destroy(&dr); | |
1032 | dstr_destroy(&dn); | |
1033 | dstr_destroy(&dt); | |
1034 | dstr_destroy(&dm); | |
47af781c MW |
1035 | return (ok); |
1036 | } | |
1037 | ||
57496a50 MW |
1038 | static const struct test_chunk tests[] = { |
1039 | { "poly1305-hash", vrf_hash, | |
1040 | { &type_hex, &type_hex, &type_hex, &type_hex } }, | |
1041 | { "poly1305-cat", vrf_cat, | |
1042 | { &type_hex, &type_hex, &type_hex, &type_hex, &type_hex, &type_hex } }, | |
47af781c | 1043 | { "poly1305-mct", vrf_mct, |
2e572d83 MW |
1044 | { &type_hex, &type_hex, &type_hex, &type_hex, |
1045 | &type_ulong, &type_ulong, &type_hex } }, | |
57496a50 MW |
1046 | { 0, 0, { 0 } } |
1047 | }; | |
1048 | ||
1049 | int main(int argc, char *argv[]) | |
1050 | { | |
1051 | test_run(argc, argv, tests, SRCDIR "/t/poly1305"); | |
1052 | return (0); | |
1053 | } | |
1054 | ||
1055 | #endif | |
1056 | ||
1057 | /*----- That's all, folks -------------------------------------------------*/ |