chiark - git - mdw - catacomb/blob - symm/salsa20-core.h

   1 /* -*-c-*-
   2  *
   3  * Salsa20 core definitions
   4  *
   5  * (c) 2015 Straylight/Edgeware
   6  */
   7
   8 #ifndef CATACOMB_SALSA20_CORE_H
   9 #define CATACOMB_SALSA20_CORE_H
  10
  11 #ifdef __cplusplus
  12   extern "C" {
  13 #endif
  14
  15 /*----- Header files ------------------------------------------------------*/
  16
  17 #include <mLib/bits.h>
  18 #include <mLib/macros.h>
  19
  20 #ifndef CATACOMB_SALSA20_H
  21 #  include "salsa20.h"
  22 #endif
  23
  24 /*----- Magic constants ---------------------------------------------------*/
  25
  26 /* The magic Salsa20 constants, for 256-bit keys... */
  27 #define SALSA20_A256 0x61707865         /* e x p a */
  28 #define SALSA20_B256 0x3320646e         /* n d   3 */
  29 #define SALSA20_C256 0x79622d32         /* 2 - b y */
  30 #define SALSA20_D256 0x6b206574         /* t e   k */
  31
  32 /* ... and for 128-bit keys ... */
  33 #define SALSA20_A128 SALSA20_A256       /* e x p a */
  34 #define SALSA20_B128 0x3120646e         /* n d   1 */
  35 #define SALSA20_C128 0x79622d36         /* 6 - b y */
  36 #define SALSA20_D128 SALSA20_D256       /* t e   k */
  37
  38 /* ... and for 80-bit keys, for completeness's sake. */
  39 #define SALSA20_A80 SALSA20_A128        /* e x p a */
  40 #define SALSA20_B80 SALSA20_B128        /* n d   1 */
  41 #define SALSA20_C80 0x79622d30          /* 0 - b y */
  42 #define SALSA20_D80 SALSA20_D128        /* t e   k */
  43
  44 /*----- The Salsa20 core function -----------------------------------------*/
  45
  46 /* It makes life somewhat easier if we don't actually store and maintain the
  47  * input matrix in the textbook order.  Instead, we rotate the columns other
  48  * than the leftmost one upwards, so that the constants which were originally
  49  * along the diagonal end up on the top row.  We'll need to undo this
  50  * permutation on output, but that's not too terrible an imposition.
  51  *
  52  * The permutation we're applying to the matrix elements is this:
  53  *
  54  * [  0  1  2  3 ]       [  0  5 10 15 ]
  55  * [  4  5  6  7 ]  -->  [  4  9 14  3 ]
  56  * [  8  9 10 11 ]       [  8 13  2  7 ]
  57  * [ 12 13 14 15 ]       [ 12  1  6 11 ]
  58  *
  59  * and as a result, we need to apply this inverse permutation to figure out
  60  * which indices to use in the doublerow function and elsewhere.
  61  *
  62  * [  0 13 10  7 ]
  63  * [  4  1 14 11 ]
  64  * [  8  5  2 15 ]
  65  * [ 12  9  6  3 ]
  66  */
  67
  68 /* The Salsa20 quarter-round.  Read from the matrix @y@ at indices @a@, @b@,
  69  * @c@, and @d@; and write to the corresponding elements of @z@.
  70  */
  71 #define SALSA20_QR(z, y, a, b, c, d) do {                               \
  72   (z)[b] = (y)[b] ^ ROL32((y)[a] + (y)[d],  7);                         \
  73   (z)[c] = (y)[c] ^ ROL32((z)[b] + (y)[a],  9);                         \
  74   (z)[d] = (y)[d] ^ ROL32((z)[c] + (z)[b], 13);                         \
  75   (z)[a] = (y)[a] ^ ROL32((z)[d] + (z)[c], 18);                         \
  76 } while (0)
  77
  78 /* The Salsa20 double-round.  Read from matrix @y@, writing the result to
  79  * @z@.
  80  */
  81 #define SALSA20_DR(z, y) do {                                           \
  82   SALSA20_QR(z, y,  0,  4,  8, 12);                                     \
  83   SALSA20_QR(z, y,  1,  5,  9, 13);                                     \
  84   SALSA20_QR(z, y,  2,  6, 10, 14);                                     \
  85   SALSA20_QR(z, y,  3,  7, 11, 15);                                     \
  86   SALSA20_QR(z, z,  0, 13, 10,  7);                                     \
  87   SALSA20_QR(z, z,  1, 14, 11,  4);                                     \
  88   SALSA20_QR(z, z,  2, 15,  8,  5);                                     \
  89   SALSA20_QR(z, z,  3, 12,  9,  6);                                     \
  90 } while (0)
  91
  92 /* The Salsa20 feedforward step, used at the end of the core function.  Here,
  93  * @y@ contains the original input matrix; @z@ contains the final one, and is
  94  * updated.  The output is rendered in canonical order, ready for output.
  95  */
  96 #define SALSA20_FFWD(z, y) do {                                         \
  97   const uint32 *_y = (y);                                               \
  98   uint32 *_z = (z);                                                     \
  99   int _t;                                                               \
 100   _z[ 0] = _z[ 0] + _y[ 0]; _z[ 4] = _z[ 4] + _y[ 4];                   \
 101   _z[ 8] = _z[ 8] + _y[ 8]; _z[12] = _z[12] + _y[12];                   \
 102       _t = _z[ 1] + _y[ 1]; _z[ 1] = _z[13] + _y[13];                   \
 103   _z[13] = _z[ 9] + _y[ 9]; _z[ 9] = _z[ 5] + _y[ 5]; _z[ 5] = _t;      \
 104       _t = _z[ 2] + _y[ 2]; _z[ 2] = _z[10] + _y[10]; _z[10] = _t;      \
 105       _t = _z[ 6] + _y[ 6]; _z[ 6] = _z[14] + _y[14]; _z[14] = _t;      \
 106       _t = _z[ 3] + _y[ 3]; _z[ 3] = _z[ 7] + _y[ 7];                   \
 107   _z[ 7] = _z[11] + _y[11]; _z[11] = _z[15] + _y[15]; _z[15] = _t;      \
 108 } while (0)
 109
 110 /* Various numbers of rounds, unrolled.  Read from @y@, and write to @z@. */
 111 #define SALSA20_4R(z, y)                                                \
 112   do { SALSA20_DR(z, y); SALSA20_DR(z, z); } while (0)
 113 #define SALSA20_8R(z, y)                                                \
 114   do { SALSA20_4R(z, y); SALSA20_4R(z, z); } while (0)
 115 #define SALSA20_12R(z, y)                                               \
 116   do { SALSA20_8R(z, y); SALSA20_4R(z, z); } while (0)
 117 #define SALSA20_20R(z, y)                                               \
 118   do { SALSA20_12R(z, y); SALSA20_8R(z, z); } while (0)
 119
 120 /* Apply @n@ (must be even) rounds, rolled.  (This seems to be faster,
 121  * probably because it fits in cache better).  Read from @y@, and write to
 122  * @z@.
 123  */
 124 #define SALSA20_nR(z, y, n) do {                                        \
 125   int _i;                                                               \
 126   SALSA20_DR(z, y);                                                     \
 127   for (_i = 0; _i < (n)/2 - 1; _i++) SALSA20_DR(z, z);                  \
 128 } while (0)
 129
 130 /* Step the counter in the Salsa20 state matrix @a@. */
 131 #define SALSA20_STEP(a)                                                 \
 132   do { (a)[8] = U32((a)[8] + 1); (a)[5] += !(a)[8]; } while (0)
 133
 134 /*----- Buffering and output ----------------------------------------------*
 135  *
 136  * These macros are also used by ChaCha.
 137  */
 138
 139 /* Copy the Salsa20 matrix @a@ to the output buffer at @d@, advancing @d@
 140  * past the new material.
 141  */
 142 #define SALSA20_GENFULL(a, d) do {                                      \
 143   int _i;                                                               \
 144                                                                         \
 145   for (_i = 0; _i < 16; _i++) { STORE32_L((d), (a)[_i]); (d) += 4; }    \
 146 } while (0)
 147
 148 /* XOR the contents the input buffer at @s@ with the Salsa20 matrix @a@,
 149  * writing the result to @d@ and advance @s@ and @d@.
 150  */
 151 #define SALSA20_MIXFULL(a, d, s) do {                                   \
 152   uint32 _x;                                                            \
 153   int _i;                                                               \
 154                                                                         \
 155   for (_i = 0; _i < 16; _i++) {                                         \
 156     _x = LOAD32_L(s); (s) += 4;                                         \
 157     _x ^= (a)[_i];                                                      \
 158     STORE32_L((d), _x); (d) += 4;                                       \
 159   }                                                                     \
 160 } while (0)
 161
 162 /* Fill the context @ctx@'s buffer from the matrix @a@ in preparation for
 163  * emitting partial blocks of output.
 164  */
 165 #define SALSA20_PREPBUF(ctx, a) do {                                    \
 166   int _i;                                                               \
 167   for (_i = 0; _i < 16; _i++) STORE32_L((ctx)->buf + 4*_i, (a)[_i]);    \
 168   (ctx)->bufi = 0;                                                      \
 169 } while (0)
 170
 171 /* Write at most @n@ bytes of buffered output from the context @ctx@ to the
 172  * output buffer @d@ (if it's not null), XORing it with the input buffer @s@
 173  * (if that's not null).  Both @s@ and @d@ are advanced if they aren't null;
 174  * @n@ is decreased appropriately.
 175  */
 176 #define SALSA20_OUTBUF(ctx, d, s, n) do {                               \
 177   size_t _n = (n), _left = SALSA20_OUTSZ - (ctx)->bufi;                 \
 178   if (_n > _left) _n = _left;                                           \
 179   (n) -= _n;                                                            \
 180   if (!(d)) (ctx)->bufi += _n;                                          \
 181   else if (s) while (_n--) *(d)++ = (ctx)->buf[(ctx)->bufi++] ^ *(s)++; \
 182   else while (_n--) *(d)++ = (ctx)->buf[(ctx)->bufi++];                 \
 183 } while (0)
 184
 185 /*----- Variants and naming -----------------------------------------------*/
 186
 187 /* Common numbers of rounds, for which we generate definitions. */
 188 #define SALSA20_VARS(_) _(8) _(12) _(20)
 189
 190 /* Constructing externally-facing names. */
 191 #define SALSA20_DECOR(base, r, suff) SALSA20__DECOR_##r(base, suff)
 192 #define SALSA20__DECOR_20(base, suff) GLUE(base, suff)
 193 #define SALSA20__DECOR_12(base, suff) GLUE(base##12, suff)
 194 #define SALSA20__DECOR_8(base, suff) GLUE(base##8, suff)
 195
 196 /* Preprocessor-time table of the standard names. */
 197 #define SALSA20_NAME_20 "salsa20"
 198 #define SALSA20_NAME_12 "salsa20/12"
 199 #define SALSA20_NAME_8 "salsa20/8"
 200
 201 /*----- That's all, folks -------------------------------------------------*/
 202
 203 #ifdef __cplusplus
 204   }
 205 #endif
 206
 207 #endif