chiark - git - mdw - catacomb/blob - symm/salsa20-x86ish-sse2.S

   1 /// -*- mode: asm; asm-comment-char: ?/ -*-
   2 ///
   3 /// Fancy SIMD implementation of Salsa20
   4 ///
   5 /// (c) 2015 Straylight/Edgeware
   6 ///
   7
   8 ///----- Licensing notice ---------------------------------------------------
   9 ///
  10 /// This file is part of Catacomb.
  11 ///
  12 /// Catacomb is free software; you can redistribute it and/or modify
  13 /// it under the terms of the GNU Library General Public License as
  14 /// published by the Free Software Foundation; either version 2 of the
  15 /// License, or (at your option) any later version.
  16 ///
  17 /// Catacomb is distributed in the hope that it will be useful,
  18 /// but WITHOUT ANY WARRANTY; without even the implied warranty of
  19 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20 /// GNU Library General Public License for more details.
  21 ///
  22 /// You should have received a copy of the GNU Library General Public
  23 /// License along with Catacomb; if not, write to the Free
  24 /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
  25 /// MA 02111-1307, USA.
  26
  27 ///--------------------------------------------------------------------------
  28 /// External definitions.
  29
  30 #include "config.h"
  31 #include "asm-common.h"
  32
  33 ///--------------------------------------------------------------------------
  34 /// Local utilities.
  35
  36 // Magic constants for shuffling.
  37 #define ROTL 0x93
  38 #define ROT2 0x4e
  39 #define ROTR 0x39
  40
  41 ///--------------------------------------------------------------------------
  42 /// Main code.
  43
  44         .arch pentium4
  45         .section .text
  46
  47 FUNC(salsa20_core_x86ish_sse2)
  48
  49         // Initial setup.
  50
  51 #if CPUFAM_X86
  52         // Arguments come in on the stack, and will need to be collected.  We
  53         // we can get away with just the scratch registers for integer work,
  54         // but we'll run out of XMM registers and will need some properly
  55         // aligned space which we'll steal from the stack.  I don't trust the
  56         // stack pointer's alignment, so I'll have to mask the stack pointer,
  57         // which in turn means I'll need to keep track of the old value.
  58         // Hence I'm making a full i386-style stack frame here.
  59         //
  60         // The Windows and SysV ABIs are sufficiently similar that we don't
  61         // need to worry about the differences here.
  62
  63 #  define NR ecx
  64 #  define IN eax
  65 #  define OUT edx
  66 #  define SAVE0 xmm6
  67 #  define SAVE1 xmm7
  68 #  define SAVE2 [esp + 0]
  69 #  define SAVE3 [esp + 16]
  70
  71         push    ebp
  72         mov     ebp, esp
  73         sub     esp, 32
  74         mov     IN, [ebp + 12]
  75         mov     OUT, [ebp + 16]
  76         and     esp, ~15
  77         mov     NR, [ebp + 8]
  78 #endif
  79
  80 #if CPUFAM_AMD64 && ABI_SYSV
  81         // This is nice.  We have plenty of XMM registers, and the arguments
  82         // are in useful places.  There's no need to spill anything and we
  83         // can just get on with the code.
  84
  85 #  define NR edi
  86 #  define IN rsi
  87 #  define OUT rdx
  88 #  define SAVE0 xmm6
  89 #  define SAVE1 xmm7
  90 #  define SAVE2 xmm8
  91 #  define SAVE3 xmm9
  92 #endif
  93
  94 #  if CPUFAM_AMD64 && ABI_WIN
  95         // Arguments come in registers, but they're different between Windows
  96         // and everyone else (and everyone else is saner).
  97         //
  98         // The Windows ABI insists that we preserve some of the XMM
  99         // registers, but we want more than we can use as scratch space.  Two
 100         // places we only need to save a copy of the input for the
 101         // feedforward at the end; but the other two we want for the final
 102         // permutation, so save the old values on the stack (We need an extra
 103         // 8 bytes to align the stack.)
 104
 105 #  define NR ecx
 106 #  define IN rdx
 107 #  define OUT r8
 108 #  define SAVE0 xmm6
 109 #  define SAVE1 xmm7
 110 #  define SAVE2 [rsp + 32]
 111 #  define SAVE3 [rsp + 48]
 112
 113         sub     rsp, 64 + 8
 114         movdqa  [rsp +  0], xmm6
 115         movdqa  [rsp + 16], xmm7
 116 #endif
 117
 118         // First job is to slurp the matrix into XMM registers.  The words
 119         // have already been permuted conveniently to make them line up
 120         // better for SIMD processing.
 121         //
 122         // The textbook arrangement of the matrix is this.
 123         //
 124         //      [C K K K]
 125         //      [K C N N]
 126         //      [T T C K]
 127         //      [K K K C]
 128         //
 129         // But we've rotated the columns up so that the main diagonal with
 130         // the constants on it end up in the first row, giving something more
 131         // like
 132         //
 133         //      [C C C C]
 134         //      [K T K K]
 135         //      [T K K N]
 136         //      [K K N K]
 137         //
 138         // so the transformation looks like this:
 139         //
 140         //      [ 0  1  2  3]           [ 0  5 10 15] (a, xmm0)
 141         //      [ 4  5  6  7]    -->    [ 4  9 14  3] (b, xmm1)
 142         //      [ 8  9 10 11]           [ 8 13  2  7] (c, xmm2)
 143         //      [12 13 14 15]           [12  1  6 11] (d, xmm3)
 144         movdqu  xmm0, [IN +  0]
 145         movdqu  xmm1, [IN + 16]
 146         movdqu  xmm2, [IN + 32]
 147         movdqu  xmm3, [IN + 48]
 148
 149         ## Take a copy for later.
 150         movdqa  SAVE0, xmm0
 151         movdqa  SAVE1, xmm1
 152         movdqa  SAVE2, xmm2
 153         movdqa  SAVE3, xmm3
 154
 155 loop:
 156         // Apply a column quarterround to each of the columns simultaneously.
 157         // Alas, there doesn't seem to be a packed doubleword rotate, so we
 158         // have to synthesize it.
 159
 160         // b ^= (a + d) <<<  7
 161         movdqa  xmm4, xmm0
 162         paddd   xmm4, xmm3
 163         movdqa  xmm5, xmm4
 164         pslld   xmm4, 7
 165         psrld   xmm5, 25
 166         por     xmm4, xmm5
 167         pxor    xmm1, xmm4
 168
 169         // c ^= (b + a) <<<  9
 170         movdqa  xmm4, xmm1
 171         paddd   xmm4, xmm0
 172         movdqa  xmm5, xmm4
 173         pslld   xmm4, 9
 174         psrld   xmm5, 23
 175         por     xmm4, xmm5
 176         pxor    xmm2, xmm4
 177
 178         // d ^= (c + b) <<< 13
 179         movdqa  xmm4, xmm2
 180         paddd   xmm4, xmm1
 181         pshufd  xmm1, xmm1, ROTL
 182         movdqa  xmm5, xmm4
 183         pslld   xmm4, 13
 184         psrld   xmm5, 19
 185         por     xmm4, xmm5
 186         pxor    xmm3, xmm4
 187
 188         // a ^= (d + c) <<< 18
 189         movdqa  xmm4, xmm3
 190         pshufd  xmm3, xmm3, ROTR
 191         paddd   xmm4, xmm2
 192         pshufd  xmm2, xmm2, ROT2
 193         movdqa  xmm5, xmm4
 194         pslld   xmm4, 18
 195         psrld   xmm5, 14
 196         por     xmm4, xmm5
 197         pxor    xmm0, xmm4
 198
 199         // The transpose conveniently only involves reordering elements of
 200         // individual rows, which can be done quite easily, and reordering
 201         // the rows themselves, which is a trivial renaming.  It doesn't
 202         // involve any movement of elements between rows.
 203         //
 204         //      [ 0  5 10 15]           [ 0  5 10 15] (a, xmm0)
 205         //      [ 4  9 14  3]    -->    [ 1  6 11 12] (b, xmm3)
 206         //      [ 8 13  2  7]           [ 2  7  8 13] (c, xmm2)
 207         //      [12  1  6 11]           [ 3  4  9 14] (d, xmm1)
 208         //
 209         // The shuffles have quite high latency, so they've been pushed
 210         // backwards into the main instruction list.
 211
 212         // Apply the row quarterround to each of the columns (yes!)
 213         // simultaneously.
 214
 215         // b ^= (a + d) <<<  7
 216         movdqa  xmm4, xmm0
 217         paddd   xmm4, xmm1
 218         movdqa  xmm5, xmm4
 219         pslld   xmm4, 7
 220         psrld   xmm5, 25
 221         por     xmm4, xmm5
 222         pxor    xmm3, xmm4
 223
 224         // c ^= (b + a) <<<  9
 225         movdqa  xmm4, xmm3
 226         paddd   xmm4, xmm0
 227         movdqa  xmm5, xmm4
 228         pslld   xmm4, 9
 229         psrld   xmm5, 23
 230         por     xmm4, xmm5
 231         pxor    xmm2, xmm4
 232
 233         // d ^= (c + b) <<< 13
 234         movdqa  xmm4, xmm2
 235         paddd   xmm4, xmm3
 236         pshufd  xmm3, xmm3, ROTL
 237         movdqa  xmm5, xmm4
 238         pslld   xmm4, 13
 239         psrld   xmm5, 19
 240         por     xmm4, xmm5
 241         pxor    xmm1, xmm4
 242
 243         // a ^= (d + c) <<< 18
 244         movdqa  xmm4, xmm1
 245         pshufd  xmm1, xmm1, ROTR
 246         paddd   xmm4, xmm2
 247         pshufd  xmm2, xmm2, ROT2
 248         movdqa  xmm5, xmm4
 249         pslld   xmm4, 18
 250         psrld   xmm5, 14
 251         por     xmm4, xmm5
 252         pxor    xmm0, xmm4
 253
 254         // We had to undo the transpose ready for the next loop.  Again, push
 255         // back the shuffles because they take a long time coming through.
 256         // Decrement the loop counter and see if we should go round again.
 257         // Later processors fuse this pair into a single uop.
 258         sub     NR, 2
 259         ja      loop
 260
 261         // Almost there.  Firstly, the feedforward addition, and then we have
 262         // to write out the result.  Here we have to undo the permutation
 263         // which was already applied to the input.  Shuffling has quite high
 264         // latency, so arrange to start a new shuffle into a temporary as
 265         // soon as we've written out the old value.
 266         paddd   xmm0, SAVE0
 267         pshufd  xmm4, xmm0, 0x39
 268         movd    [OUT +  0], xmm0
 269
 270         paddd   xmm1, SAVE1
 271         pshufd  xmm5, xmm1, ROTL
 272         movd    [OUT + 16], xmm1
 273
 274         paddd   xmm2, SAVE2
 275         pshufd  xmm6, xmm2, ROT2
 276         movd    [OUT + 32], xmm2
 277
 278         paddd   xmm3, SAVE3
 279         pshufd  xmm7, xmm3, ROTR
 280         movd    [OUT + 48], xmm3
 281
 282         movd    [OUT +  4], xmm7
 283         pshufd  xmm7, xmm3, ROT2
 284         movd    [OUT + 24], xmm7
 285         pshufd  xmm3, xmm3, ROTL
 286         movd    [OUT + 44], xmm3
 287
 288         movd    [OUT +  8], xmm6
 289         pshufd  xmm6, xmm2, ROTL
 290         movd    [OUT + 28], xmm6
 291         pshufd  xmm2, xmm2, ROTR
 292         movd    [OUT + 52], xmm2
 293
 294         movd    [OUT + 12], xmm5
 295         pshufd  xmm5, xmm1, ROTR
 296         movd    [OUT + 36], xmm5
 297         pshufd  xmm1, xmm1, ROT2
 298         movd    [OUT + 56], xmm1
 299
 300         movd    [OUT + 20], xmm4
 301         pshufd  xmm4, xmm0, ROT2
 302         movd    [OUT + 40], xmm4
 303         pshufd  xmm0, xmm0, ROTL
 304         movd    [OUT + 60], xmm0
 305
 306         // Tidy things up.
 307
 308 #if CPUFAM_X86
 309         mov     esp, ebp
 310         pop     ebp
 311 #endif
 312 #if CPUFAM_AMD64 && ABI_WIN
 313         movdqa  xmm6, [rsp +  0]
 314         movdqa  xmm7, [rsp + 16]
 315         add     rsp, 64 + 8
 316 #endif
 317
 318         // And with that, we're done.
 319         ret
 320
 321 #undef NR
 322 #undef IN
 323 #undef OUT
 324 #undef SAVE0
 325 #undef SAVE1
 326 #undef SAVE2
 327 #undef SAVE3
 328
 329 ENDFUNC
 330
 331 ///----- That's all, folks --------------------------------------------------