chiark - git - mdw - catacomb/blob - symm/salsa20-x86ish-sse2.S

   1 /// -*- mode: asm; asm-comment-char: ?/ -*-
   2 ///
   3 /// Fancy SIMD implementation of Salsa20
   4 ///
   5 /// (c) 2015 Straylight/Edgeware
   6 ///
   7
   8 ///----- Licensing notice ---------------------------------------------------
   9 ///
  10 /// This file is part of Catacomb.
  11 ///
  12 /// Catacomb is free software; you can redistribute it and/or modify
  13 /// it under the terms of the GNU Library General Public License as
  14 /// published by the Free Software Foundation; either version 2 of the
  15 /// License, or (at your option) any later version.
  16 ///
  17 /// Catacomb is distributed in the hope that it will be useful,
  18 /// but WITHOUT ANY WARRANTY; without even the implied warranty of
  19 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20 /// GNU Library General Public License for more details.
  21 ///
  22 /// You should have received a copy of the GNU Library General Public
  23 /// License along with Catacomb; if not, write to the Free
  24 /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
  25 /// MA 02111-1307, USA.
  26
  27 ///--------------------------------------------------------------------------
  28 /// Preliminaries.
  29
  30 #include "config.h"
  31 #include "asm-common.h"
  32
  33         .text
  34
  35 ///--------------------------------------------------------------------------
  36 /// Main code.
  37
  38 FUNC(salsa20_core_x86ish_avx)
  39         .arch   .avx
  40         vzeroupper
  41   endprologue
  42         // drop through...
  43 ENDFUNC
  44
  45         .arch   pentium4
  46
  47 FUNC(salsa20_core_x86ish_sse2)
  48
  49         // Initial setup.
  50
  51 #if CPUFAM_X86
  52         // Arguments come in on the stack, and will need to be collected.  We
  53         // can get away with just the scratch registers for integer work, but
  54         // we'll run out of XMM registers and will need some properly aligned
  55         // space which we'll steal from the stack.  I don't trust the stack
  56         // pointer's alignment, so I'll have to mask the stack pointer, which
  57         // in turn means I'll need to keep track of the old value.  Hence I'm
  58         // making a full i386-style stack frame here.
  59         //
  60         // The Windows and SysV ABIs are sufficiently similar that we don't
  61         // need to worry about the differences here.
  62
  63 #  define NR ecx
  64 #  define IN eax
  65 #  define OUT edx
  66 #  define SAVE0 xmm6
  67 #  define SAVE1 xmm7
  68 #  define SAVE2 [esp + 0]
  69 #  define SAVE3 [esp + 16]
  70
  71         pushreg ebp
  72         setfp
  73         sub     esp, 32
  74         mov     IN, [ebp + 12]
  75         mov     OUT, [ebp + 16]
  76         and     esp, ~15
  77         mov     NR, [ebp + 8]
  78 #endif
  79
  80 #if CPUFAM_AMD64 && ABI_SYSV
  81         // This is nice.  We have plenty of XMM registers, and the arguments
  82         // are in useful places.  There's no need to spill anything and we
  83         // can just get on with the code.
  84
  85 #  define NR edi
  86 #  define IN rsi
  87 #  define OUT rdx
  88 #  define SAVE0 xmm6
  89 #  define SAVE1 xmm7
  90 #  define SAVE2 xmm8
  91 #  define SAVE3 xmm9
  92 #endif
  93
  94 #  if CPUFAM_AMD64 && ABI_WIN
  95         // Arguments come in registers, but they're different between Windows
  96         // and everyone else (and everyone else is saner).
  97         //
  98         // The Windows ABI insists that we preserve some of the XMM
  99         // registers, but we want more than we can use as scratch space.  Two
 100         // places we only need to save a copy of the input for the
 101         // feedforward at the end; but the other two we want for the final
 102         // permutation, so save the old values on the stack.  (We need an
 103         // extra 8 bytes to align the stack.)
 104
 105 #  define NR ecx
 106 #  define IN rdx
 107 #  define OUT r8
 108 #  define SAVE0 xmm6
 109 #  define SAVE1 xmm7
 110 #  define SAVE2 [rsp + 32]
 111 #  define SAVE3 [rsp + 48]
 112
 113         stalloc 64 + 8
 114         savexmm xmm6, 0
 115         savexmm xmm7, 16
 116 #endif
 117
 118   endprologue
 119
 120         // First job is to slurp the matrix into XMM registers.  The words
 121         // have already been permuted conveniently to make them line up
 122         // better for SIMD processing.
 123         //
 124         // The textbook arrangement of the matrix is this.
 125         //
 126         //      [C K K K]
 127         //      [K C N N]
 128         //      [T T C K]
 129         //      [K K K C]
 130         //
 131         // But we've rotated the columns up so that the main diagonal with
 132         // the constants on it end up in the first row, giving something more
 133         // like
 134         //
 135         //      [C C C C]
 136         //      [K T K K]
 137         //      [T K K N]
 138         //      [K K N K]
 139         //
 140         // so the transformation looks like this:
 141         //
 142         //      [ 0  1  2  3]           [ 0  5 10 15] (a, xmm0)
 143         //      [ 4  5  6  7]    -->    [ 4  9 14  3] (b, xmm1)
 144         //      [ 8  9 10 11]           [ 8 13  2  7] (c, xmm2)
 145         //      [12 13 14 15]           [12  1  6 11] (d, xmm3)
 146         movdqu  xmm0, [IN +  0]
 147         movdqu  xmm1, [IN + 16]
 148         movdqu  xmm2, [IN + 32]
 149         movdqu  xmm3, [IN + 48]
 150
 151         // Take a copy for later.
 152         movdqa  SAVE0, xmm0
 153         movdqa  SAVE1, xmm1
 154         movdqa  SAVE2, xmm2
 155         movdqa  SAVE3, xmm3
 156
 157 0:
 158         // Apply a column quarterround to each of the columns simultaneously.
 159         // Alas, there doesn't seem to be a packed doubleword rotate, so we
 160         // have to synthesize it.
 161
 162         // b ^= (a + d) <<<  7
 163         movdqa  xmm4, xmm0
 164         paddd   xmm4, xmm3
 165         movdqa  xmm5, xmm4
 166         pslld   xmm4, 7
 167         psrld   xmm5, 25
 168         por     xmm4, xmm5
 169         pxor    xmm1, xmm4
 170
 171         // c ^= (b + a) <<<  9
 172         movdqa  xmm4, xmm1
 173         paddd   xmm4, xmm0
 174         movdqa  xmm5, xmm4
 175         pslld   xmm4, 9
 176         psrld   xmm5, 23
 177         por     xmm4, xmm5
 178         pxor    xmm2, xmm4
 179
 180         // d ^= (c + b) <<< 13
 181         movdqa  xmm4, xmm2
 182         paddd   xmm4, xmm1
 183          pshufd xmm1, xmm1, SHUF(3, 0, 1, 2)
 184         movdqa  xmm5, xmm4
 185         pslld   xmm4, 13
 186         psrld   xmm5, 19
 187         por     xmm4, xmm5
 188         pxor    xmm3, xmm4
 189
 190         // a ^= (d + c) <<< 18
 191         movdqa  xmm4, xmm3
 192          pshufd xmm3, xmm3, SHUF(1, 2, 3, 0)
 193         paddd   xmm4, xmm2
 194          pshufd xmm2, xmm2, SHUF(2, 3, 0, 1)
 195         movdqa  xmm5, xmm4
 196         pslld   xmm4, 18
 197         psrld   xmm5, 14
 198         por     xmm4, xmm5
 199         pxor    xmm0, xmm4
 200
 201         // The transpose conveniently only involves reordering elements of
 202         // individual rows, which can be done quite easily, and reordering
 203         // the rows themselves, which is a trivial renaming.  It doesn't
 204         // involve any movement of elements between rows.
 205         //
 206         //      [ 0  5 10 15]           [ 0  5 10 15] (a, xmm0)
 207         //      [ 4  9 14  3]    -->    [ 1  6 11 12] (b, xmm3)
 208         //      [ 8 13  2  7]           [ 2  7  8 13] (c, xmm2)
 209         //      [12  1  6 11]           [ 3  4  9 14] (d, xmm1)
 210         //
 211         // The shuffles have quite high latency, so they've been pushed
 212         // backwards into the main instruction list.
 213
 214         // Apply the row quarterround to each of the columns (yes!)
 215         // simultaneously.
 216
 217         // b ^= (a + d) <<<  7
 218         movdqa  xmm4, xmm0
 219         paddd   xmm4, xmm1
 220         movdqa  xmm5, xmm4
 221         pslld   xmm4, 7
 222         psrld   xmm5, 25
 223         por     xmm4, xmm5
 224         pxor    xmm3, xmm4
 225
 226         // c ^= (b + a) <<<  9
 227         movdqa  xmm4, xmm3
 228         paddd   xmm4, xmm0
 229         movdqa  xmm5, xmm4
 230         pslld   xmm4, 9
 231         psrld   xmm5, 23
 232         por     xmm4, xmm5
 233         pxor    xmm2, xmm4
 234
 235         // d ^= (c + b) <<< 13
 236         movdqa  xmm4, xmm2
 237         paddd   xmm4, xmm3
 238          pshufd xmm3, xmm3, SHUF(3, 0, 1, 2)
 239         movdqa  xmm5, xmm4
 240         pslld   xmm4, 13
 241         psrld   xmm5, 19
 242         por     xmm4, xmm5
 243         pxor    xmm1, xmm4
 244
 245         // a ^= (d + c) <<< 18
 246         movdqa  xmm4, xmm1
 247          pshufd xmm1, xmm1, SHUF(1, 2, 3, 0)
 248         paddd   xmm4, xmm2
 249          pshufd xmm2, xmm2, SHUF(2, 3, 0, 1)
 250         movdqa  xmm5, xmm4
 251         pslld   xmm4, 18
 252         psrld   xmm5, 14
 253         por     xmm4, xmm5
 254         pxor    xmm0, xmm4
 255
 256         // We had to undo the transpose ready for the next loop.  Again, push
 257         // back the shuffles because they take a long time coming through.
 258         // Decrement the loop counter and see if we should go round again.
 259         // Later processors fuse this pair into a single uop.
 260         sub     NR, 2
 261         ja      0b
 262
 263         // Almost there.  Firstly, the feedforward addition.
 264         paddd   xmm0, SAVE0                     //  0,  5, 10, 15
 265         paddd   xmm1, SAVE1                     //  4,  9, 14,  3
 266         paddd   xmm2, SAVE2                     //  8, 13,  2,  7
 267         paddd   xmm3, SAVE3                     // 12,  1,  6, 11
 268
 269         // Next we must undo the permutation which was already applied to the
 270         // input.  This can be done by juggling values in registers, with the
 271         // following fancy footwork: some row rotations, a transpose, and
 272         // some more rotations.
 273         pshufd  xmm1, xmm1, SHUF(3, 0, 1, 2)    //  3,  4,  9, 14
 274         pshufd  xmm2, xmm2, SHUF(2, 3, 0, 1)    //  2,  7,  8, 13
 275         pshufd  xmm3, xmm3, SHUF(1, 2, 3, 0)    //  1,  6, 11, 12
 276
 277         movdqa  xmm4, xmm0
 278         movdqa  xmm5, xmm3
 279         punpckldq xmm0, xmm2                    //  0,  2,  5,  7
 280         punpckldq xmm3, xmm1                    //  1,  3,  6,  4
 281         punpckhdq xmm4, xmm2                    //  10, 8, 15, 13
 282         punpckhdq xmm5, xmm1                    //  11, 9, 12, 14
 283
 284         movdqa  xmm1, xmm0
 285         movdqa  xmm2, xmm4
 286         punpckldq xmm0, xmm3                    //  0,  1,  2,  3
 287         punpckldq xmm4, xmm5                    // 10, 11,  8,  9
 288         punpckhdq xmm1, xmm3                    //  5,  6,  7,  4
 289         punpckhdq xmm2, xmm5                    // 15, 12, 13, 14
 290
 291         pshufd  xmm1, xmm1, SHUF(3, 0, 1, 2)    //  4,  5,  6,  7
 292         pshufd  xmm4, xmm4, SHUF(2, 3, 0, 1)    //  8,  9, 10, 11
 293         pshufd  xmm2, xmm2, SHUF(1, 2, 3, 0)    // 12, 13, 14, 15
 294
 295         // Finally we have to write out the result.
 296         movdqu  [OUT +  0], xmm0
 297         movdqu  [OUT + 16], xmm1
 298         movdqu  [OUT + 32], xmm4
 299         movdqu  [OUT + 48], xmm2
 300
 301         // Tidy things up.
 302 #if CPUFAM_X86
 303         dropfp
 304         popreg  ebp
 305 #endif
 306 #if CPUFAM_AMD64 && ABI_WIN
 307         rstrxmm xmm6, 0
 308         rstrxmm xmm7, 16
 309         stfree  64 + 8
 310 #endif
 311
 312         // And with that, we're done.
 313         ret
 314
 315 #undef NR
 316 #undef IN
 317 #undef OUT
 318 #undef SAVE0
 319 #undef SAVE1
 320 #undef SAVE2
 321 #undef SAVE3
 322
 323 ENDFUNC
 324
 325 ///----- That's all, folks --------------------------------------------------