chiark - git - mdw - catacomb/blob - symm/salsa20-x86ish-sse2.S

   1 /// -*- mode: asm; asm-comment-char: ?/ -*-
   2 ///
   3 /// Fancy SIMD implementation of Salsa20
   4 ///
   5 /// (c) 2015 Straylight/Edgeware
   6 ///
   7
   8 ///----- Licensing notice ---------------------------------------------------
   9 ///
  10 /// This file is part of Catacomb.
  11 ///
  12 /// Catacomb is free software; you can redistribute it and/or modify
  13 /// it under the terms of the GNU Library General Public License as
  14 /// published by the Free Software Foundation; either version 2 of the
  15 /// License, or (at your option) any later version.
  16 ///
  17 /// Catacomb is distributed in the hope that it will be useful,
  18 /// but WITHOUT ANY WARRANTY; without even the implied warranty of
  19 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20 /// GNU Library General Public License for more details.
  21 ///
  22 /// You should have received a copy of the GNU Library General Public
  23 /// License along with Catacomb; if not, write to the Free
  24 /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
  25 /// MA 02111-1307, USA.
  26
  27 ///--------------------------------------------------------------------------
  28 /// External definitions.
  29
  30 #include "config.h"
  31 #include "asm-common.h"
  32
  33 ///--------------------------------------------------------------------------
  34 /// Main code.
  35
  36         .arch pentium4
  37         .text
  38
  39 FUNC(salsa20_core_x86ish_sse2)
  40
  41         // Initial setup.
  42
  43 #if CPUFAM_X86
  44         // Arguments come in on the stack, and will need to be collected.  We
  45         // we can get away with just the scratch registers for integer work,
  46         // but we'll run out of XMM registers and will need some properly
  47         // aligned space which we'll steal from the stack.  I don't trust the
  48         // stack pointer's alignment, so I'll have to mask the stack pointer,
  49         // which in turn means I'll need to keep track of the old value.
  50         // Hence I'm making a full i386-style stack frame here.
  51         //
  52         // The Windows and SysV ABIs are sufficiently similar that we don't
  53         // need to worry about the differences here.
  54
  55 #  define NR ecx
  56 #  define IN eax
  57 #  define OUT edx
  58 #  define SAVE0 xmm6
  59 #  define SAVE1 xmm7
  60 #  define SAVE2 [esp + 0]
  61 #  define SAVE3 [esp + 16]
  62
  63         pushreg ebp
  64         setfp   ebp
  65         sub     esp, 32
  66         mov     IN, [ebp + 12]
  67         mov     OUT, [ebp + 16]
  68         and     esp, ~15
  69         mov     NR, [ebp + 8]
  70 #endif
  71
  72 #if CPUFAM_AMD64 && ABI_SYSV
  73         // This is nice.  We have plenty of XMM registers, and the arguments
  74         // are in useful places.  There's no need to spill anything and we
  75         // can just get on with the code.
  76
  77 #  define NR edi
  78 #  define IN rsi
  79 #  define OUT rdx
  80 #  define SAVE0 xmm6
  81 #  define SAVE1 xmm7
  82 #  define SAVE2 xmm8
  83 #  define SAVE3 xmm9
  84 #endif
  85
  86 #  if CPUFAM_AMD64 && ABI_WIN
  87         // Arguments come in registers, but they're different between Windows
  88         // and everyone else (and everyone else is saner).
  89         //
  90         // The Windows ABI insists that we preserve some of the XMM
  91         // registers, but we want more than we can use as scratch space.  Two
  92         // places we only need to save a copy of the input for the
  93         // feedforward at the end; but the other two we want for the final
  94         // permutation, so save the old values on the stack.  (We need an
  95         // extra 8 bytes to align the stack.)
  96
  97 #  define NR ecx
  98 #  define IN rdx
  99 #  define OUT r8
 100 #  define SAVE0 xmm6
 101 #  define SAVE1 xmm7
 102 #  define SAVE2 [rsp + 32]
 103 #  define SAVE3 [rsp + 48]
 104
 105         stalloc 64 + 8
 106         savexmm xmm6, 0
 107         savexmm xmm7, 16
 108 #endif
 109
 110   endprologue
 111
 112         // First job is to slurp the matrix into XMM registers.  The words
 113         // have already been permuted conveniently to make them line up
 114         // better for SIMD processing.
 115         //
 116         // The textbook arrangement of the matrix is this.
 117         //
 118         //      [C K K K]
 119         //      [K C N N]
 120         //      [T T C K]
 121         //      [K K K C]
 122         //
 123         // But we've rotated the columns up so that the main diagonal with
 124         // the constants on it end up in the first row, giving something more
 125         // like
 126         //
 127         //      [C C C C]
 128         //      [K T K K]
 129         //      [T K K N]
 130         //      [K K N K]
 131         //
 132         // so the transformation looks like this:
 133         //
 134         //      [ 0  1  2  3]           [ 0  5 10 15] (a, xmm0)
 135         //      [ 4  5  6  7]    -->    [ 4  9 14  3] (b, xmm1)
 136         //      [ 8  9 10 11]           [ 8 13  2  7] (c, xmm2)
 137         //      [12 13 14 15]           [12  1  6 11] (d, xmm3)
 138         movdqu  xmm0, [IN +  0]
 139         movdqu  xmm1, [IN + 16]
 140         movdqu  xmm2, [IN + 32]
 141         movdqu  xmm3, [IN + 48]
 142
 143         // Take a copy for later.
 144         movdqa  SAVE0, xmm0
 145         movdqa  SAVE1, xmm1
 146         movdqa  SAVE2, xmm2
 147         movdqa  SAVE3, xmm3
 148
 149 0:
 150         // Apply a column quarterround to each of the columns simultaneously.
 151         // Alas, there doesn't seem to be a packed doubleword rotate, so we
 152         // have to synthesize it.
 153
 154         // b ^= (a + d) <<<  7
 155         movdqa  xmm4, xmm0
 156         paddd   xmm4, xmm3
 157         movdqa  xmm5, xmm4
 158         pslld   xmm4, 7
 159         psrld   xmm5, 25
 160         por     xmm4, xmm5
 161         pxor    xmm1, xmm4
 162
 163         // c ^= (b + a) <<<  9
 164         movdqa  xmm4, xmm1
 165         paddd   xmm4, xmm0
 166         movdqa  xmm5, xmm4
 167         pslld   xmm4, 9
 168         psrld   xmm5, 23
 169         por     xmm4, xmm5
 170         pxor    xmm2, xmm4
 171
 172         // d ^= (c + b) <<< 13
 173         movdqa  xmm4, xmm2
 174         paddd   xmm4, xmm1
 175          pshufd xmm1, xmm1, SHUF(2, 1, 0, 3)
 176         movdqa  xmm5, xmm4
 177         pslld   xmm4, 13
 178         psrld   xmm5, 19
 179         por     xmm4, xmm5
 180         pxor    xmm3, xmm4
 181
 182         // a ^= (d + c) <<< 18
 183         movdqa  xmm4, xmm3
 184          pshufd xmm3, xmm3, SHUF(0, 3, 2, 1)
 185         paddd   xmm4, xmm2
 186          pshufd xmm2, xmm2, SHUF(1, 0, 3, 2)
 187         movdqa  xmm5, xmm4
 188         pslld   xmm4, 18
 189         psrld   xmm5, 14
 190         por     xmm4, xmm5
 191         pxor    xmm0, xmm4
 192
 193         // The transpose conveniently only involves reordering elements of
 194         // individual rows, which can be done quite easily, and reordering
 195         // the rows themselves, which is a trivial renaming.  It doesn't
 196         // involve any movement of elements between rows.
 197         //
 198         //      [ 0  5 10 15]           [ 0  5 10 15] (a, xmm0)
 199         //      [ 4  9 14  3]    -->    [ 1  6 11 12] (b, xmm3)
 200         //      [ 8 13  2  7]           [ 2  7  8 13] (c, xmm2)
 201         //      [12  1  6 11]           [ 3  4  9 14] (d, xmm1)
 202         //
 203         // The shuffles have quite high latency, so they've been pushed
 204         // backwards into the main instruction list.
 205
 206         // Apply the row quarterround to each of the columns (yes!)
 207         // simultaneously.
 208
 209         // b ^= (a + d) <<<  7
 210         movdqa  xmm4, xmm0
 211         paddd   xmm4, xmm1
 212         movdqa  xmm5, xmm4
 213         pslld   xmm4, 7
 214         psrld   xmm5, 25
 215         por     xmm4, xmm5
 216         pxor    xmm3, xmm4
 217
 218         // c ^= (b + a) <<<  9
 219         movdqa  xmm4, xmm3
 220         paddd   xmm4, xmm0
 221         movdqa  xmm5, xmm4
 222         pslld   xmm4, 9
 223         psrld   xmm5, 23
 224         por     xmm4, xmm5
 225         pxor    xmm2, xmm4
 226
 227         // d ^= (c + b) <<< 13
 228         movdqa  xmm4, xmm2
 229         paddd   xmm4, xmm3
 230          pshufd xmm3, xmm3, SHUF(2, 1, 0, 3)
 231         movdqa  xmm5, xmm4
 232         pslld   xmm4, 13
 233         psrld   xmm5, 19
 234         por     xmm4, xmm5
 235         pxor    xmm1, xmm4
 236
 237         // a ^= (d + c) <<< 18
 238         movdqa  xmm4, xmm1
 239          pshufd xmm1, xmm1, SHUF(0, 3, 2, 1)
 240         paddd   xmm4, xmm2
 241          pshufd xmm2, xmm2, SHUF(1, 0, 3, 2)
 242         movdqa  xmm5, xmm4
 243         pslld   xmm4, 18
 244         psrld   xmm5, 14
 245         por     xmm4, xmm5
 246         pxor    xmm0, xmm4
 247
 248         // We had to undo the transpose ready for the next loop.  Again, push
 249         // back the shuffles because they take a long time coming through.
 250         // Decrement the loop counter and see if we should go round again.
 251         // Later processors fuse this pair into a single uop.
 252         sub     NR, 2
 253         ja      0b
 254
 255         // Almost there.  Firstly, the feedforward addition.
 256         paddd   xmm0, SAVE0                     //  0,  5, 10, 15
 257         paddd   xmm1, SAVE1                     //  4,  9, 14,  3
 258         paddd   xmm2, SAVE2                     //  8, 13,  2,  7
 259         paddd   xmm3, SAVE3                     // 12,  1,  6, 11
 260
 261         // Next we must undo the permutation which was already applied to the
 262         // input.  This can be done by juggling values in registers, with the
 263         // following fancy footwork: some row rotations, a transpose, and
 264         // some more rotations.
 265         pshufd  xmm1, xmm1, SHUF(2, 1, 0, 3)    //  3,  4,  9, 14
 266         pshufd  xmm2, xmm2, SHUF(1, 0, 3, 2)    //  2,  7,  8, 13
 267         pshufd  xmm3, xmm3, SHUF(0, 3, 2, 1)    //  1,  6, 11, 12
 268
 269         movdqa  xmm4, xmm0
 270         movdqa  xmm5, xmm3
 271         punpckldq xmm0, xmm2                    //  0,  2,  5,  7
 272         punpckldq xmm3, xmm1                    //  1,  3,  6,  4
 273         punpckhdq xmm4, xmm2                    //  10, 8, 15, 13
 274         punpckhdq xmm5, xmm1                    //  11, 9, 12, 14
 275
 276         movdqa  xmm1, xmm0
 277         movdqa  xmm2, xmm4
 278         punpckldq xmm0, xmm3                    //  0,  1,  2,  3
 279         punpckldq xmm4, xmm5                    // 10, 11,  8,  9
 280         punpckhdq xmm1, xmm3                    //  5,  6,  7,  4
 281         punpckhdq xmm2, xmm5                    // 15, 12, 13, 14
 282
 283         pshufd  xmm1, xmm1, SHUF(2, 1, 0, 3)    //  4,  5,  6,  7
 284         pshufd  xmm4, xmm4, SHUF(1, 0, 3, 2)    //  8,  9, 10, 11
 285         pshufd  xmm2, xmm2, SHUF(0, 3, 2, 1)    // 12, 13, 14, 15
 286
 287         // Finally we have to write out the result.
 288         movdqu  [OUT +  0], xmm0
 289         movdqu  [OUT + 16], xmm1
 290         movdqu  [OUT + 32], xmm4
 291         movdqu  [OUT + 48], xmm2
 292
 293         // Tidy things up.
 294 #if CPUFAM_X86
 295         dropfp
 296         popreg  ebp
 297 #endif
 298 #if CPUFAM_AMD64 && ABI_WIN
 299         rstrxmm xmm6, 0
 300         rsrrxmm xmm7, 16
 301         stfree  64 + 8
 302 #endif
 303
 304         // And with that, we're done.
 305         ret
 306
 307 #undef NR
 308 #undef IN
 309 #undef OUT
 310 #undef SAVE0
 311 #undef SAVE1
 312 #undef SAVE2
 313 #undef SAVE3
 314
 315 ENDFUNC
 316
 317 ///----- That's all, folks --------------------------------------------------