chiark - git - mdw - catacomb/blob - symm/salsa20-x86ish-sse2.S

   1 /// -*- mode: asm; asm-comment-char: ?/ -*-
   2 ///
   3 /// Fancy SIMD implementation of Salsa20
   4 ///
   5 /// (c) 2015 Straylight/Edgeware
   6 ///
   7
   8 ///----- Licensing notice ---------------------------------------------------
   9 ///
  10 /// This file is part of Catacomb.
  11 ///
  12 /// Catacomb is free software; you can redistribute it and/or modify
  13 /// it under the terms of the GNU Library General Public License as
  14 /// published by the Free Software Foundation; either version 2 of the
  15 /// License, or (at your option) any later version.
  16 ///
  17 /// Catacomb is distributed in the hope that it will be useful,
  18 /// but WITHOUT ANY WARRANTY; without even the implied warranty of
  19 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20 /// GNU Library General Public License for more details.
  21 ///
  22 /// You should have received a copy of the GNU Library General Public
  23 /// License along with Catacomb; if not, write to the Free
  24 /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
  25 /// MA 02111-1307, USA.
  26
  27 ///--------------------------------------------------------------------------
  28 /// External definitions.
  29
  30 #include "config.h"
  31 #include "asm-common.h"
  32
  33 ///--------------------------------------------------------------------------
  34 /// Main code.
  35
  36         .arch pentium4
  37         .text
  38
  39 FUNC(salsa20_core_x86ish_sse2)
  40
  41         // Initial setup.
  42
  43 #if CPUFAM_X86
  44         // Arguments come in on the stack, and will need to be collected.  We
  45         // we can get away with just the scratch registers for integer work,
  46         // but we'll run out of XMM registers and will need some properly
  47         // aligned space which we'll steal from the stack.  I don't trust the
  48         // stack pointer's alignment, so I'll have to mask the stack pointer,
  49         // which in turn means I'll need to keep track of the old value.
  50         // Hence I'm making a full i386-style stack frame here.
  51         //
  52         // The Windows and SysV ABIs are sufficiently similar that we don't
  53         // need to worry about the differences here.
  54
  55 #  define NR ecx
  56 #  define IN eax
  57 #  define OUT edx
  58 #  define SAVE0 xmm6
  59 #  define SAVE1 xmm7
  60 #  define SAVE2 [esp + 0]
  61 #  define SAVE3 [esp + 16]
  62
  63         push    ebp
  64         mov     ebp, esp
  65         sub     esp, 32
  66         mov     IN, [ebp + 12]
  67         mov     OUT, [ebp + 16]
  68         and     esp, ~15
  69         mov     NR, [ebp + 8]
  70 #endif
  71
  72 #if CPUFAM_AMD64 && ABI_SYSV
  73         // This is nice.  We have plenty of XMM registers, and the arguments
  74         // are in useful places.  There's no need to spill anything and we
  75         // can just get on with the code.
  76
  77 #  define NR edi
  78 #  define IN rsi
  79 #  define OUT rdx
  80 #  define SAVE0 xmm6
  81 #  define SAVE1 xmm7
  82 #  define SAVE2 xmm8
  83 #  define SAVE3 xmm9
  84 #endif
  85
  86 #  if CPUFAM_AMD64 && ABI_WIN
  87         // Arguments come in registers, but they're different between Windows
  88         // and everyone else (and everyone else is saner).
  89         //
  90         // The Windows ABI insists that we preserve some of the XMM
  91         // registers, but we want more than we can use as scratch space.  Two
  92         // places we only need to save a copy of the input for the
  93         // feedforward at the end; but the other two we want for the final
  94         // permutation, so save the old values on the stack.  (We need an
  95         // extra 8 bytes to align the stack.)
  96
  97 #  define NR ecx
  98 #  define IN rdx
  99 #  define OUT r8
 100 #  define SAVE0 xmm6
 101 #  define SAVE1 xmm7
 102 #  define SAVE2 [rsp + 32]
 103 #  define SAVE3 [rsp + 48]
 104
 105         sub     rsp, 64 + 8
 106           .seh_stackalloc 64 + 8
 107         movdqa  [rsp +  0], xmm6
 108           .seh_savexmm xmm6, 0
 109         movdqa  [rsp + 16], xmm7
 110           .seh_savexmm xmm7, 16
 111   .seh_endprologue
 112 #endif
 113
 114         // First job is to slurp the matrix into XMM registers.  The words
 115         // have already been permuted conveniently to make them line up
 116         // better for SIMD processing.
 117         //
 118         // The textbook arrangement of the matrix is this.
 119         //
 120         //      [C K K K]
 121         //      [K C N N]
 122         //      [T T C K]
 123         //      [K K K C]
 124         //
 125         // But we've rotated the columns up so that the main diagonal with
 126         // the constants on it end up in the first row, giving something more
 127         // like
 128         //
 129         //      [C C C C]
 130         //      [K T K K]
 131         //      [T K K N]
 132         //      [K K N K]
 133         //
 134         // so the transformation looks like this:
 135         //
 136         //      [ 0  1  2  3]           [ 0  5 10 15] (a, xmm0)
 137         //      [ 4  5  6  7]    -->    [ 4  9 14  3] (b, xmm1)
 138         //      [ 8  9 10 11]           [ 8 13  2  7] (c, xmm2)
 139         //      [12 13 14 15]           [12  1  6 11] (d, xmm3)
 140         movdqu  xmm0, [IN +  0]
 141         movdqu  xmm1, [IN + 16]
 142         movdqu  xmm2, [IN + 32]
 143         movdqu  xmm3, [IN + 48]
 144
 145         // Take a copy for later.
 146         movdqa  SAVE0, xmm0
 147         movdqa  SAVE1, xmm1
 148         movdqa  SAVE2, xmm2
 149         movdqa  SAVE3, xmm3
 150
 151 0:
 152         // Apply a column quarterround to each of the columns simultaneously.
 153         // Alas, there doesn't seem to be a packed doubleword rotate, so we
 154         // have to synthesize it.
 155
 156         // b ^= (a + d) <<<  7
 157         movdqa  xmm4, xmm0
 158         paddd   xmm4, xmm3
 159         movdqa  xmm5, xmm4
 160         pslld   xmm4, 7
 161         psrld   xmm5, 25
 162         por     xmm4, xmm5
 163         pxor    xmm1, xmm4
 164
 165         // c ^= (b + a) <<<  9
 166         movdqa  xmm4, xmm1
 167         paddd   xmm4, xmm0
 168         movdqa  xmm5, xmm4
 169         pslld   xmm4, 9
 170         psrld   xmm5, 23
 171         por     xmm4, xmm5
 172         pxor    xmm2, xmm4
 173
 174         // d ^= (c + b) <<< 13
 175         movdqa  xmm4, xmm2
 176         paddd   xmm4, xmm1
 177          pshufd xmm1, xmm1, SHUF(2, 1, 0, 3)
 178         movdqa  xmm5, xmm4
 179         pslld   xmm4, 13
 180         psrld   xmm5, 19
 181         por     xmm4, xmm5
 182         pxor    xmm3, xmm4
 183
 184         // a ^= (d + c) <<< 18
 185         movdqa  xmm4, xmm3
 186          pshufd xmm3, xmm3, SHUF(0, 3, 2, 1)
 187         paddd   xmm4, xmm2
 188          pshufd xmm2, xmm2, SHUF(1, 0, 3, 2)
 189         movdqa  xmm5, xmm4
 190         pslld   xmm4, 18
 191         psrld   xmm5, 14
 192         por     xmm4, xmm5
 193         pxor    xmm0, xmm4
 194
 195         // The transpose conveniently only involves reordering elements of
 196         // individual rows, which can be done quite easily, and reordering
 197         // the rows themselves, which is a trivial renaming.  It doesn't
 198         // involve any movement of elements between rows.
 199         //
 200         //      [ 0  5 10 15]           [ 0  5 10 15] (a, xmm0)
 201         //      [ 4  9 14  3]    -->    [ 1  6 11 12] (b, xmm3)
 202         //      [ 8 13  2  7]           [ 2  7  8 13] (c, xmm2)
 203         //      [12  1  6 11]           [ 3  4  9 14] (d, xmm1)
 204         //
 205         // The shuffles have quite high latency, so they've been pushed
 206         // backwards into the main instruction list.
 207
 208         // Apply the row quarterround to each of the columns (yes!)
 209         // simultaneously.
 210
 211         // b ^= (a + d) <<<  7
 212         movdqa  xmm4, xmm0
 213         paddd   xmm4, xmm1
 214         movdqa  xmm5, xmm4
 215         pslld   xmm4, 7
 216         psrld   xmm5, 25
 217         por     xmm4, xmm5
 218         pxor    xmm3, xmm4
 219
 220         // c ^= (b + a) <<<  9
 221         movdqa  xmm4, xmm3
 222         paddd   xmm4, xmm0
 223         movdqa  xmm5, xmm4
 224         pslld   xmm4, 9
 225         psrld   xmm5, 23
 226         por     xmm4, xmm5
 227         pxor    xmm2, xmm4
 228
 229         // d ^= (c + b) <<< 13
 230         movdqa  xmm4, xmm2
 231         paddd   xmm4, xmm3
 232          pshufd xmm3, xmm3, SHUF(2, 1, 0, 3)
 233         movdqa  xmm5, xmm4
 234         pslld   xmm4, 13
 235         psrld   xmm5, 19
 236         por     xmm4, xmm5
 237         pxor    xmm1, xmm4
 238
 239         // a ^= (d + c) <<< 18
 240         movdqa  xmm4, xmm1
 241          pshufd xmm1, xmm1, SHUF(0, 3, 2, 1)
 242         paddd   xmm4, xmm2
 243          pshufd xmm2, xmm2, SHUF(1, 0, 3, 2)
 244         movdqa  xmm5, xmm4
 245         pslld   xmm4, 18
 246         psrld   xmm5, 14
 247         por     xmm4, xmm5
 248         pxor    xmm0, xmm4
 249
 250         // We had to undo the transpose ready for the next loop.  Again, push
 251         // back the shuffles because they take a long time coming through.
 252         // Decrement the loop counter and see if we should go round again.
 253         // Later processors fuse this pair into a single uop.
 254         sub     NR, 2
 255         ja      0b
 256
 257         // Almost there.  Firstly, the feedforward addition, and then we have
 258         // to write out the result.  Here we have to undo the permutation
 259         // which was already applied to the input.  Shuffling has quite high
 260         // latency, so arrange to start a new shuffle into a temporary as
 261         // soon as we've written out the old value.
 262         paddd   xmm0, SAVE0
 263          pshufd xmm4, xmm0, 0x39
 264         movd    [OUT +  0], xmm0
 265
 266         paddd   xmm1, SAVE1
 267          pshufd xmm5, xmm1, SHUF(2, 1, 0, 3)
 268         movd    [OUT + 16], xmm1
 269
 270         paddd   xmm2, SAVE2
 271          pshufd xmm6, xmm2, SHUF(1, 0, 3, 2)
 272         movd    [OUT + 32], xmm2
 273
 274         paddd   xmm3, SAVE3
 275          pshufd xmm7, xmm3, SHUF(0, 3, 2, 1)
 276         movd    [OUT + 48], xmm3
 277
 278         movd    [OUT +  4], xmm7
 279          pshufd xmm7, xmm3, SHUF(1, 0, 3, 2)
 280         movd    [OUT + 24], xmm7
 281          pshufd xmm3, xmm3, SHUF(2, 1, 0, 3)
 282         movd    [OUT + 44], xmm3
 283
 284         movd    [OUT +  8], xmm6
 285          pshufd xmm6, xmm2, SHUF(2, 1, 0, 3)
 286         movd    [OUT + 28], xmm6
 287          pshufd xmm2, xmm2, SHUF(0, 3, 2, 1)
 288         movd    [OUT + 52], xmm2
 289
 290         movd    [OUT + 12], xmm5
 291          pshufd xmm5, xmm1, SHUF(0, 3, 2, 1)
 292         movd    [OUT + 36], xmm5
 293          pshufd xmm1, xmm1, SHUF(1, 0, 3, 2)
 294         movd    [OUT + 56], xmm1
 295
 296         movd    [OUT + 20], xmm4
 297          pshufd xmm4, xmm0, SHUF(1, 0, 3, 2)
 298         movd    [OUT + 40], xmm4
 299          pshufd xmm0, xmm0, SHUF(2, 1, 0, 3)
 300         movd    [OUT + 60], xmm0
 301
 302         // Tidy things up.
 303 #if CPUFAM_X86
 304         mov     esp, ebp
 305         pop     ebp
 306 #endif
 307 #if CPUFAM_AMD64 && ABI_WIN
 308         movdqa  xmm6, [rsp +  0]
 309         movdqa  xmm7, [rsp + 16]
 310         add     rsp, 64 + 8
 311 #endif
 312
 313         // And with that, we're done.
 314         ret
 315
 316 #undef NR
 317 #undef IN
 318 #undef OUT
 319 #undef SAVE0
 320 #undef SAVE1
 321 #undef SAVE2
 322 #undef SAVE3
 323
 324 ENDFUNC
 325
 326 ///----- That's all, folks --------------------------------------------------