chiark - git - mdw - catacomb/blob - symm/salsa20-x86ish-sse2.S

   1 /// -*- mode: asm; asm-comment-char: ?/ -*-
   2 ///
   3 /// Fancy SIMD implementation of Salsa20
   4 ///
   5 /// (c) 2015 Straylight/Edgeware
   6 ///
   7
   8 ///----- Licensing notice ---------------------------------------------------
   9 ///
  10 /// This file is part of Catacomb.
  11 ///
  12 /// Catacomb is free software; you can redistribute it and/or modify
  13 /// it under the terms of the GNU Library General Public License as
  14 /// published by the Free Software Foundation; either version 2 of the
  15 /// License, or (at your option) any later version.
  16 ///
  17 /// Catacomb is distributed in the hope that it will be useful,
  18 /// but WITHOUT ANY WARRANTY; without even the implied warranty of
  19 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20 /// GNU Library General Public License for more details.
  21 ///
  22 /// You should have received a copy of the GNU Library General Public
  23 /// License along with Catacomb; if not, write to the Free
  24 /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
  25 /// MA 02111-1307, USA.
  26
  27 ///--------------------------------------------------------------------------
  28 /// External definitions.
  29
  30 #include "config.h"
  31 #include "asm-common.h"
  32
  33 ///--------------------------------------------------------------------------
  34 /// Local utilities.
  35
  36 // Magic constants for shuffling.
  37 #define ROTL 0x93
  38 #define ROT2 0x4e
  39 #define ROTR 0x39
  40
  41 ///--------------------------------------------------------------------------
  42 /// Main code.
  43
  44         .arch pentium4
  45         .text
  46
  47 FUNC(salsa20_core_x86ish_sse2)
  48
  49         // Initial setup.
  50
  51 #if CPUFAM_X86
  52         // Arguments come in on the stack, and will need to be collected.  We
  53         // we can get away with just the scratch registers for integer work,
  54         // but we'll run out of XMM registers and will need some properly
  55         // aligned space which we'll steal from the stack.  I don't trust the
  56         // stack pointer's alignment, so I'll have to mask the stack pointer,
  57         // which in turn means I'll need to keep track of the old value.
  58         // Hence I'm making a full i386-style stack frame here.
  59         //
  60         // The Windows and SysV ABIs are sufficiently similar that we don't
  61         // need to worry about the differences here.
  62
  63 #  define NR ecx
  64 #  define IN eax
  65 #  define OUT edx
  66 #  define SAVE0 xmm6
  67 #  define SAVE1 xmm7
  68 #  define SAVE2 [esp + 0]
  69 #  define SAVE3 [esp + 16]
  70
  71         push    ebp
  72         mov     ebp, esp
  73         sub     esp, 32
  74         mov     IN, [ebp + 12]
  75         mov     OUT, [ebp + 16]
  76         and     esp, ~15
  77         mov     NR, [ebp + 8]
  78 #endif
  79
  80 #if CPUFAM_AMD64 && ABI_SYSV
  81         // This is nice.  We have plenty of XMM registers, and the arguments
  82         // are in useful places.  There's no need to spill anything and we
  83         // can just get on with the code.
  84
  85 #  define NR edi
  86 #  define IN rsi
  87 #  define OUT rdx
  88 #  define SAVE0 xmm6
  89 #  define SAVE1 xmm7
  90 #  define SAVE2 xmm8
  91 #  define SAVE3 xmm9
  92 #endif
  93
  94 #  if CPUFAM_AMD64 && ABI_WIN
  95         // Arguments come in registers, but they're different between Windows
  96         // and everyone else (and everyone else is saner).
  97         //
  98         // The Windows ABI insists that we preserve some of the XMM
  99         // registers, but we want more than we can use as scratch space.  Two
 100         // places we only need to save a copy of the input for the
 101         // feedforward at the end; but the other two we want for the final
 102         // permutation, so save the old values on the stack.  (We need an
 103         // extra 8 bytes to align the stack.)
 104
 105 #  define NR ecx
 106 #  define IN rdx
 107 #  define OUT r8
 108 #  define SAVE0 xmm6
 109 #  define SAVE1 xmm7
 110 #  define SAVE2 [rsp + 32]
 111 #  define SAVE3 [rsp + 48]
 112
 113         sub     rsp, 64 + 8
 114           .seh_stackalloc 64 + 8
 115         movdqa  [rsp +  0], xmm6
 116           .seh_savexmm xmm6, 0
 117         movdqa  [rsp + 16], xmm7
 118           .seh_savexmm xmm7, 16
 119   .seh_endprologue
 120 #endif
 121
 122         // First job is to slurp the matrix into XMM registers.  The words
 123         // have already been permuted conveniently to make them line up
 124         // better for SIMD processing.
 125         //
 126         // The textbook arrangement of the matrix is this.
 127         //
 128         //      [C K K K]
 129         //      [K C N N]
 130         //      [T T C K]
 131         //      [K K K C]
 132         //
 133         // But we've rotated the columns up so that the main diagonal with
 134         // the constants on it end up in the first row, giving something more
 135         // like
 136         //
 137         //      [C C C C]
 138         //      [K T K K]
 139         //      [T K K N]
 140         //      [K K N K]
 141         //
 142         // so the transformation looks like this:
 143         //
 144         //      [ 0  1  2  3]           [ 0  5 10 15] (a, xmm0)
 145         //      [ 4  5  6  7]    -->    [ 4  9 14  3] (b, xmm1)
 146         //      [ 8  9 10 11]           [ 8 13  2  7] (c, xmm2)
 147         //      [12 13 14 15]           [12  1  6 11] (d, xmm3)
 148         movdqu  xmm0, [IN +  0]
 149         movdqu  xmm1, [IN + 16]
 150         movdqu  xmm2, [IN + 32]
 151         movdqu  xmm3, [IN + 48]
 152
 153         // Take a copy for later.
 154         movdqa  SAVE0, xmm0
 155         movdqa  SAVE1, xmm1
 156         movdqa  SAVE2, xmm2
 157         movdqa  SAVE3, xmm3
 158
 159 0:
 160         // Apply a column quarterround to each of the columns simultaneously.
 161         // Alas, there doesn't seem to be a packed doubleword rotate, so we
 162         // have to synthesize it.
 163
 164         // b ^= (a + d) <<<  7
 165         movdqa  xmm4, xmm0
 166         paddd   xmm4, xmm3
 167         movdqa  xmm5, xmm4
 168         pslld   xmm4, 7
 169         psrld   xmm5, 25
 170         por     xmm4, xmm5
 171         pxor    xmm1, xmm4
 172
 173         // c ^= (b + a) <<<  9
 174         movdqa  xmm4, xmm1
 175         paddd   xmm4, xmm0
 176         movdqa  xmm5, xmm4
 177         pslld   xmm4, 9
 178         psrld   xmm5, 23
 179         por     xmm4, xmm5
 180         pxor    xmm2, xmm4
 181
 182         // d ^= (c + b) <<< 13
 183         movdqa  xmm4, xmm2
 184         paddd   xmm4, xmm1
 185         pshufd  xmm1, xmm1, ROTL
 186         movdqa  xmm5, xmm4
 187         pslld   xmm4, 13
 188         psrld   xmm5, 19
 189         por     xmm4, xmm5
 190         pxor    xmm3, xmm4
 191
 192         // a ^= (d + c) <<< 18
 193         movdqa  xmm4, xmm3
 194         pshufd  xmm3, xmm3, ROTR
 195         paddd   xmm4, xmm2
 196         pshufd  xmm2, xmm2, ROT2
 197         movdqa  xmm5, xmm4
 198         pslld   xmm4, 18
 199         psrld   xmm5, 14
 200         por     xmm4, xmm5
 201         pxor    xmm0, xmm4
 202
 203         // The transpose conveniently only involves reordering elements of
 204         // individual rows, which can be done quite easily, and reordering
 205         // the rows themselves, which is a trivial renaming.  It doesn't
 206         // involve any movement of elements between rows.
 207         //
 208         //      [ 0  5 10 15]           [ 0  5 10 15] (a, xmm0)
 209         //      [ 4  9 14  3]    -->    [ 1  6 11 12] (b, xmm3)
 210         //      [ 8 13  2  7]           [ 2  7  8 13] (c, xmm2)
 211         //      [12  1  6 11]           [ 3  4  9 14] (d, xmm1)
 212         //
 213         // The shuffles have quite high latency, so they've been pushed
 214         // backwards into the main instruction list.
 215
 216         // Apply the row quarterround to each of the columns (yes!)
 217         // simultaneously.
 218
 219         // b ^= (a + d) <<<  7
 220         movdqa  xmm4, xmm0
 221         paddd   xmm4, xmm1
 222         movdqa  xmm5, xmm4
 223         pslld   xmm4, 7
 224         psrld   xmm5, 25
 225         por     xmm4, xmm5
 226         pxor    xmm3, xmm4
 227
 228         // c ^= (b + a) <<<  9
 229         movdqa  xmm4, xmm3
 230         paddd   xmm4, xmm0
 231         movdqa  xmm5, xmm4
 232         pslld   xmm4, 9
 233         psrld   xmm5, 23
 234         por     xmm4, xmm5
 235         pxor    xmm2, xmm4
 236
 237         // d ^= (c + b) <<< 13
 238         movdqa  xmm4, xmm2
 239         paddd   xmm4, xmm3
 240         pshufd  xmm3, xmm3, ROTL
 241         movdqa  xmm5, xmm4
 242         pslld   xmm4, 13
 243         psrld   xmm5, 19
 244         por     xmm4, xmm5
 245         pxor    xmm1, xmm4
 246
 247         // a ^= (d + c) <<< 18
 248         movdqa  xmm4, xmm1
 249         pshufd  xmm1, xmm1, ROTR
 250         paddd   xmm4, xmm2
 251         pshufd  xmm2, xmm2, ROT2
 252         movdqa  xmm5, xmm4
 253         pslld   xmm4, 18
 254         psrld   xmm5, 14
 255         por     xmm4, xmm5
 256         pxor    xmm0, xmm4
 257
 258         // We had to undo the transpose ready for the next loop.  Again, push
 259         // back the shuffles because they take a long time coming through.
 260         // Decrement the loop counter and see if we should go round again.
 261         // Later processors fuse this pair into a single uop.
 262         sub     NR, 2
 263         ja      0b
 264
 265         // Almost there.  Firstly, the feedforward addition, and then we have
 266         // to write out the result.  Here we have to undo the permutation
 267         // which was already applied to the input.  Shuffling has quite high
 268         // latency, so arrange to start a new shuffle into a temporary as
 269         // soon as we've written out the old value.
 270         paddd   xmm0, SAVE0
 271         pshufd  xmm4, xmm0, 0x39
 272         movd    [OUT +  0], xmm0
 273
 274         paddd   xmm1, SAVE1
 275         pshufd  xmm5, xmm1, ROTL
 276         movd    [OUT + 16], xmm1
 277
 278         paddd   xmm2, SAVE2
 279         pshufd  xmm6, xmm2, ROT2
 280         movd    [OUT + 32], xmm2
 281
 282         paddd   xmm3, SAVE3
 283         pshufd  xmm7, xmm3, ROTR
 284         movd    [OUT + 48], xmm3
 285
 286         movd    [OUT +  4], xmm7
 287         pshufd  xmm7, xmm3, ROT2
 288         movd    [OUT + 24], xmm7
 289         pshufd  xmm3, xmm3, ROTL
 290         movd    [OUT + 44], xmm3
 291
 292         movd    [OUT +  8], xmm6
 293         pshufd  xmm6, xmm2, ROTL
 294         movd    [OUT + 28], xmm6
 295         pshufd  xmm2, xmm2, ROTR
 296         movd    [OUT + 52], xmm2
 297
 298         movd    [OUT + 12], xmm5
 299         pshufd  xmm5, xmm1, ROTR
 300         movd    [OUT + 36], xmm5
 301         pshufd  xmm1, xmm1, ROT2
 302         movd    [OUT + 56], xmm1
 303
 304         movd    [OUT + 20], xmm4
 305         pshufd  xmm4, xmm0, ROT2
 306         movd    [OUT + 40], xmm4
 307         pshufd  xmm0, xmm0, ROTL
 308         movd    [OUT + 60], xmm0
 309
 310         // Tidy things up.
 311 #if CPUFAM_X86
 312         mov     esp, ebp
 313         pop     ebp
 314 #endif
 315 #if CPUFAM_AMD64 && ABI_WIN
 316         movdqa  xmm6, [rsp +  0]
 317         movdqa  xmm7, [rsp + 16]
 318         add     rsp, 64 + 8
 319 #endif
 320
 321         // And with that, we're done.
 322         ret
 323
 324 #undef NR
 325 #undef IN
 326 #undef OUT
 327 #undef SAVE0
 328 #undef SAVE1
 329 #undef SAVE2
 330 #undef SAVE3
 331
 332 ENDFUNC
 333
 334 ///----- That's all, folks --------------------------------------------------