chiark - git - mdw - catacomb/blob - symm/chacha-x86ish-sse2.S

   1 /// -*- mode: asm; asm-comment-char: ?/ -*-
   2 ///
   3 /// Fancy SIMD implementation of ChaCha
   4 ///
   5 /// (c) 2015 Straylight/Edgeware
   6 ///
   7
   8 ///----- Licensing notice ---------------------------------------------------
   9 ///
  10 /// This file is part of Catacomb.
  11 ///
  12 /// Catacomb is free software; you can redistribute it and/or modify
  13 /// it under the terms of the GNU Library General Public License as
  14 /// published by the Free Software Foundation; either version 2 of the
  15 /// License, or (at your option) any later version.
  16 ///
  17 /// Catacomb is distributed in the hope that it will be useful,
  18 /// but WITHOUT ANY WARRANTY; without even the implied warranty of
  19 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20 /// GNU Library General Public License for more details.
  21 ///
  22 /// You should have received a copy of the GNU Library General Public
  23 /// License along with Catacomb; if not, write to the Free
  24 /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
  25 /// MA 02111-1307, USA.
  26
  27 ///--------------------------------------------------------------------------
  28 /// External definitions.
  29
  30 #include "config.h"
  31 #include "asm-common.h"
  32
  33 ///--------------------------------------------------------------------------
  34 /// Main code.
  35
  36         .arch pentium4
  37         .text
  38
  39 FUNC(chacha_core_x86ish_sse2)
  40
  41         // Initial setup.
  42
  43 #if CPUFAM_X86
  44         // Arguments come in on the stack, and will need to be collected.  We
  45         // we can get away with just the scratch registers for integer work,
  46         // but we'll run out of XMM registers and will need some properly
  47         // aligned space which we'll steal from the stack.  I don't trust the
  48         // stack pointer's alignment, so I'll have to mask the stack pointer,
  49         // which in turn means I'll need to keep track of the old value.
  50         // Hence I'm making a full i386-style stack frame here.
  51         //
  52         // The Windows and SysV ABIs are sufficiently similar that we don't
  53         // need to worry about the differences here.
  54
  55 #  define NR ecx
  56 #  define IN eax
  57 #  define OUT edx
  58 #  define SAVE0 xmm5
  59 #  define SAVE1 xmm6
  60 #  define SAVE2 xmm7
  61 #  define SAVE3 [esp]
  62
  63         pushreg ebp
  64         setfp   ebp
  65         sub     esp, 16
  66         mov     IN, [ebp + 12]
  67         mov     OUT, [ebp + 16]
  68         and     esp, ~15
  69         mov     NR, [ebp + 8]
  70 #endif
  71
  72 #if CPUFAM_AMD64 && ABI_SYSV
  73         // This is nice.  We have plenty of XMM registers, and the arguments
  74         // are in useful places.  There's no need to spill anything and we
  75         // can just get on with the code.
  76
  77 #  define NR edi
  78 #  define IN rsi
  79 #  define OUT rdx
  80 #  define SAVE0 xmm5
  81 #  define SAVE1 xmm6
  82 #  define SAVE2 xmm7
  83 #  define SAVE3 xmm8
  84 #endif
  85
  86 #if CPUFAM_AMD64 && ABI_WIN
  87         // Arguments come in registers, but they're different between Windows
  88         // and everyone else (and everyone else is saner).
  89         //
  90         // The Windows ABI insists that we preserve some of the XMM
  91         // registers, but we want more than we can use as scratch space.  We
  92         // only need to save a copy of the input for the feedforward at the
  93         // end, so we might as well use memory rather than spill extra
  94         // registers.  (We need an extra 8 bytes to align the stack.)
  95
  96 #  define NR ecx
  97 #  define IN rdx
  98 #  define OUT r8
  99 #  define SAVE0 xmm5
 100 #  define SAVE1 [rsp +  0]
 101 #  define SAVE2 [rsp + 16]
 102 #  define SAVE3 [rsp + 32]
 103
 104         stalloc 48 + 8
 105 #endif
 106
 107   endprologue
 108
 109         // First job is to slurp the matrix into XMM registers.  Be careful:
 110         // the input matrix isn't likely to be properly aligned.
 111         //
 112         //      [ 0  1  2  3] (a, xmm0)
 113         //      [ 4  5  6  7] (b, xmm1)
 114         //      [ 8  9 10 11] (c, xmm2)
 115         //      [12 13 14 15] (d, xmm3)
 116         movdqu  xmm0, [IN +  0]
 117         movdqu  xmm1, [IN + 16]
 118         movdqu  xmm2, [IN + 32]
 119         movdqu  xmm3, [IN + 48]
 120
 121         // Take a copy for later.  This one is aligned properly, by
 122         // construction.
 123         movdqa  SAVE0, xmm0
 124         movdqa  SAVE1, xmm1
 125         movdqa  SAVE2, xmm2
 126         movdqa  SAVE3, xmm3
 127
 128 0:
 129         // Apply a column quarterround to each of the columns simultaneously.
 130         // Alas, there doesn't seem to be a packed doubleword rotate, so we
 131         // have to synthesize it.
 132
 133         // a += b; d ^= a; d <<<= 16
 134         paddd   xmm0, xmm1
 135         pxor    xmm3, xmm0
 136         movdqa  xmm4, xmm3
 137         pslld   xmm3, 16
 138         psrld   xmm4, 16
 139         por     xmm3, xmm4
 140
 141         // c += d; b ^= c; b <<<= 12
 142         paddd   xmm2, xmm3
 143         pxor    xmm1, xmm2
 144         movdqa  xmm4, xmm1
 145         pslld   xmm1, 12
 146         psrld   xmm4, 20
 147         por     xmm1, xmm4
 148
 149         // a += b; d ^= a; d <<<=  8
 150         paddd   xmm0, xmm1
 151         pxor    xmm3, xmm0
 152         movdqa  xmm4, xmm3
 153         pslld   xmm3, 8
 154         psrld   xmm4, 24
 155         por     xmm3, xmm4
 156
 157         // c += d; b ^= c; b <<<=  7
 158         paddd   xmm2, xmm3
 159          pshufd xmm3, xmm3, SHUF(2, 1, 0, 3)
 160         pxor    xmm1, xmm2
 161          pshufd xmm2, xmm2, SHUF(1, 0, 3, 2)
 162         movdqa  xmm4, xmm1
 163         pslld   xmm1, 7
 164         psrld   xmm4, 25
 165         por     xmm1, xmm4
 166
 167         // The not-quite-transpose conveniently only involves reordering
 168         // elements of individual rows, which can be done quite easily.  It
 169         // doesn't involve any movement of elements between rows, or even
 170         // renaming of the rows.
 171         //
 172         //      [ 0  1  2  3]           [ 0  1  2  3] (a, xmm0)
 173         //      [ 4  5  6  7]    -->    [ 5  6  7  4] (b, xmm1)
 174         //      [ 8  9 10 11]           [10 11  8  9] (c, xmm2)
 175         //      [12 13 14 15]           [15 12 13 14] (d, xmm3)
 176         //
 177         // The shuffles have quite high latency, so they've mostly been
 178         // pushed upwards.  The remaining one can't be moved, though.
 179         pshufd  xmm1, xmm1, SHUF(0, 3, 2, 1)
 180
 181         // Apply the diagonal quarterround to each of the columns
 182         // simultaneously.
 183
 184         // a += b; d ^= a; d <<<= 16
 185         paddd   xmm0, xmm1
 186         pxor    xmm3, xmm0
 187         movdqa  xmm4, xmm3
 188         pslld   xmm3, 16
 189         psrld   xmm4, 16
 190         por     xmm3, xmm4
 191
 192         // c += d; b ^= c; b <<<= 12
 193         paddd   xmm2, xmm3
 194         pxor    xmm1, xmm2
 195         movdqa  xmm4, xmm1
 196         pslld   xmm1, 12
 197         psrld   xmm4, 20
 198         por     xmm1, xmm4
 199
 200         // a += b; d ^= a; d <<<=  8
 201         paddd   xmm0, xmm1
 202         pxor    xmm3, xmm0
 203         movdqa  xmm4, xmm3
 204         pslld   xmm3, 8
 205         psrld   xmm4, 24
 206         por     xmm3, xmm4
 207
 208         // c += d; b ^= c; b <<<=  7
 209         paddd   xmm2, xmm3
 210          pshufd xmm3, xmm3, SHUF(0, 3, 2, 1)
 211         pxor    xmm1, xmm2
 212          pshufd xmm2, xmm2, SHUF(1, 0, 3, 2)
 213         movdqa  xmm4, xmm1
 214         pslld   xmm1, 7
 215         psrld   xmm4, 25
 216         por     xmm1, xmm4
 217
 218         // Finally, finish off undoing the transpose, and we're done for this
 219         // doubleround.  Again, most of this was done above so we don't have
 220         // to wait for the shuffles.
 221         pshufd  xmm1, xmm1, SHUF(2, 1, 0, 3)
 222
 223         // Decrement the loop counter and see if we should go round again.
 224         sub     NR, 2
 225         ja      0b
 226
 227         // Almost there.  Firstly, the feedforward addition.
 228         paddd   xmm0, SAVE0
 229         paddd   xmm1, SAVE1
 230         paddd   xmm2, SAVE2
 231         paddd   xmm3, SAVE3
 232
 233         // And now we write out the result.  This one won't be aligned
 234         // either.
 235         movdqu  [OUT +  0], xmm0
 236         movdqu  [OUT + 16], xmm1
 237         movdqu  [OUT + 32], xmm2
 238         movdqu  [OUT + 48], xmm3
 239
 240         // Tidy things up.
 241 #if CPUFAM_X86
 242         dropfp
 243         popreg  ebp
 244 #endif
 245 #if CPUFAM_AMD64 && ABI_WIN
 246         stfree  48 + 8
 247 #endif
 248
 249         // And with that, we're done.
 250         ret
 251
 252 ENDFUNC
 253
 254 ///----- That's all, folks --------------------------------------------------