chiark - git - mdw - catacomb/blob - symm/chacha-x86ish-sse2.S

   1 /// -*- mode: asm; asm-comment-char: ?/ -*-
   2 ///
   3 /// Fancy SIMD implementation of ChaCha
   4 ///
   5 /// (c) 2015 Straylight/Edgeware
   6 ///
   7
   8 ///----- Licensing notice ---------------------------------------------------
   9 ///
  10 /// This file is part of Catacomb.
  11 ///
  12 /// Catacomb is free software; you can redistribute it and/or modify
  13 /// it under the terms of the GNU Library General Public License as
  14 /// published by the Free Software Foundation; either version 2 of the
  15 /// License, or (at your option) any later version.
  16 ///
  17 /// Catacomb is distributed in the hope that it will be useful,
  18 /// but WITHOUT ANY WARRANTY; without even the implied warranty of
  19 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20 /// GNU Library General Public License for more details.
  21 ///
  22 /// You should have received a copy of the GNU Library General Public
  23 /// License along with Catacomb; if not, write to the Free
  24 /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
  25 /// MA 02111-1307, USA.
  26
  27 ///--------------------------------------------------------------------------
  28 /// Preliminaries.
  29
  30 #include "config.h"
  31 #include "asm-common.h"
  32
  33         .text
  34
  35 ///--------------------------------------------------------------------------
  36 /// Main code.
  37
  38 FUNC(chacha_core_x86ish_avx)
  39         .arch   .avx
  40         vzeroupper
  41   endprologue
  42         // drop through...
  43 ENDFUNC
  44
  45         .arch   pentium4
  46
  47 FUNC(chacha_core_x86ish_sse2)
  48
  49         // Initial setup.
  50
  51 #if CPUFAM_X86
  52         // Arguments come in on the stack, and will need to be collected.  We
  53         // can get away with just the scratch registers for integer work, but
  54         // we'll run out of XMM registers and will need some properly aligned
  55         // space which we'll steal from the stack.  I don't trust the stack
  56         // pointer's alignment, so I'll have to mask the stack pointer, which
  57         // in turn means I'll need to keep track of the old value.  Hence I'm
  58         // making a full i386-style stack frame here.
  59         //
  60         // The Windows and SysV ABIs are sufficiently similar that we don't
  61         // need to worry about the differences here.
  62
  63 #  define NR ecx
  64 #  define IN eax
  65 #  define OUT edx
  66 #  define SAVE0 xmm5
  67 #  define SAVE1 xmm6
  68 #  define SAVE2 xmm7
  69 #  define SAVE3 [esp]
  70
  71         pushreg ebp
  72         setfp
  73         sub     esp, 16
  74         mov     IN, [ebp + 12]
  75         mov     OUT, [ebp + 16]
  76         and     esp, ~15
  77         mov     NR, [ebp + 8]
  78 #endif
  79
  80 #if CPUFAM_AMD64 && ABI_SYSV
  81         // This is nice.  We have plenty of XMM registers, and the arguments
  82         // are in useful places.  There's no need to spill anything and we
  83         // can just get on with the code.
  84
  85 #  define NR edi
  86 #  define IN rsi
  87 #  define OUT rdx
  88 #  define SAVE0 xmm5
  89 #  define SAVE1 xmm6
  90 #  define SAVE2 xmm7
  91 #  define SAVE3 xmm8
  92 #endif
  93
  94 #if CPUFAM_AMD64 && ABI_WIN
  95         // Arguments come in registers, but they're different between Windows
  96         // and everyone else (and everyone else is saner).
  97         //
  98         // The Windows ABI insists that we preserve some of the XMM
  99         // registers, but we want more than we can use as scratch space.  We
 100         // only need to save a copy of the input for the feedforward at the
 101         // end, so we might as well use memory rather than spill extra
 102         // registers.  (We need an extra 8 bytes to align the stack.)
 103
 104 #  define NR ecx
 105 #  define IN rdx
 106 #  define OUT r8
 107 #  define SAVE0 xmm5
 108 #  define SAVE1 [rsp +  0]
 109 #  define SAVE2 [rsp + 16]
 110 #  define SAVE3 [rsp + 32]
 111
 112         stalloc 48 + 8
 113 #endif
 114
 115   endprologue
 116
 117         // First job is to slurp the matrix into XMM registers.  Be careful:
 118         // the input matrix isn't likely to be properly aligned.
 119         //
 120         //      [ 0  1  2  3] (a, xmm0)
 121         //      [ 4  5  6  7] (b, xmm1)
 122         //      [ 8  9 10 11] (c, xmm2)
 123         //      [12 13 14 15] (d, xmm3)
 124         movdqu  xmm0, [IN +  0]
 125         movdqu  xmm1, [IN + 16]
 126         movdqu  xmm2, [IN + 32]
 127         movdqu  xmm3, [IN + 48]
 128
 129         // Take a copy for later.  This one is aligned properly, by
 130         // construction.
 131         movdqa  SAVE0, xmm0
 132         movdqa  SAVE1, xmm1
 133         movdqa  SAVE2, xmm2
 134         movdqa  SAVE3, xmm3
 135
 136 0:
 137         // Apply a column quarterround to each of the columns simultaneously.
 138         // Alas, there doesn't seem to be a packed doubleword rotate, so we
 139         // have to synthesize it.
 140
 141         // a += b; d ^= a; d <<<= 16
 142         paddd   xmm0, xmm1
 143         pxor    xmm3, xmm0
 144         movdqa  xmm4, xmm3
 145         pslld   xmm3, 16
 146         psrld   xmm4, 16
 147         por     xmm3, xmm4
 148
 149         // c += d; b ^= c; b <<<= 12
 150         paddd   xmm2, xmm3
 151         pxor    xmm1, xmm2
 152         movdqa  xmm4, xmm1
 153         pslld   xmm1, 12
 154         psrld   xmm4, 20
 155         por     xmm1, xmm4
 156
 157         // a += b; d ^= a; d <<<=  8
 158         paddd   xmm0, xmm1
 159         pxor    xmm3, xmm0
 160         movdqa  xmm4, xmm3
 161         pslld   xmm3, 8
 162         psrld   xmm4, 24
 163         por     xmm3, xmm4
 164
 165         // c += d; b ^= c; b <<<=  7
 166         paddd   xmm2, xmm3
 167          pshufd xmm3, xmm3, SHUF(3, 0, 1, 2)
 168         pxor    xmm1, xmm2
 169          pshufd xmm2, xmm2, SHUF(2, 3, 0, 1)
 170         movdqa  xmm4, xmm1
 171         pslld   xmm1, 7
 172         psrld   xmm4, 25
 173         por     xmm1, xmm4
 174
 175         // The not-quite-transpose conveniently only involves reordering
 176         // elements of individual rows, which can be done quite easily.  It
 177         // doesn't involve any movement of elements between rows, or even
 178         // renaming of the rows.
 179         //
 180         //      [ 0  1  2  3]           [ 0  1  2  3] (a, xmm0)
 181         //      [ 4  5  6  7]    -->    [ 5  6  7  4] (b, xmm1)
 182         //      [ 8  9 10 11]           [10 11  8  9] (c, xmm2)
 183         //      [12 13 14 15]           [15 12 13 14] (d, xmm3)
 184         //
 185         // The shuffles have quite high latency, so they've mostly been
 186         // pushed upwards.  The remaining one can't be moved, though.
 187         pshufd  xmm1, xmm1, SHUF(1, 2, 3, 0)
 188
 189         // Apply the diagonal quarterround to each of the columns
 190         // simultaneously.
 191
 192         // a += b; d ^= a; d <<<= 16
 193         paddd   xmm0, xmm1
 194         pxor    xmm3, xmm0
 195         movdqa  xmm4, xmm3
 196         pslld   xmm3, 16
 197         psrld   xmm4, 16
 198         por     xmm3, xmm4
 199
 200         // c += d; b ^= c; b <<<= 12
 201         paddd   xmm2, xmm3
 202         pxor    xmm1, xmm2
 203         movdqa  xmm4, xmm1
 204         pslld   xmm1, 12
 205         psrld   xmm4, 20
 206         por     xmm1, xmm4
 207
 208         // a += b; d ^= a; d <<<=  8
 209         paddd   xmm0, xmm1
 210         pxor    xmm3, xmm0
 211         movdqa  xmm4, xmm3
 212         pslld   xmm3, 8
 213         psrld   xmm4, 24
 214         por     xmm3, xmm4
 215
 216         // c += d; b ^= c; b <<<=  7
 217         paddd   xmm2, xmm3
 218          pshufd xmm3, xmm3, SHUF(1, 2, 3, 0)
 219         pxor    xmm1, xmm2
 220          pshufd xmm2, xmm2, SHUF(2, 3, 0, 1)
 221         movdqa  xmm4, xmm1
 222         pslld   xmm1, 7
 223         psrld   xmm4, 25
 224         por     xmm1, xmm4
 225
 226         // Finally, finish off undoing the transpose, and we're done for this
 227         // doubleround.  Again, most of this was done above so we don't have
 228         // to wait for the shuffles.
 229         pshufd  xmm1, xmm1, SHUF(3, 0, 1, 2)
 230
 231         // Decrement the loop counter and see if we should go round again.
 232         sub     NR, 2
 233         ja      0b
 234
 235         // Almost there.  Firstly, the feedforward addition.
 236         paddd   xmm0, SAVE0
 237         paddd   xmm1, SAVE1
 238         paddd   xmm2, SAVE2
 239         paddd   xmm3, SAVE3
 240
 241         // And now we write out the result.  This one won't be aligned
 242         // either.
 243         movdqu  [OUT +  0], xmm0
 244         movdqu  [OUT + 16], xmm1
 245         movdqu  [OUT + 32], xmm2
 246         movdqu  [OUT + 48], xmm3
 247
 248         // Tidy things up.
 249 #if CPUFAM_X86
 250         dropfp
 251         popreg  ebp
 252 #endif
 253 #if CPUFAM_AMD64 && ABI_WIN
 254         stfree  48 + 8
 255 #endif
 256
 257         // And with that, we're done.
 258         ret
 259
 260 ENDFUNC
 261
 262 ///----- That's all, folks --------------------------------------------------