chiark - git - mdw - catacomb/blob - symm/chacha-x86-sse2.s

   1 ### -*- mode: asm; asm-comment-char: ?# -*-
   2 ###
   3 ### Fancy SIMD implementation of ChaCha
   4 ###
   5 ### (c) 2015 Straylight/Edgeware
   6 ###
   7
   8 ###----- Licensing notice ---------------------------------------------------
   9 ###
  10 ### This file is part of Catacomb.
  11 ###
  12 ### Catacomb is free software; you can redistribute it and/or modify
  13 ### it under the terms of the GNU Library General Public License as
  14 ### published by the Free Software Foundation; either version 2 of the
  15 ### License, or (at your option) any later version.
  16 ###
  17 ### Catacomb is distributed in the hope that it will be useful,
  18 ### but WITHOUT ANY WARRANTY; without even the implied warranty of
  19 ### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20 ### GNU Library General Public License for more details.
  21 ###
  22 ### You should have received a copy of the GNU Library General Public
  23 ### License along with Catacomb; if not, write to the Free
  24 ### Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
  25 ### MA 02111-1307, USA.
  26
  27         .intel_syntax noprefix
  28         .arch pentium4
  29
  30         .section .text
  31
  32         .globl  chacha_core_x86_sse2
  33         .type   chacha_core_x86_sse2, STT_FUNC
  34 chacha_core_x86_sse2:
  35
  36         ## Initial state.  We have three arguments:
  37         ## [ebp +  8] is the number of rounds to do
  38         ## [ebp + 12] points to the input matrix
  39         ## [ebp + 16] points to the output matrix
  40         push    ebp
  41         mov     ebp, esp
  42         sub     esp, 16
  43         mov     edx, [ebp + 12]
  44         and     esp, ~15
  45
  46         ## First job is to slurp the matrix into XMM registers.  Be careful:
  47         ## the input matrix isn't likely to be properly aligned.
  48         ##
  49         ##      [ 0  1  2  3] (a, xmm0)
  50         ##      [ 4  5  6  7] (b, xmm0)
  51         ##      [ 8  9 10 11] (c, xmm0)
  52         ##      [12 13 14 15] (d, xmm0)
  53         movdqu  xmm0, [edx +  0]
  54         movdqu  xmm1, [edx + 16]
  55         movdqu  xmm2, [edx + 32]
  56         movdqu  xmm3, [edx + 48]
  57
  58         ## Prepare for the main loop.
  59         mov     ecx, [ebp + 8]
  60
  61         ## Take a copy for later.  This one is aligned properly, by
  62         ## construction.
  63         movdqa  [esp], xmm0
  64         movdqa  xmm5, xmm1
  65         movdqa  xmm6, xmm2
  66         movdqa  xmm7, xmm3
  67
  68 loop:
  69         ## Apply a column quarterround to each of the columns simultaneously.
  70         ## Alas, there doesn't seem to be a packed doubleword rotate, so we
  71         ## have to synthesize it.
  72
  73         ## a += b; d ^= a; d <<<= 16
  74         paddd   xmm0, xmm1
  75         pxor    xmm3, xmm0
  76         movdqa  xmm4, xmm3
  77         pslld   xmm3, 16
  78         psrld   xmm4, 16
  79         por     xmm3, xmm4
  80
  81         ## c += d; b ^= c; b <<<= 12
  82         paddd   xmm2, xmm3
  83         pxor    xmm1, xmm2
  84         movdqa  xmm4, xmm1
  85         pslld   xmm1, 12
  86         psrld   xmm4, 20
  87         por     xmm1, xmm4
  88
  89         ## a += b; d ^= a; d <<<=  8
  90         paddd   xmm0, xmm1
  91         pxor    xmm3, xmm0
  92         movdqa  xmm4, xmm3
  93         pslld   xmm3, 8
  94         psrld   xmm4, 24
  95         por     xmm3, xmm4
  96
  97         ## c += d; b ^= c; b <<<=  7
  98         paddd   xmm2, xmm3
  99         pshufd  xmm3, xmm3, 0x93
 100         pxor    xmm1, xmm2
 101         pshufd  xmm2, xmm2, 0x4e
 102         movdqa  xmm4, xmm1
 103         pslld   xmm1, 7
 104         psrld   xmm4, 25
 105         por     xmm1, xmm4
 106
 107         ## The not-quite-transpose conveniently only involves reordering
 108         ## elements of individual rows, which can be done quite easily.  It
 109         ## doesn't involve any movement of elements between rows, or even
 110         ## renaming of the rows.
 111         ##
 112         ##      [ 0  1  2  3]           [ 0  1  2  3] (a, xmm0)
 113         ##      [ 4  5  6  7]    -->    [ 5  6  7  4] (b, xmm1)
 114         ##      [ 8  9 10 11]           [10 11  8  9] (c, xmm2)
 115         ##      [12 13 14 15]           [15 12 13 14] (d, xmm3)
 116         ##
 117         ## The shuffles have quite high latency, so they've mostly been
 118         ## pushed upwards.  The remaining one can't be moved, though.
 119         pshufd  xmm1, xmm1, 0x39
 120
 121         ## Apply the diagonal quarterround to each of the columns
 122         ## simultaneously.
 123
 124         ## a += b; d ^= a; d <<<= 16
 125         paddd   xmm0, xmm1
 126         pxor    xmm3, xmm0
 127         movdqa  xmm4, xmm3
 128         pslld   xmm3, 16
 129         psrld   xmm4, 16
 130         por     xmm3, xmm4
 131
 132         ## c += d; b ^= c; b <<<= 12
 133         paddd   xmm2, xmm3
 134         pxor    xmm1, xmm2
 135         movdqa  xmm4, xmm1
 136         pslld   xmm1, 12
 137         psrld   xmm4, 20
 138         por     xmm1, xmm4
 139
 140         ## a += b; d ^= a; d <<<=  8
 141         paddd   xmm0, xmm1
 142         pxor    xmm3, xmm0
 143         movdqa  xmm4, xmm3
 144         pslld   xmm3, 8
 145         psrld   xmm4, 24
 146         por     xmm3, xmm4
 147
 148         ## c += d; b ^= c; b <<<=  7
 149         paddd   xmm2, xmm3
 150         pshufd  xmm3, xmm3, 0x39
 151         pxor    xmm1, xmm2
 152         pshufd  xmm2, xmm2, 0x4e
 153         movdqa  xmm4, xmm1
 154         pslld   xmm1, 7
 155         psrld   xmm4, 25
 156         por     xmm1, xmm4
 157
 158         ## Finally, finish off undoing the transpose, and we're done for this
 159         ## doubleround.  Again, most of this was done above so we don't have
 160         ## to wait for the shuffles.
 161         pshufd  xmm1, xmm1, 0x93
 162
 163         ## Decrement the loop counter and see if we should go round again.
 164         sub     ecx, 2
 165         ja      loop
 166
 167         ## Almost there.  Firstly, the feedforward addition.
 168         mov     edx, [ebp + 16]
 169         paddd   xmm0, [esp]
 170         paddd   xmm1, xmm5
 171         paddd   xmm2, xmm6
 172         paddd   xmm3, xmm7
 173
 174         ## And now we write out the result.  This one won't be aligned
 175         ## either.
 176         movdqu  [edx +  0], xmm0
 177         movdqu  [edx + 16], xmm1
 178         movdqu  [edx + 32], xmm2
 179         movdqu  [edx + 48], xmm3
 180
 181         ## And with that, we're done.
 182         mov     esp, ebp
 183         pop     ebp
 184         ret
 185
 186         .size   chacha_core_x86_sse2, . - chacha_core_x86_sse2
 187
 188 ###----- That's all, folks --------------------------------------------------