chiark - git - mdw - catacomb/blob - symm/salsa20-x86-sse2.s

   1 ### -*- mode: asm; asm-comment-char: ?# -*-
   2 ###
   3 ### Fancy SIMD implementation of Salsa20
   4 ###
   5 ### (c) 2015 Straylight/Edgeware
   6 ###
   7
   8 ###----- Licensing notice ---------------------------------------------------
   9 ###
  10 ### This file is part of Catacomb.
  11 ###
  12 ### Catacomb is free software; you can redistribute it and/or modify
  13 ### it under the terms of the GNU Library General Public License as
  14 ### published by the Free Software Foundation; either version 2 of the
  15 ### License, or (at your option) any later version.
  16 ###
  17 ### Catacomb is distributed in the hope that it will be useful,
  18 ### but WITHOUT ANY WARRANTY; without even the implied warranty of
  19 ### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20 ### GNU Library General Public License for more details.
  21 ###
  22 ### You should have received a copy of the GNU Library General Public
  23 ### License along with Catacomb; if not, write to the Free
  24 ### Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
  25 ### MA 02111-1307, USA.
  26
  27         .intel_syntax noprefix
  28         .arch pentium4
  29
  30         .section .text
  31
  32         .globl  salsa20_core_x86_sse2
  33         .type   salsa20_core_x86_sse2, STT_FUNC
  34 salsa20_core_x86_sse2:
  35
  36         ## Initial state.  We have three arguments:
  37         ## [ebp +  8] is the number of rounds to do
  38         ## [ebp + 12] points to the input matrix
  39         ## [ebp + 16] points to the output matrix
  40         push    ebp
  41         mov     ebp, esp
  42         sub     esp, 32
  43         mov     edx, [ebp + 12]
  44         and     esp, ~15
  45
  46         ## Prepare for the main loop.
  47         mov     ecx, [ebp + 8]
  48
  49         ## First job is to slurp the matrix into XMM registers.  The words
  50         ## have already been permuted conveniently to make them line up
  51         ## better for SIMD processing.
  52         ##
  53         ## The textbook arrangement of the matrix is this.
  54         ##
  55         ##      [C K K K]
  56         ##      [K C N N]
  57         ##      [T T C K]
  58         ##      [K K K C]
  59         ##
  60         ## But we've rotated the columns up so that the main diagonal with
  61         ## the constants on it end up in the first row, giving something more
  62         ## like
  63         ##
  64         ##      [C C C C]
  65         ##      [K T K K]
  66         ##      [T K K N]
  67         ##      [K K N K]
  68         ##
  69         ## so the transformation looks like this:
  70         ##
  71         ##      [ 0  1  2  3]           [ 0  5 10 15] (a, xmm0)
  72         ##      [ 4  5  6  7]    -->    [ 4  9 14  3] (b, xmm1)
  73         ##      [ 8  9 10 11]           [ 8 13  2  7] (c, xmm2)
  74         ##      [12 13 14 15]           [12  1  6 11] (d, xmm3)
  75         movdqu  xmm0, [edx +  0]
  76         movdqu  xmm1, [edx + 16]
  77         movdqu  xmm2, [edx + 32]
  78         movdqu  xmm3, [edx + 48]
  79
  80         ## Take a copy for later.
  81         movdqa  [esp +  0], xmm0
  82         movdqa  [esp + 16], xmm1
  83         movdqa  xmm6, xmm2
  84         movdqa  xmm7, xmm3
  85
  86 loop:
  87
  88         ## Apply a column quarterround to each of the columns simultaneously.
  89         ## Alas, there doesn't seem to be a packed doubleword rotate, so we
  90         ## have to synthesize it.
  91
  92         ## b ^= (a + d) <<<  7
  93         movdqa  xmm4, xmm0
  94         paddd   xmm4, xmm3
  95         movdqa  xmm5, xmm4
  96         pslld   xmm4, 7
  97         psrld   xmm5, 25
  98         por     xmm4, xmm5
  99         pxor    xmm1, xmm4
 100
 101         ## c ^= (b + a) <<<  9
 102         movdqa  xmm4, xmm1
 103         paddd   xmm4, xmm0
 104         movdqa  xmm5, xmm4
 105         pslld   xmm4, 9
 106         psrld   xmm5, 23
 107         por     xmm4, xmm5
 108         pxor    xmm2, xmm4
 109
 110         ## d ^= (c + b) <<< 13
 111         movdqa  xmm4, xmm2
 112         paddd   xmm4, xmm1
 113         pshufd  xmm1, xmm1, 0x93
 114         movdqa  xmm5, xmm4
 115         pslld   xmm4, 13
 116         psrld   xmm5, 19
 117         por     xmm4, xmm5
 118         pxor    xmm3, xmm4
 119
 120         ## a ^= (d + c) <<< 18
 121         movdqa  xmm4, xmm3
 122         pshufd  xmm3, xmm3, 0x39
 123         paddd   xmm4, xmm2
 124         pshufd  xmm2, xmm2, 0x4e
 125         movdqa  xmm5, xmm4
 126         pslld   xmm4, 18
 127         psrld   xmm5, 14
 128         por     xmm4, xmm5
 129         pxor    xmm0, xmm4
 130
 131         ## The transpose conveniently only involves reordering elements of
 132         ## individual rows, which can be done quite easily, and reordering
 133         ## the rows themselves, which is a trivial renaming.  It doesn't
 134         ## involve any movement of elements between rows.
 135         ##
 136         ##      [ 0  5 10 15]           [ 0  5 10 15] (a, xmm0)
 137         ##      [ 4  9 14  3]    -->    [ 1  6 11 12] (b, xmm3)
 138         ##      [ 8 13  2  7]           [ 2  7  8 13] (c, xmm2)
 139         ##      [12  1  6 11]           [ 3  4  9 14] (d, xmm1)
 140         ##
 141         ## The shuffles have quite high latency, so they've been pushed
 142         ## backwards into the main instruction list.
 143
 144         ## Apply the row quarterround to each of the columns (yes!)
 145         ## simultaneously.
 146
 147         ## b ^= (a + d) <<<  7
 148         movdqa  xmm4, xmm0
 149         paddd   xmm4, xmm1
 150         movdqa  xmm5, xmm4
 151         pslld   xmm4, 7
 152         psrld   xmm5, 25
 153         por     xmm4, xmm5
 154         pxor    xmm3, xmm4
 155
 156         ## c ^= (b + a) <<<  9
 157         movdqa  xmm4, xmm3
 158         paddd   xmm4, xmm0
 159         movdqa  xmm5, xmm4
 160         pslld   xmm4, 9
 161         psrld   xmm5, 23
 162         por     xmm4, xmm5
 163         pxor    xmm2, xmm4
 164
 165         ## d ^= (c + b) <<< 13
 166         movdqa  xmm4, xmm2
 167         paddd   xmm4, xmm3
 168         pshufd  xmm3, xmm3, 0x93
 169         movdqa  xmm5, xmm4
 170         pslld   xmm4, 13
 171         psrld   xmm5, 19
 172         por     xmm4, xmm5
 173         pxor    xmm1, xmm4
 174
 175         ## a ^= (d + c) <<< 18
 176         movdqa  xmm4, xmm1
 177         pshufd  xmm1, xmm1, 0x39
 178         paddd   xmm4, xmm2
 179         pshufd  xmm2, xmm2, 0x4e
 180         movdqa  xmm5, xmm4
 181         pslld   xmm4, 18
 182         psrld   xmm5, 14
 183         por     xmm4, xmm5
 184         pxor    xmm0, xmm4
 185
 186         ## We had to undo the transpose ready for the next loop.  Again, push
 187         ## back the shuffles because they take a long time coming through.
 188         ## Decrement the loop counter and see if we should go round again.
 189         ## Later processors fuse this pair into a single uop.
 190         sub     ecx, 2
 191         ja      loop
 192
 193         ## Almost there.  Firstly, the feedforward addition, and then we have
 194         ## to write out the result.  Here we have to undo the permutation
 195         ## which was already applied to the input.  Shuffling has quite high
 196         ## latency, so arrange to start a new shuffle into a temporary as
 197         ## soon as we've written out the old value.
 198         mov     edx, [ebp + 16]
 199
 200         paddd   xmm0, [esp +  0]
 201         pshufd  xmm4, xmm0, 0x39
 202         movd    [edx +  0], xmm0
 203
 204         paddd   xmm1, [esp + 16]
 205         pshufd  xmm5, xmm1, 0x93
 206         movd    [edx + 16], xmm1
 207
 208         paddd   xmm2, xmm6
 209         pshufd  xmm6, xmm2, 0x4e
 210         movd    [edx + 32], xmm2
 211
 212         paddd   xmm3, xmm7
 213         pshufd  xmm7, xmm3, 0x39
 214         movd    [edx + 48], xmm3
 215
 216         movd    [edx +  4], xmm7
 217         pshufd  xmm7, xmm3, 0x4e
 218         movd    [edx + 24], xmm7
 219         pshufd  xmm3, xmm3, 0x93
 220         movd    [edx + 44], xmm3
 221
 222         movd    [edx +  8], xmm6
 223         pshufd  xmm6, xmm2, 0x93
 224         movd    [edx + 28], xmm6
 225         pshufd  xmm2, xmm2, 0x39
 226         movd    [edx + 52], xmm2
 227
 228         movd    [edx + 12], xmm5
 229         pshufd  xmm5, xmm1, 0x39
 230         movd    [edx + 36], xmm5
 231         pshufd  xmm1, xmm1, 0x4e
 232         movd    [edx + 56], xmm1
 233
 234         movd    [edx + 20], xmm4
 235         pshufd  xmm4, xmm0, 0x4e
 236         movd    [edx + 40], xmm4
 237         pshufd  xmm0, xmm0, 0x93
 238         movd    [edx + 60], xmm0
 239
 240         ## And with that, we're done.
 241         mov     esp, ebp
 242         pop     ebp
 243         ret
 244
 245         .size   salsa20_core_x86_sse2, . - salsa20_core_x86_sse2
 246
 247 ###----- That's all, folks --------------------------------------------------