chiark - git - mdw - catacomb/blob - symm/salsa20-x86-sse2.S

   1 /// -*- mode: asm; asm-comment-char: ?/ -*-
   2 ///
   3 /// Fancy SIMD implementation of Salsa20
   4 ///
   5 /// (c) 2015 Straylight/Edgeware
   6 ///
   7
   8 ///----- Licensing notice ---------------------------------------------------
   9 ///
  10 /// This file is part of Catacomb.
  11 ///
  12 /// Catacomb is free software; you can redistribute it and/or modify
  13 /// it under the terms of the GNU Library General Public License as
  14 /// published by the Free Software Foundation; either version 2 of the
  15 /// License, or (at your option) any later version.
  16 ///
  17 /// Catacomb is distributed in the hope that it will be useful,
  18 /// but WITHOUT ANY WARRANTY; without even the implied warranty of
  19 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20 /// GNU Library General Public License for more details.
  21 ///
  22 /// You should have received a copy of the GNU Library General Public
  23 /// License along with Catacomb; if not, write to the Free
  24 /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
  25 /// MA 02111-1307, USA.
  26
  27 ///--------------------------------------------------------------------------
  28 /// External definitions.
  29
  30 #include "config.h"
  31 #include "asm-common.h"
  32
  33 ///--------------------------------------------------------------------------
  34 /// Local utilities.
  35
  36 // Magic constants for shuffling.
  37 #define ROTL 0x93
  38 #define ROT2 0x4e
  39 #define ROTR 0x39
  40
  41 ///--------------------------------------------------------------------------
  42 /// Main code.
  43
  44         .arch pentium4
  45         .section .text
  46
  47 FUNC(salsa20_core_x86_sse2)
  48
  49         // Initial state.  We have three arguments:
  50         // [ebp +  8] is the number of rounds to do
  51         // [ebp + 12] points to the input matrix
  52         // [ebp + 16] points to the output matrix
  53         push    ebp
  54         mov     ebp, esp
  55         sub     esp, 32
  56         mov     edx, [ebp + 12]
  57         and     esp, ~15
  58
  59         // Prepare for the main loop.
  60         mov     ecx, [ebp + 8]
  61
  62         // First job is to slurp the matrix into XMM registers.  The words
  63         // have already been permuted conveniently to make them line up
  64         // better for SIMD processing.
  65         //
  66         // The textbook arrangement of the matrix is this.
  67         //
  68         //      [C K K K]
  69         //      [K C N N]
  70         //      [T T C K]
  71         //      [K K K C]
  72         //
  73         // But we've rotated the columns up so that the main diagonal with
  74         // the constants on it end up in the first row, giving something more
  75         // like
  76         //
  77         //      [C C C C]
  78         //      [K T K K]
  79         //      [T K K N]
  80         //      [K K N K]
  81         //
  82         // so the transformation looks like this:
  83         //
  84         //      [ 0  1  2  3]           [ 0  5 10 15] (a, xmm0)
  85         //      [ 4  5  6  7]    -->    [ 4  9 14  3] (b, xmm1)
  86         //      [ 8  9 10 11]           [ 8 13  2  7] (c, xmm2)
  87         //      [12 13 14 15]           [12  1  6 11] (d, xmm3)
  88         movdqu  xmm0, [edx +  0]
  89         movdqu  xmm1, [edx + 16]
  90         movdqu  xmm2, [edx + 32]
  91         movdqu  xmm3, [edx + 48]
  92
  93         // Take a copy for later.
  94         movdqa  [esp +  0], xmm0
  95         movdqa  [esp + 16], xmm1
  96         movdqa  xmm6, xmm2
  97         movdqa  xmm7, xmm3
  98
  99 loop:
 100
 101         // Apply a column quarterround to each of the columns simultaneously.
 102         // Alas, there doesn't seem to be a packed doubleword rotate, so we
 103         // have to synthesize it.
 104
 105         // b ^= (a + d) <<<  7
 106         movdqa  xmm4, xmm0
 107         paddd   xmm4, xmm3
 108         movdqa  xmm5, xmm4
 109         pslld   xmm4, 7
 110         psrld   xmm5, 25
 111         por     xmm4, xmm5
 112         pxor    xmm1, xmm4
 113
 114         // c ^= (b + a) <<<  9
 115         movdqa  xmm4, xmm1
 116         paddd   xmm4, xmm0
 117         movdqa  xmm5, xmm4
 118         pslld   xmm4, 9
 119         psrld   xmm5, 23
 120         por     xmm4, xmm5
 121         pxor    xmm2, xmm4
 122
 123         // d ^= (c + b) <<< 13
 124         movdqa  xmm4, xmm2
 125         paddd   xmm4, xmm1
 126         pshufd  xmm1, xmm1, ROTL
 127         movdqa  xmm5, xmm4
 128         pslld   xmm4, 13
 129         psrld   xmm5, 19
 130         por     xmm4, xmm5
 131         pxor    xmm3, xmm4
 132
 133         // a ^= (d + c) <<< 18
 134         movdqa  xmm4, xmm3
 135         pshufd  xmm3, xmm3, ROTR
 136         paddd   xmm4, xmm2
 137         pshufd  xmm2, xmm2, ROT2
 138         movdqa  xmm5, xmm4
 139         pslld   xmm4, 18
 140         psrld   xmm5, 14
 141         por     xmm4, xmm5
 142         pxor    xmm0, xmm4
 143
 144         // The transpose conveniently only involves reordering elements of
 145         // individual rows, which can be done quite easily, and reordering
 146         // the rows themselves, which is a trivial renaming.  It doesn't
 147         // involve any movement of elements between rows.
 148         //
 149         //      [ 0  5 10 15]           [ 0  5 10 15] (a, xmm0)
 150         //      [ 4  9 14  3]    -->    [ 1  6 11 12] (b, xmm3)
 151         //      [ 8 13  2  7]           [ 2  7  8 13] (c, xmm2)
 152         //      [12  1  6 11]           [ 3  4  9 14] (d, xmm1)
 153         //
 154         // The shuffles have quite high latency, so they've been pushed
 155         // backwards into the main instruction list.
 156
 157         // Apply the row quarterround to each of the columns (yes!)
 158         // simultaneously.
 159
 160         // b ^= (a + d) <<<  7
 161         movdqa  xmm4, xmm0
 162         paddd   xmm4, xmm1
 163         movdqa  xmm5, xmm4
 164         pslld   xmm4, 7
 165         psrld   xmm5, 25
 166         por     xmm4, xmm5
 167         pxor    xmm3, xmm4
 168
 169         // c ^= (b + a) <<<  9
 170         movdqa  xmm4, xmm3
 171         paddd   xmm4, xmm0
 172         movdqa  xmm5, xmm4
 173         pslld   xmm4, 9
 174         psrld   xmm5, 23
 175         por     xmm4, xmm5
 176         pxor    xmm2, xmm4
 177
 178         // d ^= (c + b) <<< 13
 179         movdqa  xmm4, xmm2
 180         paddd   xmm4, xmm3
 181         pshufd  xmm3, xmm3, ROTL
 182         movdqa  xmm5, xmm4
 183         pslld   xmm4, 13
 184         psrld   xmm5, 19
 185         por     xmm4, xmm5
 186         pxor    xmm1, xmm4
 187
 188         // a ^= (d + c) <<< 18
 189         movdqa  xmm4, xmm1
 190         pshufd  xmm1, xmm1, ROTR
 191         paddd   xmm4, xmm2
 192         pshufd  xmm2, xmm2, ROT2
 193         movdqa  xmm5, xmm4
 194         pslld   xmm4, 18
 195         psrld   xmm5, 14
 196         por     xmm4, xmm5
 197         pxor    xmm0, xmm4
 198
 199         // We had to undo the transpose ready for the next loop.  Again, push
 200         // back the shuffles because they take a long time coming through.
 201         // Decrement the loop counter and see if we should go round again.
 202         // Later processors fuse this pair into a single uop.
 203         sub     ecx, 2
 204         ja      loop
 205
 206         // Almost there.  Firstly, the feedforward addition, and then we have
 207         // to write out the result.  Here we have to undo the permutation
 208         // which was already applied to the input.  Shuffling has quite high
 209         // latency, so arrange to start a new shuffle into a temporary as
 210         // soon as we've written out the old value.
 211         mov     edx, [ebp + 16]
 212
 213         paddd   xmm0, [esp +  0]
 214         pshufd  xmm4, xmm0, ROTR
 215         movd    [edx +  0], xmm0
 216
 217         paddd   xmm1, [esp + 16]
 218         pshufd  xmm5, xmm1, ROTL
 219         movd    [edx + 16], xmm1
 220
 221         paddd   xmm2, xmm6
 222         pshufd  xmm6, xmm2, ROT2
 223         movd    [edx + 32], xmm2
 224
 225         paddd   xmm3, xmm7
 226         pshufd  xmm7, xmm3, ROTR
 227         movd    [edx + 48], xmm3
 228
 229         movd    [edx +  4], xmm7
 230         pshufd  xmm7, xmm3, ROT2
 231         movd    [edx + 24], xmm7
 232         pshufd  xmm3, xmm3, ROTL
 233         movd    [edx + 44], xmm3
 234
 235         movd    [edx +  8], xmm6
 236         pshufd  xmm6, xmm2, ROTL
 237         movd    [edx + 28], xmm6
 238         pshufd  xmm2, xmm2, ROTR
 239         movd    [edx + 52], xmm2
 240
 241         movd    [edx + 12], xmm5
 242         pshufd  xmm5, xmm1, ROTR
 243         movd    [edx + 36], xmm5
 244         pshufd  xmm1, xmm1, ROT2
 245         movd    [edx + 56], xmm1
 246
 247         movd    [edx + 20], xmm4
 248         pshufd  xmm4, xmm0, ROT2
 249         movd    [edx + 40], xmm4
 250         pshufd  xmm0, xmm0, ROTL
 251         movd    [edx + 60], xmm0
 252
 253         // Tidy things up.
 254         mov     esp, ebp
 255         pop     ebp
 256
 257         // And with that, we're done.
 258         ret
 259
 260 ENDFUNC
 261
 262 ///----- That's all, folks --------------------------------------------------