chiark / gitweb /
(x86 asm): Zero the high parts of the ?MM registers if available.
authorMark Wooding <mdw@distorted.org.uk>
Thu, 23 Aug 2018 04:13:55 +0000 (05:13 +0100)
committerMark Wooding <mdw@distorted.org.uk>
Thu, 23 Aug 2018 06:23:31 +0000 (07:23 +0100)
There's a performance penalty to trying to preserve the upper parts of
the SSE/AVX vector registers, and it's pointless because we don't need
to preserve them.  (Earlier AVX-capable processors would carefully snip
off the upper parts of the registers and put them in a box, and then
glue them back on when they were wanted, which isn't so bad.  Later
processors instead just track the upper part of the register as an
additional operand, which leads to unnecessary latency.)

Add AVX-specific entry points to the necessary routines, and call them
when AVX is detected.  This would all be easier if Intel had chosen
`vzeroupper' from an existing `nop' encoding space.

13 files changed:
base/dispatch.c
base/dispatch.h
math/mpmont.c
math/mpx-mul4-amd64-sse2.S
math/mpx-mul4-x86-sse2.S
math/mpx.c
symm/chacha-x86ish-sse2.S
symm/chacha.c
symm/rijndael-base.c
symm/rijndael-x86ish-aesni.S
symm/rijndael.c
symm/salsa20-x86ish-sse2.S
symm/salsa20.c

index 908a4e3189e8e570a396cb3649f0b781d8f6d390..9ba6a7cd3cd79b55b298ed6fb9f39d2de1de5128 100644 (file)
@@ -47,6 +47,7 @@
 #  define CPUID1D_SSE2 (1u << 26)
 #  define CPUID1D_FXSR (1u << 24)
 #  define CPUID1C_AESNI (1u << 25)
+#  define CPUID1C_AVX (1u << 28)
 #  define CPUID1C_RDRAND (1u << 30)
 
 struct cpuid { unsigned a, b, c, d; };
@@ -545,6 +546,9 @@ int cpu_feature_p(int feat)
                 cpuid_features_p(CPUID1D_SSE2, CPUID1C_AESNI));
     CASE_CPUFEAT(X86_RDRAND, "x86:rdrand",
                 cpuid_features_p(0, CPUID1C_RDRAND));
+    CASE_CPUFEAT(X86_AVX, "x86:avx",
+                xmm_registers_available_p() &&
+                cpuid_features_p(0, CPUID1C_AVX));
 #endif
 #ifdef CAPMAP
 #  define FEATP__CASE(feat, tok)                                       \
index f778068cd4030d26ea36066ba9305349fa906a90..dae6a6894d735a9149cf2ad00143b8c7bb5af26b 100644 (file)
@@ -181,7 +181,8 @@ enum {
   CPUFEAT_ARM_V4,                      /* VFPv4 and/or SIMD v2 */
   CPUFEAT_ARM_D32,                     /* 32 double registers, not 16 */
   CPUFEAT_X86_RDRAND,                  /* Built-in entropy source */
-  CPUFEAT_ARM_AES                      /* AES instructions */
+  CPUFEAT_ARM_AES,                     /* AES instructions */
+  CPUFEAT_X86_AVX                      /* AVX 1 (i.e., 256-bit YMM regs) */
 };
 
 extern int cpu_feature_p(int /*feat*/);
index f8a26119bf03492f8d0afd8281f1b11d29d5571d..094ac401698918cc8a6fc5c5cdd37729c6ba80da 100644 (file)
@@ -90,19 +90,25 @@ static void simple_redccore(mpw *dv, mpw *dvl, const mpw *mv,
 
 #if CPUFAM_X86
   MAYBE_REDC4(x86_sse2)
+  MAYBE_REDC4(x86_avx)
 #endif
 
 #if CPUFAM_AMD64
   MAYBE_REDC4(amd64_sse2)
+  MAYBE_REDC4(amd64_avx)
 #endif
 
 static redccore__functype *pick_redccore(void)
 {
 #if CPUFAM_X86
+  DISPATCH_PICK_COND(mpmont_reduce, maybe_redc4_x86_avx,
+                    cpu_feature_p(CPUFEAT_X86_AVX));
   DISPATCH_PICK_COND(mpmont_reduce, maybe_redc4_x86_sse2,
                     cpu_feature_p(CPUFEAT_X86_SSE2));
 #endif
 #if CPUFAM_AMD64
+  DISPATCH_PICK_COND(mpmont_reduce, maybe_redc4_amd64_avx,
+                    cpu_feature_p(CPUFEAT_X86_AVX));
   DISPATCH_PICK_COND(mpmont_reduce, maybe_redc4_amd64_sse2,
                     cpu_feature_p(CPUFEAT_X86_SSE2));
 #endif
@@ -190,19 +196,25 @@ static void simple_mulcore(mpw *dv, mpw *dvl,
 
 #if CPUFAM_X86
   MAYBE_MUL4(x86_sse2)
+  MAYBE_MUL4(x86_avx)
 #endif
 
 #if CPUFAM_AMD64
   MAYBE_MUL4(amd64_sse2)
+  MAYBE_MUL4(amd64_avx)
 #endif
 
 static mulcore__functype *pick_mulcore(void)
 {
 #if CPUFAM_X86
+  DISPATCH_PICK_COND(mpmont_mul, maybe_mul4_x86_avx,
+                    cpu_feature_p(CPUFEAT_X86_AVX));
   DISPATCH_PICK_COND(mpmont_mul, maybe_mul4_x86_sse2,
                     cpu_feature_p(CPUFEAT_X86_SSE2));
 #endif
 #if CPUFAM_AMD64
+  DISPATCH_PICK_COND(mpmont_mul, maybe_mul4_amd64_avx,
+                    cpu_feature_p(CPUFEAT_X86_AVX));
   DISPATCH_PICK_COND(mpmont_mul, maybe_mul4_amd64_sse2,
                     cpu_feature_p(CPUFEAT_X86_SSE2));
 #endif
index 2d78a992709da29a8b3241c06b41ca34c0d68370..d8f54e1f9be07723d0b769ec50c287629316282d 100644 (file)
@@ -752,6 +752,13 @@ ENDFUNC
 ///--------------------------------------------------------------------------
 /// Bulk multipliers.
 
+FUNC(mpx_umul4_amd64_avx)
+       .arch   .avx
+       vzeroupper
+  endprologue
+       .arch   pentium4
+ENDFUNC
+
 FUNC(mpx_umul4_amd64_sse2)
        // void mpx_umul4_amd64_sse2(mpw *dv, const mpw *av, const mpw *avl,
        //                         const mpw *bv, const mpw *bvl);
@@ -901,6 +908,13 @@ FUNC(mpx_umul4_amd64_sse2)
 
 ENDFUNC
 
+FUNC(mpxmont_mul4_amd64_avx)
+       .arch   .avx
+       vzeroupper
+  endprologue
+       .arch   pentium4
+ENDFUNC
+
 FUNC(mpxmont_mul4_amd64_sse2)
        // void mpxmont_mul4_amd64_sse2(mpw *dv, const mpw *av, const mpw *bv,
        //                           const mpw *nv, size_t n, const mpw *mi);
@@ -1095,6 +1109,13 @@ FUNC(mpxmont_mul4_amd64_sse2)
 
 ENDFUNC
 
+FUNC(mpxmont_redc4_amd64_avx)
+       .arch   .avx
+       vzeroupper
+  endprologue
+       .arch   pentium4
+ENDFUNC
+
 FUNC(mpxmont_redc4_amd64_sse2)
        // void mpxmont_redc4_amd64_sse2(mpw *dv, mpw *dvl, const mpw *nv,
        //                             size_t n, const mpw *mi);
index f6c81673ebde740740941e43d03a8eebcda873c6..cdc359678a048ddc38cdeb5e2fc118945a389539 100644 (file)
@@ -678,6 +678,14 @@ ENDFUNC
 ///--------------------------------------------------------------------------
 /// Bulk multipliers.
 
+FUNC(mpx_umul4_x86_avx)
+       .arch   .avx
+       vzeroupper
+  endprologue
+       // and drop through...
+       .arch   pentium4
+ENDFUNC
+
 FUNC(mpx_umul4_x86_sse2)
        // void mpx_umul4_x86_sse2(mpw *dv, const mpw *av, const mpw *avl,
        //                         const mpw *bv, const mpw *bvl);
@@ -778,6 +786,14 @@ FUNC(mpx_umul4_x86_sse2)
 
 ENDFUNC
 
+FUNC(mpxmont_mul4_x86_avx)
+       .arch   .avx
+       vzeroupper
+  endprologue
+       // and drop through...
+       .arch   pentium4
+ENDFUNC
+
 FUNC(mpxmont_mul4_x86_sse2)
        // void mpxmont_mul4_x86_sse2(mpw *dv, const mpw *av, const mpw *bv,
        //                           const mpw *nv, size_t n, const mpw *mi);
@@ -919,6 +935,14 @@ FUNC(mpxmont_mul4_x86_sse2)
 
 ENDFUNC
 
+FUNC(mpxmont_redc4_x86_avx)
+       .arch   .avx
+       vzeroupper
+  endprologue
+       // and drop through...
+       .arch   pentium4
+ENDFUNC
+
 FUNC(mpxmont_redc4_x86_sse2)
        // void mpxmont_redc4_x86_sse2(mpw *dv, mpw *dvl, const mpw *nv,
        //                             size_t n, const mpw *mi);
index 3983e7cad60e623b40902c0c68bf576cad8f6559..429484575c8065eb76411e7bb8cec591673e6508 100644 (file)
@@ -923,19 +923,25 @@ static void simple_umul(mpw *dv, mpw *dvl, const mpw *av, const mpw *avl,
 
 #if CPUFAM_X86
   MAYBE_UMUL4(x86_sse2)
+  MAYBE_UMUL4(x86_avx)
 #endif
 
 #if CPUFAM_AMD64
   MAYBE_UMUL4(amd64_sse2)
+  MAYBE_UMUL4(amd64_avx)
 #endif
 
 static mpx_umul__functype *pick_umul(void)
 {
 #if CPUFAM_X86
+  DISPATCH_PICK_COND(mpx_umul, maybe_umul4_x86_avx,
+                    cpu_feature_p(CPUFEAT_X86_AVX));
   DISPATCH_PICK_COND(mpx_umul, maybe_umul4_x86_sse2,
                     cpu_feature_p(CPUFEAT_X86_SSE2));
 #endif
 #if CPUFAM_AMD64
+  DISPATCH_PICK_COND(mpx_umul, maybe_umul4_amd64_avx,
+                    cpu_feature_p(CPUFEAT_X86_AVX));
   DISPATCH_PICK_COND(mpx_umul, maybe_umul4_amd64_sse2,
                     cpu_feature_p(CPUFEAT_X86_SSE2));
 #endif
index 2dab283b37365e1b848a0f3866b7e8c0a7e3e928..b8f72d538b5a179fc50e83dcd4b26d36a75f6ae5 100644 (file)
 ///--------------------------------------------------------------------------
 /// Main code.
 
-       .arch pentium4
        .text
 
+FUNC(chacha_core_x86ish_avx)
+       .arch   .avx
+       vzeroupper
+  endprologue
+       // drop through...
+ENDFUNC
+
+       .arch   pentium4
+
 FUNC(chacha_core_x86ish_sse2)
 
        // Initial setup.
index 3419861873f522d888b6bb817184c25f778b29a1..9b83eea5bff7a8bcef143b4a3a39a3b194dd5fc5 100644 (file)
@@ -72,6 +72,7 @@ static void simple_core(unsigned r, const chacha_matrix src,
 
 #if CPUFAM_X86 || CPUFAM_AMD64
 extern core__functype chacha_core_x86ish_sse2;
+extern core__functype chacha_core_x86ish_avx;
 #endif
 
 #if CPUFAM_ARMEL
@@ -85,6 +86,8 @@ extern core__functype chacha_core_arm64;
 static core__functype *pick_core(void)
 {
 #if CPUFAM_X86 || CPUFAM_AMD64
+  DISPATCH_PICK_COND(chacha_core, chacha_core_x86ish_avx,
+                    cpu_feature_p(CPUFEAT_X86_AVX));
   DISPATCH_PICK_COND(chacha_core, chacha_core_x86ish_sse2,
                     cpu_feature_p(CPUFEAT_X86_SSE2));
 #endif
index 83a49e929bd27e25c4ee3c0eafbadefacf8e3902..2f6519187ef1d1e5d8485d56b0e6302f5962d821 100644 (file)
@@ -118,6 +118,7 @@ CPU_DISPATCH(static, EMPTY, void, setup,
 
 #if CPUFAM_X86 || CPUFAM_AMD64
 extern setup__functype rijndael_setup_x86ish_aesni;
+extern setup__functype rijndael_setup_x86ish_aesni_avx;
 #endif
 #if CPUFAM_ARMEL && HAVE_AS_ARMV8_CRYPTO
 extern setup__functype rijndael_setup_arm_crypto;
@@ -129,6 +130,9 @@ extern setup__functype rijndael_setup_arm64_crypto;
 static setup__functype *pick_setup(void)
 {
 #if CPUFAM_X86 || CPUFAM_AMD64
+  DISPATCH_PICK_COND(rijndael_setup, rijndael_setup_x86ish_aesni_avx,
+                    cpu_feature_p(CPUFEAT_X86_AVX) &&
+                    cpu_feature_p(CPUFEAT_X86_AESNI));
   DISPATCH_PICK_COND(rijndael_setup, rijndael_setup_x86ish_aesni,
                     cpu_feature_p(CPUFEAT_X86_AESNI));
 #endif
index e556aa53e5d47f6f4673de2ed3c50f1693c5878a..a7a1ece3d3823cbd7ad135b91ac1353d76d62b8b 100644 (file)
 ///--------------------------------------------------------------------------
 /// Key setup.
 
+FUNC(rijndael_setup_x86ish_aesni_avx)
+       vzeroupper                    // avoid penalty on `legacy' XMM access
+  endprologue
+       // and drop through...
+ENDFUNC
+
 FUNC(rijndael_setup_x86ish_aesni)
 
 #define SI WHOLE(si)
@@ -365,6 +371,12 @@ ENDFUNC
 /// Encrypting and decrypting blocks.
 
 .macro encdec  op, aes, koff
+  FUNC(rijndael_\op\()_x86ish_aesni_avx)
+       vzeroupper                      // avoid XMM penalties
+  endprologue
+       // and drop through...
+  ENDFUNC
+
   FUNC(rijndael_\op\()_x86ish_aesni)
 
 #if CPUFAM_X86
index 02cfb76b10775359d9243a3061fcfca013e24421..7db9e0124378f447617bb32da8432143d564ec8f 100644 (file)
@@ -83,6 +83,8 @@ CPU_DISPATCH(EMPTY, EMPTY, void, rijndael_dblk,
 #if CPUFAM_X86 || CPUFAM_AMD64
 extern rijndael_eblk__functype rijndael_eblk_x86ish_aesni;
 extern rijndael_dblk__functype rijndael_dblk_x86ish_aesni;
+extern rijndael_eblk__functype rijndael_eblk_x86ish_aesni_avx;
+extern rijndael_dblk__functype rijndael_dblk_x86ish_aesni_avx;
 #endif
 #if CPUFAM_ARMEL && HAVE_AS_ARMV8_CRYPTO
 extern rijndael_eblk__functype rijndael_eblk_arm_crypto;
@@ -96,6 +98,9 @@ extern rijndael_dblk__functype rijndael_dblk_arm64_crypto;
 static rijndael_eblk__functype *pick_eblk(void)
 {
 #if CPUFAM_X86 || CPUFAM_AMD64
+  DISPATCH_PICK_COND(rijndael_eblk, rijndael_eblk_x86ish_aesni_avx,
+                    cpu_feature_p(CPUFEAT_X86_AVX) &&
+                    cpu_feature_p(CPUFEAT_X86_AESNI));
   DISPATCH_PICK_COND(rijndael_eblk, rijndael_eblk_x86ish_aesni,
                     cpu_feature_p(CPUFEAT_X86_AESNI));
 #endif
@@ -113,6 +118,9 @@ static rijndael_eblk__functype *pick_eblk(void)
 static rijndael_dblk__functype *pick_dblk(void)
 {
 #if CPUFAM_X86 || CPUFAM_AMD64
+  DISPATCH_PICK_COND(rijndael_dblk, rijndael_dblk_x86ish_aesni_avx,
+                    cpu_feature_p(CPUFEAT_X86_AVX) &&
+                    cpu_feature_p(CPUFEAT_X86_AESNI));
   DISPATCH_PICK_COND(rijndael_dblk, rijndael_dblk_x86ish_aesni,
                     cpu_feature_p(CPUFEAT_X86_AESNI));
 #endif
index 9cbaeff4ce5f62b23a3338970a9d6f0759f5e099..76ac0ed96e27284007fe9e5decef636fafd6882b 100644 (file)
 ///--------------------------------------------------------------------------
 /// Main code.
 
-       .arch pentium4
        .text
 
+FUNC(salsa20_core_x86ish_avx)
+       .arch   .avx
+       vzeroupper
+  endprologue
+       // drop through...
+ENDFUNC
+
+       .arch   pentium4
+
 FUNC(salsa20_core_x86ish_sse2)
 
        // Initial setup.
index 03fcf469e8f46dc5de86f5aafaf717a21b157c9d..e78baf0534671cfcc321e24503c587a78aab8cc1 100644 (file)
@@ -72,6 +72,7 @@ static void simple_core(unsigned r, const salsa20_matrix src,
 
 #if CPUFAM_X86 || CPUFAM_AMD64
 extern core__functype salsa20_core_x86ish_sse2;
+extern core__functype salsa20_core_x86ish_avx;
 #endif
 
 #if CPUFAM_ARMEL
@@ -85,6 +86,8 @@ extern core__functype salsa20_core_arm64;
 static core__functype *pick_core(void)
 {
 #if CPUFAM_X86 || CPUFAM_AMD64
+  DISPATCH_PICK_COND(salsa20_core, salsa20_core_x86ish_avx,
+                    cpu_feature_p(CPUFEAT_X86_AVX));
   DISPATCH_PICK_COND(salsa20_core, salsa20_core_x86ish_sse2,
                     cpu_feature_p(CPUFEAT_X86_SSE2));
 #endif