pkginclude_HEADERS += ct.h
libbase_la_SOURCES += ct.c
+## CPU-specific dispatch.
+pkginclude_HEADERS += dispatch.h
+libbase_la_SOURCES += dispatch.c
+
## Acceptable key-size descriptions.
pkginclude_HEADERS += keysz.h
libbase_la_SOURCES += keysz.c keysz-conv.c
## Clearing secrets from memory.
pkginclude_HEADERS += paranoia.h
+## Base definitions for assembler source.
+EXTRA_DIST += asm-common.h
+
###----- That's all, folks --------------------------------------------------
--- /dev/null
+/// -*- mode: asm; asm-comment-char: ?/ -*-
+///
+/// Fancy SIMD implementation of Salsa20
+///
+/// (c) 2015 Straylight/Edgeware
+///
+
+///----- Licensing notice ---------------------------------------------------
+///
+/// This file is part of Catacomb.
+///
+/// Catacomb is free software; you can redistribute it and/or modify
+/// it under the terms of the GNU Library General Public License as
+/// published by the Free Software Foundation; either version 2 of the
+/// License, or (at your option) any later version.
+///
+/// Catacomb is distributed in the hope that it will be useful,
+/// but WITHOUT ANY WARRANTY; without even the implied warranty of
+/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+/// GNU Library General Public License for more details.
+///
+/// You should have received a copy of the GNU Library General Public
+/// License along with Catacomb; if not, write to the Free
+/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+/// MA 02111-1307, USA.
+
+///--------------------------------------------------------------------------
+/// General definitions.
+
+// Announcing an external function.
+#define FUNC(name) \
+ .globl F(name); \
+ TYPE_FUNC(name); \
+ .macro ENDFUNC; _ENDFUNC(name); .endm; \
+ FUNC_PREHOOK(name); \
+F(name): \
+ FUNC_POSTHOOK(name)
+
+// Marking the end of a function.
+#define _ENDFUNC(name) \
+ .purgem ENDFUNC; \
+ SIZE_OBJ(name); \
+ ENDFUNC_HOOK(name)
+
+///--------------------------------------------------------------------------
+/// ELF-specific hacking.
+
+#if __ELF__
+
+#if __PIC__ || __PIE__
+# define WANT_PIC 1
+#endif
+
+#define TYPE_FUNC(name) .type name, STT_FUNC
+
+#define SIZE_OBJ(name) .size name, . - name
+
+#endif
+
+///--------------------------------------------------------------------------
+/// Windows-specific hacking.
+
+#if ABI_WIN
+
+#if CPUFAM_X86
+# define F(name) _##name
+#endif
+
+#endif
+
+///--------------------------------------------------------------------------
+/// x86- and amd64-specific hacking.
+///
+/// It's (slightly) easier to deal with both of these in one go.
+
+#if CPUFAM_X86 || CPUFAM_AMD64
+
+// Set the function hooks.
+#define FUNC_PREHOOK(_) .balign 16
+
+// Don't use the wretched AT&T syntax. It's festooned with pointless
+// punctuation, and all of the data movement is backwards. Ugh!
+ .intel_syntax noprefix
+
+// Call external subroutine at ADDR, possibly via PLT.
+ .macro callext addr
+#if WANT_PIC
+ call \addr@PLT
+#else
+ call \addr
+#endif
+ .endm
+
+// Do I need to arrange a spare GOT register?
+#if WANT_PIC && CPUFAM_X86
+# define NEED_GOT 1
+#endif
+#define GOTREG ebx // Not needed in AMD64 so don't care.
+
+// Maybe load GOT address into GOT.
+ .macro ldgot got=GOTREG
+#if WANT_PIC && CPUFAM_X86
+ call _where_am_i.\got
+ add \got, offset _GLOBAL_OFFSET_TABLE_
+#endif
+ .endm
+
+// Maybe build a helper subroutine for `ldgot GOT'.
+ .macro gotaux got=GOTREG
+#if WANT_PIC && CPUFAM_X86
+ .align 16
+_where_am_i.\got :
+ mov \got, [esp]
+ ret
+#endif
+ .endm
+
+// Load address of external symbol ADDR into REG, maybe using GOT.
+ .macro leaext reg, addr, got=GOTREG
+#if WANT_PIC
+# if CPUFAM_X86
+ mov \reg, [\got + \addr@GOT]
+# endif
+# if CPUFAM_AMD64
+ mov \reg, \addr@GOTPCREL[rip]
+# endif
+#else
+# if CPUFAM_X86
+ mov \reg, offset \addr
+# endif
+# if CPUFAM_AMD64
+ lea \reg, \addr[rip]
+# endif
+#endif
+ .endm
+
+// Address expression (possibly using a base register, and a displacement)
+// referring to ADDR, which is within our module, maybe using GOT.
+#define INTADDR(...) INTADDR__0(__VA_ARGS__, GOTREG, dummy)
+#define INTADDR__0(addr, got, ...) INTADDR__1(addr, got)
+#if CPUFAM_AMD64
+# define INTADDR__1(addr, got) addr + rip
+#elif WANT_PIC
+# define INTADDR__1(addr, got) got + addr@GOTOFF
+#else
+# define INTADDR__1(addr, got) addr
+#endif
+
+#endif
+
+///--------------------------------------------------------------------------
+/// ARM-specific hacking.
+
+#if CPUFAM_ARM
+
+// Set the function hooks.
+#define FUNC_PREHOOK(_) .balign 4
+#define ENDFUNC_HOOK(name) .ltorg
+
+// Call external subroutine at ADDR, possibly via PLT.
+ .macro callext addr, cond=
+#if WANT_PIC
+ bl\cond \addr(PLT)
+#else
+ bl\cond \addr
+#endif
+ .endm
+
+// Do I need to arrange a spare GOT register?
+#if WANT_PIC
+# define NEED_GOT 1
+#endif
+#define GOTREG r9
+
+// Maybe load GOT address into GOT.
+ .macro ldgot got=r9
+#if WANT_PIC
+ ldr \got, =_GLOBAL_OFFSET_TABLE_ - . - 12
+ add \got, pc, \got
+#endif
+ .endm
+
+// Load address of external symbol ADDR into REG, maybe using GOT.
+ .macro leaext reg, addr, cond=, got=GOTREG
+#if WANT_PIC
+ ldr \reg, =\addr(GOT)
+ ldr \reg, [\got, \reg]
+#else
+ ldr \reg, =\addr
+#endif
+ .endm
+
+#endif
+
+///--------------------------------------------------------------------------
+/// Final stuff.
+
+// Default values for the various hooks.
+#ifndef FUNC_PREHOOK
+# define FUNC_PREHOOK(name)
+#endif
+#ifndef FUNC_POSTHOOK
+# define FUNC_POSTHOOK(name)
+#endif
+#ifndef ENDFUNC_HOOK
+# define ENDFUNC_HOOK(name)
+#endif
+
+#ifndef F
+# define F(name) name
+#endif
+
+#ifndef TYPE_FUNC
+# define TYPE_FUNC(name)
+#endif
+
+#ifndef SIZE_OBJ
+# define SIZE_OBJ(name)
+#endif
+
+///----- That's all, folks --------------------------------------------------
--- /dev/null
+/* -*-c-*-
+ *
+ * CPU-specific dispatch
+ *
+ * (c) 2015 Straylight/Edgeware
+ */
+
+/*----- Licensing notice --------------------------------------------------*
+ *
+ * This file is part of Catacomb.
+ *
+ * Catacomb is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Library General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * Catacomb is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with Catacomb; if not, write to the Free
+ * Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ * MA 02111-1307, USA.
+ */
+
+/*----- Header files ------------------------------------------------------*/
+
+#include "config.h"
+
+#include <ctype.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <mLib/macros.h>
+
+#include "dispatch.h"
+
+/*----- Intel x86/AMD64 feature probing -----------------------------------*/
+
+#if CPUFAM_X86 || CPUFAM_AMD64
+
+# define EFLAGS_ID (1u << 21)
+# define CPUID1D_SSE2 (1u << 26)
+# define CPUID1D_FXSR (1u << 24)
+# define CPUID1C_AESNI (1u << 25)
+
+struct cpuid { unsigned a, b, c, d; };
+
+/* --- @cpuid@ --- *
+ *
+ * Arguments: @struct cpuid *cc@ = where to write the result
+ * @unsigned a, c@ = EAX and ECX registers to set
+ *
+ * Returns: ---
+ *
+ * Use: Minimal C wrapper around the x86 `CPUID' instruction. Checks
+ * that the instruction is actually available before invoking
+ * it; fills the output structure with zero if it's not going to
+ * work.
+ */
+
+#ifdef __GNUC__
+# if CPUFAM_X86
+static __inline__ unsigned getflags(void)
+ { unsigned f; __asm__ ("pushf; popl %0" : "=g" (f)); return (f); }
+static __inline__ unsigned setflags(unsigned f)
+{
+ unsigned ff;
+ __asm__ ("pushf; pushl %1; popf; pushf; popl %0; popf"
+ : "=g" (ff)
+ : "g" (f));
+ return (ff);
+}
+# else
+static __inline__ unsigned long getflags(void)
+ { unsigned long f; __asm__ ("pushf; popq %0" : "=g" (f)); return (f); }
+static __inline__ unsigned long long setflags(unsigned long f)
+{
+ unsigned long ff;
+ __asm__ ("pushf; pushq %1; popf; pushf; popq %0; popf"
+ : "=g" (ff)
+ : "g" (f));
+ return (ff);
+}
+# endif
+#endif
+
+static void cpuid(struct cpuid *cc, unsigned a, unsigned c)
+{
+#ifdef __GNUC__
+ unsigned f;
+#endif
+
+ cc->a = cc->b = cc->c = cc->d = 0;
+
+#ifdef __GNUC__
+ /* Stupid dance to detect whether the CPUID instruction is available. */
+ f = getflags();
+ if (!(setflags(f | EFLAGS_ID) & EFLAGS_ID) ||
+ setflags(f & ~EFLAGS_ID) & EFLAGS_ID) {
+ dispatch_debug("CPUID instruction not available");
+ return;
+ }
+ setflags(f);
+
+ /* Alas, EBX is magical in PIC code, so abuse ESI instead. This isn't
+ * pretty, but it works.
+ */
+# if CPUFAM_X86
+ __asm__ ("pushl %%ebx; cpuid; movl %%ebx, %%esi; popl %%ebx"
+ : "=a" (cc->a), "=S" (cc->b), "=c" (cc->c), "=d" (cc->d)
+ : "a" (a) , "c" (c));
+# elif CPUFAM_AMD64
+ __asm__ ("pushq %%rbx; cpuid; movl %%ebx, %%esi; popq %%rbx"
+ : "=a" (cc->a), "=S" (cc->b), "=c" (cc->c), "=d" (cc->d)
+ : "a" (a) , "c" (c));
+# else
+# error "I'm confused."
+# endif
+ dispatch_debug("CPUID(%08x, %08x) -> %08x, %08x, %08x, %08x",
+ a, c, cc->a, cc->b, cc->c, cc->d);
+#else
+ dispatch_debug("GNU inline assembler not available; can't CPUID");
+#endif
+}
+
+static unsigned cpuid_maxleaf(void)
+ { struct cpuid c; cpuid(&c, 0, 0); return (c.a); }
+
+/* --- @cpuid_features_p@ --- *
+ *
+ * Arguments: @unsigned dbits@ = bits to check in EDX
+ * @unsigned cbits@ = bits to check in ECX
+ *
+ * Returns: Nonzero if all the requested bits are set in the CPUID result
+ * on leaf 1.
+ */
+
+static int cpuid_features_p(unsigned dbits, unsigned cbits)
+{
+ struct cpuid c;
+ if (cpuid_maxleaf() < 1) return (0);
+ cpuid(&c, 1, 0);
+ return ((c.d & dbits) == dbits && (c.c & cbits) == cbits);
+}
+
+/* --- @xmm_registers_available_p@ --- *
+ *
+ * Arguments: ---
+ *
+ * Returns: Nonzero if the operating system has made the XMM registers
+ * available for use.
+ */
+
+static int xmm_registers_available_p(void)
+{
+#ifdef __GNUC__
+ unsigned f;
+ /* This hack is by Agner Fog. Use FXSAVE/FXRSTOR to figure out whether the
+ * XMM registers are actually alive.
+ */
+ if (!cpuid_features_p(CPUID1D_FXSR, 0)) return (0);
+# if CPUFAM_X86
+ __asm__ ("movl %%esp, %%edx; subl $512, %%esp; andl $~15, %%esp\n"
+ "fxsave (%%esp)\n"
+ "movl 160(%%esp), %%eax; xorl $0xaaaa5555, 160(%%esp)\n"
+ "fxrstor (%%esp); fxsave (%%esp)\n"
+ "movl 160(%%esp), %%ecx; movl %%eax, 160(%%esp)\n"
+ "fxrstor (%%esp); movl %%edx, %%esp\n"
+ "xorl %%ecx, %%eax"
+ : "=a" (f)
+ : /* no inputs */
+ : "%ecx", "%edx");
+# elif CPUFAM_AMD64
+ __asm__ ("movq %%rsp, %%rdx; subq $512, %%rsp; andq $~15, %%rsp\n"
+ "fxsave (%%rsp)\n"
+ "movl 160(%%rsp), %%eax; xorl $0xaaaa5555, 160(%%rsp)\n"
+ "fxrstor (%%rsp); fxsave (%%rsp)\n"
+ "movl 160(%%rsp), %%ecx; movl %%eax, 160(%%rsp)\n"
+ "fxrstor (%%rsp); movq %%rdx, %%rsp\n"
+ "xorl %%ecx, %%eax"
+ : "=a" (f)
+ : /* no inputs */
+ : "%ecx", "%rdx");
+# else
+# error "I'm confused."
+# endif
+ dispatch_debug("XMM registers %savailable", f ? "" : "not ");
+ return (f);
+#else
+ dispatch_debug("GNU inline assembler not available; can't check for XMM");
+ return (0);
+#endif
+}
+
+#endif
+
+/*----- General feature probing using auxiliary vectors -------------------*/
+
+/* Try to find the system's definitions for auxiliary vector entries. */
+#ifdef HAVE_SYS_AUXV_H
+# include <sys/auxv.h>
+#else
+# ifdef HAVE_LINUX_AUXVEC_H
+# include <linux/auxvec.h>
+# endif
+# ifdef HAVE_ASM_HWCAP_H
+# include <asm/hwcap.h>
+# endif
+#endif
+
+/* The type of entries in the auxiliary vector. I'm assuming that `unsigned
+ * long' matches each platform's word length; if this is false then we'll
+ * need some host-specific tweaking here.
+ */
+union auxval { long i; unsigned long u; const void *p; };
+struct auxentry { unsigned long type; union auxval value; };
+
+/* Register each CPU family's interest in the auxiliary vector. Make sure
+ * that the necessary entry types are defined. This is primarily ordered by
+ * entry type to minimize duplication.
+ */
+#if defined(AT_HWCAP) && CPUFAM_ARMEL
+# define WANT_ANY 1
+# define WANT_AT_HWCAP(_) _(AT_HWCAP, u, hwcap)
+#endif
+
+/* If we couldn't find any interesting entries then we can switch all of this
+ * machinery off. Also do that if we have no means for atomic updates.
+ */
+#if WANT_ANY && CPU_DISPATCH_P
+
+/* The main output of this section is a bitmask of detected features. The
+ * least significant bit will be set if we've tried to probe. Always access
+ * this using `DISPATCH_LOAD' and `DISPATCH_STORE'.
+ */
+static unsigned hwcaps = 0;
+
+/* For each potentially interesting type which turned out not to exist or be
+ * wanted, define a dummy macro for the sake of the next step.
+ */
+#ifndef WANT_AT_HWCAP
+# define WANT_AT_HWCAP(_)
+#endif
+
+/* For each CPU family, define two lists.
+ *
+ * * `WANTAUX' is a list of the `WANT_AT_MUMBLE' macros which the CPU
+ * family tried to register interest in above. Each entry contains the
+ * interesting auxiliary vector entry type, the name of the union branch
+ * for its value, and the name of the slot in `struct auxprobe' in which
+ * to store the value.
+ *
+ * * `CAPMAP' is a list describing the output features which the CPU family
+ * intends to satisfy from the auxiliary vector. Each entry contains a
+ * feature name suffix, and the token name (for `check_env').
+ */
+#if CPUFAM_ARMEL
+# define WANTAUX(_) \
+ WANT_AT_HWCAP(_)
+# define CAPMAP(_) \
+ _(ARM_VFP, "arm:vfp") \
+ _(ARM_NEON, "arm:neon") \
+ _(ARM_V4, "arm:v4") \
+ _(ARM_D32, "arm:d32")
+#endif
+
+/* Build the bitmask for `hwcaps' from the `CAPMAP' list. */
+enum {
+ HFI_PROBED = 0,
+#define HFI__ENUM(feat, tok) HFI_##feat,
+ CAPMAP(HFI__ENUM)
+#undef HFI__ENUM
+ HFI__END
+};
+enum {
+ HF_PROBED = 1,
+#define HF__FLAG(feat, tok) HF_##feat = 1 << HFI_##feat,
+ CAPMAP(HF__FLAG)
+#undef HF__FLAG
+ HF__END
+};
+
+/* Build a structure in which we can capture the interesting data from the
+ * auxiliary vector.
+ */
+#define AUXUTYPE_i long
+#define AUXUTYPE_u unsigned long
+#define AUXUTYPE_p const void *
+struct auxprobe {
+#define AUXPROBE__SLOT(type, ubranch, slot) AUXUTYPE_##ubranch slot;
+ WANTAUX(AUXPROBE__SLOT)
+#undef AUXPROBE_SLOT
+};
+
+/* --- @probe_hwcaps@ --- *
+ *
+ * Arguments: ---
+ *
+ * Returns: ---
+ *
+ * Use: Attempt to find the auxiliary vector (which is well hidden)
+ * and discover interesting features from it.
+ */
+
+static void probe_hwcaps(void)
+{
+ unsigned hw = HF_PROBED;
+ struct auxprobe probed = { 0 };
+
+ /* Populate `probed' with the information we manage to retrieve from the
+ * auxiliary vector. Slots we couldn't find are left zero-valued.
+ */
+#if defined(HAVE_GETAUXVAL)
+ /* Shiny new libc lets us request individual entry types. This is almost
+ * too easy.
+ */
+# define CAP__GET(type, slot, ubranch) \
+ probed.slot.ubranch = (AUXUTYPE_##ubranch)getauxval(type);
+ WANTAUX(CAP__GET)
+#else
+ /* Otherwise we're a bit stuck, really. Modern Linux kernels make a copy
+ * of the vector available in `/procc' so we could try that.
+ *
+ * The usual place is stuck on the end of the environment vector, but that
+ * may well have moved, and we have no way of telling whether it has or
+ * whether there was ever an auxiliary vector there at all; so don't do
+ * that.
+ */
+ {
+ FILE *fp = 0;
+ unsigned char *p = 0, *q = 0;
+ const struct auxentry *a;
+ size_t sz, off, n;
+
+ /* Open the file and read it into a memory chunk. */
+ if ((fp = fopen("/proc/self/auxv", "rb")) == 0) goto clean;
+ sz = 4096; off = 0;
+ if ((p = malloc(sz)) == 0) goto clean;
+ for (;;) {
+ n = fread(p + off, 1, sz - off, fp);
+ off += n;
+ if (off < sz) break;
+ sz *= 2; if ((q = realloc(p, sz)) == 0) break;
+ p = q;
+ }
+
+ /* Work through the vector (or as much of it as we found) and extract the
+ * types we're interested in.
+ */
+ for (a = (const struct auxentry *)p,
+ n = sz/sizeof(struct auxentry);
+ n--; a++) {
+ switch (a->type) {
+#define CAP__SWITCH(type, ubranch, slot) \
+ case type: probed.slot = a->value.ubranch; break;
+ WANTAUX(CAP__SWITCH)
+ }
+ }
+
+ clean:
+ if (p) free(p);
+ if (fp) fclose(fp);
+ }
+#endif
+
+ /* Each CPU family now has to pick through what was found and stashed in
+ * `probed', and set the appropriate flag bits in `hw'.
+ */
+#if CPUFAM_ARMEL
+ if (probed.hwcap & HWCAP_VFPv3) hw |= HF_ARM_VFP;
+ if (probed.hwcap & HWCAP_NEON) hw |= HF_ARM_NEON;
+ if (probed.hwcap & HWCAP_VFPD32) hw |= HF_ARM_D32;
+ if (probed.hwcap & HWCAP_VFPv4) hw |= HF_ARM_V4;
+#endif
+
+ /* Store the bitmask of features we probed for everyone to see. */
+ DISPATCH_STORE(hwcaps, hw);
+
+ /* Finally, make a report about the things we found. (Doing this earlier
+ * will pointlessly widen the window in which multiple threads will do the
+ * above auxiliary-vector probing.)
+ */
+#define CAP__DEBUG(feat, tok) \
+ dispatch_debug("check auxv for feature `%s': %s", tok, \
+ hw & HF_##feat ? "available" : "absent");
+ CAPMAP(CAP__DEBUG)
+#undef CAP__DEBUG
+}
+
+/* --- @get_hwcaps@ --- *
+ *
+ * Arguments: ---
+ *
+ * Returns: A mask of hardware capabilities and other features, as probed
+ * from the auxiliary vector.
+ */
+
+static unsigned get_hwcaps(void)
+{
+ unsigned hw;
+
+ DISPATCH_LOAD(hwcaps, hw);
+ if (!(hwcaps & HF_PROBED)) { probe_hwcaps(); DISPATCH_LOAD(hwcaps, hw); }
+ return (hw);
+}
+
+#endif
+
+/*----- External interface ------------------------------------------------*/
+
+/* --- @dispatch_debug@ --- *
+ *
+ * Arguments: @const char *fmt@ = a format string
+ * @...@ = additional arguments
+ *
+ * Returns: ---
+ *
+ * Use: Writes a formatted message to standard output if dispatch
+ * debugging is enabled.
+ */
+
+void dispatch_debug(const char *fmt, ...)
+{
+ va_list ap;
+ const char *e = getenv("CATACOMB_CPUDISPATCH_DEBUG");
+
+ if (e && *e != 'n' && *e != '0') {
+ va_start(ap, fmt);
+ fputs("Catacomb CPUDISPATCH: ", stderr);
+ vfprintf(stderr, fmt, ap);
+ fputc('\n', stderr);
+ va_end(ap);
+ }
+}
+
+/* --- @check_env@ --- *
+ *
+ * Arguments: @const char *ftok@ = feature token
+ *
+ * Returns: Zero if the feature is forced off; positive if it's forced
+ * on; negative if the user hasn't decided.
+ *
+ * Use: Checks the environment variable `CATACOMB_CPUFEAT' for the
+ * feature token @ftok@. The variable, if it exists, should be
+ * a space-separated sequence of `+tok' and `-tok' items. These
+ * tokens may end in `*', which matches any suffix.
+ */
+
+static int IGNORABLE check_env(const char *ftok)
+{
+ const char *p, *q, *pp;
+ int d;
+
+ p = getenv("CATACOMB_CPUFEAT");
+ if (!p) return (-1);
+
+ for (;;) {
+ while (isspace((unsigned char)*p)) p++;
+ if (!*p) return (-1);
+ switch (*p) {
+ case '+': d = +1; p++; break;
+ case '-': d = 0; p++; break;
+ default: d = -1; break;
+ }
+ for (q = p; *q && !isspace((unsigned char)*q); q++);
+ if (d >= 0) {
+ for (pp = ftok; p < q && *pp && *p == *pp; p++, pp++);
+ if ((p == q && !*pp) || (*p == '*' && p + 1 == q)) return (d);
+ }
+ p = q;
+ }
+ return (-1);
+}
+
+/* --- @cpu_feature_p@ --- *
+ *
+ * Arguments: @unsigned feat@ = a @CPUFEAT_...@ code
+ *
+ * Returns: Nonzero if the feature is available.
+ */
+
+#include <stdio.h>
+
+static int IGNORABLE
+ feat_debug(const char *ftok, const char *check, int verdict)
+{
+ if (verdict >= 0) {
+ dispatch_debug("feature `%s': %s -> %s", ftok, check,
+ verdict ? "available" : "absent");
+ }
+ return (verdict);
+}
+
+int cpu_feature_p(int feat)
+{
+ int IGNORABLE f;
+ IGNORE(f);
+#define CASE_CPUFEAT(feat, ftok, cond) case CPUFEAT_##feat: \
+ if ((f = feat_debug(ftok, "environment override", \
+ check_env(ftok))) >= 0) \
+ return (f); \
+ else \
+ return (feat_debug(ftok, "runtime probe", cond));
+
+ switch (feat) {
+#if CPUFAM_X86 || CPUFAM_AMD64
+ CASE_CPUFEAT(X86_SSE2, "x86:sse2",
+ xmm_registers_available_p() &&
+ cpuid_features_p(CPUID1D_SSE2, 0));
+ CASE_CPUFEAT(X86_AESNI, "x86:aesni",
+ xmm_registers_available_p() &&
+ cpuid_features_p(CPUID1D_SSE2, CPUID1C_AESNI));
+#endif
+#ifdef CAPMAP
+# define FEATP__CASE(feat, tok) \
+ CASE_CPUFEAT(feat, tok, get_hwcaps & HF_##feat)
+ CAPMAP(FEATP__CASE)
+#undef FEATP__CASE
+#endif
+ default:
+ dispatch_debug("denying unknown feature %d", feat);
+ return (0);
+ }
+#undef CASE_CPUFEAT
+}
+
+/*----- That's all, folks -------------------------------------------------*/
--- /dev/null
+/* -*-c-*-
+ *
+ * CPU-specific dispatch
+ *
+ * (c) 2015 Straylight/Edgeware
+ */
+
+/*----- Licensing notice --------------------------------------------------*
+ *
+ * This file is part of Catacomb.
+ *
+ * Catacomb is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Library General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * Catacomb is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with Catacomb; if not, write to the Free
+ * Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ * MA 02111-1307, USA.
+ */
+
+#ifndef CATACOMB_DISPATCH_H
+#define CATACOMB_DISPATCH_H
+
+#ifdef __cplusplus
+ extern "C" {
+#endif
+
+/*----- Header files ------------------------------------------------------*/
+
+#include <mLib/macros.h>
+
+/*----- Macros ------------------------------------------------------------*/
+
+/* --- Atomic data access machinery --- *
+ *
+ * If they're available, use GCC's `__atomic_*' intrinsics. If that doesn't
+ * work and we're using one of a small number of processors I'm sure won't
+ * mind, then just stick with simple memory access. Otherwise turn
+ * dispatching off, because it probably isn't thread-safe.
+ */
+
+#if GCC_VERSION_P(4, 7)
+# define CPU_DISPATCH_P 1
+# define DISPATCH_LOAD(g, v) \
+ ((v) = __atomic_load_n(&(g), __ATOMIC_RELAXED))
+# define DISPATCH_STORE(g, v) \
+ (__atomic_store_n(&(g), (v), __ATOMIC_RELAXED))
+#elif defined(__i386__) || defined(__amd64__) || \
+ defined(__arm__) || defined(__aarch64__) || \
+ defined(__mips__)
+# define CPU_DISPATCH_P 1
+# define DISPATCH_LOAD(g, v) ((v) = (g))
+# define DISPATCH_STORE(g, v) ((g) = (v))
+#endif
+
+/* --- A simple hack --- */
+
+#ifndef EMPTY
+# define EMPTY
+#endif
+
+/* --- @CPU_DISPATCH@ --- *
+ *
+ * Arguments: @stcls@ = storage class for the main @ext@ function
+ * (typically either @static@ or @EMPTY@)
+ * @rtn@ = prefix for tail-calling a function of the appropriate
+ * type (either @(void)@ or @return@)
+ * @ret@ = return type for the function
+ * @ext@ = name for the main function (other named are derived
+ * from this)
+ * @argdecls@ = parenthesis-enclosed list of argument types
+ * @args@ = parenthesis-enclosed list of argument names only
+ * @pick@ = function to select appropriate implementation
+ * @dflt@ = fallback implementation
+ *
+ * Use: Main machinery for CPU-specfic dispatching.
+ *
+ * The macro defines a function
+ *
+ * @stcls ret ext argdcls@
+ *
+ * The first time @ext@ is called, it will invoke @pick@ to
+ * select and a return a pointer to an appropriate
+ * implementation for the runtime environment. Subsequent calls
+ * to @ext@ will (usually) call this preferred implementation
+ * directly.
+ *
+ * Some target platforms may not be able to establish the
+ * necessary function pointer in a threadsafe way. On such
+ * platforms, the dispatch machinery is disabled and @ext@ will
+ * simply call @dflt@.
+ *
+ * Some additional declarations are made. As a convenience,
+ * @ext__functype@ is the function type of @ext@. Declarations
+ * are made for @pick@ and @dflt@, as @static@ functions.
+ */
+
+#ifdef CPU_DISPATCH_P
+
+#define CPU_DISPATCH(stcls, rtn, ret, ext, argdecls, args, pick, dflt) \
+ \
+typedef ret ext##__functype argdecls; \
+static ret dflt argdecls; \
+static ret ext##__dispatch argdecls; \
+static ext##__functype *pick(void); \
+static ext##__functype *ext##__ptr = ext##__dispatch; \
+ \
+static ret ext##__dispatch argdecls \
+{ \
+ ext##__functype *f = pick(); \
+ DISPATCH_STORE(ext##__ptr, f); \
+ rtn f args; \
+} \
+ \
+stcls ret ext argdecls \
+{ \
+ ext##__functype *f; \
+ DISPATCH_LOAD(ext##__ptr, f); \
+ rtn f args; \
+}
+
+#else
+
+#define CPU_DISPATCH(stcls, rtn, ret, ext, argdecls, args, pick, dflt) \
+ \
+typedef ret ext##__functype argdecls; \
+static ret dflt argdecls; \
+static ext##__functype *pick(void) IGNORABLE; \
+ \
+stcls ret ext argdecls { rtn dflt args; }
+
+#endif
+
+/* --- Some macros for producing useful debugging --- */
+
+#define DISPATCH_PICK_COND(what, func, cond) do { \
+ if (cond) { \
+ dispatch_debug("picked `%s' for `%s'", #func, #what); \
+ return (func); \
+ } \
+} while (0)
+#define DISPATCH_PICK_FALLBACK(what, func) do { \
+ dispatch_debug("using default `%s'", #what); \
+ return (func); \
+} while (0)
+
+/*----- Functions provided ------------------------------------------------*/
+
+/* --- @dispatch_debug@ --- *
+ *
+ * Arguments: @const char *fmt@ = a format string
+ * @...@ = additional arguments
+ *
+ * Returns: ---
+ *
+ * Use: Writes a formatted message to standard output if dispatch
+ * debugging is enabled.
+ */
+
+extern void dispatch_debug(const char */*fmt*/, ...);
+
+/* --- @cpu_feature_p@ --- *
+ *
+ * Arguments: @unsigned feat@ = a @CPUFEAT_...@ code
+ *
+ * Returns: Nonzero if the feature is available.
+ */
+
+enum {
+ CPUFEAT_X86_SSE2, /* Streaming SIMD Extensions 2 */
+ CPUFEAT_X86_AESNI, /* AES Native Instructions */
+ CPUFEAT_ARM_VFP, /* VFP floating-point (v3 or v4) */
+ CPUFEAT_ARM_NEON, /* Advanced SIMD (v1 or v2) */
+ CPUFEAT_ARM_V4, /* VFPv4 and/or SIMD v2 */
+ CPUFEAT_ARM_D32 /* 32 double registers, not 16 */
+};
+
+extern int cpu_feature_p(int /*feat*/);
+
+/*----- That's all, folks -------------------------------------------------*/
+
+#ifdef __cplusplus
+ }
+#endif
+
+#endif
AC_CONFIG_SRCDIR([catacomb.pc.in])
AC_CONFIG_AUX_DIR([config])
AM_INIT_AUTOMAKE([foreign parallel-tests color-tests subdir-objects])
+AC_CANONICAL_HOST
mdw_SILENT_RULES
AC_PROG_CC
AM_PROG_LIBTOOL
mdw_LIBTOOL_VERSION_INFO
+AM_PROG_AS
+
AC_PROG_YACC
AC_SUBST(AM_CFLAGS)
+dnl--------------------------------------------------------------------------
+dnl Host-specific configuration.
+
+AC_MSG_CHECKING([CPU family and ABI])
+
+dnl The table of CPU families and ABIs which we might support. Support is
+dnl not uniform: each dispatched function might or might not have an
+dnl implementation for any particular CPU/ABI combination.
+AC_DEFUN([catacomb_CPU_FAMILIES],
+ [$1([i[[3-6]]86,cygwin], [x86], [win])
+ $1([i[[3-6]]86,*], [x86], [sysv])
+ $1([x86_64,cygwin], [amd64], [win])
+ $1([x86_64,*], [amd64], [sysv])
+ $1([armv*,*-gnueabi | armv*,*-gnueabihf], [armel], [gnueabi])])
+
+dnl A utility to clear the `seen' flags, used so as to process each CPU or
+dnl ABI once.
+m4_define([catacomb_CLEAR_FLAGS],
+[m4_ifdef([catacomb_seen_cpu/$2],
+ [m4_undefine([catacomb_seen_cpu/$2])])dnl
+m4_ifdef([catacomb_seen_abi/$3],
+ [m4_undefine([catacomb_seen_abi/$3])])])
+
+dnl Identify the current host.
+case $host_cpu,$host_os in
+ m4_define([catacomb_CPU_CASE],
+ [$1) CPUFAM=$2 ABI=$3 ;;
+])
+ catacomb_CPU_FAMILIES([catacomb_CPU_CASE])
+ *) CPUFAM=nil ABI=nil ;;
+esac
+
+dnl Figure out the current CPU.
+catacomb_CPU_FAMILIES([catacomb_CLEAR_FLAGS])
+case $CPUFAM in
+ m4_define([catacomb_DEFINE_CPU],
+ [m4_ifdef([catacomb_seen_cpu/$2], [],
+ [$2)
+ AC_DEFINE([CPUFAM_]m4_translit([$2], [a-z], [A-Z]), [1],
+ [Define if host CPU family is \`$2\'.])
+ ;;m4_define([catacomb_seen_cpu/$2], [t])])])
+ catacomb_CPU_FAMILIES([catacomb_DEFINE_CPU])
+ nil) ;;
+ *) AC_MSG_ERROR([BUG: unexpected cpufam \`$CPUFAM']) ;;
+esac
+AC_SUBST([CPUFAM])
+
+dnl Figure out the current ABI.
+catacomb_CPU_FAMILIES([catacomb_CLEAR_FLAGS])
+case $ABI in
+ m4_define([catacomb_DEFINE_ABI],
+ [m4_ifdef([catacomb_seen_abi/$3], [],
+ [$3)
+ AC_DEFINE([ABI_]m4_translit([$3], [a-z], [A-Z]), [1],
+ [Define if host ABI variant is \`$3\'.])
+ ;;m4_define([catacomb_seen_abi/$3], [t])])])
+ catacomb_CPU_FAMILIES([catacomb_DEFINE_ABI])
+ nil) ;;
+ *) AC_MSG_ERROR([BUG: unexpected ABI \`$ABI']) ;;
+esac
+AC_SUBST([ABI])
+
+dnl Establish Automake conditions for things.
+catacomb_CPU_FAMILIES([catacomb_CLEAR_FLAGS])
+m4_define([catacomb_COND_CPU],
+[m4_define([_CPU], m4_translit([$2], [a-z], [A-Z]))
+m4_define([_ABI], m4_translit([$3], [a-z], [A-Z]))
+AM_CONDITIONAL([CPUABI_]_CPU[_]_ABI, [test x$CPUFAM/$ABI = x$2/$3])
+m4_ifdef([catacomb_seen_cpu/$2], [],
+[AM_CONDITIONAL([CPUFAM_]_CPU, [test x$CPUFAM = x$2])dnl
+m4_define([catacomb_seen_cpu/$2], [t])])
+m4_ifdef([catacomb_seen_abi/$3], [],
+[AM_CONDITIONAL([ABI_]_ABI, [test x$ABI = x$3])dnl
+m4_define([catacomb_seen_abi/$3], [t])])])
+catacomb_CPU_FAMILIES([catacomb_COND_CPU])
+AM_CONDITIONAL([KNOWN_CPUFAM], [test x$CPUFAM != xnil])
+
+dnl Report on what we found.
+case $CPUFAM in
+ nil) AC_MSG_RESULT([not supported]) ;;
+ *) AC_MSG_RESULT([$CPUFAM/$ABI]) ;;
+esac
+
+dnl Some equipment wanted for checking CPU features at runtime.
+AC_CHECK_HEADERS([asm/hwcap.h])
+AC_CHECK_HEADERS([sys/auxv.h])
+AC_CHECK_HEADERS([linux/auxvec.h])
+AC_CHECK_FUNCS([getauxval])
+
dnl--------------------------------------------------------------------------
dnl C programming environment.
#! /usr/bin/make -f
%:; dh $@ --parallel -Bdebian/build
+
+override_dh_auto_test:
+ dh_auto_test --parallel -Bdebian/build
+ CATACOMB_CPUFEAT="-*" dh_auto_test --parallel -Bdebian/build
## Daemen and Rijmen's `Rijndael' block cipher, selected as AES.
BLKCS += rijndael rijndael192 rijndael256
libsymm_la_SOURCES += rijndael-base.h rijndael-base.c
+if CPUFAM_X86
+libsymm_la_SOURCES += rijndael-x86ish-aesni.S
+endif
+if CPUFAM_AMD64
+libsymm_la_SOURCES += rijndael-x86ish-aesni.S
+endif
nodist_libsymm_la_SOURCES += ../precomp/symm/rijndael-tab.c
PRECOMPS += $(precomp)/symm/rijndael-tab.c
PRECOMP_PROGS += rijndael-mktab
EXTRA_DIST += salsa20-tvconv
pkginclude_HEADERS += salsa20.h salsa20-core.h
libsymm_la_SOURCES += salsa20.c
+if CPUFAM_X86
+libsymm_la_SOURCES += salsa20-x86ish-sse2.S
+endif
+if CPUFAM_AMD64
+libsymm_la_SOURCES += salsa20-x86ish-sse2.S
+endif
TESTS += salsa20.t$(EXEEXT)
ALL_CIPHERS += salsa20 salsa2012 salsa208
ALL_CIPHERS += xsalsa20 xsalsa2012 xsalsa208
## Bernstein's `ChaCha' stream cipher.
pkginclude_HEADERS += chacha.h chacha-core.h
libsymm_la_SOURCES += chacha.c
+if CPUFAM_X86
+libsymm_la_SOURCES += chacha-x86ish-sse2.S
+endif
+if CPUFAM_AMD64
+libsymm_la_SOURCES += chacha-x86ish-sse2.S
+endif
TESTS += chacha.t$(EXEEXT)
EXTRA_DIST += t/chacha
ALL_CIPHERS += chacha20 chacha12 chacha8
/* The ChaCha feedforward step, used at the end of the core function. Here,
* @y@ contains the original input matrix; @z@ contains the final one, and is
- * updated. This is the same as Salsa20.
+ * updated. This is the same as Salsa20, only without the final permutation.
*/
-#define CHACHA_FFWD(z, y) SALSA20_FFWD(z, y)
+#define CHACHA_FFWD(z, y) do { \
+ int _i; \
+ for (_i = 0; _i < 16; _i++) (z)[_i] += (y)[_i]; \
+} while (0)
/* Various numbers of rounds, unrolled. Read from @y@, and write to @z@. */
#define CHACHA_4R(z, y) \
--- /dev/null
+/// -*- mode: asm; asm-comment-char: ?/ -*-
+///
+/// Fancy SIMD implementation of ChaCha
+///
+/// (c) 2015 Straylight/Edgeware
+///
+
+///----- Licensing notice ---------------------------------------------------
+///
+/// This file is part of Catacomb.
+///
+/// Catacomb is free software; you can redistribute it and/or modify
+/// it under the terms of the GNU Library General Public License as
+/// published by the Free Software Foundation; either version 2 of the
+/// License, or (at your option) any later version.
+///
+/// Catacomb is distributed in the hope that it will be useful,
+/// but WITHOUT ANY WARRANTY; without even the implied warranty of
+/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+/// GNU Library General Public License for more details.
+///
+/// You should have received a copy of the GNU Library General Public
+/// License along with Catacomb; if not, write to the Free
+/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+/// MA 02111-1307, USA.
+
+///--------------------------------------------------------------------------
+/// External definitions.
+
+#include "config.h"
+#include "asm-common.h"
+
+///--------------------------------------------------------------------------
+/// Local utilities.
+
+// Magic constants for shuffling.
+#define ROTL 0x93
+#define ROT2 0x4e
+#define ROTR 0x39
+
+///--------------------------------------------------------------------------
+/// Main code.
+
+ .arch pentium4
+ .section .text
+
+FUNC(chacha_core_x86ish_sse2)
+
+ // Initial setup.
+
+#if CPUFAM_X86
+ // Arguments come in on the stack, and will need to be collected. We
+ // we can get away with just the scratch registers for integer work,
+ // but we'll run out of XMM registers and will need some properly
+ // aligned space which we'll steal from the stack. I don't trust the
+ // stack pointer's alignment, so I'll have to mask the stack pointer,
+ // which in turn means I'll need to keep track of the old value.
+ // Hence I'm making a full i386-style stack frame here.
+ //
+ // The Windows and SysV ABIs are sufficiently similar that we don't
+ // need to worry about the differences here.
+
+# define NR ecx
+# define IN eax
+# define OUT edx
+# define SAVE0 xmm5
+# define SAVE1 xmm6
+# define SAVE2 xmm7
+# define SAVE3 [esp]
+
+ push ebp
+ mov ebp, esp
+ sub esp, 16
+ mov IN, [ebp + 12]
+ mov OUT, [ebp + 16]
+ and esp, ~15
+ mov NR, [ebp + 8]
+#endif
+
+#if CPUFAM_AMD64 && ABI_SYSV
+ // This is nice. We have plenty of XMM registers, and the arguments
+ // are in useful places. There's no need to spill anything and we
+ // can just get on with the code.
+
+# define NR edi
+# define IN rsi
+# define OUT rdx
+# define SAVE0 xmm5
+# define SAVE1 xmm6
+# define SAVE2 xmm7
+# define SAVE3 xmm8
+#endif
+
+#if CPUFAM_AMD64 && ABI_WIN
+ // Arguments come in registers, but they're different between Windows
+ // and everyone else (and everyone else is saner).
+ //
+ // The Windows ABI insists that we preserve some of the XMM
+ // registers, but we want more than we can use as scratch space. We
+ // only need to save a copy of the input for the feedforward at the
+ // end, so we might as well use memory rather than spill extra
+ // registers. (We need an extra 8 bytes to align the stack.)
+
+# define NR ecx
+# define IN rdx
+# define OUT r8
+# define SAVE0 xmm5
+# define SAVE1 [rsp + 0]
+# define SAVE2 [rsp + 16]
+# define SAVE3 [rsp + 32]
+
+ sub rsp, 48 + 8
+#endif
+
+ // First job is to slurp the matrix into XMM registers. Be careful:
+ // the input matrix isn't likely to be properly aligned.
+ //
+ // [ 0 1 2 3] (a, xmm0)
+ // [ 4 5 6 7] (b, xmm1)
+ // [ 8 9 10 11] (c, xmm2)
+ // [12 13 14 15] (d, xmm3)
+ movdqu xmm0, [IN + 0]
+ movdqu xmm1, [IN + 16]
+ movdqu xmm2, [IN + 32]
+ movdqu xmm3, [IN + 48]
+
+ // Take a copy for later. This one is aligned properly, by
+ // construction.
+ movdqa SAVE0, xmm0
+ movdqa SAVE1, xmm1
+ movdqa SAVE2, xmm2
+ movdqa SAVE3, xmm3
+
+loop:
+ // Apply a column quarterround to each of the columns simultaneously.
+ // Alas, there doesn't seem to be a packed doubleword rotate, so we
+ // have to synthesize it.
+
+ // a += b; d ^= a; d <<<= 16
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ movdqa xmm4, xmm3
+ pslld xmm3, 16
+ psrld xmm4, 16
+ por xmm3, xmm4
+
+ // c += d; b ^= c; b <<<= 12
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm4, xmm1
+ pslld xmm1, 12
+ psrld xmm4, 20
+ por xmm1, xmm4
+
+ // a += b; d ^= a; d <<<= 8
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ movdqa xmm4, xmm3
+ pslld xmm3, 8
+ psrld xmm4, 24
+ por xmm3, xmm4
+
+ // c += d; b ^= c; b <<<= 7
+ paddd xmm2, xmm3
+ pshufd xmm3, xmm3, ROTL
+ pxor xmm1, xmm2
+ pshufd xmm2, xmm2, ROT2
+ movdqa xmm4, xmm1
+ pslld xmm1, 7
+ psrld xmm4, 25
+ por xmm1, xmm4
+
+ // The not-quite-transpose conveniently only involves reordering
+ // elements of individual rows, which can be done quite easily. It
+ // doesn't involve any movement of elements between rows, or even
+ // renaming of the rows.
+ //
+ // [ 0 1 2 3] [ 0 1 2 3] (a, xmm0)
+ // [ 4 5 6 7] --> [ 5 6 7 4] (b, xmm1)
+ // [ 8 9 10 11] [10 11 8 9] (c, xmm2)
+ // [12 13 14 15] [15 12 13 14] (d, xmm3)
+ //
+ // The shuffles have quite high latency, so they've mostly been
+ // pushed upwards. The remaining one can't be moved, though.
+ pshufd xmm1, xmm1, ROTR
+
+ // Apply the diagonal quarterround to each of the columns
+ // simultaneously.
+
+ // a += b; d ^= a; d <<<= 16
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ movdqa xmm4, xmm3
+ pslld xmm3, 16
+ psrld xmm4, 16
+ por xmm3, xmm4
+
+ // c += d; b ^= c; b <<<= 12
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm4, xmm1
+ pslld xmm1, 12
+ psrld xmm4, 20
+ por xmm1, xmm4
+
+ // a += b; d ^= a; d <<<= 8
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ movdqa xmm4, xmm3
+ pslld xmm3, 8
+ psrld xmm4, 24
+ por xmm3, xmm4
+
+ // c += d; b ^= c; b <<<= 7
+ paddd xmm2, xmm3
+ pshufd xmm3, xmm3, ROTR
+ pxor xmm1, xmm2
+ pshufd xmm2, xmm2, ROT2
+ movdqa xmm4, xmm1
+ pslld xmm1, 7
+ psrld xmm4, 25
+ por xmm1, xmm4
+
+ // Finally, finish off undoing the transpose, and we're done for this
+ // doubleround. Again, most of this was done above so we don't have
+ // to wait for the shuffles.
+ pshufd xmm1, xmm1, ROTL
+
+ // Decrement the loop counter and see if we should go round again.
+ sub NR, 2
+ ja loop
+
+ // Almost there. Firstly, the feedforward addition.
+ paddd xmm0, SAVE0
+ paddd xmm1, SAVE1
+ paddd xmm2, SAVE2
+ paddd xmm3, SAVE3
+
+ // And now we write out the result. This one won't be aligned
+ // either.
+ movdqu [OUT + 0], xmm0
+ movdqu [OUT + 16], xmm1
+ movdqu [OUT + 32], xmm2
+ movdqu [OUT + 48], xmm3
+
+ // Tidy things up.
+#if CPUFAM_X86
+ mov esp, ebp
+ pop ebp
+#endif
+#if CPUFAM_AMD64 && ABI_WIN
+ add rsp, 48 + 8
+#endif
+
+ // And with that, we're done.
+ ret
+
+ENDFUNC
+
+///----- That's all, folks --------------------------------------------------
/*----- Header files ------------------------------------------------------*/
+#include "config.h"
+
#include <stdarg.h>
#include <mLib/bits.h>
#include "arena.h"
#include "chacha.h"
#include "chacha-core.h"
+#include "dispatch.h"
#include "gcipher.h"
#include "grand.h"
#include "keysz.h"
* the feedforward step.
*/
-static void core(unsigned r, const chacha_matrix src, chacha_matrix dest)
+CPU_DISPATCH(static, (void),
+ void, core, (unsigned r, const chacha_matrix src,
+ chacha_matrix dest),
+ (r, src, dest),
+ pick_core, simple_core);
+
+static void simple_core(unsigned r, const chacha_matrix src,
+ chacha_matrix dest)
{ CHACHA_nR(dest, src, r); CHACHA_FFWD(dest, src); }
+#if CPUFAM_X86 || CPUFAM_AMD64
+extern core__functype chacha_core_x86ish_sse2;
+#endif
+
+static core__functype *pick_core(void)
+{
+#if CPUFAM_X86 || CPUFAM_AMD64
+ DISPATCH_PICK_COND(chacha_core, chacha_core_x86ish_sse2,
+ cpu_feature_p(CPUFEAT_X86_SSE2));
+#endif
+ DISPATCH_PICK_FALLBACK(chacha_core, simple_core);
+}
+
/* --- @populate@ --- *
*
* Arguments: @chacha_matrix a@ = a matrix to fill in
/*----- Header files ------------------------------------------------------*/
+#include "config.h"
+
#include <assert.h>
#include <stdio.h>
#include <mLib/bits.h>
#include "blkc.h"
+#include "dispatch.h"
#include "gcipher.h"
#include "rijndael.h"
#include "rijndael-base.h"
* Use: Low-level key-scheduling.
*/
-void rijndael_setup(rijndael_ctx *k, unsigned nb, const void *buf, size_t sz)
+static void simple_setup(rijndael_ctx *k, unsigned nb,
+ const void *buf, unsigned nk)
{
- unsigned nk, nr, nw;
+ unsigned nr = k->nr, nw;
unsigned i, j, jj;
const octet *p;
uint32 ww;
- /* --- Sort out the key size --- */
-
- KSZ_ASSERT(rijndael, sz);
- nk = sz / 4;
-
- /* --- Select the number of rounds --- */
-
- nr = (nk > nb ? nk : nb) + 6;
- if (nr < 10)
- nr = 10;
- k->nr = nr;
-
/* --- Fetch the first key words out --- */
p = buf;
k->wi[i] = k->w[j + jj++];
}
+CPU_DISPATCH(static, EMPTY, void, setup, (rijndael_ctx *k, unsigned nb,
+ const void *buf, unsigned nk),
+ (k, nb, buf, nk), pick_setup, simple_setup)
+
+#if CPUFAM_X86 || CPUFAM_AMD64
+extern setup__functype rijndael_setup_x86ish_aesni;
+#endif
+
+static setup__functype *pick_setup(void)
+{
+#if CPUFAM_X86 || CPUFAM_AMD64
+ DISPATCH_PICK_COND(rijndael_setup, rijndael_setup_x86ish_aesni,
+ cpu_feature_p(CPUFEAT_X86_AESNI));
+#endif
+ DISPATCH_PICK_FALLBACK(rijndael_setup, simple_setup);
+}
+
+void rijndael_setup(rijndael_ctx *k, unsigned nb, const void *buf, size_t sz)
+{
+ unsigned nk, nr;
+
+ /* --- Sort out the key size --- */
+
+ KSZ_ASSERT(rijndael, sz);
+ nk = sz / 4;
+
+ /* --- Select the number of rounds --- */
+
+ nr = (nk > nb ? nk : nb) + 6;
+ if (nr < 10)
+ nr = 10;
+ k->nr = nr;
+
+ /* --- Do the main setup --- */
+
+ setup(k, nb, buf, nk);
+}
+
/*----- That's all, folks -------------------------------------------------*/
--- /dev/null
+/// -*- mode: asm; asm-comment-char: ?/ -*-
+///
+/// AESNI-based implementation of Rijndael
+///
+/// (c) 2015 Straylight/Edgeware
+///
+
+///----- Licensing notice ---------------------------------------------------
+///
+/// This file is part of Catacomb.
+///
+/// Catacomb is free software; you can redistribute it and/or modify
+/// it under the terms of the GNU Library General Public License as
+/// published by the Free Software Foundation; either version 2 of the
+/// License, or (at your option) any later version.
+///
+/// Catacomb is distributed in the hope that it will be useful,
+/// but WITHOUT ANY WARRANTY; without even the implied warranty of
+/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+/// GNU Library General Public License for more details.
+///
+/// You should have received a copy of the GNU Library General Public
+/// License along with Catacomb; if not, write to the Free
+/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+/// MA 02111-1307, USA.
+
+///--------------------------------------------------------------------------
+/// External definitions.
+
+#include "config.h"
+#include "asm-common.h"
+
+///--------------------------------------------------------------------------
+/// External definitions.
+
+ .globl F(abort)
+ .globl F(rijndael_rcon)
+
+///--------------------------------------------------------------------------
+/// Local utilities.
+
+// Magic constants for shuffling.
+#define ROTL 0x93
+#define ROT2 0x4e
+#define ROTR 0x39
+
+///--------------------------------------------------------------------------
+/// Main code.
+
+ .arch .aes
+ .section .text
+
+/// The AESNI instructions implement a little-endian version of AES, but
+/// Catacomb's internal interface presents as big-endian so as to work better
+/// with things like GCM. We therefore maintain the round keys in
+/// little-endian form, and have to end-swap blocks in and out.
+///
+/// For added amusement, the AESNI instructions don't implement the
+/// larger-block versions of Rijndael, so we have to end-swap the keys if
+/// we're preparing for one of those.
+
+ // Useful constants.
+ .equ maxrounds, 16 // maximum number of rounds
+ .equ maxblksz, 32 // maximum block size, in bytes
+ .equ kbufsz, maxblksz*(maxrounds + 1) // size of a key-schedule buffer
+
+ // Context structure.
+ .equ nr, 0 // number of rounds
+ .equ w, nr + 4 // encryption key words
+ .equ wi, w + kbufsz // decryption key words
+
+///--------------------------------------------------------------------------
+/// Key setup.
+
+FUNC(rijndael_setup_x86ish_aesni)
+
+#if CPUFAM_X86
+ // Arguments are on the stack. We'll need to stack the caller's
+ // register veriables, but we'll manage.
+
+# define CTX ebp // context pointer
+# define BLKSZ [esp + 24] // block size
+
+# define SI esi // source pointer
+# define DI edi // destination pointer
+
+# define KSZ ebx // key size
+# define KSZo ebx // ... as address offset
+# define NKW edx // total number of key words
+# define NKW_NEEDS_REFRESH 1 // ... needs recalculating
+# define RCON ecx // round constants table
+# define LIM edx // limit pointer
+# define LIMn edx // ... as integer offset from base
+
+# define NR ecx // number of rounds
+# define LRK eax // distance to last key
+# define LRKo eax // ... as address offset
+# define BLKOFF edx // block size in bytes
+# define BLKOFFo edx // ... as address offset
+
+ // Stack the caller's registers.
+ push ebp
+ push ebx
+ push esi
+ push edi
+
+ // Set up our own variables.
+ mov CTX, [esp + 20] // context base pointer
+ mov SI, [esp + 28] // key material
+ mov KSZ, [esp + 32] // key size, in words
+#endif
+
+#if CPUFAM_AMD64 && ABI_SYSV
+ // Arguments are in registers. We have plenty, but, to be honest,
+ // the initial register allocation is a bit annoying.
+
+# define CTX r8 // context pointer
+# define BLKSZ r9d // block size
+
+# define SI rsi // source pointer
+# define DI rdi // destination pointer
+
+# define KSZ edx // key size
+# define KSZo rdx // ... as address offset
+# define NKW r10d // total number of key words
+# define RCON rdi // round constants table
+# define LIMn ecx // limit pointer
+# define LIM rcx // ... as integer offset from base
+
+# define NR ecx // number of rounds
+# define LRK eax // distance to last key
+# define LRKo rax // ... as address offset
+# define BLKOFF r9d // block size in bytes
+# define BLKOFFo r9 // ... as address offset
+
+ // Move arguments to more useful places.
+ mov CTX, rdi // context base pointer
+ mov BLKSZ, esi // block size in words
+ mov SI, rdx // key material
+ mov KSZ, ecx // key size, in words
+#endif
+
+#if CPUFAM_AMD64 && ABI_WIN
+ // Arguments are in different registers, and they're a little tight.
+
+# define CTX r8 // context pointer
+# define BLKSZ edx // block size
+
+# define SI rsi // source pointer
+# define DI rdi // destination pointer
+
+# define KSZ r9d // key size
+# define KSZo r9 // ... as address offset
+# define NKW r10d // total number of key words
+# define RCON rdi // round constants table
+# define LIMn ecx // limit pointer
+# define LIM rcx // ... as integer offset from base
+
+# define NR ecx // number of rounds
+# define LRK eax // distance to last key
+# define LRKo rax // ... as address offset
+# define BLKOFF edx // block size in bytes
+# define BLKOFFo rdx // ... as address offset
+
+ // We'll need the index registers, which belong to the caller in this
+ // ABI.
+ push rsi
+ push rdi
+
+ // Move arguments to more useful places.
+ mov SI, r8 // key material
+ mov CTX, rcx // context base pointer
+#endif
+
+ // The initial round key material is taken directly from the input
+ // key, so copy it over.
+#if CPUFAM_AMD64 && ABI_SYSV
+ // We've been lucky. We already have a copy of the context pointer
+ // in rdi, and the key size in ecx.
+ add DI, w
+#else
+ lea DI, [CTX + w]
+ mov ecx, KSZ
+#endif
+ rep movsd
+
+ // Find out other useful things.
+ mov NKW, [CTX + nr] // number of rounds
+ add NKW, 1
+ imul NKW, BLKSZ // total key size in words
+#if !NKW_NEEDS_REFRESH
+ // If we can't keep NKW for later, then we use the same register for
+ // it and LIM, so this move is unnecessary.
+ mov LIMn, NKW
+#endif
+ sub LIMn, KSZ // offset by the key size
+
+ // Find the round constants.
+ ldgot ecx
+ leaext RCON, rijndael_rcon, ecx
+
+ // Prepare for the main loop.
+ lea SI, [CTX + w]
+ mov eax, [SI + 4*KSZo - 4] // most recent key word
+ lea LIM, [SI + 4*LIM] // limit, offset by one key expansion
+
+ // Main key expansion loop. The first word of each key-length chunk
+ // needs special treatment.
+ //
+ // This is rather tedious because the Intel `AESKEYGENASSIST'
+ // instruction is very strangely shaped. Firstly, it wants to
+ // operate on vast SSE registers, even though we're data-blocked from
+ // doing more than operation at a time unless we're doing two key
+ // schedules simultaneously -- and even then we can't do more than
+ // two, because the instruction ignores two of its input words
+ // entirely, and produces two different outputs for each of the other
+ // two. And secondly it insists on taking the magic round constant
+ // as an immediate, so it's kind of annoying if you're not
+ // open-coding the whole thing. It's much easier to leave that as
+ // zero and XOR in the round constant by hand.
+9: movd xmm0, eax
+ pshufd xmm0, xmm0, ROTR
+ aeskeygenassist xmm1, xmm0, 0
+ pshufd xmm1, xmm1, ROTL
+ movd eax, xmm1
+ xor eax, [SI]
+ xor al, [RCON]
+ inc RCON
+ mov [SI + 4*KSZo], eax
+ add SI, 4
+ cmp SI, LIM
+ jae 8f
+
+ // The next three words are simple...
+ xor eax, [SI]
+ mov [SI + 4*KSZo], eax
+ add SI, 4
+ cmp SI, LIM
+ jae 8f
+
+ // (Word 2...)
+ xor eax, [SI]
+ mov [SI + 4*KSZo], eax
+ add SI, 4
+ cmp SI, LIM
+ jae 8f
+
+ // (Word 3...)
+ xor eax, [SI]
+ mov [SI + 4*KSZo], eax
+ add SI, 4
+ cmp SI, LIM
+ jae 8f
+
+ // Word 4. If the key is /more/ than 6 words long, then we must
+ // apply a substitution here.
+ cmp KSZ, 5
+ jb 9b
+ cmp KSZ, 7
+ jb 0f
+ movd xmm0, eax
+ pshufd xmm0, xmm0, ROTL
+ aeskeygenassist xmm1, xmm0, 0
+ movd eax, xmm1
+0: xor eax, [SI]
+ mov [SI + 4*KSZo], eax
+ add SI, 4
+ cmp SI, LIM
+ jae 8f
+
+ // (Word 5...)
+ cmp KSZ, 6
+ jb 9b
+ xor eax, [SI]
+ mov [SI + 4*KSZo], eax
+ add SI, 4
+ cmp SI, LIM
+ jae 8f
+
+ // (Word 6...)
+ cmp KSZ, 7
+ jb 9b
+ xor eax, [SI]
+ mov [SI + 4*KSZo], eax
+ add SI, 4
+ cmp SI, LIM
+ jae 8f
+
+ // (Word 7...)
+ cmp KSZ, 8
+ jb 9b
+ xor eax, [SI]
+ mov [SI + 4*KSZo], eax
+ add SI, 4
+ cmp SI, LIM
+ jae 8f
+
+ // Must be done by now.
+ jmp 9b
+
+ // Next job is to construct the decryption keys. The keys for the
+ // first and last rounds don't need to be mangled, but the remaining
+ // ones do -- and they all need to be reordered too.
+ //
+ // The plan of action, then, is to copy the final encryption round's
+ // keys into place first, then to do each of the intermediate rounds
+ // in reverse order, and finally do the first round.
+ //
+ // Do all of the heavy lifting with SSE registers. The order we're
+ // doing this in means that it's OK if we read or write too much, and
+ // there's easily enough buffer space for the over-enthusiastic reads
+ // and writes because the context has space for 32-byte blocks, which
+ // is our maximum and an exact fit for two SSE registers.
+8: mov NR, [CTX + nr] // number of rounds
+#if NKW_NEEDS_REFRESH
+ mov BLKOFF, BLKSZ
+ mov LRK, NR
+ imul LRK, BLKOFF
+#else
+ // If we retain NKW, then BLKSZ and BLKOFF are the same register
+ // because we won't need the former again.
+ mov LRK, NKW
+ sub LRK, BLKSZ
+#endif
+ lea DI, [CTX + wi]
+ lea SI, [CTX + w + 4*LRKo] // last round's keys
+ shl BLKOFF, 2 // block size (in bytes now)
+
+ // Copy the last encryption round's keys.
+ movdqu xmm0, [SI]
+ movdqu [DI], xmm0
+ cmp BLKOFF, 16
+ jbe 9f
+ movdqu xmm0, [SI + 16]
+ movdqu [DI + 16], xmm0
+
+ // Update the loop variables and stop if we've finished.
+9: add DI, BLKOFFo
+ sub SI, BLKOFFo
+ sub NR, 1
+ jbe 0f
+
+ // Do another middle round's keys...
+ movdqu xmm0, [SI]
+ aesimc xmm0, xmm0
+ movdqu [DI], xmm0
+ cmp BLKOFF, 16
+ jbe 9b
+ movdqu xmm0, [SI + 16]
+ aesimc xmm0, xmm0
+ movdqu [DI + 16], xmm0
+ jmp 9b
+
+ // Finally do the first encryption round.
+0: movdqu xmm0, [SI]
+ movdqu [DI], xmm0
+ cmp BLKOFF, 16
+ jbe 0f
+ movdqu xmm0, [SI + 16]
+ movdqu [DI + 16], xmm0
+
+ // If the block size is not exactly four words then we must end-swap
+ // everything. We can use fancy SSE toys for this.
+0: cmp BLKOFF, 16
+ je 0f
+
+ // Find the byte-reordering table.
+ ldgot ecx
+ movdqa xmm5, [INTADDR(endswap_tab, ecx)]
+
+#if NKW_NEEDS_REFRESH
+ // Calculate the number of subkey words again. (It's a good job
+ // we've got a fast multiplier.)
+ mov NKW, [CTX + nr]
+ add NKW, 1
+ imul NKW, BLKSZ
+#endif
+
+ // End-swap the encryption keys.
+ mov ecx, NKW
+ lea SI, [CTX + w]
+ call endswap_block
+
+ // And the decryption keys.
+ mov ecx, NKW
+ lea SI, [CTX + wi]
+ call endswap_block
+
+0: // All done.
+#if CPUFAM_X86
+ pop edi
+ pop esi
+ pop ebx
+ pop ebp
+#endif
+#if CPUFAM_AMD64 && ABI_WIN
+ pop rdi
+ pop rsi
+#endif
+ ret
+
+ .align 16
+endswap_block:
+ // End-swap ECX words starting at SI. The end-swapping table is
+ // already loaded into XMM5; and it's OK to work in 16-byte chunks.
+ movdqu xmm1, [SI]
+ pshufb xmm1, xmm5
+ movdqu [SI], xmm1
+ add SI, 16
+ sub ecx, 4
+ ja endswap_block
+ ret
+
+#undef CTX
+#undef BLKSZ
+#undef SI
+#undef DI
+#undef KSZ
+#undef KSZo
+#undef RCON
+#undef LIMn
+#undef LIM
+#undef NR
+#undef LRK
+#undef LRKo
+#undef BLKOFF
+#undef BLKOFFo
+
+ENDFUNC
+
+///--------------------------------------------------------------------------
+/// Encrypting and decrypting blocks.
+
+ .macro encdec op, aes, koff
+FUNC(rijndael_\op\()_x86ish_aesni)
+
+ // Find the magic endianness-swapping table.
+ ldgot ecx
+ movdqa xmm5, [INTADDR(endswap_tab, ecx)]
+
+#if CPUFAM_X86
+ // Arguments come in on the stack, and need to be collected. We
+ // don't have a shortage of registers.
+
+# define K ecx
+# define SRC edx
+# define DST edx
+# define NR eax
+
+ mov K, [esp + 4]
+ mov SRC, [esp + 8]
+#endif
+
+#if CPUFAM_AMD64 && ABI_SYSV
+ // Arguments come in registers. All is good.
+
+# define K rdi
+# define SRC rsi
+# define DST rdx
+# define NR eax
+#endif
+
+#if CPUFAM_AMD64 && ABI_WIN
+ // Arguments come in different registers.
+
+# define K rcx
+# define SRC rdx
+# define DST r8
+# define NR eax
+#endif
+
+ // Initial setup.
+ movdqu xmm0, [SRC]
+ pshufb xmm0, xmm5
+ mov NR, [K + nr]
+ add K, \koff
+
+ // Initial whitening.
+ movdqu xmm1, [K]
+ add K, 16
+ pxor xmm0, xmm1
+
+ // Dispatch to the correct code.
+ cmp NR, 10
+ je 10f
+ jb bogus
+ cmp NR, 14
+ je 14f
+ ja bogus
+ cmp NR, 12
+ je 12f
+ jb 11f
+ jmp 13f
+
+ .align 2
+
+ // 14 rounds...
+14: movdqu xmm1, [K]
+ add K, 16
+ \aes xmm0, xmm1
+
+ // 13 rounds...
+13: movdqu xmm1, [K]
+ add K, 16
+ \aes xmm0, xmm1
+
+ // 12 rounds...
+12: movdqu xmm1, [K]
+ add K, 16
+ \aes xmm0, xmm1
+
+ // 11 rounds...
+11: movdqu xmm1, [K]
+ add K, 16
+ \aes xmm0, xmm1
+
+ // 10 rounds...
+10: movdqu xmm1, [K]
+ \aes xmm0, xmm1
+
+ // 9 rounds...
+ movdqu xmm1, [K + 16]
+ \aes xmm0, xmm1
+
+ // 8 rounds...
+ movdqu xmm1, [K + 32]
+ \aes xmm0, xmm1
+
+ // 7 rounds...
+ movdqu xmm1, [K + 48]
+ \aes xmm0, xmm1
+
+ // 6 rounds...
+ movdqu xmm1, [K + 64]
+ \aes xmm0, xmm1
+
+ // 5 rounds...
+ movdqu xmm1, [K + 80]
+ \aes xmm0, xmm1
+
+ // 4 rounds...
+ movdqu xmm1, [K + 96]
+ \aes xmm0, xmm1
+
+ // 3 rounds...
+ movdqu xmm1, [K + 112]
+ \aes xmm0, xmm1
+
+ // 2 rounds...
+ movdqu xmm1, [K + 128]
+ \aes xmm0, xmm1
+
+ // Final round...
+ movdqu xmm1, [K + 144]
+ \aes\()last xmm0, xmm1
+
+ // Unpermute the ciphertext block and store it.
+ pshufb xmm0, xmm5
+#if CPUFAM_X86
+ mov DST, [esp + 12]
+#endif
+ movdqu [DST], xmm0
+
+ // And we're done.
+ ret
+
+#undef K
+#undef SRC
+#undef DST
+#undef NR
+
+ENDFUNC
+ .endm
+
+ encdec eblk, aesenc, w
+ encdec dblk, aesdec, wi
+
+///--------------------------------------------------------------------------
+/// Random utilities.
+
+ .align 16
+ // Abort the process because of a programming error. Indirecting
+ // through this point serves several purposes: (a) by CALLing, rather
+ // than branching to, `abort', we can save the return address, which
+ // might at least provide a hint as to what went wrong; (b) we don't
+ // have conditional CALLs (and they'd be big anyway); and (c) we can
+ // write a HLT here as a backstop against `abort' being mad.
+bogus: callext F(abort)
+0: hlt
+ jmp 0b
+
+ gotaux ecx
+
+///--------------------------------------------------------------------------
+/// Data tables.
+
+ .align 16
+endswap_tab:
+ .byte 3, 2, 1, 0
+ .byte 7, 6, 5, 4
+ .byte 11, 10, 9, 8
+ .byte 15, 14, 13, 12
+
+///----- That's all, folks --------------------------------------------------
/*----- Header files ------------------------------------------------------*/
+#include "config.h"
+
#include <assert.h>
#include <stdio.h>
#include <mLib/bits.h>
#include "blkc.h"
+#include "dispatch.h"
#include "gcipher.h"
#include "rijndael.h"
#include "rijndael-base.h"
* Use: Low-level block encryption and decryption.
*/
+CPU_DISPATCH(EMPTY, EMPTY, void, rijndael_eblk, (const rijndael_ctx *k,
+ const uint32 s[4],
+ uint32 d[4]),
+ (k, s, d), pick_eblk, simple_eblk)
+
+CPU_DISPATCH(EMPTY, EMPTY, void, rijndael_dblk, (const rijndael_ctx *k,
+ const uint32 s[4],
+ uint32 d[4]),
+ (k, s, d), pick_dblk, simple_dblk)
+
+#if CPUFAM_X86 || CPUFAM_AMD64
+extern rijndael_eblk__functype rijndael_eblk_x86ish_aesni;
+extern rijndael_dblk__functype rijndael_dblk_x86ish_aesni;
+#endif
+
+static rijndael_eblk__functype *pick_eblk(void)
+{
+#if CPUFAM_X86 || CPUFAM_AMD64
+ DISPATCH_PICK_COND(rijndael_eblk, rijndael_eblk_x86ish_aesni,
+ cpu_feature_p(CPUFEAT_X86_AESNI));
+#endif
+ DISPATCH_PICK_FALLBACK(rijndael_eblk, simple_eblk);
+}
+
+static rijndael_dblk__functype *pick_dblk(void)
+{
+#if CPUFAM_X86 || CPUFAM_AMD64
+ DISPATCH_PICK_COND(rijndael_dblk, rijndael_dblk_x86ish_aesni,
+ cpu_feature_p(CPUFEAT_X86_AESNI));
+#endif
+ DISPATCH_PICK_FALLBACK(rijndael_dblk, simple_dblk);
+}
+
#define DO(what, t, aa, bb, cc, dd, a, b, c, d, w) do { \
aa = what(t, a, b, c, d) ^ *w++; \
bb = what(t, b, c, d, a) ^ *w++; \
dd = what(t, d, c, b, a) ^ *w++; \
} while (0)
-void rijndael_eblk(const rijndael_ctx *k, const uint32 *s, uint32 *dst)
+static void simple_eblk(const rijndael_ctx *k, const uint32 *s, uint32 *dst)
{
uint32 a = s[0], b = s[1], c = s[2], d = s[3];
uint32 aa, bb, cc, dd;
dst[0] = a; dst[1] = b; dst[2] = c; dst[3] = d;
}
-void rijndael_dblk(const rijndael_ctx *k, const uint32 *s, uint32 *dst)
+static void simple_dblk(const rijndael_ctx *k, const uint32 *s, uint32 *dst)
{
uint32 a = s[0], b = s[1], c = s[2], d = s[3];
uint32 aa, bb, cc, dd;
/*----- The Salsa20 core function -----------------------------------------*/
+/* It makes life somewhat easier if we don't actually store and maintain the
+ * input matrix in the textbook order. Instead, we rotate the columns other
+ * than the leftmost one upwards, so that the constants which were originally
+ * along the diagonal end up on the top row. We'll need to undo this
+ * permutation on output, but that's not too terrible an imposition.
+ *
+ * The permutation we're applying to the matrix elements is this:
+ *
+ * [ 0 1 2 3 ] [ 0 5 10 15 ]
+ * [ 4 5 6 7 ] --> [ 4 9 14 3 ]
+ * [ 8 9 10 11 ] [ 8 13 2 7 ]
+ * [ 12 13 14 15 ] [ 12 1 6 11 ]
+ *
+ * and as a result, we need to apply this inverse permutation to figure out
+ * which indices to use in the doublerow function and elsewhere.
+ *
+ * [ 0 13 10 7 ]
+ * [ 4 1 14 11 ]
+ * [ 8 5 2 15 ]
+ * [ 12 9 6 3 ]
+ */
+
/* The Salsa20 quarter-round. Read from the matrix @y@ at indices @a@, @b@,
* @c@, and @d@; and write to the corresponding elements of @z@.
*/
*/
#define SALSA20_DR(z, y) do { \
SALSA20_QR(z, y, 0, 4, 8, 12); \
- SALSA20_QR(z, y, 5, 9, 13, 1); \
- SALSA20_QR(z, y, 10, 14, 2, 6); \
- SALSA20_QR(z, y, 15, 3, 7, 11); \
- SALSA20_QR(z, z, 0, 1, 2, 3); \
- SALSA20_QR(z, z, 5, 6, 7, 4); \
- SALSA20_QR(z, z, 10, 11, 8, 9); \
- SALSA20_QR(z, z, 15, 12, 13, 14); \
+ SALSA20_QR(z, y, 1, 5, 9, 13); \
+ SALSA20_QR(z, y, 2, 6, 10, 14); \
+ SALSA20_QR(z, y, 3, 7, 11, 15); \
+ SALSA20_QR(z, z, 0, 13, 10, 7); \
+ SALSA20_QR(z, z, 1, 14, 11, 4); \
+ SALSA20_QR(z, z, 2, 15, 8, 5); \
+ SALSA20_QR(z, z, 3, 12, 9, 6); \
} while (0)
/* The Salsa20 feedforward step, used at the end of the core function. Here,
* @y@ contains the original input matrix; @z@ contains the final one, and is
- * updated.
+ * updated. The output is rendered in canonical order, ready for output.
*/
#define SALSA20_FFWD(z, y) do { \
- int _i; \
- for (_i = 0; _i < 16; _i++) (z)[_i] += (y)[_i]; \
+ const uint32 *_y = (y); \
+ uint32 *_z = (z); \
+ int _t; \
+ _z[ 0] = _z[ 0] + _y[ 0]; _z[ 4] = _z[ 4] + _y[ 4]; \
+ _z[ 8] = _z[ 8] + _y[ 8]; _z[12] = _z[12] + _y[12]; \
+ _t = _z[ 1] + _y[ 1]; _z[ 1] = _z[13] + _y[13]; \
+ _z[13] = _z[ 9] + _y[ 9]; _z[ 9] = _z[ 5] + _y[ 5]; _z[ 5] = _t; \
+ _t = _z[ 2] + _y[ 2]; _z[ 2] = _z[10] + _y[10]; _z[10] = _t; \
+ _t = _z[ 6] + _y[ 6]; _z[ 6] = _z[14] + _y[14]; _z[14] = _t; \
+ _t = _z[ 3] + _y[ 3]; _z[ 3] = _z[ 7] + _y[ 7]; \
+ _z[ 7] = _z[11] + _y[11]; _z[11] = _z[15] + _y[15]; _z[15] = _t; \
} while (0)
/* Various numbers of rounds, unrolled. Read from @y@, and write to @z@. */
/* Step the counter in the Salsa20 state matrix @a@. */
#define SALSA20_STEP(a) \
- do { (a)[8] = U32((a)[8] + 1); (a)[9] += !(a)[8]; } while (0)
+ do { (a)[8] = U32((a)[8] + 1); (a)[5] += !(a)[8]; } while (0)
/*----- Buffering and output ----------------------------------------------*
*
--- /dev/null
+/// -*- mode: asm; asm-comment-char: ?/ -*-
+///
+/// Fancy SIMD implementation of Salsa20
+///
+/// (c) 2015 Straylight/Edgeware
+///
+
+///----- Licensing notice ---------------------------------------------------
+///
+/// This file is part of Catacomb.
+///
+/// Catacomb is free software; you can redistribute it and/or modify
+/// it under the terms of the GNU Library General Public License as
+/// published by the Free Software Foundation; either version 2 of the
+/// License, or (at your option) any later version.
+///
+/// Catacomb is distributed in the hope that it will be useful,
+/// but WITHOUT ANY WARRANTY; without even the implied warranty of
+/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+/// GNU Library General Public License for more details.
+///
+/// You should have received a copy of the GNU Library General Public
+/// License along with Catacomb; if not, write to the Free
+/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+/// MA 02111-1307, USA.
+
+///--------------------------------------------------------------------------
+/// External definitions.
+
+#include "config.h"
+#include "asm-common.h"
+
+///--------------------------------------------------------------------------
+/// Local utilities.
+
+// Magic constants for shuffling.
+#define ROTL 0x93
+#define ROT2 0x4e
+#define ROTR 0x39
+
+///--------------------------------------------------------------------------
+/// Main code.
+
+ .arch pentium4
+ .section .text
+
+FUNC(salsa20_core_x86ish_sse2)
+
+ // Initial setup.
+
+#if CPUFAM_X86
+ // Arguments come in on the stack, and will need to be collected. We
+ // we can get away with just the scratch registers for integer work,
+ // but we'll run out of XMM registers and will need some properly
+ // aligned space which we'll steal from the stack. I don't trust the
+ // stack pointer's alignment, so I'll have to mask the stack pointer,
+ // which in turn means I'll need to keep track of the old value.
+ // Hence I'm making a full i386-style stack frame here.
+ //
+ // The Windows and SysV ABIs are sufficiently similar that we don't
+ // need to worry about the differences here.
+
+# define NR ecx
+# define IN eax
+# define OUT edx
+# define SAVE0 xmm6
+# define SAVE1 xmm7
+# define SAVE2 [esp + 0]
+# define SAVE3 [esp + 16]
+
+ push ebp
+ mov ebp, esp
+ sub esp, 32
+ mov IN, [ebp + 12]
+ mov OUT, [ebp + 16]
+ and esp, ~15
+ mov NR, [ebp + 8]
+#endif
+
+#if CPUFAM_AMD64 && ABI_SYSV
+ // This is nice. We have plenty of XMM registers, and the arguments
+ // are in useful places. There's no need to spill anything and we
+ // can just get on with the code.
+
+# define NR edi
+# define IN rsi
+# define OUT rdx
+# define SAVE0 xmm6
+# define SAVE1 xmm7
+# define SAVE2 xmm8
+# define SAVE3 xmm9
+#endif
+
+# if CPUFAM_AMD64 && ABI_WIN
+ // Arguments come in registers, but they're different between Windows
+ // and everyone else (and everyone else is saner).
+ //
+ // The Windows ABI insists that we preserve some of the XMM
+ // registers, but we want more than we can use as scratch space. Two
+ // places we only need to save a copy of the input for the
+ // feedforward at the end; but the other two we want for the final
+ // permutation, so save the old values on the stack (We need an extra
+ // 8 bytes to align the stack.)
+
+# define NR ecx
+# define IN rdx
+# define OUT r8
+# define SAVE0 xmm6
+# define SAVE1 xmm7
+# define SAVE2 [rsp + 32]
+# define SAVE3 [rsp + 48]
+
+ sub rsp, 64 + 8
+ movdqa [rsp + 0], xmm6
+ movdqa [rsp + 16], xmm7
+#endif
+
+ // First job is to slurp the matrix into XMM registers. The words
+ // have already been permuted conveniently to make them line up
+ // better for SIMD processing.
+ //
+ // The textbook arrangement of the matrix is this.
+ //
+ // [C K K K]
+ // [K C N N]
+ // [T T C K]
+ // [K K K C]
+ //
+ // But we've rotated the columns up so that the main diagonal with
+ // the constants on it end up in the first row, giving something more
+ // like
+ //
+ // [C C C C]
+ // [K T K K]
+ // [T K K N]
+ // [K K N K]
+ //
+ // so the transformation looks like this:
+ //
+ // [ 0 1 2 3] [ 0 5 10 15] (a, xmm0)
+ // [ 4 5 6 7] --> [ 4 9 14 3] (b, xmm1)
+ // [ 8 9 10 11] [ 8 13 2 7] (c, xmm2)
+ // [12 13 14 15] [12 1 6 11] (d, xmm3)
+ movdqu xmm0, [IN + 0]
+ movdqu xmm1, [IN + 16]
+ movdqu xmm2, [IN + 32]
+ movdqu xmm3, [IN + 48]
+
+ ## Take a copy for later.
+ movdqa SAVE0, xmm0
+ movdqa SAVE1, xmm1
+ movdqa SAVE2, xmm2
+ movdqa SAVE3, xmm3
+
+loop:
+ // Apply a column quarterround to each of the columns simultaneously.
+ // Alas, there doesn't seem to be a packed doubleword rotate, so we
+ // have to synthesize it.
+
+ // b ^= (a + d) <<< 7
+ movdqa xmm4, xmm0
+ paddd xmm4, xmm3
+ movdqa xmm5, xmm4
+ pslld xmm4, 7
+ psrld xmm5, 25
+ por xmm4, xmm5
+ pxor xmm1, xmm4
+
+ // c ^= (b + a) <<< 9
+ movdqa xmm4, xmm1
+ paddd xmm4, xmm0
+ movdqa xmm5, xmm4
+ pslld xmm4, 9
+ psrld xmm5, 23
+ por xmm4, xmm5
+ pxor xmm2, xmm4
+
+ // d ^= (c + b) <<< 13
+ movdqa xmm4, xmm2
+ paddd xmm4, xmm1
+ pshufd xmm1, xmm1, ROTL
+ movdqa xmm5, xmm4
+ pslld xmm4, 13
+ psrld xmm5, 19
+ por xmm4, xmm5
+ pxor xmm3, xmm4
+
+ // a ^= (d + c) <<< 18
+ movdqa xmm4, xmm3
+ pshufd xmm3, xmm3, ROTR
+ paddd xmm4, xmm2
+ pshufd xmm2, xmm2, ROT2
+ movdqa xmm5, xmm4
+ pslld xmm4, 18
+ psrld xmm5, 14
+ por xmm4, xmm5
+ pxor xmm0, xmm4
+
+ // The transpose conveniently only involves reordering elements of
+ // individual rows, which can be done quite easily, and reordering
+ // the rows themselves, which is a trivial renaming. It doesn't
+ // involve any movement of elements between rows.
+ //
+ // [ 0 5 10 15] [ 0 5 10 15] (a, xmm0)
+ // [ 4 9 14 3] --> [ 1 6 11 12] (b, xmm3)
+ // [ 8 13 2 7] [ 2 7 8 13] (c, xmm2)
+ // [12 1 6 11] [ 3 4 9 14] (d, xmm1)
+ //
+ // The shuffles have quite high latency, so they've been pushed
+ // backwards into the main instruction list.
+
+ // Apply the row quarterround to each of the columns (yes!)
+ // simultaneously.
+
+ // b ^= (a + d) <<< 7
+ movdqa xmm4, xmm0
+ paddd xmm4, xmm1
+ movdqa xmm5, xmm4
+ pslld xmm4, 7
+ psrld xmm5, 25
+ por xmm4, xmm5
+ pxor xmm3, xmm4
+
+ // c ^= (b + a) <<< 9
+ movdqa xmm4, xmm3
+ paddd xmm4, xmm0
+ movdqa xmm5, xmm4
+ pslld xmm4, 9
+ psrld xmm5, 23
+ por xmm4, xmm5
+ pxor xmm2, xmm4
+
+ // d ^= (c + b) <<< 13
+ movdqa xmm4, xmm2
+ paddd xmm4, xmm3
+ pshufd xmm3, xmm3, ROTL
+ movdqa xmm5, xmm4
+ pslld xmm4, 13
+ psrld xmm5, 19
+ por xmm4, xmm5
+ pxor xmm1, xmm4
+
+ // a ^= (d + c) <<< 18
+ movdqa xmm4, xmm1
+ pshufd xmm1, xmm1, ROTR
+ paddd xmm4, xmm2
+ pshufd xmm2, xmm2, ROT2
+ movdqa xmm5, xmm4
+ pslld xmm4, 18
+ psrld xmm5, 14
+ por xmm4, xmm5
+ pxor xmm0, xmm4
+
+ // We had to undo the transpose ready for the next loop. Again, push
+ // back the shuffles because they take a long time coming through.
+ // Decrement the loop counter and see if we should go round again.
+ // Later processors fuse this pair into a single uop.
+ sub NR, 2
+ ja loop
+
+ // Almost there. Firstly, the feedforward addition, and then we have
+ // to write out the result. Here we have to undo the permutation
+ // which was already applied to the input. Shuffling has quite high
+ // latency, so arrange to start a new shuffle into a temporary as
+ // soon as we've written out the old value.
+ paddd xmm0, SAVE0
+ pshufd xmm4, xmm0, 0x39
+ movd [OUT + 0], xmm0
+
+ paddd xmm1, SAVE1
+ pshufd xmm5, xmm1, ROTL
+ movd [OUT + 16], xmm1
+
+ paddd xmm2, SAVE2
+ pshufd xmm6, xmm2, ROT2
+ movd [OUT + 32], xmm2
+
+ paddd xmm3, SAVE3
+ pshufd xmm7, xmm3, ROTR
+ movd [OUT + 48], xmm3
+
+ movd [OUT + 4], xmm7
+ pshufd xmm7, xmm3, ROT2
+ movd [OUT + 24], xmm7
+ pshufd xmm3, xmm3, ROTL
+ movd [OUT + 44], xmm3
+
+ movd [OUT + 8], xmm6
+ pshufd xmm6, xmm2, ROTL
+ movd [OUT + 28], xmm6
+ pshufd xmm2, xmm2, ROTR
+ movd [OUT + 52], xmm2
+
+ movd [OUT + 12], xmm5
+ pshufd xmm5, xmm1, ROTR
+ movd [OUT + 36], xmm5
+ pshufd xmm1, xmm1, ROT2
+ movd [OUT + 56], xmm1
+
+ movd [OUT + 20], xmm4
+ pshufd xmm4, xmm0, ROT2
+ movd [OUT + 40], xmm4
+ pshufd xmm0, xmm0, ROTL
+ movd [OUT + 60], xmm0
+
+ // Tidy things up.
+
+#if CPUFAM_X86
+ mov esp, ebp
+ pop ebp
+#endif
+#if CPUFAM_AMD64 && ABI_WIN
+ movdqa xmm6, [rsp + 0]
+ movdqa xmm7, [rsp + 16]
+ add rsp, 64 + 8
+#endif
+
+ // And with that, we're done.
+ ret
+
+#undef NR
+#undef IN
+#undef OUT
+#undef SAVE0
+#undef SAVE1
+#undef SAVE2
+#undef SAVE3
+
+ENDFUNC
+
+///----- That's all, folks --------------------------------------------------
/*----- Header files ------------------------------------------------------*/
+#include "config.h"
+
#include <stdarg.h>
#include <mLib/bits.h>
#include "arena.h"
+#include "dispatch.h"
#include "gcipher.h"
#include "grand.h"
#include "keysz.h"
* the feedforward step.
*/
-static void core(unsigned r, const salsa20_matrix src, salsa20_matrix dest)
+CPU_DISPATCH(static, (void),
+ void, core, (unsigned r, const salsa20_matrix src,
+ salsa20_matrix dest),
+ (r, src, dest),
+ pick_core, simple_core);
+
+static void simple_core(unsigned r, const salsa20_matrix src,
+ salsa20_matrix dest)
{ SALSA20_nR(dest, src, r); SALSA20_FFWD(dest, src); }
+#if CPUFAM_X86 || CPUFAM_AMD64
+extern core__functype salsa20_core_x86ish_sse2;
+#endif
+
+static core__functype *pick_core(void)
+{
+#if CPUFAM_X86 || CPUFAM_AMD64
+ DISPATCH_PICK_COND(salsa20_core, salsa20_core_x86ish_sse2,
+ cpu_feature_p(CPUFEAT_X86_SSE2));
+#endif
+ DISPATCH_PICK_FALLBACK(salsa20_core, simple_core);
+}
+
/* --- @populate@ --- *
*
* Arguments: @salsa20_matrix a@ = a matrix to fill in
KSZ_ASSERT(salsa20, ksz);
- a[ 1] = LOAD32_L(k + 0);
- a[ 2] = LOAD32_L(k + 4);
+ /* Here's the pattern of key, constant, nonce, and counter pieces in the
+ * matrix, before and after our permutation.
+ *
+ * [ C0 K0 K1 K2 ] [ C0 C1 C2 C3 ]
+ * [ K3 C1 N0 N1 ] --> [ K3 T1 K7 K2 ]
+ * [ T0 T1 C2 K4 ] [ T0 K6 K1 N1 ]
+ * [ K5 K6 K7 C3 ] [ K5 K0 N0 K4 ]
+ */
+
+ a[13] = LOAD32_L(k + 0);
+ a[10] = LOAD32_L(k + 4);
if (ksz == 10) {
- a[ 3] = LOAD16_L(k + 8);
+ a[ 7] = LOAD16_L(k + 8);
a[ 4] = 0;
} else {
- a[ 3] = LOAD32_L(k + 8);
+ a[ 7] = LOAD32_L(k + 8);
a[ 4] = LOAD32_L(k + 12);
}
if (ksz <= 16) {
- a[11] = a[ 1];
- a[12] = a[ 2];
- a[13] = a[ 3];
- a[14] = a[ 4];
+ a[15] = a[13];
+ a[12] = a[10];
+ a[ 9] = a[ 7];
+ a[ 6] = a[ 4];
a[ 0] = SALSA20_A128;
- a[ 5] = SALSA20_B128;
- a[10] = ksz == 10 ? SALSA20_C80 : SALSA20_C128;
- a[15] = SALSA20_D128;
+ a[ 1] = SALSA20_B128;
+ a[ 2] = ksz == 10 ? SALSA20_C80 : SALSA20_C128;
+ a[ 3] = SALSA20_D128;
} else {
- a[11] = LOAD32_L(k + 16);
+ a[15] = LOAD32_L(k + 16);
a[12] = LOAD32_L(k + 20);
- a[13] = LOAD32_L(k + 24);
- a[14] = LOAD32_L(k + 28);
+ a[ 9] = LOAD32_L(k + 24);
+ a[ 6] = LOAD32_L(k + 28);
a[ 0] = SALSA20_A256;
- a[ 5] = SALSA20_B256;
- a[10] = SALSA20_C256;
- a[15] = SALSA20_D256;
+ a[ 1] = SALSA20_B256;
+ a[ 2] = SALSA20_C256;
+ a[ 3] = SALSA20_D256;
}
}
{
const octet *n = nonce;
- ctx->a[6] = LOAD32_L(n + 0);
- ctx->a[7] = LOAD32_L(n + 4);
+ ctx->a[14] = LOAD32_L(n + 0);
+ ctx->a[11] = LOAD32_L(n + 4);
salsa20_seek(ctx, 0);
}
void salsa20_seeku64(salsa20_ctx *ctx, kludge64 i)
{
- ctx->a[8] = LO64(i); ctx->a[9] = HI64(i);
+ ctx->a[8] = LO64(i); ctx->a[5] = HI64(i);
ctx->bufi = SALSA20_OUTSZ;
}
{ kludge64 i = salsa20_tellu64(ctx); return (GET64(unsigned long, i)); }
kludge64 salsa20_tellu64(salsa20_ctx *ctx)
- { kludge64 i; SET64(i, ctx->a[9], ctx->a[8]); return (i); }
+ { kludge64 i; SET64(i, ctx->a[5], ctx->a[8]); return (i); }
/* --- @salsa20{,12,8}_encrypt@ --- *
*
* speed critical, so we do it the harder way. \
*/ \
\
- for (i = 0; i < 4; i++) k[i + 6] = src[i]; \
+ for (i = 0; i < 4; i++) k[14 - 3*i] = src[i]; \
core(r, k, a); \
- for (i = 0; i < 4; i++) dest[i] = a[5*i] - k[5*i]; \
- for (i = 4; i < 8; i++) dest[i] = a[i + 2] - k[i + 2]; \
+ for (i = 0; i < 4; i++) dest[i] = a[5*i] - k[i]; \
+ for (i = 4; i < 8; i++) dest[i] = a[i + 2] - k[26 - 3*i]; \
} \
\
void HSALSA20_PRF(r, salsa20_ctx *ctx, const void *src, void *dest) \
\
populate(ctx->k, key, ksz); \
ctx->s.a[ 0] = SALSA20_A256; \
- ctx->s.a[ 5] = SALSA20_B256; \
- ctx->s.a[10] = SALSA20_C256; \
- ctx->s.a[15] = SALSA20_D256; \
+ ctx->s.a[ 1] = SALSA20_B256; \
+ ctx->s.a[ 2] = SALSA20_C256; \
+ ctx->s.a[ 3] = SALSA20_D256; \
XSALSA20_SETNONCE(r, ctx, nonce ? nonce : zerononce); \
}
SALSA20_VARS(DEFXINIT)
\
for (i = 0; i < 4; i++) in[i] = LOAD32_L(n + 4*i); \
HSALSA20_RAW(r, ctx->k, in, out); \
- for (i = 0; i < 4; i++) ctx->s.a[i + 1] = out[i]; \
- for (i = 4; i < 8; i++) ctx->s.a[i + 7] = out[i]; \
+ for (i = 0; i < 4; i++) ctx->s.a[13 - 3*i] = out[i]; \
+ for (i = 4; i < 8; i++) ctx->s.a[27 - 3*i] = out[i]; \
salsa20_setnonce(&ctx->s, n + 16); \
}
SALSA20_VARS(DEFXNONCE)
#include <mLib/quis.h>
#include <mLib/testrig.h>
+static const int perm[] = {
+ 0, 13, 10, 7,
+ 4, 1, 14, 11,
+ 8, 5, 2, 15,
+ 12, 9, 6, 3
+};
+
#define DEFVCORE(r) \
static int v_core_##r(dstr *v) \
{ \
salsa20_matrix a, b; \
dstr d = DSTR_INIT; \
- int i, n; \
+ int i, j, n; \
int ok = 1; \
\
DENSURE(&d, SALSA20_OUTSZ); d.len = SALSA20_OUTSZ; \
n = *(int *)v[0].buf; \
for (i = 0; i < SALSA20_OUTSZ/4; i++) \
- a[i] = LOAD32_L(v[1].buf + 4*i); \
+ b[i] = LOAD32_L(v[1].buf + 4*i); \
for (i = 0; i < n; i++) { \
+ for (j = 0; j < 16; j++) a[perm[j]] = b[j]; \
core(r, a, b); \
memcpy(a, b, sizeof(a)); \
} \
- for (i = 0; i < SALSA20_OUTSZ/4; i++) STORE32_L(d.buf + 4*i, a[i]); \
+ for (i = 0; i < SALSA20_OUTSZ/4; i++) STORE32_L(d.buf + 4*i, b[i]); \
\
if (d.len != v[2].len || memcmp(d.buf, v[2].buf, v[2].len) != 0) { \
ok = 0; \