///--------------------------------------------------------------------------
/// Primitive multipliers and related utilities.
- .p2align 4
-carryprop:
+INTFUNC(carryprop)
// On entry, XMM4, XMM5, and XMM6 hold a 144-bit carry in an expanded
// form. Store the low 128 bits of the represented carry to [EDI] as
// a packed 128-bit value, and leave the remaining 16 bits in the low
endprop [edi + 12], xmm6, xmm4
ret
- .p2align 4
-dmul4:
+ENDFUNC
+
+INTFUNC(dmul4)
// On entry, EDI points to the destination buffer; EAX and EBX point
// to the packed operands U and X; ECX and EDX point to the expanded
// operands V and Y; and XMM4, XMM5, XMM6 hold the incoming carry
ret
- .p2align 4
-dmla4:
+ENDFUNC
+
+INTFUNC(dmla4)
// On entry, EDI points to the destination buffer, which also
// contains an addend A to accumulate; EAX and EBX point to the
// packed operands U and X; ECX and EDX point to the expanded
ret
- .p2align 4
-mul4zc:
+ENDFUNC
+
+INTFUNC(mul4zc)
// On entry, EDI points to the destination buffer; EBX points to a
// packed operand X; and EDX points to an expanded operand Y.
//
ret
- .p2align 4
-mul4:
+ENDFUNC
+
+INTFUNC(mul4)
// On entry, EDI points to the destination buffer; EBX points to a
// packed operand X; EDX points to an expanded operand Y; and XMM4,
// XMM5, XMM6 hold the incoming carry registers c0, c1, and c2,
ret
- .p2align 4
-mla4zc:
+ENDFUNC
+
+INTFUNC(mla4zc)
// On entry, EDI points to the destination buffer, which also
// contains an addend A to accumulate; EBX points to a packed operand
// X; and EDX points to an expanded operand Y.
ret
- .p2align 4
-mla4:
+ENDFUNC
+
+INTFUNC(mla4)
// On entry, EDI points to the destination buffer, which also
// contains an addend A to accumulate; EBX points to a packed operand
// X; EDX points to an expanded operand Y; and XMM4, XMM5, XMM6 hold
ret
- .p2align 4
-mmul4:
+ENDFUNC
+
+INTFUNC(mmul4)
// On entry, EDI points to the destination buffer; EAX and EBX point
// to the packed operands U and N; ECX and ESI point to the expanded
// operands V and M; and EDX points to a place to store an expanded
propout [edi + 0], xmm4, xmm5
jmp 5f
- .p2align 4
-mmla4:
+ENDFUNC
+
+INTFUNC(mmla4)
// On entry, EDI points to the destination buffer, which also
// contains an addend A to accumulate; EAX and EBX point
// to the packed operands U and N; ECX and ESI point to the expanded
add esp, 64
ret
- .p2align 4
-mont4:
+ENDFUNC
+
+INTFUNC(mont4)
// On entry, EDI points to the destination buffer holding a packed
// value A; EBX points to a packed operand N; ESI points to an
// expanded operand M; and EDX points to a place to store an expanded
// And, with that, we're done.
ret
+ENDFUNC
+
///--------------------------------------------------------------------------
/// Bulk multipliers.
#endif
ret
- .align 16
-endswap_block:
+ENDFUNC
+
+INTFUNC(endswap_block)
// End-swap NKW words starting at SI. The end-swapping table is
// already loaded into XMM5; and it's OK to work in 16-byte chunks.
+#if CPUFAM_AMD64 && ABI_WIN
+ .seh_endprologue
+#endif
+
mov ecx, NKW
0: movdqu xmm1, [SI]
pshufb xmm1, xmm5
add SI, 16
sub ecx, 4
ja 0b
+
ret
+ENDFUNC
+
#undef CTX
#undef BLKSZ
#undef SI
#undef LRK
#undef BLKOFF
-ENDFUNC
-
///--------------------------------------------------------------------------
/// Encrypting and decrypting blocks.
///--------------------------------------------------------------------------
/// Random utilities.
- .align 16
+INTFUNC(bogus)
// Abort the process because of a programming error. Indirecting
// through this point serves several purposes: (a) by CALLing, rather
// than branching to, `abort', we can save the return address, which
// might at least provide a hint as to what went wrong; (b) we don't
// have conditional CALLs (and they'd be big anyway); and (c) we can
// write a HLT here as a backstop against `abort' being mad.
-bogus: callext F(abort)
+#if CPUFAM_AMD64 && ABI_WIN
+ .seh_endprologue
+#endif
+
+ callext F(abort)
0: hlt
jmp 0b
+ENDFUNC
+
///--------------------------------------------------------------------------
/// Data tables.