Also, rearrange the arguments so the destination(s) are at the start.
-.macro squash c0, c1, c2, c3, h, t, u
+.macro squash lo, hi, c0, c1, c2, c3, t, u
// On entry, C0, C1, C2, C3 are carry registers representing a value
// On entry, C0, C1, C2, C3 are carry registers representing a value
- // Y. On exit, C0 holds the low 128 bits of the carry value; C1, C2,
+ // Y. On exit, LO holds the low 128 bits of the carry value; C1, C2,
// C3, T, and U are clobbered; and the high bits of Y are stored in
// C3, T, and U are clobbered; and the high bits of Y are stored in
- // H, if this is not `nil'.
+ // HI, if this is not `nil'.
// The first step is to eliminate the `double-prime' pieces -- i.e.,
// the ones offset by 16 bytes from a 32-bit boundary -- by carrying
// The first step is to eliminate the `double-prime' pieces -- i.e.,
// the ones offset by 16 bytes from a 32-bit boundary -- by carrying
psrlq \c1, 16 // high parts of (y''_1, y''_3)
psrlq \c2, 32 // low parts of (y''_0, y''_2)
psrlq \c3, 32 // low parts of (y''_1, y''_3)
psrlq \c1, 16 // high parts of (y''_1, y''_3)
psrlq \c2, 32 // low parts of (y''_0, y''_2)
psrlq \c3, 32 // low parts of (y''_1, y''_3)
- .ifnes "\h", "nil"
- movdqa \h, \c1
+ .ifnes "\hi", "nil"
+ movdqa \hi, \c1
.endif
pslldq \c1, 8 // high part of (0, y''_1)
.endif
pslldq \c1, 8 // high part of (0, y''_1)
paddq \u, \c3
paddq \t, \c1 // and up: (y_0, y_2)
paddq \u, \c0 // (y_1, y_3)
paddq \u, \c3
paddq \t, \c1 // and up: (y_0, y_2)
paddq \u, \c0 // (y_1, y_3)
- .ifnes "\h", "nil"
- psrldq \h, 8 // high part of (y''_3, 0)
+ .ifnes "\hi", "nil"
+ psrldq \hi, 8 // high part of (y''_3, 0)
.endif
// Finally extract the answer. This complicated dance is better than
// storing to memory and loading, because the piecemeal stores
// inhibit store forwarding.
movdqa \c3, \t // (y_0, y_1)
.endif
// Finally extract the answer. This complicated dance is better than
// storing to memory and loading, because the piecemeal stores
// inhibit store forwarding.
movdqa \c3, \t // (y_0, y_1)
- movdqa \c0, \t // (y^*_0, ?, ?, ?)
+ movdqa \lo, \t // (y^*_0, ?, ?, ?)
psrldq \t, 8 // (y_2, 0)
psrlq \c3, 32 // (floor(y_0/B), ?)
paddq \c3, \u // (y_1 + floor(y_0/B), ?)
psrldq \t, 8 // (y_2, 0)
psrlq \c3, 32 // (floor(y_0/B), ?)
paddq \c3, \u // (y_1 + floor(y_0/B), ?)
psrldq \u, 8 // (y_3, 0)
psrlq \c3, 32 // (floor((y_1 B + y_0)/B^2, ?)
paddq \c3, \t // (y_2 + floor((y_1 B + y_0)/B^2, ?)
psrldq \u, 8 // (y_3, 0)
psrlq \c3, 32 // (floor((y_1 B + y_0)/B^2, ?)
paddq \c3, \t // (y_2 + floor((y_1 B + y_0)/B^2, ?)
- punpckldq \c0, \c3 // (y^*_0, y^*_2, ?, ?)
+ punpckldq \lo, \c3 // (y^*_0, y^*_2, ?, ?)
psrlq \c3, 32 // (floor((y_2 B^2 + y_1 B + y_0)/B^3, ?)
paddq \c3, \u // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3, ?)
psrlq \c3, 32 // (floor((y_2 B^2 + y_1 B + y_0)/B^3, ?)
paddq \c3, \u // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3, ?)
movdqa \t, \c3
pxor \u, \u
.endif
punpckldq \c1, \c3 // (y^*_1, y^*_3, ?, ?)
movdqa \t, \c3
pxor \u, \u
.endif
punpckldq \c1, \c3 // (y^*_1, y^*_3, ?, ?)
psrlq \t, 32 // very high bits of y
psrlq \t, 32 // very high bits of y
- paddq \h, \t
- punpcklqdq \h, \u // carry up
+ paddq \hi, \t
+ punpcklqdq \hi, \u // carry up
- punpckldq \c0, \c1 // y mod B^4
+ punpckldq \lo, \c1 // y mod B^4
accum xmm7, nil, nil, nil
// That's lots of pieces. Now we have to assemble the answer.
accum xmm7, nil, nil, nil
// That's lots of pieces. Now we have to assemble the answer.
- squash xmm4, xmm5, xmm6, xmm7, nil, xmm0, xmm1
+ squash xmm4, nil, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1
// Expand it.
pxor xmm2, xmm2
// Expand it.
pxor xmm2, xmm2
accum xmm7, nil, nil, nil
// That's lots of pieces. Now we have to assemble the answer.
accum xmm7, nil, nil, nil
// That's lots of pieces. Now we have to assemble the answer.
- squash xmm4, xmm5, xmm6, xmm7, nil, xmm0, xmm1
+ squash xmm4, nil, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1
// Expand it.
pxor xmm2, xmm2
// Expand it.
pxor xmm2, xmm2