.endm
.arch armv7-a
+ .fpu neon
#elif defined(__aarch64__)
0: movzx ebx, cx
imul ebx, ebx
- ror ecx, 0x10
- movzx edx, cx
+ mov edx, ecx
+ shr edx, 0x10
imul edx, edx
- rol ecx, 0x10
add ebx, edx // see?
sbb eax, 0
proc x28
+ // divide c-byte little-endian bignum at rsi by 2 (rounding down)
+
#if defined(__x86_64__)
- notimpl
+ clc
+0: rcr byte ptr [rsi], 1
+ inc rsi
+ loop 0b
#elif defined(__i386__)
- notimpl
+ clc
+0: rcr byte ptr [esi], 1
+ inc esi
+ loop 0b
#elif defined(__arm__)
- notimpl
+ // we could hack this a word at a time using rrx
+ mov r3, #0
+0: ldrb r12, [r4]
+ subs r2, r2, #1
+ orr r3, r3, r12, lsr #1
+ strb r3, [r4], #1
+ mov r3, r12, lsl #7
+ bne 0b
#elif defined(__aarch64__)
- notimpl
+ mov w16, #0
+0: ldrb w17, [x4]
+ sub x2, x2, #1
+ orr w16, w16, w17, lsr #1
+ strb w16, [x4], #1
+ lsl w16, w17, #7
+ cbnz x2, 0b
#else
notimpl
#endif
+ ret
+
endproc
proc x29
+ // fill a buffer with a 3-byte pattern
+
#if defined(__x86_64__)
- notimpl
+ lea rdi, [rsi + 3]
+ rep movsb
#elif defined(__i386__)
- notimpl
+ lea edi, [esi + 3]
+ rep movsb
#elif defined(__arm__)
- notimpl
+ add r5, r4, #3
+0: subs r2, r2, #1
+ ldrhsb r12, [r4], #1
+ strhsb r12, [r5], #1
+ bhs 0b
#elif defined(__aarch64__)
- notimpl
+ cbz x2, 9f
+ add x5, x4, #3
+0: sub x2, x2, #1
+ ldrb w16, [x4], #1
+ strb w16, [x5], #1
+ cbnz x2, 0b
+9:
#else
notimpl
#endif
+ ret
+
endproc
proc x2a
+ // rotate the words in a buffer, so that the last word comes first,
+ // the first comes second, and so on. this isn't a good way to do
+ // it.
+
#if defined(__x86_64__)
- notimpl
+ mov rsi, rbx // set string pointers
+ mov rdi, rbx
+0: lodsq // fetch next word
+ xchg rax, qword ptr [rbx] // stash it for next iteration and
+ // replace it with the previously
+ // stashed word
+ stosq // store in output
+ // (note that the first iteration doesn't actually do anything)
+ loop 0b // continue until all done
#elif defined(__i386__)
- notimpl
+ mov esi, ebx // set string pointers
+ mov edi, ebx
+0: lodsd // fetch next word
+ xchg eax, dword ptr [ebx] // stash it for next iteration and
+ // replace it with the previously
+ // stashed word
+ stosd // store in output
+ loop 0b // continue until all done
#elif defined(__arm__)
- notimpl
+ // let's do this a sensible way. (we could go faster using ldm/stm.)
+ add r0, r1, r2, lsl #2 // find the end of the buffer
+ ldr r0, [r0, #-4] // collect final element
+0: subs r2, r2, #1
+ ldr r12, [r1]
+ str r0, [r1], #4
+ mov r0, r12
+ bne 0b
#elif defined(__aarch64__)
- notimpl
+ add x0, x1, x2, lsl #3 // find the end of the buffer
+ ldr x0, [x0, #-8] // collect final element
+0: sub x2, x2, #1
+ ldr x16, [x1]
+ str x0, [x1], #8
+ mov x0, x16
+ cbnz x2, 0b
#else
notimpl
#endif
+ ret
+
endproc
proc x2b
-#if defined(__x86_64__)
-
- notimpl
+ // find a cycle in a function f: B -> B, where B = {0, 1, ..., 255}
-#elif defined(__i386__)
+#if defined(__x86_64__)
- notimpl
+ // this is floyd's cycle-finding algorithm.
+ //
+ // consider the sequence s_0 = 0, s_1 = f(0), s_2 = f(f(0)), ...,
+ // s_{i+1} = f(s_i). since B is finite, there must be some smallest
+ // t and c such that s(t) = s(t + c); then we have s_i = s_j iff
+ // i >= t, j >= t, and i == j (mod c).
+ //
+ // the algorithm sets two cursors advancing through the sequence: a
+ // /tortoise/ which advances one step at a time, and a /hare/ which
+ // advances by two, so when the tortoise is at element s_i, the hare
+ // is at s_{2i}. the hare will run around the cycle and catch the
+ // tortoise when i >= t and i == 2 i (mod c); the latter is simply i
+ // == 0 (mod c), which therefore happens first when i = k = t +
+ // (-t mod c).
+ //
+ // i'm not sure what good xlatb does here that mov al, [rbx + al]
+ // doesn't.
+
+ xor eax, eax // tortoise starts at 0
+ xor edx, edx // hare starts at 0
+0: xlatb // advance tortoise
+ xchg rax, rdx // switch to hare
+ xlatb // advance hare ...
+ xlatb // ... twice
+ xchg rax, rdx // switch back
+ cmp al, dl // hare caught the tortoise?
+ jnz 0b // no -- go around again
+
+ // now we trace the initial tail: reset the tortoise to s_0, and slow
+ // the hare down so that both take only a single step in each
+ // iteration. this loop terminates when i >= t and i == i + 2 k
+ // (mod c). we know k is a multiple of c, so the latter condition
+ // always holds, so this finds the first step of the cycle.
+
+ xor eax, eax // reset the tortoise
+0: xlatb // advance tortoise
+ xchg rax, rdx // switch to hare
+ xlatb // advance hare
+ xchg rax, rdx // and switch back
+ cmp al, dl // done?
+ jnz 0b // no -- iterate
+
+#elif defined(__i386__)
+
+ xor eax, eax // tortoise starts at 0
+ xor edx, edx // hare starts at 0
+0: xlatb // advance tortoise
+ xchg eax, edx // switch to hare
+ xlatb // advance hare ...
+ xlatb // ... twice
+ xchg eax, edx // switch back
+ cmp al, dl // hare caught the tortoise?
+ jnz 0b // no -- go around again
+
+ xor eax, eax // reset the tortoise
+0: xlatb // advance tortoise
+ xchg eax, edx // switch to hare
+ xlatb // advance hare
+ xchg eax, edx // and switch back
+ cmp al, dl // done?
+ jnz 0b // no -- iterate
#elif defined(__arm__)
- notimpl
+ mov r0, #0
+ mov r3, #0
+0: ldrb r0, [r1, r0]
+ ldrb r3, [r1, r3]
+ ldrb r3, [r1, r3]
+ cmp r0, r3
+ bne 0b
+
+ mov r0, #0
+0: ldrb r0, [r1, r0]
+ ldrb r3, [r1, r3]
+ cmp r0, r3
+ bne 0b
#elif defined(__aarch64__)
- notimpl
+ mov w0, #0
+ mov w3, #0
+0: ldrb w0, [x1, x0]
+ ldrb w3, [x1, x3]
+ ldrb w3, [x1, x3]
+ cmp w0, w3
+ b.ne 0b
+
+ mov w0, #0
+0: ldrb w0, [x1, x0]
+ ldrb w3, [x1, x3]
+ cmp w0, w3
+ b.ne 0b
#else
notimpl
#endif
+ ret
+
endproc
proc x2c
+ // a convoluted way to set rax = rsi
+
#if defined(__x86_64__)
- notimpl
+ mov qword ptr [rbx + 8*rcx], 0 // b[c] = 0
+ mov qword ptr [rbx + 8*rdx], 1 // b[d] = 1
+ mov rax, [rbx + 8*rcx] // a' = b[c] = 0
+
+ mov [rbx], rsi // b[0] = t
+ mov [rbx + 8], rdi // b[1] = u
+ mov rax, [rbx + 8*rax] // a' = b[a'] = b[0] = t
#elif defined(__i386__)
- notimpl
+ mov dword ptr [ebx + 8*ecx], 0 // b[c] = 0
+ mov dword ptr [ebx + 8*edx], 1 // b[d] = 1
+ mov eax, [ebx + 8*ecx] // a' = b[c] = 0
+
+ mov [ebx], esi // b[0] = t
+ mov [ebx + 8], edi // b[1] = u
+ mov eax, [ebx + 8*eax] // a' = b[a'] = b[0] = t
#elif defined(__arm__)
- notimpl
+ mov r0, #0
+ mov r12, #1
+
+ str r0, [r1, r2, lsl #2]
+ str r12, [r1, r3, lsl #2]
+ ldr r0, [r1, r2, lsl #2]
+
+ str r4, [r1]
+ str r5, [r1, #4]
+ ldr r0, [r1, r0, lsl #2]
#elif defined(__aarch64__)
- notimpl
+ mov x16, #1
+
+ str xzr, [x1, x2, lsl #3]
+ str x16, [x1, x3, lsl #3]
+ ldr x0, [x1, x2, lsl #3]
+
+ str x4, [x1]
+ str x5, [x1, #8]
+ ldr x0, [x1, x0, lsl #3]
#else
notimpl
#endif
+ ret
+
endproc
proc x2d
+ // clear the least significant set bit in a, by calculating a' =
+ // a AND (a - 1).
+ //
+ // if a = 0 then a' = 0. otherwise, a - 1 differs from a exactly in
+ // the least significant /set/ bit of a, and all bits of lesser
+ // significance. to put it another way: write a = u 2^{k+1} + 2^k;
+ // then a - 1 = u 2^{k+1} + 2^{k-1} + ... + 2 + 1. taking the
+ // bitwise AND of these leaves only the bits common to both, i.e.,
+ // u 2^{k+1}.
+
#if defined(__x86_64__)
- notimpl
+ mov rdx, rax // d' = a
+ dec rax // a' = a - 1
+ and rax, rdx // a' = a AND (a - 1)
#elif defined(__i386__)
- notimpl
+ mov edx, eax // d' = a
+ dec eax // a' = a - 1
+ and eax, edx // a' = a AND (a - 1)
#elif defined(__arm__)
- notimpl
+ sub r3, r0, #1
+ and r0, r0, r3
#elif defined(__aarch64__)
- notimpl
+ sub x3, x0, #1
+ and x0, x0, x3
#else
notimpl
#endif
+ ret
+
endproc
proc x2e
+ // compute a mask of one bits in exactly the positions of the
+ // low-order run of zero bits in a
+
#if defined(__x86_64__)
- notimpl
+ mov rdx, rax // d' = a
+ dec rdx // d' = a - 1
+ xor rax, rdx // a = a XOR (a - 1)
+ // set bits are least significant
+ // set bit of a, and all bits of
+ // lesser significance
+ shr rax, 1 // now only bits of lesser
+ // significance; a' = 0 iff a odd
+ cmp rax, rdx // equal if a = 0 or 2^k; otherwise
+ // strictly less
#elif defined(__i386__)
- notimpl
+ mov edx, eax
+ dec edx
+ xor eax, edx
+ shr eax, 1
+ cmp eax, edx
#elif defined(__arm__)
- notimpl
+ sub r3, r0, #1
+ eor r0, r0, r3
+ mov r0, r0, lsr #1 // probably fold shift into next inst
+ cmp r0, r3
#elif defined(__aarch64__)
- notimpl
+ sub x3, x0, #1
+ eor x0, x0, x3
+ mov x0, x0, lsr #1 // probably fold shift into next inst
+ cmp x0, x3
#else
notimpl
#endif
+ ret
+
endproc
proc x2f
+ // a slow population count
+
#if defined(__x86_64__)
- notimpl
+ popcnt rbx, rcx // the easy way
+
+ // a fast version in software
+ mov rax, rcx
+
+ mov rdx, rcx
+ shr rdx, 1
+ mov rsi, 0x5555555555555555
+ and rax, rsi
+ and rdx, rsi
+ add rax, rdx
+
+ mov rdx, rax
+ shr rdx, 2
+ mov rsi, 0x3333333333333333
+ and rax, rsi
+ and rdx, rsi
+ add rax, rdx
+
+ mov rdx, rax
+ shr rdx, 32
+ add rax, rdx
+
+ mov rdx, rax
+ shr rdx, 4
+ and rax, 0x0f0f0f0f
+ and rdx, 0x0f0f0f0f
+ add rax, rdx
+
+ mov rdx, rax
+ shr rdx, 8
+ add rax, rdx
+
+ mov rdx, rax
+ shr rdx, 16
+ add rax, rdx
+ movzx rsi, al
+
+ // the official version
+ xor eax, eax // clear iteration counter
+0: jrcxz 9f // bail if c = 0
+ inc rax // bump iteration count
+ mov rdx, rcx // d' = c
+ dec rdx // d' = c - 1
+ and rcx, rdx // zap least significant set bit of c
+ jmp 0b // and go again
+9:
#elif defined(__i386__)
- notimpl
+ popcnt ebx, ecx // the easy way
+
+ mov eax, ecx
+
+ mov edx, ecx
+ shr edx, 1
+ and eax, 0x55555555
+ and edx, 0x55555555
+ add eax, edx
+
+ mov edx, eax
+ shr edx, 2
+ and eax, 0x33333333
+ and edx, 0x33333333
+ add eax, edx
+
+ mov edx, eax
+ shr edx, 4
+ add eax, edx
+
+ mov edx, eax
+ shr edx, 8
+ and eax, 0x000f000f
+ and edx, 0x000f000f
+ add eax, edx
+
+ mov edx, eax
+ shr edx, 16
+ add eax, edx
+ movzx esi, al
+
+ xor eax, eax
+0: jecxz 9f
+ inc eax
+ mov edx, ecx
+ dec edx
+ and ecx, edx
+ jmp 0b
+9:
#elif defined(__arm__)
- notimpl
+ // the easy-ish way
+ vmov d0[0], r2
+ vcnt.8 d0, d0
+ vmov r1, d0[0]
+ add r1, r1, r1, lsl #8
+ add r1, r1, r1, lsl #16
+ mov r1, r1, lsr #24
+
+ // the hard way
+ movw r12, #0x5555
+ movt r12, #0x5555
+ and r3, r12, r2, lsr #1
+ and r0, r12, r2
+ add r0, r0, r3
+
+ movw r12, #0x3333
+ movt r12, #0x3333
+ and r3, r12, r0, lsr #2
+ and r0, r12, r0
+ add r0, r0, r3
+
+ add r0, r0, r0, lsl #16
+
+ movt r12, 0x0f0f
+ and r3, r12, r0, lsr #4
+ and r0, r12, r0
+ add r0, r0, r3
+
+ add r0, r0, r0, lsl #8
+
+ mov r4, r0, lsr #24
+
+ // and following the exercise
+ mov r0, #0
+ cmp r2, #0
+ beq 9f
+0: add r0, r0, #1
+ sub r3, r2, #1
+ ands r2, r2, r3
+ bne 0b
+9:
#elif defined(__aarch64__)
- notimpl
+ // the easy-ish way
+ mov v0.d[0], x2
+ cnt v0.8b, v0.8b
+ mov x1, v0.d[0]
+ add x1, x1, x1, lsl #8
+ add x1, x1, x1, lsl #16
+ add x1, x1, x1, lsl #32
+ lsr x1, x1, #56
+
+ // the hard way -- though arm64's immediate constant encodings and
+ // shifting make this actually rather pleasant.
+ and x3, x2, #0xaaaaaaaaaaaaaaaa
+ and x0, x2, #0x5555555555555555
+ add x0, x0, x3, lsr #1
+
+ and x3, x0, #0xcccccccccccccccc
+ and x0, x0, #0x3333333333333333
+ add x0, x0, x3, lsr #2
+
+ add x0, x0, x0, lsr #4
+
+ and x3, x0, #0x0f000f000f000f00
+ and x0, x0, #0x000f000f000f000f
+ add x0, x3, x0, lsl #8
+
+ add x0, x0, x0, lsl #16
+ add x0, x0, x0, lsl #32
+ lsr x4, x0, #56
+
+ // and the official way
+ mov x0, #0
+ cbz x2, 9f
+0: add x0, x0, #1
+ sub x3, x2, #1
+ and x2, x2, x3
+ cbnz x2, 0b
+9:
#else
notimpl
#endif
+ ret
+
endproc
///--------------------------------------------------------------------------