chiark - git - mdw - xchg-rax-rax/blob - xchg.S

   1 /// -*- mode: asm; asm-comment-char: 0 -*-
   2
   3 ///--------------------------------------------------------------------------
   4 /// Preliminaries.
   5
   6 #include <sys/syscall.h>
   7
   8 #if defined(__i386__) || defined(__x86_64__)
   9
  10         .intel_syntax noprefix
  11
  12 #elif defined(__arm__)
  13
  14 .macro  ret
  15         bx      r14
  16 .endm
  17
  18         .arch   armv7-a
  19
  20 #elif defined(__aarch64__)
  21
  22 .macro  cmov    rd, rn, cc
  23         csel    \rd, \rn, \rd, \cc
  24 .endm
  25 #define _COND(_)                                                        \
  26         _(eq) _(ne) _(cs) _(cc) _(vs) _(vc) _(mi) _(pl)                 \
  27         _(ge) _(lt) _(gt) _(le) _(hi) _(ls) _(al) _(nv)                 \
  28         _(hs) _(lo)
  29 #define _INST(_)                                                        \
  30         _(ccmp) _(ccmn)                                                 \
  31         _(csel) _(cmov)                                                 \
  32         _(csinc) _(cinc) _(cset)                                        \
  33         _(csneg) _(cneg)                                                \
  34         _(csinv) _(cinv) _(csetm)
  35 #define _CONDVAR(cc) _definstvar cc;
  36 #define _INSTVARS(inst)                                                 \
  37         .macro _definstvar cc;                                          \
  38           .macro inst.\cc args:vararg; inst \args, \cc; .endm;          \
  39         .endm;                                                          \
  40         _COND(_CONDVAR);                                                \
  41         .purgem _definstvar;
  42         _INST(_INSTVARS)
  43 #undef _COND
  44 #undef _INST
  45 #undef _CONDVAR
  46 #undef _INSTVARS
  47
  48 #define CCMP_N 8
  49 #define CCMP_Z 4
  50 #define CCMP_C 2
  51 #define CCMP_V 1
  52
  53 #define CCMP_MI CCMP_N
  54 #define CCMP_PL 0
  55 #define CCMP_EQ CCMP_Z
  56 #define CCMP_NE 0
  57 #define CCMP_CS CCMP_C
  58 #define CCMP_HS CCMP_C
  59 #define CCMP_CC 0
  60 #define CCMP_LO 0
  61 #define CCMP_VS CCMP_V
  62 #define CCMP_VC 0
  63 #define CCMP_HI CCMP_C
  64 #define CCMP_LS 0
  65 #define CCMP_LT CCMP_N
  66 #define CCMP_GE 0
  67 #define CCMP_LE CCMP_N
  68 #define CCMP_GT 0
  69
  70 #else
  71 #  error "not supported"
  72 #endif
  73
  74 .macro  proc    name
  75         .globl  \name
  76         .type   \name, STT_FUNC
  77         .p2align 4
  78 \name\():
  79   .macro endproc
  80         .size   \name, . - \name
  81         .purgem endproc
  82   .endm
  83 .endm
  84
  85 .macro ch c
  86 #if defined(__i386__)
  87
  88         pushf
  89         push    eax
  90         push    ebx
  91         push    ecx
  92         push    edx
  93         push    ebp
  94         mov     ebp, esp
  95         and     esp, -16
  96
  97         push    \c
  98         call    putchar@plt
  99
 100         call    get_pc_ebx
 101         add     ebx, offset _GLOBAL_OFFSET_TABLE
 102         mov     eax, [ebx + stdout@GOT]
 103         mov     eax, [eax]
 104         call    fflush@plt
 105
 106         mov     esp, ebp
 107         pop     ebp
 108         pop     edx
 109         pop     ecx
 110         pop     ebx
 111         pop     eax
 112         popf
 113
 114 #elif defined(__x86_64__)
 115
 116         pushf
 117         push    rax
 118         push    rcx
 119         push    rdx
 120         push    rsi
 121         push    rdi
 122         push    r8
 123         push    r9
 124         push    rbp
 125         mov     rbp, rsp
 126         and     rsp, -16
 127
 128         mov     rdi, \c
 129         call    putchar@plt
 130
 131         mov     rdi, [rip + stdout]
 132         call    fflush@plt
 133
 134         mov     rsp, rbp
 135         pop     rbp
 136         pop     r9
 137         pop     r8
 138         pop     rdi
 139         pop     rsi
 140         pop     rdx
 141         pop     rcx
 142         pop     rax
 143         popf
 144
 145 #elif defined(__arm__)
 146
 147         stmfd   r13!, {r0-r4, r12, r14}
 148
 149         mov     r4, r13
 150         bic     r14, r4, #15
 151         mov     r13, r14
 152
 153         mov     r0, #\c
 154         bl      putchar@plt
 155
 156         ldr     r14, .L$_c$gotoff$\@
 157 .L$_c$gotpc$\@:
 158         add     r14, pc, r14
 159         b       .L$_c$cont$\@
 160 .L$_c$gotoff$\@:
 161         .word   _GLOBAL_OFFSET_TABLE - .L$_c$gotpc$\@ - 8
 162 .L$_c$cont$\@:
 163         bl      fflush@plt
 164
 165         mov     r13, r4
 166         ldmfd   r13!, {r0-r4, r12, r14}
 167
 168 #elif defined(__aarch64__)
 169
 170         sub     sp, sp, #20*8
 171         stp      x0,  x1, [sp,   #0]
 172         stp      x2,  x3, [sp,  #16]
 173         stp      x4,  x5, [sp,  #32]
 174         stp      x6,  x7, [sp,  #48]
 175         stp      x8,  x9, [sp,  #64]
 176         stp     x10, x11, [sp,  #80]
 177         stp     x12, x13, [sp,  #96]
 178         stp     x14, x15, [sp, #112]
 179         stp     x16, x17, [sp, #128]
 180         mrs     x16, nzcv
 181         stp     x16, x30, [sp, #144]
 182
 183         mov     w0, #\c
 184         bl      putchar
 185         adrp    x0, :got:stdout
 186         ldr     x0, [x0, #:got_lo12:stdout]
 187         ldr     x0, [x0]
 188         bl      fflush
 189
 190         ldp     x16, x30, [sp, #144]
 191         msr     nzcv, x16
 192         ldp     x16, x17, [sp, #128]
 193         ldp     x14, x15, [sp, #112]
 194         ldp     x12, x13, [sp,  #96]
 195         ldp     x10, x11, [sp,  #80]
 196         ldp      x8,  x9, [sp,  #64]
 197         ldp      x6,  x7, [sp,  #48]
 198         ldp      x4,  x5, [sp,  #32]
 199         ldp      x2,  x3, [sp,  #16]
 200         ldp      x0,  x1, [sp,   #0]
 201         add     sp, sp, #20*8
 202
 203 #else
 204 #  error "not supported"
 205 #endif
 206 .endm
 207
 208 .macro  notimpl
 209 #if defined(__i386__) || defined(__x86_64__)
 210         ud2
 211 #elif defined(__arm__)
 212         udf
 213 #elif defined(__aarch64__)
 214         hlt     #0
 215 #else
 216 #  error "not supported"
 217 #endif
 218 .endm
 219
 220         .section .note.GNU-stack, "", %progbits
 221
 222         .text
 223
 224 #if defined(__i386__)
 225 get_pc_ebx:
 226         mov     ebx, [esp]
 227         ret
 228 #endif
 229
 230
 231 proc    call_example
 232
 233 #if defined(__i386__)
 234
 235         push    ebx                     // ebx
 236         push    esi                     // esi, ebx
 237         push    edi                     // edi, esi, ebx
 238         push    ebp                     // flags, ebp, ..., ebx
 239         pushf
 240
 241         mov     edi, [esp + 4*6]
 242         mov     esi, [esp + 4*7]
 243         push    esi                     // regs, flags, ebp, ..., ebx
 244
 245         call    get_pc_ebx
 246         lea     eax, [ebx + 9f - .]
 247         push    eax                     // cont, regs, flags, ebp, ..., ebx
 248         push    edi                 // func, cont, regs, flags, ebp, ..., ebx
 249
 250         mov     eax, [esi + 28]
 251         pushf
 252         pop     ecx
 253         and     eax,  0x0cd5
 254         and     ecx, ~0x0cd5
 255         or      eax, ecx
 256         push    eax
 257         popf
 258         mov     eax, [esi +  0]
 259         mov     ebx, [esi +  4]
 260         mov     ecx, [esi +  8]
 261         mov     edx, [esi + 12]
 262         mov     edi, [esi + 20]
 263         mov     ebp, [esi + 24]
 264         mov     esi, [esi + 16]
 265
 266         ret                            // -> func; regs, flags, ebp, ..., ebx
 267
 268 9:      pushf                           // eflags, regs, flags, ebp, ..., ebx
 269         push    esi                // esi, eflags, regs, flags, ebp, ..., ebx
 270         mov     esi, [esp + 8]
 271         mov     [esi +  0], eax
 272         mov     [esi +  4], ebx
 273         mov     [esi +  8], ecx
 274         mov     [esi + 12], edx
 275         mov     [esi + 20], edi
 276         mov     [esi + 24], ebp
 277         pop     eax                     // rflags, regs, flags, ebp, ..., ebx
 278         mov     [esi + 16], eax
 279         pop     eax                     // regs, flags, ebp, ..., ebx
 280         mov     [esi + 28], eax
 281
 282         add     esp, 4                  // flags, ebp, ..., ebx
 283         popf                            // ebp, ..., ebx
 284         pop     ebp                     // ..., ebx
 285         pop     edi
 286         pop     esi
 287         pop     ebx                     //
 288         ret
 289
 290 #elif defined(__x86_64__)
 291
 292         push    rbx                     // rbx
 293         push    r10
 294         push    r11
 295         push    r12
 296         push    r13
 297         push    r14
 298         push    r15
 299         push    rbp                     // flags, rbp, ..., rbx
 300         pushf
 301
 302         push    rsi                     // regs, flags, rbp, ..., rbx
 303
 304         lea     rax, [rip + 9f]
 305         push    rax                     // cont, regs, flags, rbp, ..., rbx
 306         push    rdi                 // func, cont, regs, flags, rbp, ..., rbx
 307
 308         mov     rax, [rsi + 8*15]
 309         pushf
 310         pop     rcx
 311         and     rax,  0x0cd5
 312         and     rcx, ~0x0cd5
 313         or      rax, rcx
 314         push    rax
 315         popf
 316         mov     rax, [rsi +   0]
 317         mov     rbx, [rsi +   8]
 318         mov     rcx, [rsi +  16]
 319         mov     rdx, [rsi +  24]
 320         mov     rdi, [rsi +  40]
 321         mov     rbp, [rsi +  48]
 322         mov     r8,  [rsi +  56]
 323         mov     r9,  [rsi +  64]
 324         mov     r10, [rsi +  72]
 325         mov     r11, [rsi +  80]
 326         mov     r12, [rsi +  88]
 327         mov     r13, [rsi +  96]
 328         mov     r14, [rsi + 104]
 329         mov     r15, [rsi + 112]
 330         mov     rsi, [rsi +  32]
 331
 332         ret                            // -> func; regs, flags, rbp, ..., rbx
 333
 334 9:      pushf                           // rflags, regs, flags, rbp, ..., rbx
 335         push    rsi                // rsi, rflags, regs, flags, rbp, ..., rbx
 336         mov     rsi, [rsp + 16]
 337         mov     [rsi +   0], rax
 338         mov     [rsi +   8], rbx
 339         mov     [rsi +  16], rcx
 340         mov     [rsi +  24], rdx
 341         mov     [rsi +  40], rdi
 342         mov     [rsi +  48], rbp
 343         mov     [rsi +  56],  r8
 344         mov     [rsi +  64],  r9
 345         mov     [rsi +  72], r10
 346         mov     [rsi +  80], r11
 347         mov     [rsi +  88], r12
 348         mov     [rsi +  96], r13
 349         mov     [rsi + 104], r14
 350         mov     [rsi + 112], r15
 351         pop     rax                     // rflags, regs, flags, rbp, ..., rbx
 352         mov     [rsi +  32], rax
 353         pop     rax                     // regs, flags, rbp, ..., rbx
 354         mov     [rsi + 120], rax
 355
 356         add     rsp, 8                  // flags, rbp, ..., rbx
 357         popf                            // rbp, ..., rbx
 358         pop     rbp                     // ..., rbx
 359         pop     r15
 360         pop     r14
 361         pop     r13
 362         pop     r12
 363         pop     r11
 364         pop     r10
 365         pop     rbx                     //
 366         ret
 367
 368 #elif defined(__arm__)
 369
 370         stmfd   r13!, {r0, r1, r4-r11, r14}
 371         ldmia   r1, {r0-r12, r14}
 372         msr     cpsr, r14
 373         mov     r14, pc
 374         ldr     pc, [r13], #4
 375         ldr     r14, [r13], #4
 376         stmia   r14!, {r0-r12}
 377         mrs     r0, cpsr
 378         str     r0, [r14]
 379         ldmfd   r13!, {r4-r11, pc}
 380
 381 #elif defined(__aarch64__)
 382
 383         stp     x29, x30, [sp, #-13*8]!
 384         mov     x29, sp
 385         stp     x19, x20, [sp,  #16]
 386         stp     x21, x22, [sp,  #32]
 387         stp     x23, x24, [sp,  #48]
 388         stp     x25, x26, [sp,  #64]
 389         stp     x27, x28, [sp,  #80]
 390         str     x1, [sp, #96]
 391
 392         mov     x16, x0
 393
 394         ldr     x17,      [x1, #128]
 395         ldp     x14, x15, [x1, #112]
 396         ldp     x12, x13, [x1,  #96]
 397         ldp     x10, x11, [x1,  #80]
 398         ldp      x8,  x9, [x1,  #64]
 399         ldp      x6,  x7, [x1,  #48]
 400         ldp      x4,  x5, [x1,  #32]
 401         ldp      x2,  x3, [x1,  #16]
 402         ldp      x0,  x1, [x1,   #0]
 403         msr     nzcv, x17
 404
 405         blr     x16
 406
 407         ldr     x16, [sp, #96]
 408         mrs     x17, nzcv
 409         str     x17,      [x16, #128]
 410         stp     x14, x15, [x16, #112]
 411         stp     x12, x13, [x16,  #96]
 412         stp     x10, x11, [x16,  #80]
 413         stp      x8,  x9, [x16,  #64]
 414         stp      x6,  x7, [x16,  #48]
 415         stp      x4,  x5, [x16,  #32]
 416         stp      x2,  x3, [x16,  #16]
 417         stp      x0,  x1, [x16,   #0]
 418
 419         ldp     x19, x20, [sp,  #16]
 420         ldp     x21, x22, [sp,  #32]
 421         ldp     x23, x24, [sp,  #48]
 422         ldp     x25, x26, [sp,  #64]
 423         ldp     x27, x28, [sp,  #80]
 424         ldp     x29, x30, [sp], #13*8
 425
 426         ret
 427
 428 #else
 429 #  error "not supported"
 430 #endif
 431
 432 endproc
 433
 434 proc    nop
 435
 436         ret
 437
 438 endproc
 439
 440 ///--------------------------------------------------------------------------
 441 /// 0x00--0x0f
 442
 443 proc    x00
 444
 445         // clear all 64 bits of extended traditional registers
 446
 447 #if defined(__x86_64__)
 448
 449         xor      eax, eax               // clear rax
 450         lea      rbx, [0]               // rbx -> _|_
 451         loop     .                      // iterate, decrement rcx until zero
 452         mov      rdx, 0                 // set rdx = 0
 453         and      esi, 0                 // clear all bits of rsi
 454         sub      edi, edi               // set rdi = edi - edi = 0
 455         push     0
 456         pop      rbp                    // pop 0 into rbp
 457
 458 #elif defined(__i386__)
 459
 460         xor     eax, eax
 461         lea     ebx, [0]
 462         loop    .
 463         mov     edx, 0
 464         and     esi, 0
 465         sub     edi, edi
 466         push    0
 467         pop     ebp
 468
 469 #elif defined(__arm__)
 470
 471         eor     r0, r0, r0
 472         rsb     r1, r1, r1
 473 0:      subs    r2, r2, #1
 474         bne     0b
 475         mov     r3, #0
 476         and     r4, r4, #0
 477         sub     r5, r5, r5
 478
 479 #elif defined(__aarch64__)
 480
 481         eor     w0, w0, w0
 482         mov     w1, wzr
 483 0:      sub     w2, w2, #1
 484         cbnz    w2, 0b
 485         mov     w3, #0
 486         and     w4, w4, wzr
 487         sub     w5, w5, w5
 488
 489 #else
 490         notimpl
 491 #endif
 492
 493         ret
 494
 495 endproc
 496
 497 proc    x01
 498
 499         // advance a fibonacci pair by c steps
 500         //
 501         // on entry, a and d are f_{i+1} and f_i; on exit, they are f_{i+c+1}
 502         // and f_{i+c}, where f_{i+1} = f_i + f_{i-1}
 503
 504 #if defined(__x86_64__)
 505
 506 0:      xadd    rax, rdx                // a, d = a + d, a
 507                                         //      = f_{i+1} + f_i, f_{i+1}
 508                                         //      = f_{i+2}, f_{i+1}
 509         loop    0b                      // advance i, decrement c, iterate
 510
 511 #elif defined(__i386__)
 512
 513 0:      xadd    eax, edx
 514         loop    0b
 515
 516 #elif defined(__arm__)
 517
 518 0:      subs    r2, r2, #2
 519         add     r3, r3, r0
 520         blo     8f
 521         add     r0, r0, r3
 522         bhi     0b
 523
 524 8:      movne   r0, r3
 525
 526 #elif defined(__aarch64__)
 527
 528 0:      subs    x2, x2, #2
 529         add     x3, x3, x0
 530         b.lo    8f
 531         add     x0, x0, x3
 532         b.hi    0b
 533
 534 8:      cmov.ne x0, x3
 535
 536 #else
 537         notimpl
 538 #endif
 539
 540         ret
 541
 542 endproc
 543
 544 proc    x02
 545
 546         // boolean canonify a: if a = 0 on entry, leave it zero; otherwise
 547         // set a = 1
 548
 549 #if defined(__x86_64__)
 550
 551         neg     rax                     // set cf iff a /= 0
 552         sbb     rax, rax                // a = a - a - cf = -cf
 553         neg     rax                     // a = cf
 554
 555 #elif defined(__i386__)
 556
 557         neg     eax
 558         sbb     eax, eax
 559         neg     eax
 560
 561 #elif defined(__arm__)
 562
 563         movs    r1, r0                  // the easy way
 564         movne   r1, #1                  // mvnne r1, #1 for mask
 565
 566         cmp     r0, #1                  // clear cf iff a == 0
 567         sbc     r2, r0, r0              // c' = a - a - 1 + cf = cf - 1
 568         add     r2, r2, #1              // c' = cf
 569
 570         sub     r3, r0, r0, lsr #1      // d' top bit clear; d' = 0 iff a = 0
 571         rsb     r3, r3, #0              // d' top bit set iff a /= 0
 572         mov     r3, r3, lsr #31         // asr for mask
 573
 574         rsbs    r0, r0, #0
 575         sbc     r0, r0, r0
 576         rsb     r0, r0, #0
 577
 578 #elif defined(__aarch64__)
 579
 580         cmp     x0, #0                  // trivial
 581         cset.ne x1                      // csetm for mask
 582
 583         cmp     xzr, x0                 // set cf iff a == 0
 584         sbc     x2, x0, x0              // c' = a - a - 1 + cf = cf - 1
 585         neg     x2, x2                  // c' = 1 - cf
 586
 587         sub     x3, x0, x0, lsr #1      // if a < 2^63 then a' = ceil(d/2) <
 588                                         // 2^63
 589                                         // if a >= 2^63, write a = 2^63 + t
 590                                         // with t < 2^63; d' = 2^63 - 2^62 +
 591                                         // ceil(t/2) = 2^62 + ceil(t/2), and
 592                                         // ceil(t/2) < 2^62
 593                                         // anyway d' < 2^63 and d' = 0 iff
 594                                         // a = 0
 595         neg     x3, x3                  // d' top bit set iff a /= 0
 596         lsr     x3, x3, #63             // asr for mask
 597
 598         cmp     x0, #1                  // set cf iff a /= 0
 599         adc     x0, xzr, xzr            // a' = 0 + 0 + cf = cf
 600
 601 #else
 602         notimpl
 603 #endif
 604
 605         ret
 606
 607 endproc
 608
 609 proc    x03
 610
 611         // set a = min(a, d) (unsigned); clobber c, d
 612
 613 #if defined(__x86_64__)
 614
 615         sub     rdx, rax                // d' = d - a; set cf if a > d
 616         sbb     rcx, rcx                // c = -cf = -[a > d]
 617         and     rcx, rdx                // c = a > d ? d - a : 0
 618         add     rax, rcx                // a' = a > d ? d : a
 619
 620 #elif defined(__i386__)
 621
 622         sub     edx, eax
 623         sbb     ecx, ecx
 624         and     ecx, edx
 625         add     eax, ecx
 626
 627 #elif defined(__arm__)
 628
 629         cmp     r0, r3                  // the easy way
 630         movlo   r1, r0                  // only needed for out-of-place
 631         movhs   r1, r3
 632
 633         subs    r3, r3, r0
 634         sbc     r12, r12, r12
 635         and     r12, r12, r3
 636         add     r0, r0, r12
 637
 638 #elif defined(__aarch64__)
 639
 640         cmp     x0, x3                  // the easy way
 641         csel.lo x1, x0, x3
 642
 643         subs    x3, x3, x0              // d' = d - a; set cf if d >= a
 644         sbc     x16, xzr, xzr           // t = -1 + cf = -[a > d]
 645         and     x16, x16, x3            // t = a > d ? d - a : 0
 646         add     x0, x0, x16             // a' = a > d ? d : a
 647
 648 #else
 649         notimpl
 650 #endif
 651
 652         ret
 653
 654 endproc
 655
 656 proc    x04
 657
 658         // switch case?
 659
 660 #if defined(__x86_64__)
 661
 662   // unrelated playing
 663   mov   ecx, eax
 664   mov   rbx, -1
 665   mov   edx, ecx
 666   sub   edx, '0'
 667   cmp   edx, 10
 668   cmovb rbx, rdx
 669   or    ecx, 0x20
 670   mov   edx, ecx
 671   sub   edx, 'a'
 672   sub   ecx, 'a' - 10
 673   cmp   edx, 6
 674   cmovb rbx, rcx
 675
 676         xor     al, 0x20
 677
 678 #elif defined(__i386__)
 679
 680   // unrelated playing
 681   mov   ecx, eax
 682   mov   ebx, -1
 683   mov   edx, ecx
 684   sub   edx, '0'
 685   cmp   edx, 10
 686   cmovb ebx, edx
 687   or    ecx, 0x20
 688   mov   edx, ecx
 689   sub   edx, 'a'
 690   sub   ecx, 'a' - 10
 691   cmp   edx, 6
 692   cmovb ebx, ecx
 693
 694         xor     al, 0x20
 695
 696 #elif defined(__arm__)
 697
 698   // unrelated playing
 699   mvn   r1, #0
 700   sub   r12, r0, #'0'
 701   cmp   r12, #10
 702   movlo r1, r12
 703   orr   r12, r0, #0x20
 704   sub   r12, r12, #'a'
 705   cmp   r12, #6
 706   addlo r1, r12, #10
 707
 708         eor     r0, r0, #0x20
 709
 710 #elif defined(__aarch64__)
 711
 712   // unrelated playing
 713   mov   x1, #-1
 714   sub   w16, w0, #'0'
 715   cmp   w16, #10
 716   cmov.lo       x1, x16
 717   orr   w16, w0, #0x20
 718   sub   w16, w16, #'a' - 10
 719   cmp   w16, #10
 720   ccmp.hs       w16, #16, #CCMP_HS
 721   cmov.lo       x1, x16
 722
 723         eor     w0, w0, #0x20
 724
 725 #else
 726         notimpl
 727 #endif
 728
 729         ret
 730
 731 endproc
 732
 733 proc    x05
 734
 735         // answer whether 5 <= a </<= 9.
 736
 737 #if defined(__x86_64__)
 738
 739         sub     rax, 5                  // a' = a - 5
 740         cmp     rax, 4                  // is a' - 5 </<= 4?
 741
 742         // cc           a'                      a
 743         //
 744         // z/e          a' = 4                  a = 9
 745         // nz/ne        a' /= 4                 a /= 9
 746         //
 747         // a/nbe        a' > 4                  a > 9 or a < 5
 748         // nc/ae/nb     a' >= 4                 a >= 9 or a < 5
 749         // c/b/nae      a' < 4                  5 <= a < 9
 750         // be/na        a' <= 4                 5 <= a <= 9
 751         //
 752         // o            a' < -2^63 + 4          -2^63 + 5 <= a < -2^63 + 9
 753         // no           a' >= -2^63 + 4         a >= -2^63 + 9 or
 754         //                                              a < -2^63 + 5
 755         // s            -2^63 + 4 <= a' < 4     -2^63 + 9 <= a < 9
 756         // ns           a' < -2^63 + 4 or       a < -2^63 + 9 or a >= 9
 757         //                      a' >= 4
 758         // ge/nl        a' >= 4                 a >= 9 or a < -2^63 + 5
 759         // l/nge        a' < 4                  -2^63 + 5 <= a < 9
 760         // g/nle        a' > 4                  a > 9 or a < -2^63 + 5
 761         // le/ng        a' <= 4                 -2^63 + 5 <= a <= 9
 762
 763 #elif defined(__i386__)
 764
 765         sub     eax, 5
 766         cmp     eax, 4
 767
 768 #elif defined(__arm__)
 769
 770         // i dimly remember having a slick way to do this way back in the
 771         // day, but i can't figure it out any more.
 772         sub     r0, #5
 773         cmp     r0, #4
 774
 775 #elif defined(__aarch64__)
 776
 777         // literal translation is too obvious
 778         cmp     x0, #5
 779         ccmp.hs x0, #9, #CCMP_HS
 780
 781 #else
 782         notimpl
 783 #endif
 784
 785         ret
 786
 787 endproc
 788
 789 proc    x06
 790
 791         // leave a unchanged, but set zf if a = 0, cf if a /= 0, clear of,
 792         // set sf to msb(a)
 793
 794 #if defined(__x86_64__)
 795
 796         not     rax                     // a' = -a - 1
 797         inc     rax                     // a' = -a
 798         neg     rax                     // a' = a
 799
 800 #elif defined(__i386__)
 801
 802         not     eax
 803         inc     eax
 804         neg     eax
 805
 806 #elif defined(__arm__)
 807
 808         mvn     r0, r0
 809         add     r0, r0, #1
 810         rsbs    r0, r0, #0              // cf has opposite sense
 811
 812 #elif defined(__aarch64__)
 813
 814         mvn     x0, x0
 815         add     x0, x0, #1
 816         negs    x0, x0                  // cf has opposite sense
 817
 818 #else
 819         notimpl
 820 #endif
 821
 822         ret
 823
 824 endproc
 825
 826 proc    x07
 827
 828         // same as before (?)
 829
 830 #if defined(__x86_64__)
 831
 832         inc     rax                     // a' = a + 1
 833         neg     rax                     // a' = -a - 1
 834         inc     rax                     // a' = -a
 835         neg     rax                     // a' = a
 836
 837 #elif defined(__i386__)
 838
 839         inc     eax
 840         neg     eax
 841         inc     eax
 842         neg     eax
 843
 844 #elif defined(__arm__)
 845
 846         add     r0, r0, #1
 847         rsb     r0, r0, #0
 848         add     r0, r0, #1
 849         rsbs    r0, r0, #0
 850
 851 #elif defined(__aarch64__)
 852
 853         add     x0, x0, #1
 854         neg     x0, x0
 855         add     x0, x0, #1
 856         negs    x0, x0                  // cf has opposite sense
 857
 858 #else
 859         notimpl
 860 #endif
 861
 862         ret
 863
 864 endproc
 865
 866 proc    x08
 867
 868         // floor((a + d)/2), correctly handling overflow conditions; final cf
 869         // is lsb(a + d), probably uninteresting
 870
 871 #if defined(__x86_64__)
 872
 873         add     rax, rdx                // cf || a' = a + d
 874         rcr     rax, 1                  // shift 65-bit result right by one
 875                                         // place; lsb moves into carry
 876
 877 #elif defined(__i386__)
 878
 879         add     eax, edx
 880         rcr     eax, 1
 881
 882 #elif defined(__arm__)
 883
 884         // like the two-instruction a64 version
 885         sub     r1, r3, r0
 886         add     r1, r0, r1, lsr #1
 887
 888         // the slick version, similar to the above
 889         adds    r0, r0, r3
 890         mov     r0, r0, rrx
 891
 892 #elif defined(__aarch64__)
 893
 894         // a64 lacks a32's rrx.  literal translation.
 895         adds    x1, x0, x3              // cf || a' = a + d
 896         adc     x16, xzr, xzr           // realize cf in extra register
 897         extr    x1, x16, x1, #1         // shift down one place
 898
 899         // two instruction version: clobbers additional register.  (if you
 900         // wanted the answer in any other register, even overwriting d, then
 901         // this is unnecessary.)  also depends on d >= a.
 902         sub     x16, x3, x0             // compute difference
 903         add     x0, x0, x16, lsr #1     // add half of it (rounded down)
 904
 905 #else
 906         notimpl
 907 #endif
 908
 909         ret
 910
 911 endproc
 912
 913 proc    x09
 914
 915         // a = a/8, rounded to nearest; i.e., floor(a/8) if a == 0, 1, 2, 3
 916         // (mod 8), or ceil(a/8) if a == 4, 5, 6, 7 (mod 8).
 917
 918 #if defined(__x86_64__)
 919
 920         shr     rax, 3                  // a' = floor(a/8); cf = 1 if a ==
 921                                         // 4, 5, 6, 7 (mod 8)
 922         adc     rax, 0                  // a' = floor(a/8) + cf
 923
 924 #elif defined(__i386__)
 925
 926         shr     eax, 3
 927         adc     eax, 0
 928
 929 #elif defined(__arm__)
 930
 931         movs    r0, r0, lsr #3
 932         adc     r0, r0, #0
 933
 934 #elif defined(__aarch64__)
 935
 936         tst     x0, #4
 937         orr     x0, xzr, x0, lsr #3
 938         cinc.ne x0, x0
 939
 940 #else
 941         notimpl
 942 #endif
 943
 944         ret
 945
 946 endproc
 947
 948 proc    x0a
 949
 950         // increment c-byte little-endian bignum at rdi
 951
 952 #if defined(__x86_64__)
 953
 954         add     byte ptr [rdi], 1
 955 0:      inc     rdi
 956         adc     byte ptr [rdi], 0
 957         loop    0b
 958
 959 #elif defined(__i386__)
 960
 961         add     byte ptr [edi], 1
 962 0:      inc     edi
 963         adc     byte ptr [edi], 0
 964         loop    0b
 965
 966 #elif defined(__arm__)
 967
 968         mov     r12, #256               // set initial carry
 969 0:      ldrb    r0, [r5]
 970         subs    r2, r2, #1
 971         add     r12, r0, r12, lsr #8
 972         strb    r12, [r5], #1
 973         bne     0b
 974
 975 #elif defined(__aarch64__)
 976
 977         mov     w17, #256               // set initial carry
 978 0:      ldrb    w16, [x5]
 979         sub     x2, x2, #1
 980         add     w17, w16, w17, lsr #8
 981         strb    w17, [x5], #1
 982         cbnz    x2, 0b
 983
 984 #else
 985         notimpl
 986 #endif
 987
 988         ret
 989
 990 endproc
 991
 992 proc    x0b
 993
 994         // negate double-precision d:a
 995
 996 #if defined(__x86_64__)
 997
 998         not     rdx                     // d' = -d - 1
 999         neg     rax                     // a' = -a;
1000                                         // cf = 1 iff a /= 0
1001         sbb     rdx, -1                 // d' = -d - cf
1002
1003 #elif defined(__i386__)
1004
1005         not     edx
1006         neg     eax
1007         sbb     edx, -1
1008
1009 #elif defined(__arm__)
1010
1011         // reverse subtract is awesome
1012         rsbs    r0, r0, #0
1013         rsc     r3, r3, #0
1014
1015 #elif defined(__aarch64__)
1016
1017         // easy way: everything is better with zero registers.
1018         negs    x0, x0
1019         ngc     x3, x3
1020
1021 #else
1022         notimpl
1023 #endif
1024
1025         ret
1026
1027 endproc
1028
1029 proc    x0c
1030
1031         // rotate is distributive over xor.
1032
1033 #if defined(__x86_64__)
1034
1035         // rax                          // = a_1 || a_0
1036         // rbx                          // = b_1 || b_0
1037         mov     rcx, rax                // = a_1 || a_0
1038
1039         xor     rcx, rbx                // = (a_1 XOR b_1) || (a_0 XOR b_0)
1040         ror     rcx, 0xd                // = (a_0 XOR b_0) || (a_1 XOR b_1)
1041
1042         ror     rax, 0xd                // = a_0 || a_1
1043         ror     rbx, 0xd                // = b_0 || b_1
1044         xor     rax, rbx                // = (a_0 XOR b_0) || (a_1 XOR b_1)
1045
1046         cmp     rax, rcx                // always equal
1047
1048 #elif defined(__i386__)
1049
1050         mov     ecx, eax                // = a_1 || a_0
1051
1052         xor     ecx, ebx                // = (a_1 XOR b_1) || (a_0 XOR b_0)
1053         ror     ecx, 0xd                // = (a_0 XOR b_0) || (a_1 XOR b_1)
1054
1055         ror     eax, 0xd                // = a_0 || a_1
1056         ror     ebx, 0xd                // = b_0 || b_1
1057         xor     eax, ebx                // = (a_0 XOR b_0) || (a_1 XOR b_1)
1058
1059         cmp     eax, ecx                // always equal
1060
1061 #elif defined(__arm__)
1062
1063
1064         // r0                           // = a_1 || a_0
1065         // r1                           // = b_1 || b_0
1066         eor     r2, r0, r1              // = (a_1 XOR b_1) || (a_0 XOR b_0)
1067         mov     r2, r2, ror #13         // = (a_0 XOR b_0) || (a_1 XOR b_1)
1068
1069         mov     r1, r1, ror #13         // = b_0 || b_1
1070         eor     r0, r1, r0, ror #13     // = (a_0 XOR b_0) || (a_1 XOR b_1)
1071
1072         cmp     r0, r2                  // always equal
1073
1074 #elif defined(__aarch64__)
1075
1076         // x0                           // = a_1 || a_0
1077         // x1                           // = b_1 || b_0
1078         eor     x2, x0, x1              // = (a_1 XOR b_1) || (a_0 XOR b_0)
1079         ror     x2, x2, #13             // = (a_0 XOR b_0) || (a_1 XOR b_1)
1080
1081         ror     x1, x1, #13             // = b_0 || b_1
1082         eor     x0, x1, x0, ror #13     // = (a_0 XOR b_0) || (a_1 XOR b_1)
1083
1084         cmp     x0, x2                  // always equal
1085
1086 #else
1087         notimpl
1088 #endif
1089
1090         ret
1091
1092 endproc
1093
1094 proc    x0d
1095
1096         // and is distributive over xor.
1097
1098 #if defined(__x86_64__)
1099
1100         mov     rdx, rbx                // = b
1101
1102         xor     rbx, rcx                // = b XOR c
1103         and     rbx, rax                // = a AND (b XOR c)
1104
1105         and     rdx, rax                // = a AND b
1106         and     rax, rcx                // = a AND c
1107         xor     rax, rdx                // = (a AND b) XOR (a AND c)
1108                                         // = a AND (b XOR c)
1109
1110         cmp     rax, rbx                // always equal
1111
1112 #elif defined(__i386__)
1113
1114         mov     edx, ebx                // = b
1115
1116         xor     ebx, ecx                // = b XOR c
1117         and     ebx, eax                // = a AND (b XOR c)
1118
1119         and     edx, eax                // = a AND b
1120         and     eax, ecx                // = a AND c
1121         xor     eax, edx                // = (a AND b) XOR (a AND c)
1122                                         // = a AND (b XOR c)
1123
1124         cmp     eax, ebx                // always equal
1125
1126 #elif defined(__arm__)
1127
1128         and     r3, r0, r1              // = a AND b
1129
1130         eor     r1, r1, r2              // = b XOR c
1131         and     r1, r1, r0              // = a AND (b XOR c)
1132
1133         and     r0, r0, r2              // = a AND c
1134         eor     r0, r0, r3              // = (a AND b) XOR (a AND c)
1135                                         // = a AND (b XOR c)
1136
1137         cmp     r0, r1                  // always equal
1138
1139 #elif defined(__aarch64__)
1140
1141         and     x3, x0, x1              // = a AND b
1142
1143         eor     x1, x1, x2              // = b XOR c
1144         and     x1, x1, x0              // = a AND (b XOR c)
1145
1146         and     x0, x0, x2              // = a AND c
1147         eor     x0, x0, x3              // = (a AND b) XOR (a AND c)
1148                                         // = a AND (b XOR c)
1149
1150         cmp     x0, x1                  // always equal
1151
1152 #else
1153         notimpl
1154 #endif
1155
1156         ret
1157
1158 endproc
1159
1160 proc    x0e
1161
1162         // de morgan's law
1163
1164 #if defined(__x86_64__)
1165
1166         mov     rcx, rax                // = a
1167
1168         and     rcx, rbx                // = a AND b
1169         not     rcx                     // = NOT (a AND b)
1170
1171         not     rax                     // = NOT a
1172         not     rbx                     // = NOT b
1173         or      rax, rbx                // = (NOT a) OR (NOT b)
1174                                         // = NOT (a AND b)
1175
1176         cmp     rax, rcx                // always equal
1177
1178 #elif defined(__i386__)
1179
1180         mov     ecx, eax                // = a
1181
1182         and     ecx, ebx                // = a AND b
1183         not     ecx                     // = NOT (a AND b)
1184
1185         not     eax                     // = NOT a
1186         not     ebx                     // = NOT b
1187         or      eax, ebx                // = (NOT a) OR (NOT b)
1188                                         // = NOT (a AND b)
1189
1190         cmp     eax, ecx                // always equal
1191
1192 #elif defined(__arm__)
1193
1194         and     r2, r0, r1              // = a AND b
1195         mvn     r2, r2                  // = NOT (a AND b)
1196
1197         mvn     r0, r0                  // = NOT a
1198         mvn     r1, r1                  // = NOT b
1199         orr     r0, r0, r1              // = (NOT a) OR (NOT b)
1200
1201         cmp     r0, r2                  // always equal
1202
1203 #elif defined(__aarch64__)
1204
1205         and     x2, x0, x1              // = a AND b
1206         mvn     x2, x2                  // = NOT (a AND b)
1207
1208         mvn     x0, x0                  // = NOT a
1209         orn     x0, x0, x1              // = (NOT a) OR (NOT b)
1210
1211         cmp     x0, x2                  // always equal
1212
1213 #else
1214         notimpl
1215 #endif
1216
1217         ret
1218
1219 endproc
1220
1221 proc    x0f
1222
1223         // replace input buffer bytes with cumulative XORs with initial a;
1224         // final a is XOR of all buffer bytes and initial a.
1225         //
1226         // not sure why you'd do this.
1227
1228 #if defined(__x86_64__)
1229
1230 0:      xor     [rsi], al
1231         lodsb
1232         loop    0b
1233
1234 #elif defined(__i386__)
1235
1236 0:      xor     [esi], al
1237         lodsb
1238         loop    0b
1239
1240 #elif defined(__arm__)
1241
1242 0:      ldrb    r12, [r4]
1243         subs    r2, r2, #1
1244         eor     r0, r0, r12
1245         strb    r0, [r4], #1
1246         bne     0b
1247
1248 #elif defined(__aarch64__)
1249
1250 0:      ldrb    w16, [x4]
1251         sub     x2, x2, #1
1252         eor     w0, w0, w16
1253         strb    w0, [x4], #1
1254         cbnz    x2, 0b
1255
1256 #else
1257         notimpl
1258 #endif
1259
1260         ret
1261
1262 endproc
1263
1264 ///--------------------------------------------------------------------------
1265 /// 0x10--0x1f
1266
1267 proc    x10
1268
1269         // four different ways to swap a pair of registers.
1270
1271 #if defined(__x86_64__)
1272
1273         push    rax
1274         push    rcx
1275         pop     rax
1276         pop     rcx
1277
1278         xor     rax, rcx
1279         xor     rcx, rax
1280         xor     rax, rcx
1281
1282         add     rax, rcx
1283         sub     rcx, rax
1284         add     rax, rcx
1285         neg     rcx
1286
1287         xchg    rax, rcx
1288
1289 #elif defined(__i386__)
1290
1291         push    eax
1292         push    ecx
1293         pop     eax
1294         pop     ecx
1295
1296         xor     eax, ecx
1297         xor     ecx, eax
1298         xor     eax, ecx
1299
1300         add     eax, ecx
1301         sub     ecx, eax
1302         add     eax, ecx
1303         neg     ecx
1304
1305         xchg    eax, ecx
1306
1307 #elif defined(__arm__)
1308
1309         stmfd   r13!, {r0, r2}
1310         ldr     r0, [r13, #4]
1311         ldr     r2, [r13], #8
1312
1313         eor     r0, r0, r2
1314         eor     r2, r2, r0
1315         eor     r0, r0, r2
1316
1317         sub     r0, r0, r2
1318         add     r2, r2, r0
1319         rsb     r0, r0, r2              // don't need 3-addr with reverse-sub
1320
1321         mov     r12, r0
1322         mov     r0, r2
1323         mov     r2, r0
1324
1325 #elif defined(__aarch64__)
1326
1327         // anything you can do
1328         stp     x0, x2, [sp, #-16]!
1329         ldp     x2, x0, [sp], #16
1330
1331         eor     x0, x0, x2
1332         eor     x2, x2, x0
1333         eor     x0, x0, x2
1334
1335         // the add/sub/add thing was daft.  you can do it in three if you're
1336         // clever -- and have three-address operations.
1337         sub     x0, x0, x2
1338         add     x2, x2, x0
1339         sub     x0, x2, x0
1340
1341         // but we lack a fourth.  we can't do this in fewer than three
1342         // instructions without hitting memory.  only `ldp' will modify two
1343         // registers at a time, so we need at least two instructions -- but
1344         // if the first one sets one of our two registers to its final value
1345         // then we lose the other input value with no way to recover it, so
1346         // we must either write a fresh third register, or write something
1347         // other than the final value, and in both cases we need a third
1348         // instruction to fix everything up.  we've done the wrong-something-
1349         // other trick twice, so here's the captain-obvious use-a-third-
1350         // register version.
1351         mov     x16, x0
1352         mov     x0, x2
1353         mov     x2, x16
1354
1355 #else
1356         notimpl
1357 #endif
1358
1359         ret
1360
1361 endproc
1362
1363 proc    x11
1364
1365         // assuming a is initialized to zero, set a to the inclusive or of
1366         // the xor-differences of corresponding bytes in the c-byte strings
1367         // at si and di.
1368         //
1369         // in particular, a will be zero (and zf set) if and only if the two
1370         // strings are equal.
1371
1372 #if defined(__x86_64__)
1373
1374 0:      mov     dl, [rsi]
1375         xor     dl, [rdi]
1376         inc     rsi
1377         inc     rdi
1378         or      al, dl
1379         loop    0b
1380
1381 #elif defined(__i386__)
1382
1383 0:      mov     dl, [esi]
1384         xor     dl, [edi]
1385         inc     esi
1386         inc     edi
1387         or      al, dl
1388         loop    0b
1389
1390 #elif defined(__arm__)
1391
1392 0:      ldrb    r1, [r4], #1
1393         ldrb    r12, [r5], #1
1394         subs    r2, r2, #1
1395         eor     r12, r12, r1
1396         orr     r0, r0, r12
1397         bne     0b
1398
1399 #elif defined(__aarch64__)
1400
1401 0:      ldrb    w16, [x4], #1
1402         ldrb    w17, [x5], #1
1403         sub     x2, x2, #1
1404         eor     w16, w16, w17
1405         orr     w0, w0, w16
1406         cbnz    x2, 0b
1407
1408 #else
1409         notimpl
1410 #endif
1411
1412         ret
1413
1414 endproc
1415
1416 proc    x12
1417
1418         // an obtuse way of adding two registers.  for any bit position, a
1419         // OR d is set if and only if at least one of a and d has a bit set
1420         // in that position, and a AND d is set if and only if both have a
1421         // bit set in that position.  essentially, then, what we've done is
1422         // move all of the set bits in d to a, unless there's already a bit
1423         // there.  this clearly doesn't change the sum.
1424
1425 #if defined(__x86_64__)
1426
1427         mov     rcx, rdx                // c' = d
1428         and     rdx, rax                // d' = a AND d
1429         or      rax, rcx                // a' = a OR d
1430         add     rax, rdx
1431
1432 #elif defined(__i386__)
1433
1434         mov     ecx, edx                // c' = d
1435         and     edx, eax                // d' = a AND d
1436         or      eax, ecx                // a' = a OR d
1437         add     eax, edx
1438
1439 #elif defined(__arm__)
1440
1441         and     r2, r0, r3              // c' = a AND d
1442         orr     r0, r0, r3              // a' = a OR d
1443         add     r0, r0, r2
1444
1445 #elif defined(__aarch64__)
1446
1447         and     x2, x0, x3              // c' = a AND d
1448         orr     x0, x0, x3              // a' = a OR d
1449         add     x0, x0, x2
1450
1451 #else
1452         notimpl
1453 #endif
1454
1455         ret
1456
1457 endproc
1458
1459 proc    x13
1460
1461         // ok, so this is a really obtuse way of adding a and b; the result
1462         // is in a and d.  but why does it work?
1463
1464 #if defined(__x86_64__)
1465
1466         mov     rcx, 0x40               // carry chains at most 64 long
1467 0:      mov     rdx, rax                // copy a'
1468         xor     rax, rbx                // low bits of each bitwise sum
1469         and     rbx, rdx                // carry bits from each bitwise sum
1470         shl     rbx, 1                  // carry them into next position
1471         loop    0b
1472
1473 #elif defined(__i386__)
1474
1475         mov     ecx, 0x40               // carry chains at most 64 long
1476 0:      mov     edx, eax                // copy a'
1477         xor     eax, ebx                // low bits of each bitwise sum
1478         and     ebx, edx                // carry bits from each bitwise sum
1479         shl     ebx, 1                  // carry them into next position
1480         loop    0b
1481
1482 #elif defined(__arm__)
1483
1484         mov     r2, #0x40
1485 0:      and     r3, r0, r1
1486         subs    r2, r2, #1
1487         eor     r0, r0, r1
1488         lsl     r1, r3, #1
1489         bne     0b
1490
1491 #elif defined(__aarch64__)
1492
1493         mov     x2, #0x40
1494 0:      and     x3, x0, x1
1495         sub     x2, x2, #1
1496         eor     x0, x0, x1
1497         lsl     x1, x3, #1
1498         cbnz    x2, 0b
1499
1500 #else
1501         notimpl
1502 #endif
1503
1504         ret
1505
1506 endproc
1507
1508 proc    x14
1509
1510         // floor((a + d)/2), like x08.
1511
1512 #if defined(__x86_64__)
1513
1514         mov     rcx, rax                // copy a for later
1515         and     rcx, rdx                // carry bits
1516
1517         xor     rax, rdx                // low bits of each bitwise sum
1518         shr     rax, 1                  // divide by 2; carries now in place
1519
1520         add     rax, rcx                // add the carries; done
1521
1522 #elif defined(__i386__)
1523
1524         mov     ecx, eax                // copy a for later
1525         and     ecx, edx                // carry bits
1526
1527         xor     eax, edx                // low bits of each bitwise sum
1528         shr     eax, 1                  // divide by 2; carries now in place
1529
1530         add     eax, ecx                // add the carries; done
1531
1532 #elif defined(__arm__)
1533
1534         and     r2, r0, r3
1535         eor     r0, r0, r3
1536         add     r0, r2, r0, lsr #1
1537
1538 #elif defined(__aarch64__)
1539
1540         and     x2, x0, x3
1541         eor     x0, x0, x3
1542         add     x0, x2, x0, lsr #1
1543
1544 #else
1545         notimpl
1546 #endif
1547
1548         ret
1549
1550 endproc
1551
1552 proc    x15
1553
1554         // sign extension 32 -> 64 bits.
1555
1556 #if defined(__x86_64__)
1557
1558         movsx   rbx, eax                // like this?
1559
1560         mov     rdx, 0xffffffff80000000
1561         add     rax, rdx                // if bit 31 of a is set then bits
1562                                         // 31--63 of a' are clear; otherwise,
1563                                         // these bits are all set -- which is
1564                                         // exactly backwards
1565         xor     rax, rdx                // so fix it
1566
1567 #elif defined(__i386__)
1568
1569         movsx   ebx, ax                 // like this?
1570
1571         mov     edx, 0xffff8000
1572         add     eax, edx                // if bit 31 of a is set then bits
1573                                         // 31--63 of a' are clear; otherwise,
1574                                         // these bits are all set -- which is
1575                                         // exactly backwards
1576         xor     eax, edx                // so fix it
1577
1578 #elif defined(__arm__)
1579
1580         sxth    r1, r0                  // like this
1581
1582         mov     r12, #0x80000000
1583         add     r0, r0, r12, asr #16
1584         eor     r0, r0, r12, asr #16
1585
1586 #elif defined(__aarch64__)
1587
1588         sxtw    x1, w0                  // like this
1589
1590         mov     x16, #0xffffffff80000000
1591         add     x0, x0, x16
1592         eor     x0, x0, x16
1593
1594 #else
1595         notimpl
1596 #endif
1597
1598         ret
1599
1600 endproc
1601
1602 proc    x16
1603
1604         // ??? i don't know why you'd want to calculate this.
1605
1606 #if defined(__x86_64__)
1607
1608         xor     rax, rbx                // a' = a XOR b
1609         xor     rbx, rcx                // b' = b XOR c
1610         mov     rsi, rax                // t = a XOR b
1611         add     rsi, rbx                // t = (a XOR b) + (b XOR c)
1612         cmovc   rax, rbx                // a' = cf ? b XOR c : a XOR b
1613         xor     rax, rbx                // a' = cf ? 0 : a XOR c
1614         cmp     rax, rsi
1615
1616 #elif defined(__i386__)
1617
1618         xor     eax, ebx                // a' = a XOR b
1619         xor     ebx, ecx                // b' = b XOR c
1620         mov     esi, eax                // t = a XOR b
1621         add     esi, ebx                // t = (a XOR b) + (b XOR c)
1622         cmovc   eax, ebx                // a' = cf ? b XOR c : a XOR b
1623         xor     eax, ebx                // a' = cf ? 0 : a XOR c
1624         cmp     eax, esi
1625
1626 #elif defined(__arm__)
1627
1628         eor     r0, r0, r1
1629         eor     r1, r1, r2
1630         adds    r4, r0, r1
1631         movcs   r0, r1
1632         eor     r0, r0, r1
1633         cmp     r0, r4
1634
1635 #elif defined(__aarch64__)
1636
1637         eor     x0, x0, x1
1638         eor     x1, x1, x2
1639         adds    x4, x0, x1
1640         cmov.cs x0, x1
1641         eor     x0, x0, x1
1642         cmp     x0, x4
1643
1644 #else
1645         notimpl
1646 #endif
1647
1648         ret
1649
1650 endproc
1651
1652 proc    x17
1653
1654         // absolute value
1655
1656 #if defined(__x86_64__)
1657
1658         cqo                             // d = a < 0 ? -1 : 0
1659         xor     rax, rdx                // a' = a < 0 ? -a - 1 : a
1660         sub     rax, rdx                // a' = a < 0 ? -a : a
1661
1662 #elif defined(__i386__)
1663
1664         cdq                             // d = a < 0 ? -1 : 0
1665         xor     eax, edx                // a' = a < 0 ? -a - 1 : a
1666         sub     eax, edx                // a' = a < 0 ? -a : a
1667
1668 #elif defined(__arm__)
1669
1670         // direct approach
1671         movs    r1, r0
1672         rsbmi   r1, r0, #0
1673
1674         // faithful-ish conversion
1675         eor     r3, r0, r0, asr #31
1676         sub     r0, r3, r0, asr #31
1677
1678 #elif defined(__aarch64__)
1679
1680         // direct approach
1681         tst     x0, #1 << 63
1682         cneg.ne x1, x0
1683
1684         // faithful-ish conversion
1685         eor     x3, x0, x0, asr #63
1686         sub     x0, x3, x0, asr #63
1687
1688 #else
1689         notimpl
1690 #endif
1691
1692         ret
1693
1694 endproc
1695
1696 proc    x18
1697
1698         // should always set sf, clear zf, unless we get rescheduled to a
1699         // different core.
1700
1701 #if defined(__x86_64__)
1702
1703         rdtsc                           // d || a = cycles
1704         shl     rdx, 0x20
1705         or      rax, rdx                // a = cycles
1706         mov     rcx, rax                // c = cycles
1707
1708         rdtsc                           // d || a = cycles'
1709         shl     rdx, 0x20
1710         or      rax, rdx                // a = cycles'
1711
1712         cmp     rcx, rax
1713
1714 #elif defined(__i386__)
1715
1716         rdtsc                           // d || a = cycles
1717         mov     ebx, eax
1718         mov     ecx, edx                // c || b = cycles
1719
1720         rdtsc                           // d || a = cycles'
1721
1722         sub     ebx, eax
1723         sbb     ecx, edx
1724
1725 #elif defined(__arm__)
1726
1727         // cycle clock not available in user mode
1728         mrrc    p15, 0, r0, r1, c9
1729         mrrc    p15, 0, r2, r3, c9
1730         subs    r0, r0, r2
1731         sbcs    r1, r1, r3
1732
1733 #elif defined(__aarch64__)
1734
1735         // cycle clock not available in user mode
1736         mrs     x0, pmccntr_el0
1737         mrs     x1, pmccntr_el0
1738         cmp     x0, x1
1739
1740 #else
1741         notimpl
1742 #endif
1743
1744         ret
1745
1746 endproc
1747
1748 proc    x19
1749
1750         // stupid way to capture a pointer to inline data and jump past it.
1751         // confuses the return-address predictor something chronic.  worse
1752         // because amd64 calling convention doesn't usually pass arguments on
1753         // the stack.
1754
1755 #if defined(__x86_64__)
1756
1757         call    8f
1758         .string "hello world!\n\0"
1759 8:      call    print_str
1760         add     rsp, 8
1761         ret
1762
1763 print_str:
1764         // actually implement this ridiculous thing
1765         mov     rsi, [rsp + 8]
1766         xor     edx, edx
1767 0:      mov     al, [rsi + rdx]
1768         inc     rdx
1769         cmp     al, 0
1770         jnz     0b
1771         mov     eax, SYS_write
1772         mov     edi, 1
1773         dec     rdx
1774         syscall                         // clobbers r11 :-(
1775         ret
1776
1777 #elif defined(__i386__)
1778
1779         call    8f
1780         .string "hello world!\n\0"
1781 8:      call    print_str
1782         add     esp, 4
1783         ret
1784
1785 print_str:
1786         // actually implement this ridiculous thing
1787         mov     ecx, [esp + 4]
1788         xor     edx, edx
1789 0:      mov     al, [ecx + edx]
1790         inc     edx
1791         cmp     al, 0
1792         jnz     0b
1793         mov     eax, SYS_write
1794         mov     ebx, 1
1795         dec     edx
1796         int     0x80
1797         ret
1798
1799 #elif defined(__arm__)
1800
1801         // why am i doing this?
1802         stmfd   r13!, {r14}
1803         bl      8f
1804         .string "hello world!\n\0"
1805         .balign 4
1806 8:      mov     r1, r14               // might as well make it easy on myself
1807         bl      print_str
1808         ldmfd   r13!, {pc}
1809
1810 print_str:
1811         mov     r2, #0
1812 0:      ldrb    r0, [r1, r2]
1813         cmp     r0, #0
1814         addne   r2, r2, #1
1815         bne     0b
1816         mov     r0, #1
1817         mov     r7, #SYS_write
1818         swi     0
1819         bx      r14
1820
1821 #elif defined(__aarch64__)
1822
1823         // why am i doing this?
1824         str     x30, [sp, #-16]!
1825         bl      8f
1826         .string "hello world!\n\0"
1827         .balign 4
1828 8:      mov     x1, x30               // might as well make it easy on myself
1829         bl      print_str
1830         ldr     x30, [sp], #16
1831         ret
1832
1833 print_str:
1834         mov     x2, #0
1835 0:      ldrb    w0, [x1, x2]
1836         cmp     w0, #0
1837         cinc.ne x2, x2
1838         b.ne    0b
1839         mov     x0, #1
1840         mov     x8, #SYS_write
1841         svc     #0
1842         ret
1843
1844 #else
1845         notimpl
1846 #endif
1847
1848 endproc
1849
1850 proc    x1a
1851
1852         // collect the current instruction-pointer address.  this was an old
1853         // 32-bit i386 trick for position-independent code, but (a) it
1854         // confuses the return predictor, and (b) amd64 has true pc-relative
1855         // addressing.
1856
1857 #if defined(__x86_64__)
1858
1859         // the actual example
1860         call    0f
1861 0:      pop     rax
1862
1863         // the modern i386 trick doesn't confuse the return-address
1864         // predictor.
1865         call    calladdr_rbx
1866         sub     rbx, . - 0b
1867
1868         // but rip-relative addressing is even better
1869         lea     rcx, [rip + 0b]
1870
1871         ret
1872
1873 calladdr_rbx:
1874         mov     rbx, [rsp]
1875         ret
1876
1877 #elif defined(__i386__)
1878
1879         // the actual example
1880         call    0f
1881 0:      pop     eax
1882
1883         // the modern i386 trick doesn't confuse the return-address
1884         // predictor.
1885         call    get_pc_ebx
1886         sub     ebx, . - 0b
1887
1888         ret
1889
1890 #elif defined(__arm__)
1891
1892         stmfd   r13!, {r14}
1893
1894         bl      0f
1895 0:      mov     r0, r14
1896
1897         bl      return
1898         sub     r1, r14, #. - 0b
1899
1900         adr     r2, 0b
1901
1902         ldmfd   r13!, {pc}
1903
1904 return: bx      r14
1905
1906 #elif defined(__aarch64__)
1907
1908         str     x30, [sp, #-16]!
1909
1910         // we can do all of the above using a64
1911         bl      0f
1912 0:      mov     x0, x30
1913
1914         bl      return
1915         sub     x1, x30, #. - 0b
1916
1917         adr     x2, 0b
1918
1919         ldr     x30, [sp], #16
1920 return: ret
1921
1922 #else
1923         notimpl
1924 #endif
1925
1926 endproc
1927
1928 proc    x1b
1929
1930 #if defined(__x86_64__)
1931
1932         // retpolines: an mitigation against adversarially influenced
1933         // speculative execution at indirect branches.  if an adversary can
1934         // prepare a branch-target buffer entry matching an indirect branch
1935         // in the victim's address space then they can cause the victim to
1936         // /speculatively/ (but not architecturally) execute any code in
1937         // their address space, possibly leading to leaking secrets through
1938         // the cache.  retpolines aren't susceptible to this because the
1939         // predicted destination address is from the return-prediction stack
1940         // which the adversary can't prime.  the performance penalty is still
1941         // essentially a branch misprediction -- for this return, and
1942         // possibly all others already stacked.
1943
1944         // (try not to crash)
1945         lea     rax, [rip + 9f]
1946
1947         push    rax
1948 9:      ret
1949
1950 #elif defined(__i386__)
1951
1952         call    get_pc_ebx
1953         lea     eax, [ebx + 9f - .]
1954
1955         push    eax
1956 9:      ret
1957
1958 #elif defined(__arm__)
1959
1960         stmfd   r13!, {r14}
1961
1962         adr     r14, 8f
1963         bx      r14
1964
1965 8:      ldmfd   r13!, {pc}
1966
1967 #elif defined(__aarch64__)
1968
1969         str     x30, [sp, #-16]!
1970
1971         adr     x30, 8f
1972         ret
1973
1974 8:      ldr     x30, [sp], #16
1975         ret
1976
1977 #else
1978         notimpl
1979 #endif
1980
1981 endproc
1982
1983 proc    x1c
1984
1985         // ok, having a hard time seeing a use for this.  the most important
1986         // thing to note is that sp is set from `pop' /after/ it's
1987         // incremented.
1988
1989 #if defined(__x86_64__)
1990
1991         // try not to crash
1992         mov     rax, rsp
1993         and     rsp, -16
1994         push    rax
1995
1996         pop     rsp
1997
1998         // check it worked
1999         mov     rbx, rsp
2000         ret
2001
2002 #elif defined(__i386__)
2003
2004         // try not to crash
2005         mov     eax, esp
2006         and     esp, -16
2007         push    eax
2008
2009         pop     esp
2010
2011         // check it worked
2012         mov     ebx, esp
2013         ret
2014
2015 #elif defined(__arm__)
2016
2017         // not even going to dignify this
2018         notimpl
2019
2020 #elif defined(__aarch64__)
2021
2022         // not even going to dignify this
2023         notimpl
2024
2025 #else
2026         notimpl
2027 #endif
2028
2029 endproc
2030
2031 proc    x1d
2032
2033         // monumentally cheesy way to copy 8 n bytes from buff1 to buff2.
2034         // also clobbers words at buff2 + 8 n and buff2 - 8 for good measure.
2035
2036         n = 4
2037
2038 #if defined(__x86_64__)
2039
2040         mov     rax, rsp                        // safekeeping
2041
2042         // we're toast if we get hit by a signal now.  fingers crossed...
2043   .if 0
2044         mov     rsp, buff2 + 8*n + 8
2045         mov     rbp, buff1 + 8*n
2046   .else
2047         lea     rsp, [rdi + 8*n + 16]
2048         lea     rbp, [rsi + 8*n]
2049   .endif
2050         enter   0, n + 1
2051
2052         // precise action:
2053         //
2054         //         +---------+                  +---------+
2055         //  rbp -> |   ???   |           rsp -> |   ???   |
2056         //         +---------+                  +---------+
2057         //         | w_{n-1} |                  |   rbp   | <- rbp'
2058         //         +---------+                  +---------+
2059         //         |   ...   |                  | w_{n-1} |
2060         //         +---------+                  +---------+
2061         //         |   w_1   |                  |   ...   |
2062         //         +---------+                  +---------+
2063         //         |   w_0   |                  |   w_1   |
2064         //         +---------+                  +---------+
2065         //                                      |   w_0   |
2066         //                                      +---------+
2067         //                                      |   rbp'  | <- rsp'
2068         //                                      +---------+
2069
2070         mov     rdx, rsp
2071         mov     rsp, rax
2072
2073 #elif defined(__i386__)
2074
2075         mov     eax, esp                        // safekeeping
2076
2077         // we're toast if we get hit by a signal now.  fingers crossed...
2078   .if 0
2079         mov     esp, buff2 + 4*n + 4
2080         mov     ebp, buff1 + 4*n
2081   .else
2082         lea     esp, [edi + 4*n + 8]
2083         lea     ebp, [esi + 4*n]
2084   .endif
2085         enter   0, n + 1
2086
2087         mov     edx, esp
2088         mov     esp, eax
2089
2090 #elif defined(__arm__)
2091
2092         add     r4, r4, #4*n
2093         add     r5, r5, #4*n + 8
2094
2095         str     r4, [r5, #-4]!
2096   .rept n/2
2097         ldrd    r0, r1, [r4, #-8]!
2098         strd    r0, r1, [r5, #-8]!
2099   .endr
2100         add     r4, r5, #4*n
2101         str     r4, [r5, #-4]!
2102
2103 #elif defined(__aarch64__)
2104
2105         // omgwtf.  let's not actually screw with the stack pointer.
2106
2107         add     x4, x4, #8*n
2108         add     x5, x5, #8*n + 16
2109
2110         str     x4, [x5, #-8]!
2111   .rept n/2
2112         ldp     x16, x17, [x4, #-16]!
2113         stp     x16, x17, [x5, #-16]!
2114   .endr
2115         add     x4, x5, #8*n
2116         str     x4, [x5, #-8]!
2117
2118 #else
2119         notimpl
2120 #endif
2121
2122         ret
2123
2124 endproc
2125
2126 proc    x1e
2127
2128         // convert nibble value to (uppercase) hex; other input values yield
2129         // nonsense.
2130
2131 #if defined(__x86_64__)
2132
2133         // das doesn't work in 64-bit mode; best i can come up with
2134         mov     edx, eax
2135         add     al, '0'
2136         add     dl, 'A' - 10
2137         cmp     al, '9' + 1
2138         cmovae  eax, edx
2139
2140 #elif defined(__i386__)
2141
2142         cmp     al, 0x0a                // cf = 1 iff a < 10
2143         sbb     al, 0x69                // if 0 <= a < 10, a' = a - 0x6a, so
2144                                         // 0x96 <= a' < 0x70, setting af, cf
2145                                         // if 10 <= a < 16, a' = a - 0x69, so
2146                                         // 0x71 <= a' < 0x77, setting cf but
2147                                         // clearing af
2148         das                             // if 0 <= a < 10, then af and cf are
2149                                         // both set, so set subtract 0x66
2150                                         // from a' leaving 0x30 <= a' < 0x3a;
2151                                         // if 10 <= a < 16 then af clear but
2152                                         // cf set, so subtract 0x60 from a'
2153                                         // leaving 0x41 <= a' < 0x47
2154
2155 #elif defined(__arm__)
2156
2157         // significantly less tricksy
2158         cmp     r0, #10
2159         addlo   r0, r0, #'0'
2160         addhs   r0, r0, #'A' - 10
2161
2162 #elif defined(__aarch64__)
2163
2164         // with less versatile conditional execution this is the best we can
2165         // do
2166         cmp     w0, #10
2167         add     w16, w0, #'A' - 10
2168         add     w0, w0, #'0'
2169         cmov.hs w0, w16
2170
2171 #else
2172         notimpl
2173 #endif
2174
2175         ret
2176
2177 endproc
2178
2179 proc    x1f
2180
2181         // verify collatz conjecture starting at a; assume a /= 0!
2182
2183 #if defined(__x86_64__)
2184
2185 0:      bsf     rcx, rax                // clobber c if a = 0
2186         shr     rax, cl                 // a = 2^c a'
2187   cmp rdx, 0
2188   je 1f
2189   stosq
2190   dec rdx
2191 1:
2192         cmp     rax, 1                  // done?
2193         je      9f
2194         lea     rax, [2*rax + rax + 1]  // a' = 3 a' + 1
2195         jmp     0b                      // again
2196
2197 9:      ret
2198
2199 #elif defined(__i386__)
2200
2201 0:      bsf     ecx, eax                // clobber c if a = 0
2202         shr     eax, cl                 // a = 2^c a'
2203   cmp edx, 0
2204   je 1f
2205   stosd
2206   dec edx
2207 1:
2208         cmp     eax, 1                  // done?
2209         je      9f
2210         lea     eax, [2*eax + eax + 1]  // a' = 3 a' + 1
2211         jmp     0b                      // again
2212
2213 9:      ret
2214
2215 #elif defined(__arm__)
2216
2217         // rbit introduced in armv7
2218 0:      rbit    r2, r0
2219         clz     r2, r2
2220         mov     r0, r0, lsr r2          // a = 2^c a'
2221   cmp r3, #0
2222   strne r0, [r5], #4
2223   subne r3, r3, #1
2224         cmp     r0, #1
2225         adcne   r0, r0, r0, lsl #1      // a' = 3 a' + 1 (because c set)
2226         bne     0b
2227
2228         ret
2229
2230 #elif defined(__aarch64__)
2231
2232 0:      rbit    w2, w0
2233         clz     w2, w2
2234         lsr     w0, w0, w2              // a = 2^c a'
2235   cmp x3, #0
2236   beq 1f
2237   str x0, [x5], #8
2238   sub x3, x3, #1
2239 1:
2240         cmp     w0, #1
2241         add     w16, w0, w0, lsl #1     // t = 3 a' + 1 (because c set)
2242         csinc.eq w0, w0, w16
2243         b.ne    0b
2244
2245         ret
2246
2247 #else
2248         notimpl
2249 #endif
2250
2251 endproc
2252
2253 ///--------------------------------------------------------------------------
2254 /// 0x20--0x2f
2255
2256 proc    x20
2257
2258         // calculate 1337 a slowly
2259
2260 #if defined(__x86_64__)
2261
2262         // original version
2263         mov     rcx, rax                // c = a
2264         shl     rcx, 2                  // c = 4 a
2265         add     rcx, rax                // c = 5 a
2266         shl     rcx, 3                  // c = 40 a
2267         add     rcx, rax                // c = 41 a
2268         shl     rcx, 1                  // c = 82 a
2269         add     rcx, rax                // c = 83 a
2270         shl     rcx, 1                  // c = 166 a
2271         add     rcx, rax                // c = 167 a
2272         shl     rcx, 3                  // c = 1336 a
2273         add     rcx, rax                // c = 1337 a
2274
2275         // a quick way
2276         lea     rdx, [2*rax + rax]      // t = 3 a
2277         shl     rdx, 6                  // t = 192 a
2278         sub     rdx, rax                // t = 191 a
2279         lea     rbx, [8*rdx]            // b = 1528 a
2280         sub     rbx, rdx                // b = 1337 a
2281
2282 #elif defined(__i386__)
2283
2284         // original version
2285         mov     ecx, eax                // c = a
2286         shl     ecx, 2                  // c = 4 a
2287         add     ecx, eax                // c = 5 a
2288         shl     ecx, 3                  // c = 40 a
2289         add     ecx, eax                // c = 41 a
2290         shl     ecx, 1                  // c = 82 a
2291         add     ecx, eax                // c = 83 a
2292         shl     ecx, 1                  // c = 166 a
2293         add     ecx, eax                // c = 167 a
2294         shl     ecx, 3                  // c = 1336 a
2295         add     ecx, eax                // c = 1337 a
2296
2297         // a quick way
2298         lea     edx, [2*eax + eax]      // t = 3 a
2299         shl     edx, 6                  // t = 192 a
2300         sub     edx, eax                // t = 191 a
2301         lea     ebx, [8*edx]            // b = 1528 a
2302         sub     ebx, edx                // b = 1337 a
2303
2304 #elif defined(__arm__)
2305
2306         // original version, ish
2307         add     r2, r0, r0, lsl #2      // c = 5 a
2308         add     r2, r0, r2, lsl #3      // c = 41 a
2309         add     r2, r0, r2, lsl #1      // c = 83 a
2310         add     r2, r0, r2, lsl #1      // c = 167 a
2311         add     r2, r0, r2, lsl #3      // c = 1337 a
2312
2313         // quicker way
2314         add     r1, r0, r0, lsl #1      // b = 3 a
2315         rsb     r1, r0, r1, lsl #6      // b = 191 a
2316         rsb     r1, r1, r1, lsl #3      // b = 1337 a
2317
2318 #elif defined(__aarch64__)
2319
2320         // original version, ish
2321         add     x2, x0, x0, lsl #2      // c = 5 a
2322         add     x2, x0, x2, lsl #3      // c = 41 a
2323         add     x2, x0, x2, lsl #1      // c = 83 a
2324         add     x2, x0, x2, lsl #1      // c = 167 a
2325         add     x2, x0, x2, lsl #3      // c = 1337 a
2326
2327         // sleazy because no rsb
2328         add     x1, x0, x0, lsl #1      // b = 3 a
2329         sub     x1, x0, x1, lsl #6      // b = -191 a
2330         sub     x1, x1, x1, lsl #3      // b = 1337 a
2331
2332 #else
2333         notimpl
2334 #endif
2335
2336         ret
2337
2338 endproc
2339
2340 proc    x21
2341
2342         // multiply complex numbers a + b i and c + d i
2343         //
2344         //      (a + b i) (c + d i) = (a c - b d) + (a d + b c) i
2345         //
2346         // somewhat slick approach uses only three multiplications
2347
2348 #if defined(__x86_64__)
2349
2350         mov     rsi, rax                // t = a
2351         add     rax, rbx                // a' = a + b
2352         mov     rdi, rdx                // u = d
2353         sub     rdx, rcx                // d' = d - c
2354         add     rdi, rcx                // u = c + d
2355
2356         imul    rax, rcx                // a' = c (a + b)
2357         imul    rsi, rdx                // t = a (d - c)
2358         imul    rdi, rbx                // u = b (c + d)
2359
2360         add     rsi, rax                // t = a (d - c) + c (a + b)
2361         mov     rbx, rsi                // b' = a (d - c) + c (a + b)
2362                                         //      = a d + b c
2363         sub     rax, rdi                // a' = c (a + b) - b (c + d)
2364                                         //      = a c - b d
2365
2366 #elif defined(__i386__)
2367
2368         mov     esi, eax                // t = a
2369         add     eax, ebx                // a' = a + b
2370         mov     edi, edx                // u = d
2371         sub     edx, ecx                // d' = d - c
2372         add     edi, ecx                // u = c + d
2373
2374         imul    eax, ecx                // a' = c (a + b)
2375         imul    esi, edx                // t = a (d - c)
2376         imul    edi, ebx                // u = b (c + d)
2377
2378         add     esi, eax                // t = a (d - c) + c (a + b)
2379         mov     ebx, esi                // b' = a (d - c) + c (a + b)
2380                                         //      = a d + b c
2381         sub     eax, edi                // a' = c (a + b) - b (c + d)
2382                                         //      = a c - b d
2383
2384 #elif defined(__arm__)
2385
2386         add     r4, r0, r1              // t = a + b
2387         add     r5, r2, r3              // u = c + d
2388         sub     r3, r3, r2              // d' = d - c
2389
2390         // mls introduced in armv7
2391         mul     r4, r4, r2              // t = c (a + b)
2392         mov     r2, r1                  // c' = a (bah!)
2393         mla     r1, r0, r3, r4          // b' = a (d - c) + c (a + b)
2394                                         //      = a d + b c
2395         mls     r0, r2, r5, r4          // a' = c (a + b) - b (c + d)
2396                                         //      = a c - b d
2397
2398 #elif defined(__aarch64__)
2399
2400         add     x4, x0, x1              // t = a + b
2401         add     x5, x2, x3              // u = c + d
2402         sub     x3, x3, x2              // d' = d - c
2403
2404         // mls intxoduced in axmv7
2405         mul     x4, x4, x2              // t = c (a + b)
2406         mov     x2, x1                  // c' = a (bah!)
2407         madd    x1, x0, x3, x4          // b' = a (d - c) + c (a + b)
2408                                         //      = a d + b c
2409         msub    x0, x2, x5, x4          // a' = c (a + b) - b (c + d)
2410                                         //      = a c - b d
2411
2412 #else
2413         notimpl
2414 #endif
2415
2416         ret
2417
2418 endproc
2419
2420 proc    x22
2421
2422         // divide by 3
2423
2424 #if defined(__x86_64__)
2425
2426         mov     rdx, 0xaaaaaaaaaaaaaaab // = ceil(2/3 2^64)
2427         mul     rdx                     // d' || a' =~ 2/3 a 2^64
2428         shr     rdx, 1                  // d' = floor(a/3)
2429         mov     rax, rdx                // a' = floor(a/3)
2430
2431         // we start with 0 <= a < 2^64.  write f = ceil(2/3 2^64), so that
2432         // 2/3 < f/2^64 < 2/3 + 1/2^64.  then floor(2/3 a) <= floor(a f/2^64)
2433         // <= floor(2/3 a + a/2^64), but a < 2^64 so a/2^64 < 1 and
2434         // floor(a f/2^64) = floor(2/3 a).
2435
2436 #elif defined(__i386__)
2437
2438         mov     edx, 0xaaaaaaab         // = ceil(2/3 2^32)
2439         mul     edx                     // d' || a' =~ 2/3 a 2^32
2440         shr     edx, 1                  // d' = floor(a/3)
2441         mov     eax, edx                // a' = floor(a/3)
2442
2443 #elif defined(__arm__)
2444
2445         ldr     r12, =0xaaaaaaab
2446         umull   r12, r0, r0, r12
2447         mov     r0, r0, lsr #1
2448
2449 #elif defined(__aarch64__)
2450
2451         ldr     x16, =0xaaaaaaaaaaaaaaab
2452         umulh   x0, x0, x16
2453         lsr     x0, x0, #1
2454
2455 #else
2456         notimpl
2457 #endif
2458
2459         ret
2460
2461 endproc
2462
2463 proc    x23
2464
2465 #if defined(__x86_64__)
2466
2467         // main loop: shorten a preserving residue class mod 3
2468 0:      cmp     rax, 5
2469         jbe     8f
2470         // a > 5
2471         mov     rdx, rax                // d' = a
2472         shr     rdx, 2                  // d' = floor(a/4)
2473         and     rax, 3                  // a = 4 d' + a' (0 <= a' < 4)
2474         add     rax, rdx                // a' == a (mod 3) but a' < a/4 + 4
2475         jmp     0b
2476
2477         // fix up final value 0 <= a < 6: want 0 <= a < 3
2478         //
2479         // the tricky part is actually a = 3; but the other final cases take
2480         // additional iterations which we can avoid.
2481 8:      cmp     rax, 3                  // set cf iff a < 3
2482         cmc                             // set cf iff a >= 3
2483         sbb     rdx, rdx                // d' = a >= 3 ? -1 : 0
2484         and     rdx, 3                  // d' = a >= 3 ? 3 : 0
2485         sub     rax, rdx                // a' = a - (a >= 3 ? 3 : 0)
2486                                         //      = a (mod 3)
2487
2488 #elif defined(__i386__)
2489
2490         // main loop: shorten a preserving residue class mod 3
2491 0:      cmp     eax, 5
2492         jbe     8f
2493         // a > 5
2494         mov     edx, eax                // d' = a
2495         shr     edx, 2                  // d' = floor(a/4)
2496         and     eax, 3                  // a = 4 d' + a' (0 <= a' < 4)
2497         add     eax, edx                // a' == a (mod 3) but a' < a/4 + 4
2498         jmp     0b
2499
2500         // fix up final value 0 <= a < 6: want 0 <= a < 3
2501         //
2502         // the tricky part is actually a = 3; but the other final cases take
2503         // additional iterations which we can avoid.
2504 8:      cmp     eax, 3                  // set cf iff a < 3
2505         cmc                             // set cf iff a >= 3
2506         sbb     edx, edx                // d' = a >= 3 ? -1 : 0
2507         and     edx, 3                  // d' = a >= 3 ? 3 : 0
2508         sub     eax, edx                // a' = a - (a >= 3 ? 3 : 0)
2509                                         //      = a (mod 3)
2510
2511 #elif defined(__arm__)
2512
2513 0:      cmp     r0, #6
2514         andhs   r12, r0, #3
2515         addhs   r0, r12, r0, lsr #2
2516         bhs     0b
2517
2518         cmp     r0, #3
2519         subhs   r0, r0, #3
2520
2521 #elif defined(__aarch64__)
2522
2523 0:      cmp     x0, #6
2524         // blunder on through regardless since this doesn't affect the result
2525         and     x16, x0, #3
2526         add     x0, x16, x0, lsr #2
2527         b.hs    0b
2528
2529         subs    x16, x0, #3
2530         cmov.hs x0, x16
2531
2532 #else
2533         notimpl
2534 #endif
2535
2536         ret
2537
2538 endproc
2539
2540 proc    x24
2541
2542         // invert (odd) a mod 2^64
2543         //
2544         // suppose a a_i == 1 (mod 2^{2^i})
2545         //
2546         // clearly good for i = 0, since 2^i = 1 and 2^{2^i} = 2, and a_0 =
2547         // a == 1 (mod 2) by assumption
2548         //
2549         // write a a_i == b_i 2^{2^i} + 1 (mod 2^{2^{i+1}})
2550         // then b_i == (a a_i - 1)/2^{2^i} (mod 2^{2^i})
2551         // to lift inverse, we want x such that a x == -b_i (mod 2^{2^i});
2552         // clearly x = -a_i b_i will do, since a a_i == 1 (mod 2^{2^i})
2553         // then:
2554         // a_{i+1} = a_i - a_i b_i 2^{2^i} = a_i (1 - (a a_i - 1))
2555         //      = 2 a_i - a a_i^2
2556         //
2557         // check:
2558         // a a_{i+1} = 2 a a_i - a^2 a_i^2
2559         //      == 2 a a_i - (b_i 2^{2^i} + 1)^2
2560         //      == 2 (b_i 2^{2^i} + 1) -
2561         //              (b_i^2 2^{2^{i+1}} + 2 b_i 2^{2^i} + 1)
2562         //      == 1 (mod 2^{2^{i+1}})
2563
2564 #if defined(__x86_64__)
2565
2566         // rax                          // a_0 = a
2567         mov     rbx, rax                // b' = a
2568         mov     rsi, rax                // t = a_0
2569
2570 0:
2571   cmp rbp, 0
2572   je 1f
2573   stosq
2574   dec rbp
2575 1:
2576         mul     rbx                     // a' = a a_i
2577         mov     rcx, rax                // c = a a_i
2578
2579         sub     rax, 2                  // a' = a a_i - 2
2580         neg     rax                     // a' = 2 - a a_i
2581         mul     rsi                     // a_{i+1} = a_i (2 - a a_i)
2582                                         //      = 2 a_i - a a_i^2
2583         mov     rsi, rax                // t = a_{i+1}
2584
2585         cmp     rcx, 1                  // done?
2586         ja      0b                      // no -- iterate
2587
2588 #elif defined(__i386__)
2589
2590         // eax                          // a_0 = a
2591         mov     ebx, eax                // b' = a
2592         mov     esi, eax                // t = a_0
2593
2594 0:
2595   cmp ebp, 0
2596   je 1f
2597   stosd
2598   dec ebp
2599 1:
2600         mul     ebx                     // a' = a a_i
2601         mov     ecx, eax                // c = a a_i
2602
2603         sub     eax, 2                  // a' = a a_i - 2
2604         jb      9f                      // done if < 2
2605         neg     eax                     // a' = 2 - a a_i
2606         mul     esi                     // a_{i+1} = a_i (2 - a a_i)
2607                                         //      = 2 a_i - a a_i^2
2608         mov     esi, eax                // t = a_{i+1}
2609
2610         jmp     0b                      // and iterate
2611 9:      mov     eax, esi                // restore
2612
2613 #elif defined(__arm__)
2614
2615         // r0                           // a_0 = a
2616         mov     r1, r0                  // b' = a
2617
2618 0:
2619   cmp r6, #0
2620   strne r0, [r5], #4
2621   subne r6, r6, #1
2622         mul     r2, r0, r1              // c = a a_i
2623         rsbs    r2, r2, #2              // c = 2 - a a_i
2624         mul     r0, r0, r2              // a_{i+1} = a_i (2 - a a_i)
2625                                         //      = 2 a_i - a a_i^2
2626         blo     0b
2627
2628 #elif defined(__aarch64__)
2629
2630         // x0                           // a_0 = a
2631         mov     x1, x0                  // b' = a
2632         mov     x16, #2                 // because we have no rsb
2633
2634 0:
2635   cmp x6, #0
2636   b.eq 1f
2637   str x0, [x5], #8
2638   sub x6, x6, #1
2639 1:
2640         mul     x2, x0, x1              // c = a a_i
2641         subs    x2, x16, x2             // c = 2 - a a_i
2642         mul     x0, x0, x2              // a_{i+1} = a_i (2 - a a_i)
2643                                         //      = 2 a_i - a a_i^2
2644         b.lo    0b
2645
2646 #else
2647         notimpl
2648 #endif
2649
2650         ret
2651
2652 endproc
2653
2654 proc    x25
2655
2656         // a poor approximation to pi/4
2657         //
2658         // think of x and y as being in 16.16 fixed-point format.  we sample
2659         // points in the unit square, and determine how many of them are
2660         // within a unit quarter-circle centred at the origin.  the area of
2661         // the quarter-circle is pi/4.
2662
2663 #if defined(__x86_64__)
2664
2665         xor     eax, eax                // a = 0
2666         mov     rcx, 1
2667         shl     rcx, 0x20               // c =~ 4 billion
2668
2669 0:      movzx   rbx, cx                 // x = low 16 bits of c
2670         imul    rbx, rbx                // b = x^2
2671
2672         ror     rcx, 0x10               // switch halves of c
2673         movzx   rdx, cx                 // y = high 16 bits of c
2674         imul    rdx, rdx                // d = y^2
2675         rol     rcx, 0x10               // switch back
2676
2677         add     rbx, rdx                // r^2 = x^2 + y^2
2678         shr     rbx, 0x20               // r^2 >= 1?
2679         cmp     rbx, 1                  // set cf iff r^2 >= 1
2680         adc     rax, 0                  // and add onto accumulator
2681         loop    0b
2682
2683 #elif defined(__i386__)
2684
2685         // this is actually better done in 32 bits.  the carry has the wrong
2686         // sense here, so instead deduct one for each point outside the
2687         // quarter-circle rather than adding one for each point inside it.
2688         xor     eax, eax
2689         xor     ecx, ecx
2690
2691 0:      movzx   ebx, cx
2692         imul    ebx, ebx
2693
2694         ror     ecx, 0x10
2695         movzx   edx, cx
2696         imul    edx, edx
2697         rol     ecx, 0x10
2698
2699         add     ebx, edx                // see?
2700         sbb     eax, 0
2701         loop    0b
2702
2703 #elif defined(__arm__)
2704
2705         mov     r0, #0
2706         mov     r2, #0
2707
2708 0:      uxth    r1, r2, ror #0
2709         uxth    r3, r2, ror #16
2710         mul     r1, r1, r1
2711         mul     r3, r3, r3
2712         cmn     r1, r3                  // mlas doesn't set cf usefully
2713         addcc   r0, r0, #1
2714         adds    r2, r2, #1
2715         bne     0b
2716
2717 #elif defined(__aarch64__)
2718
2719         mov     w0, #0
2720         mov     w2, #0
2721
2722 0:      ubfx    w1, w2, #0, #16
2723         ubfx    w3, w2, #16, #16
2724         sub     w2, w2, #1
2725         mul     w1, w1, w1
2726         mul     w3, w3, w3
2727         cmn     w1, w3
2728         cinc.cc w0, w0
2729         cbnz    w2, 0b
2730
2731 #else
2732         notimpl
2733 #endif
2734
2735         ret
2736
2737 endproc
2738
2739 proc    x26
2740
2741 #if defined(__x86_64__)
2742
2743         notimpl
2744
2745 #elif defined(__i386__)
2746
2747         notimpl
2748
2749 #elif defined(__arm__)
2750
2751         notimpl
2752
2753 #elif defined(__aarch64__)
2754
2755         notimpl
2756
2757 #else
2758         notimpl
2759 #endif
2760
2761 endproc
2762
2763 proc    x27
2764
2765 #if defined(__x86_64__)
2766
2767         notimpl
2768
2769 #elif defined(__i386__)
2770
2771         notimpl
2772
2773 #elif defined(__arm__)
2774
2775         notimpl
2776
2777 #elif defined(__aarch64__)
2778
2779         notimpl
2780
2781 #else
2782         notimpl
2783 #endif
2784
2785 endproc
2786
2787 proc    x28
2788
2789 #if defined(__x86_64__)
2790
2791         notimpl
2792
2793 #elif defined(__i386__)
2794
2795         notimpl
2796
2797 #elif defined(__arm__)
2798
2799         notimpl
2800
2801 #elif defined(__aarch64__)
2802
2803         notimpl
2804
2805 #else
2806         notimpl
2807 #endif
2808
2809 endproc
2810
2811 proc    x29
2812
2813 #if defined(__x86_64__)
2814
2815         notimpl
2816
2817 #elif defined(__i386__)
2818
2819         notimpl
2820
2821 #elif defined(__arm__)
2822
2823         notimpl
2824
2825 #elif defined(__aarch64__)
2826
2827         notimpl
2828
2829 #else
2830         notimpl
2831 #endif
2832
2833 endproc
2834
2835 proc    x2a
2836
2837 #if defined(__x86_64__)
2838
2839         notimpl
2840
2841 #elif defined(__i386__)
2842
2843         notimpl
2844
2845 #elif defined(__arm__)
2846
2847         notimpl
2848
2849 #elif defined(__aarch64__)
2850
2851         notimpl
2852
2853 #else
2854         notimpl
2855 #endif
2856
2857 endproc
2858
2859 proc    x2b
2860
2861 #if defined(__x86_64__)
2862
2863         notimpl
2864
2865 #elif defined(__i386__)
2866
2867         notimpl
2868
2869 #elif defined(__arm__)
2870
2871         notimpl
2872
2873 #elif defined(__aarch64__)
2874
2875         notimpl
2876
2877 #else
2878         notimpl
2879 #endif
2880
2881 endproc
2882
2883 proc    x2c
2884
2885 #if defined(__x86_64__)
2886
2887         notimpl
2888
2889 #elif defined(__i386__)
2890
2891         notimpl
2892
2893 #elif defined(__arm__)
2894
2895         notimpl
2896
2897 #elif defined(__aarch64__)
2898
2899         notimpl
2900
2901 #else
2902         notimpl
2903 #endif
2904
2905 endproc
2906
2907 proc    x2d
2908
2909 #if defined(__x86_64__)
2910
2911         notimpl
2912
2913 #elif defined(__i386__)
2914
2915         notimpl
2916
2917 #elif defined(__arm__)
2918
2919         notimpl
2920
2921 #elif defined(__aarch64__)
2922
2923         notimpl
2924
2925 #else
2926         notimpl
2927 #endif
2928
2929 endproc
2930
2931 proc    x2e
2932
2933 #if defined(__x86_64__)
2934
2935         notimpl
2936
2937 #elif defined(__i386__)
2938
2939         notimpl
2940
2941 #elif defined(__arm__)
2942
2943         notimpl
2944
2945 #elif defined(__aarch64__)
2946
2947         notimpl
2948
2949 #else
2950         notimpl
2951 #endif
2952
2953 endproc
2954
2955 proc    x2f
2956
2957 #if defined(__x86_64__)
2958
2959         notimpl
2960
2961 #elif defined(__i386__)
2962
2963         notimpl
2964
2965 #elif defined(__arm__)
2966
2967         notimpl
2968
2969 #elif defined(__aarch64__)
2970
2971         notimpl
2972
2973 #else
2974         notimpl
2975 #endif
2976
2977 endproc
2978
2979 ///--------------------------------------------------------------------------
2980 /// 0x30--0x3f
2981
2982 proc    x30
2983
2984 #if defined(__x86_64__)
2985
2986         notimpl
2987
2988 #elif defined(__i386__)
2989
2990         notimpl
2991
2992 #elif defined(__arm__)
2993
2994         notimpl
2995
2996 #elif defined(__aarch64__)
2997
2998         notimpl
2999
3000 #else
3001         notimpl
3002 #endif
3003
3004         ret
3005
3006 endproc
3007
3008 proc    x31
3009
3010 #if defined(__x86_64__)
3011
3012         notimpl
3013
3014 #elif defined(__i386__)
3015
3016         notimpl
3017
3018 #elif defined(__arm__)
3019
3020         notimpl
3021
3022 #elif defined(__aarch64__)
3023
3024         notimpl
3025
3026 #else
3027         notimpl
3028 #endif
3029
3030 endproc
3031
3032 proc    x32
3033
3034 #if defined(__x86_64__)
3035
3036         notimpl
3037
3038 #elif defined(__i386__)
3039
3040         notimpl
3041
3042 #elif defined(__arm__)
3043
3044         notimpl
3045
3046 #elif defined(__aarch64__)
3047
3048         notimpl
3049
3050 #else
3051         notimpl
3052 #endif
3053
3054 endproc
3055
3056 proc    x33
3057
3058 #if defined(__x86_64__)
3059
3060         notimpl
3061
3062 #elif defined(__i386__)
3063
3064         notimpl
3065
3066 #elif defined(__arm__)
3067
3068         notimpl
3069
3070 #elif defined(__aarch64__)
3071
3072         notimpl
3073
3074 #else
3075         notimpl
3076 #endif
3077
3078 endproc
3079
3080 proc    x34
3081
3082 #if defined(__x86_64__)
3083
3084         notimpl
3085
3086 #elif defined(__i386__)
3087
3088         notimpl
3089
3090 #elif defined(__arm__)
3091
3092         notimpl
3093
3094 #elif defined(__aarch64__)
3095
3096         notimpl
3097
3098 #else
3099         notimpl
3100 #endif
3101
3102 endproc
3103
3104 proc    x35
3105
3106 #if defined(__x86_64__)
3107
3108         notimpl
3109
3110 #elif defined(__i386__)
3111
3112         notimpl
3113
3114 #elif defined(__arm__)
3115
3116         notimpl
3117
3118 #elif defined(__aarch64__)
3119
3120         notimpl
3121
3122 #else
3123         notimpl
3124 #endif
3125
3126 endproc
3127
3128 proc    x36
3129
3130 #if defined(__x86_64__)
3131
3132         notimpl
3133
3134 #elif defined(__i386__)
3135
3136         notimpl
3137
3138 #elif defined(__arm__)
3139
3140         notimpl
3141
3142 #elif defined(__aarch64__)
3143
3144         notimpl
3145
3146 #else
3147         notimpl
3148 #endif
3149
3150 endproc
3151
3152 proc    x37
3153
3154 #if defined(__x86_64__)
3155
3156         notimpl
3157
3158 #elif defined(__i386__)
3159
3160         notimpl
3161
3162 #elif defined(__arm__)
3163
3164         notimpl
3165
3166 #elif defined(__aarch64__)
3167
3168         notimpl
3169
3170 #else
3171         notimpl
3172 #endif
3173
3174 endproc
3175
3176 proc    x38
3177
3178 #if defined(__x86_64__)
3179
3180         notimpl
3181
3182 #elif defined(__i386__)
3183
3184         notimpl
3185
3186 #elif defined(__arm__)
3187
3188         notimpl
3189
3190 #elif defined(__aarch64__)
3191
3192         notimpl
3193
3194 #else
3195         notimpl
3196 #endif
3197
3198 endproc
3199
3200 proc    x39
3201
3202 #if defined(__x86_64__)
3203
3204         notimpl
3205
3206 #elif defined(__i386__)
3207
3208         notimpl
3209
3210 #elif defined(__arm__)
3211
3212         notimpl
3213
3214 #elif defined(__aarch64__)
3215
3216         notimpl
3217
3218 #else
3219         notimpl
3220 #endif
3221
3222 endproc
3223
3224 proc    x3a
3225
3226 #if defined(__x86_64__)
3227
3228         notimpl
3229
3230 #elif defined(__i386__)
3231
3232         notimpl
3233
3234 #elif defined(__arm__)
3235
3236         notimpl
3237
3238 #elif defined(__aarch64__)
3239
3240         notimpl
3241
3242 #else
3243         notimpl
3244 #endif
3245
3246 endproc
3247
3248 proc    x3b
3249
3250 #if defined(__x86_64__)
3251
3252         notimpl
3253
3254 #elif defined(__i386__)
3255
3256         notimpl
3257
3258 #elif defined(__arm__)
3259
3260         notimpl
3261
3262 #elif defined(__aarch64__)
3263
3264         notimpl
3265
3266 #else
3267         notimpl
3268 #endif
3269
3270 endproc
3271
3272 proc    x3c
3273
3274 #if defined(__x86_64__)
3275
3276         notimpl
3277
3278 #elif defined(__i386__)
3279
3280         notimpl
3281
3282 #elif defined(__arm__)
3283
3284         notimpl
3285
3286 #elif defined(__aarch64__)
3287
3288         notimpl
3289
3290 #else
3291         notimpl
3292 #endif
3293
3294 endproc
3295
3296 proc    x3d
3297
3298 #if defined(__x86_64__)
3299
3300         notimpl
3301
3302 #elif defined(__i386__)
3303
3304         notimpl
3305
3306 #elif defined(__arm__)
3307
3308         notimpl
3309
3310 #elif defined(__aarch64__)
3311
3312         notimpl
3313
3314 #else
3315         notimpl
3316 #endif
3317
3318 endproc
3319
3320 proc    x3e
3321
3322 #if defined(__x86_64__)
3323
3324         notimpl
3325
3326 #elif defined(__i386__)
3327
3328         notimpl
3329
3330 #elif defined(__arm__)
3331
3332         notimpl
3333
3334 #elif defined(__aarch64__)
3335
3336         notimpl
3337
3338 #else
3339         notimpl
3340 #endif
3341
3342 endproc
3343
3344 proc    x3f
3345
3346 #if defined(__x86_64__)
3347
3348         notimpl
3349
3350 #elif defined(__i386__)
3351
3352         notimpl
3353
3354 #elif defined(__arm__)
3355
3356         notimpl
3357
3358 #elif defined(__aarch64__)
3359
3360         notimpl
3361
3362 #else
3363         notimpl
3364 #endif
3365
3366 endproc
3367
3368 ///----- That's all, folks --------------------------------------------------