chiark - git - mdw - xchg-rax-rax/blob - xchg.S

   1 /// -*- mode: asm; asm-comment-char: 0 -*-
   2
   3 ///--------------------------------------------------------------------------
   4 /// Preliminaries.
   5
   6 #include <sys/syscall.h>
   7
   8 #if defined(__i386__) || defined(__x86_64__)
   9
  10         .intel_syntax noprefix
  11
  12 #elif defined(__arm__)
  13
  14 .macro  ret
  15         bx      r14
  16 .endm
  17
  18         .arch   armv7-a
  19
  20 #elif defined(__aarch64__)
  21
  22 .macro  cmov    rd, rn, cc
  23         csel    \rd, \rn, \rd, \cc
  24 .endm
  25 #define _COND(_)                                                        \
  26         _(eq) _(ne) _(cs) _(cc) _(vs) _(vc) _(mi) _(pl)                 \
  27         _(ge) _(lt) _(gt) _(le) _(hi) _(ls) _(al) _(nv)                 \
  28         _(hs) _(lo)
  29 #define _INST(_)                                                        \
  30         _(ccmp) _(ccmn)                                                 \
  31         _(csel) _(cmov)                                                 \
  32         _(csinc) _(cinc) _(cset)                                        \
  33         _(csneg) _(cneg)                                                \
  34         _(csinv) _(cinv) _(csetm)
  35 #define _CONDVAR(cc) _definstvar cc;
  36 #define _INSTVARS(inst)                                                 \
  37         .macro _definstvar cc;                                          \
  38           .macro inst.\cc args:vararg; inst \args, \cc; .endm;          \
  39         .endm;                                                          \
  40         _COND(_CONDVAR);                                                \
  41         .purgem _definstvar;
  42         _INST(_INSTVARS)
  43 #undef _COND
  44 #undef _INST
  45 #undef _CONDVAR
  46 #undef _INSTVARS
  47
  48 #define CCMP_N 8
  49 #define CCMP_Z 4
  50 #define CCMP_C 2
  51 #define CCMP_V 1
  52
  53 #define CCMP_MI CCMP_N
  54 #define CCMP_PL 0
  55 #define CCMP_EQ CCMP_Z
  56 #define CCMP_NE 0
  57 #define CCMP_CS CCMP_C
  58 #define CCMP_HS CCMP_C
  59 #define CCMP_CC 0
  60 #define CCMP_LO 0
  61 #define CCMP_VS CCMP_V
  62 #define CCMP_VC 0
  63 #define CCMP_HI CCMP_C
  64 #define CCMP_LS 0
  65 #define CCMP_LT CCMP_N
  66 #define CCMP_GE 0
  67 #define CCMP_LE CCMP_N
  68 #define CCMP_GT 0
  69
  70 #else
  71 #  error "not supported"
  72 #endif
  73
  74 .macro  proc    name
  75         .globl  \name
  76         .type   \name, STT_FUNC
  77         .p2align 4
  78 \name\():
  79   .macro endproc
  80         .size   \name, . - \name
  81         .purgem endproc
  82   .endm
  83 .endm
  84
  85 .macro ch c
  86 #if defined(__i386__)
  87
  88         pushf
  89         push    eax
  90         push    ebx
  91         push    ecx
  92         push    edx
  93         push    ebp
  94         mov     ebp, esp
  95         and     esp, -16
  96
  97         push    \c
  98         call    putchar@plt
  99
 100         call    get_pc_ebx
 101         add     ebx, offset _GLOBAL_OFFSET_TABLE
 102         mov     eax, [ebx + stdout@GOT]
 103         mov     eax, [eax]
 104         call    fflush@plt
 105
 106         mov     esp, ebp
 107         pop     ebp
 108         pop     edx
 109         pop     ecx
 110         pop     ebx
 111         pop     eax
 112         popf
 113
 114 #elif defined(__x86_64__)
 115
 116         pushf
 117         push    rax
 118         push    rcx
 119         push    rdx
 120         push    rsi
 121         push    rdi
 122         push    r8
 123         push    r9
 124         push    rbp
 125         mov     rbp, rsp
 126         and     rsp, -16
 127
 128         mov     rdi, \c
 129         call    putchar@plt
 130
 131         mov     rdi, [rip + stdout]
 132         call    fflush@plt
 133
 134         mov     rsp, rbp
 135         pop     rbp
 136         pop     r9
 137         pop     r8
 138         pop     rdi
 139         pop     rsi
 140         pop     rdx
 141         pop     rcx
 142         pop     rax
 143         popf
 144
 145 #elif defined(__arm__)
 146
 147         stmfd   r13!, {r0-r4, r12, r14}
 148
 149         mov     r4, r13
 150         bic     r14, r4, #15
 151         mov     r13, r14
 152
 153         mov     r0, #\c
 154         bl      putchar@plt
 155
 156         ldr     r14, .L$_c$gotoff$\@
 157 .L$_c$gotpc$\@:
 158         add     r14, pc, r14
 159         b       .L$_c$cont$\@
 160 .L$_c$gotoff$\@:
 161         .word   _GLOBAL_OFFSET_TABLE - .L$_c$gotpc$\@ - 8
 162 .L$_c$cont$\@:
 163         bl      fflush@plt
 164
 165         mov     r13, r4
 166         ldmfd   r13!, {r0-r4, r12, r14}
 167
 168 #elif defined(__aarch64__)
 169
 170         sub     sp, sp, #20*8
 171         stp      x0,  x1, [sp,   #0]
 172         stp      x2,  x3, [sp,  #16]
 173         stp      x4,  x5, [sp,  #32]
 174         stp      x6,  x7, [sp,  #48]
 175         stp      x8,  x9, [sp,  #64]
 176         stp     x10, x11, [sp,  #80]
 177         stp     x12, x13, [sp,  #96]
 178         stp     x14, x15, [sp, #112]
 179         stp     x16, x17, [sp, #128]
 180         mrs     x16, nzcv
 181         stp     x16, x30, [sp, #144]
 182
 183         mov     w0, #\c
 184         bl      putchar
 185         adrp    x0, :got:stdout
 186         ldr     x0, [x0, #:got_lo12:stdout]
 187         ldr     x0, [x0]
 188         bl      fflush
 189
 190         ldp     x16, x30, [sp, #144]
 191         msr     nzcv, x16
 192         ldp     x16, x17, [sp, #128]
 193         ldp     x14, x15, [sp, #112]
 194         ldp     x12, x13, [sp,  #96]
 195         ldp     x10, x11, [sp,  #80]
 196         ldp      x8,  x9, [sp,  #64]
 197         ldp      x6,  x7, [sp,  #48]
 198         ldp      x4,  x5, [sp,  #32]
 199         ldp      x2,  x3, [sp,  #16]
 200         ldp      x0,  x1, [sp,   #0]
 201         add     sp, sp, #20*8
 202
 203 #else
 204 #  error "not supported"
 205 #endif
 206 .endm
 207
 208 .macro  notimpl
 209 #if defined(__i386__) || defined(__x86_64__)
 210         ud2
 211 #elif defined(__arm__)
 212         udf
 213 #elif defined(__aarch64__)
 214         hlt     #0
 215 #else
 216 #  error "not supported"
 217 #endif
 218 .endm
 219
 220         .section .note.GNU-stack, "", %progbits
 221
 222         .text
 223
 224 #if defined(__i386__)
 225 get_pc_ebx:
 226         mov     ebx, [esp]
 227         ret
 228 #endif
 229
 230
 231 proc    call_example
 232
 233 #if defined(__i386__)
 234
 235         push    ebx                     // ebx
 236         push    esi                     // esi, ebx
 237         push    edi                     // edi, esi, ebx
 238         push    ebp                     // flags, ebp, ..., ebx
 239         pushf
 240
 241         mov     edi, [esp + 4*6]
 242         mov     esi, [esp + 4*7]
 243         push    esi                     // regs, flags, ebp, ..., ebx
 244
 245         call    get_pc_ebx
 246         lea     eax, [ebx + 9f - .]
 247         push    eax                     // cont, regs, flags, ebp, ..., ebx
 248         push    edi                 // func, cont, regs, flags, ebp, ..., ebx
 249
 250         mov     eax, [esi + 28]
 251         pushf
 252         pop     ecx
 253         and     eax,  0x0cd5
 254         and     ecx, ~0x0cd5
 255         or      eax, ecx
 256         push    eax
 257         popf
 258         mov     eax, [esi +  0]
 259         mov     ebx, [esi +  4]
 260         mov     ecx, [esi +  8]
 261         mov     edx, [esi + 12]
 262         mov     edi, [esi + 20]
 263         mov     ebp, [esi + 24]
 264         mov     esi, [esi + 16]
 265
 266         ret                            // -> func; regs, flags, ebp, ..., ebx
 267
 268 9:      pushf                           // eflags, regs, flags, ebp, ..., ebx
 269         push    esi                // esi, eflags, regs, flags, ebp, ..., ebx
 270         mov     esi, [esp + 8]
 271         mov     [esi +  0], eax
 272         mov     [esi +  4], ebx
 273         mov     [esi +  8], ecx
 274         mov     [esi + 12], edx
 275         mov     [esi + 20], edi
 276         mov     [esi + 24], ebp
 277         pop     eax                     // rflags, regs, flags, ebp, ..., ebx
 278         mov     [esi + 16], eax
 279         pop     eax                     // regs, flags, ebp, ..., ebx
 280         mov     [esi + 28], eax
 281
 282         add     esp, 4                  // flags, ebp, ..., ebx
 283         popf                            // ebp, ..., ebx
 284         pop     ebp                     // ..., ebx
 285         pop     edi
 286         pop     esi
 287         pop     ebx                     //
 288         ret
 289
 290 #elif defined(__x86_64__)
 291
 292         push    rbx                     // rbx
 293         push    r10
 294         push    r11
 295         push    r12
 296         push    r13
 297         push    r14
 298         push    r15
 299         push    rbp                     // flags, rbp, ..., rbx
 300         pushf
 301
 302         push    rsi                     // regs, flags, rbp, ..., rbx
 303
 304         lea     rax, [rip + 9f]
 305         push    rax                     // cont, regs, flags, rbp, ..., rbx
 306         push    rdi                 // func, cont, regs, flags, rbp, ..., rbx
 307
 308         mov     rax, [rsi + 8*15]
 309         pushf
 310         pop     rcx
 311         and     rax,  0x0cd5
 312         and     rcx, ~0x0cd5
 313         or      rax, rcx
 314         push    rax
 315         popf
 316         mov     rax, [rsi +   0]
 317         mov     rbx, [rsi +   8]
 318         mov     rcx, [rsi +  16]
 319         mov     rdx, [rsi +  24]
 320         mov     rdi, [rsi +  40]
 321         mov     rbp, [rsi +  48]
 322         mov     r8,  [rsi +  56]
 323         mov     r9,  [rsi +  64]
 324         mov     r10, [rsi +  72]
 325         mov     r11, [rsi +  80]
 326         mov     r12, [rsi +  88]
 327         mov     r13, [rsi +  96]
 328         mov     r14, [rsi + 104]
 329         mov     r15, [rsi + 112]
 330         mov     rsi, [rsi +  32]
 331
 332         ret                            // -> func; regs, flags, rbp, ..., rbx
 333
 334 9:      pushf                           // rflags, regs, flags, rbp, ..., rbx
 335         push    rsi                // rsi, rflags, regs, flags, rbp, ..., rbx
 336         mov     rsi, [rsp + 16]
 337         mov     [rsi +   0], rax
 338         mov     [rsi +   8], rbx
 339         mov     [rsi +  16], rcx
 340         mov     [rsi +  24], rdx
 341         mov     [rsi +  40], rdi
 342         mov     [rsi +  48], rbp
 343         mov     [rsi +  56],  r8
 344         mov     [rsi +  64],  r9
 345         mov     [rsi +  72], r10
 346         mov     [rsi +  80], r11
 347         mov     [rsi +  88], r12
 348         mov     [rsi +  96], r13
 349         mov     [rsi + 104], r14
 350         mov     [rsi + 112], r15
 351         pop     rax                     // rflags, regs, flags, rbp, ..., rbx
 352         mov     [rsi +  32], rax
 353         pop     rax                     // regs, flags, rbp, ..., rbx
 354         mov     [rsi + 120], rax
 355
 356         add     rsp, 8                  // flags, rbp, ..., rbx
 357         popf                            // rbp, ..., rbx
 358         pop     rbp                     // ..., rbx
 359         pop     r15
 360         pop     r14
 361         pop     r13
 362         pop     r12
 363         pop     r11
 364         pop     r10
 365         pop     rbx                     //
 366         ret
 367
 368 #elif defined(__arm__)
 369
 370         stmfd   r13!, {r0, r1, r4-r11, r14}
 371         ldmia   r1, {r0-r12, r14}
 372         msr     cpsr, r14
 373         mov     r14, pc
 374         ldr     pc, [r13], #4
 375         ldr     r14, [r13], #4
 376         stmia   r14!, {r0-r12}
 377         mrs     r0, cpsr
 378         str     r0, [r14]
 379         ldmfd   r13!, {r4-r11, pc}
 380
 381 #elif defined(__aarch64__)
 382
 383         stp     x29, x30, [sp, #-14*8]!
 384         mov     x29, sp
 385         stp     x19, x20, [sp,  #16]
 386         stp     x21, x22, [sp,  #32]
 387         stp     x23, x24, [sp,  #48]
 388         stp     x25, x26, [sp,  #64]
 389         stp     x27, x28, [sp,  #80]
 390         str           x1, [sp, #104]
 391
 392         ldp     x29, x30, [x1, #224]
 393         msr     nzcv, x30
 394         mov     x30, x0
 395         ldp     x27, x28, [x1, #208]
 396         ldp     x25, x26, [x1, #192]
 397         ldp     x23, x24, [x1, #176]
 398         ldp     x21, x22, [x1, #160]
 399         ldp     x19, x20, [x1, #144]
 400         ldp     x16, x17, [x1, #128]
 401         ldp     x14, x15, [x1, #112]
 402         ldp     x12, x13, [x1,  #96]
 403         ldp     x10, x11, [x1,  #80]
 404         ldp      x8,  x9, [x1,  #64]
 405         ldp      x6,  x7, [x1,  #48]
 406         ldp      x4,  x5, [x1,  #32]
 407         ldp      x2,  x3, [x1,  #16]
 408         ldp      x0,  x1, [x1,   #0]
 409
 410         blr     x30
 411
 412         ldr     x30, [sp, #104]
 413         stp     x27, x28, [x30, #208]
 414         stp     x25, x26, [x30, #192]
 415         stp     x23, x24, [x30, #176]
 416         stp     x21, x22, [x30, #160]
 417         stp     x19, x20, [x30, #144]
 418         stp     x16, x17, [x30, #128]
 419         stp     x14, x15, [x30, #112]
 420         stp     x12, x13, [x30,  #96]
 421         stp     x10, x11, [x30,  #80]
 422         stp      x8,  x9, [x30,  #64]
 423         stp      x6,  x7, [x30,  #48]
 424         stp      x4,  x5, [x30,  #32]
 425         stp      x2,  x3, [x30,  #16]
 426         stp      x0,  x1, [x30,   #0]
 427         mov     x0, x30
 428         mrs     x30, nzcv
 429         stp     x29, x30,  [x0, #224]
 430
 431         ldp     x19, x20, [sp,  #16]
 432         ldp     x21, x22, [sp,  #32]
 433         ldp     x23, x24, [sp,  #48]
 434         ldp     x25, x26, [sp,  #64]
 435         ldp     x27, x28, [sp,  #80]
 436         ldp     x29, x30, [sp], #14*8
 437
 438         ret
 439
 440 #else
 441 #  error "not supported"
 442 #endif
 443
 444 endproc
 445
 446 proc    nop
 447
 448         ret
 449
 450 endproc
 451
 452 ///--------------------------------------------------------------------------
 453 /// 0x00--0x0f
 454
 455 proc    x00
 456
 457         // clear all 64 bits of extended traditional registers
 458
 459 #if defined(__x86_64__)
 460
 461         xor      eax, eax               // clear rax
 462         lea      rbx, [0]               // rbx -> _|_
 463         loop     .                      // iterate, decrement rcx until zero
 464         mov      rdx, 0                 // set rdx = 0
 465         and      esi, 0                 // clear all bits of rsi
 466         sub      edi, edi               // set rdi = edi - edi = 0
 467         push     0
 468         pop      rbp                    // pop 0 into rbp
 469
 470 #elif defined(__i386__)
 471
 472         xor     eax, eax
 473         lea     ebx, [0]
 474         loop    .
 475         mov     edx, 0
 476         and     esi, 0
 477         sub     edi, edi
 478         push    0
 479         pop     ebp
 480
 481 #elif defined(__arm__)
 482
 483         eor     r0, r0, r0
 484         rsb     r1, r1, r1
 485 0:      subs    r2, r2, #1
 486         bne     0b
 487         mov     r3, #0
 488         and     r4, r4, #0
 489         sub     r5, r5, r5
 490
 491 #elif defined(__aarch64__)
 492
 493         eor     w0, w0, w0
 494         mov     w1, wzr
 495 0:      sub     w2, w2, #1
 496         cbnz    w2, 0b
 497         mov     w3, #0
 498         and     w4, w4, wzr
 499         sub     w5, w5, w5
 500
 501 #else
 502         notimpl
 503 #endif
 504
 505         ret
 506
 507 endproc
 508
 509 proc    x01
 510
 511         // advance a fibonacci pair by c steps
 512         //
 513         // on entry, a and d are f_{i+1} and f_i; on exit, they are f_{i+c+1}
 514         // and f_{i+c}, where f_{i+1} = f_i + f_{i-1}
 515
 516 #if defined(__x86_64__)
 517
 518 0:      xadd    rax, rdx                // a, d = a + d, a
 519                                         //      = f_{i+1} + f_i, f_{i+1}
 520                                         //      = f_{i+2}, f_{i+1}
 521         loop    0b                      // advance i, decrement c, iterate
 522
 523 #elif defined(__i386__)
 524
 525 0:      xadd    eax, edx
 526         loop    0b
 527
 528 #elif defined(__arm__)
 529
 530 0:      subs    r2, r2, #2
 531         add     r3, r3, r0
 532         blo     8f
 533         add     r0, r0, r3
 534         bhi     0b
 535
 536 8:      movne   r0, r3
 537
 538 #elif defined(__aarch64__)
 539
 540 0:      subs    x2, x2, #2
 541         add     x3, x3, x0
 542         b.lo    8f
 543         add     x0, x0, x3
 544         b.hi    0b
 545
 546 8:      cmov.ne x0, x3
 547
 548 #else
 549         notimpl
 550 #endif
 551
 552         ret
 553
 554 endproc
 555
 556 proc    x02
 557
 558         // boolean canonify a: if a = 0 on entry, leave it zero; otherwise
 559         // set a = 1
 560
 561 #if defined(__x86_64__)
 562
 563         neg     rax                     // set cf iff a /= 0
 564         sbb     rax, rax                // a = a - a - cf = -cf
 565         neg     rax                     // a = cf
 566
 567 #elif defined(__i386__)
 568
 569         neg     eax
 570         sbb     eax, eax
 571         neg     eax
 572
 573 #elif defined(__arm__)
 574
 575         movs    r1, r0                  // the easy way
 576         movne   r1, #1                  // mvnne r1, #1 for mask
 577
 578         cmp     r0, #1                  // clear cf iff a == 0
 579         sbc     r2, r0, r0              // c' = a - a - 1 + cf = cf - 1
 580         add     r2, r2, #1              // c' = cf
 581
 582         sub     r3, r0, r0, lsr #1      // d' top bit clear; d' = 0 iff a = 0
 583         rsb     r3, r3, #0              // d' top bit set iff a /= 0
 584         mov     r3, r3, lsr #31         // asr for mask
 585
 586         rsbs    r0, r0, #0
 587         sbc     r0, r0, r0
 588         rsb     r0, r0, #0
 589
 590 #elif defined(__aarch64__)
 591
 592         cmp     x0, #0                  // trivial
 593         cset.ne x1                      // csetm for mask
 594
 595         cmp     xzr, x0                 // set cf iff a == 0
 596         sbc     x2, x0, x0              // c' = a - a - 1 + cf = cf - 1
 597         neg     x2, x2                  // c' = 1 - cf
 598
 599         sub     x3, x0, x0, lsr #1      // if a < 2^63 then a' = ceil(d/2) <
 600                                         // 2^63
 601                                         // if a >= 2^63, write a = 2^63 + t
 602                                         // with t < 2^63; d' = 2^63 - 2^62 +
 603                                         // ceil(t/2) = 2^62 + ceil(t/2), and
 604                                         // ceil(t/2) < 2^62
 605                                         // anyway d' < 2^63 and d' = 0 iff
 606                                         // a = 0
 607         neg     x3, x3                  // d' top bit set iff a /= 0
 608         lsr     x3, x3, #63             // asr for mask
 609
 610         cmp     x0, #1                  // set cf iff a /= 0
 611         adc     x0, xzr, xzr            // a' = 0 + 0 + cf = cf
 612
 613 #else
 614         notimpl
 615 #endif
 616
 617         ret
 618
 619 endproc
 620
 621 proc    x03
 622
 623         // set a = min(a, d) (unsigned); clobber c, d
 624
 625 #if defined(__x86_64__)
 626
 627         sub     rdx, rax                // d' = d - a; set cf if a > d
 628         sbb     rcx, rcx                // c = -cf = -[a > d]
 629         and     rcx, rdx                // c = a > d ? d - a : 0
 630         add     rax, rcx                // a' = a > d ? d : a
 631
 632 #elif defined(__i386__)
 633
 634         sub     edx, eax
 635         sbb     ecx, ecx
 636         and     ecx, edx
 637         add     eax, ecx
 638
 639 #elif defined(__arm__)
 640
 641         cmp     r0, r3                  // the easy way
 642         movlo   r1, r0                  // only needed for out-of-place
 643         movhs   r1, r3
 644
 645         subs    r3, r3, r0
 646         sbc     r12, r12, r12
 647         and     r12, r12, r3
 648         add     r0, r0, r12
 649
 650 #elif defined(__aarch64__)
 651
 652         cmp     x0, x3                  // the easy way
 653         csel.lo x1, x0, x3
 654
 655         subs    x3, x3, x0              // d' = d - a; set cf if d >= a
 656         sbc     x16, xzr, xzr           // t = -1 + cf = -[a > d]
 657         and     x16, x16, x3            // t = a > d ? d - a : 0
 658         add     x0, x0, x16             // a' = a > d ? d : a
 659
 660 #else
 661         notimpl
 662 #endif
 663
 664         ret
 665
 666 endproc
 667
 668 proc    x04
 669
 670         // switch case?
 671
 672 #if defined(__x86_64__)
 673
 674   // unrelated playing
 675   mov   ecx, eax
 676   mov   rbx, -1
 677   mov   edx, ecx
 678   sub   edx, '0'
 679   cmp   edx, 10
 680   cmovb rbx, rdx
 681   or    ecx, 0x20
 682   mov   edx, ecx
 683   sub   edx, 'a'
 684   sub   ecx, 'a' - 10
 685   cmp   edx, 6
 686   cmovb rbx, rcx
 687
 688         xor     al, 0x20
 689
 690 #elif defined(__i386__)
 691
 692   // unrelated playing
 693   mov   ecx, eax
 694   mov   ebx, -1
 695   mov   edx, ecx
 696   sub   edx, '0'
 697   cmp   edx, 10
 698   cmovb ebx, edx
 699   or    ecx, 0x20
 700   mov   edx, ecx
 701   sub   edx, 'a'
 702   sub   ecx, 'a' - 10
 703   cmp   edx, 6
 704   cmovb ebx, ecx
 705
 706         xor     al, 0x20
 707
 708 #elif defined(__arm__)
 709
 710   // unrelated playing
 711   mvn   r1, #0
 712   sub   r12, r0, #'0'
 713   cmp   r12, #10
 714   movlo r1, r12
 715   orr   r12, r0, #0x20
 716   sub   r12, r12, #'a'
 717   cmp   r12, #6
 718   addlo r1, r12, #10
 719
 720         eor     r0, r0, #0x20
 721
 722 #elif defined(__aarch64__)
 723
 724   // unrelated playing
 725   mov   x1, #-1
 726   sub   w16, w0, #'0'
 727   cmp   w16, #10
 728   cmov.lo       x1, x16
 729   orr   w16, w0, #0x20
 730   sub   w16, w16, #'a' - 10
 731   cmp   w16, #10
 732   ccmp.hs       w16, #16, #CCMP_HS
 733   cmov.lo       x1, x16
 734
 735         eor     w0, w0, #0x20
 736
 737 #else
 738         notimpl
 739 #endif
 740
 741         ret
 742
 743 endproc
 744
 745 proc    x05
 746
 747         // answer whether 5 <= a </<= 9.
 748
 749 #if defined(__x86_64__)
 750
 751         sub     rax, 5                  // a' = a - 5
 752         cmp     rax, 4                  // is a' - 5 </<= 4?
 753
 754         // cc           a'                      a
 755         //
 756         // z/e          a' = 4                  a = 9
 757         // nz/ne        a' /= 4                 a /= 9
 758         //
 759         // a/nbe        a' > 4                  a > 9 or a < 5
 760         // nc/ae/nb     a' >= 4                 a >= 9 or a < 5
 761         // c/b/nae      a' < 4                  5 <= a < 9
 762         // be/na        a' <= 4                 5 <= a <= 9
 763         //
 764         // o            a' < -2^63 + 4          -2^63 + 5 <= a < -2^63 + 9
 765         // no           a' >= -2^63 + 4         a >= -2^63 + 9 or
 766         //                                              a < -2^63 + 5
 767         // s            -2^63 + 4 <= a' < 4     -2^63 + 9 <= a < 9
 768         // ns           a' < -2^63 + 4 or       a < -2^63 + 9 or a >= 9
 769         //                      a' >= 4
 770         // ge/nl        a' >= 4                 a >= 9 or a < -2^63 + 5
 771         // l/nge        a' < 4                  -2^63 + 5 <= a < 9
 772         // g/nle        a' > 4                  a > 9 or a < -2^63 + 5
 773         // le/ng        a' <= 4                 -2^63 + 5 <= a <= 9
 774
 775 #elif defined(__i386__)
 776
 777         sub     eax, 5
 778         cmp     eax, 4
 779
 780 #elif defined(__arm__)
 781
 782         // i dimly remember having a slick way to do this way back in the
 783         // day, but i can't figure it out any more.
 784         sub     r0, #5
 785         cmp     r0, #4
 786
 787 #elif defined(__aarch64__)
 788
 789         // literal translation is too obvious
 790         cmp     x0, #5
 791         ccmp.hs x0, #9, #CCMP_HS
 792
 793 #else
 794         notimpl
 795 #endif
 796
 797         ret
 798
 799 endproc
 800
 801 proc    x06
 802
 803         // leave a unchanged, but set zf if a = 0, cf if a /= 0, clear of,
 804         // set sf to msb(a)
 805
 806 #if defined(__x86_64__)
 807
 808         not     rax                     // a' = -a - 1
 809         inc     rax                     // a' = -a
 810         neg     rax                     // a' = a
 811
 812 #elif defined(__i386__)
 813
 814         not     eax
 815         inc     eax
 816         neg     eax
 817
 818 #elif defined(__arm__)
 819
 820         mvn     r0, r0
 821         add     r0, r0, #1
 822         rsbs    r0, r0, #0              // cf has opposite sense
 823
 824 #elif defined(__aarch64__)
 825
 826         mvn     x0, x0
 827         add     x0, x0, #1
 828         negs    x0, x0                  // cf has opposite sense
 829
 830 #else
 831         notimpl
 832 #endif
 833
 834         ret
 835
 836 endproc
 837
 838 proc    x07
 839
 840         // same as before (?)
 841
 842 #if defined(__x86_64__)
 843
 844         inc     rax                     // a' = a + 1
 845         neg     rax                     // a' = -a - 1
 846         inc     rax                     // a' = -a
 847         neg     rax                     // a' = a
 848
 849 #elif defined(__i386__)
 850
 851         inc     eax
 852         neg     eax
 853         inc     eax
 854         neg     eax
 855
 856 #elif defined(__arm__)
 857
 858         add     r0, r0, #1
 859         rsb     r0, r0, #0
 860         add     r0, r0, #1
 861         rsbs    r0, r0, #0
 862
 863 #elif defined(__aarch64__)
 864
 865         add     x0, x0, #1
 866         neg     x0, x0
 867         add     x0, x0, #1
 868         negs    x0, x0                  // cf has opposite sense
 869
 870 #else
 871         notimpl
 872 #endif
 873
 874         ret
 875
 876 endproc
 877
 878 proc    x08
 879
 880         // floor((a + d)/2), correctly handling overflow conditions; final cf
 881         // is lsb(a + d), probably uninteresting
 882
 883 #if defined(__x86_64__)
 884
 885         add     rax, rdx                // cf || a' = a + d
 886         rcr     rax, 1                  // shift 65-bit result right by one
 887                                         // place; lsb moves into carry
 888
 889 #elif defined(__i386__)
 890
 891         add     eax, edx
 892         rcr     eax, 1
 893
 894 #elif defined(__arm__)
 895
 896         // like the two-instruction a64 version
 897         sub     r1, r3, r0
 898         add     r1, r0, r1, lsr #1
 899
 900         // the slick version, similar to the above
 901         adds    r0, r0, r3
 902         mov     r0, r0, rrx
 903
 904 #elif defined(__aarch64__)
 905
 906         // a64 lacks a32's rrx.  literal translation.
 907         adds    x1, x0, x3              // cf || a' = a + d
 908         adc     x16, xzr, xzr           // realize cf in extra register
 909         extr    x1, x16, x1, #1         // shift down one place
 910
 911         // two instruction version: clobbers additional register.  (if you
 912         // wanted the answer in any other register, even overwriting d, then
 913         // this is unnecessary.)  also depends on d >= a.
 914         sub     x16, x3, x0             // compute difference
 915         add     x0, x0, x16, lsr #1     // add half of it (rounded down)
 916
 917 #else
 918         notimpl
 919 #endif
 920
 921         ret
 922
 923 endproc
 924
 925 proc    x09
 926
 927         // a = a/8, rounded to nearest; i.e., floor(a/8) if a == 0, 1, 2, 3
 928         // (mod 8), or ceil(a/8) if a == 4, 5, 6, 7 (mod 8).
 929
 930 #if defined(__x86_64__)
 931
 932         shr     rax, 3                  // a' = floor(a/8); cf = 1 if a ==
 933                                         // 4, 5, 6, 7 (mod 8)
 934         adc     rax, 0                  // a' = floor(a/8) + cf
 935
 936 #elif defined(__i386__)
 937
 938         shr     eax, 3
 939         adc     eax, 0
 940
 941 #elif defined(__arm__)
 942
 943         movs    r0, r0, lsr #3
 944         adc     r0, r0, #0
 945
 946 #elif defined(__aarch64__)
 947
 948         tst     x0, #4
 949         orr     x0, xzr, x0, lsr #3
 950         cinc.ne x0, x0
 951
 952 #else
 953         notimpl
 954 #endif
 955
 956         ret
 957
 958 endproc
 959
 960 proc    x0a
 961
 962         // increment c-byte little-endian bignum at rdi
 963
 964 #if defined(__x86_64__)
 965
 966         add     byte ptr [rdi], 1
 967 0:      inc     rdi
 968         adc     byte ptr [rdi], 0
 969         loop    0b
 970
 971 #elif defined(__i386__)
 972
 973         add     byte ptr [edi], 1
 974 0:      inc     edi
 975         adc     byte ptr [edi], 0
 976         loop    0b
 977
 978 #elif defined(__arm__)
 979
 980         mov     r12, #256               // set initial carry
 981 0:      ldrb    r0, [r5]
 982         subs    r2, r2, #1
 983         add     r12, r0, r12, lsr #8
 984         strb    r12, [r5], #1
 985         bne     0b
 986
 987 #elif defined(__aarch64__)
 988
 989         mov     w17, #256               // set initial carry
 990 0:      ldrb    w16, [x5]
 991         sub     x2, x2, #1
 992         add     w17, w16, w17, lsr #8
 993         strb    w17, [x5], #1
 994         cbnz    x2, 0b
 995
 996 #else
 997         notimpl
 998 #endif
 999
1000         ret
1001
1002 endproc
1003
1004 proc    x0b
1005
1006         // negate double-precision d:a
1007
1008 #if defined(__x86_64__)
1009
1010         not     rdx                     // d' = -d - 1
1011         neg     rax                     // a' = -a;
1012                                         // cf = 1 iff a /= 0
1013         sbb     rdx, -1                 // d' = -d - cf
1014
1015 #elif defined(__i386__)
1016
1017         not     edx
1018         neg     eax
1019         sbb     edx, -1
1020
1021 #elif defined(__arm__)
1022
1023         // reverse subtract is awesome
1024         rsbs    r0, r0, #0
1025         rsc     r3, r3, #0
1026
1027 #elif defined(__aarch64__)
1028
1029         // easy way: everything is better with zero registers.
1030         negs    x0, x0
1031         ngc     x3, x3
1032
1033 #else
1034         notimpl
1035 #endif
1036
1037         ret
1038
1039 endproc
1040
1041 proc    x0c
1042
1043         // rotate is distributive over xor.
1044
1045 #if defined(__x86_64__)
1046
1047         // rax                          // = a_1 || a_0
1048         // rbx                          // = b_1 || b_0
1049         mov     rcx, rax                // = a_1 || a_0
1050
1051         xor     rcx, rbx                // = (a_1 XOR b_1) || (a_0 XOR b_0)
1052         ror     rcx, 0xd                // = (a_0 XOR b_0) || (a_1 XOR b_1)
1053
1054         ror     rax, 0xd                // = a_0 || a_1
1055         ror     rbx, 0xd                // = b_0 || b_1
1056         xor     rax, rbx                // = (a_0 XOR b_0) || (a_1 XOR b_1)
1057
1058         cmp     rax, rcx                // always equal
1059
1060 #elif defined(__i386__)
1061
1062         mov     ecx, eax                // = a_1 || a_0
1063
1064         xor     ecx, ebx                // = (a_1 XOR b_1) || (a_0 XOR b_0)
1065         ror     ecx, 0xd                // = (a_0 XOR b_0) || (a_1 XOR b_1)
1066
1067         ror     eax, 0xd                // = a_0 || a_1
1068         ror     ebx, 0xd                // = b_0 || b_1
1069         xor     eax, ebx                // = (a_0 XOR b_0) || (a_1 XOR b_1)
1070
1071         cmp     eax, ecx                // always equal
1072
1073 #elif defined(__arm__)
1074
1075
1076         // r0                           // = a_1 || a_0
1077         // r1                           // = b_1 || b_0
1078         eor     r2, r0, r1              // = (a_1 XOR b_1) || (a_0 XOR b_0)
1079         mov     r2, r2, ror #13         // = (a_0 XOR b_0) || (a_1 XOR b_1)
1080
1081         mov     r1, r1, ror #13         // = b_0 || b_1
1082         eor     r0, r1, r0, ror #13     // = (a_0 XOR b_0) || (a_1 XOR b_1)
1083
1084         cmp     r0, r2                  // always equal
1085
1086 #elif defined(__aarch64__)
1087
1088         // x0                           // = a_1 || a_0
1089         // x1                           // = b_1 || b_0
1090         eor     x2, x0, x1              // = (a_1 XOR b_1) || (a_0 XOR b_0)
1091         ror     x2, x2, #13             // = (a_0 XOR b_0) || (a_1 XOR b_1)
1092
1093         ror     x1, x1, #13             // = b_0 || b_1
1094         eor     x0, x1, x0, ror #13     // = (a_0 XOR b_0) || (a_1 XOR b_1)
1095
1096         cmp     x0, x2                  // always equal
1097
1098 #else
1099         notimpl
1100 #endif
1101
1102         ret
1103
1104 endproc
1105
1106 proc    x0d
1107
1108         // and is distributive over xor.
1109
1110 #if defined(__x86_64__)
1111
1112         mov     rdx, rbx                // = b
1113
1114         xor     rbx, rcx                // = b XOR c
1115         and     rbx, rax                // = a AND (b XOR c)
1116
1117         and     rdx, rax                // = a AND b
1118         and     rax, rcx                // = a AND c
1119         xor     rax, rdx                // = (a AND b) XOR (a AND c)
1120                                         // = a AND (b XOR c)
1121
1122         cmp     rax, rbx                // always equal
1123
1124 #elif defined(__i386__)
1125
1126         mov     edx, ebx                // = b
1127
1128         xor     ebx, ecx                // = b XOR c
1129         and     ebx, eax                // = a AND (b XOR c)
1130
1131         and     edx, eax                // = a AND b
1132         and     eax, ecx                // = a AND c
1133         xor     eax, edx                // = (a AND b) XOR (a AND c)
1134                                         // = a AND (b XOR c)
1135
1136         cmp     eax, ebx                // always equal
1137
1138 #elif defined(__arm__)
1139
1140         and     r3, r0, r1              // = a AND b
1141
1142         eor     r1, r1, r2              // = b XOR c
1143         and     r1, r1, r0              // = a AND (b XOR c)
1144
1145         and     r0, r0, r2              // = a AND c
1146         eor     r0, r0, r3              // = (a AND b) XOR (a AND c)
1147                                         // = a AND (b XOR c)
1148
1149         cmp     r0, r1                  // always equal
1150
1151 #elif defined(__aarch64__)
1152
1153         and     x3, x0, x1              // = a AND b
1154
1155         eor     x1, x1, x2              // = b XOR c
1156         and     x1, x1, x0              // = a AND (b XOR c)
1157
1158         and     x0, x0, x2              // = a AND c
1159         eor     x0, x0, x3              // = (a AND b) XOR (a AND c)
1160                                         // = a AND (b XOR c)
1161
1162         cmp     x0, x1                  // always equal
1163
1164 #else
1165         notimpl
1166 #endif
1167
1168         ret
1169
1170 endproc
1171
1172 proc    x0e
1173
1174         // de morgan's law
1175
1176 #if defined(__x86_64__)
1177
1178         mov     rcx, rax                // = a
1179
1180         and     rcx, rbx                // = a AND b
1181         not     rcx                     // = NOT (a AND b)
1182
1183         not     rax                     // = NOT a
1184         not     rbx                     // = NOT b
1185         or      rax, rbx                // = (NOT a) OR (NOT b)
1186                                         // = NOT (a AND b)
1187
1188         cmp     rax, rcx                // always equal
1189
1190 #elif defined(__i386__)
1191
1192         mov     ecx, eax                // = a
1193
1194         and     ecx, ebx                // = a AND b
1195         not     ecx                     // = NOT (a AND b)
1196
1197         not     eax                     // = NOT a
1198         not     ebx                     // = NOT b
1199         or      eax, ebx                // = (NOT a) OR (NOT b)
1200                                         // = NOT (a AND b)
1201
1202         cmp     eax, ecx                // always equal
1203
1204 #elif defined(__arm__)
1205
1206         and     r2, r0, r1              // = a AND b
1207         mvn     r2, r2                  // = NOT (a AND b)
1208
1209         mvn     r0, r0                  // = NOT a
1210         mvn     r1, r1                  // = NOT b
1211         orr     r0, r0, r1              // = (NOT a) OR (NOT b)
1212
1213         cmp     r0, r2                  // always equal
1214
1215 #elif defined(__aarch64__)
1216
1217         and     x2, x0, x1              // = a AND b
1218         mvn     x2, x2                  // = NOT (a AND b)
1219
1220         mvn     x0, x0                  // = NOT a
1221         orn     x0, x0, x1              // = (NOT a) OR (NOT b)
1222
1223         cmp     x0, x2                  // always equal
1224
1225 #else
1226         notimpl
1227 #endif
1228
1229         ret
1230
1231 endproc
1232
1233 proc    x0f
1234
1235         // replace input buffer bytes with cumulative XORs with initial a;
1236         // final a is XOR of all buffer bytes and initial a.
1237         //
1238         // not sure why you'd do this.
1239
1240 #if defined(__x86_64__)
1241
1242 0:      xor     [rsi], al
1243         lodsb
1244         loop    0b
1245
1246 #elif defined(__i386__)
1247
1248 0:      xor     [esi], al
1249         lodsb
1250         loop    0b
1251
1252 #elif defined(__arm__)
1253
1254 0:      ldrb    r12, [r4]
1255         subs    r2, r2, #1
1256         eor     r0, r0, r12
1257         strb    r0, [r4], #1
1258         bne     0b
1259
1260 #elif defined(__aarch64__)
1261
1262 0:      ldrb    w16, [x4]
1263         sub     x2, x2, #1
1264         eor     w0, w0, w16
1265         strb    w0, [x4], #1
1266         cbnz    x2, 0b
1267
1268 #else
1269         notimpl
1270 #endif
1271
1272         ret
1273
1274 endproc
1275
1276 ///--------------------------------------------------------------------------
1277 /// 0x10--0x1f
1278
1279 proc    x10
1280
1281         // four different ways to swap a pair of registers.
1282
1283 #if defined(__x86_64__)
1284
1285         push    rax
1286         push    rcx
1287         pop     rax
1288         pop     rcx
1289
1290         xor     rax, rcx
1291         xor     rcx, rax
1292         xor     rax, rcx
1293
1294         add     rax, rcx
1295         sub     rcx, rax
1296         add     rax, rcx
1297         neg     rcx
1298
1299         xchg    rax, rcx
1300
1301 #elif defined(__i386__)
1302
1303         push    eax
1304         push    ecx
1305         pop     eax
1306         pop     ecx
1307
1308         xor     eax, ecx
1309         xor     ecx, eax
1310         xor     eax, ecx
1311
1312         add     eax, ecx
1313         sub     ecx, eax
1314         add     eax, ecx
1315         neg     ecx
1316
1317         xchg    eax, ecx
1318
1319 #elif defined(__arm__)
1320
1321         stmfd   r13!, {r0, r2}
1322         ldr     r0, [r13, #4]
1323         ldr     r2, [r13], #8
1324
1325         eor     r0, r0, r2
1326         eor     r2, r2, r0
1327         eor     r0, r0, r2
1328
1329         sub     r0, r0, r2
1330         add     r2, r2, r0
1331         rsb     r0, r0, r2              // don't need 3-addr with reverse-sub
1332
1333         mov     r12, r0
1334         mov     r0, r2
1335         mov     r2, r0
1336
1337 #elif defined(__aarch64__)
1338
1339         // anything you can do
1340         stp     x0, x2, [sp, #-16]!
1341         ldp     x2, x0, [sp], #16
1342
1343         eor     x0, x0, x2
1344         eor     x2, x2, x0
1345         eor     x0, x0, x2
1346
1347         // the add/sub/add thing was daft.  you can do it in three if you're
1348         // clever -- and have three-address operations.
1349         sub     x0, x0, x2
1350         add     x2, x2, x0
1351         sub     x0, x2, x0
1352
1353         // but we lack a fourth.  we can't do this in fewer than three
1354         // instructions without hitting memory.  only `ldp' will modify two
1355         // registers at a time, so we need at least two instructions -- but
1356         // if the first one sets one of our two registers to its final value
1357         // then we lose the other input value with no way to recover it, so
1358         // we must either write a fresh third register, or write something
1359         // other than the final value, and in both cases we need a third
1360         // instruction to fix everything up.  we've done the wrong-something-
1361         // other trick twice, so here's the captain-obvious use-a-third-
1362         // register version.
1363         mov     x16, x0
1364         mov     x0, x2
1365         mov     x2, x16
1366
1367 #else
1368         notimpl
1369 #endif
1370
1371         ret
1372
1373 endproc
1374
1375 proc    x11
1376
1377         // assuming a is initialized to zero, set a to the inclusive or of
1378         // the xor-differences of corresponding bytes in the c-byte strings
1379         // at si and di.
1380         //
1381         // in particular, a will be zero (and zf set) if and only if the two
1382         // strings are equal.
1383
1384 #if defined(__x86_64__)
1385
1386 0:      mov     dl, [rsi]
1387         xor     dl, [rdi]
1388         inc     rsi
1389         inc     rdi
1390         or      al, dl
1391         loop    0b
1392
1393 #elif defined(__i386__)
1394
1395 0:      mov     dl, [esi]
1396         xor     dl, [edi]
1397         inc     esi
1398         inc     edi
1399         or      al, dl
1400         loop    0b
1401
1402 #elif defined(__arm__)
1403
1404 0:      ldrb    r1, [r4], #1
1405         ldrb    r12, [r5], #1
1406         subs    r2, r2, #1
1407         eor     r12, r12, r1
1408         orr     r0, r0, r12
1409         bne     0b
1410
1411 #elif defined(__aarch64__)
1412
1413 0:      ldrb    w16, [x4], #1
1414         ldrb    w17, [x5], #1
1415         sub     x2, x2, #1
1416         eor     w16, w16, w17
1417         orr     w0, w0, w16
1418         cbnz    x2, 0b
1419
1420 #else
1421         notimpl
1422 #endif
1423
1424         ret
1425
1426 endproc
1427
1428 proc    x12
1429
1430         // an obtuse way of adding two registers.  for any bit position, a
1431         // OR d is set if and only if at least one of a and d has a bit set
1432         // in that position, and a AND d is set if and only if both have a
1433         // bit set in that position.  essentially, then, what we've done is
1434         // move all of the set bits in d to a, unless there's already a bit
1435         // there.  this clearly doesn't change the sum.
1436
1437 #if defined(__x86_64__)
1438
1439         mov     rcx, rdx                // c' = d
1440         and     rdx, rax                // d' = a AND d
1441         or      rax, rcx                // a' = a OR d
1442         add     rax, rdx
1443
1444 #elif defined(__i386__)
1445
1446         mov     ecx, edx                // c' = d
1447         and     edx, eax                // d' = a AND d
1448         or      eax, ecx                // a' = a OR d
1449         add     eax, edx
1450
1451 #elif defined(__arm__)
1452
1453         and     r2, r0, r3              // c' = a AND d
1454         orr     r0, r0, r3              // a' = a OR d
1455         add     r0, r0, r2
1456
1457 #elif defined(__aarch64__)
1458
1459         and     x2, x0, x3              // c' = a AND d
1460         orr     x0, x0, x3              // a' = a OR d
1461         add     x0, x0, x2
1462
1463 #else
1464         notimpl
1465 #endif
1466
1467         ret
1468
1469 endproc
1470
1471 proc    x13
1472
1473         // ok, so this is a really obtuse way of adding a and b; the result
1474         // is in a and d.  but why does it work?
1475
1476 #if defined(__x86_64__)
1477
1478         mov     rcx, 0x40               // carry chains at most 64 long
1479 0:      mov     rdx, rax                // copy a'
1480         xor     rax, rbx                // low bits of each bitwise sum
1481         and     rbx, rdx                // carry bits from each bitwise sum
1482         shl     rbx, 1                  // carry them into next position
1483         loop    0b
1484
1485 #elif defined(__i386__)
1486
1487         mov     ecx, 0x40               // carry chains at most 64 long
1488 0:      mov     edx, eax                // copy a'
1489         xor     eax, ebx                // low bits of each bitwise sum
1490         and     ebx, edx                // carry bits from each bitwise sum
1491         shl     ebx, 1                  // carry them into next position
1492         loop    0b
1493
1494 #elif defined(__arm__)
1495
1496         mov     r2, #0x40
1497 0:      and     r3, r0, r1
1498         subs    r2, r2, #1
1499         eor     r0, r0, r1
1500         lsl     r1, r3, #1
1501         bne     0b
1502
1503 #elif defined(__aarch64__)
1504
1505         mov     x2, #0x40
1506 0:      and     x3, x0, x1
1507         sub     x2, x2, #1
1508         eor     x0, x0, x1
1509         lsl     x1, x3, #1
1510         cbnz    x2, 0b
1511
1512 #else
1513         notimpl
1514 #endif
1515
1516         ret
1517
1518 endproc
1519
1520 proc    x14
1521
1522         // floor((a + d)/2), like x08.
1523
1524 #if defined(__x86_64__)
1525
1526         mov     rcx, rax                // copy a for later
1527         and     rcx, rdx                // carry bits
1528
1529         xor     rax, rdx                // low bits of each bitwise sum
1530         shr     rax, 1                  // divide by 2; carries now in place
1531
1532         add     rax, rcx                // add the carries; done
1533
1534 #elif defined(__i386__)
1535
1536         mov     ecx, eax                // copy a for later
1537         and     ecx, edx                // carry bits
1538
1539         xor     eax, edx                // low bits of each bitwise sum
1540         shr     eax, 1                  // divide by 2; carries now in place
1541
1542         add     eax, ecx                // add the carries; done
1543
1544 #elif defined(__arm__)
1545
1546         and     r2, r0, r3
1547         eor     r0, r0, r3
1548         add     r0, r2, r0, lsr #1
1549
1550 #elif defined(__aarch64__)
1551
1552         and     x2, x0, x3
1553         eor     x0, x0, x3
1554         add     x0, x2, x0, lsr #1
1555
1556 #else
1557         notimpl
1558 #endif
1559
1560         ret
1561
1562 endproc
1563
1564 proc    x15
1565
1566         // sign extension 32 -> 64 bits.
1567
1568 #if defined(__x86_64__)
1569
1570         movsx   rbx, eax                // like this?
1571
1572         mov     rdx, 0xffffffff80000000
1573         add     rax, rdx                // if bit 31 of a is set then bits
1574                                         // 31--63 of a' are clear; otherwise,
1575                                         // these bits are all set -- which is
1576                                         // exactly backwards
1577         xor     rax, rdx                // so fix it
1578
1579 #elif defined(__i386__)
1580
1581         movsx   ebx, ax                 // like this?
1582
1583         mov     edx, 0xffff8000
1584         add     eax, edx                // if bit 31 of a is set then bits
1585                                         // 31--63 of a' are clear; otherwise,
1586                                         // these bits are all set -- which is
1587                                         // exactly backwards
1588         xor     eax, edx                // so fix it
1589
1590 #elif defined(__arm__)
1591
1592         sxth    r1, r0                  // like this
1593
1594         mov     r12, #0x80000000
1595         add     r0, r0, r12, asr #16
1596         eor     r0, r0, r12, asr #16
1597
1598 #elif defined(__aarch64__)
1599
1600         sxtw    x1, w0                  // like this
1601
1602         mov     x16, #0xffffffff80000000
1603         add     x0, x0, x16
1604         eor     x0, x0, x16
1605
1606 #else
1607         notimpl
1608 #endif
1609
1610         ret
1611
1612 endproc
1613
1614 proc    x16
1615
1616         // ??? i don't know why you'd want to calculate this.
1617
1618 #if defined(__x86_64__)
1619
1620         xor     rax, rbx                // a' = a XOR b
1621         xor     rbx, rcx                // b' = b XOR c
1622         mov     rsi, rax                // t = a XOR b
1623         add     rsi, rbx                // t = (a XOR b) + (b XOR c)
1624         cmovc   rax, rbx                // a' = cf ? b XOR c : a XOR b
1625         xor     rax, rbx                // a' = cf ? 0 : a XOR c
1626         cmp     rax, rsi
1627
1628 #elif defined(__i386__)
1629
1630         xor     eax, ebx                // a' = a XOR b
1631         xor     ebx, ecx                // b' = b XOR c
1632         mov     esi, eax                // t = a XOR b
1633         add     esi, ebx                // t = (a XOR b) + (b XOR c)
1634         cmovc   eax, ebx                // a' = cf ? b XOR c : a XOR b
1635         xor     eax, ebx                // a' = cf ? 0 : a XOR c
1636         cmp     eax, esi
1637
1638 #elif defined(__arm__)
1639
1640         eor     r0, r0, r1
1641         eor     r1, r1, r2
1642         adds    r4, r0, r1
1643         movcs   r0, r1
1644         eor     r0, r0, r1
1645         cmp     r0, r4
1646
1647 #elif defined(__aarch64__)
1648
1649         eor     x0, x0, x1
1650         eor     x1, x1, x2
1651         adds    x4, x0, x1
1652         cmov.cs x0, x1
1653         eor     x0, x0, x1
1654         cmp     x0, x4
1655
1656 #else
1657         notimpl
1658 #endif
1659
1660         ret
1661
1662 endproc
1663
1664 proc    x17
1665
1666         // absolute value
1667
1668 #if defined(__x86_64__)
1669
1670         cqo                             // d = a < 0 ? -1 : 0
1671         xor     rax, rdx                // a' = a < 0 ? -a - 1 : a
1672         sub     rax, rdx                // a' = a < 0 ? -a : a
1673
1674 #elif defined(__i386__)
1675
1676         cdq                             // d = a < 0 ? -1 : 0
1677         xor     eax, edx                // a' = a < 0 ? -a - 1 : a
1678         sub     eax, edx                // a' = a < 0 ? -a : a
1679
1680 #elif defined(__arm__)
1681
1682         // direct approach
1683         movs    r1, r0
1684         rsbmi   r1, r0, #0
1685
1686         // faithful-ish conversion
1687         eor     r3, r0, r0, asr #31
1688         sub     r0, r3, r0, asr #31
1689
1690 #elif defined(__aarch64__)
1691
1692         // direct approach
1693         tst     x0, #1 << 63
1694         cneg.ne x1, x0
1695
1696         // faithful-ish conversion
1697         eor     x3, x0, x0, asr #63
1698         sub     x0, x3, x0, asr #63
1699
1700 #else
1701         notimpl
1702 #endif
1703
1704         ret
1705
1706 endproc
1707
1708 proc    x18
1709
1710         // should always set sf, clear zf, unless we get rescheduled to a
1711         // different core.
1712
1713 #if defined(__x86_64__)
1714
1715         rdtsc                           // d || a = cycles
1716         shl     rdx, 0x20
1717         or      rax, rdx                // a = cycles
1718         mov     rcx, rax                // c = cycles
1719
1720         rdtsc                           // d || a = cycles'
1721         shl     rdx, 0x20
1722         or      rax, rdx                // a = cycles'
1723
1724         cmp     rcx, rax
1725
1726 #elif defined(__i386__)
1727
1728         rdtsc                           // d || a = cycles
1729         mov     ebx, eax
1730         mov     ecx, edx                // c || b = cycles
1731
1732         rdtsc                           // d || a = cycles'
1733
1734         sub     ebx, eax
1735         sbb     ecx, edx
1736
1737 #elif defined(__arm__)
1738
1739         // cycle clock not available in user mode
1740         mrrc    p15, 0, r0, r1, c9
1741         mrrc    p15, 0, r2, r3, c9
1742         subs    r0, r0, r2
1743         sbcs    r1, r1, r3
1744
1745 #elif defined(__aarch64__)
1746
1747         // cycle clock not available in user mode
1748         mrs     x0, pmccntr_el0
1749         mrs     x1, pmccntr_el0
1750         cmp     x0, x1
1751
1752 #else
1753         notimpl
1754 #endif
1755
1756         ret
1757
1758 endproc
1759
1760 proc    x19
1761
1762         // stupid way to capture a pointer to inline data and jump past it.
1763         // confuses the return-address predictor something chronic.  worse
1764         // because amd64 calling convention doesn't usually pass arguments on
1765         // the stack.
1766
1767 #if defined(__x86_64__)
1768
1769         call    8f
1770         .string "hello world!\n\0"
1771 8:      call    print_str
1772         add     rsp, 8
1773         ret
1774
1775 print_str:
1776         // actually implement this ridiculous thing
1777         mov     rsi, [rsp + 8]
1778         xor     edx, edx
1779 0:      mov     al, [rsi + rdx]
1780         inc     rdx
1781         cmp     al, 0
1782         jnz     0b
1783         mov     eax, SYS_write
1784         mov     edi, 1
1785         dec     rdx
1786         syscall                         // clobbers r11 :-(
1787         ret
1788
1789 #elif defined(__i386__)
1790
1791         call    8f
1792         .string "hello world!\n\0"
1793 8:      call    print_str
1794         add     esp, 4
1795         ret
1796
1797 print_str:
1798         // actually implement this ridiculous thing
1799         mov     ecx, [esp + 4]
1800         xor     edx, edx
1801 0:      mov     al, [ecx + edx]
1802         inc     edx
1803         cmp     al, 0
1804         jnz     0b
1805         mov     eax, SYS_write
1806         mov     ebx, 1
1807         dec     edx
1808         int     0x80
1809         ret
1810
1811 #elif defined(__arm__)
1812
1813         // why am i doing this?
1814         stmfd   r13!, {r14}
1815         bl      8f
1816         .string "hello world!\n\0"
1817         .balign 4
1818 8:      mov     r1, r14               // might as well make it easy on myself
1819         bl      print_str
1820         ldmfd   r13!, {pc}
1821
1822 print_str:
1823         mov     r2, #0
1824 0:      ldrb    r0, [r1, r2]
1825         cmp     r0, #0
1826         addne   r2, r2, #1
1827         bne     0b
1828         mov     r0, #1
1829         mov     r7, #SYS_write
1830         swi     0
1831         bx      r14
1832
1833 #elif defined(__aarch64__)
1834
1835         // why am i doing this?
1836         str     x30, [sp, #-16]!
1837         bl      8f
1838         .string "hello world!\n\0"
1839         .balign 4
1840 8:      mov     x1, x30               // might as well make it easy on myself
1841         bl      print_str
1842         ldr     x30, [sp], #16
1843         ret
1844
1845 print_str:
1846         mov     x2, #0
1847 0:      ldrb    w0, [x1, x2]
1848         cmp     w0, #0
1849         cinc.ne x2, x2
1850         b.ne    0b
1851         mov     x0, #1
1852         mov     x8, #SYS_write
1853         svc     #0
1854         ret
1855
1856 #else
1857         notimpl
1858 #endif
1859
1860 endproc
1861
1862 proc    x1a
1863
1864         // collect the current instruction-pointer address.  this was an old
1865         // 32-bit i386 trick for position-independent code, but (a) it
1866         // confuses the return predictor, and (b) amd64 has true pc-relative
1867         // addressing.
1868
1869 #if defined(__x86_64__)
1870
1871         // the actual example
1872         call    0f
1873 0:      pop     rax
1874
1875         // the modern i386 trick doesn't confuse the return-address
1876         // predictor.
1877         call    calladdr_rbx
1878         sub     rbx, . - 0b
1879
1880         // but rip-relative addressing is even better
1881         lea     rcx, [rip + 0b]
1882
1883         ret
1884
1885 calladdr_rbx:
1886         mov     rbx, [rsp]
1887         ret
1888
1889 #elif defined(__i386__)
1890
1891         // the actual example
1892         call    0f
1893 0:      pop     eax
1894
1895         // the modern i386 trick doesn't confuse the return-address
1896         // predictor.
1897         call    get_pc_ebx
1898         sub     ebx, . - 0b
1899
1900         ret
1901
1902 #elif defined(__arm__)
1903
1904         stmfd   r13!, {r14}
1905
1906         bl      0f
1907 0:      mov     r0, r14
1908
1909         bl      return
1910         sub     r1, r14, #. - 0b
1911
1912         adr     r2, 0b
1913
1914         ldmfd   r13!, {pc}
1915
1916 return: bx      r14
1917
1918 #elif defined(__aarch64__)
1919
1920         str     x30, [sp, #-16]!
1921
1922         // we can do all of the above using a64
1923         bl      0f
1924 0:      mov     x0, x30
1925
1926         bl      return
1927         sub     x1, x30, #. - 0b
1928
1929         adr     x2, 0b
1930
1931         ldr     x30, [sp], #16
1932 return: ret
1933
1934 #else
1935         notimpl
1936 #endif
1937
1938 endproc
1939
1940 proc    x1b
1941
1942 #if defined(__x86_64__)
1943
1944         // retpolines: an mitigation against adversarially influenced
1945         // speculative execution at indirect branches.  if an adversary can
1946         // prepare a branch-target buffer entry matching an indirect branch
1947         // in the victim's address space then they can cause the victim to
1948         // /speculatively/ (but not architecturally) execute any code in
1949         // their address space, possibly leading to leaking secrets through
1950         // the cache.  retpolines aren't susceptible to this because the
1951         // predicted destination address is from the return-prediction stack
1952         // which the adversary can't prime.  the performance penalty is still
1953         // essentially a branch misprediction -- for this return, and
1954         // possibly all others already stacked.
1955
1956         // (try not to crash)
1957         lea     rax, [rip + 9f]
1958
1959         push    rax
1960 9:      ret
1961
1962 #elif defined(__i386__)
1963
1964         call    get_pc_ebx
1965         lea     eax, [ebx + 9f - .]
1966
1967         push    eax
1968 9:      ret
1969
1970 #elif defined(__arm__)
1971
1972         stmfd   r13!, {r14}
1973
1974         adr     r14, 8f
1975         bx      r14
1976
1977 8:      ldmfd   r13!, {pc}
1978
1979 #elif defined(__aarch64__)
1980
1981         str     x30, [sp, #-16]!
1982
1983         adr     x30, 8f
1984         ret
1985
1986 8:      ldr     x30, [sp], #16
1987         ret
1988
1989 #else
1990         notimpl
1991 #endif
1992
1993 endproc
1994
1995 proc    x1c
1996
1997         // ok, having a hard time seeing a use for this.  the most important
1998         // thing to note is that sp is set from `pop' /after/ it's
1999         // incremented.
2000
2001 #if defined(__x86_64__)
2002
2003         // try not to crash
2004         mov     rax, rsp
2005         and     rsp, -16
2006         push    rax
2007
2008         pop     rsp
2009
2010         // check it worked
2011         mov     rbx, rsp
2012         ret
2013
2014 #elif defined(__i386__)
2015
2016         // try not to crash
2017         mov     eax, esp
2018         and     esp, -16
2019         push    eax
2020
2021         pop     esp
2022
2023         // check it worked
2024         mov     ebx, esp
2025         ret
2026
2027 #elif defined(__arm__)
2028
2029         // not even going to dignify this
2030         notimpl
2031
2032 #elif defined(__aarch64__)
2033
2034         // not even going to dignify this
2035         notimpl
2036
2037 #else
2038         notimpl
2039 #endif
2040
2041 endproc
2042
2043 proc    x1d
2044
2045         // monumentally cheesy way to copy 8 n bytes from buff1 to buff2.
2046         // also clobbers words at buff2 + 8 n and buff2 - 8 for good measure.
2047
2048         n = 4
2049
2050 #if defined(__x86_64__)
2051
2052         mov     rax, rsp                        // safekeeping
2053
2054         // we're toast if we get hit by a signal now.  fingers crossed...
2055   .if 0
2056         mov     rsp, buff2 + 8*n + 8
2057         mov     rbp, buff1 + 8*n
2058   .else
2059         lea     rsp, [rdi + 8*n + 16]
2060         lea     rbp, [rsi + 8*n]
2061   .endif
2062         enter   0, n + 1
2063
2064         // precise action:
2065         //
2066         //         +---------+                  +---------+
2067         //  rbp -> |   ???   |           rsp -> |   ???   |
2068         //         +---------+                  +---------+
2069         //         | w_{n-1} |                  |   rbp   | <- rbp'
2070         //         +---------+                  +---------+
2071         //         |   ...   |                  | w_{n-1} |
2072         //         +---------+                  +---------+
2073         //         |   w_1   |                  |   ...   |
2074         //         +---------+                  +---------+
2075         //         |   w_0   |                  |   w_1   |
2076         //         +---------+                  +---------+
2077         //                                      |   w_0   |
2078         //                                      +---------+
2079         //                                      |   rbp'  | <- rsp'
2080         //                                      +---------+
2081
2082         mov     rdx, rsp
2083         mov     rsp, rax
2084
2085 #elif defined(__i386__)
2086
2087         mov     eax, esp                        // safekeeping
2088
2089         // we're toast if we get hit by a signal now.  fingers crossed...
2090   .if 0
2091         mov     esp, buff2 + 4*n + 4
2092         mov     ebp, buff1 + 4*n
2093   .else
2094         lea     esp, [edi + 4*n + 8]
2095         lea     ebp, [esi + 4*n]
2096   .endif
2097         enter   0, n + 1
2098
2099         mov     edx, esp
2100         mov     esp, eax
2101
2102 #elif defined(__arm__)
2103
2104         add     r4, r4, #4*n
2105         add     r5, r5, #4*n + 8
2106
2107         str     r4, [r5, #-4]!
2108   .rept n/2
2109         ldrd    r0, r1, [r4, #-8]!
2110         strd    r0, r1, [r5, #-8]!
2111   .endr
2112         add     r4, r5, #4*n
2113         str     r4, [r5, #-4]!
2114
2115 #elif defined(__aarch64__)
2116
2117         // omgwtf.  let's not actually screw with the stack pointer.
2118
2119         add     x4, x4, #8*n
2120         add     x5, x5, #8*n + 16
2121
2122         str     x4, [x5, #-8]!
2123   .rept n/2
2124         ldp     x16, x17, [x4, #-16]!
2125         stp     x16, x17, [x5, #-16]!
2126   .endr
2127         add     x4, x5, #8*n
2128         str     x4, [x5, #-8]!
2129
2130 #else
2131         notimpl
2132 #endif
2133
2134         ret
2135
2136 endproc
2137
2138 proc    x1e
2139
2140         // convert nibble value to (uppercase) hex; other input values yield
2141         // nonsense.
2142
2143 #if defined(__x86_64__)
2144
2145         // das doesn't work in 64-bit mode; best i can come up with
2146         mov     edx, eax
2147         add     al, '0'
2148         add     dl, 'A' - 10
2149         cmp     al, '9' + 1
2150         cmovae  eax, edx
2151
2152 #elif defined(__i386__)
2153
2154         cmp     al, 0x0a                // cf = 1 iff a < 10
2155         sbb     al, 0x69                // if 0 <= a < 10, a' = a - 0x6a, so
2156                                         // 0x96 <= a' < 0x70, setting af, cf
2157                                         // if 10 <= a < 16, a' = a - 0x69, so
2158                                         // 0x71 <= a' < 0x77, setting cf but
2159                                         // clearing af
2160         das                             // if 0 <= a < 10, then af and cf are
2161                                         // both set, so set subtract 0x66
2162                                         // from a' leaving 0x30 <= a' < 0x3a;
2163                                         // if 10 <= a < 16 then af clear but
2164                                         // cf set, so subtract 0x60 from a'
2165                                         // leaving 0x41 <= a' < 0x47
2166
2167 #elif defined(__arm__)
2168
2169         // significantly less tricksy
2170         cmp     r0, #10
2171         addlo   r0, r0, #'0'
2172         addhs   r0, r0, #'A' - 10
2173
2174 #elif defined(__aarch64__)
2175
2176         // with less versatile conditional execution this is the best we can
2177         // do
2178         cmp     w0, #10
2179         add     w16, w0, #'A' - 10
2180         add     w0, w0, #'0'
2181         cmov.hs w0, w16
2182
2183 #else
2184         notimpl
2185 #endif
2186
2187         ret
2188
2189 endproc
2190
2191 proc    x1f
2192
2193         // verify collatz conjecture starting at a; assume a /= 0!
2194
2195 #if defined(__x86_64__)
2196
2197 0:      bsf     rcx, rax                // clobber c if a = 0
2198         shr     rax, cl                 // a = 2^c a'
2199   cmp rdx, 0
2200   je 1f
2201   stosq
2202   dec rdx
2203 1:
2204         cmp     rax, 1                  // done?
2205         je      9f
2206         lea     rax, [2*rax + rax + 1]  // a' = 3 a' + 1
2207         jmp     0b                      // again
2208
2209 9:      ret
2210
2211 #elif defined(__i386__)
2212
2213 0:      bsf     ecx, eax                // clobber c if a = 0
2214         shr     eax, cl                 // a = 2^c a'
2215   cmp edx, 0
2216   je 1f
2217   stosd
2218   dec edx
2219 1:
2220         cmp     eax, 1                  // done?
2221         je      9f
2222         lea     eax, [2*eax + eax + 1]  // a' = 3 a' + 1
2223         jmp     0b                      // again
2224
2225 9:      ret
2226
2227 #elif defined(__arm__)
2228
2229         // rbit introduced in armv7
2230 0:      rbit    r2, r0
2231         clz     r2, r2
2232         mov     r0, r0, lsr r2          // a = 2^c a'
2233   cmp r3, #0
2234   strne r0, [r5], #4
2235   subne r3, r3, #1
2236         cmp     r0, #1
2237         adcne   r0, r0, r0, lsl #1      // a' = 3 a' + 1 (because c set)
2238         bne     0b
2239
2240         ret
2241
2242 #elif defined(__aarch64__)
2243
2244 0:      rbit    w2, w0
2245         clz     w2, w2
2246         lsr     w0, w0, w2              // a = 2^c a'
2247   cmp x3, #0
2248   beq 1f
2249   str x0, [x5], #8
2250   sub x3, x3, #1
2251 1:
2252         cmp     w0, #1
2253         add     w16, w0, w0, lsl #1     // t = 3 a' + 1 (because c set)
2254         csinc.eq w0, w0, w16
2255         b.ne    0b
2256
2257         ret
2258
2259 #else
2260         notimpl
2261 #endif
2262
2263 endproc
2264
2265 ///--------------------------------------------------------------------------
2266 /// 0x20--0x2f
2267
2268 proc    x20
2269
2270         // calculate 1337 a slowly
2271
2272 #if defined(__x86_64__)
2273
2274         // original version
2275         mov     rcx, rax                // c = a
2276         shl     rcx, 2                  // c = 4 a
2277         add     rcx, rax                // c = 5 a
2278         shl     rcx, 3                  // c = 40 a
2279         add     rcx, rax                // c = 41 a
2280         shl     rcx, 1                  // c = 82 a
2281         add     rcx, rax                // c = 83 a
2282         shl     rcx, 1                  // c = 166 a
2283         add     rcx, rax                // c = 167 a
2284         shl     rcx, 3                  // c = 1336 a
2285         add     rcx, rax                // c = 1337 a
2286
2287         // a quick way
2288         lea     rdx, [2*rax + rax]      // t = 3 a
2289         shl     rdx, 6                  // t = 192 a
2290         sub     rdx, rax                // t = 191 a
2291         lea     rbx, [8*rdx]            // b = 1528 a
2292         sub     rbx, rdx                // b = 1337 a
2293
2294 #elif defined(__i386__)
2295
2296         // original version
2297         mov     ecx, eax                // c = a
2298         shl     ecx, 2                  // c = 4 a
2299         add     ecx, eax                // c = 5 a
2300         shl     ecx, 3                  // c = 40 a
2301         add     ecx, eax                // c = 41 a
2302         shl     ecx, 1                  // c = 82 a
2303         add     ecx, eax                // c = 83 a
2304         shl     ecx, 1                  // c = 166 a
2305         add     ecx, eax                // c = 167 a
2306         shl     ecx, 3                  // c = 1336 a
2307         add     ecx, eax                // c = 1337 a
2308
2309         // a quick way
2310         lea     edx, [2*eax + eax]      // t = 3 a
2311         shl     edx, 6                  // t = 192 a
2312         sub     edx, eax                // t = 191 a
2313         lea     ebx, [8*edx]            // b = 1528 a
2314         sub     ebx, edx                // b = 1337 a
2315
2316 #elif defined(__arm__)
2317
2318         // original version, ish
2319         add     r2, r0, r0, lsl #2      // c = 5 a
2320         add     r2, r0, r2, lsl #3      // c = 41 a
2321         add     r2, r0, r2, lsl #1      // c = 83 a
2322         add     r2, r0, r2, lsl #1      // c = 167 a
2323         add     r2, r0, r2, lsl #3      // c = 1337 a
2324
2325         // quicker way
2326         add     r1, r0, r0, lsl #1      // b = 3 a
2327         rsb     r1, r0, r1, lsl #6      // b = 191 a
2328         rsb     r1, r1, r1, lsl #3      // b = 1337 a
2329
2330 #elif defined(__aarch64__)
2331
2332         // original version, ish
2333         add     x2, x0, x0, lsl #2      // c = 5 a
2334         add     x2, x0, x2, lsl #3      // c = 41 a
2335         add     x2, x0, x2, lsl #1      // c = 83 a
2336         add     x2, x0, x2, lsl #1      // c = 167 a
2337         add     x2, x0, x2, lsl #3      // c = 1337 a
2338
2339         // sleazy because no rsb
2340         add     x1, x0, x0, lsl #1      // b = 3 a
2341         sub     x1, x0, x1, lsl #6      // b = -191 a
2342         sub     x1, x1, x1, lsl #3      // b = 1337 a
2343
2344 #else
2345         notimpl
2346 #endif
2347
2348         ret
2349
2350 endproc
2351
2352 proc    x21
2353
2354         // multiply complex numbers a + b i and c + d i
2355         //
2356         //      (a + b i) (c + d i) = (a c - b d) + (a d + b c) i
2357         //
2358         // somewhat slick approach uses only three multiplications
2359
2360 #if defined(__x86_64__)
2361
2362         mov     rsi, rax                // t = a
2363         add     rax, rbx                // a' = a + b
2364         mov     rdi, rdx                // u = d
2365         sub     rdx, rcx                // d' = d - c
2366         add     rdi, rcx                // u = c + d
2367
2368         imul    rax, rcx                // a' = c (a + b)
2369         imul    rsi, rdx                // t = a (d - c)
2370         imul    rdi, rbx                // u = b (c + d)
2371
2372         add     rsi, rax                // t = a (d - c) + c (a + b)
2373         mov     rbx, rsi                // b' = a (d - c) + c (a + b)
2374                                         //      = a d + b c
2375         sub     rax, rdi                // a' = c (a + b) - b (c + d)
2376                                         //      = a c - b d
2377
2378 #elif defined(__i386__)
2379
2380         mov     esi, eax                // t = a
2381         add     eax, ebx                // a' = a + b
2382         mov     edi, edx                // u = d
2383         sub     edx, ecx                // d' = d - c
2384         add     edi, ecx                // u = c + d
2385
2386         imul    eax, ecx                // a' = c (a + b)
2387         imul    esi, edx                // t = a (d - c)
2388         imul    edi, ebx                // u = b (c + d)
2389
2390         add     esi, eax                // t = a (d - c) + c (a + b)
2391         mov     ebx, esi                // b' = a (d - c) + c (a + b)
2392                                         //      = a d + b c
2393         sub     eax, edi                // a' = c (a + b) - b (c + d)
2394                                         //      = a c - b d
2395
2396 #elif defined(__arm__)
2397
2398         add     r4, r0, r1              // t = a + b
2399         add     r5, r2, r3              // u = c + d
2400         sub     r3, r3, r2              // d' = d - c
2401
2402         // mls introduced in armv7
2403         mul     r4, r4, r2              // t = c (a + b)
2404         mov     r2, r1                  // c' = a (bah!)
2405         mla     r1, r0, r3, r4          // b' = a (d - c) + c (a + b)
2406                                         //      = a d + b c
2407         mls     r0, r2, r5, r4          // a' = c (a + b) - b (c + d)
2408                                         //      = a c - b d
2409
2410 #elif defined(__aarch64__)
2411
2412         add     x4, x0, x1              // t = a + b
2413         add     x5, x2, x3              // u = c + d
2414         sub     x3, x3, x2              // d' = d - c
2415
2416         // mls intxoduced in axmv7
2417         mul     x4, x4, x2              // t = c (a + b)
2418         mov     x2, x1                  // c' = a (bah!)
2419         madd    x1, x0, x3, x4          // b' = a (d - c) + c (a + b)
2420                                         //      = a d + b c
2421         msub    x0, x2, x5, x4          // a' = c (a + b) - b (c + d)
2422                                         //      = a c - b d
2423
2424 #else
2425         notimpl
2426 #endif
2427
2428         ret
2429
2430 endproc
2431
2432 proc    x22
2433
2434         // divide by 3
2435
2436 #if defined(__x86_64__)
2437
2438         mov     rdx, 0xaaaaaaaaaaaaaaab // = ceil(2/3 2^64)
2439         mul     rdx                     // d' || a' =~ 2/3 a 2^64
2440         shr     rdx, 1                  // d' = floor(a/3)
2441         mov     rax, rdx                // a' = floor(a/3)
2442
2443         // we start with 0 <= a < 2^64.  write f = ceil(2/3 2^64), so that
2444         // 2/3 < f/2^64 < 2/3 + 1/2^64.  then floor(2/3 a) <= floor(a f/2^64)
2445         // <= floor(2/3 a + a/2^64), but a < 2^64 so a/2^64 < 1 and
2446         // floor(a f/2^64) = floor(2/3 a).
2447
2448 #elif defined(__i386__)
2449
2450         mov     edx, 0xaaaaaaab         // = ceil(2/3 2^32)
2451         mul     edx                     // d' || a' =~ 2/3 a 2^32
2452         shr     edx, 1                  // d' = floor(a/3)
2453         mov     eax, edx                // a' = floor(a/3)
2454
2455 #elif defined(__arm__)
2456
2457         ldr     r12, =0xaaaaaaab
2458         umull   r12, r0, r0, r12
2459         mov     r0, r0, lsr #1
2460
2461 #elif defined(__aarch64__)
2462
2463         ldr     x16, =0xaaaaaaaaaaaaaaab
2464         umulh   x0, x0, x16
2465         lsr     x0, x0, #1
2466
2467 #else
2468         notimpl
2469 #endif
2470
2471         ret
2472
2473 endproc
2474
2475 proc    x23
2476
2477 #if defined(__x86_64__)
2478
2479         // main loop: shorten a preserving residue class mod 3
2480 0:      cmp     rax, 5
2481         jbe     8f
2482         // a > 5
2483         mov     rdx, rax                // d' = a
2484         shr     rdx, 2                  // d' = floor(a/4)
2485         and     rax, 3                  // a = 4 d' + a' (0 <= a' < 4)
2486         add     rax, rdx                // a' == a (mod 3) but a' < a/4 + 4
2487         jmp     0b
2488
2489         // fix up final value 0 <= a < 6: want 0 <= a < 3
2490         //
2491         // the tricky part is actually a = 3; but the other final cases take
2492         // additional iterations which we can avoid.
2493 8:      cmp     rax, 3                  // set cf iff a < 3
2494         cmc                             // set cf iff a >= 3
2495         sbb     rdx, rdx                // d' = a >= 3 ? -1 : 0
2496         and     rdx, 3                  // d' = a >= 3 ? 3 : 0
2497         sub     rax, rdx                // a' = a - (a >= 3 ? 3 : 0)
2498                                         //      = a (mod 3)
2499
2500 #elif defined(__i386__)
2501
2502         // main loop: shorten a preserving residue class mod 3
2503 0:      cmp     eax, 5
2504         jbe     8f
2505         // a > 5
2506         mov     edx, eax                // d' = a
2507         shr     edx, 2                  // d' = floor(a/4)
2508         and     eax, 3                  // a = 4 d' + a' (0 <= a' < 4)
2509         add     eax, edx                // a' == a (mod 3) but a' < a/4 + 4
2510         jmp     0b
2511
2512         // fix up final value 0 <= a < 6: want 0 <= a < 3
2513         //
2514         // the tricky part is actually a = 3; but the other final cases take
2515         // additional iterations which we can avoid.
2516 8:      cmp     eax, 3                  // set cf iff a < 3
2517         cmc                             // set cf iff a >= 3
2518         sbb     edx, edx                // d' = a >= 3 ? -1 : 0
2519         and     edx, 3                  // d' = a >= 3 ? 3 : 0
2520         sub     eax, edx                // a' = a - (a >= 3 ? 3 : 0)
2521                                         //      = a (mod 3)
2522
2523 #elif defined(__arm__)
2524
2525 0:      cmp     r0, #6
2526         andhs   r12, r0, #3
2527         addhs   r0, r12, r0, lsr #2
2528         bhs     0b
2529
2530         cmp     r0, #3
2531         subhs   r0, r0, #3
2532
2533 #elif defined(__aarch64__)
2534
2535 0:      cmp     x0, #6
2536         // blunder on through regardless since this doesn't affect the result
2537         and     x16, x0, #3
2538         add     x0, x16, x0, lsr #2
2539         b.hs    0b
2540
2541         subs    x16, x0, #3
2542         cmov.hs x0, x16
2543
2544 #else
2545         notimpl
2546 #endif
2547
2548         ret
2549
2550 endproc
2551
2552 proc    x24
2553
2554         // invert (odd) a mod 2^64
2555         //
2556         // suppose a a_i == 1 (mod 2^{2^i})
2557         //
2558         // clearly good for i = 0, since 2^i = 1 and 2^{2^i} = 2, and a_0 =
2559         // a == 1 (mod 2) by assumption
2560         //
2561         // write a a_i == b_i 2^{2^i} + 1 (mod 2^{2^{i+1}})
2562         // then b_i == (a a_i - 1)/2^{2^i} (mod 2^{2^i})
2563         // to lift inverse, we want x such that a x == -b_i (mod 2^{2^i});
2564         // clearly x = -a_i b_i will do, since a a_i == 1 (mod 2^{2^i})
2565         // then:
2566         // a_{i+1} = a_i - a_i b_i 2^{2^i} = a_i (1 - (a a_i - 1))
2567         //      = 2 a_i - a a_i^2
2568         //
2569         // check:
2570         // a a_{i+1} = 2 a a_i - a^2 a_i^2
2571         //      == 2 a a_i - (b_i 2^{2^i} + 1)^2
2572         //      == 2 (b_i 2^{2^i} + 1) -
2573         //              (b_i^2 2^{2^{i+1}} + 2 b_i 2^{2^i} + 1)
2574         //      == 1 (mod 2^{2^{i+1}})
2575
2576 #if defined(__x86_64__)
2577
2578         // rax                          // a_0 = a
2579         mov     rbx, rax                // b' = a
2580         mov     rsi, rax                // t = a_0
2581
2582 0:
2583   cmp rbp, 0
2584   je 1f
2585   stosq
2586   dec rbp
2587 1:
2588         mul     rbx                     // a' = a a_i
2589         mov     rcx, rax                // c = a a_i
2590
2591         sub     rax, 2                  // a' = a a_i - 2
2592         neg     rax                     // a' = 2 - a a_i
2593         mul     rsi                     // a_{i+1} = a_i (2 - a a_i)
2594                                         //      = 2 a_i - a a_i^2
2595         mov     rsi, rax                // t = a_{i+1}
2596
2597         cmp     rcx, 1                  // done?
2598         ja      0b                      // no -- iterate
2599
2600 #elif defined(__i386__)
2601
2602         // eax                          // a_0 = a
2603         mov     ebx, eax                // b' = a
2604         mov     esi, eax                // t = a_0
2605
2606 0:
2607   cmp ebp, 0
2608   je 1f
2609   stosd
2610   dec ebp
2611 1:
2612         mul     ebx                     // a' = a a_i
2613         mov     ecx, eax                // c = a a_i
2614
2615         sub     eax, 2                  // a' = a a_i - 2
2616         jb      9f                      // done if < 2
2617         neg     eax                     // a' = 2 - a a_i
2618         mul     esi                     // a_{i+1} = a_i (2 - a a_i)
2619                                         //      = 2 a_i - a a_i^2
2620         mov     esi, eax                // t = a_{i+1}
2621
2622         jmp     0b                      // and iterate
2623 9:      mov     eax, esi                // restore
2624
2625 #elif defined(__arm__)
2626
2627         // r0                           // a_0 = a
2628         mov     r1, r0                  // b' = a
2629
2630 0:
2631   cmp r6, #0
2632   strne r0, [r5], #4
2633   subne r6, r6, #1
2634         mul     r2, r0, r1              // c = a a_i
2635         rsbs    r2, r2, #2              // c = 2 - a a_i
2636         mul     r0, r0, r2              // a_{i+1} = a_i (2 - a a_i)
2637                                         //      = 2 a_i - a a_i^2
2638         blo     0b
2639
2640 #elif defined(__aarch64__)
2641
2642         // x0                           // a_0 = a
2643         mov     x1, x0                  // b' = a
2644         mov     x16, #2                 // because we have no rsb
2645
2646 0:
2647   cmp x6, #0
2648   b.eq 1f
2649   str x0, [x5], #8
2650   sub x6, x6, #1
2651 1:
2652         mul     x2, x0, x1              // c = a a_i
2653         subs    x2, x16, x2             // c = 2 - a a_i
2654         mul     x0, x0, x2              // a_{i+1} = a_i (2 - a a_i)
2655                                         //      = 2 a_i - a a_i^2
2656         b.lo    0b
2657
2658 #else
2659         notimpl
2660 #endif
2661
2662         ret
2663
2664 endproc
2665
2666 proc    x25
2667
2668         // a poor approximation to pi/4
2669         //
2670         // think of x and y as being in 16.16 fixed-point format.  we sample
2671         // points in the unit square, and determine how many of them are
2672         // within a unit quarter-circle centred at the origin.  the area of
2673         // the quarter-circle is pi/4.
2674
2675 #if defined(__x86_64__)
2676
2677         xor     eax, eax                // a = 0
2678         mov     rcx, 1
2679         shl     rcx, 0x20               // c =~ 4 billion
2680
2681 0:      movzx   rbx, cx                 // x = low 16 bits of c
2682         imul    rbx, rbx                // b = x^2
2683
2684         ror     rcx, 0x10               // switch halves of c
2685         movzx   rdx, cx                 // y = high 16 bits of c
2686         imul    rdx, rdx                // d = y^2
2687         rol     rcx, 0x10               // switch back
2688
2689         add     rbx, rdx                // r^2 = x^2 + y^2
2690         shr     rbx, 0x20               // r^2 >= 1?
2691         cmp     rbx, 1                  // set cf iff r^2 >= 1
2692         adc     rax, 0                  // and add onto accumulator
2693         loop    0b
2694
2695 #elif defined(__i386__)
2696
2697         // this is actually better done in 32 bits.  the carry has the wrong
2698         // sense here, so instead deduct one for each point outside the
2699         // quarter-circle rather than adding one for each point inside it.
2700         xor     eax, eax
2701         xor     ecx, ecx
2702
2703 0:      movzx   ebx, cx
2704         imul    ebx, ebx
2705
2706         ror     ecx, 0x10
2707         movzx   edx, cx
2708         imul    edx, edx
2709         rol     ecx, 0x10
2710
2711         add     ebx, edx                // see?
2712         sbb     eax, 0
2713         loop    0b
2714
2715 #elif defined(__arm__)
2716
2717         mov     r0, #0
2718         mov     r2, #0
2719
2720 0:      uxth    r1, r2, ror #0
2721         uxth    r3, r2, ror #16
2722         mul     r1, r1, r1
2723         mul     r3, r3, r3
2724         cmn     r1, r3                  // mlas doesn't set cf usefully
2725         addcc   r0, r0, #1
2726         adds    r2, r2, #1
2727         bne     0b
2728
2729 #elif defined(__aarch64__)
2730
2731         mov     w0, #0
2732         mov     w2, #0
2733
2734 0:      ubfx    w1, w2, #0, #16
2735         ubfx    w3, w2, #16, #16
2736         sub     w2, w2, #1
2737         mul     w1, w1, w1
2738         mul     w3, w3, w3
2739         cmn     w1, w3
2740         cinc.cc w0, w0
2741         cbnz    w2, 0b
2742
2743 #else
2744         notimpl
2745 #endif
2746
2747         ret
2748
2749 endproc
2750
2751 proc    x26
2752
2753 #if defined(__x86_64__)
2754
2755         notimpl
2756
2757 #elif defined(__i386__)
2758
2759         notimpl
2760
2761 #elif defined(__arm__)
2762
2763         notimpl
2764
2765 #elif defined(__aarch64__)
2766
2767         notimpl
2768
2769 #else
2770         notimpl
2771 #endif
2772
2773 endproc
2774
2775 proc    x27
2776
2777 #if defined(__x86_64__)
2778
2779         notimpl
2780
2781 #elif defined(__i386__)
2782
2783         notimpl
2784
2785 #elif defined(__arm__)
2786
2787         notimpl
2788
2789 #elif defined(__aarch64__)
2790
2791         notimpl
2792
2793 #else
2794         notimpl
2795 #endif
2796
2797 endproc
2798
2799 proc    x28
2800
2801 #if defined(__x86_64__)
2802
2803         notimpl
2804
2805 #elif defined(__i386__)
2806
2807         notimpl
2808
2809 #elif defined(__arm__)
2810
2811         notimpl
2812
2813 #elif defined(__aarch64__)
2814
2815         notimpl
2816
2817 #else
2818         notimpl
2819 #endif
2820
2821 endproc
2822
2823 proc    x29
2824
2825 #if defined(__x86_64__)
2826
2827         notimpl
2828
2829 #elif defined(__i386__)
2830
2831         notimpl
2832
2833 #elif defined(__arm__)
2834
2835         notimpl
2836
2837 #elif defined(__aarch64__)
2838
2839         notimpl
2840
2841 #else
2842         notimpl
2843 #endif
2844
2845 endproc
2846
2847 proc    x2a
2848
2849 #if defined(__x86_64__)
2850
2851         notimpl
2852
2853 #elif defined(__i386__)
2854
2855         notimpl
2856
2857 #elif defined(__arm__)
2858
2859         notimpl
2860
2861 #elif defined(__aarch64__)
2862
2863         notimpl
2864
2865 #else
2866         notimpl
2867 #endif
2868
2869 endproc
2870
2871 proc    x2b
2872
2873 #if defined(__x86_64__)
2874
2875         notimpl
2876
2877 #elif defined(__i386__)
2878
2879         notimpl
2880
2881 #elif defined(__arm__)
2882
2883         notimpl
2884
2885 #elif defined(__aarch64__)
2886
2887         notimpl
2888
2889 #else
2890         notimpl
2891 #endif
2892
2893 endproc
2894
2895 proc    x2c
2896
2897 #if defined(__x86_64__)
2898
2899         notimpl
2900
2901 #elif defined(__i386__)
2902
2903         notimpl
2904
2905 #elif defined(__arm__)
2906
2907         notimpl
2908
2909 #elif defined(__aarch64__)
2910
2911         notimpl
2912
2913 #else
2914         notimpl
2915 #endif
2916
2917 endproc
2918
2919 proc    x2d
2920
2921 #if defined(__x86_64__)
2922
2923         notimpl
2924
2925 #elif defined(__i386__)
2926
2927         notimpl
2928
2929 #elif defined(__arm__)
2930
2931         notimpl
2932
2933 #elif defined(__aarch64__)
2934
2935         notimpl
2936
2937 #else
2938         notimpl
2939 #endif
2940
2941 endproc
2942
2943 proc    x2e
2944
2945 #if defined(__x86_64__)
2946
2947         notimpl
2948
2949 #elif defined(__i386__)
2950
2951         notimpl
2952
2953 #elif defined(__arm__)
2954
2955         notimpl
2956
2957 #elif defined(__aarch64__)
2958
2959         notimpl
2960
2961 #else
2962         notimpl
2963 #endif
2964
2965 endproc
2966
2967 proc    x2f
2968
2969 #if defined(__x86_64__)
2970
2971         notimpl
2972
2973 #elif defined(__i386__)
2974
2975         notimpl
2976
2977 #elif defined(__arm__)
2978
2979         notimpl
2980
2981 #elif defined(__aarch64__)
2982
2983         notimpl
2984
2985 #else
2986         notimpl
2987 #endif
2988
2989 endproc
2990
2991 ///--------------------------------------------------------------------------
2992 /// 0x30--0x3f
2993
2994 proc    x30
2995
2996 #if defined(__x86_64__)
2997
2998         notimpl
2999
3000 #elif defined(__i386__)
3001
3002         notimpl
3003
3004 #elif defined(__arm__)
3005
3006         notimpl
3007
3008 #elif defined(__aarch64__)
3009
3010         notimpl
3011
3012 #else
3013         notimpl
3014 #endif
3015
3016         ret
3017
3018 endproc
3019
3020 proc    x31
3021
3022 #if defined(__x86_64__)
3023
3024         notimpl
3025
3026 #elif defined(__i386__)
3027
3028         notimpl
3029
3030 #elif defined(__arm__)
3031
3032         notimpl
3033
3034 #elif defined(__aarch64__)
3035
3036         notimpl
3037
3038 #else
3039         notimpl
3040 #endif
3041
3042 endproc
3043
3044 proc    x32
3045
3046 #if defined(__x86_64__)
3047
3048         notimpl
3049
3050 #elif defined(__i386__)
3051
3052         notimpl
3053
3054 #elif defined(__arm__)
3055
3056         notimpl
3057
3058 #elif defined(__aarch64__)
3059
3060         notimpl
3061
3062 #else
3063         notimpl
3064 #endif
3065
3066 endproc
3067
3068 proc    x33
3069
3070 #if defined(__x86_64__)
3071
3072         notimpl
3073
3074 #elif defined(__i386__)
3075
3076         notimpl
3077
3078 #elif defined(__arm__)
3079
3080         notimpl
3081
3082 #elif defined(__aarch64__)
3083
3084         notimpl
3085
3086 #else
3087         notimpl
3088 #endif
3089
3090 endproc
3091
3092 proc    x34
3093
3094 #if defined(__x86_64__)
3095
3096         notimpl
3097
3098 #elif defined(__i386__)
3099
3100         notimpl
3101
3102 #elif defined(__arm__)
3103
3104         notimpl
3105
3106 #elif defined(__aarch64__)
3107
3108         notimpl
3109
3110 #else
3111         notimpl
3112 #endif
3113
3114 endproc
3115
3116 proc    x35
3117
3118 #if defined(__x86_64__)
3119
3120         notimpl
3121
3122 #elif defined(__i386__)
3123
3124         notimpl
3125
3126 #elif defined(__arm__)
3127
3128         notimpl
3129
3130 #elif defined(__aarch64__)
3131
3132         notimpl
3133
3134 #else
3135         notimpl
3136 #endif
3137
3138 endproc
3139
3140 proc    x36
3141
3142 #if defined(__x86_64__)
3143
3144         notimpl
3145
3146 #elif defined(__i386__)
3147
3148         notimpl
3149
3150 #elif defined(__arm__)
3151
3152         notimpl
3153
3154 #elif defined(__aarch64__)
3155
3156         notimpl
3157
3158 #else
3159         notimpl
3160 #endif
3161
3162 endproc
3163
3164 proc    x37
3165
3166 #if defined(__x86_64__)
3167
3168         notimpl
3169
3170 #elif defined(__i386__)
3171
3172         notimpl
3173
3174 #elif defined(__arm__)
3175
3176         notimpl
3177
3178 #elif defined(__aarch64__)
3179
3180         notimpl
3181
3182 #else
3183         notimpl
3184 #endif
3185
3186 endproc
3187
3188 proc    x38
3189
3190 #if defined(__x86_64__)
3191
3192         notimpl
3193
3194 #elif defined(__i386__)
3195
3196         notimpl
3197
3198 #elif defined(__arm__)
3199
3200         notimpl
3201
3202 #elif defined(__aarch64__)
3203
3204         notimpl
3205
3206 #else
3207         notimpl
3208 #endif
3209
3210 endproc
3211
3212 proc    x39
3213
3214 #if defined(__x86_64__)
3215
3216         notimpl
3217
3218 #elif defined(__i386__)
3219
3220         notimpl
3221
3222 #elif defined(__arm__)
3223
3224         notimpl
3225
3226 #elif defined(__aarch64__)
3227
3228         notimpl
3229
3230 #else
3231         notimpl
3232 #endif
3233
3234 endproc
3235
3236 proc    x3a
3237
3238 #if defined(__x86_64__)
3239
3240         notimpl
3241
3242 #elif defined(__i386__)
3243
3244         notimpl
3245
3246 #elif defined(__arm__)
3247
3248         notimpl
3249
3250 #elif defined(__aarch64__)
3251
3252         notimpl
3253
3254 #else
3255         notimpl
3256 #endif
3257
3258 endproc
3259
3260 proc    x3b
3261
3262 #if defined(__x86_64__)
3263
3264         notimpl
3265
3266 #elif defined(__i386__)
3267
3268         notimpl
3269
3270 #elif defined(__arm__)
3271
3272         notimpl
3273
3274 #elif defined(__aarch64__)
3275
3276         notimpl
3277
3278 #else
3279         notimpl
3280 #endif
3281
3282 endproc
3283
3284 proc    x3c
3285
3286 #if defined(__x86_64__)
3287
3288         notimpl
3289
3290 #elif defined(__i386__)
3291
3292         notimpl
3293
3294 #elif defined(__arm__)
3295
3296         notimpl
3297
3298 #elif defined(__aarch64__)
3299
3300         notimpl
3301
3302 #else
3303         notimpl
3304 #endif
3305
3306 endproc
3307
3308 proc    x3d
3309
3310 #if defined(__x86_64__)
3311
3312         notimpl
3313
3314 #elif defined(__i386__)
3315
3316         notimpl
3317
3318 #elif defined(__arm__)
3319
3320         notimpl
3321
3322 #elif defined(__aarch64__)
3323
3324         notimpl
3325
3326 #else
3327         notimpl
3328 #endif
3329
3330 endproc
3331
3332 proc    x3e
3333
3334 #if defined(__x86_64__)
3335
3336         notimpl
3337
3338 #elif defined(__i386__)
3339
3340         notimpl
3341
3342 #elif defined(__arm__)
3343
3344         notimpl
3345
3346 #elif defined(__aarch64__)
3347
3348         notimpl
3349
3350 #else
3351         notimpl
3352 #endif
3353
3354 endproc
3355
3356 proc    x3f
3357
3358 #if defined(__x86_64__)
3359
3360         notimpl
3361
3362 #elif defined(__i386__)
3363
3364         notimpl
3365
3366 #elif defined(__arm__)
3367
3368         notimpl
3369
3370 #elif defined(__aarch64__)
3371
3372         notimpl
3373
3374 #else
3375         notimpl
3376 #endif
3377
3378 endproc
3379
3380 ///----- That's all, folks --------------------------------------------------