chiark / gitweb /
tidy up conflict
[pcre3.git] / sljit / sljitNativeX86_common.c
1 /*
2  *    Stack-less Just-In-Time compiler
3  *
4  *    Copyright 2009-2012 Zoltan Herczeg (hzmester@freemail.hu). All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without modification, are
7  * permitted provided that the following conditions are met:
8  *
9  *   1. Redistributions of source code must retain the above copyright notice, this list of
10  *      conditions and the following disclaimer.
11  *
12  *   2. Redistributions in binary form must reproduce the above copyright notice, this list
13  *      of conditions and the following disclaimer in the documentation and/or other materials
14  *      provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) AND CONTRIBUTORS ``AS IS'' AND ANY
17  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
19  * SHALL THE COPYRIGHT HOLDER(S) OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
20  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
21  * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
22  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
24  * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  */
26
27 SLJIT_API_FUNC_ATTRIBUTE SLJIT_CONST char* sljit_get_platform_name(void)
28 {
29         return "x86" SLJIT_CPUINFO;
30 }
31
32 /*
33    32b register indexes:
34      0 - EAX
35      1 - ECX
36      2 - EDX
37      3 - EBX
38      4 - none
39      5 - EBP
40      6 - ESI
41      7 - EDI
42 */
43
44 /*
45    64b register indexes:
46      0 - RAX
47      1 - RCX
48      2 - RDX
49      3 - RBX
50      4 - none
51      5 - RBP
52      6 - RSI
53      7 - RDI
54      8 - R8   - From now on REX prefix is required
55      9 - R9
56     10 - R10
57     11 - R11
58     12 - R12
59     13 - R13
60     14 - R14
61     15 - R15
62 */
63
64 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
65
66 /* Last register + 1. */
67 #define TMP_REG1        (SLJIT_NO_REGISTERS + 1)
68
69 static SLJIT_CONST sljit_ub reg_map[SLJIT_NO_REGISTERS + 2] = {
70         0, 0, 2, 1, 0, 0, 3, 6, 7, 0, 0, 4, 5
71 };
72
73 #define CHECK_EXTRA_REGS(p, w, do) \
74         if (p >= SLJIT_TEMPORARY_EREG1 && p <= SLJIT_TEMPORARY_EREG2) { \
75                 w = compiler->scratches_start + (p - SLJIT_TEMPORARY_EREG1) * sizeof(sljit_sw); \
76                 p = SLJIT_MEM1(SLJIT_LOCALS_REG); \
77                 do; \
78         } \
79         else if (p >= SLJIT_SAVED_EREG1 && p <= SLJIT_SAVED_EREG2) { \
80                 w = compiler->saveds_start + (p - SLJIT_SAVED_EREG1) * sizeof(sljit_sw); \
81                 p = SLJIT_MEM1(SLJIT_LOCALS_REG); \
82                 do; \
83         }
84
85 #else /* SLJIT_CONFIG_X86_32 */
86
87 /* Last register + 1. */
88 #define TMP_REG1        (SLJIT_NO_REGISTERS + 1)
89 #define TMP_REG2        (SLJIT_NO_REGISTERS + 2)
90 #define TMP_REG3        (SLJIT_NO_REGISTERS + 3)
91
92 /* Note: r12 & 0x7 == 0b100, which decoded as SIB byte present
93    Note: avoid to use r12 and r13 for memory addessing
94    therefore r12 is better for SAVED_EREG than SAVED_REG. */
95 #ifndef _WIN64
96 /* 1st passed in rdi, 2nd argument passed in rsi, 3rd in rdx. */
97 static SLJIT_CONST sljit_ub reg_map[SLJIT_NO_REGISTERS + 4] = {
98         0, 0, 6, 1, 8, 11, 3, 15, 14, 13, 12, 4, 2, 7, 9
99 };
100 /* low-map. reg_map & 0x7. */
101 static SLJIT_CONST sljit_ub reg_lmap[SLJIT_NO_REGISTERS + 4] = {
102         0, 0, 6, 1, 0, 3,  3, 7,  6,  5,  4,  4, 2, 7, 1
103 };
104 #else
105 /* 1st passed in rcx, 2nd argument passed in rdx, 3rd in r8. */
106 static SLJIT_CONST sljit_ub reg_map[SLJIT_NO_REGISTERS + 4] = {
107         0, 0, 2, 1, 11, 13, 3, 6, 7, 14, 15, 4, 10, 8, 9
108 };
109 /* low-map. reg_map & 0x7. */
110 static SLJIT_CONST sljit_ub reg_lmap[SLJIT_NO_REGISTERS + 4] = {
111         0, 0, 2, 1, 3,  5,  3, 6, 7,  6,  7, 4, 2,  0, 1
112 };
113 #endif
114
115 #define REX_W           0x48
116 #define REX_R           0x44
117 #define REX_X           0x42
118 #define REX_B           0x41
119 #define REX             0x40
120
121 #ifndef _WIN64
122 #define HALFWORD_MAX 0x7fffffffl
123 #define HALFWORD_MIN -0x80000000l
124 #else
125 #define HALFWORD_MAX 0x7fffffffll
126 #define HALFWORD_MIN -0x80000000ll
127 #endif
128
129 #define IS_HALFWORD(x)          ((x) <= HALFWORD_MAX && (x) >= HALFWORD_MIN)
130 #define NOT_HALFWORD(x)         ((x) > HALFWORD_MAX || (x) < HALFWORD_MIN)
131
132 #define CHECK_EXTRA_REGS(p, w, do)
133
134 #endif /* SLJIT_CONFIG_X86_32 */
135
136 #if (defined SLJIT_SSE2 && SLJIT_SSE2)
137 #define TMP_FREG        (0)
138 #endif
139
140 /* Size flags for emit_x86_instruction: */
141 #define EX86_BIN_INS            0x0010
142 #define EX86_SHIFT_INS          0x0020
143 #define EX86_REX                0x0040
144 #define EX86_NO_REXW            0x0080
145 #define EX86_BYTE_ARG           0x0100
146 #define EX86_HALF_ARG           0x0200
147 #define EX86_PREF_66            0x0400
148
149 #if (defined SLJIT_SSE2 && SLJIT_SSE2)
150 #define EX86_SSE2               0x0800
151 #define EX86_PREF_F2            0x1000
152 #define EX86_PREF_F3            0x2000
153 #endif
154
155 /* --------------------------------------------------------------------- */
156 /*  Instrucion forms                                                     */
157 /* --------------------------------------------------------------------- */
158
159 #define ADD             (/* BINARY */ 0 << 3)
160 #define ADD_EAX_i32     0x05
161 #define ADD_r_rm        0x03
162 #define ADD_rm_r        0x01
163 #define ADDSD_x_xm      0x58
164 #define ADC             (/* BINARY */ 2 << 3)
165 #define ADC_EAX_i32     0x15
166 #define ADC_r_rm        0x13
167 #define ADC_rm_r        0x11
168 #define AND             (/* BINARY */ 4 << 3)
169 #define AND_EAX_i32     0x25
170 #define AND_r_rm        0x23
171 #define AND_rm_r        0x21
172 #define ANDPD_x_xm      0x54
173 #define BSR_r_rm        (/* GROUP_0F */ 0xbd)
174 #define CALL_i32        0xe8
175 #define CALL_rm         (/* GROUP_FF */ 2 << 3)
176 #define CDQ             0x99
177 #define CMOVNE_r_rm     (/* GROUP_0F */ 0x45)
178 #define CMP             (/* BINARY */ 7 << 3)
179 #define CMP_EAX_i32     0x3d
180 #define CMP_r_rm        0x3b
181 #define CMP_rm_r        0x39
182 #define DIV             (/* GROUP_F7 */ 6 << 3)
183 #define DIVSD_x_xm      0x5e
184 #define INT3            0xcc
185 #define IDIV            (/* GROUP_F7 */ 7 << 3)
186 #define IMUL            (/* GROUP_F7 */ 5 << 3)
187 #define IMUL_r_rm       (/* GROUP_0F */ 0xaf)
188 #define IMUL_r_rm_i8    0x6b
189 #define IMUL_r_rm_i32   0x69
190 #define JE_i8           0x74
191 #define JMP_i8          0xeb
192 #define JMP_i32         0xe9
193 #define JMP_rm          (/* GROUP_FF */ 4 << 3)
194 #define LEA_r_m         0x8d
195 #define MOV_r_rm        0x8b
196 #define MOV_r_i32       0xb8
197 #define MOV_rm_r        0x89
198 #define MOV_rm_i32      0xc7
199 #define MOV_rm8_i8      0xc6
200 #define MOV_rm8_r8      0x88
201 #define MOVSD_x_xm      0x10
202 #define MOVSD_xm_x      0x11
203 #define MOVSXD_r_rm     0x63
204 #define MOVSX_r_rm8     (/* GROUP_0F */ 0xbe)
205 #define MOVSX_r_rm16    (/* GROUP_0F */ 0xbf)
206 #define MOVZX_r_rm8     (/* GROUP_0F */ 0xb6)
207 #define MOVZX_r_rm16    (/* GROUP_0F */ 0xb7)
208 #define MUL             (/* GROUP_F7 */ 4 << 3)
209 #define MULSD_x_xm      0x59
210 #define NEG_rm          (/* GROUP_F7 */ 3 << 3)
211 #define NOP             0x90
212 #define NOT_rm          (/* GROUP_F7 */ 2 << 3)
213 #define OR              (/* BINARY */ 1 << 3)
214 #define OR_r_rm         0x0b
215 #define OR_EAX_i32      0x0d
216 #define OR_rm_r         0x09
217 #define OR_rm8_r8       0x08
218 #define POP_r           0x58
219 #define POP_rm          0x8f
220 #define POPF            0x9d
221 #define PUSH_i32        0x68
222 #define PUSH_r          0x50
223 #define PUSH_rm         (/* GROUP_FF */ 6 << 3)
224 #define PUSHF           0x9c
225 #define RET_near        0xc3
226 #define RET_i16         0xc2
227 #define SBB             (/* BINARY */ 3 << 3)
228 #define SBB_EAX_i32     0x1d
229 #define SBB_r_rm        0x1b
230 #define SBB_rm_r        0x19
231 #define SAR             (/* SHIFT */ 7 << 3)
232 #define SHL             (/* SHIFT */ 4 << 3)
233 #define SHR             (/* SHIFT */ 5 << 3)
234 #define SUB             (/* BINARY */ 5 << 3)
235 #define SUB_EAX_i32     0x2d
236 #define SUB_r_rm        0x2b
237 #define SUB_rm_r        0x29
238 #define SUBSD_x_xm      0x5c
239 #define TEST_EAX_i32    0xa9
240 #define TEST_rm_r       0x85
241 #define UCOMISD_x_xm    0x2e
242 #define XCHG_EAX_r      0x90
243 #define XCHG_r_rm       0x87
244 #define XOR             (/* BINARY */ 6 << 3)
245 #define XOR_EAX_i32     0x35
246 #define XOR_r_rm        0x33
247 #define XOR_rm_r        0x31
248 #define XORPD_x_xm      0x57
249
250 #define GROUP_0F        0x0f
251 #define GROUP_F7        0xf7
252 #define GROUP_FF        0xff
253 #define GROUP_BINARY_81 0x81
254 #define GROUP_BINARY_83 0x83
255 #define GROUP_SHIFT_1   0xd1
256 #define GROUP_SHIFT_N   0xc1
257 #define GROUP_SHIFT_CL  0xd3
258
259 #define MOD_REG         0xc0
260 #define MOD_DISP8       0x40
261
262 #define INC_SIZE(s)                     (*inst++ = (s), compiler->size += (s))
263
264 #define PUSH_REG(r)                     (*inst++ = (PUSH_r + (r)))
265 #define POP_REG(r)                      (*inst++ = (POP_r + (r)))
266 #define RET()                           (*inst++ = (RET_near))
267 #define RET_I16(n)                      (*inst++ = (RET_i16), *inst++ = n, *inst++ = 0)
268 /* r32, r/m32 */
269 #define MOV_RM(mod, reg, rm)            (*inst++ = (MOV_r_rm), *inst++ = (mod) << 6 | (reg) << 3 | (rm))
270
271 /* Multithreading does not affect these static variables, since they store
272    built-in CPU features. Therefore they can be overwritten by different threads
273    if they detect the CPU features in the same time. */
274 #if (defined SLJIT_SSE2 && SLJIT_SSE2) && (defined SLJIT_DETECT_SSE2 && SLJIT_DETECT_SSE2)
275 static sljit_si cpu_has_sse2 = -1;
276 #endif
277 static sljit_si cpu_has_cmov = -1;
278
279 #if defined(_MSC_VER) && _MSC_VER >= 1400
280 #include <intrin.h>
281 #endif
282
283 static void get_cpu_features(void)
284 {
285         sljit_ui features;
286
287 #if defined(_MSC_VER) && _MSC_VER >= 1400
288
289         int CPUInfo[4];
290         __cpuid(CPUInfo, 1);
291         features = (sljit_ui)CPUInfo[3];
292
293 #elif defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__SUNPRO_C)
294
295         /* AT&T syntax. */
296         __asm__ (
297                 "movl $0x1, %%eax\n"
298 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
299                 /* On x86-32, there is no red zone, so this
300                    should work (no need for a local variable). */
301                 "push %%ebx\n"
302 #endif
303                 "cpuid\n"
304 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
305                 "pop %%ebx\n"
306 #endif
307                 "movl %%edx, %0\n"
308                 : "=g" (features)
309                 :
310 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
311                 : "%eax", "%ecx", "%edx"
312 #else
313                 : "%rax", "%rbx", "%rcx", "%rdx"
314 #endif
315         );
316
317 #else /* _MSC_VER && _MSC_VER >= 1400 */
318
319         /* Intel syntax. */
320         __asm {
321                 mov eax, 1
322                 cpuid
323                 mov features, edx
324         }
325
326 #endif /* _MSC_VER && _MSC_VER >= 1400 */
327
328 #if (defined SLJIT_SSE2 && SLJIT_SSE2) && (defined SLJIT_DETECT_SSE2 && SLJIT_DETECT_SSE2)
329         cpu_has_sse2 = (features >> 26) & 0x1;
330 #endif
331         cpu_has_cmov = (features >> 15) & 0x1;
332 }
333
334 static sljit_ub get_jump_code(sljit_si type)
335 {
336         switch (type) {
337         case SLJIT_C_EQUAL:
338         case SLJIT_C_FLOAT_EQUAL:
339                 return 0x84 /* je */;
340
341         case SLJIT_C_NOT_EQUAL:
342         case SLJIT_C_FLOAT_NOT_EQUAL:
343                 return 0x85 /* jne */;
344
345         case SLJIT_C_LESS:
346         case SLJIT_C_FLOAT_LESS:
347                 return 0x82 /* jc */;
348
349         case SLJIT_C_GREATER_EQUAL:
350         case SLJIT_C_FLOAT_GREATER_EQUAL:
351                 return 0x83 /* jae */;
352
353         case SLJIT_C_GREATER:
354         case SLJIT_C_FLOAT_GREATER:
355                 return 0x87 /* jnbe */;
356
357         case SLJIT_C_LESS_EQUAL:
358         case SLJIT_C_FLOAT_LESS_EQUAL:
359                 return 0x86 /* jbe */;
360
361         case SLJIT_C_SIG_LESS:
362                 return 0x8c /* jl */;
363
364         case SLJIT_C_SIG_GREATER_EQUAL:
365                 return 0x8d /* jnl */;
366
367         case SLJIT_C_SIG_GREATER:
368                 return 0x8f /* jnle */;
369
370         case SLJIT_C_SIG_LESS_EQUAL:
371                 return 0x8e /* jle */;
372
373         case SLJIT_C_OVERFLOW:
374         case SLJIT_C_MUL_OVERFLOW:
375                 return 0x80 /* jo */;
376
377         case SLJIT_C_NOT_OVERFLOW:
378         case SLJIT_C_MUL_NOT_OVERFLOW:
379                 return 0x81 /* jno */;
380
381         case SLJIT_C_FLOAT_UNORDERED:
382                 return 0x8a /* jp */;
383
384         case SLJIT_C_FLOAT_ORDERED:
385                 return 0x8b /* jpo */;
386         }
387         return 0;
388 }
389
390 static sljit_ub* generate_far_jump_code(struct sljit_jump *jump, sljit_ub *code_ptr, sljit_si type);
391
392 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
393 static sljit_ub* generate_fixed_jump(sljit_ub *code_ptr, sljit_sw addr, sljit_si type);
394 #endif
395
396 static sljit_ub* generate_near_jump_code(struct sljit_jump *jump, sljit_ub *code_ptr, sljit_ub *code, sljit_si type)
397 {
398         sljit_si short_jump;
399         sljit_uw label_addr;
400
401         if (jump->flags & JUMP_LABEL)
402                 label_addr = (sljit_uw)(code + jump->u.label->size);
403         else
404                 label_addr = jump->u.target;
405         short_jump = (sljit_sw)(label_addr - (jump->addr + 2)) >= -128 && (sljit_sw)(label_addr - (jump->addr + 2)) <= 127;
406
407 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
408         if ((sljit_sw)(label_addr - (jump->addr + 1)) > HALFWORD_MAX || (sljit_sw)(label_addr - (jump->addr + 1)) < HALFWORD_MIN)
409                 return generate_far_jump_code(jump, code_ptr, type);
410 #endif
411
412         if (type == SLJIT_JUMP) {
413                 if (short_jump)
414                         *code_ptr++ = JMP_i8;
415                 else
416                         *code_ptr++ = JMP_i32;
417                 jump->addr++;
418         }
419         else if (type >= SLJIT_FAST_CALL) {
420                 short_jump = 0;
421                 *code_ptr++ = CALL_i32;
422                 jump->addr++;
423         }
424         else if (short_jump) {
425                 *code_ptr++ = get_jump_code(type) - 0x10;
426                 jump->addr++;
427         }
428         else {
429                 *code_ptr++ = GROUP_0F;
430                 *code_ptr++ = get_jump_code(type);
431                 jump->addr += 2;
432         }
433
434         if (short_jump) {
435                 jump->flags |= PATCH_MB;
436                 code_ptr += sizeof(sljit_sb);
437         } else {
438                 jump->flags |= PATCH_MW;
439 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
440                 code_ptr += sizeof(sljit_sw);
441 #else
442                 code_ptr += sizeof(sljit_si);
443 #endif
444         }
445
446         return code_ptr;
447 }
448
449 SLJIT_API_FUNC_ATTRIBUTE void* sljit_generate_code(struct sljit_compiler *compiler)
450 {
451         struct sljit_memory_fragment *buf;
452         sljit_ub *code;
453         sljit_ub *code_ptr;
454         sljit_ub *buf_ptr;
455         sljit_ub *buf_end;
456         sljit_ub len;
457
458         struct sljit_label *label;
459         struct sljit_jump *jump;
460         struct sljit_const *const_;
461
462         CHECK_ERROR_PTR();
463         check_sljit_generate_code(compiler);
464         reverse_buf(compiler);
465
466         /* Second code generation pass. */
467         code = (sljit_ub*)SLJIT_MALLOC_EXEC(compiler->size);
468         PTR_FAIL_WITH_EXEC_IF(code);
469         buf = compiler->buf;
470
471         code_ptr = code;
472         label = compiler->labels;
473         jump = compiler->jumps;
474         const_ = compiler->consts;
475         do {
476                 buf_ptr = buf->memory;
477                 buf_end = buf_ptr + buf->used_size;
478                 do {
479                         len = *buf_ptr++;
480                         if (len > 0) {
481                                 /* The code is already generated. */
482                                 SLJIT_MEMMOVE(code_ptr, buf_ptr, len);
483                                 code_ptr += len;
484                                 buf_ptr += len;
485                         }
486                         else {
487                                 if (*buf_ptr >= 4) {
488                                         jump->addr = (sljit_uw)code_ptr;
489                                         if (!(jump->flags & SLJIT_REWRITABLE_JUMP))
490                                                 code_ptr = generate_near_jump_code(jump, code_ptr, code, *buf_ptr - 4);
491                                         else
492                                                 code_ptr = generate_far_jump_code(jump, code_ptr, *buf_ptr - 4);
493                                         jump = jump->next;
494                                 }
495                                 else if (*buf_ptr == 0) {
496                                         label->addr = (sljit_uw)code_ptr;
497                                         label->size = code_ptr - code;
498                                         label = label->next;
499                                 }
500                                 else if (*buf_ptr == 1) {
501                                         const_->addr = ((sljit_uw)code_ptr) - sizeof(sljit_sw);
502                                         const_ = const_->next;
503                                 }
504                                 else {
505 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
506                                         *code_ptr++ = (*buf_ptr == 2) ? CALL_i32 : JMP_i32;
507                                         buf_ptr++;
508                                         *(sljit_sw*)code_ptr = *(sljit_sw*)buf_ptr - ((sljit_sw)code_ptr + sizeof(sljit_sw));
509                                         code_ptr += sizeof(sljit_sw);
510                                         buf_ptr += sizeof(sljit_sw) - 1;
511 #else
512                                         code_ptr = generate_fixed_jump(code_ptr, *(sljit_sw*)(buf_ptr + 1), *buf_ptr);
513                                         buf_ptr += sizeof(sljit_sw);
514 #endif
515                                 }
516                                 buf_ptr++;
517                         }
518                 } while (buf_ptr < buf_end);
519                 SLJIT_ASSERT(buf_ptr == buf_end);
520                 buf = buf->next;
521         } while (buf);
522
523         SLJIT_ASSERT(!label);
524         SLJIT_ASSERT(!jump);
525         SLJIT_ASSERT(!const_);
526
527         jump = compiler->jumps;
528         while (jump) {
529                 if (jump->flags & PATCH_MB) {
530                         SLJIT_ASSERT((sljit_sw)(jump->u.label->addr - (jump->addr + sizeof(sljit_sb))) >= -128 && (sljit_sw)(jump->u.label->addr - (jump->addr + sizeof(sljit_sb))) <= 127);
531                         *(sljit_ub*)jump->addr = (sljit_ub)(jump->u.label->addr - (jump->addr + sizeof(sljit_sb)));
532                 } else if (jump->flags & PATCH_MW) {
533                         if (jump->flags & JUMP_LABEL) {
534 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
535                                 *(sljit_sw*)jump->addr = (sljit_sw)(jump->u.label->addr - (jump->addr + sizeof(sljit_sw)));
536 #else
537                                 SLJIT_ASSERT((sljit_sw)(jump->u.label->addr - (jump->addr + sizeof(sljit_si))) >= HALFWORD_MIN && (sljit_sw)(jump->u.label->addr - (jump->addr + sizeof(sljit_si))) <= HALFWORD_MAX);
538                                 *(sljit_si*)jump->addr = (sljit_si)(jump->u.label->addr - (jump->addr + sizeof(sljit_si)));
539 #endif
540                         }
541                         else {
542 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
543                                 *(sljit_sw*)jump->addr = (sljit_sw)(jump->u.target - (jump->addr + sizeof(sljit_sw)));
544 #else
545                                 SLJIT_ASSERT((sljit_sw)(jump->u.target - (jump->addr + sizeof(sljit_si))) >= HALFWORD_MIN && (sljit_sw)(jump->u.target - (jump->addr + sizeof(sljit_si))) <= HALFWORD_MAX);
546                                 *(sljit_si*)jump->addr = (sljit_si)(jump->u.target - (jump->addr + sizeof(sljit_si)));
547 #endif
548                         }
549                 }
550 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
551                 else if (jump->flags & PATCH_MD)
552                         *(sljit_sw*)jump->addr = jump->u.label->addr;
553 #endif
554
555                 jump = jump->next;
556         }
557
558         /* Maybe we waste some space because of short jumps. */
559         SLJIT_ASSERT(code_ptr <= code + compiler->size);
560         compiler->error = SLJIT_ERR_COMPILED;
561         compiler->executable_size = code_ptr - code;
562         return (void*)code;
563 }
564
565 /* --------------------------------------------------------------------- */
566 /*  Operators                                                            */
567 /* --------------------------------------------------------------------- */
568
569 static sljit_si emit_cum_binary(struct sljit_compiler *compiler,
570         sljit_ub op_rm, sljit_ub op_mr, sljit_ub op_imm, sljit_ub op_eax_imm,
571         sljit_si dst, sljit_sw dstw,
572         sljit_si src1, sljit_sw src1w,
573         sljit_si src2, sljit_sw src2w);
574
575 static sljit_si emit_non_cum_binary(struct sljit_compiler *compiler,
576         sljit_ub op_rm, sljit_ub op_mr, sljit_ub op_imm, sljit_ub op_eax_imm,
577         sljit_si dst, sljit_sw dstw,
578         sljit_si src1, sljit_sw src1w,
579         sljit_si src2, sljit_sw src2w);
580
581 static sljit_si emit_mov(struct sljit_compiler *compiler,
582         sljit_si dst, sljit_sw dstw,
583         sljit_si src, sljit_sw srcw);
584
585 static SLJIT_INLINE sljit_si emit_save_flags(struct sljit_compiler *compiler)
586 {
587         sljit_ub *inst;
588
589 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
590         inst = (sljit_ub*)ensure_buf(compiler, 1 + 5);
591         FAIL_IF(!inst);
592         INC_SIZE(5);
593 #else
594         inst = (sljit_ub*)ensure_buf(compiler, 1 + 6);
595         FAIL_IF(!inst);
596         INC_SIZE(6);
597         *inst++ = REX_W;
598 #endif
599         *inst++ = LEA_r_m; /* lea esp/rsp, [esp/rsp + sizeof(sljit_sw)] */
600         *inst++ = 0x64;
601         *inst++ = 0x24;
602         *inst++ = (sljit_ub)sizeof(sljit_sw);
603         *inst++ = PUSHF;
604         compiler->flags_saved = 1;
605         return SLJIT_SUCCESS;
606 }
607
608 static SLJIT_INLINE sljit_si emit_restore_flags(struct sljit_compiler *compiler, sljit_si keep_flags)
609 {
610         sljit_ub *inst;
611
612 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
613         inst = (sljit_ub*)ensure_buf(compiler, 1 + 5);
614         FAIL_IF(!inst);
615         INC_SIZE(5);
616         *inst++ = POPF;
617 #else
618         inst = (sljit_ub*)ensure_buf(compiler, 1 + 6);
619         FAIL_IF(!inst);
620         INC_SIZE(6);
621         *inst++ = POPF;
622         *inst++ = REX_W;
623 #endif
624         *inst++ = LEA_r_m; /* lea esp/rsp, [esp/rsp - sizeof(sljit_sw)] */
625         *inst++ = 0x64;
626         *inst++ = 0x24;
627         *inst++ = (sljit_ub)-(sljit_sb)sizeof(sljit_sw);
628         compiler->flags_saved = keep_flags;
629         return SLJIT_SUCCESS;
630 }
631
632 #ifdef _WIN32
633 #include <malloc.h>
634
635 static void SLJIT_CALL sljit_grow_stack(sljit_sw local_size)
636 {
637         /* Workaround for calling the internal _chkstk() function on Windows.
638         This function touches all 4k pages belongs to the requested stack space,
639         which size is passed in local_size. This is necessary on Windows where
640         the stack can only grow in 4k steps. However, this function just burn
641         CPU cycles if the stack is large enough. However, you don't know it in
642         advance, so it must always be called. I think this is a bad design in
643         general even if it has some reasons. */
644         *(volatile sljit_si*)alloca(local_size) = 0;
645 }
646
647 #endif
648
649 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
650 #include "sljitNativeX86_32.c"
651 #else
652 #include "sljitNativeX86_64.c"
653 #endif
654
655 static sljit_si emit_mov(struct sljit_compiler *compiler,
656         sljit_si dst, sljit_sw dstw,
657         sljit_si src, sljit_sw srcw)
658 {
659         sljit_ub* inst;
660
661         if (dst == SLJIT_UNUSED) {
662                 /* No destination, doesn't need to setup flags. */
663                 if (src & SLJIT_MEM) {
664                         inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src, srcw);
665                         FAIL_IF(!inst);
666                         *inst = MOV_r_rm;
667                 }
668                 return SLJIT_SUCCESS;
669         }
670         if (FAST_IS_REG(src)) {
671                 inst = emit_x86_instruction(compiler, 1, src, 0, dst, dstw);
672                 FAIL_IF(!inst);
673                 *inst = MOV_rm_r;
674                 return SLJIT_SUCCESS;
675         }
676         if (src & SLJIT_IMM) {
677                 if (FAST_IS_REG(dst)) {
678 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
679                         return emit_do_imm(compiler, MOV_r_i32 + reg_map[dst], srcw);
680 #else
681                         if (!compiler->mode32) {
682                                 if (NOT_HALFWORD(srcw))
683                                         return emit_load_imm64(compiler, dst, srcw);
684                         }
685                         else
686                                 return emit_do_imm32(compiler, (reg_map[dst] >= 8) ? REX_B : 0, MOV_r_i32 + reg_lmap[dst], srcw);
687 #endif
688                 }
689 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
690                 if (!compiler->mode32 && NOT_HALFWORD(srcw)) {
691                         FAIL_IF(emit_load_imm64(compiler, TMP_REG2, srcw));
692                         inst = emit_x86_instruction(compiler, 1, TMP_REG2, 0, dst, dstw);
693                         FAIL_IF(!inst);
694                         *inst = MOV_rm_r;
695                         return SLJIT_SUCCESS;
696                 }
697 #endif
698                 inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, srcw, dst, dstw);
699                 FAIL_IF(!inst);
700                 *inst = MOV_rm_i32;
701                 return SLJIT_SUCCESS;
702         }
703         if (FAST_IS_REG(dst)) {
704                 inst = emit_x86_instruction(compiler, 1, dst, 0, src, srcw);
705                 FAIL_IF(!inst);
706                 *inst = MOV_r_rm;
707                 return SLJIT_SUCCESS;
708         }
709
710         /* Memory to memory move. Requires two instruction. */
711         inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src, srcw);
712         FAIL_IF(!inst);
713         *inst = MOV_r_rm;
714         inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, dst, dstw);
715         FAIL_IF(!inst);
716         *inst = MOV_rm_r;
717         return SLJIT_SUCCESS;
718 }
719
720 #define EMIT_MOV(compiler, dst, dstw, src, srcw) \
721         FAIL_IF(emit_mov(compiler, dst, dstw, src, srcw));
722
723 SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op0(struct sljit_compiler *compiler, sljit_si op)
724 {
725         sljit_ub *inst;
726 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
727         sljit_si size;
728 #endif
729
730         CHECK_ERROR();
731         check_sljit_emit_op0(compiler, op);
732
733         switch (GET_OPCODE(op)) {
734         case SLJIT_BREAKPOINT:
735                 inst = (sljit_ub*)ensure_buf(compiler, 1 + 1);
736                 FAIL_IF(!inst);
737                 INC_SIZE(1);
738                 *inst = INT3;
739                 break;
740         case SLJIT_NOP:
741                 inst = (sljit_ub*)ensure_buf(compiler, 1 + 1);
742                 FAIL_IF(!inst);
743                 INC_SIZE(1);
744                 *inst = NOP;
745                 break;
746         case SLJIT_UMUL:
747         case SLJIT_SMUL:
748         case SLJIT_UDIV:
749         case SLJIT_SDIV:
750                 compiler->flags_saved = 0;
751 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
752 #ifdef _WIN64
753                 SLJIT_COMPILE_ASSERT(
754                         reg_map[SLJIT_SCRATCH_REG1] == 0
755                         && reg_map[SLJIT_SCRATCH_REG2] == 2
756                         && reg_map[TMP_REG1] > 7,
757                         invalid_register_assignment_for_div_mul);
758 #else
759                 SLJIT_COMPILE_ASSERT(
760                         reg_map[SLJIT_SCRATCH_REG1] == 0
761                         && reg_map[SLJIT_SCRATCH_REG2] < 7
762                         && reg_map[TMP_REG1] == 2,
763                         invalid_register_assignment_for_div_mul);
764 #endif
765                 compiler->mode32 = op & SLJIT_INT_OP;
766 #endif
767
768                 op = GET_OPCODE(op);
769                 if (op == SLJIT_UDIV) {
770 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) || defined(_WIN64)
771                         EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_SCRATCH_REG2, 0);
772                         inst = emit_x86_instruction(compiler, 1, SLJIT_SCRATCH_REG2, 0, SLJIT_SCRATCH_REG2, 0);
773 #else
774                         inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, TMP_REG1, 0);
775 #endif
776                         FAIL_IF(!inst);
777                         *inst = XOR_r_rm;
778                 }
779
780                 if (op == SLJIT_SDIV) {
781 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) || defined(_WIN64)
782                         EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_SCRATCH_REG2, 0);
783 #endif
784
785 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
786                         inst = (sljit_ub*)ensure_buf(compiler, 1 + 1);
787                         FAIL_IF(!inst);
788                         INC_SIZE(1);
789                         *inst = CDQ;
790 #else
791                         if (compiler->mode32) {
792                                 inst = (sljit_ub*)ensure_buf(compiler, 1 + 1);
793                                 FAIL_IF(!inst);
794                                 INC_SIZE(1);
795                                 *inst = CDQ;
796                         } else {
797                                 inst = (sljit_ub*)ensure_buf(compiler, 1 + 2);
798                                 FAIL_IF(!inst);
799                                 INC_SIZE(2);
800                                 *inst++ = REX_W;
801                                 *inst = CDQ;
802                         }
803 #endif
804                 }
805
806 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
807                 inst = (sljit_ub*)ensure_buf(compiler, 1 + 2);
808                 FAIL_IF(!inst);
809                 INC_SIZE(2);
810                 *inst++ = GROUP_F7;
811                 *inst = MOD_REG | ((op >= SLJIT_UDIV) ? reg_map[TMP_REG1] : reg_map[SLJIT_SCRATCH_REG2]);
812 #else
813 #ifdef _WIN64
814                 size = (!compiler->mode32 || op >= SLJIT_UDIV) ? 3 : 2;
815 #else
816                 size = (!compiler->mode32) ? 3 : 2;
817 #endif
818                 inst = (sljit_ub*)ensure_buf(compiler, 1 + size);
819                 FAIL_IF(!inst);
820                 INC_SIZE(size);
821 #ifdef _WIN64
822                 if (!compiler->mode32)
823                         *inst++ = REX_W | ((op >= SLJIT_UDIV) ? REX_B : 0);
824                 else if (op >= SLJIT_UDIV)
825                         *inst++ = REX_B;
826                 *inst++ = GROUP_F7;
827                 *inst = MOD_REG | ((op >= SLJIT_UDIV) ? reg_lmap[TMP_REG1] : reg_lmap[SLJIT_SCRATCH_REG2]);
828 #else
829                 if (!compiler->mode32)
830                         *inst++ = REX_W;
831                 *inst++ = GROUP_F7;
832                 *inst = MOD_REG | reg_map[SLJIT_SCRATCH_REG2];
833 #endif
834 #endif
835                 switch (op) {
836                 case SLJIT_UMUL:
837                         *inst |= MUL;
838                         break;
839                 case SLJIT_SMUL:
840                         *inst |= IMUL;
841                         break;
842                 case SLJIT_UDIV:
843                         *inst |= DIV;
844                         break;
845                 case SLJIT_SDIV:
846                         *inst |= IDIV;
847                         break;
848                 }
849 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) && !defined(_WIN64)
850                 EMIT_MOV(compiler, SLJIT_SCRATCH_REG2, 0, TMP_REG1, 0);
851 #endif
852                 break;
853         }
854
855         return SLJIT_SUCCESS;
856 }
857
858 #define ENCODE_PREFIX(prefix) \
859         do { \
860                 inst = (sljit_ub*)ensure_buf(compiler, 1 + 1); \
861                 FAIL_IF(!inst); \
862                 INC_SIZE(1); \
863                 *inst = (prefix); \
864         } while (0)
865
866 static sljit_si emit_mov_byte(struct sljit_compiler *compiler, sljit_si sign,
867         sljit_si dst, sljit_sw dstw,
868         sljit_si src, sljit_sw srcw)
869 {
870         sljit_ub* inst;
871         sljit_si dst_r;
872 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
873         sljit_si work_r;
874 #endif
875
876 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
877         compiler->mode32 = 0;
878 #endif
879
880         if (dst == SLJIT_UNUSED && !(src & SLJIT_MEM))
881                 return SLJIT_SUCCESS; /* Empty instruction. */
882
883         if (src & SLJIT_IMM) {
884                 if (FAST_IS_REG(dst)) {
885 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
886                         return emit_do_imm(compiler, MOV_r_i32 + reg_map[dst], srcw);
887 #else
888                         inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, srcw, dst, 0);
889                         FAIL_IF(!inst);
890                         *inst = MOV_rm_i32;
891                         return SLJIT_SUCCESS;
892 #endif
893                 }
894                 inst = emit_x86_instruction(compiler, 1 | EX86_BYTE_ARG | EX86_NO_REXW, SLJIT_IMM, srcw, dst, dstw);
895                 FAIL_IF(!inst);
896                 *inst = MOV_rm8_i8;
897                 return SLJIT_SUCCESS;
898         }
899
900         dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
901
902         if ((dst & SLJIT_MEM) && FAST_IS_REG(src)) {
903 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
904                 if (reg_map[src] >= 4) {
905                         SLJIT_ASSERT(dst_r == TMP_REG1);
906                         EMIT_MOV(compiler, TMP_REG1, 0, src, 0);
907                 } else
908                         dst_r = src;
909 #else
910                 dst_r = src;
911 #endif
912         }
913 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
914         else if (FAST_IS_REG(src) && reg_map[src] >= 4) {
915                 /* src, dst are registers. */
916                 SLJIT_ASSERT(SLOW_IS_REG(dst));
917                 if (reg_map[dst] < 4) {
918                         if (dst != src)
919                                 EMIT_MOV(compiler, dst, 0, src, 0);
920                         inst = emit_x86_instruction(compiler, 2, dst, 0, dst, 0);
921                         FAIL_IF(!inst);
922                         *inst++ = GROUP_0F;
923                         *inst = sign ? MOVSX_r_rm8 : MOVZX_r_rm8;
924                 }
925                 else {
926                         if (dst != src)
927                                 EMIT_MOV(compiler, dst, 0, src, 0);
928                         if (sign) {
929                                 /* shl reg, 24 */
930                                 inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_IMM, 24, dst, 0);
931                                 FAIL_IF(!inst);
932                                 *inst |= SHL;
933                                 /* sar reg, 24 */
934                                 inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_IMM, 24, dst, 0);
935                                 FAIL_IF(!inst);
936                                 *inst |= SAR;
937                         }
938                         else {
939                                 inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, 0xff, dst, 0);
940                                 FAIL_IF(!inst);
941                                 *(inst + 1) |= AND;
942                         }
943                 }
944                 return SLJIT_SUCCESS;
945         }
946 #endif
947         else {
948                 /* src can be memory addr or reg_map[src] < 4 on x86_32 architectures. */
949                 inst = emit_x86_instruction(compiler, 2, dst_r, 0, src, srcw);
950                 FAIL_IF(!inst);
951                 *inst++ = GROUP_0F;
952                 *inst = sign ? MOVSX_r_rm8 : MOVZX_r_rm8;
953         }
954
955         if (dst & SLJIT_MEM) {
956 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
957                 if (dst_r == TMP_REG1) {
958                         /* Find a non-used register, whose reg_map[src] < 4. */
959                         if ((dst & REG_MASK) == SLJIT_SCRATCH_REG1) {
960                                 if ((dst & OFFS_REG_MASK) == TO_OFFS_REG(SLJIT_SCRATCH_REG2))
961                                         work_r = SLJIT_SCRATCH_REG3;
962                                 else
963                                         work_r = SLJIT_SCRATCH_REG2;
964                         }
965                         else {
966                                 if ((dst & OFFS_REG_MASK) != TO_OFFS_REG(SLJIT_SCRATCH_REG1))
967                                         work_r = SLJIT_SCRATCH_REG1;
968                                 else if ((dst & REG_MASK) == SLJIT_SCRATCH_REG2)
969                                         work_r = SLJIT_SCRATCH_REG3;
970                                 else
971                                         work_r = SLJIT_SCRATCH_REG2;
972                         }
973
974                         if (work_r == SLJIT_SCRATCH_REG1) {
975                                 ENCODE_PREFIX(XCHG_EAX_r + reg_map[TMP_REG1]);
976                         }
977                         else {
978                                 inst = emit_x86_instruction(compiler, 1, work_r, 0, dst_r, 0);
979                                 FAIL_IF(!inst);
980                                 *inst = XCHG_r_rm;
981                         }
982
983                         inst = emit_x86_instruction(compiler, 1, work_r, 0, dst, dstw);
984                         FAIL_IF(!inst);
985                         *inst = MOV_rm8_r8;
986
987                         if (work_r == SLJIT_SCRATCH_REG1) {
988                                 ENCODE_PREFIX(XCHG_EAX_r + reg_map[TMP_REG1]);
989                         }
990                         else {
991                                 inst = emit_x86_instruction(compiler, 1, work_r, 0, dst_r, 0);
992                                 FAIL_IF(!inst);
993                                 *inst = XCHG_r_rm;
994                         }
995                 }
996                 else {
997                         inst = emit_x86_instruction(compiler, 1, dst_r, 0, dst, dstw);
998                         FAIL_IF(!inst);
999                         *inst = MOV_rm8_r8;
1000                 }
1001 #else
1002                 inst = emit_x86_instruction(compiler, 1 | EX86_REX | EX86_NO_REXW, dst_r, 0, dst, dstw);
1003                 FAIL_IF(!inst);
1004                 *inst = MOV_rm8_r8;
1005 #endif
1006         }
1007
1008         return SLJIT_SUCCESS;
1009 }
1010
1011 static sljit_si emit_mov_half(struct sljit_compiler *compiler, sljit_si sign,
1012         sljit_si dst, sljit_sw dstw,
1013         sljit_si src, sljit_sw srcw)
1014 {
1015         sljit_ub* inst;
1016         sljit_si dst_r;
1017
1018 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1019         compiler->mode32 = 0;
1020 #endif
1021
1022         if (dst == SLJIT_UNUSED && !(src & SLJIT_MEM))
1023                 return SLJIT_SUCCESS; /* Empty instruction. */
1024
1025         if (src & SLJIT_IMM) {
1026                 if (FAST_IS_REG(dst)) {
1027 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1028                         return emit_do_imm(compiler, MOV_r_i32 + reg_map[dst], srcw);
1029 #else
1030                         inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, srcw, dst, 0);
1031                         FAIL_IF(!inst);
1032                         *inst = MOV_rm_i32;
1033                         return SLJIT_SUCCESS;
1034 #endif
1035                 }
1036                 inst = emit_x86_instruction(compiler, 1 | EX86_HALF_ARG | EX86_NO_REXW | EX86_PREF_66, SLJIT_IMM, srcw, dst, dstw);
1037                 FAIL_IF(!inst);
1038                 *inst = MOV_rm_i32;
1039                 return SLJIT_SUCCESS;
1040         }
1041
1042         dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
1043
1044         if ((dst & SLJIT_MEM) && FAST_IS_REG(src))
1045                 dst_r = src;
1046         else {
1047                 inst = emit_x86_instruction(compiler, 2, dst_r, 0, src, srcw);
1048                 FAIL_IF(!inst);
1049                 *inst++ = GROUP_0F;
1050                 *inst = sign ? MOVSX_r_rm16 : MOVZX_r_rm16;
1051         }
1052
1053         if (dst & SLJIT_MEM) {
1054                 inst = emit_x86_instruction(compiler, 1 | EX86_NO_REXW | EX86_PREF_66, dst_r, 0, dst, dstw);
1055                 FAIL_IF(!inst);
1056                 *inst = MOV_rm_r;
1057         }
1058
1059         return SLJIT_SUCCESS;
1060 }
1061
1062 static sljit_si emit_unary(struct sljit_compiler *compiler, sljit_ub opcode,
1063         sljit_si dst, sljit_sw dstw,
1064         sljit_si src, sljit_sw srcw)
1065 {
1066         sljit_ub* inst;
1067
1068         if (dst == SLJIT_UNUSED) {
1069                 EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
1070                 inst = emit_x86_instruction(compiler, 1, 0, 0, TMP_REG1, 0);
1071                 FAIL_IF(!inst);
1072                 *inst++ = GROUP_F7;
1073                 *inst |= opcode;
1074                 return SLJIT_SUCCESS;
1075         }
1076         if (dst == src && dstw == srcw) {
1077                 /* Same input and output */
1078                 inst = emit_x86_instruction(compiler, 1, 0, 0, dst, dstw);
1079                 FAIL_IF(!inst);
1080                 *inst++ = GROUP_F7;
1081                 *inst |= opcode;
1082                 return SLJIT_SUCCESS;
1083         }
1084         if (FAST_IS_REG(dst)) {
1085                 EMIT_MOV(compiler, dst, 0, src, srcw);
1086                 inst = emit_x86_instruction(compiler, 1, 0, 0, dst, dstw);
1087                 FAIL_IF(!inst);
1088                 *inst++ = GROUP_F7;
1089                 *inst |= opcode;
1090                 return SLJIT_SUCCESS;
1091         }
1092         EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
1093         inst = emit_x86_instruction(compiler, 1, 0, 0, TMP_REG1, 0);
1094         FAIL_IF(!inst);
1095         *inst++ = GROUP_F7;
1096         *inst |= opcode;
1097         EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
1098         return SLJIT_SUCCESS;
1099 }
1100
1101 static sljit_si emit_not_with_flags(struct sljit_compiler *compiler,
1102         sljit_si dst, sljit_sw dstw,
1103         sljit_si src, sljit_sw srcw)
1104 {
1105         sljit_ub* inst;
1106
1107         if (dst == SLJIT_UNUSED) {
1108                 EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
1109                 inst = emit_x86_instruction(compiler, 1, 0, 0, TMP_REG1, 0);
1110                 FAIL_IF(!inst);
1111                 *inst++ = GROUP_F7;
1112                 *inst |= NOT_rm;
1113                 inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, TMP_REG1, 0);
1114                 FAIL_IF(!inst);
1115                 *inst = OR_r_rm;
1116                 return SLJIT_SUCCESS;
1117         }
1118         if (FAST_IS_REG(dst)) {
1119                 EMIT_MOV(compiler, dst, 0, src, srcw);
1120                 inst = emit_x86_instruction(compiler, 1, 0, 0, dst, dstw);
1121                 FAIL_IF(!inst);
1122                 *inst++ = GROUP_F7;
1123                 *inst |= NOT_rm;
1124                 inst = emit_x86_instruction(compiler, 1, dst, 0, dst, 0);
1125                 FAIL_IF(!inst);
1126                 *inst = OR_r_rm;
1127                 return SLJIT_SUCCESS;
1128         }
1129         EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
1130         inst = emit_x86_instruction(compiler, 1, 0, 0, TMP_REG1, 0);
1131         FAIL_IF(!inst);
1132         *inst++ = GROUP_F7;
1133         *inst |= NOT_rm;
1134         inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, TMP_REG1, 0);
1135         FAIL_IF(!inst);
1136         *inst = OR_r_rm;
1137         EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
1138         return SLJIT_SUCCESS;
1139 }
1140
1141 static sljit_si emit_clz(struct sljit_compiler *compiler, sljit_si op_flags,
1142         sljit_si dst, sljit_sw dstw,
1143         sljit_si src, sljit_sw srcw)
1144 {
1145         sljit_ub* inst;
1146         sljit_si dst_r;
1147
1148         SLJIT_UNUSED_ARG(op_flags);
1149         if (SLJIT_UNLIKELY(dst == SLJIT_UNUSED)) {
1150                 /* Just set the zero flag. */
1151                 EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
1152                 inst = emit_x86_instruction(compiler, 1, 0, 0, TMP_REG1, 0);
1153                 FAIL_IF(!inst);
1154                 *inst++ = GROUP_F7;
1155                 *inst |= NOT_rm;
1156 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1157                 inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_IMM, 31, TMP_REG1, 0);
1158 #else
1159                 inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_IMM, !(op_flags & SLJIT_INT_OP) ? 63 : 31, TMP_REG1, 0);
1160 #endif
1161                 FAIL_IF(!inst);
1162                 *inst |= SHR;
1163                 return SLJIT_SUCCESS;
1164         }
1165
1166         if (SLJIT_UNLIKELY(src & SLJIT_IMM)) {
1167                 EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, srcw);
1168                 src = TMP_REG1;
1169                 srcw = 0;
1170         }
1171
1172         inst = emit_x86_instruction(compiler, 2, TMP_REG1, 0, src, srcw);
1173         FAIL_IF(!inst);
1174         *inst++ = GROUP_0F;
1175         *inst = BSR_r_rm;
1176
1177 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1178         if (FAST_IS_REG(dst))
1179                 dst_r = dst;
1180         else {
1181                 /* Find an unused temporary register. */
1182                 if ((dst & REG_MASK) != SLJIT_SCRATCH_REG1 && (dst & OFFS_REG_MASK) != TO_OFFS_REG(SLJIT_SCRATCH_REG1))
1183                         dst_r = SLJIT_SCRATCH_REG1;
1184                 else if ((dst & REG_MASK) != SLJIT_SCRATCH_REG2 && (dst & OFFS_REG_MASK) != TO_OFFS_REG(SLJIT_SCRATCH_REG2))
1185                         dst_r = SLJIT_SCRATCH_REG2;
1186                 else
1187                         dst_r = SLJIT_SCRATCH_REG3;
1188                 EMIT_MOV(compiler, dst, dstw, dst_r, 0);
1189         }
1190         EMIT_MOV(compiler, dst_r, 0, SLJIT_IMM, 32 + 31);
1191 #else
1192         dst_r = FAST_IS_REG(dst) ? dst : TMP_REG2;
1193         compiler->mode32 = 0;
1194         EMIT_MOV(compiler, dst_r, 0, SLJIT_IMM, !(op_flags & SLJIT_INT_OP) ? 64 + 63 : 32 + 31);
1195         compiler->mode32 = op_flags & SLJIT_INT_OP;
1196 #endif
1197
1198         if (cpu_has_cmov == -1)
1199                 get_cpu_features();
1200
1201         if (cpu_has_cmov) {
1202                 inst = emit_x86_instruction(compiler, 2, dst_r, 0, TMP_REG1, 0);
1203                 FAIL_IF(!inst);
1204                 *inst++ = GROUP_0F;
1205                 *inst = CMOVNE_r_rm;
1206         } else {
1207 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1208                 inst = (sljit_ub*)ensure_buf(compiler, 1 + 4);
1209                 FAIL_IF(!inst);
1210                 INC_SIZE(4);
1211
1212                 *inst++ = JE_i8;
1213                 *inst++ = 2;
1214                 *inst++ = MOV_r_rm;
1215                 *inst++ = MOD_REG | (reg_map[dst_r] << 3) | reg_map[TMP_REG1];
1216 #else
1217                 inst = (sljit_ub*)ensure_buf(compiler, 1 + 5);
1218                 FAIL_IF(!inst);
1219                 INC_SIZE(5);
1220
1221                 *inst++ = JE_i8;
1222                 *inst++ = 3;
1223                 *inst++ = REX_W | (reg_map[dst_r] >= 8 ? REX_R : 0) | (reg_map[TMP_REG1] >= 8 ? REX_B : 0);
1224                 *inst++ = MOV_r_rm;
1225                 *inst++ = MOD_REG | (reg_lmap[dst_r] << 3) | reg_lmap[TMP_REG1];
1226 #endif
1227         }
1228
1229 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1230         inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, 31, dst_r, 0);
1231 #else
1232         inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, !(op_flags & SLJIT_INT_OP) ? 63 : 31, dst_r, 0);
1233 #endif
1234         FAIL_IF(!inst);
1235         *(inst + 1) |= XOR;
1236
1237 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1238         if (dst & SLJIT_MEM) {
1239                 inst = emit_x86_instruction(compiler, 1, dst_r, 0, dst, dstw);
1240                 FAIL_IF(!inst);
1241                 *inst = XCHG_r_rm;
1242         }
1243 #else
1244         if (dst & SLJIT_MEM)
1245                 EMIT_MOV(compiler, dst, dstw, TMP_REG2, 0);
1246 #endif
1247         return SLJIT_SUCCESS;
1248 }
1249
1250 SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op1(struct sljit_compiler *compiler, sljit_si op,
1251         sljit_si dst, sljit_sw dstw,
1252         sljit_si src, sljit_sw srcw)
1253 {
1254         sljit_ub* inst;
1255         sljit_si update = 0;
1256         sljit_si op_flags = GET_ALL_FLAGS(op);
1257 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1258         sljit_si dst_is_ereg = 0;
1259         sljit_si src_is_ereg = 0;
1260 #else
1261 #       define src_is_ereg 0
1262 #endif
1263
1264         CHECK_ERROR();
1265         check_sljit_emit_op1(compiler, op, dst, dstw, src, srcw);
1266         ADJUST_LOCAL_OFFSET(dst, dstw);
1267         ADJUST_LOCAL_OFFSET(src, srcw);
1268
1269         CHECK_EXTRA_REGS(dst, dstw, dst_is_ereg = 1);
1270         CHECK_EXTRA_REGS(src, srcw, src_is_ereg = 1);
1271 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1272         compiler->mode32 = op_flags & SLJIT_INT_OP;
1273 #endif
1274
1275         op = GET_OPCODE(op);
1276         if (op >= SLJIT_MOV && op <= SLJIT_MOVU_P) {
1277 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1278                 compiler->mode32 = 0;
1279 #endif
1280
1281                 if (op_flags & SLJIT_INT_OP) {
1282                         if (FAST_IS_REG(src) && src == dst) {
1283                                 if (!TYPE_CAST_NEEDED(op))
1284                                         return SLJIT_SUCCESS;
1285                         }
1286 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1287                         if (op == SLJIT_MOV_SI && (src & SLJIT_MEM))
1288                                 op = SLJIT_MOV_UI;
1289                         if (op == SLJIT_MOVU_SI && (src & SLJIT_MEM))
1290                                 op = SLJIT_MOVU_UI;
1291                         if (op == SLJIT_MOV_UI && (src & SLJIT_IMM))
1292                                 op = SLJIT_MOV_SI;
1293                         if (op == SLJIT_MOVU_UI && (src & SLJIT_IMM))
1294                                 op = SLJIT_MOVU_SI;
1295 #endif
1296                 }
1297
1298                 SLJIT_COMPILE_ASSERT(SLJIT_MOV + 8 == SLJIT_MOVU, movu_offset);
1299                 if (op >= SLJIT_MOVU) {
1300                         update = 1;
1301                         op -= 8;
1302                 }
1303
1304                 if (src & SLJIT_IMM) {
1305                         switch (op) {
1306                         case SLJIT_MOV_UB:
1307                                 srcw = (sljit_ub)srcw;
1308                                 break;
1309                         case SLJIT_MOV_SB:
1310                                 srcw = (sljit_sb)srcw;
1311                                 break;
1312                         case SLJIT_MOV_UH:
1313                                 srcw = (sljit_uh)srcw;
1314                                 break;
1315                         case SLJIT_MOV_SH:
1316                                 srcw = (sljit_sh)srcw;
1317                                 break;
1318 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1319                         case SLJIT_MOV_UI:
1320                                 srcw = (sljit_ui)srcw;
1321                                 break;
1322                         case SLJIT_MOV_SI:
1323                                 srcw = (sljit_si)srcw;
1324                                 break;
1325 #endif
1326                         }
1327 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1328                         if (SLJIT_UNLIKELY(dst_is_ereg))
1329                                 return emit_mov(compiler, dst, dstw, src, srcw);
1330 #endif
1331                 }
1332
1333                 if (SLJIT_UNLIKELY(update) && (src & SLJIT_MEM) && !src_is_ereg && (src & REG_MASK) && (srcw != 0 || (src & OFFS_REG_MASK) != 0)) {
1334                         inst = emit_x86_instruction(compiler, 1, src & REG_MASK, 0, src, srcw);
1335                         FAIL_IF(!inst);
1336                         *inst = LEA_r_m;
1337                         src &= SLJIT_MEM | 0xf;
1338                         srcw = 0;
1339                 }
1340
1341 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1342                 if (SLJIT_UNLIKELY(dst_is_ereg) && (!(op == SLJIT_MOV || op == SLJIT_MOV_UI || op == SLJIT_MOV_SI || op == SLJIT_MOV_P) || (src & SLJIT_MEM))) {
1343                         SLJIT_ASSERT(dst == SLJIT_MEM1(SLJIT_LOCALS_REG));
1344                         dst = TMP_REG1;
1345                 }
1346 #endif
1347
1348                 switch (op) {
1349                 case SLJIT_MOV:
1350                 case SLJIT_MOV_P:
1351 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1352                 case SLJIT_MOV_UI:
1353                 case SLJIT_MOV_SI:
1354 #endif
1355                         FAIL_IF(emit_mov(compiler, dst, dstw, src, srcw));
1356                         break;
1357                 case SLJIT_MOV_UB:
1358                         FAIL_IF(emit_mov_byte(compiler, 0, dst, dstw, src, srcw));
1359                         break;
1360                 case SLJIT_MOV_SB:
1361                         FAIL_IF(emit_mov_byte(compiler, 1, dst, dstw, src, srcw));
1362                         break;
1363                 case SLJIT_MOV_UH:
1364                         FAIL_IF(emit_mov_half(compiler, 0, dst, dstw, src, srcw));
1365                         break;
1366                 case SLJIT_MOV_SH:
1367                         FAIL_IF(emit_mov_half(compiler, 1, dst, dstw, src, srcw));
1368                         break;
1369 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1370                 case SLJIT_MOV_UI:
1371                         FAIL_IF(emit_mov_int(compiler, 0, dst, dstw, src, srcw));
1372                         break;
1373                 case SLJIT_MOV_SI:
1374                         FAIL_IF(emit_mov_int(compiler, 1, dst, dstw, src, srcw));
1375                         break;
1376 #endif
1377                 }
1378
1379 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1380                 if (SLJIT_UNLIKELY(dst_is_ereg) && dst == TMP_REG1)
1381                         return emit_mov(compiler, SLJIT_MEM1(SLJIT_LOCALS_REG), dstw, TMP_REG1, 0);
1382 #endif
1383
1384                 if (SLJIT_UNLIKELY(update) && (dst & SLJIT_MEM) && (dst & REG_MASK) && (dstw != 0 || (dst & OFFS_REG_MASK) != 0)) {
1385                         inst = emit_x86_instruction(compiler, 1, dst & REG_MASK, 0, dst, dstw);
1386                         FAIL_IF(!inst);
1387                         *inst = LEA_r_m;
1388                 }
1389                 return SLJIT_SUCCESS;
1390         }
1391
1392         if (SLJIT_UNLIKELY(GET_FLAGS(op_flags)))
1393                 compiler->flags_saved = 0;
1394
1395         switch (op) {
1396         case SLJIT_NOT:
1397                 if (SLJIT_UNLIKELY(op_flags & SLJIT_SET_E))
1398                         return emit_not_with_flags(compiler, dst, dstw, src, srcw);
1399                 return emit_unary(compiler, NOT_rm, dst, dstw, src, srcw);
1400
1401         case SLJIT_NEG:
1402                 if (SLJIT_UNLIKELY(op_flags & SLJIT_KEEP_FLAGS) && !compiler->flags_saved)
1403                         FAIL_IF(emit_save_flags(compiler));
1404                 return emit_unary(compiler, NEG_rm, dst, dstw, src, srcw);
1405
1406         case SLJIT_CLZ:
1407                 if (SLJIT_UNLIKELY(op_flags & SLJIT_KEEP_FLAGS) && !compiler->flags_saved)
1408                         FAIL_IF(emit_save_flags(compiler));
1409                 return emit_clz(compiler, op_flags, dst, dstw, src, srcw);
1410         }
1411
1412         return SLJIT_SUCCESS;
1413
1414 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1415 #       undef src_is_ereg
1416 #endif
1417 }
1418
1419 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1420
1421 #define BINARY_IMM(op_imm, op_mr, immw, arg, argw) \
1422         if (IS_HALFWORD(immw) || compiler->mode32) { \
1423                 inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, immw, arg, argw); \
1424                 FAIL_IF(!inst); \
1425                 *(inst + 1) |= (op_imm); \
1426         } \
1427         else { \
1428                 FAIL_IF(emit_load_imm64(compiler, TMP_REG2, immw)); \
1429                 inst = emit_x86_instruction(compiler, 1, TMP_REG2, 0, arg, argw); \
1430                 FAIL_IF(!inst); \
1431                 *inst = (op_mr); \
1432         }
1433
1434 #define BINARY_EAX_IMM(op_eax_imm, immw) \
1435         FAIL_IF(emit_do_imm32(compiler, (!compiler->mode32) ? REX_W : 0, (op_eax_imm), immw))
1436
1437 #else
1438
1439 #define BINARY_IMM(op_imm, op_mr, immw, arg, argw) \
1440         inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, immw, arg, argw); \
1441         FAIL_IF(!inst); \
1442         *(inst + 1) |= (op_imm);
1443
1444 #define BINARY_EAX_IMM(op_eax_imm, immw) \
1445         FAIL_IF(emit_do_imm(compiler, (op_eax_imm), immw))
1446
1447 #endif
1448
1449 static sljit_si emit_cum_binary(struct sljit_compiler *compiler,
1450         sljit_ub op_rm, sljit_ub op_mr, sljit_ub op_imm, sljit_ub op_eax_imm,
1451         sljit_si dst, sljit_sw dstw,
1452         sljit_si src1, sljit_sw src1w,
1453         sljit_si src2, sljit_sw src2w)
1454 {
1455         sljit_ub* inst;
1456
1457         if (dst == SLJIT_UNUSED) {
1458                 EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1459                 if (src2 & SLJIT_IMM) {
1460                         BINARY_IMM(op_imm, op_mr, src2w, TMP_REG1, 0);
1461                 }
1462                 else {
1463                         inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
1464                         FAIL_IF(!inst);
1465                         *inst = op_rm;
1466                 }
1467                 return SLJIT_SUCCESS;
1468         }
1469
1470         if (dst == src1 && dstw == src1w) {
1471                 if (src2 & SLJIT_IMM) {
1472 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1473                         if ((dst == SLJIT_SCRATCH_REG1) && (src2w > 127 || src2w < -128) && (compiler->mode32 || IS_HALFWORD(src2w))) {
1474 #else
1475                         if ((dst == SLJIT_SCRATCH_REG1) && (src2w > 127 || src2w < -128)) {
1476 #endif
1477                                 BINARY_EAX_IMM(op_eax_imm, src2w);
1478                         }
1479                         else {
1480                                 BINARY_IMM(op_imm, op_mr, src2w, dst, dstw);
1481                         }
1482                 }
1483                 else if (FAST_IS_REG(dst)) {
1484                         inst = emit_x86_instruction(compiler, 1, dst, dstw, src2, src2w);
1485                         FAIL_IF(!inst);
1486                         *inst = op_rm;
1487                 }
1488                 else if (FAST_IS_REG(src2)) {
1489                         /* Special exception for sljit_emit_op_flags. */
1490                         inst = emit_x86_instruction(compiler, 1, src2, src2w, dst, dstw);
1491                         FAIL_IF(!inst);
1492                         *inst = op_mr;
1493                 }
1494                 else {
1495                         EMIT_MOV(compiler, TMP_REG1, 0, src2, src2w);
1496                         inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, dst, dstw);
1497                         FAIL_IF(!inst);
1498                         *inst = op_mr;
1499                 }
1500                 return SLJIT_SUCCESS;
1501         }
1502
1503         /* Only for cumulative operations. */
1504         if (dst == src2 && dstw == src2w) {
1505                 if (src1 & SLJIT_IMM) {
1506 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1507                         if ((dst == SLJIT_SCRATCH_REG1) && (src1w > 127 || src1w < -128) && (compiler->mode32 || IS_HALFWORD(src1w))) {
1508 #else
1509                         if ((dst == SLJIT_SCRATCH_REG1) && (src1w > 127 || src1w < -128)) {
1510 #endif
1511                                 BINARY_EAX_IMM(op_eax_imm, src1w);
1512                         }
1513                         else {
1514                                 BINARY_IMM(op_imm, op_mr, src1w, dst, dstw);
1515                         }
1516                 }
1517                 else if (FAST_IS_REG(dst)) {
1518                         inst = emit_x86_instruction(compiler, 1, dst, dstw, src1, src1w);
1519                         FAIL_IF(!inst);
1520                         *inst = op_rm;
1521                 }
1522                 else if (FAST_IS_REG(src1)) {
1523                         inst = emit_x86_instruction(compiler, 1, src1, src1w, dst, dstw);
1524                         FAIL_IF(!inst);
1525                         *inst = op_mr;
1526                 }
1527                 else {
1528                         EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1529                         inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, dst, dstw);
1530                         FAIL_IF(!inst);
1531                         *inst = op_mr;
1532                 }
1533                 return SLJIT_SUCCESS;
1534         }
1535
1536         /* General version. */
1537         if (FAST_IS_REG(dst)) {
1538                 EMIT_MOV(compiler, dst, 0, src1, src1w);
1539                 if (src2 & SLJIT_IMM) {
1540                         BINARY_IMM(op_imm, op_mr, src2w, dst, 0);
1541                 }
1542                 else {
1543                         inst = emit_x86_instruction(compiler, 1, dst, 0, src2, src2w);
1544                         FAIL_IF(!inst);
1545                         *inst = op_rm;
1546                 }
1547         }
1548         else {
1549                 /* This version requires less memory writing. */
1550                 EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1551                 if (src2 & SLJIT_IMM) {
1552                         BINARY_IMM(op_imm, op_mr, src2w, TMP_REG1, 0);
1553                 }
1554                 else {
1555                         inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
1556                         FAIL_IF(!inst);
1557                         *inst = op_rm;
1558                 }
1559                 EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
1560         }
1561
1562         return SLJIT_SUCCESS;
1563 }
1564
1565 static sljit_si emit_non_cum_binary(struct sljit_compiler *compiler,
1566         sljit_ub op_rm, sljit_ub op_mr, sljit_ub op_imm, sljit_ub op_eax_imm,
1567         sljit_si dst, sljit_sw dstw,
1568         sljit_si src1, sljit_sw src1w,
1569         sljit_si src2, sljit_sw src2w)
1570 {
1571         sljit_ub* inst;
1572
1573         if (dst == SLJIT_UNUSED) {
1574                 EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1575                 if (src2 & SLJIT_IMM) {
1576                         BINARY_IMM(op_imm, op_mr, src2w, TMP_REG1, 0);
1577                 }
1578                 else {
1579                         inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
1580                         FAIL_IF(!inst);
1581                         *inst = op_rm;
1582                 }
1583                 return SLJIT_SUCCESS;
1584         }
1585
1586         if (dst == src1 && dstw == src1w) {
1587                 if (src2 & SLJIT_IMM) {
1588 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1589                         if ((dst == SLJIT_SCRATCH_REG1) && (src2w > 127 || src2w < -128) && (compiler->mode32 || IS_HALFWORD(src2w))) {
1590 #else
1591                         if ((dst == SLJIT_SCRATCH_REG1) && (src2w > 127 || src2w < -128)) {
1592 #endif
1593                                 BINARY_EAX_IMM(op_eax_imm, src2w);
1594                         }
1595                         else {
1596                                 BINARY_IMM(op_imm, op_mr, src2w, dst, dstw);
1597                         }
1598                 }
1599                 else if (FAST_IS_REG(dst)) {
1600                         inst = emit_x86_instruction(compiler, 1, dst, dstw, src2, src2w);
1601                         FAIL_IF(!inst);
1602                         *inst = op_rm;
1603                 }
1604                 else if (FAST_IS_REG(src2)) {
1605                         inst = emit_x86_instruction(compiler, 1, src2, src2w, dst, dstw);
1606                         FAIL_IF(!inst);
1607                         *inst = op_mr;
1608                 }
1609                 else {
1610                         EMIT_MOV(compiler, TMP_REG1, 0, src2, src2w);
1611                         inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, dst, dstw);
1612                         FAIL_IF(!inst);
1613                         *inst = op_mr;
1614                 }
1615                 return SLJIT_SUCCESS;
1616         }
1617
1618         /* General version. */
1619         if (FAST_IS_REG(dst) && dst != src2) {
1620                 EMIT_MOV(compiler, dst, 0, src1, src1w);
1621                 if (src2 & SLJIT_IMM) {
1622                         BINARY_IMM(op_imm, op_mr, src2w, dst, 0);
1623                 }
1624                 else {
1625                         inst = emit_x86_instruction(compiler, 1, dst, 0, src2, src2w);
1626                         FAIL_IF(!inst);
1627                         *inst = op_rm;
1628                 }
1629         }
1630         else {
1631                 /* This version requires less memory writing. */
1632                 EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1633                 if (src2 & SLJIT_IMM) {
1634                         BINARY_IMM(op_imm, op_mr, src2w, TMP_REG1, 0);
1635                 }
1636                 else {
1637                         inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
1638                         FAIL_IF(!inst);
1639                         *inst = op_rm;
1640                 }
1641                 EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
1642         }
1643
1644         return SLJIT_SUCCESS;
1645 }
1646
1647 static sljit_si emit_mul(struct sljit_compiler *compiler,
1648         sljit_si dst, sljit_sw dstw,
1649         sljit_si src1, sljit_sw src1w,
1650         sljit_si src2, sljit_sw src2w)
1651 {
1652         sljit_ub* inst;
1653         sljit_si dst_r;
1654
1655         dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
1656
1657         /* Register destination. */
1658         if (dst_r == src1 && !(src2 & SLJIT_IMM)) {
1659                 inst = emit_x86_instruction(compiler, 2, dst_r, 0, src2, src2w);
1660                 FAIL_IF(!inst);
1661                 *inst++ = GROUP_0F;
1662                 *inst = IMUL_r_rm;
1663         }
1664         else if (dst_r == src2 && !(src1 & SLJIT_IMM)) {
1665                 inst = emit_x86_instruction(compiler, 2, dst_r, 0, src1, src1w);
1666                 FAIL_IF(!inst);
1667                 *inst++ = GROUP_0F;
1668                 *inst = IMUL_r_rm;
1669         }
1670         else if (src1 & SLJIT_IMM) {
1671                 if (src2 & SLJIT_IMM) {
1672                         EMIT_MOV(compiler, dst_r, 0, SLJIT_IMM, src2w);
1673                         src2 = dst_r;
1674                         src2w = 0;
1675                 }
1676
1677                 if (src1w <= 127 && src1w >= -128) {
1678                         inst = emit_x86_instruction(compiler, 1, dst_r, 0, src2, src2w);
1679                         FAIL_IF(!inst);
1680                         *inst = IMUL_r_rm_i8;
1681                         inst = (sljit_ub*)ensure_buf(compiler, 1 + 1);
1682                         FAIL_IF(!inst);
1683                         INC_SIZE(1);
1684                         *inst = (sljit_sb)src1w;
1685                 }
1686 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1687                 else {
1688                         inst = emit_x86_instruction(compiler, 1, dst_r, 0, src2, src2w);
1689                         FAIL_IF(!inst);
1690                         *inst = IMUL_r_rm_i32;
1691                         inst = (sljit_ub*)ensure_buf(compiler, 1 + 4);
1692                         FAIL_IF(!inst);
1693                         INC_SIZE(4);
1694                         *(sljit_sw*)inst = src1w;
1695                 }
1696 #else
1697                 else if (IS_HALFWORD(src1w)) {
1698                         inst = emit_x86_instruction(compiler, 1, dst_r, 0, src2, src2w);
1699                         FAIL_IF(!inst);
1700                         *inst = IMUL_r_rm_i32;
1701                         inst = (sljit_ub*)ensure_buf(compiler, 1 + 4);
1702                         FAIL_IF(!inst);
1703                         INC_SIZE(4);
1704                         *(sljit_si*)inst = (sljit_si)src1w;
1705                 }
1706                 else {
1707                         EMIT_MOV(compiler, TMP_REG2, 0, SLJIT_IMM, src1w);
1708                         if (dst_r != src2)
1709                                 EMIT_MOV(compiler, dst_r, 0, src2, src2w);
1710                         inst = emit_x86_instruction(compiler, 2, dst_r, 0, TMP_REG2, 0);
1711                         FAIL_IF(!inst);
1712                         *inst++ = GROUP_0F;
1713                         *inst = IMUL_r_rm;
1714                 }
1715 #endif
1716         }
1717         else if (src2 & SLJIT_IMM) {
1718                 /* Note: src1 is NOT immediate. */
1719
1720                 if (src2w <= 127 && src2w >= -128) {
1721                         inst = emit_x86_instruction(compiler, 1, dst_r, 0, src1, src1w);
1722                         FAIL_IF(!inst);
1723                         *inst = IMUL_r_rm_i8;
1724                         inst = (sljit_ub*)ensure_buf(compiler, 1 + 1);
1725                         FAIL_IF(!inst);
1726                         INC_SIZE(1);
1727                         *inst = (sljit_sb)src2w;
1728                 }
1729 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1730                 else {
1731                         inst = emit_x86_instruction(compiler, 1, dst_r, 0, src1, src1w);
1732                         FAIL_IF(!inst);
1733                         *inst = IMUL_r_rm_i32;
1734                         inst = (sljit_ub*)ensure_buf(compiler, 1 + 4);
1735                         FAIL_IF(!inst);
1736                         INC_SIZE(4);
1737                         *(sljit_sw*)inst = src2w;
1738                 }
1739 #else
1740                 else if (IS_HALFWORD(src2w)) {
1741                         inst = emit_x86_instruction(compiler, 1, dst_r, 0, src1, src1w);
1742                         FAIL_IF(!inst);
1743                         *inst = IMUL_r_rm_i32;
1744                         inst = (sljit_ub*)ensure_buf(compiler, 1 + 4);
1745                         FAIL_IF(!inst);
1746                         INC_SIZE(4);
1747                         *(sljit_si*)inst = (sljit_si)src2w;
1748                 }
1749                 else {
1750                         EMIT_MOV(compiler, TMP_REG2, 0, SLJIT_IMM, src1w);
1751                         if (dst_r != src1)
1752                                 EMIT_MOV(compiler, dst_r, 0, src1, src1w);
1753                         inst = emit_x86_instruction(compiler, 2, dst_r, 0, TMP_REG2, 0);
1754                         FAIL_IF(!inst);
1755                         *inst++ = GROUP_0F;
1756                         *inst = IMUL_r_rm;
1757                 }
1758 #endif
1759         }
1760         else {
1761                 /* Neither argument is immediate. */
1762                 if (ADDRESSING_DEPENDS_ON(src2, dst_r))
1763                         dst_r = TMP_REG1;
1764                 EMIT_MOV(compiler, dst_r, 0, src1, src1w);
1765                 inst = emit_x86_instruction(compiler, 2, dst_r, 0, src2, src2w);
1766                 FAIL_IF(!inst);
1767                 *inst++ = GROUP_0F;
1768                 *inst = IMUL_r_rm;
1769         }
1770
1771         if (dst_r == TMP_REG1)
1772                 EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
1773
1774         return SLJIT_SUCCESS;
1775 }
1776
1777 static sljit_si emit_lea_binary(struct sljit_compiler *compiler, sljit_si keep_flags,
1778         sljit_si dst, sljit_sw dstw,
1779         sljit_si src1, sljit_sw src1w,
1780         sljit_si src2, sljit_sw src2w)
1781 {
1782         sljit_ub* inst;
1783         sljit_si dst_r, done = 0;
1784
1785         /* These cases better be left to handled by normal way. */
1786         if (!keep_flags) {
1787                 if (dst == src1 && dstw == src1w)
1788                         return SLJIT_ERR_UNSUPPORTED;
1789                 if (dst == src2 && dstw == src2w)
1790                         return SLJIT_ERR_UNSUPPORTED;
1791         }
1792
1793         dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
1794
1795         if (FAST_IS_REG(src1)) {
1796                 if (FAST_IS_REG(src2)) {
1797                         inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM2(src1, src2), 0);
1798                         FAIL_IF(!inst);
1799                         *inst = LEA_r_m;
1800                         done = 1;
1801                 }
1802 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1803                 if ((src2 & SLJIT_IMM) && (compiler->mode32 || IS_HALFWORD(src2w))) {
1804                         inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM1(src1), (sljit_si)src2w);
1805 #else
1806                 if (src2 & SLJIT_IMM) {
1807                         inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM1(src1), src2w);
1808 #endif
1809                         FAIL_IF(!inst);
1810                         *inst = LEA_r_m;
1811                         done = 1;
1812                 }
1813         }
1814         else if (FAST_IS_REG(src2)) {
1815 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1816                 if ((src1 & SLJIT_IMM) && (compiler->mode32 || IS_HALFWORD(src1w))) {
1817                         inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM1(src2), (sljit_si)src1w);
1818 #else
1819                 if (src1 & SLJIT_IMM) {
1820                         inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM1(src2), src1w);
1821 #endif
1822                         FAIL_IF(!inst);
1823                         *inst = LEA_r_m;
1824                         done = 1;
1825                 }
1826         }
1827
1828         if (done) {
1829                 if (dst_r == TMP_REG1)
1830                         return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
1831                 return SLJIT_SUCCESS;
1832         }
1833         return SLJIT_ERR_UNSUPPORTED;
1834 }
1835
1836 static sljit_si emit_cmp_binary(struct sljit_compiler *compiler,
1837         sljit_si src1, sljit_sw src1w,
1838         sljit_si src2, sljit_sw src2w)
1839 {
1840         sljit_ub* inst;
1841
1842 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1843         if (src1 == SLJIT_SCRATCH_REG1 && (src2 & SLJIT_IMM) && (src2w > 127 || src2w < -128) && (compiler->mode32 || IS_HALFWORD(src2w))) {
1844 #else
1845         if (src1 == SLJIT_SCRATCH_REG1 && (src2 & SLJIT_IMM) && (src2w > 127 || src2w < -128)) {
1846 #endif
1847                 BINARY_EAX_IMM(CMP_EAX_i32, src2w);
1848                 return SLJIT_SUCCESS;
1849         }
1850
1851         if (FAST_IS_REG(src1)) {
1852                 if (src2 & SLJIT_IMM) {
1853                         BINARY_IMM(CMP, CMP_rm_r, src2w, src1, 0);
1854                 }
1855                 else {
1856                         inst = emit_x86_instruction(compiler, 1, src1, 0, src2, src2w);
1857                         FAIL_IF(!inst);
1858                         *inst = CMP_r_rm;
1859                 }
1860                 return SLJIT_SUCCESS;
1861         }
1862
1863         if (FAST_IS_REG(src2) && !(src1 & SLJIT_IMM)) {
1864                 inst = emit_x86_instruction(compiler, 1, src2, 0, src1, src1w);
1865                 FAIL_IF(!inst);
1866                 *inst = CMP_rm_r;
1867                 return SLJIT_SUCCESS;
1868         }
1869
1870         if (src2 & SLJIT_IMM) {
1871                 if (src1 & SLJIT_IMM) {
1872                         EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1873                         src1 = TMP_REG1;
1874                         src1w = 0;
1875                 }
1876                 BINARY_IMM(CMP, CMP_rm_r, src2w, src1, src1w);
1877         }
1878         else {
1879                 EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1880                 inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
1881                 FAIL_IF(!inst);
1882                 *inst = CMP_r_rm;
1883         }
1884         return SLJIT_SUCCESS;
1885 }
1886
1887 static sljit_si emit_test_binary(struct sljit_compiler *compiler,
1888         sljit_si src1, sljit_sw src1w,
1889         sljit_si src2, sljit_sw src2w)
1890 {
1891         sljit_ub* inst;
1892
1893 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1894         if (src1 == SLJIT_SCRATCH_REG1 && (src2 & SLJIT_IMM) && (src2w > 127 || src2w < -128) && (compiler->mode32 || IS_HALFWORD(src2w))) {
1895 #else
1896         if (src1 == SLJIT_SCRATCH_REG1 && (src2 & SLJIT_IMM) && (src2w > 127 || src2w < -128)) {
1897 #endif
1898                 BINARY_EAX_IMM(TEST_EAX_i32, src2w);
1899                 return SLJIT_SUCCESS;
1900         }
1901
1902 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1903         if (src2 == SLJIT_SCRATCH_REG1 && (src2 & SLJIT_IMM) && (src1w > 127 || src1w < -128) && (compiler->mode32 || IS_HALFWORD(src1w))) {
1904 #else
1905         if (src2 == SLJIT_SCRATCH_REG1 && (src1 & SLJIT_IMM) && (src1w > 127 || src1w < -128)) {
1906 #endif
1907                 BINARY_EAX_IMM(TEST_EAX_i32, src1w);
1908                 return SLJIT_SUCCESS;
1909         }
1910
1911         if (FAST_IS_REG(src1)) {
1912                 if (src2 & SLJIT_IMM) {
1913 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1914                         if (IS_HALFWORD(src2w) || compiler->mode32) {
1915                                 inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, src1, 0);
1916                                 FAIL_IF(!inst);
1917                                 *inst = GROUP_F7;
1918                         }
1919                         else {
1920                                 FAIL_IF(emit_load_imm64(compiler, TMP_REG2, src2w));
1921                                 inst = emit_x86_instruction(compiler, 1, TMP_REG2, 0, src1, 0);
1922                                 FAIL_IF(!inst);
1923                                 *inst = TEST_rm_r;
1924                         }
1925 #else
1926                         inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, src1, 0);
1927                         FAIL_IF(!inst);
1928                         *inst = GROUP_F7;
1929 #endif
1930                 }
1931                 else {
1932                         inst = emit_x86_instruction(compiler, 1, src1, 0, src2, src2w);
1933                         FAIL_IF(!inst);
1934                         *inst = TEST_rm_r;
1935                 }
1936                 return SLJIT_SUCCESS;
1937         }
1938
1939         if (FAST_IS_REG(src2)) {
1940                 if (src1 & SLJIT_IMM) {
1941 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1942                         if (IS_HALFWORD(src1w) || compiler->mode32) {
1943                                 inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src1w, src2, 0);
1944                                 FAIL_IF(!inst);
1945                                 *inst = GROUP_F7;
1946                         }
1947                         else {
1948                                 FAIL_IF(emit_load_imm64(compiler, TMP_REG2, src1w));
1949                                 inst = emit_x86_instruction(compiler, 1, TMP_REG2, 0, src2, 0);
1950                                 FAIL_IF(!inst);
1951                                 *inst = TEST_rm_r;
1952                         }
1953 #else
1954                         inst = emit_x86_instruction(compiler, 1, src1, src1w, src2, 0);
1955                         FAIL_IF(!inst);
1956                         *inst = GROUP_F7;
1957 #endif
1958                 }
1959                 else {
1960                         inst = emit_x86_instruction(compiler, 1, src2, 0, src1, src1w);
1961                         FAIL_IF(!inst);
1962                         *inst = TEST_rm_r;
1963                 }
1964                 return SLJIT_SUCCESS;
1965         }
1966
1967         EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1968         if (src2 & SLJIT_IMM) {
1969 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1970                 if (IS_HALFWORD(src2w) || compiler->mode32) {
1971                         inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, TMP_REG1, 0);
1972                         FAIL_IF(!inst);
1973                         *inst = GROUP_F7;
1974                 }
1975                 else {
1976                         FAIL_IF(emit_load_imm64(compiler, TMP_REG2, src2w));
1977                         inst = emit_x86_instruction(compiler, 1, TMP_REG2, 0, TMP_REG1, 0);
1978                         FAIL_IF(!inst);
1979                         *inst = TEST_rm_r;
1980                 }
1981 #else
1982                 inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, TMP_REG1, 0);
1983                 FAIL_IF(!inst);
1984                 *inst = GROUP_F7;
1985 #endif
1986         }
1987         else {
1988                 inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
1989                 FAIL_IF(!inst);
1990                 *inst = TEST_rm_r;
1991         }
1992         return SLJIT_SUCCESS;
1993 }
1994
1995 static sljit_si emit_shift(struct sljit_compiler *compiler,
1996         sljit_ub mode,
1997         sljit_si dst, sljit_sw dstw,
1998         sljit_si src1, sljit_sw src1w,
1999         sljit_si src2, sljit_sw src2w)
2000 {
2001         sljit_ub* inst;
2002
2003         if ((src2 & SLJIT_IMM) || (src2 == SLJIT_PREF_SHIFT_REG)) {
2004                 if (dst == src1 && dstw == src1w) {
2005                         inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, src2, src2w, dst, dstw);
2006                         FAIL_IF(!inst);
2007                         *inst |= mode;
2008                         return SLJIT_SUCCESS;
2009                 }
2010                 if (dst == SLJIT_UNUSED) {
2011                         EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
2012                         inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, src2, src2w, TMP_REG1, 0);
2013                         FAIL_IF(!inst);
2014                         *inst |= mode;
2015                         return SLJIT_SUCCESS;
2016                 }
2017                 if (dst == SLJIT_PREF_SHIFT_REG && src2 == SLJIT_PREF_SHIFT_REG) {
2018                         EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
2019                         inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
2020                         FAIL_IF(!inst);
2021                         *inst |= mode;
2022                         EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
2023                         return SLJIT_SUCCESS;
2024                 }
2025                 if (FAST_IS_REG(dst)) {
2026                         EMIT_MOV(compiler, dst, 0, src1, src1w);
2027                         inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, src2, src2w, dst, 0);
2028                         FAIL_IF(!inst);
2029                         *inst |= mode;
2030                         return SLJIT_SUCCESS;
2031                 }
2032
2033                 EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
2034                 inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, src2, src2w, TMP_REG1, 0);
2035                 FAIL_IF(!inst);
2036                 *inst |= mode;
2037                 EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
2038                 return SLJIT_SUCCESS;
2039         }
2040
2041         if (dst == SLJIT_PREF_SHIFT_REG) {
2042                 EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
2043                 EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, src2, src2w);
2044                 inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
2045                 FAIL_IF(!inst);
2046                 *inst |= mode;
2047                 EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
2048         }
2049         else if (FAST_IS_REG(dst) && dst != src2 && !ADDRESSING_DEPENDS_ON(src2, dst)) {
2050                 if (src1 != dst)
2051                         EMIT_MOV(compiler, dst, 0, src1, src1w);
2052                 EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_PREF_SHIFT_REG, 0);
2053                 EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, src2, src2w);
2054                 inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, dst, 0);
2055                 FAIL_IF(!inst);
2056                 *inst |= mode;
2057                 EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
2058         }
2059         else {
2060                 /* This case is really difficult, since ecx itself may used for
2061                    addressing, and we must ensure to work even in that case. */
2062                 EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
2063 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2064                 EMIT_MOV(compiler, TMP_REG2, 0, SLJIT_PREF_SHIFT_REG, 0);
2065 #else
2066                 /* [esp+0] contains the flags. */
2067                 EMIT_MOV(compiler, SLJIT_MEM1(SLJIT_LOCALS_REG), sizeof(sljit_sw), SLJIT_PREF_SHIFT_REG, 0);
2068 #endif
2069                 EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, src2, src2w);
2070                 inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
2071                 FAIL_IF(!inst);
2072                 *inst |= mode;
2073 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2074                 EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG2, 0);
2075 #else
2076                 EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, SLJIT_MEM1(SLJIT_LOCALS_REG), sizeof(sljit_sw));
2077 #endif
2078                 EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
2079         }
2080
2081         return SLJIT_SUCCESS;
2082 }
2083
2084 static sljit_si emit_shift_with_flags(struct sljit_compiler *compiler,
2085         sljit_ub mode, sljit_si set_flags,
2086         sljit_si dst, sljit_sw dstw,
2087         sljit_si src1, sljit_sw src1w,
2088         sljit_si src2, sljit_sw src2w)
2089 {
2090         /* The CPU does not set flags if the shift count is 0. */
2091         if (src2 & SLJIT_IMM) {
2092 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2093                 if ((src2w & 0x3f) != 0 || (compiler->mode32 && (src2w & 0x1f) != 0))
2094                         return emit_shift(compiler, mode, dst, dstw, src1, src1w, src2, src2w);
2095 #else
2096                 if ((src2w & 0x1f) != 0)
2097                         return emit_shift(compiler, mode, dst, dstw, src1, src1w, src2, src2w);
2098 #endif
2099                 if (!set_flags)
2100                         return emit_mov(compiler, dst, dstw, src1, src1w);
2101                 /* OR dst, src, 0 */
2102                 return emit_cum_binary(compiler, OR_r_rm, OR_rm_r, OR, OR_EAX_i32,
2103                         dst, dstw, src1, src1w, SLJIT_IMM, 0);
2104         }
2105
2106         if (!set_flags)
2107                 return emit_shift(compiler, mode, dst, dstw, src1, src1w, src2, src2w);
2108
2109         if (!FAST_IS_REG(dst))
2110                 FAIL_IF(emit_cmp_binary(compiler, src1, src1w, SLJIT_IMM, 0));
2111
2112         FAIL_IF(emit_shift(compiler,mode, dst, dstw, src1, src1w, src2, src2w));
2113
2114         if (FAST_IS_REG(dst))
2115                 return emit_cmp_binary(compiler, dst, dstw, SLJIT_IMM, 0);
2116         return SLJIT_SUCCESS;
2117 }
2118
2119 SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op2(struct sljit_compiler *compiler, sljit_si op,
2120         sljit_si dst, sljit_sw dstw,
2121         sljit_si src1, sljit_sw src1w,
2122         sljit_si src2, sljit_sw src2w)
2123 {
2124         CHECK_ERROR();
2125         check_sljit_emit_op2(compiler, op, dst, dstw, src1, src1w, src2, src2w);
2126         ADJUST_LOCAL_OFFSET(dst, dstw);
2127         ADJUST_LOCAL_OFFSET(src1, src1w);
2128         ADJUST_LOCAL_OFFSET(src2, src2w);
2129
2130         CHECK_EXTRA_REGS(dst, dstw, (void)0);
2131         CHECK_EXTRA_REGS(src1, src1w, (void)0);
2132         CHECK_EXTRA_REGS(src2, src2w, (void)0);
2133 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2134         compiler->mode32 = op & SLJIT_INT_OP;
2135 #endif
2136
2137         if (GET_OPCODE(op) >= SLJIT_MUL) {
2138                 if (SLJIT_UNLIKELY(GET_FLAGS(op)))
2139                         compiler->flags_saved = 0;
2140                 else if (SLJIT_UNLIKELY(op & SLJIT_KEEP_FLAGS) && !compiler->flags_saved)
2141                         FAIL_IF(emit_save_flags(compiler));
2142         }
2143
2144         switch (GET_OPCODE(op)) {
2145         case SLJIT_ADD:
2146                 if (!GET_FLAGS(op)) {
2147                         if (emit_lea_binary(compiler, op & SLJIT_KEEP_FLAGS, dst, dstw, src1, src1w, src2, src2w) != SLJIT_ERR_UNSUPPORTED)
2148                                 return compiler->error;
2149                 }
2150                 else
2151                         compiler->flags_saved = 0;
2152                 if (SLJIT_UNLIKELY(op & SLJIT_KEEP_FLAGS) && !compiler->flags_saved)
2153                         FAIL_IF(emit_save_flags(compiler));
2154                 return emit_cum_binary(compiler, ADD_r_rm, ADD_rm_r, ADD, ADD_EAX_i32,
2155                         dst, dstw, src1, src1w, src2, src2w);
2156         case SLJIT_ADDC:
2157                 if (SLJIT_UNLIKELY(compiler->flags_saved)) /* C flag must be restored. */
2158                         FAIL_IF(emit_restore_flags(compiler, 1));
2159                 else if (SLJIT_UNLIKELY(op & SLJIT_KEEP_FLAGS))
2160                         FAIL_IF(emit_save_flags(compiler));
2161                 if (SLJIT_UNLIKELY(GET_FLAGS(op)))
2162                         compiler->flags_saved = 0;
2163                 return emit_cum_binary(compiler, ADC_r_rm, ADC_rm_r, ADC, ADC_EAX_i32,
2164                         dst, dstw, src1, src1w, src2, src2w);
2165         case SLJIT_SUB:
2166                 if (!GET_FLAGS(op)) {
2167                         if ((src2 & SLJIT_IMM) && emit_lea_binary(compiler, op & SLJIT_KEEP_FLAGS, dst, dstw, src1, src1w, SLJIT_IMM, -src2w) != SLJIT_ERR_UNSUPPORTED)
2168                                 return compiler->error;
2169                 }
2170                 else
2171                         compiler->flags_saved = 0;
2172                 if (SLJIT_UNLIKELY(op & SLJIT_KEEP_FLAGS) && !compiler->flags_saved)
2173                         FAIL_IF(emit_save_flags(compiler));
2174                 if (dst == SLJIT_UNUSED)
2175                         return emit_cmp_binary(compiler, src1, src1w, src2, src2w);
2176                 return emit_non_cum_binary(compiler, SUB_r_rm, SUB_rm_r, SUB, SUB_EAX_i32,
2177                         dst, dstw, src1, src1w, src2, src2w);
2178         case SLJIT_SUBC:
2179                 if (SLJIT_UNLIKELY(compiler->flags_saved)) /* C flag must be restored. */
2180                         FAIL_IF(emit_restore_flags(compiler, 1));
2181                 else if (SLJIT_UNLIKELY(op & SLJIT_KEEP_FLAGS))
2182                         FAIL_IF(emit_save_flags(compiler));
2183                 if (SLJIT_UNLIKELY(GET_FLAGS(op)))
2184                         compiler->flags_saved = 0;
2185                 return emit_non_cum_binary(compiler, SBB_r_rm, SBB_rm_r, SBB, SBB_EAX_i32,
2186                         dst, dstw, src1, src1w, src2, src2w);
2187         case SLJIT_MUL:
2188                 return emit_mul(compiler, dst, dstw, src1, src1w, src2, src2w);
2189         case SLJIT_AND:
2190                 if (dst == SLJIT_UNUSED)
2191                         return emit_test_binary(compiler, src1, src1w, src2, src2w);
2192                 return emit_cum_binary(compiler, AND_r_rm, AND_rm_r, AND, AND_EAX_i32,
2193                         dst, dstw, src1, src1w, src2, src2w);
2194         case SLJIT_OR:
2195                 return emit_cum_binary(compiler, OR_r_rm, OR_rm_r, OR, OR_EAX_i32,
2196                         dst, dstw, src1, src1w, src2, src2w);
2197         case SLJIT_XOR:
2198                 return emit_cum_binary(compiler, XOR_r_rm, XOR_rm_r, XOR, XOR_EAX_i32,
2199                         dst, dstw, src1, src1w, src2, src2w);
2200         case SLJIT_SHL:
2201                 return emit_shift_with_flags(compiler, SHL, GET_FLAGS(op),
2202                         dst, dstw, src1, src1w, src2, src2w);
2203         case SLJIT_LSHR:
2204                 return emit_shift_with_flags(compiler, SHR, GET_FLAGS(op),
2205                         dst, dstw, src1, src1w, src2, src2w);
2206         case SLJIT_ASHR:
2207                 return emit_shift_with_flags(compiler, SAR, GET_FLAGS(op),
2208                         dst, dstw, src1, src1w, src2, src2w);
2209         }
2210
2211         return SLJIT_SUCCESS;
2212 }
2213
2214 SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_get_register_index(sljit_si reg)
2215 {
2216         check_sljit_get_register_index(reg);
2217 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
2218         if (reg == SLJIT_TEMPORARY_EREG1 || reg == SLJIT_TEMPORARY_EREG2
2219                         || reg == SLJIT_SAVED_EREG1 || reg == SLJIT_SAVED_EREG2)
2220                 return -1;
2221 #endif
2222         return reg_map[reg];
2223 }
2224
2225 SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_get_float_register_index(sljit_si reg)
2226 {
2227         check_sljit_get_float_register_index(reg);
2228         return reg;
2229 }
2230
2231 SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op_custom(struct sljit_compiler *compiler,
2232         void *instruction, sljit_si size)
2233 {
2234         sljit_ub *inst;
2235
2236         CHECK_ERROR();
2237         check_sljit_emit_op_custom(compiler, instruction, size);
2238         SLJIT_ASSERT(size > 0 && size < 16);
2239
2240         inst = (sljit_ub*)ensure_buf(compiler, 1 + size);
2241         FAIL_IF(!inst);
2242         INC_SIZE(size);
2243         SLJIT_MEMMOVE(inst, instruction, size);
2244         return SLJIT_SUCCESS;
2245 }
2246
2247 /* --------------------------------------------------------------------- */
2248 /*  Floating point operators                                             */
2249 /* --------------------------------------------------------------------- */
2250
2251 #if (defined SLJIT_SSE2 && SLJIT_SSE2)
2252
2253 /* Alignment + 2 * 16 bytes. */
2254 static sljit_si sse2_data[3 + (4 + 4) * 2];
2255 static sljit_si *sse2_buffer;
2256
2257 static void init_compiler(void)
2258 {
2259         sse2_buffer = (sljit_si*)(((sljit_uw)sse2_data + 15) & ~0xf);
2260         /* Single precision constants. */
2261         sse2_buffer[0] = 0x80000000;
2262         sse2_buffer[4] = 0x7fffffff;
2263         /* Double precision constants. */
2264         sse2_buffer[8] = 0;
2265         sse2_buffer[9] = 0x80000000;
2266         sse2_buffer[12] = 0xffffffff;
2267         sse2_buffer[13] = 0x7fffffff;
2268 }
2269
2270 #endif
2271
2272 SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_is_fpu_available(void)
2273 {
2274 #ifdef SLJIT_IS_FPU_AVAILABLE
2275         return SLJIT_IS_FPU_AVAILABLE;
2276 #elif (defined SLJIT_SSE2 && SLJIT_SSE2)
2277 #if (defined SLJIT_DETECT_SSE2 && SLJIT_DETECT_SSE2)
2278         if (cpu_has_sse2 == -1)
2279                 get_cpu_features();
2280         return cpu_has_sse2;
2281 #else /* SLJIT_DETECT_SSE2 */
2282         return 1;
2283 #endif /* SLJIT_DETECT_SSE2 */
2284 #else /* SLJIT_SSE2 */
2285         return 0;
2286 #endif
2287 }
2288
2289 #if (defined SLJIT_SSE2 && SLJIT_SSE2)
2290
2291 static sljit_si emit_sse2(struct sljit_compiler *compiler, sljit_ub opcode,
2292         sljit_si single, sljit_si xmm1, sljit_si xmm2, sljit_sw xmm2w)
2293 {
2294         sljit_ub *inst;
2295
2296         inst = emit_x86_instruction(compiler, 2 | (single ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2, xmm1, 0, xmm2, xmm2w);
2297         FAIL_IF(!inst);
2298         *inst++ = GROUP_0F;
2299         *inst = opcode;
2300         return SLJIT_SUCCESS;
2301 }
2302
2303 static sljit_si emit_sse2_logic(struct sljit_compiler *compiler, sljit_ub opcode,
2304         sljit_si pref66, sljit_si xmm1, sljit_si xmm2, sljit_sw xmm2w)
2305 {
2306         sljit_ub *inst;
2307
2308         inst = emit_x86_instruction(compiler, 2 | (pref66 ? EX86_PREF_66 : 0) | EX86_SSE2, xmm1, 0, xmm2, xmm2w);
2309         FAIL_IF(!inst);
2310         *inst++ = GROUP_0F;
2311         *inst = opcode;
2312         return SLJIT_SUCCESS;
2313 }
2314
2315 static SLJIT_INLINE sljit_si emit_sse2_load(struct sljit_compiler *compiler,
2316         sljit_si single, sljit_si dst, sljit_si src, sljit_sw srcw)
2317 {
2318         return emit_sse2(compiler, MOVSD_x_xm, single, dst, src, srcw);
2319 }
2320
2321 static SLJIT_INLINE sljit_si emit_sse2_store(struct sljit_compiler *compiler,
2322         sljit_si single, sljit_si dst, sljit_sw dstw, sljit_si src)
2323 {
2324         return emit_sse2(compiler, MOVSD_xm_x, single, src, dst, dstw);
2325 }
2326
2327 SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_fop1(struct sljit_compiler *compiler, sljit_si op,
2328         sljit_si dst, sljit_sw dstw,
2329         sljit_si src, sljit_sw srcw)
2330 {
2331         sljit_si dst_r;
2332
2333         CHECK_ERROR();
2334         check_sljit_emit_fop1(compiler, op, dst, dstw, src, srcw);
2335
2336 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2337         compiler->mode32 = 1;
2338 #endif
2339
2340         if (GET_OPCODE(op) == SLJIT_CMPD) {
2341                 compiler->flags_saved = 0;
2342                 if (FAST_IS_REG(dst))
2343                         dst_r = dst;
2344                 else {
2345                         dst_r = TMP_FREG;
2346                         FAIL_IF(emit_sse2_load(compiler, op & SLJIT_SINGLE_OP, dst_r, dst, dstw));
2347                 }
2348                 return emit_sse2_logic(compiler, UCOMISD_x_xm, !(op & SLJIT_SINGLE_OP), dst_r, src, srcw);
2349         }
2350
2351         if (op == SLJIT_MOVD) {
2352                 if (FAST_IS_REG(dst))
2353                         return emit_sse2_load(compiler, op & SLJIT_SINGLE_OP, dst, src, srcw);
2354                 if (FAST_IS_REG(src))
2355                         return emit_sse2_store(compiler, op & SLJIT_SINGLE_OP, dst, dstw, src);
2356                 FAIL_IF(emit_sse2_load(compiler, op & SLJIT_SINGLE_OP, TMP_FREG, src, srcw));
2357                 return emit_sse2_store(compiler, op & SLJIT_SINGLE_OP, dst, dstw, TMP_FREG);
2358         }
2359
2360         if (SLOW_IS_REG(dst)) {
2361                 dst_r = dst;
2362                 if (dst != src)
2363                         FAIL_IF(emit_sse2_load(compiler, op & SLJIT_SINGLE_OP, dst_r, src, srcw));
2364         }
2365         else {
2366                 dst_r = TMP_FREG;
2367                 FAIL_IF(emit_sse2_load(compiler, op & SLJIT_SINGLE_OP, dst_r, src, srcw));
2368         }
2369
2370         switch (GET_OPCODE(op)) {
2371         case SLJIT_NEGD:
2372                 FAIL_IF(emit_sse2_logic(compiler, XORPD_x_xm, 1, dst_r, SLJIT_MEM0(), (sljit_sw)(op & SLJIT_SINGLE_OP ? sse2_buffer : sse2_buffer + 8)));
2373                 break;
2374
2375         case SLJIT_ABSD:
2376                 FAIL_IF(emit_sse2_logic(compiler, ANDPD_x_xm, 1, dst_r, SLJIT_MEM0(), (sljit_sw)(op & SLJIT_SINGLE_OP ? sse2_buffer + 4 : sse2_buffer + 12)));
2377                 break;
2378         }
2379
2380         if (dst_r == TMP_FREG)
2381                 return emit_sse2_store(compiler, op & SLJIT_SINGLE_OP, dst, dstw, TMP_FREG);
2382         return SLJIT_SUCCESS;
2383 }
2384
2385 SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_fop2(struct sljit_compiler *compiler, sljit_si op,
2386         sljit_si dst, sljit_sw dstw,
2387         sljit_si src1, sljit_sw src1w,
2388         sljit_si src2, sljit_sw src2w)
2389 {
2390         sljit_si dst_r;
2391
2392         CHECK_ERROR();
2393         check_sljit_emit_fop2(compiler, op, dst, dstw, src1, src1w, src2, src2w);
2394
2395 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2396         compiler->mode32 = 1;
2397 #endif
2398
2399         if (FAST_IS_REG(dst)) {
2400                 dst_r = dst;
2401                 if (dst == src1)
2402                         ; /* Do nothing here. */
2403                 else if (dst == src2 && (op == SLJIT_ADDD || op == SLJIT_MULD)) {
2404                         /* Swap arguments. */
2405                         src2 = src1;
2406                         src2w = src1w;
2407                 }
2408                 else if (dst != src2)
2409                         FAIL_IF(emit_sse2_load(compiler, op & SLJIT_SINGLE_OP, dst_r, src1, src1w));
2410                 else {
2411                         dst_r = TMP_FREG;
2412                         FAIL_IF(emit_sse2_load(compiler, op & SLJIT_SINGLE_OP, TMP_FREG, src1, src1w));
2413                 }
2414         }
2415         else {
2416                 dst_r = TMP_FREG;
2417                 FAIL_IF(emit_sse2_load(compiler, op & SLJIT_SINGLE_OP, TMP_FREG, src1, src1w));
2418         }
2419
2420         switch (GET_OPCODE(op)) {
2421         case SLJIT_ADDD:
2422                 FAIL_IF(emit_sse2(compiler, ADDSD_x_xm, op & SLJIT_SINGLE_OP, dst_r, src2, src2w));
2423                 break;
2424
2425         case SLJIT_SUBD:
2426                 FAIL_IF(emit_sse2(compiler, SUBSD_x_xm, op & SLJIT_SINGLE_OP, dst_r, src2, src2w));
2427                 break;
2428
2429         case SLJIT_MULD:
2430                 FAIL_IF(emit_sse2(compiler, MULSD_x_xm, op & SLJIT_SINGLE_OP, dst_r, src2, src2w));
2431                 break;
2432
2433         case SLJIT_DIVD:
2434                 FAIL_IF(emit_sse2(compiler, DIVSD_x_xm, op & SLJIT_SINGLE_OP, dst_r, src2, src2w));
2435                 break;
2436         }
2437
2438         if (dst_r == TMP_FREG)
2439                 return emit_sse2_store(compiler, op & SLJIT_SINGLE_OP, dst, dstw, TMP_FREG);
2440         return SLJIT_SUCCESS;
2441 }
2442
2443 #else
2444
2445 SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_fop1(struct sljit_compiler *compiler, sljit_si op,
2446         sljit_si dst, sljit_sw dstw,
2447         sljit_si src, sljit_sw srcw)
2448 {
2449         CHECK_ERROR();
2450         /* Should cause an assertion fail. */
2451         check_sljit_emit_fop1(compiler, op, dst, dstw, src, srcw);
2452         compiler->error = SLJIT_ERR_UNSUPPORTED;
2453         return SLJIT_ERR_UNSUPPORTED;
2454 }
2455
2456 SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_fop2(struct sljit_compiler *compiler, sljit_si op,
2457         sljit_si dst, sljit_sw dstw,
2458         sljit_si src1, sljit_sw src1w,
2459         sljit_si src2, sljit_sw src2w)
2460 {
2461         CHECK_ERROR();
2462         /* Should cause an assertion fail. */
2463         check_sljit_emit_fop2(compiler, op, dst, dstw, src1, src1w, src2, src2w);
2464         compiler->error = SLJIT_ERR_UNSUPPORTED;
2465         return SLJIT_ERR_UNSUPPORTED;
2466 }
2467
2468 #endif
2469
2470 /* --------------------------------------------------------------------- */
2471 /*  Conditional instructions                                             */
2472 /* --------------------------------------------------------------------- */
2473
2474 SLJIT_API_FUNC_ATTRIBUTE struct sljit_label* sljit_emit_label(struct sljit_compiler *compiler)
2475 {
2476         sljit_ub *inst;
2477         struct sljit_label *label;
2478
2479         CHECK_ERROR_PTR();
2480         check_sljit_emit_label(compiler);
2481
2482         /* We should restore the flags before the label,
2483            since other taken jumps has their own flags as well. */
2484         if (SLJIT_UNLIKELY(compiler->flags_saved))
2485                 PTR_FAIL_IF(emit_restore_flags(compiler, 0));
2486
2487         if (compiler->last_label && compiler->last_label->size == compiler->size)
2488                 return compiler->last_label;
2489
2490         label = (struct sljit_label*)ensure_abuf(compiler, sizeof(struct sljit_label));
2491         PTR_FAIL_IF(!label);
2492         set_label(label, compiler);
2493
2494         inst = (sljit_ub*)ensure_buf(compiler, 2);
2495         PTR_FAIL_IF(!inst);
2496
2497         *inst++ = 0;
2498         *inst++ = 0;
2499
2500         return label;
2501 }
2502
2503 SLJIT_API_FUNC_ATTRIBUTE struct sljit_jump* sljit_emit_jump(struct sljit_compiler *compiler, sljit_si type)
2504 {
2505         sljit_ub *inst;
2506         struct sljit_jump *jump;
2507
2508         CHECK_ERROR_PTR();
2509         check_sljit_emit_jump(compiler, type);
2510
2511         if (SLJIT_UNLIKELY(compiler->flags_saved)) {
2512                 if ((type & 0xff) <= SLJIT_JUMP)
2513                         PTR_FAIL_IF(emit_restore_flags(compiler, 0));
2514                 compiler->flags_saved = 0;
2515         }
2516
2517         jump = (struct sljit_jump*)ensure_abuf(compiler, sizeof(struct sljit_jump));
2518         PTR_FAIL_IF_NULL(jump);
2519         set_jump(jump, compiler, type & SLJIT_REWRITABLE_JUMP);
2520         type &= 0xff;
2521
2522         if (type >= SLJIT_CALL1)
2523                 PTR_FAIL_IF(call_with_args(compiler, type));
2524
2525         /* Worst case size. */
2526 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
2527         compiler->size += (type >= SLJIT_JUMP) ? 5 : 6;
2528 #else
2529         compiler->size += (type >= SLJIT_JUMP) ? (10 + 3) : (2 + 10 + 3);
2530 #endif
2531
2532         inst = (sljit_ub*)ensure_buf(compiler, 2);
2533         PTR_FAIL_IF_NULL(inst);
2534
2535         *inst++ = 0;
2536         *inst++ = type + 4;
2537         return jump;
2538 }
2539
2540 SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_ijump(struct sljit_compiler *compiler, sljit_si type, sljit_si src, sljit_sw srcw)
2541 {
2542         sljit_ub *inst;
2543         struct sljit_jump *jump;
2544
2545         CHECK_ERROR();
2546         check_sljit_emit_ijump(compiler, type, src, srcw);
2547         ADJUST_LOCAL_OFFSET(src, srcw);
2548
2549         CHECK_EXTRA_REGS(src, srcw, (void)0);
2550
2551         if (SLJIT_UNLIKELY(compiler->flags_saved)) {
2552                 if (type <= SLJIT_JUMP)
2553                         FAIL_IF(emit_restore_flags(compiler, 0));
2554                 compiler->flags_saved = 0;
2555         }
2556
2557         if (type >= SLJIT_CALL1) {
2558 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
2559 #if (defined SLJIT_X86_32_FASTCALL && SLJIT_X86_32_FASTCALL)
2560                 if (src == SLJIT_SCRATCH_REG3) {
2561                         EMIT_MOV(compiler, TMP_REG1, 0, src, 0);
2562                         src = TMP_REG1;
2563                 }
2564                 if (src == SLJIT_MEM1(SLJIT_LOCALS_REG) && type >= SLJIT_CALL3)
2565                         srcw += sizeof(sljit_sw);
2566 #endif
2567 #endif
2568 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) && defined(_WIN64)
2569                 if (src == SLJIT_SCRATCH_REG3) {
2570                         EMIT_MOV(compiler, TMP_REG1, 0, src, 0);
2571                         src = TMP_REG1;
2572                 }
2573 #endif
2574                 FAIL_IF(call_with_args(compiler, type));
2575         }
2576
2577         if (src == SLJIT_IMM) {
2578                 jump = (struct sljit_jump*)ensure_abuf(compiler, sizeof(struct sljit_jump));
2579                 FAIL_IF_NULL(jump);
2580                 set_jump(jump, compiler, JUMP_ADDR);
2581                 jump->u.target = srcw;
2582
2583                 /* Worst case size. */
2584 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
2585                 compiler->size += 5;
2586 #else
2587                 compiler->size += 10 + 3;
2588 #endif
2589
2590                 inst = (sljit_ub*)ensure_buf(compiler, 2);
2591                 FAIL_IF_NULL(inst);
2592
2593                 *inst++ = 0;
2594                 *inst++ = type + 4;
2595         }
2596         else {
2597 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2598                 /* REX_W is not necessary (src is not immediate). */
2599                 compiler->mode32 = 1;
2600 #endif
2601                 inst = emit_x86_instruction(compiler, 1, 0, 0, src, srcw);
2602                 FAIL_IF(!inst);
2603                 *inst++ = GROUP_FF;
2604                 *inst |= (type >= SLJIT_FAST_CALL) ? CALL_rm : JMP_rm;
2605         }
2606         return SLJIT_SUCCESS;
2607 }
2608
2609 SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op_flags(struct sljit_compiler *compiler, sljit_si op,
2610         sljit_si dst, sljit_sw dstw,
2611         sljit_si src, sljit_sw srcw,
2612         sljit_si type)
2613 {
2614         sljit_ub *inst;
2615         sljit_ub cond_set = 0;
2616 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2617         sljit_si reg;
2618 #else
2619         /* CHECK_EXTRA_REGS migh overwrite these values. */
2620         sljit_si dst_save = dst;
2621         sljit_sw dstw_save = dstw;
2622 #endif
2623
2624         CHECK_ERROR();
2625         check_sljit_emit_op_flags(compiler, op, dst, dstw, src, srcw, type);
2626
2627         if (dst == SLJIT_UNUSED)
2628                 return SLJIT_SUCCESS;
2629
2630         ADJUST_LOCAL_OFFSET(dst, dstw);
2631         CHECK_EXTRA_REGS(dst, dstw, (void)0);
2632         if (SLJIT_UNLIKELY(compiler->flags_saved))
2633                 FAIL_IF(emit_restore_flags(compiler, op & SLJIT_KEEP_FLAGS));
2634
2635         /* setcc = jcc + 0x10. */
2636         cond_set = get_jump_code(type) + 0x10;
2637
2638 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2639         if (GET_OPCODE(op) == SLJIT_OR && !GET_ALL_FLAGS(op) && FAST_IS_REG(dst) && dst == src) {
2640                 inst = (sljit_ub*)ensure_buf(compiler, 1 + 4 + 3);
2641                 FAIL_IF(!inst);
2642                 INC_SIZE(4 + 3);
2643                 /* Set low register to conditional flag. */
2644                 *inst++ = (reg_map[TMP_REG1] <= 7) ? REX : REX_B;
2645                 *inst++ = GROUP_0F;
2646                 *inst++ = cond_set;
2647                 *inst++ = MOD_REG | reg_lmap[TMP_REG1];
2648                 *inst++ = REX | (reg_map[TMP_REG1] <= 7 ? 0 : REX_R) | (reg_map[dst] <= 7 ? 0 : REX_B);
2649                 *inst++ = OR_rm8_r8;
2650                 *inst++ = MOD_REG | (reg_lmap[TMP_REG1] << 3) | reg_lmap[dst];
2651                 return SLJIT_SUCCESS;
2652         }
2653
2654         reg = (op == SLJIT_MOV && FAST_IS_REG(dst)) ? dst : TMP_REG1;
2655
2656         inst = (sljit_ub*)ensure_buf(compiler, 1 + 4 + 4);
2657         FAIL_IF(!inst);
2658         INC_SIZE(4 + 4);
2659         /* Set low register to conditional flag. */
2660         *inst++ = (reg_map[reg] <= 7) ? REX : REX_B;
2661         *inst++ = GROUP_0F;
2662         *inst++ = cond_set;
2663         *inst++ = MOD_REG | reg_lmap[reg];
2664         *inst++ = REX_W | (reg_map[reg] <= 7 ? 0 : (REX_B | REX_R));
2665         *inst++ = GROUP_0F;
2666         *inst++ = MOVZX_r_rm8;
2667         *inst = MOD_REG | (reg_lmap[reg] << 3) | reg_lmap[reg];
2668
2669         if (reg != TMP_REG1)
2670                 return SLJIT_SUCCESS;
2671
2672         if (GET_OPCODE(op) < SLJIT_ADD) {
2673                 compiler->mode32 = GET_OPCODE(op) != SLJIT_MOV;
2674                 return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
2675         }
2676 #if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) || (defined SLJIT_DEBUG && SLJIT_DEBUG)
2677         compiler->skip_checks = 1;
2678 #endif
2679         return sljit_emit_op2(compiler, op, dst, dstw, dst, dstw, TMP_REG1, 0);
2680 #else /* SLJIT_CONFIG_X86_64 */
2681         if (GET_OPCODE(op) < SLJIT_ADD && FAST_IS_REG(dst)) {
2682                 if (reg_map[dst] <= 4) {
2683                         /* Low byte is accessible. */
2684                         inst = (sljit_ub*)ensure_buf(compiler, 1 + 3 + 3);
2685                         FAIL_IF(!inst);
2686                         INC_SIZE(3 + 3);
2687                         /* Set low byte to conditional flag. */
2688                         *inst++ = GROUP_0F;
2689                         *inst++ = cond_set;
2690                         *inst++ = MOD_REG | reg_map[dst];
2691
2692                         *inst++ = GROUP_0F;
2693                         *inst++ = MOVZX_r_rm8;
2694                         *inst = MOD_REG | (reg_map[dst] << 3) | reg_map[dst];
2695                         return SLJIT_SUCCESS;
2696                 }
2697
2698                 /* Low byte is not accessible. */
2699                 if (cpu_has_cmov == -1)
2700                         get_cpu_features();
2701
2702                 if (cpu_has_cmov) {
2703                         EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, 1);
2704                         /* a xor reg, reg operation would overwrite the flags. */
2705                         EMIT_MOV(compiler, dst, 0, SLJIT_IMM, 0);
2706
2707                         inst = (sljit_ub*)ensure_buf(compiler, 1 + 3);
2708                         FAIL_IF(!inst);
2709                         INC_SIZE(3);
2710
2711                         *inst++ = GROUP_0F;
2712                         /* cmovcc = setcc - 0x50. */
2713                         *inst++ = cond_set - 0x50;
2714                         *inst++ = MOD_REG | (reg_map[dst] << 3) | reg_map[TMP_REG1];
2715                         return SLJIT_SUCCESS;
2716                 }
2717
2718                 inst = (sljit_ub*)ensure_buf(compiler, 1 + 1 + 3 + 3 + 1);
2719                 FAIL_IF(!inst);
2720                 INC_SIZE(1 + 3 + 3 + 1);
2721                 *inst++ = XCHG_EAX_r + reg_map[TMP_REG1];
2722                 /* Set al to conditional flag. */
2723                 *inst++ = GROUP_0F;
2724                 *inst++ = cond_set;
2725                 *inst++ = MOD_REG | 0 /* eax */;
2726
2727                 *inst++ = GROUP_0F;
2728                 *inst++ = MOVZX_r_rm8;
2729                 *inst++ = MOD_REG | (reg_map[dst] << 3) | 0 /* eax */;
2730                 *inst++ = XCHG_EAX_r + reg_map[TMP_REG1];
2731                 return SLJIT_SUCCESS;
2732         }
2733
2734         if (GET_OPCODE(op) == SLJIT_OR && !GET_ALL_FLAGS(op) && FAST_IS_REG(dst) && dst == src && reg_map[dst] <= 4) {
2735                 SLJIT_COMPILE_ASSERT(reg_map[SLJIT_SCRATCH_REG1] == 0, scratch_reg1_must_be_eax);
2736                 if (dst != SLJIT_SCRATCH_REG1) {
2737                         inst = (sljit_ub*)ensure_buf(compiler, 1 + 1 + 3 + 2 + 1);
2738                         FAIL_IF(!inst);
2739                         INC_SIZE(1 + 3 + 2 + 1);
2740                         /* Set low register to conditional flag. */
2741                         *inst++ = XCHG_EAX_r + reg_map[TMP_REG1];
2742                         *inst++ = GROUP_0F;
2743                         *inst++ = cond_set;
2744                         *inst++ = MOD_REG | 0 /* eax */;
2745                         *inst++ = OR_rm8_r8;
2746                         *inst++ = MOD_REG | (0 /* eax */ << 3) | reg_map[dst];
2747                         *inst++ = XCHG_EAX_r + reg_map[TMP_REG1];
2748                 }
2749                 else {
2750                         inst = (sljit_ub*)ensure_buf(compiler, 1 + 2 + 3 + 2 + 2);
2751                         FAIL_IF(!inst);
2752                         INC_SIZE(2 + 3 + 2 + 2);
2753                         /* Set low register to conditional flag. */
2754                         *inst++ = XCHG_r_rm;
2755                         *inst++ = MOD_REG | (1 /* ecx */ << 3) | reg_map[TMP_REG1];
2756                         *inst++ = GROUP_0F;
2757                         *inst++ = cond_set;
2758                         *inst++ = MOD_REG | 1 /* ecx */;
2759                         *inst++ = OR_rm8_r8;
2760                         *inst++ = MOD_REG | (1 /* ecx */ << 3) | 0 /* eax */;
2761                         *inst++ = XCHG_r_rm;
2762                         *inst++ = MOD_REG | (1 /* ecx */ << 3) | reg_map[TMP_REG1];
2763                 }
2764                 return SLJIT_SUCCESS;
2765         }
2766
2767         /* Set TMP_REG1 to the bit. */
2768         inst = (sljit_ub*)ensure_buf(compiler, 1 + 1 + 3 + 3 + 1);
2769         FAIL_IF(!inst);
2770         INC_SIZE(1 + 3 + 3 + 1);
2771         *inst++ = XCHG_EAX_r + reg_map[TMP_REG1];
2772         /* Set al to conditional flag. */
2773         *inst++ = GROUP_0F;
2774         *inst++ = cond_set;
2775         *inst++ = MOD_REG | 0 /* eax */;
2776
2777         *inst++ = GROUP_0F;
2778         *inst++ = MOVZX_r_rm8;
2779         *inst++ = MOD_REG | (0 << 3) /* eax */ | 0 /* eax */;
2780
2781         *inst++ = XCHG_EAX_r + reg_map[TMP_REG1];
2782
2783         if (GET_OPCODE(op) < SLJIT_ADD)
2784                 return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
2785
2786 #if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) || (defined SLJIT_DEBUG && SLJIT_DEBUG)
2787         compiler->skip_checks = 1;
2788 #endif
2789         return sljit_emit_op2(compiler, op, dst_save, dstw_save, dst_save, dstw_save, TMP_REG1, 0);
2790 #endif /* SLJIT_CONFIG_X86_64 */
2791 }
2792
2793 SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_get_local_base(struct sljit_compiler *compiler, sljit_si dst, sljit_sw dstw, sljit_sw offset)
2794 {
2795         CHECK_ERROR();
2796         check_sljit_get_local_base(compiler, dst, dstw, offset);
2797         ADJUST_LOCAL_OFFSET(dst, dstw);
2798
2799         CHECK_EXTRA_REGS(dst, dstw, (void)0);
2800
2801 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2802         compiler->mode32 = 0;
2803 #endif
2804
2805         ADJUST_LOCAL_OFFSET(SLJIT_MEM1(SLJIT_LOCALS_REG), offset);
2806
2807 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2808         if (NOT_HALFWORD(offset)) {
2809                 FAIL_IF(emit_load_imm64(compiler, TMP_REG1, offset));
2810 #if (defined SLJIT_DEBUG && SLJIT_DEBUG)
2811                 SLJIT_ASSERT(emit_lea_binary(compiler, SLJIT_KEEP_FLAGS, dst, dstw, SLJIT_LOCALS_REG, 0, TMP_REG1, 0) != SLJIT_ERR_UNSUPPORTED);
2812                 return compiler->error;
2813 #else
2814                 return emit_lea_binary(compiler, SLJIT_KEEP_FLAGS, dst, dstw, SLJIT_LOCALS_REG, 0, TMP_REG1, 0);
2815 #endif
2816         }
2817 #endif
2818
2819         if (offset != 0)
2820                 return emit_lea_binary(compiler, SLJIT_KEEP_FLAGS, dst, dstw, SLJIT_LOCALS_REG, 0, SLJIT_IMM, offset);
2821         return emit_mov(compiler, dst, dstw, SLJIT_LOCALS_REG, 0);
2822 }
2823
2824 SLJIT_API_FUNC_ATTRIBUTE struct sljit_const* sljit_emit_const(struct sljit_compiler *compiler, sljit_si dst, sljit_sw dstw, sljit_sw init_value)
2825 {
2826         sljit_ub *inst;
2827         struct sljit_const *const_;
2828 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2829         sljit_si reg;
2830 #endif
2831
2832         CHECK_ERROR_PTR();
2833         check_sljit_emit_const(compiler, dst, dstw, init_value);
2834         ADJUST_LOCAL_OFFSET(dst, dstw);
2835
2836         CHECK_EXTRA_REGS(dst, dstw, (void)0);
2837
2838         const_ = (struct sljit_const*)ensure_abuf(compiler, sizeof(struct sljit_const));
2839         PTR_FAIL_IF(!const_);
2840         set_const(const_, compiler);
2841
2842 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2843         compiler->mode32 = 0;
2844         reg = SLOW_IS_REG(dst) ? dst : TMP_REG1;
2845
2846         if (emit_load_imm64(compiler, reg, init_value))
2847                 return NULL;
2848 #else
2849         if (dst == SLJIT_UNUSED)
2850                 dst = TMP_REG1;
2851
2852         if (emit_mov(compiler, dst, dstw, SLJIT_IMM, init_value))
2853                 return NULL;
2854 #endif
2855
2856         inst = (sljit_ub*)ensure_buf(compiler, 2);
2857         PTR_FAIL_IF(!inst);
2858
2859         *inst++ = 0;
2860         *inst++ = 1;
2861
2862 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2863         if (dst & SLJIT_MEM)
2864                 if (emit_mov(compiler, dst, dstw, TMP_REG1, 0))
2865                         return NULL;
2866 #endif
2867
2868         return const_;
2869 }
2870
2871 SLJIT_API_FUNC_ATTRIBUTE void sljit_set_jump_addr(sljit_uw addr, sljit_uw new_addr)
2872 {
2873 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
2874         *(sljit_sw*)addr = new_addr - (addr + 4);
2875 #else
2876         *(sljit_uw*)addr = new_addr;
2877 #endif
2878 }
2879
2880 SLJIT_API_FUNC_ATTRIBUTE void sljit_set_const(sljit_uw addr, sljit_sw new_constant)
2881 {
2882         *(sljit_sw*)addr = new_constant;
2883 }