chiark / gitweb /
Reapply patches
[pcre3.git] / sljit / sljitNativeX86_common.c
1 /*
2  *    Stack-less Just-In-Time compiler
3  *
4  *    Copyright 2009-2012 Zoltan Herczeg (hzmester@freemail.hu). All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without modification, are
7  * permitted provided that the following conditions are met:
8  *
9  *   1. Redistributions of source code must retain the above copyright notice, this list of
10  *      conditions and the following disclaimer.
11  *
12  *   2. Redistributions in binary form must reproduce the above copyright notice, this list
13  *      of conditions and the following disclaimer in the documentation and/or other materials
14  *      provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) AND CONTRIBUTORS ``AS IS'' AND ANY
17  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
19  * SHALL THE COPYRIGHT HOLDER(S) OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
20  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
21  * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
22  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
24  * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  */
26
27 SLJIT_API_FUNC_ATTRIBUTE const char* sljit_get_platform_name(void)
28 {
29         return "x86" SLJIT_CPUINFO;
30 }
31
32 /*
33    32b register indexes:
34      0 - EAX
35      1 - ECX
36      2 - EDX
37      3 - EBX
38      4 - none
39      5 - EBP
40      6 - ESI
41      7 - EDI
42 */
43
44 /*
45    64b register indexes:
46      0 - RAX
47      1 - RCX
48      2 - RDX
49      3 - RBX
50      4 - none
51      5 - RBP
52      6 - RSI
53      7 - RDI
54      8 - R8   - From now on REX prefix is required
55      9 - R9
56     10 - R10
57     11 - R11
58     12 - R12
59     13 - R13
60     14 - R14
61     15 - R15
62 */
63
64 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
65
66 /* Last register + 1. */
67 #define TMP_REG1        (SLJIT_NUMBER_OF_REGISTERS + 2)
68
69 static const sljit_u8 reg_map[SLJIT_NUMBER_OF_REGISTERS + 3] = {
70         0, 0, 2, 1, 0, 0, 0, 0, 7, 6, 3, 4, 5
71 };
72
73 #define CHECK_EXTRA_REGS(p, w, do) \
74         if (p >= SLJIT_R3 && p <= SLJIT_R6) { \
75                 w = SLJIT_LOCALS_OFFSET + ((p) - (SLJIT_R3 + 4)) * sizeof(sljit_sw); \
76                 p = SLJIT_MEM1(SLJIT_SP); \
77                 do; \
78         }
79
80 #else /* SLJIT_CONFIG_X86_32 */
81
82 /* Last register + 1. */
83 #define TMP_REG1        (SLJIT_NUMBER_OF_REGISTERS + 2)
84 #define TMP_REG2        (SLJIT_NUMBER_OF_REGISTERS + 3)
85 #define TMP_REG3        (SLJIT_NUMBER_OF_REGISTERS + 4)
86
87 /* Note: r12 & 0x7 == 0b100, which decoded as SIB byte present
88    Note: avoid to use r12 and r13 for memory addessing
89    therefore r12 is better for SAVED_EREG than SAVED_REG. */
90 #ifndef _WIN64
91 /* 1st passed in rdi, 2nd argument passed in rsi, 3rd in rdx. */
92 static const sljit_u8 reg_map[SLJIT_NUMBER_OF_REGISTERS + 5] = {
93         0, 0, 6, 1, 8, 11, 10, 12, 5, 13, 14, 15, 3, 4, 2, 7, 9
94 };
95 /* low-map. reg_map & 0x7. */
96 static const sljit_u8 reg_lmap[SLJIT_NUMBER_OF_REGISTERS + 5] = {
97         0, 0, 6, 1, 0, 3,  2,  4,  5,  5,  6,  7, 3, 4, 2, 7, 1
98 };
99 #else
100 /* 1st passed in rcx, 2nd argument passed in rdx, 3rd in r8. */
101 static const sljit_u8 reg_map[SLJIT_NUMBER_OF_REGISTERS + 5] = {
102         0, 0, 2, 1, 11, 12, 5, 13, 14, 15, 7, 6, 3, 4, 10, 8, 9
103 };
104 /* low-map. reg_map & 0x7. */
105 static const sljit_u8 reg_lmap[SLJIT_NUMBER_OF_REGISTERS + 5] = {
106         0, 0, 2, 1, 3,  4,  5,  5, 6,  7,  7, 6, 3, 4, 2,  0, 1
107 };
108 #endif
109
110 #define REX_W           0x48
111 #define REX_R           0x44
112 #define REX_X           0x42
113 #define REX_B           0x41
114 #define REX             0x40
115
116 #ifndef _WIN64
117 #define HALFWORD_MAX 0x7fffffffl
118 #define HALFWORD_MIN -0x80000000l
119 #else
120 #define HALFWORD_MAX 0x7fffffffll
121 #define HALFWORD_MIN -0x80000000ll
122 #endif
123
124 #define IS_HALFWORD(x)          ((x) <= HALFWORD_MAX && (x) >= HALFWORD_MIN)
125 #define NOT_HALFWORD(x)         ((x) > HALFWORD_MAX || (x) < HALFWORD_MIN)
126
127 #define CHECK_EXTRA_REGS(p, w, do)
128
129 #endif /* SLJIT_CONFIG_X86_32 */
130
131 #define TMP_FREG        (0)
132
133 /* Size flags for emit_x86_instruction: */
134 #define EX86_BIN_INS            0x0010
135 #define EX86_SHIFT_INS          0x0020
136 #define EX86_REX                0x0040
137 #define EX86_NO_REXW            0x0080
138 #define EX86_BYTE_ARG           0x0100
139 #define EX86_HALF_ARG           0x0200
140 #define EX86_PREF_66            0x0400
141 #define EX86_PREF_F2            0x0800
142 #define EX86_PREF_F3            0x1000
143 #define EX86_SSE2_OP1           0x2000
144 #define EX86_SSE2_OP2           0x4000
145 #define EX86_SSE2               (EX86_SSE2_OP1 | EX86_SSE2_OP2)
146
147 /* --------------------------------------------------------------------- */
148 /*  Instrucion forms                                                     */
149 /* --------------------------------------------------------------------- */
150
151 #define ADD             (/* BINARY */ 0 << 3)
152 #define ADD_EAX_i32     0x05
153 #define ADD_r_rm        0x03
154 #define ADD_rm_r        0x01
155 #define ADDSD_x_xm      0x58
156 #define ADC             (/* BINARY */ 2 << 3)
157 #define ADC_EAX_i32     0x15
158 #define ADC_r_rm        0x13
159 #define ADC_rm_r        0x11
160 #define AND             (/* BINARY */ 4 << 3)
161 #define AND_EAX_i32     0x25
162 #define AND_r_rm        0x23
163 #define AND_rm_r        0x21
164 #define ANDPD_x_xm      0x54
165 #define BSR_r_rm        (/* GROUP_0F */ 0xbd)
166 #define CALL_i32        0xe8
167 #define CALL_rm         (/* GROUP_FF */ 2 << 3)
168 #define CDQ             0x99
169 #define CMOVNE_r_rm     (/* GROUP_0F */ 0x45)
170 #define CMP             (/* BINARY */ 7 << 3)
171 #define CMP_EAX_i32     0x3d
172 #define CMP_r_rm        0x3b
173 #define CMP_rm_r        0x39
174 #define CVTPD2PS_x_xm   0x5a
175 #define CVTSI2SD_x_rm   0x2a
176 #define CVTTSD2SI_r_xm  0x2c
177 #define DIV             (/* GROUP_F7 */ 6 << 3)
178 #define DIVSD_x_xm      0x5e
179 #define INT3            0xcc
180 #define IDIV            (/* GROUP_F7 */ 7 << 3)
181 #define IMUL            (/* GROUP_F7 */ 5 << 3)
182 #define IMUL_r_rm       (/* GROUP_0F */ 0xaf)
183 #define IMUL_r_rm_i8    0x6b
184 #define IMUL_r_rm_i32   0x69
185 #define JE_i8           0x74
186 #define JNE_i8          0x75
187 #define JMP_i8          0xeb
188 #define JMP_i32         0xe9
189 #define JMP_rm          (/* GROUP_FF */ 4 << 3)
190 #define LEA_r_m         0x8d
191 #define MOV_r_rm        0x8b
192 #define MOV_r_i32       0xb8
193 #define MOV_rm_r        0x89
194 #define MOV_rm_i32      0xc7
195 #define MOV_rm8_i8      0xc6
196 #define MOV_rm8_r8      0x88
197 #define MOVSD_x_xm      0x10
198 #define MOVSD_xm_x      0x11
199 #define MOVSXD_r_rm     0x63
200 #define MOVSX_r_rm8     (/* GROUP_0F */ 0xbe)
201 #define MOVSX_r_rm16    (/* GROUP_0F */ 0xbf)
202 #define MOVZX_r_rm8     (/* GROUP_0F */ 0xb6)
203 #define MOVZX_r_rm16    (/* GROUP_0F */ 0xb7)
204 #define MUL             (/* GROUP_F7 */ 4 << 3)
205 #define MULSD_x_xm      0x59
206 #define NEG_rm          (/* GROUP_F7 */ 3 << 3)
207 #define NOP             0x90
208 #define NOT_rm          (/* GROUP_F7 */ 2 << 3)
209 #define OR              (/* BINARY */ 1 << 3)
210 #define OR_r_rm         0x0b
211 #define OR_EAX_i32      0x0d
212 #define OR_rm_r         0x09
213 #define OR_rm8_r8       0x08
214 #define POP_r           0x58
215 #define POP_rm          0x8f
216 #define POPF            0x9d
217 #define PUSH_i32        0x68
218 #define PUSH_r          0x50
219 #define PUSH_rm         (/* GROUP_FF */ 6 << 3)
220 #define PUSHF           0x9c
221 #define RET_near        0xc3
222 #define RET_i16         0xc2
223 #define SBB             (/* BINARY */ 3 << 3)
224 #define SBB_EAX_i32     0x1d
225 #define SBB_r_rm        0x1b
226 #define SBB_rm_r        0x19
227 #define SAR             (/* SHIFT */ 7 << 3)
228 #define SHL             (/* SHIFT */ 4 << 3)
229 #define SHR             (/* SHIFT */ 5 << 3)
230 #define SUB             (/* BINARY */ 5 << 3)
231 #define SUB_EAX_i32     0x2d
232 #define SUB_r_rm        0x2b
233 #define SUB_rm_r        0x29
234 #define SUBSD_x_xm      0x5c
235 #define TEST_EAX_i32    0xa9
236 #define TEST_rm_r       0x85
237 #define UCOMISD_x_xm    0x2e
238 #define UNPCKLPD_x_xm   0x14
239 #define XCHG_EAX_r      0x90
240 #define XCHG_r_rm       0x87
241 #define XOR             (/* BINARY */ 6 << 3)
242 #define XOR_EAX_i32     0x35
243 #define XOR_r_rm        0x33
244 #define XOR_rm_r        0x31
245 #define XORPD_x_xm      0x57
246
247 #define GROUP_0F        0x0f
248 #define GROUP_F7        0xf7
249 #define GROUP_FF        0xff
250 #define GROUP_BINARY_81 0x81
251 #define GROUP_BINARY_83 0x83
252 #define GROUP_SHIFT_1   0xd1
253 #define GROUP_SHIFT_N   0xc1
254 #define GROUP_SHIFT_CL  0xd3
255
256 #define MOD_REG         0xc0
257 #define MOD_DISP8       0x40
258
259 #define INC_SIZE(s)                     (*inst++ = (s), compiler->size += (s))
260
261 #define PUSH_REG(r)                     (*inst++ = (PUSH_r + (r)))
262 #define POP_REG(r)                      (*inst++ = (POP_r + (r)))
263 #define RET()                           (*inst++ = (RET_near))
264 #define RET_I16(n)                      (*inst++ = (RET_i16), *inst++ = n, *inst++ = 0)
265 /* r32, r/m32 */
266 #define MOV_RM(mod, reg, rm)            (*inst++ = (MOV_r_rm), *inst++ = (mod) << 6 | (reg) << 3 | (rm))
267
268 /* Multithreading does not affect these static variables, since they store
269    built-in CPU features. Therefore they can be overwritten by different threads
270    if they detect the CPU features in the same time. */
271 #if (defined SLJIT_DETECT_SSE2 && SLJIT_DETECT_SSE2)
272 static sljit_s32 cpu_has_sse2 = -1;
273 #endif
274 static sljit_s32 cpu_has_cmov = -1;
275
276 #ifdef _WIN32_WCE
277 #include <cmnintrin.h>
278 #elif defined(_MSC_VER) && _MSC_VER >= 1400
279 #include <intrin.h>
280 #endif
281
282 static void get_cpu_features(void)
283 {
284         sljit_u32 features;
285
286 #if defined(_MSC_VER) && _MSC_VER >= 1400
287
288         int CPUInfo[4];
289         __cpuid(CPUInfo, 1);
290         features = (sljit_u32)CPUInfo[3];
291
292 #elif defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__SUNPRO_C)
293
294         /* AT&T syntax. */
295         __asm__ (
296                 "movl $0x1, %%eax\n"
297 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
298                 /* On x86-32, there is no red zone, so this
299                    should work (no need for a local variable). */
300                 "push %%ebx\n"
301 #endif
302                 "cpuid\n"
303 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
304                 "pop %%ebx\n"
305 #endif
306                 "movl %%edx, %0\n"
307                 : "=g" (features)
308                 :
309 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
310                 : "%eax", "%ecx", "%edx"
311 #else
312                 : "%rax", "%rbx", "%rcx", "%rdx"
313 #endif
314         );
315
316 #else /* _MSC_VER && _MSC_VER >= 1400 */
317
318         /* Intel syntax. */
319         __asm {
320                 mov eax, 1
321                 cpuid
322                 mov features, edx
323         }
324
325 #endif /* _MSC_VER && _MSC_VER >= 1400 */
326
327 #if (defined SLJIT_DETECT_SSE2 && SLJIT_DETECT_SSE2)
328         cpu_has_sse2 = (features >> 26) & 0x1;
329 #endif
330         cpu_has_cmov = (features >> 15) & 0x1;
331 }
332
333 static sljit_u8 get_jump_code(sljit_s32 type)
334 {
335         switch (type) {
336         case SLJIT_EQUAL:
337         case SLJIT_EQUAL_F64:
338                 return 0x84 /* je */;
339
340         case SLJIT_NOT_EQUAL:
341         case SLJIT_NOT_EQUAL_F64:
342                 return 0x85 /* jne */;
343
344         case SLJIT_LESS:
345         case SLJIT_LESS_F64:
346                 return 0x82 /* jc */;
347
348         case SLJIT_GREATER_EQUAL:
349         case SLJIT_GREATER_EQUAL_F64:
350                 return 0x83 /* jae */;
351
352         case SLJIT_GREATER:
353         case SLJIT_GREATER_F64:
354                 return 0x87 /* jnbe */;
355
356         case SLJIT_LESS_EQUAL:
357         case SLJIT_LESS_EQUAL_F64:
358                 return 0x86 /* jbe */;
359
360         case SLJIT_SIG_LESS:
361                 return 0x8c /* jl */;
362
363         case SLJIT_SIG_GREATER_EQUAL:
364                 return 0x8d /* jnl */;
365
366         case SLJIT_SIG_GREATER:
367                 return 0x8f /* jnle */;
368
369         case SLJIT_SIG_LESS_EQUAL:
370                 return 0x8e /* jle */;
371
372         case SLJIT_OVERFLOW:
373         case SLJIT_MUL_OVERFLOW:
374                 return 0x80 /* jo */;
375
376         case SLJIT_NOT_OVERFLOW:
377         case SLJIT_MUL_NOT_OVERFLOW:
378                 return 0x81 /* jno */;
379
380         case SLJIT_UNORDERED_F64:
381                 return 0x8a /* jp */;
382
383         case SLJIT_ORDERED_F64:
384                 return 0x8b /* jpo */;
385         }
386         return 0;
387 }
388
389 static sljit_u8* generate_far_jump_code(struct sljit_jump *jump, sljit_u8 *code_ptr, sljit_s32 type);
390
391 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
392 static sljit_u8* generate_fixed_jump(sljit_u8 *code_ptr, sljit_sw addr, sljit_s32 type);
393 #endif
394
395 static sljit_u8* generate_near_jump_code(struct sljit_jump *jump, sljit_u8 *code_ptr, sljit_u8 *code, sljit_s32 type)
396 {
397         sljit_s32 short_jump;
398         sljit_uw label_addr;
399
400         if (jump->flags & JUMP_LABEL)
401                 label_addr = (sljit_uw)(code + jump->u.label->size);
402         else
403                 label_addr = jump->u.target;
404         short_jump = (sljit_sw)(label_addr - (jump->addr + 2)) >= -128 && (sljit_sw)(label_addr - (jump->addr + 2)) <= 127;
405
406 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
407         if ((sljit_sw)(label_addr - (jump->addr + 1)) > HALFWORD_MAX || (sljit_sw)(label_addr - (jump->addr + 1)) < HALFWORD_MIN)
408                 return generate_far_jump_code(jump, code_ptr, type);
409 #endif
410
411         if (type == SLJIT_JUMP) {
412                 if (short_jump)
413                         *code_ptr++ = JMP_i8;
414                 else
415                         *code_ptr++ = JMP_i32;
416                 jump->addr++;
417         }
418         else if (type >= SLJIT_FAST_CALL) {
419                 short_jump = 0;
420                 *code_ptr++ = CALL_i32;
421                 jump->addr++;
422         }
423         else if (short_jump) {
424                 *code_ptr++ = get_jump_code(type) - 0x10;
425                 jump->addr++;
426         }
427         else {
428                 *code_ptr++ = GROUP_0F;
429                 *code_ptr++ = get_jump_code(type);
430                 jump->addr += 2;
431         }
432
433         if (short_jump) {
434                 jump->flags |= PATCH_MB;
435                 code_ptr += sizeof(sljit_s8);
436         } else {
437                 jump->flags |= PATCH_MW;
438 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
439                 code_ptr += sizeof(sljit_sw);
440 #else
441                 code_ptr += sizeof(sljit_s32);
442 #endif
443         }
444
445         return code_ptr;
446 }
447
448 SLJIT_API_FUNC_ATTRIBUTE void* sljit_generate_code(struct sljit_compiler *compiler)
449 {
450         struct sljit_memory_fragment *buf;
451         sljit_u8 *code;
452         sljit_u8 *code_ptr;
453         sljit_u8 *buf_ptr;
454         sljit_u8 *buf_end;
455         sljit_u8 len;
456
457         struct sljit_label *label;
458         struct sljit_jump *jump;
459         struct sljit_const *const_;
460
461         CHECK_ERROR_PTR();
462         CHECK_PTR(check_sljit_generate_code(compiler));
463         reverse_buf(compiler);
464
465         /* Second code generation pass. */
466         code = (sljit_u8*)SLJIT_MALLOC_EXEC(compiler->size);
467         PTR_FAIL_WITH_EXEC_IF(code);
468         buf = compiler->buf;
469
470         code_ptr = code;
471         label = compiler->labels;
472         jump = compiler->jumps;
473         const_ = compiler->consts;
474         do {
475                 buf_ptr = buf->memory;
476                 buf_end = buf_ptr + buf->used_size;
477                 do {
478                         len = *buf_ptr++;
479                         if (len > 0) {
480                                 /* The code is already generated. */
481                                 SLJIT_MEMMOVE(code_ptr, buf_ptr, len);
482                                 code_ptr += len;
483                                 buf_ptr += len;
484                         }
485                         else {
486                                 if (*buf_ptr >= 4) {
487                                         jump->addr = (sljit_uw)code_ptr;
488                                         if (!(jump->flags & SLJIT_REWRITABLE_JUMP))
489                                                 code_ptr = generate_near_jump_code(jump, code_ptr, code, *buf_ptr - 4);
490                                         else
491                                                 code_ptr = generate_far_jump_code(jump, code_ptr, *buf_ptr - 4);
492                                         jump = jump->next;
493                                 }
494                                 else if (*buf_ptr == 0) {
495                                         label->addr = (sljit_uw)code_ptr;
496                                         label->size = code_ptr - code;
497                                         label = label->next;
498                                 }
499                                 else if (*buf_ptr == 1) {
500                                         const_->addr = ((sljit_uw)code_ptr) - sizeof(sljit_sw);
501                                         const_ = const_->next;
502                                 }
503                                 else {
504 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
505                                         *code_ptr++ = (*buf_ptr == 2) ? CALL_i32 : JMP_i32;
506                                         buf_ptr++;
507                                         *(sljit_sw*)code_ptr = *(sljit_sw*)buf_ptr - ((sljit_sw)code_ptr + sizeof(sljit_sw));
508                                         code_ptr += sizeof(sljit_sw);
509                                         buf_ptr += sizeof(sljit_sw) - 1;
510 #else
511                                         code_ptr = generate_fixed_jump(code_ptr, *(sljit_sw*)(buf_ptr + 1), *buf_ptr);
512                                         buf_ptr += sizeof(sljit_sw);
513 #endif
514                                 }
515                                 buf_ptr++;
516                         }
517                 } while (buf_ptr < buf_end);
518                 SLJIT_ASSERT(buf_ptr == buf_end);
519                 buf = buf->next;
520         } while (buf);
521
522         SLJIT_ASSERT(!label);
523         SLJIT_ASSERT(!jump);
524         SLJIT_ASSERT(!const_);
525
526         jump = compiler->jumps;
527         while (jump) {
528                 if (jump->flags & PATCH_MB) {
529                         SLJIT_ASSERT((sljit_sw)(jump->u.label->addr - (jump->addr + sizeof(sljit_s8))) >= -128 && (sljit_sw)(jump->u.label->addr - (jump->addr + sizeof(sljit_s8))) <= 127);
530                         *(sljit_u8*)jump->addr = (sljit_u8)(jump->u.label->addr - (jump->addr + sizeof(sljit_s8)));
531                 } else if (jump->flags & PATCH_MW) {
532                         if (jump->flags & JUMP_LABEL) {
533 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
534                                 *(sljit_sw*)jump->addr = (sljit_sw)(jump->u.label->addr - (jump->addr + sizeof(sljit_sw)));
535 #else
536                                 SLJIT_ASSERT((sljit_sw)(jump->u.label->addr - (jump->addr + sizeof(sljit_s32))) >= HALFWORD_MIN && (sljit_sw)(jump->u.label->addr - (jump->addr + sizeof(sljit_s32))) <= HALFWORD_MAX);
537                                 *(sljit_s32*)jump->addr = (sljit_s32)(jump->u.label->addr - (jump->addr + sizeof(sljit_s32)));
538 #endif
539                         }
540                         else {
541 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
542                                 *(sljit_sw*)jump->addr = (sljit_sw)(jump->u.target - (jump->addr + sizeof(sljit_sw)));
543 #else
544                                 SLJIT_ASSERT((sljit_sw)(jump->u.target - (jump->addr + sizeof(sljit_s32))) >= HALFWORD_MIN && (sljit_sw)(jump->u.target - (jump->addr + sizeof(sljit_s32))) <= HALFWORD_MAX);
545                                 *(sljit_s32*)jump->addr = (sljit_s32)(jump->u.target - (jump->addr + sizeof(sljit_s32)));
546 #endif
547                         }
548                 }
549 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
550                 else if (jump->flags & PATCH_MD)
551                         *(sljit_sw*)jump->addr = jump->u.label->addr;
552 #endif
553
554                 jump = jump->next;
555         }
556
557         /* Maybe we waste some space because of short jumps. */
558         SLJIT_ASSERT(code_ptr <= code + compiler->size);
559         compiler->error = SLJIT_ERR_COMPILED;
560         compiler->executable_size = code_ptr - code;
561         return (void*)code;
562 }
563
564 /* --------------------------------------------------------------------- */
565 /*  Operators                                                            */
566 /* --------------------------------------------------------------------- */
567
568 static sljit_s32 emit_cum_binary(struct sljit_compiler *compiler,
569         sljit_u8 op_rm, sljit_u8 op_mr, sljit_u8 op_imm, sljit_u8 op_eax_imm,
570         sljit_s32 dst, sljit_sw dstw,
571         sljit_s32 src1, sljit_sw src1w,
572         sljit_s32 src2, sljit_sw src2w);
573
574 static sljit_s32 emit_non_cum_binary(struct sljit_compiler *compiler,
575         sljit_u8 op_rm, sljit_u8 op_mr, sljit_u8 op_imm, sljit_u8 op_eax_imm,
576         sljit_s32 dst, sljit_sw dstw,
577         sljit_s32 src1, sljit_sw src1w,
578         sljit_s32 src2, sljit_sw src2w);
579
580 static sljit_s32 emit_mov(struct sljit_compiler *compiler,
581         sljit_s32 dst, sljit_sw dstw,
582         sljit_s32 src, sljit_sw srcw);
583
584 static SLJIT_INLINE sljit_s32 emit_save_flags(struct sljit_compiler *compiler)
585 {
586         sljit_u8 *inst;
587
588 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
589         inst = (sljit_u8*)ensure_buf(compiler, 1 + 5);
590         FAIL_IF(!inst);
591         INC_SIZE(5);
592 #else
593         inst = (sljit_u8*)ensure_buf(compiler, 1 + 6);
594         FAIL_IF(!inst);
595         INC_SIZE(6);
596         *inst++ = REX_W;
597 #endif
598         *inst++ = LEA_r_m; /* lea esp/rsp, [esp/rsp + sizeof(sljit_sw)] */
599         *inst++ = 0x64;
600         *inst++ = 0x24;
601         *inst++ = (sljit_u8)sizeof(sljit_sw);
602         *inst++ = PUSHF;
603         compiler->flags_saved = 1;
604         return SLJIT_SUCCESS;
605 }
606
607 static SLJIT_INLINE sljit_s32 emit_restore_flags(struct sljit_compiler *compiler, sljit_s32 keep_flags)
608 {
609         sljit_u8 *inst;
610
611 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
612         inst = (sljit_u8*)ensure_buf(compiler, 1 + 5);
613         FAIL_IF(!inst);
614         INC_SIZE(5);
615         *inst++ = POPF;
616 #else
617         inst = (sljit_u8*)ensure_buf(compiler, 1 + 6);
618         FAIL_IF(!inst);
619         INC_SIZE(6);
620         *inst++ = POPF;
621         *inst++ = REX_W;
622 #endif
623         *inst++ = LEA_r_m; /* lea esp/rsp, [esp/rsp - sizeof(sljit_sw)] */
624         *inst++ = 0x64;
625         *inst++ = 0x24;
626         *inst++ = (sljit_u8)(-(sljit_s8)sizeof(sljit_sw));
627         compiler->flags_saved = keep_flags;
628         return SLJIT_SUCCESS;
629 }
630
631 #ifdef _WIN32
632 #include <malloc.h>
633
634 static void SLJIT_CALL sljit_grow_stack(sljit_sw local_size)
635 {
636         /* Workaround for calling the internal _chkstk() function on Windows.
637         This function touches all 4k pages belongs to the requested stack space,
638         which size is passed in local_size. This is necessary on Windows where
639         the stack can only grow in 4k steps. However, this function just burn
640         CPU cycles if the stack is large enough. However, you don't know it in
641         advance, so it must always be called. I think this is a bad design in
642         general even if it has some reasons. */
643         *(volatile sljit_s32*)alloca(local_size) = 0;
644 }
645
646 #endif
647
648 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
649 #include "sljitNativeX86_32.c"
650 #else
651 #include "sljitNativeX86_64.c"
652 #endif
653
654 static sljit_s32 emit_mov(struct sljit_compiler *compiler,
655         sljit_s32 dst, sljit_sw dstw,
656         sljit_s32 src, sljit_sw srcw)
657 {
658         sljit_u8* inst;
659
660         if (dst == SLJIT_UNUSED) {
661                 /* No destination, doesn't need to setup flags. */
662                 if (src & SLJIT_MEM) {
663                         inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src, srcw);
664                         FAIL_IF(!inst);
665                         *inst = MOV_r_rm;
666                 }
667                 return SLJIT_SUCCESS;
668         }
669         if (FAST_IS_REG(src)) {
670                 inst = emit_x86_instruction(compiler, 1, src, 0, dst, dstw);
671                 FAIL_IF(!inst);
672                 *inst = MOV_rm_r;
673                 return SLJIT_SUCCESS;
674         }
675         if (src & SLJIT_IMM) {
676                 if (FAST_IS_REG(dst)) {
677 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
678                         return emit_do_imm(compiler, MOV_r_i32 + reg_map[dst], srcw);
679 #else
680                         if (!compiler->mode32) {
681                                 if (NOT_HALFWORD(srcw))
682                                         return emit_load_imm64(compiler, dst, srcw);
683                         }
684                         else
685                                 return emit_do_imm32(compiler, (reg_map[dst] >= 8) ? REX_B : 0, MOV_r_i32 + reg_lmap[dst], srcw);
686 #endif
687                 }
688 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
689                 if (!compiler->mode32 && NOT_HALFWORD(srcw)) {
690                         FAIL_IF(emit_load_imm64(compiler, TMP_REG2, srcw));
691                         inst = emit_x86_instruction(compiler, 1, TMP_REG2, 0, dst, dstw);
692                         FAIL_IF(!inst);
693                         *inst = MOV_rm_r;
694                         return SLJIT_SUCCESS;
695                 }
696 #endif
697                 inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, srcw, dst, dstw);
698                 FAIL_IF(!inst);
699                 *inst = MOV_rm_i32;
700                 return SLJIT_SUCCESS;
701         }
702         if (FAST_IS_REG(dst)) {
703                 inst = emit_x86_instruction(compiler, 1, dst, 0, src, srcw);
704                 FAIL_IF(!inst);
705                 *inst = MOV_r_rm;
706                 return SLJIT_SUCCESS;
707         }
708
709         /* Memory to memory move. Requires two instruction. */
710         inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src, srcw);
711         FAIL_IF(!inst);
712         *inst = MOV_r_rm;
713         inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, dst, dstw);
714         FAIL_IF(!inst);
715         *inst = MOV_rm_r;
716         return SLJIT_SUCCESS;
717 }
718
719 #define EMIT_MOV(compiler, dst, dstw, src, srcw) \
720         FAIL_IF(emit_mov(compiler, dst, dstw, src, srcw));
721
722 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op0(struct sljit_compiler *compiler, sljit_s32 op)
723 {
724         sljit_u8 *inst;
725 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
726         sljit_s32 size;
727 #endif
728
729         CHECK_ERROR();
730         CHECK(check_sljit_emit_op0(compiler, op));
731
732         switch (GET_OPCODE(op)) {
733         case SLJIT_BREAKPOINT:
734                 inst = (sljit_u8*)ensure_buf(compiler, 1 + 1);
735                 FAIL_IF(!inst);
736                 INC_SIZE(1);
737                 *inst = INT3;
738                 break;
739         case SLJIT_NOP:
740                 inst = (sljit_u8*)ensure_buf(compiler, 1 + 1);
741                 FAIL_IF(!inst);
742                 INC_SIZE(1);
743                 *inst = NOP;
744                 break;
745         case SLJIT_LMUL_UW:
746         case SLJIT_LMUL_SW:
747         case SLJIT_DIVMOD_UW:
748         case SLJIT_DIVMOD_SW:
749         case SLJIT_DIV_UW:
750         case SLJIT_DIV_SW:
751                 compiler->flags_saved = 0;
752 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
753 #ifdef _WIN64
754                 SLJIT_COMPILE_ASSERT(
755                         reg_map[SLJIT_R0] == 0
756                         && reg_map[SLJIT_R1] == 2
757                         && reg_map[TMP_REG1] > 7,
758                         invalid_register_assignment_for_div_mul);
759 #else
760                 SLJIT_COMPILE_ASSERT(
761                         reg_map[SLJIT_R0] == 0
762                         && reg_map[SLJIT_R1] < 7
763                         && reg_map[TMP_REG1] == 2,
764                         invalid_register_assignment_for_div_mul);
765 #endif
766                 compiler->mode32 = op & SLJIT_I32_OP;
767 #endif
768                 SLJIT_COMPILE_ASSERT((SLJIT_DIVMOD_UW & 0x2) == 0 && SLJIT_DIV_UW - 0x2 == SLJIT_DIVMOD_UW, bad_div_opcode_assignments);
769
770                 op = GET_OPCODE(op);
771                 if ((op | 0x2) == SLJIT_DIV_UW) {
772 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) || defined(_WIN64)
773                         EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_R1, 0);
774                         inst = emit_x86_instruction(compiler, 1, SLJIT_R1, 0, SLJIT_R1, 0);
775 #else
776                         inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, TMP_REG1, 0);
777 #endif
778                         FAIL_IF(!inst);
779                         *inst = XOR_r_rm;
780                 }
781
782                 if ((op | 0x2) == SLJIT_DIV_SW) {
783 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) || defined(_WIN64)
784                         EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_R1, 0);
785 #endif
786
787 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
788                         inst = (sljit_u8*)ensure_buf(compiler, 1 + 1);
789                         FAIL_IF(!inst);
790                         INC_SIZE(1);
791                         *inst = CDQ;
792 #else
793                         if (compiler->mode32) {
794                                 inst = (sljit_u8*)ensure_buf(compiler, 1 + 1);
795                                 FAIL_IF(!inst);
796                                 INC_SIZE(1);
797                                 *inst = CDQ;
798                         } else {
799                                 inst = (sljit_u8*)ensure_buf(compiler, 1 + 2);
800                                 FAIL_IF(!inst);
801                                 INC_SIZE(2);
802                                 *inst++ = REX_W;
803                                 *inst = CDQ;
804                         }
805 #endif
806                 }
807
808 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
809                 inst = (sljit_u8*)ensure_buf(compiler, 1 + 2);
810                 FAIL_IF(!inst);
811                 INC_SIZE(2);
812                 *inst++ = GROUP_F7;
813                 *inst = MOD_REG | ((op >= SLJIT_DIVMOD_UW) ? reg_map[TMP_REG1] : reg_map[SLJIT_R1]);
814 #else
815 #ifdef _WIN64
816                 size = (!compiler->mode32 || op >= SLJIT_DIVMOD_UW) ? 3 : 2;
817 #else
818                 size = (!compiler->mode32) ? 3 : 2;
819 #endif
820                 inst = (sljit_u8*)ensure_buf(compiler, 1 + size);
821                 FAIL_IF(!inst);
822                 INC_SIZE(size);
823 #ifdef _WIN64
824                 if (!compiler->mode32)
825                         *inst++ = REX_W | ((op >= SLJIT_DIVMOD_UW) ? REX_B : 0);
826                 else if (op >= SLJIT_DIVMOD_UW)
827                         *inst++ = REX_B;
828                 *inst++ = GROUP_F7;
829                 *inst = MOD_REG | ((op >= SLJIT_DIVMOD_UW) ? reg_lmap[TMP_REG1] : reg_lmap[SLJIT_R1]);
830 #else
831                 if (!compiler->mode32)
832                         *inst++ = REX_W;
833                 *inst++ = GROUP_F7;
834                 *inst = MOD_REG | reg_map[SLJIT_R1];
835 #endif
836 #endif
837                 switch (op) {
838                 case SLJIT_LMUL_UW:
839                         *inst |= MUL;
840                         break;
841                 case SLJIT_LMUL_SW:
842                         *inst |= IMUL;
843                         break;
844                 case SLJIT_DIVMOD_UW:
845                 case SLJIT_DIV_UW:
846                         *inst |= DIV;
847                         break;
848                 case SLJIT_DIVMOD_SW:
849                 case SLJIT_DIV_SW:
850                         *inst |= IDIV;
851                         break;
852                 }
853 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) && !defined(_WIN64)
854                 if (op <= SLJIT_DIVMOD_SW)
855                         EMIT_MOV(compiler, SLJIT_R1, 0, TMP_REG1, 0);
856 #else
857                 if (op >= SLJIT_DIV_UW)
858                         EMIT_MOV(compiler, SLJIT_R1, 0, TMP_REG1, 0);
859 #endif
860                 break;
861         }
862
863         return SLJIT_SUCCESS;
864 }
865
866 #define ENCODE_PREFIX(prefix) \
867         do { \
868                 inst = (sljit_u8*)ensure_buf(compiler, 1 + 1); \
869                 FAIL_IF(!inst); \
870                 INC_SIZE(1); \
871                 *inst = (prefix); \
872         } while (0)
873
874 static sljit_s32 emit_mov_byte(struct sljit_compiler *compiler, sljit_s32 sign,
875         sljit_s32 dst, sljit_sw dstw,
876         sljit_s32 src, sljit_sw srcw)
877 {
878         sljit_u8* inst;
879         sljit_s32 dst_r;
880 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
881         sljit_s32 work_r;
882 #endif
883
884 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
885         compiler->mode32 = 0;
886 #endif
887
888         if (dst == SLJIT_UNUSED && !(src & SLJIT_MEM))
889                 return SLJIT_SUCCESS; /* Empty instruction. */
890
891         if (src & SLJIT_IMM) {
892                 if (FAST_IS_REG(dst)) {
893 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
894                         return emit_do_imm(compiler, MOV_r_i32 + reg_map[dst], srcw);
895 #else
896                         inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, srcw, dst, 0);
897                         FAIL_IF(!inst);
898                         *inst = MOV_rm_i32;
899                         return SLJIT_SUCCESS;
900 #endif
901                 }
902                 inst = emit_x86_instruction(compiler, 1 | EX86_BYTE_ARG | EX86_NO_REXW, SLJIT_IMM, srcw, dst, dstw);
903                 FAIL_IF(!inst);
904                 *inst = MOV_rm8_i8;
905                 return SLJIT_SUCCESS;
906         }
907
908         dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
909
910         if ((dst & SLJIT_MEM) && FAST_IS_REG(src)) {
911 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
912                 if (reg_map[src] >= 4) {
913                         SLJIT_ASSERT(dst_r == TMP_REG1);
914                         EMIT_MOV(compiler, TMP_REG1, 0, src, 0);
915                 } else
916                         dst_r = src;
917 #else
918                 dst_r = src;
919 #endif
920         }
921 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
922         else if (FAST_IS_REG(src) && reg_map[src] >= 4) {
923                 /* src, dst are registers. */
924                 SLJIT_ASSERT(SLOW_IS_REG(dst));
925                 if (reg_map[dst] < 4) {
926                         if (dst != src)
927                                 EMIT_MOV(compiler, dst, 0, src, 0);
928                         inst = emit_x86_instruction(compiler, 2, dst, 0, dst, 0);
929                         FAIL_IF(!inst);
930                         *inst++ = GROUP_0F;
931                         *inst = sign ? MOVSX_r_rm8 : MOVZX_r_rm8;
932                 }
933                 else {
934                         if (dst != src)
935                                 EMIT_MOV(compiler, dst, 0, src, 0);
936                         if (sign) {
937                                 /* shl reg, 24 */
938                                 inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_IMM, 24, dst, 0);
939                                 FAIL_IF(!inst);
940                                 *inst |= SHL;
941                                 /* sar reg, 24 */
942                                 inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_IMM, 24, dst, 0);
943                                 FAIL_IF(!inst);
944                                 *inst |= SAR;
945                         }
946                         else {
947                                 inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, 0xff, dst, 0);
948                                 FAIL_IF(!inst);
949                                 *(inst + 1) |= AND;
950                         }
951                 }
952                 return SLJIT_SUCCESS;
953         }
954 #endif
955         else {
956                 /* src can be memory addr or reg_map[src] < 4 on x86_32 architectures. */
957                 inst = emit_x86_instruction(compiler, 2, dst_r, 0, src, srcw);
958                 FAIL_IF(!inst);
959                 *inst++ = GROUP_0F;
960                 *inst = sign ? MOVSX_r_rm8 : MOVZX_r_rm8;
961         }
962
963         if (dst & SLJIT_MEM) {
964 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
965                 if (dst_r == TMP_REG1) {
966                         /* Find a non-used register, whose reg_map[src] < 4. */
967                         if ((dst & REG_MASK) == SLJIT_R0) {
968                                 if ((dst & OFFS_REG_MASK) == TO_OFFS_REG(SLJIT_R1))
969                                         work_r = SLJIT_R2;
970                                 else
971                                         work_r = SLJIT_R1;
972                         }
973                         else {
974                                 if ((dst & OFFS_REG_MASK) != TO_OFFS_REG(SLJIT_R0))
975                                         work_r = SLJIT_R0;
976                                 else if ((dst & REG_MASK) == SLJIT_R1)
977                                         work_r = SLJIT_R2;
978                                 else
979                                         work_r = SLJIT_R1;
980                         }
981
982                         if (work_r == SLJIT_R0) {
983                                 ENCODE_PREFIX(XCHG_EAX_r + reg_map[TMP_REG1]);
984                         }
985                         else {
986                                 inst = emit_x86_instruction(compiler, 1, work_r, 0, dst_r, 0);
987                                 FAIL_IF(!inst);
988                                 *inst = XCHG_r_rm;
989                         }
990
991                         inst = emit_x86_instruction(compiler, 1, work_r, 0, dst, dstw);
992                         FAIL_IF(!inst);
993                         *inst = MOV_rm8_r8;
994
995                         if (work_r == SLJIT_R0) {
996                                 ENCODE_PREFIX(XCHG_EAX_r + reg_map[TMP_REG1]);
997                         }
998                         else {
999                                 inst = emit_x86_instruction(compiler, 1, work_r, 0, dst_r, 0);
1000                                 FAIL_IF(!inst);
1001                                 *inst = XCHG_r_rm;
1002                         }
1003                 }
1004                 else {
1005                         inst = emit_x86_instruction(compiler, 1, dst_r, 0, dst, dstw);
1006                         FAIL_IF(!inst);
1007                         *inst = MOV_rm8_r8;
1008                 }
1009 #else
1010                 inst = emit_x86_instruction(compiler, 1 | EX86_REX | EX86_NO_REXW, dst_r, 0, dst, dstw);
1011                 FAIL_IF(!inst);
1012                 *inst = MOV_rm8_r8;
1013 #endif
1014         }
1015
1016         return SLJIT_SUCCESS;
1017 }
1018
1019 static sljit_s32 emit_mov_half(struct sljit_compiler *compiler, sljit_s32 sign,
1020         sljit_s32 dst, sljit_sw dstw,
1021         sljit_s32 src, sljit_sw srcw)
1022 {
1023         sljit_u8* inst;
1024         sljit_s32 dst_r;
1025
1026 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1027         compiler->mode32 = 0;
1028 #endif
1029
1030         if (dst == SLJIT_UNUSED && !(src & SLJIT_MEM))
1031                 return SLJIT_SUCCESS; /* Empty instruction. */
1032
1033         if (src & SLJIT_IMM) {
1034                 if (FAST_IS_REG(dst)) {
1035 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1036                         return emit_do_imm(compiler, MOV_r_i32 + reg_map[dst], srcw);
1037 #else
1038                         inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, srcw, dst, 0);
1039                         FAIL_IF(!inst);
1040                         *inst = MOV_rm_i32;
1041                         return SLJIT_SUCCESS;
1042 #endif
1043                 }
1044                 inst = emit_x86_instruction(compiler, 1 | EX86_HALF_ARG | EX86_NO_REXW | EX86_PREF_66, SLJIT_IMM, srcw, dst, dstw);
1045                 FAIL_IF(!inst);
1046                 *inst = MOV_rm_i32;
1047                 return SLJIT_SUCCESS;
1048         }
1049
1050         dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
1051
1052         if ((dst & SLJIT_MEM) && FAST_IS_REG(src))
1053                 dst_r = src;
1054         else {
1055                 inst = emit_x86_instruction(compiler, 2, dst_r, 0, src, srcw);
1056                 FAIL_IF(!inst);
1057                 *inst++ = GROUP_0F;
1058                 *inst = sign ? MOVSX_r_rm16 : MOVZX_r_rm16;
1059         }
1060
1061         if (dst & SLJIT_MEM) {
1062                 inst = emit_x86_instruction(compiler, 1 | EX86_NO_REXW | EX86_PREF_66, dst_r, 0, dst, dstw);
1063                 FAIL_IF(!inst);
1064                 *inst = MOV_rm_r;
1065         }
1066
1067         return SLJIT_SUCCESS;
1068 }
1069
1070 static sljit_s32 emit_unary(struct sljit_compiler *compiler, sljit_u8 opcode,
1071         sljit_s32 dst, sljit_sw dstw,
1072         sljit_s32 src, sljit_sw srcw)
1073 {
1074         sljit_u8* inst;
1075
1076         if (dst == SLJIT_UNUSED) {
1077                 EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
1078                 inst = emit_x86_instruction(compiler, 1, 0, 0, TMP_REG1, 0);
1079                 FAIL_IF(!inst);
1080                 *inst++ = GROUP_F7;
1081                 *inst |= opcode;
1082                 return SLJIT_SUCCESS;
1083         }
1084         if (dst == src && dstw == srcw) {
1085                 /* Same input and output */
1086                 inst = emit_x86_instruction(compiler, 1, 0, 0, dst, dstw);
1087                 FAIL_IF(!inst);
1088                 *inst++ = GROUP_F7;
1089                 *inst |= opcode;
1090                 return SLJIT_SUCCESS;
1091         }
1092         if (FAST_IS_REG(dst)) {
1093                 EMIT_MOV(compiler, dst, 0, src, srcw);
1094                 inst = emit_x86_instruction(compiler, 1, 0, 0, dst, dstw);
1095                 FAIL_IF(!inst);
1096                 *inst++ = GROUP_F7;
1097                 *inst |= opcode;
1098                 return SLJIT_SUCCESS;
1099         }
1100         EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
1101         inst = emit_x86_instruction(compiler, 1, 0, 0, TMP_REG1, 0);
1102         FAIL_IF(!inst);
1103         *inst++ = GROUP_F7;
1104         *inst |= opcode;
1105         EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
1106         return SLJIT_SUCCESS;
1107 }
1108
1109 static sljit_s32 emit_not_with_flags(struct sljit_compiler *compiler,
1110         sljit_s32 dst, sljit_sw dstw,
1111         sljit_s32 src, sljit_sw srcw)
1112 {
1113         sljit_u8* inst;
1114
1115         if (dst == SLJIT_UNUSED) {
1116                 EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
1117                 inst = emit_x86_instruction(compiler, 1, 0, 0, TMP_REG1, 0);
1118                 FAIL_IF(!inst);
1119                 *inst++ = GROUP_F7;
1120                 *inst |= NOT_rm;
1121                 inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, TMP_REG1, 0);
1122                 FAIL_IF(!inst);
1123                 *inst = OR_r_rm;
1124                 return SLJIT_SUCCESS;
1125         }
1126         if (FAST_IS_REG(dst)) {
1127                 EMIT_MOV(compiler, dst, 0, src, srcw);
1128                 inst = emit_x86_instruction(compiler, 1, 0, 0, dst, dstw);
1129                 FAIL_IF(!inst);
1130                 *inst++ = GROUP_F7;
1131                 *inst |= NOT_rm;
1132                 inst = emit_x86_instruction(compiler, 1, dst, 0, dst, 0);
1133                 FAIL_IF(!inst);
1134                 *inst = OR_r_rm;
1135                 return SLJIT_SUCCESS;
1136         }
1137         EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
1138         inst = emit_x86_instruction(compiler, 1, 0, 0, TMP_REG1, 0);
1139         FAIL_IF(!inst);
1140         *inst++ = GROUP_F7;
1141         *inst |= NOT_rm;
1142         inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, TMP_REG1, 0);
1143         FAIL_IF(!inst);
1144         *inst = OR_r_rm;
1145         EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
1146         return SLJIT_SUCCESS;
1147 }
1148
1149 static sljit_s32 emit_clz(struct sljit_compiler *compiler, sljit_s32 op_flags,
1150         sljit_s32 dst, sljit_sw dstw,
1151         sljit_s32 src, sljit_sw srcw)
1152 {
1153         sljit_u8* inst;
1154         sljit_s32 dst_r;
1155
1156         SLJIT_UNUSED_ARG(op_flags);
1157         if (SLJIT_UNLIKELY(dst == SLJIT_UNUSED)) {
1158                 /* Just set the zero flag. */
1159                 EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
1160                 inst = emit_x86_instruction(compiler, 1, 0, 0, TMP_REG1, 0);
1161                 FAIL_IF(!inst);
1162                 *inst++ = GROUP_F7;
1163                 *inst |= NOT_rm;
1164 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1165                 inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_IMM, 31, TMP_REG1, 0);
1166 #else
1167                 inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_IMM, !(op_flags & SLJIT_I32_OP) ? 63 : 31, TMP_REG1, 0);
1168 #endif
1169                 FAIL_IF(!inst);
1170                 *inst |= SHR;
1171                 return SLJIT_SUCCESS;
1172         }
1173
1174         if (SLJIT_UNLIKELY(src & SLJIT_IMM)) {
1175                 EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, srcw);
1176                 src = TMP_REG1;
1177                 srcw = 0;
1178         }
1179
1180         inst = emit_x86_instruction(compiler, 2, TMP_REG1, 0, src, srcw);
1181         FAIL_IF(!inst);
1182         *inst++ = GROUP_0F;
1183         *inst = BSR_r_rm;
1184
1185 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1186         if (FAST_IS_REG(dst))
1187                 dst_r = dst;
1188         else {
1189                 /* Find an unused temporary register. */
1190                 if ((dst & REG_MASK) != SLJIT_R0 && (dst & OFFS_REG_MASK) != TO_OFFS_REG(SLJIT_R0))
1191                         dst_r = SLJIT_R0;
1192                 else if ((dst & REG_MASK) != SLJIT_R1 && (dst & OFFS_REG_MASK) != TO_OFFS_REG(SLJIT_R1))
1193                         dst_r = SLJIT_R1;
1194                 else
1195                         dst_r = SLJIT_R2;
1196                 EMIT_MOV(compiler, dst, dstw, dst_r, 0);
1197         }
1198         EMIT_MOV(compiler, dst_r, 0, SLJIT_IMM, 32 + 31);
1199 #else
1200         dst_r = FAST_IS_REG(dst) ? dst : TMP_REG2;
1201         compiler->mode32 = 0;
1202         EMIT_MOV(compiler, dst_r, 0, SLJIT_IMM, !(op_flags & SLJIT_I32_OP) ? 64 + 63 : 32 + 31);
1203         compiler->mode32 = op_flags & SLJIT_I32_OP;
1204 #endif
1205
1206         if (cpu_has_cmov == -1)
1207                 get_cpu_features();
1208
1209         if (cpu_has_cmov) {
1210                 inst = emit_x86_instruction(compiler, 2, dst_r, 0, TMP_REG1, 0);
1211                 FAIL_IF(!inst);
1212                 *inst++ = GROUP_0F;
1213                 *inst = CMOVNE_r_rm;
1214         } else {
1215 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1216                 inst = (sljit_u8*)ensure_buf(compiler, 1 + 4);
1217                 FAIL_IF(!inst);
1218                 INC_SIZE(4);
1219
1220                 *inst++ = JE_i8;
1221                 *inst++ = 2;
1222                 *inst++ = MOV_r_rm;
1223                 *inst++ = MOD_REG | (reg_map[dst_r] << 3) | reg_map[TMP_REG1];
1224 #else
1225                 inst = (sljit_u8*)ensure_buf(compiler, 1 + 5);
1226                 FAIL_IF(!inst);
1227                 INC_SIZE(5);
1228
1229                 *inst++ = JE_i8;
1230                 *inst++ = 3;
1231                 *inst++ = REX_W | (reg_map[dst_r] >= 8 ? REX_R : 0) | (reg_map[TMP_REG1] >= 8 ? REX_B : 0);
1232                 *inst++ = MOV_r_rm;
1233                 *inst++ = MOD_REG | (reg_lmap[dst_r] << 3) | reg_lmap[TMP_REG1];
1234 #endif
1235         }
1236
1237 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1238         inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, 31, dst_r, 0);
1239 #else
1240         inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, !(op_flags & SLJIT_I32_OP) ? 63 : 31, dst_r, 0);
1241 #endif
1242         FAIL_IF(!inst);
1243         *(inst + 1) |= XOR;
1244
1245 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1246         if (dst & SLJIT_MEM) {
1247                 inst = emit_x86_instruction(compiler, 1, dst_r, 0, dst, dstw);
1248                 FAIL_IF(!inst);
1249                 *inst = XCHG_r_rm;
1250         }
1251 #else
1252         if (dst & SLJIT_MEM)
1253                 EMIT_MOV(compiler, dst, dstw, TMP_REG2, 0);
1254 #endif
1255         return SLJIT_SUCCESS;
1256 }
1257
1258 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op1(struct sljit_compiler *compiler, sljit_s32 op,
1259         sljit_s32 dst, sljit_sw dstw,
1260         sljit_s32 src, sljit_sw srcw)
1261 {
1262         sljit_u8* inst;
1263         sljit_s32 update = 0;
1264         sljit_s32 op_flags = GET_ALL_FLAGS(op);
1265 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1266         sljit_s32 dst_is_ereg = 0;
1267         sljit_s32 src_is_ereg = 0;
1268 #else
1269 #       define src_is_ereg 0
1270 #endif
1271
1272         CHECK_ERROR();
1273         CHECK(check_sljit_emit_op1(compiler, op, dst, dstw, src, srcw));
1274         ADJUST_LOCAL_OFFSET(dst, dstw);
1275         ADJUST_LOCAL_OFFSET(src, srcw);
1276
1277         CHECK_EXTRA_REGS(dst, dstw, dst_is_ereg = 1);
1278         CHECK_EXTRA_REGS(src, srcw, src_is_ereg = 1);
1279 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1280         compiler->mode32 = op_flags & SLJIT_I32_OP;
1281 #endif
1282
1283         op = GET_OPCODE(op);
1284         if (op >= SLJIT_MOV && op <= SLJIT_MOVU_P) {
1285 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1286                 compiler->mode32 = 0;
1287 #endif
1288
1289                 if (op_flags & SLJIT_I32_OP) {
1290                         if (FAST_IS_REG(src) && src == dst) {
1291                                 if (!TYPE_CAST_NEEDED(op))
1292                                         return SLJIT_SUCCESS;
1293                         }
1294 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1295                         if (op == SLJIT_MOV_S32 && (src & SLJIT_MEM))
1296                                 op = SLJIT_MOV_U32;
1297                         if (op == SLJIT_MOVU_S32 && (src & SLJIT_MEM))
1298                                 op = SLJIT_MOVU_U32;
1299                         if (op == SLJIT_MOV_U32 && (src & SLJIT_IMM))
1300                                 op = SLJIT_MOV_S32;
1301                         if (op == SLJIT_MOVU_U32 && (src & SLJIT_IMM))
1302                                 op = SLJIT_MOVU_S32;
1303 #endif
1304                 }
1305
1306                 SLJIT_COMPILE_ASSERT(SLJIT_MOV + 8 == SLJIT_MOVU, movu_offset);
1307                 if (op >= SLJIT_MOVU) {
1308                         update = 1;
1309                         op -= 8;
1310                 }
1311
1312                 if (src & SLJIT_IMM) {
1313                         switch (op) {
1314                         case SLJIT_MOV_U8:
1315                                 srcw = (sljit_u8)srcw;
1316                                 break;
1317                         case SLJIT_MOV_S8:
1318                                 srcw = (sljit_s8)srcw;
1319                                 break;
1320                         case SLJIT_MOV_U16:
1321                                 srcw = (sljit_u16)srcw;
1322                                 break;
1323                         case SLJIT_MOV_S16:
1324                                 srcw = (sljit_s16)srcw;
1325                                 break;
1326 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1327                         case SLJIT_MOV_U32:
1328                                 srcw = (sljit_u32)srcw;
1329                                 break;
1330                         case SLJIT_MOV_S32:
1331                                 srcw = (sljit_s32)srcw;
1332                                 break;
1333 #endif
1334                         }
1335 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1336                         if (SLJIT_UNLIKELY(dst_is_ereg))
1337                                 return emit_mov(compiler, dst, dstw, src, srcw);
1338 #endif
1339                 }
1340
1341                 if (SLJIT_UNLIKELY(update) && (src & SLJIT_MEM) && !src_is_ereg && (src & REG_MASK) && (srcw != 0 || (src & OFFS_REG_MASK) != 0)) {
1342                         inst = emit_x86_instruction(compiler, 1, src & REG_MASK, 0, src, srcw);
1343                         FAIL_IF(!inst);
1344                         *inst = LEA_r_m;
1345                         src &= SLJIT_MEM | 0xf;
1346                         srcw = 0;
1347                 }
1348
1349 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1350                 if (SLJIT_UNLIKELY(dst_is_ereg) && (!(op == SLJIT_MOV || op == SLJIT_MOV_U32 || op == SLJIT_MOV_S32 || op == SLJIT_MOV_P) || (src & SLJIT_MEM))) {
1351                         SLJIT_ASSERT(dst == SLJIT_MEM1(SLJIT_SP));
1352                         dst = TMP_REG1;
1353                 }
1354 #endif
1355
1356                 switch (op) {
1357                 case SLJIT_MOV:
1358                 case SLJIT_MOV_P:
1359 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1360                 case SLJIT_MOV_U32:
1361                 case SLJIT_MOV_S32:
1362 #endif
1363                         FAIL_IF(emit_mov(compiler, dst, dstw, src, srcw));
1364                         break;
1365                 case SLJIT_MOV_U8:
1366                         FAIL_IF(emit_mov_byte(compiler, 0, dst, dstw, src, srcw));
1367                         break;
1368                 case SLJIT_MOV_S8:
1369                         FAIL_IF(emit_mov_byte(compiler, 1, dst, dstw, src, srcw));
1370                         break;
1371                 case SLJIT_MOV_U16:
1372                         FAIL_IF(emit_mov_half(compiler, 0, dst, dstw, src, srcw));
1373                         break;
1374                 case SLJIT_MOV_S16:
1375                         FAIL_IF(emit_mov_half(compiler, 1, dst, dstw, src, srcw));
1376                         break;
1377 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1378                 case SLJIT_MOV_U32:
1379                         FAIL_IF(emit_mov_int(compiler, 0, dst, dstw, src, srcw));
1380                         break;
1381                 case SLJIT_MOV_S32:
1382                         FAIL_IF(emit_mov_int(compiler, 1, dst, dstw, src, srcw));
1383                         break;
1384 #endif
1385                 }
1386
1387 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1388                 if (SLJIT_UNLIKELY(dst_is_ereg) && dst == TMP_REG1)
1389                         return emit_mov(compiler, SLJIT_MEM1(SLJIT_SP), dstw, TMP_REG1, 0);
1390 #endif
1391
1392                 if (SLJIT_UNLIKELY(update) && (dst & SLJIT_MEM) && (dst & REG_MASK) && (dstw != 0 || (dst & OFFS_REG_MASK) != 0)) {
1393                         inst = emit_x86_instruction(compiler, 1, dst & REG_MASK, 0, dst, dstw);
1394                         FAIL_IF(!inst);
1395                         *inst = LEA_r_m;
1396                 }
1397                 return SLJIT_SUCCESS;
1398         }
1399
1400         if (SLJIT_UNLIKELY(GET_FLAGS(op_flags)))
1401                 compiler->flags_saved = 0;
1402
1403         switch (op) {
1404         case SLJIT_NOT:
1405                 if (SLJIT_UNLIKELY(op_flags & SLJIT_SET_E))
1406                         return emit_not_with_flags(compiler, dst, dstw, src, srcw);
1407                 return emit_unary(compiler, NOT_rm, dst, dstw, src, srcw);
1408
1409         case SLJIT_NEG:
1410                 if (SLJIT_UNLIKELY(op_flags & SLJIT_KEEP_FLAGS) && !compiler->flags_saved)
1411                         FAIL_IF(emit_save_flags(compiler));
1412                 return emit_unary(compiler, NEG_rm, dst, dstw, src, srcw);
1413
1414         case SLJIT_CLZ:
1415                 if (SLJIT_UNLIKELY(op_flags & SLJIT_KEEP_FLAGS) && !compiler->flags_saved)
1416                         FAIL_IF(emit_save_flags(compiler));
1417                 return emit_clz(compiler, op_flags, dst, dstw, src, srcw);
1418         }
1419
1420         return SLJIT_SUCCESS;
1421
1422 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1423 #       undef src_is_ereg
1424 #endif
1425 }
1426
1427 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1428
1429 #define BINARY_IMM(op_imm, op_mr, immw, arg, argw) \
1430         if (IS_HALFWORD(immw) || compiler->mode32) { \
1431                 inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, immw, arg, argw); \
1432                 FAIL_IF(!inst); \
1433                 *(inst + 1) |= (op_imm); \
1434         } \
1435         else { \
1436                 FAIL_IF(emit_load_imm64(compiler, TMP_REG2, immw)); \
1437                 inst = emit_x86_instruction(compiler, 1, TMP_REG2, 0, arg, argw); \
1438                 FAIL_IF(!inst); \
1439                 *inst = (op_mr); \
1440         }
1441
1442 #define BINARY_EAX_IMM(op_eax_imm, immw) \
1443         FAIL_IF(emit_do_imm32(compiler, (!compiler->mode32) ? REX_W : 0, (op_eax_imm), immw))
1444
1445 #else
1446
1447 #define BINARY_IMM(op_imm, op_mr, immw, arg, argw) \
1448         inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, immw, arg, argw); \
1449         FAIL_IF(!inst); \
1450         *(inst + 1) |= (op_imm);
1451
1452 #define BINARY_EAX_IMM(op_eax_imm, immw) \
1453         FAIL_IF(emit_do_imm(compiler, (op_eax_imm), immw))
1454
1455 #endif
1456
1457 static sljit_s32 emit_cum_binary(struct sljit_compiler *compiler,
1458         sljit_u8 op_rm, sljit_u8 op_mr, sljit_u8 op_imm, sljit_u8 op_eax_imm,
1459         sljit_s32 dst, sljit_sw dstw,
1460         sljit_s32 src1, sljit_sw src1w,
1461         sljit_s32 src2, sljit_sw src2w)
1462 {
1463         sljit_u8* inst;
1464
1465         if (dst == SLJIT_UNUSED) {
1466                 EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1467                 if (src2 & SLJIT_IMM) {
1468                         BINARY_IMM(op_imm, op_mr, src2w, TMP_REG1, 0);
1469                 }
1470                 else {
1471                         inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
1472                         FAIL_IF(!inst);
1473                         *inst = op_rm;
1474                 }
1475                 return SLJIT_SUCCESS;
1476         }
1477
1478         if (dst == src1 && dstw == src1w) {
1479                 if (src2 & SLJIT_IMM) {
1480 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1481                         if ((dst == SLJIT_R0) && (src2w > 127 || src2w < -128) && (compiler->mode32 || IS_HALFWORD(src2w))) {
1482 #else
1483                         if ((dst == SLJIT_R0) && (src2w > 127 || src2w < -128)) {
1484 #endif
1485                                 BINARY_EAX_IMM(op_eax_imm, src2w);
1486                         }
1487                         else {
1488                                 BINARY_IMM(op_imm, op_mr, src2w, dst, dstw);
1489                         }
1490                 }
1491                 else if (FAST_IS_REG(dst)) {
1492                         inst = emit_x86_instruction(compiler, 1, dst, dstw, src2, src2w);
1493                         FAIL_IF(!inst);
1494                         *inst = op_rm;
1495                 }
1496                 else if (FAST_IS_REG(src2)) {
1497                         /* Special exception for sljit_emit_op_flags. */
1498                         inst = emit_x86_instruction(compiler, 1, src2, src2w, dst, dstw);
1499                         FAIL_IF(!inst);
1500                         *inst = op_mr;
1501                 }
1502                 else {
1503                         EMIT_MOV(compiler, TMP_REG1, 0, src2, src2w);
1504                         inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, dst, dstw);
1505                         FAIL_IF(!inst);
1506                         *inst = op_mr;
1507                 }
1508                 return SLJIT_SUCCESS;
1509         }
1510
1511         /* Only for cumulative operations. */
1512         if (dst == src2 && dstw == src2w) {
1513                 if (src1 & SLJIT_IMM) {
1514 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1515                         if ((dst == SLJIT_R0) && (src1w > 127 || src1w < -128) && (compiler->mode32 || IS_HALFWORD(src1w))) {
1516 #else
1517                         if ((dst == SLJIT_R0) && (src1w > 127 || src1w < -128)) {
1518 #endif
1519                                 BINARY_EAX_IMM(op_eax_imm, src1w);
1520                         }
1521                         else {
1522                                 BINARY_IMM(op_imm, op_mr, src1w, dst, dstw);
1523                         }
1524                 }
1525                 else if (FAST_IS_REG(dst)) {
1526                         inst = emit_x86_instruction(compiler, 1, dst, dstw, src1, src1w);
1527                         FAIL_IF(!inst);
1528                         *inst = op_rm;
1529                 }
1530                 else if (FAST_IS_REG(src1)) {
1531                         inst = emit_x86_instruction(compiler, 1, src1, src1w, dst, dstw);
1532                         FAIL_IF(!inst);
1533                         *inst = op_mr;
1534                 }
1535                 else {
1536                         EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1537                         inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, dst, dstw);
1538                         FAIL_IF(!inst);
1539                         *inst = op_mr;
1540                 }
1541                 return SLJIT_SUCCESS;
1542         }
1543
1544         /* General version. */
1545         if (FAST_IS_REG(dst)) {
1546                 EMIT_MOV(compiler, dst, 0, src1, src1w);
1547                 if (src2 & SLJIT_IMM) {
1548                         BINARY_IMM(op_imm, op_mr, src2w, dst, 0);
1549                 }
1550                 else {
1551                         inst = emit_x86_instruction(compiler, 1, dst, 0, src2, src2w);
1552                         FAIL_IF(!inst);
1553                         *inst = op_rm;
1554                 }
1555         }
1556         else {
1557                 /* This version requires less memory writing. */
1558                 EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1559                 if (src2 & SLJIT_IMM) {
1560                         BINARY_IMM(op_imm, op_mr, src2w, TMP_REG1, 0);
1561                 }
1562                 else {
1563                         inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
1564                         FAIL_IF(!inst);
1565                         *inst = op_rm;
1566                 }
1567                 EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
1568         }
1569
1570         return SLJIT_SUCCESS;
1571 }
1572
1573 static sljit_s32 emit_non_cum_binary(struct sljit_compiler *compiler,
1574         sljit_u8 op_rm, sljit_u8 op_mr, sljit_u8 op_imm, sljit_u8 op_eax_imm,
1575         sljit_s32 dst, sljit_sw dstw,
1576         sljit_s32 src1, sljit_sw src1w,
1577         sljit_s32 src2, sljit_sw src2w)
1578 {
1579         sljit_u8* inst;
1580
1581         if (dst == SLJIT_UNUSED) {
1582                 EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1583                 if (src2 & SLJIT_IMM) {
1584                         BINARY_IMM(op_imm, op_mr, src2w, TMP_REG1, 0);
1585                 }
1586                 else {
1587                         inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
1588                         FAIL_IF(!inst);
1589                         *inst = op_rm;
1590                 }
1591                 return SLJIT_SUCCESS;
1592         }
1593
1594         if (dst == src1 && dstw == src1w) {
1595                 if (src2 & SLJIT_IMM) {
1596 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1597                         if ((dst == SLJIT_R0) && (src2w > 127 || src2w < -128) && (compiler->mode32 || IS_HALFWORD(src2w))) {
1598 #else
1599                         if ((dst == SLJIT_R0) && (src2w > 127 || src2w < -128)) {
1600 #endif
1601                                 BINARY_EAX_IMM(op_eax_imm, src2w);
1602                         }
1603                         else {
1604                                 BINARY_IMM(op_imm, op_mr, src2w, dst, dstw);
1605                         }
1606                 }
1607                 else if (FAST_IS_REG(dst)) {
1608                         inst = emit_x86_instruction(compiler, 1, dst, dstw, src2, src2w);
1609                         FAIL_IF(!inst);
1610                         *inst = op_rm;
1611                 }
1612                 else if (FAST_IS_REG(src2)) {
1613                         inst = emit_x86_instruction(compiler, 1, src2, src2w, dst, dstw);
1614                         FAIL_IF(!inst);
1615                         *inst = op_mr;
1616                 }
1617                 else {
1618                         EMIT_MOV(compiler, TMP_REG1, 0, src2, src2w);
1619                         inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, dst, dstw);
1620                         FAIL_IF(!inst);
1621                         *inst = op_mr;
1622                 }
1623                 return SLJIT_SUCCESS;
1624         }
1625
1626         /* General version. */
1627         if (FAST_IS_REG(dst) && dst != src2) {
1628                 EMIT_MOV(compiler, dst, 0, src1, src1w);
1629                 if (src2 & SLJIT_IMM) {
1630                         BINARY_IMM(op_imm, op_mr, src2w, dst, 0);
1631                 }
1632                 else {
1633                         inst = emit_x86_instruction(compiler, 1, dst, 0, src2, src2w);
1634                         FAIL_IF(!inst);
1635                         *inst = op_rm;
1636                 }
1637         }
1638         else {
1639                 /* This version requires less memory writing. */
1640                 EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1641                 if (src2 & SLJIT_IMM) {
1642                         BINARY_IMM(op_imm, op_mr, src2w, TMP_REG1, 0);
1643                 }
1644                 else {
1645                         inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
1646                         FAIL_IF(!inst);
1647                         *inst = op_rm;
1648                 }
1649                 EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
1650         }
1651
1652         return SLJIT_SUCCESS;
1653 }
1654
1655 static sljit_s32 emit_mul(struct sljit_compiler *compiler,
1656         sljit_s32 dst, sljit_sw dstw,
1657         sljit_s32 src1, sljit_sw src1w,
1658         sljit_s32 src2, sljit_sw src2w)
1659 {
1660         sljit_u8* inst;
1661         sljit_s32 dst_r;
1662
1663         dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
1664
1665         /* Register destination. */
1666         if (dst_r == src1 && !(src2 & SLJIT_IMM)) {
1667                 inst = emit_x86_instruction(compiler, 2, dst_r, 0, src2, src2w);
1668                 FAIL_IF(!inst);
1669                 *inst++ = GROUP_0F;
1670                 *inst = IMUL_r_rm;
1671         }
1672         else if (dst_r == src2 && !(src1 & SLJIT_IMM)) {
1673                 inst = emit_x86_instruction(compiler, 2, dst_r, 0, src1, src1w);
1674                 FAIL_IF(!inst);
1675                 *inst++ = GROUP_0F;
1676                 *inst = IMUL_r_rm;
1677         }
1678         else if (src1 & SLJIT_IMM) {
1679                 if (src2 & SLJIT_IMM) {
1680                         EMIT_MOV(compiler, dst_r, 0, SLJIT_IMM, src2w);
1681                         src2 = dst_r;
1682                         src2w = 0;
1683                 }
1684
1685                 if (src1w <= 127 && src1w >= -128) {
1686                         inst = emit_x86_instruction(compiler, 1, dst_r, 0, src2, src2w);
1687                         FAIL_IF(!inst);
1688                         *inst = IMUL_r_rm_i8;
1689                         inst = (sljit_u8*)ensure_buf(compiler, 1 + 1);
1690                         FAIL_IF(!inst);
1691                         INC_SIZE(1);
1692                         *inst = (sljit_s8)src1w;
1693                 }
1694 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1695                 else {
1696                         inst = emit_x86_instruction(compiler, 1, dst_r, 0, src2, src2w);
1697                         FAIL_IF(!inst);
1698                         *inst = IMUL_r_rm_i32;
1699                         inst = (sljit_u8*)ensure_buf(compiler, 1 + 4);
1700                         FAIL_IF(!inst);
1701                         INC_SIZE(4);
1702                         *(sljit_sw*)inst = src1w;
1703                 }
1704 #else
1705                 else if (IS_HALFWORD(src1w)) {
1706                         inst = emit_x86_instruction(compiler, 1, dst_r, 0, src2, src2w);
1707                         FAIL_IF(!inst);
1708                         *inst = IMUL_r_rm_i32;
1709                         inst = (sljit_u8*)ensure_buf(compiler, 1 + 4);
1710                         FAIL_IF(!inst);
1711                         INC_SIZE(4);
1712                         *(sljit_s32*)inst = (sljit_s32)src1w;
1713                 }
1714                 else {
1715                         EMIT_MOV(compiler, TMP_REG2, 0, SLJIT_IMM, src1w);
1716                         if (dst_r != src2)
1717                                 EMIT_MOV(compiler, dst_r, 0, src2, src2w);
1718                         inst = emit_x86_instruction(compiler, 2, dst_r, 0, TMP_REG2, 0);
1719                         FAIL_IF(!inst);
1720                         *inst++ = GROUP_0F;
1721                         *inst = IMUL_r_rm;
1722                 }
1723 #endif
1724         }
1725         else if (src2 & SLJIT_IMM) {
1726                 /* Note: src1 is NOT immediate. */
1727
1728                 if (src2w <= 127 && src2w >= -128) {
1729                         inst = emit_x86_instruction(compiler, 1, dst_r, 0, src1, src1w);
1730                         FAIL_IF(!inst);
1731                         *inst = IMUL_r_rm_i8;
1732                         inst = (sljit_u8*)ensure_buf(compiler, 1 + 1);
1733                         FAIL_IF(!inst);
1734                         INC_SIZE(1);
1735                         *inst = (sljit_s8)src2w;
1736                 }
1737 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1738                 else {
1739                         inst = emit_x86_instruction(compiler, 1, dst_r, 0, src1, src1w);
1740                         FAIL_IF(!inst);
1741                         *inst = IMUL_r_rm_i32;
1742                         inst = (sljit_u8*)ensure_buf(compiler, 1 + 4);
1743                         FAIL_IF(!inst);
1744                         INC_SIZE(4);
1745                         *(sljit_sw*)inst = src2w;
1746                 }
1747 #else
1748                 else if (IS_HALFWORD(src2w)) {
1749                         inst = emit_x86_instruction(compiler, 1, dst_r, 0, src1, src1w);
1750                         FAIL_IF(!inst);
1751                         *inst = IMUL_r_rm_i32;
1752                         inst = (sljit_u8*)ensure_buf(compiler, 1 + 4);
1753                         FAIL_IF(!inst);
1754                         INC_SIZE(4);
1755                         *(sljit_s32*)inst = (sljit_s32)src2w;
1756                 }
1757                 else {
1758                         EMIT_MOV(compiler, TMP_REG2, 0, SLJIT_IMM, src2w);
1759                         if (dst_r != src1)
1760                                 EMIT_MOV(compiler, dst_r, 0, src1, src1w);
1761                         inst = emit_x86_instruction(compiler, 2, dst_r, 0, TMP_REG2, 0);
1762                         FAIL_IF(!inst);
1763                         *inst++ = GROUP_0F;
1764                         *inst = IMUL_r_rm;
1765                 }
1766 #endif
1767         }
1768         else {
1769                 /* Neither argument is immediate. */
1770                 if (ADDRESSING_DEPENDS_ON(src2, dst_r))
1771                         dst_r = TMP_REG1;
1772                 EMIT_MOV(compiler, dst_r, 0, src1, src1w);
1773                 inst = emit_x86_instruction(compiler, 2, dst_r, 0, src2, src2w);
1774                 FAIL_IF(!inst);
1775                 *inst++ = GROUP_0F;
1776                 *inst = IMUL_r_rm;
1777         }
1778
1779         if (dst_r == TMP_REG1)
1780                 EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
1781
1782         return SLJIT_SUCCESS;
1783 }
1784
1785 static sljit_s32 emit_lea_binary(struct sljit_compiler *compiler, sljit_s32 keep_flags,
1786         sljit_s32 dst, sljit_sw dstw,
1787         sljit_s32 src1, sljit_sw src1w,
1788         sljit_s32 src2, sljit_sw src2w)
1789 {
1790         sljit_u8* inst;
1791         sljit_s32 dst_r, done = 0;
1792
1793         /* These cases better be left to handled by normal way. */
1794         if (!keep_flags) {
1795                 if (dst == src1 && dstw == src1w)
1796                         return SLJIT_ERR_UNSUPPORTED;
1797                 if (dst == src2 && dstw == src2w)
1798                         return SLJIT_ERR_UNSUPPORTED;
1799         }
1800
1801         dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
1802
1803         if (FAST_IS_REG(src1)) {
1804                 if (FAST_IS_REG(src2)) {
1805                         inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM2(src1, src2), 0);
1806                         FAIL_IF(!inst);
1807                         *inst = LEA_r_m;
1808                         done = 1;
1809                 }
1810 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1811                 if ((src2 & SLJIT_IMM) && (compiler->mode32 || IS_HALFWORD(src2w))) {
1812                         inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM1(src1), (sljit_s32)src2w);
1813 #else
1814                 if (src2 & SLJIT_IMM) {
1815                         inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM1(src1), src2w);
1816 #endif
1817                         FAIL_IF(!inst);
1818                         *inst = LEA_r_m;
1819                         done = 1;
1820                 }
1821         }
1822         else if (FAST_IS_REG(src2)) {
1823 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1824                 if ((src1 & SLJIT_IMM) && (compiler->mode32 || IS_HALFWORD(src1w))) {
1825                         inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM1(src2), (sljit_s32)src1w);
1826 #else
1827                 if (src1 & SLJIT_IMM) {
1828                         inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM1(src2), src1w);
1829 #endif
1830                         FAIL_IF(!inst);
1831                         *inst = LEA_r_m;
1832                         done = 1;
1833                 }
1834         }
1835
1836         if (done) {
1837                 if (dst_r == TMP_REG1)
1838                         return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
1839                 return SLJIT_SUCCESS;
1840         }
1841         return SLJIT_ERR_UNSUPPORTED;
1842 }
1843
1844 static sljit_s32 emit_cmp_binary(struct sljit_compiler *compiler,
1845         sljit_s32 src1, sljit_sw src1w,
1846         sljit_s32 src2, sljit_sw src2w)
1847 {
1848         sljit_u8* inst;
1849
1850 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1851         if (src1 == SLJIT_R0 && (src2 & SLJIT_IMM) && (src2w > 127 || src2w < -128) && (compiler->mode32 || IS_HALFWORD(src2w))) {
1852 #else
1853         if (src1 == SLJIT_R0 && (src2 & SLJIT_IMM) && (src2w > 127 || src2w < -128)) {
1854 #endif
1855                 BINARY_EAX_IMM(CMP_EAX_i32, src2w);
1856                 return SLJIT_SUCCESS;
1857         }
1858
1859         if (FAST_IS_REG(src1)) {
1860                 if (src2 & SLJIT_IMM) {
1861                         BINARY_IMM(CMP, CMP_rm_r, src2w, src1, 0);
1862                 }
1863                 else {
1864                         inst = emit_x86_instruction(compiler, 1, src1, 0, src2, src2w);
1865                         FAIL_IF(!inst);
1866                         *inst = CMP_r_rm;
1867                 }
1868                 return SLJIT_SUCCESS;
1869         }
1870
1871         if (FAST_IS_REG(src2) && !(src1 & SLJIT_IMM)) {
1872                 inst = emit_x86_instruction(compiler, 1, src2, 0, src1, src1w);
1873                 FAIL_IF(!inst);
1874                 *inst = CMP_rm_r;
1875                 return SLJIT_SUCCESS;
1876         }
1877
1878         if (src2 & SLJIT_IMM) {
1879                 if (src1 & SLJIT_IMM) {
1880                         EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1881                         src1 = TMP_REG1;
1882                         src1w = 0;
1883                 }
1884                 BINARY_IMM(CMP, CMP_rm_r, src2w, src1, src1w);
1885         }
1886         else {
1887                 EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1888                 inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
1889                 FAIL_IF(!inst);
1890                 *inst = CMP_r_rm;
1891         }
1892         return SLJIT_SUCCESS;
1893 }
1894
1895 static sljit_s32 emit_test_binary(struct sljit_compiler *compiler,
1896         sljit_s32 src1, sljit_sw src1w,
1897         sljit_s32 src2, sljit_sw src2w)
1898 {
1899         sljit_u8* inst;
1900
1901 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1902         if (src1 == SLJIT_R0 && (src2 & SLJIT_IMM) && (src2w > 127 || src2w < -128) && (compiler->mode32 || IS_HALFWORD(src2w))) {
1903 #else
1904         if (src1 == SLJIT_R0 && (src2 & SLJIT_IMM) && (src2w > 127 || src2w < -128)) {
1905 #endif
1906                 BINARY_EAX_IMM(TEST_EAX_i32, src2w);
1907                 return SLJIT_SUCCESS;
1908         }
1909
1910 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1911         if (src2 == SLJIT_R0 && (src2 & SLJIT_IMM) && (src1w > 127 || src1w < -128) && (compiler->mode32 || IS_HALFWORD(src1w))) {
1912 #else
1913         if (src2 == SLJIT_R0 && (src1 & SLJIT_IMM) && (src1w > 127 || src1w < -128)) {
1914 #endif
1915                 BINARY_EAX_IMM(TEST_EAX_i32, src1w);
1916                 return SLJIT_SUCCESS;
1917         }
1918
1919         if (!(src1 & SLJIT_IMM)) {
1920                 if (src2 & SLJIT_IMM) {
1921 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1922                         if (IS_HALFWORD(src2w) || compiler->mode32) {
1923                                 inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, src1, src1w);
1924                                 FAIL_IF(!inst);
1925                                 *inst = GROUP_F7;
1926                         }
1927                         else {
1928                                 FAIL_IF(emit_load_imm64(compiler, TMP_REG2, src2w));
1929                                 inst = emit_x86_instruction(compiler, 1, TMP_REG2, 0, src1, src1w);
1930                                 FAIL_IF(!inst);
1931                                 *inst = TEST_rm_r;
1932                         }
1933 #else
1934                         inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, src1, src1w);
1935                         FAIL_IF(!inst);
1936                         *inst = GROUP_F7;
1937 #endif
1938                         return SLJIT_SUCCESS;
1939                 }
1940                 else if (FAST_IS_REG(src1)) {
1941                         inst = emit_x86_instruction(compiler, 1, src1, 0, src2, src2w);
1942                         FAIL_IF(!inst);
1943                         *inst = TEST_rm_r;
1944                         return SLJIT_SUCCESS;
1945                 }
1946         }
1947
1948         if (!(src2 & SLJIT_IMM)) {
1949                 if (src1 & SLJIT_IMM) {
1950 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1951                         if (IS_HALFWORD(src1w) || compiler->mode32) {
1952                                 inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src1w, src2, src2w);
1953                                 FAIL_IF(!inst);
1954                                 *inst = GROUP_F7;
1955                         }
1956                         else {
1957                                 FAIL_IF(emit_load_imm64(compiler, TMP_REG2, src1w));
1958                                 inst = emit_x86_instruction(compiler, 1, TMP_REG2, 0, src2, src2w);
1959                                 FAIL_IF(!inst);
1960                                 *inst = TEST_rm_r;
1961                         }
1962 #else
1963                         inst = emit_x86_instruction(compiler, 1, src1, src1w, src2, src2w);
1964                         FAIL_IF(!inst);
1965                         *inst = GROUP_F7;
1966 #endif
1967                         return SLJIT_SUCCESS;
1968                 }
1969                 else if (FAST_IS_REG(src2)) {
1970                         inst = emit_x86_instruction(compiler, 1, src2, 0, src1, src1w);
1971                         FAIL_IF(!inst);
1972                         *inst = TEST_rm_r;
1973                         return SLJIT_SUCCESS;
1974                 }
1975         }
1976
1977         EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1978         if (src2 & SLJIT_IMM) {
1979 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1980                 if (IS_HALFWORD(src2w) || compiler->mode32) {
1981                         inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, TMP_REG1, 0);
1982                         FAIL_IF(!inst);
1983                         *inst = GROUP_F7;
1984                 }
1985                 else {
1986                         FAIL_IF(emit_load_imm64(compiler, TMP_REG2, src2w));
1987                         inst = emit_x86_instruction(compiler, 1, TMP_REG2, 0, TMP_REG1, 0);
1988                         FAIL_IF(!inst);
1989                         *inst = TEST_rm_r;
1990                 }
1991 #else
1992                 inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, TMP_REG1, 0);
1993                 FAIL_IF(!inst);
1994                 *inst = GROUP_F7;
1995 #endif
1996         }
1997         else {
1998                 inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
1999                 FAIL_IF(!inst);
2000                 *inst = TEST_rm_r;
2001         }
2002         return SLJIT_SUCCESS;
2003 }
2004
2005 static sljit_s32 emit_shift(struct sljit_compiler *compiler,
2006         sljit_u8 mode,
2007         sljit_s32 dst, sljit_sw dstw,
2008         sljit_s32 src1, sljit_sw src1w,
2009         sljit_s32 src2, sljit_sw src2w)
2010 {
2011         sljit_u8* inst;
2012
2013         if ((src2 & SLJIT_IMM) || (src2 == SLJIT_PREF_SHIFT_REG)) {
2014                 if (dst == src1 && dstw == src1w) {
2015                         inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, src2, src2w, dst, dstw);
2016                         FAIL_IF(!inst);
2017                         *inst |= mode;
2018                         return SLJIT_SUCCESS;
2019                 }
2020                 if (dst == SLJIT_UNUSED) {
2021                         EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
2022                         inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, src2, src2w, TMP_REG1, 0);
2023                         FAIL_IF(!inst);
2024                         *inst |= mode;
2025                         return SLJIT_SUCCESS;
2026                 }
2027                 if (dst == SLJIT_PREF_SHIFT_REG && src2 == SLJIT_PREF_SHIFT_REG) {
2028                         EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
2029                         inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
2030                         FAIL_IF(!inst);
2031                         *inst |= mode;
2032                         EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
2033                         return SLJIT_SUCCESS;
2034                 }
2035                 if (FAST_IS_REG(dst)) {
2036                         EMIT_MOV(compiler, dst, 0, src1, src1w);
2037                         inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, src2, src2w, dst, 0);
2038                         FAIL_IF(!inst);
2039                         *inst |= mode;
2040                         return SLJIT_SUCCESS;
2041                 }
2042
2043                 EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
2044                 inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, src2, src2w, TMP_REG1, 0);
2045                 FAIL_IF(!inst);
2046                 *inst |= mode;
2047                 EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
2048                 return SLJIT_SUCCESS;
2049         }
2050
2051         if (dst == SLJIT_PREF_SHIFT_REG) {
2052                 EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
2053                 EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, src2, src2w);
2054                 inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
2055                 FAIL_IF(!inst);
2056                 *inst |= mode;
2057                 EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
2058         }
2059         else if (FAST_IS_REG(dst) && dst != src2 && !ADDRESSING_DEPENDS_ON(src2, dst)) {
2060                 if (src1 != dst)
2061                         EMIT_MOV(compiler, dst, 0, src1, src1w);
2062                 EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_PREF_SHIFT_REG, 0);
2063                 EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, src2, src2w);
2064                 inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, dst, 0);
2065                 FAIL_IF(!inst);
2066                 *inst |= mode;
2067                 EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
2068         }
2069         else {
2070                 /* This case is really difficult, since ecx itself may used for
2071                    addressing, and we must ensure to work even in that case. */
2072                 EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
2073 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2074                 EMIT_MOV(compiler, TMP_REG2, 0, SLJIT_PREF_SHIFT_REG, 0);
2075 #else
2076                 /* [esp+0] contains the flags. */
2077                 EMIT_MOV(compiler, SLJIT_MEM1(SLJIT_SP), sizeof(sljit_sw), SLJIT_PREF_SHIFT_REG, 0);
2078 #endif
2079                 EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, src2, src2w);
2080                 inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
2081                 FAIL_IF(!inst);
2082                 *inst |= mode;
2083 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2084                 EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG2, 0);
2085 #else
2086                 EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, SLJIT_MEM1(SLJIT_SP), sizeof(sljit_sw));
2087 #endif
2088                 EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
2089         }
2090
2091         return SLJIT_SUCCESS;
2092 }
2093
2094 static sljit_s32 emit_shift_with_flags(struct sljit_compiler *compiler,
2095         sljit_u8 mode, sljit_s32 set_flags,
2096         sljit_s32 dst, sljit_sw dstw,
2097         sljit_s32 src1, sljit_sw src1w,
2098         sljit_s32 src2, sljit_sw src2w)
2099 {
2100         /* The CPU does not set flags if the shift count is 0. */
2101         if (src2 & SLJIT_IMM) {
2102 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2103                 if ((src2w & 0x3f) != 0 || (compiler->mode32 && (src2w & 0x1f) != 0))
2104                         return emit_shift(compiler, mode, dst, dstw, src1, src1w, src2, src2w);
2105 #else
2106                 if ((src2w & 0x1f) != 0)
2107                         return emit_shift(compiler, mode, dst, dstw, src1, src1w, src2, src2w);
2108 #endif
2109                 if (!set_flags)
2110                         return emit_mov(compiler, dst, dstw, src1, src1w);
2111                 /* OR dst, src, 0 */
2112                 return emit_cum_binary(compiler, OR_r_rm, OR_rm_r, OR, OR_EAX_i32,
2113                         dst, dstw, src1, src1w, SLJIT_IMM, 0);
2114         }
2115
2116         if (!set_flags)
2117                 return emit_shift(compiler, mode, dst, dstw, src1, src1w, src2, src2w);
2118
2119         if (!FAST_IS_REG(dst))
2120                 FAIL_IF(emit_cmp_binary(compiler, src1, src1w, SLJIT_IMM, 0));
2121
2122         FAIL_IF(emit_shift(compiler,mode, dst, dstw, src1, src1w, src2, src2w));
2123
2124         if (FAST_IS_REG(dst))
2125                 return emit_cmp_binary(compiler, dst, dstw, SLJIT_IMM, 0);
2126         return SLJIT_SUCCESS;
2127 }
2128
2129 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op2(struct sljit_compiler *compiler, sljit_s32 op,
2130         sljit_s32 dst, sljit_sw dstw,
2131         sljit_s32 src1, sljit_sw src1w,
2132         sljit_s32 src2, sljit_sw src2w)
2133 {
2134         CHECK_ERROR();
2135         CHECK(check_sljit_emit_op2(compiler, op, dst, dstw, src1, src1w, src2, src2w));
2136         ADJUST_LOCAL_OFFSET(dst, dstw);
2137         ADJUST_LOCAL_OFFSET(src1, src1w);
2138         ADJUST_LOCAL_OFFSET(src2, src2w);
2139
2140         CHECK_EXTRA_REGS(dst, dstw, (void)0);
2141         CHECK_EXTRA_REGS(src1, src1w, (void)0);
2142         CHECK_EXTRA_REGS(src2, src2w, (void)0);
2143 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2144         compiler->mode32 = op & SLJIT_I32_OP;
2145 #endif
2146
2147         if (GET_OPCODE(op) >= SLJIT_MUL) {
2148                 if (SLJIT_UNLIKELY(GET_FLAGS(op)))
2149                         compiler->flags_saved = 0;
2150                 else if (SLJIT_UNLIKELY(op & SLJIT_KEEP_FLAGS) && !compiler->flags_saved)
2151                         FAIL_IF(emit_save_flags(compiler));
2152         }
2153
2154         switch (GET_OPCODE(op)) {
2155         case SLJIT_ADD:
2156                 if (!GET_FLAGS(op)) {
2157                         if (emit_lea_binary(compiler, op & SLJIT_KEEP_FLAGS, dst, dstw, src1, src1w, src2, src2w) != SLJIT_ERR_UNSUPPORTED)
2158                                 return compiler->error;
2159                 }
2160                 else
2161                         compiler->flags_saved = 0;
2162                 if (SLJIT_UNLIKELY(op & SLJIT_KEEP_FLAGS) && !compiler->flags_saved)
2163                         FAIL_IF(emit_save_flags(compiler));
2164                 return emit_cum_binary(compiler, ADD_r_rm, ADD_rm_r, ADD, ADD_EAX_i32,
2165                         dst, dstw, src1, src1w, src2, src2w);
2166         case SLJIT_ADDC:
2167                 if (SLJIT_UNLIKELY(compiler->flags_saved)) /* C flag must be restored. */
2168                         FAIL_IF(emit_restore_flags(compiler, 1));
2169                 else if (SLJIT_UNLIKELY(op & SLJIT_KEEP_FLAGS))
2170                         FAIL_IF(emit_save_flags(compiler));
2171                 if (SLJIT_UNLIKELY(GET_FLAGS(op)))
2172                         compiler->flags_saved = 0;
2173                 return emit_cum_binary(compiler, ADC_r_rm, ADC_rm_r, ADC, ADC_EAX_i32,
2174                         dst, dstw, src1, src1w, src2, src2w);
2175         case SLJIT_SUB:
2176                 if (!GET_FLAGS(op)) {
2177                         if ((src2 & SLJIT_IMM) && emit_lea_binary(compiler, op & SLJIT_KEEP_FLAGS, dst, dstw, src1, src1w, SLJIT_IMM, -src2w) != SLJIT_ERR_UNSUPPORTED)
2178                                 return compiler->error;
2179                 }
2180                 else
2181                         compiler->flags_saved = 0;
2182                 if (SLJIT_UNLIKELY(op & SLJIT_KEEP_FLAGS) && !compiler->flags_saved)
2183                         FAIL_IF(emit_save_flags(compiler));
2184                 if (dst == SLJIT_UNUSED)
2185                         return emit_cmp_binary(compiler, src1, src1w, src2, src2w);
2186                 return emit_non_cum_binary(compiler, SUB_r_rm, SUB_rm_r, SUB, SUB_EAX_i32,
2187                         dst, dstw, src1, src1w, src2, src2w);
2188         case SLJIT_SUBC:
2189                 if (SLJIT_UNLIKELY(compiler->flags_saved)) /* C flag must be restored. */
2190                         FAIL_IF(emit_restore_flags(compiler, 1));
2191                 else if (SLJIT_UNLIKELY(op & SLJIT_KEEP_FLAGS))
2192                         FAIL_IF(emit_save_flags(compiler));
2193                 if (SLJIT_UNLIKELY(GET_FLAGS(op)))
2194                         compiler->flags_saved = 0;
2195                 return emit_non_cum_binary(compiler, SBB_r_rm, SBB_rm_r, SBB, SBB_EAX_i32,
2196                         dst, dstw, src1, src1w, src2, src2w);
2197         case SLJIT_MUL:
2198                 return emit_mul(compiler, dst, dstw, src1, src1w, src2, src2w);
2199         case SLJIT_AND:
2200                 if (dst == SLJIT_UNUSED)
2201                         return emit_test_binary(compiler, src1, src1w, src2, src2w);
2202                 return emit_cum_binary(compiler, AND_r_rm, AND_rm_r, AND, AND_EAX_i32,
2203                         dst, dstw, src1, src1w, src2, src2w);
2204         case SLJIT_OR:
2205                 return emit_cum_binary(compiler, OR_r_rm, OR_rm_r, OR, OR_EAX_i32,
2206                         dst, dstw, src1, src1w, src2, src2w);
2207         case SLJIT_XOR:
2208                 return emit_cum_binary(compiler, XOR_r_rm, XOR_rm_r, XOR, XOR_EAX_i32,
2209                         dst, dstw, src1, src1w, src2, src2w);
2210         case SLJIT_SHL:
2211                 return emit_shift_with_flags(compiler, SHL, GET_FLAGS(op),
2212                         dst, dstw, src1, src1w, src2, src2w);
2213         case SLJIT_LSHR:
2214                 return emit_shift_with_flags(compiler, SHR, GET_FLAGS(op),
2215                         dst, dstw, src1, src1w, src2, src2w);
2216         case SLJIT_ASHR:
2217                 return emit_shift_with_flags(compiler, SAR, GET_FLAGS(op),
2218                         dst, dstw, src1, src1w, src2, src2w);
2219         }
2220
2221         return SLJIT_SUCCESS;
2222 }
2223
2224 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_get_register_index(sljit_s32 reg)
2225 {
2226         CHECK_REG_INDEX(check_sljit_get_register_index(reg));
2227 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
2228         if (reg >= SLJIT_R3 && reg <= SLJIT_R6)
2229                 return -1;
2230 #endif
2231         return reg_map[reg];
2232 }
2233
2234 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_get_float_register_index(sljit_s32 reg)
2235 {
2236         CHECK_REG_INDEX(check_sljit_get_float_register_index(reg));
2237         return reg;
2238 }
2239
2240 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_custom(struct sljit_compiler *compiler,
2241         void *instruction, sljit_s32 size)
2242 {
2243         sljit_u8 *inst;
2244
2245         CHECK_ERROR();
2246         CHECK(check_sljit_emit_op_custom(compiler, instruction, size));
2247
2248         inst = (sljit_u8*)ensure_buf(compiler, 1 + size);
2249         FAIL_IF(!inst);
2250         INC_SIZE(size);
2251         SLJIT_MEMMOVE(inst, instruction, size);
2252         return SLJIT_SUCCESS;
2253 }
2254
2255 /* --------------------------------------------------------------------- */
2256 /*  Floating point operators                                             */
2257 /* --------------------------------------------------------------------- */
2258
2259 /* Alignment + 2 * 16 bytes. */
2260 static sljit_s32 sse2_data[3 + (4 + 4) * 2];
2261 static sljit_s32 *sse2_buffer;
2262
2263 static void init_compiler(void)
2264 {
2265         sse2_buffer = (sljit_s32*)(((sljit_uw)sse2_data + 15) & ~0xf);
2266         /* Single precision constants. */
2267         sse2_buffer[0] = 0x80000000;
2268         sse2_buffer[4] = 0x7fffffff;
2269         /* Double precision constants. */
2270         sse2_buffer[8] = 0;
2271         sse2_buffer[9] = 0x80000000;
2272         sse2_buffer[12] = 0xffffffff;
2273         sse2_buffer[13] = 0x7fffffff;
2274 }
2275
2276 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_is_fpu_available(void)
2277 {
2278 #ifdef SLJIT_IS_FPU_AVAILABLE
2279         return SLJIT_IS_FPU_AVAILABLE;
2280 #elif (defined SLJIT_DETECT_SSE2 && SLJIT_DETECT_SSE2)
2281         if (cpu_has_sse2 == -1)
2282                 get_cpu_features();
2283         return cpu_has_sse2;
2284 #else /* SLJIT_DETECT_SSE2 */
2285         return 1;
2286 #endif /* SLJIT_DETECT_SSE2 */
2287 }
2288
2289 static sljit_s32 emit_sse2(struct sljit_compiler *compiler, sljit_u8 opcode,
2290         sljit_s32 single, sljit_s32 xmm1, sljit_s32 xmm2, sljit_sw xmm2w)
2291 {
2292         sljit_u8 *inst;
2293
2294         inst = emit_x86_instruction(compiler, 2 | (single ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2, xmm1, 0, xmm2, xmm2w);
2295         FAIL_IF(!inst);
2296         *inst++ = GROUP_0F;
2297         *inst = opcode;
2298         return SLJIT_SUCCESS;
2299 }
2300
2301 static sljit_s32 emit_sse2_logic(struct sljit_compiler *compiler, sljit_u8 opcode,
2302         sljit_s32 pref66, sljit_s32 xmm1, sljit_s32 xmm2, sljit_sw xmm2w)
2303 {
2304         sljit_u8 *inst;
2305
2306         inst = emit_x86_instruction(compiler, 2 | (pref66 ? EX86_PREF_66 : 0) | EX86_SSE2, xmm1, 0, xmm2, xmm2w);
2307         FAIL_IF(!inst);
2308         *inst++ = GROUP_0F;
2309         *inst = opcode;
2310         return SLJIT_SUCCESS;
2311 }
2312
2313 static SLJIT_INLINE sljit_s32 emit_sse2_load(struct sljit_compiler *compiler,
2314         sljit_s32 single, sljit_s32 dst, sljit_s32 src, sljit_sw srcw)
2315 {
2316         return emit_sse2(compiler, MOVSD_x_xm, single, dst, src, srcw);
2317 }
2318
2319 static SLJIT_INLINE sljit_s32 emit_sse2_store(struct sljit_compiler *compiler,
2320         sljit_s32 single, sljit_s32 dst, sljit_sw dstw, sljit_s32 src)
2321 {
2322         return emit_sse2(compiler, MOVSD_xm_x, single, src, dst, dstw);
2323 }
2324
2325 static SLJIT_INLINE sljit_s32 sljit_emit_fop1_conv_sw_from_f64(struct sljit_compiler *compiler, sljit_s32 op,
2326         sljit_s32 dst, sljit_sw dstw,
2327         sljit_s32 src, sljit_sw srcw)
2328 {
2329         sljit_s32 dst_r = SLOW_IS_REG(dst) ? dst : TMP_REG1;
2330         sljit_u8 *inst;
2331
2332 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2333         if (GET_OPCODE(op) == SLJIT_CONV_SW_FROM_F64)
2334                 compiler->mode32 = 0;
2335 #endif
2336
2337         inst = emit_x86_instruction(compiler, 2 | ((op & SLJIT_F32_OP) ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2_OP2, dst_r, 0, src, srcw);
2338         FAIL_IF(!inst);
2339         *inst++ = GROUP_0F;
2340         *inst = CVTTSD2SI_r_xm;
2341
2342         if (dst_r == TMP_REG1 && dst != SLJIT_UNUSED)
2343                 return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
2344         return SLJIT_SUCCESS;
2345 }
2346
2347 static SLJIT_INLINE sljit_s32 sljit_emit_fop1_conv_f64_from_sw(struct sljit_compiler *compiler, sljit_s32 op,
2348         sljit_s32 dst, sljit_sw dstw,
2349         sljit_s32 src, sljit_sw srcw)
2350 {
2351         sljit_s32 dst_r = FAST_IS_REG(dst) ? dst : TMP_FREG;
2352         sljit_u8 *inst;
2353
2354 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2355         if (GET_OPCODE(op) == SLJIT_CONV_F64_FROM_SW)
2356                 compiler->mode32 = 0;
2357 #endif
2358
2359         if (src & SLJIT_IMM) {
2360 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2361                 if (GET_OPCODE(op) == SLJIT_CONV_F64_FROM_S32)
2362                         srcw = (sljit_s32)srcw;
2363 #endif
2364                 EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
2365                 src = TMP_REG1;
2366                 srcw = 0;
2367         }
2368
2369         inst = emit_x86_instruction(compiler, 2 | ((op & SLJIT_F32_OP) ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2_OP1, dst_r, 0, src, srcw);
2370         FAIL_IF(!inst);
2371         *inst++ = GROUP_0F;
2372         *inst = CVTSI2SD_x_rm;
2373
2374 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2375         compiler->mode32 = 1;
2376 #endif
2377         if (dst_r == TMP_FREG)
2378                 return emit_sse2_store(compiler, op & SLJIT_F32_OP, dst, dstw, TMP_FREG);
2379         return SLJIT_SUCCESS;
2380 }
2381
2382 static SLJIT_INLINE sljit_s32 sljit_emit_fop1_cmp(struct sljit_compiler *compiler, sljit_s32 op,
2383         sljit_s32 src1, sljit_sw src1w,
2384         sljit_s32 src2, sljit_sw src2w)
2385 {
2386         compiler->flags_saved = 0;
2387         if (!FAST_IS_REG(src1)) {
2388                 FAIL_IF(emit_sse2_load(compiler, op & SLJIT_F32_OP, TMP_FREG, src1, src1w));
2389                 src1 = TMP_FREG;
2390         }
2391         return emit_sse2_logic(compiler, UCOMISD_x_xm, !(op & SLJIT_F32_OP), src1, src2, src2w);
2392 }
2393
2394 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fop1(struct sljit_compiler *compiler, sljit_s32 op,
2395         sljit_s32 dst, sljit_sw dstw,
2396         sljit_s32 src, sljit_sw srcw)
2397 {
2398         sljit_s32 dst_r;
2399
2400 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2401         compiler->mode32 = 1;
2402 #endif
2403
2404         CHECK_ERROR();
2405         SELECT_FOP1_OPERATION_WITH_CHECKS(compiler, op, dst, dstw, src, srcw);
2406
2407         if (GET_OPCODE(op) == SLJIT_MOV_F64) {
2408                 if (FAST_IS_REG(dst))
2409                         return emit_sse2_load(compiler, op & SLJIT_F32_OP, dst, src, srcw);
2410                 if (FAST_IS_REG(src))
2411                         return emit_sse2_store(compiler, op & SLJIT_F32_OP, dst, dstw, src);
2412                 FAIL_IF(emit_sse2_load(compiler, op & SLJIT_F32_OP, TMP_FREG, src, srcw));
2413                 return emit_sse2_store(compiler, op & SLJIT_F32_OP, dst, dstw, TMP_FREG);
2414         }
2415
2416         if (GET_OPCODE(op) == SLJIT_CONV_F64_FROM_F32) {
2417                 dst_r = FAST_IS_REG(dst) ? dst : TMP_FREG;
2418                 if (FAST_IS_REG(src)) {
2419                         /* We overwrite the high bits of source. From SLJIT point of view,
2420                            this is not an issue.
2421                            Note: In SSE3, we could also use MOVDDUP and MOVSLDUP. */
2422                         FAIL_IF(emit_sse2_logic(compiler, UNPCKLPD_x_xm, op & SLJIT_F32_OP, src, src, 0));
2423                 }
2424                 else {
2425                         FAIL_IF(emit_sse2_load(compiler, !(op & SLJIT_F32_OP), TMP_FREG, src, srcw));
2426                         src = TMP_FREG;
2427                 }
2428
2429                 FAIL_IF(emit_sse2_logic(compiler, CVTPD2PS_x_xm, op & SLJIT_F32_OP, dst_r, src, 0));
2430                 if (dst_r == TMP_FREG)
2431                         return emit_sse2_store(compiler, op & SLJIT_F32_OP, dst, dstw, TMP_FREG);
2432                 return SLJIT_SUCCESS;
2433         }
2434
2435         if (SLOW_IS_REG(dst)) {
2436                 dst_r = dst;
2437                 if (dst != src)
2438                         FAIL_IF(emit_sse2_load(compiler, op & SLJIT_F32_OP, dst_r, src, srcw));
2439         }
2440         else {
2441                 dst_r = TMP_FREG;
2442                 FAIL_IF(emit_sse2_load(compiler, op & SLJIT_F32_OP, dst_r, src, srcw));
2443         }
2444
2445         switch (GET_OPCODE(op)) {
2446         case SLJIT_NEG_F64:
2447                 FAIL_IF(emit_sse2_logic(compiler, XORPD_x_xm, 1, dst_r, SLJIT_MEM0(), (sljit_sw)(op & SLJIT_F32_OP ? sse2_buffer : sse2_buffer + 8)));
2448                 break;
2449
2450         case SLJIT_ABS_F64:
2451                 FAIL_IF(emit_sse2_logic(compiler, ANDPD_x_xm, 1, dst_r, SLJIT_MEM0(), (sljit_sw)(op & SLJIT_F32_OP ? sse2_buffer + 4 : sse2_buffer + 12)));
2452                 break;
2453         }
2454
2455         if (dst_r == TMP_FREG)
2456                 return emit_sse2_store(compiler, op & SLJIT_F32_OP, dst, dstw, TMP_FREG);
2457         return SLJIT_SUCCESS;
2458 }
2459
2460 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fop2(struct sljit_compiler *compiler, sljit_s32 op,
2461         sljit_s32 dst, sljit_sw dstw,
2462         sljit_s32 src1, sljit_sw src1w,
2463         sljit_s32 src2, sljit_sw src2w)
2464 {
2465         sljit_s32 dst_r;
2466
2467         CHECK_ERROR();
2468         CHECK(check_sljit_emit_fop2(compiler, op, dst, dstw, src1, src1w, src2, src2w));
2469         ADJUST_LOCAL_OFFSET(dst, dstw);
2470         ADJUST_LOCAL_OFFSET(src1, src1w);
2471         ADJUST_LOCAL_OFFSET(src2, src2w);
2472
2473 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2474         compiler->mode32 = 1;
2475 #endif
2476
2477         if (FAST_IS_REG(dst)) {
2478                 dst_r = dst;
2479                 if (dst == src1)
2480                         ; /* Do nothing here. */
2481                 else if (dst == src2 && (op == SLJIT_ADD_F64 || op == SLJIT_MUL_F64)) {
2482                         /* Swap arguments. */
2483                         src2 = src1;
2484                         src2w = src1w;
2485                 }
2486                 else if (dst != src2)
2487                         FAIL_IF(emit_sse2_load(compiler, op & SLJIT_F32_OP, dst_r, src1, src1w));
2488                 else {
2489                         dst_r = TMP_FREG;
2490                         FAIL_IF(emit_sse2_load(compiler, op & SLJIT_F32_OP, TMP_FREG, src1, src1w));
2491                 }
2492         }
2493         else {
2494                 dst_r = TMP_FREG;
2495                 FAIL_IF(emit_sse2_load(compiler, op & SLJIT_F32_OP, TMP_FREG, src1, src1w));
2496         }
2497
2498         switch (GET_OPCODE(op)) {
2499         case SLJIT_ADD_F64:
2500                 FAIL_IF(emit_sse2(compiler, ADDSD_x_xm, op & SLJIT_F32_OP, dst_r, src2, src2w));
2501                 break;
2502
2503         case SLJIT_SUB_F64:
2504                 FAIL_IF(emit_sse2(compiler, SUBSD_x_xm, op & SLJIT_F32_OP, dst_r, src2, src2w));
2505                 break;
2506
2507         case SLJIT_MUL_F64:
2508                 FAIL_IF(emit_sse2(compiler, MULSD_x_xm, op & SLJIT_F32_OP, dst_r, src2, src2w));
2509                 break;
2510
2511         case SLJIT_DIV_F64:
2512                 FAIL_IF(emit_sse2(compiler, DIVSD_x_xm, op & SLJIT_F32_OP, dst_r, src2, src2w));
2513                 break;
2514         }
2515
2516         if (dst_r == TMP_FREG)
2517                 return emit_sse2_store(compiler, op & SLJIT_F32_OP, dst, dstw, TMP_FREG);
2518         return SLJIT_SUCCESS;
2519 }
2520
2521 /* --------------------------------------------------------------------- */
2522 /*  Conditional instructions                                             */
2523 /* --------------------------------------------------------------------- */
2524
2525 SLJIT_API_FUNC_ATTRIBUTE struct sljit_label* sljit_emit_label(struct sljit_compiler *compiler)
2526 {
2527         sljit_u8 *inst;
2528         struct sljit_label *label;
2529
2530         CHECK_ERROR_PTR();
2531         CHECK_PTR(check_sljit_emit_label(compiler));
2532
2533         /* We should restore the flags before the label,
2534            since other taken jumps has their own flags as well. */
2535         if (SLJIT_UNLIKELY(compiler->flags_saved))
2536                 PTR_FAIL_IF(emit_restore_flags(compiler, 0));
2537
2538         if (compiler->last_label && compiler->last_label->size == compiler->size)
2539                 return compiler->last_label;
2540
2541         label = (struct sljit_label*)ensure_abuf(compiler, sizeof(struct sljit_label));
2542         PTR_FAIL_IF(!label);
2543         set_label(label, compiler);
2544
2545         inst = (sljit_u8*)ensure_buf(compiler, 2);
2546         PTR_FAIL_IF(!inst);
2547
2548         *inst++ = 0;
2549         *inst++ = 0;
2550
2551         return label;
2552 }
2553
2554 SLJIT_API_FUNC_ATTRIBUTE struct sljit_jump* sljit_emit_jump(struct sljit_compiler *compiler, sljit_s32 type)
2555 {
2556         sljit_u8 *inst;
2557         struct sljit_jump *jump;
2558
2559         CHECK_ERROR_PTR();
2560         CHECK_PTR(check_sljit_emit_jump(compiler, type));
2561
2562         if (SLJIT_UNLIKELY(compiler->flags_saved)) {
2563                 if ((type & 0xff) <= SLJIT_JUMP)
2564                         PTR_FAIL_IF(emit_restore_flags(compiler, 0));
2565                 compiler->flags_saved = 0;
2566         }
2567
2568         jump = (struct sljit_jump*)ensure_abuf(compiler, sizeof(struct sljit_jump));
2569         PTR_FAIL_IF_NULL(jump);
2570         set_jump(jump, compiler, type & SLJIT_REWRITABLE_JUMP);
2571         type &= 0xff;
2572
2573         if (type >= SLJIT_CALL1)
2574                 PTR_FAIL_IF(call_with_args(compiler, type));
2575
2576         /* Worst case size. */
2577 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
2578         compiler->size += (type >= SLJIT_JUMP) ? 5 : 6;
2579 #else
2580         compiler->size += (type >= SLJIT_JUMP) ? (10 + 3) : (2 + 10 + 3);
2581 #endif
2582
2583         inst = (sljit_u8*)ensure_buf(compiler, 2);
2584         PTR_FAIL_IF_NULL(inst);
2585
2586         *inst++ = 0;
2587         *inst++ = type + 4;
2588         return jump;
2589 }
2590
2591 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_ijump(struct sljit_compiler *compiler, sljit_s32 type, sljit_s32 src, sljit_sw srcw)
2592 {
2593         sljit_u8 *inst;
2594         struct sljit_jump *jump;
2595
2596         CHECK_ERROR();
2597         CHECK(check_sljit_emit_ijump(compiler, type, src, srcw));
2598         ADJUST_LOCAL_OFFSET(src, srcw);
2599
2600         CHECK_EXTRA_REGS(src, srcw, (void)0);
2601
2602         if (SLJIT_UNLIKELY(compiler->flags_saved)) {
2603                 if (type <= SLJIT_JUMP)
2604                         FAIL_IF(emit_restore_flags(compiler, 0));
2605                 compiler->flags_saved = 0;
2606         }
2607
2608         if (type >= SLJIT_CALL1) {
2609 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
2610 #if (defined SLJIT_X86_32_FASTCALL && SLJIT_X86_32_FASTCALL)
2611                 if (src == SLJIT_R2) {
2612                         EMIT_MOV(compiler, TMP_REG1, 0, src, 0);
2613                         src = TMP_REG1;
2614                 }
2615                 if (src == SLJIT_MEM1(SLJIT_SP) && type >= SLJIT_CALL3)
2616                         srcw += sizeof(sljit_sw);
2617 #endif
2618 #endif
2619 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) && defined(_WIN64)
2620                 if (src == SLJIT_R2) {
2621                         EMIT_MOV(compiler, TMP_REG1, 0, src, 0);
2622                         src = TMP_REG1;
2623                 }
2624 #endif
2625                 FAIL_IF(call_with_args(compiler, type));
2626         }
2627
2628         if (src == SLJIT_IMM) {
2629                 jump = (struct sljit_jump*)ensure_abuf(compiler, sizeof(struct sljit_jump));
2630                 FAIL_IF_NULL(jump);
2631                 set_jump(jump, compiler, JUMP_ADDR);
2632                 jump->u.target = srcw;
2633
2634                 /* Worst case size. */
2635 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
2636                 compiler->size += 5;
2637 #else
2638                 compiler->size += 10 + 3;
2639 #endif
2640
2641                 inst = (sljit_u8*)ensure_buf(compiler, 2);
2642                 FAIL_IF_NULL(inst);
2643
2644                 *inst++ = 0;
2645                 *inst++ = type + 4;
2646         }
2647         else {
2648 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2649                 /* REX_W is not necessary (src is not immediate). */
2650                 compiler->mode32 = 1;
2651 #endif
2652                 inst = emit_x86_instruction(compiler, 1, 0, 0, src, srcw);
2653                 FAIL_IF(!inst);
2654                 *inst++ = GROUP_FF;
2655                 *inst |= (type >= SLJIT_FAST_CALL) ? CALL_rm : JMP_rm;
2656         }
2657         return SLJIT_SUCCESS;
2658 }
2659
2660 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_flags(struct sljit_compiler *compiler, sljit_s32 op,
2661         sljit_s32 dst, sljit_sw dstw,
2662         sljit_s32 src, sljit_sw srcw,
2663         sljit_s32 type)
2664 {
2665         sljit_u8 *inst;
2666         sljit_u8 cond_set = 0;
2667 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2668         sljit_s32 reg;
2669 #else
2670         /* CHECK_EXTRA_REGS migh overwrite these values. */
2671         sljit_s32 dst_save = dst;
2672         sljit_sw dstw_save = dstw;
2673 #endif
2674
2675         CHECK_ERROR();
2676         CHECK(check_sljit_emit_op_flags(compiler, op, dst, dstw, src, srcw, type));
2677         SLJIT_UNUSED_ARG(srcw);
2678
2679         if (dst == SLJIT_UNUSED)
2680                 return SLJIT_SUCCESS;
2681
2682         ADJUST_LOCAL_OFFSET(dst, dstw);
2683         CHECK_EXTRA_REGS(dst, dstw, (void)0);
2684         if (SLJIT_UNLIKELY(compiler->flags_saved))
2685                 FAIL_IF(emit_restore_flags(compiler, op & SLJIT_KEEP_FLAGS));
2686
2687         type &= 0xff;
2688         /* setcc = jcc + 0x10. */
2689         cond_set = get_jump_code(type) + 0x10;
2690
2691 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2692         if (GET_OPCODE(op) == SLJIT_OR && !GET_ALL_FLAGS(op) && FAST_IS_REG(dst) && dst == src) {
2693                 inst = (sljit_u8*)ensure_buf(compiler, 1 + 4 + 3);
2694                 FAIL_IF(!inst);
2695                 INC_SIZE(4 + 3);
2696                 /* Set low register to conditional flag. */
2697                 *inst++ = (reg_map[TMP_REG1] <= 7) ? REX : REX_B;
2698                 *inst++ = GROUP_0F;
2699                 *inst++ = cond_set;
2700                 *inst++ = MOD_REG | reg_lmap[TMP_REG1];
2701                 *inst++ = REX | (reg_map[TMP_REG1] <= 7 ? 0 : REX_R) | (reg_map[dst] <= 7 ? 0 : REX_B);
2702                 *inst++ = OR_rm8_r8;
2703                 *inst++ = MOD_REG | (reg_lmap[TMP_REG1] << 3) | reg_lmap[dst];
2704                 return SLJIT_SUCCESS;
2705         }
2706
2707         reg = (op == SLJIT_MOV && FAST_IS_REG(dst)) ? dst : TMP_REG1;
2708
2709         inst = (sljit_u8*)ensure_buf(compiler, 1 + 4 + 4);
2710         FAIL_IF(!inst);
2711         INC_SIZE(4 + 4);
2712         /* Set low register to conditional flag. */
2713         *inst++ = (reg_map[reg] <= 7) ? REX : REX_B;
2714         *inst++ = GROUP_0F;
2715         *inst++ = cond_set;
2716         *inst++ = MOD_REG | reg_lmap[reg];
2717         *inst++ = REX_W | (reg_map[reg] <= 7 ? 0 : (REX_B | REX_R));
2718         *inst++ = GROUP_0F;
2719         *inst++ = MOVZX_r_rm8;
2720         *inst = MOD_REG | (reg_lmap[reg] << 3) | reg_lmap[reg];
2721
2722         if (reg != TMP_REG1)
2723                 return SLJIT_SUCCESS;
2724
2725         if (GET_OPCODE(op) < SLJIT_ADD) {
2726                 compiler->mode32 = GET_OPCODE(op) != SLJIT_MOV;
2727                 return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
2728         }
2729 #if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) \
2730                 || (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
2731         compiler->skip_checks = 1;
2732 #endif
2733         return sljit_emit_op2(compiler, op, dst, dstw, dst, dstw, TMP_REG1, 0);
2734 #else /* SLJIT_CONFIG_X86_64 */
2735         if (GET_OPCODE(op) < SLJIT_ADD && FAST_IS_REG(dst)) {
2736                 if (reg_map[dst] <= 4) {
2737                         /* Low byte is accessible. */
2738                         inst = (sljit_u8*)ensure_buf(compiler, 1 + 3 + 3);
2739                         FAIL_IF(!inst);
2740                         INC_SIZE(3 + 3);
2741                         /* Set low byte to conditional flag. */
2742                         *inst++ = GROUP_0F;
2743                         *inst++ = cond_set;
2744                         *inst++ = MOD_REG | reg_map[dst];
2745
2746                         *inst++ = GROUP_0F;
2747                         *inst++ = MOVZX_r_rm8;
2748                         *inst = MOD_REG | (reg_map[dst] << 3) | reg_map[dst];
2749                         return SLJIT_SUCCESS;
2750                 }
2751
2752                 /* Low byte is not accessible. */
2753                 if (cpu_has_cmov == -1)
2754                         get_cpu_features();
2755
2756                 if (cpu_has_cmov) {
2757                         EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, 1);
2758                         /* a xor reg, reg operation would overwrite the flags. */
2759                         EMIT_MOV(compiler, dst, 0, SLJIT_IMM, 0);
2760
2761                         inst = (sljit_u8*)ensure_buf(compiler, 1 + 3);
2762                         FAIL_IF(!inst);
2763                         INC_SIZE(3);
2764
2765                         *inst++ = GROUP_0F;
2766                         /* cmovcc = setcc - 0x50. */
2767                         *inst++ = cond_set - 0x50;
2768                         *inst++ = MOD_REG | (reg_map[dst] << 3) | reg_map[TMP_REG1];
2769                         return SLJIT_SUCCESS;
2770                 }
2771
2772                 inst = (sljit_u8*)ensure_buf(compiler, 1 + 1 + 3 + 3 + 1);
2773                 FAIL_IF(!inst);
2774                 INC_SIZE(1 + 3 + 3 + 1);
2775                 *inst++ = XCHG_EAX_r + reg_map[TMP_REG1];
2776                 /* Set al to conditional flag. */
2777                 *inst++ = GROUP_0F;
2778                 *inst++ = cond_set;
2779                 *inst++ = MOD_REG | 0 /* eax */;
2780
2781                 *inst++ = GROUP_0F;
2782                 *inst++ = MOVZX_r_rm8;
2783                 *inst++ = MOD_REG | (reg_map[dst] << 3) | 0 /* eax */;
2784                 *inst++ = XCHG_EAX_r + reg_map[TMP_REG1];
2785                 return SLJIT_SUCCESS;
2786         }
2787
2788         if (GET_OPCODE(op) == SLJIT_OR && !GET_ALL_FLAGS(op) && FAST_IS_REG(dst) && dst == src && reg_map[dst] <= 4) {
2789                 SLJIT_COMPILE_ASSERT(reg_map[SLJIT_R0] == 0, scratch_reg1_must_be_eax);
2790                 if (dst != SLJIT_R0) {
2791                         inst = (sljit_u8*)ensure_buf(compiler, 1 + 1 + 3 + 2 + 1);
2792                         FAIL_IF(!inst);
2793                         INC_SIZE(1 + 3 + 2 + 1);
2794                         /* Set low register to conditional flag. */
2795                         *inst++ = XCHG_EAX_r + reg_map[TMP_REG1];
2796                         *inst++ = GROUP_0F;
2797                         *inst++ = cond_set;
2798                         *inst++ = MOD_REG | 0 /* eax */;
2799                         *inst++ = OR_rm8_r8;
2800                         *inst++ = MOD_REG | (0 /* eax */ << 3) | reg_map[dst];
2801                         *inst++ = XCHG_EAX_r + reg_map[TMP_REG1];
2802                 }
2803                 else {
2804                         inst = (sljit_u8*)ensure_buf(compiler, 1 + 2 + 3 + 2 + 2);
2805                         FAIL_IF(!inst);
2806                         INC_SIZE(2 + 3 + 2 + 2);
2807                         /* Set low register to conditional flag. */
2808                         *inst++ = XCHG_r_rm;
2809                         *inst++ = MOD_REG | (1 /* ecx */ << 3) | reg_map[TMP_REG1];
2810                         *inst++ = GROUP_0F;
2811                         *inst++ = cond_set;
2812                         *inst++ = MOD_REG | 1 /* ecx */;
2813                         *inst++ = OR_rm8_r8;
2814                         *inst++ = MOD_REG | (1 /* ecx */ << 3) | 0 /* eax */;
2815                         *inst++ = XCHG_r_rm;
2816                         *inst++ = MOD_REG | (1 /* ecx */ << 3) | reg_map[TMP_REG1];
2817                 }
2818                 return SLJIT_SUCCESS;
2819         }
2820
2821         /* Set TMP_REG1 to the bit. */
2822         inst = (sljit_u8*)ensure_buf(compiler, 1 + 1 + 3 + 3 + 1);
2823         FAIL_IF(!inst);
2824         INC_SIZE(1 + 3 + 3 + 1);
2825         *inst++ = XCHG_EAX_r + reg_map[TMP_REG1];
2826         /* Set al to conditional flag. */
2827         *inst++ = GROUP_0F;
2828         *inst++ = cond_set;
2829         *inst++ = MOD_REG | 0 /* eax */;
2830
2831         *inst++ = GROUP_0F;
2832         *inst++ = MOVZX_r_rm8;
2833         *inst++ = MOD_REG | (0 << 3) /* eax */ | 0 /* eax */;
2834
2835         *inst++ = XCHG_EAX_r + reg_map[TMP_REG1];
2836
2837         if (GET_OPCODE(op) < SLJIT_ADD)
2838                 return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
2839
2840 #if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) \
2841                 || (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
2842         compiler->skip_checks = 1;
2843 #endif
2844         return sljit_emit_op2(compiler, op, dst_save, dstw_save, dst_save, dstw_save, TMP_REG1, 0);
2845 #endif /* SLJIT_CONFIG_X86_64 */
2846 }
2847
2848 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_get_local_base(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw dstw, sljit_sw offset)
2849 {
2850         CHECK_ERROR();
2851         CHECK(check_sljit_get_local_base(compiler, dst, dstw, offset));
2852         ADJUST_LOCAL_OFFSET(dst, dstw);
2853
2854         CHECK_EXTRA_REGS(dst, dstw, (void)0);
2855
2856 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2857         compiler->mode32 = 0;
2858 #endif
2859
2860         ADJUST_LOCAL_OFFSET(SLJIT_MEM1(SLJIT_SP), offset);
2861
2862 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2863         if (NOT_HALFWORD(offset)) {
2864                 FAIL_IF(emit_load_imm64(compiler, TMP_REG1, offset));
2865 #if (defined SLJIT_DEBUG && SLJIT_DEBUG)
2866                 SLJIT_ASSERT(emit_lea_binary(compiler, SLJIT_KEEP_FLAGS, dst, dstw, SLJIT_SP, 0, TMP_REG1, 0) != SLJIT_ERR_UNSUPPORTED);
2867                 return compiler->error;
2868 #else
2869                 return emit_lea_binary(compiler, SLJIT_KEEP_FLAGS, dst, dstw, SLJIT_SP, 0, TMP_REG1, 0);
2870 #endif
2871         }
2872 #endif
2873
2874         if (offset != 0)
2875                 return emit_lea_binary(compiler, SLJIT_KEEP_FLAGS, dst, dstw, SLJIT_SP, 0, SLJIT_IMM, offset);
2876         return emit_mov(compiler, dst, dstw, SLJIT_SP, 0);
2877 }
2878
2879 SLJIT_API_FUNC_ATTRIBUTE struct sljit_const* sljit_emit_const(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw dstw, sljit_sw init_value)
2880 {
2881         sljit_u8 *inst;
2882         struct sljit_const *const_;
2883 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2884         sljit_s32 reg;
2885 #endif
2886
2887         CHECK_ERROR_PTR();
2888         CHECK_PTR(check_sljit_emit_const(compiler, dst, dstw, init_value));
2889         ADJUST_LOCAL_OFFSET(dst, dstw);
2890
2891         CHECK_EXTRA_REGS(dst, dstw, (void)0);
2892
2893         const_ = (struct sljit_const*)ensure_abuf(compiler, sizeof(struct sljit_const));
2894         PTR_FAIL_IF(!const_);
2895         set_const(const_, compiler);
2896
2897 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2898         compiler->mode32 = 0;
2899         reg = SLOW_IS_REG(dst) ? dst : TMP_REG1;
2900
2901         if (emit_load_imm64(compiler, reg, init_value))
2902                 return NULL;
2903 #else
2904         if (dst == SLJIT_UNUSED)
2905                 dst = TMP_REG1;
2906
2907         if (emit_mov(compiler, dst, dstw, SLJIT_IMM, init_value))
2908                 return NULL;
2909 #endif
2910
2911         inst = (sljit_u8*)ensure_buf(compiler, 2);
2912         PTR_FAIL_IF(!inst);
2913
2914         *inst++ = 0;
2915         *inst++ = 1;
2916
2917 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2918         if (dst & SLJIT_MEM)
2919                 if (emit_mov(compiler, dst, dstw, TMP_REG1, 0))
2920                         return NULL;
2921 #endif
2922
2923         return const_;
2924 }
2925
2926 SLJIT_API_FUNC_ATTRIBUTE void sljit_set_jump_addr(sljit_uw addr, sljit_uw new_addr)
2927 {
2928 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
2929         *(sljit_sw*)addr = new_addr - (addr + 4);
2930 #else
2931         *(sljit_uw*)addr = new_addr;
2932 #endif
2933 }
2934
2935 SLJIT_API_FUNC_ATTRIBUTE void sljit_set_const(sljit_uw addr, sljit_sw new_constant)
2936 {
2937         *(sljit_sw*)addr = new_constant;
2938 }
2939
2940 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_x86_is_sse2_available(void)
2941 {
2942 #if (defined SLJIT_DETECT_SSE2 && SLJIT_DETECT_SSE2)
2943         if (cpu_has_sse2 == -1)
2944                 get_cpu_features();
2945         return cpu_has_sse2;
2946 #else
2947         return 1;
2948 #endif
2949 }
2950
2951 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_x86_is_cmov_available(void)
2952 {
2953         if (cpu_has_cmov == -1)
2954                 get_cpu_features();
2955         return cpu_has_cmov;
2956 }
2957
2958 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_x86_emit_cmov(struct sljit_compiler *compiler,
2959         sljit_s32 type,
2960         sljit_s32 dst_reg,
2961         sljit_s32 src, sljit_sw srcw)
2962 {
2963         sljit_u8* inst;
2964
2965         CHECK_ERROR();
2966 #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
2967         CHECK_ARGUMENT(sljit_x86_is_cmov_available());
2968         CHECK_ARGUMENT(!(type & ~(0xff | SLJIT_I32_OP)));
2969         CHECK_ARGUMENT((type & 0xff) >= SLJIT_EQUAL && (type & 0xff) <= SLJIT_ORDERED_F64);
2970         CHECK_ARGUMENT(FUNCTION_CHECK_IS_REG(dst_reg & ~SLJIT_I32_OP));
2971         FUNCTION_CHECK_SRC(src, srcw);
2972 #endif
2973 #if (defined SLJIT_VERBOSE && SLJIT_VERBOSE)
2974         if (SLJIT_UNLIKELY(!!compiler->verbose)) {
2975                 fprintf(compiler->verbose, "  x86_cmov%s %s%s, ",
2976                         !(dst_reg & SLJIT_I32_OP) ? "" : ".i",
2977                         jump_names[type & 0xff], JUMP_POSTFIX(type));
2978                 sljit_verbose_reg(compiler, dst_reg & ~SLJIT_I32_OP);
2979                 fprintf(compiler->verbose, ", ");
2980                 sljit_verbose_param(compiler, src, srcw);
2981                 fprintf(compiler->verbose, "\n");
2982         }
2983 #endif
2984
2985         ADJUST_LOCAL_OFFSET(src, srcw);
2986         CHECK_EXTRA_REGS(src, srcw, (void)0);
2987
2988 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2989         compiler->mode32 = dst_reg & SLJIT_I32_OP;
2990 #endif
2991         dst_reg &= ~SLJIT_I32_OP;
2992
2993         if (SLJIT_UNLIKELY(src & SLJIT_IMM)) {
2994                 EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, srcw);
2995                 src = TMP_REG1;
2996                 srcw = 0;
2997         }
2998
2999         inst = emit_x86_instruction(compiler, 2, dst_reg, 0, src, srcw);
3000         FAIL_IF(!inst);
3001         *inst++ = GROUP_0F;
3002         *inst = get_jump_code(type & 0xff) - 0x40;
3003         return SLJIT_SUCCESS;
3004 }