diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 0528289f3..60fff3b23 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -22,7 +22,7 @@ jobs: fail-fast: false matrix: target: [linux, darwin, windows] - architecture: [32, 64, arm64] + architecture: [64, arm64] build_system: [make, cmake, cmake-mingw, cmake-clang-cl, vs2019, makegcc14] include: @@ -429,7 +429,7 @@ jobs: fail-fast: false matrix: os: [darwin, linux, windows] - architecture: [x86_32, x86_64, arm64] + architecture: [x86_64, arm64] include: - architecture: arm64 test-flags: --skip-hl-jit # not yet supported diff --git a/CMakeLists.txt b/CMakeLists.txt index ddd2fd260..5cbb7277a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,7 +1,7 @@ cmake_minimum_required(VERSION 3.13) -set(HL_VERSION_MAJOR 1) -set(HL_VERSION_MINOR 16) +set(HL_VERSION_MAJOR 2) +set(HL_VERSION_MINOR 0) set(HL_VERSION_PATCH 0) set(HL_VERSION ${HL_VERSION_MAJOR}.${HL_VERSION_MINOR}.${HL_VERSION_PATCH}) @@ -20,7 +20,8 @@ include(FindPkgConfig) include(CTest) set(WITH_VM_DEFAULT ON) -if(CMAKE_SYSTEM_PROCESSOR MATCHES "arm|aarch64" AND (NOT CMAKE_OSX_ARCHITECTURES MATCHES "x86_64")) +# 32-bit ARM has no JIT backend; aarch64/arm64 uses src/jit_aarch64.c. +if(CMAKE_SYSTEM_PROCESSOR MATCHES "^arm$|^armv7" AND (NOT CMAKE_OSX_ARCHITECTURES MATCHES "x86_64")) set(WITH_VM_DEFAULT OFF) endif() @@ -225,9 +226,18 @@ else() endif() if (WITH_VM) + if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm64|ARM64") + set(HL_JIT_BACKEND src/jit_aarch64.c src/jit_aarch64_emit.c) + else() + set(HL_JIT_BACKEND src/jit_x86_64.c) + endif() add_executable(hl src/code.c src/jit.c + src/jit_emit.c + src/jit_regs.c + ${HL_JIT_BACKEND} + src/jit_dump.c src/main.c src/module.c src/debugger.c diff --git a/Makefile b/Makefile index aded6c272..e1ffa169b 100644 --- a/Makefile +++ b/Makefile @@ -41,7 +41,13 @@ STD = src/std/array.o src/std/buffer.o src/std/bytes.o src/std/cast.o src/std/da src/std/socket.o src/std/string.o src/std/sys.o src/std/types.o src/std/ucs2.o src/std/thread.o src/std/process.o \ src/std/track.o -HL_OBJ = src/code.o src/jit.o src/main.o src/module.o src/debugger.o src/profile.o +ifeq ($(ARCH),arm64) +HL_JIT_BACKEND_OBJ = src/jit_aarch64.o src/jit_aarch64_emit.o +else +HL_JIT_BACKEND_OBJ = src/jit_x86_64.o +endif + +HL_OBJ = src/code.o src/jit.o src/jit_emit.o src/jit_regs.o $(HL_JIT_BACKEND_OBJ) src/jit_dump.o src/main.o src/module.o src/debugger.o src/profile.o FMT_CPPFLAGS = -I include/mikktspace -I include/minimp3 @@ -240,19 +246,12 @@ LIBHL = libhl.$(LIBEXT) HL = hl$(EXE_SUFFIX) HLC = hlc$(EXE_SUFFIX) -all: $(LIBHL) libs -ifeq ($(ARCH),arm64) - $(warning HashLink vm is not supported on arm64, skipping) -else -all: $(HL) -endif +all: $(LIBHL) libs $(HL) install: $(UNAME)==Darwin && ${MAKE} uninstall -ifneq ($(ARCH),arm64) mkdir -p $(INSTALL_BIN_DIR) cp $(HL) $(INSTALL_BIN_DIR) -endif mkdir -p $(INSTALL_LIB_DIR) cp *.hdll $(INSTALL_LIB_DIR) cp $(LIBHL) $(INSTALL_LIB_DIR) @@ -365,11 +364,7 @@ release_win: rm -rf $(PACKAGE_NAME) release_linux release_osx: -ifeq ($(ARCH),arm64) - cp $(LIBHL) *.hdll $(PACKAGE_NAME) -else cp $(HL) $(LIBHL) *.hdll $(PACKAGE_NAME) -endif tar -cvzf $(PACKAGE_NAME).tar.gz $(PACKAGE_NAME) rm -rf $(PACKAGE_NAME) diff --git a/hl.vcxproj b/hl.vcxproj index 88e95b28b..fef4a909e 100644 --- a/hl.vcxproj +++ b/hl.vcxproj @@ -45,55 +45,55 @@ Application true Unicode - v142 + v143 Application true Unicode - v142 + v143 Application false true Unicode - v142 + v143 Application false true Unicode - v142 + v143 Application false true Unicode - v120 + v143 Application false true Unicode - v142 + v143 Application false true Unicode - v142 + v143 Application false true Unicode - v120 + v143 @@ -186,7 +186,7 @@ EnableAllWarnings Disabled WIN32;_DEBUG;_CONSOLE;HL_VTUNE;%(PreprocessorDefinitions) - /wd4456 /wd4100 /wd4204 /wd4702 /wd4457 %(AdditionalOptions) + /wd4456 /wd4100 /wd4204 /wd4702 /wd4457 /we4013 %(AdditionalOptions) true stdc11 @@ -196,6 +196,7 @@ libhl.lib;user32.lib;include/vtune/jitprofiling.lib false false + 4194304 PerMonitorHighDPIAware @@ -361,14 +362,20 @@ + + + + + + diff --git a/hl.vcxproj.filters b/hl.vcxproj.filters index f86723996..8a8395f72 100644 --- a/hl.vcxproj.filters +++ b/hl.vcxproj.filters @@ -4,14 +4,20 @@ - + + + + + + + \ No newline at end of file diff --git a/libhl.vcxproj b/libhl.vcxproj index 40f1a2eff..1f86fe1a7 100644 --- a/libhl.vcxproj +++ b/libhl.vcxproj @@ -36,40 +36,40 @@ DynamicLibrary true - v142 + v143 Unicode DynamicLibrary false - v142 + v143 true Unicode DynamicLibrary false - v120 + v143 true Unicode DynamicLibrary true - v142 + v143 Unicode DynamicLibrary false - v142 + v143 true Unicode DynamicLibrary false - v120 + v143 true Unicode diff --git a/src/allocator.c b/src/allocator.c index 47dfc8f41..f9bd63420 100644 --- a/src/allocator.c +++ b/src/allocator.c @@ -313,6 +313,8 @@ static void *gc_alloc_fixed( int part, int kind ) { for(i=0;iblock_size;i++) if( ptr[i] != 0xDD ) hl_fatal("assert"); + else + ptr[i] = 0xCD; } # endif gc_free_pages[pid] = ph; @@ -367,6 +369,8 @@ static void *gc_alloc_var( int part, int size, int kind ) { for(i=0;ibmp ) { diff --git a/src/data_struct.c b/src/data_struct.c new file mode 100644 index 000000000..ed417770e --- /dev/null +++ b/src/data_struct.c @@ -0,0 +1,308 @@ +/* + * Copyright (C)2015-2026 Haxe Foundation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ +#ifdef S_TYPE + +// is included by data_struct.h + +#ifdef S_MAP +# define S_ARGS S_KEY k, S_VALUE v +#else +# define S_ARGS S_VALUE k +# define S_KEY S_VALUE +# define keys values +#endif + +#ifndef S_DEFVAL +# define S_DEFVAL (S_VALUE)0 +#endif + +#ifndef S_CMP +# define S_CMP(a,b) a > b +#endif + +typedef struct { + int cur; + int max; + S_KEY *keys; +# ifdef S_MAP + S_VALUE *values; +# endif +} S_TYPE; + +typedef S_VALUE S_NAME(_value); +#ifdef S_MAP +typedef S_KEY S_NAME(_key); +#endif + +INLINE static void S_NAME(check_size)( hl_alloc *alloc, S_TYPE *st ) { + if( st->cur == st->max ) { + int n = st->max ? (st->max << 1) : STRUCT_DEF_SIZE; + S_KEY *keys = (S_KEY*)hl_malloc(alloc,sizeof(S_KEY) * n); + memcpy(keys,st->keys,sizeof(S_KEY) * st->cur); + st->keys = keys; +# ifdef S_MAP + S_VALUE *vals = (S_VALUE*)hl_malloc(alloc,sizeof(S_VALUE) * n); + memcpy(vals,st->values,sizeof(S_VALUE) * st->cur); + st->values = vals; +# endif + st->max = n; + } +} + +#ifndef S_SORTED + +INLINE static void S_NAME(add_impl)( hl_alloc *alloc, S_TYPE *st, S_ARGS ) { + S_NAME(check_size)(alloc,st); + st->keys[st->cur] = k; +# ifdef S_MAP + st->values[st->cur] = v; +# endif + st->cur++; +} + +INLINE static bool S_NAME(exists)( S_TYPE st, S_KEY k ) { + for(int i=0;icur;i++) + if( st->keys[i] == k ) { + int pos = i; + memmove(st->keys + pos, st->keys + pos + 1, (st->cur - pos - 1) * sizeof(S_KEY)); +# ifdef S_MAP + memmove(st->values + pos, st->values + pos + 1, (st->cur - pos - 1) * sizeof(S_VALUE)); +# endif + st->cur--; + return true; + } + return false; +} + +INLINE static void S_NAME(remove_range)( S_TYPE *st, int pos, int count ) { + memmove(st->keys + pos, st->keys + pos + count, (st->cur - pos - count) * sizeof(S_KEY)); +# ifdef S_MAP + memmove(st->values + pos, st->values + pos + count, (st->cur - pos - count) * sizeof(S_VALUE)); +# endif + st->cur -= count; +} + +#ifdef S_MAP +static S_VALUE S_NAME(find)( S_TYPE st, S_KEY k ) { + for(int i=0;icur + count > st->max ) { + int n = st->max ? (st->max << 1) : STRUCT_DEF_SIZE; + while( n < st->cur + count ) n <<= 1; + S_KEY *keys = (S_KEY*)hl_malloc(alloc,sizeof(S_KEY) * n); + memcpy(keys,st->keys,sizeof(S_KEY) * st->cur); + st->keys = keys; + st->max = n; + } + S_VALUE *ptr = st->keys + st->cur; + st->cur += count; + return ptr; +} +#endif + + +#else + +INLINE static bool S_NAME(add_impl)( hl_alloc *alloc, S_TYPE *st, S_ARGS ) { + int min = 0; + int max = st->cur; + int pos; + while( min < max ) { + int mid = (min + max) >> 1; + S_KEY k2 = st->keys[mid]; + if( S_CMP(k,k2) ) min = mid + 1; else if( S_CMP(k2,k) ) max = mid; else return false; + } + S_NAME(check_size)(alloc,st); + pos = (min + max) >> 1; + memmove(st->keys + pos + 1, st->keys + pos, (st->cur - pos) * sizeof(S_KEY)); +# ifdef S_MAP + memmove(st->values + pos + 1, st->values + pos, (st->cur - pos) * sizeof(S_VALUE)); +# endif + st->keys[pos] = k; +# ifdef S_MAP + st->values[pos] = v; +# endif + st->cur++; + return true; +} + +#ifdef S_MAP +INLINE static void S_NAME(replace_impl)( hl_alloc *alloc, S_TYPE *st, S_ARGS ) { + int min = 0; + int max = st->cur; + int pos; + while( min < max ) { + int mid = (min + max) >> 1; + S_KEY k2 = st->keys[mid]; + if( k2 < k ) min = mid + 1; else if( k2 > k ) max = mid; else { + st->values[mid] = v; + return; + } + } + S_NAME(check_size)(alloc,st); + pos = (min + max) >> 1; + memmove(st->keys + pos + 1, st->keys + pos, (st->cur - pos) * sizeof(S_KEY)); + memmove(st->values + pos + 1, st->values + pos, (st->cur - pos) * sizeof(S_VALUE)); + st->keys[pos] = k; + st->values[pos] = v; + st->cur++; +} + +INLINE static bool S_NAME(add_pair_impl)( hl_alloc *alloc, S_TYPE *st, S_ARGS ) { + int min = 0; + int max = st->cur; + int pos; + while( min < max ) { + int mid = (min + max) >> 1; + S_KEY k2 = st->keys[mid]; + if( k2 < k ) min = mid + 1; else if( k2 > k ) max = mid; else { + S_VALUE v2 = st->values[mid]; + if( S_CMP(v,v2) ) min = mid+1; else if( S_CMP(v2,v) ) max = mid; else return false; + } + } + S_NAME(check_size)(alloc,st); + pos = (min + max) >> 1; + memmove(st->keys + pos + 1, st->keys + pos, (st->cur - pos) * sizeof(S_KEY)); + memmove(st->values + pos + 1, st->values + pos, (st->cur - pos) * sizeof(S_VALUE)); + st->keys[pos] = k; + st->values[pos] = v; + st->cur++; + return true; +} +#endif + +INLINE static bool S_NAME(exists)( S_TYPE st, S_KEY k ) { + int min = 0; + int max = st.cur; + while( min < max ) { + int mid = (min + max) >> 1; + S_KEY k2 = st.keys[mid]; + if( S_CMP(k,k2) ) min = mid + 1; else if( S_CMP(k2,k) ) max = mid; else return true; + } + return false; +} + +#ifdef S_MAP +INLINE static S_VALUE S_NAME(find)( S_TYPE st, S_KEY k ) { + int min = 0; + int max = st.cur; + while( min < max ) { + int mid = (min + max) >> 1; + S_KEY k2 = st.keys[mid]; + if( k2 < k ) min = mid + 1; else if( k2 > k ) max = mid; else return st.values[mid]; + } + return S_DEFVAL; +} +#endif + +INLINE static bool S_NAME(remove)( S_TYPE *st, S_KEY k ) { + int min = 0; + int max = st->cur; + while( min < max ) { + int mid = (min + max) >> 1; + S_KEY k2 = st->keys[mid]; + if( S_CMP(k,k2) ) min = mid + 1; else if( S_CMP(k2,k) ) max = mid; else { + int pos = mid; + memmove(st->keys + pos, st->keys + pos + 1, (st->cur - pos - 1) * sizeof(S_KEY)); +# ifdef S_MAP + memmove(st->values + pos, st->values + pos + 1, (st->cur - pos - 1) * sizeof(S_VALUE)); +# endif + st->cur--; + return true; + } + } + return false; +} + +#endif + +INLINE static void S_NAME(reset)( S_TYPE *st ) { + st->cur = 0; +} + +INLINE static S_VALUE *S_NAME(free)( S_TYPE *st ) { + st->cur = 0; + st->max = 0; + S_VALUE *vals = st->values; +# ifdef S_MAP + st->keys = NULL; +# endif + st->values = NULL; + return vals; +} + +INLINE static int S_NAME(count)( S_TYPE st ) { + return st.cur; +} + +INLINE static S_VALUE S_NAME(get)( S_TYPE st, int idx ) { + return st.values[idx]; +} + +INLINE static S_VALUE *S_NAME(addr)( S_TYPE st, int idx ) { + return &st.values[idx]; +} + +INLINE static S_VALUE S_NAME(first)( S_TYPE st ) { + return st.cur == 0 ? S_DEFVAL : st.values[0]; +} + +INLINE static bool S_NAME(iter_next)( S_TYPE st, S_VALUE *val, int idx ) { + if( idx < st.cur ) *val = st.values[idx]; + return idx < st.cur; +} + +#ifdef S_MAP +INLINE static bool S_NAME(iter_next_key)( S_TYPE st, S_KEY *key, int idx ) { + if( idx < st.cur ) *key = st.keys[idx]; + return idx < st.cur; +} +#endif + +INLINE static bool S_NAME(iter_prev)( S_TYPE st, S_VALUE *val, int idx ) { + if( idx >= 0 ) *val = st.values[idx]; + return idx >= 0; +} + +#undef S_NAME +#undef S_TYPE +#undef S_VALUE +#undef S_KEY +#undef S_ARGS +#undef STRUCT_NAME +#undef S_CMP +#undef S_DEFVAL +#undef keys + +#endif diff --git a/src/data_struct.h b/src/data_struct.h new file mode 100644 index 000000000..5c5b9fe4e --- /dev/null +++ b/src/data_struct.h @@ -0,0 +1,82 @@ +/* + * Copyright (C)2015-2026 Haxe Foundation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ +#ifndef HL_DATA_STRUCT_H +#define HL_DATA_STRUCT_H + +#include + +#if defined(__GNUC__) || defined(__clang__) +#define INLINE inline __attribute__((always_inline)) +#elif defined(_MSC_VER) +#define INLINE __forceinline +#else +#define INLINE inline +#endif + +#define STRUCT_DEF_SIZE 2 +#define for_iter(name,var,set) name##__value var; for(int __idx=0;name##_iter_next(set,&var,__idx);__idx++) +#define for_iter_key(name,var,set) name##__key var; for(int __idx=0;name##_iter_next_key(set,&var,__idx);__idx++) +#define for_iter_back(name,var,set) name##__value var; for(int __idx=(set).cur-1;name##_iter_prev(set,&var,__idx);__idx--) + +#define S_TYPE ptr_set +#define S_NAME(name) ptr_set_##name +#define S_VALUE void* +#include "data_struct.c" +#define ptr_set_add(set,v) ptr_set_add_impl(DEF_ALLOC,&(set),v) + +#define S_TYPE int_arr +#define S_NAME(name) int_arr_##name +#define S_VALUE int +#include "data_struct.c" +#define int_arr_add(set,v) int_arr_add_impl(DEF_ALLOC,&(set),v) +#define int_arr_reserve(set,v) int_arr_reserve_impl(DEF_ALLOC,&(set),v) + +#define S_SORTED + +#define S_TYPE int_set +#define S_NAME(name) int_set_##name +#define S_VALUE int +#include "data_struct.c" +#define int_set_add(set,v) int_set_add_impl(DEF_ALLOC,&(set),v) + +#define S_MAP + +#define S_TYPE int_map +#define S_NAME(name) int_map_##name +#define S_KEY int +#define S_VALUE int +#include "data_struct.c" +#define int_map_add(map,k,v) int_map_add_impl(DEF_ALLOC,&(map),k,v) +#define int_map_replace(map,k,v) int_map_replace_impl(DEF_ALLOC,&(map),k,v) + +#define S_TYPE ptr_map +#define S_NAME(name) ptr_map_##name +#define S_KEY int +#define S_VALUE void* +#include "data_struct.c" +#define ptr_map_add(map,k,v) ptr_map_add_impl(DEF_ALLOC,&(map),k,v) +#define ptr_map_replace(map,k,v) ptr_map_replace_impl(DEF_ALLOC,&(map),k,v) + +#undef S_MAP +#undef S_SORTED + +#endif diff --git a/src/hl.h b/src/hl.h index 6220eb369..e21be7f92 100644 --- a/src/hl.h +++ b/src/hl.h @@ -27,7 +27,7 @@ https://github.com/HaxeFoundation/hashlink/wiki/ **/ -#define HL_VERSION 0x011000 +#define HL_VERSION 0x020000 #if defined(_WIN32) # define HL_WIN diff --git a/src/hlmodule.h b/src/hlmodule.h index b2619f932..adf29f9bd 100644 --- a/src/hlmodule.h +++ b/src/hlmodule.h @@ -19,6 +19,9 @@ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. */ +#ifndef HL_MODULE_H +#define HL_MODULE_H + #include #include #include "opcodes.h" @@ -104,9 +107,6 @@ typedef struct { bool large; } hl_debug_infos; -typedef struct _jit_ctx jit_ctx; - - typedef struct { hl_code *code; int *types_hashes; @@ -124,6 +124,8 @@ typedef struct { #endif #endif +typedef struct _jit_ctx jit_ctx; + typedef struct { hl_code *code; int codesize; @@ -138,6 +140,7 @@ typedef struct { jit_ctx *jit_ctx; hl_module_context ctx; #ifdef WIN64_UNWIND_TABLES + int unwind_table_size; PRUNTIME_FUNCTION unwind_table; #endif } hl_module; @@ -165,10 +168,4 @@ hl_type *hl_module_resolve_type( hl_module *m, hl_type *t, bool err ); void hl_profile_setup( int sample_count ); void hl_profile_end(); -jit_ctx *hl_jit_alloc(); -void hl_jit_free( jit_ctx *ctx, h_bool can_reset ); -void hl_jit_reset( jit_ctx *ctx, hl_module *m ); -void hl_jit_init( jit_ctx *ctx, hl_module *m ); -int hl_jit_function( jit_ctx *ctx, hl_module *m, hl_function *f ); -void *hl_jit_code( jit_ctx *ctx, hl_module *m, int *codesize, hl_debug_infos **debug, hl_module *previous ); -void hl_jit_patch_method( void *old_fun, void **new_fun_table ); +#endif diff --git a/src/jit.c b/src/jit.c index b1f82b0fa..ddf9a187d 100644 --- a/src/jit.c +++ b/src/jit.c @@ -19,4753 +19,330 @@ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. */ -#ifdef _MSC_VER -#pragma warning(disable:4820) -#endif -#include -#include -#include "hlsystem.h" - -#ifdef __arm__ -# error "JIT does not support ARM processors, only x86 and x86-64 are supported, please use HashLink/C native compilation instead" -#endif - -#ifdef HL_DEBUG -# define JIT_DEBUG -#endif - -typedef enum { - Eax = 0, - Ecx = 1, - Edx = 2, - Ebx = 3, - Esp = 4, - Ebp = 5, - Esi = 6, - Edi = 7, -#ifdef HL_64 - R8 = 8, - R9 = 9, - R10 = 10, - R11 = 11, - R12 = 12, - R13 = 13, - R14 = 14, - R15 = 15, -#endif - _LAST = 0xFF -} CpuReg; - -typedef enum { - MOV, - LEA, - PUSH, - ADD, - SUB, - IMUL, // only overflow flag changes compared to MUL - DIV, - IDIV, - CDQ, - CDQE, - POP, - RET, - CALL, - AND, - OR, - XOR, - CMP, - TEST, - NOP, - SHL, - SHR, - SAR, - INC, - DEC, - JMP, - // FPU - FSTP, - FSTP32, - FLD, - FLD32, - FLDCW, - // SSE - MOVSD, - MOVSS, - COMISD, - COMISS, - ADDSD, - SUBSD, - MULSD, - DIVSD, - ADDSS, - SUBSS, - MULSS, - DIVSS, - XORPD, - CVTSI2SD, - CVTSI2SS, - CVTSD2SI, - CVTSD2SS, - CVTSS2SD, - CVTSS2SI, - STMXCSR, - LDMXCSR, - // 8-16 bits - MOV8, - CMP8, - TEST8, - PUSH8, - MOV16, - CMP16, - TEST16, - // prefetchs - PREFETCHT0, - PREFETCHT1, - PREFETCHT2, - PREFETCHNTA, - PREFETCHW, - // -- - _CPU_LAST -} CpuOp; - -#define JAlways 0 -#define JOverflow 0x80 -#define JULt 0x82 -#define JUGte 0x83 -#define JEq 0x84 -#define JNeq 0x85 -#define JULte 0x86 -#define JUGt 0x87 -#define JParity 0x8A -#define JNParity 0x8B -#define JSLt 0x8C -#define JSGte 0x8D -#define JSLte 0x8E -#define JSGt 0x8F - -#define JCarry JLt -#define JZero JEq -#define JNotZero JNeq - -#define B(bv) *ctx->buf.b++ = (unsigned char)(bv) -#define W(wv) *ctx->buf.w++ = wv - -#ifdef HL_64 -# define W64(wv) *ctx->buf.w64++ = wv -#else -# define W64(wv) W(wv) -#endif - -static const int SIB_MULT[] = {-1, 0, 1, -1, 2, -1, -1, -1, 3}; - -#define MOD_RM(mod,reg,rm) B(((mod) << 6) | (((reg)&7) << 3) | ((rm)&7)) -#define SIB(mult,rmult,rbase) B((SIB_MULT[mult]<<6) | (((rmult)&7)<<3) | ((rbase)&7)) -#define IS_SBYTE(c) ( (c) >= -128 && (c) < 128 ) - -#define AddJump(how,local) { if( (how) == JAlways ) { B(0xE9); } else { B(0x0F); B(how); }; local = BUF_POS(); W(0); } -#define AddJump_small(how,local) { if( (how) == JAlways ) { B(0xEB); } else B(how - 0x10); local = BUF_POS() | 0x40000000; B(0); } -#define XJump(how,local) AddJump(how,local) -#define XJump_small(how,local) AddJump_small(how,local) +#include -#define MAX_OP_SIZE 256 +static jit_ctx *current_ctx = NULL; -#define BUF_POS() ((int)(ctx->buf.b - ctx->startBuf)) -#define RTYPE(r) r->t->kind - -#ifdef HL_64 -# define RESERVE_ADDRESS 0x8000000000000000 -#else -# define RESERVE_ADDRESS 0x80000000 -#endif - -#if defined(HL_WIN_CALL) && defined(HL_64) -# define IS_WINCALL64 1 -#else -# define IS_WINCALL64 0 -#endif - -typedef struct jlist jlist; -struct jlist { - int pos; - int target; - jlist *next; -}; - -typedef struct vreg vreg; - -typedef enum { - RCPU = 0, - RFPU = 1, - RSTACK = 2, - RCONST = 3, - RADDR = 4, - RMEM = 5, - RUNUSED = 6, - RCPU_CALL = 1 | 8, - RCPU_8BITS = 1 | 16 -} preg_kind; - -typedef struct { - preg_kind kind; - int id; - int lock; - vreg *holds; -} preg; - -struct vreg { - int stackPos; - int size; - hl_type *t; - preg *current; - preg stack; -}; - -#define REG_AT(i) (ctx->pregs + (i)) +void hl_jit_error( const char *msg, const char *func, int line ) { + printf("*** JIT ERROR %s:%d (%s)****\n", func, line, msg); + if( current_ctx ) { + jit_ctx *ctx = current_ctx; + current_ctx = NULL; + hl_emit_dump(ctx); + } + fflush(stdout); +} -#ifdef HL_64 -# define RCPU_COUNT 16 -# define RFPU_COUNT 16 -# ifdef HL_WIN_CALL -# define CALL_NREGS 4 -# define RCPU_SCRATCH_COUNT 7 -# define RFPU_SCRATCH_COUNT 6 -static const int RCPU_SCRATCH_REGS[] = { Eax, Ecx, Edx, R8, R9, R10, R11 }; -static const CpuReg CALL_REGS[] = { Ecx, Edx, R8, R9 }; -# else -# define CALL_NREGS 6 // TODO : XMM6+XMM7 are FPU reg parameters -# define RCPU_SCRATCH_COUNT 9 -# define RFPU_SCRATCH_COUNT 16 -static const int RCPU_SCRATCH_REGS[] = { Eax, Ecx, Edx, Esi, Edi, R8, R9, R10, R11 }; -static const CpuReg CALL_REGS[] = { Edi, Esi, Edx, Ecx, R8, R9 }; -# endif -#else -# define CALL_NREGS 0 -# define RCPU_COUNT 8 -# define RFPU_COUNT 8 -# define RCPU_SCRATCH_COUNT 3 -# define RFPU_SCRATCH_COUNT 8 -static const int RCPU_SCRATCH_REGS[] = { Eax, Ecx, Edx }; -#endif +void hl_jit_null_field_access( int fhash ) { + vbyte *field = hl_field_name(fhash); + hl_buffer *b = hl_alloc_buffer(); + hl_buffer_str(b, USTR("Null access .")); + hl_buffer_str(b, (uchar*)field); + vdynamic *d = hl_alloc_dynamic(&hlt_bytes); + d->v.ptr = hl_buffer_content(b,NULL); + hl_throw(d); +} -#define XMM(i) ((i) + RCPU_COUNT) -#define PXMM(i) REG_AT(XMM(i)) -#define REG_IS_FPU(i) ((i) >= RCPU_COUNT) +void hl_jit_assert() { + vdynamic *d = hl_alloc_dynamic(&hlt_bytes); + d->v.ptr = USTR("Assert"); + hl_throw(d); +} -#define PEAX REG_AT(Eax) -#define PESP REG_AT(Esp) -#define PEBP REG_AT(Ebp) +void hl_emit_alloc( jit_ctx *jit ); +void hl_emit_free( jit_ctx *jit ); +void hl_emit_function( jit_ctx *jit ); +void hl_emit_final( jit_ctx *jit ); -#define REG_COUNT (RCPU_COUNT + RFPU_COUNT) +void hl_regs_alloc( jit_ctx *jit ); +void hl_regs_free( jit_ctx *jit ); +void hl_regs_function( jit_ctx *jit ); -#define ID2(a,b) ((a) | ((b)<<8)) -#define R(id) (ctx->vregs + (id)) -#define ASSERT(i) { printf("JIT ERROR %d (jit.c line %d)\n",i,(int)__LINE__); jit_exit(); } -#define IS_FLOAT(r) ((r)->t->kind == HF64 || (r)->t->kind == HF32) -#define RLOCK(r) if( (r)->lock < ctx->currentPos ) (r)->lock = ctx->currentPos -#define RUNLOCK(r) if( (r)->lock == ctx->currentPos ) (r)->lock = 0 +void hl_codegen_alloc( jit_ctx *jit ); +void hl_codegen_init( jit_ctx *jit ); +void hl_codegen_free( jit_ctx *jit ); +void hl_codegen_flush_consts( jit_ctx *jit ); +void hl_codegen_function( jit_ctx *jit ); +void hl_codegen_final( jit_ctx *jit ); -#define BREAK() B(0xCC) +void hl_jit_init_regs( regs_config *cfg ); -static preg _unused = { RUNUSED, 0, 0, NULL }; -static preg *UNUSED = &_unused; +jit_ctx *hl_jit_alloc() { + jit_ctx *ctx = (jit_ctx*)malloc(sizeof(jit_ctx)); + memset(ctx,0,sizeof(jit_ctx)); + hl_jit_init_regs(&ctx->cfg); + hl_alloc_init(&ctx->falloc); + hl_emit_alloc(ctx); + hl_regs_alloc(ctx); + hl_codegen_alloc(ctx); + return ctx; +} -struct _jit_ctx { - union { - unsigned char *b; - unsigned int *w; - unsigned long long *w64; - int *i; - double *d; - } buf; - vreg *vregs; - preg pregs[REG_COUNT]; - vreg *savedRegs[REG_COUNT]; - int savedLocks[REG_COUNT]; - int *opsPos; - int maxRegs; - int maxOps; - int bufSize; - int totalRegsSize; - int functionPos; - int allocOffset; - int currentPos; - int nativeArgsCount; - unsigned char *startBuf; - hl_module *m; - hl_function *f; - jlist *jumps; - jlist *calls; - jlist *switchs; - hl_alloc falloc; // cleared per-function - hl_alloc galloc; - vclosure *closure_list; - hl_debug_infos *debug; - int c2hl; - int hl2c; -#ifdef JIT_CUSTOM_LONGJUMP - int longjump; -#endif - void *static_functions[8]; - bool static_function_offset; +void hl_jit_define_function( jit_ctx *ctx, int start, int size ) { #ifdef WIN64_UNWIND_TABLES - int unwind_offset; - int nunwind; - PRUNTIME_FUNCTION unwind_table; + int fid = ctx->fdef_index++; + if( fid >= ctx->mod->unwind_table_size ) jit_assert(); + ctx->mod->unwind_table[fid].BeginAddress = start; + ctx->mod->unwind_table[fid].EndAddress = start + size; #endif -}; - -#ifdef WIN64_UNWIND_TABLES +} -typedef enum _UNWIND_OP_CODES -{ - UWOP_PUSH_NONVOL = 0, /* info == register number */ - UWOP_ALLOC_LARGE, /* no info, alloc size in next 2 slots */ - UWOP_ALLOC_SMALL, /* info == size of allocation / 8 - 1 */ - UWOP_SET_FPREG, /* no info, FP = RSP + UNWIND_INFO.FPRegOffset*16 */ - UWOP_SAVE_NONVOL, /* info == register number, offset in next slot */ - UWOP_SAVE_NONVOL_FAR, /* info == register number, offset in next 2 slots */ - UWOP_SAVE_XMM128 = 8, /* info == XMM reg number, offset in next slot */ - UWOP_SAVE_XMM128_FAR, /* info == XMM reg number, offset in next 2 slots */ - UWOP_PUSH_MACHFRAME /* info == 0: no error-code, 1: error-code */ -} UNWIND_CODE_OPS; +static bool jit_code_reserve( jit_ctx *ctx, int size ) { + int pos = ctx->out_pos; + if( pos + size > ctx->out_max ) { + int nsize = ctx->out_max ? ctx->out_max * 3 : 4096; + while( pos + ctx->code_size > nsize ) nsize *= 3; + unsigned char *nout = malloc(nsize); + if( !nout ) return false; + memcpy(nout,ctx->output,pos); + free(ctx->output); + ctx->output = nout; + ctx->out_max = nsize; + } + return true; +} -void write_uwcode(jit_ctx *ctx, unsigned char offset, UNWIND_CODE_OPS code, unsigned char info) -{ - B(offset); - B((code) | (info) << 4); +static bool jit_code_append( jit_ctx *ctx ) { + if( !jit_code_reserve(ctx,ctx->code_size) ) + return false; + int pos = ctx->out_pos; + memcpy(ctx->output + pos, ctx->code_instrs, ctx->code_size); + ctx->out_pos += ctx->code_size; + return true; } -void write_unwind_data(jit_ctx *ctx) -{ - // All generated functions use a frame pointer, so the same unwind info can be used for all of them +void hl_jit_init( jit_ctx *ctx, hl_module *m ) { + ctx->mod = m; +#ifdef WIN64_UNWIND_TABLES unsigned char version = 1; unsigned char flags = 0; unsigned char CountOfCodes = 2; unsigned char SizeOfProlog = 4; unsigned char FrameRegister = 5; // RBP unsigned char FrameOffset = 0; + jit_code_reserve(ctx,64); +# define B(v) ctx->output[ctx->out_pos++] = v +# define UW(offs,code,inf) B(offs); B((code) | (inf) << 4) B((version) | (flags) << 3); B(SizeOfProlog); B(CountOfCodes); B((FrameRegister) | (FrameOffset) << 4); - write_uwcode(ctx, 4, UWOP_SET_FPREG, 0); - write_uwcode(ctx, 1, UWOP_PUSH_NONVOL, 5); -} -#endif - -#define jit_exit() { hl_debug_break(); exit(-1); } -#define jit_error(msg) _jit_error(ctx,msg,__LINE__) - -#ifndef HL_64 -# ifdef HL_DEBUG -# define error_i64() jit_error("i64-32") -# else -void error_i64() { - printf("The module you are loading is using 64 bit ints that are not supported by the HL32.\nPlease run using HL64 or compile with -D hl-legacy32"); - jit_exit(); -} -# endif -#endif - -static void _jit_error( jit_ctx *ctx, const char *msg, int line ); -static void on_jit_error( const char *msg, int_val line ); - -static preg *pmem( preg *r, CpuReg reg, int offset ) { - r->kind = RMEM; - r->id = 0 | (reg << 4) | (offset << 8); - return r; -} - -static preg *pmem2( preg *r, CpuReg reg, CpuReg reg2, int mult, int offset ) { - r->kind = RMEM; - r->id = mult | (reg << 4) | (reg2 << 8); - r->holds = (void*)(int_val)offset; - return r; -} - -#ifdef HL_64 -static preg *pcodeaddr( preg *r, int offset ) { - r->kind = RMEM; - r->id = 15 | (offset << 4); - return r; -} -#endif - -static preg *pconst( preg *r, int c ) { - r->kind = RCONST; - r->holds = NULL; - r->id = c; - return r; -} - -static preg *pconst64( preg *r, int_val c ) { -#ifdef HL_64 - if( ((int)c) == c ) - return pconst(r,(int)c); - r->kind = RCONST; - r->id = 0xC064C064; - r->holds = (vreg*)c; - return r; -#else - return pconst(r,(int)c); -#endif -} - -#ifndef HL_64 -// it is not possible to access direct 64 bit address in x86-64 -static preg *paddr( preg *r, void *p ) { - r->kind = RADDR; - r->holds = (vreg*)p; - return r; -} -#endif - -static void save_regs( jit_ctx *ctx ) { - int i; - for(i=0;isavedRegs[i] = ctx->pregs[i].holds; - ctx->savedLocks[i] = ctx->pregs[i].lock; - } -} - -static void restore_regs( jit_ctx *ctx ) { - int i; - for(i=0;imaxRegs;i++) - ctx->vregs[i].current = NULL; - for(i=0;isavedRegs[i]; - preg *p = ctx->pregs + i; - p->holds = r; - p->lock = ctx->savedLocks[i]; - if( r ) r->current = p; - } -} - -static void jit_buf( jit_ctx *ctx ) { - if( BUF_POS() > ctx->bufSize - MAX_OP_SIZE ) { - int nsize = ctx->bufSize * 4 / 3; - unsigned char *nbuf; - int curpos; - if( nsize == 0 ) { - int i; - for(i=0;im->code->nfunctions;i++) - nsize += ctx->m->code->functions[i].nops; - nsize *= 4; - } - if( nsize < ctx->bufSize + MAX_OP_SIZE * 4 ) nsize = ctx->bufSize + MAX_OP_SIZE * 4; - curpos = BUF_POS(); - nbuf = (unsigned char*)malloc(nsize); - if( nbuf == NULL ) ASSERT(nsize); - if( ctx->startBuf ) { - memcpy(nbuf,ctx->startBuf,curpos); - free(ctx->startBuf); - } - ctx->startBuf = nbuf; - ctx->buf.b = nbuf + curpos; - ctx->bufSize = nsize; - } -} - -static const char *KNAMES[] = { "cpu","fpu","stack","const","addr","mem","unused" }; -#define ERRIF(c) if( c ) { printf("%s(%s,%s)\n",f?f->name:"???",KNAMES[a->kind], KNAMES[b->kind]); ASSERT(0); } - -typedef struct { - const char *name; // single operand - int r_mem; // r32 / r/m32 r32 - int mem_r; // r/m32 / r32 r/m32 - int r_const; // r32 / imm32 imm32 - int r_i8; // r32 / imm8 imm8 - int mem_const; // r/m32 / imm32 N/A -} opform; - -#define FLAG_LONGOP 0x80000000 -#define FLAG_16B 0x40000000 -#define FLAG_8B 0x20000000 -#define FLAG_DUAL 0x10000000 - -#define RM(op,id) ((op) | (((id)+1)<<8)) -#define GET_RM(op) (((op) >> ((op) < 0 ? 24 : 8)) & 15) -#define SBYTE(op) ((op) << 16) -#define LONG_OP(op) ((op) | FLAG_LONGOP) -#define OP16(op) LONG_OP((op) | FLAG_16B) -#define LONG_RM(op,id) LONG_OP(op | (((id) + 1) << 24)) - -static opform OP_FORMS[_CPU_LAST] = { - { "MOV", 0x8B, 0x89, 0xB8, 0, RM(0xC7,0) }, - { "LEA", 0x8D }, - { "PUSH", 0x50, RM(0xFF,6), 0x68, 0x6A }, - { "ADD", 0x03, 0x01, RM(0x81,0), RM(0x83,0) }, - { "SUB", 0x2B, 0x29, RM(0x81,5), RM(0x83,5) }, - { "IMUL", LONG_OP(0x0FAF), 0, 0x69 | FLAG_DUAL, 0x6B | FLAG_DUAL }, - { "DIV", RM(0xF7,6), RM(0xF7,6) }, - { "IDIV", RM(0xF7,7), RM(0xF7,7) }, - { "CDQ", 0x99 }, - { "CDQE", 0x98 }, - { "POP", 0x58, RM(0x8F,0) }, - { "RET", 0xC3 }, - { "CALL", RM(0xFF,2), RM(0xFF,2), 0xE8 }, - { "AND", 0x23, 0x21, RM(0x81,4), RM(0x83,4) }, - { "OR", 0x0B, 0x09, RM(0x81,1), RM(0x83,1) }, - { "XOR", 0x33, 0x31, RM(0x81,6), RM(0x83,6) }, - { "CMP", 0x3B, 0x39, RM(0x81,7), RM(0x83,7) }, - { "TEST", 0x85, 0x85/*SWP?*/, RM(0xF7,0) }, - { "NOP", 0x90 }, - { "SHL", RM(0xD3,4), 0, 0, RM(0xC1,4) }, - { "SHR", RM(0xD3,5), 0, 0, RM(0xC1,5) }, - { "SAR", RM(0xD3,7), 0, 0, RM(0xC1,7) }, - { "INC", IS_64 ? RM(0xFF,0) : 0x40, RM(0xFF,0) }, - { "DEC", IS_64 ? RM(0xFF,1) : 0x48, RM(0xFF,1) }, - { "JMP", RM(0xFF,4) }, - // FPU - { "FSTP", 0, RM(0xDD,3) }, - { "FSTP32", 0, RM(0xD9,3) }, - { "FLD", 0, RM(0xDD,0) }, - { "FLD32", 0, RM(0xD9,0) }, - { "FLDCW", 0, RM(0xD9, 5) }, - // SSE - { "MOVSD", 0xF20F10, 0xF20F11 }, - { "MOVSS", 0xF30F10, 0xF30F11 }, - { "COMISD", 0x660F2F }, - { "COMISS", LONG_OP(0x0F2F) }, - { "ADDSD", 0xF20F58 }, - { "SUBSD", 0xF20F5C }, - { "MULSD", 0xF20F59 }, - { "DIVSD", 0xF20F5E }, - { "ADDSS", 0xF30F58 }, - { "SUBSS", 0xF30F5C }, - { "MULSS", 0xF30F59 }, - { "DIVSS", 0xF30F5E }, - { "XORPD", 0x660F57 }, - { "CVTSI2SD", 0xF20F2A }, - { "CVTSI2SS", 0xF30F2A }, - { "CVTSD2SI", 0xF20F2D }, - { "CVTSD2SS", 0xF20F5A }, - { "CVTSS2SD", 0xF30F5A }, - { "CVTSS2SI", 0xF30F2D }, - { "STMXCSR", 0, LONG_RM(0x0FAE,3) }, - { "LDMXCSR", 0, LONG_RM(0x0FAE,2) }, - // 8 bits, - { "MOV8", 0x8A, 0x88, 0, 0xB0, RM(0xC6,0) }, - { "CMP8", 0x3A, 0x38, 0, RM(0x80,7) }, - { "TEST8", 0x84, 0x84, RM(0xF6,0) }, - { "PUSH8", 0, 0, 0x6A | FLAG_8B }, - { "MOV16", OP16(0x8B), OP16(0x89), OP16(0xB8) }, - { "CMP16", OP16(0x3B), OP16(0x39) }, - { "TEST16", OP16(0x85) }, - // prefetchs - { "PREFETCHT0", 0, LONG_RM(0x0F18,1) }, - { "PREFETCHT1", 0, LONG_RM(0x0F18,2) }, - { "PREFETCHT2", 0, LONG_RM(0x0F18,3) }, - { "PREFETCHNTA", 0, LONG_RM(0x0F18,0) }, - { "PREFETCHW", 0, LONG_RM(0x0F0D,1) }, -}; - -#ifdef HL_64 -# define REX() if( r64 ) B(r64 | 0x40) -#else -# define REX() -#endif - -#define OP(b) \ - if( (b) & 0xFF0000 ) { \ - B((b)>>16); \ - if( r64 ) B(r64 | 0x40); /* also in 32 bits mode */ \ - B((b)>>8); \ - B(b); \ - } else { \ - if( (b) & FLAG_16B ) { \ - B(0x66); \ - REX(); \ - } else {\ - REX(); \ - if( (b) & FLAG_LONGOP ) B((b)>>8); \ - }\ - B(b); \ - } - -static bool is_reg8( preg *a ) { - return a->kind == RSTACK || a->kind == RMEM || a->kind == RCONST || (a->kind == RCPU && a->id != Esi && a->id != Edi); -} - -static void op( jit_ctx *ctx, CpuOp o, preg *a, preg *b, bool mode64 ) { - opform *f = &OP_FORMS[o]; - int r64 = mode64 && (o != PUSH && o != POP && o != CALL && o != PUSH8 && o < PREFETCHT0) ? 8 : 0; - switch( o ) { - case CMP8: - case TEST8: - case MOV8: - if( !is_reg8(a) || !is_reg8(b) ) - ASSERT(0); - break; - default: - break; - } - switch( ID2(a->kind,b->kind) ) { - case ID2(RUNUSED,RUNUSED): - ERRIF(f->r_mem == 0); - OP(f->r_mem); - break; - case ID2(RCPU,RCPU): - case ID2(RFPU,RFPU): - ERRIF( f->r_mem == 0 ); - if( a->id > 7 ) r64 |= 4; - if( b->id > 7 ) r64 |= 1; - OP(f->r_mem); - MOD_RM(3,a->id,b->id); - break; - case ID2(RCPU,RFPU): - case ID2(RFPU,RCPU): - ERRIF( (f->r_mem>>16) == 0 ); - if( a->id > 7 ) r64 |= 4; - if( b->id > 7 ) r64 |= 1; - OP(f->r_mem); - MOD_RM(3,a->id,b->id); - break; - case ID2(RCPU,RUNUSED): - ERRIF( f->r_mem == 0 ); - if( a->id > 7 ) r64 |= 1; - if( GET_RM(f->r_mem) > 0 ) { - OP(f->r_mem); - MOD_RM(3, GET_RM(f->r_mem)-1, a->id); - } else - OP(f->r_mem + (a->id&7)); - break; - case ID2(RSTACK,RUNUSED): - ERRIF( f->mem_r == 0 || GET_RM(f->mem_r) == 0 ); - { - int stackPos = R(a->id)->stackPos; - OP(f->mem_r); - if( IS_SBYTE(stackPos) ) { - MOD_RM(1,GET_RM(f->mem_r)-1,Ebp); - B(stackPos); - } else { - MOD_RM(2,GET_RM(f->mem_r)-1,Ebp); - W(stackPos); - } - } - break; - case ID2(RCPU,RCONST): - ERRIF( f->r_const == 0 && f->r_i8 == 0 ); - if( a->id > 7 ) r64 |= 1; - { - int_val cval = b->holds ? (int_val)b->holds : b->id; - // short byte form - if( f->r_i8 && IS_SBYTE(cval) ) { - if( (f->r_i8&FLAG_DUAL) && a->id > 7 ) r64 |= 4; - OP(f->r_i8); - if( (f->r_i8&FLAG_DUAL) ) MOD_RM(3,a->id,a->id); else MOD_RM(3,GET_RM(f->r_i8)-1,a->id); - B((int)cval); - } else if( GET_RM(f->r_const) > 0 || (f->r_const&FLAG_DUAL) ) { - if( (f->r_i8&FLAG_DUAL) && a->id > 7 ) r64 |= 4; - OP(f->r_const&0xFF); - if( (f->r_i8&FLAG_DUAL) ) MOD_RM(3,a->id,a->id); else MOD_RM(3,GET_RM(f->r_const)-1,a->id); - if( mode64 && IS_64 && o == MOV ) W64(cval); else W((int)cval); - } else { - ERRIF( f->r_const == 0); - OP((f->r_const&0xFF) + (a->id&7)); - if( mode64 && IS_64 && o == MOV ) W64(cval); else W((int)cval); - } - } - break; - case ID2(RSTACK,RCPU): - case ID2(RSTACK,RFPU): - ERRIF( f->mem_r == 0 ); - if( b->id > 7 ) r64 |= 4; - { - int stackPos = R(a->id)->stackPos; - OP(f->mem_r); - if( IS_SBYTE(stackPos) ) { - MOD_RM(1,b->id,Ebp); - B(stackPos); - } else { - MOD_RM(2,b->id,Ebp); - W(stackPos); - } - } - break; - case ID2(RCPU,RSTACK): - case ID2(RFPU,RSTACK): - ERRIF( f->r_mem == 0 ); - if( a->id > 7 ) r64 |= 4; - { - int stackPos = R(b->id)->stackPos; - OP(f->r_mem); - if( IS_SBYTE(stackPos) ) { - MOD_RM(1,a->id,Ebp); - B(stackPos); - } else { - MOD_RM(2,a->id,Ebp); - W(stackPos); - } - } - break; - case ID2(RCONST,RUNUSED): - ERRIF( f->r_const == 0 ); - { - int_val cval = a->holds ? (int_val)a->holds : a->id; - OP(f->r_const); - if( f->r_const & FLAG_8B ) B((int)cval); else W((int)cval); - } - break; - case ID2(RMEM,RUNUSED): - ERRIF( f->mem_r == 0 ); - { - int mult = a->id & 0xF; - int regOrOffs = mult == 15 ? a->id >> 4 : a->id >> 8; - CpuReg reg = (a->id >> 4) & 0xF; - if( mult == 15 ) { - ERRIF(1); - } else if( mult == 0 ) { - if( reg > 7 ) r64 |= 1; - OP(f->mem_r); - if( regOrOffs == 0 && (reg&7) != Ebp ) { - MOD_RM(0,GET_RM(f->mem_r)-1,reg); - if( (reg&7) == Esp ) B(0x24); - } else if( IS_SBYTE(regOrOffs) ) { - MOD_RM(1,GET_RM(f->mem_r)-1,reg); - if( (reg&7) == Esp ) B(0x24); - B(regOrOffs); - } else { - MOD_RM(2,GET_RM(f->mem_r)-1,reg); - if( (reg&7) == Esp ) B(0x24); - W(regOrOffs); - } - } else { - // [eax + ebx * M] - ERRIF(1); - } - } - break; - case ID2(RCPU, RMEM): - case ID2(RFPU, RMEM): - ERRIF( f->r_mem == 0 ); - { - int mult = b->id & 0xF; - int regOrOffs = mult == 15 ? b->id >> 4 : b->id >> 8; - CpuReg reg = (b->id >> 4) & 0xF; - if( mult == 15 ) { - int pos; - if( a->id > 7 ) r64 |= 4; - OP(f->r_mem); - MOD_RM(0,a->id,5); - if( IS_64 ) { - // offset wrt current code - pos = BUF_POS() + 4; - W(regOrOffs - pos); - } else { - ERRIF(1); - } - } else if( mult == 0 ) { - if( a->id > 7 ) r64 |= 4; - if( reg > 7 ) r64 |= 1; - OP(f->r_mem); - if( regOrOffs == 0 && (reg&7) != Ebp ) { - MOD_RM(0,a->id,reg); - if( (reg&7) == Esp ) B(0x24); - } else if( IS_SBYTE(regOrOffs) ) { - MOD_RM(1,a->id,reg); - if( (reg&7) == Esp ) B(0x24); - B(regOrOffs); - } else { - MOD_RM(2,a->id,reg); - if( (reg&7) == Esp ) B(0x24); - W(regOrOffs); - } - } else { - int offset = (int)(int_val)b->holds; - if( a->id > 7 ) r64 |= 4; - if( reg > 7 ) r64 |= 1; - if( regOrOffs > 7 ) r64 |= 2; - OP(f->r_mem); - MOD_RM(offset == 0 ? 0 : IS_SBYTE(offset) ? 1 : 2,a->id,4); - SIB(mult,regOrOffs,reg); - if( offset ) { - if( IS_SBYTE(offset) ) B(offset); else W(offset); - } - } - } - break; -# ifndef HL_64 - case ID2(RFPU,RADDR): -# endif - case ID2(RCPU,RADDR): - ERRIF( f->r_mem == 0 ); - if( a->id > 7 ) r64 |= 4; - OP(f->r_mem); - MOD_RM(0,a->id,5); - if( IS_64 ) - W64((int_val)b->holds); - else - W((int)(int_val)b->holds); - break; -# ifndef HL_64 - case ID2(RADDR,RFPU): -# endif - case ID2(RADDR,RCPU): - ERRIF( f->mem_r == 0 ); - if( b->id > 7 ) r64 |= 4; - OP(f->mem_r); - MOD_RM(0,b->id,5); - if( IS_64 ) - W64((int_val)a->holds); - else - W((int)(int_val)a->holds); - break; - case ID2(RMEM, RCPU): - case ID2(RMEM, RFPU): - ERRIF( f->mem_r == 0 ); - { - int mult = a->id & 0xF; - int regOrOffs = mult == 15 ? a->id >> 4 : a->id >> 8; - CpuReg reg = (a->id >> 4) & 0xF; - if( mult == 15 ) { - int pos; - if( b->id > 7 ) r64 |= 4; - OP(f->mem_r); - MOD_RM(0,b->id,5); - if( IS_64 ) { - // offset wrt current code - pos = BUF_POS() + 4; - W(regOrOffs - pos); - } else { - ERRIF(1); - } - } else if( mult == 0 ) { - if( b->id > 7 ) r64 |= 4; - if( reg > 7 ) r64 |= 1; - OP(f->mem_r); - if( regOrOffs == 0 && (reg&7) != Ebp ) { - MOD_RM(0,b->id,reg); - if( (reg&7) == Esp ) B(0x24); - } else if( IS_SBYTE(regOrOffs) ) { - MOD_RM(1,b->id,reg); - if( (reg&7) == Esp ) B(0x24); - B(regOrOffs); - } else { - MOD_RM(2,b->id,reg); - if( (reg&7) == Esp ) B(0x24); - W(regOrOffs); - } - } else { - int offset = (int)(int_val)a->holds; - if( b->id > 7 ) r64 |= 4; - if( reg > 7 ) r64 |= 1; - if( regOrOffs > 7 ) r64 |= 2; - OP(f->mem_r); - MOD_RM(offset == 0 ? 0 : IS_SBYTE(offset) ? 1 : 2,b->id,4); - SIB(mult,regOrOffs,reg); - if( offset ) { - if( IS_SBYTE(offset) ) B(offset); else W(offset); - } - } - } - break; - default: - ERRIF(1); - } - if( ctx->debug && ctx->f && o == CALL ) { - preg p; - op(ctx,MOV,pmem(&p,Esp,-HL_WSIZE),PEBP,true); // erase EIP (clean stack report) - } -} - -static void op32( jit_ctx *ctx, CpuOp o, preg *a, preg *b ) { - op(ctx,o,a,b,false); -} - -static void op64( jit_ctx *ctx, CpuOp o, preg *a, preg *b ) { -#ifndef HL_64 - op(ctx,o,a,b,false); -#else - op(ctx,o,a,b,true); + UW(4, 3 /*UWOP_SET_FPREG*/, 0); + UW(1, 0 /*UWOP_PUSH_NONVOL*/, 5); + while( ctx->out_pos & 15 ) B(0); #endif -} - -static void patch_jump( jit_ctx *ctx, int p ) { - if( p == 0 ) return; - if( p & 0x40000000 ) { - int d; - p &= 0x3FFFFFFF; - d = BUF_POS() - (p + 1); - if( d < -128 || d >= 128 ) ASSERT(d); - *(char*)(ctx->startBuf + p) = (char)d; - } else { - *(int*)(ctx->startBuf + p) = BUF_POS() - (p + 4); - } -} - -static void patch_jump_to( jit_ctx *ctx, int p, int target ) { - if( p == 0 ) return; - if( p & 0x40000000 ) { - int d; - p &= 0x3FFFFFFF; - d = target - (p + 1); - if( d < -128 || d >= 128 ) ASSERT(d); - *(char*)(ctx->startBuf + p) = (char)d; - } else { - *(int*)(ctx->startBuf + p) = target - (p + 4); - } -} - -static int stack_size( hl_type *t ) { - switch( t->kind ) { - case HUI8: - case HUI16: - case HBOOL: -# ifdef HL_64 - case HI32: - case HF32: -# endif - return sizeof(int_val); - case HI64: - default: - return hl_type_size(t); + hl_codegen_init(ctx); + jit_code_append(ctx); + if( m->code->hasdebug ) { + m->jit_debug = (hl_debug_infos*)malloc(sizeof(hl_debug_infos) * m->code->nfunctions); + memset(m->jit_debug, -1, sizeof(hl_debug_infos) * m->code->nfunctions); } } -static int call_reg_index( int reg ) { -# ifdef HL_64 - int i; - for(i=0;ifalloc); + free(ctx); } -static bool is_call_reg( preg *p ) { -# ifdef HL_64 - int i; - if( p->kind == RFPU ) - return p->id < CALL_NREGS; - for(i=0;ikind == RCPU && p->id == CALL_REGS[i] ) - return true; - return false; -# else - return false; -# endif +void hl_jit_reset( jit_ctx *ctx, hl_module *m ) { } -static preg *alloc_reg( jit_ctx *ctx, preg_kind k ) { - int i; - preg *p; - switch( k ) { - case RCPU: - case RCPU_CALL: - case RCPU_8BITS: - { - int off = ctx->allocOffset++; - const int count = RCPU_SCRATCH_COUNT; - for(i=0;ipregs + r; - if( p->lock >= ctx->currentPos ) continue; - if( k == RCPU_CALL && is_call_reg(p) ) continue; - if( k == RCPU_8BITS && !is_reg8(p) ) continue; - if( p->holds == NULL ) { - RLOCK(p); - return p; - } - } - for(i=0;ipregs + RCPU_SCRATCH_REGS[(i + off)%count]; - if( p->lock >= ctx->currentPos ) continue; - if( k == RCPU_CALL && is_call_reg(p) ) continue; - if( k == RCPU_8BITS && !is_reg8(p) ) continue; - if( p->holds ) { - RLOCK(p); - p->holds->current = NULL; - p->holds = NULL; - return p; - } - } - } - break; - case RFPU: - { - int off = ctx->allocOffset++; - const int count = RFPU_SCRATCH_COUNT; - for(i=0;ilock >= ctx->currentPos ) continue; - if( p->holds == NULL ) { - RLOCK(p); - return p; - } - } - for(i=0;ilock >= ctx->currentPos ) continue; - if( p->holds ) { - RLOCK(p); - p->holds->current = NULL; - p->holds = NULL; - return p; - } - } +int hl_jit_function( jit_ctx *ctx, hl_module *m, hl_function *f ) { + hl_free(&ctx->falloc); + ctx->mod = m; + ctx->fun = f; + ctx->reg_instr_count = 0; + ctx->code_size = 0; + current_ctx = ctx; + hl_emit_function(ctx); + hl_regs_function(ctx); + hl_codegen_function(ctx); + int pos = ctx->out_pos; + hl_jit_define_function(ctx, pos, ctx->code_size); + if( m->jit_debug && ctx->code_pos_map ) { + bool compact = ctx->code_size < 0xFFFF; + void *debug = malloc((compact ? sizeof(unsigned short) : sizeof(int)) * (f->nops + 1)); + for(int i=0;i<=f->nops;i++) { + int ipos = ctx->emit_pos_map[i]; + int rpos = ctx->reg_pos_map[ipos]; + int cpos = ctx->code_pos_map[rpos]; + if( compact ) + ((unsigned short*)debug)[i] = (unsigned short)cpos; + else + ((int*)debug)[i] = cpos; } - break; - default: - ASSERT(k); - } - ASSERT(0); // out of registers ! - return NULL; -} - -static preg *fetch( vreg *r ) { - if( r->current ) - return r->current; - return &r->stack; -} - -static void scratch( preg *r ) { - if( r && r->holds ) { - r->holds->current = NULL; - r->holds = NULL; - r->lock = 0; + int fid = (int)(f - m->code->functions); + m->jit_debug[fid].start = pos; + m->jit_debug[fid].offsets = debug; + m->jit_debug[fid].large = !compact; } + if( !jit_code_append(ctx) ) + return -1; + current_ctx = NULL; + return pos; } -static preg *copy( jit_ctx *ctx, preg *to, preg *from, int size ); - -static void load( jit_ctx *ctx, preg *r, vreg *v ) { - preg *from = fetch(v); - if( from == r || v->size == 0 ) return; - if( r->holds ) r->holds->current = NULL; - if( v->current ) { - v->current->holds = NULL; - from = r; - } - r->holds = v; - v->current = r; - copy(ctx,r,from,v->size); -} +static void *call_jit_c2hl = hl_jit_assert; +static void *call_jit_hl2c = hl_jit_assert; +static int arg_reg_count = 0; +static int arg_fp_count = 0; -static preg *alloc_fpu( jit_ctx *ctx, vreg *r, bool andLoad ) { - preg *p = fetch(r); - if( p->kind != RFPU ) { - if( !IS_FLOAT(r) && (IS_64 || r->t->kind != HI64) ) ASSERT(r->t->kind); - p = alloc_reg(ctx, RFPU); - if( andLoad ) - load(ctx,p,r); - else { - if( r->current ) - r->current->holds = NULL; - r->current = p; - p->holds = r; +static int get_next_reg( hl_type *t, int *rp, int *fp ) { + if( t->kind == HF32 || t->kind == HF64 ) { + if( *fp < arg_fp_count ) { + int r = (*fp)++; + if( IS_WINCALL64 ) (*rp)++; + return r; } - } else - RLOCK(p); - return p; -} - -static void reg_bind( vreg *r, preg *p ) { - if( r->current ) - r->current->holds = NULL; - r->current = p; - p->holds = r; -} - -static preg *alloc_cpu( jit_ctx *ctx, vreg *r, bool andLoad ) { - preg *p = fetch(r); - if( p->kind != RCPU ) { -# ifndef HL_64 - if( r->t->kind == HI64 ) return alloc_fpu(ctx,r,andLoad); - if( r->size > 4 ) ASSERT(r->size); -# endif - p = alloc_reg(ctx, RCPU); - if( andLoad ) - load(ctx,p,r); - else - reg_bind(r,p); - } else - RLOCK(p); - return p; -} - -// allocate a register that is not a call parameter -static preg *alloc_cpu_call( jit_ctx *ctx, vreg *r ) { - preg *p = fetch(r); - if( p->kind != RCPU ) { -# ifndef HL_64 - if( r->t->kind == HI64 ) return alloc_fpu(ctx,r,true); - if( r->size > 4 ) ASSERT(r->size); -# endif - p = alloc_reg(ctx, RCPU_CALL); - load(ctx,p,r); - } else if( is_call_reg(p) ) { - preg *p2 = alloc_reg(ctx, RCPU_CALL); - op64(ctx,MOV,p2,p); - scratch(p); - reg_bind(r,p2); - return p2; - } else - RLOCK(p); - return p; -} - -static preg *fetch32( jit_ctx *ctx, vreg *r ) { - if( r->current ) - return r->current; - // make sure that the register is correctly erased - if( r->size < 4 ) { - preg *p = alloc_cpu(ctx, r, true); - RUNLOCK(p); - return p; + return -1; } - return fetch(r); -} - -// make sure higher bits are zeroes -static preg *alloc_cpu64( jit_ctx *ctx, vreg *r, bool andLoad ) { -# ifndef HL_64 - return alloc_cpu(ctx,r,andLoad); -# else - preg *p = fetch(r); - if( !andLoad ) ASSERT(0); - if( p->kind != RCPU ) { - p = alloc_reg(ctx, RCPU); - op64(ctx,XOR,p,p); - load(ctx,p,r); - } else { - // remove higher bits - preg tmp; - op64(ctx,SHL,p,pconst(&tmp,32)); - op64(ctx,SHR,p,pconst(&tmp,32)); - RLOCK(p); + if( *rp < arg_fp_count ) { + int r = (*rp)++; + if( IS_WINCALL64 ) (*fp)++; + return r; } - return p; -# endif + return -1; } -// make sure the register can be used with 8 bits access -static preg *alloc_cpu8( jit_ctx *ctx, vreg *r, bool andLoad ) { - preg *p = fetch(r); - if( p->kind != RCPU ) { - p = alloc_reg(ctx, RCPU_8BITS); - load(ctx,p,r); - } else if( !is_reg8(p) ) { - preg *p2 = alloc_reg(ctx, RCPU_8BITS); - op64(ctx,MOV,p2,p); - scratch(p); - reg_bind(r,p2); - return p2; - } else - RLOCK(p); - return p; +static void *default_wrapper( hl_type *ft ) { + return call_jit_hl2c; } -static preg *copy( jit_ctx *ctx, preg *to, preg *from, int size ) { - if( size == 0 || to == from ) return to; - switch( ID2(to->kind,from->kind) ) { - case ID2(RMEM,RCPU): - case ID2(RSTACK,RCPU): - case ID2(RCPU,RSTACK): - case ID2(RCPU,RMEM): - case ID2(RCPU,RCPU): -# ifndef HL_64 - case ID2(RCPU,RADDR): - case ID2(RADDR,RCPU): -# endif - switch( size ) { - case 1: - if( to->kind == RCPU ) { - op64(ctx,XOR,to,to); - if( !is_reg8(to) ) { - preg p; - op32(ctx,MOV16,to,from); - op32(ctx,SHL,to,pconst(&p,24)); - op32(ctx,SHR,to,pconst(&p,24)); - break; - } - } - if( !is_reg8(from) ) { - preg *r = alloc_reg(ctx, RCPU_CALL); - op32(ctx, MOV, r, from); - RUNLOCK(r); - op32(ctx,MOV8,to,r); - return from; - } - op32(ctx,MOV8,to,from); - break; - case 2: - if( to->kind == RCPU ) - op64(ctx,XOR,to,to); - op32(ctx,MOV16,to,from); +static void *callback_c2hl( void *f, hl_type *t, void **args, vdynamic *ret ) { + int nargs = t->fun->nargs; + if( nargs > MAX_ARGS ) + hl_error("Too many arguments for dynamic call"); + struct { + void *regs[MAX_ARGS]; + void *stack[MAX_ARGS]; + } vargs; + int rp = 0, fp = 0, sp = MAX_ARGS; + for(int i=0;ifun->nargs;i++) { + hl_type *at = t->fun->args[i]; + void *v = args[i]; + int r = get_next_reg(at,&rp,&fp); + int_val iv; + switch( at->kind ) { + case HBOOL: + case HUI8: + case HUI16: + case HI32: + case HF32: + iv = *(int*)v; break; - case 4: - op32(ctx,MOV,to,from); + case HI64: + case HGUID: + case HF64: + iv = *(int_val*)v; break; - case 8: - if( IS_64 ) { - op64(ctx,MOV,to,from); - break; - } default: - ASSERT(size); - } - return to->kind == RCPU ? to : from; - case ID2(RFPU,RFPU): - case ID2(RMEM,RFPU): - case ID2(RSTACK,RFPU): - case ID2(RFPU,RMEM): - case ID2(RFPU,RSTACK): - switch( size ) { - case 8: - op64(ctx,MOVSD,to,from); + iv = (int_val)v; break; - case 4: - op32(ctx,MOVSS,to,from); - break; - default: - ASSERT(size); } - return to->kind == RFPU ? to : from; - case ID2(RMEM,RSTACK): - { - vreg *rfrom = R(from->id); - if( IS_FLOAT(rfrom) ) - return copy(ctx,to,alloc_fpu(ctx,rfrom,true),size); - return copy(ctx,to,alloc_cpu(ctx,rfrom,true),size); - } - case ID2(RMEM,RMEM): - case ID2(RSTACK,RMEM): - case ID2(RSTACK,RSTACK): -# ifndef HL_64 - case ID2(RMEM,RADDR): - case ID2(RSTACK,RADDR): - case ID2(RADDR,RSTACK): -# endif - { - preg *tmp; - if( (!IS_64 && size == 8) || (to->kind == RSTACK && IS_FLOAT(R(to->id))) || (from->kind == RSTACK && IS_FLOAT(R(from->id))) ) { - tmp = alloc_reg(ctx, RFPU); - op64(ctx,size == 8 ? MOVSD : MOVSS,tmp,from); - } else { - tmp = alloc_reg(ctx, RCPU); - copy(ctx,tmp,from,size); - } - return copy(ctx,to,tmp,size); - } -# ifdef HL_64 - case ID2(RCPU,RADDR): - case ID2(RMEM,RADDR): - case ID2(RSTACK,RADDR): - { - preg p; - preg *tmp = alloc_reg(ctx, RCPU); - op64(ctx,MOV,tmp,pconst64(&p,(int_val)from->holds)); - return copy(ctx,to,pmem(&p,tmp->id,0),size); - } - case ID2(RADDR,RCPU): - case ID2(RADDR,RMEM): - case ID2(RADDR,RSTACK): - { - preg p; - preg *tmp = alloc_reg(ctx, RCPU); - op64(ctx,MOV,tmp,pconst64(&p,(int_val)to->holds)); - return copy(ctx,pmem(&p,tmp->id,0),from,size); - } -# endif + if( r >= 0 ) + vargs.regs[r + (at->kind == HF32 || at->kind == HF64 ? arg_reg_count : 0)] = (void*)iv; + else + vargs.stack[--sp] = (void*)iv; + } + switch( t->fun->ret->kind ) { + case HUI8: + case HUI16: + case HI32: + case HBOOL: + ret->v.i = ((int (*)(void *, void *, int))call_jit_c2hl)(f, &vargs, MAX_ARGS - sp); + return &ret->v.i; + case HI64: + case HGUID: + ret->v.i64 = ((int64 (*)(void *, void *, int))call_jit_c2hl)(f, &vargs, MAX_ARGS - sp); + return &ret->v.i64; + case HF32: + ret->v.f = ((float (*)(void *, void *, int))call_jit_c2hl)(f, &vargs, MAX_ARGS - sp); + return &ret->v.f; + case HF64: + ret->v.d = ((double (*)(void *, void *, int))call_jit_c2hl)(f, &vargs, MAX_ARGS - sp); + return &ret->v.d; default: - break; + return ((void *(*)(void *, void *, int))call_jit_c2hl)(f, &vargs, MAX_ARGS - sp); } - printf("copy(%s,%s)\n",KNAMES[to->kind], KNAMES[from->kind]); - ASSERT(0); - return NULL; } -static void store( jit_ctx *ctx, vreg *r, preg *v, bool bind ) { - if( r->current && r->current != v ) { - r->current->holds = NULL; - r->current = NULL; - } - v = copy(ctx,&r->stack,v,r->size); - if( IS_FLOAT(r) != (v->kind == RFPU) ) - ASSERT(0); - if( bind && r->current != v && (v->kind == RCPU || v->kind == RFPU) ) { - scratch(v); - r->current = v; - v->holds = r; +static vdynamic *callback_hl2c( vclosure_wrapper *c, char *stack_args, void **regs ) { + vdynamic *args[MAX_ARGS]; + int nargs = c->cl.t->fun->nargs; + if( nargs > MAX_ARGS ) + hl_error("Too many arguments for wrapped call"); + int rp = 0, fp = 0; + rp++; // skip fptr in HL64 - was passed as arg0 + if( IS_WINCALL64 ) fp++; + for(int i=0;icl.t->fun->args[i]; + int creg = get_next_reg(t,&rp,&fp); + if( creg < 0 ) { + args[i] = hl_is_dynamic(t) ? *(vdynamic**)stack_args : hl_make_dyn(stack_args,t); + stack_args += (t->kind == HF64 ? 8 : HL_WSIZE); + } else if( hl_is_dynamic(t) ) { + args[i] = *(vdynamic**)(regs + creg); + } else if( t->kind == HF32 || t->kind == HF64 ) { + args[i] = hl_make_dyn(regs + arg_reg_count + creg,&hlt_f64); + } else { + args[i] = hl_make_dyn(regs + creg,t); + } } + return hl_dyn_call(c->wrappedFun,args,nargs); } -static void store_result( jit_ctx *ctx, vreg *r ) { -# ifndef HL_64 - switch( r->t->kind ) { - case HF64: - scratch(r->current); - op64(ctx,FSTP,&r->stack,UNUSED); - break; - case HF32: - scratch(r->current); - op64(ctx,FSTP32,&r->stack,UNUSED); - break; +void *hl_jit_wrapper_ptr( vclosure_wrapper *c, char *stack_args, void **regs ) { + vdynamic *ret = callback_hl2c(c, stack_args, regs); + hl_type *tret = c->cl.t->fun->ret; + switch( tret->kind ) { + case HVOID: + return NULL; + case HUI8: + case HUI16: + case HI32: + case HBOOL: + return (void*)(int_val)hl_dyn_casti(&ret,&hlt_dyn,tret); case HI64: - scratch(r->current); - error_i64(); - break; + case HGUID: + return (void*)(int_val)hl_dyn_casti64(&ret,&hlt_dyn); default: -# endif - store(ctx,r,IS_FLOAT(r) ? REG_AT(XMM(0)) : PEAX,true); -# ifndef HL_64 - break; - } -# endif -} - -static void op_mov( jit_ctx *ctx, vreg *to, vreg *from ) { - preg *r = fetch(from); -# ifndef HL_64 - if( to->t->kind == HI64 ) { - error_i64(); - return; + return hl_dyn_castp(&ret,&hlt_dyn,tret); } -# endif - if( from->t->kind == HF32 && r->kind != RFPU ) - r = alloc_fpu(ctx,from,true); - store(ctx, to, r, true); } -static void copy_to( jit_ctx *ctx, vreg *to, preg *from ) { - store(ctx,to,from,true); +double hl_jit_wrapper_d( vclosure_wrapper *c, char *stack_args, void **regs ) { + vdynamic *ret = callback_hl2c(c, stack_args, regs); + return hl_dyn_castd(&ret,&hlt_dyn); } -static void copy_from( jit_ctx *ctx, preg *to, vreg *from ) { - copy(ctx,to,fetch(from),from->size); +void *hl_jit_code( jit_ctx *ctx, hl_module *m, int *codesize, hl_debug_infos **debug, hl_module *previous ) { + hl_codegen_flush_consts(ctx); + jit_code_append(ctx); + int size = ctx->out_pos; + if( size & 4095 ) size += 4096 - (size&4095); + unsigned char *code = (unsigned char*)hl_alloc_executable_memory(size); + if( code == NULL ) return NULL; + memcpy(code,ctx->output,size); + *codesize = size; + *debug = m->jit_debug; + ctx->final_code = code; + hl_emit_final(ctx); + hl_codegen_final(ctx); + arg_reg_count = ctx->cfg.regs.nargs; + arg_fp_count = ctx->cfg.floats.nargs; + call_jit_c2hl = ctx->final_code + ctx->code_funs.c2hl; + call_jit_hl2c = ctx->final_code + ctx->code_funs.hl2c; +# ifdef WIN64_UNWIND_TABLES + ctx->mod->unwind_table_size = ctx->fdef_index; +# endif + hl_setup.get_wrapper = default_wrapper; + hl_setup.static_call = callback_c2hl; + return code; } -static void store_const( jit_ctx *ctx, vreg *r, int c ) { - preg p; - if( c == 0 ) - op(ctx,XOR,alloc_cpu(ctx,r,false),alloc_cpu(ctx,r,false),r->size == 8); - else if( r->size == 8 ) - op64(ctx,MOV,alloc_cpu(ctx,r,false),pconst64(&p,c)); - else - op32(ctx,MOV,alloc_cpu(ctx,r,false),pconst(&p,c)); - store(ctx,r,r->current,false); +void hl_jit_patch_method( void*fun, void**newt ) { + jit_assert(); } - -static void discard_regs( jit_ctx *ctx, bool native_call ) { - int i; - for(i=0;ipregs + RCPU_SCRATCH_REGS[i]; - if( r->holds ) { - r->holds->current = NULL; - r->holds = NULL; - } - } - for(i=0;ipregs + XMM(i); - if( r->holds ) { - r->holds->current = NULL; - r->holds = NULL; - } - } -} - -static int pad_before_call( jit_ctx *ctx, int size ) { - int total = size + ctx->totalRegsSize + HL_WSIZE * 2; // EIP+EBP - if( total & 15 ) { - int pad = 16 - (total & 15); - preg p; - if( pad ) op64(ctx,SUB,PESP,pconst(&p,pad)); - size += pad; - } - return size; -} - -static void push_reg( jit_ctx *ctx, vreg *r ) { - preg p; - switch( stack_size(r->t) ) { - case 1: - op64(ctx,SUB,PESP,pconst(&p,1)); - op32(ctx,MOV8,pmem(&p,Esp,0),alloc_cpu8(ctx,r,true)); - break; - case 2: - op64(ctx,SUB,PESP,pconst(&p,2)); - op32(ctx,MOV16,pmem(&p,Esp,0),alloc_cpu(ctx,r,true)); - break; - case 4: - if( r->size < 4 ) - alloc_cpu(ctx,r,true); // force fetch (higher bits set to 0) - if( !IS_64 ) { - if( r->current != NULL && r->current->kind == RFPU ) scratch(r->current); - op32(ctx,PUSH,fetch(r),UNUSED); - } else { - // pseudo push32 (not available) - op64(ctx,SUB,PESP,pconst(&p,4)); - op32(ctx,MOV,pmem(&p,Esp,0),alloc_cpu(ctx,r,true)); - } - break; - case 8: - if( fetch(r)->kind == RFPU ) { - op64(ctx,SUB,PESP,pconst(&p,8)); - op64(ctx,MOVSD,pmem(&p,Esp,0),fetch(r)); - } else if( IS_64 ) - op64(ctx,PUSH,fetch(r),UNUSED); - else if( r->stack.kind == RSTACK ) { - scratch(r->current); - r->stackPos += 4; - op32(ctx,PUSH,&r->stack,UNUSED); - r->stackPos -= 4; - op32(ctx,PUSH,&r->stack,UNUSED); - } else - ASSERT(0); - break; - default: - ASSERT(r->size); - } -} - -static int begin_native_call( jit_ctx *ctx, int nargs ) { - ctx->nativeArgsCount = nargs; - return pad_before_call(ctx, nargs > CALL_NREGS ? (nargs - CALL_NREGS) * HL_WSIZE : 0); -} - -static preg *alloc_native_arg( jit_ctx *ctx ) { -# ifdef HL_64 - int rid = ctx->nativeArgsCount - 1; - preg *r = rid < CALL_NREGS ? REG_AT(CALL_REGS[rid]) : alloc_reg(ctx,RCPU_CALL); - scratch(r); - return r; -# else - return alloc_reg(ctx, RCPU); -# endif -} - -static void set_native_arg( jit_ctx *ctx, preg *r ) { - if( r->kind == RSTACK ) { - vreg *v = ctx->vregs + r->id; - if( v->size < 4 ) - r = fetch32(ctx, v); - } -# ifdef HL_64 - if( r->kind == RFPU ) ASSERT(0); - int rid = --ctx->nativeArgsCount; - preg *target; - if( rid >= CALL_NREGS ) { - op64(ctx,PUSH,r,UNUSED); - return; - } - target = REG_AT(CALL_REGS[rid]); - if( target != r ) { - op64(ctx, MOV, target, r); - scratch(target); - } -# else - op32(ctx,PUSH,r,UNUSED); -# endif -} - -static void set_native_arg_fpu( jit_ctx *ctx, preg *r, bool isf32 ) { -# ifdef HL_64 - if( r->kind == RCPU ) ASSERT(0); - // can only be used if last argument !! - ctx->nativeArgsCount--; - preg *target = REG_AT(XMM(IS_WINCALL64 ? ctx->nativeArgsCount : 0)); - if( target != r ) { - op64(ctx, isf32 ? MOVSS : MOVSD, target, r); - scratch(target); - } -# else - op32(ctx,PUSH,r,UNUSED); -# endif -} - -typedef struct { - int nextCpu; - int nextFpu; - int mapped[REG_COUNT]; -} call_regs; - -static int select_call_reg( call_regs *regs, hl_type *t, int id ) { -# ifndef HL_64 - return -1; -#else - bool isFloat = t->kind == HF32 || t->kind == HF64; -# ifdef HL_WIN_CALL - int index = regs->nextCpu++; -# else - int index = isFloat ? regs->nextFpu++ : regs->nextCpu++; -# endif - if( index >= CALL_NREGS ) - return -1; - int reg = isFloat ? XMM(index) : CALL_REGS[index]; - regs->mapped[reg] = id + 1; - return reg; -#endif -} - -static int mapped_reg( call_regs *regs, int id ) { -# ifndef HL_64 - return -1; -#else - int i; - for(i=0;imapped[r] == id + 1 ) return r; - r = XMM(i); - if( regs->mapped[r] == id + 1 ) return r; - } - return -1; -#endif -} - -static int prepare_call_args( jit_ctx *ctx, int count, int *args, vreg *vregs, int extraSize ) { - int i; - int size = extraSize, paddedSize; - call_regs ctmp = {0}; - for(i=0;it, i); - if( cr >= 0 ) { - preg *c = REG_AT(cr); - preg *cur = fetch(r); - if( cur != c ) { - copy(ctx,c,cur,r->size); - scratch(c); - } - RLOCK(c); - continue; - } - size += stack_size(r->t); - } - paddedSize = pad_before_call(ctx,size); - for(i=0;i= 0 ) continue; - push_reg(ctx,r); - if( r->current ) RUNLOCK(r->current); - } - return paddedSize; -} - -static void op_call( jit_ctx *ctx, preg *r, int size ) { - preg p; -# ifdef JIT_DEBUG - if( IS_64 && size >= 0 ) { - int jchk; - op32(ctx,TEST,PESP,pconst(&p,15)); - XJump(JZero,jchk); - BREAK(); // unaligned ESP - patch_jump(ctx, jchk); - } -# endif - if( IS_WINCALL64 ) { - // MSVC requires 32bytes of free space here - op64(ctx,SUB,PESP,pconst(&p,32)); - if( size >= 0 ) size += 32; - } - op32(ctx, CALL, r, UNUSED); - if( size > 0 ) op64(ctx,ADD,PESP,pconst(&p,size)); -} - -static void call_native( jit_ctx *ctx, void *nativeFun, int size ) { - bool isExc = nativeFun == hl_assert || nativeFun == hl_throw || nativeFun == on_jit_error; - preg p; - // native function, already resolved - op64(ctx,MOV,PEAX,pconst64(&p,(int_val)nativeFun)); - op_call(ctx,PEAX, isExc ? -1 : size); - if( isExc ) - return; - discard_regs(ctx, true); -} - -static void op_call_fun( jit_ctx *ctx, vreg *dst, int findex, int count, int *args ) { - int fid = findex < 0 ? -1 : ctx->m->functions_indexes[findex]; - bool isNative = fid >= ctx->m->code->nfunctions; - int size = prepare_call_args(ctx,count,args,ctx->vregs,0); - preg p; - if( fid < 0 ) { - ASSERT(fid); - } else if( isNative ) { - call_native(ctx,ctx->m->functions_ptrs[findex],size); - } else { - int cpos = BUF_POS() + (IS_WINCALL64 ? 4 : 0); -# ifdef JIT_DEBUG - if( IS_64 ) cpos += 13; // ESP CHECK -# endif - if( ctx->m->functions_ptrs[findex] ) { - // already compiled - op_call(ctx,pconst(&p,(int)(int_val)ctx->m->functions_ptrs[findex] - (cpos + 5)), size); - } else if( ctx->m->code->functions + fid == ctx->f ) { - // our current function - op_call(ctx,pconst(&p, ctx->functionPos - (cpos + 5)), size); - } else { - // stage for later - jlist *j = (jlist*)hl_malloc(&ctx->galloc,sizeof(jlist)); - j->pos = cpos; - j->target = findex; - j->next = ctx->calls; - ctx->calls = j; - op_call(ctx,pconst(&p,0), size); - } - discard_regs(ctx, false); - } - if( dst ) - store_result(ctx,dst); -} - -static void op_enter( jit_ctx *ctx ) { - preg p; - op64(ctx, PUSH, PEBP, UNUSED); - op64(ctx, MOV, PEBP, PESP); - if( ctx->totalRegsSize ) op64(ctx, SUB, PESP, pconst(&p,ctx->totalRegsSize)); -} - -static void op_ret( jit_ctx *ctx, vreg *r ) { - preg p; - switch( r->t->kind ) { - case HF32: -# ifdef HL_64 - op64(ctx, MOVSS, PXMM(0), fetch(r)); -# else - op64(ctx,FLD32,&r->stack,UNUSED); -# endif - break; - case HF64: -# ifdef HL_64 - op64(ctx, MOVSD, PXMM(0), fetch(r)); -# else - op64(ctx,FLD,&r->stack,UNUSED); -# endif - break; - default: - if( r->size < 4 && !r->current ) - fetch32(ctx, r); - if( r->current != PEAX ) - op64(ctx,MOV,PEAX,fetch(r)); - break; - } - if( ctx->totalRegsSize ) op64(ctx, ADD, PESP, pconst(&p, ctx->totalRegsSize)); -# ifdef JIT_DEBUG - { - int jeq; - op64(ctx, CMP, PESP, PEBP); - XJump_small(JEq,jeq); - jit_error("invalid ESP"); - patch_jump(ctx,jeq); - } -# endif - op64(ctx, POP, PEBP, UNUSED); - op64(ctx, RET, UNUSED, UNUSED); -} - -static void call_native_consts( jit_ctx *ctx, void *nativeFun, int_val *args, int nargs ) { - int size = pad_before_call(ctx, IS_64 ? 0 : HL_WSIZE*nargs); - preg p; - int i; -# ifdef HL_64 - for(i=0;i=0;i--) - op32(ctx, PUSH, pconst64(&p, args[i]), UNUSED); -# endif - call_native(ctx, nativeFun, size); -} - -static void on_jit_error( const char *msg, int_val line ) { - char buf[256]; - int iline = (int)line; - sprintf(buf,"%s (line %d)",msg,iline); -#ifdef HL_WIN_DESKTOP - MessageBoxA(NULL,buf,"JIT ERROR",MB_OK); -#else - printf("JIT ERROR : %s\n",buf); -#endif - hl_debug_break(); - hl_throw(NULL); -} - -static void _jit_error( jit_ctx *ctx, const char *msg, int line ) { - int_val args[2] = { (int_val)msg, (int_val)line }; - call_native_consts(ctx,on_jit_error,args,2); -} - - -static preg *op_binop( jit_ctx *ctx, vreg *dst, vreg *a, vreg *b, hl_op bop ) { - preg *pa = fetch(a), *pb = fetch(b), *out = NULL; - CpuOp o; - if( IS_FLOAT(a) ) { - bool isf32 = a->t->kind == HF32; - switch( bop ) { - case OAdd: o = isf32 ? ADDSS : ADDSD; break; - case OSub: o = isf32 ? SUBSS : SUBSD; break; - case OMul: o = isf32 ? MULSS : MULSD; break; - case OSDiv: o = isf32 ? DIVSS : DIVSD; break; - case OJSLt: - case OJSGte: - case OJSLte: - case OJSGt: - case OJEq: - case OJNotEq: - case OJNotLt: - case OJNotGte: - o = isf32 ? COMISS : COMISD; - break; - case OSMod: - { - int args[] = { a->stack.id, b->stack.id }; - int size = prepare_call_args(ctx,2,args,ctx->vregs,0); - void *mod_fun; - if( isf32 ) mod_fun = fmodf; else mod_fun = fmod; - call_native(ctx,mod_fun,size); - store_result(ctx,dst); - return fetch(dst); - } - default: - printf("%s\n", hl_op_name(bop)); - ASSERT(bop); - } - } else { - bool is64 = a->t->kind == HI64; -# ifndef HL_64 - if( is64 ) { - error_i64(); - return fetch(a); - } -# endif - switch( bop ) { - case OAdd: o = ADD; break; - case OSub: o = SUB; break; - case OMul: o = IMUL; break; - case OAnd: o = AND; break; - case OOr: o = OR; break; - case OXor: o = XOR; break; - case OShl: - case OUShr: - case OSShr: - if( !b->current || b->current->kind != RCPU || b->current->id != Ecx ) { - scratch(REG_AT(Ecx)); - op(ctx,MOV,REG_AT(Ecx),pb,is64); - RLOCK(REG_AT(Ecx)); - pa = fetch(a); - } else - RLOCK(b->current); - if( pa->kind != RCPU ) { - pa = alloc_reg(ctx, RCPU); - op(ctx,MOV,pa,fetch(a), is64); - } - op(ctx,bop == OShl ? SHL : (bop == OUShr ? SHR : SAR), pa, UNUSED,is64); - if( dst ) store(ctx, dst, pa, true); - return pa; - case OSDiv: - case OUDiv: - case OSMod: - case OUMod: - { - preg *out = bop == OSMod || bop == OUMod ? REG_AT(Edx) : PEAX; - preg *r = pb; - preg p; - int jz, jz1 = 0, jend; - if( pa->kind == RCPU && pa->id == Eax ) RLOCK(pa); - // ensure b in CPU reg and not in Eax/Edx (for UI8/UI16) - if( pb->kind != RCPU || (pb->id == Eax || pb->id == Edx) ) { - scratch(REG_AT(Ecx)); - scratch(pb); - load(ctx,REG_AT(Ecx),b); - r = REG_AT(Ecx); - } - // integer div 0 => 0 - op(ctx,TEST,r,r,is64); - XJump_small(JZero, jz); - // Prevent MIN/-1 overflow exception - // OSMod: r = (b == 0 || b == -1) ? 0 : a % b - // OSDiv: r = (b == 0 || b == -1) ? a * b : a / b - if( bop == OSMod || bop == OSDiv ) { - op(ctx, CMP, r, pconst(&p,-1), is64); - XJump_small(JEq, jz1); - } - pa = fetch(a); - if( pa->kind != RCPU || pa->id != Eax ) { - scratch(PEAX); - scratch(pa); - load(ctx,PEAX,a); - } - scratch(REG_AT(Edx)); - scratch(REG_AT(Eax)); - if( bop == OUDiv || bop == OUMod ) - op(ctx, XOR, REG_AT(Edx), REG_AT(Edx), is64); - else - op(ctx, CDQ, UNUSED, UNUSED, is64); // sign-extend Eax into Eax:Edx - op(ctx, bop == OUDiv || bop == OUMod ? DIV : IDIV, r, UNUSED, is64); - XJump_small(JAlways, jend); - patch_jump(ctx, jz); - patch_jump(ctx, jz1); - if( bop != OSDiv ) { - op(ctx, XOR, out, out, is64); - } else { - load(ctx, out, a); - op(ctx, IMUL, out, r, is64); - } - patch_jump(ctx, jend); - if( dst ) store(ctx, dst, out, true); - return out; - } - case OJSLt: - case OJSGte: - case OJSLte: - case OJSGt: - case OJULt: - case OJUGte: - case OJEq: - case OJNotEq: - switch( a->t->kind ) { - case HUI8: - case HBOOL: - o = CMP8; - break; - case HUI16: - o = CMP16; - break; - default: - o = CMP; - break; - } - break; - default: - printf("%s\n", hl_op_name(bop)); - ASSERT(bop); - } - } - switch( RTYPE(a) ) { - case HI32: - case HUI8: - case HUI16: - case HBOOL: -# ifndef HL_64 - case HDYNOBJ: - case HVIRTUAL: - case HOBJ: - case HSTRUCT: - case HFUN: - case HMETHOD: - case HBYTES: - case HNULL: - case HENUM: - case HDYN: - case HTYPE: - case HABSTRACT: - case HARRAY: -# endif - switch( ID2(pa->kind, pb->kind) ) { - case ID2(RCPU,RCPU): - case ID2(RCPU,RSTACK): - op32(ctx, o, pa, pb); - scratch(pa); - out = pa; - break; - case ID2(RSTACK,RCPU): - if( dst == a && o != IMUL ) { - op32(ctx, o, pa, pb); - dst = NULL; - out = pa; - } else { - alloc_cpu(ctx,a, true); - return op_binop(ctx,dst,a,b,bop); - } - break; - case ID2(RSTACK,RSTACK): - alloc_cpu(ctx, a, true); - return op_binop(ctx, dst, a, b, bop); - default: - printf("%s(%d,%d)\n", hl_op_name(bop), pa->kind, pb->kind); - ASSERT(ID2(pa->kind, pb->kind)); - } - if( dst ) store(ctx, dst, out, true); - return out; -# ifdef HL_64 - case HOBJ: - case HSTRUCT: - case HDYNOBJ: - case HVIRTUAL: - case HFUN: - case HMETHOD: - case HBYTES: - case HNULL: - case HENUM: - case HDYN: - case HTYPE: - case HABSTRACT: - case HARRAY: - case HI64: - case HGUID: - switch( ID2(pa->kind, pb->kind) ) { - case ID2(RCPU,RCPU): - case ID2(RCPU,RSTACK): - op64(ctx, o, pa, pb); - scratch(pa); - out = pa; - break; - case ID2(RSTACK,RCPU): - if( dst == a && OP_FORMS[o].mem_r ) { - op64(ctx, o, pa, pb); - dst = NULL; - out = pa; - } else { - alloc_cpu(ctx,a, true); - return op_binop(ctx,dst,a,b,bop); - } - break; - case ID2(RSTACK,RSTACK): - alloc_cpu(ctx, a, true); - return op_binop(ctx, dst, a, b, bop); - default: - printf("%s(%d,%d)\n", hl_op_name(bop), pa->kind, pb->kind); - ASSERT(ID2(pa->kind, pb->kind)); - } - if( dst ) store(ctx, dst, out, true); - return out; -# endif - case HF64: - case HF32: - pa = alloc_fpu(ctx, a, true); - pb = alloc_fpu(ctx, b, true); - switch( ID2(pa->kind, pb->kind) ) { - case ID2(RFPU,RFPU): - op64(ctx,o,pa,pb); - if( (o == COMISD || o == COMISS) && bop != OJSGt ) { - int jnotnan; - XJump_small(JNParity,jnotnan); - switch( bop ) { - case OJSLt: - case OJNotLt: - { - preg *r = alloc_reg(ctx,RCPU); - // set CF=0, ZF=1 - op64(ctx,XOR,r,r); - RUNLOCK(r); - break; - } - case OJSGte: - case OJNotGte: - { - preg *r = alloc_reg(ctx,RCPU); - // set ZF=0, CF=1 - op64(ctx,XOR,r,r); - op64(ctx,CMP,r,PESP); - RUNLOCK(r); - break; - } - break; - case OJNotEq: - case OJEq: - // set ZF=0, CF=? - case OJSLte: - // set ZF=0, CF=0 - op64(ctx,TEST,PESP,PESP); - break; - default: - ASSERT(bop); - } - patch_jump(ctx,jnotnan); - } - scratch(pa); - out = pa; - break; - default: - printf("%s(%d,%d)\n", hl_op_name(bop), pa->kind, pb->kind); - ASSERT(ID2(pa->kind, pb->kind)); - } - if( dst ) store(ctx, dst, out, true); - return out; - default: - ASSERT(RTYPE(a)); - } - return NULL; -} - -static int do_jump( jit_ctx *ctx, hl_op op, bool isFloat ) { - int j; - switch( op ) { - case OJAlways: - XJump(JAlways,j); - break; - case OJSGte: - XJump(isFloat ? JUGte : JSGte,j); - break; - case OJSGt: - XJump(isFloat ? JUGt : JSGt,j); - break; - case OJUGte: - XJump(JUGte,j); - break; - case OJSLt: - XJump(isFloat ? JULt : JSLt,j); - break; - case OJSLte: - XJump(isFloat ? JULte : JSLte,j); - break; - case OJULt: - XJump(JULt,j); - break; - case OJEq: - XJump(JEq,j); - break; - case OJNotEq: - XJump(JNeq,j); - break; - case OJNotLt: - XJump(JUGte,j); - break; - case OJNotGte: - XJump(JULt,j); - break; - default: - j = 0; - printf("Unknown JUMP %d\n",op); - break; - } - return j; -} - -static void register_jump( jit_ctx *ctx, int pos, int target ) { - jlist *j = (jlist*)hl_malloc(&ctx->falloc, sizeof(jlist)); - j->pos = pos; - j->target = target; - j->next = ctx->jumps; - ctx->jumps = j; - if( target != 0 && ctx->opsPos[target] == 0 ) - ctx->opsPos[target] = -1; -} - -#define HDYN_VALUE 8 - -static void dyn_value_compare( jit_ctx *ctx, preg *a, preg *b, hl_type *t ) { - preg p; - switch( t->kind ) { - case HUI8: - case HBOOL: - op32(ctx,MOV8,a,pmem(&p,a->id,HDYN_VALUE)); - op32(ctx,MOV8,b,pmem(&p,b->id,HDYN_VALUE)); - op64(ctx,CMP8,a,b); - break; - case HUI16: - op32(ctx,MOV16,a,pmem(&p,a->id,HDYN_VALUE)); - op32(ctx,MOV16,b,pmem(&p,b->id,HDYN_VALUE)); - op64(ctx,CMP16,a,b); - break; - case HI32: - op32(ctx,MOV,a,pmem(&p,a->id,HDYN_VALUE)); - op32(ctx,MOV,b,pmem(&p,b->id,HDYN_VALUE)); - op64(ctx,CMP,a,b); - break; - case HF32: - { - preg *fa = alloc_reg(ctx, RFPU); - preg *fb = alloc_reg(ctx, RFPU); - op64(ctx,MOVSS,fa,pmem(&p,a->id,HDYN_VALUE)); - op64(ctx,MOVSS,fb,pmem(&p,b->id,HDYN_VALUE)); - op64(ctx,COMISD,fa,fb); - } - break; - case HF64: - { - preg *fa = alloc_reg(ctx, RFPU); - preg *fb = alloc_reg(ctx, RFPU); - op64(ctx,MOVSD,fa,pmem(&p,a->id,HDYN_VALUE)); - op64(ctx,MOVSD,fb,pmem(&p,b->id,HDYN_VALUE)); - op64(ctx,COMISD,fa,fb); - } - break; - case HI64: - default: - // ptr comparison - op64(ctx,MOV,a,pmem(&p,a->id,HDYN_VALUE)); - op64(ctx,MOV,b,pmem(&p,b->id,HDYN_VALUE)); - op64(ctx,CMP,a,b); - break; - } -} - -static void op_jump( jit_ctx *ctx, vreg *a, vreg *b, hl_opcode *op, int targetPos ) { - if( a->t->kind == HDYN || b->t->kind == HDYN || a->t->kind == HFUN || b->t->kind == HFUN ) { - int args[] = { a->stack.id, b->stack.id }; - int size = prepare_call_args(ctx,2,args,ctx->vregs,0); - call_native(ctx,hl_dyn_compare,size); - if( op->op == OJSGt || op->op == OJSGte ) { - preg p; - int jinvalid; - op32(ctx,CMP,PEAX,pconst(&p,hl_invalid_comparison)); - XJump_small(JEq,jinvalid); - op32(ctx,TEST,PEAX,PEAX); - register_jump(ctx,do_jump(ctx,op->op, IS_FLOAT(a)),targetPos); - patch_jump(ctx,jinvalid); - return; - } - op32(ctx,TEST,PEAX,PEAX); - } else switch( a->t->kind ) { - case HTYPE: - { - int args[] = { a->stack.id, b->stack.id }; - int size = prepare_call_args(ctx,2,args,ctx->vregs,0); - preg p; - call_native(ctx,hl_same_type,size); - op64(ctx,CMP8,PEAX,pconst(&p,1)); - } - break; - case HNULL: - { - preg *pa = hl_type_size(a->t->tparam) == 1 ? alloc_cpu8(ctx,a,true) : alloc_cpu(ctx,a,true); - preg *pb = hl_type_size(b->t->tparam) == 1 ? alloc_cpu8(ctx,b,true) : alloc_cpu(ctx,b,true); - if( op->op == OJEq ) { - // if( a == b || (a && b && a->v == b->v) ) goto - int ja, jb; - // if( a != b && (!a || !b || a->v != b->v) ) goto - op64(ctx,CMP,pa,pb); - register_jump(ctx,do_jump(ctx,OJEq,false),targetPos); - op64(ctx,TEST,pa,pa); - XJump_small(JZero,ja); - op64(ctx,TEST,pb,pb); - XJump_small(JZero,jb); - dyn_value_compare(ctx,pa,pb,a->t->tparam); - register_jump(ctx,do_jump(ctx,OJEq,false),targetPos); - scratch(pa); - scratch(pb); - patch_jump(ctx,ja); - patch_jump(ctx,jb); - } else if( op->op == OJNotEq ) { - int jeq, jcmp; - // if( a != b && (!a || !b || a->v != b->v) ) goto - op64(ctx,CMP,pa,pb); - XJump_small(JEq,jeq); - op64(ctx,TEST,pa,pa); - register_jump(ctx,do_jump(ctx,OJEq,false),targetPos); - op64(ctx,TEST,pb,pb); - register_jump(ctx,do_jump(ctx,OJEq,false),targetPos); - dyn_value_compare(ctx,pa,pb,a->t->tparam); - XJump_small(JZero,jcmp); - scratch(pa); - scratch(pb); - register_jump(ctx,do_jump(ctx,OJNotEq,false),targetPos); - patch_jump(ctx,jcmp); - patch_jump(ctx,jeq); - } else - ASSERT(op->op); - return; - } - case HVIRTUAL: - { - preg p; - preg *pa = alloc_cpu(ctx,a,true); - preg *pb = alloc_cpu(ctx,b,true); - int ja,jb,jav,jbv,jvalue; - if( b->t->kind == HOBJ ) { - if( op->op == OJEq ) { - // if( a ? (b && a->value == b) : (b == NULL) ) goto - op64(ctx,TEST,pa,pa); - XJump_small(JZero,ja); - op64(ctx,TEST,pb,pb); - XJump_small(JZero,jb); - op64(ctx,MOV,pa,pmem(&p,pa->id,HL_WSIZE)); - op64(ctx,CMP,pa,pb); - XJump_small(JAlways,jvalue); - patch_jump(ctx,ja); - op64(ctx,TEST,pb,pb); - patch_jump(ctx,jvalue); - register_jump(ctx,do_jump(ctx,OJEq,false),targetPos); - patch_jump(ctx,jb); - } else if( op->op == OJNotEq ) { - // if( a ? (b == NULL || a->value != b) : (b != NULL) ) goto - op64(ctx,TEST,pa,pa); - XJump_small(JZero,ja); - op64(ctx,TEST,pb,pb); - register_jump(ctx,do_jump(ctx,OJEq,false),targetPos); - op64(ctx,MOV,pa,pmem(&p,pa->id,HL_WSIZE)); - op64(ctx,CMP,pa,pb); - XJump_small(JAlways,jvalue); - patch_jump(ctx,ja); - op64(ctx,TEST,pb,pb); - patch_jump(ctx,jvalue); - register_jump(ctx,do_jump(ctx,OJNotEq,false),targetPos); - } else - ASSERT(op->op); - scratch(pa); - return; - } - op64(ctx,CMP,pa,pb); - if( op->op == OJEq ) { - // if( a == b || (a && b && a->value && b->value && a->value == b->value) ) goto - register_jump(ctx,do_jump(ctx,OJEq, false),targetPos); - op64(ctx,TEST,pa,pa); - XJump_small(JZero,ja); - op64(ctx,TEST,pb,pb); - XJump_small(JZero,jb); - op64(ctx,MOV,pa,pmem(&p,pa->id,HL_WSIZE)); - op64(ctx,TEST,pa,pa); - XJump_small(JZero,jav); - op64(ctx,MOV,pb,pmem(&p,pb->id,HL_WSIZE)); - op64(ctx,TEST,pb,pb); - XJump_small(JZero,jbv); - op64(ctx,CMP,pa,pb); - XJump_small(JNeq,jvalue); - register_jump(ctx,do_jump(ctx,OJEq, false),targetPos); - patch_jump(ctx,ja); - patch_jump(ctx,jb); - patch_jump(ctx,jav); - patch_jump(ctx,jbv); - patch_jump(ctx,jvalue); - } else if( op->op == OJNotEq ) { - int jnext; - // if( a != b && (!a || !b || !a->value || !b->value || a->value != b->value) ) goto - XJump_small(JEq,jnext); - op64(ctx,TEST,pa,pa); - XJump_small(JZero,ja); - op64(ctx,TEST,pb,pb); - XJump_small(JZero,jb); - op64(ctx,MOV,pa,pmem(&p,pa->id,HL_WSIZE)); - op64(ctx,TEST,pa,pa); - XJump_small(JZero,jav); - op64(ctx,MOV,pb,pmem(&p,pb->id,HL_WSIZE)); - op64(ctx,TEST,pb,pb); - XJump_small(JZero,jbv); - op64(ctx,CMP,pa,pb); - XJump_small(JEq,jvalue); - patch_jump(ctx,ja); - patch_jump(ctx,jb); - patch_jump(ctx,jav); - patch_jump(ctx,jbv); - register_jump(ctx,do_jump(ctx,OJAlways, false),targetPos); - patch_jump(ctx,jnext); - patch_jump(ctx,jvalue); - } else - ASSERT(op->op); - scratch(pa); - scratch(pb); - return; - } - break; - case HOBJ: - case HSTRUCT: - if( b->t->kind == HVIRTUAL ) { - op_jump(ctx,b,a,op,targetPos); // inverse - return; - } - if( hl_get_obj_rt(a->t)->compareFun ) { - preg *pa = alloc_cpu(ctx,a,true); - preg *pb = alloc_cpu(ctx,b,true); - preg p; - int jeq, ja, jb, jcmp; - int args[] = { a->stack.id, b->stack.id }; - switch( op->op ) { - case OJEq: - // if( a == b || (a && b && cmp(a,b) == 0) ) goto - op64(ctx,CMP,pa,pb); - XJump_small(JEq,jeq); - op64(ctx,TEST,pa,pa); - XJump_small(JZero,ja); - op64(ctx,TEST,pb,pb); - XJump_small(JZero,jb); - op_call_fun(ctx,NULL,(int)(int_val)a->t->obj->rt->compareFun,2,args); - op32(ctx,TEST,PEAX,PEAX); - XJump_small(JNotZero,jcmp); - patch_jump(ctx,jeq); - register_jump(ctx,do_jump(ctx,OJAlways,false),targetPos); - patch_jump(ctx,ja); - patch_jump(ctx,jb); - patch_jump(ctx,jcmp); - break; - case OJNotEq: - // if( a != b && (!a || !b || cmp(a,b) != 0) ) goto - op64(ctx,CMP,pa,pb); - XJump_small(JEq,jeq); - op64(ctx,TEST,pa,pa); - register_jump(ctx,do_jump(ctx,OJEq,false),targetPos); - op64(ctx,TEST,pb,pb); - register_jump(ctx,do_jump(ctx,OJEq,false),targetPos); - - op_call_fun(ctx,NULL,(int)(int_val)a->t->obj->rt->compareFun,2,args); - op32(ctx,TEST,PEAX,PEAX); - XJump_small(JZero,jcmp); - - register_jump(ctx,do_jump(ctx,OJNotEq,false),targetPos); - patch_jump(ctx,jcmp); - patch_jump(ctx,jeq); - break; - default: - // if( a && b && cmp(a,b) ?? 0 ) goto - op64(ctx,TEST,pa,pa); - XJump_small(JZero,ja); - op64(ctx,TEST,pb,pb); - XJump_small(JZero,jb); - op_call_fun(ctx,NULL,(int)(int_val)a->t->obj->rt->compareFun,2,args); - op32(ctx,CMP,PEAX,pconst(&p,0)); - register_jump(ctx,do_jump(ctx,op->op,false),targetPos); - patch_jump(ctx,ja); - patch_jump(ctx,jb); - break; - } - return; - } - // fallthrough - default: - // make sure we have valid 8 bits registers - if( a->size == 1 ) alloc_cpu8(ctx,a,true); - if( b->size == 1 ) alloc_cpu8(ctx,b,true); - op_binop(ctx,NULL,a,b,op->op); - break; - } - register_jump(ctx,do_jump(ctx,op->op, IS_FLOAT(a)),targetPos); -} - -jit_ctx *hl_jit_alloc() { - int i; - jit_ctx *ctx = (jit_ctx*)malloc(sizeof(jit_ctx)); - if( ctx == NULL ) return NULL; - memset(ctx,0,sizeof(jit_ctx)); - hl_alloc_init(&ctx->falloc); - hl_alloc_init(&ctx->galloc); - for(i=0;iid = i; - r->kind = RCPU; - } - for(i=0;iid = i; - r->kind = RFPU; - } - return ctx; -} - -void hl_jit_free( jit_ctx *ctx, h_bool can_reset ) { - free(ctx->vregs); - free(ctx->opsPos); - free(ctx->startBuf); - ctx->maxRegs = 0; - ctx->vregs = NULL; - ctx->maxOps = 0; - ctx->opsPos = NULL; - ctx->startBuf = NULL; - ctx->bufSize = 0; - ctx->buf.b = NULL; - ctx->calls = NULL; - ctx->switchs = NULL; - ctx->closure_list = NULL; - hl_free(&ctx->falloc); - hl_free(&ctx->galloc); - if( !can_reset ) free(ctx); -} - -static void jit_nops( jit_ctx *ctx ) { - while( BUF_POS() & 15 ) - op32(ctx, NOP, UNUSED, UNUSED); -} - -#define MAX_ARGS 16 - -static void *call_jit_c2hl = NULL; -static void *call_jit_hl2c = NULL; - -static void *callback_c2hl( void *_f, hl_type *t, void **args, vdynamic *ret ) { - /* - prepare stack and regs according to prepare_call_args, but by reading runtime type information - from the function type. The stack and regs will be setup by the trampoline function. - */ - void **f = (void**)_f; - unsigned char stack[MAX_ARGS * 8]; - call_regs cregs = {0}; - if( t->fun->nargs > MAX_ARGS ) - hl_error("Too many arguments for dynamic call"); - int i, size = 0, pad = 0, pos = 0; - for(i=0;ifun->nargs;i++) { - hl_type *at = t->fun->args[i]; - int creg = select_call_reg(&cregs,at,i); - if( creg >= 0 ) - continue; - size += stack_size(at); - } - pad = (-size) & 15; - size += pad; - pos = 0; - for(i=0;ifun->nargs;i++) { - // RTL - hl_type *at = t->fun->args[i]; - void *v = args[i]; - int creg = mapped_reg(&cregs,i); - void *store; - if( creg >= 0 ) { - if( REG_IS_FPU(creg) ) { - store = stack + size + CALL_NREGS * HL_WSIZE + (creg - XMM(0)) * sizeof(double); - } else { - store = stack + size + call_reg_index(creg) * HL_WSIZE; - } - switch( at->kind ) { - case HBOOL: - case HUI8: - *(int_val*)store = *(unsigned char*)v; - break; - case HUI16: - *(int_val*)store = *(unsigned short*)v; - break; - case HI32: - *(int_val*)store = *(int*)v; - break; - case HF32: - *(void**)store = 0; - *(float*)store = *(float*)v; - break; - case HF64: - *(double*)store = *(double*)v; - break; - case HI64: - case HGUID: - *(int64*)store = *(int64*)v; - break; - default: - *(void**)store = v; - break; - } - } else { - int tsize = stack_size(at); - store = stack + pos; - pos += tsize; - switch( at->kind ) { - case HBOOL: - case HUI8: - *(int*)store = *(unsigned char*)v; - break; - case HUI16: - *(int*)store = *(unsigned short*)v; - break; - case HI32: - case HF32: - *(int*)store = *(int*)v; - break; - case HF64: - *(double*)store = *(double*)v; - break; - case HI64: - case HGUID: - *(int64*)store = *(int64*)v; - break; - default: - *(void**)store = v; - break; - } - } - } - pos += pad; - pos >>= IS_64 ? 3 : 2; - switch( t->fun->ret->kind ) { - case HUI8: - case HUI16: - case HI32: - case HBOOL: - ret->v.i = ((int (*)(void *, void *, void *))call_jit_c2hl)(*f, (void**)&stack + pos, &stack); - return &ret->v.i; - case HI64: - case HGUID: - ret->v.i64 = ((int64 (*)(void *, void *, void *))call_jit_c2hl)(*f, (void**)&stack + pos, &stack); - return &ret->v.i64; - case HF32: - ret->v.f = ((float (*)(void *, void *, void *))call_jit_c2hl)(*f, (void**)&stack + pos, &stack); - return &ret->v.f; - case HF64: - ret->v.d = ((double (*)(void *, void *, void *))call_jit_c2hl)(*f, (void**)&stack + pos, &stack); - return &ret->v.d; - default: - return ((void *(*)(void *, void *, void *))call_jit_c2hl)(*f, (void**)&stack + pos, &stack); - } -} - -static void jit_c2hl( jit_ctx *ctx ) { - // create the function that will be called by callback_c2hl - // it will make sure to prepare the stack/regs according to native calling conventions - int jeq, jloop, jstart; - preg *fptr, *stack, *stend; - preg p; - - op64(ctx,PUSH,PEBP,UNUSED); - op64(ctx,MOV,PEBP,PESP); - -# ifdef HL_64 - - fptr = REG_AT(R10); - stack = PEAX; - stend = REG_AT(R11); - op64(ctx, MOV, fptr, REG_AT(CALL_REGS[0])); - op64(ctx, MOV, stack, REG_AT(CALL_REGS[1])); - op64(ctx, MOV, stend, REG_AT(CALL_REGS[2])); - - // set native call regs - int i; - for(i=0;iid,i*HL_WSIZE)); - for(i=0;iid,(i+CALL_NREGS)*HL_WSIZE)); - -# else - - // make sure the stack is aligned on 16 bytes - // the amount of push we will do afterwards is guaranteed to be a multiple of 16bytes by hl_callback -# ifdef HL_VCC - // VCC does not guarantee us an aligned stack... - op64(ctx,MOV,PEAX,PESP); - op64(ctx,AND,PEAX,pconst(&p,15)); - op64(ctx,SUB,PESP,PEAX); -# else - op64(ctx,SUB,PESP,pconst(&p,8)); -# endif - - // mov arguments to regs - fptr = REG_AT(Eax); - stack = REG_AT(Edx); - stend = REG_AT(Ecx); - op64(ctx,MOV,fptr,pmem(&p,Ebp,HL_WSIZE*2)); - op64(ctx,MOV,stack,pmem(&p,Ebp,HL_WSIZE*3)); - op64(ctx,MOV,stend,pmem(&p,Ebp,HL_WSIZE*4)); - -# endif - - // push stack args - jstart = BUF_POS(); - op64(ctx,CMP,stack,stend); - XJump(JEq,jeq); - op64(ctx,SUB,stack,pconst(&p,HL_WSIZE)); - op64(ctx,PUSH,pmem(&p,stack->id,0),UNUSED); - XJump(JAlways,jloop); - patch_jump(ctx,jeq); - patch_jump_to(ctx, jloop, jstart); - - op_call(ctx,fptr,0); - - // cleanup and ret - op64(ctx,MOV,PESP,PEBP); - op64(ctx,POP,PEBP, UNUSED); - op64(ctx,RET,UNUSED,UNUSED); -} - -static vdynamic *jit_wrapper_call( vclosure_wrapper *c, char *stack_args, void **regs ) { - vdynamic *args[MAX_ARGS]; - int i; - int nargs = c->cl.t->fun->nargs; - call_regs cregs = {0}; - if( nargs > MAX_ARGS ) - hl_error("Too many arguments for wrapped call"); - cregs.nextCpu++; // skip fptr in HL64 - was passed as arg0 - for(i=0;icl.t->fun->args[i]; - int creg = select_call_reg(&cregs,t,i); - if( creg < 0 ) { - args[i] = hl_is_dynamic(t) ? *(vdynamic**)stack_args : hl_make_dyn(stack_args,t); - stack_args += stack_size(t); - } else if( hl_is_dynamic(t) ) { - args[i] = *(vdynamic**)(regs + call_reg_index(creg)); - } else if( t->kind == HF32 || t->kind == HF64 ) { - args[i] = hl_make_dyn(regs + CALL_NREGS + creg - XMM(0),&hlt_f64); - } else { - args[i] = hl_make_dyn(regs + call_reg_index(creg),t); - } - } - return hl_dyn_call(c->wrappedFun,args,nargs); -} - -static void *jit_wrapper_ptr( vclosure_wrapper *c, char *stack_args, void **regs ) { - vdynamic *ret = jit_wrapper_call(c, stack_args, regs); - hl_type *tret = c->cl.t->fun->ret; - switch( tret->kind ) { - case HVOID: - return NULL; - case HUI8: - case HUI16: - case HI32: - case HBOOL: - return (void*)(int_val)hl_dyn_casti(&ret,&hlt_dyn,tret); - case HI64: - case HGUID: - return (void*)(int_val)hl_dyn_casti64(&ret,&hlt_dyn); - default: - return hl_dyn_castp(&ret,&hlt_dyn,tret); - } -} - -static double jit_wrapper_d( vclosure_wrapper *c, char *stack_args, void **regs ) { - vdynamic *ret = jit_wrapper_call(c, stack_args, regs); - return hl_dyn_castd(&ret,&hlt_dyn); -} - -static void jit_hl2c( jit_ctx *ctx ) { - // create a function that is called with a vclosure_wrapper* and native args - // and pack and pass the args to callback_hl2c - preg p; - int jfloat1, jfloat2, jexit; - hl_type_fun *ft = NULL; - int size; -# ifdef HL_64 - preg *cl = REG_AT(CALL_REGS[0]); - preg *tmp = REG_AT(CALL_REGS[1]); -# else - preg *cl = REG_AT(Ecx); - preg *tmp = REG_AT(Edx); -# endif - - op64(ctx,PUSH,PEBP,UNUSED); - op64(ctx,MOV,PEBP,PESP); - -# ifdef HL_64 - // push registers - int i; - op64(ctx,SUB,PESP,pconst(&p,CALL_NREGS*8)); - for(i=0;it->fun->ret->kind ) { - // case HF32: case HF64: return jit_wrapper_d(arg0,&args); - // default: return jit_wrapper_ptr(arg0,&args); - // } - if( !IS_64 ) - op64(ctx,MOV,cl,pmem(&p,Ebp,HL_WSIZE*2)); // load arg0 - op64(ctx,MOV,tmp,pmem(&p,cl->id,0)); // ->t - op64(ctx,MOV,tmp,pmem(&p,tmp->id,HL_WSIZE)); // ->fun - op64(ctx,MOV,tmp,pmem(&p,tmp->id,(int)(int_val)&ft->ret)); // ->ret - op32(ctx,MOV,tmp,pmem(&p,tmp->id,0)); // -> kind - - op32(ctx,CMP,tmp,pconst(&p,HF64)); - XJump_small(JEq,jfloat1); - op32(ctx,CMP,tmp,pconst(&p,HF32)); - XJump_small(JEq,jfloat2); - - // 64 bits : ESP + EIP (+WIN64PAD) - // 32 bits : ESP + EIP + PARAM0 - int args_pos = IS_64 ? ((IS_WINCALL64 ? 32 : 0) + HL_WSIZE * 2) : (HL_WSIZE*3); - - size = begin_native_call(ctx,3); - op64(ctx, LEA, tmp, pmem(&p,Ebp,-HL_WSIZE*CALL_NREGS*2)); - set_native_arg(ctx, tmp); - op64(ctx, LEA, tmp, pmem(&p,Ebp,args_pos)); - set_native_arg(ctx, tmp); - set_native_arg(ctx, cl); - call_native(ctx, jit_wrapper_ptr, size); - XJump_small(JAlways, jexit); - - patch_jump(ctx,jfloat1); - patch_jump(ctx,jfloat2); - size = begin_native_call(ctx,3); - op64(ctx, LEA, tmp, pmem(&p,Ebp,-HL_WSIZE*CALL_NREGS*2)); - set_native_arg(ctx, tmp); - op64(ctx, LEA, tmp, pmem(&p,Ebp,args_pos)); - set_native_arg(ctx, tmp); - set_native_arg(ctx, cl); - call_native(ctx, jit_wrapper_d, size); - - patch_jump(ctx,jexit); - op64(ctx,MOV,PESP,PEBP); - op64(ctx,POP,PEBP, UNUSED); - op64(ctx,RET,UNUSED,UNUSED); -} - -#ifdef JIT_CUSTOM_LONGJUMP -// Win64 debug CRT performs a Rtl stack check in debug mode, preventing from -// using longjump. This in an alternate implementation that follows the native -// setjump storage. -// -// Another more reliable way of handling this would be to use RtlAddFunctionTable -// but some platform does not have it. -static void jit_longjump( jit_ctx *ctx ) { - preg *buf = REG_AT(CALL_REGS[0]); - preg *ret = REG_AT(CALL_REGS[1]); - preg p; - int i; - op64(ctx,MOV,PEAX,ret); // return value - op64(ctx,MOV,REG_AT(Edx),pmem(&p,buf->id,0x0)); - op64(ctx,MOV,REG_AT(Ebx),pmem(&p,buf->id,0x8)); - op64(ctx,MOV,REG_AT(Esp),pmem(&p,buf->id,0x10)); - op64(ctx,MOV,REG_AT(Ebp),pmem(&p,buf->id,0x18)); - op64(ctx,MOV,REG_AT(Esi),pmem(&p,buf->id,0x20)); - op64(ctx,MOV,REG_AT(Edi),pmem(&p,buf->id,0x28)); - op64(ctx,MOV,REG_AT(R12),pmem(&p,buf->id,0x30)); - op64(ctx,MOV,REG_AT(R13),pmem(&p,buf->id,0x38)); - op64(ctx,MOV,REG_AT(R14),pmem(&p,buf->id,0x40)); - op64(ctx,MOV,REG_AT(R15),pmem(&p,buf->id,0x48)); - op64(ctx,LDMXCSR,pmem(&p,buf->id,0x58), UNUSED); - op64(ctx,FLDCW,pmem(&p,buf->id,0x5C), UNUSED); - for(i=0;i<10;i++) - op64(ctx,MOVSD,REG_AT(XMM(i+6)),pmem(&p,buf->id,0x60 + i * 16)); - op64(ctx,PUSH,pmem(&p,buf->id,0x50),UNUSED); - op64(ctx,RET,UNUSED,UNUSED); -} -#endif - -static void jit_fail( uchar *msg ) { - if( msg == NULL ) { - hl_debug_break(); - msg = USTR("assert"); - } - vdynamic *d = hl_alloc_dynamic(&hlt_bytes); - d->v.ptr = msg; - hl_throw(d); -} - -static void jit_null_access( jit_ctx *ctx ) { - op64(ctx,PUSH,PEBP,UNUSED); - op64(ctx,MOV,PEBP,PESP); - int_val arg = (int_val)USTR("Null access"); - call_native_consts(ctx, jit_fail, &arg, 1); -} - -static void jit_null_fail( int fhash ) { - vbyte *field = hl_field_name(fhash); - hl_buffer *b = hl_alloc_buffer(); - hl_buffer_str(b, USTR("Null access .")); - hl_buffer_str(b, (uchar*)field); - vdynamic *d = hl_alloc_dynamic(&hlt_bytes); - d->v.ptr = hl_buffer_content(b,NULL); - hl_throw(d); -} - -static void jit_null_field_access( jit_ctx *ctx ) { - preg p; - op64(ctx,PUSH,PEBP,UNUSED); - op64(ctx,MOV,PEBP,PESP); - int size = begin_native_call(ctx, 1); - int args_pos = (IS_WINCALL64 ? 32 : 0) + HL_WSIZE*2; - set_native_arg(ctx, pmem(&p,Ebp,args_pos)); - call_native(ctx,jit_null_fail,size); -} - -static void jit_assert( jit_ctx *ctx ) { - op64(ctx,PUSH,PEBP,UNUSED); - op64(ctx,MOV,PEBP,PESP); - int_val arg = 0; - call_native_consts(ctx, jit_fail, &arg, 1); -} - -static int jit_build( jit_ctx *ctx, void (*fbuild)( jit_ctx *) ) { - int pos; - jit_buf(ctx); - jit_nops(ctx); - pos = BUF_POS(); - fbuild(ctx); - int endPos = BUF_POS(); - jit_nops(ctx); -#ifdef WIN64_UNWIND_TABLES - int fid = ctx->nunwind++; - ctx->unwind_table[fid].BeginAddress = pos; - ctx->unwind_table[fid].EndAddress = endPos; - ctx->unwind_table[fid].UnwindData = ctx->unwind_offset; -#endif - return pos; -} - -static void hl_jit_init_module( jit_ctx *ctx, hl_module *m ) { - int i; - ctx->m = m; - if( m->code->hasdebug ) { - ctx->debug = (hl_debug_infos*)malloc(sizeof(hl_debug_infos) * m->code->nfunctions); - memset(ctx->debug, -1, sizeof(hl_debug_infos) * m->code->nfunctions); - } - for(i=0;icode->nfloats;i++) { - jit_buf(ctx); - *ctx->buf.d++ = m->code->floats[i]; - } -#ifdef WIN64_UNWIND_TABLES - jit_buf(ctx); - ctx->unwind_offset = BUF_POS(); - write_unwind_data(ctx); - - ctx->unwind_table = malloc(sizeof(RUNTIME_FUNCTION) * (m->code->nfunctions + 10)); - memset(ctx->unwind_table, 0, sizeof(RUNTIME_FUNCTION) * (m->code->nfunctions + 10)); -#endif -} - -void hl_jit_init( jit_ctx *ctx, hl_module *m ) { - hl_jit_init_module(ctx,m); - ctx->c2hl = jit_build(ctx, jit_c2hl); - ctx->hl2c = jit_build(ctx, jit_hl2c); -# ifdef JIT_CUSTOM_LONGJUMP - ctx->longjump = jit_build(ctx, jit_longjump); -# endif - ctx->static_functions[0] = (void*)(int_val)jit_build(ctx,jit_null_access); - ctx->static_functions[1] = (void*)(int_val)jit_build(ctx,jit_assert); - ctx->static_functions[2] = (void*)(int_val)jit_build(ctx,jit_null_field_access); -} - -void hl_jit_reset( jit_ctx *ctx, hl_module *m ) { - ctx->debug = NULL; - hl_jit_init_module(ctx,m); -} - -static void *get_dyncast( hl_type *t ) { - switch( t->kind ) { - case HF32: - return hl_dyn_castf; - case HF64: - return hl_dyn_castd; - case HI64: - case HGUID: - return hl_dyn_casti64; - case HI32: - case HUI16: - case HUI8: - case HBOOL: - return hl_dyn_casti; - default: - return hl_dyn_castp; - } -} - -static void *get_dynset( hl_type *t ) { - switch( t->kind ) { - case HF32: - return hl_dyn_setf; - case HF64: - return hl_dyn_setd; - case HI64: - case HGUID: - return hl_dyn_seti64; - case HI32: - case HUI16: - case HUI8: - case HBOOL: - return hl_dyn_seti; - default: - return hl_dyn_setp; - } -} - -static void *get_dynget( hl_type *t ) { - switch( t->kind ) { - case HF32: - return hl_dyn_getf; - case HF64: - return hl_dyn_getd; - case HI64: - case HGUID: - return hl_dyn_geti64; - case HI32: - case HUI16: - case HUI8: - case HBOOL: - return hl_dyn_geti; - default: - return hl_dyn_getp; - } -} - -static double uint_to_double( unsigned int v ) { - return v; -} - -static vclosure *alloc_static_closure( jit_ctx *ctx, int fid ) { - hl_module *m = ctx->m; - vclosure *c = hl_malloc(&m->ctx.alloc,sizeof(vclosure)); - int fidx = m->functions_indexes[fid]; - c->hasValue = 0; - if( fidx >= m->code->nfunctions ) { - // native - c->t = m->code->natives[fidx - m->code->nfunctions].t; - c->fun = m->functions_ptrs[fid]; - c->value = NULL; - } else { - c->t = m->code->functions[fidx].type; - c->fun = (void*)(int_val)fid; - c->value = ctx->closure_list; - ctx->closure_list = c; - } - return c; -} - -static void make_dyn_cast( jit_ctx *ctx, vreg *dst, vreg *v ) { - int size; - preg p; - preg *tmp; - if( v->t->kind == HNULL && v->t->tparam->kind == dst->t->kind ) { - int jnull, jend; - preg *out; - switch( dst->t->kind ) { - case HUI8: - case HUI16: - case HI32: - case HBOOL: - case HI64: - case HGUID: - tmp = alloc_cpu(ctx, v, true); - op64(ctx, TEST, tmp, tmp); - XJump_small(JZero, jnull); - op64(ctx, MOV, tmp, pmem(&p,tmp->id,8)); - XJump_small(JAlways, jend); - patch_jump(ctx, jnull); - op64(ctx, XOR, tmp, tmp); - patch_jump(ctx, jend); - store(ctx, dst, tmp, true); - return; - case HF32: - case HF64: - tmp = alloc_cpu(ctx, v, true); - out = alloc_fpu(ctx, dst, false); - op64(ctx, TEST, tmp, tmp); - XJump_small(JZero, jnull); - op64(ctx, dst->t->kind == HF32 ? MOVSS : MOVSD, out, pmem(&p,tmp->id,8)); - XJump_small(JAlways, jend); - patch_jump(ctx, jnull); - op64(ctx, XORPD, out, out); - patch_jump(ctx, jend); - store(ctx, dst, out, true); - return; - default: - break; - } - } - switch( dst->t->kind ) { - case HF32: - case HF64: - case HI64: - case HGUID: - size = begin_native_call(ctx, 2); - set_native_arg(ctx, pconst64(&p,(int_val)v->t)); - break; - default: - size = begin_native_call(ctx, 3); - set_native_arg(ctx, pconst64(&p,(int_val)dst->t)); - set_native_arg(ctx, pconst64(&p,(int_val)v->t)); - break; - } - tmp = alloc_native_arg(ctx); - op64(ctx,MOV,tmp,REG_AT(Ebp)); - if( v->stackPos >= 0 ) - op64(ctx,ADD,tmp,pconst(&p,v->stackPos)); - else - op64(ctx,SUB,tmp,pconst(&p,-v->stackPos)); - set_native_arg(ctx,tmp); - call_native(ctx,get_dyncast(dst->t),size); - store_result(ctx, dst); -} - -int hl_jit_function( jit_ctx *ctx, hl_module *m, hl_function *f ) { - int i, size = 0, opCount; - int codePos = BUF_POS(); - int nargs = f->type->fun->nargs; - unsigned short *debug16 = NULL; - int *debug32 = NULL; - call_regs cregs = {0}; - hl_thread_info *tinf = NULL; - preg p; - ctx->f = f; - ctx->allocOffset = 0; - if( f->nregs > ctx->maxRegs ) { - free(ctx->vregs); - ctx->vregs = (vreg*)malloc(sizeof(vreg) * (f->nregs + 1)); - if( ctx->vregs == NULL ) { - ctx->maxRegs = 0; - return -1; - } - ctx->maxRegs = f->nregs; - } - if( f->nops > ctx->maxOps ) { - free(ctx->opsPos); - ctx->opsPos = (int*)malloc(sizeof(int) * (f->nops + 1)); - if( ctx->opsPos == NULL ) { - ctx->maxOps = 0; - return -1; - } - ctx->maxOps = f->nops; - } - memset(ctx->opsPos,0,(f->nops+1)*sizeof(int)); - for(i=0;inregs;i++) { - vreg *r = R(i); - r->t = f->regs[i]; - r->size = hl_type_size(r->t); - r->current = NULL; - r->stack.holds = NULL; - r->stack.id = i; - r->stack.kind = RSTACK; - } - size = 0; - int argsSize = 0; - for(i=0;it,i); - if( creg < 0 || IS_WINCALL64 ) { - // use existing stack storage - r->stackPos = argsSize + HL_WSIZE * 2; - argsSize += stack_size(r->t); - } else { - // make room in local vars - size += r->size; - size += hl_pad_size(size,r->t); - r->stackPos = -size; - } - } - for(i=nargs;inregs;i++) { - vreg *r = R(i); - size += r->size; - size += hl_pad_size(size,r->t); // align local vars - r->stackPos = -size; - } -# ifdef HL_64 - size += (-size) & 15; // align on 16 bytes -# else - size += hl_pad_size(size,&hlt_dyn); // align on word size -# endif - ctx->totalRegsSize = size; - jit_buf(ctx); - ctx->functionPos = BUF_POS(); - // make sure currentPos is > 0 before any reg allocations happen - // otherwise `alloc_reg` thinks that all registers are locked - ctx->currentPos = 1; - op_enter(ctx); -# ifdef HL_64 - { - // store in local var - for(i=0;isize); - p->holds = r; - r->current = p; - } - } -# endif - if( ctx->m->code->hasdebug ) { - debug16 = (unsigned short*)malloc(sizeof(unsigned short) * (f->nops + 1)); - debug16[0] = (unsigned short)(BUF_POS() - codePos); - } - ctx->opsPos[0] = BUF_POS(); - - for(opCount=0;opCountnops;opCount++) { - int jump; - hl_opcode *o = f->ops + opCount; - vreg *dst = R(o->p1); - vreg *ra = R(o->p2); - vreg *rb = R(o->p3); - ctx->currentPos = opCount + 1; - jit_buf(ctx); -# ifdef JIT_DEBUG - if( opCount == 0 || f->ops[opCount-1].op != OAsm ) { - int uid = opCount + (f->findex<<16); - op32(ctx, PUSH, pconst(&p,uid), UNUSED); - op64(ctx, ADD, PESP, pconst(&p,HL_WSIZE)); - } -# endif - // emit code - switch( o->op ) { - case OMov: - case OUnsafeCast: - op_mov(ctx, dst, ra); - break; - case OInt: - store_const(ctx, dst, m->code->ints[o->p2]); - break; - case OBool: - store_const(ctx, dst, o->p2); - break; - case OGetGlobal: - { - void *addr = m->globals_data + m->globals_indexes[o->p2]; -# ifdef HL_64 - preg *tmp = alloc_reg(ctx, RCPU); - op64(ctx, MOV, tmp, pconst64(&p,(int_val)addr)); - copy_to(ctx, dst, pmem(&p,tmp->id,0)); -# else - copy_to(ctx, dst, paddr(&p,addr)); -# endif - } - break; - case OSetGlobal: - { - void *addr = m->globals_data + m->globals_indexes[o->p1]; -# ifdef HL_64 - preg *tmp = alloc_reg(ctx, RCPU); - op64(ctx, MOV, tmp, pconst64(&p,(int_val)addr)); - copy_from(ctx, pmem(&p,tmp->id,0), ra); -# else - copy_from(ctx, paddr(&p,addr), ra); -# endif - } - break; - case OCall3: - { - int args[3] = { o->p3, o->extra[0], o->extra[1] }; - op_call_fun(ctx, dst, o->p2, 3, args); - } - break; - case OCall4: - { - int args[4] = { o->p3, o->extra[0], o->extra[1], o->extra[2] }; - op_call_fun(ctx, dst, o->p2, 4, args); - } - break; - case OCallN: - op_call_fun(ctx, dst, o->p2, o->p3, o->extra); - break; - case OCall0: - op_call_fun(ctx, dst, o->p2, 0, NULL); - break; - case OCall1: - op_call_fun(ctx, dst, o->p2, 1, &o->p3); - break; - case OCall2: - { - int args[2] = { o->p3, (int)(int_val)o->extra }; - op_call_fun(ctx, dst, o->p2, 2, args); - } - break; - case OSub: - case OAdd: - case OMul: - case OSDiv: - case OUDiv: - case OShl: - case OSShr: - case OUShr: - case OAnd: - case OOr: - case OXor: - case OSMod: - case OUMod: - op_binop(ctx, dst, ra, rb, o->op); - break; - case ONeg: - { - if( IS_FLOAT(ra) ) { - preg *pa = alloc_reg(ctx,RFPU); - preg *pb = alloc_fpu(ctx,ra,true); - op64(ctx,XORPD,pa,pa); - op64(ctx,ra->t->kind == HF32 ? SUBSS : SUBSD,pa,pb); - store(ctx,dst,pa,true); - } else if( ra->t->kind == HI64 ) { -# ifdef HL_64 - preg *pa = alloc_reg(ctx,RCPU); - preg *pb = alloc_cpu(ctx,ra,true); - op64(ctx,XOR,pa,pa); - op64(ctx,SUB,pa,pb); - store(ctx,dst,pa,true); -# else - error_i64(); -# endif - } else { - preg *pa = alloc_reg(ctx,RCPU); - preg *pb = alloc_cpu(ctx,ra,true); - op32(ctx,XOR,pa,pa); - op32(ctx,SUB,pa,pb); - store(ctx,dst,pa,true); - } - } - break; - case ONot: - { - preg *v = alloc_cpu(ctx,ra,true); - op32(ctx,XOR,v,pconst(&p,1)); - store(ctx,dst,v,true); - } - break; - case OJFalse: - case OJTrue: - case OJNotNull: - case OJNull: - { - preg *r = dst->t->kind == HBOOL ? alloc_cpu8(ctx, dst, true) : alloc_cpu(ctx, dst, true); - op64(ctx, dst->t->kind == HBOOL ? TEST8 : TEST, r, r); - XJump( o->op == OJFalse || o->op == OJNull ? JZero : JNotZero,jump); - register_jump(ctx,jump,(opCount + 1) + o->p2); - } - break; - case OJEq: - case OJNotEq: - case OJSLt: - case OJSGte: - case OJSLte: - case OJSGt: - case OJULt: - case OJUGte: - case OJNotLt: - case OJNotGte: - op_jump(ctx,dst,ra,o,(opCount + 1) + o->p3); - break; - case OJAlways: - jump = do_jump(ctx,o->op,false); - register_jump(ctx,jump,(opCount + 1) + o->p1); - break; - case OToDyn: - if( ra->t->kind == HBOOL ) { - int size = begin_native_call(ctx, 1); - set_native_arg(ctx, fetch(ra)); - call_native(ctx, hl_alloc_dynbool, size); - store(ctx, dst, PEAX, true); - } else { - int_val rt = (int_val)ra->t; - int jskip = 0; - if( hl_is_ptr(ra->t) ) { - int jnz; - preg *a = alloc_cpu(ctx,ra,true); - op64(ctx,TEST,a,a); - XJump_small(JNotZero,jnz); - op64(ctx,XOR,PEAX,PEAX); // will replace the result of alloc_dynamic at jump land - XJump_small(JAlways,jskip); - patch_jump(ctx,jnz); - } - call_native_consts(ctx, hl_alloc_dynamic, &rt, 1); - // copy value to dynamic - if( (IS_FLOAT(ra) || ra->size == 8) && !IS_64 ) { - preg *tmp = REG_AT(RCPU_SCRATCH_REGS[1]); - op64(ctx,MOV,tmp,&ra->stack); - op32(ctx,MOV,pmem(&p,Eax,HDYN_VALUE),tmp); - if( ra->t->kind == HF64 ) { - ra->stackPos += 4; - op64(ctx,MOV,tmp,&ra->stack); - op32(ctx,MOV,pmem(&p,Eax,HDYN_VALUE+4),tmp); - ra->stackPos -= 4; - } - } else { - preg *tmp = REG_AT(RCPU_SCRATCH_REGS[1]); - copy_from(ctx,tmp,ra); - op64(ctx,MOV,pmem(&p,Eax,HDYN_VALUE),tmp); - } - if( hl_is_ptr(ra->t) ) patch_jump(ctx,jskip); - store(ctx, dst, PEAX, true); - } - break; - case OToSFloat: - if( ra == dst ) break; - if (ra->t->kind == HI32 || ra->t->kind == HUI16 || ra->t->kind == HUI8) { - preg* r = alloc_cpu(ctx, ra, true); - preg* w = alloc_fpu(ctx, dst, false); - op32(ctx, dst->t->kind == HF64 ? CVTSI2SD : CVTSI2SS, w, r); - store(ctx, dst, w, true); - } else if (ra->t->kind == HI64 ) { - preg* r = alloc_cpu(ctx, ra, true); - preg* w = alloc_fpu(ctx, dst, false); - op64(ctx, dst->t->kind == HF64 ? CVTSI2SD : CVTSI2SS, w, r); - store(ctx, dst, w, true); - } else if( ra->t->kind == HF64 && dst->t->kind == HF32 ) { - preg *r = alloc_fpu(ctx,ra,true); - preg *w = alloc_fpu(ctx,dst,false); - op32(ctx,CVTSD2SS,w,r); - store(ctx, dst, w, true); - } else if( ra->t->kind == HF32 && dst->t->kind == HF64 ) { - preg *r = alloc_fpu(ctx,ra,true); - preg *w = alloc_fpu(ctx,dst,false); - op32(ctx,CVTSS2SD,w,r); - store(ctx, dst, w, true); - } else - ASSERT(0); - break; - case OToUFloat: - { - int size; - size = prepare_call_args(ctx,1,&o->p2,ctx->vregs,0); - call_native(ctx,uint_to_double,size); - store_result(ctx,dst); - } - break; - case OToInt: - if( ra == dst ) break; - if( ra->t->kind == HF64 ) { - preg *r = alloc_fpu(ctx,ra,true); - preg *w = alloc_cpu(ctx,dst,false); - preg *tmp = alloc_reg(ctx,RCPU); - op32(ctx,STMXCSR,pmem(&p,Esp,-4),UNUSED); - op32(ctx,MOV,tmp,&p); - op32(ctx,OR,tmp,pconst(&p,0x6000)); // set round towards 0 - op32(ctx,MOV,pmem(&p,Esp,-8),tmp); - op32(ctx,LDMXCSR,&p,UNUSED); - op32(ctx,CVTSD2SI,w,r); - op32(ctx,LDMXCSR,pmem(&p,Esp,-4),UNUSED); - store(ctx, dst, w, true); - } else if (ra->t->kind == HF32) { - preg *r = alloc_fpu(ctx, ra, true); - preg *w = alloc_cpu(ctx, dst, false); - preg *tmp = alloc_reg(ctx, RCPU); - op32(ctx, STMXCSR, pmem(&p, Esp, -4), UNUSED); - op32(ctx, MOV, tmp, &p); - op32(ctx, OR, tmp, pconst(&p, 0x6000)); // set round towards 0 - op32(ctx, MOV, pmem(&p, Esp, -8), tmp); - op32(ctx, LDMXCSR, &p, UNUSED); - op32(ctx, CVTSS2SI, w, r); - op32(ctx, LDMXCSR, pmem(&p, Esp, -4), UNUSED); - store(ctx, dst, w, true); - } else if( (dst->t->kind == HI64 || dst->t->kind == HGUID) && ra->t->kind == HI32 ) { - if( ra->current != PEAX ) { - op32(ctx, MOV, PEAX, fetch(ra)); - scratch(PEAX); - } -# ifdef HL_64 - op64(ctx, CDQE, UNUSED, UNUSED); // sign-extend Eax into Rax - store(ctx, dst, PEAX, true); -# else - op32(ctx, CDQ, UNUSED, UNUSED); // sign-extend Eax into Eax:Edx - scratch(REG_AT(Edx)); - op32(ctx, MOV, fetch(dst), PEAX); - dst->stackPos += 4; - op32(ctx, MOV, fetch(dst), REG_AT(Edx)); - dst->stackPos -= 4; - } else if( dst->t->kind == HI32 && ra->t->kind == HI64 ) { - error_i64(); -# endif - } else { - preg *r = alloc_cpu(ctx,dst,false); - copy_from(ctx, r, ra); - store(ctx, dst, r, true); - } - break; - case ORet: - op_ret(ctx, dst); - break; - case OIncr: - { - if( IS_FLOAT(dst) ) { - ASSERT(0); - } else { - preg *v = fetch32(ctx,dst); - op32(ctx,INC,v,UNUSED); - if( v->kind != RSTACK ) store(ctx, dst, v, false); - } - } - break; - case ODecr: - { - if( IS_FLOAT(dst) ) { - ASSERT(0); - } else { - preg *v = fetch32(ctx,dst); - op32(ctx,DEC,v,UNUSED); - if( v->kind != RSTACK ) store(ctx, dst, v, false); - } - } - break; - case OFloat: - { - if( m->code->floats[o->p2] == 0 ) { - preg *f = alloc_fpu(ctx,dst,false); - op64(ctx,XORPD,f,f); - } else switch( dst->t->kind ) { - case HF64: - case HF32: -# ifdef HL_64 - op64(ctx,dst->t->kind == HF32 ? CVTSD2SS : MOVSD,alloc_fpu(ctx,dst,false),pcodeaddr(&p,o->p2 * 8)); -# else - op64(ctx,dst->t->kind == HF32 ? MOVSS : MOVSD,alloc_fpu(ctx,dst,false),paddr(&p,m->code->floats + o->p2)); -# endif - break; - default: - ASSERT(dst->t->kind); - } - store(ctx,dst,dst->current,false); - } - break; - case OString: - op64(ctx,MOV,alloc_cpu(ctx, dst, false),pconst64(&p,(int_val)hl_get_ustring(m->code,o->p2))); - store(ctx,dst,dst->current,false); - break; - case OBytes: - { - char *b = m->code->version >= 5 ? m->code->bytes + m->code->bytes_pos[o->p2] : m->code->strings[o->p2]; - op64(ctx,MOV,alloc_cpu(ctx,dst,false),pconst64(&p,(int_val)b)); - store(ctx,dst,dst->current,false); - } - break; - case ONull: - { - op64(ctx,XOR,alloc_cpu(ctx, dst, false),alloc_cpu(ctx, dst, false)); - store(ctx,dst,dst->current,false); - } - break; - case ONew: - { - int_val args[] = { (int_val)dst->t }; - void *allocFun; - int nargs = 1; - switch( dst->t->kind ) { - case HOBJ: - case HSTRUCT: - allocFun = hl_alloc_obj; - break; - case HDYNOBJ: - allocFun = hl_alloc_dynobj; - nargs = 0; - break; - case HVIRTUAL: - allocFun = hl_alloc_virtual; - break; - default: - ASSERT(dst->t->kind); - } - call_native_consts(ctx, allocFun, args, nargs); - store(ctx, dst, PEAX, true); - } - break; - case OInstanceClosure: - { - preg *r = alloc_cpu(ctx, rb, true); - jlist *j = (jlist*)hl_malloc(&ctx->galloc,sizeof(jlist)); - int size = begin_native_call(ctx,3); - set_native_arg(ctx,r); - - j->pos = BUF_POS(); - j->target = o->p2; - j->next = ctx->calls; - ctx->calls = j; - - set_native_arg(ctx,pconst64(&p,RESERVE_ADDRESS)); - set_native_arg(ctx,pconst64(&p,(int_val)m->code->functions[m->functions_indexes[o->p2]].type)); - call_native(ctx,hl_alloc_closure_ptr,size); - store(ctx,dst,PEAX,true); - } - break; - case OVirtualClosure: - { - int size, i; - preg *r = alloc_cpu_call(ctx, ra); - hl_type *t = NULL; - hl_type *ot = ra->t; - while( t == NULL ) { - for(i=0;iobj->nproto;i++) { - hl_obj_proto *pp = ot->obj->proto + i; - if( pp->pindex == o->p3 ) { - t = m->code->functions[m->functions_indexes[pp->findex]].type; - break; - } - } - ot = ot->obj->super; - } - size = begin_native_call(ctx,3); - set_native_arg(ctx,r); - // read r->type->vobj_proto[i] for function address - op64(ctx,MOV,r,pmem(&p,r->id,0)); - op64(ctx,MOV,r,pmem(&p,r->id,HL_WSIZE*2)); - op64(ctx,MOV,r,pmem(&p,r->id,HL_WSIZE*o->p3)); - set_native_arg(ctx,r); - op64(ctx,MOV,r,pconst64(&p,(int_val)t)); - set_native_arg(ctx,r); - call_native(ctx,hl_alloc_closure_ptr,size); - store(ctx,dst,PEAX,true); - } - break; - case OCallClosure: - if( ra->t->kind == HDYN ) { - // ASM for { - // vdynamic *args[] = {args}; - // vdynamic *ret = hl_dyn_call(closure,args,nargs); - // dst = hl_dyncast(ret,t_dynamic,t_dst); - // } - int offset = o->p3 * HL_WSIZE; - preg *r = alloc_reg(ctx, RCPU_CALL); - if( offset & 15 ) offset += 16 - (offset & 15); - op64(ctx,SUB,PESP,pconst(&p,offset)); - op64(ctx,MOV,r,PESP); - for(i=0;ip3;i++) { - vreg *a = R(o->extra[i]); - if( !hl_is_dynamic(a->t) ) ASSERT(0); - preg *v = alloc_cpu(ctx,a,true); - op64(ctx,MOV,pmem(&p,r->id,i * HL_WSIZE),v); - RUNLOCK(v); - } -# ifdef HL_64 - int size = begin_native_call(ctx, 3) + offset; - set_native_arg(ctx, pconst(&p,o->p3)); - set_native_arg(ctx, r); - set_native_arg(ctx, fetch(ra)); -# else - int size = pad_before_call(ctx,HL_WSIZE*2 + sizeof(int) + offset); - op64(ctx,PUSH,pconst(&p,o->p3),UNUSED); - op64(ctx,PUSH,r,UNUSED); - op64(ctx,PUSH,alloc_cpu(ctx,ra,true),UNUSED); -# endif - call_native(ctx,hl_dyn_call,size); - if( dst->t->kind != HVOID ) { - store(ctx,dst,PEAX,true); - make_dyn_cast(ctx,dst,dst); - } - } else { - int jhasvalue, jend, size; - // ASM for if( c->hasValue ) c->fun(value,args) else c->fun(args) - preg *r = alloc_cpu(ctx,ra,true); - preg *tmp = alloc_reg(ctx, RCPU); - op32(ctx,MOV,tmp,pmem(&p,r->id,HL_WSIZE*2)); - op32(ctx,TEST,tmp,tmp); - scratch(tmp); - XJump_small(JNotZero,jhasvalue); - save_regs(ctx); - size = prepare_call_args(ctx,o->p3,o->extra,ctx->vregs,0); - preg *rr = r; - if( rr->holds != ra ) rr = alloc_cpu(ctx, ra, true); - op_call(ctx, pmem(&p,rr->id,HL_WSIZE), size); - XJump_small(JAlways,jend); - patch_jump(ctx,jhasvalue); - restore_regs(ctx); -# ifdef HL_64 - { - int regids[64]; - preg *pc = REG_AT(CALL_REGS[0]); - vreg *sc = R(f->nregs); // scratch register that we temporary rebind - if( o->p3 >= 63 ) jit_error("assert"); - memcpy(regids + 1, o->extra, o->p3 * sizeof(int)); - regids[0] = f->nregs; - sc->size = HL_WSIZE; - sc->t = &hlt_dyn; - op64(ctx, MOV, pc, pmem(&p,r->id,HL_WSIZE*3)); - scratch(pc); - sc->current = pc; - pc->holds = sc; - size = prepare_call_args(ctx,o->p3 + 1,regids,ctx->vregs,0); - if( r->holds != ra ) r = alloc_cpu(ctx, ra, true); - } -# else - size = prepare_call_args(ctx,o->p3,o->extra,ctx->vregs,HL_WSIZE); - if( r->holds != ra ) r = alloc_cpu(ctx, ra, true); - op64(ctx, PUSH,pmem(&p,r->id,HL_WSIZE*3),UNUSED); // push closure value -# endif - op_call(ctx, pmem(&p,r->id,HL_WSIZE), size); - discard_regs(ctx,false); - patch_jump(ctx,jend); - store_result(ctx, dst); - } - break; - case OStaticClosure: - { - vclosure *c = alloc_static_closure(ctx,o->p2); - preg *r = alloc_reg(ctx, RCPU); - op64(ctx, MOV, r, pconst64(&p,(int_val)c)); - store(ctx,dst,r,true); - } - break; - case OField: - { -# ifndef HL_64 - if( dst->t->kind == HI64 ) { - error_i64(); - break; - } -# endif - switch( ra->t->kind ) { - case HOBJ: - case HSTRUCT: - { - hl_runtime_obj *rt = hl_get_obj_rt(ra->t); - preg *rr = alloc_cpu(ctx,ra, true); - if( dst->t->kind == HSTRUCT ) { - hl_type *ft = hl_obj_field_fetch(ra->t,o->p3)->t; - if( ft->kind == HPACKED ) { - preg *r = alloc_reg(ctx,RCPU); - op64(ctx,LEA,r,pmem(&p,(CpuReg)rr->id,rt->fields_indexes[o->p3])); - store(ctx,dst,r,true); - break; - } - } - copy_to(ctx,dst,pmem(&p, (CpuReg)rr->id, rt->fields_indexes[o->p3])); - } - break; - case HVIRTUAL: - // ASM for --> if( hl_vfields(o)[f] ) r = *hl_vfields(o)[f]; else r = hl_dyn_get(o,hash(field),vt) - { - int jhasfield, jend, size; - bool need_type = !(IS_FLOAT(dst) || dst->t->kind == HI64); - preg *v = alloc_cpu_call(ctx,ra); - preg *r = alloc_reg(ctx,RCPU); - op64(ctx,MOV,r,pmem(&p,v->id,sizeof(vvirtual)+HL_WSIZE*o->p3)); - op64(ctx,TEST,r,r); - XJump_small(JNotZero,jhasfield); - size = begin_native_call(ctx, need_type ? 3 : 2); - if( need_type ) set_native_arg(ctx,pconst64(&p,(int_val)dst->t)); - set_native_arg(ctx,pconst64(&p,(int_val)ra->t->virt->fields[o->p3].hashed_name)); - set_native_arg(ctx,v); - call_native(ctx,get_dynget(dst->t),size); - store_result(ctx,dst); - XJump_small(JAlways,jend); - patch_jump(ctx,jhasfield); - copy_to(ctx, dst, pmem(&p,(CpuReg)r->id,0)); - patch_jump(ctx,jend); - scratch(dst->current); - } - break; - default: - ASSERT(ra->t->kind); - break; - } - } - break; - case OSetField: - { - switch( dst->t->kind ) { - case HOBJ: - case HSTRUCT: - { - hl_runtime_obj *rt = hl_get_obj_rt(dst->t); - preg *rr = alloc_cpu(ctx, dst, true); - if( rb->t->kind == HSTRUCT ) { - hl_type *ft = hl_obj_field_fetch(dst->t,o->p2)->t; - if( ft->kind == HPACKED ) { - hl_runtime_obj *frt = hl_get_obj_rt(ft->tparam); - preg *prb = alloc_cpu(ctx, rb, true); - preg *tmp = alloc_reg(ctx, RCPU_CALL); - int offset = 0; - while( offset < frt->size ) { - int remain = frt->size - offset; - int copy_size = remain >= HL_WSIZE ? HL_WSIZE : (remain >= 4 ? 4 : (remain >= 2 ? 2 : 1)); - copy(ctx, tmp, pmem(&p, (CpuReg)prb->id, offset), copy_size); - copy(ctx, pmem(&p, (CpuReg)rr->id, rt->fields_indexes[o->p2]+offset), tmp, copy_size); - offset += copy_size; - } - break; - } - } - copy_from(ctx, pmem(&p, (CpuReg)rr->id, rt->fields_indexes[o->p2]), rb); - } - break; - case HVIRTUAL: - // ASM for --> if( hl_vfields(o)[f] ) *hl_vfields(o)[f] = v; else hl_dyn_set(o,hash(field),vt,v) - { - int jhasfield, jend; - preg *obj = alloc_cpu_call(ctx,dst); - preg *r = alloc_reg(ctx,RCPU); - op64(ctx,MOV,r,pmem(&p,obj->id,sizeof(vvirtual)+HL_WSIZE*o->p2)); - op64(ctx,TEST,r,r); - XJump_small(JNotZero,jhasfield); -# ifdef HL_64 - switch( rb->t->kind ) { - case HF64: - case HF32: - size = begin_native_call(ctx,3); - set_native_arg_fpu(ctx, fetch(rb), rb->t->kind == HF32); - break; - case HI64: - case HGUID: - size = begin_native_call(ctx,3); - set_native_arg(ctx, fetch(rb)); - break; - default: - size = begin_native_call(ctx, 4); - set_native_arg(ctx, fetch(rb)); - set_native_arg(ctx, pconst64(&p,(int_val)rb->t)); - break; - } - set_native_arg(ctx,pconst(&p,dst->t->virt->fields[o->p2].hashed_name)); - set_native_arg(ctx,obj); -# else - switch( rb->t->kind ) { - case HF64: - case HI64: - case HGUID: - size = pad_before_call(ctx,HL_WSIZE*2 + sizeof(double)); - push_reg(ctx,rb); - break; - case HF32: - size = pad_before_call(ctx,HL_WSIZE*2 + sizeof(float)); - push_reg(ctx,rb); - break; - default: - size = pad_before_call(ctx,HL_WSIZE*4); - op64(ctx,PUSH,fetch32(ctx,rb),UNUSED); - op64(ctx,MOV,r,pconst64(&p,(int_val)rb->t)); - op64(ctx,PUSH,r,UNUSED); - break; - } - op32(ctx,MOV,r,pconst(&p,dst->t->virt->fields[o->p2].hashed_name)); - op64(ctx,PUSH,r,UNUSED); - op64(ctx,PUSH,obj,UNUSED); -# endif - call_native(ctx,get_dynset(rb->t),size); - XJump_small(JAlways,jend); - patch_jump(ctx,jhasfield); - copy_from(ctx, pmem(&p,(CpuReg)r->id,0), rb); - patch_jump(ctx,jend); - scratch(rb->current); - } - break; - default: - ASSERT(dst->t->kind); - break; - } - } - break; - case OGetThis: - { - vreg *r = R(0); - hl_runtime_obj *rt = hl_get_obj_rt(r->t); - preg *rr = alloc_cpu(ctx,r, true); - if( dst->t->kind == HSTRUCT ) { - hl_type *ft = hl_obj_field_fetch(r->t,o->p2)->t; - if( ft->kind == HPACKED ) { - preg *r = alloc_reg(ctx,RCPU); - op64(ctx,LEA,r,pmem(&p,(CpuReg)rr->id,rt->fields_indexes[o->p2])); - store(ctx,dst,r,true); - break; - } - } - copy_to(ctx,dst,pmem(&p, (CpuReg)rr->id, rt->fields_indexes[o->p2])); - } - break; - case OSetThis: - { - vreg *r = R(0); - hl_runtime_obj *rt = hl_get_obj_rt(r->t); - preg *rr = alloc_cpu(ctx, r, true); - if( ra->t->kind == HSTRUCT ) { - hl_type *ft = hl_obj_field_fetch(r->t,o->p1)->t; - if( ft->kind == HPACKED ) { - hl_runtime_obj *frt = hl_get_obj_rt(ft->tparam); - preg *pra = alloc_cpu(ctx, ra, true); - preg *tmp = alloc_reg(ctx, RCPU_CALL); - int offset = 0; - while( offset < frt->size ) { - int remain = frt->size - offset; - int copy_size = remain >= HL_WSIZE ? HL_WSIZE : (remain >= 4 ? 4 : (remain >= 2 ? 2 : 1)); - copy(ctx, tmp, pmem(&p, (CpuReg)pra->id, offset), copy_size); - copy(ctx, pmem(&p, (CpuReg)rr->id, rt->fields_indexes[o->p1]+offset), tmp, copy_size); - offset += copy_size; - } - break; - } - } - copy_from(ctx, pmem(&p, (CpuReg)rr->id, rt->fields_indexes[o->p1]), ra); - } - break; - case OCallThis: - { - int nargs = o->p3 + 1; - int *args = (int*)hl_malloc(&ctx->falloc,sizeof(int) * nargs); - int size; - preg *r = alloc_cpu(ctx, R(0), true); - preg *tmp; - tmp = alloc_reg(ctx, RCPU_CALL); - op64(ctx,MOV,tmp,pmem(&p,r->id,0)); // read type - op64(ctx,MOV,tmp,pmem(&p,tmp->id,HL_WSIZE*2)); // read proto - args[0] = 0; - for(i=1;iextra[i-1]; - size = prepare_call_args(ctx,nargs,args,ctx->vregs,0); - op_call(ctx,pmem(&p,tmp->id,o->p2*HL_WSIZE),size); - discard_regs(ctx, false); - store_result(ctx, dst); - } - break; - case OCallMethod: - switch( R(o->extra[0])->t->kind ) { - case HOBJ: { - int size; - preg *r = alloc_cpu(ctx, R(o->extra[0]), true); - preg *tmp; - tmp = alloc_reg(ctx, RCPU_CALL); - op64(ctx,MOV,tmp,pmem(&p,r->id,0)); // read type - op64(ctx,MOV,tmp,pmem(&p,tmp->id,HL_WSIZE*2)); // read proto - size = prepare_call_args(ctx,o->p3,o->extra,ctx->vregs,0); - op_call(ctx,pmem(&p,tmp->id,o->p2*HL_WSIZE),size); - discard_regs(ctx, false); - store_result(ctx, dst); - break; - } - case HVIRTUAL: - // ASM for --> if( hl_vfields(o)[f] ) dst = *hl_vfields(o)[f](o->value,args...); else dst = hl_dyn_call_obj(o->value,field,args,&ret) - { - int size; - int paramsSize; - int jhasfield, jend; - bool need_dyn; - bool obj_in_args = false; - vreg *obj = R(o->extra[0]); - preg *v = alloc_cpu_call(ctx,obj); - preg *r = alloc_reg(ctx,RCPU_CALL); - op64(ctx,MOV,r,pmem(&p,v->id,sizeof(vvirtual)+HL_WSIZE*o->p2)); - op64(ctx,TEST,r,r); - save_regs(ctx); - - if( o->p3 < 6 ) { - XJump_small(JNotZero,jhasfield); - } else { - XJump(JNotZero,jhasfield); - } - - need_dyn = !hl_is_ptr(dst->t) && dst->t->kind != HVOID; - paramsSize = (o->p3 - 1) * HL_WSIZE; - if( need_dyn ) paramsSize += sizeof(vdynamic); - if( paramsSize & 15 ) paramsSize += 16 - (paramsSize&15); - op64(ctx,SUB,PESP,pconst(&p,paramsSize)); - op64(ctx,MOV,r,PESP); - - for(i=0;ip3-1;i++) { - vreg *a = R(o->extra[i+1]); - if( hl_is_ptr(a->t) ) { - op64(ctx,MOV,pmem(&p,r->id,i*HL_WSIZE),alloc_cpu(ctx,a,true)); - if( a->current != v ) { - RUNLOCK(a->current); - } else - obj_in_args = true; - } else { - preg *r2 = alloc_reg(ctx,RCPU); - op64(ctx,LEA,r2,&a->stack); - op64(ctx,MOV,pmem(&p,r->id,i*HL_WSIZE),r2); - if( r2 != v ) RUNLOCK(r2); - } - } - - jit_buf(ctx); - - if( !need_dyn ) { - size = begin_native_call(ctx, 5); - set_native_arg(ctx, pconst(&p,0)); - } else { - preg *rtmp = alloc_reg(ctx,RCPU); - op64(ctx,LEA,rtmp,pmem(&p,Esp,paramsSize - sizeof(vdynamic))); - size = begin_native_call(ctx, 5); - set_native_arg(ctx,rtmp); - if( !IS_64 ) RUNLOCK(rtmp); - } - set_native_arg(ctx,r); - set_native_arg(ctx,pconst(&p,obj->t->virt->fields[o->p2].hashed_name)); // fid - set_native_arg(ctx,pconst64(&p,(int_val)obj->t->virt->fields[o->p2].t)); // ftype - set_native_arg(ctx,pmem(&p,v->id,HL_WSIZE)); // o->value - call_native(ctx,hl_dyn_call_obj,size + paramsSize); - if( need_dyn ) { - preg *r = IS_FLOAT(dst) ? REG_AT(XMM(0)) : PEAX; - copy(ctx,r,pmem(&p,Esp,HDYN_VALUE - (int)sizeof(vdynamic)),dst->size); - store(ctx, dst, r, false); - } else - store(ctx, dst, PEAX, false); - - XJump_small(JAlways,jend); - patch_jump(ctx,jhasfield); - restore_regs(ctx); - - if( !obj_in_args ) { - // o = o->value hack - if( v->holds ) v->holds->current = NULL; - obj->current = v; - v->holds = obj; - op64(ctx,MOV,v,pmem(&p,v->id,HL_WSIZE)); - size = prepare_call_args(ctx,o->p3,o->extra,ctx->vregs,0); - } else { - // keep o->value in R(f->nregs) - int regids[64]; - preg *pc = alloc_reg(ctx,RCPU_CALL); - vreg *sc = R(f->nregs); // scratch register that we temporary rebind - if( o->p3 >= 63 ) jit_error("assert"); - memcpy(regids, o->extra, o->p3 * sizeof(int)); - regids[0] = f->nregs; - sc->size = HL_WSIZE; - sc->t = &hlt_dyn; - op64(ctx, MOV, pc, pmem(&p,v->id,HL_WSIZE)); - scratch(pc); - sc->current = pc; - pc->holds = sc; - size = prepare_call_args(ctx,o->p3,regids,ctx->vregs,0); - } - - op_call(ctx,r,size); - discard_regs(ctx, false); - store_result(ctx, dst); - patch_jump(ctx,jend); - } - break; - default: - ASSERT(0); - break; - } - break; - case ORethrow: - { - int size = prepare_call_args(ctx,1,&o->p1,ctx->vregs,0); - call_native(ctx,hl_rethrow,size); - } - break; - case OThrow: - { - int size = prepare_call_args(ctx,1,&o->p1,ctx->vregs,0); - call_native(ctx,hl_throw,size); - } - break; - case OLabel: - // NOP for now - discard_regs(ctx,false); - break; - case OGetI8: - case OGetI16: - { - preg *base = alloc_cpu(ctx, ra, true); - preg *offset = alloc_cpu64(ctx, rb, true); - preg *r = alloc_reg(ctx,o->op == OGetI8 ? RCPU_8BITS : RCPU); - op64(ctx,XOR,r,r); - op32(ctx, o->op == OGetI8 ? MOV8 : MOV16,r,pmem2(&p,base->id,offset->id,1,0)); - store(ctx, dst, r, true); - } - break; - case OGetMem: - { - #ifndef HL_64 - if (dst->t->kind == HI64) { - error_i64(); - } - #endif - preg *base = alloc_cpu(ctx, ra, true); - preg *offset = alloc_cpu64(ctx, rb, true); - store(ctx, dst, pmem2(&p,base->id,offset->id,1,0), false); - } - break; - case OSetI8: - { - preg *base = alloc_cpu(ctx, dst, true); - preg *offset = alloc_cpu64(ctx, ra, true); - preg *value = alloc_cpu8(ctx, rb, true); - op32(ctx,MOV8,pmem2(&p,base->id,offset->id,1,0),value); - } - break; - case OSetI16: - { - preg *base = alloc_cpu(ctx, dst, true); - preg *offset = alloc_cpu64(ctx, ra, true); - preg *value = alloc_cpu(ctx, rb, true); - op32(ctx,MOV16,pmem2(&p,base->id,offset->id,1,0),value); - } - break; - case OSetMem: - { - preg *base = alloc_cpu(ctx, dst, true); - preg *offset = alloc_cpu64(ctx, ra, true); - preg *value; - switch( rb->t->kind ) { - case HI32: - value = alloc_cpu(ctx, rb, true); - op32(ctx,MOV,pmem2(&p,base->id,offset->id,1,0),value); - break; - case HF32: - value = alloc_fpu(ctx, rb, true); - op32(ctx,MOVSS,pmem2(&p,base->id,offset->id,1,0),value); - break; - case HF64: - value = alloc_fpu(ctx, rb, true); - op32(ctx,MOVSD,pmem2(&p,base->id,offset->id,1,0),value); - break; - case HI64: - case HGUID: - value = alloc_cpu(ctx, rb, true); - op64(ctx,MOV,pmem2(&p,base->id,offset->id,1,0),value); - break; - default: - ASSERT(rb->t->kind); - break; - } - } - break; - case OType: - { - op64(ctx,MOV,alloc_cpu(ctx, dst, false),pconst64(&p,(int_val)(m->code->types + o->p2))); - store(ctx,dst,dst->current,false); - } - break; - case OGetType: - { - int jnext, jend; - preg *r = alloc_cpu(ctx, ra, true); - preg *tmp = alloc_reg(ctx, RCPU); - op64(ctx,TEST,r,r); - XJump_small(JNotZero,jnext); - op64(ctx,MOV, tmp, pconst64(&p,(int_val)&hlt_void)); - XJump_small(JAlways,jend); - patch_jump(ctx,jnext); - op64(ctx, MOV, tmp, pmem(&p,r->id,0)); - patch_jump(ctx,jend); - store(ctx,dst,tmp,true); - } - break; - case OGetArray: - { - preg *rdst = IS_FLOAT(dst) ? alloc_fpu(ctx,dst,false) : alloc_cpu(ctx,dst,false); - if( ra->t->kind == HABSTRACT ) { - int osize; - bool isRead = dst->t->kind != HOBJ && dst->t->kind != HSTRUCT; - if( isRead ) - osize = sizeof(void*); - else { - hl_runtime_obj *rt = hl_get_obj_rt(dst->t); - osize = rt->size; - } - preg *idx = alloc_cpu64(ctx, rb, true); - op64(ctx, IMUL, idx, pconst(&p,osize)); - op64(ctx, isRead?MOV:LEA, rdst, pmem2(&p,alloc_cpu(ctx,ra, true)->id,idx->id,1,0)); - store(ctx,dst,dst->current,false); - scratch(idx); - } else { - copy(ctx, rdst, pmem2(&p,alloc_cpu(ctx,ra,true)->id,alloc_cpu64(ctx,rb,true)->id,hl_type_size(dst->t),sizeof(varray)), dst->size); - store(ctx,dst,dst->current,false); - } - } - break; - case OSetArray: - { - if( dst->t->kind == HABSTRACT ) { - int osize; - bool isWrite = rb->t->kind != HOBJ && rb->t->kind != HSTRUCT; - if( isWrite ) { - osize = sizeof(void*); - } else { - hl_runtime_obj *rt = hl_get_obj_rt(rb->t); - osize = rt->size; - } - preg *pdst = alloc_cpu(ctx,dst,true); - preg *pra = alloc_cpu64(ctx,ra,true); - op64(ctx, IMUL, pra, pconst(&p,osize)); - op64(ctx, ADD, pdst, pra); - scratch(pra); - preg *prb = alloc_cpu(ctx,rb,true); - preg *tmp = alloc_reg(ctx, RCPU_CALL); - int offset = 0; - while( offset < osize ) { - int remain = osize - offset; - int copy_size = remain >= HL_WSIZE ? HL_WSIZE : (remain >= 4 ? 4 : (remain >= 2 ? 2 : 1)); - copy(ctx, tmp, pmem(&p, prb->id, offset), copy_size); - copy(ctx, pmem(&p, pdst->id, offset), tmp, copy_size); - offset += copy_size; - } - scratch(pdst); - } else { - preg *rrb = IS_FLOAT(rb) ? alloc_fpu(ctx,rb,true) : alloc_cpu(ctx,rb,true); - copy(ctx, pmem2(&p,alloc_cpu(ctx,dst,true)->id,alloc_cpu64(ctx,ra,true)->id,hl_type_size(rb->t),sizeof(varray)), rrb, rb->size); - } - } - break; - case OArraySize: - { - op32(ctx,MOV,alloc_cpu(ctx,dst,false),pmem(&p,alloc_cpu(ctx,ra,true)->id,ra->t->kind == HABSTRACT ? HL_WSIZE + 4 : HL_WSIZE*2)); - store(ctx,dst,dst->current,false); - } - break; - case ORef: - { - scratch(ra->current); - op64(ctx,MOV,alloc_cpu(ctx,dst,false),REG_AT(Ebp)); - if( ra->stackPos < 0 ) - op64(ctx,SUB,dst->current,pconst(&p,-ra->stackPos)); - else - op64(ctx,ADD,dst->current,pconst(&p,ra->stackPos)); - store(ctx,dst,dst->current,false); - } - break; - case OUnref: - copy_to(ctx,dst,pmem(&p,alloc_cpu(ctx,ra,true)->id,0)); - break; - case OSetref: - copy_from(ctx,pmem(&p,alloc_cpu(ctx,dst,true)->id,0),ra); - break; - case ORefData: - switch( ra->t->kind ) { - case HARRAY: - { - preg *r = fetch(ra); - preg *d = alloc_cpu(ctx,dst,false); - op64(ctx,MOV,d,r); - op64(ctx,ADD,d,pconst(&p,sizeof(varray))); - store(ctx,dst,dst->current,false); - } - break; - default: - ASSERT(ra->t->kind); - } - break; - case ORefOffset: - { - preg *d = alloc_cpu(ctx,rb,true); - preg *r2 = alloc_cpu(ctx,dst,false); - preg *r = fetch(ra); - int size = hl_type_size(dst->t->tparam); - op64(ctx,MOV,r2,r); - switch( size ) { - case 1: - break; - case 2: - op64(ctx,SHL,d,pconst(&p,1)); - break; - case 4: - op64(ctx,SHL,d,pconst(&p,2)); - break; - case 8: - op64(ctx,SHL,d,pconst(&p,3)); - break; - default: - op64(ctx,IMUL,d,pconst(&p,size)); - break; - } - op64(ctx,ADD,r2,d); - scratch(d); - store(ctx,dst,dst->current,false); - } - break; - case OToVirtual: - { -# ifdef HL_64 - int size = pad_before_call(ctx, 0); - op64(ctx,MOV,REG_AT(CALL_REGS[1]),fetch(ra)); - op64(ctx,MOV,REG_AT(CALL_REGS[0]),pconst64(&p,(int_val)dst->t)); -# else - int size = pad_before_call(ctx, HL_WSIZE*2); - op32(ctx,PUSH,fetch(ra),UNUSED); - op32(ctx,PUSH,pconst(&p,(int)(int_val)dst->t),UNUSED); -# endif - if( ra->t->kind == HOBJ ) hl_get_obj_rt(ra->t); // ensure it's initialized - call_native(ctx,hl_to_virtual,size); - store(ctx,dst,PEAX,true); - } - break; - case OMakeEnum: - { - hl_enum_construct *c = &dst->t->tenum->constructs[o->p2]; - int_val args[] = { (int_val)dst->t, o->p2 }; - int i; - call_native_consts(ctx, hl_alloc_enum, args, 2); - RLOCK(PEAX); - for(i=0;inparams;i++) { - preg *r = fetch(R(o->extra[i])); - copy(ctx, pmem(&p,Eax,c->offsets[i]),r, R(o->extra[i])->size); - RUNLOCK(fetch(R(o->extra[i]))); - if ((i & 15) == 0) jit_buf(ctx); - } - store(ctx, dst, PEAX, true); - } - break; - case OEnumAlloc: - { - int_val args[] = { (int_val)dst->t, o->p2 }; - call_native_consts(ctx, hl_alloc_enum, args, 2); - store(ctx, dst, PEAX, true); - } - break; - case OEnumField: - { - hl_enum_construct *c = &ra->t->tenum->constructs[o->p3]; - preg *r = alloc_cpu(ctx,ra,true); - copy_to(ctx,dst,pmem(&p,r->id,c->offsets[(int)(int_val)o->extra])); - } - break; - case OSetEnumField: - { - hl_enum_construct *c = &dst->t->tenum->constructs[0]; - preg *r = alloc_cpu(ctx,dst,true); - switch( rb->t->kind ) { - case HF64: - { - preg *d = alloc_fpu(ctx,rb,true); - copy(ctx,pmem(&p,r->id,c->offsets[o->p2]),d,8); - break; - } - default: - copy(ctx,pmem(&p,r->id,c->offsets[o->p2]),alloc_cpu(ctx,rb,true),hl_type_size(c->params[o->p2])); - break; - } - } - break; - case ONullCheck: - { - int jz; - preg *r = alloc_cpu(ctx,dst,true); - op64(ctx,TEST,r,r); - XJump_small(JNotZero,jz); - - hl_opcode *next = f->ops + opCount + 1; - bool null_field_access = false; - int hashed_name = 0; - // skip const and operation between nullcheck and access - while( (next < f->ops + f->nops - 1) && (next->op >= OInt && next->op <= ODecr) ) { - next++; - } - if( (next->op == OField && next->p2 == o->p1) || (next->op == OSetField && next->p1 == o->p1) ) { - int fid = next->op == OField ? next->p3 : next->p2; - hl_obj_field *f = NULL; - if( dst->t->kind == HOBJ || dst->t->kind == HSTRUCT ) - f = hl_obj_field_fetch(dst->t, fid); - else if( dst->t->kind == HVIRTUAL ) - f = dst->t->virt->fields + fid; - if( f == NULL ) ASSERT(dst->t->kind); - null_field_access = true; - hashed_name = f->hashed_name; - } else if( (next->op >= OCall1 && next->op <= OCallN) && next->p3 == o->p1 ) { - int fid = next->p2 < 0 ? -1 : ctx->m->functions_indexes[next->p2]; - hl_function *cf = ctx->m->code->functions + fid; - const uchar *name = fun_field_name(cf); - null_field_access = true; - hashed_name = hl_hash_gen(name, true); - } - - if( null_field_access ) { - pad_before_call(ctx, HL_WSIZE); - if( hashed_name >= 0 && hashed_name < 256 ) - op64(ctx,PUSH8,pconst(&p,hashed_name),UNUSED); - else - op32(ctx,PUSH,pconst(&p,hashed_name),UNUSED); - } else { - pad_before_call(ctx, 0); - } - - jlist *j = (jlist*)hl_malloc(&ctx->galloc,sizeof(jlist)); - j->pos = BUF_POS(); - j->target = null_field_access ? -3 : -1; - j->next = ctx->calls; - ctx->calls = j; - - op64(ctx,MOV,PEAX,pconst64(&p,RESERVE_ADDRESS)); - op_call(ctx,PEAX,-1); - patch_jump(ctx,jz); - } - break; - case OSafeCast: - make_dyn_cast(ctx, dst, ra); - break; - case ODynGet: - { - int size; -# ifdef HL_64 - if( IS_FLOAT(dst) || dst->t->kind == HI64 ) { - size = begin_native_call(ctx,2); - } else { - size = begin_native_call(ctx,3); - set_native_arg(ctx,pconst64(&p,(int_val)dst->t)); - } - set_native_arg(ctx,pconst64(&p,(int_val)hl_hash_utf8(m->code->strings[o->p3]))); - set_native_arg(ctx,fetch(ra)); -# else - preg *r; - r = alloc_reg(ctx,RCPU); - if( IS_FLOAT(dst) || dst->t->kind == HI64 ) { - size = pad_before_call(ctx,HL_WSIZE*2); - } else { - size = pad_before_call(ctx,HL_WSIZE*3); - op64(ctx,MOV,r,pconst64(&p,(int_val)dst->t)); - op64(ctx,PUSH,r,UNUSED); - } - op64(ctx,MOV,r,pconst64(&p,(int_val)hl_hash_utf8(m->code->strings[o->p3]))); - op64(ctx,PUSH,r,UNUSED); - op64(ctx,PUSH,fetch(ra),UNUSED); -# endif - call_native(ctx,get_dynget(dst->t),size); - store_result(ctx,dst); - } - break; - case ODynSet: - { - int size; -# ifdef HL_64 - switch( rb->t->kind ) { - case HF32: - case HF64: - size = begin_native_call(ctx, 3); - set_native_arg_fpu(ctx,fetch(rb),rb->t->kind == HF32); - set_native_arg(ctx,pconst64(&p,hl_hash_gen(hl_get_ustring(m->code,o->p2),true))); - set_native_arg(ctx,fetch(dst)); - call_native(ctx,get_dynset(rb->t),size); - break; - case HI64: - case HGUID: - size = begin_native_call(ctx, 3); - set_native_arg(ctx,fetch(rb)); - set_native_arg(ctx,pconst64(&p,hl_hash_gen(hl_get_ustring(m->code,o->p2),true))); - set_native_arg(ctx,fetch(dst)); - call_native(ctx,get_dynset(rb->t),size); - break; - default: - size = begin_native_call(ctx,4); - set_native_arg(ctx,fetch(rb)); - set_native_arg(ctx,pconst64(&p,(int_val)rb->t)); - set_native_arg(ctx,pconst64(&p,hl_hash_gen(hl_get_ustring(m->code,o->p2),true))); - set_native_arg(ctx,fetch(dst)); - call_native(ctx,get_dynset(rb->t),size); - break; - } -# else - switch( rb->t->kind ) { - case HF32: - size = pad_before_call(ctx, HL_WSIZE*2 + sizeof(float)); - push_reg(ctx,rb); - op32(ctx,PUSH,pconst64(&p,hl_hash_gen(hl_get_ustring(m->code,o->p2),true)),UNUSED); - op32(ctx,PUSH,fetch(dst),UNUSED); - call_native(ctx,get_dynset(rb->t),size); - break; - case HF64: - case HI64: - case HGUID: - size = pad_before_call(ctx, HL_WSIZE*2 + sizeof(double)); - push_reg(ctx,rb); - op32(ctx,PUSH,pconst64(&p,hl_hash_gen(hl_get_ustring(m->code,o->p2),true)),UNUSED); - op32(ctx,PUSH,fetch(dst),UNUSED); - call_native(ctx,get_dynset(rb->t),size); - break; - default: - size = pad_before_call(ctx, HL_WSIZE*4); - op32(ctx,PUSH,fetch32(ctx,rb),UNUSED); - op32(ctx,PUSH,pconst64(&p,(int_val)rb->t),UNUSED); - op32(ctx,PUSH,pconst64(&p,hl_hash_gen(hl_get_ustring(m->code,o->p2),true)),UNUSED); - op32(ctx,PUSH,fetch(dst),UNUSED); - call_native(ctx,get_dynset(rb->t),size); - break; - } -# endif - } - break; - case OTrap: - { - int size, jenter, jtrap; - int offset = 0; - int trap_size = (sizeof(hl_trap_ctx) + 15) & 0xFFF0; - hl_trap_ctx *t = NULL; -# ifndef HL_THREADS - if( tinf == NULL ) tinf = hl_get_thread(); // single thread -# endif - -# ifdef HL_64 - preg *trap = REG_AT(CALL_REGS[0]); -# else - preg *trap = PEAX; -# endif - RLOCK(trap); - - preg *treg = alloc_reg(ctx, RCPU); - if( !tinf ) { - call_native(ctx, hl_get_thread, 0); - op64(ctx,MOV,treg,PEAX); - offset = (int)(int_val)&tinf->trap_current; - } else { - offset = 0; - op64(ctx,MOV,treg,pconst64(&p,(int_val)&tinf->trap_current)); - } - op64(ctx,MOV,trap,pmem(&p,treg->id,offset)); - op64(ctx,SUB,PESP,pconst(&p,trap_size)); - op64(ctx,MOV,pmem(&p,Esp,(int)(int_val)&t->prev),trap); - op64(ctx,MOV,trap,PESP); - op64(ctx,MOV,pmem(&p,treg->id,offset),trap); - - /* - trap E,@catch - catch g - catch g2 - ... - @:catch - - // Before haxe 5 - This is a bit hackshish : we want to detect the type of exception filtered by the catch so we check the following - sequence of HL opcodes: - - trap E,@catch - ... - @catch: - global R, _ - call _, ???(R,E) - - ??? is expected to be hl.BaseType.check - */ - hl_opcode *cat = f->ops + opCount + 1; - hl_opcode *next = f->ops + opCount + 1 + o->p2; - hl_opcode *next2 = f->ops + opCount + 2 + o->p2; - if( cat->op == OCatch || (next->op == OGetGlobal && next2->op == OCall2 && next2->p3 == next->p1 && dst->stack.id == (int)(int_val)next2->extra) ) { - int gindex = cat->op == OCatch ? cat->p1 : next->p2; - hl_type *gt = m->code->globals[gindex]; - while( gt->kind == HOBJ && gt->obj->super ) gt = gt->obj->super; - if( gt->kind == HOBJ && gt->obj->nfields && gt->obj->fields[0].t->kind == HTYPE ) { - void *addr = m->globals_data + m->globals_indexes[gindex]; -# ifdef HL_64 - op64(ctx,MOV,treg,pconst64(&p,(int_val)addr)); - op64(ctx,MOV,treg,pmem(&p,treg->id,0)); -# else - op64(ctx,MOV,treg,paddr(&p,addr)); -# endif - } else - op64(ctx,MOV,treg,pconst(&p,0)); - } else { - op64(ctx,MOV,treg,pconst(&p,0)); - } - op64(ctx,MOV,pmem(&p,Esp,(int)(int_val)&t->tcheck),treg); - - // On Win64 setjmp actually takes two arguments - // the jump buffer and the frame pointer (or the stack pointer if there is no FP) -#if defined(HL_WIN) && defined(HL_64) - size = begin_native_call(ctx, 2); - set_native_arg(ctx, REG_AT(Ebp)); -#else - size = begin_native_call(ctx, 1); -#endif - set_native_arg(ctx,trap); -#ifdef HL_MINGW - call_native(ctx,_setjmp,size); -#else - call_native(ctx,setjmp,size); -#endif - op64(ctx,TEST,PEAX,PEAX); - XJump_small(JZero,jenter); - op64(ctx,ADD,PESP,pconst(&p,trap_size)); - if( !tinf ) { - call_native(ctx, hl_get_thread, 0); - op64(ctx,MOV,PEAX,pmem(&p, Eax, (int)(int_val)&tinf->exc_value)); - } else { - op64(ctx,MOV,PEAX,pconst64(&p,(int_val)&tinf->exc_value)); - op64(ctx,MOV,PEAX,pmem(&p, Eax, 0)); - } - store(ctx,dst,PEAX,false); - - jtrap = do_jump(ctx,OJAlways,false); - register_jump(ctx,jtrap,(opCount + 1) + o->p2); - patch_jump(ctx,jenter); - } - break; - case OEndTrap: - { - int trap_size = (sizeof(hl_trap_ctx) + 15) & 0xFFF0; - hl_trap_ctx *tmp = NULL; - preg *addr,*r; - int offset; - if (!tinf) { - call_native(ctx, hl_get_thread, 0); - addr = PEAX; - RLOCK(addr); - offset = (int)(int_val)&tinf->trap_current; - } else { - offset = 0; - addr = alloc_reg(ctx, RCPU); - op64(ctx, MOV, addr, pconst64(&p, (int_val)&tinf->trap_current)); - } - r = alloc_reg(ctx, RCPU); - op64(ctx, MOV, r, pmem(&p,addr->id,offset)); - op64(ctx, MOV, r, pmem(&p,r->id,(int)(int_val)&tmp->prev)); - op64(ctx, MOV, pmem(&p,addr->id, offset), r); -# ifdef HL_WIN - // erase eip (prevent false positive) - { - _JUMP_BUFFER *b = NULL; -# ifdef HL_64 - op64(ctx,MOV,pmem(&p,Esp,(int)(int_val)&(b->Rip)),PEAX); -# else - op64(ctx,MOV,pmem(&p,Esp,(int)&(b->Eip)),PEAX); -# endif - } -# endif - op64(ctx,ADD,PESP,pconst(&p,trap_size)); - } - break; - case OEnumIndex: - { - preg *r = alloc_reg(ctx,RCPU); - op64(ctx,MOV,r,pmem(&p,alloc_cpu(ctx,ra,true)->id,HL_WSIZE)); - store(ctx,dst,r,true); - break; - } - break; - case OSwitch: - { - int jdefault; - int i; - preg *r = alloc_cpu(ctx, dst, true); - preg *r2 = alloc_reg(ctx, RCPU); - op32(ctx, CMP, r, pconst(&p,o->p2)); - XJump(JUGte,jdefault); - // r2 = r * 5 + eip -# ifdef HL_64 - op64(ctx, XOR, r2, r2); -# endif - op32(ctx, MOV, r2, r); - op32(ctx, SHL, r2, pconst(&p,2)); - op32(ctx, ADD, r2, r); -# ifdef HL_64 - preg *tmp = alloc_reg(ctx, RCPU); - op64(ctx, MOV, tmp, pconst64(&p,RESERVE_ADDRESS)); -# else - op64(ctx, ADD, r2, pconst64(&p,RESERVE_ADDRESS)); -# endif - { - jlist *s = (jlist*)hl_malloc(&ctx->galloc, sizeof(jlist)); - s->pos = BUF_POS() - sizeof(void*); - s->next = ctx->switchs; - ctx->switchs = s; - } -# ifdef HL_64 - op64(ctx, ADD, r2, tmp); -# endif - op64(ctx, JMP, r2, UNUSED); - for(i=0;ip2;i++) { - int j = do_jump(ctx,OJAlways,false); - register_jump(ctx,j,(opCount + 1) + o->extra[i]); - if( (i & 15) == 0 ) jit_buf(ctx); - } - patch_jump(ctx, jdefault); - } - break; - case OGetTID: - op32(ctx, MOV, alloc_cpu(ctx,dst,false), pmem(&p,alloc_cpu(ctx,ra,true)->id,0)); - store(ctx,dst,dst->current,false); - break; - case OAssert: - { - pad_before_call(ctx, 0); - jlist *j = (jlist*)hl_malloc(&ctx->galloc,sizeof(jlist)); - j->pos = BUF_POS(); - j->target = -2; - j->next = ctx->calls; - ctx->calls = j; - - op64(ctx,MOV,PEAX,pconst64(&p,RESERVE_ADDRESS)); - op_call(ctx,PEAX,-1); - } - break; - case ONop: - break; - case OPrefetch: - { - preg *r = alloc_cpu(ctx, dst, true); - if( o->p2 > 0 ) { - switch( dst->t->kind ) { - case HOBJ: - case HSTRUCT: - { - hl_runtime_obj *rt = hl_get_obj_rt(dst->t); - preg *r2 = alloc_reg(ctx, RCPU); - op64(ctx, LEA, r2, pmem(&p, r->id, rt->fields_indexes[o->p2-1])); - r = r2; - } - break; - default: - ASSERT(dst->t->kind); - break; - } - } - switch( o->p3 ) { - case 0: - op64(ctx, PREFETCHT0, pmem(&p,r->id,0), UNUSED); - break; - case 1: - op64(ctx, PREFETCHT1, pmem(&p,r->id,0), UNUSED); - break; - case 2: - op64(ctx, PREFETCHT2, pmem(&p,r->id,0), UNUSED); - break; - case 3: - op64(ctx, PREFETCHNTA, pmem(&p,r->id,0), UNUSED); - break; - case 4: - op64(ctx, PREFETCHW, pmem(&p,r->id,0), UNUSED); - break; - default: - ASSERT(o->p3); - break; - } - } - break; - case OAsm: - { - switch( o->p1 ) { - case 0: // byte output - B(o->p2); - break; - case 1: // scratch cpu reg - scratch(REG_AT(o->p2)); - break; - case 2: // read vm reg - rb--; - copy(ctx, REG_AT(o->p2), &rb->stack, rb->size); - scratch(REG_AT(o->p2)); - break; - case 3: // write vm reg - rb--; - copy(ctx, &rb->stack, REG_AT(o->p2), rb->size); - scratch(rb->current); - break; - case 4: - if( ctx->totalRegsSize != 0 ) - hl_fatal("Asm naked function should not have local variables"); - if( opCount != 0 ) - hl_fatal("Asm naked function should be on first opcode"); - ctx->buf.b -= BUF_POS() - ctx->functionPos; // reset to our function start - break; - default: - ASSERT(o->p1); - break; - } - } - break; - case OCatch: - // Only used by OTrap typing - break; - default: - jit_error(hl_op_name(o->op)); - break; - } - // we are landing at this position, assume we have lost our registers - if( ctx->opsPos[opCount+1] == -1 ) - discard_regs(ctx,true); - ctx->opsPos[opCount+1] = BUF_POS(); - - // write debug infos - size = BUF_POS() - codePos; - if( debug16 && size > 0xFF00 ) { - debug32 = malloc(sizeof(int) * (f->nops + 1)); - for(i=0;icurrentPos;i++) - debug32[i] = debug16[i]; - free(debug16); - debug16 = NULL; - } - if( debug16 ) debug16[ctx->currentPos] = (unsigned short)size; else if( debug32 ) debug32[ctx->currentPos] = size; - - } - // patch jumps - { - jlist *j = ctx->jumps; - while( j ) { - *(int*)(ctx->startBuf + j->pos) = ctx->opsPos[j->target] - (j->pos + 4); - j = j->next; - } - ctx->jumps = NULL; - } - int codeEndPos = BUF_POS(); - // add nops padding - jit_nops(ctx); - // clear regs - for(i=0;iholds = NULL; - r->lock = 0; - } - // save debug infos - if( ctx->debug ) { - int fid = (int)(f - m->code->functions); - ctx->debug[fid].start = codePos; - ctx->debug[fid].offsets = debug32 ? (void*)debug32 : (void*)debug16; - ctx->debug[fid].large = debug32 != NULL; - } - // unwind info -#ifdef WIN64_UNWIND_TABLES - int uw_idx = ctx->nunwind++; - ctx->unwind_table[uw_idx].BeginAddress = codePos; - ctx->unwind_table[uw_idx].EndAddress = codeEndPos; - ctx->unwind_table[uw_idx].UnwindData = ctx->unwind_offset; -#endif - // reset tmp allocator - hl_free(&ctx->falloc); - return codePos; -} - -static void *get_wrapper( hl_type *t ) { - return call_jit_hl2c; -} - -void hl_jit_patch_method( void *old_fun, void **new_fun_table ) { - // mov eax, addr - // jmp [eax] - unsigned char *b = (unsigned char*)old_fun; - unsigned long long addr = (unsigned long long)(int_val)new_fun_table; -# ifdef HL_64 - *b++ = 0x48; - *b++ = 0xB8; - *b++ = (unsigned char)addr; - *b++ = (unsigned char)(addr>>8); - *b++ = (unsigned char)(addr>>16); - *b++ = (unsigned char)(addr>>24); - *b++ = (unsigned char)(addr>>32); - *b++ = (unsigned char)(addr>>40); - *b++ = (unsigned char)(addr>>48); - *b++ = (unsigned char)(addr>>56); -# else - *b++ = 0xB8; - *b++ = (unsigned char)addr; - *b++ = (unsigned char)(addr>>8); - *b++ = (unsigned char)(addr>>16); - *b++ = (unsigned char)(addr>>24); -# endif - *b++ = 0xFF; - *b++ = 0x20; -} - -static void missing_closure() { - hl_error("Missing static closure"); -} - -void *hl_jit_code( jit_ctx *ctx, hl_module *m, int *codesize, hl_debug_infos **debug, hl_module *previous ) { - jlist *c; - int size = BUF_POS(); - unsigned char *code; - if( size & 4095 ) size += 4096 - (size&4095); - code = (unsigned char*)hl_alloc_executable_memory(size); - if( code == NULL ) return NULL; - memcpy(code,ctx->startBuf,BUF_POS()); - *codesize = size; - *debug = ctx->debug; - if( !call_jit_c2hl ) { - call_jit_c2hl = code + ctx->c2hl; - call_jit_hl2c = code + ctx->hl2c; - hl_setup.get_wrapper = get_wrapper; - hl_setup.static_call = callback_c2hl; - hl_setup.static_call_ref = true; -# ifdef JIT_CUSTOM_LONGJUMP - hl_setup.throw_jump = (void(*)(jmp_buf, int))(code + ctx->longjump); -# endif - } -#ifdef WIN64_UNWIND_TABLES - m->unwind_table = ctx->unwind_table; - RtlAddFunctionTable(m->unwind_table, ctx->nunwind, (DWORD64)code); -#endif - if( !ctx->static_function_offset ) { - int i; - ctx->static_function_offset = true; - for(i=0;i<(int)(sizeof(ctx->static_functions)/sizeof(void*));i++) - ctx->static_functions[i] = (void*)(code + (int)(int_val)ctx->static_functions[i]); - } - // patch calls - c = ctx->calls; - while( c ) { - void *fabs; - if( c->target < 0 ) - fabs = ctx->static_functions[-c->target-1]; - else { - fabs = m->functions_ptrs[c->target]; - if( fabs == NULL ) { - // read absolute address from previous module - int old_idx = m->hash->functions_hashes[m->functions_indexes[c->target]]; - if( old_idx < 0 ) - return NULL; - fabs = previous->functions_ptrs[(previous->code->functions + old_idx)->findex]; - } else { - // relative - fabs = (unsigned char*)code + (int)(int_val)fabs; - } - } - if( (code[c->pos]&~3) == (IS_64?0x48:0xB8) || code[c->pos] == 0x68 ) // MOV : absolute | PUSH - *(void**)(code + c->pos + (IS_64?2:1)) = fabs; - else { - int_val delta = (int_val)fabs - (int_val)code - (c->pos + 5); - int rpos = (int)delta; - if( (int_val)rpos != delta ) { - printf("Target code too far too rebase\n"); - return NULL; - } - *(int*)(code + c->pos + 1) = rpos; - } - c = c->next; - } - // patch switchs - c = ctx->switchs; - while( c ) { - *(void**)(code + c->pos) = code + c->pos + (IS_64 ? 14 : 6); - c = c->next; - } - // patch closures - { - vclosure *c = ctx->closure_list; - while( c ) { - vclosure *next; - int fidx = (int)(int_val)c->fun; - void *fabs = m->functions_ptrs[fidx]; - if( fabs == NULL ) { - // read absolute address from previous module - int old_idx = m->hash->functions_hashes[m->functions_indexes[fidx]]; - if( old_idx < 0 ) - fabs = missing_closure; - else - fabs = previous->functions_ptrs[(previous->code->functions + old_idx)->findex]; - } else { - // relative - fabs = (unsigned char*)code + (int)(int_val)fabs; - } - c->fun = fabs; - next = (vclosure*)c->value; - c->value = NULL; - c = next; - } - } - return code; -} - diff --git a/src/jit.h b/src/jit.h new file mode 100644 index 000000000..69c609547 --- /dev/null +++ b/src/jit.h @@ -0,0 +1,302 @@ +/* + * Copyright (C)2005-2016 Haxe Foundation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ +#ifndef JIT_H +#define JIT_H + +#include +#include + +typedef enum { + LOAD_ADDR, + LOAD_CONST, + LOAD_ARG, + LOAD_FUN, + STORE, + LEA, + TEST, + CMP, + JCOND, + JUMP, + JUMP_TABLE, + BINOP, + UNOP, + CONV, + CONV_UNSIGNED, + RET, + CALL_PTR, + CALL_REG, + CALL_FUN, + MOV, + CMOV, + XCHG, + CXCHG, + PUSH_CONST, + PUSH, + POP, + ALLOC_STACK, + PREFETCH, + DEBUG_BREAK, + BLOCK, + ENTER, + STACK_OFFS, + CATCH, + ADDRESS, + NOP, +} emit_op; + +typedef enum { + M_NONE, + M_UI8, + M_UI16, + M_I32, + M_PTR, + M_F64, + M_F32, + M_VOID, + M_NORET, +} emit_mode; + +typedef int ereg; + +typedef struct { + union { + struct { + unsigned char op; + unsigned char mode; + unsigned char nargs; + unsigned char _unused; + }; + int header; + }; + int size_offs; + union { + struct { + ereg a; + ereg b; + }; + uint64 value; + }; +} einstr; + +typedef enum { + R_VALUE = 0, + R_REG = 0x40000000, + R_REG_PTR = 0x50000000, + R_CONST = 0x60000000, + R_PHI = 0x70000000, +} rkind; + +// reg representation is : +// higher bits +// 0000 = positive value (for IR only VXXX) +// X100 = native register, lower 7 bits is the register, bits 8-28 are the offset (21 bits) +// X101 = same as above, but indirect address +// X110 = small constant value stored in offset +// 1111 = negative value (for IR phi PXXX) +// 10XX = unused + +#define STACK_REG 5 + +#define UNUSED ((ereg)0) +#define MK_REG(v,kind) (((v)&0x7F) | (kind)) +#define MK_REG_VAL(v,kind,val) (MK_REG(v,kind) | (((val) << 7)&0x8FFFFF80)) + +#define REG_KIND(r) ((r)&0x70000000) +#define REG_REG(r) ((r)&0x7F) +#define REG_VALUE(r) (((int)(((r) & 0x8000000) ? ((r) | 0xF0000000) : ((r)&0x0FFFFFFF)))>>7) +#define REG_PTR(r) _reg_chk(r,R_REG,(r)|R_REG_PTR) +#define REG_ADD_OFFSET(r,offs) _reg_chk(r,R_REG_PTR,MK_REG_VAL(r,REG_KIND(r),REG_VALUE(r)+(offs))) +#define REG_IS_VAL(r) (REG_KIND(r) == R_VALUE || REG_KIND(r) == R_PHI) + +#define IS_NULL(r) ((r) == 0) +#define IS_REG(r) (REG_KIND(r) == R_REG) +#define MK_STACK_REG(v) MK_REG_VAL(STACK_REG,R_REG_PTR,v) +#define MK_STACK_OFFS(v) MK_REG_VAL(STACK_REG,R_REG,v) +#define MK_CONST(v) MK_REG_VAL(0,R_CONST,v) +#define MK_ADDR(reg,offs) MK_REG_VAL(reg,R_REG_PTR,offs) + +#define IS_CALL(op) ((op) == CALL_PTR || (op) == CALL_REG || (op) == CALL_FUN) +#define IS_FLOAT(mode) ((mode) == M_F64 || (mode) == M_F32) + +#define MAX_ARGS 16 + +#if defined(HL_WIN_CALL) && defined(HL_64) +# define IS_WINCALL64 1 +#else +# define IS_WINCALL64 0 +#endif + +typedef struct { + int *data; + int max; + int cur; +} int_alloc; + +typedef struct _ephi ephi; + +struct _ephi { + ereg value; + int nvalues; + emit_mode mode; + ereg *values; + int *blocks; +}; + +typedef struct _eblock { + int start_pos; + int end_pos; + int next_count; + int pred_count; + int phi_count; + int *nexts; + int *preds; + ephi *phis; +} eblock; + +typedef struct _emit_ctx emit_ctx; +typedef struct _regs_ctx regs_ctx; +typedef struct _code_ctx code_ctx; +typedef struct _jit_ctx jit_ctx; + +typedef struct { + int nscratchs; + int npersists; + int nargs; + ereg ret; + ereg *scratch; + ereg *persist; + ereg *arg; +} reg_config; + +typedef struct { + reg_config regs; + reg_config floats; + ereg stack_reg; + ereg stack_pos; + int stack_align; + // Minimum bytes consumed by each stack argument. Defaults to HL_WSIZE + // when 0. Backends like AArch64 set this to 16 because each PUSH must + // move SP by 16 bytes to keep SP 16-byte aligned (any [SP, ...] access + // with a misaligned SP traps under EL0). + int stack_arg_size; + int debug_prefix_size; + ereg req_bit_shifts; + ereg req_div_a; + ereg req_div_b; +} regs_config; + +typedef struct { + int c2hl; + int hl2c; +} jit_special_funs; + +struct _jit_ctx { + hl_module *mod; + hl_function *fun; + hl_alloc falloc; + hl_alloc galloc; + emit_ctx *emit; + regs_ctx *regs; + code_ctx *code; + regs_config cfg; + // emit output + int instr_count; + int block_count; + int value_count; + int phi_count; + einstr *instrs; + eblock *blocks; + int *values_writes; + int *emit_pos_map; + // regs output + int reg_instr_count; + einstr *reg_instrs; + ereg *reg_writes; + int *reg_pos_map; + // codegen output + int code_size; + unsigned char *code_instrs; + int *code_pos_map; + jit_special_funs code_funs; + // accum output + int fdef_index; + int out_pos; + int out_max; + unsigned char *output; + unsigned char *final_code; +}; + +jit_ctx *hl_jit_alloc(); +void hl_jit_free( jit_ctx *ctx, h_bool can_reset ); +void hl_jit_reset( jit_ctx *ctx, hl_module *m ); +void hl_jit_init( jit_ctx *ctx, hl_module *m ); +int hl_jit_function( jit_ctx *ctx, hl_module *m, hl_function *f ); +void hl_jit_define_function( jit_ctx *ctx, int start, int size ); + +void hl_jit_null_field_access( int fhash ); +void hl_jit_assert(); +void *hl_jit_wrapper_ptr( vclosure_wrapper *c, char *stack_args, void **regs ); +double hl_jit_wrapper_d( vclosure_wrapper *c, char *stack_args, void **regs ); + +// emit & dump +void hl_emit_dump( jit_ctx *ctx ); +const char *hl_emit_regstr( ereg v, emit_mode m ); +void hl_emit_store_args( emit_ctx *ctx, einstr *e, ereg *args, int count ); +void hl_emit_remap_jumps( emit_ctx *ctx, void *jumps, einstr *instrs, int *pos_map ); +ereg *hl_emit_get_args( emit_ctx *ctx, einstr *e ); +ereg **hl_emit_get_regs( einstr *e, int *count ); +void hl_emit_reg_iter( jit_ctx *jit, einstr *e, void *ctx, void (*iter_reg)( void *, ereg * ) ); +extern int hl_emit_mode_sizes[]; +extern bool hl_jit_dump_bin; +#define val_str(v,m) hl_emit_regstr(v,m) + +#ifdef HL_DEBUG +# define JIT_DEBUG +#endif + +#define jit_error(msg) { hl_jit_error(msg,__func__,__LINE__); hl_debug_break(); exit(-1); } +#define jit_assert() jit_error("") + +#if defined(JIT_DEBUG) +# define jit_debug(...) printf(__VA_ARGS__) +#else +# define jit_debug(...) +#endif + +#define DEF_ALLOC &ctx->jit->falloc + +#define jit_pad_size(size,k) ((k == 0) ? 0 : ((-(size)) & (k - 1))) + +static void __ignore( void *value ) {} + +void hl_jit_error( const char *msg, const char *func, int line ); + +void *hl_jit_code( jit_ctx *ctx, hl_module *m, int *codesize, hl_debug_infos **debug, hl_module *previous ); +void hl_jit_patch_method( void *old_fun, void **new_fun_table ); + +static ereg _reg_chk( ereg r, rkind k, ereg ret ) { + if( REG_KIND(r) != k ) jit_assert(); + return ret; +} + + +#endif diff --git a/src/jit_aarch64.c b/src/jit_aarch64.c new file mode 100644 index 000000000..397f67104 --- /dev/null +++ b/src/jit_aarch64.c @@ -0,0 +1,1999 @@ +/* + * Copyright (C)2015-2026 Haxe Foundation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +/* + * AArch64 JIT backend for the HL2 IR JIT. + * + * Phase 2 + 3: function shell + simple ops + arithmetic + memory + conversions. + * Calls/trampolines and the constant pool are still phase 4. + */ + +#if !defined(__aarch64__) && !defined(_M_ARM64) +# error "This file is for AArch64 architecture only." +#endif + +#include +#include +#include "jit_aarch64_emit.h" +#include +#include + +#ifdef HL_DEBUG +# define GEN_DEBUG +#endif + +// IR ereg encoding 5 is reserved (`STACK_REG` in jit.h) — the regs phase uses +// it to label stack-bound vregs. ARM hardware register X5 happens to use the +// same hardware encoding, which would create a fatal aliasing if we exposed +// X5 through the regs configuration as encoding 5. Re-encode X5 as the +// otherwise-unused IR slot 32; gpr_id maps it back to hardware X5 at emit +// time. (FP regs encode in the 64..127 range and have no such conflict.) +#define X5_LOGICAL 32 + +#define R(id) MK_REG(id, R_REG) +#define V(id) MK_REG((id) + 64, R_REG) + +// ============================================================================ +// Register class declaration (AAPCS64, Linux + Apple) +// ============================================================================ + +void hl_jit_init_regs( regs_config *cfg ) { + // Integer registers. + // X15/X16/X17 reserved as backend-private temporaries. X16/X17 are the + // linker IP0/IP1 (the Apple dynamic linker may clobber them at indirect + // branches). X15 (ARM_TMP3) is reserved as a third scratch for op + // handlers that need three independent temps at once — notably emit_store + // when both base and data are spilled (TMP1 holds base, TMP2 holds data, + // and emit_ld_st still needs a temp for the offset-register encoding). + // X18 reserved on Apple/Windows as platform register; conservatively skipped on Linux too. + // X29 = FP, X30 = LR, X31 = SP/XZR — special-purpose. + static int scratch_regs[] = { + R(X0), R(X1), R(X2), R(X3), R(X4), R(X5_LOGICAL), R(X6), R(X7), + R(X8), R(X9), R(X10), R(X11), R(X12), R(X13), R(X14) + }; + static int persist_regs[] = { + R(X19), R(X20), R(X21), R(X22), R(X23), + R(X24), R(X25), R(X26), R(X27), R(X28) + }; + static int arg_regs[] = { + R(X0), R(X1), R(X2), R(X3), R(X4), R(X5_LOGICAL), R(X6), R(X7) + }; + cfg->regs.ret = scratch_regs[0]; + cfg->regs.nscratchs = sizeof(scratch_regs) / sizeof(int); + cfg->regs.npersists = sizeof(persist_regs) / sizeof(int); + cfg->regs.nargs = sizeof(arg_regs) / sizeof(int); + cfg->regs.scratch = (ereg*)scratch_regs; + cfg->regs.persist = (ereg*)persist_regs; + cfg->regs.arg = (ereg*)arg_regs; + + // Float registers (V0-V31; lower 64 bits of V8-V15 are callee-saved per AAPCS64). + static int float_scratch[] = { + V(0), V(1), V(2), V(3), V(4), V(5), V(6), V(7), + V(16), V(17), V(18), V(19), V(20), V(21), V(22), V(23), + V(24), V(25), V(26), V(27), V(28), V(29), V(30), V(31) + }; + static int float_persist[] = { + V(8), V(9), V(10), V(11), V(12), V(13), V(14), V(15) + }; + static int float_args[] = { + V(0), V(1), V(2), V(3), V(4), V(5), V(6), V(7) + }; + cfg->floats.ret = float_scratch[0]; + cfg->floats.nscratchs = sizeof(float_scratch) / sizeof(int); + cfg->floats.npersists = sizeof(float_persist) / sizeof(int); + cfg->floats.nargs = sizeof(float_args) / sizeof(int); + cfg->floats.scratch = (ereg*)float_scratch; + cfg->floats.persist = (ereg*)float_persist; + cfg->floats.arg = (ereg*)float_args; + + // ARM has no register pinning constraints for shifts (LSLV/LSRV/ASRV accept + // any source) or division (SDIV/UDIV write any destination). + cfg->req_bit_shifts = 0; + cfg->req_div_a = 0; + cfg->req_div_b = 0; + + cfg->stack_reg = R(SP_REG); // X31 (SP) + cfg->stack_pos = R(FP); // X29 + cfg->stack_align = 16; // AAPCS64 mandates + // Each stack-passed arg consumes 16 bytes to keep SP 16-byte aligned — + // any [SP, ...] memory access with misaligned SP traps under EL0 + // alignment enforcement on Linux/macOS. emit_push correspondingly moves + // SP by 16 per arg, so the IR's call-arg accounting matches. + cfg->stack_arg_size = 16; + +#ifdef GEN_DEBUG + cfg->debug_prefix_size = 4; // ARM instructions are fixed 4 bytes +#endif +} + +// ============================================================================ +// Disassembly helper +// ============================================================================ + +const char *hl_natreg_str( int reg, emit_mode m ) { + static char out[16]; + int r = REG_REG(reg); + // Reverse the remappings used in gpr_id so debug output reflects the + // hardware register actually emitted. + int hw = (r == X5_LOGICAL) ? 5 : (r == STACK_REG) ? 29 : r; + switch( m ) { + case M_I32: + case M_UI16: + case M_UI8: + if( hw == 31 ) + sprintf(out, "WZR"); + else if( hw < 31 ) + sprintf(out, "W%d", hw); + else + sprintf(out, "W%d???", hw); + break; + case M_F32: + hw = r - 64; + sprintf(out, "S%d%s", hw, hw >= 0 && hw < 32 ? "" : "???"); + break; + case M_F64: + hw = r - 64; + sprintf(out, "D%d%s", hw, hw >= 0 && hw < 32 ? "" : "???"); + break; + default: + if( hw == 31 ) + sprintf(out, "SP"); + else if( hw == 29 ) + sprintf(out, "FP"); + else if( hw == 30 ) + sprintf(out, "LR"); + else if( hw < 31 ) + sprintf(out, "X%d", hw); + else + sprintf(out, "X%d???", hw); + break; + } + return out; +} + +// ============================================================================ +// Backend lifecycle +// ============================================================================ + +void hl_codegen_alloc( jit_ctx *jit ) { + code_ctx *ctx = (code_ctx*)malloc(sizeof(code_ctx)); + memset(ctx, 0, sizeof(code_ctx)); + jit->code = ctx; + ctx->jit = jit; +} + +void hl_codegen_free( jit_ctx *jit ) { + code_ctx *ctx = jit->code; + if( ctx == NULL ) return; + free(ctx); +} + +// ============================================================================ +// Helpers +// ============================================================================ + +#define ARM_TMP1 X16 // backend-private scratch (IP0) +#define ARM_TMP2 X17 // backend-private scratch (IP1) +#define ARM_TMP3 X15 // backend-private scratch (excluded from regalloc) + +// Map an IR ereg to a physical AArch64 GPR encoding (0..31). +// IR encoding 5 (STACK_REG) → ARM FP (X29); the regs phase uses 5 as a +// stack-bound-vreg marker, and after ENTER lowers `MOV stack_pos, +// stack_reg` we keep that in X29. +// IR encoding X5_LOGICAL (32) → ARM X5; the remap shifts X5 out of slot 5 +// so the IR's STACK_REG sentinel does not alias it. +static Arm64Reg gpr_id( ereg r ) { + int v = REG_REG(r); + if( v == STACK_REG ) return FP; + if( v == X5_LOGICAL ) return X5; + return (Arm64Reg)v; +} + +static Arm64FpReg fpr_id( ereg r ) { + return (Arm64FpReg)(REG_REG(r) - 64); +} + +// LDR/STR `size` field: 0=8b, 1=16b, 2=32b, 3=64b. +static int ls_size_for( emit_mode m ) { + switch( m ) { + case M_UI8: return 0; + case M_UI16: return 1; + case M_I32: + case M_F32: return 2; + case M_PTR: + case M_F64: return 3; + default: return 3; + } +} + +static int sf_for( emit_mode m ) { + // 1 = 64-bit, 0 = 32-bit (sub-word loads/stores still use 64-bit reg encoding). + return (m == M_PTR || m == M_F64) ? 1 : 0; +} + +static bool is_fp_mode( emit_mode m ) { return m == M_F32 || m == M_F64; } + +// ---------------------------------------------------------------------------- +// Stack pointer arithmetic with arbitrary signed delta. +// `delta > 0` => SP += delta, `delta < 0` => SP -= |delta|. +// Uses imm12 + optional LSL #12 when possible; falls back through ARM_TMP1. +// ---------------------------------------------------------------------------- +static void emit_sp_offs( code_ctx *ctx, int delta ) { + if( delta == 0 ) return; + int op = (delta < 0) ? 1 : 0; // 0 = ADD, 1 = SUB + uint32_t mag = (uint32_t)(delta < 0 ? -delta : delta); + if( mag <= 0xFFF ) { + encode_add_sub_imm(ctx, 1, op, 0, 0, (int)mag, SP_REG, SP_REG); + return; + } + if( (mag & 0xFFF) == 0 && (mag >> 12) <= 0xFFF ) { + encode_add_sub_imm(ctx, 1, op, 0, 1, (int)(mag >> 12), SP_REG, SP_REG); + return; + } + // Try two-step imm: hi part (LSL #12) + lo part, both ≤ 0xFFF. + uint32_t mag_lo = mag & 0xFFF; + uint32_t mag_hi = mag >> 12; + if( mag_hi <= 0xFFF ) { + encode_add_sub_imm(ctx, 1, op, 0, 1, (int)mag_hi, SP_REG, SP_REG); + if( mag_lo ) + encode_add_sub_imm(ctx, 1, op, 0, 0, (int)mag_lo, SP_REG, SP_REG); + return; + } + // Fall back to register form. Must use ADD/SUB (extended register) — the + // shifted-register form interprets register 31 as XZR, not SP, so + // `SUB SP, SP, X16` would silently become `SUB XZR, XZR, X16` (a NOP). + // Extended-register form with option=UXTX(011), imm3=0 treats Rd/Rn=31 + // as SP, which is what we want. + load_immediate(ctx, (int64_t)mag, ARM_TMP1, true); + encode_add_sub_ext(ctx, 1, op, 0, ARM_TMP1, /*option=UXTX*/3, /*imm3=*/0, SP_REG, SP_REG); +} + +// ---------------------------------------------------------------------------- +// ADD/SUB-imm with optional 12-bit shift, returns true if `mag` fits. +// Emits `op (ADD/SUB) Rd, Rn, #mag` using up to two instructions. +// Caller picks 0=ADD or 1=SUB. +// ---------------------------------------------------------------------------- +static bool emit_addsub_imm_2step( code_ctx *ctx, int op, Arm64Reg Rd, Arm64Reg Rn, uint32_t mag ) { + if( mag <= 0xFFF ) { + encode_add_sub_imm(ctx, 1, op, 0, 0, (int)mag, Rn, Rd); + return true; + } + if( (mag >> 12) <= 0xFFF ) { + uint32_t hi = mag >> 12, lo = mag & 0xFFF; + encode_add_sub_imm(ctx, 1, op, 0, 1, (int)hi, Rn, Rd); + if( lo ) + encode_add_sub_imm(ctx, 1, op, 0, 0, (int)lo, Rd, Rd); + return true; + } + return false; +} + +// ---------------------------------------------------------------------------- +// Load/store with FP-relative or arbitrary base+offs. +// Picks LDR/STR(unsigned imm scaled) when offset fits, else LDUR/STUR (signed, +// unscaled, ±256), else falls back to a register-offset form. +// +// Register-form offset requires a scratch register that must NOT collide with +// reg_t (for STR Xt,[base,Xt] would store the offset value at the offset +// location). When base also lives in a backend temp (ARM_TMP1/TMP2), we may +// run out of disjoint temps. In that case, fold the offset into base in place +// using ADD/SUB-imm (preserving base across the load/store), which is valid +// for magnitudes up to 0xFFFFFF. +// ---------------------------------------------------------------------------- +static void emit_ld_st_ex( code_ctx *ctx, bool is_load, emit_mode mode, int reg_t, Arm64Reg base, int offs, Arm64Reg avoid ) { + int size = ls_size_for(mode); + int V = is_fp_mode(mode) ? 1 : 0; + int opc = is_load ? 1 : 0; // 0=STR, 1=LDR (for V=0 GPR; same for V=1 FP) + int scale = 1 << size; + if( offs >= 0 && (offs & (scale - 1)) == 0 && (offs / scale) < 0x1000 ) { + encode_ldr_str_imm(ctx, size, V, opc, offs / scale, base, (Arm64Reg)reg_t); + return; + } + if( offs >= -256 && offs < 256 ) { + encode_ldur_stur(ctx, size, V, opc, offs, base, (Arm64Reg)reg_t); + return; + } + // Pick an offset temp. Constraints: + // - For stores, off_tmp must not equal reg_t (else STR Xt,[base,Xt] + // writes the offset value instead of the data). Loads are immune + // since LDR reads the offset register before writing reg_t. + // - off_tmp must not equal base (the load/store needs base intact). + // - off_tmp must not equal `avoid` (a caller-supplied register the + // caller has parked a live value in — typically the OUTER base in + // emit_store/emit_load_addr while loading the data argument). + // For FP loads/stores, reg_t is a V-register, so V-vs-X never collides. + Arm64Reg off_tmp = ARM_TMP1; + if( V == 0 ) { + bool bad_t1 = (!is_load && reg_t == ARM_TMP1) || base == ARM_TMP1 || avoid == ARM_TMP1; + if( bad_t1 ) off_tmp = ARM_TMP2; + bool bad_t2 = (!is_load && reg_t == off_tmp) || base == off_tmp || avoid == off_tmp; + if( bad_t2 ) off_tmp = ARM_TMP3; + bool bad_t3 = (!is_load && reg_t == off_tmp) || base == off_tmp || avoid == off_tmp; + if( bad_t3 ) jit_error("aarch64 emit_ld_st: no free offset temp"); + } + load_immediate(ctx, offs, off_tmp, true); + encode_ldr_str_reg(ctx, size, V, opc, off_tmp, /*option=*/3 /*LSL*/, /*S=*/0, base, (Arm64Reg)reg_t); +} + +static void emit_ld_st( code_ctx *ctx, bool is_load, emit_mode mode, int reg_t, Arm64Reg base, int offs ) { + emit_ld_st_ex(ctx, is_load, mode, reg_t, base, offs, (Arm64Reg)-1 /*no avoid*/); +} + +// MOV between two GPRs. Handles SP as source/dest (ARM disallows ORR with SP). +static void emit_mov_gpr( code_ctx *ctx, Arm64Reg dst, Arm64Reg src, int sf ) { + if( dst == src ) return; + if( dst == SP_REG || src == SP_REG ) { + // ADD , , #0 (only form that accepts SP). + encode_add_sub_imm(ctx, sf, 0, 0, 0, 0, src, dst); + } else { + // ORR , XZR, + encode_logical_reg(ctx, sf, 0x01, 0, 0, src, 0, XZR, dst); + } +} + +// MOV between two FP regs (preserves the lane size used by the mode). +// Uses ORR.16B (same encoding regardless of S/D since it's a bitwise move). +// FMOV is also an option; we use FMOV (scalar) for clarity. +static void emit_mov_fpr( code_ctx *ctx, Arm64FpReg dst, Arm64FpReg src, emit_mode mode ) { + if( dst == src ) return; + int type = (mode == M_F64) ? 1 : 0; // 1=double, 0=single + // FMOV (register) opcode = 0 + encode_fp_1src(ctx, /*M=*/0, /*S=*/0, type, /*opcode=*/0, src, dst); +} + +// Generic MOV that mirrors x86's emit_mov: handles reg/reg, reg/mem, mem/reg. +// imm-to-reg goes through emit_load_const. +static void emit_load_const( code_ctx *ctx, ereg out, uint64_t value, emit_mode mode ); + +// Phase 4 forward declarations (defined later in this file). +static int reserve_const_segment( code_ctx *ctx, int size, int align ); +static int alloc_const( code_ctx *ctx, uint64_t value, int adrp_pos ); +static void emit_const_load( code_ctx *ctx, Arm64Reg dst, uint64_t value ); +static void emit_const_addr( code_ctx *ctx, Arm64Reg dst, uint64_t value ); +static void emit_pool_offset_addr( code_ctx *ctx, Arm64Reg dst, int const_offset ); +static Arm64FpReg materialize_fpr( code_ctx *ctx, ereg src, emit_mode mode, Arm64FpReg tmp ); +static Arm64Reg materialize_gpr( code_ctx *ctx, ereg src, emit_mode mode, Arm64Reg tmp ); +static Arm64Reg materialize_gpr_ex( code_ctx *ctx, ereg src, emit_mode mode, Arm64Reg tmp, Arm64Reg avoid ); + +// LEA-like: out = base + offs. Used when an operand encodes an address as +// (R_REG, value=offs) — e.g. MK_STACK_OFFS, or the LEA-rewritten ADDRESS op. +static void emit_lea_imm( code_ctx *ctx, Arm64Reg out, Arm64Reg base, int offs ) { + if( offs == 0 ) { + emit_mov_gpr(ctx, out, base, 1); + } else if( offs > 0 && offs <= 0xFFF ) { + encode_add_sub_imm(ctx, 1, 0, 0, 0, offs, base, out); + } else if( offs < 0 && -offs <= 0xFFF ) { + encode_add_sub_imm(ctx, 1, 1, 0, 0, -offs, base, out); + } else { + load_immediate(ctx, offs, ARM_TMP1, true); + encode_add_sub_reg(ctx, 1, 0, 0, 0, ARM_TMP1, 0, base, out); + } +} + +static void emit_mov( code_ctx *ctx, ereg dst, ereg src, emit_mode mode ) { + int dst_kind = REG_KIND(dst); + int src_kind = REG_KIND(src); + + if( dst_kind == R_REG && src_kind == R_REG ) { + // MK_STACK_OFFS / LEA-rewritten ADDRESS: src encodes (reg, offs). + // Treat as an address computation: dst = src_reg + offs. + if( !is_fp_mode(mode) && REG_VALUE(src) != 0 ) { + emit_lea_imm(ctx, gpr_id(dst), gpr_id(src), REG_VALUE(src)); + return; + } + if( is_fp_mode(mode) ) + emit_mov_fpr(ctx, fpr_id(dst), fpr_id(src), mode); + else + emit_mov_gpr(ctx, gpr_id(dst), gpr_id(src), sf_for(mode)); + return; + } + if( dst_kind == R_REG && src_kind == R_REG_PTR ) { + // LOAD: dst <- [base + offs] + Arm64Reg base = gpr_id(src); + int offs = REG_VALUE(src); + int reg_t = is_fp_mode(mode) ? fpr_id(dst) : gpr_id(dst); + emit_ld_st(ctx, /*is_load=*/true, mode, reg_t, base, offs); + return; + } + if( dst_kind == R_REG_PTR && src_kind == R_REG ) { + // STORE: [base + offs] <- src + Arm64Reg base = gpr_id(dst); + int offs = REG_VALUE(dst); + int reg_t = is_fp_mode(mode) ? fpr_id(src) : gpr_id(src); + emit_ld_st(ctx, /*is_load=*/false, mode, reg_t, base, offs); + return; + } + if( dst_kind == R_REG && src_kind == R_CONST ) { + emit_load_const(ctx, dst, (uint64_t)REG_VALUE(src), mode); + return; + } + if( dst_kind == R_REG_PTR && src_kind == R_REG_PTR ) { + // memory-to-memory: load through a scratch register, then store. + // Use V31 for FP modes and ARM_TMP1 for integer/pointer modes — both + // are reserved as backend-private scratch. + Arm64Reg sb = gpr_id(src); + int so = REG_VALUE(src); + Arm64Reg db = gpr_id(dst); + int doff = REG_VALUE(dst); + if( is_fp_mode(mode) ) { + emit_ld_st(ctx, /*is_load=*/true, mode, (Arm64FpReg)31, sb, so); + emit_ld_st(ctx, /*is_load=*/false, mode, (Arm64FpReg)31, db, doff); + } else { + emit_ld_st(ctx, /*is_load=*/true, mode, ARM_TMP1, sb, so); + emit_ld_st(ctx, /*is_load=*/false, mode, ARM_TMP1, db, doff); + } + return; + } + jit_error("aarch64 emit_mov: unhandled operand kinds"); +} + +// ---------------------------------------------------------------------------- +// LOAD_CONST: integer immediate or floating constant. +// Float constants need the literal pool (Phase 4). For Phase 2 only ints. +// ---------------------------------------------------------------------------- +static void emit_load_const( code_ctx *ctx, ereg out, uint64_t value, emit_mode mode ) { + if( REG_KIND(out) != R_REG ) { + // emit-into-memory: load the bit pattern into ARM_TMP1 and store as the + // requested width. For floats we treat the FP constant's bit pattern as + // an integer — the resulting STR writes the same bytes a FP STR would. + emit_mode store_mode = is_fp_mode(mode) ? (mode == M_F32 ? M_I32 : M_PTR) : mode; + load_immediate(ctx, (int64_t)value, ARM_TMP1, sf_for(store_mode) == 1); + Arm64Reg base = gpr_id(out); + int offs = REG_VALUE(out); + emit_ld_st(ctx, /*is_load=*/false, store_mode, ARM_TMP1, base, offs); + return; + } + if( is_fp_mode(mode) ) { + // Float constants live in the literal pool: ADRP+LDR into the FP reg. + // jit_emit.c packs F32 constants into the low 32 bits of `value` with + // the upper 32 bits zeroed, so we must use the matching width-encoding + // (size=2 → LDR Sd, ...). Loading 8 bytes would pull the zero high + // half into D and yield a subnormal double when read as F64. + Arm64FpReg fp_dst = fpr_id(out); + int adrp_pos = byte_count(ctx->code); + int size = (mode == M_F32) ? 2 : 3; + encode_adrp(ctx, 0, 0, ARM_TMP1); // ADRP X16, page + // LDR Sd|Dd, [X16, #lo12] V=1, opc=01, imm12=placeholder + encode_ldr_str_imm(ctx, size, 1, 1, 0, ARM_TMP1, (Arm64Reg)fp_dst); + alloc_const(ctx, value, adrp_pos); + return; + } + load_immediate(ctx, (int64_t)value, gpr_id(out), sf_for(mode) == 1); +} + +// ---------------------------------------------------------------------------- +// PUSH / POP. ARM has no explicit push/pop; we use STR/LDR with pre/post-index +// on SP. To match the x86 stack-offset accounting (which assumes 16 bytes are +// already consumed by RIP+RBP), PUSH X29 emits STP X29, X30, [SP, #-16]! so +// LR is implicitly saved as part of FP-save. POP X29 mirrors with LDP. +// All other PUSH/POPs use 16-byte SP movement (8 bytes wasted) to keep SP +// 16-byte aligned per AAPCS64. +// ---------------------------------------------------------------------------- +static void emit_push( code_ctx *ctx, ereg r, emit_mode mode ) { + if( is_fp_mode(mode) ) { + // SUB SP, SP, #16; STR Dn, [SP]. Materialize through V31 if r is not a register. + Arm64FpReg src = (REG_KIND(r) == R_REG) ? fpr_id(r) : materialize_fpr(ctx, r, mode, (Arm64FpReg)31); + emit_sp_offs(ctx, -16); + encode_ldr_str_imm(ctx, /*size=*/3, /*V=*/1, /*opc=*/0 /*STR*/, 0, SP_REG, (Arm64Reg)src); + return; + } + // materialize_gpr handles MK_STACK_OFFS by adding the offset; gpr_id alone + // would discard it (mapping STACK_REG->FP and ignoring REG_VALUE). + Arm64Reg src = materialize_gpr(ctx, r, mode, ARM_TMP1); + if( src == FP && REG_KIND(r) == R_REG && REG_VALUE(r) == 0 ) { + // True PUSH FP (prologue) — emit STP x29,x30,[sp,#-16]! to also save LR. + encode_ldp_stp(ctx, /*opc=*/2, /*V=*/0, /*mode=*/0x03, /*imm7=*/-2 & 0x7F, LR, SP_REG, FP); + return; + } + // SUB SP, SP, #16; STR Xn, [SP] + emit_sp_offs(ctx, -16); + encode_ldr_str_imm(ctx, /*size=*/3, /*V=*/0, /*opc=*/0, 0, SP_REG, src); +} + +static void emit_pop( code_ctx *ctx, ereg r, emit_mode mode ) { + if( REG_KIND(r) != R_REG ) jit_error("aarch64 POP non-reg not implemented"); + if( is_fp_mode(mode) ) { + encode_ldr_str_imm(ctx, /*size=*/3, /*V=*/1, /*opc=*/1 /*LDR*/, 0, SP_REG, (Arm64Reg)fpr_id(r)); + emit_sp_offs(ctx, 16); + return; + } + Arm64Reg dst = gpr_id(r); + if( dst == FP ) { + // LDP X29, X30, [SP], #16 opc=10, V=0, mode=01 (post-index load), imm7=2 + encode_ldp_stp(ctx, /*opc=*/2, /*V=*/0, /*mode=*/0x01, /*imm7=*/2, LR, SP_REG, FP); + return; + } + // LDR Xn, [SP]; ADD SP, SP, #16 + encode_ldr_str_imm(ctx, /*size=*/3, /*V=*/0, /*opc=*/1, 0, SP_REG, dst); + emit_sp_offs(ctx, 16); +} + +// ---------------------------------------------------------------------------- +// CMP / TEST. e->mode tells us int width / float; e->size_offs holds the +// upstream OJxxx opcode (consumed later by the JCOND/CMOV that follows). +// ---------------------------------------------------------------------------- +static void emit_cmp( code_ctx *ctx, einstr *e ) { + if( is_fp_mode(e->mode) ) { + // FCMP. NaN handling deferred; bare FCMP is correct for ordered compares + // and gives QNaN-as-unordered which matches ARM defaults. + Arm64FpReg ra = materialize_fpr(ctx, e->a, e->mode, (Arm64FpReg)29); + Arm64FpReg rb = materialize_fpr(ctx, e->b, e->mode, (Arm64FpReg)30); + int type = (e->mode == M_F64) ? 1 : 0; + encode_fp_compare(ctx, /*M=*/0, /*S=*/0, type, rb, /*op=*/0, ra); + return; + } + // Integer compare: SUBS XZR, Xa, Xb (or imm form). + // materialize_gpr handles R_REG (incl. MK_STACK_OFFS via emit_lea_imm), + // R_CONST, and R_REG_PTR — picking gpr_id alone would drop the FP+N + // offset for stack-allocated addresses. + int sf = sf_for(e->mode); + Arm64Reg a = materialize_gpr(ctx, e->a, e->mode, ARM_TMP1); + if( REG_KIND(e->b) == R_CONST ) { + int64_t val = (int64_t)REG_VALUE(e->b); + if( val >= 0 && val <= 0xFFF ) { + // CMP Xa, #imm (SUBS XZR, Xa, #imm; sf, op=1, S=1) + encode_add_sub_imm(ctx, sf, 1, 1, 0, (int)val, a, XZR); + return; + } + if( val < 0 && -val <= 0xFFF ) { + // CMN Xa, #imm (ADDS XZR, Xa, #imm) + encode_add_sub_imm(ctx, sf, 0, 1, 0, (int)-val, a, XZR); + return; + } + load_immediate(ctx, val, ARM_TMP2, sf == 1); + encode_add_sub_reg(ctx, sf, 1, 1, 0, ARM_TMP2, 0, a, XZR); + return; + } + Arm64Reg b = materialize_gpr_ex(ctx, e->b, e->mode, ARM_TMP2, a); + encode_add_sub_reg(ctx, sf, 1, 1, 0, b, 0, a, XZR); +} + +static void emit_test( code_ctx *ctx, einstr *e ) { + if( is_fp_mode(e->mode) ) jit_error("aarch64 TEST float not supported"); + int sf = sf_for(e->mode); + // materialize_gpr folds MK_STACK_OFFS (R_REG kind + non-zero REG_VALUE) + // into FP+N so we never TST raw FP for stack-allocated address operands. + Arm64Reg a = materialize_gpr(ctx, e->a, e->mode, ARM_TMP1); + // TST Xa, Xa (ANDS XZR, Xa, Xa); opc=11 (ANDS), shift=0, N=0 + encode_logical_reg(ctx, sf, 0x03, 0, 0, a, 0, a, XZR); +} + +// ---------------------------------------------------------------------------- +// JCOND / JUMP — branch fixups patched after function emit. +// ---------------------------------------------------------------------------- +static void add_branch_fixup( code_ctx *ctx, int code_pos, int target_op, int is_cond ) { + int_arr_add_impl(&ctx->jit->galloc, &ctx->branch_fixups, code_pos); + int_arr_add_impl(&ctx->jit->galloc, &ctx->branch_fixups, target_op); + int_arr_add_impl(&ctx->jit->galloc, &ctx->branch_fixups, is_cond); +} + +static void emit_jump( code_ctx *ctx, int target_op_offset ) { + // target_op_offset is the IR-relative displacement, target = cur_op + 1 + offset + int target = ctx->cur_op + 1 + target_op_offset; + int pos = byte_count(ctx->code); + encode_branch_uncond(ctx, 0); // placeholder + add_branch_fixup(ctx, pos, target, 0); +} + +static void emit_jump_cond( code_ctx *ctx, ArmCondition cond, int target_op_offset ) { + int target = ctx->cur_op + 1 + target_op_offset; + int pos = byte_count(ctx->code); + encode_branch_cond(ctx, 0, cond); + add_branch_fixup(ctx, pos, target, 1); +} + +// Mirror x86 get_cond_jump: walk back through MOV/JCOND/CMOV/XCHG/CXCHG to find +// the comparison whose flags this JCOND/CMOV consumes. Translate the upstream +// OJxxx opcode into an ARM condition code. +static ArmCondition get_cond_jump( code_ctx *ctx ) { + int prev = 0; + einstr *p; + do { + p = ctx->jit->reg_instrs + ctx->cur_op - (++prev); + } while( p->op == MOV || p->op == JCOND || p->op == CMOV || p->op == XCHG || p->op == CXCHG ); + switch( p->size_offs ) { + case OJFalse: + case OJNull: + return COND_EQ; + case OJTrue: + case OJNotNull: + return COND_NE; + // For ARM64 FCMP, NaN sets N=0, Z=0, C=1, V=1. IEEE 754 ordered + // predicates need to evaluate FALSE for NaN. HS (C==1) and HI (C==1 + // && Z==0) both fire on NaN — wrong. GE (N==V) and GT (Z==0 && N==V) + // reject NaN since V differs from N. The x86 backend can use JUGte/JUGt + // for FP only because x86 UCOMISS sets CF=1 on NaN, making JAE/JA + // reject it; ARM's carry conventions are inverted from x86's. + // LO (C==0) and LS (C==0 || Z==1) already reject NaN on ARM (C=1). + case OJSGte: + return COND_GE; + case OJSGt: + return COND_GT; + case OJUGte: + return COND_HS; + case OJSLt: + return is_fp_mode(p->mode) ? COND_LO : COND_LT; + case OJSLte: + return is_fp_mode(p->mode) ? COND_LS : COND_LE; + case OJULt: + return COND_LO; + case OJEq: + return COND_EQ; + case OJNotEq: + return COND_NE; + case OJNotLt: + // HS (C==1) fires on NaN (C=1) and on ordered >= (C=1) ✓. + // GE (N==V) would reject NaN (V=1, N=0) — wrong, NaN means + // "not less than" should fire. + return COND_HS; + case OJNotGte: + // LT (N!=V) is signed less-than for INT, and for FP it fires on + // NaN (V=1) — the right semantics for "not >=". + // LO (C==0) would not fire on NaN (C=1) — wrong for FP. + return COND_LT; + case 0: + if( p->op == DEBUG_BREAK ) return COND_EQ; + // fallthrough + default: + jit_error("aarch64 get_cond_jump: unknown OJ opcode"); + return COND_AL; + } +} + +static void patch_branch( code_ctx *ctx, int pos, int target_byte_pos, int is_cond ) { + int delta = target_byte_pos - pos; + if( delta & 3 ) jit_error("aarch64 branch target not 4-byte aligned"); + int imm = delta >> 2; + unsigned int *insn = (unsigned int*)&ctx->code.values[pos]; + if( is_cond ) { + // imm19 lives in bits [23:5]; cond + 0x54000000 prefix retained. + if( imm < -(1 << 18) || imm >= (1 << 18) ) + jit_error("aarch64 B.cond out of range (Phase 2 limit)"); + *insn = (*insn & ~(0x7FFFF << 5)) | ((imm & 0x7FFFF) << 5); + } else { + // imm26 lives in bits [25:0]; opcode 000101. + if( imm < -(1 << 25) || imm >= (1 << 25) ) + jit_error("aarch64 B out of range"); + *insn = (*insn & ~0x03FFFFFF) | (imm & 0x03FFFFFF); + } +} + +// ---------------------------------------------------------------------------- +// Operand materialization: ensure src is a live register; load through a temp +// if it's a constant or memory. Returns the GPR encoding to use. +// ---------------------------------------------------------------------------- +static Arm64Reg materialize_gpr_ex( code_ctx *ctx, ereg src, emit_mode mode, Arm64Reg tmp, Arm64Reg avoid ) { + if( REG_KIND(src) == R_REG ) { + Arm64Reg base = gpr_id(src); + int v = REG_VALUE(src); + if( v == 0 ) return base; + emit_lea_imm(ctx, tmp, base, v); + return tmp; + } + if( REG_KIND(src) == R_CONST ) { + load_immediate(ctx, (int64_t)REG_VALUE(src), tmp, sf_for(mode) == 1); + return tmp; + } + if( REG_KIND(src) == R_REG_PTR ) { + // Load directly via emit_ld_st_ex so the offset-temp picker can avoid + // `avoid` (typically the caller's outer base register). + emit_ld_st_ex(ctx, true, mode, tmp, gpr_id(src), REG_VALUE(src), avoid); + return tmp; + } + emit_mov(ctx, R(tmp), src, mode); + return tmp; +} + +static Arm64Reg materialize_gpr( code_ctx *ctx, ereg src, emit_mode mode, Arm64Reg tmp ) { + return materialize_gpr_ex(ctx, src, mode, tmp, (Arm64Reg)-1); +} + +static Arm64FpReg materialize_fpr( code_ctx *ctx, ereg src, emit_mode mode, Arm64FpReg tmp ) { + if( REG_KIND(src) == R_REG ) return fpr_id(src); + if( REG_KIND(src) == R_REG_PTR ) { + Arm64Reg base = gpr_id(src); + int offs = REG_VALUE(src); + emit_ld_st(ctx, /*is_load=*/true, mode, tmp, base, offs); + return tmp; + } + if( REG_KIND(src) == R_CONST ) { + // FP constants always live in the literal pool. + int adrp_pos = byte_count(ctx->code); + encode_adrp(ctx, 0, 0, ARM_TMP1); + encode_ldr_str_imm(ctx, 3, 1, 1, 0, ARM_TMP1, (Arm64Reg)tmp); + alloc_const(ctx, (uint64_t)REG_VALUE(src), adrp_pos); + return tmp; + } + jit_error("aarch64 materialize_fpr: unsupported operand kind"); + return (Arm64FpReg)0; +} + +// ---------------------------------------------------------------------------- +// Bitfield helpers (SBFM / UBFM raw encoding) for sign/zero-extension. +// ---------------------------------------------------------------------------- +static void emit_bitfield( code_ctx *ctx, int sf, int opc, int immr, int imms, Arm64Reg Rn, Arm64Reg Rd ) { + // [31]=sf, [30:29]=opc (00=SBFM, 01=BFM, 10=UBFM), [28:23]=100110, [22]=N(=sf), + // [21:16]=immr, [15:10]=imms, [9:5]=Rn, [4:0]=Rd + unsigned int insn = ((unsigned)sf << 31) | ((unsigned)opc << 29) | (0x26u << 23) | + ((unsigned)sf << 22) | ((immr & 0x3F) << 16) | ((imms & 0x3F) << 10) | + ((Rn & 0x1F) << 5) | (Rd & 0x1F); + EMIT32(ctx, insn); +} + +static void emit_sxt_to_int( code_ctx *ctx, emit_mode in_mode, Arm64Reg Rn, Arm64Reg Rd ) { + // SXTB Wd, Wn / SXTH Wd, Wn — produce sign-extended 32-bit result. + switch( in_mode ) { + case M_UI8: emit_bitfield(ctx, 0, 0x00, 0, 7, Rn, Rd); break; + case M_UI16: emit_bitfield(ctx, 0, 0x00, 0, 15, Rn, Rd); break; + default: jit_error("emit_sxt_to_int unsupported in_mode"); + } +} + +static void emit_sxt_to_ptr( code_ctx *ctx, emit_mode in_mode, Arm64Reg Rn, Arm64Reg Rd ) { + // SBFM Xd, Xn, #0, #N — sign-extend to 64-bit. + switch( in_mode ) { + case M_UI8: emit_bitfield(ctx, 1, 0x00, 0, 7, Rn, Rd); break; + case M_UI16: emit_bitfield(ctx, 1, 0x00, 0, 15, Rn, Rd); break; + case M_I32: emit_bitfield(ctx, 1, 0x00, 0, 31, Rn, Rd); break; + default: jit_error("emit_sxt_to_ptr unsupported in_mode"); + } +} + +static void emit_uxt_to_w( code_ctx *ctx, emit_mode in_mode, Arm64Reg Rn, Arm64Reg Rd ) { + // UXTB Wd, Wn / UXTH Wd, Wn — implemented as AND Wd, Wn, #mask. + switch( in_mode ) { + case M_UI8: encode_logical_imm(ctx, 0, 0x00, 0, 0, 7, Rn, Rd); break; // AND Wd, Wn, #0xFF + case M_UI16: encode_logical_imm(ctx, 0, 0x00, 0, 0, 15, Rn, Rd); break; // AND Wd, Wn, #0xFFFF + default: jit_error("emit_uxt_to_w unsupported in_mode"); + } +} + +// ---------------------------------------------------------------------------- +// BINOP / UNOP integer. e->size_offs encodes the upstream Haxe op (OAdd, ...). +// ARM has 3-operand ALU so we can write directly to `out` from `a, b`. +// ---------------------------------------------------------------------------- +static void emit_div_mod( code_ctx *ctx, hl_op op, Arm64Reg out, Arm64Reg a, Arm64Reg b, int sf ); + +static void emit_binop_int( code_ctx *ctx, hl_op op, ereg out_e, ereg a_e, ereg b_e, emit_mode mode ) { + int sf = sf_for(mode); + Arm64Reg out = (REG_KIND(out_e) == R_REG) ? gpr_id(out_e) : ARM_TMP1; + Arm64Reg a = materialize_gpr(ctx, a_e, mode, ARM_TMP1); + + // Constant-imm fast paths (ADD/SUB/AND/OR/XOR with small immediates). + if( REG_KIND(b_e) == R_CONST ) { + int64_t v = (int64_t)REG_VALUE(b_e); + if( (op == OAdd || op == OSub) && v >= 0 && v <= 0xFFF ) { + encode_add_sub_imm(ctx, sf, op == OSub ? 1 : 0, 0, 0, (int)v, a, out); + goto store_out; + } + if( (op == OAdd || op == OSub) && v < 0 && -v <= 0xFFF ) { + encode_add_sub_imm(ctx, sf, op == OSub ? 0 : 1, 0, 0, (int)-v, a, out); + goto store_out; + } + } + + Arm64Reg b = materialize_gpr_ex(ctx, b_e, mode, ARM_TMP2, a); + + switch( op ) { + case OAdd: encode_add_sub_reg(ctx, sf, 0, 0, 0, b, 0, a, out); break; + case OSub: encode_add_sub_reg(ctx, sf, 1, 0, 0, b, 0, a, out); break; + case OMul: encode_madd_msub(ctx, sf, 0, b, XZR, a, out); break; + case OAnd: encode_logical_reg(ctx, sf, 0x00, 0, 0, b, 0, a, out); break; + case OOr: encode_logical_reg(ctx, sf, 0x01, 0, 0, b, 0, a, out); break; + case OXor: encode_logical_reg(ctx, sf, 0x02, 0, 0, b, 0, a, out); break; + case OShl: encode_shift_reg(ctx, sf, 0x00, b, a, out); break; // LSLV + case OUShr: encode_shift_reg(ctx, sf, 0x01, b, a, out); break; // LSRV + case OSShr: encode_shift_reg(ctx, sf, 0x02, b, a, out); break; // ASRV + case OSDiv: + case OUDiv: + case OSMod: + case OUMod: + emit_div_mod(ctx, op, out, a, b, sf); + break; + default: + jit_error("aarch64 emit_binop_int: unsupported op"); + } + + // Sub-word result truncation. Loads/stores already truncate, but ALU on + // 32-bit reg leaves upper W zero already; we only need a mask for 8/16-bit. + if( mode == M_UI8 ) { + encode_logical_imm(ctx, 0, 0x00, 0, 0, 7, out, out); // AND Wd, Wd, #0xFF + } else if( mode == M_UI16 ) { + encode_logical_imm(ctx, 0, 0x00, 0, 0, 15, out, out); // AND Wd, Wd, #0xFFFF + } + +store_out: + if( REG_KIND(out_e) != R_REG ) { + emit_mov(ctx, out_e, R(ARM_TMP1), mode); + } +} + +// ---------------------------------------------------------------------------- +// Integer divide / modulo with Haxe semantics: +// OUDiv: b == 0 => 0 +// OUMod: b == 0 => 0 +// OSDiv: b == 0 || -1 => a*b (matches x86; avoids INT_MIN/-1 overflow trap) +// OSMod: b == 0 || -1 => 0 +// ARM SDIV/UDIV give 0 for div/0, but mod via MSUB needs explicit guarding. +// ---------------------------------------------------------------------------- +static void emit_div_mod( code_ctx *ctx, hl_op op, Arm64Reg out, Arm64Reg a, Arm64Reg b, int sf ) { + bool unsign = (op == OUDiv || op == OUMod); + bool is_div = (op == OSDiv || op == OUDiv); + + // Test b for 0; signed ops also test for -1. + encode_logical_reg(ctx, sf, 0x03, 0, 0, b, 0, b, XZR); // TST b, b + int jz_pos = byte_count(ctx->code); + encode_branch_cond(ctx, 0, COND_EQ); // patched later + + int jneg_pos = -1; + if( !unsign ) { + // CMN b, #1 (= b + 1; sets Z if b == -1) + encode_add_sub_imm(ctx, sf, 0, 1, 0, 1, b, XZR); + jneg_pos = byte_count(ctx->code); + encode_branch_cond(ctx, 0, COND_EQ); + } + + // Mainline. encode_div's U bit is 0=UDIV, 1=SDIV (per the ARM ARM + // bit-10 encoding) — pass `unsign ? 0 : 1`, NOT the inverse. + if( is_div ) { + // SDIV/UDIV out, a, b + encode_div(ctx, sf, unsign ? 0 : 1, b, a, out); + } else { + // MSUB needs the ORIGINAL `a` and `b` after the divide; SDIV writes + // `out`, so any of {out==a, out==b} would clobber a source. Spill + // the aliased operand(s) to backend temps first. ARM_TMP3 is + // reserved precisely for cases like this where we need a third + // independent register. + Arm64Reg a_safe = a, b_safe = b; + if( out == a ) { + emit_mov_gpr(ctx, ARM_TMP3, a, sf); + a_safe = ARM_TMP3; + if( b == a ) b_safe = ARM_TMP3; // a==b too: same value in TMP3 + } + if( out == b && b_safe == b ) { + // Need a different temp from a_safe (which may be ARM_TMP3 already). + Arm64Reg t = (a_safe == ARM_TMP1) ? ARM_TMP2 : ARM_TMP1; + emit_mov_gpr(ctx, t, b, sf); + b_safe = t; + } + encode_div(ctx, sf, unsign ? 0 : 1, b_safe, a_safe, out); + // MSUB out, out, b_safe, a_safe => out = a_safe - out * b_safe + encode_madd_msub(ctx, sf, 1, b_safe, a_safe, out, out); + } + int jdone_pos = byte_count(ctx->code); + encode_branch_uncond(ctx, 0); + + // Special case path: result = 0 (mod or unsigned div) or a*b (signed div). + int special_pos = byte_count(ctx->code); + if( op == OSDiv ) { + // out = a * b + encode_madd_msub(ctx, sf, 0, b, XZR, a, out); + } else { + // out = 0 + encode_logical_reg(ctx, sf, 0x01, 0, 0, XZR, 0, XZR, out); // ORR out, XZR, XZR + } + + int after = byte_count(ctx->code); + + // Patch branches. + int delta_jz = (special_pos - jz_pos) >> 2; + *(unsigned int*)&ctx->code.values[jz_pos] = + (*(unsigned int*)&ctx->code.values[jz_pos] & ~(0x7FFFF << 5)) | ((delta_jz & 0x7FFFF) << 5); + if( jneg_pos >= 0 ) { + int delta_jn = (special_pos - jneg_pos) >> 2; + *(unsigned int*)&ctx->code.values[jneg_pos] = + (*(unsigned int*)&ctx->code.values[jneg_pos] & ~(0x7FFFF << 5)) | ((delta_jn & 0x7FFFF) << 5); + } + int delta_done = (after - jdone_pos) >> 2; + *(unsigned int*)&ctx->code.values[jdone_pos] = + (*(unsigned int*)&ctx->code.values[jdone_pos] & ~0x03FFFFFF) | (delta_done & 0x03FFFFFF); +} + +// ---------------------------------------------------------------------------- +// BINOP / UNOP float. +// ---------------------------------------------------------------------------- +static void emit_binop_fp( code_ctx *ctx, hl_op op, ereg out_e, ereg a_e, ereg b_e, emit_mode mode ) { + bool out_to_mem = (REG_KIND(out_e) != R_REG); + Arm64FpReg out = out_to_mem ? (Arm64FpReg)31 : fpr_id(out_e); + // Use V29/V30 as scratch FP regs (in our scratch list, won't collide with `out`=V31). + Arm64FpReg a = materialize_fpr(ctx, a_e, mode, (Arm64FpReg)29); + Arm64FpReg b = materialize_fpr(ctx, b_e, mode, (Arm64FpReg)30); + int type = (mode == M_F64) ? 1 : 0; + int opcode; + switch( op ) { + case OAdd: opcode = 0x02; break; // FADD + case OSub: opcode = 0x03; break; // FSUB + case OMul: opcode = 0x00; break; // FMUL + case OSDiv: opcode = 0x01; break; // FDIV + default: jit_error("aarch64 emit_binop_fp: unsupported op"); + } + encode_fp_arith(ctx, /*M=*/0, /*S=*/0, type, b, opcode, a, out); + if( out_to_mem ) { + Arm64Reg base = gpr_id(out_e); + int offs = REG_VALUE(out_e); + emit_ld_st(ctx, false, mode, out, base, offs); + } +} + +static void emit_unop( code_ctx *ctx, hl_op op, ereg out_e, ereg a_e, emit_mode mode ) { + if( is_fp_mode(mode) ) { + bool out_to_mem = (REG_KIND(out_e) != R_REG); + Arm64FpReg out = out_to_mem ? (Arm64FpReg)31 : fpr_id(out_e); + Arm64FpReg a = materialize_fpr(ctx, a_e, mode, (Arm64FpReg)29); + int type = (mode == M_F64) ? 1 : 0; + switch( op ) { + case ONeg: encode_fp_1src(ctx, 0, 0, type, /*FNEG*/2, a, out); break; + default: jit_error("aarch64 emit_unop float: unsupported op"); + } + if( out_to_mem ) { + Arm64Reg base = gpr_id(out_e); + int offs = REG_VALUE(out_e); + emit_ld_st(ctx, false, mode, out, base, offs); + } + return; + } + int sf = sf_for(mode); + Arm64Reg out = (REG_KIND(out_e) == R_REG) ? gpr_id(out_e) : ARM_TMP1; + Arm64Reg a = materialize_gpr(ctx, a_e, mode, ARM_TMP1); + switch( op ) { + case ONeg: + // SUB out, XZR, a (NEG alias) + encode_add_sub_reg(ctx, sf, 1, 0, 0, a, 0, XZR, out); + break; + case ONot: + // EOR out, a, #1 (boolean toggle). N must equal sf for value 1. + encode_logical_imm(ctx, sf, 0x02, sf, 0, 0, a, out); + break; + case OIncr: + encode_add_sub_imm(ctx, sf, 0, 0, 0, 1, a, out); + break; + case ODecr: + encode_add_sub_imm(ctx, sf, 1, 0, 0, 1, a, out); + break; + default: + jit_error("aarch64 emit_unop: unsupported op"); + } + if( mode == M_UI8 ) { + encode_logical_imm(ctx, 0, 0x00, 0, 0, 7, out, out); + } else if( mode == M_UI16 ) { + encode_logical_imm(ctx, 0, 0x00, 0, 0, 15, out, out); + } + if( REG_KIND(out_e) != R_REG ) emit_mov(ctx, out_e, R(ARM_TMP1), mode); +} + +// ---------------------------------------------------------------------------- +// CONV / CONV_UNSIGNED. e->mode = output mode, e->size_offs = input mode. +// ---------------------------------------------------------------------------- +static void emit_conv( code_ctx *ctx, einstr *e, ereg out_e, bool unsign ) { + emit_mode out_mode = e->mode; + emit_mode in_mode = (emit_mode)e->size_offs; + bool out_fp = is_fp_mode(out_mode); + bool in_fp = is_fp_mode(in_mode); + + // Materialize source. + Arm64Reg a_gpr = 0; + Arm64FpReg a_fpr = (Arm64FpReg)0; + if( in_fp ) { + a_fpr = materialize_fpr(ctx, e->a, in_mode, (Arm64FpReg)29); + } else { + a_gpr = materialize_gpr(ctx, e->a, in_mode, ARM_TMP1); + } + + // Pick output register encoding. When the result lives in memory we route + // the value through a backend-private temporary in the appropriate class. + bool out_to_mem = REG_KIND(out_e) != R_REG; + Arm64Reg dst_gpr = (!out_fp && !out_to_mem) ? gpr_id(out_e) + : (!out_fp ? ARM_TMP2 : 0); + // V31 is in our scratch list and serves as an FP temp; we still need to + // emit a follow-up STR if the output is memory. + Arm64FpReg dst_fpr = (out_fp && !out_to_mem) ? fpr_id(out_e) + : (out_fp ? (Arm64FpReg)31 : (Arm64FpReg)0); + + if( in_fp && out_fp ) { + // FCVT between F32/F64 + int type = (in_mode == M_F64) ? 1 : 0; // input type + int opcode = (in_mode == M_F32) ? 0x05 : 0x04; // F32->F64 = 0x05, F64->F32 = 0x04 + encode_fp_1src(ctx, 0, 0, type, opcode, a_fpr, dst_fpr); + } else if( in_fp && !out_fp ) { + // FP -> int. FCVTZS / FCVTZU (round toward zero). + int sf = sf_for(out_mode); + int type = (in_mode == M_F64) ? 1 : 0; + int rmode = 3; // round toward zero + int opc = unsign ? 1 : 0; // 0=FCVTZS, 1=FCVTZU + encode_fcvt_int(ctx, sf, 0, type, rmode, opc, a_fpr, dst_gpr); + } else if( !in_fp && out_fp ) { + // int -> FP. SCVTF / UCVTF. + int sf = sf_for(in_mode); + int type = (out_mode == M_F64) ? 1 : 0; + int rmode = 0; + int opc = unsign ? 3 : 2; // 2=SCVTF, 3=UCVTF + // First, widen sub-word inputs to full width. UI8/UI16 are + // unsigned regardless of the `unsign` flag (which here selects + // SCVTF vs UCVTF), so always zero-extend the byte/half before + // the FP conversion. + Arm64Reg src = a_gpr; + if( in_mode == M_UI8 || in_mode == M_UI16 ) { + emit_uxt_to_w(ctx, in_mode, src, ARM_TMP1); + src = ARM_TMP1; + } + encode_int_fcvt(ctx, sf, 0, type, rmode, opc, src, dst_fpr); + } else { + // int -> int. + switch( in_mode ) { + case M_UI8: + case M_UI16: + // UI8/UI16 are inherently unsigned in HL — widening to a larger + // integer must always zero-extend, matching x86's MOVZX. The + // `unsign` flag is only meaningful for FP conversions. + if( out_mode == M_PTR || out_mode == M_I32 ) { + emit_uxt_to_w(ctx, in_mode, a_gpr, dst_gpr); + } else if( out_mode == M_UI16 || out_mode == M_UI8 ) { + emit_uxt_to_w(ctx, out_mode, a_gpr, dst_gpr); + } + break; + case M_I32: + if( out_mode == M_PTR ) { + if( unsign ) emit_mov_gpr(ctx, dst_gpr, a_gpr, 0); // MOV Wd, Wn — zero-extends to X + else emit_sxt_to_ptr(ctx, M_I32, a_gpr, dst_gpr); + } else { + emit_mov_gpr(ctx, dst_gpr, a_gpr, sf_for(out_mode)); + if( out_mode == M_UI8 || out_mode == M_UI16 ) + emit_uxt_to_w(ctx, out_mode, dst_gpr, dst_gpr); + } + break; + case M_PTR: + if( out_mode == M_I32 ) { + emit_mov_gpr(ctx, dst_gpr, a_gpr, 0); // truncate + } else if( out_mode == M_UI8 || out_mode == M_UI16 ) { + emit_uxt_to_w(ctx, out_mode, a_gpr, dst_gpr); + } else { + emit_mov_gpr(ctx, dst_gpr, a_gpr, 1); + } + break; + default: + jit_error("aarch64 emit_conv: unsupported int conversion"); + } + } + + if( out_to_mem ) { + if( out_fp ) { + // STR D31/S31, [base+offs] — base might be inside a register operand + // of `out_e`; use emit_ld_st with the FP class. + Arm64Reg base = gpr_id(out_e); + int offs = REG_VALUE(out_e); + emit_ld_st(ctx, false, out_mode, dst_fpr, base, offs); + } else { + emit_mov(ctx, out_e, R(ARM_TMP2), out_mode); + } + } +} + +// ---------------------------------------------------------------------------- +// STORE / LOAD_ADDR / LEA. +// ---------------------------------------------------------------------------- +static void emit_store( code_ctx *ctx, einstr *e ) { + int offs = e->size_offs; + Arm64Reg base; + if( REG_KIND(e->a) == R_REG ) { + base = gpr_id(e->a); + // MK_STACK_OFFS(v) and MK_ADDR-like values encode the offset in the + // register's value field; combine it with size_offs. For regular + // register operands REG_VALUE is 0, so this is a no-op. + offs += REG_VALUE(e->a); + } else { + emit_mov(ctx, R(ARM_TMP1), e->a, M_PTR); + base = ARM_TMP1; + } + if( is_fp_mode(e->mode) ) { + if( REG_KIND(e->b) == R_REG ) { + emit_ld_st(ctx, false, e->mode, fpr_id(e->b), base, offs); + } else { + // Route the bit pattern through a GPR. STR writes the same bytes + // regardless of FP vs. INT class. + Arm64Reg tmp = (base == ARM_TMP1) ? ARM_TMP2 : ARM_TMP1; + emit_mode int_mode = (e->mode == M_F32) ? M_I32 : M_PTR; + if( REG_KIND(e->b) == R_CONST ) { + load_immediate(ctx, (int64_t)REG_VALUE(e->b), tmp, sf_for(int_mode) == 1); + } else if( REG_KIND(e->b) == R_REG_PTR ) { + // Spilled FP vreg: load via emit_ld_st_ex so the offset-temp picker + // can avoid clobbering `base` (parked in ARM_TMP1 when e->a was spilled). + emit_ld_st_ex(ctx, true, int_mode, tmp, gpr_id(e->b), REG_VALUE(e->b), base); + } else { + emit_mov(ctx, R(tmp), e->b, int_mode); + } + emit_ld_st_ex(ctx, false, int_mode, tmp, base, offs, (Arm64Reg)-1); + } + return; + } + int reg_t; + if( REG_KIND(e->b) == R_REG && REG_VALUE(e->b) == 0 ) { + reg_t = gpr_id(e->b); + } else { + Arm64Reg tmp = (base == ARM_TMP1) ? ARM_TMP2 : ARM_TMP1; + if( REG_KIND(e->b) == R_REG ) { + // MK_STACK_OFFS / LEA-rewritten ADDRESS: source encodes (reg, offs). + // Materialize the effective address into tmp. + emit_lea_imm(ctx, tmp, gpr_id(e->b), REG_VALUE(e->b)); + } else if( REG_KIND(e->b) == R_CONST ) { + load_immediate(ctx, (int64_t)REG_VALUE(e->b), tmp, sf_for(e->mode) == 1); + } else if( REG_KIND(e->b) == R_REG_PTR ) { + // Load directly via emit_ld_st_ex so we can tell it to avoid + // clobbering `base` (which lives in ARM_TMP1 when e->a was spilled). + emit_ld_st_ex(ctx, true, e->mode, tmp, gpr_id(e->b), REG_VALUE(e->b), base); + } else { + emit_mov(ctx, R(tmp), e->b, e->mode); + } + reg_t = tmp; + } + emit_ld_st_ex(ctx, false, e->mode, reg_t, base, offs, (Arm64Reg)-1); +} + +static void emit_load_addr( code_ctx *ctx, einstr *e, ereg out_e ) { + emit_mode lmode = (emit_mode)e->nargs; + Arm64Reg base; + int offs = e->size_offs; + if( REG_KIND(e->a) == R_REG ) { + base = gpr_id(e->a); + offs += REG_VALUE(e->a); + } else { + emit_mov(ctx, R(ARM_TMP1), e->a, M_PTR); + base = ARM_TMP1; + } + if( is_fp_mode(lmode) ) { + if( REG_KIND(out_e) == R_REG ) { + emit_ld_st(ctx, true, lmode, fpr_id(out_e), base, offs); + } else { + // FP load into V31 then STR to memory dst. + emit_ld_st(ctx, true, lmode, (Arm64FpReg)31, base, offs); + Arm64Reg out_base = gpr_id(out_e); + int out_offs = REG_VALUE(out_e); + emit_ld_st(ctx, false, lmode, (Arm64FpReg)31, out_base, out_offs); + } + return; + } + Arm64Reg dst = (REG_KIND(out_e) == R_REG) ? gpr_id(out_e) : ARM_TMP2; + emit_ld_st(ctx, true, lmode, dst, base, offs); + if( REG_KIND(out_e) != R_REG ) { + emit_mov(ctx, out_e, R(ARM_TMP2), e->mode); + } +} + +static void emit_lea( code_ctx *ctx, einstr *e, ereg out_e ) { + int mult = e->size_offs & 0xFF; + int offs = e->size_offs >> 8; + if( REG_KIND(e->a) == R_REG ) offs += REG_VALUE(e->a); + + Arm64Reg out = (REG_KIND(out_e) == R_REG) ? gpr_id(out_e) : ARM_TMP1; + Arm64Reg a; + if( REG_KIND(e->a) == R_REG ) { + a = gpr_id(e->a); + } else { + emit_mov(ctx, R(ARM_TMP1), e->a, M_PTR); + a = ARM_TMP1; + } + + if( mult == 0 || IS_NULL(e->b) ) { + // out = a + offs + if( offs == 0 ) { + emit_mov_gpr(ctx, out, a, 1); + } else if( offs > 0 && offs <= 0xFFF ) { + encode_add_sub_imm(ctx, 1, 0, 0, 0, offs, a, out); + } else if( offs < 0 && -offs <= 0xFFF ) { + encode_add_sub_imm(ctx, 1, 1, 0, 0, -offs, a, out); + } else { + load_immediate(ctx, offs, ARM_TMP2, true); + encode_add_sub_reg(ctx, 1, 0, 0, 0, ARM_TMP2, 0, a, out); + } + } else { + if( mult != 1 && mult != 2 && mult != 4 && mult != 8 ) + jit_error("aarch64 LEA: unsupported scale"); + int shift = (mult == 1) ? 0 : (mult == 2) ? 1 : (mult == 4) ? 2 : 3; + // Index width matches HL semantics — array indexes are M_I32. Materialize + // from a 32-bit slot so we don't read garbage from the adjacent vreg, and + // use the extended-register ADD with UXTW so only the lower 32 bits feed + // the address calculation. + Arm64Reg b = materialize_gpr_ex(ctx, e->b, M_I32, ARM_TMP2, a); + // out = a + UXTW(b) << shift + encode_add_sub_ext(ctx, /*sf=*/1, /*op=*/0, /*S=*/0, b, /*option=UXTW*/2, shift, a, out); + if( offs != 0 ) { + if( offs > 0 && offs <= 0xFFF ) { + encode_add_sub_imm(ctx, 1, 0, 0, 0, offs, out, out); + } else if( offs < 0 && -offs <= 0xFFF ) { + encode_add_sub_imm(ctx, 1, 1, 0, 0, -offs, out, out); + } else { + load_immediate(ctx, offs, ARM_TMP2, true); + encode_add_sub_reg(ctx, 1, 0, 0, 0, ARM_TMP2, 0, out, out); + } + } + } + + if( REG_KIND(out_e) != R_REG ) emit_mov(ctx, out_e, R(ARM_TMP1), M_PTR); +} + +// ---------------------------------------------------------------------------- +// CMOV / XCHG / PUSH_CONST / PREFETCH. +// ---------------------------------------------------------------------------- +static void emit_cmov_arm( code_ctx *ctx, ereg out_e, ereg a_e, ArmCondition cond ) { + if( REG_KIND(out_e) != R_REG ) jit_error("aarch64 CMOV non-reg out"); + Arm64Reg out = gpr_id(out_e); + Arm64Reg a = materialize_gpr(ctx, a_e, M_PTR, ARM_TMP1); + // CSEL out, a, out, cond (if cond: out=a; else out=out) + encode_cond_select(ctx, 1, 0, out, cond, 0, a, out); +} + +static void emit_xchg( code_ctx *ctx, einstr *e ) { + if( REG_KIND(e->a) != R_REG || REG_KIND(e->b) != R_REG ) + jit_error("aarch64 XCHG with non-reg operand"); + Arm64Reg ra = gpr_id(e->a); + Arm64Reg rb = gpr_id(e->b); + emit_mov_gpr(ctx, ARM_TMP1, ra, 1); + emit_mov_gpr(ctx, ra, rb, 1); + emit_mov_gpr(ctx, rb, ARM_TMP1, 1); +} + +static void emit_push_const( code_ctx *ctx, einstr *e ) { + if( e->mode != M_PTR ) jit_error("aarch64 PUSH_CONST non-ptr mode"); + load_immediate(ctx, (int64_t)e->value, ARM_TMP1, true); + emit_sp_offs(ctx, -16); + encode_ldr_str_imm(ctx, 3, 0, 0, 0, SP_REG, ARM_TMP1); // STR X16, [SP] +} + +// ---------------------------------------------------------------------------- +// Phase 4: constant-pool helpers. +// ---------------------------------------------------------------------------- + +static int reserve_const_segment( code_ctx *ctx, int size, int align ) { + int pos = byte_count(ctx->const_table); + if( align ) { + int k = pos & (align - 1); + if( k ) { + byte_reserve_impl(&ctx->jit->galloc, &ctx->const_table, align - k); + pos = byte_count(ctx->const_table); + } + } + byte_reserve_impl(&ctx->jit->galloc, &ctx->const_table, size); + return pos; +} + +// Insert (or find) a 64-bit value in the constant table; record the current +// emission point as an ADRP+LDR (or ADRP+ADD) pair to be patched later. +// Returns the byte offset of the value inside ctx->const_table. +static int alloc_const( code_ctx *ctx, uint64_t value, int adrp_pos ) { + int pos = value_map_find(ctx->const_table_lookup, value); + if( pos < 0 ) { + pos = reserve_const_segment(ctx, 8, 8); + *(uint64_t*)byte_addr(ctx->const_table, pos) = value; + value_map_add_impl(&ctx->jit->galloc, &ctx->const_table_lookup, value, pos); + } + int_arr_add_impl(&ctx->jit->galloc, &ctx->const_refs, ctx->jit->out_pos + adrp_pos); + int_arr_add_impl(&ctx->jit->galloc, &ctx->const_refs, pos); + return pos; +} + +// Emit ADRP dst, page ; LDR dst, [dst, #lo12] — load constant `value` from pool. +static void emit_const_load( code_ctx *ctx, Arm64Reg dst, uint64_t value ) { + int adrp_pos = byte_count(ctx->code); + encode_adrp(ctx, 0, 0, dst); // imm21 placeholder + encode_ldr_str_imm(ctx, 3, 0, 1, 0, dst, dst); // LDR Xd, [Xd, #0] + alloc_const(ctx, value, adrp_pos); +} + +// Emit ADRP dst, page ; ADD dst, dst, #lo12 — load address of pool entry `value`. +static void emit_const_addr( code_ctx *ctx, Arm64Reg dst, uint64_t value ) { + int adrp_pos = byte_count(ctx->code); + encode_adrp(ctx, 0, 0, dst); // imm21 placeholder + encode_add_sub_imm(ctx, 1, 0, 0, 0, 0, dst, dst); // ADD Xd, Xd, #0 + alloc_const(ctx, value, adrp_pos); +} + +// Emit ADRP+ADD pair targeting an offset INSIDE the const table (used for +// jump-table base addressing). The offset is recorded directly, not the value. +static void emit_pool_offset_addr( code_ctx *ctx, Arm64Reg dst, int const_offset ) { + int adrp_pos = byte_count(ctx->code); + encode_adrp(ctx, 0, 0, dst); + encode_add_sub_imm(ctx, 1, 0, 0, 0, 0, dst, dst); + int_arr_add_impl(&ctx->jit->galloc, &ctx->const_refs, ctx->jit->out_pos + adrp_pos); + int_arr_add_impl(&ctx->jit->galloc, &ctx->const_refs, const_offset); +} + +// ---------------------------------------------------------------------------- +// Phase 4: call ops, LOAD_FUN, JUMP_TABLE. +// ---------------------------------------------------------------------------- + +// CALL_FUN: emit BL with a deferred imm26 patch (resolved in flush_consts once +// jit->mod->functions_ptrs[fid] holds the in-output offset). +static void emit_call_fun( code_ctx *ctx, einstr *e ) { + int pos = byte_count(ctx->code); + encode_branch_link(ctx, 0); // imm26 placeholder + int_arr_add_impl(&ctx->jit->galloc, &ctx->funs, ctx->jit->out_pos + pos); + int_arr_add_impl(&ctx->jit->galloc, &ctx->funs, (int)e->a); + int_arr_add_impl(&ctx->jit->galloc, &ctx->funs, /*kind=BL*/0); +} + +// LOAD_FUN: emit ADRP + ADD with a deferred imm21+imm12 patch — produces the +// absolute address of the JIT-compiled function in `out`. +static void emit_load_fun( code_ctx *ctx, ereg out_e, int fid ) { + Arm64Reg out = (REG_KIND(out_e) == R_REG) ? gpr_id(out_e) : ARM_TMP1; + int pos = byte_count(ctx->code); + encode_adrp(ctx, 0, 0, out); + encode_add_sub_imm(ctx, 1, 0, 0, 0, 0, out, out); + int_arr_add_impl(&ctx->jit->galloc, &ctx->funs, ctx->jit->out_pos + pos); + int_arr_add_impl(&ctx->jit->galloc, &ctx->funs, fid); + int_arr_add_impl(&ctx->jit->galloc, &ctx->funs, /*kind=ADRP+ADD*/1); + if( REG_KIND(out_e) != R_REG ) emit_mov(ctx, out_e, R(ARM_TMP1), M_PTR); +} + +// CALL_PTR: indirect call via constant pool, with shortcuts for the two known +// near-call targets (hl_null_access, hl_jit_null_field_access). +static void emit_call_ptr( code_ctx *ctx, einstr *e ) { + uint64_t target = (uint64_t)e->value; + int near_pos = -1; + if( target == (uint64_t)(uintptr_t)hl_null_access ) + near_pos = ctx->null_access_pos; + else if( target == (uint64_t)(uintptr_t)hl_jit_null_field_access ) + near_pos = ctx->null_field_pos; + + if( near_pos >= 0 ) { + // BL — direct PC-relative call to the trampoline emitted in + // hl_codegen_init. Both source and target are within the same output + // buffer, so resolve the imm26 immediately. + int pos = ctx->jit->out_pos + byte_count(ctx->code); + intptr_t delta = (intptr_t)near_pos - (intptr_t)pos; + int imm26 = (int)(delta >> 2); + encode_branch_link(ctx, imm26); + } else { + emit_const_load(ctx, ARM_TMP1, target); + encode_branch_reg(ctx, /*BLR*/1, ARM_TMP1); + } + // Sub-word return masking to match x86's MOVZX behavior. + if( e->mode == M_UI8 ) + encode_logical_imm(ctx, 0, 0x00, 0, 0, 7, X0, X0); + else if( e->mode == M_UI16 ) + encode_logical_imm(ctx, 0, 0x00, 0, 0, 15, X0, X0); +} + +// CALL_REG: BLR . +static void emit_call_reg( code_ctx *ctx, einstr *e ) { + Arm64Reg target = materialize_gpr(ctx, e->a, M_PTR, ARM_TMP1); + encode_branch_reg(ctx, /*BLR*/1, target); +} + +// JUMP_TABLE: dispatch through a const_table-resident jump table whose entries +// are absolute target addresses (filled in hl_codegen_final). Index value lives +// in e->a (32-bit int). Falls through after BR — caller assumes no return. +static void emit_jump_table( code_ctx *ctx, einstr *e ) { + int n = e->nargs; + int start = reserve_const_segment(ctx, 8 * n, 16); + + // Materialize index as a zero-extended 64-bit value. IR convention: e->a + // holds an int (M_I32); MOV Wn, Wn zero-extends to X. + Arm64Reg idx; + if( REG_KIND(e->a) == R_REG ) { + Arm64Reg src = gpr_id(e->a); + // MOV W17, Wsrc — clears upper 32 bits. + encode_logical_reg(ctx, 0, 0x01, 0, 0, src, 0, XZR, ARM_TMP2); + idx = ARM_TMP2; + } else { + emit_mov(ctx, R(ARM_TMP2), e->a, M_I32); + // Re-zero-extend to be safe. + encode_logical_reg(ctx, 0, 0x01, 0, 0, ARM_TMP2, 0, XZR, ARM_TMP2); + idx = ARM_TMP2; + } + + emit_pool_offset_addr(ctx, ARM_TMP1, start); + // LDR X16, [X16, idx, LSL #3] size=3, V=0, opc=1, option=3 (LSL/UXTX), S=1 + encode_ldr_str_reg(ctx, 3, 0, 1, idx, /*option=*/3, /*S=*/1, ARM_TMP1, ARM_TMP1); + encode_branch_reg(ctx, /*BR*/0, ARM_TMP1); + + ereg *args = hl_emit_get_args(ctx->jit->emit, e); + for( int k = 0; k < n; k++ ) { + int_arr_add_impl(&ctx->jit->galloc, &ctx->const_addr, start + k * 8); + int_arr_add_impl(&ctx->jit->galloc, &ctx->const_addr, ctx->cur_op + (int)args[k] + 1); + } +} + +static void emit_prefetch( code_ctx *ctx, einstr *e ) { + int prfop; + switch( e->size_offs ) { + case 0: prfop = 0; break; // PLDL1KEEP + case 1: prfop = 2; break; // PLDL2KEEP + case 2: prfop = 4; break; // PLDL3KEEP + case 3: prfop = 1; break; // PLDL1STRM + case 4: prfop = 16; break; // PSTL1KEEP + default: jit_error("aarch64 PREFETCH: bad size_offs"); + } + Arm64Reg base; + if( REG_KIND(e->a) == R_REG ) { + base = gpr_id(e->a); + } else { + emit_mov(ctx, R(ARM_TMP1), e->a, M_PTR); + base = ARM_TMP1; + } + // PRFM: size=11, V=0, opc=10, imm12=0, Rn=base, Rt=prfop + encode_ldr_str_imm(ctx, 3, 0, 2, 0, base, (Arm64Reg)prfop); +} + +// ============================================================================ +// hl_codegen_flush +// ============================================================================ + +void hl_codegen_flush( jit_ctx *jit ) { + code_ctx *ctx = jit->code; + if( ctx->flushed ) return; + ctx->flushed = true; + jit->code_size = ctx->code.cur; + jit->code_instrs = ctx->code.values; + jit->code_pos_map = ctx->pos_map; + if( ctx->pos_map ) ctx->pos_map[ctx->cur_op + 1] = ctx->code.cur; +} + +// ============================================================================ +// hl_codegen_function — the main per-IR-op switch +// ============================================================================ + +void hl_codegen_function( jit_ctx *jit ) { + code_ctx *ctx = jit->code; + ctx->flushed = false; + byte_free(&ctx->code); + int_arr_free(&ctx->branch_fixups); + free(ctx->pos_map); + ctx->pos_map = (int*)malloc((jit->reg_instr_count + 1) * sizeof(int)); + ctx->pos_map[0] = 0; + byte_reserve(ctx->code, 64); + ctx->code.cur -= 64; + + int const_addr_prev = int_arr_count(ctx->const_addr); + + for( int cur_pos = 0; cur_pos < jit->reg_instr_count; cur_pos++ ) { + einstr *e = jit->reg_instrs + cur_pos; + ereg out = jit->reg_writes[cur_pos]; + byte_reserve(ctx->code, 64); + ctx->code.cur -= 64; + ctx->cur_op = cur_pos; + if( cur_pos > 0 ) ctx->pos_map[cur_pos] = ctx->code.cur; + + switch( e->op ) { + case LOAD_ARG: + // nop — argument lives in its allocated register already + continue; + case NOP: + // HINT #0 (NOP) + EMIT32(ctx, 0xD503201F); + break; + case MOV: + emit_mov(ctx, out, e->a, e->mode); + break; + case LOAD_CONST: + emit_load_const(ctx, out, e->value, e->mode); + break; + case RET: + // Result placement was handled by upstream regs phase via a preceding + // regs_emit_mov(out, e->a). Here we just emit the actual return. + encode_branch_reg(ctx, /*opc=*/2 /*RET*/, LR); + break; + case PUSH: + emit_push(ctx, e->a, e->mode); + break; + case POP: + emit_pop(ctx, e->a, e->mode); + break; + case STACK_OFFS: + emit_sp_offs(ctx, e->size_offs); + break; + case CMP: + emit_cmp(ctx, e); + break; + case TEST: + emit_test(ctx, e); + break; + case JCOND: + emit_jump_cond(ctx, get_cond_jump(ctx), e->size_offs); + break; + case JUMP: + emit_jump(ctx, e->size_offs); + break; + case DEBUG_BREAK: + // BRK #0 — encoded as 0xD4200000 + EMIT32(ctx, 0xD4200000); + break; + case BINOP: + if( is_fp_mode(e->mode) ) + emit_binop_fp(ctx, (hl_op)e->size_offs, out, e->a, e->b, e->mode); + else + emit_binop_int(ctx, (hl_op)e->size_offs, out, e->a, e->b, e->mode); + break; + case UNOP: + // jit_emit.c lowers `not b` and similar boolean toggles as a UNOP + // with two operands (a, b=immediate, op=OXor). Dispatch the + // two-operand form through the regular binop handler so OXor/OAnd/OOr + // don't need a second copy of the encoding. + if( !IS_NULL(e->b) ) { + if( is_fp_mode(e->mode) ) + emit_binop_fp(ctx, (hl_op)e->size_offs, out, e->a, e->b, e->mode); + else + emit_binop_int(ctx, (hl_op)e->size_offs, out, e->a, e->b, e->mode); + } else { + emit_unop(ctx, (hl_op)e->size_offs, out, e->a, e->mode); + } + break; + case CONV: + emit_conv(ctx, e, out, /*unsign=*/false); + break; + case CONV_UNSIGNED: + emit_conv(ctx, e, out, /*unsign=*/true); + break; + case STORE: + emit_store(ctx, e); + break; + case LOAD_ADDR: + emit_load_addr(ctx, e, out); + break; + case LEA: + emit_lea(ctx, e, out); + break; + case CMOV: + emit_cmov_arm(ctx, out, e->a, get_cond_jump(ctx)); + break; + case XCHG: + emit_xchg(ctx, e); + break; + case CXCHG: + // x86 emits BREAK() here too — atomic compare-exchange unimplemented. + EMIT32(ctx, 0xD4200000); + break; + case PUSH_CONST: + emit_push_const(ctx, e); + break; + case PREFETCH: + emit_prefetch(ctx, e); + break; + case CALL_FUN: + emit_call_fun(ctx, e); + break; + case CALL_PTR: + emit_call_ptr(ctx, e); + break; + case CALL_REG: + emit_call_reg(ctx, e); + break; + case LOAD_FUN: + emit_load_fun(ctx, out, e->size_offs); + break; + case JUMP_TABLE: + emit_jump_table(ctx, e); + break; + case ADDRESS: + // Rewritten to LEA in the regs phase; should never reach here. + jit_error("aarch64: ADDRESS reached backend (regs phase should rewrite)"); + break; + case CATCH: + // IR marker only (mirrors x86) — no code emitted. + break; + default: + { + static const char *op_names[] = { + "LOAD_ADDR", "LOAD_CONST", "LOAD_ARG", "LOAD_FUN", "STORE", + "LEA", "TEST", "CMP", "JCOND", "JUMP", "JUMP_TABLE", + "BINOP", "UNOP", "CONV", "CONV_UNSIGNED", "RET", + "CALL_PTR", "CALL_REG", "CALL_FUN", "MOV", "CMOV", + "XCHG", "CXCHG", "PUSH_CONST", "PUSH", "POP", + "ALLOC_STACK", "PREFETCH", "DEBUG_BREAK", "BLOCK", + "ENTER", "STACK_OFFS", "CATCH", "ADDRESS", "NOP" + }; + static char errbuf[128]; + const char *name = (e->op < (int)(sizeof(op_names)/sizeof(*op_names))) + ? op_names[e->op] : "?"; + snprintf(errbuf, sizeof(errbuf), "aarch64: unhandled IR op %s (%d) at cur_op=%d", + name, e->op, cur_pos); + jit_error(errbuf); + } + break; + } + + if( ctx->code.cur > ctx->code.max ) jit_error("aarch64 code buffer overrun"); + } + + // Functions are 4-byte aligned naturally on ARM; no padding needed for now. + hl_codegen_flush(jit); + + // Patch all in-function branches. + for( int i = 0; i < int_arr_count(ctx->branch_fixups); i += 3 ) { + int pos = int_arr_get(ctx->branch_fixups, i); + int target_op = int_arr_get(ctx->branch_fixups, i + 1); + int is_cond = int_arr_get(ctx->branch_fixups, i + 2); + int target_byte_pos = ctx->pos_map[target_op]; + patch_branch(ctx, pos, target_byte_pos, is_cond); + } + + // Convert any jump-table target_op_index entries recorded by emit_jump_table + // into absolute byte offsets in the output buffer. + for( int i = const_addr_prev; i < int_arr_count(ctx->const_addr); i += 2 ) { + int target_op = int_arr_get(ctx->const_addr, i + 1); + int offs = jit->out_pos + ctx->pos_map[target_op]; + ctx->const_addr.values[i + 1] = offs; + } +} + +// ============================================================================ +// Phase 4: module-level emission. +// ============================================================================ + +// Helper: finalize a freshly-emitted helper stub (null-access stubs, c2hl, +// hl2c). Mirrors x86's flush_function: reports the start/size to the unwind +// machinery and rounds the function buffer to 16 bytes. +static void flush_helper( code_ctx *ctx, int start ) { + hl_jit_define_function(ctx->jit, start, ctx->jit->out_pos + byte_count(ctx->code) - start); + while( byte_count(ctx->code) & 15 ) + EMIT32(ctx, 0xD503201F); // NOP + if( byte_count(ctx->code) > ctx->code.max ) jit_error("aarch64 trampoline overrun"); +} + +// Patch a placeholder branch (B, BL, or B.cond) emitted at byte position `pos` +// to target byte position `target` in the same buffer. Selects imm26 for +// unconditional and imm19 for conditional based on the opcode bits. +static void patch_helper_branch( code_ctx *ctx, int pos, int target ) { + int delta = (target - pos) >> 2; + unsigned int *insn = (unsigned int*)&ctx->code.values[pos]; + unsigned int op = (*insn >> 26) & 0x3F; + if( op == 0x05 || op == 0x25 ) { + // B / BL: imm26 + *insn = (*insn & ~0x03FFFFFFu) | ((unsigned)delta & 0x03FFFFFF); + } else { + // B.cond: imm19 in bits [23:5] + *insn = (*insn & ~(0x7FFFFu << 5)) | ((unsigned)(delta & 0x7FFFF) << 5); + } +} + +// Emit a function prologue compatible with the Apple ARM64 + AAPCS64 ABI: +// STP X29, X30, [SP, #-16]! ; MOV X29, SP. +static void emit_helper_prologue( code_ctx *ctx ) { + encode_ldp_stp(ctx, /*opc=*/2, /*V=*/0, /*mode=*/0x03, /*imm7=*/-2 & 0x7F, LR, SP_REG, FP); + emit_mov_gpr(ctx, FP, SP_REG, 1); +} + +// Emit the standard epilogue used by all helpers/trampolines: +// MOV SP, X29 ; LDP X29, X30, [SP], #16 ; RET. +static void emit_helper_epilogue( code_ctx *ctx ) { + emit_mov_gpr(ctx, SP_REG, FP, 1); + encode_ldp_stp(ctx, /*opc=*/2, /*V=*/0, /*mode=*/0x01, /*imm7=*/2, LR, SP_REG, FP); + encode_branch_reg(ctx, /*RET*/2, LR); +} + +// Emit hl_null_access stub: ADRP/LDR the C function pointer and BLR (it never +// returns; we still emit a BRK afterward to mirror x86). +static void emit_null_access_stub( code_ctx *ctx, void *target ) { + emit_helper_prologue(ctx); + emit_const_load(ctx, ARM_TMP1, (uint64_t)(uintptr_t)target); + encode_branch_reg(ctx, /*BLR*/1, ARM_TMP1); + EMIT32(ctx, 0xD4200000); // BRK #0 +} + +// Emit hl_jit_null_field_access stub. The caller passes the field hash in W0. +// The C function takes one int argument (the hash), so our trampoline doesn't +// need to marshal — just forward. +static void emit_null_field_stub( code_ctx *ctx, void *target ) { + emit_helper_prologue(ctx); + emit_const_load(ctx, ARM_TMP1, (uint64_t)(uintptr_t)target); + encode_branch_reg(ctx, /*BLR*/1, ARM_TMP1); + EMIT32(ctx, 0xD4200000); // BRK #0 +} + +// Emit the c2hl trampoline. +// +// Called from C with: X0 = JIT-compiled fn ptr, X1 = &vargs (struct{regs[16]; +// stack[16]}), X2 = stack-arg count. +// +// The C side (jit.c:callback_c2hl) populates vargs.regs[0..7] with int reg +// args, vargs.regs[8..15] with FP reg args, and vargs.stack[16-N..15] with the +// N stack args (leftmost stack arg at vargs.stack[15]). We: +// 1. Load X0..X7 from [vargs+0..56] and D0..D7 from [vargs+64..120]. +// 2. Push the stack args in reverse order so the leftmost ends up at SP+0. +// 3. BLR fn ; restore frame ; RET. +// +// X16/X17 hold the fn pointer and vargs through the call (they survive any +// data-load up to BLR; the dynamic linker only clobbers them at the BLR itself, +// at which point we're done with them). +static void emit_c2hl_trampoline( code_ctx *ctx ) { + emit_helper_prologue(ctx); + emit_mov_gpr(ctx, ARM_TMP1, X0, 1); // X16 = fn + emit_mov_gpr(ctx, ARM_TMP2, X1, 1); // X17 = vargs + emit_mov_gpr(ctx, X9, X2, 1); // X9 = stack count + + // Load int arg regs from vargs.regs[0..7]. + encode_ldp_stp(ctx, 0x02, 0, 0x02, 0, X1, ARM_TMP2, X0); // LDP X0,X1, [X17, #0] + encode_ldp_stp(ctx, 0x02, 0, 0x02, 2, X3, ARM_TMP2, X2); // LDP X2,X3, [X17, #16] + encode_ldp_stp(ctx, 0x02, 0, 0x02, 4, X5, ARM_TMP2, X4); // LDP X4,X5, [X17, #32] + encode_ldp_stp(ctx, 0x02, 0, 0x02, 6, X7, ARM_TMP2, X6); // LDP X6,X7, [X17, #48] + // Load FP arg regs from vargs.regs[8..15] (= byte offsets 64..120). + encode_ldp_stp(ctx, 0x01, 1, 0x02, 8, (Arm64Reg)1, ARM_TMP2, (Arm64Reg)0); // LDP D0,D1, [X17, #64] + encode_ldp_stp(ctx, 0x01, 1, 0x02, 10, (Arm64Reg)3, ARM_TMP2, (Arm64Reg)2); // LDP D2,D3, [X17, #80] + encode_ldp_stp(ctx, 0x01, 1, 0x02, 12, (Arm64Reg)5, ARM_TMP2, (Arm64Reg)4); // LDP D4,D5, [X17, #96] + encode_ldp_stp(ctx, 0x01, 1, 0x02, 14, (Arm64Reg)7, ARM_TMP2, (Arm64Reg)6); // LDP D6,D7, [X17, #112] + + // Push stack args, padding SP to 16 bytes if N is odd. + // total bytes = N*8 + (N&1)*8 — always a multiple of 16. + + // CBZ X9, no_stack — skip everything if no stack args. + int cbz_skip_pos = byte_count(ctx->code); + encode_cbz_cbnz(ctx, /*sf=*/1, /*op=*/0, 0, X9); + + // X10 = X9 * 8 (size in bytes; LSL #3 via UBFM). + emit_bitfield(ctx, /*sf=*/1, /*opc=UBFM*/0x02, /*immr=*/(64 - 3) & 0x3F, /*imms=*/63 - 3, X9, X10); + + // Pad: if X9 is odd, allocate +8. X10 += (X9 & 1) << 3 + // AND X11, X9, #1 ; LSL X11, X11, #3 ; ADD X10, X10, X11. + encode_logical_imm(ctx, 1, 0x00, 1, 0, 0, X9, X11); // AND X11, X9, #1 (immr=0,imms=0,N=1 → 1) + emit_bitfield(ctx, 1, 0x02, (64 - 3) & 0x3F, 63 - 3, X11, X11); + encode_add_sub_reg(ctx, 1, 0, 0, 0, X11, 0, X10, X10); + + // SUB SP, SP, X10 — must use ADD/SUB (extended register); the shifted-reg + // form treats register 31 as XZR, not SP, so this would silently NOP out. + encode_add_sub_ext(ctx, 1, 1, 0, X10, /*UXTX*/3, 0, SP_REG, SP_REG); + + // Source pointer X12 = vargs + (32 - N) * 8 = vargs + 256 - X10 + // Compute via X12 = vargs + 256, then X12 -= X10. + encode_add_sub_imm(ctx, 1, 0, 0, 0, 256, ARM_TMP2, X12); // ADD X12, X17, #256 + encode_add_sub_reg(ctx, 1, 1, 0, 0, X10, 0, X12, X12); // SUB X12, X12, X10 + + // Destination pointer X13 = SP + emit_mov_gpr(ctx, X13, SP_REG, 1); + + // Counter X14 = X9 + emit_mov_gpr(ctx, X14, X9, 1); + + // Copy loop: while X14 != 0: *X13++ = *X12++ ; X14--. + int loop_top = byte_count(ctx->code); + encode_ldr_str_imm(ctx, 3, 0, 1, 0, X12, X15); // LDR X15, [X12, #0] + encode_add_sub_imm(ctx, 1, 0, 0, 0, 8, X12, X12); // ADD X12, X12, #8 + encode_ldr_str_imm(ctx, 3, 0, 0, 0, X13, X15); // STR X15, [X13, #0] + encode_add_sub_imm(ctx, 1, 0, 0, 0, 8, X13, X13); // ADD X13, X13, #8 + encode_add_sub_imm(ctx, 1, 1, 1, 0, 1, X14, X14); // SUBS X14, X14, #1 + int loop_branch_pos = byte_count(ctx->code); + encode_branch_cond(ctx, 0, COND_NE); // B.NE loop_top + patch_helper_branch(ctx, loop_branch_pos, loop_top); + + // Patch the CBZ skip target = end of stack-push block. + int after_stack = byte_count(ctx->code); + patch_helper_branch(ctx, cbz_skip_pos, after_stack); + // --- END STACK PUSH --- + + // BLR fn (X16). + encode_branch_reg(ctx, /*BLR*/1, ARM_TMP1); + + emit_helper_epilogue(ctx); +} + +// Emit the hl2c trampoline. Called from JIT-compiled HL code; X0 holds the +// closure (vclosure_wrapper*), X1..X7,V0..V7 hold call args. We: +// 1. Spill X0..X7 and V0..V7 into a 128-byte buffer beneath the saved frame. +// 2. Inspect cl->t->fun->ret->kind to decide between hl_jit_wrapper_ptr +// (default) and hl_jit_wrapper_d (HF32/HF64 return). +// 3. Call wrapper(closure, &caller_stack_args, &spilled_regs). +static void emit_hl2c_trampoline( code_ctx *ctx ) { + hl_type_fun *ft = NULL; + + emit_helper_prologue(ctx); + emit_sp_offs(ctx, -128); // SUB SP, SP, #128 + + // Spill X0..X7 → [SP+0..56]. mode 0x12 = signed-offset STORE. + encode_ldp_stp(ctx, 0x02, 0, 0x12, 0, X1, SP_REG, X0); // STP X0,X1, [SP, #0] + encode_ldp_stp(ctx, 0x02, 0, 0x12, 2, X3, SP_REG, X2); // STP X2,X3, [SP, #16] + encode_ldp_stp(ctx, 0x02, 0, 0x12, 4, X5, SP_REG, X4); // STP X4,X5, [SP, #32] + encode_ldp_stp(ctx, 0x02, 0, 0x12, 6, X7, SP_REG, X6); // STP X6,X7, [SP, #48] + // Spill V0..V7 → [SP+64..120] (V0 at lowest, matching wrapper expectations). + encode_ldp_stp(ctx, 0x01, 1, 0x12, 8, (Arm64Reg)1, SP_REG, (Arm64Reg)0); // STP D0,D1, [SP, #64] + encode_ldp_stp(ctx, 0x01, 1, 0x12, 10, (Arm64Reg)3, SP_REG, (Arm64Reg)2); // STP D2,D3, [SP, #80] + encode_ldp_stp(ctx, 0x01, 1, 0x12, 12, (Arm64Reg)5, SP_REG, (Arm64Reg)4); // STP D4,D5, [SP, #96] + encode_ldp_stp(ctx, 0x01, 1, 0x12, 14, (Arm64Reg)7, SP_REG, (Arm64Reg)6); // STP D6,D7, [SP, #112] + + // X9 = closure (still in X0 — copy to keep X0 alive across loads). + emit_mov_gpr(ctx, X9, X0, 1); + // X9 = X9->t ; LDR X9, [X9, #0] + encode_ldr_str_imm(ctx, 3, 0, 1, 0, X9, X9); + // X9 = X9->fun ; LDR X9, [X9, #8] + encode_ldr_str_imm(ctx, 3, 0, 1, 1, X9, X9); + // X9 = X9->ret ; LDR X9, [X9, #offsetof(hl_type_fun, ret)] + int ret_offset = (int)(int_val)&ft->ret; + if( (ret_offset & 7) == 0 && (unsigned)ret_offset < 0x8000 ) + encode_ldr_str_imm(ctx, 3, 0, 1, ret_offset / 8, X9, X9); + else { + load_immediate(ctx, ret_offset, X10, true); + encode_ldr_str_reg(ctx, 3, 0, 1, X10, /*option=*/3, /*S=*/0, X9, X9); + } + // W9 = W9->kind ; LDR W9, [X9, #0] + encode_ldr_str_imm(ctx, 2, 0, 1, 0, X9, X9); + + // Branch on return-type kind. HF64 / HF32 → wrapper_d; default → wrapper_ptr. + encode_add_sub_imm(ctx, 0, 1, 1, 0, HF64, X9, XZR); // CMP W9, #HF64 + int jeq_f64 = byte_count(ctx->code); + encode_branch_cond(ctx, 0, COND_EQ); + encode_add_sub_imm(ctx, 0, 1, 1, 0, HF32, X9, XZR); // CMP W9, #HF32 + int jeq_f32 = byte_count(ctx->code); + encode_branch_cond(ctx, 0, COND_EQ); + + // Default path: load wrapper_ptr. + emit_const_load(ctx, ARM_TMP1, (uint64_t)(uintptr_t)hl_jit_wrapper_ptr); + int jdone_default = byte_count(ctx->code); + encode_branch_uncond(ctx, 0); + + // Float path. + int float_path = byte_count(ctx->code); + patch_helper_branch(ctx, jeq_f64, float_path); + patch_helper_branch(ctx, jeq_f32, float_path); + emit_const_load(ctx, ARM_TMP1, (uint64_t)(uintptr_t)hl_jit_wrapper_d); + + int after_select = byte_count(ctx->code); + patch_helper_branch(ctx, jdone_default, after_select); + + // Set up wrapper args: + // X0 (closure) — already in X0 across the type-walk because the LDR chain + // above used X9 only. ✓ + // X1 = caller stack args = X29 + 16 (skip saved fp+lr). + encode_add_sub_imm(ctx, 1, 0, 0, 0, 16, FP, X1); + // X2 = &spilled regs = SP. + emit_mov_gpr(ctx, X2, SP_REG, 1); + + // Call wrapper. + encode_branch_reg(ctx, /*BLR*/1, ARM_TMP1); + + emit_helper_epilogue(ctx); +} + +void hl_codegen_init( jit_ctx *jit ) { + code_ctx *ctx = jit->code; + byte_reserve(ctx->code, 4096); + ctx->code.cur -= 4096; + + // hl_null_access stub. + ctx->null_access_pos = jit->out_pos + byte_count(ctx->code); + emit_null_access_stub(ctx, (void*)hl_null_access); + flush_helper(ctx, ctx->null_access_pos); + + // hl_jit_null_field_access stub. + ctx->null_field_pos = jit->out_pos + byte_count(ctx->code); + emit_null_field_stub(ctx, (void*)hl_jit_null_field_access); + flush_helper(ctx, ctx->null_field_pos); + + // c2hl + hl2c trampolines. + jit->code_funs.c2hl = jit->out_pos + byte_count(ctx->code); + emit_c2hl_trampoline(ctx); + flush_helper(ctx, jit->code_funs.c2hl); + + jit->code_funs.hl2c = jit->out_pos + byte_count(ctx->code); + emit_hl2c_trampoline(ctx); + flush_helper(ctx, jit->code_funs.hl2c); + + hl_codegen_flush(jit); +} + +// --------------------------------------------------------------------------- +// hl_codegen_flush_consts: patch BL/ADRP/LDR/ADD references against absolute +// positions, then append the constant table to the output stream. +// --------------------------------------------------------------------------- + +// Patch ADRP imm21 split (immlo at bits 30:29, immhi at bits 23:5) given a +// target byte address `target_abs` and the address `pc_abs` of the ADRP insn. +// Both are absolute byte offsets within `jit->output` (page-aligned arithmetic +// is preserved when the buffer is later mmap'd to a page-aligned VA). +static void patch_adrp_imm21( unsigned char *out, int pc_abs, int target_abs ) { + int imm21 = (target_abs >> 12) - (pc_abs >> 12); + unsigned int *insn = (unsigned int*)(out + pc_abs); + unsigned int immlo = (unsigned)(imm21 & 0x3); + unsigned int immhi = (unsigned)((imm21 >> 2) & 0x7FFFF); + *insn = (*insn & ~((0x3u << 29) | (0x7FFFFu << 5))) + | (immlo << 29) | (immhi << 5); +} + +// Patch ADD/LDR imm12 (bits 21:10). `scale` is the instruction's natural +// immediate scale (1 for ADD, 8 for 64-bit LDR, etc.). Caller guarantees the +// low bits of the target are aligned to `scale`. +static void patch_imm12( unsigned char *out, int pos, int target_lo12, int scale ) { + unsigned int *insn = (unsigned int*)(out + pos); + unsigned int imm12 = (unsigned)((target_lo12 / scale) & 0xFFF); + *insn = (*insn & ~(0xFFFu << 10)) | (imm12 << 10); +} + +void hl_codegen_flush_consts( jit_ctx *jit ) { + code_ctx *ctx = jit->code; + + // Patch cross-function call sites recorded in `funs`. + for( int i = 0; i < int_arr_count(ctx->funs); i += 3 ) { + int pos = int_arr_get(ctx->funs, i); + int fid = int_arr_get(ctx->funs, i + 1); + int kind = int_arr_get(ctx->funs, i + 2); + intptr_t target_offs = (intptr_t)jit->mod->functions_ptrs[fid]; + if( kind == 0 ) { + // BL imm26. + intptr_t delta = target_offs - (intptr_t)pos; + int imm26 = (int)(delta >> 2); + unsigned int *insn = (unsigned int*)(jit->output + pos); + *insn = (*insn & ~0x03FFFFFFu) | ((unsigned)imm26 & 0x03FFFFFF); + } else { + // ADRP+ADD pair: pos = ADRP, pos+4 = ADD. + patch_adrp_imm21(jit->output, pos, (int)target_offs); + int lo12 = (int)target_offs & 0xFFF; + patch_imm12(jit->output, pos + 4, lo12, /*scale=*/1); + } + } + int_arr_reset(&ctx->funs); + + // Pad jit->out_pos to an 8-byte boundary so that constants at offset 0 + // (and every multiple of 8) within the table are reachable through LDR's + // 8-byte-scaled imm12 field with no precision loss. + while( jit->out_pos & 7 ) { + if( jit->out_pos < jit->out_max ) jit->output[jit->out_pos] = 0; + jit->out_pos++; + } + + // Append the constant table to the output stream. + jit->code_size = byte_count(ctx->const_table); + jit->code_instrs = ctx->const_table.values; + ctx->const_table_pos = jit->out_pos; + + // Patch ADRP+(LDR|ADD) const-pool refs. + for( int i = 0; i < int_arr_count(ctx->const_refs); i += 2 ) { + int adrp_pos = int_arr_get(ctx->const_refs, i); + int coffs = int_arr_get(ctx->const_refs, i + 1); + int target = ctx->const_table_pos + coffs; + patch_adrp_imm21(jit->output, adrp_pos, target); + // Detect whether the second insn is LDR (Xt|Dt|St) or ADD by inspecting + // the top 10 bits (31:22). LDR (unsigned-imm) encoding is + // `size 111 V 01 01 imm12 Rn Rt`; the 8-byte-scaled imm12 lives in + // bits 21:10. ADD-imm leaves the imm12 unscaled. + // Top10 bits (>>22) of canonical encodings: + // LDR Xt (size=11,V=0,opc=01): 0b1111100101 = 0x3E5 (scale=8) + // LDR Dt (size=11,V=1,opc=01): 0b1111110101 = 0x3F5 (scale=8) + // LDR St (size=10,V=1,opc=01): 0b1011110101 = 0x2F5 (scale=4) + // ADD-imm always falls into the else. + unsigned int second = *(unsigned int*)(jit->output + adrp_pos + 4); + int lo12 = target & 0xFFF; + switch( (second >> 22) & 0x3FF ) { + case 0x3E5: // LDR Xt + case 0x3F5: // LDR Dt + patch_imm12(jit->output, adrp_pos + 4, lo12, /*scale=*/8); + break; + case 0x2F5: // LDR St + patch_imm12(jit->output, adrp_pos + 4, lo12, /*scale=*/4); + break; + default: + // ADD (imm), unscaled. + patch_imm12(jit->output, adrp_pos + 4, lo12, /*scale=*/1); + break; + } + } + int_arr_reset(&ctx->const_refs); + + byte_free(&ctx->const_table); + value_map_free(&ctx->const_table_lookup); +} + +void hl_codegen_final( jit_ctx *jit ) { + code_ctx *ctx = jit->code; + // Fill jump-table entries with absolute addresses inside final_code. + for( int i = 0; i < int_arr_count(ctx->const_addr); i += 2 ) { + int table_offs = int_arr_get(ctx->const_addr, i); + int target_offs = int_arr_get(ctx->const_addr, i + 1); + *(void**)(jit->final_code + ctx->const_table_pos + table_offs) = + jit->final_code + target_offs; + } + int_arr_free(&ctx->const_addr); +} diff --git a/src/jit_aarch64_emit.c b/src/jit_aarch64_emit.c new file mode 100644 index 000000000..dbef3be37 --- /dev/null +++ b/src/jit_aarch64_emit.c @@ -0,0 +1,864 @@ +/* + * Copyright (C)2015-2016 Haxe Foundation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +/* + * AArch64 Instruction Encoding + * + * This file provides low-level instruction encoding functions for the AArch64 + * architecture. All instructions are 32-bit fixed width. + * + * References: + * - ARM Architecture Reference Manual ARMv8 (ARM ARM) + * - AArch64 Instruction Set Architecture + */ + +#if !defined(__aarch64__) && !defined(_M_ARM64) +# error "This file is for AArch64 architecture only." +#endif + +#include "jit_aarch64_emit.h" + +/* + * Helper macros for bit field manipulation + */ +#define BITS(val, start, len) (((unsigned int)(val) & ((1u << (len)) - 1)) << (start)) +#define BIT(val, pos) (((unsigned int)(val) & 1) << (pos)) + +// EMIT32 is defined in jit_common.h + +// ============================================================================ +// ADD/SUB Instructions +// ============================================================================ + +/** + * Encode ADD/SUB (immediate) instruction + * Format: ADD/SUB Xd, Xn, #imm12 [, LSL #shift] + * + * @param sf 1=64-bit, 0=32-bit + * @param op 0=ADD, 1=SUB + * @param S 1=set flags (ADDS/SUBS), 0=don't set flags + * @param shift 0=LSL #0, 1=LSL #12 + * @param imm12 12-bit unsigned immediate + * @param Rn Source register (0-31, 31=SP) + * @param Rd Destination register (0-31, 31=SP) + */ +void encode_add_sub_imm(code_ctx *ctx, int sf, int op, int S, int shift, int imm12, Arm64Reg Rn, Arm64Reg Rd) { + // ADD/SUB (immediate) encoding: + // [31] = sf, [30] = op (0=ADD, 1=SUB), [29] = S, [28:23] = 100010, [22] = sh + // [21:10] = imm12, [9:5] = Rn, [4:0] = Rd + unsigned int insn = BIT(sf, 31) | // [31] = sf + BIT(op, 30) | // [30] = op + BIT(S, 29) | // [29] = S + BITS(0x22, 23, 6) | // [28:23] = 100010 + BIT(shift, 22) | // [22] = sh + BITS(imm12, 10, 12) | // [21:10] = imm12 + BITS(Rn, 5, 5) | // [9:5] = Rn + BITS(Rd, 0, 5); // [4:0] = Rd + EMIT32(ctx, insn); +} + +/** + * Encode ADD/SUB (shifted register) instruction + * Format: ADD/SUB Xd, Xn, Xm [, shift #amount] + * + * @param sf 1=64-bit, 0=32-bit + * @param op 0=ADD, 1=SUB + * @param S 1=set flags, 0=don't set flags + * @param shift 00=LSL, 01=LSR, 10=ASR + * @param Rm Second source register + * @param imm6 Shift amount (0-63) + * @param Rn First source register + * @param Rd Destination register + */ +void encode_add_sub_reg(code_ctx *ctx, int sf, int op, int S, int shift, Arm64Reg Rm, + int imm6, Arm64Reg Rn, Arm64Reg Rd) { + unsigned int insn = BIT(sf, 31) | BITS(op, 30, 1) | BIT(S, 29) | BITS(0x0B, 24, 5) | + BITS(shift, 22, 2) | BITS(Rm, 16, 5) | BITS(imm6, 10, 6) | + BITS(Rn, 5, 5) | BITS(Rd, 0, 5); + EMIT32(ctx, insn); +} + +/** + * Encode ADD/SUB (extended register) instruction + * Format: ADD/SUB Xd, Xn, Wm, extend [#amount] + * + * @param sf 1=64-bit, 0=32-bit + * @param op 0=ADD, 1=SUB + * @param S 1=set flags, 0=don't set flags + * @param Rm Second source register + * @param option Extend type (UXTB=000, UXTH=001, UXTW=010, UXTX=011, SXTB=100, SXTH=101, SXTW=110, SXTX=111) + * @param imm3 Shift amount (0-4) + * @param Rn First source register + * @param Rd Destination register + */ +void encode_add_sub_ext(code_ctx *ctx, int sf, int op, int S, Arm64Reg Rm, + int option, int imm3, Arm64Reg Rn, Arm64Reg Rd) { + unsigned int insn = BIT(sf, 31) | BITS(op, 30, 1) | BIT(S, 29) | BITS(0x0B, 24, 5) | + BITS(1, 21, 2) | BITS(Rm, 16, 5) | BITS(option, 13, 3) | + BITS(imm3, 10, 3) | BITS(Rn, 5, 5) | BITS(Rd, 0, 5); + EMIT32(ctx, insn); +} + +// ============================================================================ +// Logical Instructions +// ============================================================================ + +/** + * Encode Logical (immediate) instruction + * Format: AND/ORR/EOR/ANDS Xd, Xn, #imm + * + * @param sf 1=64-bit, 0=32-bit + * @param opc 00=AND, 01=ORR, 10=EOR, 11=ANDS + * @param N Immediate encoding parameter + * @param immr Immediate encoding parameter (rotation) + * @param imms Immediate encoding parameter (size) + * @param Rn Source register + * @param Rd Destination register + */ +void encode_logical_imm(code_ctx *ctx, int sf, int opc, int N, int immr, int imms, Arm64Reg Rn, Arm64Reg Rd) { + unsigned int insn = BIT(sf, 31) | BITS(opc, 29, 2) | BITS(0x24, 23, 6) | BIT(N, 22) | + BITS(immr, 16, 6) | BITS(imms, 10, 6) | BITS(Rn, 5, 5) | BITS(Rd, 0, 5); + EMIT32(ctx, insn); +} + +/** + * Encode Logical (shifted register) instruction + * Format: AND/ORR/EOR/ANDS Xd, Xn, Xm [, shift #amount] + * + * @param sf 1=64-bit, 0=32-bit + * @param opc 00=AND, 01=ORR, 10=EOR, 11=ANDS + * @param shift 00=LSL, 01=LSR, 10=ASR, 11=ROR + * @param N Must be 0 for regular logical ops + * @param Rm Second source register + * @param imm6 Shift amount + * @param Rn First source register + * @param Rd Destination register + */ +void encode_logical_reg(code_ctx *ctx, int sf, int opc, int shift, int N, Arm64Reg Rm, + int imm6, Arm64Reg Rn, Arm64Reg Rd) { + unsigned int insn = BIT(sf, 31) | BITS(opc, 29, 2) | BITS(0x0A, 24, 5) | BITS(shift, 22, 2) | + BIT(N, 21) | BITS(Rm, 16, 5) | BITS(imm6, 10, 6) | + BITS(Rn, 5, 5) | BITS(Rd, 0, 5); + EMIT32(ctx, insn); +} + +// ============================================================================ +// Move Wide (immediate) Instructions +// ============================================================================ + +/** + * Encode MOVZ/MOVN/MOVK instruction + * Format: MOVZ/MOVN/MOVK Xd, #imm16 [, LSL #shift] + * + * @param sf 1=64-bit, 0=32-bit + * @param opc 10=MOVZ, 00=MOVN, 11=MOVK + * @param hw Hardware position (0-3 for 64-bit, 0-1 for 32-bit) - selects 16-bit field + * @param imm16 16-bit immediate value + * @param Rd Destination register + */ +void encode_mov_wide_imm(code_ctx *ctx, int sf, int opc, int hw, int imm16, Arm64Reg Rd) { + unsigned int insn = BIT(sf, 31) | BITS(opc, 29, 2) | BITS(0x25, 23, 6) | + BITS(hw, 21, 2) | BITS(imm16, 5, 16) | BITS(Rd, 0, 5); + EMIT32(ctx, insn); +} + +// ============================================================================ +// Multiply Instructions +// ============================================================================ + +/** + * Encode MADD/MSUB instruction (multiply-add/subtract) + * Format: MADD Xd, Xn, Xm, Xa (Xd = Xa + Xn*Xm) + * MSUB Xd, Xn, Xm, Xa (Xd = Xa - Xn*Xm) + * + * @param sf 1=64-bit, 0=32-bit + * @param op 0=MADD, 1=MSUB + * @param Rm Second multiplicand + * @param Ra Addend/subtrahend (use XZR for simple MUL) + * @param Rn First multiplicand + * @param Rd Destination + */ +void encode_madd_msub(code_ctx *ctx, int sf, int op, Arm64Reg Rm, Arm64Reg Ra, Arm64Reg Rn, Arm64Reg Rd) { + // MADD/MSUB encoding: [31]=sf, [30:29]=00, [28:24]=11011, [23:21]=000, [20:16]=Rm + // [15]=op (0=MADD, 1=MSUB), [14:10]=Ra, [9:5]=Rn, [4:0]=Rd + unsigned int insn = BIT(sf, 31) | BITS(0xD8, 21, 8) | BITS(Rm, 16, 5) | + BIT(op, 15) | BITS(Ra, 10, 5) | BITS(Rn, 5, 5) | BITS(Rd, 0, 5); + EMIT32(ctx, insn); +} + +/** + * Encode SDIV/UDIV instruction + * Format: SDIV/UDIV Xd, Xn, Xm + * + * @param sf 1=64-bit, 0=32-bit + * @param U 0=UDIV (unsigned), 1=SDIV (signed) — this matches the + * ARM ARM bit-10 encoding: 0=UDIV, 1=SDIV. (Earlier comment + * had this inverted.) + * @param Rm Divisor + * @param Rn Dividend + * @param Rd Destination (quotient) + */ +void encode_div(code_ctx *ctx, int sf, int U, Arm64Reg Rm, Arm64Reg Rn, Arm64Reg Rd) { + // SDIV/UDIV encoding: [31]=sf, [30:29]=00, [28:21]=11010110, [20:16]=Rm + // [15:11]=00001, [10]=U (1=SDIV, 0=UDIV), [9:5]=Rn, [4:0]=Rd + unsigned int insn = BIT(sf, 31) | BITS(0xD6, 21, 8) | BITS(Rm, 16, 5) | + BITS(0x2 | U, 10, 6) | BITS(Rn, 5, 5) | BITS(Rd, 0, 5); + EMIT32(ctx, insn); +} + +// ============================================================================ +// Shift Instructions +// ============================================================================ + +/** + * Encode variable shift (LSLV/LSRV/ASRV/RORV) + * Format: LSL/LSR/ASR/ROR Xd, Xn, Xm + * + * @param sf 1=64-bit, 0=32-bit + * @param op2 00=LSLV, 01=LSRV, 10=ASRV, 11=RORV + * @param Rm Shift amount register + * @param Rn Source register + * @param Rd Destination register + */ +void encode_shift_reg(code_ctx *ctx, int sf, int op2, Arm64Reg Rm, Arm64Reg Rn, Arm64Reg Rd) { + // LSLV/LSRV/ASRV/RORV encoding: [31]=sf, [30:29]=00, [28:21]=11010110, [20:16]=Rm + // [15:12]=0010, [11:10]=op2, [9:5]=Rn, [4:0]=Rd + unsigned int insn = BIT(sf, 31) | BITS(0xD6, 21, 8) | BITS(Rm, 16, 5) | + BITS(0x08 | op2, 10, 6) | BITS(Rn, 5, 5) | BITS(Rd, 0, 5); + EMIT32(ctx, insn); +} + +// ============================================================================ +// Load/Store Instructions +// ============================================================================ + +/** + * Encode LDR/STR (unsigned immediate offset) + * Format: LDR/STR Xt, [Xn, #imm] + * + * @param size 00=8-bit, 01=16-bit, 10=32-bit, 11=64-bit + * @param V 0=GPR, 1=FP/SIMD + * @param opc For V=0: 01=LDR, 00=STR, 10=LDRSW, 11=prfm + * @param imm12 Unsigned 12-bit offset (scaled by size) + * @param Rn Base register + * @param Rt Source/destination register + */ +void encode_ldr_str_imm(code_ctx *ctx, int size, int V, int opc, int imm12, Arm64Reg Rn, Arm64Reg Rt) { + // LDR/STR (unsigned offset) encoding: + // [31:30] = size, [29:27] = 111, [26] = V, [25:24] = 01, [23:22] = opc + // [21:10] = imm12, [9:5] = Rn, [4:0] = Rt + unsigned int insn = BITS(size, 30, 2) | // [31:30] = size + BITS(7, 27, 3) | // [29:27] = 111 + BIT(V, 26) | // [26] = V + BITS(1, 24, 2) | // [25:24] = 01 (unsigned offset) + BITS(opc, 22, 2) | // [23:22] = opc + BITS(imm12, 10, 12) | // [21:10] = imm12 + BITS(Rn, 5, 5) | // [9:5] = Rn + BITS(Rt, 0, 5); // [4:0] = Rt + EMIT32(ctx, insn); +} + +/** + * Encode LDR/STR (register offset) + * Format: LDR/STR Xt, [Xn, Xm{, extend {#amount}}] + * + * @param size 00=8-bit, 01=16-bit, 10=32-bit, 11=64-bit + * @param V 0=GPR, 1=FP/SIMD + * @param opc For V=0: 01=LDR, 00=STR + * @param Rm Offset register + * @param option Extend type (010=UXTW, 011=LSL, 110=SXTW, 111=SXTX) + * @param S 1=scale offset by size, 0=no scaling + * @param Rn Base register + * @param Rt Source/destination register + */ +void encode_ldr_str_reg(code_ctx *ctx, int size, int V, int opc, Arm64Reg Rm, + int option, int S, Arm64Reg Rn, Arm64Reg Rt) { + // LDR/STR (register offset) encoding: + // [31:30] = size, [29:27] = 111, [26] = V, [25:24] = 00, [23:22] = opc + // [21] = 1, [20:16] = Rm, [15:13] = option, [12] = S, [11:10] = 10 + // [9:5] = Rn, [4:0] = Rt + unsigned int insn = BITS(size, 30, 2) | // [31:30] = size + BITS(7, 27, 3) | // [29:27] = 111 + BIT(V, 26) | // [26] = V + BITS(0, 24, 2) | // [25:24] = 00 (register offset) + BITS(opc, 22, 2) | // [23:22] = opc + BIT(1, 21) | // [21] = 1 + BITS(Rm, 16, 5) | // [20:16] = Rm + BITS(option, 13, 3) | // [15:13] = option + BIT(S, 12) | // [12] = S + BITS(2, 10, 2) | // [11:10] = 10 + BITS(Rn, 5, 5) | // [9:5] = Rn + BITS(Rt, 0, 5); // [4:0] = Rt + EMIT32(ctx, insn); +} + +/** + * Encode LDUR/STUR (unscaled signed offset) + * Format: LDUR/STUR Rt, [Xn, #simm9] + * + * This instruction uses a signed 9-bit immediate offset (-256 to +255) that is + * NOT scaled by the access size. This is ideal for accessing stack locals at + * negative offsets from the frame pointer. + * + * @param size 00=8-bit, 01=16-bit, 10=32-bit, 11=64-bit + * @param V 0=GPR, 1=FP/SIMD + * @param opc 00=STUR, 01=LDUR + * @param imm9 Signed 9-bit offset (-256 to +255), unscaled + * @param Rn Base register + * @param Rt Source/destination register + */ +void encode_ldur_stur(code_ctx *ctx, int size, int V, int opc, int imm9, Arm64Reg Rn, Arm64Reg Rt) { + // LDUR/STUR (unscaled offset) encoding: + // [31:30] = size, [29:27] = 111, [26] = V, [25:24] = 00, [23:22] = opc + // [21] = 0, [20:12] = imm9, [11:10] = 00, [9:5] = Rn, [4:0] = Rt + unsigned int insn = BITS(size, 30, 2) | // [31:30] = size + BITS(7, 27, 3) | // [29:27] = 111 + BIT(V, 26) | // [26] = V + BITS(0, 24, 2) | // [25:24] = 00 (unscaled offset) + BITS(opc, 22, 2) | // [23:22] = opc + BIT(0, 21) | // [21] = 0 + BITS(imm9 & 0x1FF, 12, 9) | // [20:12] = imm9 (masked to 9 bits) + BITS(0, 10, 2) | // [11:10] = 00 + BITS(Rn, 5, 5) | // [9:5] = Rn + BITS(Rt, 0, 5); // [4:0] = Rt + EMIT32(ctx, insn); +} + +/** + * Encode LDP/STP (Load/Store Pair) + * Format: LDP/STP Xt1, Xt2, [Xn, #imm] (various addressing modes) + * + * @param opc Size: 00=32-bit, 10=64-bit + * @param V 0=GPR, 1=FP/SIMD registers + * @param mode Addressing mode + load/store: + * 0x01 = post-indexed load (LDP Xt1, Xt2, [Xn], #imm) + * 0x02 = signed-offset load (LDP Xt1, Xt2, [Xn, #imm]) + * 0x03 = pre-indexed store (STP Xt1, Xt2, [Xn, #imm]!) + * 0x12 = signed-offset store (STP Xt1, Xt2, [Xn, #imm]) + * 0x13 = pre-indexed load (LDP Xt1, Xt2, [Xn, #imm]!) + * 0x11 = post-indexed store (STP Xt1, Xt2, [Xn], #imm) + * @param imm7 Signed 7-bit offset (scaled by register size: *4 for 32-bit, *8 for 64-bit) + * @param Rt2 Second register + * @param Rn Base register + * @param Rt First register + * + * ARM64 encoding: + * [31:30] = opc (size) + * [29:27] = 101 (fixed) + * [26] = V + * [25:24] = addressing mode (01=post, 10=offset, 11=pre) + * [23] = 0 (reserved) + * [22] = L (0=store, 1=load) + * [21:15] = imm7 + * [14:10] = Rt2 + * [9:5] = Rn + * [4:0] = Rt + */ +void encode_ldp_stp(code_ctx *ctx, int opc, int V, int mode, int imm7, + Arm64Reg Rt2, Arm64Reg Rn, Arm64Reg Rt) { + int addr_mode, L; + + // Decode mode parameter to get addressing mode and load/store bit. + // Bit 4 (0x10) of mode forces store; otherwise the legacy mappings apply. + if (mode & 0x10) { + addr_mode = mode & 3; + L = 0; + } else if (mode == 0x03) { + // Pre-indexed store: STP Xt1, Xt2, [Xn, #imm]! + addr_mode = 3; + L = 0; + } else if (mode == 0x01) { + // Post-indexed load: LDP Xt1, Xt2, [Xn], #imm + addr_mode = 1; + L = 1; + } else { + // Default: use mode as addressing mode, assume load + addr_mode = mode & 3; + L = 1; + } + + unsigned int insn = BITS(opc, 30, 2) | // [31:30] = opc + BITS(5, 27, 3) | // [29:27] = 101 + BIT(V, 26) | // [26] = V + BITS(addr_mode, 23, 2) | // [24:23] = addressing mode + BIT(L, 22) | // [22] = L + BITS(imm7, 15, 7) | // [21:15] = imm7 + BITS(Rt2, 10, 5) | // [14:10] = Rt2 + BITS(Rn, 5, 5) | // [9:5] = Rn + BITS(Rt, 0, 5); // [4:0] = Rt + EMIT32(ctx, insn); +} + +// ============================================================================ +// PC-Relative Addressing +// ============================================================================ + +/** + * Encode ADRP instruction + * Format: ADRP Xd, label (load PC-relative page address) + * + * @param immlo Low 2 bits of 21-bit offset (bits 0-1) + * @param immhi High 19 bits of 21-bit offset (bits 2-20) + * @param Rd Destination register + * + * Note: offset is in pages (4KB), so actual byte offset = imm21 << 12 + */ +void encode_adrp(code_ctx *ctx, int immlo, int immhi, Arm64Reg Rd) { + unsigned int insn = BITS(1, 31, 1) | BITS(immlo, 29, 2) | BITS(0x10, 24, 5) | + BITS(immhi, 5, 19) | BITS(Rd, 0, 5); + EMIT32(ctx, insn); +} + +/** + * Encode ADR instruction + * Format: ADR Xd, label (load PC-relative address) + * + * @param immlo Low 2 bits of 21-bit offset + * @param immhi High 19 bits of 21-bit offset + * @param Rd Destination register + */ +void encode_adr(code_ctx *ctx, int immlo, int immhi, Arm64Reg Rd) { + unsigned int insn = BITS(0, 31, 1) | BITS(immlo, 29, 2) | BITS(0x10, 24, 5) | + BITS(immhi, 5, 19) | BITS(Rd, 0, 5); + EMIT32(ctx, insn); +} + +// ============================================================================ +// Branch Instructions +// ============================================================================ + +/** + * Encode conditional branch + * Format: B.cond label + * + * @param imm19 Signed 19-bit offset (in instructions, i.e., offset/4) + * @param cond Condition code (0000=EQ, 0001=NE, 1010=GE, 1011=LT, etc.) + */ +void encode_branch_cond(code_ctx *ctx, int imm19, ArmCondition cond) { + unsigned int insn = BITS(0x54, 24, 8) | BITS(imm19, 5, 19) | BITS(cond, 0, 4); + EMIT32(ctx, insn); +} + +/** + * Encode unconditional branch + * Format: B label + * + * @param imm26 Signed 26-bit offset (in instructions, i.e., offset/4) + */ +void encode_branch_uncond(code_ctx *ctx, int imm26) { + unsigned int insn = BITS(0x05, 26, 6) | BITS(imm26, 0, 26); + EMIT32(ctx, insn); +} + +/** + * Encode branch with link + * Format: BL label + * + * @param imm26 Signed 26-bit offset (in instructions) + */ +void encode_branch_link(code_ctx *ctx, int imm26) { + unsigned int insn = BITS(0x25, 26, 6) | BITS(imm26, 0, 26); + EMIT32(ctx, insn); +} + +/** + * Encode register branch instructions + * Format: BR/BLR/RET Xn + * + * @param opc 00=BR, 01=BLR, 10=RET + * @param Rn Register containing target address (X30/LR for RET) + */ +void encode_branch_reg(code_ctx *ctx, int opc, Arm64Reg Rn) { + unsigned int insn = BITS(0x6B0, 21, 11) | BITS(opc, 21, 2) | + BITS(0x1F, 16, 5) | BITS(Rn, 5, 5); + EMIT32(ctx, insn); +} + +/** + * Encode CBZ/CBNZ (compare and branch if zero/non-zero) + * Format: CBZ/CBNZ Xt, label + * + * @param sf 1=64-bit, 0=32-bit + * @param op 0=CBZ, 1=CBNZ + * @param imm19 Signed 19-bit offset (in instructions) + * @param Rt Register to test + */ +void encode_cbz_cbnz(code_ctx *ctx, int sf, int op, int imm19, Arm64Reg Rt) { + unsigned int insn = BIT(sf, 31) | BITS(0x1A, 25, 6) | BIT(op, 24) | + BITS(imm19, 5, 19) | BITS(Rt, 0, 5); + EMIT32(ctx, insn); +} + +/** + * Encode TBZ/TBNZ (test bit and branch if zero/non-zero) + * Format: TBZ/TBNZ Xt, #bit, label + * + * @param b5 Bit 5 of bit position (0-63) + * @param op 0=TBZ, 1=TBNZ + * @param b40 Bits 4-0 of bit position + * @param imm14 Signed 14-bit offset (in instructions) + * @param Rt Register to test + */ +void encode_tbz_tbnz(code_ctx *ctx, int b5, int op, int b40, int imm14, Arm64Reg Rt) { + unsigned int insn = BIT(b5, 31) | BITS(0x1B, 25, 6) | BIT(op, 24) | + BITS(b40, 19, 5) | BITS(imm14, 5, 14) | BITS(Rt, 0, 5); + EMIT32(ctx, insn); +} + +// ============================================================================ +// Floating-Point Instructions +// ============================================================================ + +/** + * Encode floating-point arithmetic (2-source) + * Format: FADD/FSUB/FMUL/FDIV/FMAX/FMIN Vd, Vn, Vm + * + * @param M 0=scalar, 1=vector + * @param S 0=single precision, 1=double precision + * @param type 00=single, 01=double + * @param Rm Second source register + * @param opcode 0000=FMUL, 0001=FDIV, 0010=FADD, 0011=FSUB, 0100=FMAX, 0101=FMIN + * @param Rn First source register + * @param Rd Destination register + */ +void encode_fp_arith(code_ctx *ctx, int M, int S, int type, Arm64FpReg Rm, + int opcode, Arm64FpReg Rn, Arm64FpReg Rd) { + unsigned int insn = BIT(M, 31) | BIT(S, 29) | BITS(0x1E, 24, 5) | + BITS(type, 22, 2) | BITS(1, 21, 1) | BITS(Rm, 16, 5) | + BITS(opcode, 12, 4) | BITS(2, 10, 2) | + BITS(Rn, 5, 5) | BITS(Rd, 0, 5); + EMIT32(ctx, insn); +} + +/** + * Encode floating-point negate/abs/sqrt (1-source) + * Format: FNEG/FABS/FSQRT Vd, Vn + * + * @param M 0=scalar, 1=vector + * @param S 0=single precision, 1=double precision + * @param type 00=single, 01=double + * @param opcode 000000=FMOV, 000001=FABS, 000010=FNEG, 000011=FSQRT + * @param Rn Source register + * @param Rd Destination register + */ +void encode_fp_1src(code_ctx *ctx, int M, int S, int type, int opcode, Arm64FpReg Rn, Arm64FpReg Rd) { + unsigned int insn = BIT(M, 31) | BIT(S, 29) | BITS(0x1E, 24, 5) | + BITS(type, 22, 2) | BITS(1, 21, 1) | + BITS(opcode, 15, 6) | BITS(0x10, 10, 5) | + BITS(Rn, 5, 5) | BITS(Rd, 0, 5); + EMIT32(ctx, insn); +} + +/** + * Encode floating-point compare + * Format: FCMP/FCMPE Vn, Vm + * + * @param M 0=scalar + * @param S 0=single precision, 1=double precision + * @param type 00=single, 01=double + * @param Rm Second source register (or 0 for comparison with zero) + * @param op 00=FCMP, 10=FCMPE (signal exception on qNaN) + * @param Rn First source register + */ +void encode_fp_compare(code_ctx *ctx, int M, int S, int type, Arm64FpReg Rm, int op, Arm64FpReg Rn) { + unsigned int insn = BIT(M, 31) | BIT(S, 29) | BITS(0x1E, 24, 5) | + BITS(type, 22, 2) | BITS(1, 21, 1) | BITS(Rm, 16, 5) | + BITS(op, 14, 2) | BITS(8, 10, 4) | BITS(Rn, 5, 5); + EMIT32(ctx, insn); +} + +/** + * Encode floating-point conversion to integer + * Format: FCVTZS/FCVTZU Xd, Vn + * + * @param sf 1=64-bit int, 0=32-bit int + * @param S 0=single precision, 1=double precision + * @param type 00=single, 01=double, 10/11=half + * @param rmode 00=round to nearest, 01=round towards +inf, 10=round towards -inf, 11=round towards zero + * @param opc 000=FCVTNS, 001=FCVTNU, 010=SCVTF, 011=UCVTF, 110=FMOV, 111=FMOV + * @param Rn Source FP register + * @param Rd Destination integer register + */ +void encode_fcvt_int(code_ctx *ctx, int sf, int S, int type, int rmode, int opc, Arm64FpReg Rn, Arm64Reg Rd) { + unsigned int insn = BIT(sf, 31) | BIT(S, 29) | BITS(0x1E, 24, 5) | + BITS(type, 22, 2) | BITS(1, 21, 1) | + BITS(rmode, 19, 2) | BITS(opc, 16, 3) | + BITS(Rn, 5, 5) | BITS(Rd, 0, 5); + EMIT32(ctx, insn); +} + +/** + * Encode integer conversion to floating-point + * Format: SCVTF/UCVTF Vd, Xn + * + * @param sf 1=64-bit int, 0=32-bit int + * @param S 0=single precision, 1=double precision + * @param type 00=single, 01=double + * @param rmode 00 for conversions + * @param opc 010=SCVTF, 011=UCVTF + * @param Rn Source integer register + * @param Rd Destination FP register + */ +void encode_int_fcvt(code_ctx *ctx, int sf, int S, int type, int rmode, int opc, Arm64Reg Rn, Arm64FpReg Rd) { + unsigned int insn = BIT(sf, 31) | BIT(S, 29) | BITS(0x1E, 24, 5) | + BITS(type, 22, 2) | BITS(1, 21, 1) | + BITS(rmode, 19, 2) | BITS(opc, 16, 3) | + BITS(Rn, 5, 5) | BITS(Rd, 0, 5); + EMIT32(ctx, insn); +} + +// ============================================================================ +// Conditional Select +// ============================================================================ + +/** + * Encode CSEL/CSINC/CSINV/CSNEG + * Format: CSEL Xd, Xn, Xm, cond + * + * @param sf 1=64-bit, 0=32-bit + * @param op 0=CSEL, 1=CSINC/CSINV/CSNEG (depends on op2) + * @param Rm Second source register + * @param cond Condition code + * @param op2 00=CSEL, 01=CSINC, 10=CSINV, 11=CSNEG + * @param Rn First source register + * @param Rd Destination register + */ +void encode_cond_select(code_ctx *ctx, int sf, int op, Arm64Reg Rm, ArmCondition cond, + int op2, Arm64Reg Rn, Arm64Reg Rd) { + // CSEL/CSINC/CSINV/CSNEG encoding: [31]=sf, [30]=op, [29]=S=0, [28:21]=11010100 + // [20:16]=Rm, [15:12]=cond, [11:10]=op2, [9:5]=Rn, [4:0]=Rd + unsigned int insn = BIT(sf, 31) | BIT(op, 30) | BITS(0xD4, 21, 8) | + BITS(Rm, 16, 5) | BITS(cond, 12, 4) | + BITS(op2, 10, 2) | BITS(Rn, 5, 5) | BITS(Rd, 0, 5); + EMIT32(ctx, insn); +} + +// ============================================================================ +// High-Level Helper Functions +// ============================================================================ + +// ---------------------------------------------------------------------------- +// Logical Immediate Encoding Helpers +// ---------------------------------------------------------------------------- + +/** + * Rotate a 64-bit value right by the specified amount + */ +static inline uint64_t rotate_right_64(uint64_t val, int rotation) { + return (val >> (rotation & 63)) | (val << ((-rotation) & 63)); +} + +/** + * Check if a 64-bit value can be encoded as a logical immediate + * and compute the N, immr, imms fields if so. + * + * Based on the optimized algorithm from dougallj: + * https://dougallj.wordpress.com/2021/10/30/ + * + * AArch64 logical immediates can represent bitmask patterns consisting of + * a single run of 1-bits, optionally rotated, and replicated across element + * sizes of 2, 4, 8, 16, 32, or 64 bits. + * + * @param val The 64-bit value to check + * @param N Output: N field (1 for 64-bit element, 0 otherwise) + * @param immr Output: rotation amount field (6 bits) + * @param imms Output: element size/ones encoding field (6 bits) + * @return true if value is encodable, false otherwise + */ +static bool is_logical_immediate_64(uint64_t val, int *N, int *immr, int *imms) { + // All-zeros and all-ones cannot be encoded + if (val == 0 || ~val == 0) + return false; + + // Find rotation to normalize the pattern + // val & (val + 1) clears trailing ones; ctz gives rotation amount + // Handle the case where val is all trailing ones (ctzll(0) is undefined) + uint64_t tmp = val & (val + 1); + int rotation = (tmp == 0) ? 0 : __builtin_ctzll(tmp); + uint64_t normalized = rotate_right_64(val, rotation); + + // Count leading zeros and trailing ones in normalized form + int zeroes = __builtin_clzll(normalized); + int ones = __builtin_ctzll(~normalized); + int size = zeroes + ones; + + // Validate: pattern must repeat when rotated by size + // This also implicitly checks that size is a power of 2 + if (rotate_right_64(val, size) != val) + return false; + + // Encode the fields + *immr = (-rotation) & (size - 1); + *imms = ((-(size << 1)) | (ones - 1)) & 0x3f; + *N = (size >> 6); + + return true; +} + +/** + * Check if a 32-bit value can be encoded as a logical immediate + * for 32-bit operations (where N must be 0). + * + * @param val The 32-bit value to check + * @param N Output: N field (must be 0 for 32-bit) + * @param immr Output: rotation amount field + * @param imms Output: element size/ones encoding field + * @return true if value is encodable, false otherwise + */ +static bool is_logical_immediate_32(uint32_t val, int *N, int *immr, int *imms) { + // All-zeros and all-ones cannot be encoded + if (val == 0 || val == 0xFFFFFFFF) + return false; + + // Replicate 32-bit pattern to 64-bit for encoding calculation + uint64_t val64 = ((uint64_t)val << 32) | val; + + if (!is_logical_immediate_64(val64, N, immr, imms)) + return false; + + // For 32-bit operations, N must be 0 (element size <= 32) + if (*N != 0) + return false; + + return true; +} + +// ---------------------------------------------------------------------------- + +/** + * Load an immediate value into a register + * Uses logical immediate (ORR) when possible, otherwise MOVZ/MOVK sequence + * + * @param val 64-bit immediate value + * @param dst Destination register + * @param is_64bit true=64-bit register, false=32-bit register + */ +void load_immediate(code_ctx *ctx, int64_t val, Arm64Reg dst, bool is_64bit) { + int sf = is_64bit ? 1 : 0; + + // Special case: zero + if (val == 0) { + // MOV Xd, XZR (using ORR with XZR) + encode_logical_reg(ctx, sf, 0x01, 0, 0, XZR, 0, XZR, dst); + return; + } + + // Special case: all ones (for 32-bit: 0xFFFFFFFF, for 64-bit: 0xFFFFFFFFFFFFFFFF) + if ((!is_64bit && val == 0xFFFFFFFF) || (is_64bit && val == -1LL)) { + // MOVN Xd, #0 + encode_mov_wide_imm(ctx, sf, 0x00, 0, 0, dst); + return; + } + + // Special case: small negative values that fit in a single MOVN instruction + // MOVN Xd, #imm16 produces ~imm16, which equals -(imm16+1) + // So for values in range [-65536, -1], we can use a single MOVN + // For 32-bit mode, sign extension is automatic + if (val < 0 && val >= -65536) { + // ~val gives us the immediate to use with MOVN + // e.g., for val=-8: ~(-8) = 7, and MOVN Xd, #7 produces ~7 = -8 + encode_mov_wide_imm(ctx, sf, 0x00, 0, (int)(~val) & 0xFFFF, dst); + return; + } + + // Special case: small positive values that fit in a single MOVZ instruction + if (val > 0 && val <= 65535) { + encode_mov_wide_imm(ctx, sf, 0x02, 0, (int)val, dst); + return; + } + + // Try logical immediate encoding: ORR Xd, XZR, #imm + // This can load many bitmask patterns with a single instruction + { + int N, immr, imms; + bool can_encode = is_64bit + ? is_logical_immediate_64((uint64_t)val, &N, &immr, &imms) + : is_logical_immediate_32((uint32_t)val, &N, &immr, &imms); + + if (can_encode) { + // ORR Xd, XZR, #imm (opc=0x01 for ORR) + encode_logical_imm(ctx, sf, 0x01, N, immr, imms, XZR, dst); + return; + } + } + + // Count which halfwords are non-zero + uint64_t uval = (uint64_t)val; + int hw0 = uval & 0xFFFF; + int hw1 = (uval >> 16) & 0xFFFF; + int hw2 = (uval >> 32) & 0xFFFF; + int hw3 = (uval >> 48) & 0xFFFF; + + int nonzero_count = 0; + if (hw0) nonzero_count++; + if (hw1) nonzero_count++; + if (is_64bit) { + if (hw2) nonzero_count++; + if (hw3) nonzero_count++; + } + + // Try MOVN (move inverted) if more halfwords are 0xFFFF than not + int ones_count = 0; + if (hw0 == 0xFFFF) ones_count++; + if (hw1 == 0xFFFF) ones_count++; + if (is_64bit) { + if (hw2 == 0xFFFF) ones_count++; + if (hw3 == 0xFFFF) ones_count++; + } + + int total_hw = is_64bit ? 4 : 2; + bool use_movn = (ones_count > nonzero_count); + + if (use_movn) { + // Use MOVN (inverted) + MOVK + int first = 1; + for (int i = 0; i < total_hw; i++) { + int hw_val = (uval >> (i * 16)) & 0xFFFF; + if (hw_val != 0xFFFF) { + if (first) { + // MOVN Xd, #(~hw_val & 0xFFFF), LSL #(i*16) + encode_mov_wide_imm(ctx, sf, 0x00, i, (~hw_val) & 0xFFFF, dst); + first = 0; + } else { + // MOVK Xd, #hw_val, LSL #(i*16) + encode_mov_wide_imm(ctx, sf, 0x03, i, hw_val, dst); + } + } + } + } else { + // Use MOVZ + MOVK + int first = 1; + for (int i = 0; i < total_hw; i++) { + int hw_val = (uval >> (i * 16)) & 0xFFFF; + if (hw_val != 0) { + if (first) { + // MOVZ Xd, #hw_val, LSL #(i*16) + encode_mov_wide_imm(ctx, sf, 0x02, i, hw_val, dst); + first = 0; + } else { + // MOVK Xd, #hw_val, LSL #(i*16) + encode_mov_wide_imm(ctx, sf, 0x03, i, hw_val, dst); + } + } + } + } +} diff --git a/src/jit_aarch64_emit.h b/src/jit_aarch64_emit.h new file mode 100644 index 000000000..0371af69c --- /dev/null +++ b/src/jit_aarch64_emit.h @@ -0,0 +1,240 @@ +/* + * Copyright (C)2015-2026 Haxe Foundation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ +#ifndef JIT_AARCH64_EMIT_H +#define JIT_AARCH64_EMIT_H + +#include +#include +#include +#include +#include "data_struct.h" + +// Per-TU instantiation of byte_arr (the code buffer type). +// Helpers are static-inline so two TUs may include this header without ODR conflict. +#define S_TYPE byte_arr +#define S_NAME(name) byte_##name +#define S_VALUE unsigned char +#include "data_struct.c" +#define byte_reserve(set,count) byte_reserve_impl(DEF_ALLOC,&set,count) + +// value_map: dedup uint64 constants in the literal pool (Phase 4+). +#define S_SORTED +#define S_MAP +#define S_TYPE value_map +#define S_NAME(name) value_map_##name +#define S_KEY uint64 +#define S_VALUE int +#define S_DEFVAL -1 +#include "data_struct.c" +#undef S_MAP +#undef S_SORTED + +// Backend codegen context (each backend defines its own _code_ctx layout). +// Phase 2: function shell + branch fixups + per-IR-op pos_map. +// Phase 4: constant pool, function-call relocations, jump-table absolutes. +struct _code_ctx { + jit_ctx *jit; + byte_arr code; + // Each pending branch is a triple (code_byte_pos, target_ir_op, is_cond) + // patched after the function's pos_map is finalized. + int_arr branch_fixups; + int *pos_map; + int cur_op; + bool flushed; + // Phase 4: cross-function call relocations (BL imm26 or ADRP+ADD). + // Triples (code_byte_pos, fid, kind) where kind=0:BL, kind=1:ADRP+ADD pair. + int_arr funs; + // Constant pool. Each constant ref is (adrp_pos, const_offset); patched in + // hl_codegen_flush_consts to ADRP imm21 + LDR/ADD imm12 split. + value_map const_table_lookup; + byte_arr const_table; + int_arr const_refs; + // Jump-table absolute fills: pairs (table_offs, target_byte_pos_in_output). + // In hl_codegen_final each entry becomes `final_code + target` written into + // `final_code + const_table_pos + table_offs`. + int_arr const_addr; + int const_table_pos; + // Direct-call shortcuts for null-access stubs (BL within ±128 MB). + int null_access_pos; + int null_field_pos; +}; + +// Write a 32-bit instruction to ctx->code. Caller is responsible for byte_reserve. +#define EMIT32(ctx, val) do { \ + *(unsigned int*)&(ctx)->code.values[(ctx)->code.cur] = (unsigned int)(val); \ + (ctx)->code.cur += 4; \ +} while(0) + +/* + * AArch64 Register Definitions + */ + +// General Purpose Registers (64-bit: X0-X30, 32-bit: W0-W30) +typedef enum { + X0 = 0, X1 = 1, X2 = 2, X3 = 3, + X4 = 4, X5 = 5, X6 = 6, X7 = 7, + X8 = 8, X9 = 9, X10 = 10, X11 = 11, + X12 = 12, X13 = 13, X14 = 14, X15 = 15, + X16 = 16, X17 = 17, X18 = 18, X19 = 19, + X20 = 20, X21 = 21, X22 = 22, X23 = 23, + X24 = 24, X25 = 25, X26 = 26, X27 = 27, + X28 = 28, X29 = 29, X30 = 30, + + // Special register names + FP = 29, // Frame Pointer (X29) + LR = 30, // Link Register (X30) + SP_REG = 31, // Stack Pointer (encoding value, context-dependent) + XZR = 31 // Zero Register (encoding value, context-dependent) +} Arm64Reg; + +// 32-bit register names (W registers) +typedef enum { + W0 = 0, W1 = 1, W2 = 2, W3 = 3, + W4 = 4, W5 = 5, W6 = 6, W7 = 7, + W8 = 8, W9 = 9, W10 = 10, W11 = 11, + W12 = 12, W13 = 13, W14 = 14, W15 = 15, + W16 = 16, W17 = 17, W18 = 18, W19 = 19, + W20 = 20, W21 = 21, W22 = 22, W23 = 23, + W24 = 24, W25 = 25, W26 = 26, W27 = 27, + W28 = 28, W29 = 29, W30 = 30, + WZR = 31 // 32-bit zero register +} Arm64Reg32; + +// Floating-Point/SIMD Registers +typedef enum { + V0 = 0, V1 = 1, V2 = 2, V3 = 3, + V4 = 4, V5 = 5, V6 = 6, V7 = 7, + V8 = 8, V9 = 9, V10 = 10, V11 = 11, + V12 = 12, V13 = 13, V14 = 14, V15 = 15, + V16 = 16, V17 = 17, V18 = 18, V19 = 19, + V20 = 20, V21 = 21, V22 = 22, V23 = 23, + V24 = 24, V25 = 25, V26 = 26, V27 = 27, + V28 = 28, V29 = 29, V30 = 30, V31 = 31 +} Arm64FpReg; + +// Aliases for specific precision +// D0-D31 = 64-bit (double precision) - same encoding as V0-V31 +// S0-S31 = 32-bit (single precision) - same encoding as V0-V31 +// H0-H31 = 16-bit (half precision) - same encoding as V0-V31 + +/* + * Condition Codes for Conditional Branches and Selects + */ +typedef enum { + COND_EQ = 0x0, // Equal (Z == 1) + COND_NE = 0x1, // Not equal (Z == 0) + COND_CS = 0x2, // Carry set (C == 1), also HS (unsigned higher or same) + COND_CC = 0x3, // Carry clear (C == 0), also LO (unsigned lower) + COND_MI = 0x4, // Minus/negative (N == 1) + COND_PL = 0x5, // Plus/positive or zero (N == 0) + COND_VS = 0x6, // Overflow set (V == 1) + COND_VC = 0x7, // Overflow clear (V == 0) + COND_HI = 0x8, // Unsigned higher (C == 1 && Z == 0) + COND_LS = 0x9, // Unsigned lower or same (C == 0 || Z == 1) + COND_GE = 0xA, // Signed greater than or equal (N == V) + COND_LT = 0xB, // Signed less than (N != V) + COND_GT = 0xC, // Signed greater than (Z == 0 && N == V) + COND_LE = 0xD, // Signed less than or equal (Z == 1 || N != V) + COND_AL = 0xE, // Always (unconditional) + COND_NV = 0xF // Never (reserved, don't use) +} ArmCondition; + +// Aliases +#define COND_HS COND_CS // Unsigned higher or same +#define COND_LO COND_CC // Unsigned lower + +/* + * Extend/Shift Types + */ +typedef enum { + EXTEND_UXTB = 0, // Unsigned extend byte + EXTEND_UXTH = 1, // Unsigned extend halfword + EXTEND_UXTW = 2, // Unsigned extend word + EXTEND_UXTX = 3, // Unsigned extend doubleword (64-bit, same as LSL) + EXTEND_SXTB = 4, // Signed extend byte + EXTEND_SXTH = 5, // Signed extend halfword + EXTEND_SXTW = 6, // Signed extend word + EXTEND_SXTX = 7 // Signed extend doubleword +} ArmExtend; + +typedef enum { + SHIFT_LSL = 0, // Logical shift left + SHIFT_LSR = 1, // Logical shift right + SHIFT_ASR = 2, // Arithmetic shift right + SHIFT_ROR = 3 // Rotate right +} ArmShift; + +/* + * Function Declarations + */ + +// ADD/SUB instructions +void encode_add_sub_imm(code_ctx *ctx, int sf, int op, int S, int shift, int imm12, Arm64Reg Rn, Arm64Reg Rd); +void encode_add_sub_reg(code_ctx *ctx, int sf, int op, int S, int shift, Arm64Reg Rm, int imm6, Arm64Reg Rn, Arm64Reg Rd); +void encode_add_sub_ext(code_ctx *ctx, int sf, int op, int S, Arm64Reg Rm, int option, int imm3, Arm64Reg Rn, Arm64Reg Rd); + +// Logical instructions +void encode_logical_imm(code_ctx *ctx, int sf, int opc, int N, int immr, int imms, Arm64Reg Rn, Arm64Reg Rd); +void encode_logical_reg(code_ctx *ctx, int sf, int opc, int shift, int N, Arm64Reg Rm, int imm6, Arm64Reg Rn, Arm64Reg Rd); + +// Move wide immediate +void encode_mov_wide_imm(code_ctx *ctx, int sf, int opc, int hw, int imm16, Arm64Reg Rd); + +// Multiply/divide +void encode_madd_msub(code_ctx *ctx, int sf, int op, Arm64Reg Rm, Arm64Reg Ra, Arm64Reg Rn, Arm64Reg Rd); +void encode_div(code_ctx *ctx, int sf, int U, Arm64Reg Rm, Arm64Reg Rn, Arm64Reg Rd); + +// Shift instructions +void encode_shift_reg(code_ctx *ctx, int sf, int op2, Arm64Reg Rm, Arm64Reg Rn, Arm64Reg Rd); + +// Load/store instructions +void encode_ldr_str_imm(code_ctx *ctx, int size, int V, int opc, int imm12, Arm64Reg Rn, Arm64Reg Rt); +void encode_ldr_str_reg(code_ctx *ctx, int size, int V, int opc, Arm64Reg Rm, int option, int S, Arm64Reg Rn, Arm64Reg Rt); +void encode_ldur_stur(code_ctx *ctx, int size, int V, int opc, int imm9, Arm64Reg Rn, Arm64Reg Rt); +void encode_ldp_stp(code_ctx *ctx, int opc, int V, int mode, int imm7, Arm64Reg Rt2, Arm64Reg Rn, Arm64Reg Rt); + +// PC-relative addressing +void encode_adrp(code_ctx *ctx, int immlo, int immhi, Arm64Reg Rd); +void encode_adr(code_ctx *ctx, int immlo, int immhi, Arm64Reg Rd); + +// Branch instructions +void encode_branch_cond(code_ctx *ctx, int imm19, ArmCondition cond); +void encode_branch_uncond(code_ctx *ctx, int imm26); +void encode_branch_link(code_ctx *ctx, int imm26); +void encode_branch_reg(code_ctx *ctx, int opc, Arm64Reg Rn); +void encode_cbz_cbnz(code_ctx *ctx, int sf, int op, int imm19, Arm64Reg Rt); +void encode_tbz_tbnz(code_ctx *ctx, int b5, int op, int b40, int imm14, Arm64Reg Rt); + +// Floating-point instructions +void encode_fp_arith(code_ctx *ctx, int M, int S, int type, Arm64FpReg Rm, int opcode, Arm64FpReg Rn, Arm64FpReg Rd); +void encode_fp_1src(code_ctx *ctx, int M, int S, int type, int opcode, Arm64FpReg Rn, Arm64FpReg Rd); +void encode_fp_compare(code_ctx *ctx, int M, int S, int type, Arm64FpReg Rm, int op, Arm64FpReg Rn); +void encode_fcvt_int(code_ctx *ctx, int sf, int S, int type, int rmode, int opc, Arm64FpReg Rn, Arm64Reg Rd); +void encode_int_fcvt(code_ctx *ctx, int sf, int S, int type, int rmode, int opc, Arm64Reg Rn, Arm64FpReg Rd); + +// Conditional select +void encode_cond_select(code_ctx *ctx, int sf, int op, Arm64Reg Rm, ArmCondition cond, int op2, Arm64Reg Rn, Arm64Reg Rd); + +// High-level helpers +void load_immediate(code_ctx *ctx, int64_t val, Arm64Reg dst, bool is_64bit); + +#endif // JIT_AARCH64_EMIT_H diff --git a/src/jit_dump.c b/src/jit_dump.c new file mode 100644 index 000000000..c1b16a073 --- /dev/null +++ b/src/jit_dump.c @@ -0,0 +1,584 @@ +/* + * Copyright (C)2015-2016 Haxe Foundation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ +#include + +static const char *op_names[] = { + "load-addr", + "load-const", + "load-arg", + "load-fun", + "store", + "lea", + "test", + "cmp", + "jcond", + "jump", + "jump-table", + "binop", + "unop", + "conv", + "conv-unsigned", + "ret", + "call", + "call", + "call", + "mov", + "cmov", + "xchg", + "cxhg", + "push-const", + "push", + "pop", + "alloc-stack", + "prefetch", + "debug-break", + "block", + "enter", + "stack", + "catch", + "address", + "nop" +}; + +bool hl_jit_dump_bin = false; + +const char *hl_natreg_str( int reg, emit_mode m ); + +const char *hl_emit_regstr( ereg v, emit_mode m ) { + static char fmts[4][10]; + static int flip = 0; + // allow up to four concurrent val_str + char *fmt = fmts[flip++&3]; + if( IS_NULL(v) ) { + sprintf(fmt,"NULL???"); + return fmt; + } + int val = REG_VALUE(v); + switch( REG_KIND(v) ) { + case R_VALUE: + sprintf(fmt,"V%d",v); + break; + case R_PHI: + sprintf(fmt,"P%d",-v); + break; + case R_CONST: + sprintf(fmt,"%d",val); + break; + case R_REG: + if( val == 0 ) + sprintf(fmt,"%s",hl_natreg_str(v,m)); + else if( val > 0 ) + sprintf(fmt,"%s+%Xh",hl_natreg_str(v,m),val); + else + sprintf(fmt,"%s-%Xh",hl_natreg_str(v,m),-val); + break; + case R_REG_PTR: + if( val == 0 ) + sprintf(fmt,"[%s]",REG_REG(v) == STACK_REG ? "ST" : hl_natreg_str(v,M_PTR)); + else if( val > 0 ) + sprintf(fmt,"[%s+%Xh]",REG_REG(v) == STACK_REG ? "ST" : hl_natreg_str(v,M_PTR),val); + else + sprintf(fmt,"[%s-%Xh]",REG_REG(v) == STACK_REG ? "ST" : hl_natreg_str(v,M_PTR),-val); + break; + default: + jit_assert(); + break; + } + return fmt; +} + +static void hl_dump_arg( hl_function *fun, int fmt, int val, char sep, int pos ) { + if( fmt == 0 ) return; + printf("%c", sep); + switch( fmt ) { + case 1: + case 2: + printf("R%d", val); + if( val < 0 || val >= fun->nregs ) printf("?"); + break; + case 3: + printf("%d", val); + break; + case 4: + printf("[%d]", val); + break; + case 5: + case 6: + printf("@%X", val + pos + 1); + break; + default: + printf("?#%d", fmt); + break; + } +} + +#define OP(_,_a,_b,_c) ((_a) | (((_b)&0xFF) << 8) | (((_c)&0xFF) << 16)), +#define OP_BEGIN static int hl_op_fmt[] = { +#define OP_END }; +#undef R +#include "opcodes.h" + +static void hl_dump_op( hl_function *fun, hl_opcode *op ) { + printf("%s", hl_op_name(op->op) + 1); + int fmt = hl_op_fmt[op->op]; + int pos = (int)(op - fun->ops); + hl_dump_arg(fun, fmt & 0xFF, op->p1, ' ', pos); + if( ((fmt >> 8) & 0xFF) == 5 ) { + int count = (fmt >> 16) & 0xFF; + printf(" ["); + if( count == 4 ) { + printf("%d", op->p2); + printf(",%d", op->p3); + printf(",%d", (int)(int_val)op->extra); + } else if( op->op == OSwitch ) { + for(int i=0;ip2;i++) { + if( i != 0 ) printf(","); + printf("@%X", (op->extra[i] + pos + 1)); + } + printf(",def=@%X", op->p3 + pos + 1); + } else { + if( count == 0xFF ) + count = op->p3; + else { + printf("%d,%d,",op->p2,op->p3); + count -= 3; + } + for(int i=0;iextra[i]); + } + } + printf("]"); + } else { + hl_dump_arg(fun, (fmt >> 8) & 0xFF, op->p2,',', pos); + hl_dump_arg(fun, fmt >> 16, op->p3,',', pos); + } +} + +static const char *emit_mode_str( emit_mode mode ) { + switch( mode ) { + case M_UI8: return "-ui8"; + case M_UI16: return "-ui16"; + case M_I32: return "-i32"; + case M_F32: return "-f32"; + case M_F64: return "-f64"; + case M_PTR: return ""; + case M_VOID: return "-void"; + case M_NORET: return "-noret"; + default: + static char buf[50]; + sprintf(buf,"?%d",mode); + return buf; + } +} + +static void dump_value( jit_ctx *ctx, uint64 value, emit_mode mode ) { + union { + uint64 v; + double d; + float f; + } tmp; + hl_module *mod = ctx->mod; + hl_code *code = ctx->mod->code; + switch( mode ) { + case M_NONE: + printf("?0x%llX",value); + break; + case M_UI8: + case M_UI16: + case M_I32: + if( (int)value >= -0x10000 && (int)value <= 0x10000 ) + printf("%d",(int)value); + else + printf("0x%X",(int)value); + break; + case M_F32: + tmp.v = value; + printf("%f",tmp.f); + break; + case M_F64: + tmp.v = value; + printf("%g",tmp.d); + break; + default: + if( value == 0 ) + printf("NULL"); + else if( mode == M_PTR && value >= (uint64)code->types && value < (uint64)(code->types + code->ntypes) ) + uprintf(USTR("<%s>"),hl_type_str((hl_type*)value)); + else if( mode == M_PTR && value == (uint64)mod->globals_data ) + printf(""); + else if( value == (uint64)&hlt_void ) + printf(""); + else + printf("0x%llX",value); + break; + } +} + +static void hl_dump_fun_name( hl_function *f ) { + if( f->obj ) { + uprintf(USTR("%s."),f->obj->name); + uprintf(USTR("%s"),f->field.name); + } + else if( f->field.ref ) { + uprintf(USTR("%s."),f->field.ref->obj->name); + uprintf(USTR("~%s"),f->field.ref->field.name); + printf(".%d",f->ref); + } + printf("[%X]", f->findex); +} + +static void hl_dump_args( jit_ctx *ctx, einstr *e ) { + if( e->nargs == 0xFF ) + return; + ereg *v = hl_emit_get_args(ctx->emit, e); + printf("("); + for(int i=0;inargs;i++) { + if( i != 0 ) printf(","); + printf("%s", val_str(v[i],M_NONE)); + } + printf(")"); +} + +typedef struct { const char *name; void *ptr; } named_ptr; +static void hl_dump_ptr_name( jit_ctx *ctx, void *ptr ) { +# define N(v) ptr_names[i].name = #v; ptr_names[i].ptr = v; i++ +# define N2(n,v) ptr_names[i].name = n; ptr_names[i].ptr = v; i++ +# define DYN(p) N2("dyn_get" #p, hl_dyn_get##p); N2("dyn_set" #p, hl_dyn_set##p); N2("dyn_cast" #p, hl_dyn_cast##p) + static named_ptr ptr_names[256] = { NULL }; + int i = 0; + if( !ptr_names[0].ptr ) { + N(hl_alloc_dynbool); + N(hl_alloc_dynamic); + N(hl_alloc_obj); + N(hl_alloc_dynobj); + N(hl_alloc_virtual); + N(hl_alloc_closure_ptr); + N(hl_dyn_call); + N(hl_dyn_call_obj); + N(hl_throw); + N(hl_rethrow); + N(hl_to_virtual); + N(hl_alloc_enum); + N(hl_dyn_compare); + N(hl_same_type); + DYN(f); + DYN(d); + DYN(i64); + DYN(i); + DYN(p); + N2("null_field",hl_jit_null_field_access); + N2("null_access",hl_null_access); + N(hl_get_thread); + N(setjmp); + N(_setjmp); + N2("assert",hl_jit_assert); + N(fmod); + N(fmodf); + i = 0; + } +# undef N +# undef N2 + while( true ) { + named_ptr p = ptr_names[i++]; + if( !p.ptr ) break; + if( p.ptr == ptr ) { + printf("<%s>",p.name); + return; + } + } + for(i=0;imod->code->nnatives;i++) { + hl_native *n = ctx->mod->code->natives + i; + if( ctx->mod->functions_ptrs[n->findex] == ptr ) { + printf("<%s.%s>",n->lib[0] == '?' ? n->lib + 1 : n->lib,n->name); + return; + } + } + printf("",(uint64)ptr); +} + +void hl_emit_flush( jit_ctx *ctx ); +void hl_regs_flush( jit_ctx *ctx ); +void hl_codegen_flush( jit_ctx *ctx ); + +#define reg_str(r) val_str(r,e->mode) + +static void dump_instr( jit_ctx *ctx, einstr *e, int cur_pos ) { + printf("%s", op_names[e->op]); + bool show_size = true; + switch( e->op ) { + case TEST: + case CMP: + printf("-%s", hl_op_name(e->size_offs)+2); + show_size = false; + break; + case BINOP: + case UNOP: + printf("-%s", hl_op_name(e->size_offs)+1); + show_size = false; + break; + default: + break; + } + if( e->mode ) + printf("%s", emit_mode_str(e->mode)); + switch( e->op ) { + case CALL_FUN: + printf(" "); + { + int fid = ctx->mod->functions_indexes[e->a]; + hl_code *code = ctx->mod->code; + if( fid < code->nfunctions ) { + hl_dump_fun_name(&code->functions[fid]); + } else { + printf("???"); + } + } + hl_dump_args(ctx,e); + break; + case CALL_REG: + printf(" %s", val_str(e->a,M_PTR)); + hl_dump_args(ctx,e); + break; + case CALL_PTR: + printf(" "); + hl_dump_ptr_name(ctx, (void*)e->value); + hl_dump_args(ctx,e); + break; + case JUMP: + case JCOND: + printf(" @%X", cur_pos + 1 + e->size_offs); + break; + case JUMP_TABLE: + { + int *offsets = hl_emit_get_args(ctx->emit, e); + printf(" %s (", reg_str(e->a)); + for(int k=0;knargs;k++) { + if( k > 0 ) printf(","); + printf("@%X", cur_pos + 1 + offsets[k]); + } + printf(")"); + } + break; + case BLOCK: + printf(" #%d", e->size_offs); + if( e->size_offs && ctx->blocks[e->size_offs].pred_count == 0 ) + printf(" ???DEAD"); + break; + case STACK_OFFS: + if( e->size_offs >= 0 ) + printf(" +%Xh", e->size_offs); + else + printf(" -%Xh", -e->size_offs); + break; + case LOAD_CONST: + case PUSH_CONST: + printf(" "); + dump_value(ctx, e->value, e->mode); + break; + case LOAD_ADDR: + if( e->nargs != e->mode ) { + if( e->mode == M_PTR ) printf("-ptr"); + printf("%s", e->nargs == M_PTR ? "-ptr" : emit_mode_str(e->nargs)); + } + printf(" %s[%Xh]", val_str(e->a,M_PTR), e->size_offs); + break; + case STORE: + { + int offs = e->size_offs; + if( offs == 0 ) + printf(" [%s]", val_str(e->a,M_PTR)); + else + printf(" %s[%Xh]", val_str(e->a,M_PTR), offs); + printf(" = %s", reg_str(e->b)); + } + break; + case CONV: + case CONV_UNSIGNED: + if( e->mode == M_PTR ) printf("-i64"); + printf("%s %s", e->size_offs == M_PTR ? "-i64" : emit_mode_str(e->size_offs), val_str(e->a,(emit_mode)e->size_offs)); + break; + case LEA: + printf(" [%s", reg_str(e->a)); + if( !IS_NULL(e->b) ) printf("+%s", reg_str(e->b)); + if( (e->size_offs&0xFF) > 1 ) printf("*%d",e->size_offs&0xFF); + if( e->size_offs >> 8 ) printf("+%Xh", e->size_offs>>8); + printf("]"); + break; + default: + if( !IS_NULL(e->a) ) { + printf(" %s", reg_str(e->a)); + if( !IS_NULL(e->b) ) printf(", %s", reg_str(e->b)); + } + if( show_size && e->size_offs != 0 ) + printf(" %d", e->size_offs); + break; + } +} + +void hl_emit_dump( jit_ctx *ctx ) { + hl_function *f = ctx->fun; + int nargs = f->type->fun->nargs; + // if it not was not before (in case of dump during process) + hl_emit_flush(ctx); + hl_regs_flush(ctx); + hl_codegen_flush(ctx); + printf("function "); + hl_dump_fun_name(f); + printf("("); + for(int i=0;i 0 ) printf(","); + printf("R%d", i); + } + printf(")\n"); + for(int i=0;inregs;i++) { + printf("\tR%d : ",i); + uprintf(USTR("%s\n"), hl_type_str(f->regs[i])); + } + // check blocks intervals + int cur = 0; + for(int i=0;iblock_count;i++) { + eblock *b = ctx->blocks + i; + if( b->start_pos != cur ) printf(" ??? BLOCK %d START AT %X != %X\n", i, b->start_pos, cur); + if( b->end_pos < b->start_pos ) printf(" ??? BLOCK %d RANGE [%X,%X]\n", i, b->start_pos, b->end_pos); + cur = b->end_pos; + } + if( cur != ctx->instr_count ) + printf(" ??? MISSING BLOCK FOR RANGE %X-%X\n", cur, ctx->instr_count); + // print instrs + int vpos = 1; + int rpos = 0; + int cpos = 0; + int cur_op = 0; + bool new_op = false; + eblock *cur_block = NULL; + for(int icount=0;icountinstr_count;icount++) { + while( ctx->emit_pos_map[cur_op] == icount ) { + printf("@%X ", cur_op); + hl_dump_op(ctx->fun, f->ops + cur_op); + printf("\n"); + new_op = true; + cur_op++; + } + einstr *e = ctx->instrs + icount; + printf("\t\t@%X ", icount); + if( vpos < ctx->value_count && ctx->values_writes[vpos] == icount ) + printf("V%d = ", vpos++); + dump_instr(ctx, e, icount); + if( e->op == JCOND || e->op == JUMP ) { + int target = icount + 1 + e->size_offs; + bool bad = false; + if( icount + 1 >= ctx->instr_count || target < 0 || target >= ctx->instr_count ) + bad = true; + else if( ctx->instrs[target].op != BLOCK || (e->op == JCOND && ctx->instrs[icount+1].op != BLOCK) ) + bad = true; + else { + bool found = false; + for(int k=0;knext_count;k++) { + if( cur_block->nexts[k] == ctx->instrs[target].size_offs ) + found = true; + if( (e->op == JUMP || e->op == JUMP_TABLE) && ctx->instrs[icount+1].op == BLOCK && ctx->instrs[icount+1].size_offs == cur_block->nexts[k] ) + printf(" ???LEAK"); + } + if( !found ) printf(" ???NEXT"); + } + if( bad ) + printf(" ???"); + } + if( e->op == BLOCK ) { + eblock *b = &ctx->blocks[e->size_offs]; + for(int k=0;kpred_count;k++) { + eblock *p = &ctx->blocks[b->preds[k]]; + einstr *pe = &ctx->instrs[p->end_pos-1]; + if( p->end_pos == icount ) + continue; + bool bad = false; + if( (pe->op == JUMP || pe->op == JCOND) && pe->size_offs == icount - p->end_pos ) + bad = false; + else if( pe->op != JUMP_TABLE ) + bad = true; + if( bad ) + printf(" ???PREV#%d",b->preds[k]); + } + for(int k=0;kphi_count;k++) { + ephi *p = b->phis + k; + printf("\n\t\t@%X %s = phi%s(",icount,val_str(p->value,p->mode),emit_mode_str(p->mode)); + for(int n=0;nnvalues;n++) { + if( n > 0 ) printf(","); + printf("%s:%d",val_str(p->values[n],p->mode),p->blocks[n]); + } + if( p->nvalues == 0 ) + printf("unwritten"); + printf(")"); + if( p->nvalues == 1 ) + printf(" ???"); + } + cur_block = b; + } + while( rpos < ctx->reg_instr_count && rpos < ctx->reg_pos_map[icount+1] ) { + ereg out = ctx->reg_writes[rpos]; + e = ctx->reg_instrs + rpos; + printf("\n\t\t\t\t@%X ",rpos); + if( !IS_NULL(out) ) printf("%s = ",reg_str(out)); + dump_instr(ctx,e,rpos); + bool first = true; + while( cpos < ctx->code_size && cpos < ctx->code_pos_map[rpos+1] ) { + if( first ) { + if( hl_jit_dump_bin ) + printf("\t\t\t"); + else + printf("\033[80G"); + first = false; + if( new_op ) { + new_op = false; + cpos += ctx->cfg.debug_prefix_size; + if( cpos == ctx->code_pos_map[rpos+1] ) break; + } + } + printf("%.2X",ctx->code_instrs[cpos++]); + } + rpos++; + } + printf("\n"); + } + // invalid ? + while( vpos < ctx->value_count ) + printf(" ??? UNWRITTEN VALUE V%d @%X\n", vpos, ctx->values_writes[vpos++]); + // interrupted + if( cur_op < f->nops ) { + printf("@%X ", cur_op); + hl_dump_op(ctx->fun, f->ops + cur_op); + printf("\n\t\t...\n"); + } + if( cpos == ctx->code_size && cpos > 0 ) { + int n = 1; + for(int i=0;icode_pos_map[n] == i ) { + if( (n & 15) == 0 ) printf("\n"); else printf(" "); + n++; + } + printf("%.2X", ctx->code_instrs[i]); + } + } + printf("\n\n"); + fflush(stdout); +} diff --git a/src/jit_emit.c b/src/jit_emit.c new file mode 100644 index 000000000..7524c5483 --- /dev/null +++ b/src/jit_emit.c @@ -0,0 +1,2214 @@ +/* + * Copyright (C)2015-2016 Haxe Foundation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ +#include +#include +#include +#include "data_struct.h" + +//#define EMIT_DEBUG + +#ifdef EMIT_DEBUG +# define emit_debug jit_debug +#else +# define emit_debug(...) +#endif + +int hl_emit_mode_sizes[] = {0,1,2,4,HL_WSIZE,8,4,0,0}; + +typedef struct { + hl_type *t; + int id; + ereg stored; +} vreg; + +#define MAX_TMP_ARGS 32 +#define MAX_TRAPS 32 + +typedef struct _linked_inf linked_inf; +typedef struct _emit_block emit_block; +typedef struct _tmp_phi tmp_phi; + +#define S_TYPE blocks +#define S_NAME(name) blocks_##name +#define S_VALUE emit_block* +#include "data_struct.c" +#define blocks_add(set,v) blocks_add_impl(DEF_ALLOC,&(set),v) + +#define S_TYPE phi_arr +#define S_NAME(name) phi_##name +#define S_VALUE tmp_phi* +#include "data_struct.c" +#define phi_add(set,v) phi_add_impl(DEF_ALLOC,&(set),v) + +#define S_SORTED + +#define S_MAP +#define S_TYPE ereg_map +#define S_NAME(name) ereg_##name +#define S_KEY ereg +#define S_VALUE emit_block* +#include "data_struct.c" +#define ereg_add(set,k,v) ereg_add_pair_impl(DEF_ALLOC,&(set),k,v) + +#define S_MAP + +#define S_TYPE vreg_map +#define S_NAME(name) vreg_##name +#define S_KEY int +#define S_VALUE ereg +#include "data_struct.c" +#define vreg_replace(set,k,v) vreg_replace_impl(DEF_ALLOC,&(set),k,v) + +struct _linked_inf { + int id; + void *ptr; + linked_inf *next; +}; + +struct _emit_block { + int id; + int start_pos; + int end_pos; + int wait_nexts; + bool sealed; + blocks nexts; + blocks preds; + vreg_map written_vars; + phi_arr phis; + emit_block *wait_seal_next; +}; + +struct _tmp_phi { + ereg value; + vreg *r; + ereg target; + int final_id; + bool locked; + bool opt; + emit_mode mode; + emit_block *b; + ereg_map vals; + phi_arr ref_phis; + linked_inf *ref_blocks; +}; + +typedef struct { + ereg stack; + int target; +} trap_inf; + +struct _emit_ctx { + hl_module *mod; + hl_function *fun; + jit_ctx *jit; + + einstr *instrs; + vreg *vregs; + tmp_phi **phis; + int max_instrs; + int max_regs; + int max_phis; + int emit_pos; + int op_pos; + int phi_count; + int phi_depth; + bool flushed; + + ereg tmp_args[MAX_TMP_ARGS]; + trap_inf traps[MAX_TRAPS]; + int *pos_map; + int pos_map_size; + int trap_count; + + int_arr args_data; + int_arr jump_regs; + int_arr values; + + blocks blocks; + emit_block *current_block; + emit_block *wait_seal; + linked_inf *arrival_points; + vclosure *closure_list; +}; + +#define R(i) (ctx->vregs + (i)) + +#define LOAD(r) emit_load_reg(ctx, r) +#define STORE(r, v) emit_store_reg(ctx, r, v) +#define LOAD_CONST(v, t) emit_load_const(ctx, (uint64)(v), t) +#define LOAD_CONST_PTR(v) LOAD_CONST(v,&hlt_bytes) +#define LOAD_MEM(v, offs, t) emit_load_mem(ctx, v, offs, t, t) +#define LOAD_MEM_PTR(v, offs) LOAD_MEM(v, offs, &hlt_bytes) +#define STORE_MEM(to, offs, v) emit_store_mem(ctx, to, offs, v) +#define LOAD_OBJ_METHOD(obj,id) LOAD_MEM_PTR(LOAD_MEM_PTR(LOAD_MEM_PTR(obj,0),HL_WSIZE*2),HL_WSIZE*(id)) +#define OFFSET(base,index,mult,offset) emit_gen_ext(ctx, LEA, base, index, M_PTR, (mult) | ((offset) << 8)) +#define BREAK() emit_gen(ctx, DEBUG_BREAK, UNUSED, UNUSED, 0) +#define GET_MODE(r) emit_get_mode(ctx,r) +#define GET_PHI(r) ctx->phis[-(r)-1] +#define HDYN_VALUE 8 + +static hl_type hlt_ui8 = { HUI8, 0 }; +static hl_type hlt_ui16 = { HUI16, 0 }; + +static linked_inf *link_add( emit_ctx *ctx, int id, void *ptr, linked_inf *head ) { + linked_inf *l = hl_malloc(&ctx->jit->falloc,sizeof(linked_inf)); + l->id = id; + l->ptr = ptr; + l->next = head; + return l; +} + +static linked_inf *link_add_sort_unique( emit_ctx *ctx, int id, void *ptr, linked_inf *head ) { + linked_inf *prev = NULL; + linked_inf *cur = head; + while( cur && cur->id < id ) { + prev = cur; + cur = cur->next; + } + // check duplicate + while( cur && cur->id == id ) { + if( cur->ptr == ptr ) + return head; + cur = cur->next; + } + // insert + linked_inf *l = hl_malloc(&ctx->jit->falloc,sizeof(linked_inf)); + l->id = id; + l->ptr = ptr; + if( !prev ) { + l->next = head; + return l; + } else { + l->next = prev->next; + prev->next = l; + return head; + } +} + +static linked_inf *link_add_sort_replace( emit_ctx *ctx, int id, void *ptr, linked_inf *head ) { + linked_inf *prev = NULL; + linked_inf *cur = head; + while( cur && cur->id < id ) { + prev = cur; + cur = cur->next; + } + // replace duplicate + if( cur && cur->id == id ) { + cur->ptr = ptr; + return head; + } + // insert + linked_inf *l = hl_malloc(&ctx->jit->falloc,sizeof(linked_inf)); + l->id = id; + l->ptr = ptr; + if( !prev ) { + l->next = head; + return l; + } else { + l->next = prev->next; + prev->next = l; + return head; + } +} + +static void *link_sort_lookup( linked_inf *head, int id ) { + while( head && head->id < id ) + head = head->next; + if( head && head->id == id ) + return head->ptr; + return NULL; +} + +static linked_inf *link_sort_remove( linked_inf *head, int id ) { + linked_inf *prev = NULL; + linked_inf *cur = head; + while( cur && cur->id < id ) { + prev = cur; + cur = cur->next; + } + if( cur && cur->id == id ) { + if( !prev ) + return cur->next; + prev->next = cur->next; + return head; + } + return head; +} + +static emit_mode hl_type_mode( hl_type *t ) { + static emit_mode CONV[] = { + M_VOID, + M_UI8, + M_UI16, + M_I32, + M_PTR, + M_F32, + M_F64, + sizeof(bool) == 1 ? M_UI8 : M_I32, + }; + if( t->kind <= HBOOL ) + return CONV[t->kind]; + return M_PTR; +} + +static ereg new_value( emit_ctx *ctx ) { + ereg r = int_arr_count(ctx->values); + int_arr_add(ctx->values, ctx->emit_pos-1); + return r; +} + +static ereg *get_tmp_args( emit_ctx *ctx, int count ) { + if( count > MAX_TMP_ARGS ) jit_error("Too many arguments"); + return ctx->tmp_args; +} + +static emit_mode emit_get_mode( emit_ctx *ctx, ereg v ) { + if( IS_NULL(v) ) jit_assert(); + if( v < 0 ) + return GET_PHI(v)->mode; + return ctx->instrs[int_arr_get(ctx->values,v)].mode; +} + +static const char *phi_prefix( emit_ctx *ctx ) { + static char tmp[20]; + int sp = 3 + ctx->phi_depth * 2; + if( sp > 19 ) sp = 19; + memset(tmp,0x20,sp); + tmp[sp] = 0; + return tmp; +} + +static einstr *emit_instr( emit_ctx *ctx, emit_op op ) { + if( ctx->emit_pos == ctx->max_instrs ) { + int pos = ctx->emit_pos; + int next_size = ctx->max_instrs ? (ctx->max_instrs << 1) : 256; + einstr *instrs = (einstr*)malloc(sizeof(einstr) * next_size); + if( instrs == NULL ) jit_error("Out of memory"); + memcpy(instrs, ctx->instrs, pos * sizeof(einstr)); + memset(instrs + pos, 0, (next_size - pos) * sizeof(einstr)); + free(ctx->instrs); + ctx->instrs = instrs; + ctx->max_instrs = next_size; + } else if( (ctx->emit_pos & 0xFF) == 0 ) + memset(ctx->instrs + ctx->emit_pos, 0, 256 * sizeof(einstr)); + einstr *e = ctx->instrs + ctx->emit_pos++; + e->op = op; + return e; +} + +static void emit_store_mem( emit_ctx *ctx, ereg to, int offs, ereg from ) { + einstr *e = emit_instr(ctx, STORE); + e->mode = GET_MODE(from); + e->size_offs = offs; + e->a = to; + e->b = from; +} + +#define store_args hl_emit_store_args +void hl_emit_store_args( emit_ctx *ctx, einstr *e, ereg *args, int count ) { + if( count < 0 ) jit_assert(); + if( count > 256 ) jit_error("Too many arguments"); + e->nargs = (unsigned char)count; + if( count == 0 ) return; + if( count == 1 ) { + e->size_offs = args[0]; + return; + } + int *args_data = int_arr_reserve(ctx->args_data, count); + e->size_offs = (int)(args_data - ctx->args_data.values); + memcpy(args_data, args, sizeof(int) * count); +} + +ereg *hl_emit_get_args( emit_ctx *ctx, einstr *e ) { + if( e->nargs == 0 ) + return NULL; + if( e->nargs == 1 ) + return (ereg*)&e->size_offs; + return (ereg*)(ctx->args_data.values + e->size_offs); +} + +static ereg emit_gen_ext( emit_ctx *ctx, emit_op op, ereg a, ereg b, int mode, int size_offs ) { + einstr *e = emit_instr(ctx, op); + if( (unsigned char)mode != mode ) jit_assert(); + e->mode = (unsigned char)mode; + e->size_offs = size_offs; + e->a = a; + e->b = b; + return mode == 0 || mode == M_NORET ? UNUSED : new_value(ctx); +} + +static ereg emit_gen( emit_ctx *ctx, emit_op op, ereg a, ereg b, int mode ) { + return emit_gen_ext(ctx,op,a,b,mode,0); +} + +static ereg emit_gen_size( emit_ctx *ctx, emit_op op, int size_offs ) { + return emit_gen_ext(ctx,op,UNUSED,UNUSED,op==ALLOC_STACK ? M_PTR : 0,size_offs); +} + +static void patch_instr_mode( emit_ctx *ctx, int mode ) { + ctx->instrs[ctx->emit_pos-1].mode = (unsigned char)mode; +} + +static tmp_phi *alloc_phi( emit_ctx *ctx, emit_block *b, vreg *r ) { + if( ctx->phi_count == ctx->max_phis ) { + int new_size = ctx->max_phis ? ctx->max_phis << 1 : 64; + tmp_phi **phis = (tmp_phi**)malloc(sizeof(tmp_phi*) * new_size); + if( phis == NULL ) jit_error("Out of memory"); + memcpy(phis, ctx->phis, sizeof(tmp_phi*) * ctx->phi_count); + free(ctx->phis); + ctx->phis = phis; + ctx->max_phis = new_size; + } + tmp_phi *p = (tmp_phi*)hl_zalloc(&ctx->jit->falloc, sizeof(tmp_phi)); + p->b = b; + p->r = r; + if( r ) p->mode = hl_type_mode(r->t); + p->value = -(++ctx->phi_count); + phi_add(b->phis,p); + GET_PHI(p->value) = p; + return p; +} + +static emit_block *alloc_block( emit_ctx *ctx ) { + emit_block *b = hl_zalloc(&ctx->jit->falloc, sizeof(emit_block)); + b->id = blocks_count(ctx->blocks); + b->start_pos = ctx->emit_pos; + blocks_add(ctx->blocks, b); + if( b->id > 0 ) emit_gen_size(ctx, BLOCK, b->id); + return b; +} + +static void block_add_pred( emit_ctx *ctx, emit_block *b, emit_block *p ) { + for_iter(blocks,p2,b->preds) + if( p2 == p ) + return; + blocks_add(b->preds,p); + blocks_add(p->nexts,b); + emit_debug(" PRED #%d\n",p->id); +} + +static void store_block_var( emit_ctx *ctx, emit_block *b, vreg *r, ereg v ) { + if( IS_NULL(v) ) jit_assert(); + vreg_replace(b->written_vars,r->id,v); + if( v < 0 ) { + tmp_phi *p = GET_PHI(v); + p->ref_blocks = link_add_sort_unique(ctx,b->id,b,p->ref_blocks); + } +} + +static bool split_block( emit_ctx *ctx ) { + if( ctx->current_block->start_pos == ctx->emit_pos-1 ) + return false; + emit_block *b = alloc_block(ctx); + b->sealed = true; + emit_debug("BLOCK #%d@%X[%X]\n",b->id,b->start_pos,ctx->op_pos); + while( ctx->arrival_points && ctx->arrival_points->id == ctx->op_pos ) { + block_add_pred(ctx, b, (emit_block*)ctx->arrival_points->ptr); + ctx->arrival_points = ctx->arrival_points->next; + } + einstr *eprev = &ctx->instrs[b->start_pos-1]; + if( eprev->op != JUMP && eprev->op != JUMP_TABLE && eprev->op != RET && eprev->mode != M_NORET ) + block_add_pred(ctx, b, ctx->current_block); + ctx->current_block->end_pos = b->start_pos; + ctx->current_block = b; + return true; +} + +static void add_jump_target( emit_ctx *ctx, int offs ) { + if( offs == 0 && ctx->current_block->start_pos == ctx->emit_pos-1 ) + return; + int target = offs + ctx->op_pos + 1; + ctx->arrival_points = link_add_sort_unique(ctx, target, ctx->current_block, ctx->arrival_points); +} + +static int emit_jump( emit_ctx *ctx, bool cond ) { + int p = ctx->emit_pos; + emit_gen(ctx, cond ? JCOND : JUMP, UNUSED, UNUSED, 0); + if( !cond ) add_jump_target(ctx, 0); + split_block(ctx); + return p; +} + +static void patch_jump( emit_ctx *ctx, int jpos ) { + emit_block *b = NULL; + // find the block or initial jump was + for_iter_back(blocks,b2,ctx->blocks) { + if( b2->start_pos <= jpos ) { + b = b2; + break; + } + } + if( !b || b == ctx->current_block ) jit_assert(); + // patch opcode + bool after_block = ctx->current_block->start_pos == ctx->emit_pos-1; + ctx->instrs[jpos].size_offs = ctx->emit_pos - (after_block?1:0) - (jpos + 1); + if( after_block ) { + block_add_pred(ctx, ctx->current_block, b); + } else { + if( !split_block(ctx) ) jit_assert(); + } +} + +static void register_jump( emit_ctx *ctx, int jpos, int offs ) { + int target = offs + ctx->op_pos + 1; + int_arr_add(ctx->jump_regs, jpos); + int_arr_add(ctx->jump_regs, target); + if( offs > 0 ) add_jump_target(ctx, offs); +} + +static ereg emit_load_const( emit_ctx *ctx, uint64 value, hl_type *size_t ) { + einstr *e = emit_instr(ctx, LOAD_CONST); + e->mode = hl_type_mode(size_t); + e->value = value; + return new_value(ctx); +} + +static ereg emit_load_mem( emit_ctx *ctx, ereg v, int offset, hl_type *size_t, hl_type *to_t ) { + einstr *e = emit_instr(ctx, LOAD_ADDR); + e->mode = hl_type_mode(to_t); + e->a = v; + e->nargs = hl_type_mode(size_t); + e->size_offs = offset; + return new_value(ctx); +} + +static void emit_store_reg( emit_ctx *ctx, vreg *to, ereg v ) { + if( to->t->kind == HVOID ) return; + if( IS_NULL(v) ) jit_assert(); + store_block_var(ctx,ctx->current_block,to,v); + if( ctx->trap_count > 0 ) { + // if the value was written before the trap, let's update it + if( !IS_NULL(to->stored) ) + STORE_MEM(emit_gen(ctx,ADDRESS,to->stored,UNUSED,M_PTR), 0, v); + } else { + to->stored = v; + } +} + +static ereg emit_native_call( emit_ctx *ctx, void *native_ptr, ereg args[], int nargs, hl_type *ret ) { + einstr *e = emit_instr(ctx, CALL_PTR); + e->mode = (unsigned char)(ret ? hl_type_mode(ret) : M_NORET); + e->value = (int_val)native_ptr; + store_args(ctx, e, args, nargs); + return ret == NULL || e->mode == M_VOID ? UNUSED : new_value(ctx); +} + +static ereg emit_dyn_call( emit_ctx *ctx, ereg f, ereg args[], int nargs, hl_type *ret ) { + einstr *e = emit_instr(ctx, CALL_REG); + e->mode = hl_type_mode(ret); + e->a = f; + store_args(ctx, e, args, nargs); + return e->mode == M_VOID ? UNUSED : new_value(ctx); +} + +static void emit_test( emit_ctx *ctx, ereg v, hl_op o ) { + emit_gen_ext(ctx, TEST, v, UNUSED, 0, o); + patch_instr_mode(ctx, GET_MODE(v)); +} + +static void emit_cmp( emit_ctx *ctx, ereg a, ereg b, hl_op o ) { + emit_gen_ext(ctx, CMP, a, b, 0, o); + patch_instr_mode(ctx, GET_MODE(a)); +} + +static void phi_remove_val( emit_ctx *ctx, tmp_phi *p, ereg v ) { + ereg_remove(&p->vals,v); + emit_debug("%sPHI-REM-DEP %s = %s\n", phi_prefix(ctx), val_str(p->value,p->mode), val_str(v,p->mode)); +} + +static void phi_add_val( emit_ctx *ctx, tmp_phi *p, ereg v, emit_block *from ) { + if( !p->b ) jit_assert(); + if( IS_NULL(v) ) jit_assert(); + if( p->value == v ) + return; + if( !ereg_add(p->vals,v,from) ) + return; + emit_debug("%sPHI-DEP %s:#%d = %s\n", phi_prefix(ctx), val_str(p->value,p->mode), from->id, val_str(v,p->mode)); + if( v < 0 ) { + tmp_phi *p2 = GET_PHI(v); + phi_add(p2->ref_phis,p); + } +} + +static ereg optimize_phi_rec( emit_ctx *ctx, tmp_phi *p ) { + + if( p->locked ) jit_assert(); + ereg same = UNUSED; + for_iter_key(ereg,v,p->vals) { + if( v == same || v == p->value ) + continue; + if( !IS_NULL(same) ) + return p->value; + same = v; + } + if( IS_NULL(same) ) + return p->value; // sealed (no dep yet) + + if( !phi_count(p->ref_phis) && !p->ref_blocks ) + return same; + + if( p->locked || p->opt ) jit_assert(); + + emit_debug("%sPHI-OPT %s = %s\n", phi_prefix(ctx), val_str(p->value,p->mode), val_str(same,p->mode)); + p->opt = true; + ctx->phi_depth++; + linked_inf *l = p->ref_blocks; + while( l ) { + emit_block *b = (emit_block*)l->ptr; + if( vreg_find(b->written_vars,p->r->id) == p->value ) + store_block_var(ctx,b,p->r,same); + l = l->next; + } + for_iter(phi,p2,p->ref_phis) { + emit_block *bsame = ereg_find(p2->vals,p->value); + phi_remove_val(ctx,p2,p->value); + phi_add_val(ctx,p2,same,bsame); + } + p->ref_blocks = NULL; + int count = phi_count(p->ref_phis); + tmp_phi **phis = phi_free(&p->ref_phis); + for(int i=0;iphi_depth--; + emit_debug("%sPHI-OPT-DONE %s = %s\n", phi_prefix(ctx), val_str(p->value,p->mode), val_str(same,p->mode)); + return optimize_phi_rec(ctx,p); +} + +static ereg emit_load_reg_block( emit_ctx *ctx, emit_block *b, vreg *r ); + +static ereg gather_phis( emit_ctx *ctx, tmp_phi *p ) { + p->locked = true; + for_iter(blocks,b,p->b->preds) { + ereg r = p->r ? emit_load_reg_block(ctx, b, p->r) : p->value; + phi_add_val(ctx, p, r, b); + } + p->locked = false; + return optimize_phi_rec(ctx, p); +} + +static ereg emit_load_reg_block( emit_ctx *ctx, emit_block *b, vreg *r ) { + ereg v = vreg_find(b->written_vars,r->id); + if( !IS_NULL(v) ) + return v; + if( !b->sealed ) { + tmp_phi *p = alloc_phi(ctx,b,r); + emit_debug("%sPHI-SEALED %s = R%d\n",phi_prefix(ctx),val_str(p->value,p->mode),r->id); + v = p->value; + } else if( blocks_count(b->preds) == 1 ) + v = emit_load_reg_block(ctx, blocks_get(b->preds,0), r); + else { + tmp_phi *p = alloc_phi(ctx,b,r); + store_block_var(ctx,b,r,p->value); + v = gather_phis(ctx, p); + } + store_block_var(ctx,b,r,v); + return v; +} + +static ereg emit_load_reg( emit_ctx *ctx, vreg *r ) { + return emit_load_reg_block(ctx, ctx->current_block, r); +} + +static void seal_block( emit_ctx *ctx, emit_block *b ) { + emit_debug(" SEAL #%d\n",b->id); + for_iter(phi,p,b->phis) + gather_phis(ctx, p); + b->sealed = true; +} + +static ereg emit_call_fid( emit_ctx *ctx, int findex, ereg *args, int nargs, emit_mode mode ) { + einstr *e = emit_instr(ctx, CALL_FUN); + e->mode = mode; + e->a = findex; + store_args(ctx, e, args, nargs); + return mode == M_VOID ? UNUSED : new_value(ctx); +} + +static void emit_call_fun( emit_ctx *ctx, vreg *dst, int findex, int count, int *args_regs ) { + hl_module *m = ctx->mod; + int fid = m->functions_indexes[findex]; + bool isNative = fid >= m->code->nfunctions; + ereg *args = get_tmp_args(ctx, count); + for(int i=0;ifunctions_ptrs[findex], args, count, dst->t)); + else { + ereg out = emit_call_fid(ctx,findex,args,count,hl_type_mode(dst->t)); + if( out ) STORE(dst, out); + } +} + +static vclosure *alloc_static_closure( emit_ctx *ctx, int fid ) { + hl_module *m = ctx->mod; + vclosure *c = hl_malloc(&m->ctx.alloc,sizeof(vclosure)); + int fidx = m->functions_indexes[fid]; + c->hasValue = 0; + if( fidx >= m->code->nfunctions ) { + // native + c->t = m->code->natives[fidx - m->code->nfunctions].t; + c->fun = m->functions_ptrs[fid]; + c->value = NULL; + } else { + c->t = m->code->functions[fidx].type; + c->fun = (void*)(int_val)fid; + c->value = ctx->closure_list; + ctx->closure_list = c; + } + return c; +} + +static void *get_dynget( hl_type *t ) { + switch( t->kind ) { + case HF32: + return hl_dyn_getf; + case HF64: + return hl_dyn_getd; + case HI64: + case HGUID: + return hl_dyn_geti64; + case HI32: + case HUI16: + case HUI8: + case HBOOL: + return hl_dyn_geti; + default: + return hl_dyn_getp; + } +} + +static void *get_dynset( hl_type *t ) { + switch( t->kind ) { + case HF32: + return hl_dyn_setf; + case HF64: + return hl_dyn_setd; + case HI64: + case HGUID: + return hl_dyn_seti64; + case HI32: + case HUI16: + case HUI8: + case HBOOL: + return hl_dyn_seti; + default: + return hl_dyn_setp; + } +} + +static void *get_dyncast( hl_type *t ) { + switch( t->kind ) { + case HF32: + return hl_dyn_castf; + case HF64: + return hl_dyn_castd; + case HI64: + case HGUID: + return hl_dyn_casti64; + case HI32: + case HUI16: + case HUI8: + case HBOOL: + return hl_dyn_casti; + default: + return hl_dyn_castp; + } +} + +static void emit_store_size( emit_ctx *ctx, ereg dst, int dst_offset, ereg src, int src_offset, int total_size ) { + int offset = 0; + while( offset < total_size) { + int remain = total_size - offset; + hl_type *ct = remain >= HL_WSIZE ? &hlt_bytes : (remain >= 4 ? &hlt_i32 : &hlt_ui8); + STORE_MEM(dst, dst_offset+offset, LOAD_MEM(src,src_offset+offset,ct)); + offset += hl_type_size(ct); + } +} + + +static ereg emit_conv( emit_ctx *ctx, ereg v, emit_mode from, emit_mode to, bool _unsigned ) { + if( from == to && !_unsigned ) + return emit_gen(ctx,MOV,v,UNUSED,to); + if( IS_FLOAT(from) != IS_FLOAT(to) ) + return emit_gen_ext(ctx, _unsigned ? CONV_UNSIGNED : CONV, v, UNUSED, to, from); + return emit_gen_ext(ctx, CONV, v, UNUSED, to, from); +} + +static bool dyn_need_type( hl_type *t ) { + return !(t->kind == HF32 || t->kind == HF64 || t->kind == HI64 || t->kind == HGUID); +} + +static void emit_dyn_cast( emit_ctx *ctx, ereg v, hl_type *t, vreg *dst ) { + hl_type *dt = dst->t; + if( t->kind == HNULL && t->tparam->kind == dt->kind ) { + emit_test(ctx, v, OJNotNull); + int jnot = emit_jump(ctx, true); + ereg v1 = LOAD_CONST(0,dt); + STORE(dst, v1); + int jend = emit_jump(ctx, false); + patch_jump(ctx, jnot); + ereg v2 = LOAD_MEM(v,HDYN_VALUE,dt); + STORE(dst, v2); + patch_jump(ctx, jend); + return; + } + bool need_dyn = dyn_need_type(dt); + ereg st = emit_gen_size(ctx, ALLOC_STACK, HL_WSIZE); + STORE_MEM(st, 0, v); + ereg args[3]; + args[0] = st; + args[1] = LOAD_CONST_PTR(t); + if( need_dyn ) args[2] = LOAD_CONST_PTR(dt); + ereg r = emit_native_call(ctx, get_dyncast(dt), args, need_dyn ? 3 : 2, dt); + STORE(dst, r); +} + +static void emit_opcode( emit_ctx *ctx, hl_opcode *o ); + +static void remap_phi_reg( emit_ctx *ctx, ereg *r ) { + if( *r >= 0 || IS_NULL(*r) ) + return; + tmp_phi *p = GET_PHI(*r); + while( p->final_id < 0 ) { + if( p->target >= 0 ) { + *r = p->target; + return; + } + p = GET_PHI(p->target); + } + if( p->final_id == 0 ) + return; + *r = -p->final_id; // new phis +} + +static void emit_write_block( emit_ctx *ctx, emit_block *b ) { + jit_ctx *jit = ctx->jit; + eblock *bl = jit->blocks + b->id; + bl->start_pos = b->id == 0 ? 0 : b->start_pos; + bl->end_pos = b->end_pos; + bl->pred_count = blocks_count(b->preds); + bl->next_count = blocks_count(b->nexts); + bl->preds = (int*)hl_malloc(&jit->falloc,sizeof(int)*bl->pred_count); + bl->nexts = (int*)hl_malloc(&jit->falloc,sizeof(int)*bl->next_count); + for(int i=0;ipred_count;i++) + bl->preds[i] = blocks_get(b->preds,i)->id; + for(int i=0;inext_count;i++) + bl->nexts[i] = blocks_get(b->nexts,i)->id; + // write phis + { + for_iter(phi,p,b->phis) + if( p->final_id >= 0 ) + bl->phi_count++; + } + bl->phis = (ephi*)hl_zalloc(&jit->falloc,sizeof(ephi)*bl->phi_count); + jit->phi_count += bl->phi_count; + int i = 0; + for_iter(phi,p,b->phis) { + if( p->final_id < 0 ) + continue; + ephi *p2 = bl->phis + i++; + if( p->final_id == 0 ) + p2->value = p->value; + else + p2->value = -p->final_id; + p2->mode = p->mode; + p2->nvalues = ereg_count(p->vals); + p2->values = (ereg*)hl_malloc(&jit->falloc,sizeof(ereg)*p2->nvalues); + p2->blocks = (ereg*)hl_malloc(&jit->falloc,sizeof(int)*p2->nvalues); + int k = 0; + for_iter_key(ereg,v,p->vals) { + remap_phi_reg(ctx, &v); + p2->values[k++] = v; + } + k = 0; + for_iter(ereg,bfrom,p->vals) + p2->blocks[k++] = bfrom->id; + } +} + +void hl_emit_remap_jumps( emit_ctx *ctx, void *_jumps, einstr *instrs, int *pos_map ) { + int_arr jumps = *(int_arr*)_jumps; + int i = 0; + while( i < int_arr_count(jumps) ) { + int pos = int_arr_get(jumps,i++); + int target = int_arr_get(jumps,i++); + einstr *e = instrs + pos; + if( e->op == JUMP_TABLE ) { + int *args = (int*)hl_emit_get_args(ctx, e); + for(int k=0;knargs;k++) + args[k] = pos_map[target + args[k]] - (pos + 1); + } else + e->size_offs = pos_map[target] - (pos + 1); + } + int_arr_reset((int_arr*)_jumps); +} + +void hl_emit_flush( jit_ctx *jit ) { + emit_ctx *ctx = jit->emit; + if( ctx->flushed ) return; + ctx->flushed = true; + ctx->pos_map[ctx->fun->nops] = ctx->emit_pos; + ctx->current_block->end_pos = ctx->emit_pos; + hl_emit_remap_jumps(ctx,&ctx->jump_regs, ctx->instrs, ctx->pos_map); + jit->instrs = ctx->instrs; + jit->instr_count = ctx->emit_pos; + jit->emit_pos_map = ctx->pos_map; + jit->phi_count = 0; + jit->block_count = ctx->current_block->id + 1; + jit->blocks = hl_zalloc(&jit->falloc,sizeof(eblock) * jit->block_count); + jit->value_count = int_arr_count(ctx->values); + jit->values_writes = ctx->values.values; + for_iter(blocks,b,ctx->blocks) + emit_write_block(ctx,b); +} + +void hl_emit_reg_iter( jit_ctx *jit, einstr *e, void *ctx, void (*iter_reg)( void *, ereg * ) ) { + switch( e->op ) { + case CALL_REG: + iter_reg(ctx,&e->a); + case CALL_FUN: + case CALL_PTR: + { + int i; + ereg *args = hl_emit_get_args(jit->emit, e); + for(i=0;inargs;i++) + iter_reg(ctx, args + i); + } + break; + case LOAD_CONST: + case PUSH_CONST: + // skip + break; + default: + if( !IS_NULL(e->a) ) { + iter_reg(ctx,&e->a); + if( !IS_NULL(e->b) ) + iter_reg(ctx,&e->b); + } + break; + } +} + +ereg **hl_emit_get_regs( einstr *e, int *count ) { + static ereg *tmp[2]; + int k = 0; + switch( e->op ) { + case CALL_REG: + case CALL_FUN: + case CALL_PTR: + jit_assert(); + break; + case LOAD_CONST: + case PUSH_CONST: + // skip + break; + default: + if( !IS_NULL(e->a) ) { + tmp[k++] = &e->a; + if( !IS_NULL(e->b) ) + tmp[k++] = &e->b; + } + break; + } + *count = k; + return tmp; +} + +static void hl_emit_clean_phis( emit_ctx *ctx ) { + for(int i=0;iphi_count;i++) { + tmp_phi *p = ctx->phis[i]; + tmp_phi *cur = p; + ereg r; + while( true ) { + cur->opt = false; + r = optimize_phi_rec(ctx,cur); + if( r >= 0 || r == cur->value ) break; + cur = GET_PHI(r); + } + p->target = r; + } + int new_phis = 0; + for(int i=0;iphi_count;i++) { + tmp_phi *p = ctx->phis[i]; + if( p->target == p->value ) + p->final_id = ++new_phis; + else + p->final_id = -1; + } + for(int i=0;iemit_pos;i++) + hl_emit_reg_iter(ctx->jit, ctx->instrs + i, ctx, (void*)remap_phi_reg); +} + +void hl_emit_function( jit_ctx *jit ) { + emit_ctx *ctx = jit->emit; + hl_function *f = jit->fun; + int i; + ctx->mod = jit->mod; + ctx->fun = f; + ctx->emit_pos = 0; + ctx->trap_count = 0; + ctx->phi_count = 0; + ctx->flushed = false; + int_arr_free(&ctx->args_data); + int_arr_free(&ctx->jump_regs); + int_arr_free(&ctx->values); + blocks_free(&ctx->blocks); + int_arr_add(ctx->values,-1); + ctx->current_block = alloc_block(ctx); + ctx->current_block->sealed = true; + ctx->arrival_points = NULL; + emit_debug("---- begin [%X] ----\n",f->findex); + if( f->nregs > ctx->max_regs ) { + free(ctx->vregs); + ctx->vregs = (vreg*)malloc(sizeof(vreg) * (f->nregs + 1)); + if( ctx->vregs == NULL ) jit_assert(); + for(i=0;inregs;i++) + R(i)->id = i; + ctx->max_regs = f->nregs; + } + + if( f->nops >= ctx->pos_map_size ) { + free(ctx->pos_map); + ctx->pos_map = (int*)malloc(sizeof(int) * (f->nops+1)); + if( ctx->pos_map == NULL ) jit_assert(); + ctx->pos_map_size = f->nops; + } + + for(i=0;inregs;i++) { + vreg *r = R(i); + r->t = f->regs[i]; + r->stored = UNUSED; + } + + emit_gen_size(ctx, BLOCK, 0); + emit_gen(ctx,ENTER,UNUSED,UNUSED,M_NONE); + for(i=0;itype->fun->nargs;i++) { + hl_type *t = f->type->fun->args[i]; + STORE(R(i), emit_gen(ctx, LOAD_ARG, UNUSED, UNUSED, hl_type_mode(t))); + } + + for(int op_pos=0;op_posnops;op_pos++) { + ctx->op_pos = op_pos; + if( ctx->emit_pos > 0 && ctx->instrs[ctx->emit_pos-1].op == BLOCK ) + ctx->pos_map[op_pos] = ctx->emit_pos-1; + else + ctx->pos_map[op_pos] = ctx->emit_pos; + if( ctx->arrival_points ) { + if( ctx->arrival_points->id < op_pos ) + jit_assert(); + while( ctx->arrival_points && ctx->arrival_points->id == op_pos && !split_block(ctx) ) { + emit_block *b = ctx->arrival_points->ptr; + for_iter(blocks,bp,ctx->current_block->preds) { + if( b == bp ) { b = NULL; break; } + } + if( b ) block_add_pred(ctx, ctx->current_block, b); + ctx->arrival_points = ctx->arrival_points->next; + } + if( ctx->trap_count && ctx->traps[ctx->trap_count-1].target == ctx->op_pos ) + ctx->trap_count--; + } + emit_opcode(ctx,f->ops + op_pos); + } + // emit a break if we're not supposed to reach here : will fix RtlUnwind on windows too. + if( f->nops == 0 || f->ops[f->nops-1].op != ORet ) + BREAK(); + if( ctx->arrival_points ) + jit_assert(); + + hl_emit_clean_phis(ctx); + hl_emit_flush(ctx->jit); + if( ctx->wait_seal ) jit_assert(); +} + +void hl_emit_alloc( jit_ctx *jit ) { + emit_ctx *ctx = (emit_ctx*)malloc(sizeof(emit_ctx)); + if( ctx == NULL ) jit_assert(); + memset(ctx,0,sizeof(emit_ctx)); + ctx->jit = jit; + jit->emit = ctx; + if( sizeof(einstr) != 16 ) jit_assert(); +} + +void hl_emit_free( jit_ctx *jit ) { + emit_ctx *ctx = jit->emit; + free(ctx->vregs); + free(ctx->instrs); + free(ctx->pos_map); + free(ctx); + jit->emit = NULL; +} + +void hl_emit_final( jit_ctx *jit ) { + emit_ctx *ctx = jit->emit; + vclosure *l = ctx->closure_list; + while( l ) { + vclosure *n = (vclosure*)l->value; + l->value = NULL; + l->fun = jit->final_code + (int_val)jit->mod->functions_ptrs[(int_val)l->fun]; + l = n; + } + ctx->closure_list = NULL; +} + +static bool seal_block_rec( emit_ctx *ctx, emit_block *b, int target ) { + if( b->start_pos < target ) + return false; + if( b->start_pos == target ) { + b->wait_nexts--; + block_add_pred(ctx, b, ctx->current_block); + while( b && b->wait_nexts == 0 && ctx->wait_seal == b ) { + seal_block(ctx,b); + b = b->wait_seal_next; + ctx->wait_seal = b; + } + return true; + } + for_iter(blocks,p,b->preds) + if( p->start_pos < b->start_pos && seal_block_rec(ctx,p,target) ) + return true; + return false; +} + +static void register_block_jump( emit_ctx *ctx, int offs, bool cond ) { + int jidx = ctx->emit_pos; + emit_gen(ctx, cond ? JCOND : JUMP, UNUSED, UNUSED, 0); + register_jump(ctx, jidx, offs); + if( offs < 0 ) { + int target = ctx->pos_map[ctx->op_pos + 1 + offs]; + emit_block *b = ctx->current_block; + if( !seal_block_rec(ctx, b, target) ) jit_assert(); + } +} + +static void prepare_loop_block( emit_ctx *ctx ) { + emit_block *b = ctx->current_block; + // gather all backward jumps to know when the block will be finished + for(int i=ctx->op_pos+1;ifun->nops;i++) { + hl_opcode *op = &ctx->fun->ops[i]; + int offs = 0; + switch( op->op ) { + case OJFalse: + case OJTrue: + case OJNotNull: + case OJNull: + offs = op->p2; + break; + case OJAlways: + offs = op->p1; + break; + case OJEq: + case OJNotEq: + case OJSLt: + case OJSGte: + case OJSLte: + case OJSGt: + case OJULt: + case OJUGte: + case OJNotLt: + case OJNotGte: + offs = op->p3; + break; + default: + break; + } + if( offs < 0 && i + 1 + offs == ctx->op_pos ) { + emit_debug(" WAIT @%X\n",i); + b->wait_nexts++; + if( b->sealed ) { + b->sealed = false; + b->wait_seal_next = ctx->wait_seal; + ctx->wait_seal = b; + } + } + } +} + +static void emit_jump_dyn( emit_ctx *ctx, hl_op op, hl_type *at, ereg a, hl_type *bt, ereg b, int offset ) { + if( at->kind == HDYN || bt->kind == HDYN || at->kind == HFUN || bt->kind == HFUN ) { + ereg args[2] = { a, b }; + ereg ret = emit_native_call(ctx,hl_dyn_compare,args,2,&hlt_i32); + if( op == OJSGt || op == OJSGte ) { + emit_cmp(ctx, ret, LOAD_CONST(hl_invalid_comparison,&hlt_i32), OJEq); + int jinvalid = emit_jump(ctx, true); + emit_test(ctx, ret, op); + register_block_jump(ctx, offset, true); + patch_jump(ctx, jinvalid); + return; + } + emit_test(ctx, ret, op); + // continue + } else switch( at->kind ) { + case HTYPE: + { + ereg args[2] = { a, b }; + ereg ret = emit_native_call(ctx,hl_same_type,args,2,&hlt_bool); + emit_test(ctx, emit_gen_ext(ctx,UNOP,ret,UNUSED,M_I32,ONot), op); + } + break; + case HNULL: + { + if( op == OJEq ) { + // if( a == b || (a && b && a->v == b->v) ) goto + emit_cmp(ctx,a,b,OJEq); + register_block_jump(ctx,offset,true); + emit_test(ctx,a,OJNull); + int ja = emit_jump(ctx,true); + emit_test(ctx,b,OJNull); + int jb = emit_jump(ctx,true); + hl_type *vt = at->tparam; + emit_cmp(ctx, LOAD_MEM(a,HDYN_VALUE,vt), LOAD_MEM(b,HDYN_VALUE,vt), OJEq); + register_block_jump(ctx,offset,true); + patch_jump(ctx,ja); + patch_jump(ctx,jb); + } else if( op == OJNotEq ) { + // if( a != b && (!a || !b || a->v != b->v) ) goto + emit_cmp(ctx,a,b,OJEq); + int jeq = emit_jump(ctx,true); + emit_test(ctx,a,OJEq); + register_block_jump(ctx,offset,true); + split_block(ctx); + emit_test(ctx,b,OJEq); + register_block_jump(ctx,offset,true); + split_block(ctx); + hl_type *vt = at->tparam; + emit_cmp(ctx, LOAD_MEM(a,HDYN_VALUE,vt), LOAD_MEM(b,HDYN_VALUE,vt), OJNull); + add_jump_target(ctx, 0); + int jcmp = emit_jump(ctx,true); + register_block_jump(ctx,offset,true); + patch_jump(ctx,jcmp); + patch_jump(ctx,jeq); + } else + jit_assert(); + } + return; + case HVIRTUAL: + if( bt->kind == HOBJ ) { + if( op == OJEq ) { + // if( a == b || (a && a->value == b) ) goto + emit_cmp(ctx, a, b, OJEq); + register_block_jump(ctx,offset,true); + split_block(ctx); + emit_test(ctx, a, OJNull); + int jnot = emit_jump(ctx, true); + emit_cmp(ctx, LOAD_MEM_PTR(a,HL_WSIZE), b, OJEq); + register_block_jump(ctx,offset,true); + split_block(ctx); + patch_jump(ctx, jnot); + } else if( op == OJNotEq ) { + // if( a != b && (!a || a->value != b) ) goto + emit_cmp(ctx, a, b, OJEq); + int jsame = emit_jump(ctx, true); + emit_test(ctx, a, OJNull); + register_block_jump(ctx,offset,true); + split_block(ctx); + emit_cmp(ctx, LOAD_MEM_PTR(a,HL_WSIZE), b, OJNotEq); + register_block_jump(ctx,offset,true); + split_block(ctx); + patch_jump(ctx,jsame); + } else + jit_assert(); + } else { + if( op == OJEq ) { + // if( a == b || (a && b && a->value && a->value == b->value) ) goto + emit_cmp(ctx, a, b, OJEq); + register_block_jump(ctx,offset,true); + split_block(ctx); + emit_test(ctx, a, OJNull); + int ja = emit_jump(ctx, true); + emit_test(ctx, b, OJNull); + int jb = emit_jump(ctx, true); + ereg va = LOAD_MEM_PTR(a,HL_WSIZE); + emit_test(ctx, va, OJNull); + int jva = emit_jump(ctx, true); + ereg vb = LOAD_MEM_PTR(b,HL_WSIZE); + emit_cmp(ctx, va, vb, OJEq); + register_block_jump(ctx,offset,true); + split_block(ctx); + patch_jump(ctx,ja); + patch_jump(ctx,jb); + patch_jump(ctx,jva); + } else if( op == OJNotEq ) { + // if( a != b && (!a || !b || !a->value || a->value != b->value) ) goto + emit_cmp(ctx, a, b, OJEq); + int jeq1 = emit_jump(ctx, true); + emit_test(ctx, a, OJNull); + int ja = emit_jump(ctx, true); + emit_test(ctx, b, OJNull); + int jb = emit_jump(ctx, true); + ereg va = LOAD_MEM_PTR(a,HL_WSIZE); + emit_test(ctx, va, OJNull); + int jva = emit_jump(ctx, true); + ereg vb = LOAD_MEM_PTR(b,HL_WSIZE); + emit_cmp(ctx, va, vb, OJEq); + int jeq2 = emit_jump(ctx, true); + split_block(ctx); + patch_jump(ctx,ja); + patch_jump(ctx,jb); + patch_jump(ctx,jva); + register_block_jump(ctx,offset,false); + split_block(ctx); + patch_jump(ctx,jeq1); + patch_jump(ctx,jeq2); + } else + jit_assert(); + } + return; + case HOBJ: + case HSTRUCT: + if( bt->kind == HVIRTUAL ) { + emit_jump_dyn(ctx,op,bt,b,at,a,offset); // inverse + return; + } + if( hl_get_obj_rt(at)->compareFun ) { + ereg args[] = {a,b}; + switch( op ) { + case OJEq: + { + // if( a == b || (a && b && cmp(a,b) == 0) ) goto + emit_cmp(ctx,a,b,OJEq); + int jeq = emit_jump(ctx, true); + emit_test(ctx,a,OJNull); + int ja = emit_jump(ctx, true); + emit_test(ctx,b,OJNull); + int jb = emit_jump(ctx, true); + emit_test(ctx, emit_call_fid(ctx,(int)(int_val)at->obj->rt->compareFun,args,2,M_I32),OJNotNull); + int jcmp = emit_jump(ctx, true); + patch_jump(ctx, jeq); + register_block_jump(ctx, offset, false); + split_block(ctx); + patch_jump(ctx, ja); + patch_jump(ctx, jb); + patch_jump(ctx, jcmp); + } + break; + case OJNotEq: + { + // if( a != b && (!a || !b || cmp(a,b) != 0) ) goto + emit_cmp(ctx,a,b,OJEq); + add_jump_target(ctx, 0); + int jeq = emit_jump(ctx, true); + emit_test(ctx,a,OJEq); + register_block_jump(ctx,offset,true); + split_block(ctx); + emit_test(ctx,b,OJEq); + register_block_jump(ctx,offset,true); + split_block(ctx); + emit_test(ctx, emit_call_fid(ctx,(int)(int_val)at->obj->rt->compareFun,args,2,M_I32),OJNotNull); + register_block_jump(ctx,offset,true); + patch_jump(ctx,jeq); + } + break; + default: + { + // if( a && b && cmp(a,b) ~op~ 0 ) goto + emit_test(ctx,a,OJNull); + int ja = emit_jump(ctx, true); + emit_test(ctx,b,OJNull); + int jb = emit_jump(ctx, true); + emit_cmp(ctx, emit_call_fid(ctx,(int)(int_val)at->obj->rt->compareFun,args,2,M_I32), LOAD_CONST(0,&hlt_i32),op); + register_block_jump(ctx,offset,true); + patch_jump(ctx,ja); + patch_jump(ctx,jb); + } + break; + } + return; + } + // fallthrough + default: + emit_cmp(ctx, a, b, op); + break; + } + register_block_jump(ctx, offset, true); +} + +static void emit_opcode( emit_ctx *ctx, hl_opcode *o ) { + vreg *dst = R(o->p1); + vreg *ra = R(o->p2); + vreg *rb = R(o->p3); + hl_module *m = ctx->mod; +#ifdef HL_DEBUG + int uid = (ctx->fun->findex << 16) | ctx->op_pos; + __ignore(&uid); +#endif + switch( o->op ) { + case OMov: + case OUnsafeCast: + STORE(dst, emit_gen(ctx,MOV,LOAD(ra),UNUSED,hl_type_mode(ra->t))); + break; + case OInt: + STORE(dst, LOAD_CONST(m->code->ints[o->p2], dst->t)); + break; + case OBool: + STORE(dst, LOAD_CONST(o->p2, &hlt_bool)); + break; + case ONull: + STORE(dst, LOAD_CONST(0, dst->t)); + break; + case OFloat: + { + union { + float f; + double d; + uint64 i; + } v; + if( dst->t->kind == HF32 ) { + v.i = 0; + v.f = (float)m->code->floats[o->p2]; + } else + v.d = m->code->floats[o->p2]; + STORE(dst, LOAD_CONST(v.i, dst->t)); + } + break; + case OString: + STORE(dst, LOAD_CONST_PTR(hl_get_ustring(m->code,o->p2))); + break; + case OBytes: + { + char *b = m->code->version >= 5 ? m->code->bytes + m->code->bytes_pos[o->p2] : m->code->strings[o->p2]; + STORE(dst,LOAD_CONST_PTR(b)); + } + break; + case OGetGlobal: + { + int offs = m->globals_indexes[o->p2]; + STORE(dst, LOAD_MEM_PTR(LOAD_CONST_PTR(m->globals_data),offs)); + } + break; + case OSetGlobal: + { + int offs = m->globals_indexes[o->p1]; + STORE_MEM(LOAD_CONST_PTR(m->globals_data),offs,LOAD(ra)); + } + break; + case OCall0: + emit_call_fun(ctx, dst, o->p2, 0, NULL); + break; + case OCall1: + emit_call_fun(ctx, dst, o->p2, 1, &o->p3); + break; + case OCall2: + { + int args[2] = { o->p3, (int)(int_val)o->extra }; + emit_call_fun(ctx, dst, o->p2, 2, args); + } + break; + case OCall3: + { + int args[3] = { o->p3, o->extra[0], o->extra[1] }; + emit_call_fun(ctx, dst, o->p2, 3, args); + } + break; + case OCall4: + { + int args[4] = { o->p3, o->extra[0], o->extra[1], o->extra[2] }; + emit_call_fun(ctx, dst, o->p2, 4, args); + } + break; + case OCallN: + emit_call_fun(ctx, dst, o->p2, o->p3, o->extra); + break; + case OSub: + case OAdd: + case OMul: + case OSDiv: + case OUDiv: + case OShl: + case OSShr: + case OUShr: + case OAnd: + case OOr: + case OXor: + case OSMod: + case OUMod: + { + ereg va = LOAD(ra); + ereg vb = LOAD(rb); + ereg r; + if( (dst->t->kind == HF32 || dst->t->kind == HF64) && o->op == OSMod ) { + ereg args[] = {va,vb}; + r = emit_native_call(ctx, dst->t->kind == HF32 ? (void*)fmodf : (void*)fmod, args, 2, dst->t); + } else { + r = emit_gen_ext(ctx, BINOP, va, vb, hl_type_mode(dst->t), o->op); + } + STORE(dst, r); + } + break; + case ONeg: + STORE(dst, emit_gen_ext(ctx, UNOP, LOAD(ra), UNUSED, hl_type_mode(dst->t), o->op)); + break; + case ONot: + STORE(dst, emit_gen_ext(ctx, UNOP, LOAD(ra), LOAD_CONST(1,&hlt_i32), hl_type_mode(dst->t), OXor)); + break; + case OJFalse: + case OJTrue: + case OJNotNull: + case OJNull: + { + emit_test(ctx, LOAD(dst), o->op); + register_block_jump(ctx, o->p2, true); + add_jump_target(ctx, 0); + } + break; + case OJEq: + case OJNotEq: + case OJSLt: + case OJSGte: + case OJSLte: + case OJSGt: + case OJULt: + case OJUGte: + case OJNotLt: + case OJNotGte: + emit_jump_dyn(ctx,o->op,dst->t,LOAD(dst),ra->t,LOAD(ra),o->p3); + add_jump_target(ctx, 0); + break; + case OJAlways: + register_block_jump(ctx, o->p1, false); + break; + case OToDyn: + if( ra->t->kind == HBOOL ) { + ereg arg = LOAD(ra); + STORE(dst, emit_native_call(ctx,hl_alloc_dynbool,&arg,1,&hlt_dyn)); + } else { + ereg arg = LOAD_CONST_PTR(ra->t); + ereg ret = emit_native_call(ctx,hl_alloc_dynamic,&arg,1,&hlt_dyn); + STORE_MEM(ret,HDYN_VALUE,LOAD(ra)); + STORE(dst, ret); + } + break; + case OToSFloat: + case OToInt: + case OToUFloat: + STORE(dst, emit_conv(ctx,LOAD(ra),hl_type_mode(ra->t),hl_type_mode(dst->t), o->op == OToUFloat)); + break; + case ORet: + emit_gen(ctx, RET, dst->t->kind == HVOID ? UNUSED : LOAD(dst), 0, M_NORET); + patch_instr_mode(ctx, hl_type_mode(dst->t)); + break; + case OIncr: + case ODecr: + STORE(dst, emit_gen_ext(ctx,UNOP,LOAD(dst),UNUSED,hl_type_mode(dst->t),o->op)); + break; + case ONew: + { + ereg arg = UNUSED; + void *allocFun = NULL; + int nargs = 1; + switch( dst->t->kind ) { + case HOBJ: + case HSTRUCT: + allocFun = hl_alloc_obj; + break; + case HDYNOBJ: + allocFun = hl_alloc_dynobj; + nargs = 0; + break; + case HVIRTUAL: + allocFun = hl_alloc_virtual; + break; + default: + jit_assert(); + } + if( nargs ) arg = LOAD_CONST_PTR(dst->t); + STORE(dst, emit_native_call(ctx,allocFun,&arg,nargs,dst->t)); + } + break; + case OInstanceClosure: + { + ereg args[3]; + args[0] = LOAD_CONST_PTR(m->code->functions[m->functions_indexes[o->p2]].type); + einstr *e = emit_instr(ctx, LOAD_FUN); + e->mode = M_PTR; + e->size_offs = o->p2; + args[1] = new_value(ctx); + args[2] = LOAD(rb); + STORE(dst, emit_native_call(ctx,hl_alloc_closure_ptr,args,3,dst->t)); + } + break; + case OVirtualClosure: + { + hl_type *t = NULL; + hl_type *ot = ra->t; + while( t == NULL ) { + int i; + for(i=0;iobj->nproto;i++) { + hl_obj_proto *pp = ot->obj->proto + i; + if( pp->pindex == o->p3 ) { + t = m->code->functions[m->functions_indexes[pp->findex]].type; + break; + } + } + ot = ot->obj->super; + } + ereg args[3]; + ereg obj = LOAD(ra); + args[0] = LOAD_CONST_PTR(t); + args[1] = LOAD_OBJ_METHOD(obj,o->p3); + args[2] = obj; + STORE(dst, emit_native_call(ctx,hl_alloc_closure_ptr,args,3,dst->t)); + } + break; + case OCallClosure: + if( ra->t->kind == HDYN ) { + int i; + ereg st = emit_gen_size(ctx, ALLOC_STACK, o->p3 * HL_WSIZE); + for(i=0;ip3;i++) { + vreg *r = R(o->extra[i]); + if( !hl_is_dynamic(r->t) ) jit_assert(); + STORE_MEM(st,i*HL_WSIZE,LOAD(r)); + } + ereg args[3]; + args[0] = LOAD(ra); + args[1] = st; + args[2] = LOAD_CONST(o->p3,&hlt_i32); + emit_dyn_cast(ctx,emit_native_call(ctx,hl_dyn_call,args,3,dst->t),ra->t,dst); + } else { + ereg r = LOAD(ra); + ereg *args = get_tmp_args(ctx,o->p3+1); + // Code for if( c->hasValue ) c->fun(c->value,args) else c->fun(args) + ereg has = LOAD_MEM(r,HL_WSIZE*2,&hlt_i32); + emit_test(ctx, has, OJNull); + int jidx = emit_jump(ctx, true); + int i; + args[0] = LOAD_MEM_PTR(r,HL_WSIZE * 3); + for(i=0;ip3;i++) + args[i+1] = LOAD(R(o->extra[i])); + ereg v1 = emit_dyn_call(ctx,LOAD_MEM_PTR(r,HL_WSIZE),args,o->p3 + 1,dst->t); + STORE(dst, v1); + int jend = emit_jump(ctx, false); + patch_jump(ctx, jidx); + for(i=0;ip3;i++) + args[i] = LOAD(R(o->extra[i])); + ereg v2 = emit_dyn_call(ctx,LOAD_MEM_PTR(r,HL_WSIZE),args,o->p3,dst->t); + STORE(dst, v2); + patch_jump(ctx, jend); + } + break; + case OStaticClosure: + { + vclosure *c = alloc_static_closure(ctx,o->p2); + STORE(dst, LOAD_CONST_PTR(c)); + } + break; + case OField: + { + switch( ra->t->kind ) { + case HOBJ: + case HSTRUCT: + { + hl_runtime_obj *rt = hl_get_obj_rt(ra->t); + ereg r = LOAD(ra); + if( dst->t->kind == HSTRUCT ) { + hl_type *ft = hl_obj_field_fetch(ra->t,o->p3)->t; + if( ft->kind == HPACKED ) { + STORE(dst,OFFSET(r, UNUSED, 0, rt->fields_indexes[o->p3])); + break; + } + } + STORE(dst, LOAD_MEM(r,rt->fields_indexes[o->p3],dst->t)); + } + break; + case HVIRTUAL: + // code for : if( hl_vfields(o)[f] ) r = *hl_vfields(o)[f]; else r = hl_dyn_get(o,hash(field),vt) + { + ereg obj = LOAD(ra); + ereg field = LOAD_MEM_PTR(obj,sizeof(vvirtual)+HL_WSIZE*o->p3); + emit_test(ctx, field, OJNull); + int jidx = emit_jump(ctx, true); + ereg v1 = LOAD_MEM(field,0,dst->t); + STORE(dst, v1); + int jend = emit_jump(ctx, false); + patch_jump(ctx, jidx); + bool need_type = dyn_need_type(dst->t); + ereg args[3]; + args[0] = obj; + args[1] = LOAD_CONST(ra->t->virt->fields[o->p3].hashed_name,&hlt_i32); + if( need_type ) args[2] = LOAD_CONST_PTR(dst->t); + ereg v2 = emit_native_call(ctx,get_dynget(dst->t),args,need_type?3:2,dst->t); + STORE(dst, v2); + patch_jump(ctx, jend); + } + break; + default: + jit_assert(); + break; + } + } + break; + case OSetField: + { + switch( dst->t->kind ) { + case HOBJ: + case HSTRUCT: + { + ereg obj = LOAD(dst); + ereg val = LOAD(rb); + hl_runtime_obj *rt = hl_get_obj_rt(dst->t); + int field_pos = rt->fields_indexes[o->p2]; + if( rb->t->kind == HSTRUCT ) { + hl_type *ft = hl_obj_field_fetch(dst->t,o->p2)->t; + if( ft->kind == HPACKED ) { + emit_store_size(ctx,obj,field_pos,val,0,hl_get_obj_rt(ft->tparam)->size); + break; + } + } + STORE_MEM(obj,field_pos, val); + } + break; + case HVIRTUAL: + // code for : if( hl_vfields(o)[f] ) *hl_vfields(o)[f] = v; else hl_dyn_set(o,hash(field),vt,v) + { + ereg obj = LOAD(dst); + ereg val = LOAD(rb); + ereg field = LOAD_MEM_PTR(obj,sizeof(vvirtual)+HL_WSIZE*o->p2); + emit_test(ctx, field, OJNull); + int jidx = emit_jump(ctx, true); + STORE_MEM(field, 0, val); + int jend = emit_jump(ctx, false); + patch_jump(ctx, jidx); + bool need_type = dyn_need_type(dst->t); + ereg args[4]; + args[0] = obj; + args[1] = LOAD_CONST(dst->t->virt->fields[o->p2].hashed_name,&hlt_i32); + if( need_type ) { + args[2] = LOAD_CONST_PTR(rb->t); + args[3] = val; + } else { + args[2] = val; + } + emit_native_call(ctx,get_dynset(dst->t),args,need_type?4:3,dst->t); + patch_jump(ctx, jend); + } + break; + default: + jit_assert(); + break; + } + } + break; + case OGetThis: + { + vreg *r = R(0); + ereg obj = LOAD(r); + hl_runtime_obj *rt = hl_get_obj_rt(r->t); + int field_pos = rt->fields_indexes[o->p2]; + if( dst->t->kind == HSTRUCT ) { + hl_type *ft = hl_obj_field_fetch(r->t,o->p2)->t; + if( ft->kind == HPACKED ) { + STORE(dst, OFFSET(obj, UNUSED, 0, field_pos)); + break; + } + } + STORE(dst, LOAD_MEM(obj, field_pos, dst->t)); + } + break; + case OSetThis: + { + vreg *r = R(0); + ereg obj = LOAD(r); + ereg val = LOAD(ra); + hl_runtime_obj *rt = hl_get_obj_rt(r->t); + int field_pos = rt->fields_indexes[o->p1]; + if( ra->t->kind == HSTRUCT ) { + hl_type *ft = hl_obj_field_fetch(r->t,o->p1)->t; + if( ft->kind == HPACKED ) { + emit_store_size(ctx, obj, field_pos, val, 0, hl_get_obj_rt(ft->tparam)->size); + break; + } + } + STORE_MEM(obj,field_pos,val); + } + break; + case OCallThis: + { + int i; + int nargs = o->p3 + 1; + ereg obj = LOAD(R(0)); + ereg *args = get_tmp_args(ctx, nargs); + args[0] = obj; + for(i=1;iextra[i-1])); + ereg fun = LOAD_OBJ_METHOD(obj, o->p2); + STORE(dst, emit_dyn_call(ctx,fun,args,nargs,dst->t)); + } + break; + case OCallMethod: + { + vreg *r = R(o->extra[0]); + ereg obj = LOAD(r); + switch( r->t->kind ) { + case HOBJ: + { + int i; + int nargs = o->p3; + ereg *args = get_tmp_args(ctx, nargs); + for(i=0;iextra[i])); + ereg fun = LOAD_OBJ_METHOD(obj, o->p2); + STORE(dst, emit_dyn_call(ctx,fun,args,nargs,dst->t)); + } + break; + case HVIRTUAL: + // code for : if( (fun=hl_vfields(o)[f]) ) dst = fun(o->value,args...); else dst = hl_dyn_call_obj(o->value,ft,field,args,&ret) + { + vreg *_o = R(o->extra[0]); + ereg obj = LOAD(_o); + ereg fun = LOAD_MEM_PTR(obj,sizeof(vvirtual)+HL_WSIZE*o->p2); + emit_test(ctx, fun, OJNull); + int jidx = emit_jump(ctx, true); + + int nargs = o->p3; + ereg *args = get_tmp_args(ctx, nargs); + int i; + args[0] = LOAD_MEM_PTR(obj,HL_WSIZE); + for(i=1;iextra[i])); + ereg v1 = emit_dyn_call(ctx,fun,args,nargs,dst->t); + STORE(dst, v1); + + int jend = emit_jump(ctx, false); + patch_jump(ctx, jidx); + + nargs = o->p3 - 1; + ereg eargs = nargs == 0 ? LOAD_CONST_PTR(NULL) : emit_gen_size(ctx, ALLOC_STACK, nargs * HL_WSIZE); + for(i=0;iextra[i+1]); + if( hl_is_ptr(r->t) ) + STORE_MEM(eargs,i*HL_WSIZE,LOAD(r)); + else + STORE_MEM(eargs,i*HL_WSIZE,emit_gen(ctx, ADDRESS, LOAD(r), UNUSED, M_PTR)); + } + bool need_dyn = !hl_is_ptr(dst->t) && dst->t->kind != HVOID; + ereg edyn = need_dyn ? emit_gen_size(ctx, ALLOC_STACK, sizeof(vdynamic)) : LOAD_CONST_PTR(NULL); + + args = get_tmp_args(ctx, 5); + args[0] = LOAD_MEM_PTR(obj,HL_WSIZE); + args[1] = LOAD_CONST_PTR(_o->t->virt->fields[o->p2].t); + args[2] = LOAD_CONST(_o->t->virt->fields[o->p2].hashed_name,&hlt_i32); + args[3] = eargs; + args[4] = edyn; + + ereg v2 = emit_native_call(ctx, hl_dyn_call_obj, args, 5, &hlt_bytes); + if( need_dyn ) + STORE(dst, LOAD_MEM(edyn,HDYN_VALUE,dst->t)); + else + STORE(dst, v2); + patch_jump(ctx, jend); + } + break; + default: + jit_assert(); + break; + } + } + break; + case OThrow: + case ORethrow: + { + ereg arg = LOAD(dst); + emit_native_call(ctx, o->op == OThrow ? hl_throw : hl_rethrow, &arg, 1, NULL); + } + break; + case OLabel: + split_block(ctx); + prepare_loop_block(ctx); + break; + case OGetI8: + case OGetI16: + case OGetMem: + { + hl_type *size_t = o->op == OGetI8 ? &hlt_ui8 : o->op == OGetI16 ? &hlt_ui16 : dst->t; + ereg offs = OFFSET(LOAD(ra),LOAD(rb),1,0); + ereg val = emit_load_mem(ctx, offs, 0, size_t, dst->t); + STORE(dst, val); + } + break; + case OSetI8: + case OSetI16: + case OSetMem: + { + ereg offs = OFFSET(LOAD(dst), LOAD(ra),1,0); + ereg val = LOAD(rb); + STORE_MEM(offs, 0, val); + if( o->op != OSetMem ) patch_instr_mode(ctx, o->op == OSetI8 ? M_UI8 : M_UI16); + } + break; + case OType: + STORE(dst, LOAD_CONST_PTR(m->code->types + o->p2)); + break; + case OGetType: + { + ereg r = LOAD(ra); + emit_test(ctx, r, OJNotNull); + int jidx = emit_jump(ctx, true); + ereg v1 = LOAD_CONST_PTR(&hlt_void); + STORE(dst,v1); + int jend = emit_jump(ctx, false); + patch_jump(ctx, jidx); + ereg v2 = LOAD_MEM_PTR(r,0); + STORE(dst,v2); + patch_jump(ctx, jend); + } + break; + case OGetArray: + { + if( ra->t->kind == HABSTRACT ) { + int osize; + bool isPtr = dst->t->kind != HOBJ && dst->t->kind != HSTRUCT; + if( isPtr ) + osize = HL_WSIZE; // a pointer into the carray + else { + hl_runtime_obj *rt = hl_get_obj_rt(dst->t); + osize = rt->size; // a mem offset into it + } + ereg pos = (osize <= 8 && ((osize - 1) & osize) == 0) ? OFFSET(LOAD(ra), LOAD(rb), osize, 0) : OFFSET(LOAD(ra), emit_gen_ext(ctx,BINOP,LOAD(rb),MK_CONST(osize),M_I32,OMul),1,0); + ereg val = isPtr ? LOAD_MEM_PTR(pos,0) : pos; + STORE(dst, val); + } else { + ereg pos = OFFSET(LOAD(ra), LOAD(rb), hl_type_size(dst->t), sizeof(varray)); + STORE(dst, LOAD_MEM(pos,0,dst->t)); + } + } + break; + case OSetArray: + { + if( dst->t->kind == HABSTRACT ) { + int osize; + bool isPtr = rb->t->kind != HOBJ && rb->t->kind != HSTRUCT; + if( isPtr) { + osize = HL_WSIZE; + } else { + hl_runtime_obj *rt = hl_get_obj_rt(rb->t); + osize = rt->size; + } + ereg pos = (osize <= 8 && ((osize - 1) & osize) == 0) ? OFFSET(LOAD(dst), LOAD(ra), osize, 0) : OFFSET(LOAD(dst), emit_gen_ext(ctx,BINOP,LOAD(ra),MK_CONST(osize),M_I32,OMul),1,0); + emit_store_size(ctx, pos, 0, LOAD(rb), 0, osize); + } else { + ereg pos = OFFSET(LOAD(dst), LOAD(ra), hl_type_size(dst->t), sizeof(varray)); + STORE_MEM(pos, 0, LOAD(rb)); + } + } + break; + case OArraySize: + STORE(dst, LOAD_MEM(LOAD(ra),HL_WSIZE*2,&hlt_i32)); + break; + case ORef: + STORE(dst, emit_gen(ctx, ADDRESS, LOAD(ra), UNUSED, M_PTR)); + break; + case OUnref: + STORE(dst, LOAD_MEM(LOAD(ra),0,dst->t)); + break; + case OSetref: + STORE_MEM(LOAD(dst),0,LOAD(ra)); + break; + case ORefData: + switch( ra->t->kind ) { + case HARRAY: + STORE(dst, OFFSET(LOAD(ra),UNUSED,0,sizeof(varray))); + break; + default: + jit_assert(); + } + break; + case ORefOffset: + STORE(dst, OFFSET(LOAD(ra),LOAD(rb), hl_type_size(dst->t->tparam),0)); + break; + case OToVirtual: + { + ereg args[2]; + args[0] = LOAD_CONST_PTR(dst->t); + args[1] = LOAD(ra); + STORE(dst, emit_native_call(ctx,hl_to_virtual,args,2, dst->t)); + } + break; + case OMakeEnum: + { + ereg args[2]; + args[0] = LOAD_CONST_PTR(dst->t); + args[1] = LOAD_CONST(o->p2,&hlt_i32); + ereg en = emit_native_call(ctx, hl_alloc_enum, args, 2, dst->t); + hl_enum_construct *c = &dst->t->tenum->constructs[o->p2]; + for(int i=0;inparams;i++) + STORE_MEM(en, c->offsets[i], LOAD(R(o->extra[i]))); + STORE(dst, en); + } + break; + case OEnumAlloc: + { + ereg args[2]; + args[0] = LOAD_CONST_PTR(dst->t); + args[1] = LOAD_CONST(o->p2,&hlt_i32); + STORE(dst, emit_native_call(ctx, hl_alloc_enum, args, 2, dst->t)); + } + break; + case OEnumField: + { + hl_enum_construct *c = &ra->t->tenum->constructs[o->p3]; + int slot = (int)(int_val)o->extra; + STORE(dst, LOAD_MEM(LOAD(ra),c->offsets[slot], dst->t)); + } + break; + case OEnumIndex: + STORE(dst, LOAD_MEM(LOAD(ra),HL_WSIZE,dst->t)); + break; + case OSetEnumField: + { + hl_enum_construct *c = &dst->t->tenum->constructs[0]; + STORE_MEM(LOAD(dst), c->offsets[o->p2], LOAD(rb)); + } + break; + case ONullCheck: + { + emit_test(ctx, LOAD(dst), OJNotNull); + add_jump_target(ctx, 0); + int jok = emit_jump(ctx, true); + + // ----- DETECT FIELD ACCESS ---------------- + hl_function *f = ctx->fun; + hl_opcode *next = f->ops + ctx->op_pos + 1; + bool null_field_access = false; + int hashed_name = 0; + // skip const and operation between nullcheck and access + while( (next < f->ops + f->nops - 1) && (next->op >= OInt && next->op <= ODecr) ) { + next++; + } + if( (next->op == OField && next->p2 == o->p1) || (next->op == OSetField && next->p1 == o->p1) ) { + int fid = next->op == OField ? next->p3 : next->p2; + hl_obj_field *f = NULL; + if( dst->t->kind == HOBJ || dst->t->kind == HSTRUCT ) + f = hl_obj_field_fetch(dst->t, fid); + else if( dst->t->kind == HVIRTUAL ) + f = dst->t->virt->fields + fid; + if( f == NULL ) jit_assert(); + null_field_access = true; + hashed_name = f->hashed_name; + } else if( (next->op >= OCall1 && next->op <= OCallN) && next->p3 == o->p1 ) { + int fid = next->p2 < 0 ? -1 : m->functions_indexes[next->p2]; + hl_function *cf = m->code->functions + fid; + const uchar *name = fun_field_name(cf); + null_field_access = true; + hashed_name = hl_hash_gen(name, true); + } + // ----------------------------------------- + if( null_field_access ) { + einstr *e = emit_instr(ctx, PUSH_CONST); + e->mode = M_PTR; + e->value = hashed_name; + } + emit_native_call(ctx, null_field_access ? (void*)hl_jit_null_field_access : (void*)hl_null_access, NULL, 0, NULL); + patch_jump(ctx, jok); + } + break; + case OSafeCast: + emit_dyn_cast(ctx, LOAD(ra), ra->t, dst); + break; + case ODynGet: + { + bool need_type = dyn_need_type(dst->t); + ereg args[3]; + args[0] = LOAD(ra); + args[1] = LOAD_CONST(hl_hash_utf8(m->code->strings[o->p3]),&hlt_i32); + if( need_type ) args[2] = LOAD_CONST_PTR(dst->t); + STORE(dst, emit_native_call(ctx, get_dynget(dst->t), args, need_type ? 3 : 2, dst->t)); + } + break; + case ODynSet: + { + bool need_type = dyn_need_type(dst->t); + ereg args[4]; + args[0] = LOAD(dst); + args[1] = LOAD_CONST(hl_hash_utf8(m->code->strings[o->p2]),&hlt_i32); + if( need_type ) { + args[2] = LOAD_CONST_PTR(rb->t); + args[3] = LOAD(rb); + } else + args[2] = LOAD(rb); + emit_native_call(ctx, get_dynset(rb->t), args, need_type ? 4 : 3, &hlt_void); + } + break; + case OTrap: + { + ereg st = emit_gen_size(ctx, ALLOC_STACK, sizeof(hl_trap_ctx)); + + ereg thread, current_addr; + static hl_thread_info *tinf = NULL; + static hl_trap_ctx *trap = NULL; +# ifndef HL_THREADS + if( tinf == NULL ) tinf = hl_get_thread(); + current_addr = LOAD_CONST_PTR(&tinf->trap_current); +# else + thread = emit_native_call(ctx, hl_get_thread, NULL, 0, &hlt_bytes); + current_addr = OFFSET(thread, UNUSED, 0, (int)(int_val)&tinf->trap_current); +# endif + STORE_MEM(st, (int)(int_val)&trap->prev, LOAD_MEM_PTR(current_addr,0)); + STORE_MEM(current_addr, 0, st); + + + /* + trap E,@catch + catch g + catch g2 + ... + @:catch + + // Before haxe 5 + This is a bit hackshish : we want to detect the type of exception filtered by the catch so we check the following + sequence of HL opcodes: + + trap E,@catch + ... + @catch: + global R, _ + call _, ???(R,E) + + ??? is expected to be hl.BaseType.check + */ + hl_function *f = ctx->fun; + hl_opcode *cat = f->ops + ctx->op_pos + 1; + hl_opcode *next = f->ops + ctx->op_pos + 1 + o->p2; + hl_opcode *next2 = f->ops + ctx->op_pos + 2 + o->p2; + void *addr = NULL; + int offs = 0; + if( cat->op == OCatch || (next->op == OGetGlobal && next2->op == OCall2 && next2->p3 == next->p1 && dst->id == (int)(int_val)next2->extra) ) { + int gindex = cat->op == OCatch ? cat->p1 : next->p2; + hl_type *gt = m->code->globals[gindex]; + while( gt->kind == HOBJ && gt->obj->super ) gt = gt->obj->super; + if( gt->kind == HOBJ && gt->obj->nfields && gt->obj->fields[0].t->kind == HTYPE ) { + addr = m->globals_data; + offs = m->globals_indexes[gindex]; + } + } + STORE_MEM(st, (int)(int_val)&trap->tcheck, addr ? LOAD_MEM_PTR(LOAD_CONST_PTR(addr),offs) : LOAD_CONST_PTR(NULL)); + + void *fun = setjmp; + ereg args[2]; + int nargs = 1; + args[0] = st; +#if defined(HL_WIN) && defined(HL_64) + // On Win64 setjmp actually takes two arguments + // the jump buffer and the frame pointer (or the stack pointer if there is no FP) + nargs = 2; + args[1] = emit_gen(ctx,LEA,MK_STACK_REG(0),UNUSED,M_PTR); +#endif +#ifdef HL_MINGW + fun = _setjmp; +#endif + ereg ret = emit_native_call(ctx, fun, args, nargs, &hlt_i32); + emit_test(ctx, ret, OJNull); + int jskip = emit_jump(ctx, true); + STORE(dst, tinf ? LOAD_CONST_PTR(&tinf->exc_value) : LOAD_MEM_PTR(thread,(int)(int_val)&tinf->exc_value)); + + int jtrap = ctx->emit_pos; + emit_gen(ctx, JUMP, UNUSED, UNUSED, 0); + register_jump(ctx, jtrap, o->p2); + split_block(ctx); + patch_jump(ctx, jskip); + + if( ctx->trap_count == MAX_TRAPS ) jit_error("Too many try/catch depth"); + trap_inf *inf = &ctx->traps[ctx->trap_count++]; + inf->stack = st; + inf->target = o->p2 + 1 + ctx->op_pos; + } + break; + case OEndTrap: + { + if( ctx->trap_count == 0 ) jit_assert(); + ereg st = ctx->traps[ctx->trap_count - 1].stack; + + ereg thread, current_addr; + static hl_thread_info *tinf = NULL; + static hl_trap_ctx *trap = NULL; +# ifndef HL_THREADS + if( tinf == NULL ) tinf = hl_get_thread(); + current_addr = LOAD_CONST_PTR(&tinf->trap_current); +# else + thread = emit_native_call(ctx, hl_get_thread, NULL, 0, &hlt_bytes); + current_addr = OFFSET(thread, UNUSED, 0, (int)(int_val)&tinf->trap_current); +# endif + + STORE_MEM(current_addr, 0, LOAD_MEM_PTR(st,(int)(int_val)&trap->prev)); + + emit_instr(ctx, CATCH); + } + break; + case OSwitch: + { + ereg v = LOAD(dst); + int count = o->p2; + emit_cmp(ctx,v,LOAD_CONST(count,&hlt_i32),OJUGte); + add_jump_target(ctx, 0); + int jdefault = emit_jump(ctx, true); + int pos = ctx->emit_pos; + einstr *e = emit_instr(ctx, JUMP_TABLE); + e->a = v; + patch_instr_mode(ctx, M_NORET); + store_args(ctx,e,(ereg*)o->extra,count); + register_jump(ctx, pos, 0); + for(int k=0;kextra[k]; + if( offs < 0 ) jit_assert(); + if( offs == 0 ) continue; + add_jump_target(ctx, offs); + } + patch_jump(ctx, jdefault); + } + break; + case OGetTID: + STORE(dst, LOAD_MEM(LOAD(ra),0,&hlt_i32)); + break; + case OAssert: + emit_native_call(ctx, hl_jit_assert, NULL, 0, NULL); + break; + case ONop: + break; + case OPrefetch: + { + ereg r = LOAD(dst); + if( o->p2 > 0 ) { + switch( dst->t->kind ) { + case HOBJ: + case HSTRUCT: + { + hl_runtime_obj *rt = hl_get_obj_rt(dst->t); + r = OFFSET(r, UNUSED, 0, rt->fields_indexes[o->p2-1]); + } + break; + default: + jit_assert(); + break; + } + } + emit_gen_ext(ctx, PREFETCH, r, UNUSED, M_NONE, o->p3); + } + break; + case OAsm: + jit_assert(); + break; + case OCatch: + // Only used by OTrap typing + break; + default: + jit_error(hl_op_name(o->op)); + break; + } +} diff --git a/src/jit_old.c b/src/jit_old.c new file mode 100644 index 000000000..7e4e6e88b --- /dev/null +++ b/src/jit_old.c @@ -0,0 +1,4730 @@ +/* + * Copyright (C)2015-2016 Haxe Foundation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ +#ifdef _MSC_VER +#pragma warning(disable:4820) +#endif +#include +#include +#include "hlsystem.h" + +#ifdef __arm__ +# error "JIT does not support ARM processors, only x86 and x86-64 are supported, please use HashLink/C native compilation instead" +#endif + +#ifdef HL_DEBUG +# define JIT_DEBUG +#endif + +typedef enum { + Eax = 0, + Ecx = 1, + Edx = 2, + Ebx = 3, + Esp = 4, + Ebp = 5, + Esi = 6, + Edi = 7, +#ifdef HL_64 + R8 = 8, + R9 = 9, + R10 = 10, + R11 = 11, + R12 = 12, + R13 = 13, + R14 = 14, + R15 = 15, +#endif + _LAST = 0xFF +} CpuReg; + +typedef enum { + MOV, + LEA, + PUSH, + ADD, + SUB, + IMUL, // only overflow flag changes compared to MUL + DIV, + IDIV, + CDQ, + CDQE, + POP, + RET, + CALL, + AND, + OR, + XOR, + CMP, + TEST, + NOP, + SHL, + SHR, + SAR, + INC, + DEC, + JMP, + // FPU + FSTP, + FSTP32, + FLD, + FLD32, + FLDCW, + // SSE + MOVSD, + MOVSS, + COMISD, + COMISS, + ADDSD, + SUBSD, + MULSD, + DIVSD, + ADDSS, + SUBSS, + MULSS, + DIVSS, + XORPD, + CVTSI2SD, + CVTSI2SS, + CVTSD2SI, + CVTSD2SS, + CVTSS2SD, + CVTSS2SI, + STMXCSR, + LDMXCSR, + // 8-16 bits + MOV8, + CMP8, + TEST8, + PUSH8, + MOV16, + CMP16, + TEST16, + // prefetchs + PREFETCHT0, + PREFETCHT1, + PREFETCHT2, + PREFETCHNTA, + PREFETCHW, + // -- + _CPU_LAST +} CpuOp; + +#define JAlways 0 +#define JOverflow 0x80 +#define JULt 0x82 +#define JUGte 0x83 +#define JEq 0x84 +#define JNeq 0x85 +#define JULte 0x86 +#define JUGt 0x87 +#define JParity 0x8A +#define JNParity 0x8B +#define JSLt 0x8C +#define JSGte 0x8D +#define JSLte 0x8E +#define JSGt 0x8F + +#define JCarry JLt +#define JZero JEq +#define JNotZero JNeq + +#define B(bv) *ctx->buf.b++ = (unsigned char)(bv) +#define W(wv) *ctx->buf.w++ = wv + +#ifdef HL_64 +# define W64(wv) *ctx->buf.w64++ = wv +#else +# define W64(wv) W(wv) +#endif + +static const int SIB_MULT[] = {-1, 0, 1, -1, 2, -1, -1, -1, 3}; + +#define MOD_RM(mod,reg,rm) B(((mod) << 6) | (((reg)&7) << 3) | ((rm)&7)) +#define SIB(mult,rmult,rbase) B((SIB_MULT[mult]<<6) | (((rmult)&7)<<3) | ((rbase)&7)) +#define IS_SBYTE(c) ( (c) >= -128 && (c) < 128 ) + +#define AddJump(how,local) { if( (how) == JAlways ) { B(0xE9); } else { B(0x0F); B(how); }; local = BUF_POS(); W(0); } +#define AddJump_small(how,local) { if( (how) == JAlways ) { B(0xEB); } else B(how - 0x10); local = BUF_POS() | 0x40000000; B(0); } +#define XJump(how,local) AddJump(how,local) +#define XJump_small(how,local) AddJump_small(how,local) + +#define MAX_OP_SIZE 256 + +#define BUF_POS() ((int)(ctx->buf.b - ctx->startBuf)) +#define RTYPE(r) r->t->kind + +#ifdef HL_64 +# define RESERVE_ADDRESS 0x8000000000000000 +#else +# define RESERVE_ADDRESS 0x80000000 +#endif + +#if defined(HL_WIN_CALL) && defined(HL_64) +# define IS_WINCALL64 1 +#else +# define IS_WINCALL64 0 +#endif + +typedef struct jlist jlist; +struct jlist { + int pos; + int target; + jlist *next; +}; + +typedef struct vreg vreg; + +typedef enum { + RCPU = 0, + RFPU = 1, + RSTACK = 2, + RCONST = 3, + RADDR = 4, + RMEM = 5, + RUNUSED = 6, + RCPU_CALL = 1 | 8, + RCPU_8BITS = 1 | 16 +} preg_kind; + +typedef struct { + preg_kind kind; + int id; + int lock; + vreg *holds; +} preg; + +struct vreg { + int stackPos; + int size; + hl_type *t; + preg *current; + preg stack; +}; + +#define REG_AT(i) (ctx->pregs + (i)) + +#ifdef HL_64 +# define RCPU_COUNT 16 +# define RFPU_COUNT 16 +# ifdef HL_WIN_CALL +# define CALL_NREGS 4 +# define RCPU_SCRATCH_COUNT 7 +# define RFPU_SCRATCH_COUNT 6 +static const int RCPU_SCRATCH_REGS[] = { Eax, Ecx, Edx, R8, R9, R10, R11 }; +static const CpuReg CALL_REGS[] = { Ecx, Edx, R8, R9 }; +# else +# define CALL_NREGS 6 // TODO : XMM6+XMM7 are FPU reg parameters +# define RCPU_SCRATCH_COUNT 9 +# define RFPU_SCRATCH_COUNT 16 +static const int RCPU_SCRATCH_REGS[] = { Eax, Ecx, Edx, Esi, Edi, R8, R9, R10, R11 }; +static const CpuReg CALL_REGS[] = { Edi, Esi, Edx, Ecx, R8, R9 }; +# endif +#else +# define CALL_NREGS 0 +# define RCPU_COUNT 8 +# define RFPU_COUNT 8 +# define RCPU_SCRATCH_COUNT 3 +# define RFPU_SCRATCH_COUNT 8 +static const int RCPU_SCRATCH_REGS[] = { Eax, Ecx, Edx }; +#endif + +#define XMM(i) ((i) + RCPU_COUNT) +#define PXMM(i) REG_AT(XMM(i)) +#define REG_IS_FPU(i) ((i) >= RCPU_COUNT) + +#define PEAX REG_AT(Eax) +#define PESP REG_AT(Esp) +#define PEBP REG_AT(Ebp) + +#define REG_COUNT (RCPU_COUNT + RFPU_COUNT) + +#define ID2(a,b) ((a) | ((b)<<8)) +#define R(id) (ctx->vregs + (id)) +#define ASSERT(i) { printf("JIT ERROR %d (jit.c line %d)\n",i,(int)__LINE__); jit_exit(); } +#define IS_FLOAT(r) ((r)->t->kind == HF64 || (r)->t->kind == HF32) +#define RLOCK(r) if( (r)->lock < ctx->currentPos ) (r)->lock = ctx->currentPos +#define RUNLOCK(r) if( (r)->lock == ctx->currentPos ) (r)->lock = 0 + +#define BREAK() B(0xCC) + +static preg _unused = { RUNUSED, 0, 0, NULL }; +static preg *UNUSED = &_unused; + +struct _jit_ctx { + union { + unsigned char *b; + unsigned int *w; + unsigned long long *w64; + int *i; + double *d; + } buf; + vreg *vregs; + preg pregs[REG_COUNT]; + vreg *savedRegs[REG_COUNT]; + int savedLocks[REG_COUNT]; + int *opsPos; + int maxRegs; + int maxOps; + int bufSize; + int totalRegsSize; + int functionPos; + int allocOffset; + int currentPos; + int nativeArgsCount; + unsigned char *startBuf; + hl_module *m; + hl_function *f; + jlist *jumps; + jlist *calls; + jlist *switchs; + hl_alloc falloc; // cleared per-function + hl_alloc galloc; + vclosure *closure_list; + hl_debug_infos *debug; + int c2hl; + int hl2c; + void *static_functions[8]; + bool static_function_offset; +#ifdef WIN64_UNWIND_TABLES + int unwind_offset; + int nunwind; + PRUNTIME_FUNCTION unwind_table; +#endif +}; + +#ifdef WIN64_UNWIND_TABLES + +typedef enum _UNWIND_OP_CODES +{ + UWOP_PUSH_NONVOL = 0, /* info == register number */ + UWOP_ALLOC_LARGE, /* no info, alloc size in next 2 slots */ + UWOP_ALLOC_SMALL, /* info == size of allocation / 8 - 1 */ + UWOP_SET_FPREG, /* no info, FP = RSP + UNWIND_INFO.FPRegOffset*16 */ + UWOP_SAVE_NONVOL, /* info == register number, offset in next slot */ + UWOP_SAVE_NONVOL_FAR, /* info == register number, offset in next 2 slots */ + UWOP_SAVE_XMM128 = 8, /* info == XMM reg number, offset in next slot */ + UWOP_SAVE_XMM128_FAR, /* info == XMM reg number, offset in next 2 slots */ + UWOP_PUSH_MACHFRAME /* info == 0: no error-code, 1: error-code */ +} UNWIND_CODE_OPS; + +void write_uwcode(jit_ctx *ctx, unsigned char offset, UNWIND_CODE_OPS code, unsigned char info) +{ + B(offset); + B((code) | (info) << 4); +} + +void write_unwind_data(jit_ctx *ctx) +{ + // All generated functions use a frame pointer, so the same unwind info can be used for all of them + unsigned char version = 1; + unsigned char flags = 0; + unsigned char CountOfCodes = 2; + unsigned char SizeOfProlog = 4; + unsigned char FrameRegister = 5; // RBP + unsigned char FrameOffset = 0; + B((version) | (flags) << 3); + B(SizeOfProlog); + B(CountOfCodes); + B((FrameRegister) | (FrameOffset) << 4); + write_uwcode(ctx, 4, UWOP_SET_FPREG, 0); + write_uwcode(ctx, 1, UWOP_PUSH_NONVOL, 5); +} +#endif + +#define jit_exit() { hl_debug_break(); exit(-1); } +#define jit_error(msg) _jit_error(ctx,msg,__LINE__) + +#ifndef HL_64 +# ifdef HL_DEBUG +# define error_i64() jit_error("i64-32") +# else +void error_i64() { + printf("The module you are loading is using 64 bit ints that are not supported by the HL32.\nPlease run using HL64 or compile with -D hl-legacy32"); + jit_exit(); +} +# endif +#endif + +static void _jit_error( jit_ctx *ctx, const char *msg, int line ); +static void on_jit_error( const char *msg, int_val line ); + +static preg *pmem( preg *r, CpuReg reg, int offset ) { + r->kind = RMEM; + r->id = 0 | (reg << 4) | (offset << 8); + return r; +} + +static preg *pmem2( preg *r, CpuReg reg, CpuReg reg2, int mult, int offset ) { + r->kind = RMEM; + r->id = mult | (reg << 4) | (reg2 << 8); + r->holds = (void*)(int_val)offset; + return r; +} + +#ifdef HL_64 +static preg *pcodeaddr( preg *r, int offset ) { + r->kind = RMEM; + r->id = 15 | (offset << 4); + return r; +} +#endif + +static preg *pconst( preg *r, int c ) { + r->kind = RCONST; + r->holds = NULL; + r->id = c; + return r; +} + +static preg *pconst64( preg *r, int_val c ) { +#ifdef HL_64 + if( ((int)c) == c ) + return pconst(r,(int)c); + r->kind = RCONST; + r->id = 0xC064C064; + r->holds = (vreg*)c; + return r; +#else + return pconst(r,(int)c); +#endif +} + +#ifndef HL_64 +// it is not possible to access direct 64 bit address in x86-64 +static preg *paddr( preg *r, void *p ) { + r->kind = RADDR; + r->holds = (vreg*)p; + return r; +} +#endif + +static void save_regs( jit_ctx *ctx ) { + int i; + for(i=0;isavedRegs[i] = ctx->pregs[i].holds; + ctx->savedLocks[i] = ctx->pregs[i].lock; + } +} + +static void restore_regs( jit_ctx *ctx ) { + int i; + for(i=0;imaxRegs;i++) + ctx->vregs[i].current = NULL; + for(i=0;isavedRegs[i]; + preg *p = ctx->pregs + i; + p->holds = r; + p->lock = ctx->savedLocks[i]; + if( r ) r->current = p; + } +} + +static void jit_buf( jit_ctx *ctx ) { + if( BUF_POS() > ctx->bufSize - MAX_OP_SIZE ) { + int nsize = ctx->bufSize * 4 / 3; + unsigned char *nbuf; + int curpos; + if( nsize == 0 ) { + int i; + for(i=0;im->code->nfunctions;i++) + nsize += ctx->m->code->functions[i].nops; + nsize *= 4; + } + if( nsize < ctx->bufSize + MAX_OP_SIZE * 4 ) nsize = ctx->bufSize + MAX_OP_SIZE * 4; + curpos = BUF_POS(); + nbuf = (unsigned char*)malloc(nsize); + if( nbuf == NULL ) ASSERT(nsize); + if( ctx->startBuf ) { + memcpy(nbuf,ctx->startBuf,curpos); + free(ctx->startBuf); + } + ctx->startBuf = nbuf; + ctx->buf.b = nbuf + curpos; + ctx->bufSize = nsize; + } +} + +static const char *KNAMES[] = { "cpu","fpu","stack","const","addr","mem","unused" }; +#define ERRIF(c) if( c ) { printf("%s(%s,%s)\n",f?f->name:"???",KNAMES[a->kind], KNAMES[b->kind]); ASSERT(0); } + +typedef struct { + const char *name; // single operand + int r_mem; // r32 / r/m32 r32 + int mem_r; // r/m32 / r32 r/m32 + int r_const; // r32 / imm32 imm32 + int r_i8; // r32 / imm8 imm8 + int mem_const; // r/m32 / imm32 N/A +} opform; + +#define FLAG_LONGOP 0x80000000 +#define FLAG_16B 0x40000000 +#define FLAG_8B 0x20000000 +#define FLAG_DUAL 0x10000000 + +#define RM(op,id) ((op) | (((id)+1)<<8)) +#define GET_RM(op) (((op) >> ((op) < 0 ? 24 : 8)) & 15) +#define SBYTE(op) ((op) << 16) +#define LONG_OP(op) ((op) | FLAG_LONGOP) +#define OP16(op) LONG_OP((op) | FLAG_16B) +#define LONG_RM(op,id) LONG_OP(op | (((id) + 1) << 24)) + +static opform OP_FORMS[_CPU_LAST] = { + { "MOV", 0x8B, 0x89, 0xB8, 0, RM(0xC7,0) }, + { "LEA", 0x8D }, + { "PUSH", 0x50, RM(0xFF,6), 0x68, 0x6A }, + { "ADD", 0x03, 0x01, RM(0x81,0), RM(0x83,0) }, + { "SUB", 0x2B, 0x29, RM(0x81,5), RM(0x83,5) }, + { "IMUL", LONG_OP(0x0FAF), 0, 0x69 | FLAG_DUAL, 0x6B | FLAG_DUAL }, + { "DIV", RM(0xF7,6), RM(0xF7,6) }, + { "IDIV", RM(0xF7,7), RM(0xF7,7) }, + { "CDQ", 0x99 }, + { "CDQE", 0x98 }, + { "POP", 0x58, RM(0x8F,0) }, + { "RET", 0xC3 }, + { "CALL", RM(0xFF,2), RM(0xFF,2), 0xE8 }, + { "AND", 0x23, 0x21, RM(0x81,4), RM(0x83,4) }, + { "OR", 0x0B, 0x09, RM(0x81,1), RM(0x83,1) }, + { "XOR", 0x33, 0x31, RM(0x81,6), RM(0x83,6) }, + { "CMP", 0x3B, 0x39, RM(0x81,7), RM(0x83,7) }, + { "TEST", 0x85, 0x85/*SWP?*/, RM(0xF7,0) }, + { "NOP", 0x90 }, + { "SHL", RM(0xD3,4), 0, 0, RM(0xC1,4) }, + { "SHR", RM(0xD3,5), 0, 0, RM(0xC1,5) }, + { "SAR", RM(0xD3,7), 0, 0, RM(0xC1,7) }, + { "INC", IS_64 ? RM(0xFF,0) : 0x40, RM(0xFF,0) }, + { "DEC", IS_64 ? RM(0xFF,1) : 0x48, RM(0xFF,1) }, + { "JMP", RM(0xFF,4) }, + // FPU + { "FSTP", 0, RM(0xDD,3) }, + { "FSTP32", 0, RM(0xD9,3) }, + { "FLD", 0, RM(0xDD,0) }, + { "FLD32", 0, RM(0xD9,0) }, + { "FLDCW", 0, RM(0xD9, 5) }, + // SSE + { "MOVSD", 0xF20F10, 0xF20F11 }, + { "MOVSS", 0xF30F10, 0xF30F11 }, + { "COMISD", 0x660F2F }, + { "COMISS", LONG_OP(0x0F2F) }, + { "ADDSD", 0xF20F58 }, + { "SUBSD", 0xF20F5C }, + { "MULSD", 0xF20F59 }, + { "DIVSD", 0xF20F5E }, + { "ADDSS", 0xF30F58 }, + { "SUBSS", 0xF30F5C }, + { "MULSS", 0xF30F59 }, + { "DIVSS", 0xF30F5E }, + { "XORPD", 0x660F57 }, + { "CVTSI2SD", 0xF20F2A }, + { "CVTSI2SS", 0xF30F2A }, + { "CVTSD2SI", 0xF20F2D }, + { "CVTSD2SS", 0xF20F5A }, + { "CVTSS2SD", 0xF30F5A }, + { "CVTSS2SI", 0xF30F2D }, + { "STMXCSR", 0, LONG_RM(0x0FAE,3) }, + { "LDMXCSR", 0, LONG_RM(0x0FAE,2) }, + // 8 bits, + { "MOV8", 0x8A, 0x88, 0, 0xB0, RM(0xC6,0) }, + { "CMP8", 0x3A, 0x38, 0, RM(0x80,7) }, + { "TEST8", 0x84, 0x84, RM(0xF6,0) }, + { "PUSH8", 0, 0, 0x6A | FLAG_8B }, + { "MOV16", OP16(0x8B), OP16(0x89), OP16(0xB8) }, + { "CMP16", OP16(0x3B), OP16(0x39) }, + { "TEST16", OP16(0x85) }, + // prefetchs + { "PREFETCHT0", 0, LONG_RM(0x0F18,1) }, + { "PREFETCHT1", 0, LONG_RM(0x0F18,2) }, + { "PREFETCHT2", 0, LONG_RM(0x0F18,3) }, + { "PREFETCHNTA", 0, LONG_RM(0x0F18,0) }, + { "PREFETCHW", 0, LONG_RM(0x0F0D,1) }, +}; + +#ifdef HL_64 +# define REX() if( r64 ) B(r64 | 0x40) +#else +# define REX() +#endif + +#define OP(b) \ + if( (b) & 0xFF0000 ) { \ + B((b)>>16); \ + if( r64 ) B(r64 | 0x40); /* also in 32 bits mode */ \ + B((b)>>8); \ + B(b); \ + } else { \ + if( (b) & FLAG_16B ) { \ + B(0x66); \ + REX(); \ + } else {\ + REX(); \ + if( (b) & FLAG_LONGOP ) B((b)>>8); \ + }\ + B(b); \ + } + +static bool is_reg8( preg *a ) { + return a->kind == RSTACK || a->kind == RMEM || a->kind == RCONST || (a->kind == RCPU && a->id != Esi && a->id != Edi); +} + +static void op( jit_ctx *ctx, CpuOp o, preg *a, preg *b, bool mode64 ) { + opform *f = &OP_FORMS[o]; + int r64 = mode64 && (o != PUSH && o != POP && o != CALL && o != PUSH8 && o < PREFETCHT0) ? 8 : 0; + switch( o ) { + case CMP8: + case TEST8: + case MOV8: + if( !is_reg8(a) || !is_reg8(b) ) + ASSERT(0); + break; + default: + break; + } + switch( ID2(a->kind,b->kind) ) { + case ID2(RUNUSED,RUNUSED): + ERRIF(f->r_mem == 0); + OP(f->r_mem); + break; + case ID2(RCPU,RCPU): + case ID2(RFPU,RFPU): + ERRIF( f->r_mem == 0 ); + if( a->id > 7 ) r64 |= 4; + if( b->id > 7 ) r64 |= 1; + OP(f->r_mem); + MOD_RM(3,a->id,b->id); + break; + case ID2(RCPU,RFPU): + case ID2(RFPU,RCPU): + ERRIF( (f->r_mem>>16) == 0 ); + if( a->id > 7 ) r64 |= 4; + if( b->id > 7 ) r64 |= 1; + OP(f->r_mem); + MOD_RM(3,a->id,b->id); + break; + case ID2(RCPU,RUNUSED): + ERRIF( f->r_mem == 0 ); + if( a->id > 7 ) r64 |= 1; + if( GET_RM(f->r_mem) > 0 ) { + OP(f->r_mem); + MOD_RM(3, GET_RM(f->r_mem)-1, a->id); + } else + OP(f->r_mem + (a->id&7)); + break; + case ID2(RSTACK,RUNUSED): + ERRIF( f->mem_r == 0 || GET_RM(f->mem_r) == 0 ); + { + int stackPos = R(a->id)->stackPos; + OP(f->mem_r); + if( IS_SBYTE(stackPos) ) { + MOD_RM(1,GET_RM(f->mem_r)-1,Ebp); + B(stackPos); + } else { + MOD_RM(2,GET_RM(f->mem_r)-1,Ebp); + W(stackPos); + } + } + break; + case ID2(RCPU,RCONST): + ERRIF( f->r_const == 0 && f->r_i8 == 0 ); + if( a->id > 7 ) r64 |= 1; + { + int_val cval = b->holds ? (int_val)b->holds : b->id; + // short byte form + if( f->r_i8 && IS_SBYTE(cval) ) { + if( (f->r_i8&FLAG_DUAL) && a->id > 7 ) r64 |= 4; + OP(f->r_i8); + if( (f->r_i8&FLAG_DUAL) ) MOD_RM(3,a->id,a->id); else MOD_RM(3,GET_RM(f->r_i8)-1,a->id); + B((int)cval); + } else if( GET_RM(f->r_const) > 0 || (f->r_const&FLAG_DUAL) ) { + if( (f->r_i8&FLAG_DUAL) && a->id > 7 ) r64 |= 4; + OP(f->r_const&0xFF); + if( (f->r_i8&FLAG_DUAL) ) MOD_RM(3,a->id,a->id); else MOD_RM(3,GET_RM(f->r_const)-1,a->id); + if( mode64 && IS_64 && o == MOV ) W64(cval); else W((int)cval); + } else { + ERRIF( f->r_const == 0); + OP((f->r_const&0xFF) + (a->id&7)); + if( mode64 && IS_64 && o == MOV ) W64(cval); else W((int)cval); + } + } + break; + case ID2(RSTACK,RCPU): + case ID2(RSTACK,RFPU): + ERRIF( f->mem_r == 0 ); + if( b->id > 7 ) r64 |= 4; + { + int stackPos = R(a->id)->stackPos; + OP(f->mem_r); + if( IS_SBYTE(stackPos) ) { + MOD_RM(1,b->id,Ebp); + B(stackPos); + } else { + MOD_RM(2,b->id,Ebp); + W(stackPos); + } + } + break; + case ID2(RCPU,RSTACK): + case ID2(RFPU,RSTACK): + ERRIF( f->r_mem == 0 ); + if( a->id > 7 ) r64 |= 4; + { + int stackPos = R(b->id)->stackPos; + OP(f->r_mem); + if( IS_SBYTE(stackPos) ) { + MOD_RM(1,a->id,Ebp); + B(stackPos); + } else { + MOD_RM(2,a->id,Ebp); + W(stackPos); + } + } + break; + case ID2(RCONST,RUNUSED): + ERRIF( f->r_const == 0 ); + { + int_val cval = a->holds ? (int_val)a->holds : a->id; + OP(f->r_const); + if( f->r_const & FLAG_8B ) B((int)cval); else W((int)cval); + } + break; + case ID2(RMEM,RUNUSED): + ERRIF( f->mem_r == 0 ); + { + int mult = a->id & 0xF; + int regOrOffs = mult == 15 ? a->id >> 4 : a->id >> 8; + CpuReg reg = (a->id >> 4) & 0xF; + if( mult == 15 ) { + ERRIF(1); + } else if( mult == 0 ) { + if( reg > 7 ) r64 |= 1; + OP(f->mem_r); + if( regOrOffs == 0 && (reg&7) != Ebp ) { + MOD_RM(0,GET_RM(f->mem_r)-1,reg); + if( (reg&7) == Esp ) B(0x24); + } else if( IS_SBYTE(regOrOffs) ) { + MOD_RM(1,GET_RM(f->mem_r)-1,reg); + if( (reg&7) == Esp ) B(0x24); + B(regOrOffs); + } else { + MOD_RM(2,GET_RM(f->mem_r)-1,reg); + if( (reg&7) == Esp ) B(0x24); + W(regOrOffs); + } + } else { + // [eax + ebx * M] + ERRIF(1); + } + } + break; + case ID2(RCPU, RMEM): + case ID2(RFPU, RMEM): + ERRIF( f->r_mem == 0 ); + { + int mult = b->id & 0xF; + int regOrOffs = mult == 15 ? b->id >> 4 : b->id >> 8; + CpuReg reg = (b->id >> 4) & 0xF; + if( mult == 15 ) { + int pos; + if( a->id > 7 ) r64 |= 4; + OP(f->r_mem); + MOD_RM(0,a->id,5); + if( IS_64 ) { + // offset wrt current code + pos = BUF_POS() + 4; + W(regOrOffs - pos); + } else { + ERRIF(1); + } + } else if( mult == 0 ) { + if( a->id > 7 ) r64 |= 4; + if( reg > 7 ) r64 |= 1; + OP(f->r_mem); + if( regOrOffs == 0 && (reg&7) != Ebp ) { + MOD_RM(0,a->id,reg); + if( (reg&7) == Esp ) B(0x24); + } else if( IS_SBYTE(regOrOffs) ) { + MOD_RM(1,a->id,reg); + if( (reg&7) == Esp ) B(0x24); + B(regOrOffs); + } else { + MOD_RM(2,a->id,reg); + if( (reg&7) == Esp ) B(0x24); + W(regOrOffs); + } + } else { + int offset = (int)(int_val)b->holds; + if( a->id > 7 ) r64 |= 4; + if( reg > 7 ) r64 |= 1; + if( regOrOffs > 7 ) r64 |= 2; + OP(f->r_mem); + MOD_RM(offset == 0 ? 0 : IS_SBYTE(offset) ? 1 : 2,a->id,4); + SIB(mult,regOrOffs,reg); + if( offset ) { + if( IS_SBYTE(offset) ) B(offset); else W(offset); + } + } + } + break; +# ifndef HL_64 + case ID2(RFPU,RADDR): +# endif + case ID2(RCPU,RADDR): + ERRIF( f->r_mem == 0 ); + if( a->id > 7 ) r64 |= 4; + OP(f->r_mem); + MOD_RM(0,a->id,5); + if( IS_64 ) + W64((int_val)b->holds); + else + W((int)(int_val)b->holds); + break; +# ifndef HL_64 + case ID2(RADDR,RFPU): +# endif + case ID2(RADDR,RCPU): + ERRIF( f->mem_r == 0 ); + if( b->id > 7 ) r64 |= 4; + OP(f->mem_r); + MOD_RM(0,b->id,5); + if( IS_64 ) + W64((int_val)a->holds); + else + W((int)(int_val)a->holds); + break; + case ID2(RMEM, RCPU): + case ID2(RMEM, RFPU): + ERRIF( f->mem_r == 0 ); + { + int mult = a->id & 0xF; + int regOrOffs = mult == 15 ? a->id >> 4 : a->id >> 8; + CpuReg reg = (a->id >> 4) & 0xF; + if( mult == 15 ) { + int pos; + if( b->id > 7 ) r64 |= 4; + OP(f->mem_r); + MOD_RM(0,b->id,5); + if( IS_64 ) { + // offset wrt current code + pos = BUF_POS() + 4; + W(regOrOffs - pos); + } else { + ERRIF(1); + } + } else if( mult == 0 ) { + if( b->id > 7 ) r64 |= 4; + if( reg > 7 ) r64 |= 1; + OP(f->mem_r); + if( regOrOffs == 0 && (reg&7) != Ebp ) { + MOD_RM(0,b->id,reg); + if( (reg&7) == Esp ) B(0x24); + } else if( IS_SBYTE(regOrOffs) ) { + MOD_RM(1,b->id,reg); + if( (reg&7) == Esp ) B(0x24); + B(regOrOffs); + } else { + MOD_RM(2,b->id,reg); + if( (reg&7) == Esp ) B(0x24); + W(regOrOffs); + } + } else { + int offset = (int)(int_val)a->holds; + if( b->id > 7 ) r64 |= 4; + if( reg > 7 ) r64 |= 1; + if( regOrOffs > 7 ) r64 |= 2; + OP(f->mem_r); + MOD_RM(offset == 0 ? 0 : IS_SBYTE(offset) ? 1 : 2,b->id,4); + SIB(mult,regOrOffs,reg); + if( offset ) { + if( IS_SBYTE(offset) ) B(offset); else W(offset); + } + } + } + break; + default: + ERRIF(1); + } + if( ctx->debug && ctx->f && o == CALL ) { + preg p; + op(ctx,MOV,pmem(&p,Esp,-HL_WSIZE),PEBP,true); // erase EIP (clean stack report) + } +} + +static void op32( jit_ctx *ctx, CpuOp o, preg *a, preg *b ) { + op(ctx,o,a,b,false); +} + +static void op64( jit_ctx *ctx, CpuOp o, preg *a, preg *b ) { +#ifndef HL_64 + op(ctx,o,a,b,false); +#else + op(ctx,o,a,b,true); +#endif +} + +static void patch_jump( jit_ctx *ctx, int p ) { + if( p == 0 ) return; + if( p & 0x40000000 ) { + int d; + p &= 0x3FFFFFFF; + d = BUF_POS() - (p + 1); + if( d < -128 || d >= 128 ) ASSERT(d); + *(char*)(ctx->startBuf + p) = (char)d; + } else { + *(int*)(ctx->startBuf + p) = BUF_POS() - (p + 4); + } +} + +static void patch_jump_to( jit_ctx *ctx, int p, int target ) { + if( p == 0 ) return; + if( p & 0x40000000 ) { + int d; + p &= 0x3FFFFFFF; + d = target - (p + 1); + if( d < -128 || d >= 128 ) ASSERT(d); + *(char*)(ctx->startBuf + p) = (char)d; + } else { + *(int*)(ctx->startBuf + p) = target - (p + 4); + } +} + +static int stack_size( hl_type *t ) { + switch( t->kind ) { + case HUI8: + case HUI16: + case HBOOL: +# ifdef HL_64 + case HI32: + case HF32: +# endif + return sizeof(int_val); + case HI64: + default: + return hl_type_size(t); + } +} + +static int call_reg_index( int reg ) { +# ifdef HL_64 + int i; + for(i=0;ikind == RFPU ) + return p->id < CALL_NREGS; + for(i=0;ikind == RCPU && p->id == CALL_REGS[i] ) + return true; + return false; +# else + return false; +# endif +} + +static preg *alloc_reg( jit_ctx *ctx, preg_kind k ) { + int i; + preg *p; + switch( k ) { + case RCPU: + case RCPU_CALL: + case RCPU_8BITS: + { + int off = ctx->allocOffset++; + const int count = RCPU_SCRATCH_COUNT; + for(i=0;ipregs + r; + if( p->lock >= ctx->currentPos ) continue; + if( k == RCPU_CALL && is_call_reg(p) ) continue; + if( k == RCPU_8BITS && !is_reg8(p) ) continue; + if( p->holds == NULL ) { + RLOCK(p); + return p; + } + } + for(i=0;ipregs + RCPU_SCRATCH_REGS[(i + off)%count]; + if( p->lock >= ctx->currentPos ) continue; + if( k == RCPU_CALL && is_call_reg(p) ) continue; + if( k == RCPU_8BITS && !is_reg8(p) ) continue; + if( p->holds ) { + RLOCK(p); + p->holds->current = NULL; + p->holds = NULL; + return p; + } + } + } + break; + case RFPU: + { + int off = ctx->allocOffset++; + const int count = RFPU_SCRATCH_COUNT; + for(i=0;ilock >= ctx->currentPos ) continue; + if( p->holds == NULL ) { + RLOCK(p); + return p; + } + } + for(i=0;ilock >= ctx->currentPos ) continue; + if( p->holds ) { + RLOCK(p); + p->holds->current = NULL; + p->holds = NULL; + return p; + } + } + } + break; + default: + ASSERT(k); + } + ASSERT(0); // out of registers ! + return NULL; +} + +static preg *fetch( vreg *r ) { + if( r->current ) + return r->current; + return &r->stack; +} + +static void scratch( preg *r ) { + if( r && r->holds ) { + r->holds->current = NULL; + r->holds = NULL; + r->lock = 0; + } +} + +static preg *copy( jit_ctx *ctx, preg *to, preg *from, int size ); + +static void load( jit_ctx *ctx, preg *r, vreg *v ) { + preg *from = fetch(v); + if( from == r || v->size == 0 ) return; + if( r->holds ) r->holds->current = NULL; + if( v->current ) { + v->current->holds = NULL; + from = r; + } + r->holds = v; + v->current = r; + copy(ctx,r,from,v->size); +} + +static preg *alloc_fpu( jit_ctx *ctx, vreg *r, bool andLoad ) { + preg *p = fetch(r); + if( p->kind != RFPU ) { + if( !IS_FLOAT(r) && (IS_64 || r->t->kind != HI64) ) ASSERT(r->t->kind); + p = alloc_reg(ctx, RFPU); + if( andLoad ) + load(ctx,p,r); + else { + if( r->current ) + r->current->holds = NULL; + r->current = p; + p->holds = r; + } + } else + RLOCK(p); + return p; +} + +static void reg_bind( vreg *r, preg *p ) { + if( r->current ) + r->current->holds = NULL; + r->current = p; + p->holds = r; +} + +static preg *alloc_cpu( jit_ctx *ctx, vreg *r, bool andLoad ) { + preg *p = fetch(r); + if( p->kind != RCPU ) { +# ifndef HL_64 + if( r->t->kind == HI64 ) return alloc_fpu(ctx,r,andLoad); + if( r->size > 4 ) ASSERT(r->size); +# endif + p = alloc_reg(ctx, RCPU); + if( andLoad ) + load(ctx,p,r); + else + reg_bind(r,p); + } else + RLOCK(p); + return p; +} + +// allocate a register that is not a call parameter +static preg *alloc_cpu_call( jit_ctx *ctx, vreg *r ) { + preg *p = fetch(r); + if( p->kind != RCPU ) { +# ifndef HL_64 + if( r->t->kind == HI64 ) return alloc_fpu(ctx,r,true); + if( r->size > 4 ) ASSERT(r->size); +# endif + p = alloc_reg(ctx, RCPU_CALL); + load(ctx,p,r); + } else if( is_call_reg(p) ) { + preg *p2 = alloc_reg(ctx, RCPU_CALL); + op64(ctx,MOV,p2,p); + scratch(p); + reg_bind(r,p2); + return p2; + } else + RLOCK(p); + return p; +} + +static preg *fetch32( jit_ctx *ctx, vreg *r ) { + if( r->current ) + return r->current; + // make sure that the register is correctly erased + if( r->size < 4 ) { + preg *p = alloc_cpu(ctx, r, true); + RUNLOCK(p); + return p; + } + return fetch(r); +} + +// make sure higher bits are zeroes +static preg *alloc_cpu64( jit_ctx *ctx, vreg *r, bool andLoad ) { +# ifndef HL_64 + return alloc_cpu(ctx,r,andLoad); +# else + preg *p = fetch(r); + if( !andLoad ) ASSERT(0); + if( p->kind != RCPU ) { + p = alloc_reg(ctx, RCPU); + op64(ctx,XOR,p,p); + load(ctx,p,r); + } else { + // remove higher bits + preg tmp; + op64(ctx,SHL,p,pconst(&tmp,32)); + op64(ctx,SHR,p,pconst(&tmp,32)); + RLOCK(p); + } + return p; +# endif +} + +// make sure the register can be used with 8 bits access +static preg *alloc_cpu8( jit_ctx *ctx, vreg *r, bool andLoad ) { + preg *p = fetch(r); + if( p->kind != RCPU ) { + p = alloc_reg(ctx, RCPU_8BITS); + load(ctx,p,r); + } else if( !is_reg8(p) ) { + preg *p2 = alloc_reg(ctx, RCPU_8BITS); + op64(ctx,MOV,p2,p); + scratch(p); + reg_bind(r,p2); + return p2; + } else + RLOCK(p); + return p; +} + +static preg *copy( jit_ctx *ctx, preg *to, preg *from, int size ) { + if( size == 0 || to == from ) return to; + switch( ID2(to->kind,from->kind) ) { + case ID2(RMEM,RCPU): + case ID2(RSTACK,RCPU): + case ID2(RCPU,RSTACK): + case ID2(RCPU,RMEM): + case ID2(RCPU,RCPU): +# ifndef HL_64 + case ID2(RCPU,RADDR): + case ID2(RADDR,RCPU): +# endif + switch( size ) { + case 1: + if( to->kind == RCPU ) { + op64(ctx,XOR,to,to); + if( !is_reg8(to) ) { + preg p; + op32(ctx,MOV16,to,from); + op32(ctx,SHL,to,pconst(&p,24)); + op32(ctx,SHR,to,pconst(&p,24)); + break; + } + } + if( !is_reg8(from) ) { + preg *r = alloc_reg(ctx, RCPU_CALL); + op32(ctx, MOV, r, from); + RUNLOCK(r); + op32(ctx,MOV8,to,r); + return from; + } + op32(ctx,MOV8,to,from); + break; + case 2: + if( to->kind == RCPU ) + op64(ctx,XOR,to,to); + op32(ctx,MOV16,to,from); + break; + case 4: + op32(ctx,MOV,to,from); + break; + case 8: + if( IS_64 ) { + op64(ctx,MOV,to,from); + break; + } + default: + ASSERT(size); + } + return to->kind == RCPU ? to : from; + case ID2(RFPU,RFPU): + case ID2(RMEM,RFPU): + case ID2(RSTACK,RFPU): + case ID2(RFPU,RMEM): + case ID2(RFPU,RSTACK): + switch( size ) { + case 8: + op64(ctx,MOVSD,to,from); + break; + case 4: + op32(ctx,MOVSS,to,from); + break; + default: + ASSERT(size); + } + return to->kind == RFPU ? to : from; + case ID2(RMEM,RSTACK): + { + vreg *rfrom = R(from->id); + if( IS_FLOAT(rfrom) ) + return copy(ctx,to,alloc_fpu(ctx,rfrom,true),size); + return copy(ctx,to,alloc_cpu(ctx,rfrom,true),size); + } + case ID2(RMEM,RMEM): + case ID2(RSTACK,RMEM): + case ID2(RSTACK,RSTACK): +# ifndef HL_64 + case ID2(RMEM,RADDR): + case ID2(RSTACK,RADDR): + case ID2(RADDR,RSTACK): +# endif + { + preg *tmp; + if( (!IS_64 && size == 8) || (to->kind == RSTACK && IS_FLOAT(R(to->id))) || (from->kind == RSTACK && IS_FLOAT(R(from->id))) ) { + tmp = alloc_reg(ctx, RFPU); + op64(ctx,size == 8 ? MOVSD : MOVSS,tmp,from); + } else { + tmp = alloc_reg(ctx, RCPU); + copy(ctx,tmp,from,size); + } + return copy(ctx,to,tmp,size); + } +# ifdef HL_64 + case ID2(RCPU,RADDR): + case ID2(RMEM,RADDR): + case ID2(RSTACK,RADDR): + { + preg p; + preg *tmp = alloc_reg(ctx, RCPU); + op64(ctx,MOV,tmp,pconst64(&p,(int_val)from->holds)); + return copy(ctx,to,pmem(&p,tmp->id,0),size); + } + case ID2(RADDR,RCPU): + case ID2(RADDR,RMEM): + case ID2(RADDR,RSTACK): + { + preg p; + preg *tmp = alloc_reg(ctx, RCPU); + op64(ctx,MOV,tmp,pconst64(&p,(int_val)to->holds)); + return copy(ctx,pmem(&p,tmp->id,0),from,size); + } +# endif + default: + break; + } + printf("copy(%s,%s)\n",KNAMES[to->kind], KNAMES[from->kind]); + ASSERT(0); + return NULL; +} + +static void store( jit_ctx *ctx, vreg *r, preg *v, bool bind ) { + if( r->current && r->current != v ) { + r->current->holds = NULL; + r->current = NULL; + } + v = copy(ctx,&r->stack,v,r->size); + if( IS_FLOAT(r) != (v->kind == RFPU) ) + ASSERT(0); + if( bind && r->current != v && (v->kind == RCPU || v->kind == RFPU) ) { + scratch(v); + r->current = v; + v->holds = r; + } +} + +static void store_result( jit_ctx *ctx, vreg *r ) { +# ifndef HL_64 + switch( r->t->kind ) { + case HF64: + scratch(r->current); + op64(ctx,FSTP,&r->stack,UNUSED); + break; + case HF32: + scratch(r->current); + op64(ctx,FSTP32,&r->stack,UNUSED); + break; + case HI64: + scratch(r->current); + error_i64(); + break; + default: +# endif + store(ctx,r,IS_FLOAT(r) ? REG_AT(XMM(0)) : PEAX,true); +# ifndef HL_64 + break; + } +# endif +} + +static void op_mov( jit_ctx *ctx, vreg *to, vreg *from ) { + preg *r = fetch(from); +# ifndef HL_64 + if( to->t->kind == HI64 ) { + error_i64(); + return; + } +# endif + if( from->t->kind == HF32 && r->kind != RFPU ) + r = alloc_fpu(ctx,from,true); + store(ctx, to, r, true); +} + +static void copy_to( jit_ctx *ctx, vreg *to, preg *from ) { + store(ctx,to,from,true); +} + +static void copy_from( jit_ctx *ctx, preg *to, vreg *from ) { + copy(ctx,to,fetch(from),from->size); +} + +static void store_const( jit_ctx *ctx, vreg *r, int c ) { + preg p; + if( c == 0 ) + op(ctx,XOR,alloc_cpu(ctx,r,false),alloc_cpu(ctx,r,false),r->size == 8); + else if( r->size == 8 ) + op64(ctx,MOV,alloc_cpu(ctx,r,false),pconst64(&p,c)); + else + op32(ctx,MOV,alloc_cpu(ctx,r,false),pconst(&p,c)); + store(ctx,r,r->current,false); +} + +static void discard_regs( jit_ctx *ctx, bool native_call ) { + int i; + for(i=0;ipregs + RCPU_SCRATCH_REGS[i]; + if( r->holds ) { + r->holds->current = NULL; + r->holds = NULL; + } + } + for(i=0;ipregs + XMM(i); + if( r->holds ) { + r->holds->current = NULL; + r->holds = NULL; + } + } +} + +static int pad_before_call( jit_ctx *ctx, int size ) { + int total = size + ctx->totalRegsSize + HL_WSIZE * 2; // EIP+EBP + if( total & 15 ) { + int pad = 16 - (total & 15); + preg p; + if( pad ) op64(ctx,SUB,PESP,pconst(&p,pad)); + size += pad; + } + return size; +} + +static void push_reg( jit_ctx *ctx, vreg *r ) { + preg p; + switch( stack_size(r->t) ) { + case 1: + op64(ctx,SUB,PESP,pconst(&p,1)); + op32(ctx,MOV8,pmem(&p,Esp,0),alloc_cpu8(ctx,r,true)); + break; + case 2: + op64(ctx,SUB,PESP,pconst(&p,2)); + op32(ctx,MOV16,pmem(&p,Esp,0),alloc_cpu(ctx,r,true)); + break; + case 4: + if( r->size < 4 ) + alloc_cpu(ctx,r,true); // force fetch (higher bits set to 0) + if( !IS_64 ) { + if( r->current != NULL && r->current->kind == RFPU ) scratch(r->current); + op32(ctx,PUSH,fetch(r),UNUSED); + } else { + // pseudo push32 (not available) + op64(ctx,SUB,PESP,pconst(&p,4)); + op32(ctx,MOV,pmem(&p,Esp,0),alloc_cpu(ctx,r,true)); + } + break; + case 8: + if( fetch(r)->kind == RFPU ) { + op64(ctx,SUB,PESP,pconst(&p,8)); + op64(ctx,MOVSD,pmem(&p,Esp,0),fetch(r)); + } else if( IS_64 ) + op64(ctx,PUSH,fetch(r),UNUSED); + else if( r->stack.kind == RSTACK ) { + scratch(r->current); + r->stackPos += 4; + op32(ctx,PUSH,&r->stack,UNUSED); + r->stackPos -= 4; + op32(ctx,PUSH,&r->stack,UNUSED); + } else + ASSERT(0); + break; + default: + ASSERT(r->size); + } +} + +static int begin_native_call( jit_ctx *ctx, int nargs ) { + ctx->nativeArgsCount = nargs; + return pad_before_call(ctx, nargs > CALL_NREGS ? (nargs - CALL_NREGS) * HL_WSIZE : 0); +} + +static preg *alloc_native_arg( jit_ctx *ctx ) { +# ifdef HL_64 + int rid = ctx->nativeArgsCount - 1; + preg *r = rid < CALL_NREGS ? REG_AT(CALL_REGS[rid]) : alloc_reg(ctx,RCPU_CALL); + scratch(r); + return r; +# else + return alloc_reg(ctx, RCPU); +# endif +} + +static void set_native_arg( jit_ctx *ctx, preg *r ) { + if( r->kind == RSTACK ) { + vreg *v = ctx->vregs + r->id; + if( v->size < 4 ) + r = fetch32(ctx, v); + } +# ifdef HL_64 + if( r->kind == RFPU ) ASSERT(0); + int rid = --ctx->nativeArgsCount; + preg *target; + if( rid >= CALL_NREGS ) { + op64(ctx,PUSH,r,UNUSED); + return; + } + target = REG_AT(CALL_REGS[rid]); + if( target != r ) { + op64(ctx, MOV, target, r); + scratch(target); + } +# else + op32(ctx,PUSH,r,UNUSED); +# endif +} + +static void set_native_arg_fpu( jit_ctx *ctx, preg *r, bool isf32 ) { +# ifdef HL_64 + if( r->kind == RCPU ) ASSERT(0); + // can only be used if last argument !! + ctx->nativeArgsCount--; + preg *target = REG_AT(XMM(IS_WINCALL64 ? ctx->nativeArgsCount : 0)); + if( target != r ) { + op64(ctx, isf32 ? MOVSS : MOVSD, target, r); + scratch(target); + } +# else + op32(ctx,PUSH,r,UNUSED); +# endif +} + +typedef struct { + int nextCpu; + int nextFpu; + int mapped[REG_COUNT]; +} call_regs; + +static int select_call_reg( call_regs *regs, hl_type *t, int id ) { +# ifndef HL_64 + return -1; +#else + bool isFloat = t->kind == HF32 || t->kind == HF64; +# ifdef HL_WIN_CALL + int index = regs->nextCpu++; +# else + int index = isFloat ? regs->nextFpu++ : regs->nextCpu++; +# endif + if( index >= CALL_NREGS ) + return -1; + int reg = isFloat ? XMM(index) : CALL_REGS[index]; + regs->mapped[reg] = id + 1; + return reg; +#endif +} + +static int mapped_reg( call_regs *regs, int id ) { +# ifndef HL_64 + return -1; +#else + int i; + for(i=0;imapped[r] == id + 1 ) return r; + r = XMM(i); + if( regs->mapped[r] == id + 1 ) return r; + } + return -1; +#endif +} + +static int prepare_call_args( jit_ctx *ctx, int count, int *args, vreg *vregs, int extraSize ) { + int i; + int size = extraSize, paddedSize; + call_regs ctmp = {0}; + for(i=0;it, i); + if( cr >= 0 ) { + preg *c = REG_AT(cr); + preg *cur = fetch(r); + if( cur != c ) { + copy(ctx,c,cur,r->size); + scratch(c); + } + RLOCK(c); + continue; + } + size += stack_size(r->t); + } + paddedSize = pad_before_call(ctx,size); + for(i=0;i= 0 ) continue; + push_reg(ctx,r); + if( r->current ) RUNLOCK(r->current); + } + return paddedSize; +} + +static void op_call( jit_ctx *ctx, preg *r, int size ) { + preg p; +# ifdef JIT_DEBUG + if( IS_64 && size >= 0 ) { + int jchk; + op32(ctx,TEST,PESP,pconst(&p,15)); + XJump(JZero,jchk); + BREAK(); // unaligned ESP + patch_jump(ctx, jchk); + } +# endif + if( IS_WINCALL64 ) { + // MSVC requires 32bytes of free space here + op64(ctx,SUB,PESP,pconst(&p,32)); + if( size >= 0 ) size += 32; + } + op32(ctx, CALL, r, UNUSED); + if( size > 0 ) op64(ctx,ADD,PESP,pconst(&p,size)); +} + +static void call_native( jit_ctx *ctx, void *nativeFun, int size ) { + bool isExc = nativeFun == hl_assert || nativeFun == hl_throw || nativeFun == on_jit_error; + preg p; + // native function, already resolved + op64(ctx,MOV,PEAX,pconst64(&p,(int_val)nativeFun)); + op_call(ctx,PEAX, isExc ? -1 : size); + if( isExc ) + return; + discard_regs(ctx, true); +} + +static void op_call_fun( jit_ctx *ctx, vreg *dst, int findex, int count, int *args ) { + int fid = findex < 0 ? -1 : ctx->m->functions_indexes[findex]; + bool isNative = fid >= ctx->m->code->nfunctions; + int size = prepare_call_args(ctx,count,args,ctx->vregs,0); + preg p; + if( fid < 0 ) { + ASSERT(fid); + } else if( isNative ) { + call_native(ctx,ctx->m->functions_ptrs[findex],size); + } else { + int cpos = BUF_POS() + (IS_WINCALL64 ? 4 : 0); +# ifdef JIT_DEBUG + if( IS_64 ) cpos += 13; // ESP CHECK +# endif + if( ctx->m->functions_ptrs[findex] ) { + // already compiled + op_call(ctx,pconst(&p,(int)(int_val)ctx->m->functions_ptrs[findex] - (cpos + 5)), size); + } else if( ctx->m->code->functions + fid == ctx->f ) { + // our current function + op_call(ctx,pconst(&p, ctx->functionPos - (cpos + 5)), size); + } else { + // stage for later + jlist *j = (jlist*)hl_malloc(&ctx->galloc,sizeof(jlist)); + j->pos = cpos; + j->target = findex; + j->next = ctx->calls; + ctx->calls = j; + op_call(ctx,pconst(&p,0), size); + } + discard_regs(ctx, false); + } + if( dst ) + store_result(ctx,dst); +} + +static void op_enter( jit_ctx *ctx ) { + preg p; + op64(ctx, PUSH, PEBP, UNUSED); + op64(ctx, MOV, PEBP, PESP); + if( ctx->totalRegsSize ) op64(ctx, SUB, PESP, pconst(&p,ctx->totalRegsSize)); +} + +static void op_ret( jit_ctx *ctx, vreg *r ) { + preg p; + switch( r->t->kind ) { + case HF32: +# ifdef HL_64 + op64(ctx, MOVSS, PXMM(0), fetch(r)); +# else + op64(ctx,FLD32,&r->stack,UNUSED); +# endif + break; + case HF64: +# ifdef HL_64 + op64(ctx, MOVSD, PXMM(0), fetch(r)); +# else + op64(ctx,FLD,&r->stack,UNUSED); +# endif + break; + default: + if( r->size < 4 && !r->current ) + fetch32(ctx, r); + if( r->current != PEAX ) + op64(ctx,MOV,PEAX,fetch(r)); + break; + } + if( ctx->totalRegsSize ) op64(ctx, ADD, PESP, pconst(&p, ctx->totalRegsSize)); +# ifdef JIT_DEBUG + { + int jeq; + op64(ctx, CMP, PESP, PEBP); + XJump_small(JEq,jeq); + jit_error("invalid ESP"); + patch_jump(ctx,jeq); + } +# endif + op64(ctx, POP, PEBP, UNUSED); + op64(ctx, RET, UNUSED, UNUSED); +} + +static void call_native_consts( jit_ctx *ctx, void *nativeFun, int_val *args, int nargs ) { + int size = pad_before_call(ctx, IS_64 ? 0 : HL_WSIZE*nargs); + preg p; + int i; +# ifdef HL_64 + for(i=0;i=0;i--) + op32(ctx, PUSH, pconst64(&p, args[i]), UNUSED); +# endif + call_native(ctx, nativeFun, size); +} + +static void on_jit_error( const char *msg, int_val line ) { + char buf[256]; + int iline = (int)line; + sprintf(buf,"%s (line %d)",msg,iline); +#ifdef HL_WIN_DESKTOP + MessageBoxA(NULL,buf,"JIT ERROR",MB_OK); +#else + printf("JIT ERROR : %s\n",buf); +#endif + hl_debug_break(); + hl_throw(NULL); +} + +static void _jit_error( jit_ctx *ctx, const char *msg, int line ) { + int_val args[2] = { (int_val)msg, (int_val)line }; + call_native_consts(ctx,on_jit_error,args,2); +} + + +static preg *op_binop( jit_ctx *ctx, vreg *dst, vreg *a, vreg *b, hl_op bop ) { + preg *pa = fetch(a), *pb = fetch(b), *out = NULL; + CpuOp o; + if( IS_FLOAT(a) ) { + bool isf32 = a->t->kind == HF32; + switch( bop ) { + case OAdd: o = isf32 ? ADDSS : ADDSD; break; + case OSub: o = isf32 ? SUBSS : SUBSD; break; + case OMul: o = isf32 ? MULSS : MULSD; break; + case OSDiv: o = isf32 ? DIVSS : DIVSD; break; + case OJSLt: + case OJSGte: + case OJSLte: + case OJSGt: + case OJEq: + case OJNotEq: + case OJNotLt: + case OJNotGte: + o = isf32 ? COMISS : COMISD; + break; + case OSMod: + { + int args[] = { a->stack.id, b->stack.id }; + int size = prepare_call_args(ctx,2,args,ctx->vregs,0); + void *mod_fun; + if( isf32 ) mod_fun = fmodf; else mod_fun = fmod; + call_native(ctx,mod_fun,size); + store_result(ctx,dst); + return fetch(dst); + } + default: + printf("%s\n", hl_op_name(bop)); + ASSERT(bop); + } + } else { + bool is64 = a->t->kind == HI64; +# ifndef HL_64 + if( is64 ) { + error_i64(); + return fetch(a); + } +# endif + switch( bop ) { + case OAdd: o = ADD; break; + case OSub: o = SUB; break; + case OMul: o = IMUL; break; + case OAnd: o = AND; break; + case OOr: o = OR; break; + case OXor: o = XOR; break; + case OShl: + case OUShr: + case OSShr: + if( !b->current || b->current->kind != RCPU || b->current->id != Ecx ) { + scratch(REG_AT(Ecx)); + op(ctx,MOV,REG_AT(Ecx),pb,is64); + RLOCK(REG_AT(Ecx)); + pa = fetch(a); + } else + RLOCK(b->current); + if( pa->kind != RCPU ) { + pa = alloc_reg(ctx, RCPU); + op(ctx,MOV,pa,fetch(a), is64); + } + op(ctx,bop == OShl ? SHL : (bop == OUShr ? SHR : SAR), pa, UNUSED,is64); + if( dst ) store(ctx, dst, pa, true); + return pa; + case OSDiv: + case OUDiv: + case OSMod: + case OUMod: + { + preg *out = bop == OSMod || bop == OUMod ? REG_AT(Edx) : PEAX; + preg *r = pb; + preg p; + int jz, jz1 = 0, jend; + if( pa->kind == RCPU && pa->id == Eax ) RLOCK(pa); + // ensure b in CPU reg and not in Eax/Edx (for UI8/UI16) + if( pb->kind != RCPU || (pb->id == Eax || pb->id == Edx) ) { + scratch(REG_AT(Ecx)); + scratch(pb); + load(ctx,REG_AT(Ecx),b); + r = REG_AT(Ecx); + } + // integer div 0 => 0 + op(ctx,TEST,r,r,is64); + XJump_small(JZero, jz); + // Prevent MIN/-1 overflow exception + // OSMod: r = (b == 0 || b == -1) ? 0 : a % b + // OSDiv: r = (b == 0 || b == -1) ? a * b : a / b + if( bop == OSMod || bop == OSDiv ) { + op(ctx, CMP, r, pconst(&p,-1), is64); + XJump_small(JEq, jz1); + } + pa = fetch(a); + if( pa->kind != RCPU || pa->id != Eax ) { + scratch(PEAX); + scratch(pa); + load(ctx,PEAX,a); + } + scratch(REG_AT(Edx)); + scratch(REG_AT(Eax)); + if( bop == OUDiv || bop == OUMod ) + op(ctx, XOR, REG_AT(Edx), REG_AT(Edx), is64); + else + op(ctx, CDQ, UNUSED, UNUSED, is64); // sign-extend Eax into Eax:Edx + op(ctx, bop == OUDiv || bop == OUMod ? DIV : IDIV, r, UNUSED, is64); + XJump_small(JAlways, jend); + patch_jump(ctx, jz); + patch_jump(ctx, jz1); + if( bop != OSDiv ) { + op(ctx, XOR, out, out, is64); + } else { + load(ctx, out, a); + op(ctx, IMUL, out, r, is64); + } + patch_jump(ctx, jend); + if( dst ) store(ctx, dst, out, true); + return out; + } + case OJSLt: + case OJSGte: + case OJSLte: + case OJSGt: + case OJULt: + case OJUGte: + case OJEq: + case OJNotEq: + switch( a->t->kind ) { + case HUI8: + case HBOOL: + o = CMP8; + break; + case HUI16: + o = CMP16; + break; + default: + o = CMP; + break; + } + break; + default: + printf("%s\n", hl_op_name(bop)); + ASSERT(bop); + } + } + switch( RTYPE(a) ) { + case HI32: + case HUI8: + case HUI16: + case HBOOL: +# ifndef HL_64 + case HDYNOBJ: + case HVIRTUAL: + case HOBJ: + case HSTRUCT: + case HFUN: + case HMETHOD: + case HBYTES: + case HNULL: + case HENUM: + case HDYN: + case HTYPE: + case HABSTRACT: + case HARRAY: +# endif + switch( ID2(pa->kind, pb->kind) ) { + case ID2(RCPU,RCPU): + case ID2(RCPU,RSTACK): + op32(ctx, o, pa, pb); + scratch(pa); + out = pa; + break; + case ID2(RSTACK,RCPU): + if( dst == a && o != IMUL ) { + op32(ctx, o, pa, pb); + dst = NULL; + out = pa; + } else { + alloc_cpu(ctx,a, true); + return op_binop(ctx,dst,a,b,bop); + } + break; + case ID2(RSTACK,RSTACK): + alloc_cpu(ctx, a, true); + return op_binop(ctx, dst, a, b, bop); + default: + printf("%s(%d,%d)\n", hl_op_name(bop), pa->kind, pb->kind); + ASSERT(ID2(pa->kind, pb->kind)); + } + if( dst ) store(ctx, dst, out, true); + return out; +# ifdef HL_64 + case HOBJ: + case HSTRUCT: + case HDYNOBJ: + case HVIRTUAL: + case HFUN: + case HMETHOD: + case HBYTES: + case HNULL: + case HENUM: + case HDYN: + case HTYPE: + case HABSTRACT: + case HARRAY: + case HI64: + case HGUID: + switch( ID2(pa->kind, pb->kind) ) { + case ID2(RCPU,RCPU): + case ID2(RCPU,RSTACK): + op64(ctx, o, pa, pb); + scratch(pa); + out = pa; + break; + case ID2(RSTACK,RCPU): + if( dst == a && OP_FORMS[o].mem_r ) { + op64(ctx, o, pa, pb); + dst = NULL; + out = pa; + } else { + alloc_cpu(ctx,a, true); + return op_binop(ctx,dst,a,b,bop); + } + break; + case ID2(RSTACK,RSTACK): + alloc_cpu(ctx, a, true); + return op_binop(ctx, dst, a, b, bop); + default: + printf("%s(%d,%d)\n", hl_op_name(bop), pa->kind, pb->kind); + ASSERT(ID2(pa->kind, pb->kind)); + } + if( dst ) store(ctx, dst, out, true); + return out; +# endif + case HF64: + case HF32: + pa = alloc_fpu(ctx, a, true); + pb = alloc_fpu(ctx, b, true); + switch( ID2(pa->kind, pb->kind) ) { + case ID2(RFPU,RFPU): + op64(ctx,o,pa,pb); + if( (o == COMISD || o == COMISS) && bop != OJSGt ) { + int jnotnan; + XJump_small(JNParity,jnotnan); + switch( bop ) { + case OJSLt: + case OJNotLt: + { + preg *r = alloc_reg(ctx,RCPU); + // set CF=0, ZF=1 + op64(ctx,XOR,r,r); + RUNLOCK(r); + break; + } + case OJSGte: + case OJNotGte: + { + preg *r = alloc_reg(ctx,RCPU); + // set ZF=0, CF=1 + op64(ctx,XOR,r,r); + op64(ctx,CMP,r,PESP); + RUNLOCK(r); + break; + } + break; + case OJNotEq: + case OJEq: + // set ZF=0, CF=? + case OJSLte: + // set ZF=0, CF=0 + op64(ctx,TEST,PESP,PESP); + break; + default: + ASSERT(bop); + } + patch_jump(ctx,jnotnan); + } + scratch(pa); + out = pa; + break; + default: + printf("%s(%d,%d)\n", hl_op_name(bop), pa->kind, pb->kind); + ASSERT(ID2(pa->kind, pb->kind)); + } + if( dst ) store(ctx, dst, out, true); + return out; + default: + ASSERT(RTYPE(a)); + } + return NULL; +} + +static int do_jump( jit_ctx *ctx, hl_op op, bool isFloat ) { + int j; + switch( op ) { + case OJAlways: + XJump(JAlways,j); + break; + case OJSGte: + XJump(isFloat ? JUGte : JSGte,j); + break; + case OJSGt: + XJump(isFloat ? JUGt : JSGt,j); + break; + case OJUGte: + XJump(JUGte,j); + break; + case OJSLt: + XJump(isFloat ? JULt : JSLt,j); + break; + case OJSLte: + XJump(isFloat ? JULte : JSLte,j); + break; + case OJULt: + XJump(JULt,j); + break; + case OJEq: + XJump(JEq,j); + break; + case OJNotEq: + XJump(JNeq,j); + break; + case OJNotLt: + XJump(JUGte,j); + break; + case OJNotGte: + XJump(JULt,j); + break; + default: + j = 0; + printf("Unknown JUMP %d\n",op); + break; + } + return j; +} + +static void register_jump( jit_ctx *ctx, int pos, int target ) { + jlist *j = (jlist*)hl_malloc(&ctx->falloc, sizeof(jlist)); + j->pos = pos; + j->target = target; + j->next = ctx->jumps; + ctx->jumps = j; + if( target != 0 && ctx->opsPos[target] == 0 ) + ctx->opsPos[target] = -1; +} + +#define HDYN_VALUE 8 + +static void dyn_value_compare( jit_ctx *ctx, preg *a, preg *b, hl_type *t ) { + preg p; + switch( t->kind ) { + case HUI8: + case HBOOL: + op32(ctx,MOV8,a,pmem(&p,a->id,HDYN_VALUE)); + op32(ctx,MOV8,b,pmem(&p,b->id,HDYN_VALUE)); + op64(ctx,CMP8,a,b); + break; + case HUI16: + op32(ctx,MOV16,a,pmem(&p,a->id,HDYN_VALUE)); + op32(ctx,MOV16,b,pmem(&p,b->id,HDYN_VALUE)); + op64(ctx,CMP16,a,b); + break; + case HI32: + op32(ctx,MOV,a,pmem(&p,a->id,HDYN_VALUE)); + op32(ctx,MOV,b,pmem(&p,b->id,HDYN_VALUE)); + op64(ctx,CMP,a,b); + break; + case HF32: + { + preg *fa = alloc_reg(ctx, RFPU); + preg *fb = alloc_reg(ctx, RFPU); + op64(ctx,MOVSS,fa,pmem(&p,a->id,HDYN_VALUE)); + op64(ctx,MOVSS,fb,pmem(&p,b->id,HDYN_VALUE)); + op64(ctx,COMISD,fa,fb); + } + break; + case HF64: + { + preg *fa = alloc_reg(ctx, RFPU); + preg *fb = alloc_reg(ctx, RFPU); + op64(ctx,MOVSD,fa,pmem(&p,a->id,HDYN_VALUE)); + op64(ctx,MOVSD,fb,pmem(&p,b->id,HDYN_VALUE)); + op64(ctx,COMISD,fa,fb); + } + break; + case HI64: + default: + // ptr comparison + op64(ctx,MOV,a,pmem(&p,a->id,HDYN_VALUE)); + op64(ctx,MOV,b,pmem(&p,b->id,HDYN_VALUE)); + op64(ctx,CMP,a,b); + break; + } +} + +static void op_jump( jit_ctx *ctx, vreg *a, vreg *b, hl_opcode *op, int targetPos ) { + if( a->t->kind == HDYN || b->t->kind == HDYN || a->t->kind == HFUN || b->t->kind == HFUN ) { + int args[] = { a->stack.id, b->stack.id }; + int size = prepare_call_args(ctx,2,args,ctx->vregs,0); + call_native(ctx,hl_dyn_compare,size); + if( op->op == OJSGt || op->op == OJSGte ) { + preg p; + int jinvalid; + op32(ctx,CMP,PEAX,pconst(&p,hl_invalid_comparison)); + XJump_small(JEq,jinvalid); + op32(ctx,TEST,PEAX,PEAX); + register_jump(ctx,do_jump(ctx,op->op, IS_FLOAT(a)),targetPos); + patch_jump(ctx,jinvalid); + return; + } + op32(ctx,TEST,PEAX,PEAX); + } else switch( a->t->kind ) { + case HTYPE: + { + int args[] = { a->stack.id, b->stack.id }; + int size = prepare_call_args(ctx,2,args,ctx->vregs,0); + preg p; + call_native(ctx,hl_same_type,size); + op64(ctx,CMP8,PEAX,pconst(&p,1)); + } + break; + case HNULL: + { + preg *pa = hl_type_size(a->t->tparam) == 1 ? alloc_cpu8(ctx,a,true) : alloc_cpu(ctx,a,true); + preg *pb = hl_type_size(b->t->tparam) == 1 ? alloc_cpu8(ctx,b,true) : alloc_cpu(ctx,b,true); + if( op->op == OJEq ) { + // if( a == b || (a && b && a->v == b->v) ) goto + int ja, jb; + // if( a != b && (!a || !b || a->v != b->v) ) goto + op64(ctx,CMP,pa,pb); + register_jump(ctx,do_jump(ctx,OJEq,false),targetPos); + op64(ctx,TEST,pa,pa); + XJump_small(JZero,ja); + op64(ctx,TEST,pb,pb); + XJump_small(JZero,jb); + dyn_value_compare(ctx,pa,pb,a->t->tparam); + register_jump(ctx,do_jump(ctx,OJEq,false),targetPos); + scratch(pa); + scratch(pb); + patch_jump(ctx,ja); + patch_jump(ctx,jb); + } else if( op->op == OJNotEq ) { + int jeq, jcmp; + // if( a != b && (!a || !b || a->v != b->v) ) goto + op64(ctx,CMP,pa,pb); + XJump_small(JEq,jeq); + op64(ctx,TEST,pa,pa); + register_jump(ctx,do_jump(ctx,OJEq,false),targetPos); + op64(ctx,TEST,pb,pb); + register_jump(ctx,do_jump(ctx,OJEq,false),targetPos); + dyn_value_compare(ctx,pa,pb,a->t->tparam); + XJump_small(JZero,jcmp); + scratch(pa); + scratch(pb); + register_jump(ctx,do_jump(ctx,OJNotEq,false),targetPos); + patch_jump(ctx,jcmp); + patch_jump(ctx,jeq); + } else + ASSERT(op->op); + return; + } + case HVIRTUAL: + { + preg p; + preg *pa = alloc_cpu(ctx,a,true); + preg *pb = alloc_cpu(ctx,b,true); + int ja,jb,jav,jbv,jvalue; + if( b->t->kind == HOBJ ) { + if( op->op == OJEq ) { + // if( a ? (b && a->value == b) : (b == NULL) ) goto + op64(ctx,TEST,pa,pa); + XJump_small(JZero,ja); + op64(ctx,TEST,pb,pb); + XJump_small(JZero,jb); + op64(ctx,MOV,pa,pmem(&p,pa->id,HL_WSIZE)); + op64(ctx,CMP,pa,pb); + XJump_small(JAlways,jvalue); + patch_jump(ctx,ja); + op64(ctx,TEST,pb,pb); + patch_jump(ctx,jvalue); + register_jump(ctx,do_jump(ctx,OJEq,false),targetPos); + patch_jump(ctx,jb); + } else if( op->op == OJNotEq ) { + // if( a ? (b == NULL || a->value != b) : (b != NULL) ) goto + op64(ctx,TEST,pa,pa); + XJump_small(JZero,ja); + op64(ctx,TEST,pb,pb); + register_jump(ctx,do_jump(ctx,OJEq,false),targetPos); + op64(ctx,MOV,pa,pmem(&p,pa->id,HL_WSIZE)); + op64(ctx,CMP,pa,pb); + XJump_small(JAlways,jvalue); + patch_jump(ctx,ja); + op64(ctx,TEST,pb,pb); + patch_jump(ctx,jvalue); + register_jump(ctx,do_jump(ctx,OJNotEq,false),targetPos); + } else + ASSERT(op->op); + scratch(pa); + return; + } + op64(ctx,CMP,pa,pb); + if( op->op == OJEq ) { + // if( a == b || (a && b && a->value && b->value && a->value == b->value) ) goto + register_jump(ctx,do_jump(ctx,OJEq, false),targetPos); + op64(ctx,TEST,pa,pa); + XJump_small(JZero,ja); + op64(ctx,TEST,pb,pb); + XJump_small(JZero,jb); + op64(ctx,MOV,pa,pmem(&p,pa->id,HL_WSIZE)); + op64(ctx,TEST,pa,pa); + XJump_small(JZero,jav); + op64(ctx,MOV,pb,pmem(&p,pb->id,HL_WSIZE)); + op64(ctx,TEST,pb,pb); + XJump_small(JZero,jbv); + op64(ctx,CMP,pa,pb); + XJump_small(JNeq,jvalue); + register_jump(ctx,do_jump(ctx,OJEq, false),targetPos); + patch_jump(ctx,ja); + patch_jump(ctx,jb); + patch_jump(ctx,jav); + patch_jump(ctx,jbv); + patch_jump(ctx,jvalue); + } else if( op->op == OJNotEq ) { + int jnext; + // if( a != b && (!a || !b || !a->value || !b->value || a->value != b->value) ) goto + XJump_small(JEq,jnext); + op64(ctx,TEST,pa,pa); + XJump_small(JZero,ja); + op64(ctx,TEST,pb,pb); + XJump_small(JZero,jb); + op64(ctx,MOV,pa,pmem(&p,pa->id,HL_WSIZE)); + op64(ctx,TEST,pa,pa); + XJump_small(JZero,jav); + op64(ctx,MOV,pb,pmem(&p,pb->id,HL_WSIZE)); + op64(ctx,TEST,pb,pb); + XJump_small(JZero,jbv); + op64(ctx,CMP,pa,pb); + XJump_small(JEq,jvalue); + patch_jump(ctx,ja); + patch_jump(ctx,jb); + patch_jump(ctx,jav); + patch_jump(ctx,jbv); + register_jump(ctx,do_jump(ctx,OJAlways, false),targetPos); + patch_jump(ctx,jnext); + patch_jump(ctx,jvalue); + } else + ASSERT(op->op); + scratch(pa); + scratch(pb); + return; + } + break; + case HOBJ: + case HSTRUCT: + if( b->t->kind == HVIRTUAL ) { + op_jump(ctx,b,a,op,targetPos); // inverse + return; + } + if( hl_get_obj_rt(a->t)->compareFun ) { + preg *pa = alloc_cpu(ctx,a,true); + preg *pb = alloc_cpu(ctx,b,true); + preg p; + int jeq, ja, jb, jcmp; + int args[] = { a->stack.id, b->stack.id }; + switch( op->op ) { + case OJEq: + // if( a == b || (a && b && cmp(a,b) == 0) ) goto + op64(ctx,CMP,pa,pb); + XJump_small(JEq,jeq); + op64(ctx,TEST,pa,pa); + XJump_small(JZero,ja); + op64(ctx,TEST,pb,pb); + XJump_small(JZero,jb); + op_call_fun(ctx,NULL,(int)(int_val)a->t->obj->rt->compareFun,2,args); + op32(ctx,TEST,PEAX,PEAX); + XJump_small(JNotZero,jcmp); + patch_jump(ctx,jeq); + register_jump(ctx,do_jump(ctx,OJAlways,false),targetPos); + patch_jump(ctx,ja); + patch_jump(ctx,jb); + patch_jump(ctx,jcmp); + break; + case OJNotEq: + // if( a != b && (!a || !b || cmp(a,b) != 0) ) goto + op64(ctx,CMP,pa,pb); + XJump_small(JEq,jeq); + op64(ctx,TEST,pa,pa); + register_jump(ctx,do_jump(ctx,OJEq,false),targetPos); + op64(ctx,TEST,pb,pb); + register_jump(ctx,do_jump(ctx,OJEq,false),targetPos); + + op_call_fun(ctx,NULL,(int)(int_val)a->t->obj->rt->compareFun,2,args); + op32(ctx,TEST,PEAX,PEAX); + XJump_small(JZero,jcmp); + + register_jump(ctx,do_jump(ctx,OJNotEq,false),targetPos); + patch_jump(ctx,jcmp); + patch_jump(ctx,jeq); + break; + default: + // if( a && b && cmp(a,b) ?? 0 ) goto + op64(ctx,TEST,pa,pa); + XJump_small(JZero,ja); + op64(ctx,TEST,pb,pb); + XJump_small(JZero,jb); + op_call_fun(ctx,NULL,(int)(int_val)a->t->obj->rt->compareFun,2,args); + op32(ctx,CMP,PEAX,pconst(&p,0)); + register_jump(ctx,do_jump(ctx,op->op,false),targetPos); + patch_jump(ctx,ja); + patch_jump(ctx,jb); + break; + } + return; + } + // fallthrough + default: + // make sure we have valid 8 bits registers + if( a->size == 1 ) alloc_cpu8(ctx,a,true); + if( b->size == 1 ) alloc_cpu8(ctx,b,true); + op_binop(ctx,NULL,a,b,op->op); + break; + } + register_jump(ctx,do_jump(ctx,op->op, IS_FLOAT(a)),targetPos); +} + +jit_ctx *hl_jit_alloc() { + int i; + jit_ctx *ctx = (jit_ctx*)malloc(sizeof(jit_ctx)); + if( ctx == NULL ) return NULL; + memset(ctx,0,sizeof(jit_ctx)); + hl_alloc_init(&ctx->falloc); + hl_alloc_init(&ctx->galloc); + for(i=0;iid = i; + r->kind = RCPU; + } + for(i=0;iid = i; + r->kind = RFPU; + } + return ctx; +} + +void hl_jit_free( jit_ctx *ctx, h_bool can_reset ) { + free(ctx->vregs); + free(ctx->opsPos); + free(ctx->startBuf); + ctx->maxRegs = 0; + ctx->vregs = NULL; + ctx->maxOps = 0; + ctx->opsPos = NULL; + ctx->startBuf = NULL; + ctx->bufSize = 0; + ctx->buf.b = NULL; + ctx->calls = NULL; + ctx->switchs = NULL; + ctx->closure_list = NULL; + hl_free(&ctx->falloc); + hl_free(&ctx->galloc); + if( !can_reset ) free(ctx); +} + +static void jit_nops( jit_ctx *ctx ) { + while( BUF_POS() & 15 ) + op32(ctx, NOP, UNUSED, UNUSED); +} + +#define MAX_ARGS 16 + +static void *call_jit_c2hl = NULL; +static void *call_jit_hl2c = NULL; + +static void *callback_c2hl( void *_f, hl_type *t, void **args, vdynamic *ret ) { + /* + prepare stack and regs according to prepare_call_args, but by reading runtime type information + from the function type. The stack and regs will be setup by the trampoline function. + */ + void **f = (void**)_f; + unsigned char stack[MAX_ARGS * 8]; + call_regs cregs = {0}; + if( t->fun->nargs > MAX_ARGS ) + hl_error("Too many arguments for dynamic call"); + int i, size = 0, pad = 0, pos = 0; + for(i=0;ifun->nargs;i++) { + hl_type *at = t->fun->args[i]; + int creg = select_call_reg(&cregs,at,i); + if( creg >= 0 ) + continue; + size += stack_size(at); + } + pad = (-size) & 15; + size += pad; + pos = 0; + for(i=0;ifun->nargs;i++) { + // RTL + hl_type *at = t->fun->args[i]; + void *v = args[i]; + int creg = mapped_reg(&cregs,i); + void *store; + if( creg >= 0 ) { + if( REG_IS_FPU(creg) ) { + store = stack + size + CALL_NREGS * HL_WSIZE + (creg - XMM(0)) * sizeof(double); + } else { + store = stack + size + call_reg_index(creg) * HL_WSIZE; + } + switch( at->kind ) { + case HBOOL: + case HUI8: + *(int_val*)store = *(unsigned char*)v; + break; + case HUI16: + *(int_val*)store = *(unsigned short*)v; + break; + case HI32: + *(int_val*)store = *(int*)v; + break; + case HF32: + *(void**)store = 0; + *(float*)store = *(float*)v; + break; + case HF64: + *(double*)store = *(double*)v; + break; + case HI64: + case HGUID: + *(int64*)store = *(int64*)v; + break; + default: + *(void**)store = v; + break; + } + } else { + int tsize = stack_size(at); + store = stack + pos; + pos += tsize; + switch( at->kind ) { + case HBOOL: + case HUI8: + *(int*)store = *(unsigned char*)v; + break; + case HUI16: + *(int*)store = *(unsigned short*)v; + break; + case HI32: + case HF32: + *(int*)store = *(int*)v; + break; + case HF64: + *(double*)store = *(double*)v; + break; + case HI64: + case HGUID: + *(int64*)store = *(int64*)v; + break; + default: + *(void**)store = v; + break; + } + } + } + pos += pad; + pos >>= IS_64 ? 3 : 2; + switch( t->fun->ret->kind ) { + case HUI8: + case HUI16: + case HI32: + case HBOOL: + ret->v.i = ((int (*)(void *, void *, void *))call_jit_c2hl)(*f, (void**)&stack + pos, &stack); + return &ret->v.i; + case HI64: + case HGUID: + ret->v.i64 = ((int64 (*)(void *, void *, void *))call_jit_c2hl)(*f, (void**)&stack + pos, &stack); + return &ret->v.i64; + case HF32: + ret->v.f = ((float (*)(void *, void *, void *))call_jit_c2hl)(*f, (void**)&stack + pos, &stack); + return &ret->v.f; + case HF64: + ret->v.d = ((double (*)(void *, void *, void *))call_jit_c2hl)(*f, (void**)&stack + pos, &stack); + return &ret->v.d; + default: + return ((void *(*)(void *, void *, void *))call_jit_c2hl)(*f, (void**)&stack + pos, &stack); + } +} + +static void jit_c2hl( jit_ctx *ctx ) { + // create the function that will be called by callback_c2hl + // it will make sure to prepare the stack/regs according to native calling conventions + int jeq, jloop, jstart; + preg *fptr, *stack, *stend; + preg p; + + op64(ctx,PUSH,PEBP,UNUSED); + op64(ctx,MOV,PEBP,PESP); + +# ifdef HL_64 + + fptr = REG_AT(R10); + stack = PEAX; + stend = REG_AT(R11); + op64(ctx, MOV, fptr, REG_AT(CALL_REGS[0])); + op64(ctx, MOV, stack, REG_AT(CALL_REGS[1])); + op64(ctx, MOV, stend, REG_AT(CALL_REGS[2])); + + // set native call regs + int i; + for(i=0;iid,i*HL_WSIZE)); + for(i=0;iid,(i+CALL_NREGS)*HL_WSIZE)); + +# else + + // make sure the stack is aligned on 16 bytes + // the amount of push we will do afterwards is guaranteed to be a multiple of 16bytes by hl_callback +# ifdef HL_VCC + // VCC does not guarantee us an aligned stack... + op64(ctx,MOV,PEAX,PESP); + op64(ctx,AND,PEAX,pconst(&p,15)); + op64(ctx,SUB,PESP,PEAX); +# else + op64(ctx,SUB,PESP,pconst(&p,8)); +# endif + + // mov arguments to regs + fptr = REG_AT(Eax); + stack = REG_AT(Edx); + stend = REG_AT(Ecx); + op64(ctx,MOV,fptr,pmem(&p,Ebp,HL_WSIZE*2)); + op64(ctx,MOV,stack,pmem(&p,Ebp,HL_WSIZE*3)); + op64(ctx,MOV,stend,pmem(&p,Ebp,HL_WSIZE*4)); + +# endif + + // push stack args + jstart = BUF_POS(); + op64(ctx,CMP,stack,stend); + XJump(JEq,jeq); + op64(ctx,SUB,stack,pconst(&p,HL_WSIZE)); + op64(ctx,PUSH,pmem(&p,stack->id,0),UNUSED); + XJump(JAlways,jloop); + patch_jump(ctx,jeq); + patch_jump_to(ctx, jloop, jstart); + + op_call(ctx,fptr,0); + + // cleanup and ret + op64(ctx,MOV,PESP,PEBP); + op64(ctx,POP,PEBP, UNUSED); + op64(ctx,RET,UNUSED,UNUSED); +} + +static vdynamic *jit_wrapper_call( vclosure_wrapper *c, char *stack_args, void **regs ) { + vdynamic *args[MAX_ARGS]; + int i; + int nargs = c->cl.t->fun->nargs; + call_regs cregs = {0}; + if( nargs > MAX_ARGS ) + hl_error("Too many arguments for wrapped call"); + cregs.nextCpu++; // skip fptr in HL64 - was passed as arg0 + for(i=0;icl.t->fun->args[i]; + int creg = select_call_reg(&cregs,t,i); + if( creg < 0 ) { + args[i] = hl_is_dynamic(t) ? *(vdynamic**)stack_args : hl_make_dyn(stack_args,t); + stack_args += stack_size(t); + } else if( hl_is_dynamic(t) ) { + args[i] = *(vdynamic**)(regs + call_reg_index(creg)); + } else if( t->kind == HF32 || t->kind == HF64 ) { + args[i] = hl_make_dyn(regs + CALL_NREGS + creg - XMM(0),&hlt_f64); + } else { + args[i] = hl_make_dyn(regs + call_reg_index(creg),t); + } + } + return hl_dyn_call(c->wrappedFun,args,nargs); +} + +static void *jit_wrapper_ptr( vclosure_wrapper *c, char *stack_args, void **regs ) { + vdynamic *ret = jit_wrapper_call(c, stack_args, regs); + hl_type *tret = c->cl.t->fun->ret; + switch( tret->kind ) { + case HVOID: + return NULL; + case HUI8: + case HUI16: + case HI32: + case HBOOL: + return (void*)(int_val)hl_dyn_casti(&ret,&hlt_dyn,tret); + case HI64: + case HGUID: + return (void*)(int_val)hl_dyn_casti64(&ret,&hlt_dyn); + default: + return hl_dyn_castp(&ret,&hlt_dyn,tret); + } +} + +static double jit_wrapper_d( vclosure_wrapper *c, char *stack_args, void **regs ) { + vdynamic *ret = jit_wrapper_call(c, stack_args, regs); + return hl_dyn_castd(&ret,&hlt_dyn); +} + +static void jit_hl2c( jit_ctx *ctx ) { + // create a function that is called with a vclosure_wrapper* and native args + // and pack and pass the args to callback_hl2c + preg p; + int jfloat1, jfloat2, jexit; + hl_type_fun *ft = NULL; + int size; +# ifdef HL_64 + preg *cl = REG_AT(CALL_REGS[0]); + preg *tmp = REG_AT(CALL_REGS[1]); +# else + preg *cl = REG_AT(Ecx); + preg *tmp = REG_AT(Edx); +# endif + + op64(ctx,PUSH,PEBP,UNUSED); + op64(ctx,MOV,PEBP,PESP); + +# ifdef HL_64 + // push registers + int i; + op64(ctx,SUB,PESP,pconst(&p,CALL_NREGS*8)); + for(i=0;it->fun->ret->kind ) { + // case HF32: case HF64: return jit_wrapper_d(arg0,&args); + // default: return jit_wrapper_ptr(arg0,&args); + // } + if( !IS_64 ) + op64(ctx,MOV,cl,pmem(&p,Ebp,HL_WSIZE*2)); // load arg0 + op64(ctx,MOV,tmp,pmem(&p,cl->id,0)); // ->t + op64(ctx,MOV,tmp,pmem(&p,tmp->id,HL_WSIZE)); // ->fun + op64(ctx,MOV,tmp,pmem(&p,tmp->id,(int)(int_val)&ft->ret)); // ->ret + op32(ctx,MOV,tmp,pmem(&p,tmp->id,0)); // -> kind + + op32(ctx,CMP,tmp,pconst(&p,HF64)); + XJump_small(JEq,jfloat1); + op32(ctx,CMP,tmp,pconst(&p,HF32)); + XJump_small(JEq,jfloat2); + + // 64 bits : ESP + EIP (+WIN64PAD) + // 32 bits : ESP + EIP + PARAM0 + int args_pos = IS_64 ? ((IS_WINCALL64 ? 32 : 0) + HL_WSIZE * 2) : (HL_WSIZE*3); + + size = begin_native_call(ctx,3); + op64(ctx, LEA, tmp, pmem(&p,Ebp,-HL_WSIZE*CALL_NREGS*2)); + set_native_arg(ctx, tmp); + op64(ctx, LEA, tmp, pmem(&p,Ebp,args_pos)); + set_native_arg(ctx, tmp); + set_native_arg(ctx, cl); + call_native(ctx, jit_wrapper_ptr, size); + XJump_small(JAlways, jexit); + + patch_jump(ctx,jfloat1); + patch_jump(ctx,jfloat2); + size = begin_native_call(ctx,3); + op64(ctx, LEA, tmp, pmem(&p,Ebp,-HL_WSIZE*CALL_NREGS*2)); + set_native_arg(ctx, tmp); + op64(ctx, LEA, tmp, pmem(&p,Ebp,args_pos)); + set_native_arg(ctx, tmp); + set_native_arg(ctx, cl); + call_native(ctx, jit_wrapper_d, size); + + patch_jump(ctx,jexit); + op64(ctx,MOV,PESP,PEBP); + op64(ctx,POP,PEBP, UNUSED); + op64(ctx,RET,UNUSED,UNUSED); +} + +static void jit_fail( uchar *msg ) { + if( msg == NULL ) { + hl_debug_break(); + msg = USTR("assert"); + } + vdynamic *d = hl_alloc_dynamic(&hlt_bytes); + d->v.ptr = msg; + hl_throw(d); +} + +static void jit_null_access( jit_ctx *ctx ) { + op64(ctx,PUSH,PEBP,UNUSED); + op64(ctx,MOV,PEBP,PESP); + int_val arg = (int_val)USTR("Null access"); + call_native_consts(ctx, jit_fail, &arg, 1); +} + +static void jit_null_fail( int fhash ) { + vbyte *field = hl_field_name(fhash); + hl_buffer *b = hl_alloc_buffer(); + hl_buffer_str(b, USTR("Null access .")); + hl_buffer_str(b, (uchar*)field); + vdynamic *d = hl_alloc_dynamic(&hlt_bytes); + d->v.ptr = hl_buffer_content(b,NULL); + hl_throw(d); +} + +static void jit_null_field_access( jit_ctx *ctx ) { + preg p; + op64(ctx,PUSH,PEBP,UNUSED); + op64(ctx,MOV,PEBP,PESP); + int size = begin_native_call(ctx, 1); + int args_pos = (IS_WINCALL64 ? 32 : 0) + HL_WSIZE*2; + set_native_arg(ctx, pmem(&p,Ebp,args_pos)); + call_native(ctx,jit_null_fail,size); +} + +static void jit_assert( jit_ctx *ctx ) { + op64(ctx,PUSH,PEBP,UNUSED); + op64(ctx,MOV,PEBP,PESP); + int_val arg = 0; + call_native_consts(ctx, jit_fail, &arg, 1); +} + +static int jit_build( jit_ctx *ctx, void (*fbuild)( jit_ctx *) ) { + int pos; + jit_buf(ctx); + jit_nops(ctx); + pos = BUF_POS(); + fbuild(ctx); + int endPos = BUF_POS(); + jit_nops(ctx); +#ifdef WIN64_UNWIND_TABLES + int fid = ctx->nunwind++; + ctx->unwind_table[fid].BeginAddress = pos; + ctx->unwind_table[fid].EndAddress = endPos; + ctx->unwind_table[fid].UnwindData = ctx->unwind_offset; +#endif + return pos; +} + +static void hl_jit_init_module( jit_ctx *ctx, hl_module *m ) { + int i; + ctx->m = m; + if( m->code->hasdebug ) { + ctx->debug = (hl_debug_infos*)malloc(sizeof(hl_debug_infos) * m->code->nfunctions); + memset(ctx->debug, -1, sizeof(hl_debug_infos) * m->code->nfunctions); + } + for(i=0;icode->nfloats;i++) { + jit_buf(ctx); + *ctx->buf.d++ = m->code->floats[i]; + } +#ifdef WIN64_UNWIND_TABLES + jit_buf(ctx); + ctx->unwind_offset = BUF_POS(); + write_unwind_data(ctx); + + ctx->unwind_table = malloc(sizeof(RUNTIME_FUNCTION) * (m->code->nfunctions + 10)); + memset(ctx->unwind_table, 0, sizeof(RUNTIME_FUNCTION) * (m->code->nfunctions + 10)); +#endif +} + +void hl_jit_init( jit_ctx *ctx, hl_module *m ) { + hl_jit_init_module(ctx,m); + ctx->c2hl = jit_build(ctx, jit_c2hl); + ctx->hl2c = jit_build(ctx, jit_hl2c); + ctx->static_functions[0] = (void*)(int_val)jit_build(ctx,jit_null_access); + ctx->static_functions[1] = (void*)(int_val)jit_build(ctx,jit_assert); + ctx->static_functions[2] = (void*)(int_val)jit_build(ctx,jit_null_field_access); +} + +void hl_jit_reset( jit_ctx *ctx, hl_module *m ) { + ctx->debug = NULL; + hl_jit_init_module(ctx,m); +} + +static void *get_dyncast( hl_type *t ) { + switch( t->kind ) { + case HF32: + return hl_dyn_castf; + case HF64: + return hl_dyn_castd; + case HI64: + case HGUID: + return hl_dyn_casti64; + case HI32: + case HUI16: + case HUI8: + case HBOOL: + return hl_dyn_casti; + default: + return hl_dyn_castp; + } +} + +static void *get_dynset( hl_type *t ) { + switch( t->kind ) { + case HF32: + return hl_dyn_setf; + case HF64: + return hl_dyn_setd; + case HI64: + case HGUID: + return hl_dyn_seti64; + case HI32: + case HUI16: + case HUI8: + case HBOOL: + return hl_dyn_seti; + default: + return hl_dyn_setp; + } +} + +static void *get_dynget( hl_type *t ) { + switch( t->kind ) { + case HF32: + return hl_dyn_getf; + case HF64: + return hl_dyn_getd; + case HI64: + case HGUID: + return hl_dyn_geti64; + case HI32: + case HUI16: + case HUI8: + case HBOOL: + return hl_dyn_geti; + default: + return hl_dyn_getp; + } +} + +static double uint_to_double( unsigned int v ) { + return v; +} + +static vclosure *alloc_static_closure( jit_ctx *ctx, int fid ) { + hl_module *m = ctx->m; + vclosure *c = hl_malloc(&m->ctx.alloc,sizeof(vclosure)); + int fidx = m->functions_indexes[fid]; + c->hasValue = 0; + if( fidx >= m->code->nfunctions ) { + // native + c->t = m->code->natives[fidx - m->code->nfunctions].t; + c->fun = m->functions_ptrs[fid]; + c->value = NULL; + } else { + c->t = m->code->functions[fidx].type; + c->fun = (void*)(int_val)fid; + c->value = ctx->closure_list; + ctx->closure_list = c; + } + return c; +} + +static void make_dyn_cast( jit_ctx *ctx, vreg *dst, vreg *v ) { + int size; + preg p; + preg *tmp; + if( v->t->kind == HNULL && v->t->tparam->kind == dst->t->kind ) { + int jnull, jend; + preg *out; + switch( dst->t->kind ) { + case HUI8: + case HUI16: + case HI32: + case HBOOL: + case HI64: + case HGUID: + tmp = alloc_cpu(ctx, v, true); + op64(ctx, TEST, tmp, tmp); + XJump_small(JZero, jnull); + op64(ctx, MOV, tmp, pmem(&p,tmp->id,8)); + XJump_small(JAlways, jend); + patch_jump(ctx, jnull); + op64(ctx, XOR, tmp, tmp); + patch_jump(ctx, jend); + store(ctx, dst, tmp, true); + return; + case HF32: + case HF64: + tmp = alloc_cpu(ctx, v, true); + out = alloc_fpu(ctx, dst, false); + op64(ctx, TEST, tmp, tmp); + XJump_small(JZero, jnull); + op64(ctx, dst->t->kind == HF32 ? MOVSS : MOVSD, out, pmem(&p,tmp->id,8)); + XJump_small(JAlways, jend); + patch_jump(ctx, jnull); + op64(ctx, XORPD, out, out); + patch_jump(ctx, jend); + store(ctx, dst, out, true); + return; + default: + break; + } + } + switch( dst->t->kind ) { + case HF32: + case HF64: + case HI64: + case HGUID: + size = begin_native_call(ctx, 2); + set_native_arg(ctx, pconst64(&p,(int_val)v->t)); + break; + default: + size = begin_native_call(ctx, 3); + set_native_arg(ctx, pconst64(&p,(int_val)dst->t)); + set_native_arg(ctx, pconst64(&p,(int_val)v->t)); + break; + } + tmp = alloc_native_arg(ctx); + op64(ctx,MOV,tmp,REG_AT(Ebp)); + if( v->stackPos >= 0 ) + op64(ctx,ADD,tmp,pconst(&p,v->stackPos)); + else + op64(ctx,SUB,tmp,pconst(&p,-v->stackPos)); + set_native_arg(ctx,tmp); + call_native(ctx,get_dyncast(dst->t),size); + store_result(ctx, dst); +} + +int hl_jit_function( jit_ctx *ctx, hl_module *m, hl_function *f ) { + int i, size = 0, opCount; + int codePos = BUF_POS(); + int nargs = f->type->fun->nargs; + unsigned short *debug16 = NULL; + int *debug32 = NULL; + call_regs cregs = {0}; + hl_thread_info *tinf = NULL; + preg p; + ctx->f = f; + ctx->allocOffset = 0; + if( f->nregs > ctx->maxRegs ) { + free(ctx->vregs); + ctx->vregs = (vreg*)malloc(sizeof(vreg) * (f->nregs + 1)); + if( ctx->vregs == NULL ) { + ctx->maxRegs = 0; + return -1; + } + ctx->maxRegs = f->nregs; + } + if( f->nops > ctx->maxOps ) { + free(ctx->opsPos); + ctx->opsPos = (int*)malloc(sizeof(int) * (f->nops + 1)); + if( ctx->opsPos == NULL ) { + ctx->maxOps = 0; + return -1; + } + ctx->maxOps = f->nops; + } + memset(ctx->opsPos,0,(f->nops+1)*sizeof(int)); + for(i=0;inregs;i++) { + vreg *r = R(i); + r->t = f->regs[i]; + r->size = hl_type_size(r->t); + r->current = NULL; + r->stack.holds = NULL; + r->stack.id = i; + r->stack.kind = RSTACK; + } + size = 0; + int argsSize = 0; + for(i=0;it,i); + if( creg < 0 || IS_WINCALL64 ) { + // use existing stack storage + r->stackPos = argsSize + HL_WSIZE * 2; + argsSize += stack_size(r->t); + } else { + // make room in local vars + size += r->size; + size += hl_pad_size(size,r->t); + r->stackPos = -size; + } + } + for(i=nargs;inregs;i++) { + vreg *r = R(i); + size += r->size; + size += hl_pad_size(size,r->t); // align local vars + r->stackPos = -size; + } +# ifdef HL_64 + size += (-size) & 15; // align on 16 bytes +# else + size += hl_pad_size(size,&hlt_dyn); // align on word size +# endif + ctx->totalRegsSize = size; + jit_buf(ctx); + ctx->functionPos = BUF_POS(); + // make sure currentPos is > 0 before any reg allocations happen + // otherwise `alloc_reg` thinks that all registers are locked + ctx->currentPos = 1; + op_enter(ctx); +# ifdef HL_64 + { + // store in local var + for(i=0;isize); + p->holds = r; + r->current = p; + } + } +# endif + if( ctx->m->code->hasdebug ) { + debug16 = (unsigned short*)malloc(sizeof(unsigned short) * (f->nops + 1)); + debug16[0] = (unsigned short)(BUF_POS() - codePos); + } + ctx->opsPos[0] = BUF_POS(); + + for(opCount=0;opCountnops;opCount++) { + int jump; + hl_opcode *o = f->ops + opCount; + vreg *dst = R(o->p1); + vreg *ra = R(o->p2); + vreg *rb = R(o->p3); + ctx->currentPos = opCount + 1; + jit_buf(ctx); +# ifdef JIT_DEBUG + if( opCount == 0 || f->ops[opCount-1].op != OAsm ) { + int uid = opCount + (f->findex<<16); + op32(ctx, PUSH, pconst(&p,uid), UNUSED); + op64(ctx, ADD, PESP, pconst(&p,HL_WSIZE)); + } +# endif + // emit code + switch( o->op ) { + case OMov: + case OUnsafeCast: + op_mov(ctx, dst, ra); + break; + case OInt: + store_const(ctx, dst, m->code->ints[o->p2]); + break; + case OBool: + store_const(ctx, dst, o->p2); + break; + case OGetGlobal: + { + void *addr = m->globals_data + m->globals_indexes[o->p2]; +# ifdef HL_64 + preg *tmp = alloc_reg(ctx, RCPU); + op64(ctx, MOV, tmp, pconst64(&p,(int_val)addr)); + copy_to(ctx, dst, pmem(&p,tmp->id,0)); +# else + copy_to(ctx, dst, paddr(&p,addr)); +# endif + } + break; + case OSetGlobal: + { + void *addr = m->globals_data + m->globals_indexes[o->p1]; +# ifdef HL_64 + preg *tmp = alloc_reg(ctx, RCPU); + op64(ctx, MOV, tmp, pconst64(&p,(int_val)addr)); + copy_from(ctx, pmem(&p,tmp->id,0), ra); +# else + copy_from(ctx, paddr(&p,addr), ra); +# endif + } + break; + case OCall3: + { + int args[3] = { o->p3, o->extra[0], o->extra[1] }; + op_call_fun(ctx, dst, o->p2, 3, args); + } + break; + case OCall4: + { + int args[4] = { o->p3, o->extra[0], o->extra[1], o->extra[2] }; + op_call_fun(ctx, dst, o->p2, 4, args); + } + break; + case OCallN: + op_call_fun(ctx, dst, o->p2, o->p3, o->extra); + break; + case OCall0: + op_call_fun(ctx, dst, o->p2, 0, NULL); + break; + case OCall1: + op_call_fun(ctx, dst, o->p2, 1, &o->p3); + break; + case OCall2: + { + int args[2] = { o->p3, (int)(int_val)o->extra }; + op_call_fun(ctx, dst, o->p2, 2, args); + } + break; + case OSub: + case OAdd: + case OMul: + case OSDiv: + case OUDiv: + case OShl: + case OSShr: + case OUShr: + case OAnd: + case OOr: + case OXor: + case OSMod: + case OUMod: + op_binop(ctx, dst, ra, rb, o->op); + break; + case ONeg: + { + if( IS_FLOAT(ra) ) { + preg *pa = alloc_reg(ctx,RFPU); + preg *pb = alloc_fpu(ctx,ra,true); + op64(ctx,XORPD,pa,pa); + op64(ctx,ra->t->kind == HF32 ? SUBSS : SUBSD,pa,pb); + store(ctx,dst,pa,true); + } else if( ra->t->kind == HI64 ) { +# ifdef HL_64 + preg *pa = alloc_reg(ctx,RCPU); + preg *pb = alloc_cpu(ctx,ra,true); + op64(ctx,XOR,pa,pa); + op64(ctx,SUB,pa,pb); + store(ctx,dst,pa,true); +# else + error_i64(); +# endif + } else { + preg *pa = alloc_reg(ctx,RCPU); + preg *pb = alloc_cpu(ctx,ra,true); + op32(ctx,XOR,pa,pa); + op32(ctx,SUB,pa,pb); + store(ctx,dst,pa,true); + } + } + break; + case ONot: + { + preg *v = alloc_cpu(ctx,ra,true); + op32(ctx,XOR,v,pconst(&p,1)); + store(ctx,dst,v,true); + } + break; + case OJFalse: + case OJTrue: + case OJNotNull: + case OJNull: + { + preg *r = dst->t->kind == HBOOL ? alloc_cpu8(ctx, dst, true) : alloc_cpu(ctx, dst, true); + op64(ctx, dst->t->kind == HBOOL ? TEST8 : TEST, r, r); + XJump( o->op == OJFalse || o->op == OJNull ? JZero : JNotZero,jump); + register_jump(ctx,jump,(opCount + 1) + o->p2); + } + break; + case OJEq: + case OJNotEq: + case OJSLt: + case OJSGte: + case OJSLte: + case OJSGt: + case OJULt: + case OJUGte: + case OJNotLt: + case OJNotGte: + op_jump(ctx,dst,ra,o,(opCount + 1) + o->p3); + break; + case OJAlways: + jump = do_jump(ctx,o->op,false); + register_jump(ctx,jump,(opCount + 1) + o->p1); + break; + case OToDyn: + if( ra->t->kind == HBOOL ) { + int size = begin_native_call(ctx, 1); + set_native_arg(ctx, fetch(ra)); + call_native(ctx, hl_alloc_dynbool, size); + store(ctx, dst, PEAX, true); + } else { + int_val rt = (int_val)ra->t; + int jskip = 0; + if( hl_is_ptr(ra->t) ) { + int jnz; + preg *a = alloc_cpu(ctx,ra,true); + op64(ctx,TEST,a,a); + XJump_small(JNotZero,jnz); + op64(ctx,XOR,PEAX,PEAX); // will replace the result of alloc_dynamic at jump land + XJump_small(JAlways,jskip); + patch_jump(ctx,jnz); + } + call_native_consts(ctx, hl_alloc_dynamic, &rt, 1); + // copy value to dynamic + if( (IS_FLOAT(ra) || ra->size == 8) && !IS_64 ) { + preg *tmp = REG_AT(RCPU_SCRATCH_REGS[1]); + op64(ctx,MOV,tmp,&ra->stack); + op32(ctx,MOV,pmem(&p,Eax,HDYN_VALUE),tmp); + if( ra->t->kind == HF64 ) { + ra->stackPos += 4; + op64(ctx,MOV,tmp,&ra->stack); + op32(ctx,MOV,pmem(&p,Eax,HDYN_VALUE+4),tmp); + ra->stackPos -= 4; + } + } else { + preg *tmp = REG_AT(RCPU_SCRATCH_REGS[1]); + copy_from(ctx,tmp,ra); + op64(ctx,MOV,pmem(&p,Eax,HDYN_VALUE),tmp); + } + if( hl_is_ptr(ra->t) ) patch_jump(ctx,jskip); + store(ctx, dst, PEAX, true); + } + break; + case OToSFloat: + if( ra == dst ) break; + if (ra->t->kind == HI32 || ra->t->kind == HUI16 || ra->t->kind == HUI8) { + preg* r = alloc_cpu(ctx, ra, true); + preg* w = alloc_fpu(ctx, dst, false); + op32(ctx, dst->t->kind == HF64 ? CVTSI2SD : CVTSI2SS, w, r); + store(ctx, dst, w, true); + } else if (ra->t->kind == HI64 ) { + preg* r = alloc_cpu(ctx, ra, true); + preg* w = alloc_fpu(ctx, dst, false); + op64(ctx, dst->t->kind == HF64 ? CVTSI2SD : CVTSI2SS, w, r); + store(ctx, dst, w, true); + } else if( ra->t->kind == HF64 && dst->t->kind == HF32 ) { + preg *r = alloc_fpu(ctx,ra,true); + preg *w = alloc_fpu(ctx,dst,false); + op32(ctx,CVTSD2SS,w,r); + store(ctx, dst, w, true); + } else if( ra->t->kind == HF32 && dst->t->kind == HF64 ) { + preg *r = alloc_fpu(ctx,ra,true); + preg *w = alloc_fpu(ctx,dst,false); + op32(ctx,CVTSS2SD,w,r); + store(ctx, dst, w, true); + } else + ASSERT(0); + break; + case OToUFloat: + { + int size; + size = prepare_call_args(ctx,1,&o->p2,ctx->vregs,0); + call_native(ctx,uint_to_double,size); + store_result(ctx,dst); + } + break; + case OToInt: + if( ra == dst ) break; + if( ra->t->kind == HF64 ) { + preg *r = alloc_fpu(ctx,ra,true); + preg *w = alloc_cpu(ctx,dst,false); + preg *tmp = alloc_reg(ctx,RCPU); + op32(ctx,STMXCSR,pmem(&p,Esp,-4),UNUSED); + op32(ctx,MOV,tmp,&p); + op32(ctx,OR,tmp,pconst(&p,0x6000)); // set round towards 0 + op32(ctx,MOV,pmem(&p,Esp,-8),tmp); + op32(ctx,LDMXCSR,&p,UNUSED); + op32(ctx,CVTSD2SI,w,r); + op32(ctx,LDMXCSR,pmem(&p,Esp,-4),UNUSED); + store(ctx, dst, w, true); + } else if (ra->t->kind == HF32) { + preg *r = alloc_fpu(ctx, ra, true); + preg *w = alloc_cpu(ctx, dst, false); + preg *tmp = alloc_reg(ctx, RCPU); + op32(ctx, STMXCSR, pmem(&p, Esp, -4), UNUSED); + op32(ctx, MOV, tmp, &p); + op32(ctx, OR, tmp, pconst(&p, 0x6000)); // set round towards 0 + op32(ctx, MOV, pmem(&p, Esp, -8), tmp); + op32(ctx, LDMXCSR, &p, UNUSED); + op32(ctx, CVTSS2SI, w, r); + op32(ctx, LDMXCSR, pmem(&p, Esp, -4), UNUSED); + store(ctx, dst, w, true); + } else if( (dst->t->kind == HI64 || dst->t->kind == HGUID) && ra->t->kind == HI32 ) { + if( ra->current != PEAX ) { + op32(ctx, MOV, PEAX, fetch(ra)); + scratch(PEAX); + } +# ifdef HL_64 + op64(ctx, CDQE, UNUSED, UNUSED); // sign-extend Eax into Rax + store(ctx, dst, PEAX, true); +# else + op32(ctx, CDQ, UNUSED, UNUSED); // sign-extend Eax into Eax:Edx + scratch(REG_AT(Edx)); + op32(ctx, MOV, fetch(dst), PEAX); + dst->stackPos += 4; + op32(ctx, MOV, fetch(dst), REG_AT(Edx)); + dst->stackPos -= 4; + } else if( dst->t->kind == HI32 && ra->t->kind == HI64 ) { + error_i64(); +# endif + } else { + preg *r = alloc_cpu(ctx,dst,false); + copy_from(ctx, r, ra); + store(ctx, dst, r, true); + } + break; + case ORet: + op_ret(ctx, dst); + break; + case OIncr: + { + if( IS_FLOAT(dst) ) { + ASSERT(0); + } else { + preg *v = fetch32(ctx,dst); + op32(ctx,INC,v,UNUSED); + if( v->kind != RSTACK ) store(ctx, dst, v, false); + } + } + break; + case ODecr: + { + if( IS_FLOAT(dst) ) { + ASSERT(0); + } else { + preg *v = fetch32(ctx,dst); + op32(ctx,DEC,v,UNUSED); + if( v->kind != RSTACK ) store(ctx, dst, v, false); + } + } + break; + case OFloat: + { + if( m->code->floats[o->p2] == 0 ) { + preg *f = alloc_fpu(ctx,dst,false); + op64(ctx,XORPD,f,f); + } else switch( dst->t->kind ) { + case HF64: + case HF32: +# ifdef HL_64 + op64(ctx,dst->t->kind == HF32 ? CVTSD2SS : MOVSD,alloc_fpu(ctx,dst,false),pcodeaddr(&p,o->p2 * 8)); +# else + op64(ctx,dst->t->kind == HF32 ? MOVSS : MOVSD,alloc_fpu(ctx,dst,false),paddr(&p,m->code->floats + o->p2)); +# endif + break; + default: + ASSERT(dst->t->kind); + } + store(ctx,dst,dst->current,false); + } + break; + case OString: + op64(ctx,MOV,alloc_cpu(ctx, dst, false),pconst64(&p,(int_val)hl_get_ustring(m->code,o->p2))); + store(ctx,dst,dst->current,false); + break; + case OBytes: + { + char *b = m->code->version >= 5 ? m->code->bytes + m->code->bytes_pos[o->p2] : m->code->strings[o->p2]; + op64(ctx,MOV,alloc_cpu(ctx,dst,false),pconst64(&p,(int_val)b)); + store(ctx,dst,dst->current,false); + } + break; + case ONull: + { + op64(ctx,XOR,alloc_cpu(ctx, dst, false),alloc_cpu(ctx, dst, false)); + store(ctx,dst,dst->current,false); + } + break; + case ONew: + { + int_val args[] = { (int_val)dst->t }; + void *allocFun; + int nargs = 1; + switch( dst->t->kind ) { + case HOBJ: + case HSTRUCT: + allocFun = hl_alloc_obj; + break; + case HDYNOBJ: + allocFun = hl_alloc_dynobj; + nargs = 0; + break; + case HVIRTUAL: + allocFun = hl_alloc_virtual; + break; + default: + ASSERT(dst->t->kind); + } + call_native_consts(ctx, allocFun, args, nargs); + store(ctx, dst, PEAX, true); + } + break; + case OInstanceClosure: + { + preg *r = alloc_cpu(ctx, rb, true); + jlist *j = (jlist*)hl_malloc(&ctx->galloc,sizeof(jlist)); + int size = begin_native_call(ctx,3); + set_native_arg(ctx,r); + + j->pos = BUF_POS(); + j->target = o->p2; + j->next = ctx->calls; + ctx->calls = j; + + set_native_arg(ctx,pconst64(&p,RESERVE_ADDRESS)); + set_native_arg(ctx,pconst64(&p,(int_val)m->code->functions[m->functions_indexes[o->p2]].type)); + call_native(ctx,hl_alloc_closure_ptr,size); + store(ctx,dst,PEAX,true); + } + break; + case OVirtualClosure: + { + int size, i; + preg *r = alloc_cpu_call(ctx, ra); + hl_type *t = NULL; + hl_type *ot = ra->t; + while( t == NULL ) { + for(i=0;iobj->nproto;i++) { + hl_obj_proto *pp = ot->obj->proto + i; + if( pp->pindex == o->p3 ) { + t = m->code->functions[m->functions_indexes[pp->findex]].type; + break; + } + } + ot = ot->obj->super; + } + size = begin_native_call(ctx,3); + set_native_arg(ctx,r); + // read r->type->vobj_proto[i] for function address + op64(ctx,MOV,r,pmem(&p,r->id,0)); + op64(ctx,MOV,r,pmem(&p,r->id,HL_WSIZE*2)); + op64(ctx,MOV,r,pmem(&p,r->id,HL_WSIZE*o->p3)); + set_native_arg(ctx,r); + op64(ctx,MOV,r,pconst64(&p,(int_val)t)); + set_native_arg(ctx,r); + call_native(ctx,hl_alloc_closure_ptr,size); + store(ctx,dst,PEAX,true); + } + break; + case OCallClosure: + if( ra->t->kind == HDYN ) { + // ASM for { + // vdynamic *args[] = {args}; + // vdynamic *ret = hl_dyn_call(closure,args,nargs); + // dst = hl_dyncast(ret,t_dynamic,t_dst); + // } + int offset = o->p3 * HL_WSIZE; + preg *r = alloc_reg(ctx, RCPU_CALL); + if( offset & 15 ) offset += 16 - (offset & 15); + op64(ctx,SUB,PESP,pconst(&p,offset)); + op64(ctx,MOV,r,PESP); + for(i=0;ip3;i++) { + vreg *a = R(o->extra[i]); + if( !hl_is_dynamic(a->t) ) ASSERT(0); + preg *v = alloc_cpu(ctx,a,true); + op64(ctx,MOV,pmem(&p,r->id,i * HL_WSIZE),v); + RUNLOCK(v); + } +# ifdef HL_64 + int size = begin_native_call(ctx, 3) + offset; + set_native_arg(ctx, pconst(&p,o->p3)); + set_native_arg(ctx, r); + set_native_arg(ctx, fetch(ra)); +# else + int size = pad_before_call(ctx,HL_WSIZE*2 + sizeof(int) + offset); + op64(ctx,PUSH,pconst(&p,o->p3),UNUSED); + op64(ctx,PUSH,r,UNUSED); + op64(ctx,PUSH,alloc_cpu(ctx,ra,true),UNUSED); +# endif + call_native(ctx,hl_dyn_call,size); + if( dst->t->kind != HVOID ) { + store(ctx,dst,PEAX,true); + make_dyn_cast(ctx,dst,dst); + } + } else { + int jhasvalue, jend, size; + // ASM for if( c->hasValue ) c->fun(value,args) else c->fun(args) + preg *r = alloc_cpu(ctx,ra,true); + preg *tmp = alloc_reg(ctx, RCPU); + op32(ctx,MOV,tmp,pmem(&p,r->id,HL_WSIZE*2)); + op32(ctx,TEST,tmp,tmp); + scratch(tmp); + XJump_small(JNotZero,jhasvalue); + save_regs(ctx); + size = prepare_call_args(ctx,o->p3,o->extra,ctx->vregs,0); + preg *rr = r; + if( rr->holds != ra ) rr = alloc_cpu(ctx, ra, true); + op_call(ctx, pmem(&p,rr->id,HL_WSIZE), size); + XJump_small(JAlways,jend); + patch_jump(ctx,jhasvalue); + restore_regs(ctx); +# ifdef HL_64 + { + int regids[64]; + preg *pc = REG_AT(CALL_REGS[0]); + vreg *sc = R(f->nregs); // scratch register that we temporary rebind + if( o->p3 >= 63 ) jit_error("assert"); + memcpy(regids + 1, o->extra, o->p3 * sizeof(int)); + regids[0] = f->nregs; + sc->size = HL_WSIZE; + sc->t = &hlt_dyn; + op64(ctx, MOV, pc, pmem(&p,r->id,HL_WSIZE*3)); + scratch(pc); + sc->current = pc; + pc->holds = sc; + size = prepare_call_args(ctx,o->p3 + 1,regids,ctx->vregs,0); + if( r->holds != ra ) r = alloc_cpu(ctx, ra, true); + } +# else + size = prepare_call_args(ctx,o->p3,o->extra,ctx->vregs,HL_WSIZE); + if( r->holds != ra ) r = alloc_cpu(ctx, ra, true); + op64(ctx, PUSH,pmem(&p,r->id,HL_WSIZE*3),UNUSED); // push closure value +# endif + op_call(ctx, pmem(&p,r->id,HL_WSIZE), size); + discard_regs(ctx,false); + patch_jump(ctx,jend); + store_result(ctx, dst); + } + break; + case OStaticClosure: + { + vclosure *c = alloc_static_closure(ctx,o->p2); + preg *r = alloc_reg(ctx, RCPU); + op64(ctx, MOV, r, pconst64(&p,(int_val)c)); + store(ctx,dst,r,true); + } + break; + case OField: + { +# ifndef HL_64 + if( dst->t->kind == HI64 ) { + error_i64(); + break; + } +# endif + switch( ra->t->kind ) { + case HOBJ: + case HSTRUCT: + { + hl_runtime_obj *rt = hl_get_obj_rt(ra->t); + preg *rr = alloc_cpu(ctx,ra, true); + if( dst->t->kind == HSTRUCT ) { + hl_type *ft = hl_obj_field_fetch(ra->t,o->p3)->t; + if( ft->kind == HPACKED ) { + preg *r = alloc_reg(ctx,RCPU); + op64(ctx,LEA,r,pmem(&p,(CpuReg)rr->id,rt->fields_indexes[o->p3])); + store(ctx,dst,r,true); + break; + } + } + copy_to(ctx,dst,pmem(&p, (CpuReg)rr->id, rt->fields_indexes[o->p3])); + } + break; + case HVIRTUAL: + // ASM for --> if( hl_vfields(o)[f] ) r = *hl_vfields(o)[f]; else r = hl_dyn_get(o,hash(field),vt) + { + int jhasfield, jend, size; + bool need_type = !(IS_FLOAT(dst) || dst->t->kind == HI64); + preg *v = alloc_cpu_call(ctx,ra); + preg *r = alloc_reg(ctx,RCPU); + op64(ctx,MOV,r,pmem(&p,v->id,sizeof(vvirtual)+HL_WSIZE*o->p3)); + op64(ctx,TEST,r,r); + XJump_small(JNotZero,jhasfield); + size = begin_native_call(ctx, need_type ? 3 : 2); + if( need_type ) set_native_arg(ctx,pconst64(&p,(int_val)dst->t)); + set_native_arg(ctx,pconst64(&p,(int_val)ra->t->virt->fields[o->p3].hashed_name)); + set_native_arg(ctx,v); + call_native(ctx,get_dynget(dst->t),size); + store_result(ctx,dst); + XJump_small(JAlways,jend); + patch_jump(ctx,jhasfield); + copy_to(ctx, dst, pmem(&p,(CpuReg)r->id,0)); + patch_jump(ctx,jend); + scratch(dst->current); + } + break; + default: + ASSERT(ra->t->kind); + break; + } + } + break; + case OSetField: + { + switch( dst->t->kind ) { + case HOBJ: + case HSTRUCT: + { + hl_runtime_obj *rt = hl_get_obj_rt(dst->t); + preg *rr = alloc_cpu(ctx, dst, true); + if( rb->t->kind == HSTRUCT ) { + hl_type *ft = hl_obj_field_fetch(dst->t,o->p2)->t; + if( ft->kind == HPACKED ) { + hl_runtime_obj *frt = hl_get_obj_rt(ft->tparam); + preg *prb = alloc_cpu(ctx, rb, true); + preg *tmp = alloc_reg(ctx, RCPU_CALL); + int offset = 0; + while( offset < frt->size ) { + int remain = frt->size - offset; + int copy_size = remain >= HL_WSIZE ? HL_WSIZE : (remain >= 4 ? 4 : (remain >= 2 ? 2 : 1)); + copy(ctx, tmp, pmem(&p, (CpuReg)prb->id, offset), copy_size); + copy(ctx, pmem(&p, (CpuReg)rr->id, rt->fields_indexes[o->p2]+offset), tmp, copy_size); + offset += copy_size; + } + break; + } + } + copy_from(ctx, pmem(&p, (CpuReg)rr->id, rt->fields_indexes[o->p2]), rb); + } + break; + case HVIRTUAL: + // ASM for --> if( hl_vfields(o)[f] ) *hl_vfields(o)[f] = v; else hl_dyn_set(o,hash(field),vt,v) + { + int jhasfield, jend; + preg *obj = alloc_cpu_call(ctx,dst); + preg *r = alloc_reg(ctx,RCPU); + op64(ctx,MOV,r,pmem(&p,obj->id,sizeof(vvirtual)+HL_WSIZE*o->p2)); + op64(ctx,TEST,r,r); + XJump_small(JNotZero,jhasfield); +# ifdef HL_64 + switch( rb->t->kind ) { + case HF64: + case HF32: + size = begin_native_call(ctx,3); + set_native_arg_fpu(ctx, fetch(rb), rb->t->kind == HF32); + break; + case HI64: + case HGUID: + size = begin_native_call(ctx,3); + set_native_arg(ctx, fetch(rb)); + break; + default: + size = begin_native_call(ctx, 4); + set_native_arg(ctx, fetch(rb)); + set_native_arg(ctx, pconst64(&p,(int_val)rb->t)); + break; + } + set_native_arg(ctx,pconst(&p,dst->t->virt->fields[o->p2].hashed_name)); + set_native_arg(ctx,obj); +# else + switch( rb->t->kind ) { + case HF64: + case HI64: + case HGUID: + size = pad_before_call(ctx,HL_WSIZE*2 + sizeof(double)); + push_reg(ctx,rb); + break; + case HF32: + size = pad_before_call(ctx,HL_WSIZE*2 + sizeof(float)); + push_reg(ctx,rb); + break; + default: + size = pad_before_call(ctx,HL_WSIZE*4); + op64(ctx,PUSH,fetch32(ctx,rb),UNUSED); + op64(ctx,MOV,r,pconst64(&p,(int_val)rb->t)); + op64(ctx,PUSH,r,UNUSED); + break; + } + op32(ctx,MOV,r,pconst(&p,dst->t->virt->fields[o->p2].hashed_name)); + op64(ctx,PUSH,r,UNUSED); + op64(ctx,PUSH,obj,UNUSED); +# endif + call_native(ctx,get_dynset(rb->t),size); + XJump_small(JAlways,jend); + patch_jump(ctx,jhasfield); + copy_from(ctx, pmem(&p,(CpuReg)r->id,0), rb); + patch_jump(ctx,jend); + scratch(rb->current); + } + break; + default: + ASSERT(dst->t->kind); + break; + } + } + break; + case OGetThis: + { + vreg *r = R(0); + hl_runtime_obj *rt = hl_get_obj_rt(r->t); + preg *rr = alloc_cpu(ctx,r, true); + if( dst->t->kind == HSTRUCT ) { + hl_type *ft = hl_obj_field_fetch(r->t,o->p2)->t; + if( ft->kind == HPACKED ) { + preg *r = alloc_reg(ctx,RCPU); + op64(ctx,LEA,r,pmem(&p,(CpuReg)rr->id,rt->fields_indexes[o->p2])); + store(ctx,dst,r,true); + break; + } + } + copy_to(ctx,dst,pmem(&p, (CpuReg)rr->id, rt->fields_indexes[o->p2])); + } + break; + case OSetThis: + { + vreg *r = R(0); + hl_runtime_obj *rt = hl_get_obj_rt(r->t); + preg *rr = alloc_cpu(ctx, r, true); + if( ra->t->kind == HSTRUCT ) { + hl_type *ft = hl_obj_field_fetch(r->t,o->p1)->t; + if( ft->kind == HPACKED ) { + hl_runtime_obj *frt = hl_get_obj_rt(ft->tparam); + preg *pra = alloc_cpu(ctx, ra, true); + preg *tmp = alloc_reg(ctx, RCPU_CALL); + int offset = 0; + while( offset < frt->size ) { + int remain = frt->size - offset; + int copy_size = remain >= HL_WSIZE ? HL_WSIZE : (remain >= 4 ? 4 : (remain >= 2 ? 2 : 1)); + copy(ctx, tmp, pmem(&p, (CpuReg)pra->id, offset), copy_size); + copy(ctx, pmem(&p, (CpuReg)rr->id, rt->fields_indexes[o->p1]+offset), tmp, copy_size); + offset += copy_size; + } + break; + } + } + copy_from(ctx, pmem(&p, (CpuReg)rr->id, rt->fields_indexes[o->p1]), ra); + } + break; + case OCallThis: + { + int nargs = o->p3 + 1; + int *args = (int*)hl_malloc(&ctx->falloc,sizeof(int) * nargs); + int size; + preg *r = alloc_cpu(ctx, R(0), true); + preg *tmp; + tmp = alloc_reg(ctx, RCPU_CALL); + op64(ctx,MOV,tmp,pmem(&p,r->id,0)); // read type + op64(ctx,MOV,tmp,pmem(&p,tmp->id,HL_WSIZE*2)); // read proto + args[0] = 0; + for(i=1;iextra[i-1]; + size = prepare_call_args(ctx,nargs,args,ctx->vregs,0); + op_call(ctx,pmem(&p,tmp->id,o->p2*HL_WSIZE),size); + discard_regs(ctx, false); + store_result(ctx, dst); + } + break; + case OCallMethod: + switch( R(o->extra[0])->t->kind ) { + case HOBJ: { + int size; + preg *r = alloc_cpu(ctx, R(o->extra[0]), true); + preg *tmp; + tmp = alloc_reg(ctx, RCPU_CALL); + op64(ctx,MOV,tmp,pmem(&p,r->id,0)); // read type + op64(ctx,MOV,tmp,pmem(&p,tmp->id,HL_WSIZE*2)); // read proto + size = prepare_call_args(ctx,o->p3,o->extra,ctx->vregs,0); + op_call(ctx,pmem(&p,tmp->id,o->p2*HL_WSIZE),size); + discard_regs(ctx, false); + store_result(ctx, dst); + break; + } + case HVIRTUAL: + // ASM for --> if( hl_vfields(o)[f] ) dst = *hl_vfields(o)[f](o->value,args...); else dst = hl_dyn_call_obj(o->value,field,args,&ret) + { + int size; + int paramsSize; + int jhasfield, jend; + bool need_dyn; + bool obj_in_args = false; + vreg *obj = R(o->extra[0]); + preg *v = alloc_cpu_call(ctx,obj); + preg *r = alloc_reg(ctx,RCPU_CALL); + op64(ctx,MOV,r,pmem(&p,v->id,sizeof(vvirtual)+HL_WSIZE*o->p2)); + op64(ctx,TEST,r,r); + save_regs(ctx); + + if( o->p3 < 6 ) { + XJump_small(JNotZero,jhasfield); + } else { + XJump(JNotZero,jhasfield); + } + + need_dyn = !hl_is_ptr(dst->t) && dst->t->kind != HVOID; + paramsSize = (o->p3 - 1) * HL_WSIZE; + if( need_dyn ) paramsSize += sizeof(vdynamic); + if( paramsSize & 15 ) paramsSize += 16 - (paramsSize&15); + op64(ctx,SUB,PESP,pconst(&p,paramsSize)); + op64(ctx,MOV,r,PESP); + + for(i=0;ip3-1;i++) { + vreg *a = R(o->extra[i+1]); + if( hl_is_ptr(a->t) ) { + op64(ctx,MOV,pmem(&p,r->id,i*HL_WSIZE),alloc_cpu(ctx,a,true)); + if( a->current != v ) { + RUNLOCK(a->current); + } else + obj_in_args = true; + } else { + preg *r2 = alloc_reg(ctx,RCPU); + op64(ctx,LEA,r2,&a->stack); + op64(ctx,MOV,pmem(&p,r->id,i*HL_WSIZE),r2); + if( r2 != v ) RUNLOCK(r2); + } + } + + jit_buf(ctx); + + if( !need_dyn ) { + size = begin_native_call(ctx, 5); + set_native_arg(ctx, pconst(&p,0)); + } else { + preg *rtmp = alloc_reg(ctx,RCPU); + op64(ctx,LEA,rtmp,pmem(&p,Esp,paramsSize - sizeof(vdynamic))); + size = begin_native_call(ctx, 5); + set_native_arg(ctx,rtmp); + if( !IS_64 ) RUNLOCK(rtmp); + } + set_native_arg(ctx,r); + set_native_arg(ctx,pconst(&p,obj->t->virt->fields[o->p2].hashed_name)); // fid + set_native_arg(ctx,pconst64(&p,(int_val)obj->t->virt->fields[o->p2].t)); // ftype + set_native_arg(ctx,pmem(&p,v->id,HL_WSIZE)); // o->value + call_native(ctx,hl_dyn_call_obj,size + paramsSize); + if( need_dyn ) { + preg *r = IS_FLOAT(dst) ? REG_AT(XMM(0)) : PEAX; + copy(ctx,r,pmem(&p,Esp,HDYN_VALUE - (int)sizeof(vdynamic)),dst->size); + store(ctx, dst, r, false); + } else + store(ctx, dst, PEAX, false); + + XJump_small(JAlways,jend); + patch_jump(ctx,jhasfield); + restore_regs(ctx); + + if( !obj_in_args ) { + // o = o->value hack + if( v->holds ) v->holds->current = NULL; + obj->current = v; + v->holds = obj; + op64(ctx,MOV,v,pmem(&p,v->id,HL_WSIZE)); + size = prepare_call_args(ctx,o->p3,o->extra,ctx->vregs,0); + } else { + // keep o->value in R(f->nregs) + int regids[64]; + preg *pc = alloc_reg(ctx,RCPU_CALL); + vreg *sc = R(f->nregs); // scratch register that we temporary rebind + if( o->p3 >= 63 ) jit_error("assert"); + memcpy(regids, o->extra, o->p3 * sizeof(int)); + regids[0] = f->nregs; + sc->size = HL_WSIZE; + sc->t = &hlt_dyn; + op64(ctx, MOV, pc, pmem(&p,v->id,HL_WSIZE)); + scratch(pc); + sc->current = pc; + pc->holds = sc; + size = prepare_call_args(ctx,o->p3,regids,ctx->vregs,0); + } + + op_call(ctx,r,size); + discard_regs(ctx, false); + store_result(ctx, dst); + patch_jump(ctx,jend); + } + break; + default: + ASSERT(0); + break; + } + break; + case ORethrow: + { + int size = prepare_call_args(ctx,1,&o->p1,ctx->vregs,0); + call_native(ctx,hl_rethrow,size); + } + break; + case OThrow: + { + int size = prepare_call_args(ctx,1,&o->p1,ctx->vregs,0); + call_native(ctx,hl_throw,size); + } + break; + case OLabel: + // NOP for now + discard_regs(ctx,false); + break; + case OGetI8: + case OGetI16: + { + preg *base = alloc_cpu(ctx, ra, true); + preg *offset = alloc_cpu64(ctx, rb, true); + preg *r = alloc_reg(ctx,o->op == OGetI8 ? RCPU_8BITS : RCPU); + op64(ctx,XOR,r,r); + op32(ctx, o->op == OGetI8 ? MOV8 : MOV16,r,pmem2(&p,base->id,offset->id,1,0)); + store(ctx, dst, r, true); + } + break; + case OGetMem: + { + #ifndef HL_64 + if (dst->t->kind == HI64) { + error_i64(); + } + #endif + preg *base = alloc_cpu(ctx, ra, true); + preg *offset = alloc_cpu64(ctx, rb, true); + store(ctx, dst, pmem2(&p,base->id,offset->id,1,0), false); + } + break; + case OSetI8: + { + preg *base = alloc_cpu(ctx, dst, true); + preg *offset = alloc_cpu64(ctx, ra, true); + preg *value = alloc_cpu8(ctx, rb, true); + op32(ctx,MOV8,pmem2(&p,base->id,offset->id,1,0),value); + } + break; + case OSetI16: + { + preg *base = alloc_cpu(ctx, dst, true); + preg *offset = alloc_cpu64(ctx, ra, true); + preg *value = alloc_cpu(ctx, rb, true); + op32(ctx,MOV16,pmem2(&p,base->id,offset->id,1,0),value); + } + break; + case OSetMem: + { + preg *base = alloc_cpu(ctx, dst, true); + preg *offset = alloc_cpu64(ctx, ra, true); + preg *value; + switch( rb->t->kind ) { + case HI32: + value = alloc_cpu(ctx, rb, true); + op32(ctx,MOV,pmem2(&p,base->id,offset->id,1,0),value); + break; + case HF32: + value = alloc_fpu(ctx, rb, true); + op32(ctx,MOVSS,pmem2(&p,base->id,offset->id,1,0),value); + break; + case HF64: + value = alloc_fpu(ctx, rb, true); + op32(ctx,MOVSD,pmem2(&p,base->id,offset->id,1,0),value); + break; + case HI64: + case HGUID: + value = alloc_cpu(ctx, rb, true); + op64(ctx,MOV,pmem2(&p,base->id,offset->id,1,0),value); + break; + default: + ASSERT(rb->t->kind); + break; + } + } + break; + case OType: + { + op64(ctx,MOV,alloc_cpu(ctx, dst, false),pconst64(&p,(int_val)(m->code->types + o->p2))); + store(ctx,dst,dst->current,false); + } + break; + case OGetType: + { + int jnext, jend; + preg *r = alloc_cpu(ctx, ra, true); + preg *tmp = alloc_reg(ctx, RCPU); + op64(ctx,TEST,r,r); + XJump_small(JNotZero,jnext); + op64(ctx,MOV, tmp, pconst64(&p,(int_val)&hlt_void)); + XJump_small(JAlways,jend); + patch_jump(ctx,jnext); + op64(ctx, MOV, tmp, pmem(&p,r->id,0)); + patch_jump(ctx,jend); + store(ctx,dst,tmp,true); + } + break; + case OGetArray: + { + preg *rdst = IS_FLOAT(dst) ? alloc_fpu(ctx,dst,false) : alloc_cpu(ctx,dst,false); + if( ra->t->kind == HABSTRACT ) { + int osize; + bool isRead = dst->t->kind != HOBJ && dst->t->kind != HSTRUCT; + if( isRead ) + osize = sizeof(void*); + else { + hl_runtime_obj *rt = hl_get_obj_rt(dst->t); + osize = rt->size; + } + preg *idx = alloc_cpu64(ctx, rb, true); + op64(ctx, IMUL, idx, pconst(&p,osize)); + op64(ctx, isRead?MOV:LEA, rdst, pmem2(&p,alloc_cpu(ctx,ra, true)->id,idx->id,1,0)); + store(ctx,dst,dst->current,false); + scratch(idx); + } else { + copy(ctx, rdst, pmem2(&p,alloc_cpu(ctx,ra,true)->id,alloc_cpu64(ctx,rb,true)->id,hl_type_size(dst->t),sizeof(varray)), dst->size); + store(ctx,dst,dst->current,false); + } + } + break; + case OSetArray: + { + if( dst->t->kind == HABSTRACT ) { + int osize; + bool isWrite = rb->t->kind != HOBJ && rb->t->kind != HSTRUCT; + if( isWrite ) { + osize = sizeof(void*); + } else { + hl_runtime_obj *rt = hl_get_obj_rt(rb->t); + osize = rt->size; + } + preg *pdst = alloc_cpu(ctx,dst,true); + preg *pra = alloc_cpu64(ctx,ra,true); + op64(ctx, IMUL, pra, pconst(&p,osize)); + op64(ctx, ADD, pdst, pra); + scratch(pra); + preg *prb = alloc_cpu(ctx,rb,true); + preg *tmp = alloc_reg(ctx, RCPU_CALL); + int offset = 0; + while( offset < osize ) { + int remain = osize - offset; + int copy_size = remain >= HL_WSIZE ? HL_WSIZE : (remain >= 4 ? 4 : (remain >= 2 ? 2 : 1)); + copy(ctx, tmp, pmem(&p, prb->id, offset), copy_size); + copy(ctx, pmem(&p, pdst->id, offset), tmp, copy_size); + offset += copy_size; + } + scratch(pdst); + } else { + preg *rrb = IS_FLOAT(rb) ? alloc_fpu(ctx,rb,true) : alloc_cpu(ctx,rb,true); + copy(ctx, pmem2(&p,alloc_cpu(ctx,dst,true)->id,alloc_cpu64(ctx,ra,true)->id,hl_type_size(rb->t),sizeof(varray)), rrb, rb->size); + } + } + break; + case OArraySize: + { + op32(ctx,MOV,alloc_cpu(ctx,dst,false),pmem(&p,alloc_cpu(ctx,ra,true)->id,ra->t->kind == HABSTRACT ? HL_WSIZE + 4 : HL_WSIZE*2)); + store(ctx,dst,dst->current,false); + } + break; + case ORef: + { + scratch(ra->current); + op64(ctx,MOV,alloc_cpu(ctx,dst,false),REG_AT(Ebp)); + if( ra->stackPos < 0 ) + op64(ctx,SUB,dst->current,pconst(&p,-ra->stackPos)); + else + op64(ctx,ADD,dst->current,pconst(&p,ra->stackPos)); + store(ctx,dst,dst->current,false); + } + break; + case OUnref: + copy_to(ctx,dst,pmem(&p,alloc_cpu(ctx,ra,true)->id,0)); + break; + case OSetref: + copy_from(ctx,pmem(&p,alloc_cpu(ctx,dst,true)->id,0),ra); + break; + case ORefData: + switch( ra->t->kind ) { + case HARRAY: + { + preg *r = fetch(ra); + preg *d = alloc_cpu(ctx,dst,false); + op64(ctx,MOV,d,r); + op64(ctx,ADD,d,pconst(&p,sizeof(varray))); + store(ctx,dst,dst->current,false); + } + break; + default: + ASSERT(ra->t->kind); + } + break; + case ORefOffset: + { + preg *d = alloc_cpu(ctx,rb,true); + preg *r2 = alloc_cpu(ctx,dst,false); + preg *r = fetch(ra); + int size = hl_type_size(dst->t->tparam); + op64(ctx,MOV,r2,r); + switch( size ) { + case 1: + break; + case 2: + op64(ctx,SHL,d,pconst(&p,1)); + break; + case 4: + op64(ctx,SHL,d,pconst(&p,2)); + break; + case 8: + op64(ctx,SHL,d,pconst(&p,3)); + break; + default: + op64(ctx,IMUL,d,pconst(&p,size)); + break; + } + op64(ctx,ADD,r2,d); + scratch(d); + store(ctx,dst,dst->current,false); + } + break; + case OToVirtual: + { +# ifdef HL_64 + int size = pad_before_call(ctx, 0); + op64(ctx,MOV,REG_AT(CALL_REGS[1]),fetch(ra)); + op64(ctx,MOV,REG_AT(CALL_REGS[0]),pconst64(&p,(int_val)dst->t)); +# else + int size = pad_before_call(ctx, HL_WSIZE*2); + op32(ctx,PUSH,fetch(ra),UNUSED); + op32(ctx,PUSH,pconst(&p,(int)(int_val)dst->t),UNUSED); +# endif + if( ra->t->kind == HOBJ ) hl_get_obj_rt(ra->t); // ensure it's initialized + call_native(ctx,hl_to_virtual,size); + store(ctx,dst,PEAX,true); + } + break; + case OMakeEnum: + { + hl_enum_construct *c = &dst->t->tenum->constructs[o->p2]; + int_val args[] = { (int_val)dst->t, o->p2 }; + int i; + call_native_consts(ctx, hl_alloc_enum, args, 2); + RLOCK(PEAX); + for(i=0;inparams;i++) { + preg *r = fetch(R(o->extra[i])); + copy(ctx, pmem(&p,Eax,c->offsets[i]),r, R(o->extra[i])->size); + RUNLOCK(fetch(R(o->extra[i]))); + if ((i & 15) == 0) jit_buf(ctx); + } + store(ctx, dst, PEAX, true); + } + break; + case OEnumAlloc: + { + int_val args[] = { (int_val)dst->t, o->p2 }; + call_native_consts(ctx, hl_alloc_enum, args, 2); + store(ctx, dst, PEAX, true); + } + break; + case OEnumField: + { + hl_enum_construct *c = &ra->t->tenum->constructs[o->p3]; + preg *r = alloc_cpu(ctx,ra,true); + copy_to(ctx,dst,pmem(&p,r->id,c->offsets[(int)(int_val)o->extra])); + } + break; + case OSetEnumField: + { + hl_enum_construct *c = &dst->t->tenum->constructs[0]; + preg *r = alloc_cpu(ctx,dst,true); + switch( rb->t->kind ) { + case HF64: + { + preg *d = alloc_fpu(ctx,rb,true); + copy(ctx,pmem(&p,r->id,c->offsets[o->p2]),d,8); + break; + } + default: + copy(ctx,pmem(&p,r->id,c->offsets[o->p2]),alloc_cpu(ctx,rb,true),hl_type_size(c->params[o->p2])); + break; + } + } + break; + case ONullCheck: + { + int jz; + preg *r = alloc_cpu(ctx,dst,true); + op64(ctx,TEST,r,r); + XJump_small(JNotZero,jz); + + hl_opcode *next = f->ops + opCount + 1; + bool null_field_access = false; + int hashed_name = 0; + // skip const and operation between nullcheck and access + while( (next < f->ops + f->nops - 1) && (next->op >= OInt && next->op <= ODecr) ) { + next++; + } + if( (next->op == OField && next->p2 == o->p1) || (next->op == OSetField && next->p1 == o->p1) ) { + int fid = next->op == OField ? next->p3 : next->p2; + hl_obj_field *f = NULL; + if( dst->t->kind == HOBJ || dst->t->kind == HSTRUCT ) + f = hl_obj_field_fetch(dst->t, fid); + else if( dst->t->kind == HVIRTUAL ) + f = dst->t->virt->fields + fid; + if( f == NULL ) ASSERT(dst->t->kind); + null_field_access = true; + hashed_name = f->hashed_name; + } else if( (next->op >= OCall1 && next->op <= OCallN) && next->p3 == o->p1 ) { + int fid = next->p2 < 0 ? -1 : ctx->m->functions_indexes[next->p2]; + hl_function *cf = ctx->m->code->functions + fid; + const uchar *name = fun_field_name(cf); + null_field_access = true; + hashed_name = hl_hash_gen(name, true); + } + + if( null_field_access ) { + pad_before_call(ctx, HL_WSIZE); + if( hashed_name >= 0 && hashed_name < 256 ) + op64(ctx,PUSH8,pconst(&p,hashed_name),UNUSED); + else + op32(ctx,PUSH,pconst(&p,hashed_name),UNUSED); + } else { + pad_before_call(ctx, 0); + } + + jlist *j = (jlist*)hl_malloc(&ctx->galloc,sizeof(jlist)); + j->pos = BUF_POS(); + j->target = null_field_access ? -3 : -1; + j->next = ctx->calls; + ctx->calls = j; + + op64(ctx,MOV,PEAX,pconst64(&p,RESERVE_ADDRESS)); + op_call(ctx,PEAX,-1); + patch_jump(ctx,jz); + } + break; + case OSafeCast: + make_dyn_cast(ctx, dst, ra); + break; + case ODynGet: + { + int size; +# ifdef HL_64 + if( IS_FLOAT(dst) || dst->t->kind == HI64 ) { + size = begin_native_call(ctx,2); + } else { + size = begin_native_call(ctx,3); + set_native_arg(ctx,pconst64(&p,(int_val)dst->t)); + } + set_native_arg(ctx,pconst64(&p,(int_val)hl_hash_utf8(m->code->strings[o->p3]))); + set_native_arg(ctx,fetch(ra)); +# else + preg *r; + r = alloc_reg(ctx,RCPU); + if( IS_FLOAT(dst) || dst->t->kind == HI64 ) { + size = pad_before_call(ctx,HL_WSIZE*2); + } else { + size = pad_before_call(ctx,HL_WSIZE*3); + op64(ctx,MOV,r,pconst64(&p,(int_val)dst->t)); + op64(ctx,PUSH,r,UNUSED); + } + op64(ctx,MOV,r,pconst64(&p,(int_val)hl_hash_utf8(m->code->strings[o->p3]))); + op64(ctx,PUSH,r,UNUSED); + op64(ctx,PUSH,fetch(ra),UNUSED); +# endif + call_native(ctx,get_dynget(dst->t),size); + store_result(ctx,dst); + } + break; + case ODynSet: + { + int size; +# ifdef HL_64 + switch( rb->t->kind ) { + case HF32: + case HF64: + size = begin_native_call(ctx, 3); + set_native_arg_fpu(ctx,fetch(rb),rb->t->kind == HF32); + set_native_arg(ctx,pconst64(&p,hl_hash_gen(hl_get_ustring(m->code,o->p2),true))); + set_native_arg(ctx,fetch(dst)); + call_native(ctx,get_dynset(rb->t),size); + break; + case HI64: + case HGUID: + size = begin_native_call(ctx, 3); + set_native_arg(ctx,fetch(rb)); + set_native_arg(ctx,pconst64(&p,hl_hash_gen(hl_get_ustring(m->code,o->p2),true))); + set_native_arg(ctx,fetch(dst)); + call_native(ctx,get_dynset(rb->t),size); + break; + default: + size = begin_native_call(ctx,4); + set_native_arg(ctx,fetch(rb)); + set_native_arg(ctx,pconst64(&p,(int_val)rb->t)); + set_native_arg(ctx,pconst64(&p,hl_hash_gen(hl_get_ustring(m->code,o->p2),true))); + set_native_arg(ctx,fetch(dst)); + call_native(ctx,get_dynset(rb->t),size); + break; + } +# else + switch( rb->t->kind ) { + case HF32: + size = pad_before_call(ctx, HL_WSIZE*2 + sizeof(float)); + push_reg(ctx,rb); + op32(ctx,PUSH,pconst64(&p,hl_hash_gen(hl_get_ustring(m->code,o->p2),true)),UNUSED); + op32(ctx,PUSH,fetch(dst),UNUSED); + call_native(ctx,get_dynset(rb->t),size); + break; + case HF64: + case HI64: + case HGUID: + size = pad_before_call(ctx, HL_WSIZE*2 + sizeof(double)); + push_reg(ctx,rb); + op32(ctx,PUSH,pconst64(&p,hl_hash_gen(hl_get_ustring(m->code,o->p2),true)),UNUSED); + op32(ctx,PUSH,fetch(dst),UNUSED); + call_native(ctx,get_dynset(rb->t),size); + break; + default: + size = pad_before_call(ctx, HL_WSIZE*4); + op32(ctx,PUSH,fetch32(ctx,rb),UNUSED); + op32(ctx,PUSH,pconst64(&p,(int_val)rb->t),UNUSED); + op32(ctx,PUSH,pconst64(&p,hl_hash_gen(hl_get_ustring(m->code,o->p2),true)),UNUSED); + op32(ctx,PUSH,fetch(dst),UNUSED); + call_native(ctx,get_dynset(rb->t),size); + break; + } +# endif + } + break; + case OTrap: + { + int size, jenter, jtrap; + int offset = 0; + int trap_size = (sizeof(hl_trap_ctx) + 15) & 0xFFF0; + hl_trap_ctx *t = NULL; +# ifndef HL_THREADS + if( tinf == NULL ) tinf = hl_get_thread(); // single thread +# endif + +# ifdef HL_64 + preg *trap = REG_AT(CALL_REGS[0]); +# else + preg *trap = PEAX; +# endif + RLOCK(trap); + + preg *treg = alloc_reg(ctx, RCPU); + if( !tinf ) { + call_native(ctx, hl_get_thread, 0); + op64(ctx,MOV,treg,PEAX); + offset = (int)(int_val)&tinf->trap_current; + } else { + offset = 0; + op64(ctx,MOV,treg,pconst64(&p,(int_val)&tinf->trap_current)); + } + op64(ctx,MOV,trap,pmem(&p,treg->id,offset)); + op64(ctx,SUB,PESP,pconst(&p,trap_size)); + op64(ctx,MOV,pmem(&p,Esp,(int)(int_val)&t->prev),trap); + op64(ctx,MOV,trap,PESP); + op64(ctx,MOV,pmem(&p,treg->id,offset),trap); + + /* + trap E,@catch + catch g + catch g2 + ... + @:catch + + // Before haxe 5 + This is a bit hackshish : we want to detect the type of exception filtered by the catch so we check the following + sequence of HL opcodes: + + trap E,@catch + ... + @catch: + global R, _ + call _, ???(R,E) + + ??? is expected to be hl.BaseType.check + */ + hl_opcode *cat = f->ops + opCount + 1; + hl_opcode *next = f->ops + opCount + 1 + o->p2; + hl_opcode *next2 = f->ops + opCount + 2 + o->p2; + if( cat->op == OCatch || (next->op == OGetGlobal && next2->op == OCall2 && next2->p3 == next->p1 && dst->stack.id == (int)(int_val)next2->extra) ) { + int gindex = cat->op == OCatch ? cat->p1 : next->p2; + hl_type *gt = m->code->globals[gindex]; + while( gt->kind == HOBJ && gt->obj->super ) gt = gt->obj->super; + if( gt->kind == HOBJ && gt->obj->nfields && gt->obj->fields[0].t->kind == HTYPE ) { + void *addr = m->globals_data + m->globals_indexes[gindex]; +# ifdef HL_64 + op64(ctx,MOV,treg,pconst64(&p,(int_val)addr)); + op64(ctx,MOV,treg,pmem(&p,treg->id,0)); +# else + op64(ctx,MOV,treg,paddr(&p,addr)); +# endif + } else + op64(ctx,MOV,treg,pconst(&p,0)); + } else { + op64(ctx,MOV,treg,pconst(&p,0)); + } + op64(ctx,MOV,pmem(&p,Esp,(int)(int_val)&t->tcheck),treg); + + // On Win64 setjmp actually takes two arguments + // the jump buffer and the frame pointer (or the stack pointer if there is no FP) +#if defined(HL_WIN) && defined(HL_64) + size = begin_native_call(ctx, 2); + set_native_arg(ctx, REG_AT(Ebp)); +#else + size = begin_native_call(ctx, 1); +#endif + set_native_arg(ctx,trap); +#ifdef HL_MINGW + call_native(ctx,_setjmp,size); +#else + call_native(ctx,setjmp,size); +#endif + op64(ctx,TEST,PEAX,PEAX); + XJump_small(JZero,jenter); + op64(ctx,ADD,PESP,pconst(&p,trap_size)); + if( !tinf ) { + call_native(ctx, hl_get_thread, 0); + op64(ctx,MOV,PEAX,pmem(&p, Eax, (int)(int_val)&tinf->exc_value)); + } else { + op64(ctx,MOV,PEAX,pconst64(&p,(int_val)&tinf->exc_value)); + op64(ctx,MOV,PEAX,pmem(&p, Eax, 0)); + } + store(ctx,dst,PEAX,false); + + jtrap = do_jump(ctx,OJAlways,false); + register_jump(ctx,jtrap,(opCount + 1) + o->p2); + patch_jump(ctx,jenter); + } + break; + case OEndTrap: + { + int trap_size = (sizeof(hl_trap_ctx) + 15) & 0xFFF0; + hl_trap_ctx *tmp = NULL; + preg *addr,*r; + int offset; + if (!tinf) { + call_native(ctx, hl_get_thread, 0); + addr = PEAX; + RLOCK(addr); + offset = (int)(int_val)&tinf->trap_current; + } else { + offset = 0; + addr = alloc_reg(ctx, RCPU); + op64(ctx, MOV, addr, pconst64(&p, (int_val)&tinf->trap_current)); + } + r = alloc_reg(ctx, RCPU); + op64(ctx, MOV, r, pmem(&p,addr->id,offset)); + op64(ctx, MOV, r, pmem(&p,r->id,(int)(int_val)&tmp->prev)); + op64(ctx, MOV, pmem(&p,addr->id, offset), r); +# ifdef HL_WIN + // erase eip (prevent false positive) + { + _JUMP_BUFFER *b = NULL; +# ifdef HL_64 + op64(ctx,MOV,pmem(&p,Esp,(int)(int_val)&(b->Rip)),PEAX); +# else + op64(ctx,MOV,pmem(&p,Esp,(int)&(b->Eip)),PEAX); +# endif + } +# endif + op64(ctx,ADD,PESP,pconst(&p,trap_size)); + } + break; + case OEnumIndex: + { + preg *r = alloc_reg(ctx,RCPU); + op64(ctx,MOV,r,pmem(&p,alloc_cpu(ctx,ra,true)->id,HL_WSIZE)); + store(ctx,dst,r,true); + break; + } + break; + case OSwitch: + { + int jdefault; + int i; + preg *r = alloc_cpu(ctx, dst, true); + preg *r2 = alloc_reg(ctx, RCPU); + op32(ctx, CMP, r, pconst(&p,o->p2)); + XJump(JUGte,jdefault); + // r2 = r * 5 + eip +# ifdef HL_64 + op64(ctx, XOR, r2, r2); +# endif + op32(ctx, MOV, r2, r); + op32(ctx, SHL, r2, pconst(&p,2)); + op32(ctx, ADD, r2, r); +# ifdef HL_64 + preg *tmp = alloc_reg(ctx, RCPU); + op64(ctx, MOV, tmp, pconst64(&p,RESERVE_ADDRESS)); +# else + op64(ctx, ADD, r2, pconst64(&p,RESERVE_ADDRESS)); +# endif + { + jlist *s = (jlist*)hl_malloc(&ctx->galloc, sizeof(jlist)); + s->pos = BUF_POS() - sizeof(void*); + s->next = ctx->switchs; + ctx->switchs = s; + } +# ifdef HL_64 + op64(ctx, ADD, r2, tmp); +# endif + op64(ctx, JMP, r2, UNUSED); + for(i=0;ip2;i++) { + int j = do_jump(ctx,OJAlways,false); + register_jump(ctx,j,(opCount + 1) + o->extra[i]); + if( (i & 15) == 0 ) jit_buf(ctx); + } + patch_jump(ctx, jdefault); + } + break; + case OGetTID: + op32(ctx, MOV, alloc_cpu(ctx,dst,false), pmem(&p,alloc_cpu(ctx,ra,true)->id,0)); + store(ctx,dst,dst->current,false); + break; + case OAssert: + { + pad_before_call(ctx, 0); + jlist *j = (jlist*)hl_malloc(&ctx->galloc,sizeof(jlist)); + j->pos = BUF_POS(); + j->target = -2; + j->next = ctx->calls; + ctx->calls = j; + + op64(ctx,MOV,PEAX,pconst64(&p,RESERVE_ADDRESS)); + op_call(ctx,PEAX,-1); + } + break; + case ONop: + break; + case OPrefetch: + { + preg *r = alloc_cpu(ctx, dst, true); + if( o->p2 > 0 ) { + switch( dst->t->kind ) { + case HOBJ: + case HSTRUCT: + { + hl_runtime_obj *rt = hl_get_obj_rt(dst->t); + preg *r2 = alloc_reg(ctx, RCPU); + op64(ctx, LEA, r2, pmem(&p, r->id, rt->fields_indexes[o->p2-1])); + r = r2; + } + break; + default: + ASSERT(dst->t->kind); + break; + } + } + switch( o->p3 ) { + case 0: + op64(ctx, PREFETCHT0, pmem(&p,r->id,0), UNUSED); + break; + case 1: + op64(ctx, PREFETCHT1, pmem(&p,r->id,0), UNUSED); + break; + case 2: + op64(ctx, PREFETCHT2, pmem(&p,r->id,0), UNUSED); + break; + case 3: + op64(ctx, PREFETCHNTA, pmem(&p,r->id,0), UNUSED); + break; + case 4: + op64(ctx, PREFETCHW, pmem(&p,r->id,0), UNUSED); + break; + default: + ASSERT(o->p3); + break; + } + } + break; + case OAsm: + { + switch( o->p1 ) { + case 0: // byte output + B(o->p2); + break; + case 1: // scratch cpu reg + scratch(REG_AT(o->p2)); + break; + case 2: // read vm reg + rb--; + copy(ctx, REG_AT(o->p2), &rb->stack, rb->size); + scratch(REG_AT(o->p2)); + break; + case 3: // write vm reg + rb--; + copy(ctx, &rb->stack, REG_AT(o->p2), rb->size); + scratch(rb->current); + break; + case 4: + if( ctx->totalRegsSize != 0 ) + hl_fatal("Asm naked function should not have local variables"); + if( opCount != 0 ) + hl_fatal("Asm naked function should be on first opcode"); + ctx->buf.b -= BUF_POS() - ctx->functionPos; // reset to our function start + break; + default: + ASSERT(o->p1); + break; + } + } + break; + case OCatch: + // Only used by OTrap typing + break; + default: + jit_error(hl_op_name(o->op)); + break; + } + // we are landing at this position, assume we have lost our registers + if( ctx->opsPos[opCount+1] == -1 ) + discard_regs(ctx,true); + ctx->opsPos[opCount+1] = BUF_POS(); + + // write debug infos + size = BUF_POS() - codePos; + if( debug16 && size > 0xFF00 ) { + debug32 = malloc(sizeof(int) * (f->nops + 1)); + for(i=0;icurrentPos;i++) + debug32[i] = debug16[i]; + free(debug16); + debug16 = NULL; + } + if( debug16 ) debug16[ctx->currentPos] = (unsigned short)size; else if( debug32 ) debug32[ctx->currentPos] = size; + + } + // patch jumps + { + jlist *j = ctx->jumps; + while( j ) { + *(int*)(ctx->startBuf + j->pos) = ctx->opsPos[j->target] - (j->pos + 4); + j = j->next; + } + ctx->jumps = NULL; + } + int codeEndPos = BUF_POS(); + // add nops padding + jit_nops(ctx); + // clear regs + for(i=0;iholds = NULL; + r->lock = 0; + } + // save debug infos + if( ctx->debug ) { + int fid = (int)(f - m->code->functions); + ctx->debug[fid].start = codePos; + ctx->debug[fid].offsets = debug32 ? (void*)debug32 : (void*)debug16; + ctx->debug[fid].large = debug32 != NULL; + } + // unwind info +#ifdef WIN64_UNWIND_TABLES + int uw_idx = ctx->nunwind++; + ctx->unwind_table[uw_idx].BeginAddress = codePos; + ctx->unwind_table[uw_idx].EndAddress = codeEndPos; + ctx->unwind_table[uw_idx].UnwindData = ctx->unwind_offset; +#endif + // reset tmp allocator + hl_free(&ctx->falloc); + return codePos; +} + +static void *get_wrapper( hl_type *t ) { + return call_jit_hl2c; +} + +void hl_jit_patch_method( void *old_fun, void **new_fun_table ) { + // mov eax, addr + // jmp [eax] + unsigned char *b = (unsigned char*)old_fun; + unsigned long long addr = (unsigned long long)(int_val)new_fun_table; +# ifdef HL_64 + *b++ = 0x48; + *b++ = 0xB8; + *b++ = (unsigned char)addr; + *b++ = (unsigned char)(addr>>8); + *b++ = (unsigned char)(addr>>16); + *b++ = (unsigned char)(addr>>24); + *b++ = (unsigned char)(addr>>32); + *b++ = (unsigned char)(addr>>40); + *b++ = (unsigned char)(addr>>48); + *b++ = (unsigned char)(addr>>56); +# else + *b++ = 0xB8; + *b++ = (unsigned char)addr; + *b++ = (unsigned char)(addr>>8); + *b++ = (unsigned char)(addr>>16); + *b++ = (unsigned char)(addr>>24); +# endif + *b++ = 0xFF; + *b++ = 0x20; +} + +static void missing_closure() { + hl_error("Missing static closure"); +} + +void *hl_jit_code( jit_ctx *ctx, hl_module *m, int *codesize, hl_debug_infos **debug, hl_module *previous ) { + jlist *c; + int size = BUF_POS(); + unsigned char *code; + if( size & 4095 ) size += 4096 - (size&4095); + code = (unsigned char*)hl_alloc_executable_memory(size); + if( code == NULL ) return NULL; + memcpy(code,ctx->startBuf,BUF_POS()); + *codesize = size; + *debug = ctx->debug; + if( !call_jit_c2hl ) { + call_jit_c2hl = code + ctx->c2hl; + call_jit_hl2c = code + ctx->hl2c; + hl_setup.get_wrapper = get_wrapper; + hl_setup.static_call = callback_c2hl; + hl_setup.static_call_ref = true; + } +#ifdef WIN64_UNWIND_TABLES + m->unwind_table = ctx->unwind_table; + RtlAddFunctionTable(m->unwind_table, ctx->nunwind, (DWORD64)code); +#endif + if( !ctx->static_function_offset ) { + int i; + ctx->static_function_offset = true; + for(i=0;i<(int)(sizeof(ctx->static_functions)/sizeof(void*));i++) + ctx->static_functions[i] = (void*)(code + (int)(int_val)ctx->static_functions[i]); + } + // patch calls + c = ctx->calls; + while( c ) { + void *fabs; + if( c->target < 0 ) + fabs = ctx->static_functions[-c->target-1]; + else { + fabs = m->functions_ptrs[c->target]; + if( fabs == NULL ) { + // read absolute address from previous module + int old_idx = m->hash->functions_hashes[m->functions_indexes[c->target]]; + if( old_idx < 0 ) + return NULL; + fabs = previous->functions_ptrs[(previous->code->functions + old_idx)->findex]; + } else { + // relative + fabs = (unsigned char*)code + (int)(int_val)fabs; + } + } + if( (code[c->pos]&~3) == (IS_64?0x48:0xB8) || code[c->pos] == 0x68 ) // MOV : absolute | PUSH + *(void**)(code + c->pos + (IS_64?2:1)) = fabs; + else { + int_val delta = (int_val)fabs - (int_val)code - (c->pos + 5); + int rpos = (int)delta; + if( (int_val)rpos != delta ) { + printf("Target code too far too rebase\n"); + return NULL; + } + *(int*)(code + c->pos + 1) = rpos; + } + c = c->next; + } + // patch switchs + c = ctx->switchs; + while( c ) { + *(void**)(code + c->pos) = code + c->pos + (IS_64 ? 14 : 6); + c = c->next; + } + // patch closures + { + vclosure *c = ctx->closure_list; + while( c ) { + vclosure *next; + int fidx = (int)(int_val)c->fun; + void *fabs = m->functions_ptrs[fidx]; + if( fabs == NULL ) { + // read absolute address from previous module + int old_idx = m->hash->functions_hashes[m->functions_indexes[fidx]]; + if( old_idx < 0 ) + fabs = missing_closure; + else + fabs = previous->functions_ptrs[(previous->code->functions + old_idx)->findex]; + } else { + // relative + fabs = (unsigned char*)code + (int)(int_val)fabs; + } + c->fun = fabs; + next = (vclosure*)c->value; + c->value = NULL; + c = next; + } + } + return code; +} + diff --git a/src/jit_regs.c b/src/jit_regs.c new file mode 100644 index 000000000..50f151f06 --- /dev/null +++ b/src/jit_regs.c @@ -0,0 +1,813 @@ +/* + * Copyright (C)2015-2016 Haxe Foundation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ +#include +#include +#include "data_struct.h" + +#define VAL(k) (ctx->values + (k)) + +//#define REGS_DEBUG + +#ifdef REGS_DEBUG +# define regs_debug jit_debug +#else +# define regs_debug(...) +#endif + +#define INVALID 0x80000000 + +#define VIDX(e) (((e) < 0) ? ctx->jit->value_count + (-(e)-1) : (e)) +#define VAL_REG(e) VAL(VIDX(e)) +#define REG_MODE(m) (IS_FLOAT(m) ? 1 :0) +#define REG_CFG(m) (m ? &ctx->jit->cfg.floats : &ctx->jit->cfg.regs) + +#define EMIT(r,a,b,m) regs_emit(ctx,UNUSED,r,a,b,m,0) +#define BREAK() EMIT(DEBUG_BREAK,UNUSED,UNUSED,0) + +typedef struct { + int id; + int stack_pos; + int last_read; + int tot_reads; + emit_mode mode; + ereg pref_reg; + ereg reg; +} value_info; + +#define S_TYPE values +#define S_NAME(name) values_##name +#define S_VALUE value_info* +#include "data_struct.c" +#define values_add(set,v) values_add_impl(DEF_ALLOC,&(set),v) + +struct _regs_ctx { + jit_ctx *jit; + value_info *values; + values scratch; + int_arr jump_regs; + int_arr pack_movs; + int_arr *blocks_phis; + int max_instrs; + int cur_op; + int emit_pos; + int stack_size; + int stack_offset; + int loop_start; + int loop_end; + einstr *instrs; + ereg *out_write; + int *pos_map; + bool flushed; + bool has_direct_call; + int persists_uses[2]; +}; + +typedef int call_regs[2]; + +static ereg get_call_reg( regs_ctx *ctx, call_regs regs, emit_mode m ) { + ereg r; + int mode = REG_MODE(m); + reg_config *cfg = REG_CFG(mode); + int idx = IS_WINCALL64 ? 0 : mode; + if( regs[idx] < cfg->nargs ) + r = cfg->arg[regs[idx]++]; + else + r = UNUSED; + return r; +} + +static int get_stack_size( regs_ctx *ctx, emit_mode m ) { + int size = hl_emit_mode_sizes[m]; + if( size < HL_WSIZE ) size = HL_WSIZE; + int min = ctx->jit->cfg.stack_arg_size; + if( min && size < min ) size = min; + return size; +} + +static void regs_write_instr( regs_ctx *ctx, einstr *e, ereg out ) { + if( ctx->emit_pos == ctx->max_instrs ) { + int pos = ctx->emit_pos; + int next_size = ctx->max_instrs ? (ctx->max_instrs << 1) : 256; + einstr *instrs = (einstr*)malloc(sizeof(einstr) * next_size); + ereg *out = (ereg*)malloc(sizeof(ereg) * next_size); + if( instrs == NULL || out == NULL ) jit_error("Out of memory"); + memcpy(instrs, ctx->instrs, pos * sizeof(einstr)); + memcpy(out, ctx->out_write, pos * sizeof(ereg)); + memset(instrs + pos, 0, (next_size - pos) * sizeof(einstr)); + free(ctx->instrs); + free(ctx->out_write); + ctx->instrs = instrs; + ctx->out_write = out; + ctx->max_instrs = next_size; + } else if( (ctx->emit_pos & 0xFF) == 0 ) + memset(ctx->instrs + ctx->emit_pos, 0, 256 * sizeof(einstr)); + ctx->out_write[ctx->emit_pos] = out; + ctx->instrs[ctx->emit_pos++] = *e; +} + +static void regs_emit( regs_ctx *ctx, ereg out, emit_op op, ereg a, ereg b, emit_mode m, int size_offs ) { + einstr e; + e.header = op; + e.mode = m; + e.a = a; + e.b = b; + e.size_offs = size_offs; + regs_write_instr(ctx, &e, out); +} + +static void regs_emit_mov( regs_ctx *ctx, ereg to, ereg from, emit_mode m ) { + if( to == from ) return; + regs_emit(ctx,to,MOV,from,UNUSED,m,0); +} + +static int regs_alloc_stack( regs_ctx *ctx, int size ) { + ctx->stack_size += size; + ctx->stack_size += jit_pad_size(ctx->stack_size,size); + return -ctx->stack_size; +} + +#define value_str(v) value_to_str(ctx,v) + +static const char *value_to_str( regs_ctx *ctx, value_info *v ) { + static char out[20]; + sprintf(out,"%s:%s", val_str(v->id,v->mode), val_str(v->reg,v->mode)); + return out; +} + +static void spill( regs_ctx *ctx, value_info *v ) { + if( v->stack_pos == INVALID ) v->stack_pos = regs_alloc_stack(ctx, hl_emit_mode_sizes[v->mode]); + v->reg = MK_STACK_REG(v->stack_pos); + values_remove(&ctx->scratch,v); + regs_debug("REG SPILL %s @%X\n",value_str(v),ctx->cur_op); +} + +static bool regs_alloc_reg( regs_ctx *ctx, value_info *v ) { + // lookup available reg + int mode = REG_MODE(v->mode); + reg_config *cfg = REG_CFG(mode); + if( !IS_NULL(v->pref_reg) ) { + bool free = true; + for_iter(values,v2,ctx->scratch) { + if( v2->reg == v->pref_reg ) { + free = false; + break; + } + } + if( free ) { + for(int i=0;ipersists_uses[mode];i++) + if( cfg->persist[i] == v->pref_reg ) { + free = false; + break; + } + } + if( free ) { + v->reg = v->pref_reg; + return true; + } + } + value_info *first = NULL; + for(int i=0;inscratchs;i++) { + ereg r = cfg->scratch[i]; + for_iter(values,v2,ctx->scratch) { + if( v2->reg == r ) { + if( first == NULL ) first = v2; + r = UNUSED; + break; + } + } + if( !IS_NULL(r) ) { + v->reg = r; + return true; + } + } + if( ctx->persists_uses[mode] < cfg->npersists ) { + v->reg = cfg->persist[ctx->persists_uses[mode]++]; + return false; + } + // free the oldest scratch reg + if( !first ) jit_assert(); + v->reg = first->reg; + spill(ctx, first); + return true; +} + +static void regs_assign( regs_ctx *ctx, value_info *v ) { + if( v->reg != UNUSED ) jit_assert(); + if( regs_alloc_reg(ctx, v) ) + values_add(ctx->scratch, v); + regs_debug("REG ASSIGN %s @%X-@%X\n",value_str(v),ctx->cur_op,v->last_read); +} + +static void regs_write_live( regs_ctx *ctx, ereg *r ) { + if( IS_NULL(*r) ) jit_assert(); + if( !REG_IS_VAL(*r) ) return; // some are injections of native regs at emit + value_info *v = VAL_REG(*r); + int write = v->id >= 0 ? ctx->jit->values_writes[v->id] : -1; + v->last_read = ctx->loop_end && write < ctx->loop_start ? ctx->loop_end : ctx->cur_op; + v->tot_reads++; +} + +static value_info *regs_current( regs_ctx *ctx, ereg r ) { + for_iter(values,v,ctx->scratch) { + if( v->reg == r ) + return v; + } + return NULL; +} + +static void regs_compute_liveness( regs_ctx *ctx ) { +# define MAX_LOOP_DEPTH 256 + int loop_saves[MAX_LOOP_DEPTH]; + int loop_count = 0; + int write_index = 1; + jit_ctx *jit = ctx->jit; + hl_type *tret = ctx->jit->fun->type->fun->ret; + emit_mode mret = tret->kind == HF32 || tret->kind == HF64 ? M_F64 : M_PTR; + ereg ret = REG_CFG(REG_MODE(mret))->ret; + for(int cur_op=0;cur_opinstr_count;cur_op++) { + einstr *e = jit->instrs + cur_op; + value_info *write = NULL; + + while( ctx->loop_end == cur_op && cur_op ) { + ctx->loop_end = loop_saves[--loop_count]; + ctx->loop_start = loop_saves[--loop_count]; + } + + if( write_index < jit->value_count && jit->values_writes[write_index] == cur_op ) + write = VAL(write_index++); + + ctx->cur_op = cur_op; + hl_emit_reg_iter(jit,e,ctx,(void*)regs_write_live); + if( IS_CALL(e->op) ) { + // anticipate register usage in call so we can previlege this assign + ereg *r = hl_emit_get_args(jit->emit, e); + call_regs regs = {0}; + bool needs_push = false; + for(int k=0;knargs;k++) { + ereg arg = r[k]; + value_info *v = REG_IS_VAL(arg) ? VAL_REG(r[k]) : NULL; + ereg r = get_call_reg(ctx, regs, v ? v->mode : M_I32); + if( IS_NULL(r) ) { + needs_push = true; + continue; + } + if( v && IS_NULL(v->pref_reg) ) + v->pref_reg = r; + } + if( !needs_push && e->mode != M_NORET ) ctx->has_direct_call = true; + if( write && IS_NULL(write->pref_reg) ) + write->pref_reg = REG_CFG(REG_MODE(e->mode))->ret; + } else switch( e->op ) { + case RET: + if( e->a ) { + value_info *v = VAL_REG(e->a); + if( v->pref_reg == UNUSED ) v->pref_reg = ret; + } + break; + case BINOP: + switch( e->size_offs ) { + case OSShr: + case OUShr: + case OShl: + if( jit->cfg.req_bit_shifts ) VAL_REG(e->b)->pref_reg = jit->cfg.req_bit_shifts; + break; + case OSDiv: + case OUDiv: + case OSMod: + case OUMod: + if( !IS_FLOAT(e->mode) ) { + if( jit->cfg.req_div_a ) VAL_REG(e->a)->pref_reg = jit->cfg.req_div_a; + if( jit->cfg.req_div_b ) VAL_REG(e->b)->pref_reg = jit->cfg.req_div_b; + } + break; + } + break; + case BLOCK: + { + // are we in loop ? + eblock *bl = jit->blocks + e->size_offs; + int loop_end = -1; + for(int k=0;kpred_count;k++) { + eblock *b2 = jit->blocks + bl->preds[k]; + if( b2->start_pos > bl->start_pos && b2->end_pos >= loop_end ) + loop_end = b2->end_pos - 1; + } + if( loop_end > 0 ) { + loop_saves[loop_count++] = ctx->loop_start; + loop_saves[loop_count++] = ctx->loop_end; + ctx->loop_start = cur_op; + ctx->loop_end = loop_end; + } + } + break; + default: + break; + } + } + if( loop_count != 0 ) jit_assert(); + // compute reverse phis + for(int b=0;bblock_count;b++) { + eblock *bl = jit->blocks + b; + for(int p=0;pphi_count;p++) { + ephi *ph = bl->phis + p; + VAL_REG(ph->value)->mode = ph->mode; + for(int k=0;knvalues;k++) { + ereg v = ph->values[k]; + eblock *b2 = jit->blocks + ph->blocks[k]; + value_info *val = VAL_REG(v); + int_arr *arr = &ctx->blocks_phis[b2 - jit->blocks]; + regs_debug("ADD PHI %s:=%s to #%d@%X\n",val_str(ph->value,ph->mode),val_str(v,ph->mode),(int)(b2 - jit->blocks),b2->end_pos-1); + int_arr_add(*arr,v); + int_arr_add(*arr,ph->value); + int_arr_add(*arr,(bl - b2) == 1); + val->tot_reads++; + if( val->last_read < b2->end_pos ) + val->last_read = b2->end_pos; + } + } + } +} + +static void regs_assign_regs( regs_ctx *ctx ) { + jit_ctx *jit = ctx->jit; + // assign args + call_regs regs = {0}; + int args_count = 0; + for(int i=1;i<=ctx->jit->fun->type->fun->nargs;i++) { + value_info *v = VAL(i); + einstr *e = ctx->jit->instrs + ctx->jit->values_writes[i]; + int size = hl_emit_mode_sizes[e->mode]; + if( size <= 0 && e->mode != M_VOID ) jit_assert(); + ereg r = get_call_reg(ctx,regs,e->mode); + if( !IS_NULL(r) ) { + v->reg = r; + values_add(ctx->scratch,v); + } + if( IS_NULL(r) || IS_WINCALL64 ) { + // use existing stack storage + v->stack_pos = (args_count++ + 2) * HL_WSIZE; + if( IS_NULL(r) ) v->reg = MK_STACK_REG(v->stack_pos); + } + } + // assign registers + int write_index = 1; + for(int cur_op=0;cur_opinstr_count;cur_op++) { + einstr e = jit->instrs[cur_op]; + value_info *write = NULL; +# ifdef HL_DEBUG + int eid = (jit->fun->findex << 16) | cur_op; + __ignore(&eid); +# endif + ctx->cur_op = cur_op; + + + if( write_index < jit->value_count && jit->values_writes[write_index] == cur_op ) { + write = VAL(write_index++); + // try to preserve ops in the from A = A op B + if( (e.op == UNOP || e.op == BINOP) && write->pref_reg == UNUSED ) { + value_info *v = VAL_REG(e.a); + if( IS_REG(v->reg) ) write->pref_reg = v->reg; + } + } + + for_iter_back(values,v,ctx->scratch) { + if( v->last_read <= cur_op ) + values_remove(&ctx->scratch,v); + } + + if( IS_CALL(e.op) ) { + ereg *args = hl_emit_get_args(ctx->jit->emit,&e); + call_regs regs = {0}; + bool will_scratch = e.mode != M_NORET; + value_info *vcall = e.op == CALL_REG ? VAL_REG(e.a) : NULL; + if( will_scratch ) { + for_iter_back(values,v2,ctx->scratch) { + if( v2->last_read > cur_op ) + spill(ctx,v2); + } + } + for(int k=0;kmode); + if( !IS_NULL(r) ) { + value_info *cur = regs_current(ctx,r); + if( cur && cur != v ) + spill(ctx,cur); + if( vcall && vcall->reg == r ) + spill(ctx,vcall); + } + } + if( will_scratch ) values_reset(&ctx->scratch); + } + switch( e.op ) { + case BLOCK: + for_iter_back(values,v,ctx->scratch) { + if( v->last_read == cur_op ) + values_remove(&ctx->scratch,v); + } + eblock *bl = jit->blocks + e.size_offs; + for(int k=0;kphi_count;k++) { + ephi *p = bl->phis + k; + value_info *v = VAL_REG(p->value); + for(int n=0;nnvalues;n++) { + value_info *vn = VAL_REG(p->values[n]); + // ignore previously set pref_reg (minimize moves) + if( IS_REG(vn->reg) && !regs_current(ctx,vn->reg) ) { + v->pref_reg = vn->reg; + break; + } + } + regs_assign(ctx, v); + } + break; + case CATCH: + { + for_iter_back(values,v2,ctx->scratch) + spill(ctx,v2); + } + break; + case ALLOC_STACK: + write->reg = MK_STACK_OFFS(regs_alloc_stack(ctx, e.size_offs)); + continue; + case LOAD_ARG: + if( write->reg == UNUSED ) + regs_assign(ctx, write); // assign for stack reg + continue; + case ADDRESS: + { + if( REG_KIND(e.a) == R_CONST ) jit_assert(); + value_info *v = VAL_REG(e.a); + spill(ctx, v); + break; + } + default: + break; + } + if( write ) regs_assign(ctx, write); + } + // assign stack regs + int nvalues = jit->value_count + jit->phi_count; + ctx->stack_offset = (ctx->persists_uses[0] + ctx->persists_uses[1]) * 8; + for(int i=0;ivalues + i; + if( v->reg == UNUSED ) v->reg = MK_STACK_REG(v->stack_pos); + } +} + +static void flush_movs( regs_ctx *ctx, bool cond ) { + int_arr movs = ctx->pack_movs; + while( true ) { + int size = int_arr_count(movs); + if( !size ) break; + bool cycle = true; + for(int k=0;kpack_movs = movs; + int_arr_reset(&ctx->pack_movs); +} + +static void flush_phis( regs_ctx *ctx, eblock *b, bool cond, bool after ) { + if( !b ) return; + jit_ctx *jit = ctx->jit; + int bid = (int)(b - jit->blocks); + int_arr arr = ctx->blocks_phis[bid]; + int idx = 0; + int_arr movs = ctx->pack_movs; + + while( idx < int_arr_count(arr) ) { + ereg a = int_arr_get(arr,idx++); + ereg b = int_arr_get(arr,idx++); + int bcount = int_arr_get(arr,idx++); + if( after != (bcount == 1) ) + continue; + value_info *from = VAL_REG(a); + value_info *to = VAL_REG(b); + if( from->reg == to->reg ) continue; + int size = int_arr_count(movs); + bool dup = false; + for(int k=0;kreg && int_arr_get(movs,k+1) == from->reg ) { + dup = true; + break; + } + } + if( !dup ) { + int_arr_add(movs, to->reg); + int_arr_add(movs, from->reg); + int_arr_add(movs, from->mode); + } + } + ctx->pack_movs = movs; + if( !cond ) + int_arr_free(&ctx->blocks_phis[bid]); + flush_movs(ctx, cond); +} + +static void regs_emit_instrs( regs_ctx *ctx ) { + jit_ctx *jit = ctx->jit; + eblock *cur_block = NULL; + call_regs regs = {0}; + int write_index = 1; + ctx->pos_map[0] = 0; + + int stack_offset = ctx->stack_size; + int push_size = HL_WSIZE * 2 + ctx->stack_offset; // RIP + RBP save + if( jit->cfg.stack_align ) { + int align = (stack_offset + push_size) % jit->cfg.stack_align; + if( align ) stack_offset += jit->cfg.stack_align - align; + } + + for(int cur_op=0;cur_opinstr_count;cur_op++) { + einstr e = jit->instrs[cur_op]; + ereg *ret_val = NULL; + int nread; + int instr_stack_offset = 0; + ctx->cur_op = cur_op; + + value_info *vout = NULL; + ereg out = UNUSED; + if( write_index < jit->value_count && jit->values_writes[write_index] == cur_op ) { + vout = VAL(write_index++); + out = vout->reg; + } + + if( IS_CALL(e.op) ) { + ereg *args = hl_emit_get_args(ctx->jit->emit,&e); + call_regs regs = {0}; + int stack_args = 0; + int stack_bits = 0; + for(int k=0;kmode : M_I32; + ereg r = get_call_reg(ctx,regs,mode); + if( IS_NULL(r) ) { + stack_args += get_stack_size(ctx, mode); + stack_bits |= 1 << k; + } else if( !v || r != v->reg ) { + int_arr_add(ctx->pack_movs,r); + int_arr_add(ctx->pack_movs,v ? v->reg : args[k]); + int_arr_add(ctx->pack_movs,mode); + } + } + if( stack_args > 0 ) { + int offset = 0; + if( jit->cfg.stack_align ) { + int align = stack_args % jit->cfg.stack_align; + if( align ) offset = jit->cfg.stack_align - align; + } + if( offset ) + regs_emit(ctx,UNUSED,STACK_OFFS,UNUSED,UNUSED,0,-offset); + for(int k=e.nargs-1;k>=0;k--) { + if( stack_bits & (1 << k) ) { + value_info *v = REG_IS_VAL(args[k]) ? VAL_REG(args[k]) : NULL; + EMIT(PUSH,VAL_REG(args[k])->reg,UNUSED,v && IS_FLOAT(v->mode) ? v->mode : M_PTR); + } + } + if( IS_WINCALL64 ) { + regs_emit(ctx,UNUSED,STACK_OFFS,UNUSED,UNUSED,0,-0x20); + offset += 0x20; + } + instr_stack_offset = stack_args+offset; + } + flush_movs(ctx,0); + e.nargs = 0xFF; + if( vout && vout->last_read > cur_op ) + ret_val = ®_CFG(REG_MODE(e.mode))->ret; + else if( e.mode != M_NORET ) { + e.mode = M_VOID; // ignore output + out = UNUSED; + } + if( e.op == CALL_REG ) + e.a = VAL_REG(e.a)->reg; + } else { + ereg **regs = hl_emit_get_regs(&e,&nread); + for(int k=0;kreg; + } + } + switch( e.op ) { + case ALLOC_STACK: + case CATCH: + break; + case BLOCK: + cur_block = jit->blocks + e.size_offs; + break; + case LOAD_ARG: + { + ereg def = get_call_reg(ctx,regs,e.mode); + if( def && out != def ) + regs_emit_mov(ctx,out,def,e.mode); + else + regs_write_instr(ctx, &e, out); + } + break; + case ENTER: + { + EMIT(PUSH,jit->cfg.stack_pos,UNUSED,M_PTR); + regs_emit_mov(ctx,jit->cfg.stack_pos,jit->cfg.stack_reg,M_PTR); + if( stack_offset ) + regs_emit(ctx,UNUSED,STACK_OFFS,UNUSED,UNUSED,M_PTR,-stack_offset); + for(int i=0;ipersists_uses[0];i++) + EMIT(PUSH,ctx->jit->cfg.regs.persist[i],UNUSED,M_PTR); + for(int i=0;ipersists_uses[1];i++) + EMIT(PUSH,ctx->jit->cfg.floats.persist[i],UNUSED,M_F64); + if( IS_WINCALL64 && ctx->has_direct_call ) + regs_emit(ctx,UNUSED,STACK_OFFS,UNUSED,UNUSED,M_PTR,-0x20); + } + break; + case JCOND: + case JUMP: + case JUMP_TABLE: + flush_phis(ctx,cur_block, e.op == JCOND, false); + if( e.op == JUMP_TABLE ) { + // copy args (remap later) + hl_emit_store_args(jit->emit,&e,hl_emit_get_args(jit->emit,&e),e.nargs); + } + regs_write_instr(ctx, &e, out); + int_arr_add(ctx->jump_regs, ctx->emit_pos - 1); + int_arr_add(ctx->jump_regs, cur_op + 1 + (e.op == JUMP_TABLE ? 0 : e.size_offs)); + if( e.op == JCOND ) flush_phis(ctx,cur_block, false, true); + break; + case RET: + if( e.a ) { + ereg ret = REG_CFG(REG_MODE(e.mode))->ret; + if( e.a != ret ) + regs_emit_mov(ctx, ret, e.a, e.mode); + } +# ifdef WIN64_UNWIND_TABLES + // if we have our stack offset just after a call, the unwind algorithm + // will subtract and create invalid stack frame. this is because we do + // not register the stack offset in our unwind table so all functions + // can share the same definition + if( cur_op && IS_CALL(jit->instrs[cur_op-1].op) ) + EMIT(NOP,UNUSED,UNUSED,M_NONE); +# endif + if( IS_WINCALL64 && ctx->has_direct_call ) + regs_emit(ctx,UNUSED,STACK_OFFS,UNUSED,UNUSED,M_PTR,0x20); + for(int i=ctx->persists_uses[1]-1;i>=0;i--) + EMIT(POP,ctx->jit->cfg.floats.persist[i],UNUSED,M_F64); + for(int i=ctx->persists_uses[0]-1;i>=0;i--) + EMIT(POP,ctx->jit->cfg.regs.persist[i],UNUSED,M_PTR); + if( stack_offset ) { + regs_emit(ctx,UNUSED,STACK_OFFS,UNUSED,UNUSED,M_PTR,stack_offset); + } + EMIT(POP,jit->cfg.stack_pos,UNUSED,M_PTR); + EMIT(RET,UNUSED,UNUSED,M_NONE); + break; + case MOV: + if( out == e.a ) break; + // fallthrough + default: + if( e.op == ADDRESS ) { + e.op = LEA; + if( REG_KIND(e.a) != R_REG_PTR ) jit_assert(); + e.a = (e.a & ~R_REG_PTR) | R_REG; + } + if( ret_val && out ) { + regs_write_instr(ctx, &e, *ret_val); + regs_emit_mov(ctx, out, *ret_val, e.mode); + } else + regs_write_instr(ctx, &e, out); + break; + } + if( instr_stack_offset ) + regs_emit(ctx,UNUSED,STACK_OFFS,UNUSED,UNUSED,M_PTR,instr_stack_offset); + if( cur_block && cur_block->end_pos == cur_op+1 ) + flush_phis(ctx,cur_block,false,true); + ctx->pos_map[cur_op+1] = ctx->emit_pos; + } +} + +void hl_regs_flush( jit_ctx *jit ) { + regs_ctx *ctx = jit->regs; + if( ctx->flushed ) return; + ctx->flushed = true; + jit->reg_instr_count = ctx->emit_pos; + jit->reg_instrs = ctx->instrs; + jit->reg_writes = ctx->out_write; + jit->reg_pos_map = ctx->pos_map; + if( ctx->pos_map ) ctx->pos_map[ctx->cur_op+1] = ctx->emit_pos; + hl_emit_remap_jumps(jit->emit, &ctx->jump_regs, ctx->instrs, ctx->pos_map); +} + +void hl_regs_function( jit_ctx *jit ) { + regs_ctx *ctx = jit->regs; + int nvalues = jit->value_count + jit->phi_count; + memset(ctx->persists_uses,0,sizeof(ctx->persists_uses)); + free(ctx->pos_map); + ctx->flushed = false; + ctx->has_direct_call = false; + ctx->pos_map = (int*)malloc((jit->instr_count + 1) * sizeof(int)); + ctx->emit_pos = 0; + ctx->cur_op = 0; + ctx->stack_size = 0; + jit->reg_instrs = NULL; + values_free(&ctx->scratch); + int_arr_free(&ctx->jump_regs); + int_arr_free(&ctx->pack_movs); + ctx->blocks_phis = (int_arr*)hl_zalloc(&jit->falloc,sizeof(int_arr) * jit->block_count); + ctx->values = (value_info*)hl_zalloc(&jit->falloc,sizeof(value_info) * nvalues); + for(int i=1;ireg = UNUSED; + v->pref_reg = UNUSED; + v->stack_pos = INVALID; + v->last_read = -1; + if( i < jit->value_count ) { + v->id = i; + v->mode = jit->instrs[jit->values_writes[i]].mode; + } else { + v->id = -(i-jit->value_count) - 1; + v->mode = M_NONE; + } + } + regs_compute_liveness(ctx); + regs_assign_regs(ctx); + regs_emit_instrs(ctx); + hl_regs_flush(ctx->jit); +} + + +void hl_regs_alloc( jit_ctx *jit ) { + regs_ctx *ctx = malloc(sizeof(regs_ctx)); + memset(ctx,0,sizeof(regs_ctx)); + ctx->jit = jit; + jit->regs = ctx; +} + +void hl_regs_free( jit_ctx *jit ) { + regs_ctx *ctx = jit->regs; + free(ctx->pos_map); + free(ctx->instrs); + free(ctx->out_write); + free(ctx); +} + diff --git a/src/jit_x86_64.c b/src/jit_x86_64.c new file mode 100644 index 000000000..a2b6185c3 --- /dev/null +++ b/src/jit_x86_64.c @@ -0,0 +1,1722 @@ +/* + * Copyright (C)2015-2016 Haxe Foundation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ +#include +#include +#include "data_struct.h" + +#ifdef HL_DEBUG +# define GEN_DEBUG +#endif + +#define S_TYPE byte_arr +#define S_NAME(name) byte_##name +#define S_VALUE unsigned char +#include "data_struct.c" +#define byte_reserve(set,count) byte_reserve_impl(DEF_ALLOC,&set,count) +#define VAL_CONST 0x80000000 +#define VAL_MEM(reg) (FL_MEMPTR | (reg)) + +#define S_TYPE value_arr +#define S_NAME(name) value_arr_##name +#define S_VALUE uint64 +#include "data_struct.c" + +#define S_SORTED +#define S_MAP +#define S_TYPE value_map +#define S_NAME(name) value_map_##name +#define S_KEY uint64 +#define S_VALUE int +#define S_DEFVAL -1 +#include "data_struct.c" + +typedef enum { + RAX = 0, + RCX = 1, + RDX = 2, + RBX = 3, + RSP = 4, + RBP = 5, + RSI = 6, + RDI = 7, +#ifdef HL_64 + R8 = 8, + R9 = 9, + R10 = 10, + R11 = 11, + R12 = 12, + R13 = 13, + R14 = 14, + R15 = 15, +#endif + _UNUSED = 0xFF +} CpuReg; + +#define R(id) MK_REG(id,R_REG) +#define MMX(id) MK_REG((id)+64,R_REG) + +typedef enum { + _MOV, + _LEA, + _PUSH, + ADD, + SUB, + IMUL, // only overflow flag changes compared to MUL + DIV, + IDIV, + NEG, + CDQ, + CDQE, + _POP, + _RET, + _CALL, + AND, + OR, + XOR, + _CMP, + _TEST, + SHL, + SHR, + SAR, + INC, + DEC, + JMP, + MOVSXD, + // FPU + FSTP, + FSTP32, + FLD, + FLD32, + FLDCW, + // SSE + MOVSD, + MOVSS, + COMISD, + COMISS, + ADDSD, + SUBSD, + MULSD, + DIVSD, + ADDSS, + SUBSS, + MULSS, + DIVSS, + XORPS, + XORPD, + CVTSI2SD, + CVTSI2SS, + CVTTSD2SI, + CVTSD2SS, + CVTSS2SD, + CVTTSS2SI, + STMXCSR, + LDMXCSR, + STC, + CLC, + // 8-16 bits + ADD8, + SUB8, + MOV8, + MOVZX8, + MOVSX8, + CMP8, + TEST8, + PUSH8, + ADD16, + SUB16, + IMUL16, + MOV16, + MOVZX16, + MOVSX16, + CMP16, + TEST16, + // prefetchs + PREFETCHT0, + PREFETCHT1, + PREFETCHT2, + PREFETCHNTA, + PREFETCHW, + // -- + _CPU_LAST +} CpuOp; + +#define JAlways 0xE9 +#define JAlways_short 0xEB +#define JOverflow 0x80 +#define JULt 0x82 +#define JUGte 0x83 +#define JEq 0x84 +#define JNeq 0x85 +#define JULte 0x86 +#define JUGt 0x87 +#define JParity 0x8A +#define JNParity 0x8B +#define JSLt 0x8C +#define JSGte 0x8D +#define JSLte 0x8E +#define JSGt 0x8F + +#define JCarry JLt +#define JZero JEq +#define JNotZero JNeq + +#define FLAG_LONGOP 0x80000000 +#define FLAG_16B 0x40000000 +#define FLAG_8B 0x20000000 +#define FLAG_DUAL 0x10000000 +#define FLAG_DEF64 0x08000000 + +#define RM(op,id) ((op) | (((id)+1)<<8)) +#define GET_RM(op) (((op) >> ((op) < 0 ? 24 : 8)) & 15) +#define SBYTE(op) ((op) << 16) +#define LONG_OP(op) ((op) | FLAG_LONGOP) +#define OP16(op) ((op) | FLAG_16B) +#define LONG_RM(op,id) LONG_OP(op | (((id) + 1) << 24)) + +typedef struct { + const char *name; // single operand + int r_mem; // r32 / r/m32 r32 + int mem_r; // r/m32 / r32 r/m32 + int r_const; // r32 / imm32 imm32 + int r_i8; // r32 / imm8 imm8 +} opform; + +static opform OP_FORMS[] = { + { "MOV", 0x8B, 0x89, 0xB8, 0 }, + { "LEA", 0x8D }, + { "PUSH", 0x50 | FLAG_DEF64, RM(0xFF,6), 0x68, 0x6A }, + { "ADD", 0x03, 0x01, RM(0x81,0), RM(0x83,0) }, + { "SUB", 0x2B, 0x29, RM(0x81,5), RM(0x83,5) }, + { "IMUL", LONG_OP(0x0FAF), 0, 0x69 | FLAG_DUAL, 0x6B | FLAG_DUAL }, + { "DIV", RM(0xF7,6), RM(0xF7,6) }, + { "IDIV", RM(0xF7,7), RM(0xF7,7) }, + { "NEG", RM(0xF7,3) }, + { "CDQ", 0x99 }, + { "CDQE", 0x98 }, + { "POP", 0x58 | FLAG_DEF64, RM(0x8F,0) }, + { "RET", 0xC3 }, + { "CALL", RM(0xFF,2) | FLAG_DEF64, RM(0xFF,2), 0xE8 }, + { "AND", 0x23, 0x21, RM(0x81,4), RM(0x83,4) }, + { "OR", 0x0B, 0x09, RM(0x81,1), RM(0x83,1) }, + { "XOR", 0x33, 0x31, RM(0x81,6), RM(0x83,6) }, + { "CMP", 0x3B, 0x39, RM(0x81,7), RM(0x83,7) }, + { "TEST", 0x85, 0x85/*SWP?*/, RM(0xF7,0) }, + { "SHL", RM(0xD3,4), 0, 0, RM(0xC1,4) }, + { "SHR", RM(0xD3,5), 0, 0, RM(0xC1,5) }, + { "SAR", RM(0xD3,7), 0, 0, RM(0xC1,7) }, + { "INC", IS_64 ? RM(0xFF,0) : 0x40, RM(0xFF,0) }, + { "DEC", IS_64 ? RM(0xFF,1) : 0x48, RM(0xFF,1) }, + { "JMP", RM(0xFF,4) }, + { "MOVSXD", 0x63 }, + // FPU + { "FSTP", 0, RM(0xDD,3) }, + { "FSTP32", 0, RM(0xD9,3) }, + { "FLD", 0, RM(0xDD,0) }, + { "FLD32", 0, RM(0xD9,0) }, + { "FLDCW", 0, RM(0xD9, 5) }, + // SSE + { "MOVSD", 0xF20F10, 0xF20F11 }, + { "MOVSS", 0xF30F10, 0xF30F11 }, + { "COMISD", LONG_RM(0x660F2F,1) }, + { "COMISS", LONG_RM(0x0F2F,1) }, + { "ADDSD", 0xF20F58 }, + { "SUBSD", 0xF20F5C }, + { "MULSD", 0xF20F59 }, + { "DIVSD", 0xF20F5E }, + { "ADDSS", 0xF30F58 }, + { "SUBSS", 0xF30F5C }, + { "MULSS", 0xF30F59 }, + { "DIVSS", 0xF30F5E }, + { "XORPS", LONG_OP(0x0F57) }, + { "XORPD", 0x660F57 }, + { "CVTSI2SD", 0xF20F2A }, + { "CVTSI2SS", 0xF30F2A }, + { "CVTTSD2SI", 0xF20F2C }, + { "CVTSD2SS", 0xF20F5A }, + { "CVTSS2SD", 0xF30F5A }, + { "CVTTSS2SI", 0xF30F2C }, + { "STMXCSR", 0, LONG_RM(0x0FAE,3) }, + { "LDMXCSR", 0, LONG_RM(0x0FAE,2) }, + { "STC", 0xF9 }, + { "CLC", 0xF8 }, + // 8 bits, + { "ADD8", 0, RM(0x00,3) }, + { "SUB8", 0, 0x28 }, + { "MOV8", 0x8A, 0x88, 0, RM(0xC6,0) }, + { "MOVZX8", LONG_OP(0x0FB6) }, + { "MOVSX8", LONG_OP(0x0FBE) }, + { "CMP8", 0x3A, 0x38, 0, RM(0x80,7) }, + { "TEST8", 0x84, 0x84, RM(0xF6,0) }, + { "PUSH8", FLAG_DEF64, 0, 0x6A | FLAG_8B }, + { "ADD16", 0, OP16(0x01) }, + { "SUB16", 0, OP16(0x29) }, + { "IMUL16", OP16(LONG_OP(0x0FAF)) }, + { "MOV16", OP16(0x8B), OP16(0x89), OP16(0xB8) }, + { "MOVZX16", LONG_OP(0x0FB7) }, + { "MOVSX16", LONG_OP(0x0FBF) }, + { "CMP16", OP16(0x3B), OP16(0x39) }, + { "TEST16", OP16(0x85) }, + // prefetchs + { "PREFETCHT0", FLAG_DEF64, LONG_RM(0x0F18,1) }, + { "PREFETCHT1", FLAG_DEF64, LONG_RM(0x0F18,2) }, + { "PREFETCHT2", FLAG_DEF64, LONG_RM(0x0F18,3) }, + { "PREFETCHNTA", FLAG_DEF64, LONG_RM(0x0F18,0) }, + { "PREFETCHW", FLAG_DEF64, LONG_RM(0x0F0D,1) }, +}; + +#ifdef HL_64 +# define REX() if( r64 ) B(r64 | 0x40) +#else +# define REX() +#endif + +static const int SIB_MULT[] = {-1, 0, 1, -1, 2, -1, -1, -1, 3}; + +#define B(v) ctx->code.values[ctx->code.cur++] = (unsigned char)(v) +#define W(wv) *(int*)&ctx->code.values[_incr(&ctx->code.cur,4)] = wv +#define W64(v64) *(int_val*)&ctx->code.values[_incr(&ctx->code.cur,8)] = v64 + +#define MOD_RM(mod,reg,rm) B(((mod) << 6) | (((reg)&7) << 3) | ((rm)&7)) +#define SIB(mult,rmult,rbase) B((SIB_MULT[mult]<<6) | (((rmult)&7)<<3) | ((rbase)&7)) +#define IS_SBYTE(c) ( (c) >= -128 && (c) < 128 ) + +#define BREAK() B(0xCC) + +#define OP(b) \ + if( (b) & 0xFF0000 ) { \ + B((b)>>16); \ + if( r64 ) B(r64 | 0x40); /* also in 32 bits mode */ \ + B((b)>>8); \ + B(b); \ + } else { \ + if( (b) & FLAG_16B ) { \ + B(0x66); \ + REX(); \ + } else {\ + REX(); \ + }\ + if( (b) & FLAG_LONGOP ) B((b)>>8); \ + B(b); \ + } + +struct _code_ctx { + jit_ctx *jit; + byte_arr code; + int_arr funs; + int_arr short_jumps; + int_arr near_jumps; + value_map const_table_lookup; + byte_arr const_table; + int_arr const_refs; + int_arr const_addr; + int *pos_map; + int cur_op; + bool flushed; + int const_table_pos; + int null_access_pos; + int null_field_pos; +}; + +static int _incr( int*v, int n ) { + int k = *v; + *v += n; + return k; +} + +const char *hl_natreg_str( int reg, emit_mode m ) { + static char out[16]; + static const char *regs_str[] = { "AX", "CX", "DX", "BX", "SP", "BP", "SI", "DI" }; + static const char *regs_str8[] = { "AL", "CL", "DL", "BL", "SPL", "BPL", "SIL", "DIL" }; + CpuReg r = REG_REG(reg); + switch( m ) { + case M_I32: + if( r < 8 ) + sprintf(out,"E%s",regs_str[r]); + else + sprintf(out,"R%dD%s",r,r<16?"":"???"); + break; + case M_UI16: + if( r < 8 ) + sprintf(out,"%s",regs_str[r]); + else + sprintf(out,"R%dW%s",r,r<16?"":"???"); + break; + case M_UI8: + if( r < 8 ) + sprintf(out,"%s",regs_str8[r]); + else + sprintf(out,"R%dB%s",r,r<16?"":"???"); + break; + case M_F32: + r -= 64; + sprintf(out,"XMM%df%s",r,r >= 0 && r < 16 ? "" : "???"); + break; + case M_F64: + r -= 64; + sprintf(out,"XMM%d%s",r,r >= 0 && r < 16 ? "" : "???"); + break; + default: + if( r < 8 ) + sprintf(out,"R%s",regs_str[r]); + else + sprintf(out,"R%d%s",r,r<16?"":"???"); + break; + } + return out; +} + +static int scratch_float_reg = -1; + +static ereg scratch_not_param[] = { R(RAX), R(R10), R(R11) }; + +void hl_jit_init_regs( regs_config *cfg ) { + // exclude R11 at it's use as temporary for various ops +# ifdef HL_WIN_CALL + static int scratch_regs[] = { R(RAX), R(RCX), R(RDX), R(R8), R(R9), R(R10), /*R(R11)*/ }; + static int free_regs[] = { R(RSI), R(RDI), R(RBX), R(R12), R(R13), R(R14), R(R15) }; + static int call_regs[] = { R(RCX), R(RDX), R(R8), R(R9) }; +# else + static int scratch_regs[] = { R(RAX), R(RCX), R(RDX), R(RSI), R(RDI), R(R8), R(R9), R(R10), /*R(R11)*/ }; + static int free_regs[] = { R(RBX), R(R12), R(R13), R(R14), R(R15) }; + static int call_regs[] = { R(RDI), R(RSI), R(RDX), R(RCX), R(R8), R(R9) }; +# endif + cfg->regs.ret = scratch_regs[0]; + cfg->regs.nscratchs = sizeof(scratch_regs) / sizeof(int); + cfg->regs.npersists = sizeof(free_regs) / sizeof(int); + cfg->regs.nargs = sizeof(call_regs) / sizeof(int); + cfg->regs.scratch = (ereg*)scratch_regs; + cfg->regs.persist = (ereg*)free_regs; + cfg->regs.arg = (ereg*)call_regs; + // floats + static int floats[] = { + MMX(0), MMX(1), MMX(2), MMX(3), + MMX(4), MMX(5), MMX(6), MMX(7), + MMX(8), MMX(9), MMX(10), MMX(11), + MMX(12), MMX(13), MMX(14), MMX(15) + }; +# ifdef HL_WIN_CALL + cfg->floats.nargs = 4; + cfg->floats.nscratchs = 6; +# else + cfg->floats.nargs = 8; + cfg->floats.nscratchs = 16; +# endif + scratch_float_reg = cfg->floats.nscratchs - 1; + cfg->floats.nscratchs--; + cfg->floats.ret = floats[0]; + cfg->floats.scratch = (ereg*)floats; + cfg->floats.arg = (ereg*)floats; + cfg->floats.persist = (ereg*)floats + cfg->floats.nscratchs + 1; + cfg->floats.npersists = 15 - cfg->floats.nscratchs; + // extra + cfg->req_bit_shifts = R(RCX); + cfg->req_div_a = R(RAX); + cfg->req_div_b = R(RCX); + cfg->stack_reg = R(RSP); + cfg->stack_pos = R(RBP); + cfg->stack_align = 16; +# ifdef GEN_DEBUG + cfg->debug_prefix_size = 6; +# endif +} + +#define EMIT(op,a,b,mode) emit_ext(ctx,op,a,b,mode,0) +#define ID2(a,b) ((a) | ((b)<<8)) + +typedef enum { + RCPU = 0, + RFPU = 1, + RSTACK = 2, + RCONST = 3, + RMEM = 4, + RUNUSED = 5, +} preg_kind; + +typedef struct { + preg_kind kind; + CpuReg reg; + int64 value; +} preg; + +#define ERRIF(v) if( v ) jit_assert() + +static preg make_reg( ereg r, uint64 value ) { + preg p; + if( IS_NULL(r) ) { + p.kind = RUNUSED; + return p; + } + if( r == VAL_CONST ) { + p.kind = RCONST; + p.value = value; + return p; + } + p.reg = REG_REG(r); + p.value = REG_VALUE(r); + switch( REG_KIND(r) ) { + case R_REG: + if( p.reg >= 64 ) { + p.kind = RFPU; + p.reg -= 64; + } else + p.kind = RCPU; + break; + case R_REG_PTR: + if( p.reg == RBP ) + p.kind = RSTACK; + else + p.kind = RMEM; + break; + case R_CONST: + p.kind = RCONST; + break; + default: + jit_assert(); + break; + } + if( p.reg < 0 || p.reg > 15 ) jit_assert(); + return p; +} + +static void emit_ext( code_ctx *ctx, CpuOp op, ereg _a, ereg _b, emit_mode mode, int_val _value ) { + opform *f = &OP_FORMS[op]; + int mode64 = mode == M_PTR && (f->r_mem&FLAG_DEF64) == 0 ? 8 : 0; + int r64 = mode64; + preg a = make_reg(_a,_value), b = make_reg(_b,_value); + switch( ID2(a.kind,b.kind) ) { + case ID2(RUNUSED,RUNUSED): + ERRIF(f->r_mem == 0); + OP(f->r_mem); + break; + case ID2(RCPU,RCPU): + case ID2(RFPU,RFPU): + if( f->mem_r ) { + // canonical form + if( a.reg & 8 ) r64 |= 1; + if( b.reg & 8 ) r64 |= 4; + OP(f->mem_r); + MOD_RM(3,b.reg,a.reg); + } else { + ERRIF( f->r_mem == 0 ); + if( a.reg & 8 ) r64 |= 4; + if( b.reg & 8 ) r64 |= 1; + OP(f->r_mem); + MOD_RM(3,a.reg,b.reg); + } + break; + case ID2(RCPU,RFPU): + case ID2(RFPU,RCPU): + ERRIF( (f->r_mem>>16) == 0 ); + if( a.reg & 8 ) r64 |= 4; + if( b.reg & 8 ) r64 |= 1; + OP(f->r_mem); + MOD_RM(3,a.reg,b.reg); + break; + case ID2(RCPU,RUNUSED): + ERRIF( f->r_mem == 0 ); + if( a.reg & 8 ) r64 |= 1; + if( GET_RM(f->r_mem) > 0 ) { + OP(f->r_mem); + MOD_RM(3, GET_RM(f->r_mem)-1, a.reg); + } else + OP(f->r_mem + (a.reg&7)); + break; + case ID2(RSTACK,RUNUSED): + ERRIF( f->mem_r == 0 || GET_RM(f->mem_r) == 0 ); + OP(f->mem_r); + if( IS_SBYTE(a.value) ) { + MOD_RM(1,GET_RM(f->mem_r)-1,RBP); + B(a.value); + } else { + MOD_RM(2,GET_RM(f->mem_r)-1,RBP); + W((int)a.value); + } + break; + case ID2(RCPU,RCONST): + ERRIF( f->r_const == 0 && f->r_i8 == 0 ); + if( a.reg & 8 ) r64 |= 1; + if( f->r_i8 && IS_SBYTE(b.value) ) { + if( (f->r_i8&FLAG_DUAL) && (a.reg & 8) ) r64 |= 4; + OP(f->r_i8); + if( (f->r_i8&FLAG_DUAL) ) MOD_RM(3,a.reg,a.reg); else MOD_RM(3,GET_RM(f->r_i8)-1,a.reg); + B(b.value); + } else if( GET_RM(f->r_const) > 0 || (f->r_const&FLAG_DUAL) ) { + if( (f->r_i8&FLAG_DUAL) && (a.reg & 8) ) r64 |= 4; + OP(f->r_const&0xFF); + if( (f->r_i8&FLAG_DUAL) ) MOD_RM(3,a.reg,a.reg); else MOD_RM(3,GET_RM(f->r_const)-1,a.reg); + if( mode64 && IS_64 && op == _MOV ) W64(b.value); else W((int)b.value); + } else { + ERRIF( f->r_const == 0); + OP((f->r_const&0xFF) + (a.reg&7)); + if( mode64 && IS_64 && op == _MOV ) W64(b.value); else W((int)b.value); + } + break; + case ID2(RSTACK,RCPU): + case ID2(RSTACK,RFPU): + ERRIF( f->mem_r == 0 ); + if( b.reg & 8 ) r64 |= 4; + OP(f->mem_r); + if( IS_SBYTE(a.value) ) { + MOD_RM(1,b.reg,RBP); + B(a.value); + } else { + MOD_RM(2,b.reg,RBP); + W((int)a.value); + } + break; + case ID2(RCPU,RSTACK): + case ID2(RFPU,RSTACK): + ERRIF( f->r_mem == 0 ); + if( a.reg & 8 ) r64 |= 4; + OP(f->r_mem); + if( IS_SBYTE(b.value) ) { + MOD_RM(1,a.reg,RBP); + B(b.value); + } else { + MOD_RM(2,a.reg,RBP); + W((int)b.value); + } + break; + case ID2(RCONST,RUNUSED): + ERRIF( f->r_const == 0 ); + OP(f->r_const); + if( f->r_const & FLAG_8B ) B(a.value); else W((int)a.value); + break; + case ID2(RMEM,RUNUSED): + ERRIF( f->mem_r == 0 ); + if( a.reg & 8 ) r64 |= 1; + OP(f->mem_r); + if( a.value == 0 && (a.reg&7) != RBP ) { + MOD_RM(0,GET_RM(f->mem_r)-1,a.reg); + if( (a.reg&7) == RSP ) B(0x24); + } else if( IS_SBYTE(a.value) ) { + MOD_RM(1,GET_RM(f->mem_r)-1,a.reg); + if( (a.reg&7) == RSP ) B(0x24); + B(a.value); + } else { + MOD_RM(2,GET_RM(f->mem_r)-1,a.reg); + if( (a.reg&7) == RSP ) B(0x24); + W((int)a.value); + } + break; + case ID2(RCPU, RMEM): + case ID2(RFPU, RMEM): + ERRIF( f->r_mem == 0 ); + if( a.reg & 8 ) r64 |= 4; + if( b.reg & 8 ) r64 |= 1; + OP(f->r_mem); + if( b.value == 0 && (b.reg&7) != RBP ) { + MOD_RM(0,a.reg,b.reg); + if( (b.reg&7) == RSP ) B(0x24); + } else if( IS_SBYTE(b.value) ) { + MOD_RM(1,a.reg,b.reg); + if( (b.reg&7) == RSP ) B(0x24); + B(b.value); + } else { + MOD_RM(2,a.reg,b.reg); + if( (b.reg&7) == RSP ) B(0x24); + W((int)b.value); + } + break; + case ID2(RMEM, RCPU): + case ID2(RMEM, RFPU): + ERRIF( f->mem_r == 0 ); + if( a.reg & 8 ) r64 |= 1; + if( b.reg & 8 ) r64 |= 4; + OP(f->mem_r); + if( a.value == 0 && (a.reg&7) != RBP ) { + MOD_RM(0,b.reg,a.reg); + if( (a.reg&7) == RSP ) B(0x24); + } else if( IS_SBYTE(a.value) ) { + MOD_RM(1,b.reg,a.reg); + if( (a.reg&7) == RSP ) B(0x24); + B(a.value); + } else { + MOD_RM(2,b.reg,a.reg); + if( (a.reg&7) == RSP ) B(0x24); + W((int)a.value); + } + break; + default: + ERRIF(1); + } +} + +static void emit_jump( code_ctx *ctx, int mode, int offset ) { + int op_mult = 16; +# ifdef GEN_DEBUG + op_mult += 6; // additional debug info per op +# endif + if( IS_SBYTE(offset*op_mult) ) { + // assume it's ok to use short jump + B(mode == JAlways ? JAlways_short : mode - 0x10); + int_arr_add(ctx->short_jumps, byte_count(ctx->code)); + int_arr_add(ctx->short_jumps, ctx->cur_op + offset + 1); + B(-2); + } else { + if( mode != JAlways ) B(0x0F); + B(mode); + int_arr_add(ctx->near_jumps, byte_count(ctx->code)); + int_arr_add(ctx->near_jumps, ctx->cur_op + offset + 1); + W(-5); + } +} + +#define RTMP R(R11) +static ereg get_tmp( emit_mode mode ) { + if( IS_FLOAT(mode) ) + return MMX(scratch_float_reg); + return RTMP; +} + +static void emit_mov( code_ctx *ctx, ereg out, ereg val, emit_mode mode ) { + if( out == val ) + return; + if( !IS_REG(out) && (!IS_REG(val) || REG_VALUE(val) != 0) ) { + ereg tmp = get_tmp(mode); + emit_mov(ctx, tmp, val, mode); + emit_mov(ctx, out, tmp, mode); + } else if( IS_REG(val) && REG_VALUE(val) != 0 ) { + emit_ext(ctx,_LEA,out,REG_PTR(val),M_PTR,0); + } else { + static CpuOp MOV_OP[] = {_MOV,MOV8,MOV16,_MOV,_MOV,MOVSD,MOVSS,_MOV,_MOV}; + CpuOp op = MOV_OP[mode]; + if( (mode == M_UI8 || mode == M_UI16) && IS_REG(out) ) { + op++; // MOVZX + mode = M_PTR; + } + emit_ext(ctx,op,out,val,mode,0); + } +} + +static int jump_near( code_ctx *ctx, int mode ) { + int pos = byte_count(ctx->code); + if( mode < 0 ) { + // backwards + int target = -mode; + B(JAlways_short); + B(target - (pos + 2)); + } else { + B(mode == JAlways ? JAlways_short : mode - 0x10); + B(0); + } + return pos; +} + +static void patch_jump_near( code_ctx *ctx, int jpos ) { + if( !jpos ) return; + ctx->code.values[jpos + 1] = (unsigned char)(byte_count(ctx->code) - (jpos + 2)); +} + +static void emit_div_mod( code_ctx *ctx, hl_op op, ereg out, ereg a, ereg b, emit_mode mode ) { + if( IS_FLOAT(mode) ) { + BREAK(); + return; + } + ereg bas = R(RAX), div = R(RDX); + if( out != bas ) EMIT(_PUSH,bas,UNUSED,M_PTR); + if( out != div ) EMIT(_PUSH,div,UNUSED,M_PTR); + if( b == bas || b == div || !IS_REG(b) ) { + EMIT(_MOV,RTMP,b,mode); + b = RTMP; + } + if( a != bas ) EMIT(_MOV,bas,a,mode); + + // check for div = 0 + EMIT(_TEST,b,b,mode); + int jz = jump_near(ctx,JZero); + int jz1 = 0; + // Prevent MIN/-1 overflow exception + // OSMod: r = (b == 0 || b == -1) ? 0 : a % b + // OSDiv: r = (b == 0 || b == -1) ? a * b : a / b + if( op == OSMod || op == OSDiv ) { + EMIT(_CMP,b,MK_CONST(-1),mode); + jz1 = jump_near(ctx,JZero); + } + bool unsign = op == OUDiv || op == OUMod; + if( unsign ) + EMIT(XOR,div,div,mode); + else + EMIT(CDQ, UNUSED, UNUSED, mode); + EMIT(unsign ? DIV : IDIV, b, UNUSED, mode); + ereg res = (op == OUDiv || op == OSDiv) ? bas : div; + int jn = jump_near(ctx,JAlways); + patch_jump_near(ctx,jz); + patch_jump_near(ctx,jz1); + if( op != OSDiv ) { + EMIT(XOR, res, res, mode); + } else { + if( res != bas ) EMIT(_MOV,res,bas,mode); + EMIT(IMUL,res,b,mode); + } + patch_jump_near(ctx,jn); + if( out != res ) EMIT(_MOV,out,res,mode); + if( out != div ) EMIT(_POP,div,UNUSED,M_PTR); + if( out != bas ) EMIT(_POP,bas,UNUSED,M_PTR); +} + +static void emit_anyop( code_ctx *ctx, hl_op op, ereg out, ereg a, ereg b, emit_mode mode ) { + CpuOp cop; + int mask = 0; +# define F_OP(iop,f32,f64) cop = mode == M_F32 ? f32 : (mode == M_F64 ? f64 : iop); +# define DECL_OP(i8,i16,iop,f32,f64) static CpuOp ops_##iop[] = {-1,i8,i16,iop,iop,f64,f32,-1,-1}; cop = ops_##iop[mode] + switch( op ) { + case OAdd: + DECL_OP(ADD8,ADD16,ADD,ADDSS,ADDSD); + break; + case OSub: + DECL_OP(SUB8,SUB16,SUB,SUBSS,SUBSD); + break; + case OMul: + DECL_OP(IMUL16/*NO IMUL8*/,IMUL16,IMUL,MULSS,MULSD); + if( mode == M_UI8 ) mask = 0xFF; + break; + case OIncr: + cop = INC; + break; + case ODecr: + cop = DEC; + break; + case OAnd: + cop = AND; + break; + case OOr: + cop = OR; + break; + case OXor: + cop = XOR; + break; + case OShl: + case OSShr: + case OUShr: + { + ereg f = R(RCX); + if( b != f ) { + if( a == f || out == f ) { + EMIT(_MOV,RTMP,a,mode); + a = RTMP; + } + if( out == f ) { + EMIT(_MOV,f,b,mode); + emit_anyop(ctx, op, RTMP, RTMP, f, mode); + EMIT(_MOV,f,RTMP,mode); + } else { + EMIT(_PUSH,f,UNUSED,M_PTR); + EMIT(_MOV,f,b,mode); + emit_anyop(ctx, op, out, a, f, mode); + EMIT(_POP,f,UNUSED,M_PTR); + } + return; + } + } + if( out == b ) { + ereg r = get_tmp(mode); + emit_anyop(ctx,op,r,a,b,mode); + emit_mov(ctx,out,r,mode); + return; + } + b = UNUSED; + cop = (op == OShl ? SHL : (op == OSShr ? SAR : SHR)); + break; + case OSDiv: + F_OP(0,DIVSS,DIVSD); + if( IS_FLOAT(mode) ) break; + case OSMod: + case OUMod: + case OUDiv: + emit_div_mod(ctx,op,out,a,b,mode); + return; + case ONot: + if( IS_REG(a) ) { + EMIT(XOR,a,MK_CONST(1),M_I32); + } else { + BREAK(); + } + return; + case ONeg: + if( IS_FLOAT(mode) ) { + if( out != a && IS_REG(out) ) { + EMIT(mode == M_F32 ? XORPS : XORPD, out, out, mode); + EMIT(mode == M_F32 ? SUBSS : SUBSD, out, a, mode); + } else { + ereg tmp = get_tmp(mode); + EMIT(mode == M_F32 ? XORPS : XORPD, tmp, tmp, mode); + EMIT(mode == M_F32 ? SUBSS : SUBSD, tmp, a, mode); + EMIT(mode == M_F32 ? MOVSS : MOVSD, out, tmp, mode); + } + return; + } + cop = NEG; + break; + default: + jit_assert(); + break; + } + + if( out == a && IS_REG(a) ) { + EMIT(cop,out,b,mode); + } else if( !IS_REG(out) || out == b ) { + ereg tmp = get_tmp(mode); + emit_mov(ctx, tmp, a, mode); + EMIT(cop,tmp,b,mode); + if( mask ) { + EMIT(AND,tmp,MK_CONST(mask),M_I32); + mask = 0; + } + emit_mov(ctx, out, tmp, mode); + } else { + emit_mov(ctx, out, a, mode); + EMIT(cop,out,b,mode); + } + if( mask ) EMIT(AND,out,MK_CONST(mask),M_I32); +} + +void hl_codegen_flush( jit_ctx *jit ) { + code_ctx *ctx = jit->code; + if( ctx->flushed ) return; + ctx->flushed = true; + jit->code_size = ctx->code.cur; + jit->code_instrs = ctx->code.values; + jit->code_pos_map = ctx->pos_map; + if( ctx->pos_map ) ctx->pos_map[ctx->cur_op+1] = ctx->code.cur; +} + +static void emit_nop( code_ctx *ctx, int size ) { + byte_reserve(ctx->code,size); + ctx->code.cur -= size; + if( size >= 8 ) { + W(0x841F0F); + W(0); + return; + } + if( size >= 5 ) { + W(0x441F0F); + B(0); + return; + } + if( size >= 4 ) { + W(0x401F0F); + return; + } + if( size >= 3 ) { + B(0x0F); + B(0x1F); + B(0x00); + return; + } + if( size >= 2 ) { + B(0x66); + B(0x90); + return; + } + B(0x90); +} + +#define CALC_REX(w,a,b) (((w)&8) ? 4 : 0) | (((b)&8) ? 2 : 0) | (((a) & 8) ? 1 : 0) + +#define REX64(out,a,b) B(0x48 | CALC_REX(out,a,b)) +#define REX32(out,a,b) { int v = CALC_REX(out,a,b); if( v ) B(v|0x40); } + +static void emit_lea( code_ctx *ctx, ereg out, einstr *_e ) { + einstr e = *_e; + + int mult = e.size_offs & 0xFF; + int offs = e.size_offs >> 8; + if( mult != 0 && (mult < 0 || mult > 8 || (mult & (mult - 1)) != 0) ) jit_assert(); + + if( IS_REG(e.a) ) + offs += REG_VALUE(e.a); + + if( !IS_REG(e.a) ) { + // a is always a mem address ! + emit_mov(ctx, RTMP, e.a, M_PTR); + e.a = RTMP; + if( e.b && !IS_REG(e.b) ) { + if( !IS_REG(out) ) jit_assert(); + emit_mov(ctx, out, e.b, M_I32); + e.b = out; + } + } else if( e.b && !IS_REG(e.b) ) { + // b is always an int index ! + emit_mov(ctx, RTMP, e.b, M_I32); + e.b = RTMP; + } + + if( mult == 0 ) { + if( REG_KIND(e.a) != R_REG ) jit_assert(); + // no index + emit_ext(ctx,_LEA,out,MK_ADDR(e.a,offs),M_PTR,0); + return; + } + + bool use_offs = offs != 0 || (e.a&7) == RBP; + REX64(out,e.a,e.b); + B(0x8D); + MOD_RM(use_offs ? 1 : 0,out,4); + SIB(mult,e.b,e.a); + if( use_offs ) { + if( !IS_SBYTE(offs) ) jit_assert(); + B(offs); + } +} + +static void align_function( code_ctx *ctx ) { + while( byte_count(ctx->code) & 15 ) + emit_nop(ctx,16 - (byte_count(ctx->code) & 15)); +} + +static int reserve_const_segment( code_ctx *ctx, int size, int align ) { + int pos = byte_count(ctx->const_table); + if( align ) { + int k = pos & (align-1); + if( k ) { + byte_reserve_impl(&ctx->jit->galloc,&ctx->const_table,align - k); + pos = byte_count(ctx->const_table); + } + } + byte_reserve_impl(&ctx->jit->galloc,&ctx->const_table,size); + return pos; +} + +static void alloc_const( code_ctx *ctx, uint64 value ) { + int pos = value_map_find(ctx->const_table_lookup, value); + if( pos < 0 ) { + pos = reserve_const_segment(ctx,8,8); + *(uint64*)byte_addr(ctx->const_table,pos) = value; + value_map_add_impl(&ctx->jit->galloc,&ctx->const_table_lookup,value,pos); + } + int_arr_add_impl(&ctx->jit->galloc,&ctx->const_refs,ctx->jit->out_pos + byte_count(ctx->code) - 4); + int_arr_add_impl(&ctx->jit->galloc,&ctx->const_refs,pos); +} + +static int emit_lea_rel( code_ctx *ctx, ereg out ) { + B(0x48 + ((out & 8) ? 4 : 0)); + B(0x8D); + MOD_RM(0,out&7,5); + int pos = ctx->jit->out_pos + byte_count(ctx->code); + W(0); + return pos; +} + +static int get_cond_jump( code_ctx *ctx ) { + int prev = 0; + einstr *p; + do { + p = ctx->jit->reg_instrs + ctx->cur_op - (++prev); + } while( p->op == MOV || p->op == JCOND || p->op == CMOV || p->op == XCHG || p->op == CXCHG ); + int op; + switch( p->size_offs ) { + case OJFalse: + case OJNull: + op = JZero; + break; + case OJTrue: + case OJNotNull: + op = JNotZero; + break; + case OJSGte: + op = IS_FLOAT(p->mode) ? JUGte : JSGte; + break; + case OJSGt: + op = IS_FLOAT(p->mode) ? JUGt : JSGt; + break; + case OJUGte: + op = JUGte; + break; + case OJSLt: + op = IS_FLOAT(p->mode) ? JULt : JSLt; + break; + case OJSLte: + op = IS_FLOAT(p->mode) ? JULte : JSLte; + break; + case OJULt: + op = JULt; + break; + case OJEq: + op = JEq; + break; + case OJNotEq: + op = JNeq; + break; + case OJNotLt: + op = JUGte; + break; + case OJNotGte: + op = JULt; + break; + case 0: + if( p->op == DEBUG_BREAK ) { + // found a debug break ! + BREAK(); + op = JZero; + break; + } + // fallback + default: + jit_assert(); + break; + } + return op; +} + +static void emit_cmov( code_ctx *ctx, ereg out, ereg r, int cond, emit_mode m ) { + if( IS_FLOAT(m) ) jit_assert(); + if( hl_emit_mode_sizes[m] == 8 ) + REX64(out,r,UNUSED); + else + REX32(out,r,UNUSED); + B(0x0F); + B(cond - 0x40); + MOD_RM(3,out,r); +} + +void hl_codegen_function( jit_ctx *jit ) { + code_ctx *ctx = jit->code; + ctx->flushed = false; + byte_free(&ctx->code); + int_arr_free(&ctx->near_jumps); + int_arr_free(&ctx->short_jumps); + free(ctx->pos_map); + ctx->pos_map = (int*)malloc((jit->reg_instr_count + 1) * sizeof(int)); + ctx->pos_map[0] = 0; + int const_addr_prev = int_arr_count(ctx->const_addr); + byte_reserve(ctx->code,64); + ctx->code.cur -= 64; +# ifdef GEN_DEBUG + int reg_index = 0; + int emit_index = 0; +# endif + for(int cur_pos=0;cur_posreg_instr_count;cur_pos++) { + einstr *e = jit->reg_instrs + cur_pos; + ereg out = jit->reg_writes[cur_pos]; + byte_reserve(ctx->code,64); + ctx->code.cur -= 64; + ctx->cur_op = cur_pos; + if( cur_pos > 0 ) ctx->pos_map[cur_pos] = ctx->code.cur; +# ifdef GEN_DEBUG + int rid = cur_pos | (jit->fun->findex << 16); + while( reg_index < jit->instr_count && jit->reg_pos_map[reg_index] <= cur_pos ) reg_index++; + int uid; + while( emit_index < jit->fun->nops && jit->emit_pos_map[emit_index] < reg_index ) { + uid = emit_index | (jit->fun->findex << 16); + __ignore(&uid); + __ignore(&rid); + emit_index++; + if( emit_index >= jit->fun->nops || jit->emit_pos_map[emit_index] >= reg_index ) + emit_ext(ctx,_MOV,RTMP,VAL_CONST,M_I32,uid); + } +# endif + switch( e->op ) { + case LOAD_ARG: + continue; // nop + case MOV: + emit_mov(ctx, out, e->a, e->mode); + break; + case XCHG: + { + ereg tmp = get_tmp(e->mode); + if( !IS_REG(e->a) && !IS_REG(e->b) ) + jit_assert(); + emit_mov(ctx, tmp, e->a, M_PTR); + emit_mov(ctx, e->a, e->b, M_PTR); + emit_mov(ctx, e->b, tmp, M_PTR); + } + break; + case STORE: + if( !IS_REG(e->a) && !IS_REG(e->b) ) { + if( e->mode != M_PTR ) { + // no push/pop 32 bit + ereg tmp2 = R(RAX); + emit_mode mode = e->mode == M_F64 ? M_PTR : e->mode == M_F32 ? M_I32 : e->mode; + EMIT(_PUSH,tmp2,UNUSED,M_PTR); + emit_mov(ctx, RTMP, e->a, M_PTR); + emit_mov(ctx, tmp2, e->b, mode); + emit_mov(ctx, MK_ADDR(RTMP,e->size_offs), tmp2, mode); + EMIT(_POP,tmp2,UNUSED,M_PTR); + } else { + if( IS_FLOAT(e->mode) ) BREAK(); + EMIT(_PUSH,e->b,UNUSED,e->mode); + emit_mov(ctx, RTMP, e->a, M_PTR); + emit_ext(ctx, _POP,REG_ADD_OFFSET(REG_PTR(RTMP),e->size_offs), UNUSED, e->mode, 0); + } + } else if( !IS_REG(e->a) ) { + emit_mov(ctx, RTMP, e->a, M_PTR); + emit_mov(ctx, MK_ADDR(RTMP,e->size_offs), e->b, e->mode); + } else + emit_mov(ctx, REG_ADD_OFFSET(REG_PTR(e->a),e->size_offs), e->b, e->mode); + break; + case PUSH: + if( IS_FLOAT(e->mode) ) { + if( !IS_REG(e->a) ) + EMIT(_PUSH,e->a,UNUSED,M_PTR); + else { + EMIT(SUB,R(RSP),MK_CONST(8),M_PTR); + EMIT(e->mode == M_F32 ? MOVSS : MOVSD,REG_PTR(R(RSP)),e->a,e->mode); + } + } else if( IS_REG(e->a) && REG_VALUE(e->a) != 0 ) { + emit_mov(ctx, RTMP, e->a, e->mode); + EMIT(_PUSH, RTMP, UNUSED, M_PTR); + } else + EMIT(_PUSH, e->a, UNUSED, M_PTR); + break; + case POP: + if( IS_FLOAT(e->mode) ) { + EMIT(e->mode == M_F32 ? MOVSS : MOVSD,REG_PTR(R(RSP)),e->a,e->mode); + EMIT(ADD,R(RSP),MK_CONST(8),M_PTR); + } else { + EMIT(_POP, e->a, UNUSED, M_PTR); + } + break; + case PUSH_CONST: + if( e->mode != M_PTR ) jit_assert(); + if( (e->value&0xFF) == e->value ) + emit_ext(ctx,PUSH8, VAL_CONST, UNUSED, M_PTR, e->value); + else if( (e->value&0xFFFFFFFF) == e->value ) + emit_ext(ctx,_PUSH, VAL_CONST, UNUSED, M_I32, e->value); // will push 64bits + else + emit_ext(ctx,_PUSH, VAL_CONST, UNUSED, M_PTR, e->value); + break; + case DEBUG_BREAK: + BREAK(); + break; + case RET: + if( !IS_NULL(e->a) ) { + ereg ret = IS_FLOAT(e->mode) ? MMX(0) : R(RAX); + if( e->a != ret ) emit_mov(ctx, ret, e->a, e->mode); + } + EMIT(_RET, UNUSED, UNUSED, M_NONE); + break; + case LOAD_CONST: + { + emit_mode mode = e->mode; + if( !IS_REG(out) ) + mode = (mode == M_F32 ? M_I32 : mode == M_F64 ? M_PTR : mode); // don't use FP for stack ops + ereg w = IS_REG(out) ? out : get_tmp(mode); + if( e->value == 0 ) + EMIT(mode == M_F32 ? XORPS : mode == M_F64 ? XORPD : XOR, w, w, mode); + else if( IS_FLOAT(mode) ) { + // MOVSS / MOVSD with data relative + B(e->mode == M_F32 ? 0xF3 : 0xF2); + if( out&8 ) B(0x44); + B(0x0F); + B(0x10); + MOD_RM(0,out&7,5); + W(0); + alloc_const(ctx, e->value); + } else if( mode == M_PTR && (e->value&0xFFFFFFFF) == e->value ) + emit_ext(ctx, _MOV, w, VAL_CONST, M_I32, e->value); + else + emit_ext(ctx, _MOV, w, VAL_CONST, mode, e->value); + if( w != out ) + emit_mov(ctx, out, w, mode); + } + break; + case LOAD_ADDR: + if( IS_REG(e->a) && e->nargs == e->mode ) { + emit_mov(ctx, out, REG_ADD_OFFSET(REG_PTR(e->a),e->size_offs), e->nargs); + } else { + ereg tmp = IS_REG(out) || (e->nargs == e->mode) ? out : RTMP; + emit_mov(ctx, RTMP, e->a, M_PTR); + emit_mov(ctx, tmp, MK_ADDR(RTMP,e->size_offs), e->nargs); + if( out != tmp ) + emit_mov(ctx, out, tmp, e->mode); + } + break; + case LOAD_FUN: + { + ereg w = IS_REG(out) ? out : RTMP; + int pos = emit_lea_rel(ctx,w); + int fid = e->size_offs; + int_arr_add_impl(&ctx->jit->galloc,&ctx->funs,pos); + int_arr_add_impl(&ctx->jit->galloc,&ctx->funs,fid); + if( w != out ) + emit_mov(ctx, out, w, M_PTR); + } + break; + case CALL_FUN: + B(0xE8); + { + int pos = jit->out_pos + byte_count(ctx->code); + int fid = e->a; + int_arr_add_impl(&ctx->jit->galloc,&ctx->funs,pos); + int_arr_add_impl(&ctx->jit->galloc,&ctx->funs,fid); + W(0); + } + break; + case CALL_PTR: + if( e->value == (uint64)hl_null_access || e->value == (uint64)hl_jit_null_field_access ) { + // call near + int target = e->value == (uint64)hl_null_access ? ctx->null_access_pos : ctx->null_field_pos; + B(0xE8); + W(target - (jit->out_pos + byte_count(ctx->code) + 4)); + } else { + // call near indirect + B(0xFF); + B(0x15); + W(0); + alloc_const(ctx, (uint64)e->value); + if( e->mode == M_UI8 || e->mode == M_UI16 ) { + // clear value upper bits + EMIT(e->mode == M_UI8 ? MOVZX8 : MOVZX16,R(RAX),R(RAX),M_PTR); + } + } + break; + case CALL_REG: + EMIT(_CALL, e->a, UNUSED, M_NONE); + break; + case TEST: + if( IS_FLOAT(e->mode) ) + jit_assert(); + if( !IS_REG(e->a) ) { + ereg tmp = get_tmp(e->mode); + emit_mov(ctx, tmp, e->a, e->mode); + EMIT(_TEST,tmp,tmp,e->mode); + } else + EMIT(_TEST,e->a,e->a,e->mode); + break; + case CMP: + { + CpuOp op; + switch( e->mode ) { + case M_UI8: op = CMP8; break; + case M_UI16: op = CMP16; break; + case M_F32: op = COMISS; break; + case M_F64: op = COMISD; break; + default: op = _CMP; break; + } + ereg a = e->a; + if( !IS_REG(e->a) && (IS_FLOAT(e->mode) || !IS_REG(e->b)) ) { + ereg tmp = get_tmp(e->mode); + emit_mov(ctx, tmp, e->a, e->mode); + a = tmp; + } + EMIT(op,a,e->b,e->mode); + if( IS_FLOAT(e->mode) && e->size_offs != OJSGt && e->size_offs != OJNull && e->size_offs != OJNotNull ) { + // handle NaNs + int jnotnan = jump_near(ctx,JNParity); + switch( e->size_offs ) { + case OJSLt: + case OJNotLt: + // set CF=0, ZF=1 + EMIT(XOR,RTMP,RTMP,M_I32); + break; + case OJSGte: + case OJNotGte: + // set ZF=0, CF=1 + EMIT(XOR,RTMP,RTMP,M_I32); + EMIT(STC,UNUSED,UNUSED,0); + break; + case OJNotEq: + case OJEq: + // set ZF=0, CF=? + case OJSLte: + // set ZF=0, CF=0 + EMIT(TEST,R(RSP),R(RSP),M_PTR); + break; + default: + jit_assert(); + } + patch_jump_near(ctx,jnotnan); + } + } + break; + case JCOND: + { + int jump = get_cond_jump(ctx); + emit_jump(ctx, jump, e->size_offs); + } + break; + case JUMP: + emit_jump(ctx, JAlways, e->size_offs); + break; + case JUMP_TABLE: + { + int start = reserve_const_segment(ctx,HL_WSIZE * e->nargs,16); + int pos = emit_lea_rel(ctx, RTMP); + int_arr_add_impl(&ctx->jit->galloc,&ctx->const_refs,pos); + int_arr_add_impl(&ctx->jit->galloc,&ctx->const_refs,start); + ereg a = RTMP; + ereg b = e->a; + if( IS_REG(b) ) { + // jump [a+b*8] + B(0x40 | ((a&8)?1:0) | ((b&8)?2:0)); + B(0xFF); + B(0x24); + SIB(3,(b&7),(a&7)); + } else { + ereg save = R(RAX); + EMIT(_PUSH,save,UNUSED,M_PTR); + EMIT(_MOV,save,b,M_I32); + // lea tmp, [tmp+save*8] + einstr etmp; + etmp.a = a; + etmp.b = save; + etmp.size_offs = 8; + emit_lea(ctx, RTMP, &etmp); + EMIT(_POP,save,UNUSED,M_PTR); + // jump [tmp] + B(0x40 | ((RTMP&8)?1:0)); + B(0xFF); + MOD_RM(0,4,RTMP&7); + } + ereg *args = hl_emit_get_args(jit->emit,e); + for(int k=0;knargs;k++) { + int_arr_add_impl(&jit->galloc,&ctx->const_addr,start + k * HL_WSIZE); + int_arr_add_impl(&jit->galloc,&ctx->const_addr,ctx->cur_op + (int)args[k] + 1); + } + } + break; + case CONV_UNSIGNED: + case CONV: + { + emit_mode in_mode = e->size_offs; + ereg r = IS_REG(e->a) ? e->a : get_tmp(in_mode); + if( r != e->a ) emit_mov(ctx, r, e->a, in_mode); + CpuOp op = -1; + switch( ID2(e->mode,in_mode) ) { + case ID2(M_F32,M_UI8): + case ID2(M_F32,M_UI16): + case ID2(M_F32,M_I32): + case ID2(M_F32,M_PTR): + op = CVTSI2SS; + break; + case ID2(M_F64,M_UI8): + case ID2(M_F64,M_UI16): + case ID2(M_F64,M_I32): + case ID2(M_F64,M_PTR): + op = CVTSI2SD; + break; + case ID2(M_UI8,M_F32): + case ID2(M_UI16,M_F32): + case ID2(M_I32,M_F32): + case ID2(M_PTR,M_F32): + op = CVTTSS2SI; + break; + case ID2(M_UI8,M_F64): + case ID2(M_UI16,M_F64): + case ID2(M_I32,M_F64): + case ID2(M_PTR,M_F64): + op = CVTTSD2SI; + break; + case ID2(M_F32,M_F64): + op = CVTSD2SS; + break; + case ID2(M_F64,M_F32): + op = CVTSS2SD; + break; + case ID2(M_PTR,M_I32): + // sign extend 32-64 bit conv + op = MOVSXD; + break; + case ID2(M_UI16,M_UI8): + case ID2(M_I32,M_UI8): + case ID2(M_PTR,M_UI8): + case ID2(M_UI8, M_UI16): + case ID2(M_UI8, M_I32): + case ID2(M_UI8, M_PTR): + op = MOVZX8; + break; + case ID2(M_I32,M_UI16): + case ID2(M_PTR,M_UI16): + case ID2(M_UI16, M_I32): + case ID2(M_UI16, M_PTR): + op = MOVZX16; + break; + case ID2(M_I32,M_PTR): + op = _MOV; + break; + default: + jit_assert(); + break; + } + if( IS_REG(out) || op == _MOV ) + EMIT(op,out,r,e->op == CONV_UNSIGNED ? M_PTR : e->mode); + else { + ereg r2 = get_tmp(e->mode); + EMIT(op,r2,r,e->op == CONV_UNSIGNED ? M_PTR : e->mode); + emit_mov(ctx,out,r2,e->mode); + } + } + break; + case BINOP: + case UNOP: + emit_anyop(ctx, e->size_offs, out, e->a, e->b, e->mode); + break; + case LEA: + if( !IS_REG(out) ) { + ereg tmp = get_tmp(e->mode); + emit_lea(ctx,tmp,e); + emit_mov(ctx,out,tmp,e->mode); + } else + emit_lea(ctx,out,e); + break; + case STACK_OFFS: + if( e->size_offs >= 0 ) + EMIT(ADD,R(RSP),MK_CONST(e->size_offs),M_PTR); + else + EMIT(SUB,R(RSP),MK_CONST(-e->size_offs),M_PTR); + break; + case PREFETCH: + { + CpuOp op; + switch( e->size_offs ) { + case 0: op = PREFETCHT0; break; + case 1: op = PREFETCHT1; break; + case 2: op = PREFETCHT2; break; + case 3: op = PREFETCHNTA; break; + case 4: op = PREFETCHW; break; + default: jit_assert(); + } + ereg a = e->a; + if( !IS_REG(e->a) ) { + emit_mov(ctx,RTMP,e->a,M_PTR); + a = RTMP; + } + EMIT(op,REG_PTR(a),UNUSED,M_PTR); + } + break; + case CMOV: + { + int cond = get_cond_jump(ctx); + if( !IS_REG(out) ) jit_assert(); + if( IS_REG(e->a) ) { + emit_cmov(ctx,out,e->a,cond,M_PTR); + } else { + emit_mov(ctx,RTMP,e->a,e->mode); + emit_cmov(ctx,out,RTMP,cond,M_PTR); + } + } + break; + case CXCHG: + BREAK(); + break; + case NOP: + emit_nop(ctx,1); + break; + default: + jit_assert(); + break; + } + if( ctx->code.cur > ctx->code.max ) jit_assert(); + } + align_function(ctx); + hl_codegen_flush(jit); + for(int i=0;ishort_jumps);i+=2) { + int pos = int_arr_get(ctx->short_jumps,i); + int target = int_arr_get(ctx->short_jumps,i+1); + int offset = ctx->pos_map[target] - (pos + 1); + if( !IS_SBYTE(offset) ) jit_assert(); + *(char*)&ctx->code.values[pos] = (char)offset; + } + for(int i=0;inear_jumps);i+=2) { + int pos = int_arr_get(ctx->near_jumps,i); + int target = int_arr_get(ctx->near_jumps,i+1); + int offset = ctx->pos_map[target] - (pos + 4); + *(int*)&ctx->code.values[pos] = offset; + } + for(int i=const_addr_prev;iconst_addr);i+=2) { + int target = int_arr_get(ctx->const_addr,i+1); + int offs = jit->out_pos + ctx->pos_map[target]; + ctx->const_addr.values[i+1] = offs; + } +} + +void hl_codegen_alloc( jit_ctx *jit ) { + code_ctx *ctx = (code_ctx*)malloc(sizeof(code_ctx)); + memset(ctx,0,sizeof(code_ctx)); + jit->code = ctx; + ctx->jit = jit; +} + +static void flush_function( code_ctx *ctx, int start ) { + hl_jit_define_function(ctx->jit, start, ctx->jit->out_pos + byte_count(ctx->code) - start); + align_function(ctx); + if( byte_count(ctx->code) > ctx->code.max ) jit_assert(); +} + +void hl_codegen_init( jit_ctx *jit ) { + code_ctx *ctx = jit->code; + byte_reserve(ctx->code,1024); + ctx->code.cur -= 1024; + + // generate hl_null_access stub + ctx->null_access_pos = jit->out_pos + byte_count(ctx->code); + EMIT(_PUSH,R(RBP),UNUSED,M_PTR); + EMIT(_MOV,R(RBP),R(RSP),M_PTR); + EMIT(SUB,R(RSP),MK_CONST(0x20),M_PTR); + emit_ext(ctx,_MOV,R(RAX),VAL_CONST,M_PTR,(int_val)hl_null_access); + EMIT(_CALL,R(RAX),UNUSED,M_PTR); + BREAK(); + flush_function(ctx, ctx->null_access_pos); + + // generate hl_null_field access stub + ctx->null_field_pos = jit->out_pos + byte_count(ctx->code); + EMIT(_PUSH,R(RBP),UNUSED,M_PTR); + EMIT(_MOV,R(RBP),R(RSP),M_PTR); + EMIT(SUB,R(RSP),MK_CONST(0x28),M_PTR); + EMIT(_MOV,jit->cfg.regs.arg[0],MK_ADDR(RBP,HL_WSIZE*2),M_I32); + emit_ext(ctx,_MOV,R(RAX),VAL_CONST,M_PTR,(int_val)hl_jit_null_field_access); + EMIT(_CALL,R(RAX),UNUSED,M_PTR); + BREAK(); + flush_function(ctx, ctx->null_field_pos); + + // generate c2hl stub + jit->code_funs.c2hl = jit->out_pos + byte_count(ctx->code); + regs_config *cfg = &jit->cfg; + EMIT(_PUSH,R(RBP),UNUSED,M_PTR); + EMIT(_MOV,R(RBP),R(RSP),M_PTR); + + ereg fptr = scratch_not_param[0]; + ereg vargs = scratch_not_param[1]; + ereg nargs = scratch_not_param[2]; + EMIT(_MOV,fptr,cfg->regs.arg[0],M_PTR); + EMIT(_MOV,vargs,cfg->regs.arg[1],M_PTR); + EMIT(_MOV,nargs,cfg->regs.arg[2],M_I32); + + for(int i=0;iregs.nargs;i++) + EMIT(_MOV, cfg->regs.arg[i], MK_ADDR(vargs,i*8), M_PTR); + for(int i=0;ifloats.nargs;i++) + EMIT(MOVSD, cfg->floats.arg[i]-64, MK_ADDR(vargs,(i + cfg->regs.nargs) * 8), M_PTR); + + EMIT(ADD,vargs,MK_CONST((MAX_ARGS - 1) * HL_WSIZE),M_PTR); + int begin = byte_count(ctx->code); + EMIT(_TEST,nargs,nargs,M_I32); + int pos = jump_near(ctx,JZero); + EMIT(_PUSH,MK_ADDR(vargs,0),UNUSED,M_PTR); + EMIT(SUB,vargs,MK_CONST(HL_WSIZE),M_PTR); + EMIT(DEC,nargs,UNUSED,M_I32); + jump_near(ctx,-begin); + patch_jump_near(ctx,pos); + + if( IS_WINCALL64 ) EMIT(SUB,R(RSP),MK_CONST(0x20),M_PTR); + EMIT(_CALL, fptr, UNUSED, M_NONE); + + EMIT(_MOV,R(RSP),R(RBP),M_PTR); + EMIT(_POP,R(RBP),UNUSED,M_PTR); + EMIT(_RET,UNUSED,UNUSED,M_NONE); + + flush_function(ctx, jit->code_funs.c2hl); + + // generate hl2c stub + jit->code_funs.hl2c = jit->out_pos + byte_count(ctx->code); + ereg cl = cfg->regs.arg[0]; + ereg tmp = cfg->regs.arg[1]; + EMIT(_PUSH,R(RBP),UNUSED,M_PTR); + EMIT(_MOV,R(RBP),R(RSP),M_PTR); + EMIT(SUB,R(RSP),MK_CONST(cfg->floats.nargs*8),M_PTR); + + // push all possible call registers + for(int i=0;ifloats.nargs;i++) + EMIT(MOVSD,MK_ADDR(RSP,i*8),cfg->floats.arg[cfg->floats.nargs - 1 - i],M_F64); + for(int i=0;iregs.nargs;i++) + EMIT(_PUSH,cfg->regs.arg[cfg->regs.nargs - 1 - i],UNUSED,M_PTR); + + // opcodes for: + // switch( arg0->t->fun->ret->kind ) { + // case HF32: case HF64: return jit_wrapper_d(arg0,&args); + // default: return jit_wrapper_ptr(arg0,&args); + // } + hl_type_fun *ft = NULL; + ereg fun_ptr = scratch_not_param[0]; + + EMIT(_MOV,tmp,MK_ADDR(cl,0),M_PTR); // ->t + EMIT(_MOV,tmp,MK_ADDR(tmp,HL_WSIZE),M_PTR); // ->fun + EMIT(_MOV,tmp,MK_ADDR(tmp,(int)(int_val)&ft->ret),M_PTR); // ->rets + EMIT(_MOV,tmp,MK_ADDR(tmp,0),M_I32); // ->kind + + EMIT(_CMP,tmp,MK_CONST(HF64),M_I32); + int float1 = jump_near(ctx,JEq); + EMIT(_CMP,tmp,MK_CONST(HF32),M_I32); + int float2 = jump_near(ctx,JEq); + emit_ext(ctx,_MOV,fun_ptr,VAL_CONST,M_PTR,(int_val)hl_jit_wrapper_ptr); + + int jexit = jump_near(ctx, JAlways); + patch_jump_near(ctx, float1); + patch_jump_near(ctx, float2); + emit_ext(ctx,_MOV,fun_ptr,VAL_CONST,M_PTR,(int_val)hl_jit_wrapper_d); + patch_jump_near(ctx, jexit); + + int stack_args_pos = HL_WSIZE * (IS_64?2:3); + if( IS_WINCALL64 ) { + stack_args_pos += 0x20; + EMIT(SUB,R(RSP),MK_CONST(0x20),M_PTR); + } + EMIT(_LEA,cfg->regs.arg[1],MK_ADDR(R(RBP),stack_args_pos),M_PTR); + EMIT(_LEA,cfg->regs.arg[2],MK_ADDR(R(RBP),-(cfg->floats.nargs * 8 + cfg->regs.nargs * HL_WSIZE)),M_PTR); + EMIT(_CALL,fun_ptr,UNUSED,M_PTR); + + if( IS_WINCALL64 ) + EMIT(ADD,R(RSP),MK_CONST(0x20),M_PTR); + + EMIT(_MOV,R(RSP),R(RBP),M_PTR); + EMIT(_POP,R(RBP),UNUSED,M_PTR); + EMIT(_RET,UNUSED,UNUSED,M_NONE); + + flush_function(ctx, jit->code_funs.hl2c); + + + hl_codegen_flush(jit); +} + +void hl_codegen_free( jit_ctx *jit ) { + code_ctx *ctx = jit->code; + free(ctx->pos_map); + free(ctx); +} + +void hl_codegen_flush_consts( jit_ctx *jit ) { + code_ctx *ctx = jit->code; + // patch function offsets + for(int i=0;ifuns);i+=2) { + int pos = int_arr_get(ctx->funs,i); + int fid = int_arr_get(ctx->funs,i+1); + int offset = (int)(int_val)jit->mod->functions_ptrs[fid] - (pos + 4); + *(int*)(jit->output + pos) = offset; + } + int_arr_reset(&ctx->funs); + // emit constant table + jit->code_size = byte_count(ctx->const_table); + jit->code_instrs = ctx->const_table.values; + ctx->const_table_pos = jit->out_pos; + // patch constant offsets + for(int i=0;iconst_refs);i+=2) { + int pos = int_arr_get(ctx->const_refs,i); + int coffs = int_arr_get(ctx->const_refs,i+1); + int offset = (ctx->const_table_pos + coffs) - (pos + 4); + *(int*)(jit->output + pos) = offset; + } + int_arr_reset(&ctx->const_refs); + // cleanup + byte_free(&ctx->const_table); + value_map_free(&ctx->const_table_lookup); +} + +void hl_codegen_final( jit_ctx *jit ) { + code_ctx *ctx = jit->code; + // patch absolute addresses + for(int i=0;iconst_addr);i+=2) { + int pos = int_arr_get(ctx->const_addr,i); + int offs = int_arr_get(ctx->const_addr,i+1); + *(void**)(jit->final_code + ctx->const_table_pos + pos) = jit->final_code + offs; + } + int_arr_free(&ctx->const_addr); +} diff --git a/src/main.c b/src/main.c index 6054060d0..5ad605e36 100644 --- a/src/main.c +++ b/src/main.c @@ -20,7 +20,7 @@ * DEALINGS IN THE SOFTWARE. */ #include -#include +#include #include "hlsystem.h" #ifdef HL_WIN @@ -259,7 +259,7 @@ int main(int argc, pchar *argv[]) { file = PSTR("hlboot.dat"); fchk = pfopen(file,"rb"); if( fchk == NULL ) { - printf("HL/JIT %d.%d.%d (c)2015-2025 Haxe Foundation\n Usage : hl [--debug ] [--debug-wait] \n",HL_VERSION>>16,(HL_VERSION>>8)&0xFF,HL_VERSION&0xFF); + printf("HL/JIT %d.%d.%d (c)2015-2026 Haxe Foundation\n Usage : hl [--debug ] [--debug-wait] \n",HL_VERSION>>16,(HL_VERSION>>8)&0xFF,HL_VERSION&0xFF); return 1; } fclose(fchk); diff --git a/src/module.c b/src/module.c index e668b1064..b6d7a4a97 100644 --- a/src/module.c +++ b/src/module.c @@ -21,6 +21,7 @@ */ #include #include +#include #ifdef HL_WIN # undef _GUID @@ -34,6 +35,10 @@ EXTERN_C IMAGE_DOS_HEADER __ImageBase; #define HOT_RELOAD_EXTRA_GLOBALS 4096 +#ifdef HL_DEBUG +# define ALLOW_DUMP +#endif + HL_API void hl_prim_not_loaded( const uchar *err ); static hl_module **cur_modules = NULL; @@ -72,7 +77,7 @@ static bool module_resolve_pos( hl_module *m, void *addr, int *fidx, int *fpos ) while( min < max ) { int mid = (min + max) >> 1; int offset = dbg->large ? ((int*)dbg->offsets)[mid] : ((unsigned short*)dbg->offsets)[mid]; - if( offset <= code_pos ) + if( offset < code_pos ) min = mid + 1; else max = mid; @@ -224,10 +229,8 @@ static int module_capture_stack( void **stack, int size ) { unsigned char *code = m->jit_code; int code_size = m->codesize; if( module_addr >= (void*)code && module_addr < (void*)(code + code_size) ) { - if( stack && count == size ) { + if( stack && count == size ) break; - } - if( stack ) stack[count++] = module_addr; else @@ -248,6 +251,41 @@ static int module_capture_stack( void **stack, int size ) { } } return count; +#elif defined(__aarch64__) || defined(_M_ARM64) + // On AArch64, walk the frame pointer (X29) chain instead of scanning the stack. + // The heuristic scanner produces false positives from callee-saved register spills + // (STP X19,X20 etc.) that look like (stack_addr, code_addr) pairs. + void *stack_top = hl_get_thread()->stack_top; + void **fp = (void **)__builtin_frame_address(0); + int count = 0; + while( fp && (void *)fp < stack_top ) { + void *lr = fp[1]; + void *next_fp = fp[0]; + int i; + for(i=0;ijit_code; + int code_size = m->codesize; + if( lr >= (void*)code && lr < (void*)(code + code_size) ) { + if( m->jit_debug ) { + int s = m->jit_debug[0].start; + code += s; + code_size -= s; + if( lr < (void*)code || lr >= (void*)(code + code_size) ) continue; + } + if( stack ) { + if( count == size ) return count; + stack[count] = lr; + } + count++; + break; + } + } + if( next_fp == NULL || next_fp <= (void *)fp || next_fp >= stack_top ) + break; + fp = (void **)next_fp; + } + return count; #else return hl_module_capture_stack_range(hl_get_thread()->stack_top, (void**)&stack, stack, size); #endif @@ -705,21 +743,57 @@ int hl_module_init( hl_module *m, h_bool hot_reload ) { if( hot_reload ) m->hash = hl_code_hash_alloc(m->code); hl_module_init_natives(m); hl_module_init_indexes(m); +# ifdef WIN64_UNWIND_TABLES + m->unwind_table_size = m->code->nfunctions + 10; // extra space for jit internals + m->unwind_table = malloc(sizeof(RUNTIME_FUNCTION) * m->unwind_table_size); + memset(m->unwind_table, 0, sizeof(RUNTIME_FUNCTION) * m->unwind_table_size); +# endif // JIT ctx = hl_jit_alloc(); if( ctx == NULL ) return 0; hl_jit_init(ctx, m); +# ifdef ALLOW_DUMP + bool dump = false; + int filter = -1; + for(i=0;i= '0' && arg[pos] <= '9' ) + filter |= arg[pos] - '0'; + else + filter |= arg[pos] - 'A' + 10; + pos++; + } + } + } +# endif for(i=0;icode->nfunctions;i++) { hl_function *f = m->code->functions + i; +# ifdef ALLOW_DUMP + if( filter >= 0 && filter != f->findex ) continue; +# endif int fpos = hl_jit_function(ctx, m, f); if( fpos < 0 ) { hl_jit_free(ctx, false); return 0; } m->functions_ptrs[f->findex] = (void*)(int_val)fpos; +# ifdef ALLOW_DUMP + if( dump ) hl_emit_dump(ctx); +# endif } m->jit_code = hl_jit_code(ctx, m, &m->codesize, &m->jit_debug, NULL); +# ifdef ALLOW_DUMP + if( filter >= 0 ) exit(0); +# endif for(i=0;icode->nfunctions;i++) { hl_function *f = m->code->functions + i; m->functions_ptrs[f->findex] = ((unsigned char*)m->jit_code) + ((int_val)m->functions_ptrs[f->findex]); @@ -735,6 +809,9 @@ int hl_module_init( hl_module *m, h_bool hot_reload ) { hl_gc_set_dump_types(hl_module_types_dump); # ifdef HL_VTUNE hl_setup.vtune_init = modules_init_vtune; +# endif +# ifdef WIN64_UNWIND_TABLES + RtlAddFunctionTable(m->unwind_table, m->unwind_table_size, (DWORD64)m->jit_code); # endif hl_jit_free(ctx, hot_reload); if( hot_reload ) { diff --git a/src/opcodes.h b/src/opcodes.h index ab9b1fa51..9e4df7f60 100644 --- a/src/opcodes.h +++ b/src/opcodes.h @@ -67,8 +67,8 @@ OP_BEGIN OP(OIncr,R,X,X) OP(ODecr,R,X,X) - OP(OCall0,R,R,X) - OP(OCall1,R,R,R) + OP(OCall0,R,C,X) + OP(OCall1,R,C,R) OP(OCall2,R,AR,4) OP(OCall3,R,AR,5) OP(OCall4,R,AR,6) @@ -78,17 +78,17 @@ OP_BEGIN OP(OCallClosure,R,AR,VAR_ARGS) OP(OStaticClosure,R,G,X) - OP(OInstanceClosure,R,R,G) + OP(OInstanceClosure,R,C,R) OP(OVirtualClosure,R,R,G) OP(OGetGlobal,R,G,X) - OP(OSetGlobal,R_NW,G,X) - OP(OField,R,R,C) - OP(OSetField,R_NW,R,C) - OP(OGetThis,R,C,X) - OP(OSetThis,R_NW,R,X) + OP(OSetGlobal,G,R,X) + OP(OField,R,R,G) + OP(OSetField,R_NW,G,R) + OP(OGetThis,R,G,X) + OP(OSetThis,G,R,X) OP(ODynGet,R,R,C) - OP(ODynSet,R_NW,R,C) + OP(ODynSet,R_NW,C,R) OP(OJTrue,R_NW,J,X) OP(OJFalse,R_NW,J,X) @@ -134,7 +134,7 @@ OP_BEGIN OP(ONew,R,X,X) OP(OArraySize,R,R,X) - OP(OType,R,R,X) + OP(OType,R,G,X) OP(OGetType,R,R,X) OP(OGetTID,R,R,X) diff --git a/src/profile.c b/src/profile.c index e0df0efc3..09ba265ed 100644 --- a/src/profile.c +++ b/src/profile.c @@ -146,13 +146,23 @@ static void *get_thread_stackptr( thread_handle *t, void **eip ) { return (void*)c.Esp; # endif #elif defined(HL_LINUX) -# ifdef HL_64 +# if defined(__aarch64__) || defined(_M_ARM64) + *eip = (void*)shared_context.context.uc_mcontext.pc; + return (void*)shared_context.context.uc_mcontext.sp; +# elif defined(HL_64) *eip = (void*)shared_context.context.uc_mcontext.gregs[REG_RIP]; return (void*)shared_context.context.uc_mcontext.gregs[REG_RSP]; # else *eip = (void*)shared_context.context.uc_mcontext.gregs[REG_EIP]; return (void*)shared_context.context.uc_mcontext.gregs[REG_ESP]; # endif +#elif defined(HL_MAC) && defined(__aarch64__) + struct __darwin_mcontext64 *mcontext = shared_context.context.uc_mcontext; + if (mcontext != NULL) { + *eip = (void*)mcontext->__ss.__pc; + return (void*)mcontext->__ss.__sp; + } + return NULL; #elif defined(HL_MAC) && defined(__x86_64__) struct __darwin_mcontext64 *mcontext = shared_context.context.uc_mcontext; if (mcontext != NULL) { diff --git a/src/std/types.c b/src/std/types.c index eaf228db6..8db708185 100644 --- a/src/std/types.c +++ b/src/std/types.c @@ -35,7 +35,7 @@ HL_PRIM hl_type hlt_bool = { HBOOL }; HL_PRIM hl_type hlt_abstract = { HABSTRACT, {USTR("")} }; static const uchar *TSTR[] = { - USTR("void"), USTR("i8"), USTR("i16"), USTR("i32"), USTR("i64"), USTR("f32"), USTR("f64"), + USTR("void"), USTR("ui8"), USTR("ui16"), USTR("i32"), USTR("i64"), USTR("f32"), USTR("f64"), USTR("bool"), USTR("bytes"), USTR("dynamic"), NULL, NULL, USTR("array"), USTR("type"), NULL, NULL, USTR("dynobj"), NULL, NULL, NULL, NULL, NULL, NULL, USTR("guid") @@ -43,8 +43,8 @@ static const uchar *TSTR[] = { static int T_SIZES[] = { 0, // VOID - 1, // I8 - 2, // I16 + 1, // UI8 + 2, // UI16 4, // I32 8, // I64 4, // F32 @@ -160,8 +160,8 @@ HL_PRIM bool hl_same_type( hl_type *a, hl_type *b ) { HL_PRIM bool hl_is_dynamic( hl_type *t ) { static bool T_IS_DYNAMIC[] = { false, // HVOID, - false, // HI8 - false, // HI16 + false, // HUI8 + false, // HUI16 false, // HI32 false, // HI64 false, // HF32 @@ -190,8 +190,8 @@ HL_PRIM bool hl_is_dynamic( hl_type *t ) { HL_PRIM bool hl_is_ptr( hl_type *t ) { static bool T_IS_PTR[] = { false, // HVOID, - false, // HI8 - false, // HI16 + false, // HUI8 + false, // HUI16 false, // HI32 false, // HI64 false, // HF32