diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 0528289f3..60fff3b23 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -22,7 +22,7 @@ jobs:
       fail-fast: false
       matrix:
         target: [linux, darwin, windows]
-        architecture: [32, 64, arm64]
+        architecture: [64, arm64]
         build_system: [make, cmake, cmake-mingw, cmake-clang-cl, vs2019, makegcc14]
 
         include:
@@ -429,7 +429,7 @@ jobs:
       fail-fast: false
       matrix:
         os: [darwin, linux, windows]
-        architecture: [x86_32, x86_64, arm64]
+        architecture: [x86_64, arm64]
         include:
         - architecture: arm64
           test-flags: --skip-hl-jit # not yet supported
diff --git a/CMakeLists.txt b/CMakeLists.txt
index ddd2fd260..5cbb7277a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,7 +1,7 @@
 cmake_minimum_required(VERSION 3.13)
 
-set(HL_VERSION_MAJOR 1)
-set(HL_VERSION_MINOR 16)
+set(HL_VERSION_MAJOR 2)
+set(HL_VERSION_MINOR 0)
 set(HL_VERSION_PATCH 0)
 set(HL_VERSION ${HL_VERSION_MAJOR}.${HL_VERSION_MINOR}.${HL_VERSION_PATCH})
 
@@ -20,7 +20,8 @@ include(FindPkgConfig)
 include(CTest)
 
 set(WITH_VM_DEFAULT ON)
-if(CMAKE_SYSTEM_PROCESSOR MATCHES "arm|aarch64" AND (NOT CMAKE_OSX_ARCHITECTURES MATCHES "x86_64"))
+# 32-bit ARM has no JIT backend; aarch64/arm64 uses src/jit_aarch64.c.
+if(CMAKE_SYSTEM_PROCESSOR MATCHES "^arm$|^armv7" AND (NOT CMAKE_OSX_ARCHITECTURES MATCHES "x86_64"))
     set(WITH_VM_DEFAULT OFF)
 endif()
 
@@ -225,9 +226,18 @@ else()
 endif()
 
 if (WITH_VM)
+    if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm64|ARM64")
+        set(HL_JIT_BACKEND src/jit_aarch64.c src/jit_aarch64_emit.c)
+    else()
+        set(HL_JIT_BACKEND src/jit_x86_64.c)
+    endif()
     add_executable(hl
         src/code.c
         src/jit.c
+        src/jit_emit.c
+        src/jit_regs.c
+        ${HL_JIT_BACKEND}
+        src/jit_dump.c
         src/main.c
         src/module.c
         src/debugger.c
diff --git a/Makefile b/Makefile
index aded6c272..e1ffa169b 100644
--- a/Makefile
+++ b/Makefile
@@ -41,7 +41,13 @@ STD = src/std/array.o src/std/buffer.o src/std/bytes.o src/std/cast.o src/std/da
 	src/std/socket.o src/std/string.o src/std/sys.o src/std/types.o src/std/ucs2.o src/std/thread.o src/std/process.o \
 	src/std/track.o
 
-HL_OBJ = src/code.o src/jit.o src/main.o src/module.o src/debugger.o src/profile.o
+ifeq ($(ARCH),arm64)
+HL_JIT_BACKEND_OBJ = src/jit_aarch64.o src/jit_aarch64_emit.o
+else
+HL_JIT_BACKEND_OBJ = src/jit_x86_64.o
+endif
+
+HL_OBJ = src/code.o src/jit.o src/jit_emit.o src/jit_regs.o $(HL_JIT_BACKEND_OBJ) src/jit_dump.o src/main.o src/module.o src/debugger.o src/profile.o
 
 FMT_CPPFLAGS = -I include/mikktspace -I include/minimp3
 
@@ -240,19 +246,12 @@ LIBHL = libhl.$(LIBEXT)
 HL = hl$(EXE_SUFFIX)
 HLC = hlc$(EXE_SUFFIX)
 
-all: $(LIBHL) libs
-ifeq ($(ARCH),arm64)
-	$(warning HashLink vm is not supported on arm64, skipping)
-else
-all: $(HL)
-endif
+all: $(LIBHL) libs $(HL)
 
 install:
 	$(UNAME)==Darwin && ${MAKE} uninstall
-ifneq ($(ARCH),arm64)
 	mkdir -p $(INSTALL_BIN_DIR)
 	cp $(HL) $(INSTALL_BIN_DIR)
-endif
 	mkdir -p $(INSTALL_LIB_DIR)
 	cp *.hdll $(INSTALL_LIB_DIR)
 	cp $(LIBHL) $(INSTALL_LIB_DIR)
@@ -365,11 +364,7 @@ release_win:
 	rm -rf $(PACKAGE_NAME)
 
 release_linux release_osx:
-ifeq ($(ARCH),arm64)
-	cp $(LIBHL) *.hdll $(PACKAGE_NAME)
-else
 	cp $(HL) $(LIBHL) *.hdll $(PACKAGE_NAME)
-endif
 	tar -cvzf $(PACKAGE_NAME).tar.gz $(PACKAGE_NAME)
 	rm -rf $(PACKAGE_NAME)
 
diff --git a/hl.vcxproj b/hl.vcxproj
index 88e95b28b..fef4a909e 100644
--- a/hl.vcxproj
+++ b/hl.vcxproj
@@ -45,55 +45,55 @@
     <ConfigurationType>Application</ConfigurationType>
     <UseDebugLibraries>true</UseDebugLibraries>
     <CharacterSet>Unicode</CharacterSet>
-    <PlatformToolset>v142</PlatformToolset>
+    <PlatformToolset>v143</PlatformToolset>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
     <ConfigurationType>Application</ConfigurationType>
     <UseDebugLibraries>true</UseDebugLibraries>
     <CharacterSet>Unicode</CharacterSet>
-    <PlatformToolset>v142</PlatformToolset>
+    <PlatformToolset>v143</PlatformToolset>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
     <ConfigurationType>Application</ConfigurationType>
     <UseDebugLibraries>false</UseDebugLibraries>
     <WholeProgramOptimization>true</WholeProgramOptimization>
     <CharacterSet>Unicode</CharacterSet>
-    <PlatformToolset>v142</PlatformToolset>
+    <PlatformToolset>v143</PlatformToolset>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='ReleaseDX12Agility|Win32'" Label="Configuration">
     <ConfigurationType>Application</ConfigurationType>
     <UseDebugLibraries>false</UseDebugLibraries>
     <WholeProgramOptimization>true</WholeProgramOptimization>
     <CharacterSet>Unicode</CharacterSet>
-    <PlatformToolset>v142</PlatformToolset>
+    <PlatformToolset>v143</PlatformToolset>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='ReleaseVS2013|Win32'" Label="Configuration">
     <ConfigurationType>Application</ConfigurationType>
     <UseDebugLibraries>false</UseDebugLibraries>
     <WholeProgramOptimization>true</WholeProgramOptimization>
     <CharacterSet>Unicode</CharacterSet>
-    <PlatformToolset>v120</PlatformToolset>
+    <PlatformToolset>v143</PlatformToolset>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
     <ConfigurationType>Application</ConfigurationType>
     <UseDebugLibraries>false</UseDebugLibraries>
     <WholeProgramOptimization>true</WholeProgramOptimization>
     <CharacterSet>Unicode</CharacterSet>
-    <PlatformToolset>v142</PlatformToolset>
+    <PlatformToolset>v143</PlatformToolset>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='ReleaseDX12Agility|x64'" Label="Configuration">
     <ConfigurationType>Application</ConfigurationType>
     <UseDebugLibraries>false</UseDebugLibraries>
     <WholeProgramOptimization>true</WholeProgramOptimization>
     <CharacterSet>Unicode</CharacterSet>
-    <PlatformToolset>v142</PlatformToolset>
+    <PlatformToolset>v143</PlatformToolset>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='ReleaseVS2013|x64'" Label="Configuration">
     <ConfigurationType>Application</ConfigurationType>
     <UseDebugLibraries>false</UseDebugLibraries>
     <WholeProgramOptimization>true</WholeProgramOptimization>
     <CharacterSet>Unicode</CharacterSet>
-    <PlatformToolset>v120</PlatformToolset>
+    <PlatformToolset>v143</PlatformToolset>
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
@@ -186,7 +186,7 @@
       <WarningLevel>EnableAllWarnings</WarningLevel>
       <Optimization>Disabled</Optimization>
       <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;HL_VTUNE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalOptions>/wd4456 /wd4100 /wd4204 /wd4702 /wd4457 %(AdditionalOptions)</AdditionalOptions>
+      <AdditionalOptions>/wd4456 /wd4100 /wd4204 /wd4702 /wd4457 /we4013 %(AdditionalOptions)</AdditionalOptions>
       <ConformanceMode>true</ConformanceMode>
       <LanguageStandard_C>stdc11</LanguageStandard_C>
     </ClCompile>
@@ -196,6 +196,7 @@
       <AdditionalDependencies>libhl.lib;user32.lib;include/vtune/jitprofiling.lib</AdditionalDependencies>
       <RandomizedBaseAddress>false</RandomizedBaseAddress>
       <DataExecutionPrevention>false</DataExecutionPrevention>
+      <StackReserveSize>4194304</StackReserveSize>
     </Link>
     <Manifest>
       <EnableDpiAwareness>PerMonitorHighDPIAware</EnableDpiAwareness>
@@ -361,14 +362,20 @@
     <ClCompile Include="src\code.c" />
     <ClCompile Include="src\debugger.c" />
     <ClCompile Include="src\jit.c" />
+    <ClCompile Include="src\jit_dump.c" />
+    <ClCompile Include="src\jit_emit.c" />
+    <ClCompile Include="src\jit_regs.c" />
+    <ClCompile Include="src\jit_x86_64.c" />
     <ClCompile Include="src\main.c" />
     <ClCompile Include="src\module.c" />
     <ClCompile Include="src\profile.c" />
   </ItemGroup>
   <ItemGroup>
+    <ClInclude Include="src\data_struct.h" />
     <ClInclude Include="src\hl.h" />
     <ClInclude Include="src\hlmodule.h" />
     <ClInclude Include="src\hlsystem.h" />
+    <ClInclude Include="src\jit.h" />
     <ClInclude Include="src\opcodes.h" />
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
diff --git a/hl.vcxproj.filters b/hl.vcxproj.filters
index f86723996..8a8395f72 100644
--- a/hl.vcxproj.filters
+++ b/hl.vcxproj.filters
@@ -4,14 +4,20 @@
     <ClCompile Include="src\main.c" />
     <ClCompile Include="src\code.c" />
     <ClCompile Include="src\module.c" />
-    <ClCompile Include="src\jit.c" />
     <ClCompile Include="src\debugger.c" />
     <ClCompile Include="src\profile.c" />
+    <ClCompile Include="src\jit_dump.c" />
+    <ClCompile Include="src\jit_emit.c" />
+    <ClCompile Include="src\jit.c" />
+    <ClCompile Include="src\jit_regs.c" />
+    <ClCompile Include="src\jit_x86_64.c" />
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="src\hlmodule.h" />
     <ClInclude Include="src\opcodes.h" />
     <ClInclude Include="src\hl.h" />
     <ClInclude Include="src\hlsystem.h" />
+    <ClInclude Include="src\jit.h" />
+    <ClInclude Include="src\data_struct.h" />
   </ItemGroup>
 </Project>
\ No newline at end of file
diff --git a/libhl.vcxproj b/libhl.vcxproj
index 40f1a2eff..1f86fe1a7 100644
--- a/libhl.vcxproj
+++ b/libhl.vcxproj
@@ -36,40 +36,40 @@
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
     <ConfigurationType>DynamicLibrary</ConfigurationType>
     <UseDebugLibraries>true</UseDebugLibraries>
-    <PlatformToolset>v142</PlatformToolset>
+    <PlatformToolset>v143</PlatformToolset>
     <CharacterSet>Unicode</CharacterSet>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
     <ConfigurationType>DynamicLibrary</ConfigurationType>
     <UseDebugLibraries>false</UseDebugLibraries>
-    <PlatformToolset>v142</PlatformToolset>
+    <PlatformToolset>v143</PlatformToolset>
     <WholeProgramOptimization>true</WholeProgramOptimization>
     <CharacterSet>Unicode</CharacterSet>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='ReleaseVS2013|Win32'" Label="Configuration">
     <ConfigurationType>DynamicLibrary</ConfigurationType>
     <UseDebugLibraries>false</UseDebugLibraries>
-    <PlatformToolset>v120</PlatformToolset>
+    <PlatformToolset>v143</PlatformToolset>
     <WholeProgramOptimization>true</WholeProgramOptimization>
     <CharacterSet>Unicode</CharacterSet>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
     <ConfigurationType>DynamicLibrary</ConfigurationType>
     <UseDebugLibraries>true</UseDebugLibraries>
-    <PlatformToolset>v142</PlatformToolset>
+    <PlatformToolset>v143</PlatformToolset>
     <CharacterSet>Unicode</CharacterSet>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
     <ConfigurationType>DynamicLibrary</ConfigurationType>
     <UseDebugLibraries>false</UseDebugLibraries>
-    <PlatformToolset>v142</PlatformToolset>
+    <PlatformToolset>v143</PlatformToolset>
     <WholeProgramOptimization>true</WholeProgramOptimization>
     <CharacterSet>Unicode</CharacterSet>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='ReleaseVS2013|x64'" Label="Configuration">
     <ConfigurationType>DynamicLibrary</ConfigurationType>
     <UseDebugLibraries>false</UseDebugLibraries>
-    <PlatformToolset>v120</PlatformToolset>
+    <PlatformToolset>v143</PlatformToolset>
     <WholeProgramOptimization>true</WholeProgramOptimization>
     <CharacterSet>Unicode</CharacterSet>
   </PropertyGroup>
diff --git a/src/allocator.c b/src/allocator.c
index 47dfc8f41..f9bd63420 100644
--- a/src/allocator.c
+++ b/src/allocator.c
@@ -313,6 +313,8 @@ static void *gc_alloc_fixed( int part, int kind ) {
 		for(i=0;i<p->block_size;i++)
 			if( ptr[i] != 0xDD )
 				hl_fatal("assert");
+			else
+				ptr[i] = 0xCD;
 	}
 #	endif
 	gc_free_pages[pid] = ph;
@@ -367,6 +369,8 @@ static void *gc_alloc_var( int part, int size, int kind ) {
 		for(i=0;i<size;i++)
 			if( ptr[i] != 0xDD )
 				hl_fatal("assert");
+			else
+				ptr[i] = 0xCD;
 	}
 #	endif
 	if( ph->bmp ) {
diff --git a/src/data_struct.c b/src/data_struct.c
new file mode 100644
index 000000000..ed417770e
--- /dev/null
+++ b/src/data_struct.c
@@ -0,0 +1,308 @@
+/*
+ * Copyright (C)2015-2026 Haxe Foundation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+#ifdef S_TYPE
+
+// is included by data_struct.h
+
+#ifdef S_MAP
+#	define S_ARGS	S_KEY k, S_VALUE v
+#else
+#	define S_ARGS	S_VALUE k
+#	define S_KEY	S_VALUE
+#	define keys		values
+#endif
+
+#ifndef S_DEFVAL
+#	define S_DEFVAL	(S_VALUE)0
+#endif
+
+#ifndef S_CMP
+#	define S_CMP(a,b) a > b
+#endif
+
+typedef struct {
+	int cur;
+	int max;
+	S_KEY *keys;
+#	ifdef S_MAP
+	S_VALUE *values;
+#	endif
+} S_TYPE;
+
+typedef S_VALUE	S_NAME(_value);
+#ifdef S_MAP
+typedef S_KEY	S_NAME(_key);
+#endif
+
+INLINE static void S_NAME(check_size)( hl_alloc *alloc, S_TYPE *st ) {
+	if( st->cur == st->max ) {
+		int n = st->max ? (st->max << 1) : STRUCT_DEF_SIZE;
+		S_KEY *keys = (S_KEY*)hl_malloc(alloc,sizeof(S_KEY) * n);
+		memcpy(keys,st->keys,sizeof(S_KEY) * st->cur);
+		st->keys = keys;
+#		ifdef S_MAP
+		S_VALUE *vals = (S_VALUE*)hl_malloc(alloc,sizeof(S_VALUE) * n);
+		memcpy(vals,st->values,sizeof(S_VALUE) * st->cur);
+		st->values = vals;
+#		endif
+		st->max = n;
+	}
+}
+
+#ifndef S_SORTED
+
+INLINE static void S_NAME(add_impl)( hl_alloc *alloc, S_TYPE *st, S_ARGS ) {
+	S_NAME(check_size)(alloc,st);
+	st->keys[st->cur] = k;
+#	ifdef S_MAP
+	st->values[st->cur] = v;
+#	endif
+	st->cur++;
+}
+
+INLINE static bool S_NAME(exists)( S_TYPE st, S_KEY k ) {
+	for(int i=0;i<st.cur;i++)
+		if( st.keys[i] == k )
+			return true;
+	return false;
+}
+
+INLINE static bool S_NAME(remove)( S_TYPE *st, S_KEY k ) {
+	for(int i=0;i<st->cur;i++)
+		if( st->keys[i] == k ) {
+			int pos = i;
+			memmove(st->keys + pos, st->keys + pos + 1, (st->cur - pos - 1) * sizeof(S_KEY));
+#			ifdef S_MAP
+			memmove(st->values + pos, st->values + pos + 1, (st->cur - pos - 1) * sizeof(S_VALUE));
+#			endif
+			st->cur--;
+			return true;
+		}
+	return false;
+}
+
+INLINE static void S_NAME(remove_range)( S_TYPE *st, int pos, int count ) {
+	memmove(st->keys + pos, st->keys + pos + count, (st->cur - pos - count) * sizeof(S_KEY));
+#	ifdef S_MAP
+	memmove(st->values + pos, st->values + pos + count, (st->cur - pos - count) * sizeof(S_VALUE));
+#	endif
+	st->cur -= count;
+}
+
+#ifdef S_MAP
+static S_VALUE S_NAME(find)( S_TYPE st, S_KEY k ) {
+	for(int i=0;i<st.cur;i++)
+		if( st.keys[i] == k )
+			return st.values[i];
+	return (S_VALUE)0;
+}
+#else
+static S_VALUE *S_NAME(reserve_impl)( hl_alloc *alloc, S_TYPE *st, int count ) {
+	if( st->cur + count > st->max ) {
+		int n = st->max ? (st->max << 1) : STRUCT_DEF_SIZE;
+		while( n < st->cur + count ) n <<= 1;
+		S_KEY *keys = (S_KEY*)hl_malloc(alloc,sizeof(S_KEY) * n);
+		memcpy(keys,st->keys,sizeof(S_KEY) * st->cur);
+		st->keys = keys;
+		st->max = n;
+	}
+	S_VALUE *ptr = st->keys + st->cur;
+	st->cur += count;
+	return ptr;
+}
+#endif
+
+
+#else
+
+INLINE static bool S_NAME(add_impl)( hl_alloc *alloc, S_TYPE *st, S_ARGS ) {
+	int min = 0;
+	int max = st->cur;
+	int pos;
+	while( min < max ) {
+		int mid = (min + max) >> 1;
+		S_KEY k2 = st->keys[mid];
+		if( S_CMP(k,k2) ) min = mid + 1; else if( S_CMP(k2,k) ) max = mid; else return false;
+	}
+	S_NAME(check_size)(alloc,st);
+	pos = (min + max) >> 1;
+	memmove(st->keys + pos + 1, st->keys + pos, (st->cur - pos) * sizeof(S_KEY));
+#	ifdef S_MAP
+	memmove(st->values + pos + 1, st->values + pos, (st->cur - pos) * sizeof(S_VALUE));
+#	endif
+	st->keys[pos] = k;
+#	ifdef S_MAP
+	st->values[pos] = v;
+#	endif
+	st->cur++;
+	return true;
+}
+
+#ifdef S_MAP
+INLINE static void S_NAME(replace_impl)( hl_alloc *alloc, S_TYPE *st, S_ARGS ) {
+	int min = 0;
+	int max = st->cur;
+	int pos;
+	while( min < max ) {
+		int mid = (min + max) >> 1;
+		S_KEY k2 = st->keys[mid];
+		if( k2 < k ) min = mid + 1; else if( k2 > k ) max = mid; else {
+			st->values[mid] = v;
+			return;
+		}
+	}
+	S_NAME(check_size)(alloc,st);
+	pos = (min + max) >> 1;
+	memmove(st->keys + pos + 1, st->keys + pos, (st->cur - pos) * sizeof(S_KEY));
+	memmove(st->values + pos + 1, st->values + pos, (st->cur - pos) * sizeof(S_VALUE));
+	st->keys[pos] = k;
+	st->values[pos] = v;
+	st->cur++;
+}
+
+INLINE static bool S_NAME(add_pair_impl)( hl_alloc *alloc, S_TYPE *st, S_ARGS ) {
+	int min = 0;
+	int max = st->cur;
+	int pos;
+	while( min < max ) {
+		int mid = (min + max) >> 1;
+		S_KEY k2 = st->keys[mid];
+		if( k2 < k ) min = mid + 1; else if( k2 > k ) max = mid; else {
+			S_VALUE v2 = st->values[mid];
+			if( S_CMP(v,v2) ) min = mid+1; else if( S_CMP(v2,v) ) max = mid; else return false;
+		}
+	}
+	S_NAME(check_size)(alloc,st);
+	pos = (min + max) >> 1;
+	memmove(st->keys + pos + 1, st->keys + pos, (st->cur - pos) * sizeof(S_KEY));
+	memmove(st->values + pos + 1, st->values + pos, (st->cur - pos) * sizeof(S_VALUE));
+	st->keys[pos] = k;
+	st->values[pos] = v;
+	st->cur++;
+	return true;
+}
+#endif
+
+INLINE static bool S_NAME(exists)( S_TYPE st, S_KEY k ) {
+	int min = 0;
+	int max = st.cur;
+	while( min < max ) {
+		int mid = (min + max) >> 1;
+		S_KEY k2 = st.keys[mid];
+		if( S_CMP(k,k2) ) min = mid + 1; else if( S_CMP(k2,k) ) max = mid; else return true;
+	}
+	return false;
+}
+
+#ifdef S_MAP
+INLINE static S_VALUE S_NAME(find)( S_TYPE st, S_KEY k ) {
+	int min = 0;
+	int max = st.cur;
+	while( min < max ) {
+		int mid = (min + max) >> 1;
+		S_KEY k2 = st.keys[mid];
+		if( k2 < k ) min = mid + 1; else if( k2 > k ) max = mid; else return st.values[mid];
+	}
+	return S_DEFVAL;
+}
+#endif
+
+INLINE static bool S_NAME(remove)( S_TYPE *st, S_KEY k ) {
+	int min = 0;
+	int max = st->cur;
+	while( min < max ) {
+		int mid = (min + max) >> 1;
+		S_KEY k2 = st->keys[mid];
+		if( S_CMP(k,k2) ) min = mid + 1; else if( S_CMP(k2,k) ) max = mid; else {
+			int pos = mid;
+			memmove(st->keys + pos, st->keys + pos + 1, (st->cur - pos - 1) * sizeof(S_KEY));
+#			ifdef S_MAP
+			memmove(st->values + pos, st->values + pos + 1, (st->cur - pos - 1) * sizeof(S_VALUE));
+#			endif
+			st->cur--;
+			return true;
+		}
+	}
+	return false;
+}
+
+#endif
+
+INLINE static void S_NAME(reset)( S_TYPE *st ) {
+	st->cur = 0;
+}
+
+INLINE static S_VALUE *S_NAME(free)( S_TYPE *st ) {
+	st->cur = 0;
+	st->max = 0;
+	S_VALUE *vals = st->values;
+#	ifdef S_MAP
+	st->keys = NULL;
+#	endif
+	st->values = NULL;
+	return vals;
+}
+
+INLINE static int S_NAME(count)( S_TYPE st ) {
+	return st.cur;
+}
+
+INLINE static S_VALUE S_NAME(get)( S_TYPE st, int idx ) {
+	return st.values[idx];
+}
+
+INLINE static S_VALUE *S_NAME(addr)( S_TYPE st, int idx ) {
+	return &st.values[idx];
+}
+
+INLINE static S_VALUE S_NAME(first)( S_TYPE st ) {
+	return st.cur == 0 ? S_DEFVAL : st.values[0];
+}
+
+INLINE static bool S_NAME(iter_next)( S_TYPE st, S_VALUE *val, int idx ) {
+	if( idx < st.cur ) *val = st.values[idx];
+	return idx < st.cur;
+}
+
+#ifdef S_MAP
+INLINE static bool S_NAME(iter_next_key)( S_TYPE st, S_KEY *key, int idx ) {
+	if( idx < st.cur ) *key = st.keys[idx];
+	return idx < st.cur;
+}
+#endif
+
+INLINE static bool S_NAME(iter_prev)( S_TYPE st, S_VALUE *val, int idx ) {
+	if( idx >= 0 ) *val = st.values[idx];
+	return idx >= 0;
+}
+
+#undef S_NAME
+#undef S_TYPE
+#undef S_VALUE
+#undef S_KEY
+#undef S_ARGS
+#undef STRUCT_NAME
+#undef S_CMP
+#undef S_DEFVAL
+#undef keys
+
+#endif
diff --git a/src/data_struct.h b/src/data_struct.h
new file mode 100644
index 000000000..5c5b9fe4e
--- /dev/null
+++ b/src/data_struct.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (C)2015-2026 Haxe Foundation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+#ifndef HL_DATA_STRUCT_H
+#define HL_DATA_STRUCT_H
+
+#include <hl.h>
+
+#if defined(__GNUC__) || defined(__clang__)
+#define INLINE inline __attribute__((always_inline))
+#elif defined(_MSC_VER)
+#define INLINE __forceinline
+#else
+#define INLINE inline
+#endif
+
+#define STRUCT_DEF_SIZE 2
+#define for_iter(name,var,set) name##__value var; for(int __idx=0;name##_iter_next(set,&var,__idx);__idx++)
+#define for_iter_key(name,var,set) name##__key var; for(int __idx=0;name##_iter_next_key(set,&var,__idx);__idx++)
+#define for_iter_back(name,var,set) name##__value var; for(int __idx=(set).cur-1;name##_iter_prev(set,&var,__idx);__idx--)
+
+#define S_TYPE			ptr_set
+#define S_NAME(name)	ptr_set_##name
+#define S_VALUE			void*
+#include "data_struct.c"
+#define ptr_set_add(set,v)		ptr_set_add_impl(DEF_ALLOC,&(set),v)
+
+#define S_TYPE			int_arr
+#define S_NAME(name)	int_arr_##name
+#define S_VALUE			int
+#include "data_struct.c"
+#define int_arr_add(set,v)		int_arr_add_impl(DEF_ALLOC,&(set),v)
+#define int_arr_reserve(set,v)	int_arr_reserve_impl(DEF_ALLOC,&(set),v)
+
+#define S_SORTED
+
+#define S_TYPE			int_set
+#define S_NAME(name)	int_set_##name
+#define S_VALUE			int
+#include "data_struct.c"
+#define int_set_add(set,v)		int_set_add_impl(DEF_ALLOC,&(set),v)
+
+#define S_MAP
+
+#define S_TYPE			int_map
+#define S_NAME(name)	int_map_##name
+#define S_KEY			int
+#define S_VALUE			int
+#include "data_struct.c"
+#define int_map_add(map,k,v)		int_map_add_impl(DEF_ALLOC,&(map),k,v)
+#define int_map_replace(map,k,v)	int_map_replace_impl(DEF_ALLOC,&(map),k,v)
+
+#define S_TYPE			ptr_map
+#define S_NAME(name)	ptr_map_##name
+#define S_KEY			int
+#define S_VALUE			void*
+#include "data_struct.c"
+#define ptr_map_add(map,k,v)		ptr_map_add_impl(DEF_ALLOC,&(map),k,v)
+#define ptr_map_replace(map,k,v)	ptr_map_replace_impl(DEF_ALLOC,&(map),k,v)
+
+#undef S_MAP
+#undef S_SORTED
+
+#endif
diff --git a/src/hl.h b/src/hl.h
index 6220eb369..e21be7f92 100644
--- a/src/hl.h
+++ b/src/hl.h
@@ -27,7 +27,7 @@
 	https://github.com/HaxeFoundation/hashlink/wiki/
 **/
 
-#define HL_VERSION	0x011000
+#define HL_VERSION	0x020000
 
 #if defined(_WIN32)
 #	define HL_WIN
diff --git a/src/hlmodule.h b/src/hlmodule.h
index b2619f932..adf29f9bd 100644
--- a/src/hlmodule.h
+++ b/src/hlmodule.h
@@ -19,6 +19,9 @@
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  * DEALINGS IN THE SOFTWARE.
  */
+#ifndef HL_MODULE_H
+#define HL_MODULE_H
+
 #include <hl.h>
 #include <hlsystem.h>
 #include "opcodes.h"
@@ -104,9 +107,6 @@ typedef struct {
 	bool large;
 } hl_debug_infos;
 
-typedef struct _jit_ctx jit_ctx;
-
-
 typedef struct {
 	hl_code *code;
 	int *types_hashes;
@@ -124,6 +124,8 @@ typedef struct {
 #endif
 #endif
 
+typedef struct _jit_ctx jit_ctx;
+
 typedef struct {
 	hl_code *code;
 	int codesize;
@@ -138,6 +140,7 @@ typedef struct {
 	jit_ctx *jit_ctx;
 	hl_module_context ctx;
 #ifdef WIN64_UNWIND_TABLES
+	int unwind_table_size;
 	PRUNTIME_FUNCTION unwind_table;
 #endif
 } hl_module;
@@ -165,10 +168,4 @@ hl_type *hl_module_resolve_type( hl_module *m, hl_type *t, bool err );
 void hl_profile_setup( int sample_count );
 void hl_profile_end();
 
-jit_ctx *hl_jit_alloc();
-void hl_jit_free( jit_ctx *ctx, h_bool can_reset );
-void hl_jit_reset( jit_ctx *ctx, hl_module *m );
-void hl_jit_init( jit_ctx *ctx, hl_module *m );
-int hl_jit_function( jit_ctx *ctx, hl_module *m, hl_function *f );
-void *hl_jit_code( jit_ctx *ctx, hl_module *m, int *codesize, hl_debug_infos **debug, hl_module *previous );
-void hl_jit_patch_method( void *old_fun, void **new_fun_table );
+#endif
diff --git a/src/jit.c b/src/jit.c
index b1f82b0fa..ddf9a187d 100644
--- a/src/jit.c
+++ b/src/jit.c
@@ -19,4753 +19,330 @@
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  * DEALINGS IN THE SOFTWARE.
  */
-#ifdef _MSC_VER
-#pragma warning(disable:4820)
-#endif
-#include <math.h>
-#include <hlmodule.h>
-#include "hlsystem.h"
-
-#ifdef __arm__
-#	error "JIT does not support ARM processors, only x86 and x86-64 are supported, please use HashLink/C native compilation instead"
-#endif
-
-#ifdef HL_DEBUG
-#	define JIT_DEBUG
-#endif
-
-typedef enum {
-	Eax = 0,
-	Ecx = 1,
-	Edx = 2,
-	Ebx = 3,
-	Esp = 4,
-	Ebp = 5,
-	Esi = 6,
-	Edi = 7,
-#ifdef HL_64
-	R8 = 8,
-	R9 = 9,
-	R10	= 10,
-	R11	= 11,
-	R12	= 12,
-	R13	= 13,
-	R14	= 14,
-	R15	= 15,
-#endif
-	_LAST = 0xFF
-} CpuReg;
-
-typedef enum {
-	MOV,
-	LEA,
-	PUSH,
-	ADD,
-	SUB,
-	IMUL,	// only overflow flag changes compared to MUL
-	DIV,
-	IDIV,
-	CDQ,
-	CDQE,
-	POP,
-	RET,
-	CALL,
-	AND,
-	OR,
-	XOR,
-	CMP,
-	TEST,
-	NOP,
-	SHL,
-	SHR,
-	SAR,
-	INC,
-	DEC,
-	JMP,
-	// FPU
-	FSTP,
-	FSTP32,
-	FLD,
-	FLD32,
-	FLDCW,
-	// SSE
-	MOVSD,
-	MOVSS,
-	COMISD,
-	COMISS,
-	ADDSD,
-	SUBSD,
-	MULSD,
-	DIVSD,
-	ADDSS,
-	SUBSS,
-	MULSS,
-	DIVSS,
-	XORPD,
-	CVTSI2SD,
-	CVTSI2SS,
-	CVTSD2SI,
-	CVTSD2SS,
-	CVTSS2SD,
-	CVTSS2SI,
-	STMXCSR,
-	LDMXCSR,
-	// 8-16 bits
-	MOV8,
-	CMP8,
-	TEST8,
-	PUSH8,
-	MOV16,
-	CMP16,
-	TEST16,
-	// prefetchs
-	PREFETCHT0,
-	PREFETCHT1,
-	PREFETCHT2,
-	PREFETCHNTA,
-	PREFETCHW,
-	// --
-	_CPU_LAST
-} CpuOp;
-
-#define JAlways		0
-#define JOverflow	0x80
-#define JULt		0x82
-#define JUGte		0x83
-#define JEq			0x84
-#define JNeq		0x85
-#define JULte		0x86
-#define JUGt		0x87
-#define JParity		0x8A
-#define JNParity	0x8B
-#define JSLt		0x8C
-#define JSGte		0x8D
-#define JSLte		0x8E
-#define JSGt		0x8F
-
-#define JCarry		JLt
-#define JZero		JEq
-#define JNotZero	JNeq
-
-#define B(bv)	*ctx->buf.b++ = (unsigned char)(bv)
-#define W(wv)	*ctx->buf.w++ = wv
-
-#ifdef HL_64
-#	define W64(wv)	*ctx->buf.w64++ = wv
-#else
-#	define W64(wv)	W(wv)
-#endif
-
-static const int SIB_MULT[] = {-1, 0, 1, -1, 2, -1, -1, -1, 3};
-
-#define MOD_RM(mod,reg,rm)		B(((mod) << 6) | (((reg)&7) << 3) | ((rm)&7))
-#define SIB(mult,rmult,rbase)	B((SIB_MULT[mult]<<6) | (((rmult)&7)<<3) | ((rbase)&7))
-#define IS_SBYTE(c)				( (c) >= -128 && (c) < 128 )
-
-#define AddJump(how,local)		{ if( (how) == JAlways ) { B(0xE9); } else { B(0x0F); B(how); }; local = BUF_POS(); W(0); }
-#define AddJump_small(how,local) { if( (how) == JAlways ) { B(0xEB); } else B(how - 0x10); local = BUF_POS() | 0x40000000; B(0); }
-#define XJump(how,local)		AddJump(how,local)
-#define XJump_small(how,local)		AddJump_small(how,local)
+#include <jit.h>
 
-#define MAX_OP_SIZE				256
+static jit_ctx *current_ctx = NULL;
 
-#define BUF_POS()				((int)(ctx->buf.b - ctx->startBuf))
-#define RTYPE(r)				r->t->kind
-
-#ifdef HL_64
-#	define RESERVE_ADDRESS	0x8000000000000000
-#else
-#	define RESERVE_ADDRESS	0x80000000
-#endif
-
-#if defined(HL_WIN_CALL) && defined(HL_64)
-#	define IS_WINCALL64 1
-#else
-#	define IS_WINCALL64 0
-#endif
-
-typedef struct jlist jlist;
-struct jlist {
-	int pos;
-	int target;
-	jlist *next;
-};
-
-typedef struct vreg vreg;
-
-typedef enum {
-	RCPU = 0,
-	RFPU = 1,
-	RSTACK = 2,
-	RCONST = 3,
-	RADDR = 4,
-	RMEM = 5,
-	RUNUSED = 6,
-	RCPU_CALL = 1 | 8,
-	RCPU_8BITS = 1 | 16
-} preg_kind;
-
-typedef struct {
-	preg_kind kind;
-	int id;
-	int lock;
-	vreg *holds;
-} preg;
-
-struct vreg {
-	int stackPos;
-	int size;
-	hl_type *t;
-	preg *current;
-	preg stack;
-};
-
-#define REG_AT(i)		(ctx->pregs + (i))
+void hl_jit_error( const char *msg, const char *func, int line ) {
+	printf("*** JIT ERROR %s:%d (%s)****\n", func, line, msg);
+	if( current_ctx  ) {
+		jit_ctx *ctx = current_ctx;
+		current_ctx = NULL;
+		hl_emit_dump(ctx);
+	}
+	fflush(stdout);
+}
 
-#ifdef HL_64
-#	define RCPU_COUNT	16
-#	define RFPU_COUNT	16
-#	ifdef HL_WIN_CALL
-#		define CALL_NREGS			4
-#		define RCPU_SCRATCH_COUNT	7
-#		define RFPU_SCRATCH_COUNT	6
-static const int RCPU_SCRATCH_REGS[] = { Eax, Ecx, Edx, R8, R9, R10, R11 };
-static const CpuReg CALL_REGS[] = { Ecx, Edx, R8, R9 };
-#	else
-#		define CALL_NREGS			6 // TODO : XMM6+XMM7 are FPU reg parameters
-#		define RCPU_SCRATCH_COUNT	9
-#		define RFPU_SCRATCH_COUNT	16
-static const int RCPU_SCRATCH_REGS[] = { Eax, Ecx, Edx, Esi, Edi, R8, R9, R10, R11 };
-static const CpuReg CALL_REGS[] = { Edi, Esi, Edx, Ecx, R8, R9 };
-#	endif
-#else
-#	define CALL_NREGS	0
-#	define RCPU_COUNT	8
-#	define RFPU_COUNT	8
-#	define RCPU_SCRATCH_COUNT	3
-#	define RFPU_SCRATCH_COUNT	8
-static const int RCPU_SCRATCH_REGS[] = { Eax, Ecx, Edx };
-#endif
+void hl_jit_null_field_access( int fhash ) {
+	vbyte *field = hl_field_name(fhash);
+	hl_buffer *b = hl_alloc_buffer();
+	hl_buffer_str(b, USTR("Null access ."));
+	hl_buffer_str(b, (uchar*)field);
+	vdynamic *d = hl_alloc_dynamic(&hlt_bytes);
+	d->v.ptr = hl_buffer_content(b,NULL);
+	hl_throw(d);
+}
 
-#define XMM(i)			((i) + RCPU_COUNT)
-#define PXMM(i)			REG_AT(XMM(i))
-#define REG_IS_FPU(i)	((i) >= RCPU_COUNT)
+void hl_jit_assert() {
+	vdynamic *d = hl_alloc_dynamic(&hlt_bytes);
+	d->v.ptr = USTR("Assert");
+	hl_throw(d);
+}
 
-#define PEAX			REG_AT(Eax)
-#define PESP			REG_AT(Esp)
-#define PEBP			REG_AT(Ebp)
+void hl_emit_alloc( jit_ctx *jit );
+void hl_emit_free( jit_ctx *jit );
+void hl_emit_function( jit_ctx *jit );
+void hl_emit_final( jit_ctx *jit );
 
-#define REG_COUNT	(RCPU_COUNT + RFPU_COUNT)
+void hl_regs_alloc( jit_ctx *jit );
+void hl_regs_free( jit_ctx *jit );
+void hl_regs_function( jit_ctx *jit );
 
-#define ID2(a,b)	((a) | ((b)<<8))
-#define R(id)		(ctx->vregs + (id))
-#define ASSERT(i)	{ printf("JIT ERROR %d (jit.c line %d)\n",i,(int)__LINE__); jit_exit(); }
-#define IS_FLOAT(r)	((r)->t->kind == HF64 || (r)->t->kind == HF32)
-#define RLOCK(r)		if( (r)->lock < ctx->currentPos ) (r)->lock = ctx->currentPos
-#define RUNLOCK(r)		if( (r)->lock == ctx->currentPos ) (r)->lock = 0
+void hl_codegen_alloc( jit_ctx *jit );
+void hl_codegen_init( jit_ctx *jit );
+void hl_codegen_free( jit_ctx *jit );
+void hl_codegen_flush_consts( jit_ctx *jit );
+void hl_codegen_function( jit_ctx *jit );
+void hl_codegen_final( jit_ctx *jit );
 
-#define BREAK()		B(0xCC)
+void hl_jit_init_regs( regs_config *cfg );
 
-static preg _unused = { RUNUSED, 0, 0, NULL };
-static preg *UNUSED = &_unused;
+jit_ctx *hl_jit_alloc() {
+	jit_ctx *ctx = (jit_ctx*)malloc(sizeof(jit_ctx));
+	memset(ctx,0,sizeof(jit_ctx));
+	hl_jit_init_regs(&ctx->cfg);
+	hl_alloc_init(&ctx->falloc);
+	hl_emit_alloc(ctx);
+	hl_regs_alloc(ctx);
+	hl_codegen_alloc(ctx);
+	return ctx;
+}
 
-struct _jit_ctx {
-	union {
-		unsigned char *b;
-		unsigned int *w;
-		unsigned long long *w64;
-		int *i;
-		double *d;
-	} buf;
-	vreg *vregs;
-	preg pregs[REG_COUNT];
-	vreg *savedRegs[REG_COUNT];
-	int savedLocks[REG_COUNT];
-	int *opsPos;
-	int maxRegs;
-	int maxOps;
-	int bufSize;
-	int totalRegsSize;
-	int functionPos;
-	int allocOffset;
-	int currentPos;
-	int nativeArgsCount;
-	unsigned char *startBuf;
-	hl_module *m;
-	hl_function *f;
-	jlist *jumps;
-	jlist *calls;
-	jlist *switchs;
-	hl_alloc falloc; // cleared per-function
-	hl_alloc galloc;
-	vclosure *closure_list;
-	hl_debug_infos *debug;
-	int c2hl;
-	int hl2c;
-#ifdef JIT_CUSTOM_LONGJUMP
-	int longjump;
-#endif
-	void *static_functions[8];
-	bool static_function_offset;
+void hl_jit_define_function( jit_ctx *ctx, int start, int size ) {
 #ifdef WIN64_UNWIND_TABLES
-	int unwind_offset;
-	int nunwind;
-	PRUNTIME_FUNCTION unwind_table;
+	int fid = ctx->fdef_index++;
+	if( fid >= ctx->mod->unwind_table_size ) jit_assert();
+	ctx->mod->unwind_table[fid].BeginAddress = start;
+	ctx->mod->unwind_table[fid].EndAddress = start + size;
 #endif
-};
-
-#ifdef WIN64_UNWIND_TABLES
+}
 
-typedef enum _UNWIND_OP_CODES
-{
-	UWOP_PUSH_NONVOL = 0, /* info == register number */
-	UWOP_ALLOC_LARGE,	  /* no info, alloc size in next 2 slots */
-	UWOP_ALLOC_SMALL,	  /* info == size of allocation / 8 - 1 */
-	UWOP_SET_FPREG,		  /* no info, FP = RSP + UNWIND_INFO.FPRegOffset*16 */
-	UWOP_SAVE_NONVOL,	  /* info == register number, offset in next slot */
-	UWOP_SAVE_NONVOL_FAR, /* info == register number, offset in next 2 slots */
-	UWOP_SAVE_XMM128 = 8, /* info == XMM reg number, offset in next slot */
-	UWOP_SAVE_XMM128_FAR, /* info == XMM reg number, offset in next 2 slots */
-	UWOP_PUSH_MACHFRAME	  /* info == 0: no error-code, 1: error-code */
-} UNWIND_CODE_OPS;
+static bool jit_code_reserve( jit_ctx *ctx, int size ) {
+	int pos = ctx->out_pos;
+	if( pos + size > ctx->out_max ) {
+		int nsize = ctx->out_max ? ctx->out_max * 3 : 4096;
+		while( pos + ctx->code_size > nsize ) nsize *= 3;
+		unsigned char *nout = malloc(nsize);
+		if( !nout ) return false;
+		memcpy(nout,ctx->output,pos);
+		free(ctx->output);
+		ctx->output = nout;
+		ctx->out_max = nsize;
+	}
+	return true;
+}
 
-void write_uwcode(jit_ctx *ctx, unsigned char offset, UNWIND_CODE_OPS code, unsigned char info)
-{
-	B(offset);
-	B((code) | (info) << 4);
+static bool jit_code_append( jit_ctx *ctx ) {
+	if( !jit_code_reserve(ctx,ctx->code_size) )
+		return false;
+	int pos = ctx->out_pos;
+	memcpy(ctx->output + pos, ctx->code_instrs, ctx->code_size);
+	ctx->out_pos += ctx->code_size;
+	return true;
 }
 
-void write_unwind_data(jit_ctx *ctx)
-{
-	// All generated functions use a frame pointer, so the same unwind info can be used for all of them
+void hl_jit_init( jit_ctx *ctx, hl_module *m ) {
+	ctx->mod = m;
+#ifdef WIN64_UNWIND_TABLES
 	unsigned char version = 1;
 	unsigned char flags = 0;
 	unsigned char CountOfCodes = 2;
 	unsigned char SizeOfProlog = 4;
 	unsigned char FrameRegister = 5; // RBP
 	unsigned char FrameOffset = 0;
+	jit_code_reserve(ctx,64);
+#	define B(v)	ctx->output[ctx->out_pos++] = v
+#	define UW(offs,code,inf)	B(offs); B((code) | (inf) << 4)
 	B((version) | (flags) << 3);
 	B(SizeOfProlog);
 	B(CountOfCodes);
 	B((FrameRegister) | (FrameOffset) << 4);
-	write_uwcode(ctx, 4, UWOP_SET_FPREG, 0);
-	write_uwcode(ctx, 1, UWOP_PUSH_NONVOL, 5);
-}
-#endif
-
-#define jit_exit() { hl_debug_break(); exit(-1); }
-#define jit_error(msg)	_jit_error(ctx,msg,__LINE__)
-
-#ifndef HL_64
-#	ifdef HL_DEBUG
-#		define error_i64() jit_error("i64-32")
-#	else
-void error_i64() {
-	printf("The module you are loading is using 64 bit ints that are not supported by the HL32.\nPlease run using HL64 or compile with -D hl-legacy32");
-	jit_exit();
-}
-#	endif
-#endif
-
-static void _jit_error( jit_ctx *ctx, const char *msg, int line );
-static void on_jit_error( const char *msg, int_val line );
-
-static preg *pmem( preg *r, CpuReg reg, int offset ) {
-	r->kind = RMEM;
-	r->id = 0 | (reg << 4) | (offset << 8);
-	return r;
-}
-
-static preg *pmem2( preg *r, CpuReg reg, CpuReg reg2, int mult, int offset ) {
-	r->kind = RMEM;
-	r->id = mult | (reg << 4) | (reg2 << 8);
-	r->holds = (void*)(int_val)offset;
-	return r;
-}
-
-#ifdef HL_64
-static preg *pcodeaddr( preg *r, int offset ) {
-	r->kind = RMEM;
-	r->id = 15 | (offset << 4);
-	return r;
-}
-#endif
-
-static preg *pconst( preg *r, int c ) {
-	r->kind = RCONST;
-	r->holds = NULL;
-	r->id = c;
-	return r;
-}
-
-static preg *pconst64( preg *r, int_val c ) {
-#ifdef HL_64
-	if( ((int)c) == c )
-		return pconst(r,(int)c);
-	r->kind = RCONST;
-	r->id = 0xC064C064;
-	r->holds = (vreg*)c;
-	return r;
-#else
-	return pconst(r,(int)c);
-#endif
-}
-
-#ifndef HL_64
-// it is not possible to access direct 64 bit address in x86-64
-static preg *paddr( preg *r, void *p ) {
-	r->kind = RADDR;
-	r->holds = (vreg*)p;
-	return r;
-}
-#endif
-
-static void save_regs( jit_ctx *ctx ) {
-	int i;
-	for(i=0;i<REG_COUNT;i++) {
-		ctx->savedRegs[i] = ctx->pregs[i].holds;
-		ctx->savedLocks[i] = ctx->pregs[i].lock;
-	}
-}
-
-static void restore_regs( jit_ctx *ctx ) {
-	int i;
-	for(i=0;i<ctx->maxRegs;i++)
-		ctx->vregs[i].current = NULL;
-	for(i=0;i<REG_COUNT;i++) {
-		vreg *r = ctx->savedRegs[i];
-		preg *p = ctx->pregs + i;
-		p->holds = r;
-		p->lock = ctx->savedLocks[i];
-		if( r ) r->current = p;
-	}
-}
-
-static void jit_buf( jit_ctx *ctx ) {
-	if( BUF_POS() > ctx->bufSize - MAX_OP_SIZE ) {
-		int nsize = ctx->bufSize * 4 / 3;
-		unsigned char *nbuf;
-		int curpos;
-		if( nsize == 0 ) {
-			int i;
-			for(i=0;i<ctx->m->code->nfunctions;i++)
-				nsize += ctx->m->code->functions[i].nops;
-			nsize *= 4;
-		}
-		if( nsize < ctx->bufSize + MAX_OP_SIZE * 4 ) nsize = ctx->bufSize + MAX_OP_SIZE * 4;
-		curpos = BUF_POS();
-		nbuf = (unsigned char*)malloc(nsize);
-		if( nbuf == NULL ) ASSERT(nsize);
-		if( ctx->startBuf ) {
-			memcpy(nbuf,ctx->startBuf,curpos);
-			free(ctx->startBuf);
-		}
-		ctx->startBuf = nbuf;
-		ctx->buf.b = nbuf + curpos;
-		ctx->bufSize = nsize;
-	}
-}
-
-static const char *KNAMES[] = { "cpu","fpu","stack","const","addr","mem","unused" };
-#define ERRIF(c)	if( c ) { printf("%s(%s,%s)\n",f?f->name:"???",KNAMES[a->kind], KNAMES[b->kind]); ASSERT(0); }
-
-typedef struct {
-	const char *name;						// single operand
-	int r_mem;		// r32 / r/m32				r32
-	int mem_r;		// r/m32 / r32				r/m32
-	int r_const;	// r32 / imm32				imm32
-	int r_i8;		// r32 / imm8				imm8
-	int mem_const;	// r/m32 / imm32			N/A
-} opform;
-
-#define FLAG_LONGOP	0x80000000
-#define FLAG_16B	0x40000000
-#define FLAG_8B		0x20000000
-#define FLAG_DUAL   0x10000000
-
-#define RM(op,id) ((op) | (((id)+1)<<8))
-#define GET_RM(op)	(((op) >> ((op) < 0 ? 24 : 8)) & 15)
-#define SBYTE(op) ((op) << 16)
-#define LONG_OP(op)	((op) | FLAG_LONGOP)
-#define OP16(op)	LONG_OP((op) | FLAG_16B)
-#define LONG_RM(op,id)	LONG_OP(op | (((id) + 1) << 24))
-
-static opform OP_FORMS[_CPU_LAST] = {
-	{ "MOV", 0x8B, 0x89, 0xB8, 0, RM(0xC7,0) },
-	{ "LEA", 0x8D },
-	{ "PUSH", 0x50, RM(0xFF,6), 0x68, 0x6A },
-	{ "ADD", 0x03, 0x01, RM(0x81,0), RM(0x83,0) },
-	{ "SUB", 0x2B, 0x29, RM(0x81,5), RM(0x83,5) },
-	{ "IMUL", LONG_OP(0x0FAF), 0, 0x69 | FLAG_DUAL, 0x6B | FLAG_DUAL },
-	{ "DIV", RM(0xF7,6), RM(0xF7,6) },
-	{ "IDIV", RM(0xF7,7), RM(0xF7,7) },
-	{ "CDQ", 0x99 },
-	{ "CDQE", 0x98 },
-	{ "POP", 0x58, RM(0x8F,0) },
-	{ "RET", 0xC3 },
-	{ "CALL", RM(0xFF,2), RM(0xFF,2), 0xE8 },
-	{ "AND", 0x23, 0x21, RM(0x81,4), RM(0x83,4) },
-	{ "OR", 0x0B, 0x09, RM(0x81,1), RM(0x83,1) },
-	{ "XOR", 0x33, 0x31, RM(0x81,6), RM(0x83,6) },
-	{ "CMP", 0x3B, 0x39, RM(0x81,7), RM(0x83,7) },
-	{ "TEST", 0x85, 0x85/*SWP?*/, RM(0xF7,0) },
-	{ "NOP", 0x90 },
-	{ "SHL", RM(0xD3,4), 0, 0, RM(0xC1,4) },
-	{ "SHR", RM(0xD3,5), 0, 0, RM(0xC1,5) },
-	{ "SAR", RM(0xD3,7), 0, 0, RM(0xC1,7) },
-	{ "INC", IS_64 ? RM(0xFF,0) : 0x40, RM(0xFF,0) },
-	{ "DEC", IS_64 ? RM(0xFF,1) : 0x48, RM(0xFF,1) },
-	{ "JMP", RM(0xFF,4) },
-	// FPU
-	{ "FSTP", 0, RM(0xDD,3) },
-	{ "FSTP32", 0, RM(0xD9,3) },
-	{ "FLD", 0, RM(0xDD,0) },
-	{ "FLD32", 0, RM(0xD9,0) },
-	{ "FLDCW", 0, RM(0xD9, 5) },
-	// SSE
-	{ "MOVSD", 0xF20F10, 0xF20F11  },
-	{ "MOVSS", 0xF30F10, 0xF30F11  },
-	{ "COMISD", 0x660F2F },
-	{ "COMISS", LONG_OP(0x0F2F) },
-	{ "ADDSD", 0xF20F58 },
-	{ "SUBSD", 0xF20F5C },
-	{ "MULSD", 0xF20F59 },
-	{ "DIVSD", 0xF20F5E },
-	{ "ADDSS", 0xF30F58 },
-	{ "SUBSS", 0xF30F5C },
-	{ "MULSS", 0xF30F59 },
-	{ "DIVSS", 0xF30F5E },
-	{ "XORPD", 0x660F57 },
-	{ "CVTSI2SD", 0xF20F2A },
-	{ "CVTSI2SS", 0xF30F2A },
-	{ "CVTSD2SI", 0xF20F2D },
-	{ "CVTSD2SS", 0xF20F5A },
-	{ "CVTSS2SD", 0xF30F5A },
-	{ "CVTSS2SI", 0xF30F2D },
-	{ "STMXCSR", 0, LONG_RM(0x0FAE,3) },
-	{ "LDMXCSR", 0, LONG_RM(0x0FAE,2) },
-	// 8 bits,
-	{ "MOV8", 0x8A, 0x88, 0, 0xB0, RM(0xC6,0) },
-	{ "CMP8", 0x3A, 0x38, 0, RM(0x80,7) },
-	{ "TEST8", 0x84, 0x84, RM(0xF6,0) },
-	{ "PUSH8", 0, 0, 0x6A | FLAG_8B },
-	{ "MOV16", OP16(0x8B), OP16(0x89), OP16(0xB8) },
-	{ "CMP16", OP16(0x3B), OP16(0x39) },
-	{ "TEST16", OP16(0x85) },
-	// prefetchs
-	{ "PREFETCHT0", 0, LONG_RM(0x0F18,1) },
-	{ "PREFETCHT1", 0, LONG_RM(0x0F18,2) },
-	{ "PREFETCHT2", 0, LONG_RM(0x0F18,3) },
-	{ "PREFETCHNTA", 0, LONG_RM(0x0F18,0) },
-	{ "PREFETCHW", 0, LONG_RM(0x0F0D,1) },
-};
-
-#ifdef HL_64
-#	define REX()	if( r64 ) B(r64 | 0x40)
-#else
-#	define REX()
-#endif
-
-#define	OP(b)	\
-	if( (b) & 0xFF0000 ) { \
-		B((b)>>16); \
-		if( r64 ) B(r64 | 0x40); /* also in 32 bits mode */ \
-		B((b)>>8); \
-		B(b); \
-	} else { \
-		if( (b) & FLAG_16B ) { \
-			B(0x66); \
-			REX(); \
-		} else {\
-			REX(); \
-			if( (b) & FLAG_LONGOP ) B((b)>>8); \
-		}\
-		B(b); \
-	}
-
-static bool is_reg8( preg *a ) {
-	return a->kind == RSTACK || a->kind == RMEM || a->kind == RCONST || (a->kind == RCPU && a->id != Esi && a->id != Edi);
-}
-
-static void op( jit_ctx *ctx, CpuOp o, preg *a, preg *b, bool mode64 ) {
-	opform *f = &OP_FORMS[o];
-	int r64 = mode64 && (o != PUSH && o != POP && o != CALL && o != PUSH8 && o < PREFETCHT0) ? 8 : 0;
-	switch( o ) {
-	case CMP8:
-	case TEST8:
-	case MOV8:
-		if( !is_reg8(a) || !is_reg8(b) )
-			ASSERT(0);
-		break;
-	default:
-		break;
-	}
-	switch( ID2(a->kind,b->kind) ) {
-	case ID2(RUNUSED,RUNUSED):
-		ERRIF(f->r_mem == 0);
-		OP(f->r_mem);
-		break;
-	case ID2(RCPU,RCPU):
-	case ID2(RFPU,RFPU):
-		ERRIF( f->r_mem == 0 );
-		if( a->id > 7 ) r64 |= 4;
-		if( b->id > 7 ) r64 |= 1;
-		OP(f->r_mem);
-		MOD_RM(3,a->id,b->id);
-		break;
-	case ID2(RCPU,RFPU):
-	case ID2(RFPU,RCPU):
-		ERRIF( (f->r_mem>>16) == 0 );
-		if( a->id > 7 ) r64 |= 4;
-		if( b->id > 7 ) r64 |= 1;
-		OP(f->r_mem);
-		MOD_RM(3,a->id,b->id);
-		break;
-	case ID2(RCPU,RUNUSED):
-		ERRIF( f->r_mem == 0 );
-		if( a->id > 7 ) r64 |= 1;
-		if( GET_RM(f->r_mem) > 0 ) {
-			OP(f->r_mem);
-			MOD_RM(3, GET_RM(f->r_mem)-1, a->id);
-		} else
-			OP(f->r_mem + (a->id&7));
-		break;
-	case ID2(RSTACK,RUNUSED):
-		ERRIF( f->mem_r == 0 || GET_RM(f->mem_r) == 0 );
-		{
-			int stackPos = R(a->id)->stackPos;
-			OP(f->mem_r);
-			if( IS_SBYTE(stackPos) ) {
-				MOD_RM(1,GET_RM(f->mem_r)-1,Ebp);
-				B(stackPos);
-			} else {
-				MOD_RM(2,GET_RM(f->mem_r)-1,Ebp);
-				W(stackPos);
-			}
-		}
-		break;
-	case ID2(RCPU,RCONST):
-		ERRIF( f->r_const == 0 && f->r_i8 == 0 );
-		if( a->id > 7 ) r64 |= 1;
-		{
-			int_val cval = b->holds ? (int_val)b->holds : b->id;
-			// short byte form
-			if( f->r_i8 && IS_SBYTE(cval) ) {
-				if( (f->r_i8&FLAG_DUAL) && a->id > 7 ) r64 |= 4;
-				OP(f->r_i8);
-				if( (f->r_i8&FLAG_DUAL) ) MOD_RM(3,a->id,a->id); else MOD_RM(3,GET_RM(f->r_i8)-1,a->id);
-				B((int)cval);
-			} else if( GET_RM(f->r_const) > 0 || (f->r_const&FLAG_DUAL) ) {
-				if( (f->r_i8&FLAG_DUAL) && a->id > 7 ) r64 |= 4;
-				OP(f->r_const&0xFF);
-				if( (f->r_i8&FLAG_DUAL) ) MOD_RM(3,a->id,a->id); else MOD_RM(3,GET_RM(f->r_const)-1,a->id);
-				if( mode64 && IS_64 && o == MOV ) W64(cval); else W((int)cval);
-			} else {
-				ERRIF( f->r_const == 0);
-				OP((f->r_const&0xFF) + (a->id&7));
-				if( mode64 && IS_64 && o == MOV ) W64(cval); else W((int)cval);
-			}
-		}
-		break;
-	case ID2(RSTACK,RCPU):
-	case ID2(RSTACK,RFPU):
-		ERRIF( f->mem_r == 0 );
-		if( b->id > 7 ) r64 |= 4;
-		{
-			int stackPos = R(a->id)->stackPos;
-			OP(f->mem_r);
-			if( IS_SBYTE(stackPos) ) {
-				MOD_RM(1,b->id,Ebp);
-				B(stackPos);
-			} else {
-				MOD_RM(2,b->id,Ebp);
-				W(stackPos);
-			}
-		}
-		break;
-	case ID2(RCPU,RSTACK):
-	case ID2(RFPU,RSTACK):
-		ERRIF( f->r_mem == 0 );
-		if( a->id > 7 ) r64 |= 4;
-		{
-			int stackPos = R(b->id)->stackPos;
-			OP(f->r_mem);
-			if( IS_SBYTE(stackPos) ) {
-				MOD_RM(1,a->id,Ebp);
-				B(stackPos);
-			} else {
-				MOD_RM(2,a->id,Ebp);
-				W(stackPos);
-			}
-		}
-		break;
-	case ID2(RCONST,RUNUSED):
-		ERRIF( f->r_const == 0 );
-		{
-			int_val cval = a->holds ? (int_val)a->holds : a->id;
-			OP(f->r_const);
-			if( f->r_const & FLAG_8B ) B((int)cval); else W((int)cval);
-		}
-		break;
-	case ID2(RMEM,RUNUSED):
-		ERRIF( f->mem_r == 0 );
-		{
-			int mult = a->id & 0xF;
-			int regOrOffs = mult == 15 ? a->id >> 4 : a->id >> 8;
-			CpuReg reg = (a->id >> 4) & 0xF;
-			if( mult == 15 ) {
-				ERRIF(1);
-			} else if( mult == 0 ) {
-				if( reg > 7 ) r64 |= 1;
-				OP(f->mem_r);
-				if( regOrOffs == 0 && (reg&7) != Ebp ) {
-					MOD_RM(0,GET_RM(f->mem_r)-1,reg);
-					if( (reg&7) == Esp ) B(0x24);
-				} else if( IS_SBYTE(regOrOffs) ) {
-					MOD_RM(1,GET_RM(f->mem_r)-1,reg);
-					if( (reg&7) == Esp ) B(0x24);
-					B(regOrOffs);
-				} else {
-					MOD_RM(2,GET_RM(f->mem_r)-1,reg);
-					if( (reg&7) == Esp ) B(0x24);
-					W(regOrOffs);
-				}
-			} else {
-				// [eax + ebx * M]
-				ERRIF(1);
-			}
-		}
-		break;
-	case ID2(RCPU, RMEM):
-	case ID2(RFPU, RMEM):
-		ERRIF( f->r_mem == 0 );
-		{
-			int mult = b->id & 0xF;
-			int regOrOffs = mult == 15 ? b->id >> 4 : b->id >> 8;
-			CpuReg reg = (b->id >> 4) & 0xF;
-			if( mult == 15 ) {
-				int pos;
-				if( a->id > 7 ) r64 |= 4;
-				OP(f->r_mem);
-				MOD_RM(0,a->id,5);
-				if( IS_64 ) {
-					// offset wrt current code
-					pos = BUF_POS() + 4;
-					W(regOrOffs - pos);
-				} else {
-					ERRIF(1);
-				}
-			} else if( mult == 0 ) {
-				if( a->id > 7 ) r64 |= 4;
-				if( reg > 7 ) r64 |= 1;
-				OP(f->r_mem);
-				if( regOrOffs == 0 && (reg&7) != Ebp ) {
-					MOD_RM(0,a->id,reg);
-					if( (reg&7) == Esp ) B(0x24);
-				} else if( IS_SBYTE(regOrOffs) ) {
-					MOD_RM(1,a->id,reg);
-					if( (reg&7) == Esp ) B(0x24);
-					B(regOrOffs);
-				} else {
-					MOD_RM(2,a->id,reg);
-					if( (reg&7) == Esp ) B(0x24);
-					W(regOrOffs);
-				}
-			} else {
-				int offset = (int)(int_val)b->holds;
-				if( a->id > 7 ) r64 |= 4;
-				if( reg > 7 ) r64 |= 1;
-				if( regOrOffs > 7 ) r64 |= 2;
-				OP(f->r_mem);
-				MOD_RM(offset == 0 ? 0 : IS_SBYTE(offset) ? 1 : 2,a->id,4);
-				SIB(mult,regOrOffs,reg);
-				if( offset ) {
-					if( IS_SBYTE(offset) ) B(offset); else W(offset);
-				}
-			}
-		}
-		break;
-#	ifndef HL_64
-	case ID2(RFPU,RADDR):
-#	endif
-	case ID2(RCPU,RADDR):
-		ERRIF( f->r_mem == 0 );
-		if( a->id > 7 ) r64 |= 4;
-		OP(f->r_mem);
-		MOD_RM(0,a->id,5);
-		if( IS_64 )
-			W64((int_val)b->holds);
-		else
-			W((int)(int_val)b->holds);
-		break;
-#	ifndef HL_64
-	case ID2(RADDR,RFPU):
-#	endif
-	case ID2(RADDR,RCPU):
-		ERRIF( f->mem_r == 0 );
-		if( b->id > 7 ) r64 |= 4;
-		OP(f->mem_r);
-		MOD_RM(0,b->id,5);
-		if( IS_64 )
-			W64((int_val)a->holds);
-		else
-			W((int)(int_val)a->holds);
-		break;
-	case ID2(RMEM, RCPU):
-	case ID2(RMEM, RFPU):
-		ERRIF( f->mem_r == 0 );
-		{
-			int mult = a->id & 0xF;
-			int regOrOffs = mult == 15 ? a->id >> 4 : a->id >> 8;
-			CpuReg reg = (a->id >> 4) & 0xF;
-			if( mult == 15 ) {
-				int pos;
-				if( b->id > 7 ) r64 |= 4;
-				OP(f->mem_r);
-				MOD_RM(0,b->id,5);
-				if( IS_64 ) {
-					// offset wrt current code
-					pos = BUF_POS() + 4;
-					W(regOrOffs - pos);
-				} else {
-					ERRIF(1);
-				}
-			} else if( mult == 0 ) {
-				if( b->id > 7 ) r64 |= 4;
-				if( reg > 7 ) r64 |= 1;
-				OP(f->mem_r);
-				if( regOrOffs == 0 && (reg&7) != Ebp ) {
-					MOD_RM(0,b->id,reg);
-					if( (reg&7) == Esp ) B(0x24);
-				} else if( IS_SBYTE(regOrOffs) ) {
-					MOD_RM(1,b->id,reg);
-					if( (reg&7) == Esp ) B(0x24);
-					B(regOrOffs);
-				} else {
-					MOD_RM(2,b->id,reg);
-					if( (reg&7) == Esp ) B(0x24);
-					W(regOrOffs);
-				}
-			} else {
-				int offset = (int)(int_val)a->holds;
-				if( b->id > 7 ) r64 |= 4;
-				if( reg > 7 ) r64 |= 1;
-				if( regOrOffs > 7 ) r64 |= 2;
-				OP(f->mem_r);
-				MOD_RM(offset == 0 ? 0 : IS_SBYTE(offset) ? 1 : 2,b->id,4);
-				SIB(mult,regOrOffs,reg);
-				if( offset ) {
-					if( IS_SBYTE(offset) ) B(offset); else W(offset);
-				}
-			}
-		}
-		break;
-	default:
-		ERRIF(1);
-	}
-	if( ctx->debug && ctx->f && o == CALL ) {
-		preg p;
-		op(ctx,MOV,pmem(&p,Esp,-HL_WSIZE),PEBP,true); // erase EIP (clean stack report)
-	}
-}
-
-static void op32( jit_ctx *ctx, CpuOp o, preg *a, preg *b ) {
-	op(ctx,o,a,b,false);
-}
-
-static void op64( jit_ctx *ctx, CpuOp o, preg *a, preg *b ) {
-#ifndef HL_64
-	op(ctx,o,a,b,false);
-#else
-	op(ctx,o,a,b,true);
+	UW(4, 3 /*UWOP_SET_FPREG*/, 0);
+	UW(1, 0 /*UWOP_PUSH_NONVOL*/, 5);
+	while( ctx->out_pos & 15 ) B(0);
 #endif
-}
-
-static void patch_jump( jit_ctx *ctx, int p ) {
-	if( p == 0 ) return;
-	if( p & 0x40000000 ) {
-		int d;
-		p &= 0x3FFFFFFF;
-		d = BUF_POS() - (p + 1);
-		if( d < -128 || d >= 128 ) ASSERT(d);
-		*(char*)(ctx->startBuf + p) = (char)d;
-	} else {
-		*(int*)(ctx->startBuf + p) = BUF_POS() - (p + 4);
-	}
-}
-
-static void patch_jump_to( jit_ctx *ctx, int p, int target ) {
-	if( p == 0 ) return;
-	if( p & 0x40000000 ) {
-		int d;
-		p &= 0x3FFFFFFF;
-		d = target - (p + 1);
-		if( d < -128 || d >= 128 ) ASSERT(d);
-		*(char*)(ctx->startBuf + p) = (char)d;
-	} else {
-		*(int*)(ctx->startBuf + p) = target - (p + 4);
-	}
-}
-
-static int stack_size( hl_type *t ) {
-	switch( t->kind ) {
-	case HUI8:
-	case HUI16:
-	case HBOOL:
-#	ifdef HL_64
-	case HI32:
-	case HF32:
-#	endif
-		return sizeof(int_val);
-	case HI64:
-	default:
-		return hl_type_size(t);
+	hl_codegen_init(ctx);
+	jit_code_append(ctx);
+	if( m->code->hasdebug ) {
+		m->jit_debug = (hl_debug_infos*)malloc(sizeof(hl_debug_infos) * m->code->nfunctions);
+		memset(m->jit_debug, -1, sizeof(hl_debug_infos) * m->code->nfunctions);
 	}
 }
 
-static int call_reg_index( int reg ) {
-#	ifdef HL_64
-	int i;
-	for(i=0;i<CALL_NREGS;i++)
-		if( CALL_REGS[i] == reg )
-			return i;
-#	endif
-	return -1;
+void hl_jit_free( jit_ctx *ctx, h_bool can_reset ) {
+	hl_codegen_free(ctx);
+	hl_regs_free(ctx);
+	hl_emit_free(ctx);
+	hl_free(&ctx->falloc);
+	free(ctx);
 }
 
-static bool is_call_reg( preg *p ) {
-#	ifdef HL_64
-	int i;
-	if( p->kind == RFPU )
-		return p->id < CALL_NREGS;
-	for(i=0;i<CALL_NREGS;i++)
-		if( p->kind == RCPU && p->id == CALL_REGS[i] )
-			return true;
-	return false;
-#	else
-	return false;
-#	endif
+void hl_jit_reset( jit_ctx *ctx, hl_module *m ) {
 }
 
-static preg *alloc_reg( jit_ctx *ctx, preg_kind k ) {
-	int i;
-	preg *p;
-	switch( k ) {
-	case RCPU:
-	case RCPU_CALL:
-	case RCPU_8BITS:
-		{
-			int off = ctx->allocOffset++;
-			const int count = RCPU_SCRATCH_COUNT;
-			for(i=0;i<count;i++) {
-				int r = RCPU_SCRATCH_REGS[(i + off)%count];
-				p = ctx->pregs + r;
-				if( p->lock >= ctx->currentPos ) continue;
-				if( k == RCPU_CALL && is_call_reg(p) ) continue;
-				if( k == RCPU_8BITS && !is_reg8(p) ) continue;
-				if( p->holds == NULL ) {
-					RLOCK(p);
-					return p;
-				}
-			}
-			for(i=0;i<count;i++) {
-				preg *p = ctx->pregs + RCPU_SCRATCH_REGS[(i + off)%count];
-				if( p->lock >= ctx->currentPos ) continue;
-				if( k == RCPU_CALL && is_call_reg(p) ) continue;
-				if( k == RCPU_8BITS && !is_reg8(p) ) continue;
-				if( p->holds ) {
-					RLOCK(p);
-					p->holds->current = NULL;
-					p->holds = NULL;
-					return p;
-				}
-			}
-		}
-		break;
-	case RFPU:
-		{
-			int off = ctx->allocOffset++;
-			const int count = RFPU_SCRATCH_COUNT;
-			for(i=0;i<count;i++) {
-				preg *p = PXMM((i + off)%count);
-				if( p->lock >= ctx->currentPos ) continue;
-				if( p->holds == NULL ) {
-					RLOCK(p);
-					return p;
-				}
-			}
-			for(i=0;i<count;i++) {
-				preg *p = PXMM((i + off)%count);
-				if( p->lock >= ctx->currentPos ) continue;
-				if( p->holds ) {
-					RLOCK(p);
-					p->holds->current = NULL;
-					p->holds = NULL;
-					return p;
-				}
-			}
+int hl_jit_function( jit_ctx *ctx, hl_module *m, hl_function *f ) {
+	hl_free(&ctx->falloc);
+	ctx->mod = m;
+	ctx->fun = f;
+	ctx->reg_instr_count = 0;
+	ctx->code_size = 0;
+	current_ctx = ctx;
+	hl_emit_function(ctx);
+	hl_regs_function(ctx);
+	hl_codegen_function(ctx);
+	int pos = ctx->out_pos;
+	hl_jit_define_function(ctx, pos, ctx->code_size);
+	if( m->jit_debug && ctx->code_pos_map ) {
+		bool compact = ctx->code_size < 0xFFFF;
+		void *debug = malloc((compact ? sizeof(unsigned short) : sizeof(int)) * (f->nops + 1));
+		for(int i=0;i<=f->nops;i++) {
+			int ipos = ctx->emit_pos_map[i];
+			int rpos = ctx->reg_pos_map[ipos];
+			int cpos = ctx->code_pos_map[rpos];
+			if( compact )
+				((unsigned short*)debug)[i] = (unsigned short)cpos;
+			else
+				((int*)debug)[i] = cpos;
 		}
-		break;
-	default:
-		ASSERT(k);
-	}
-	ASSERT(0); // out of registers !
-	return NULL;
-}
-
-static preg *fetch( vreg *r ) {
-	if( r->current )
-		return r->current;
-	return &r->stack;
-}
-
-static void scratch( preg *r ) {
-	if( r && r->holds ) {
-		r->holds->current = NULL;
-		r->holds = NULL;
-		r->lock = 0;
+		int fid = (int)(f - m->code->functions);
+		m->jit_debug[fid].start = pos;
+		m->jit_debug[fid].offsets = debug;
+		m->jit_debug[fid].large = !compact;
 	}
+	if( !jit_code_append(ctx) )
+		return -1;
+	current_ctx = NULL;
+	return pos;
 }
 
-static preg *copy( jit_ctx *ctx, preg *to, preg *from, int size );
-
-static void load( jit_ctx *ctx, preg *r, vreg *v ) {
-	preg *from = fetch(v);
-	if( from == r || v->size == 0 ) return;
-	if( r->holds ) r->holds->current = NULL;
-	if( v->current ) {
-		v->current->holds = NULL;
-		from = r;
-	}
-	r->holds = v;
-	v->current = r;
-	copy(ctx,r,from,v->size);
-}
+static void *call_jit_c2hl = hl_jit_assert;
+static void *call_jit_hl2c = hl_jit_assert;
+static int arg_reg_count = 0;
+static int arg_fp_count = 0;
 
-static preg *alloc_fpu( jit_ctx *ctx, vreg *r, bool andLoad ) {
-	preg *p = fetch(r);
-	if( p->kind != RFPU ) {
-		if( !IS_FLOAT(r) && (IS_64 || r->t->kind != HI64) ) ASSERT(r->t->kind);
-		p = alloc_reg(ctx, RFPU);
-		if( andLoad )
-			load(ctx,p,r);
-		else {
-			if( r->current )
-				r->current->holds = NULL;
-			r->current = p;
-			p->holds = r;
+static int get_next_reg( hl_type *t, int *rp, int *fp ) {
+	if( t->kind == HF32 || t->kind == HF64 ) {
+		if( *fp < arg_fp_count ) {
+			int r = (*fp)++;
+			if( IS_WINCALL64 ) (*rp)++;
+			return r;
 		}
-	} else
-		RLOCK(p);
-	return p;
-}
-
-static void reg_bind( vreg *r, preg *p ) {
-	if( r->current )
-		r->current->holds = NULL;
-	r->current = p;
-	p->holds = r;
-}
-
-static preg *alloc_cpu( jit_ctx *ctx, vreg *r, bool andLoad ) {
-	preg *p = fetch(r);
-	if( p->kind != RCPU ) {
-#		ifndef HL_64
-		if( r->t->kind == HI64 ) return alloc_fpu(ctx,r,andLoad);
-		if( r->size > 4 ) ASSERT(r->size);
-#		endif
-		p = alloc_reg(ctx, RCPU);
-		if( andLoad )
-			load(ctx,p,r);
-		else
-			reg_bind(r,p);
-	} else
-		RLOCK(p);
-	return p;
-}
-
-// allocate a register that is not a call parameter
-static preg *alloc_cpu_call( jit_ctx *ctx, vreg *r ) {
-	preg *p = fetch(r);
-	if( p->kind != RCPU ) {
-#		ifndef HL_64
-		if( r->t->kind == HI64 ) return alloc_fpu(ctx,r,true);
-		if( r->size > 4 ) ASSERT(r->size);
-#		endif
-		p = alloc_reg(ctx, RCPU_CALL);
-		load(ctx,p,r);
-	} else if( is_call_reg(p) ) {
-		preg *p2 = alloc_reg(ctx, RCPU_CALL);
-		op64(ctx,MOV,p2,p);
-		scratch(p);
-		reg_bind(r,p2);
-		return p2;
-	} else
-		RLOCK(p);
-	return p;
-}
-
-static preg *fetch32( jit_ctx *ctx, vreg *r ) {
-	if( r->current )
-		return r->current;
-	// make sure that the register is correctly erased
-	if( r->size < 4 ) {
-		preg *p = alloc_cpu(ctx, r, true);
-		RUNLOCK(p);
-		return p;
+		return -1;
 	}
-	return fetch(r);
-}
-
-// make sure higher bits are zeroes
-static preg *alloc_cpu64( jit_ctx *ctx, vreg *r, bool andLoad ) {
-#	ifndef HL_64
-	return alloc_cpu(ctx,r,andLoad);
-#	else
-	preg *p = fetch(r);
-	if( !andLoad ) ASSERT(0);
-	if( p->kind != RCPU ) {
-		p = alloc_reg(ctx, RCPU);
-		op64(ctx,XOR,p,p);
-		load(ctx,p,r);
-	} else {
-		// remove higher bits
-		preg tmp;
-		op64(ctx,SHL,p,pconst(&tmp,32));
-		op64(ctx,SHR,p,pconst(&tmp,32));
-		RLOCK(p);
+	if( *rp < arg_fp_count ) {
+		int r = (*rp)++;
+		if( IS_WINCALL64 ) (*fp)++;
+		return r;
 	}
-	return p;
-#	endif
+	return -1;
 }
 
-// make sure the register can be used with 8 bits access
-static preg *alloc_cpu8( jit_ctx *ctx, vreg *r, bool andLoad ) {
-	preg *p = fetch(r);
-	if( p->kind != RCPU ) {
-		p = alloc_reg(ctx, RCPU_8BITS);
-		load(ctx,p,r);
-	} else if( !is_reg8(p) ) {
-		preg *p2 = alloc_reg(ctx, RCPU_8BITS);
-		op64(ctx,MOV,p2,p);
-		scratch(p);
-		reg_bind(r,p2);
-		return p2;
-	} else
-		RLOCK(p);
-	return p;
+static void *default_wrapper( hl_type *ft ) {
+	return call_jit_hl2c;
 }
 
-static preg *copy( jit_ctx *ctx, preg *to, preg *from, int size ) {
-	if( size == 0 || to == from ) return to;
-	switch( ID2(to->kind,from->kind) ) {
-	case ID2(RMEM,RCPU):
-	case ID2(RSTACK,RCPU):
-	case ID2(RCPU,RSTACK):
-	case ID2(RCPU,RMEM):
-	case ID2(RCPU,RCPU):
-#	ifndef HL_64
-	case ID2(RCPU,RADDR):
-	case ID2(RADDR,RCPU):
-#	endif
-		switch( size ) {
-		case 1:
-			if( to->kind == RCPU ) {
-				op64(ctx,XOR,to,to);
-				if( !is_reg8(to) ) {
-					preg p;
-					op32(ctx,MOV16,to,from);
-					op32(ctx,SHL,to,pconst(&p,24));
-					op32(ctx,SHR,to,pconst(&p,24));
-					break;
-				}
-			}
-			if( !is_reg8(from) ) {
-				preg *r = alloc_reg(ctx, RCPU_CALL);
-				op32(ctx, MOV, r, from);
-				RUNLOCK(r);
-				op32(ctx,MOV8,to,r);
-				return from;
-			}
-			op32(ctx,MOV8,to,from);
-			break;
-		case 2:
-			if( to->kind == RCPU )
-				op64(ctx,XOR,to,to);
-			op32(ctx,MOV16,to,from);
+static void *callback_c2hl( void *f, hl_type *t, void **args, vdynamic *ret ) {
+	int nargs = t->fun->nargs;
+	if( nargs > MAX_ARGS )
+		hl_error("Too many arguments for dynamic call");
+	struct {
+		void *regs[MAX_ARGS];
+		void *stack[MAX_ARGS];
+	} vargs;
+	int rp = 0, fp = 0, sp = MAX_ARGS;
+	for(int i=0;i<t->fun->nargs;i++) {
+		hl_type *at = t->fun->args[i];
+		void *v = args[i];
+		int r = get_next_reg(at,&rp,&fp);
+		int_val iv;
+		switch( at->kind ) {
+		case HBOOL:
+		case HUI8:
+		case HUI16:
+		case HI32:
+		case HF32:
+			iv = *(int*)v;
 			break;
-		case 4:
-			op32(ctx,MOV,to,from);
+		case HI64:
+		case HGUID:
+		case HF64:
+			iv = *(int_val*)v;
 			break;
-		case 8:
-			if( IS_64 ) {
-				op64(ctx,MOV,to,from);
-				break;
-			}
 		default:
-			ASSERT(size);
-		}
-		return to->kind == RCPU ? to : from;
-	case ID2(RFPU,RFPU):
-	case ID2(RMEM,RFPU):
-	case ID2(RSTACK,RFPU):
-	case ID2(RFPU,RMEM):
-	case ID2(RFPU,RSTACK):
-		switch( size ) {
-		case 8:
-			op64(ctx,MOVSD,to,from);
+			iv = (int_val)v;
 			break;
-		case 4:
-			op32(ctx,MOVSS,to,from);
-			break;
-		default:
-			ASSERT(size);
 		}
-		return to->kind == RFPU ? to : from;
-	case ID2(RMEM,RSTACK):
-		{
-			vreg *rfrom = R(from->id);
-			if( IS_FLOAT(rfrom) )
-				return copy(ctx,to,alloc_fpu(ctx,rfrom,true),size);
-			return copy(ctx,to,alloc_cpu(ctx,rfrom,true),size);
-		}
-	case ID2(RMEM,RMEM):
-	case ID2(RSTACK,RMEM):
-	case ID2(RSTACK,RSTACK):
-#	ifndef HL_64
-	case ID2(RMEM,RADDR):
-	case ID2(RSTACK,RADDR):
-	case ID2(RADDR,RSTACK):
-#	endif
-		{
-			preg *tmp;
-			if( (!IS_64 && size == 8) || (to->kind == RSTACK && IS_FLOAT(R(to->id))) || (from->kind == RSTACK && IS_FLOAT(R(from->id))) ) {
-				tmp = alloc_reg(ctx, RFPU);
-				op64(ctx,size == 8 ? MOVSD : MOVSS,tmp,from);
-			} else {
-				tmp = alloc_reg(ctx, RCPU);
-				copy(ctx,tmp,from,size);
-			}
-			return copy(ctx,to,tmp,size);
-		}
-#	ifdef HL_64
-	case ID2(RCPU,RADDR):
-	case ID2(RMEM,RADDR):
-	case ID2(RSTACK,RADDR):
-		{
-			preg p;
-			preg *tmp = alloc_reg(ctx, RCPU);
-			op64(ctx,MOV,tmp,pconst64(&p,(int_val)from->holds));
-			return copy(ctx,to,pmem(&p,tmp->id,0),size);
-		}
-	case ID2(RADDR,RCPU):
-	case ID2(RADDR,RMEM):
-	case ID2(RADDR,RSTACK):
-		{
-			preg p;
-			preg *tmp = alloc_reg(ctx, RCPU);
-			op64(ctx,MOV,tmp,pconst64(&p,(int_val)to->holds));
-			return copy(ctx,pmem(&p,tmp->id,0),from,size);
-		}
-#	endif
+		if( r >= 0 )
+			vargs.regs[r + (at->kind == HF32 || at->kind == HF64 ? arg_reg_count : 0)] = (void*)iv;
+		else
+			vargs.stack[--sp] = (void*)iv;
+	}
+	switch( t->fun->ret->kind ) {
+	case HUI8:
+	case HUI16:
+	case HI32:
+	case HBOOL:
+		ret->v.i = ((int (*)(void *, void *, int))call_jit_c2hl)(f, &vargs, MAX_ARGS - sp);
+		return &ret->v.i;
+	case HI64:
+	case HGUID:
+		ret->v.i64 = ((int64 (*)(void *, void *, int))call_jit_c2hl)(f, &vargs, MAX_ARGS - sp);
+		return &ret->v.i64;
+	case HF32:
+		ret->v.f = ((float (*)(void *, void *, int))call_jit_c2hl)(f, &vargs, MAX_ARGS - sp);
+		return &ret->v.f;
+	case HF64:
+		ret->v.d = ((double (*)(void *, void *, int))call_jit_c2hl)(f, &vargs, MAX_ARGS - sp);
+		return &ret->v.d;
 	default:
-		break;
+		return ((void *(*)(void *, void *, int))call_jit_c2hl)(f, &vargs, MAX_ARGS - sp);
 	}
-	printf("copy(%s,%s)\n",KNAMES[to->kind], KNAMES[from->kind]);
-	ASSERT(0);
-	return NULL;
 }
 
-static void store( jit_ctx *ctx, vreg *r, preg *v, bool bind ) {
-	if( r->current && r->current != v ) {
-		r->current->holds = NULL;
-		r->current = NULL;
-	}
-	v = copy(ctx,&r->stack,v,r->size);
-	if( IS_FLOAT(r) != (v->kind == RFPU) )
-		ASSERT(0);
-	if( bind && r->current != v && (v->kind == RCPU || v->kind == RFPU) ) {
-		scratch(v);
-		r->current = v;
-		v->holds = r;
+static vdynamic *callback_hl2c( vclosure_wrapper *c, char *stack_args, void **regs ) {
+	vdynamic *args[MAX_ARGS];
+	int nargs = c->cl.t->fun->nargs;
+	if( nargs > MAX_ARGS )
+		hl_error("Too many arguments for wrapped call");
+	int rp = 0, fp = 0;
+	rp++; // skip fptr in HL64 - was passed as arg0
+	if( IS_WINCALL64 ) fp++;
+	for(int i=0;i<nargs;i++) {
+		hl_type *t = c->cl.t->fun->args[i];
+		int creg = get_next_reg(t,&rp,&fp);
+		if( creg < 0 ) {
+			args[i] = hl_is_dynamic(t) ? *(vdynamic**)stack_args : hl_make_dyn(stack_args,t);
+			stack_args += (t->kind == HF64 ? 8 : HL_WSIZE);
+		} else if( hl_is_dynamic(t) ) {
+			args[i] = *(vdynamic**)(regs + creg);
+		} else if( t->kind == HF32 || t->kind == HF64 ) {
+			args[i] = hl_make_dyn(regs + arg_reg_count + creg,&hlt_f64);
+		} else {
+			args[i] = hl_make_dyn(regs + creg,t);
+		}
 	}
+	return hl_dyn_call(c->wrappedFun,args,nargs);
 }
 
-static void store_result( jit_ctx *ctx, vreg *r ) {
-#	ifndef HL_64
-	switch( r->t->kind ) {
-	case HF64:
-		scratch(r->current);
-		op64(ctx,FSTP,&r->stack,UNUSED);
-		break;
-	case HF32:
-		scratch(r->current);
-		op64(ctx,FSTP32,&r->stack,UNUSED);
-		break;
+void *hl_jit_wrapper_ptr( vclosure_wrapper *c, char *stack_args, void **regs ) {
+	vdynamic *ret = callback_hl2c(c, stack_args, regs);
+	hl_type *tret = c->cl.t->fun->ret;
+	switch( tret->kind ) {
+	case HVOID:
+		return NULL;
+	case HUI8:
+	case HUI16:
+	case HI32:
+	case HBOOL:
+		return (void*)(int_val)hl_dyn_casti(&ret,&hlt_dyn,tret);
 	case HI64:
-		scratch(r->current);
-		error_i64();
-		break;
+	case HGUID:
+		return (void*)(int_val)hl_dyn_casti64(&ret,&hlt_dyn);
 	default:
-#	endif
-		store(ctx,r,IS_FLOAT(r) ? REG_AT(XMM(0)) : PEAX,true);
-#	ifndef HL_64
-		break;
-	}
-#	endif
-}
-
-static void op_mov( jit_ctx *ctx, vreg *to, vreg *from ) {
-	preg *r = fetch(from);
-#	ifndef HL_64
-	if( to->t->kind == HI64 ) {
-		error_i64();
-		return;
+		return hl_dyn_castp(&ret,&hlt_dyn,tret);
 	}
-#	endif
-	if( from->t->kind == HF32 && r->kind != RFPU )
-		r = alloc_fpu(ctx,from,true);
-	store(ctx, to, r, true);
 }
 
-static void copy_to( jit_ctx *ctx, vreg *to, preg *from ) {
-	store(ctx,to,from,true);
+double hl_jit_wrapper_d( vclosure_wrapper *c, char *stack_args, void **regs ) {
+	vdynamic *ret = callback_hl2c(c, stack_args, regs);
+	return hl_dyn_castd(&ret,&hlt_dyn);
 }
 
-static void copy_from( jit_ctx *ctx, preg *to, vreg *from ) {
-	copy(ctx,to,fetch(from),from->size);
+void *hl_jit_code( jit_ctx *ctx, hl_module *m, int *codesize, hl_debug_infos **debug, hl_module *previous ) {
+	hl_codegen_flush_consts(ctx);
+	jit_code_append(ctx);
+	int size = ctx->out_pos;
+	if( size & 4095 ) size += 4096 - (size&4095);
+	unsigned char *code = (unsigned char*)hl_alloc_executable_memory(size);
+	if( code == NULL ) return NULL;
+	memcpy(code,ctx->output,size);
+	*codesize = size;
+	*debug = m->jit_debug;
+	ctx->final_code = code;
+	hl_emit_final(ctx);
+	hl_codegen_final(ctx);
+	arg_reg_count = ctx->cfg.regs.nargs;
+	arg_fp_count = ctx->cfg.floats.nargs;
+	call_jit_c2hl = ctx->final_code + ctx->code_funs.c2hl;
+	call_jit_hl2c = ctx->final_code + ctx->code_funs.hl2c;
+#	ifdef WIN64_UNWIND_TABLES
+	ctx->mod->unwind_table_size = ctx->fdef_index;
+#	endif
+	hl_setup.get_wrapper = default_wrapper;
+	hl_setup.static_call = callback_c2hl;
+	return code;
 }
 
-static void store_const( jit_ctx *ctx, vreg *r, int c ) {
-	preg p;
-	if( c == 0 )
-		op(ctx,XOR,alloc_cpu(ctx,r,false),alloc_cpu(ctx,r,false),r->size == 8);
-	else if( r->size == 8 )
-		op64(ctx,MOV,alloc_cpu(ctx,r,false),pconst64(&p,c));
-	else
-		op32(ctx,MOV,alloc_cpu(ctx,r,false),pconst(&p,c));
-	store(ctx,r,r->current,false);
+void hl_jit_patch_method( void*fun, void**newt ) {
+	jit_assert();
 }
-
-static void discard_regs( jit_ctx *ctx, bool native_call ) {
-	int i;
-	for(i=0;i<RCPU_SCRATCH_COUNT;i++) {
-		preg *r = ctx->pregs + RCPU_SCRATCH_REGS[i];
-		if( r->holds ) {
-			r->holds->current = NULL;
-			r->holds = NULL;
-		}
-	}
-	for(i=0;i<RFPU_COUNT;i++) {
-		preg *r = ctx->pregs + XMM(i);
-		if( r->holds ) {
-			r->holds->current = NULL;
-			r->holds = NULL;
-		}
-	}
-}
-
-static int pad_before_call( jit_ctx *ctx, int size ) {
-	int total = size + ctx->totalRegsSize + HL_WSIZE * 2; // EIP+EBP
-	if( total & 15 ) {
-		int pad = 16 - (total & 15);
-		preg p;
-		if( pad ) op64(ctx,SUB,PESP,pconst(&p,pad));
-		size += pad;
-	}
-	return size;
-}
-
-static void push_reg( jit_ctx *ctx, vreg *r ) {
-	preg p;
-	switch( stack_size(r->t) ) {
-	case 1:
-		op64(ctx,SUB,PESP,pconst(&p,1));
-		op32(ctx,MOV8,pmem(&p,Esp,0),alloc_cpu8(ctx,r,true));
-		break;
-	case 2:
-		op64(ctx,SUB,PESP,pconst(&p,2));
-		op32(ctx,MOV16,pmem(&p,Esp,0),alloc_cpu(ctx,r,true));
-		break;
-	case 4:
-		if( r->size < 4 )
-			alloc_cpu(ctx,r,true); // force fetch (higher bits set to 0)
-		if( !IS_64 ) {
-			if( r->current != NULL && r->current->kind == RFPU ) scratch(r->current);
-			op32(ctx,PUSH,fetch(r),UNUSED);
-		} else {
-			// pseudo push32 (not available)
-			op64(ctx,SUB,PESP,pconst(&p,4));
-			op32(ctx,MOV,pmem(&p,Esp,0),alloc_cpu(ctx,r,true));
-		}
-		break;
-	case 8:
-		if( fetch(r)->kind == RFPU ) {
-			op64(ctx,SUB,PESP,pconst(&p,8));
-			op64(ctx,MOVSD,pmem(&p,Esp,0),fetch(r));
-		} else if( IS_64 )
-			op64(ctx,PUSH,fetch(r),UNUSED);
-		else if( r->stack.kind == RSTACK ) {
-			scratch(r->current);
-			r->stackPos += 4;
-			op32(ctx,PUSH,&r->stack,UNUSED);
-			r->stackPos -= 4;
-			op32(ctx,PUSH,&r->stack,UNUSED);
-		} else
-			ASSERT(0);
-		break;
-	default:
-		ASSERT(r->size);
-	}
-}
-
-static int begin_native_call( jit_ctx *ctx, int nargs ) {
-	ctx->nativeArgsCount = nargs;
-	return pad_before_call(ctx, nargs > CALL_NREGS ? (nargs - CALL_NREGS) * HL_WSIZE : 0);
-}
-
-static preg *alloc_native_arg( jit_ctx *ctx ) {
-#	ifdef HL_64
-	int rid = ctx->nativeArgsCount - 1;
-	preg *r = rid < CALL_NREGS ? REG_AT(CALL_REGS[rid]) : alloc_reg(ctx,RCPU_CALL);
-	scratch(r);
-	return r;
-#	else
-	return alloc_reg(ctx, RCPU);
-#	endif
-}
-
-static void set_native_arg( jit_ctx *ctx, preg *r ) {
-	if( r->kind == RSTACK ) {
-		vreg *v = ctx->vregs + r->id;
-		if( v->size < 4 )
-			r = fetch32(ctx, v);
-	}
-#	ifdef HL_64
-	if( r->kind == RFPU ) ASSERT(0);
-	int rid = --ctx->nativeArgsCount;
-	preg *target;
-	if( rid >= CALL_NREGS ) {
-		op64(ctx,PUSH,r,UNUSED);
-		return;
-	}
-	target = REG_AT(CALL_REGS[rid]);
-	if( target != r ) {
-		op64(ctx, MOV, target, r);
-		scratch(target);
-	}
-#	else
-	op32(ctx,PUSH,r,UNUSED);
-#	endif
-}
-
-static void set_native_arg_fpu( jit_ctx *ctx, preg *r, bool isf32 ) {
-#	ifdef HL_64
-	if( r->kind == RCPU ) ASSERT(0);
-	// can only be used if last argument !!
-	ctx->nativeArgsCount--;
-	preg *target = REG_AT(XMM(IS_WINCALL64 ? ctx->nativeArgsCount : 0));
-	if( target != r ) {
-		op64(ctx, isf32 ? MOVSS : MOVSD, target, r);
-		scratch(target);
-	}
-#	else
-	op32(ctx,PUSH,r,UNUSED);
-#	endif
-}
-
-typedef struct {
-	int nextCpu;
-	int nextFpu;
-	int mapped[REG_COUNT];
-} call_regs;
-
-static int select_call_reg( call_regs *regs, hl_type *t, int id ) {
-#	ifndef HL_64
-	return -1;
-#else
-	bool isFloat = t->kind == HF32 || t->kind == HF64;
-#	ifdef HL_WIN_CALL
-	int index = regs->nextCpu++;
-#	else
-	int index = isFloat ? regs->nextFpu++ : regs->nextCpu++;
-#	endif
-	if( index >= CALL_NREGS )
-		return -1;
-	int reg = isFloat ? XMM(index) : CALL_REGS[index];
-	regs->mapped[reg] = id + 1;
-	return reg;
-#endif
-}
-
-static int mapped_reg( call_regs *regs, int id ) {
-#	ifndef HL_64
-	return -1;
-#else
-	int i;
-	for(i=0;i<CALL_NREGS;i++) {
-		int r = CALL_REGS[i];
-		if( regs->mapped[r] == id + 1 ) return r;
-		r = XMM(i);
-		if( regs->mapped[r] == id + 1 ) return r;
-	}
-	return -1;
-#endif
-}
-
-static int prepare_call_args( jit_ctx *ctx, int count, int *args, vreg *vregs, int extraSize ) {
-	int i;
-	int size = extraSize, paddedSize;
-	call_regs ctmp = {0};
-	for(i=0;i<count;i++) {
-		vreg *r = vregs + args[i];
-		int cr = select_call_reg(&ctmp, r->t, i);
-		if( cr >= 0 ) {
-			preg *c = REG_AT(cr);
-			preg *cur = fetch(r);
-			if( cur != c ) {
-				copy(ctx,c,cur,r->size);
-				scratch(c);
-			}
-			RLOCK(c);
-			continue;
-		}
-		size += stack_size(r->t);
-	}
-	paddedSize = pad_before_call(ctx,size);
-	for(i=0;i<count;i++) {
-		// RTL
-		int j = count - (i + 1);
-		vreg *r = vregs + args[j];
-		if( (i & 7) == 0 ) jit_buf(ctx);
-		if( mapped_reg(&ctmp,j) >= 0 ) continue;
-		push_reg(ctx,r);
-		if( r->current ) RUNLOCK(r->current);
-	}
-	return paddedSize;
-}
-
-static void op_call( jit_ctx *ctx, preg *r, int size ) {
-	preg p;
-#	ifdef JIT_DEBUG
-	if( IS_64 && size >= 0 ) {
-		int jchk;
-		op32(ctx,TEST,PESP,pconst(&p,15));
-		XJump(JZero,jchk);
-		BREAK(); // unaligned ESP
-		patch_jump(ctx, jchk);
-	}
-#	endif
-	if( IS_WINCALL64 ) {
-		// MSVC requires 32bytes of free space here
-		op64(ctx,SUB,PESP,pconst(&p,32));
-		if( size >= 0 ) size += 32;
-	}
-	op32(ctx, CALL, r, UNUSED);
-	if( size > 0 ) op64(ctx,ADD,PESP,pconst(&p,size));
-}
-
-static void call_native( jit_ctx *ctx, void *nativeFun, int size ) {
-	bool isExc = nativeFun == hl_assert || nativeFun == hl_throw || nativeFun == on_jit_error;
-	preg p;
-	// native function, already resolved
-	op64(ctx,MOV,PEAX,pconst64(&p,(int_val)nativeFun));
-	op_call(ctx,PEAX, isExc ? -1 : size);
-	if( isExc )
-		return;
-	discard_regs(ctx, true);
-}
-
-static void op_call_fun( jit_ctx *ctx, vreg *dst, int findex, int count, int *args ) {
-	int fid = findex < 0 ? -1 : ctx->m->functions_indexes[findex];
-	bool isNative = fid >= ctx->m->code->nfunctions;
-	int size = prepare_call_args(ctx,count,args,ctx->vregs,0);
-	preg p;
-	if( fid < 0 ) {
-		ASSERT(fid);
-	} else if( isNative ) {
-		call_native(ctx,ctx->m->functions_ptrs[findex],size);
-	} else {
-		int cpos = BUF_POS() + (IS_WINCALL64 ? 4 : 0);
-#		ifdef JIT_DEBUG
-		if( IS_64 ) cpos += 13; // ESP CHECK
-#		endif
-		if( ctx->m->functions_ptrs[findex] ) {
-			// already compiled
-			op_call(ctx,pconst(&p,(int)(int_val)ctx->m->functions_ptrs[findex] - (cpos + 5)), size);
-		} else if( ctx->m->code->functions + fid == ctx->f ) {
-			// our current function
-			op_call(ctx,pconst(&p, ctx->functionPos - (cpos + 5)), size);
-		} else {
-			// stage for later
-			jlist *j = (jlist*)hl_malloc(&ctx->galloc,sizeof(jlist));
-			j->pos = cpos;
-			j->target = findex;
-			j->next = ctx->calls;
-			ctx->calls = j;
-			op_call(ctx,pconst(&p,0), size);
-		}
-		discard_regs(ctx, false);
-	}
-	if( dst )
-		store_result(ctx,dst);
-}
-
-static void op_enter( jit_ctx *ctx ) {
-	preg p;
-	op64(ctx, PUSH, PEBP, UNUSED);
-	op64(ctx, MOV, PEBP, PESP);
-	if( ctx->totalRegsSize ) op64(ctx, SUB, PESP, pconst(&p,ctx->totalRegsSize));
-}
-
-static void op_ret( jit_ctx *ctx, vreg *r ) {
-	preg p;
-	switch( r->t->kind ) {
-	case HF32:
-#		ifdef HL_64
-		op64(ctx, MOVSS, PXMM(0), fetch(r));
-#		else
-		op64(ctx,FLD32,&r->stack,UNUSED);
-#		endif
-		break;
-	case HF64:
-#		ifdef HL_64
-		op64(ctx, MOVSD, PXMM(0), fetch(r));
-#		else
-		op64(ctx,FLD,&r->stack,UNUSED);
-#		endif
-		break;
-	default:
-		if( r->size < 4 && !r->current )
-			fetch32(ctx, r);
-		if( r->current != PEAX )
-			op64(ctx,MOV,PEAX,fetch(r));
-		break;
-	}
-	if( ctx->totalRegsSize ) op64(ctx, ADD, PESP, pconst(&p, ctx->totalRegsSize));
-#	ifdef JIT_DEBUG
-	{
-		int jeq;
-		op64(ctx, CMP, PESP, PEBP);
-		XJump_small(JEq,jeq);
-		jit_error("invalid ESP");
-		patch_jump(ctx,jeq);
-	}
-#	endif
-	op64(ctx, POP, PEBP, UNUSED);
-	op64(ctx, RET, UNUSED, UNUSED);
-}
-
-static void call_native_consts( jit_ctx *ctx, void *nativeFun, int_val *args, int nargs ) {
-	int size = pad_before_call(ctx, IS_64 ? 0 : HL_WSIZE*nargs);
-	preg p;
-	int i;
-#	ifdef HL_64
-	for(i=0;i<nargs;i++)
-		op64(ctx, MOV, REG_AT(CALL_REGS[i]), pconst64(&p, args[i]));
-#	else
-	for(i=nargs-1;i>=0;i--)
-		op32(ctx, PUSH, pconst64(&p, args[i]), UNUSED);
-#	endif
-	call_native(ctx, nativeFun, size);
-}
-
-static void on_jit_error( const char *msg, int_val line ) {
-	char buf[256];
-	int iline = (int)line;
-	sprintf(buf,"%s (line %d)",msg,iline);
-#ifdef HL_WIN_DESKTOP
-	MessageBoxA(NULL,buf,"JIT ERROR",MB_OK);
-#else
-	printf("JIT ERROR : %s\n",buf);
-#endif
-	hl_debug_break();
-	hl_throw(NULL);
-}
-
-static void _jit_error( jit_ctx *ctx, const char *msg, int line ) {
-	int_val args[2] = { (int_val)msg, (int_val)line };
-	call_native_consts(ctx,on_jit_error,args,2);
-}
-
-
-static preg *op_binop( jit_ctx *ctx, vreg *dst, vreg *a, vreg *b, hl_op bop ) {
-	preg *pa = fetch(a), *pb = fetch(b), *out = NULL;
-	CpuOp o;
-	if( IS_FLOAT(a) ) {
-		bool isf32 = a->t->kind == HF32;
-		switch( bop ) {
-		case OAdd: o = isf32 ? ADDSS : ADDSD; break;
-		case OSub: o = isf32 ? SUBSS : SUBSD; break;
-		case OMul: o = isf32 ? MULSS : MULSD; break;
-		case OSDiv: o = isf32 ? DIVSS : DIVSD; break;
-		case OJSLt:
-		case OJSGte:
-		case OJSLte:
-		case OJSGt:
-		case OJEq:
-		case OJNotEq:
-		case OJNotLt:
-		case OJNotGte:
-			o = isf32 ? COMISS : COMISD;
-			break;
-		case OSMod:
-			{
-				int args[] = { a->stack.id, b->stack.id };
-				int size = prepare_call_args(ctx,2,args,ctx->vregs,0);
-				void *mod_fun;
-				if( isf32 ) mod_fun = fmodf; else mod_fun = fmod;
-				call_native(ctx,mod_fun,size);
-				store_result(ctx,dst);
-				return fetch(dst);
-			}
-		default:
-			printf("%s\n", hl_op_name(bop));
-			ASSERT(bop);
-		}
-	} else {
-		bool is64 =	a->t->kind == HI64;
-#	ifndef HL_64
-		if( is64 ) {
-			error_i64();
-			return fetch(a);
-		}
-#	endif
-		switch( bop ) {
-		case OAdd: o = ADD; break;
-		case OSub: o = SUB; break;
-		case OMul: o = IMUL; break;
-		case OAnd: o = AND; break;
-		case OOr: o = OR; break;
-		case OXor: o = XOR; break;
-		case OShl:
-		case OUShr:
-		case OSShr:
-			if( !b->current || b->current->kind != RCPU || b->current->id != Ecx ) {
-				scratch(REG_AT(Ecx));
-				op(ctx,MOV,REG_AT(Ecx),pb,is64);
-				RLOCK(REG_AT(Ecx));
-				pa = fetch(a);
-			} else
-				RLOCK(b->current);
-			if( pa->kind != RCPU ) {
-				pa = alloc_reg(ctx, RCPU);
-				op(ctx,MOV,pa,fetch(a), is64);
-			}
-			op(ctx,bop == OShl ? SHL : (bop == OUShr ? SHR : SAR), pa, UNUSED,is64);
-			if( dst ) store(ctx, dst, pa, true);
-			return pa;
-		case OSDiv:
-		case OUDiv:
-		case OSMod:
-		case OUMod:
-			{
-				preg *out = bop == OSMod || bop == OUMod ? REG_AT(Edx) : PEAX;
-				preg *r = pb;
-				preg p;
-				int jz, jz1 = 0, jend;
-				if( pa->kind == RCPU && pa->id == Eax ) RLOCK(pa);
-				// ensure b in CPU reg and not in Eax/Edx (for UI8/UI16)
-				if( pb->kind != RCPU || (pb->id == Eax || pb->id == Edx) ) {
-					scratch(REG_AT(Ecx));
-					scratch(pb);
-					load(ctx,REG_AT(Ecx),b);
-					r = REG_AT(Ecx);
-				}
-				// integer div 0 => 0
-				op(ctx,TEST,r,r,is64);
-				XJump_small(JZero, jz);
-				// Prevent MIN/-1 overflow exception
-				// OSMod: r = (b == 0 || b == -1) ? 0 : a % b
-				// OSDiv: r = (b == 0 || b == -1) ? a * b : a / b
-				if( bop == OSMod || bop == OSDiv ) {
-					op(ctx, CMP, r, pconst(&p,-1), is64);
-					XJump_small(JEq, jz1);
-				}
-				pa = fetch(a);
-				if( pa->kind != RCPU || pa->id != Eax ) {
-					scratch(PEAX);
-					scratch(pa);
-					load(ctx,PEAX,a);
-				}
-				scratch(REG_AT(Edx));
-				scratch(REG_AT(Eax));
-				if( bop == OUDiv || bop == OUMod )
-					op(ctx, XOR, REG_AT(Edx), REG_AT(Edx), is64);
-				else
-					op(ctx, CDQ, UNUSED, UNUSED, is64); // sign-extend Eax into Eax:Edx
-				op(ctx, bop == OUDiv || bop == OUMod ? DIV : IDIV, r, UNUSED, is64);
-				XJump_small(JAlways, jend);
-				patch_jump(ctx, jz);
-				patch_jump(ctx, jz1);
-				if( bop != OSDiv ) {
-					op(ctx, XOR, out, out, is64);
-				} else {
-					load(ctx, out, a);
-					op(ctx, IMUL, out, r, is64);
-				}
-				patch_jump(ctx, jend);
-				if( dst ) store(ctx, dst, out, true);
-				return out;
-			}
-		case OJSLt:
-		case OJSGte:
-		case OJSLte:
-		case OJSGt:
-		case OJULt:
-		case OJUGte:
-		case OJEq:
-		case OJNotEq:
-			switch( a->t->kind ) {
-			case HUI8:
-			case HBOOL:
-				o = CMP8;
-				break;
-			case HUI16:
-				o = CMP16;
-				break;
-			default:
-				o = CMP;
-				break;
-			}
-			break;
-		default:
-			printf("%s\n", hl_op_name(bop));
-			ASSERT(bop);
-		}
-	}
-	switch( RTYPE(a) ) {
-	case HI32:
-	case HUI8:
-	case HUI16:
-	case HBOOL:
-#	ifndef HL_64
-	case HDYNOBJ:
-	case HVIRTUAL:
-	case HOBJ:
-	case HSTRUCT:
-	case HFUN:
-	case HMETHOD:
-	case HBYTES:
-	case HNULL:
-	case HENUM:
-	case HDYN:
-	case HTYPE:
-	case HABSTRACT:
-	case HARRAY:
-#	endif
-		switch( ID2(pa->kind, pb->kind) ) {
-		case ID2(RCPU,RCPU):
-		case ID2(RCPU,RSTACK):
-			op32(ctx, o, pa, pb);
-			scratch(pa);
-			out = pa;
-			break;
-		case ID2(RSTACK,RCPU):
-			if( dst == a && o != IMUL ) {
-				op32(ctx, o, pa, pb);
-				dst = NULL;
-				out = pa;
-			} else {
-				alloc_cpu(ctx,a, true);
-				return op_binop(ctx,dst,a,b,bop);
-			}
-			break;
-		case ID2(RSTACK,RSTACK):
-			alloc_cpu(ctx, a, true);
-			return op_binop(ctx, dst, a, b, bop);
-		default:
-			printf("%s(%d,%d)\n", hl_op_name(bop), pa->kind, pb->kind);
-			ASSERT(ID2(pa->kind, pb->kind));
-		}
-		if( dst ) store(ctx, dst, out, true);
-		return out;
-#	ifdef HL_64
-	case HOBJ:
-	case HSTRUCT:
-	case HDYNOBJ:
-	case HVIRTUAL:
-	case HFUN:
-	case HMETHOD:
-	case HBYTES:
-	case HNULL:
-	case HENUM:
-	case HDYN:
-	case HTYPE:
-	case HABSTRACT:
-	case HARRAY:
-	case HI64:
-	case HGUID:
-		switch( ID2(pa->kind, pb->kind) ) {
-		case ID2(RCPU,RCPU):
-		case ID2(RCPU,RSTACK):
-			op64(ctx, o, pa, pb);
-			scratch(pa);
-			out = pa;
-			break;
-		case ID2(RSTACK,RCPU):
-			if( dst == a && OP_FORMS[o].mem_r ) {
-				op64(ctx, o, pa, pb);
-				dst = NULL;
-				out = pa;
-			} else {
-				alloc_cpu(ctx,a, true);
-				return op_binop(ctx,dst,a,b,bop);
-			}
-			break;
-		case ID2(RSTACK,RSTACK):
-			alloc_cpu(ctx, a, true);
-			return op_binop(ctx, dst, a, b, bop);
-		default:
-			printf("%s(%d,%d)\n", hl_op_name(bop), pa->kind, pb->kind);
-			ASSERT(ID2(pa->kind, pb->kind));
-		}
-		if( dst ) store(ctx, dst, out, true);
-		return out;
-#	endif
-	case HF64:
-	case HF32:
-		pa = alloc_fpu(ctx, a, true);
-		pb = alloc_fpu(ctx, b, true);
-		switch( ID2(pa->kind, pb->kind) ) {
-		case ID2(RFPU,RFPU):
-			op64(ctx,o,pa,pb);
-			if( (o == COMISD || o == COMISS) && bop != OJSGt ) {
-				int jnotnan;
-				XJump_small(JNParity,jnotnan);
-				switch( bop ) {
-				case OJSLt:
-				case OJNotLt:
-					{
-						preg *r = alloc_reg(ctx,RCPU);
-						// set CF=0, ZF=1
-						op64(ctx,XOR,r,r);
-						RUNLOCK(r);
-						break;
-					}
-				case OJSGte:
-				case OJNotGte:
-					{
-						preg *r = alloc_reg(ctx,RCPU);
-						// set ZF=0, CF=1
-						op64(ctx,XOR,r,r);
-						op64(ctx,CMP,r,PESP);
-						RUNLOCK(r);
-						break;
-					}
-					break;
-				case OJNotEq:
-				case OJEq:
-					// set ZF=0, CF=?
-				case OJSLte:
-					// set ZF=0, CF=0
-					op64(ctx,TEST,PESP,PESP);
-					break;
-				default:
-					ASSERT(bop);
-				}
-				patch_jump(ctx,jnotnan);
-			}
-			scratch(pa);
-			out = pa;
-			break;
-		default:
-			printf("%s(%d,%d)\n", hl_op_name(bop), pa->kind, pb->kind);
-			ASSERT(ID2(pa->kind, pb->kind));
-		}
-		if( dst ) store(ctx, dst, out, true);
-		return out;
-	default:
-		ASSERT(RTYPE(a));
-	}
-	return NULL;
-}
-
-static int do_jump( jit_ctx *ctx, hl_op op, bool isFloat ) {
-	int j;
-	switch( op ) {
-	case OJAlways:
-		XJump(JAlways,j);
-		break;
-	case OJSGte:
-		XJump(isFloat ? JUGte : JSGte,j);
-		break;
-	case OJSGt:
-		XJump(isFloat ? JUGt : JSGt,j);
-		break;
-	case OJUGte:
-		XJump(JUGte,j);
-		break;
-	case OJSLt:
-		XJump(isFloat ? JULt : JSLt,j);
-		break;
-	case OJSLte:
-		XJump(isFloat ? JULte : JSLte,j);
-		break;
-	case OJULt:
-		XJump(JULt,j);
-		break;
-	case OJEq:
-		XJump(JEq,j);
-		break;
-	case OJNotEq:
-		XJump(JNeq,j);
-		break;
-	case OJNotLt:
-		XJump(JUGte,j);
-		break;
-	case OJNotGte:
-		XJump(JULt,j);
-		break;
-	default:
-		j = 0;
-		printf("Unknown JUMP %d\n",op);
-		break;
-	}
-	return j;
-}
-
-static void register_jump( jit_ctx *ctx, int pos, int target ) {
-	jlist *j = (jlist*)hl_malloc(&ctx->falloc, sizeof(jlist));
-	j->pos = pos;
-	j->target = target;
-	j->next = ctx->jumps;
-	ctx->jumps = j;
-	if( target != 0 && ctx->opsPos[target] == 0 )
-		ctx->opsPos[target] = -1;
-}
-
-#define HDYN_VALUE 8
-
-static void dyn_value_compare( jit_ctx *ctx, preg *a, preg *b, hl_type *t ) {
-	preg p;
-	switch( t->kind ) {
-	case HUI8:
-	case HBOOL:
-		op32(ctx,MOV8,a,pmem(&p,a->id,HDYN_VALUE));
-		op32(ctx,MOV8,b,pmem(&p,b->id,HDYN_VALUE));
-		op64(ctx,CMP8,a,b);
-		break;
-	case HUI16:
-		op32(ctx,MOV16,a,pmem(&p,a->id,HDYN_VALUE));
-		op32(ctx,MOV16,b,pmem(&p,b->id,HDYN_VALUE));
-		op64(ctx,CMP16,a,b);
-		break;
-	case HI32:
-		op32(ctx,MOV,a,pmem(&p,a->id,HDYN_VALUE));
-		op32(ctx,MOV,b,pmem(&p,b->id,HDYN_VALUE));
-		op64(ctx,CMP,a,b);
-		break;
-	case HF32:
-		{
-			preg *fa = alloc_reg(ctx, RFPU);
-			preg *fb = alloc_reg(ctx, RFPU);
-			op64(ctx,MOVSS,fa,pmem(&p,a->id,HDYN_VALUE));
-			op64(ctx,MOVSS,fb,pmem(&p,b->id,HDYN_VALUE));
-			op64(ctx,COMISD,fa,fb);
-		}
-		break;
-	case HF64:
-		{
-			preg *fa = alloc_reg(ctx, RFPU);
-			preg *fb = alloc_reg(ctx, RFPU);
-			op64(ctx,MOVSD,fa,pmem(&p,a->id,HDYN_VALUE));
-			op64(ctx,MOVSD,fb,pmem(&p,b->id,HDYN_VALUE));
-			op64(ctx,COMISD,fa,fb);
-		}
-		break;
-	case HI64:
-	default:
-		// ptr comparison
-		op64(ctx,MOV,a,pmem(&p,a->id,HDYN_VALUE));
-		op64(ctx,MOV,b,pmem(&p,b->id,HDYN_VALUE));
-		op64(ctx,CMP,a,b);
-		break;
-	}
-}
-
-static void op_jump( jit_ctx *ctx, vreg *a, vreg *b, hl_opcode *op, int targetPos ) {
-	if( a->t->kind == HDYN || b->t->kind == HDYN || a->t->kind == HFUN || b->t->kind == HFUN ) {
-		int args[] = { a->stack.id, b->stack.id };
-		int size = prepare_call_args(ctx,2,args,ctx->vregs,0);
-		call_native(ctx,hl_dyn_compare,size);
-		if( op->op == OJSGt || op->op == OJSGte ) {
-			preg p;
-			int jinvalid;
-			op32(ctx,CMP,PEAX,pconst(&p,hl_invalid_comparison));
-			XJump_small(JEq,jinvalid);
-			op32(ctx,TEST,PEAX,PEAX);
-			register_jump(ctx,do_jump(ctx,op->op, IS_FLOAT(a)),targetPos);
-			patch_jump(ctx,jinvalid);
-			return;
-		}
-		op32(ctx,TEST,PEAX,PEAX);
-	} else switch( a->t->kind ) {
-	case HTYPE:
-		{
-			int args[] = { a->stack.id, b->stack.id };
-			int size = prepare_call_args(ctx,2,args,ctx->vregs,0);
-			preg p;
-			call_native(ctx,hl_same_type,size);
-			op64(ctx,CMP8,PEAX,pconst(&p,1));
-		}
-		break;
-	case HNULL:
-		{
-			preg *pa = hl_type_size(a->t->tparam) == 1 ? alloc_cpu8(ctx,a,true) : alloc_cpu(ctx,a,true);
-			preg *pb = hl_type_size(b->t->tparam) == 1 ? alloc_cpu8(ctx,b,true) : alloc_cpu(ctx,b,true);
-			if( op->op == OJEq ) {
-				// if( a == b || (a && b && a->v == b->v) ) goto
-				int ja, jb;
-				// if( a != b && (!a || !b || a->v != b->v) ) goto
-				op64(ctx,CMP,pa,pb);
-				register_jump(ctx,do_jump(ctx,OJEq,false),targetPos);
-				op64(ctx,TEST,pa,pa);
-				XJump_small(JZero,ja);
-				op64(ctx,TEST,pb,pb);
-				XJump_small(JZero,jb);
-				dyn_value_compare(ctx,pa,pb,a->t->tparam);
-				register_jump(ctx,do_jump(ctx,OJEq,false),targetPos);
-				scratch(pa);
-				scratch(pb);
-				patch_jump(ctx,ja);
-				patch_jump(ctx,jb);
-			} else if( op->op == OJNotEq ) {
-				int jeq, jcmp;
-				// if( a != b && (!a || !b || a->v != b->v) ) goto
-				op64(ctx,CMP,pa,pb);
-				XJump_small(JEq,jeq);
-				op64(ctx,TEST,pa,pa);
-				register_jump(ctx,do_jump(ctx,OJEq,false),targetPos);
-				op64(ctx,TEST,pb,pb);
-				register_jump(ctx,do_jump(ctx,OJEq,false),targetPos);
-				dyn_value_compare(ctx,pa,pb,a->t->tparam);
-				XJump_small(JZero,jcmp);
-				scratch(pa);
-				scratch(pb);
-				register_jump(ctx,do_jump(ctx,OJNotEq,false),targetPos);
-				patch_jump(ctx,jcmp);
-				patch_jump(ctx,jeq);
-			} else
-				ASSERT(op->op);
-			return;
-		}
-	case HVIRTUAL:
-		{
-			preg p;
-			preg *pa = alloc_cpu(ctx,a,true);
-			preg *pb = alloc_cpu(ctx,b,true);
-			int ja,jb,jav,jbv,jvalue;
-			if( b->t->kind == HOBJ ) {
-				if( op->op == OJEq ) {
-					// if( a ? (b && a->value == b) : (b == NULL) ) goto
-					op64(ctx,TEST,pa,pa);
-					XJump_small(JZero,ja);
-					op64(ctx,TEST,pb,pb);
-					XJump_small(JZero,jb);
-					op64(ctx,MOV,pa,pmem(&p,pa->id,HL_WSIZE));
-					op64(ctx,CMP,pa,pb);
-					XJump_small(JAlways,jvalue);
-					patch_jump(ctx,ja);
-					op64(ctx,TEST,pb,pb);
-					patch_jump(ctx,jvalue);
-					register_jump(ctx,do_jump(ctx,OJEq,false),targetPos);
-					patch_jump(ctx,jb);
-				} else if( op->op == OJNotEq ) {
-					// if( a ? (b == NULL || a->value != b) : (b != NULL) ) goto
-					op64(ctx,TEST,pa,pa);
-					XJump_small(JZero,ja);
-					op64(ctx,TEST,pb,pb);
-					register_jump(ctx,do_jump(ctx,OJEq,false),targetPos);
-					op64(ctx,MOV,pa,pmem(&p,pa->id,HL_WSIZE));
-					op64(ctx,CMP,pa,pb);
-					XJump_small(JAlways,jvalue);
-					patch_jump(ctx,ja);
-					op64(ctx,TEST,pb,pb);
-					patch_jump(ctx,jvalue);
-					register_jump(ctx,do_jump(ctx,OJNotEq,false),targetPos);
-				} else
-					ASSERT(op->op);
-				scratch(pa);
-				return;
-			}
-			op64(ctx,CMP,pa,pb);
-			if( op->op == OJEq ) {
-				// if( a == b || (a && b && a->value && b->value && a->value == b->value) ) goto
-				register_jump(ctx,do_jump(ctx,OJEq, false),targetPos);
-				op64(ctx,TEST,pa,pa);
-				XJump_small(JZero,ja);
-				op64(ctx,TEST,pb,pb);
-				XJump_small(JZero,jb);
-				op64(ctx,MOV,pa,pmem(&p,pa->id,HL_WSIZE));
-				op64(ctx,TEST,pa,pa);
-				XJump_small(JZero,jav);
-				op64(ctx,MOV,pb,pmem(&p,pb->id,HL_WSIZE));
-				op64(ctx,TEST,pb,pb);
-				XJump_small(JZero,jbv);
-				op64(ctx,CMP,pa,pb);
-				XJump_small(JNeq,jvalue);
-				register_jump(ctx,do_jump(ctx,OJEq, false),targetPos);
-				patch_jump(ctx,ja);
-				patch_jump(ctx,jb);
-				patch_jump(ctx,jav);
-				patch_jump(ctx,jbv);
-				patch_jump(ctx,jvalue);
-			} else if( op->op == OJNotEq ) {
-				int jnext;
-				// if( a != b && (!a || !b || !a->value || !b->value || a->value != b->value) ) goto
-				XJump_small(JEq,jnext);
-				op64(ctx,TEST,pa,pa);
-				XJump_small(JZero,ja);
-				op64(ctx,TEST,pb,pb);
-				XJump_small(JZero,jb);
-				op64(ctx,MOV,pa,pmem(&p,pa->id,HL_WSIZE));
-				op64(ctx,TEST,pa,pa);
-				XJump_small(JZero,jav);
-				op64(ctx,MOV,pb,pmem(&p,pb->id,HL_WSIZE));
-				op64(ctx,TEST,pb,pb);
-				XJump_small(JZero,jbv);
-				op64(ctx,CMP,pa,pb);
-				XJump_small(JEq,jvalue);
-				patch_jump(ctx,ja);
-				patch_jump(ctx,jb);
-				patch_jump(ctx,jav);
-				patch_jump(ctx,jbv);
-				register_jump(ctx,do_jump(ctx,OJAlways, false),targetPos);
-				patch_jump(ctx,jnext);
-				patch_jump(ctx,jvalue);
-			} else
-				ASSERT(op->op);
-			scratch(pa);
-			scratch(pb);
-			return;
-		}
-		break;
-	case HOBJ:
-	case HSTRUCT:
-		if( b->t->kind == HVIRTUAL ) {
-			op_jump(ctx,b,a,op,targetPos); // inverse
-			return;
-		}
-		if( hl_get_obj_rt(a->t)->compareFun ) {
-			preg *pa = alloc_cpu(ctx,a,true);
-			preg *pb = alloc_cpu(ctx,b,true);
-			preg p;
-			int jeq, ja, jb, jcmp;
-			int args[] = { a->stack.id, b->stack.id };
-			switch( op->op ) {
-			case OJEq:
-				// if( a == b || (a && b && cmp(a,b) == 0) ) goto
-				op64(ctx,CMP,pa,pb);
-				XJump_small(JEq,jeq);
-				op64(ctx,TEST,pa,pa);
-				XJump_small(JZero,ja);
-				op64(ctx,TEST,pb,pb);
-				XJump_small(JZero,jb);
-				op_call_fun(ctx,NULL,(int)(int_val)a->t->obj->rt->compareFun,2,args);
-				op32(ctx,TEST,PEAX,PEAX);
-				XJump_small(JNotZero,jcmp);
-				patch_jump(ctx,jeq);
-				register_jump(ctx,do_jump(ctx,OJAlways,false),targetPos);
-				patch_jump(ctx,ja);
-				patch_jump(ctx,jb);
-				patch_jump(ctx,jcmp);
-				break;
-			case OJNotEq:
-				// if( a != b && (!a || !b || cmp(a,b) != 0) ) goto
-				op64(ctx,CMP,pa,pb);
-				XJump_small(JEq,jeq);
-				op64(ctx,TEST,pa,pa);
-				register_jump(ctx,do_jump(ctx,OJEq,false),targetPos);
-				op64(ctx,TEST,pb,pb);
-				register_jump(ctx,do_jump(ctx,OJEq,false),targetPos);
-
-				op_call_fun(ctx,NULL,(int)(int_val)a->t->obj->rt->compareFun,2,args);
-				op32(ctx,TEST,PEAX,PEAX);
-				XJump_small(JZero,jcmp);
-
-				register_jump(ctx,do_jump(ctx,OJNotEq,false),targetPos);
-				patch_jump(ctx,jcmp);
-				patch_jump(ctx,jeq);
-				break;
-			default:
-				// if( a && b && cmp(a,b) ?? 0 ) goto
-				op64(ctx,TEST,pa,pa);
-				XJump_small(JZero,ja);
-				op64(ctx,TEST,pb,pb);
-				XJump_small(JZero,jb);
-				op_call_fun(ctx,NULL,(int)(int_val)a->t->obj->rt->compareFun,2,args);
-				op32(ctx,CMP,PEAX,pconst(&p,0));
-				register_jump(ctx,do_jump(ctx,op->op,false),targetPos);
-				patch_jump(ctx,ja);
-				patch_jump(ctx,jb);
-				break;
-			}
-			return;
-		}
-		// fallthrough
-	default:
-		// make sure we have valid 8 bits registers
-		if( a->size == 1 ) alloc_cpu8(ctx,a,true);
-		if( b->size == 1 ) alloc_cpu8(ctx,b,true);
-		op_binop(ctx,NULL,a,b,op->op);
-		break;
-	}
-	register_jump(ctx,do_jump(ctx,op->op, IS_FLOAT(a)),targetPos);
-}
-
-jit_ctx *hl_jit_alloc() {
-	int i;
-	jit_ctx *ctx = (jit_ctx*)malloc(sizeof(jit_ctx));
-	if( ctx == NULL ) return NULL;
-	memset(ctx,0,sizeof(jit_ctx));
-	hl_alloc_init(&ctx->falloc);
-	hl_alloc_init(&ctx->galloc);
-	for(i=0;i<RCPU_COUNT;i++) {
-		preg *r = REG_AT(i);
-		r->id = i;
-		r->kind = RCPU;
-	}
-	for(i=0;i<RFPU_COUNT;i++) {
-		preg *r = REG_AT(XMM(i));
-		r->id = i;
-		r->kind = RFPU;
-	}
-	return ctx;
-}
-
-void hl_jit_free( jit_ctx *ctx, h_bool can_reset ) {
-	free(ctx->vregs);
-	free(ctx->opsPos);
-	free(ctx->startBuf);
-	ctx->maxRegs = 0;
-	ctx->vregs = NULL;
-	ctx->maxOps = 0;
-	ctx->opsPos = NULL;
-	ctx->startBuf = NULL;
-	ctx->bufSize = 0;
-	ctx->buf.b = NULL;
-	ctx->calls = NULL;
-	ctx->switchs = NULL;
-	ctx->closure_list = NULL;
-	hl_free(&ctx->falloc);
-	hl_free(&ctx->galloc);
-	if( !can_reset ) free(ctx);
-}
-
-static void jit_nops( jit_ctx *ctx ) {
-	while( BUF_POS() & 15 )
-		op32(ctx, NOP, UNUSED, UNUSED);
-}
-
-#define MAX_ARGS 16
-
-static void *call_jit_c2hl = NULL;
-static void *call_jit_hl2c = NULL;
-
-static void *callback_c2hl( void *_f, hl_type *t, void **args, vdynamic *ret ) {
-	/*
-		prepare stack and regs according to prepare_call_args, but by reading runtime type information
-		from the function type. The stack and regs will be setup by the trampoline function.
-	*/
-	void **f = (void**)_f;
-	unsigned char stack[MAX_ARGS * 8];
-	call_regs cregs = {0};
-	if( t->fun->nargs > MAX_ARGS )
-		hl_error("Too many arguments for dynamic call");
-	int i, size = 0, pad = 0, pos = 0;
-	for(i=0;i<t->fun->nargs;i++) {
-		hl_type *at = t->fun->args[i];
-		int creg = select_call_reg(&cregs,at,i);
-		if( creg >= 0 )
-			continue;
-		size += stack_size(at);
-	}
-	pad = (-size) & 15;
-	size += pad;
-	pos = 0;
-	for(i=0;i<t->fun->nargs;i++) {
-		// RTL
-		hl_type *at = t->fun->args[i];
-		void *v = args[i];
-		int creg = mapped_reg(&cregs,i);
-		void *store;
-		if( creg >= 0 ) {
-			if( REG_IS_FPU(creg) ) {
-				store = stack + size + CALL_NREGS * HL_WSIZE + (creg - XMM(0)) * sizeof(double);
-			} else {
-				store = stack + size + call_reg_index(creg) * HL_WSIZE;
-			}
-			switch( at->kind ) {
-			case HBOOL:
-			case HUI8:
-				*(int_val*)store = *(unsigned char*)v;
-				break;
-			case HUI16:
-				*(int_val*)store = *(unsigned short*)v;
-				break;
-			case HI32:
-				*(int_val*)store = *(int*)v;
-				break;
-			case HF32:
-				*(void**)store = 0;
-				*(float*)store = *(float*)v;
-				break;
-			case HF64:
-				*(double*)store = *(double*)v;
-				break;
-			case HI64:
-			case HGUID:
-				*(int64*)store = *(int64*)v;
-				break;
-			default:
-				*(void**)store = v;
-				break;
-			}
-		} else {
-			int tsize = stack_size(at);
-			store = stack + pos;
-			pos += tsize;
-			switch( at->kind ) {
-			case HBOOL:
-			case HUI8:
-				*(int*)store = *(unsigned char*)v;
-				break;
-			case HUI16:
-				*(int*)store = *(unsigned short*)v;
-				break;
-			case HI32:
-			case HF32:
-				*(int*)store = *(int*)v;
-				break;
-			case HF64:
-				*(double*)store = *(double*)v;
-				break;
-			case HI64:
-			case HGUID:
-				*(int64*)store = *(int64*)v;
-				break;
-			default:
-				*(void**)store = v;
-				break;
-			}
-		}
-	}
-	pos += pad;
-	pos >>= IS_64 ? 3 : 2;
-	switch( t->fun->ret->kind ) {
-	case HUI8:
-	case HUI16:
-	case HI32:
-	case HBOOL:
-		ret->v.i = ((int (*)(void *, void *, void *))call_jit_c2hl)(*f, (void**)&stack + pos, &stack);
-		return &ret->v.i;
-	case HI64:
-	case HGUID:
-		ret->v.i64 = ((int64 (*)(void *, void *, void *))call_jit_c2hl)(*f, (void**)&stack + pos, &stack);
-		return &ret->v.i64;
-	case HF32:
-		ret->v.f = ((float (*)(void *, void *, void *))call_jit_c2hl)(*f, (void**)&stack + pos, &stack);
-		return &ret->v.f;
-	case HF64:
-		ret->v.d = ((double (*)(void *, void *, void *))call_jit_c2hl)(*f, (void**)&stack + pos, &stack);
-		return &ret->v.d;
-	default:
-		return ((void *(*)(void *, void *, void *))call_jit_c2hl)(*f, (void**)&stack + pos, &stack);
-	}
-}
-
-static void jit_c2hl( jit_ctx *ctx ) {
-	//	create the function that will be called by callback_c2hl
-	//	it will make sure to prepare the stack/regs according to native calling conventions
-	int jeq, jloop, jstart;
-	preg *fptr, *stack, *stend;
-	preg p;
-
-	op64(ctx,PUSH,PEBP,UNUSED);
-	op64(ctx,MOV,PEBP,PESP);
-
-#	ifdef HL_64
-
-	fptr = REG_AT(R10);
-	stack = PEAX;
-	stend = REG_AT(R11);
-	op64(ctx, MOV, fptr, REG_AT(CALL_REGS[0]));
-	op64(ctx, MOV, stack, REG_AT(CALL_REGS[1]));
-	op64(ctx, MOV, stend, REG_AT(CALL_REGS[2]));
-
-	// set native call regs
-	int i;
-	for(i=0;i<CALL_NREGS;i++)
-		op64(ctx,MOV,REG_AT(CALL_REGS[i]),pmem(&p,stack->id,i*HL_WSIZE));
-	for(i=0;i<CALL_NREGS;i++)
-		op64(ctx,MOVSD,REG_AT(XMM(i)),pmem(&p,stack->id,(i+CALL_NREGS)*HL_WSIZE));
-
-#	else
-
-	// make sure the stack is aligned on 16 bytes
-	// the amount of push we will do afterwards is guaranteed to be a multiple of 16bytes by hl_callback
-#	ifdef HL_VCC
-	// VCC does not guarantee us an aligned stack...
-	op64(ctx,MOV,PEAX,PESP);
-	op64(ctx,AND,PEAX,pconst(&p,15));
-	op64(ctx,SUB,PESP,PEAX);
-#	else
-	op64(ctx,SUB,PESP,pconst(&p,8));
-#	endif
-
-	// mov arguments to regs
-	fptr = REG_AT(Eax);
-	stack = REG_AT(Edx);
-	stend = REG_AT(Ecx);
-	op64(ctx,MOV,fptr,pmem(&p,Ebp,HL_WSIZE*2));
-	op64(ctx,MOV,stack,pmem(&p,Ebp,HL_WSIZE*3));
-	op64(ctx,MOV,stend,pmem(&p,Ebp,HL_WSIZE*4));
-
-#	endif
-
-	// push stack args
-	jstart = BUF_POS();
-	op64(ctx,CMP,stack,stend);
-	XJump(JEq,jeq);
-	op64(ctx,SUB,stack,pconst(&p,HL_WSIZE));
-	op64(ctx,PUSH,pmem(&p,stack->id,0),UNUSED);
-	XJump(JAlways,jloop);
-	patch_jump(ctx,jeq);
-	patch_jump_to(ctx, jloop, jstart);
-
-	op_call(ctx,fptr,0);
-
-	// cleanup and ret
-	op64(ctx,MOV,PESP,PEBP);
-	op64(ctx,POP,PEBP, UNUSED);
-	op64(ctx,RET,UNUSED,UNUSED);
-}
-
-static vdynamic *jit_wrapper_call( vclosure_wrapper *c, char *stack_args, void **regs ) {
-	vdynamic *args[MAX_ARGS];
-	int i;
-	int nargs = c->cl.t->fun->nargs;
-	call_regs cregs = {0};
-	if( nargs > MAX_ARGS )
-		hl_error("Too many arguments for wrapped call");
-	cregs.nextCpu++; // skip fptr in HL64 - was passed as arg0
-	for(i=0;i<nargs;i++) {
-		hl_type *t = c->cl.t->fun->args[i];
-		int creg = select_call_reg(&cregs,t,i);
-		if( creg < 0 ) {
-			args[i] = hl_is_dynamic(t) ? *(vdynamic**)stack_args : hl_make_dyn(stack_args,t);
-			stack_args += stack_size(t);
-		} else if( hl_is_dynamic(t) ) {
-			args[i] = *(vdynamic**)(regs + call_reg_index(creg));
-		} else if( t->kind == HF32 || t->kind == HF64 ) {
-			args[i] = hl_make_dyn(regs + CALL_NREGS + creg - XMM(0),&hlt_f64);
-		} else {
-			args[i] = hl_make_dyn(regs + call_reg_index(creg),t);
-		}
-	}
-	return hl_dyn_call(c->wrappedFun,args,nargs);
-}
-
-static void *jit_wrapper_ptr( vclosure_wrapper *c, char *stack_args, void **regs ) {
-	vdynamic *ret = jit_wrapper_call(c, stack_args, regs);
-	hl_type *tret = c->cl.t->fun->ret;
-	switch( tret->kind ) {
-	case HVOID:
-		return NULL;
-	case HUI8:
-	case HUI16:
-	case HI32:
-	case HBOOL:
-		return (void*)(int_val)hl_dyn_casti(&ret,&hlt_dyn,tret);
-	case HI64:
-	case HGUID:
-		return (void*)(int_val)hl_dyn_casti64(&ret,&hlt_dyn);
-	default:
-		return hl_dyn_castp(&ret,&hlt_dyn,tret);
-	}
-}
-
-static double jit_wrapper_d( vclosure_wrapper *c, char *stack_args, void **regs ) {
-	vdynamic *ret = jit_wrapper_call(c, stack_args, regs);
-	return hl_dyn_castd(&ret,&hlt_dyn);
-}
-
-static void jit_hl2c( jit_ctx *ctx ) {
-	// create a function that is called with a vclosure_wrapper* and native args
-	// and pack and pass the args to callback_hl2c
-	preg p;
-	int jfloat1, jfloat2, jexit;
-	hl_type_fun *ft = NULL;
-	int size;
-#	ifdef HL_64
-	preg *cl = REG_AT(CALL_REGS[0]);
-	preg *tmp = REG_AT(CALL_REGS[1]);
-#	else
-	preg *cl = REG_AT(Ecx);
-	preg *tmp = REG_AT(Edx);
-#	endif
-
-	op64(ctx,PUSH,PEBP,UNUSED);
-	op64(ctx,MOV,PEBP,PESP);
-
-#	ifdef HL_64
-	// push registers
-	int i;
-	op64(ctx,SUB,PESP,pconst(&p,CALL_NREGS*8));
-	for(i=0;i<CALL_NREGS;i++)
-		op64(ctx,MOVSD,pmem(&p,Esp,i*8),REG_AT(XMM(i)));
-	for(i=0;i<CALL_NREGS;i++)
-		op64(ctx,PUSH,REG_AT(CALL_REGS[CALL_NREGS - 1 - i]),UNUSED);
-#	endif
-
-	// opcodes for:
-	//		switch( arg0->t->fun->ret->kind ) {
-	//		case HF32: case HF64: return jit_wrapper_d(arg0,&args);
-	//		default: return jit_wrapper_ptr(arg0,&args);
-	//		}
-	if( !IS_64 )
-		op64(ctx,MOV,cl,pmem(&p,Ebp,HL_WSIZE*2)); // load arg0
-	op64(ctx,MOV,tmp,pmem(&p,cl->id,0)); // ->t
-	op64(ctx,MOV,tmp,pmem(&p,tmp->id,HL_WSIZE)); // ->fun
-	op64(ctx,MOV,tmp,pmem(&p,tmp->id,(int)(int_val)&ft->ret)); // ->ret
-	op32(ctx,MOV,tmp,pmem(&p,tmp->id,0)); // -> kind
-
-	op32(ctx,CMP,tmp,pconst(&p,HF64));
-	XJump_small(JEq,jfloat1);
-	op32(ctx,CMP,tmp,pconst(&p,HF32));
-	XJump_small(JEq,jfloat2);
-
-	// 64 bits : ESP + EIP (+WIN64PAD)
-	// 32 bits : ESP + EIP + PARAM0
-	int args_pos = IS_64 ? ((IS_WINCALL64 ? 32 : 0) + HL_WSIZE * 2) : (HL_WSIZE*3);
-
-	size = begin_native_call(ctx,3);
-	op64(ctx, LEA, tmp, pmem(&p,Ebp,-HL_WSIZE*CALL_NREGS*2));
-	set_native_arg(ctx, tmp);
-	op64(ctx, LEA, tmp, pmem(&p,Ebp,args_pos));
-	set_native_arg(ctx, tmp);
-	set_native_arg(ctx, cl);
-	call_native(ctx, jit_wrapper_ptr, size);
-	XJump_small(JAlways, jexit);
-
-	patch_jump(ctx,jfloat1);
-	patch_jump(ctx,jfloat2);
-	size = begin_native_call(ctx,3);
-	op64(ctx, LEA, tmp, pmem(&p,Ebp,-HL_WSIZE*CALL_NREGS*2));
-	set_native_arg(ctx, tmp);
-	op64(ctx, LEA, tmp, pmem(&p,Ebp,args_pos));
-	set_native_arg(ctx, tmp);
-	set_native_arg(ctx, cl);
-	call_native(ctx, jit_wrapper_d, size);
-
-	patch_jump(ctx,jexit);
-	op64(ctx,MOV,PESP,PEBP);
-	op64(ctx,POP,PEBP, UNUSED);
-	op64(ctx,RET,UNUSED,UNUSED);
-}
-
-#ifdef JIT_CUSTOM_LONGJUMP
-// Win64 debug CRT performs a Rtl stack check in debug mode, preventing from
-// using longjump. This in an alternate implementation that follows the native
-// setjump storage.
-//
-// Another more reliable way of handling this would be to use RtlAddFunctionTable
-// but some platform does not have it.
-static void jit_longjump( jit_ctx *ctx ) {
-	preg *buf = REG_AT(CALL_REGS[0]);
-	preg *ret = REG_AT(CALL_REGS[1]);
-	preg p;
-	int i;
-	op64(ctx,MOV,PEAX,ret); // return value
-	op64(ctx,MOV,REG_AT(Edx),pmem(&p,buf->id,0x0));
-	op64(ctx,MOV,REG_AT(Ebx),pmem(&p,buf->id,0x8));
-	op64(ctx,MOV,REG_AT(Esp),pmem(&p,buf->id,0x10));
-	op64(ctx,MOV,REG_AT(Ebp),pmem(&p,buf->id,0x18));
-	op64(ctx,MOV,REG_AT(Esi),pmem(&p,buf->id,0x20));
-	op64(ctx,MOV,REG_AT(Edi),pmem(&p,buf->id,0x28));
-	op64(ctx,MOV,REG_AT(R12),pmem(&p,buf->id,0x30));
-	op64(ctx,MOV,REG_AT(R13),pmem(&p,buf->id,0x38));
-	op64(ctx,MOV,REG_AT(R14),pmem(&p,buf->id,0x40));
-	op64(ctx,MOV,REG_AT(R15),pmem(&p,buf->id,0x48));
-	op64(ctx,LDMXCSR,pmem(&p,buf->id,0x58), UNUSED);
-	op64(ctx,FLDCW,pmem(&p,buf->id,0x5C), UNUSED);
-	for(i=0;i<10;i++)
-		op64(ctx,MOVSD,REG_AT(XMM(i+6)),pmem(&p,buf->id,0x60 + i * 16));
-	op64(ctx,PUSH,pmem(&p,buf->id,0x50),UNUSED);
-	op64(ctx,RET,UNUSED,UNUSED);
-}
-#endif
-
-static void jit_fail( uchar *msg ) {
-	if( msg == NULL ) {
-		hl_debug_break();
-		msg = USTR("assert");
-	}
-	vdynamic *d = hl_alloc_dynamic(&hlt_bytes);
-	d->v.ptr = msg;
-	hl_throw(d);
-}
-
-static void jit_null_access( jit_ctx *ctx ) {
-	op64(ctx,PUSH,PEBP,UNUSED);
-	op64(ctx,MOV,PEBP,PESP);
-	int_val arg = (int_val)USTR("Null access");
-	call_native_consts(ctx, jit_fail, &arg, 1);
-}
-
-static void jit_null_fail( int fhash ) {
-	vbyte *field = hl_field_name(fhash);
-	hl_buffer *b = hl_alloc_buffer();
-	hl_buffer_str(b, USTR("Null access ."));
-	hl_buffer_str(b, (uchar*)field);
-	vdynamic *d = hl_alloc_dynamic(&hlt_bytes);
-	d->v.ptr = hl_buffer_content(b,NULL);
-	hl_throw(d);
-}
-
-static void jit_null_field_access( jit_ctx *ctx ) {
-	preg p;
-	op64(ctx,PUSH,PEBP,UNUSED);
-	op64(ctx,MOV,PEBP,PESP);
-	int size = begin_native_call(ctx, 1);
-	int args_pos = (IS_WINCALL64 ? 32 : 0) + HL_WSIZE*2;
-	set_native_arg(ctx, pmem(&p,Ebp,args_pos));
-	call_native(ctx,jit_null_fail,size);
-}
-
-static void jit_assert( jit_ctx *ctx ) {
-	op64(ctx,PUSH,PEBP,UNUSED);
-	op64(ctx,MOV,PEBP,PESP);
-	int_val arg = 0;
-	call_native_consts(ctx, jit_fail, &arg, 1);
-}
-
-static int jit_build( jit_ctx *ctx, void (*fbuild)( jit_ctx *) ) {
-	int pos;
-	jit_buf(ctx);
-	jit_nops(ctx);
-	pos = BUF_POS();
-	fbuild(ctx);
-	int endPos = BUF_POS();
-	jit_nops(ctx);
-#ifdef WIN64_UNWIND_TABLES
-	int fid = ctx->nunwind++;
-	ctx->unwind_table[fid].BeginAddress = pos;
-	ctx->unwind_table[fid].EndAddress = endPos;
-	ctx->unwind_table[fid].UnwindData = ctx->unwind_offset;
-#endif
-	return pos;
-}
-
-static void hl_jit_init_module( jit_ctx *ctx, hl_module *m ) {
-	int i;
-	ctx->m = m;
-	if( m->code->hasdebug ) {
-		ctx->debug = (hl_debug_infos*)malloc(sizeof(hl_debug_infos) * m->code->nfunctions);
-		memset(ctx->debug, -1, sizeof(hl_debug_infos) * m->code->nfunctions);
-	}
-	for(i=0;i<m->code->nfloats;i++) {
-		jit_buf(ctx);
-		*ctx->buf.d++ = m->code->floats[i];
-	}
-#ifdef WIN64_UNWIND_TABLES
-	jit_buf(ctx);
-	ctx->unwind_offset = BUF_POS();
-	write_unwind_data(ctx);
-
-	ctx->unwind_table = malloc(sizeof(RUNTIME_FUNCTION) * (m->code->nfunctions + 10));
-	memset(ctx->unwind_table, 0, sizeof(RUNTIME_FUNCTION) * (m->code->nfunctions + 10));
-#endif
-}
-
-void hl_jit_init( jit_ctx *ctx, hl_module *m ) {
-	hl_jit_init_module(ctx,m);
-	ctx->c2hl = jit_build(ctx, jit_c2hl);
-	ctx->hl2c = jit_build(ctx, jit_hl2c);
-#	ifdef JIT_CUSTOM_LONGJUMP
-	ctx->longjump = jit_build(ctx, jit_longjump);
-#	endif
-	ctx->static_functions[0] = (void*)(int_val)jit_build(ctx,jit_null_access);
-	ctx->static_functions[1] = (void*)(int_val)jit_build(ctx,jit_assert);
-	ctx->static_functions[2] = (void*)(int_val)jit_build(ctx,jit_null_field_access);
-}
-
-void hl_jit_reset( jit_ctx *ctx, hl_module *m ) {
-	ctx->debug = NULL;
-	hl_jit_init_module(ctx,m);
-}
-
-static void *get_dyncast( hl_type *t ) {
-	switch( t->kind ) {
-	case HF32:
-		return hl_dyn_castf;
-	case HF64:
-		return hl_dyn_castd;
-	case HI64:
-	case HGUID:
-		return hl_dyn_casti64;
-	case HI32:
-	case HUI16:
-	case HUI8:
-	case HBOOL:
-		return hl_dyn_casti;
-	default:
-		return hl_dyn_castp;
-	}
-}
-
-static void *get_dynset( hl_type *t ) {
-	switch( t->kind ) {
-	case HF32:
-		return hl_dyn_setf;
-	case HF64:
-		return hl_dyn_setd;
-	case HI64:
-	case HGUID:
-		return hl_dyn_seti64;
-	case HI32:
-	case HUI16:
-	case HUI8:
-	case HBOOL:
-		return hl_dyn_seti;
-	default:
-		return hl_dyn_setp;
-	}
-}
-
-static void *get_dynget( hl_type *t ) {
-	switch( t->kind ) {
-	case HF32:
-		return hl_dyn_getf;
-	case HF64:
-		return hl_dyn_getd;
-	case HI64:
-	case HGUID:
-		return hl_dyn_geti64;
-	case HI32:
-	case HUI16:
-	case HUI8:
-	case HBOOL:
-		return hl_dyn_geti;
-	default:
-		return hl_dyn_getp;
-	}
-}
-
-static double uint_to_double( unsigned int v ) {
-	return v;
-}
-
-static vclosure *alloc_static_closure( jit_ctx *ctx, int fid ) {
-	hl_module *m = ctx->m;
-	vclosure *c = hl_malloc(&m->ctx.alloc,sizeof(vclosure));
-	int fidx = m->functions_indexes[fid];
-	c->hasValue = 0;
-	if( fidx >= m->code->nfunctions ) {
-		// native
-		c->t = m->code->natives[fidx - m->code->nfunctions].t;
-		c->fun = m->functions_ptrs[fid];
-		c->value = NULL;
-	} else {
-		c->t = m->code->functions[fidx].type;
-		c->fun = (void*)(int_val)fid;
-		c->value = ctx->closure_list;
-		ctx->closure_list = c;
-	}
-	return c;
-}
-
-static void make_dyn_cast( jit_ctx *ctx, vreg *dst, vreg *v ) {
-	int size;
-	preg p;
-	preg *tmp;
-	if( v->t->kind == HNULL && v->t->tparam->kind == dst->t->kind ) {
-		int jnull, jend;
-		preg *out;
-		switch( dst->t->kind ) {
-		case HUI8:
-		case HUI16:
-		case HI32:
-		case HBOOL:
-		case HI64:
-		case HGUID:
-			tmp = alloc_cpu(ctx, v, true);
-			op64(ctx, TEST, tmp, tmp);
-			XJump_small(JZero, jnull);
-			op64(ctx, MOV, tmp, pmem(&p,tmp->id,8));
-			XJump_small(JAlways, jend);
-			patch_jump(ctx, jnull);
-			op64(ctx, XOR, tmp, tmp);
-			patch_jump(ctx, jend);
-			store(ctx, dst, tmp, true);
-			return;
-		case HF32:
-		case HF64:
-			tmp = alloc_cpu(ctx, v, true);
-			out = alloc_fpu(ctx, dst, false);
-			op64(ctx, TEST, tmp, tmp);
-			XJump_small(JZero, jnull);
-			op64(ctx, dst->t->kind == HF32 ? MOVSS : MOVSD, out, pmem(&p,tmp->id,8));
-			XJump_small(JAlways, jend);
-			patch_jump(ctx, jnull);
-			op64(ctx, XORPD, out, out);
-			patch_jump(ctx, jend);
-			store(ctx, dst, out, true);
-			return;
-		default:
-			break;
-		}
-	}
-	switch( dst->t->kind ) {
-	case HF32:
-	case HF64:
-	case HI64:
-	case HGUID:
-		size = begin_native_call(ctx, 2);
-		set_native_arg(ctx, pconst64(&p,(int_val)v->t));
-		break;
-	default:
-		size = begin_native_call(ctx, 3);
-		set_native_arg(ctx, pconst64(&p,(int_val)dst->t));
-		set_native_arg(ctx, pconst64(&p,(int_val)v->t));
-		break;
-	}
-	tmp = alloc_native_arg(ctx);
-	op64(ctx,MOV,tmp,REG_AT(Ebp));
-	if( v->stackPos >= 0 )
-		op64(ctx,ADD,tmp,pconst(&p,v->stackPos));
-	else
-		op64(ctx,SUB,tmp,pconst(&p,-v->stackPos));
-	set_native_arg(ctx,tmp);
-	call_native(ctx,get_dyncast(dst->t),size);
-	store_result(ctx, dst);
-}
-
-int hl_jit_function( jit_ctx *ctx, hl_module *m, hl_function *f ) {
-	int i, size = 0, opCount;
-	int codePos = BUF_POS();
-	int nargs = f->type->fun->nargs;
-	unsigned short *debug16 = NULL;
-	int *debug32 = NULL;
-	call_regs cregs = {0};
-	hl_thread_info *tinf = NULL;
-	preg p;
-	ctx->f = f;
-	ctx->allocOffset = 0;
-	if( f->nregs > ctx->maxRegs ) {
-		free(ctx->vregs);
-		ctx->vregs = (vreg*)malloc(sizeof(vreg) * (f->nregs + 1));
-		if( ctx->vregs == NULL ) {
-			ctx->maxRegs = 0;
-			return -1;
-		}
-		ctx->maxRegs = f->nregs;
-	}
-	if( f->nops > ctx->maxOps ) {
-		free(ctx->opsPos);
-		ctx->opsPos = (int*)malloc(sizeof(int) * (f->nops + 1));
-		if( ctx->opsPos == NULL ) {
-			ctx->maxOps = 0;
-			return -1;
-		}
-		ctx->maxOps = f->nops;
-	}
-	memset(ctx->opsPos,0,(f->nops+1)*sizeof(int));
-	for(i=0;i<f->nregs;i++) {
-		vreg *r = R(i);
-		r->t = f->regs[i];
-		r->size = hl_type_size(r->t);
-		r->current = NULL;
-		r->stack.holds = NULL;
-		r->stack.id = i;
-		r->stack.kind = RSTACK;
-	}
-	size = 0;
-	int argsSize = 0;
-	for(i=0;i<nargs;i++) {
-		vreg *r = R(i);
-		int creg = select_call_reg(&cregs,r->t,i);
-		if( creg < 0 || IS_WINCALL64 ) {
-			// use existing stack storage
-			r->stackPos = argsSize + HL_WSIZE * 2;
-			argsSize += stack_size(r->t);
-		} else {
-			// make room in local vars
-			size += r->size;
-			size += hl_pad_size(size,r->t);
-			r->stackPos = -size;
-		}
-	}
-	for(i=nargs;i<f->nregs;i++) {
-		vreg *r = R(i);
-		size += r->size;
-		size += hl_pad_size(size,r->t); // align local vars
-		r->stackPos = -size;
-	}
-#	ifdef HL_64
-	size += (-size) & 15; // align on 16 bytes
-#	else
-	size += hl_pad_size(size,&hlt_dyn); // align on word size
-#	endif
-	ctx->totalRegsSize = size;
-	jit_buf(ctx);
-	ctx->functionPos = BUF_POS();
-	// make sure currentPos is > 0 before any reg allocations happen
-	// otherwise `alloc_reg` thinks that all registers are locked
-	ctx->currentPos = 1;
-	op_enter(ctx);
-#	ifdef HL_64
-	{
-		// store in local var
-		for(i=0;i<nargs;i++) {
-			vreg *r = R(i);
-			preg *p;
-			int reg = mapped_reg(&cregs, i);
-			if( reg < 0 ) continue;
-			p = REG_AT(reg);
-			copy(ctx,fetch(r),p,r->size);
-			p->holds = r;
-			r->current = p;
-		}
-	}
-#	endif
-	if( ctx->m->code->hasdebug ) {
-		debug16 = (unsigned short*)malloc(sizeof(unsigned short) * (f->nops + 1));
-		debug16[0] = (unsigned short)(BUF_POS() - codePos);
-	}
-	ctx->opsPos[0] = BUF_POS();
-
-	for(opCount=0;opCount<f->nops;opCount++) {
-		int jump;
-		hl_opcode *o = f->ops + opCount;
-		vreg *dst = R(o->p1);
-		vreg *ra = R(o->p2);
-		vreg *rb = R(o->p3);
-		ctx->currentPos = opCount + 1;
-		jit_buf(ctx);
-#		ifdef JIT_DEBUG
-		if( opCount == 0 || f->ops[opCount-1].op != OAsm ) {
-			int uid = opCount + (f->findex<<16);
-			op32(ctx, PUSH, pconst(&p,uid), UNUSED);
-			op64(ctx, ADD, PESP, pconst(&p,HL_WSIZE));
-		}
-#		endif
-		// emit code
-		switch( o->op ) {
-		case OMov:
-		case OUnsafeCast:
-			op_mov(ctx, dst, ra);
-			break;
-		case OInt:
-			store_const(ctx, dst, m->code->ints[o->p2]);
-			break;
-		case OBool:
-			store_const(ctx, dst, o->p2);
-			break;
-		case OGetGlobal:
-			{
-				void *addr = m->globals_data + m->globals_indexes[o->p2];
-#				ifdef HL_64
-				preg *tmp = alloc_reg(ctx, RCPU);
-				op64(ctx, MOV, tmp, pconst64(&p,(int_val)addr));
-				copy_to(ctx, dst, pmem(&p,tmp->id,0));
-#				else
-				copy_to(ctx, dst, paddr(&p,addr));
-#				endif
-			}
-			break;
-		case OSetGlobal:
-			{
-				void *addr = m->globals_data + m->globals_indexes[o->p1];
-#				ifdef HL_64
-				preg *tmp = alloc_reg(ctx, RCPU);
-				op64(ctx, MOV, tmp, pconst64(&p,(int_val)addr));
-				copy_from(ctx, pmem(&p,tmp->id,0), ra);
-#				else
-				copy_from(ctx, paddr(&p,addr), ra);
-#				endif
-			}
-			break;
-		case OCall3:
-			{
-				int args[3] = { o->p3, o->extra[0], o->extra[1] };
-				op_call_fun(ctx, dst, o->p2, 3, args);
-			}
-			break;
-		case OCall4:
-			{
-				int args[4] = { o->p3, o->extra[0], o->extra[1], o->extra[2] };
-				op_call_fun(ctx, dst, o->p2, 4, args);
-			}
-			break;
-		case OCallN:
-			op_call_fun(ctx, dst, o->p2, o->p3, o->extra);
-			break;
-		case OCall0:
-			op_call_fun(ctx, dst, o->p2, 0, NULL);
-			break;
-		case OCall1:
-			op_call_fun(ctx, dst, o->p2, 1, &o->p3);
-			break;
-		case OCall2:
-			{
-				int args[2] = { o->p3, (int)(int_val)o->extra };
-				op_call_fun(ctx, dst, o->p2, 2, args);
-			}
-			break;
-		case OSub:
-		case OAdd:
-		case OMul:
-		case OSDiv:
-		case OUDiv:
-		case OShl:
-		case OSShr:
-		case OUShr:
-		case OAnd:
-		case OOr:
-		case OXor:
-		case OSMod:
-		case OUMod:
-			op_binop(ctx, dst, ra, rb, o->op);
-			break;
-		case ONeg:
-			{
-				if( IS_FLOAT(ra) ) {
-					preg *pa = alloc_reg(ctx,RFPU);
-					preg *pb = alloc_fpu(ctx,ra,true);
-					op64(ctx,XORPD,pa,pa);
-					op64(ctx,ra->t->kind == HF32 ? SUBSS : SUBSD,pa,pb);
-					store(ctx,dst,pa,true);
-				} else if( ra->t->kind == HI64 ) {
-#					ifdef HL_64
-					preg *pa = alloc_reg(ctx,RCPU);
-					preg *pb = alloc_cpu(ctx,ra,true);
-					op64(ctx,XOR,pa,pa);
-					op64(ctx,SUB,pa,pb);
-					store(ctx,dst,pa,true);
-#					else
-					error_i64();
-#					endif
-				} else {
-					preg *pa = alloc_reg(ctx,RCPU);
-					preg *pb = alloc_cpu(ctx,ra,true);
-					op32(ctx,XOR,pa,pa);
-					op32(ctx,SUB,pa,pb);
-					store(ctx,dst,pa,true);
-				}
-			}
-			break;
-		case ONot:
-			{
-				preg *v = alloc_cpu(ctx,ra,true);
-				op32(ctx,XOR,v,pconst(&p,1));
-				store(ctx,dst,v,true);
-			}
-			break;
-		case OJFalse:
-		case OJTrue:
-		case OJNotNull:
-		case OJNull:
-			{
-				preg *r = dst->t->kind == HBOOL ? alloc_cpu8(ctx, dst, true) : alloc_cpu(ctx, dst, true);
-				op64(ctx, dst->t->kind == HBOOL ? TEST8 : TEST, r, r);
-				XJump( o->op == OJFalse || o->op == OJNull ? JZero : JNotZero,jump);
-				register_jump(ctx,jump,(opCount + 1) + o->p2);
-			}
-			break;
-		case OJEq:
-		case OJNotEq:
-		case OJSLt:
-		case OJSGte:
-		case OJSLte:
-		case OJSGt:
-		case OJULt:
-		case OJUGte:
-		case OJNotLt:
-		case OJNotGte:
-			op_jump(ctx,dst,ra,o,(opCount + 1) + o->p3);
-			break;
-		case OJAlways:
-			jump = do_jump(ctx,o->op,false);
-			register_jump(ctx,jump,(opCount + 1) + o->p1);
-			break;
-		case OToDyn:
-			if( ra->t->kind == HBOOL ) {
-				int size = begin_native_call(ctx, 1);
-				set_native_arg(ctx, fetch(ra));
-				call_native(ctx, hl_alloc_dynbool, size);
-				store(ctx, dst, PEAX, true);
-			} else {
-				int_val rt = (int_val)ra->t;
-				int jskip = 0;
-				if( hl_is_ptr(ra->t) ) {
-					int jnz;
-					preg *a = alloc_cpu(ctx,ra,true);
-					op64(ctx,TEST,a,a);
-					XJump_small(JNotZero,jnz);
-					op64(ctx,XOR,PEAX,PEAX); // will replace the result of alloc_dynamic at jump land
-					XJump_small(JAlways,jskip);
-					patch_jump(ctx,jnz);
-				}
-				call_native_consts(ctx, hl_alloc_dynamic, &rt, 1);
-				// copy value to dynamic
-				if( (IS_FLOAT(ra) || ra->size == 8) && !IS_64 ) {
-					preg *tmp = REG_AT(RCPU_SCRATCH_REGS[1]);
-					op64(ctx,MOV,tmp,&ra->stack);
-					op32(ctx,MOV,pmem(&p,Eax,HDYN_VALUE),tmp);
-					if( ra->t->kind == HF64 ) {
-						ra->stackPos += 4;
-						op64(ctx,MOV,tmp,&ra->stack);
-						op32(ctx,MOV,pmem(&p,Eax,HDYN_VALUE+4),tmp);
-						ra->stackPos -= 4;
-					}
-				} else {
-					preg *tmp = REG_AT(RCPU_SCRATCH_REGS[1]);
-					copy_from(ctx,tmp,ra);
-					op64(ctx,MOV,pmem(&p,Eax,HDYN_VALUE),tmp);
-				}
-				if( hl_is_ptr(ra->t) ) patch_jump(ctx,jskip);
-				store(ctx, dst, PEAX, true);
-			}
-			break;
-		case OToSFloat:
-			if( ra == dst ) break;
-			if (ra->t->kind == HI32 || ra->t->kind == HUI16 || ra->t->kind == HUI8) {
-				preg* r = alloc_cpu(ctx, ra, true);
-				preg* w = alloc_fpu(ctx, dst, false);
-				op32(ctx, dst->t->kind == HF64 ? CVTSI2SD : CVTSI2SS, w, r);
-				store(ctx, dst, w, true);
-			} else if (ra->t->kind == HI64 ) {
-				preg* r = alloc_cpu(ctx, ra, true);
-				preg* w = alloc_fpu(ctx, dst, false);
-				op64(ctx, dst->t->kind == HF64 ? CVTSI2SD : CVTSI2SS, w, r);
-				store(ctx, dst, w, true);
-			} else if( ra->t->kind == HF64 && dst->t->kind == HF32 ) {
-				preg *r = alloc_fpu(ctx,ra,true);
-				preg *w = alloc_fpu(ctx,dst,false);
-				op32(ctx,CVTSD2SS,w,r);
-				store(ctx, dst, w, true);
-			} else if( ra->t->kind == HF32 && dst->t->kind == HF64 ) {
-				preg *r = alloc_fpu(ctx,ra,true);
-				preg *w = alloc_fpu(ctx,dst,false);
-				op32(ctx,CVTSS2SD,w,r);
-				store(ctx, dst, w, true);
-			} else
-				ASSERT(0);
-			break;
-		case OToUFloat:
-			{
-				int size;
-				size = prepare_call_args(ctx,1,&o->p2,ctx->vregs,0);
-				call_native(ctx,uint_to_double,size);
-				store_result(ctx,dst);
-			}
-			break;
-		case OToInt:
-			if( ra == dst ) break;
-			if( ra->t->kind == HF64 ) {
-				preg *r = alloc_fpu(ctx,ra,true);
-				preg *w = alloc_cpu(ctx,dst,false);
-				preg *tmp = alloc_reg(ctx,RCPU);
-				op32(ctx,STMXCSR,pmem(&p,Esp,-4),UNUSED);
-				op32(ctx,MOV,tmp,&p);
-				op32(ctx,OR,tmp,pconst(&p,0x6000)); // set round towards 0
-				op32(ctx,MOV,pmem(&p,Esp,-8),tmp);
-				op32(ctx,LDMXCSR,&p,UNUSED);
-				op32(ctx,CVTSD2SI,w,r);
-				op32(ctx,LDMXCSR,pmem(&p,Esp,-4),UNUSED);
-				store(ctx, dst, w, true);
-			} else if (ra->t->kind == HF32) {
-				preg *r = alloc_fpu(ctx, ra, true);
-				preg *w = alloc_cpu(ctx, dst, false);
-				preg *tmp = alloc_reg(ctx, RCPU);
-				op32(ctx, STMXCSR, pmem(&p, Esp, -4), UNUSED);
-				op32(ctx, MOV, tmp, &p);
-				op32(ctx, OR, tmp, pconst(&p, 0x6000)); // set round towards 0
-				op32(ctx, MOV, pmem(&p, Esp, -8), tmp);
-				op32(ctx, LDMXCSR, &p, UNUSED);
-				op32(ctx, CVTSS2SI, w, r);
-				op32(ctx, LDMXCSR, pmem(&p, Esp, -4), UNUSED);
-				store(ctx, dst, w, true);
-			} else if( (dst->t->kind == HI64 || dst->t->kind == HGUID) && ra->t->kind == HI32 ) {
-				if( ra->current != PEAX ) {
-					op32(ctx, MOV, PEAX, fetch(ra));
-					scratch(PEAX);
-				}
-#				ifdef HL_64
-				op64(ctx, CDQE, UNUSED, UNUSED); // sign-extend Eax into Rax
-				store(ctx, dst, PEAX, true);
-#				else
-				op32(ctx, CDQ, UNUSED, UNUSED); // sign-extend Eax into Eax:Edx
-				scratch(REG_AT(Edx));
-				op32(ctx, MOV, fetch(dst), PEAX);
-				dst->stackPos += 4;
-				op32(ctx, MOV, fetch(dst), REG_AT(Edx));
-				dst->stackPos -= 4;
-			} else if( dst->t->kind == HI32 && ra->t->kind == HI64 ) {
-				error_i64();
-#				endif
-			} else {
-				preg *r = alloc_cpu(ctx,dst,false);
-				copy_from(ctx, r, ra);
-				store(ctx, dst, r, true);
-			}
-			break;
-		case ORet:
-			op_ret(ctx, dst);
-			break;
-		case OIncr:
-			{
-				if( IS_FLOAT(dst) ) {
-					ASSERT(0);
-				} else {
-					preg *v = fetch32(ctx,dst);
-					op32(ctx,INC,v,UNUSED);
-					if( v->kind != RSTACK ) store(ctx, dst, v, false);
-				}
-			}
-			break;
-		case ODecr:
-			{
-				if( IS_FLOAT(dst) ) {
-					ASSERT(0);
-				} else {
-					preg *v = fetch32(ctx,dst);
-					op32(ctx,DEC,v,UNUSED);
-					if( v->kind != RSTACK ) store(ctx, dst, v, false);
-				}
-			}
-			break;
-		case OFloat:
-			{
-				if( m->code->floats[o->p2] == 0 ) {
-					preg *f = alloc_fpu(ctx,dst,false);
-					op64(ctx,XORPD,f,f);
-				} else switch( dst->t->kind ) {
-				case HF64:
-				case HF32:
-#					ifdef HL_64
-					op64(ctx,dst->t->kind == HF32 ? CVTSD2SS : MOVSD,alloc_fpu(ctx,dst,false),pcodeaddr(&p,o->p2 * 8));
-#					else
-					op64(ctx,dst->t->kind == HF32 ? MOVSS : MOVSD,alloc_fpu(ctx,dst,false),paddr(&p,m->code->floats + o->p2));
-#					endif
-					break;
-				default:
-					ASSERT(dst->t->kind);
-				}
-				store(ctx,dst,dst->current,false);
-			}
-			break;
-		case OString:
-			op64(ctx,MOV,alloc_cpu(ctx, dst, false),pconst64(&p,(int_val)hl_get_ustring(m->code,o->p2)));
-			store(ctx,dst,dst->current,false);
-			break;
-		case OBytes:
-			{
-				char *b = m->code->version >= 5 ? m->code->bytes + m->code->bytes_pos[o->p2] : m->code->strings[o->p2];
-				op64(ctx,MOV,alloc_cpu(ctx,dst,false),pconst64(&p,(int_val)b));
-				store(ctx,dst,dst->current,false);
-			}
-			break;
-		case ONull:
-			{
-				op64(ctx,XOR,alloc_cpu(ctx, dst, false),alloc_cpu(ctx, dst, false));
-				store(ctx,dst,dst->current,false);
-			}
-			break;
-		case ONew:
-			{
-				int_val args[] = { (int_val)dst->t };
-				void *allocFun;
-				int nargs = 1;
-				switch( dst->t->kind ) {
-				case HOBJ:
-				case HSTRUCT:
-					allocFun = hl_alloc_obj;
-					break;
-				case HDYNOBJ:
-					allocFun = hl_alloc_dynobj;
-					nargs = 0;
-					break;
-				case HVIRTUAL:
-					allocFun = hl_alloc_virtual;
-					break;
-				default:
-					ASSERT(dst->t->kind);
-				}
-				call_native_consts(ctx, allocFun, args, nargs);
-				store(ctx, dst, PEAX, true);
-			}
-			break;
-		case OInstanceClosure:
-			{
-				preg *r = alloc_cpu(ctx, rb, true);
-				jlist *j = (jlist*)hl_malloc(&ctx->galloc,sizeof(jlist));
-				int size = begin_native_call(ctx,3);
-				set_native_arg(ctx,r);
-
-				j->pos = BUF_POS();
-				j->target = o->p2;
-				j->next = ctx->calls;
-				ctx->calls = j;
-
-				set_native_arg(ctx,pconst64(&p,RESERVE_ADDRESS));
-				set_native_arg(ctx,pconst64(&p,(int_val)m->code->functions[m->functions_indexes[o->p2]].type));
-				call_native(ctx,hl_alloc_closure_ptr,size);
-				store(ctx,dst,PEAX,true);
-			}
-			break;
-		case OVirtualClosure:
-			{
-				int size, i;
-				preg *r = alloc_cpu_call(ctx, ra);
-				hl_type *t = NULL;
-				hl_type *ot = ra->t;
-				while( t == NULL ) {
-					for(i=0;i<ot->obj->nproto;i++) {
-						hl_obj_proto *pp = ot->obj->proto + i;
-						if( pp->pindex == o->p3 ) {
-							t = m->code->functions[m->functions_indexes[pp->findex]].type;
-							break;
-						}
-					}
-					ot = ot->obj->super;
-				}
-				size = begin_native_call(ctx,3);
-				set_native_arg(ctx,r);
-				// read r->type->vobj_proto[i] for function address
-				op64(ctx,MOV,r,pmem(&p,r->id,0));
-				op64(ctx,MOV,r,pmem(&p,r->id,HL_WSIZE*2));
-				op64(ctx,MOV,r,pmem(&p,r->id,HL_WSIZE*o->p3));
-				set_native_arg(ctx,r);
-				op64(ctx,MOV,r,pconst64(&p,(int_val)t));
-				set_native_arg(ctx,r);
-				call_native(ctx,hl_alloc_closure_ptr,size);
-				store(ctx,dst,PEAX,true);
-			}
-			break;
-		case OCallClosure:
-			if( ra->t->kind == HDYN ) {
-				// ASM for {
-				//	vdynamic *args[] = {args};
-				//  vdynamic *ret = hl_dyn_call(closure,args,nargs);
-				//  dst = hl_dyncast(ret,t_dynamic,t_dst);
-				// }
-				int offset = o->p3 * HL_WSIZE;
-				preg *r = alloc_reg(ctx, RCPU_CALL);
-				if( offset & 15 ) offset += 16 - (offset & 15);
-				op64(ctx,SUB,PESP,pconst(&p,offset));
-				op64(ctx,MOV,r,PESP);
-				for(i=0;i<o->p3;i++) {
-					vreg *a = R(o->extra[i]);
-					if( !hl_is_dynamic(a->t) ) ASSERT(0);
-					preg *v = alloc_cpu(ctx,a,true);
-					op64(ctx,MOV,pmem(&p,r->id,i * HL_WSIZE),v);
-					RUNLOCK(v);
-				}
-#				ifdef HL_64
-				int size = begin_native_call(ctx, 3) + offset;
-				set_native_arg(ctx, pconst(&p,o->p3));
-				set_native_arg(ctx, r);
-				set_native_arg(ctx, fetch(ra));
-#				else
-				int size = pad_before_call(ctx,HL_WSIZE*2 + sizeof(int) + offset);
-				op64(ctx,PUSH,pconst(&p,o->p3),UNUSED);
-				op64(ctx,PUSH,r,UNUSED);
-				op64(ctx,PUSH,alloc_cpu(ctx,ra,true),UNUSED);
-#				endif
-				call_native(ctx,hl_dyn_call,size);
-				if( dst->t->kind != HVOID ) {
-					store(ctx,dst,PEAX,true);
-					make_dyn_cast(ctx,dst,dst);
-				}
-			} else {
-				int jhasvalue, jend, size;
-				// ASM for  if( c->hasValue ) c->fun(value,args) else c->fun(args)
-				preg *r = alloc_cpu(ctx,ra,true);
-				preg *tmp = alloc_reg(ctx, RCPU);
-				op32(ctx,MOV,tmp,pmem(&p,r->id,HL_WSIZE*2));
-				op32(ctx,TEST,tmp,tmp);
-				scratch(tmp);
-				XJump_small(JNotZero,jhasvalue);
-				save_regs(ctx);
-				size = prepare_call_args(ctx,o->p3,o->extra,ctx->vregs,0);
-				preg *rr = r;
-				if( rr->holds != ra ) rr = alloc_cpu(ctx, ra, true);
-				op_call(ctx, pmem(&p,rr->id,HL_WSIZE), size);
-				XJump_small(JAlways,jend);
-				patch_jump(ctx,jhasvalue);
-				restore_regs(ctx);
-#				ifdef HL_64
-				{
-					int regids[64];
-					preg *pc = REG_AT(CALL_REGS[0]);
-					vreg *sc = R(f->nregs); // scratch register that we temporary rebind
-					if( o->p3 >= 63 ) jit_error("assert");
-					memcpy(regids + 1, o->extra, o->p3 * sizeof(int));
-					regids[0] = f->nregs;
-					sc->size = HL_WSIZE;
-					sc->t = &hlt_dyn;
-					op64(ctx, MOV, pc, pmem(&p,r->id,HL_WSIZE*3));
-					scratch(pc);
-					sc->current = pc;
-					pc->holds = sc;
-					size = prepare_call_args(ctx,o->p3 + 1,regids,ctx->vregs,0);
-					if( r->holds != ra ) r = alloc_cpu(ctx, ra, true);
-				}
-#				else
-				size = prepare_call_args(ctx,o->p3,o->extra,ctx->vregs,HL_WSIZE);
-				if( r->holds != ra ) r = alloc_cpu(ctx, ra, true);
-				op64(ctx, PUSH,pmem(&p,r->id,HL_WSIZE*3),UNUSED); // push closure value
-#				endif
-				op_call(ctx, pmem(&p,r->id,HL_WSIZE), size);
-				discard_regs(ctx,false);
-				patch_jump(ctx,jend);
-				store_result(ctx, dst);
-			}
-			break;
-		case OStaticClosure:
-			{
-				vclosure *c = alloc_static_closure(ctx,o->p2);
-				preg *r = alloc_reg(ctx, RCPU);
-				op64(ctx, MOV, r, pconst64(&p,(int_val)c));
-				store(ctx,dst,r,true);
-			}
-			break;
-		case OField:
-			{
-#				ifndef HL_64
-				if( dst->t->kind == HI64 ) {
-					error_i64();
-					break;
-				}
-#				endif
-				switch( ra->t->kind ) {
-				case HOBJ:
-				case HSTRUCT:
-					{
-						hl_runtime_obj *rt = hl_get_obj_rt(ra->t);
-						preg *rr = alloc_cpu(ctx,ra, true);
-						if( dst->t->kind == HSTRUCT ) {
-							hl_type *ft = hl_obj_field_fetch(ra->t,o->p3)->t;
-							if( ft->kind == HPACKED ) {
-								preg *r = alloc_reg(ctx,RCPU);
-								op64(ctx,LEA,r,pmem(&p,(CpuReg)rr->id,rt->fields_indexes[o->p3]));
-								store(ctx,dst,r,true);
-								break;
-							}
-						}
-						copy_to(ctx,dst,pmem(&p, (CpuReg)rr->id, rt->fields_indexes[o->p3]));
-					}
-					break;
-				case HVIRTUAL:
-					// ASM for --> if( hl_vfields(o)[f] ) r = *hl_vfields(o)[f]; else r = hl_dyn_get(o,hash(field),vt)
-					{
-						int jhasfield, jend, size;
-						bool need_type = !(IS_FLOAT(dst) || dst->t->kind == HI64);
-						preg *v = alloc_cpu_call(ctx,ra);
-						preg *r = alloc_reg(ctx,RCPU);
-						op64(ctx,MOV,r,pmem(&p,v->id,sizeof(vvirtual)+HL_WSIZE*o->p3));
-						op64(ctx,TEST,r,r);
-						XJump_small(JNotZero,jhasfield);
-						size = begin_native_call(ctx, need_type ? 3 : 2);
-						if( need_type ) set_native_arg(ctx,pconst64(&p,(int_val)dst->t));
-						set_native_arg(ctx,pconst64(&p,(int_val)ra->t->virt->fields[o->p3].hashed_name));
-						set_native_arg(ctx,v);
-						call_native(ctx,get_dynget(dst->t),size);
-						store_result(ctx,dst);
-						XJump_small(JAlways,jend);
-						patch_jump(ctx,jhasfield);
-						copy_to(ctx, dst, pmem(&p,(CpuReg)r->id,0));
-						patch_jump(ctx,jend);
-						scratch(dst->current);
-					}
-					break;
-				default:
-					ASSERT(ra->t->kind);
-					break;
-				}
-			}
-			break;
-		case OSetField:
-			{
-				switch( dst->t->kind ) {
-				case HOBJ:
-				case HSTRUCT:
-					{
-						hl_runtime_obj *rt = hl_get_obj_rt(dst->t);
-						preg *rr = alloc_cpu(ctx, dst, true);
-						if( rb->t->kind == HSTRUCT ) {
-							hl_type *ft = hl_obj_field_fetch(dst->t,o->p2)->t;
-							if( ft->kind == HPACKED ) {
-								hl_runtime_obj *frt = hl_get_obj_rt(ft->tparam);
-								preg *prb = alloc_cpu(ctx, rb, true);
-								preg *tmp = alloc_reg(ctx, RCPU_CALL);
-								int offset = 0;
-								while( offset < frt->size ) {
-									int remain = frt->size - offset;
-									int copy_size = remain >= HL_WSIZE ? HL_WSIZE : (remain >= 4 ? 4 : (remain >= 2 ? 2 : 1));
-									copy(ctx, tmp, pmem(&p, (CpuReg)prb->id, offset), copy_size);
-									copy(ctx, pmem(&p, (CpuReg)rr->id, rt->fields_indexes[o->p2]+offset), tmp, copy_size);
-									offset += copy_size;
-								}
-								break;
-							}
-						}
-						copy_from(ctx, pmem(&p, (CpuReg)rr->id, rt->fields_indexes[o->p2]), rb);
-					}
-					break;
-				case HVIRTUAL:
-					// ASM for --> if( hl_vfields(o)[f] ) *hl_vfields(o)[f] = v; else hl_dyn_set(o,hash(field),vt,v)
-					{
-						int jhasfield, jend;
-						preg *obj = alloc_cpu_call(ctx,dst);
-						preg *r = alloc_reg(ctx,RCPU);
-						op64(ctx,MOV,r,pmem(&p,obj->id,sizeof(vvirtual)+HL_WSIZE*o->p2));
-						op64(ctx,TEST,r,r);
-						XJump_small(JNotZero,jhasfield);
-#						ifdef HL_64
-						switch( rb->t->kind ) {
-						case HF64:
-						case HF32:
-							size = begin_native_call(ctx,3);
-							set_native_arg_fpu(ctx, fetch(rb), rb->t->kind == HF32);
-							break;
-						case HI64:
-						case HGUID:
-							size = begin_native_call(ctx,3);
-							set_native_arg(ctx, fetch(rb));
-							break;
-						default:
-							size = begin_native_call(ctx, 4);
-							set_native_arg(ctx, fetch(rb));
-							set_native_arg(ctx, pconst64(&p,(int_val)rb->t));
-							break;
-						}
-						set_native_arg(ctx,pconst(&p,dst->t->virt->fields[o->p2].hashed_name));
-						set_native_arg(ctx,obj);
-#						else
-						switch( rb->t->kind ) {
-						case HF64:
-						case HI64:
-						case HGUID:
-							size = pad_before_call(ctx,HL_WSIZE*2 + sizeof(double));
-							push_reg(ctx,rb);
-							break;
-						case HF32:
-							size = pad_before_call(ctx,HL_WSIZE*2 + sizeof(float));
-							push_reg(ctx,rb);
-							break;
-						default:
-							size = pad_before_call(ctx,HL_WSIZE*4);
-							op64(ctx,PUSH,fetch32(ctx,rb),UNUSED);
-							op64(ctx,MOV,r,pconst64(&p,(int_val)rb->t));
-							op64(ctx,PUSH,r,UNUSED);
-							break;
-						}
-						op32(ctx,MOV,r,pconst(&p,dst->t->virt->fields[o->p2].hashed_name));
-						op64(ctx,PUSH,r,UNUSED);
-						op64(ctx,PUSH,obj,UNUSED);
-#						endif
-						call_native(ctx,get_dynset(rb->t),size);
-						XJump_small(JAlways,jend);
-						patch_jump(ctx,jhasfield);
-						copy_from(ctx, pmem(&p,(CpuReg)r->id,0), rb);
-						patch_jump(ctx,jend);
-						scratch(rb->current);
-					}
-					break;
-				default:
-					ASSERT(dst->t->kind);
-					break;
-				}
-			}
-			break;
-		case OGetThis:
-			{
-				vreg *r = R(0);
-				hl_runtime_obj *rt = hl_get_obj_rt(r->t);
-				preg *rr = alloc_cpu(ctx,r, true);
-				if( dst->t->kind == HSTRUCT ) {
-					hl_type *ft = hl_obj_field_fetch(r->t,o->p2)->t;
-					if( ft->kind == HPACKED ) {
-						preg *r = alloc_reg(ctx,RCPU);
-						op64(ctx,LEA,r,pmem(&p,(CpuReg)rr->id,rt->fields_indexes[o->p2]));
-						store(ctx,dst,r,true);
-						break;
-					}
-				}
-				copy_to(ctx,dst,pmem(&p, (CpuReg)rr->id, rt->fields_indexes[o->p2]));
-			}
-			break;
-		case OSetThis:
-			{
-				vreg *r = R(0);
-				hl_runtime_obj *rt = hl_get_obj_rt(r->t);
-				preg *rr = alloc_cpu(ctx, r, true);
-				if( ra->t->kind == HSTRUCT ) {
-					hl_type *ft = hl_obj_field_fetch(r->t,o->p1)->t;
-					if( ft->kind == HPACKED ) {
-						hl_runtime_obj *frt = hl_get_obj_rt(ft->tparam);
-						preg *pra = alloc_cpu(ctx, ra, true);
-						preg *tmp = alloc_reg(ctx, RCPU_CALL);
-						int offset = 0;
-						while( offset < frt->size ) {
-							int remain = frt->size - offset;
-							int copy_size = remain >= HL_WSIZE ? HL_WSIZE : (remain >= 4 ? 4 : (remain >= 2 ? 2 : 1));
-							copy(ctx, tmp, pmem(&p, (CpuReg)pra->id, offset), copy_size);
-							copy(ctx, pmem(&p, (CpuReg)rr->id, rt->fields_indexes[o->p1]+offset), tmp, copy_size);
-							offset += copy_size;
-						}
-						break;
-					}
-				}
-				copy_from(ctx, pmem(&p, (CpuReg)rr->id, rt->fields_indexes[o->p1]), ra);
-			}
-			break;
-		case OCallThis:
-			{
-				int nargs = o->p3 + 1;
-				int *args = (int*)hl_malloc(&ctx->falloc,sizeof(int) * nargs);
-				int size;
-				preg *r = alloc_cpu(ctx, R(0), true);
-				preg *tmp;
-				tmp = alloc_reg(ctx, RCPU_CALL);
-				op64(ctx,MOV,tmp,pmem(&p,r->id,0)); // read type
-				op64(ctx,MOV,tmp,pmem(&p,tmp->id,HL_WSIZE*2)); // read proto
-				args[0] = 0;
-				for(i=1;i<nargs;i++)
-					args[i] = o->extra[i-1];
-				size = prepare_call_args(ctx,nargs,args,ctx->vregs,0);
-				op_call(ctx,pmem(&p,tmp->id,o->p2*HL_WSIZE),size);
-				discard_regs(ctx, false);
-				store_result(ctx, dst);
-			}
-			break;
-		case OCallMethod:
-			switch( R(o->extra[0])->t->kind ) {
-			case HOBJ: {
-				int size;
-				preg *r = alloc_cpu(ctx, R(o->extra[0]), true);
-				preg *tmp;
-				tmp = alloc_reg(ctx, RCPU_CALL);
-				op64(ctx,MOV,tmp,pmem(&p,r->id,0)); // read type
-				op64(ctx,MOV,tmp,pmem(&p,tmp->id,HL_WSIZE*2)); // read proto
-				size = prepare_call_args(ctx,o->p3,o->extra,ctx->vregs,0);
-				op_call(ctx,pmem(&p,tmp->id,o->p2*HL_WSIZE),size);
-				discard_regs(ctx, false);
-				store_result(ctx, dst);
-				break;
-			}
-			case HVIRTUAL:
-				// ASM for --> if( hl_vfields(o)[f] ) dst = *hl_vfields(o)[f](o->value,args...); else dst = hl_dyn_call_obj(o->value,field,args,&ret)
-				{
-					int size;
-					int paramsSize;
-					int jhasfield, jend;
-					bool need_dyn;
-					bool obj_in_args = false;
-					vreg *obj = R(o->extra[0]);
-					preg *v = alloc_cpu_call(ctx,obj);
-					preg *r = alloc_reg(ctx,RCPU_CALL);
-					op64(ctx,MOV,r,pmem(&p,v->id,sizeof(vvirtual)+HL_WSIZE*o->p2));
-					op64(ctx,TEST,r,r);
-					save_regs(ctx);
-
-					if( o->p3 < 6 ) {
-						XJump_small(JNotZero,jhasfield);
-					} else {
-						XJump(JNotZero,jhasfield);
-					}
-
-					need_dyn = !hl_is_ptr(dst->t) && dst->t->kind != HVOID;
-					paramsSize = (o->p3 - 1) * HL_WSIZE;
-					if( need_dyn ) paramsSize += sizeof(vdynamic);
-					if( paramsSize & 15 ) paramsSize += 16 - (paramsSize&15);
-					op64(ctx,SUB,PESP,pconst(&p,paramsSize));
-					op64(ctx,MOV,r,PESP);
-
-					for(i=0;i<o->p3-1;i++) {
-						vreg *a = R(o->extra[i+1]);
-						if( hl_is_ptr(a->t) ) {
-							op64(ctx,MOV,pmem(&p,r->id,i*HL_WSIZE),alloc_cpu(ctx,a,true));
-							if( a->current != v ) {
-								RUNLOCK(a->current);
-							} else
-								obj_in_args = true;
-						} else {
-							preg *r2 = alloc_reg(ctx,RCPU);
-							op64(ctx,LEA,r2,&a->stack);
-							op64(ctx,MOV,pmem(&p,r->id,i*HL_WSIZE),r2);
-							if( r2 != v ) RUNLOCK(r2);
-						}
-					}
-
-					jit_buf(ctx);
-
-					if( !need_dyn ) {
-						size = begin_native_call(ctx, 5);
-						set_native_arg(ctx, pconst(&p,0));
-					} else {
-						preg *rtmp = alloc_reg(ctx,RCPU);
-						op64(ctx,LEA,rtmp,pmem(&p,Esp,paramsSize - sizeof(vdynamic)));
-						size = begin_native_call(ctx, 5);
-						set_native_arg(ctx,rtmp);
-						if( !IS_64 ) RUNLOCK(rtmp);
-					}
-					set_native_arg(ctx,r);
-					set_native_arg(ctx,pconst(&p,obj->t->virt->fields[o->p2].hashed_name)); // fid
-					set_native_arg(ctx,pconst64(&p,(int_val)obj->t->virt->fields[o->p2].t)); // ftype
-					set_native_arg(ctx,pmem(&p,v->id,HL_WSIZE)); // o->value
-					call_native(ctx,hl_dyn_call_obj,size + paramsSize);
-					if( need_dyn ) {
-						preg *r = IS_FLOAT(dst) ? REG_AT(XMM(0)) : PEAX;
-						copy(ctx,r,pmem(&p,Esp,HDYN_VALUE - (int)sizeof(vdynamic)),dst->size);
-						store(ctx, dst, r, false);
-					} else
-						store(ctx, dst, PEAX, false);
-
-					XJump_small(JAlways,jend);
-					patch_jump(ctx,jhasfield);
-					restore_regs(ctx);
-
-					if( !obj_in_args ) {
-						// o = o->value hack
-						if( v->holds ) v->holds->current = NULL;
-						obj->current = v;
-						v->holds = obj;
-						op64(ctx,MOV,v,pmem(&p,v->id,HL_WSIZE));
-						size = prepare_call_args(ctx,o->p3,o->extra,ctx->vregs,0);
-					} else {
-						// keep o->value in R(f->nregs)
-						int regids[64];
-						preg *pc = alloc_reg(ctx,RCPU_CALL);
-						vreg *sc = R(f->nregs); // scratch register that we temporary rebind
-						if( o->p3 >= 63 ) jit_error("assert");
-						memcpy(regids, o->extra, o->p3 * sizeof(int));
-						regids[0] = f->nregs;
-						sc->size = HL_WSIZE;
-						sc->t = &hlt_dyn;
-						op64(ctx, MOV, pc, pmem(&p,v->id,HL_WSIZE));
-						scratch(pc);
-						sc->current = pc;
-						pc->holds = sc;
-						size = prepare_call_args(ctx,o->p3,regids,ctx->vregs,0);
-					}
-
-					op_call(ctx,r,size);
-					discard_regs(ctx, false);
-					store_result(ctx, dst);
-					patch_jump(ctx,jend);
-				}
-				break;
-			default:
-				ASSERT(0);
-				break;
-			}
-			break;
-		case ORethrow:
-			{
-				int size = prepare_call_args(ctx,1,&o->p1,ctx->vregs,0);
-				call_native(ctx,hl_rethrow,size);
-			}
-			break;
-		case OThrow:
-			{
-				int size = prepare_call_args(ctx,1,&o->p1,ctx->vregs,0);
-				call_native(ctx,hl_throw,size);
-			}
-			break;
-		case OLabel:
-			// NOP for now
-			discard_regs(ctx,false);
-			break;
-		case OGetI8:
-		case OGetI16:
-			{
-				preg *base = alloc_cpu(ctx, ra, true);
-				preg *offset = alloc_cpu64(ctx, rb, true);
-				preg *r = alloc_reg(ctx,o->op == OGetI8 ? RCPU_8BITS : RCPU);
-				op64(ctx,XOR,r,r);
-				op32(ctx, o->op == OGetI8 ? MOV8 : MOV16,r,pmem2(&p,base->id,offset->id,1,0));
-				store(ctx, dst, r, true);
-			}
-			break;
-		case OGetMem:
-			{
-				#ifndef HL_64
-				if (dst->t->kind == HI64) {
-					error_i64();
-				}
-				#endif
-				preg *base = alloc_cpu(ctx, ra, true);
-				preg *offset = alloc_cpu64(ctx, rb, true);
-				store(ctx, dst, pmem2(&p,base->id,offset->id,1,0), false);
-			}
-			break;
-		case OSetI8:
-			{
-				preg *base = alloc_cpu(ctx, dst, true);
-				preg *offset = alloc_cpu64(ctx, ra, true);
-				preg *value = alloc_cpu8(ctx, rb, true);
-				op32(ctx,MOV8,pmem2(&p,base->id,offset->id,1,0),value);
-			}
-			break;
-		case OSetI16:
-			{
-				preg *base = alloc_cpu(ctx, dst, true);
-				preg *offset = alloc_cpu64(ctx, ra, true);
-				preg *value = alloc_cpu(ctx, rb, true);
-				op32(ctx,MOV16,pmem2(&p,base->id,offset->id,1,0),value);
-			}
-			break;
-		case OSetMem:
-			{
-				preg *base = alloc_cpu(ctx, dst, true);
-				preg *offset = alloc_cpu64(ctx, ra, true);
-				preg *value;
-				switch( rb->t->kind ) {
-				case HI32:
-					value = alloc_cpu(ctx, rb, true);
-					op32(ctx,MOV,pmem2(&p,base->id,offset->id,1,0),value);
-					break;
-				case HF32:
-					value = alloc_fpu(ctx, rb, true);
-					op32(ctx,MOVSS,pmem2(&p,base->id,offset->id,1,0),value);
-					break;
-				case HF64:
-					value = alloc_fpu(ctx, rb, true);
-					op32(ctx,MOVSD,pmem2(&p,base->id,offset->id,1,0),value);
-					break;
-				case HI64:
-				case HGUID:
-					value = alloc_cpu(ctx, rb, true);
-					op64(ctx,MOV,pmem2(&p,base->id,offset->id,1,0),value);
-					break;
-				default:
-					ASSERT(rb->t->kind);
-					break;
-				}
-			}
-			break;
-		case OType:
-			{
-				op64(ctx,MOV,alloc_cpu(ctx, dst, false),pconst64(&p,(int_val)(m->code->types + o->p2)));
-				store(ctx,dst,dst->current,false);
-			}
-			break;
-		case OGetType:
-			{
-				int jnext, jend;
-				preg *r = alloc_cpu(ctx, ra, true);
-				preg *tmp = alloc_reg(ctx, RCPU);
-				op64(ctx,TEST,r,r);
-				XJump_small(JNotZero,jnext);
-				op64(ctx,MOV, tmp, pconst64(&p,(int_val)&hlt_void));
-				XJump_small(JAlways,jend);
-				patch_jump(ctx,jnext);
-				op64(ctx, MOV, tmp, pmem(&p,r->id,0));
-				patch_jump(ctx,jend);
-				store(ctx,dst,tmp,true);
-			}
-			break;
-		case OGetArray:
-			{
-				preg *rdst = IS_FLOAT(dst) ? alloc_fpu(ctx,dst,false) : alloc_cpu(ctx,dst,false);
-				if( ra->t->kind == HABSTRACT ) {
-					int osize;
-					bool isRead = dst->t->kind != HOBJ && dst->t->kind != HSTRUCT;
-					if( isRead )
-						osize = sizeof(void*);
-					else {
-						hl_runtime_obj *rt = hl_get_obj_rt(dst->t);
-						osize = rt->size;
-					}
-					preg *idx = alloc_cpu64(ctx, rb, true);
-					op64(ctx, IMUL, idx, pconst(&p,osize));
-					op64(ctx, isRead?MOV:LEA, rdst, pmem2(&p,alloc_cpu(ctx,ra, true)->id,idx->id,1,0));
-					store(ctx,dst,dst->current,false);
-					scratch(idx);
-				} else {
-					copy(ctx, rdst, pmem2(&p,alloc_cpu(ctx,ra,true)->id,alloc_cpu64(ctx,rb,true)->id,hl_type_size(dst->t),sizeof(varray)), dst->size);
-					store(ctx,dst,dst->current,false);
-				}
-			}
-			break;
-		case OSetArray:
-			{
-				if( dst->t->kind == HABSTRACT ) {
-					int osize;
-					bool isWrite = rb->t->kind != HOBJ && rb->t->kind != HSTRUCT;
-					if( isWrite ) {
-						osize = sizeof(void*);
-					} else {
-						hl_runtime_obj *rt = hl_get_obj_rt(rb->t);
-						osize = rt->size;
-					}
-					preg *pdst = alloc_cpu(ctx,dst,true);
-					preg *pra = alloc_cpu64(ctx,ra,true);
-					op64(ctx, IMUL, pra, pconst(&p,osize));
-					op64(ctx, ADD, pdst, pra);
-					scratch(pra);
-					preg *prb = alloc_cpu(ctx,rb,true);
-					preg *tmp = alloc_reg(ctx, RCPU_CALL);
-					int offset = 0;
-					while( offset < osize ) {
-						int remain = osize - offset;
-						int copy_size = remain >= HL_WSIZE ? HL_WSIZE : (remain >= 4 ? 4 : (remain >= 2 ? 2 : 1));
-						copy(ctx, tmp, pmem(&p, prb->id, offset), copy_size);
-						copy(ctx, pmem(&p, pdst->id, offset), tmp, copy_size);
-						offset += copy_size;
-					}
-					scratch(pdst);
-				} else  {
-					preg *rrb = IS_FLOAT(rb) ? alloc_fpu(ctx,rb,true) : alloc_cpu(ctx,rb,true);
-					copy(ctx, pmem2(&p,alloc_cpu(ctx,dst,true)->id,alloc_cpu64(ctx,ra,true)->id,hl_type_size(rb->t),sizeof(varray)), rrb, rb->size);
-				}
-			}
-			break;
-		case OArraySize:
-			{
-				op32(ctx,MOV,alloc_cpu(ctx,dst,false),pmem(&p,alloc_cpu(ctx,ra,true)->id,ra->t->kind == HABSTRACT ? HL_WSIZE + 4 : HL_WSIZE*2));
-				store(ctx,dst,dst->current,false);
-			}
-			break;
-		case ORef:
-			{
-				scratch(ra->current);
-				op64(ctx,MOV,alloc_cpu(ctx,dst,false),REG_AT(Ebp));
-				if( ra->stackPos < 0 )
-					op64(ctx,SUB,dst->current,pconst(&p,-ra->stackPos));
-				else
-					op64(ctx,ADD,dst->current,pconst(&p,ra->stackPos));
-				store(ctx,dst,dst->current,false);
-			}
-			break;
-		case OUnref:
-			copy_to(ctx,dst,pmem(&p,alloc_cpu(ctx,ra,true)->id,0));
-			break;
-		case OSetref:
-			copy_from(ctx,pmem(&p,alloc_cpu(ctx,dst,true)->id,0),ra);
-			break;
-		case ORefData:
-			switch( ra->t->kind ) {
-			case HARRAY:
-				{
-					preg *r = fetch(ra);
-					preg *d = alloc_cpu(ctx,dst,false);
-					op64(ctx,MOV,d,r);
-					op64(ctx,ADD,d,pconst(&p,sizeof(varray)));
-					store(ctx,dst,dst->current,false);
-				}
-				break;
-			default:
-				ASSERT(ra->t->kind);
-			}
-			break;
-		case ORefOffset:
-			{
-				preg *d = alloc_cpu(ctx,rb,true);
-				preg *r2 = alloc_cpu(ctx,dst,false);
-				preg *r = fetch(ra);
-				int size = hl_type_size(dst->t->tparam);
-				op64(ctx,MOV,r2,r);
-				switch( size ) {
-				case 1:
-					break;
-				case 2:
-					op64(ctx,SHL,d,pconst(&p,1));
-					break;
-				case 4:
-					op64(ctx,SHL,d,pconst(&p,2));
-					break;
-				case 8:
-					op64(ctx,SHL,d,pconst(&p,3));
-					break;
-				default:
-					op64(ctx,IMUL,d,pconst(&p,size));
-					break;
-				}
-				op64(ctx,ADD,r2,d);
-				scratch(d);
-				store(ctx,dst,dst->current,false);
-			}
-			break;
-		case OToVirtual:
-			{
-#				ifdef HL_64
-				int size = pad_before_call(ctx, 0);
-				op64(ctx,MOV,REG_AT(CALL_REGS[1]),fetch(ra));
-				op64(ctx,MOV,REG_AT(CALL_REGS[0]),pconst64(&p,(int_val)dst->t));
-#				else
-				int size = pad_before_call(ctx, HL_WSIZE*2);
-				op32(ctx,PUSH,fetch(ra),UNUSED);
-				op32(ctx,PUSH,pconst(&p,(int)(int_val)dst->t),UNUSED);
-#				endif
-				if( ra->t->kind == HOBJ ) hl_get_obj_rt(ra->t); // ensure it's initialized
-				call_native(ctx,hl_to_virtual,size);
-				store(ctx,dst,PEAX,true);
-			}
-			break;
-		case OMakeEnum:
-			{
-				hl_enum_construct *c = &dst->t->tenum->constructs[o->p2];
-				int_val args[] = { (int_val)dst->t, o->p2 };
-				int i;
-				call_native_consts(ctx, hl_alloc_enum, args, 2);
-				RLOCK(PEAX);
-				for(i=0;i<c->nparams;i++) {
-					preg *r = fetch(R(o->extra[i]));
-					copy(ctx, pmem(&p,Eax,c->offsets[i]),r, R(o->extra[i])->size);
-					RUNLOCK(fetch(R(o->extra[i])));
-					if ((i & 15) == 0) jit_buf(ctx);
-				}
-				store(ctx, dst, PEAX, true);
-			}
-			break;
-		case OEnumAlloc:
-			{
-				int_val args[] = { (int_val)dst->t, o->p2 };
-				call_native_consts(ctx, hl_alloc_enum, args, 2);
-				store(ctx, dst, PEAX, true);
-			}
-			break;
-		case OEnumField:
-			{
-				hl_enum_construct *c = &ra->t->tenum->constructs[o->p3];
-				preg *r = alloc_cpu(ctx,ra,true);
-				copy_to(ctx,dst,pmem(&p,r->id,c->offsets[(int)(int_val)o->extra]));
-			}
-			break;
-		case OSetEnumField:
-			{
-				hl_enum_construct *c = &dst->t->tenum->constructs[0];
-				preg *r = alloc_cpu(ctx,dst,true);
-				switch( rb->t->kind ) {
-				case HF64:
-					{
-						preg *d = alloc_fpu(ctx,rb,true);
-						copy(ctx,pmem(&p,r->id,c->offsets[o->p2]),d,8);
-						break;
-					}
-				default:
-					copy(ctx,pmem(&p,r->id,c->offsets[o->p2]),alloc_cpu(ctx,rb,true),hl_type_size(c->params[o->p2]));
-					break;
-				}
-			}
-			break;
-		case ONullCheck:
-			{
-				int jz;
-				preg *r = alloc_cpu(ctx,dst,true);
-				op64(ctx,TEST,r,r);
-				XJump_small(JNotZero,jz);
-
-				hl_opcode *next = f->ops + opCount + 1;
-				bool null_field_access = false;
-				int hashed_name = 0;
-				// skip const and operation between nullcheck and access
-				while( (next < f->ops + f->nops - 1) && (next->op >= OInt && next->op <= ODecr) ) {
-					next++;
-				}
-				if( (next->op == OField && next->p2 == o->p1) || (next->op == OSetField && next->p1 == o->p1) ) {
-					int fid = next->op == OField ? next->p3 : next->p2;
-					hl_obj_field *f = NULL;
-					if( dst->t->kind == HOBJ || dst->t->kind == HSTRUCT )
-						f = hl_obj_field_fetch(dst->t, fid);
-					else if( dst->t->kind == HVIRTUAL )
-						f = dst->t->virt->fields + fid;
-					if( f == NULL ) ASSERT(dst->t->kind);
-					null_field_access = true;
-					hashed_name = f->hashed_name;
-				} else if( (next->op >= OCall1 && next->op <= OCallN) && next->p3 == o->p1 ) {
-					int fid = next->p2 < 0 ? -1 : ctx->m->functions_indexes[next->p2];
-					hl_function *cf = ctx->m->code->functions + fid;
-					const uchar *name = fun_field_name(cf);
-					null_field_access = true;
-					hashed_name = hl_hash_gen(name, true);
-				}
-
-				if( null_field_access ) {
-					pad_before_call(ctx, HL_WSIZE);
-					if( hashed_name >= 0 && hashed_name < 256 )
-						op64(ctx,PUSH8,pconst(&p,hashed_name),UNUSED);
-					else
-						op32(ctx,PUSH,pconst(&p,hashed_name),UNUSED);
-				} else {
-					pad_before_call(ctx, 0);
-				}
-
-				jlist *j = (jlist*)hl_malloc(&ctx->galloc,sizeof(jlist));
-				j->pos = BUF_POS();
-				j->target = null_field_access ? -3 : -1;
-				j->next = ctx->calls;
-				ctx->calls = j;
-
-				op64(ctx,MOV,PEAX,pconst64(&p,RESERVE_ADDRESS));
-				op_call(ctx,PEAX,-1);
-				patch_jump(ctx,jz);
-			}
-			break;
-		case OSafeCast:
-			make_dyn_cast(ctx, dst, ra);
-			break;
-		case ODynGet:
-			{
-				int size;
-#				ifdef HL_64
-				if( IS_FLOAT(dst) || dst->t->kind == HI64 ) {
-					size = begin_native_call(ctx,2);
-				} else {
-					size = begin_native_call(ctx,3);
-					set_native_arg(ctx,pconst64(&p,(int_val)dst->t));
-				}
-				set_native_arg(ctx,pconst64(&p,(int_val)hl_hash_utf8(m->code->strings[o->p3])));
-				set_native_arg(ctx,fetch(ra));
-#				else
-				preg *r;
-				r = alloc_reg(ctx,RCPU);
-				if( IS_FLOAT(dst) || dst->t->kind == HI64 ) {
-					size = pad_before_call(ctx,HL_WSIZE*2);
-				} else {
-					size = pad_before_call(ctx,HL_WSIZE*3);
-					op64(ctx,MOV,r,pconst64(&p,(int_val)dst->t));
-					op64(ctx,PUSH,r,UNUSED);
-				}
-				op64(ctx,MOV,r,pconst64(&p,(int_val)hl_hash_utf8(m->code->strings[o->p3])));
-				op64(ctx,PUSH,r,UNUSED);
-				op64(ctx,PUSH,fetch(ra),UNUSED);
-#				endif
-				call_native(ctx,get_dynget(dst->t),size);
-				store_result(ctx,dst);
-			}
-			break;
-		case ODynSet:
-			{
-				int size;
-#				ifdef HL_64
-				switch( rb->t->kind ) {
-				case HF32:
-				case HF64:
-					size = begin_native_call(ctx, 3);
-					set_native_arg_fpu(ctx,fetch(rb),rb->t->kind == HF32);
-					set_native_arg(ctx,pconst64(&p,hl_hash_gen(hl_get_ustring(m->code,o->p2),true)));
-					set_native_arg(ctx,fetch(dst));
-					call_native(ctx,get_dynset(rb->t),size);
-					break;
-				case HI64:
-				case HGUID:
-					size = begin_native_call(ctx, 3);
-					set_native_arg(ctx,fetch(rb));
-					set_native_arg(ctx,pconst64(&p,hl_hash_gen(hl_get_ustring(m->code,o->p2),true)));
-					set_native_arg(ctx,fetch(dst));
-					call_native(ctx,get_dynset(rb->t),size);
-					break;
-				default:
-					size = begin_native_call(ctx,4);
-					set_native_arg(ctx,fetch(rb));
-					set_native_arg(ctx,pconst64(&p,(int_val)rb->t));
-					set_native_arg(ctx,pconst64(&p,hl_hash_gen(hl_get_ustring(m->code,o->p2),true)));
-					set_native_arg(ctx,fetch(dst));
-					call_native(ctx,get_dynset(rb->t),size);
-					break;
-				}
-#				else
-				switch( rb->t->kind ) {
-				case HF32:
-					size = pad_before_call(ctx, HL_WSIZE*2 + sizeof(float));
-					push_reg(ctx,rb);
-					op32(ctx,PUSH,pconst64(&p,hl_hash_gen(hl_get_ustring(m->code,o->p2),true)),UNUSED);
-					op32(ctx,PUSH,fetch(dst),UNUSED);
-					call_native(ctx,get_dynset(rb->t),size);
-					break;
-				case HF64:
-				case HI64:
-				case HGUID:
-					size = pad_before_call(ctx, HL_WSIZE*2 + sizeof(double));
-					push_reg(ctx,rb);
-					op32(ctx,PUSH,pconst64(&p,hl_hash_gen(hl_get_ustring(m->code,o->p2),true)),UNUSED);
-					op32(ctx,PUSH,fetch(dst),UNUSED);
-					call_native(ctx,get_dynset(rb->t),size);
-					break;
-				default:
-					size = pad_before_call(ctx, HL_WSIZE*4);
-					op32(ctx,PUSH,fetch32(ctx,rb),UNUSED);
-					op32(ctx,PUSH,pconst64(&p,(int_val)rb->t),UNUSED);
-					op32(ctx,PUSH,pconst64(&p,hl_hash_gen(hl_get_ustring(m->code,o->p2),true)),UNUSED);
-					op32(ctx,PUSH,fetch(dst),UNUSED);
-					call_native(ctx,get_dynset(rb->t),size);
-					break;
-				}
-#				endif
-			}
-			break;
-		case OTrap:
-			{
-				int size, jenter, jtrap;
-				int offset = 0;
-				int trap_size = (sizeof(hl_trap_ctx) + 15) & 0xFFF0;
-				hl_trap_ctx *t = NULL;
-#				ifndef HL_THREADS
-				if( tinf == NULL ) tinf = hl_get_thread(); // single thread
-#				endif
-
-#				ifdef HL_64
-				preg *trap = REG_AT(CALL_REGS[0]);
-#				else
-				preg *trap = PEAX;
-#				endif
-				RLOCK(trap);
-
-				preg *treg = alloc_reg(ctx, RCPU);
-				if( !tinf ) {
-					call_native(ctx, hl_get_thread, 0);
-					op64(ctx,MOV,treg,PEAX);
-					offset = (int)(int_val)&tinf->trap_current;
-				} else {
-					offset = 0;
-					op64(ctx,MOV,treg,pconst64(&p,(int_val)&tinf->trap_current));
-				}
-				op64(ctx,MOV,trap,pmem(&p,treg->id,offset));
-				op64(ctx,SUB,PESP,pconst(&p,trap_size));
-				op64(ctx,MOV,pmem(&p,Esp,(int)(int_val)&t->prev),trap);
-				op64(ctx,MOV,trap,PESP);
-				op64(ctx,MOV,pmem(&p,treg->id,offset),trap);
-
-				/*
-					trap E,@catch
-					catch g
-					catch g2
-					...
-					@:catch
-
-					// Before haxe 5
-					This is a bit hackshish : we want to detect the type of exception filtered by the catch so we check the following
-					sequence of HL opcodes:
-
-					trap E,@catch
-					...
-					@catch:
-					global R, _
-					call _, ???(R,E)
-
-					??? is expected to be hl.BaseType.check
-				*/
-				hl_opcode *cat = f->ops + opCount + 1;
-				hl_opcode *next = f->ops + opCount + 1 + o->p2;
-				hl_opcode *next2 = f->ops + opCount + 2 + o->p2;
-				if( cat->op == OCatch || (next->op == OGetGlobal && next2->op == OCall2 && next2->p3 == next->p1 && dst->stack.id == (int)(int_val)next2->extra) ) {
-					int gindex = cat->op == OCatch ? cat->p1 : next->p2;
-					hl_type *gt = m->code->globals[gindex];
-					while( gt->kind == HOBJ && gt->obj->super ) gt = gt->obj->super;
-					if( gt->kind == HOBJ && gt->obj->nfields && gt->obj->fields[0].t->kind == HTYPE ) {
-						void *addr = m->globals_data + m->globals_indexes[gindex];
-#						ifdef HL_64
-						op64(ctx,MOV,treg,pconst64(&p,(int_val)addr));
-						op64(ctx,MOV,treg,pmem(&p,treg->id,0));
-#						else
-						op64(ctx,MOV,treg,paddr(&p,addr));
-#						endif
-					} else
-						op64(ctx,MOV,treg,pconst(&p,0));
-				} else {
-					op64(ctx,MOV,treg,pconst(&p,0));
-				}
-				op64(ctx,MOV,pmem(&p,Esp,(int)(int_val)&t->tcheck),treg);
-
-				// On Win64 setjmp actually takes two arguments
-				// the jump buffer and the frame pointer (or the stack pointer if there is no FP)
-#if defined(HL_WIN) && defined(HL_64)
-				size = begin_native_call(ctx, 2);
-				set_native_arg(ctx, REG_AT(Ebp));
-#else
-				size = begin_native_call(ctx, 1);
-#endif
-				set_native_arg(ctx,trap);
-#ifdef HL_MINGW
-				call_native(ctx,_setjmp,size);
-#else
-				call_native(ctx,setjmp,size);
-#endif
-				op64(ctx,TEST,PEAX,PEAX);
-				XJump_small(JZero,jenter);
-				op64(ctx,ADD,PESP,pconst(&p,trap_size));
-				if( !tinf ) {
-					call_native(ctx, hl_get_thread, 0);
-					op64(ctx,MOV,PEAX,pmem(&p, Eax, (int)(int_val)&tinf->exc_value));
-				} else {
-					op64(ctx,MOV,PEAX,pconst64(&p,(int_val)&tinf->exc_value));
-					op64(ctx,MOV,PEAX,pmem(&p, Eax, 0));
-				}
-				store(ctx,dst,PEAX,false);
-
-				jtrap = do_jump(ctx,OJAlways,false);
-				register_jump(ctx,jtrap,(opCount + 1) + o->p2);
-				patch_jump(ctx,jenter);
-			}
-			break;
-		case OEndTrap:
-			{
-				int trap_size = (sizeof(hl_trap_ctx) + 15) & 0xFFF0;
-				hl_trap_ctx *tmp = NULL;
-				preg *addr,*r;
-				int offset;
-				if (!tinf) {
-					call_native(ctx, hl_get_thread, 0);
-					addr = PEAX;
-					RLOCK(addr);
-					offset = (int)(int_val)&tinf->trap_current;
-				} else {
-					offset = 0;
-					addr = alloc_reg(ctx, RCPU);
-					op64(ctx, MOV, addr, pconst64(&p, (int_val)&tinf->trap_current));
-				}
-				r = alloc_reg(ctx, RCPU);
-				op64(ctx, MOV, r, pmem(&p,addr->id,offset));
-				op64(ctx, MOV, r, pmem(&p,r->id,(int)(int_val)&tmp->prev));
-				op64(ctx, MOV, pmem(&p,addr->id, offset), r);
-#				ifdef HL_WIN
-				// erase eip (prevent false positive)
-				{
-					_JUMP_BUFFER *b = NULL;
-#					ifdef HL_64
-					op64(ctx,MOV,pmem(&p,Esp,(int)(int_val)&(b->Rip)),PEAX);
-#					else
-					op64(ctx,MOV,pmem(&p,Esp,(int)&(b->Eip)),PEAX);
-#					endif
-				}
-#				endif
-				op64(ctx,ADD,PESP,pconst(&p,trap_size));
-			}
-			break;
-		case OEnumIndex:
-			{
-				preg *r = alloc_reg(ctx,RCPU);
-				op64(ctx,MOV,r,pmem(&p,alloc_cpu(ctx,ra,true)->id,HL_WSIZE));
-				store(ctx,dst,r,true);
-				break;
-			}
-			break;
-		case OSwitch:
-			{
-				int jdefault;
-				int i;
-				preg *r = alloc_cpu(ctx, dst, true);
-				preg *r2 = alloc_reg(ctx, RCPU);
-				op32(ctx, CMP, r, pconst(&p,o->p2));
-				XJump(JUGte,jdefault);
-				// r2 = r * 5 + eip
-#				ifdef HL_64
-				op64(ctx, XOR, r2, r2);
-#				endif
-				op32(ctx, MOV, r2, r);
-				op32(ctx, SHL, r2, pconst(&p,2));
-				op32(ctx, ADD, r2, r);
-#				ifdef HL_64
-				preg *tmp = alloc_reg(ctx, RCPU);
-				op64(ctx, MOV, tmp, pconst64(&p,RESERVE_ADDRESS));
-#				else
-				op64(ctx, ADD, r2, pconst64(&p,RESERVE_ADDRESS));
-#				endif
-				{
-					jlist *s = (jlist*)hl_malloc(&ctx->galloc, sizeof(jlist));
-					s->pos = BUF_POS() - sizeof(void*);
-					s->next = ctx->switchs;
-					ctx->switchs = s;
-				}
-#				ifdef HL_64
-				op64(ctx, ADD, r2, tmp);
-#				endif
-				op64(ctx, JMP, r2, UNUSED);
-				for(i=0;i<o->p2;i++) {
-					int j = do_jump(ctx,OJAlways,false);
-					register_jump(ctx,j,(opCount + 1) + o->extra[i]);
-					if( (i & 15) == 0 ) jit_buf(ctx);
-				}
-				patch_jump(ctx, jdefault);
-			}
-			break;
-		case OGetTID:
-			op32(ctx, MOV, alloc_cpu(ctx,dst,false), pmem(&p,alloc_cpu(ctx,ra,true)->id,0));
-			store(ctx,dst,dst->current,false);
-			break;
-		case OAssert:
-			{
-				pad_before_call(ctx, 0);
-				jlist *j = (jlist*)hl_malloc(&ctx->galloc,sizeof(jlist));
-				j->pos = BUF_POS();
-				j->target = -2;
-				j->next = ctx->calls;
-				ctx->calls = j;
-
-				op64(ctx,MOV,PEAX,pconst64(&p,RESERVE_ADDRESS));
-				op_call(ctx,PEAX,-1);
-			}
-			break;
-		case ONop:
-			break;
-		case OPrefetch:
-			{
-				preg *r = alloc_cpu(ctx, dst, true);
-				if( o->p2 > 0 ) {
-					switch( dst->t->kind ) {
-					case HOBJ:
-					case HSTRUCT:
-						{
-							hl_runtime_obj *rt = hl_get_obj_rt(dst->t);
-							preg *r2 = alloc_reg(ctx, RCPU);
-							op64(ctx, LEA, r2, pmem(&p, r->id, rt->fields_indexes[o->p2-1]));
-							r = r2;
-						}
-						break;
-					default:
-						ASSERT(dst->t->kind);
-						break;
-					}
-				}
-				switch( o->p3 ) {
-				case 0:
-					op64(ctx, PREFETCHT0, pmem(&p,r->id,0), UNUSED);
-					break;
-				case 1:
-					op64(ctx, PREFETCHT1, pmem(&p,r->id,0), UNUSED);
-					break;
-				case 2:
-					op64(ctx, PREFETCHT2, pmem(&p,r->id,0), UNUSED);
-					break;
-				case 3:
-					op64(ctx, PREFETCHNTA, pmem(&p,r->id,0), UNUSED);
-					break;
-				case 4:
-					op64(ctx, PREFETCHW, pmem(&p,r->id,0), UNUSED);
-					break;
-				default:
-					ASSERT(o->p3);
-					break;
-				}
-			}
-			break;
-		case OAsm:
-			{
-				switch( o->p1 ) {
-				case 0: // byte output
-					B(o->p2);
-					break;
-				case 1: // scratch cpu reg
-					scratch(REG_AT(o->p2));
-					break;
-				case 2: // read vm reg
-					rb--;
-					copy(ctx, REG_AT(o->p2), &rb->stack, rb->size);
-					scratch(REG_AT(o->p2));
-					break;
-				case 3: // write vm reg
-					rb--;
-					copy(ctx, &rb->stack, REG_AT(o->p2), rb->size);
-					scratch(rb->current);
-					break;
-				case 4:
-					if( ctx->totalRegsSize != 0 )
-						hl_fatal("Asm naked function should not have local variables");
-					if( opCount != 0 )
-						hl_fatal("Asm naked function should be on first opcode");
-					ctx->buf.b -= BUF_POS() - ctx->functionPos; // reset to our function start
-					break;
-				default:
-					ASSERT(o->p1);
-					break;
-				}
-			}
-			break;
-		case OCatch:
-			// Only used by OTrap typing
-			break;
-		default:
-			jit_error(hl_op_name(o->op));
-			break;
-		}
-		// we are landing at this position, assume we have lost our registers
-		if( ctx->opsPos[opCount+1] == -1 )
-			discard_regs(ctx,true);
-		ctx->opsPos[opCount+1] = BUF_POS();
-
-		// write debug infos
-		size = BUF_POS() - codePos;
-		if( debug16 && size > 0xFF00 ) {
-			debug32 = malloc(sizeof(int) * (f->nops + 1));
-			for(i=0;i<ctx->currentPos;i++)
-				debug32[i] = debug16[i];
-			free(debug16);
-			debug16 = NULL;
-		}
-		if( debug16 ) debug16[ctx->currentPos] = (unsigned short)size; else if( debug32 ) debug32[ctx->currentPos] = size;
-
-	}
-	// patch jumps
-	{
-		jlist *j = ctx->jumps;
-		while( j ) {
-			*(int*)(ctx->startBuf + j->pos) = ctx->opsPos[j->target] - (j->pos + 4);
-			j = j->next;
-		}
-		ctx->jumps = NULL;
-	}
-	int codeEndPos = BUF_POS();
-	// add nops padding
-	jit_nops(ctx);
-	// clear regs
-	for(i=0;i<REG_COUNT;i++) {
-		preg *r = REG_AT(i);
-		r->holds = NULL;
-		r->lock = 0;
-	}
-	// save debug infos
-	if( ctx->debug ) {
-		int fid = (int)(f - m->code->functions);
-		ctx->debug[fid].start = codePos;
-		ctx->debug[fid].offsets = debug32 ? (void*)debug32 : (void*)debug16;
-		ctx->debug[fid].large = debug32 != NULL;
-	}
-	// unwind info
-#ifdef WIN64_UNWIND_TABLES
-	int uw_idx = ctx->nunwind++;
-	ctx->unwind_table[uw_idx].BeginAddress = codePos;
-	ctx->unwind_table[uw_idx].EndAddress = codeEndPos;
-	ctx->unwind_table[uw_idx].UnwindData = ctx->unwind_offset;
-#endif
-	// reset tmp allocator
-	hl_free(&ctx->falloc);
-	return codePos;
-}
-
-static void *get_wrapper( hl_type *t ) {
-	return call_jit_hl2c;
-}
-
-void hl_jit_patch_method( void *old_fun, void **new_fun_table ) {
-	// mov eax, addr
-	// jmp [eax]
-	unsigned char *b = (unsigned char*)old_fun;
-	unsigned long long addr = (unsigned long long)(int_val)new_fun_table;
-#	ifdef HL_64
-	*b++ = 0x48;
-	*b++ = 0xB8;
-	*b++ = (unsigned char)addr;
-	*b++ = (unsigned char)(addr>>8);
-	*b++ = (unsigned char)(addr>>16);
-	*b++ = (unsigned char)(addr>>24);
-	*b++ = (unsigned char)(addr>>32);
-	*b++ = (unsigned char)(addr>>40);
-	*b++ = (unsigned char)(addr>>48);
-	*b++ = (unsigned char)(addr>>56);
-#	else
-	*b++ = 0xB8;
-	*b++ = (unsigned char)addr;
-	*b++ = (unsigned char)(addr>>8);
-	*b++ = (unsigned char)(addr>>16);
-	*b++ = (unsigned char)(addr>>24);
-#	endif
-	*b++ = 0xFF;
-	*b++ = 0x20;
-}
-
-static void missing_closure() {
-	hl_error("Missing static closure");
-}
-
-void *hl_jit_code( jit_ctx *ctx, hl_module *m, int *codesize, hl_debug_infos **debug, hl_module *previous ) {
-	jlist *c;
-	int size = BUF_POS();
-	unsigned char *code;
-	if( size & 4095 ) size += 4096 - (size&4095);
-	code = (unsigned char*)hl_alloc_executable_memory(size);
-	if( code == NULL ) return NULL;
-	memcpy(code,ctx->startBuf,BUF_POS());
-	*codesize = size;
-	*debug = ctx->debug;
-	if( !call_jit_c2hl ) {
-		call_jit_c2hl = code + ctx->c2hl;
-		call_jit_hl2c = code + ctx->hl2c;
-		hl_setup.get_wrapper = get_wrapper;
-		hl_setup.static_call = callback_c2hl;
-		hl_setup.static_call_ref = true;
-#		ifdef JIT_CUSTOM_LONGJUMP
-		hl_setup.throw_jump = (void(*)(jmp_buf, int))(code + ctx->longjump);
-#		endif
-	}
-#ifdef WIN64_UNWIND_TABLES
-	m->unwind_table = ctx->unwind_table;
-	RtlAddFunctionTable(m->unwind_table, ctx->nunwind, (DWORD64)code);
-#endif
-	if( !ctx->static_function_offset ) {
-		int i;
-		ctx->static_function_offset = true;
-		for(i=0;i<(int)(sizeof(ctx->static_functions)/sizeof(void*));i++)
-			ctx->static_functions[i] = (void*)(code + (int)(int_val)ctx->static_functions[i]);
-	}
-	// patch calls
-	c = ctx->calls;
-	while( c ) {
-		void *fabs;
-		if( c->target < 0 )
-			fabs = ctx->static_functions[-c->target-1];
-		else {
-			fabs = m->functions_ptrs[c->target];
-			if( fabs == NULL ) {
-				// read absolute address from previous module
-				int old_idx = m->hash->functions_hashes[m->functions_indexes[c->target]];
-				if( old_idx < 0 )
-					return NULL;
-				fabs = previous->functions_ptrs[(previous->code->functions + old_idx)->findex];
-			} else {
-				// relative
-				fabs = (unsigned char*)code + (int)(int_val)fabs;
-			}
-		}
-		if( (code[c->pos]&~3) == (IS_64?0x48:0xB8) || code[c->pos] == 0x68 ) // MOV : absolute | PUSH
-			*(void**)(code + c->pos + (IS_64?2:1)) = fabs;
-		else {
-			int_val delta = (int_val)fabs - (int_val)code - (c->pos + 5);
-			int rpos = (int)delta;
-			if( (int_val)rpos != delta ) {
-				printf("Target code too far too rebase\n");
-				return NULL;
-			}
-			*(int*)(code + c->pos + 1) = rpos;
-		}
-		c = c->next;
-	}
-	// patch switchs
-	c = ctx->switchs;
-	while( c ) {
-		*(void**)(code + c->pos) = code + c->pos + (IS_64 ? 14 : 6);
-		c = c->next;
-	}
-	// patch closures
-	{
-		vclosure *c = ctx->closure_list;
-		while( c ) {
-			vclosure *next;
-			int fidx = (int)(int_val)c->fun;
-			void *fabs = m->functions_ptrs[fidx];
-			if( fabs == NULL ) {
-				// read absolute address from previous module
-				int old_idx = m->hash->functions_hashes[m->functions_indexes[fidx]];
-				if( old_idx < 0 )
-					fabs = missing_closure;
-				else
-					fabs = previous->functions_ptrs[(previous->code->functions + old_idx)->findex];
-			} else {
-				// relative
-				fabs = (unsigned char*)code + (int)(int_val)fabs;
-			}
-			c->fun = fabs;
-			next = (vclosure*)c->value;
-			c->value = NULL;
-			c = next;
-		}
-	}
-	return code;
-}
-
diff --git a/src/jit.h b/src/jit.h
new file mode 100644
index 000000000..69c609547
--- /dev/null
+++ b/src/jit.h
@@ -0,0 +1,302 @@
+/*
+ * Copyright (C)2005-2016 Haxe Foundation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+#ifndef JIT_H
+#define JIT_H
+
+#include <hlmodule.h>
+#include <math.h>
+
+typedef enum {
+	LOAD_ADDR,
+	LOAD_CONST,
+	LOAD_ARG,
+	LOAD_FUN,
+	STORE,
+	LEA,
+	TEST,
+	CMP,
+	JCOND,
+	JUMP,
+	JUMP_TABLE,
+	BINOP,
+	UNOP,
+	CONV,
+	CONV_UNSIGNED,
+	RET,
+	CALL_PTR,
+	CALL_REG,
+	CALL_FUN,
+	MOV,
+	CMOV,
+	XCHG,
+	CXCHG,
+	PUSH_CONST,
+	PUSH,
+	POP,
+	ALLOC_STACK,
+	PREFETCH,
+	DEBUG_BREAK,
+	BLOCK,
+	ENTER,
+	STACK_OFFS,
+	CATCH,
+	ADDRESS,
+	NOP,
+} emit_op;
+
+typedef enum {
+	M_NONE,
+	M_UI8,
+	M_UI16,
+	M_I32,
+	M_PTR,
+	M_F64,
+	M_F32,
+	M_VOID,
+	M_NORET,
+} emit_mode;
+
+typedef int ereg;
+
+typedef struct {
+	union {
+		struct {
+			unsigned char op;
+			unsigned char mode;
+			unsigned char nargs;
+			unsigned char _unused;
+		};
+		int header;
+	};
+	int size_offs;
+	union {
+		struct {
+			ereg a;
+			ereg b;
+		};
+		uint64 value;
+	};
+} einstr;
+
+typedef enum {
+	R_VALUE			= 0,
+	R_REG			= 0x40000000,
+	R_REG_PTR		= 0x50000000,
+	R_CONST			= 0x60000000,
+	R_PHI			= 0x70000000,
+} rkind;
+
+// reg representation is :
+// higher bits
+// 0000 = positive value (for IR only VXXX)
+// X100 = native register, lower 7 bits is the register, bits 8-28 are the offset (21 bits)
+// X101 = same as above, but indirect address
+// X110 = small constant value stored in offset
+// 1111 = negative value (for IR phi  PXXX)
+// 10XX = unused
+
+#define STACK_REG	5
+
+#define UNUSED						((ereg)0)
+#define MK_REG(v,kind)				(((v)&0x7F) | (kind))
+#define MK_REG_VAL(v,kind,val)		(MK_REG(v,kind) | (((val) << 7)&0x8FFFFF80))
+
+#define REG_KIND(r)		((r)&0x70000000)
+#define REG_REG(r)		((r)&0x7F)
+#define REG_VALUE(r)	(((int)(((r) & 0x8000000) ? ((r) | 0xF0000000) : ((r)&0x0FFFFFFF)))>>7)
+#define REG_PTR(r)		_reg_chk(r,R_REG,(r)|R_REG_PTR)
+#define REG_ADD_OFFSET(r,offs) _reg_chk(r,R_REG_PTR,MK_REG_VAL(r,REG_KIND(r),REG_VALUE(r)+(offs)))
+#define REG_IS_VAL(r)		(REG_KIND(r) == R_VALUE || REG_KIND(r) == R_PHI)
+
+#define IS_NULL(r)			((r) == 0)
+#define IS_REG(r)			(REG_KIND(r) == R_REG)
+#define MK_STACK_REG(v)		MK_REG_VAL(STACK_REG,R_REG_PTR,v)
+#define MK_STACK_OFFS(v)	MK_REG_VAL(STACK_REG,R_REG,v)
+#define MK_CONST(v)			MK_REG_VAL(0,R_CONST,v)
+#define MK_ADDR(reg,offs)	MK_REG_VAL(reg,R_REG_PTR,offs)
+
+#define IS_CALL(op)	((op) == CALL_PTR || (op) == CALL_REG || (op) == CALL_FUN)
+#define IS_FLOAT(mode)	((mode) == M_F64 || (mode) == M_F32)
+
+#define MAX_ARGS	16
+
+#if defined(HL_WIN_CALL) && defined(HL_64)
+#	define IS_WINCALL64 1
+#else
+#	define IS_WINCALL64 0
+#endif
+
+typedef struct {
+	int *data;
+	int max;
+	int cur;
+} int_alloc;
+
+typedef struct _ephi ephi;
+
+struct _ephi {
+	ereg value;
+	int nvalues;
+	emit_mode mode;
+	ereg *values;
+	int *blocks;
+};
+
+typedef struct _eblock {
+	int start_pos;
+	int end_pos;
+	int next_count;
+	int pred_count;
+	int phi_count;
+	int *nexts;
+	int *preds;
+	ephi *phis;
+} eblock;
+
+typedef struct _emit_ctx emit_ctx;
+typedef struct _regs_ctx regs_ctx;
+typedef struct _code_ctx code_ctx;
+typedef struct _jit_ctx jit_ctx;
+
+typedef struct {
+	int nscratchs;
+	int npersists;
+	int nargs;
+	ereg ret;
+	ereg *scratch;
+	ereg *persist;
+	ereg *arg;
+} reg_config;
+
+typedef struct {
+	reg_config regs;
+	reg_config floats;
+	ereg stack_reg;
+	ereg stack_pos;
+	int stack_align;
+	// Minimum bytes consumed by each stack argument. Defaults to HL_WSIZE
+	// when 0. Backends like AArch64 set this to 16 because each PUSH must
+	// move SP by 16 bytes to keep SP 16-byte aligned (any [SP, ...] access
+	// with a misaligned SP traps under EL0).
+	int stack_arg_size;
+	int debug_prefix_size;
+	ereg req_bit_shifts;
+	ereg req_div_a;
+	ereg req_div_b;
+} regs_config;
+
+typedef struct {
+	int c2hl;
+	int hl2c;
+} jit_special_funs;
+
+struct _jit_ctx {
+	hl_module *mod;
+	hl_function *fun;
+	hl_alloc falloc;
+	hl_alloc galloc;
+	emit_ctx *emit;
+	regs_ctx *regs;
+	code_ctx *code;
+	regs_config cfg;
+	// emit output
+	int instr_count;
+	int block_count;
+	int value_count;
+	int phi_count;
+	einstr *instrs;
+	eblock *blocks;
+	int *values_writes;
+	int *emit_pos_map;
+	// regs output
+	int reg_instr_count;
+	einstr *reg_instrs;
+	ereg *reg_writes;
+	int *reg_pos_map;
+	// codegen output
+	int code_size;
+	unsigned char *code_instrs;
+	int *code_pos_map;
+	jit_special_funs code_funs;
+	// accum output
+	int fdef_index;
+	int out_pos;
+	int out_max;
+	unsigned char *output;
+	unsigned char *final_code;
+};
+
+jit_ctx *hl_jit_alloc();
+void hl_jit_free( jit_ctx *ctx, h_bool can_reset );
+void hl_jit_reset( jit_ctx *ctx, hl_module *m );
+void hl_jit_init( jit_ctx *ctx, hl_module *m );
+int hl_jit_function( jit_ctx *ctx, hl_module *m, hl_function *f );
+void hl_jit_define_function( jit_ctx *ctx, int start, int size );
+
+void hl_jit_null_field_access( int fhash );
+void hl_jit_assert();
+void *hl_jit_wrapper_ptr( vclosure_wrapper *c, char *stack_args, void **regs );
+double hl_jit_wrapper_d( vclosure_wrapper *c, char *stack_args, void **regs );
+
+// emit & dump
+void hl_emit_dump( jit_ctx *ctx );
+const char *hl_emit_regstr( ereg v, emit_mode m );
+void hl_emit_store_args( emit_ctx *ctx, einstr *e, ereg *args, int count );
+void hl_emit_remap_jumps( emit_ctx *ctx, void *jumps, einstr *instrs, int *pos_map );
+ereg *hl_emit_get_args( emit_ctx *ctx, einstr *e );
+ereg **hl_emit_get_regs( einstr *e, int *count );
+void hl_emit_reg_iter( jit_ctx *jit, einstr *e, void *ctx, void (*iter_reg)( void *, ereg * ) );
+extern int hl_emit_mode_sizes[];
+extern bool hl_jit_dump_bin;
+#define val_str(v,m) hl_emit_regstr(v,m)
+
+#ifdef HL_DEBUG
+#	define JIT_DEBUG
+#endif
+
+#define jit_error(msg)	{ hl_jit_error(msg,__func__,__LINE__); hl_debug_break(); exit(-1); }
+#define jit_assert()	jit_error("")
+
+#if defined(JIT_DEBUG)
+#	define jit_debug(...)	printf(__VA_ARGS__)
+#else
+#	define jit_debug(...)
+#endif
+
+#define DEF_ALLOC &ctx->jit->falloc
+
+#define jit_pad_size(size,k)	((k == 0) ? 0 : ((-(size)) & (k - 1)))
+
+static void __ignore( void *value ) {}
+
+void hl_jit_error( const char *msg, const char *func, int line );
+
+void *hl_jit_code( jit_ctx *ctx, hl_module *m, int *codesize, hl_debug_infos **debug, hl_module *previous );
+void hl_jit_patch_method( void *old_fun, void **new_fun_table );
+
+static ereg _reg_chk( ereg r, rkind k, ereg ret ) {
+	if( REG_KIND(r) != k ) jit_assert();
+	return ret;
+}
+
+
+#endif
diff --git a/src/jit_aarch64.c b/src/jit_aarch64.c
new file mode 100644
index 000000000..397f67104
--- /dev/null
+++ b/src/jit_aarch64.c
@@ -0,0 +1,1999 @@
+/*
+ * Copyright (C)2015-2026 Haxe Foundation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * AArch64 JIT backend for the HL2 IR JIT.
+ *
+ * Phase 2 + 3: function shell + simple ops + arithmetic + memory + conversions.
+ * Calls/trampolines and the constant pool are still phase 4.
+ */
+
+#if !defined(__aarch64__) && !defined(_M_ARM64)
+#  error "This file is for AArch64 architecture only."
+#endif
+
+#include <hlmodule.h>
+#include <jit.h>
+#include "jit_aarch64_emit.h"
+#include <string.h>
+#include <stdio.h>
+
+#ifdef HL_DEBUG
+#	define GEN_DEBUG
+#endif
+
+// IR ereg encoding 5 is reserved (`STACK_REG` in jit.h) — the regs phase uses
+// it to label stack-bound vregs.  ARM hardware register X5 happens to use the
+// same hardware encoding, which would create a fatal aliasing if we exposed
+// X5 through the regs configuration as encoding 5.  Re-encode X5 as the
+// otherwise-unused IR slot 32; gpr_id maps it back to hardware X5 at emit
+// time.  (FP regs encode in the 64..127 range and have no such conflict.)
+#define X5_LOGICAL	32
+
+#define R(id)		MK_REG(id, R_REG)
+#define V(id)		MK_REG((id) + 64, R_REG)
+
+// ============================================================================
+// Register class declaration (AAPCS64, Linux + Apple)
+// ============================================================================
+
+void hl_jit_init_regs( regs_config *cfg ) {
+	// Integer registers.
+	// X15/X16/X17 reserved as backend-private temporaries. X16/X17 are the
+	// linker IP0/IP1 (the Apple dynamic linker may clobber them at indirect
+	// branches). X15 (ARM_TMP3) is reserved as a third scratch for op
+	// handlers that need three independent temps at once — notably emit_store
+	// when both base and data are spilled (TMP1 holds base, TMP2 holds data,
+	// and emit_ld_st still needs a temp for the offset-register encoding).
+	// X18 reserved on Apple/Windows as platform register; conservatively skipped on Linux too.
+	// X29 = FP, X30 = LR, X31 = SP/XZR — special-purpose.
+	static int scratch_regs[] = {
+		R(X0), R(X1), R(X2), R(X3), R(X4), R(X5_LOGICAL), R(X6), R(X7),
+		R(X8), R(X9), R(X10), R(X11), R(X12), R(X13), R(X14)
+	};
+	static int persist_regs[] = {
+		R(X19), R(X20), R(X21), R(X22), R(X23),
+		R(X24), R(X25), R(X26), R(X27), R(X28)
+	};
+	static int arg_regs[] = {
+		R(X0), R(X1), R(X2), R(X3), R(X4), R(X5_LOGICAL), R(X6), R(X7)
+	};
+	cfg->regs.ret = scratch_regs[0];
+	cfg->regs.nscratchs = sizeof(scratch_regs) / sizeof(int);
+	cfg->regs.npersists = sizeof(persist_regs) / sizeof(int);
+	cfg->regs.nargs = sizeof(arg_regs) / sizeof(int);
+	cfg->regs.scratch = (ereg*)scratch_regs;
+	cfg->regs.persist = (ereg*)persist_regs;
+	cfg->regs.arg = (ereg*)arg_regs;
+
+	// Float registers (V0-V31; lower 64 bits of V8-V15 are callee-saved per AAPCS64).
+	static int float_scratch[] = {
+		V(0), V(1), V(2), V(3), V(4), V(5), V(6), V(7),
+		V(16), V(17), V(18), V(19), V(20), V(21), V(22), V(23),
+		V(24), V(25), V(26), V(27), V(28), V(29), V(30), V(31)
+	};
+	static int float_persist[] = {
+		V(8), V(9), V(10), V(11), V(12), V(13), V(14), V(15)
+	};
+	static int float_args[] = {
+		V(0), V(1), V(2), V(3), V(4), V(5), V(6), V(7)
+	};
+	cfg->floats.ret = float_scratch[0];
+	cfg->floats.nscratchs = sizeof(float_scratch) / sizeof(int);
+	cfg->floats.npersists = sizeof(float_persist) / sizeof(int);
+	cfg->floats.nargs = sizeof(float_args) / sizeof(int);
+	cfg->floats.scratch = (ereg*)float_scratch;
+	cfg->floats.persist = (ereg*)float_persist;
+	cfg->floats.arg = (ereg*)float_args;
+
+	// ARM has no register pinning constraints for shifts (LSLV/LSRV/ASRV accept
+	// any source) or division (SDIV/UDIV write any destination).
+	cfg->req_bit_shifts = 0;
+	cfg->req_div_a = 0;
+	cfg->req_div_b = 0;
+
+	cfg->stack_reg = R(SP_REG); // X31 (SP)
+	cfg->stack_pos = R(FP);     // X29
+	cfg->stack_align = 16;      // AAPCS64 mandates
+	// Each stack-passed arg consumes 16 bytes to keep SP 16-byte aligned —
+	// any [SP, ...] memory access with misaligned SP traps under EL0
+	// alignment enforcement on Linux/macOS. emit_push correspondingly moves
+	// SP by 16 per arg, so the IR's call-arg accounting matches.
+	cfg->stack_arg_size = 16;
+
+#ifdef GEN_DEBUG
+	cfg->debug_prefix_size = 4; // ARM instructions are fixed 4 bytes
+#endif
+}
+
+// ============================================================================
+// Disassembly helper
+// ============================================================================
+
+const char *hl_natreg_str( int reg, emit_mode m ) {
+	static char out[16];
+	int r = REG_REG(reg);
+	// Reverse the remappings used in gpr_id so debug output reflects the
+	// hardware register actually emitted.
+	int hw = (r == X5_LOGICAL) ? 5 : (r == STACK_REG) ? 29 : r;
+	switch( m ) {
+	case M_I32:
+	case M_UI16:
+	case M_UI8:
+		if( hw == 31 )
+			sprintf(out, "WZR");
+		else if( hw < 31 )
+			sprintf(out, "W%d", hw);
+		else
+			sprintf(out, "W%d???", hw);
+		break;
+	case M_F32:
+		hw = r - 64;
+		sprintf(out, "S%d%s", hw, hw >= 0 && hw < 32 ? "" : "???");
+		break;
+	case M_F64:
+		hw = r - 64;
+		sprintf(out, "D%d%s", hw, hw >= 0 && hw < 32 ? "" : "???");
+		break;
+	default:
+		if( hw == 31 )
+			sprintf(out, "SP");
+		else if( hw == 29 )
+			sprintf(out, "FP");
+		else if( hw == 30 )
+			sprintf(out, "LR");
+		else if( hw < 31 )
+			sprintf(out, "X%d", hw);
+		else
+			sprintf(out, "X%d???", hw);
+		break;
+	}
+	return out;
+}
+
+// ============================================================================
+// Backend lifecycle
+// ============================================================================
+
+void hl_codegen_alloc( jit_ctx *jit ) {
+	code_ctx *ctx = (code_ctx*)malloc(sizeof(code_ctx));
+	memset(ctx, 0, sizeof(code_ctx));
+	jit->code = ctx;
+	ctx->jit = jit;
+}
+
+void hl_codegen_free( jit_ctx *jit ) {
+	code_ctx *ctx = jit->code;
+	if( ctx == NULL ) return;
+	free(ctx);
+}
+
+// ============================================================================
+// Helpers
+// ============================================================================
+
+#define ARM_TMP1	X16    // backend-private scratch (IP0)
+#define ARM_TMP2	X17    // backend-private scratch (IP1)
+#define ARM_TMP3	X15    // backend-private scratch (excluded from regalloc)
+
+// Map an IR ereg to a physical AArch64 GPR encoding (0..31).
+//   IR encoding 5 (STACK_REG) → ARM FP (X29); the regs phase uses 5 as a
+//     stack-bound-vreg marker, and after ENTER lowers `MOV stack_pos,
+//     stack_reg` we keep that in X29.
+//   IR encoding X5_LOGICAL (32) → ARM X5; the remap shifts X5 out of slot 5
+//     so the IR's STACK_REG sentinel does not alias it.
+static Arm64Reg gpr_id( ereg r ) {
+	int v = REG_REG(r);
+	if( v == STACK_REG ) return FP;
+	if( v == X5_LOGICAL ) return X5;
+	return (Arm64Reg)v;
+}
+
+static Arm64FpReg fpr_id( ereg r ) {
+	return (Arm64FpReg)(REG_REG(r) - 64);
+}
+
+// LDR/STR `size` field: 0=8b, 1=16b, 2=32b, 3=64b.
+static int ls_size_for( emit_mode m ) {
+	switch( m ) {
+	case M_UI8:  return 0;
+	case M_UI16: return 1;
+	case M_I32:
+	case M_F32:  return 2;
+	case M_PTR:
+	case M_F64:  return 3;
+	default:     return 3;
+	}
+}
+
+static int sf_for( emit_mode m ) {
+	// 1 = 64-bit, 0 = 32-bit (sub-word loads/stores still use 64-bit reg encoding).
+	return (m == M_PTR || m == M_F64) ? 1 : 0;
+}
+
+static bool is_fp_mode( emit_mode m ) { return m == M_F32 || m == M_F64; }
+
+// ----------------------------------------------------------------------------
+// Stack pointer arithmetic with arbitrary signed delta.
+// `delta > 0` => SP += delta, `delta < 0` => SP -= |delta|.
+// Uses imm12 + optional LSL #12 when possible; falls back through ARM_TMP1.
+// ----------------------------------------------------------------------------
+static void emit_sp_offs( code_ctx *ctx, int delta ) {
+	if( delta == 0 ) return;
+	int op = (delta < 0) ? 1 : 0; // 0 = ADD, 1 = SUB
+	uint32_t mag = (uint32_t)(delta < 0 ? -delta : delta);
+	if( mag <= 0xFFF ) {
+		encode_add_sub_imm(ctx, 1, op, 0, 0, (int)mag, SP_REG, SP_REG);
+		return;
+	}
+	if( (mag & 0xFFF) == 0 && (mag >> 12) <= 0xFFF ) {
+		encode_add_sub_imm(ctx, 1, op, 0, 1, (int)(mag >> 12), SP_REG, SP_REG);
+		return;
+	}
+	// Try two-step imm: hi part (LSL #12) + lo part, both ≤ 0xFFF.
+	uint32_t mag_lo = mag & 0xFFF;
+	uint32_t mag_hi = mag >> 12;
+	if( mag_hi <= 0xFFF ) {
+		encode_add_sub_imm(ctx, 1, op, 0, 1, (int)mag_hi, SP_REG, SP_REG);
+		if( mag_lo )
+			encode_add_sub_imm(ctx, 1, op, 0, 0, (int)mag_lo, SP_REG, SP_REG);
+		return;
+	}
+	// Fall back to register form.  Must use ADD/SUB (extended register) — the
+	// shifted-register form interprets register 31 as XZR, not SP, so
+	// `SUB SP, SP, X16` would silently become `SUB XZR, XZR, X16` (a NOP).
+	// Extended-register form with option=UXTX(011), imm3=0 treats Rd/Rn=31
+	// as SP, which is what we want.
+	load_immediate(ctx, (int64_t)mag, ARM_TMP1, true);
+	encode_add_sub_ext(ctx, 1, op, 0, ARM_TMP1, /*option=UXTX*/3, /*imm3=*/0, SP_REG, SP_REG);
+}
+
+// ----------------------------------------------------------------------------
+// ADD/SUB-imm with optional 12-bit shift, returns true if `mag` fits.
+// Emits `op (ADD/SUB) Rd, Rn, #mag` using up to two instructions.
+// Caller picks 0=ADD or 1=SUB.
+// ----------------------------------------------------------------------------
+static bool emit_addsub_imm_2step( code_ctx *ctx, int op, Arm64Reg Rd, Arm64Reg Rn, uint32_t mag ) {
+	if( mag <= 0xFFF ) {
+		encode_add_sub_imm(ctx, 1, op, 0, 0, (int)mag, Rn, Rd);
+		return true;
+	}
+	if( (mag >> 12) <= 0xFFF ) {
+		uint32_t hi = mag >> 12, lo = mag & 0xFFF;
+		encode_add_sub_imm(ctx, 1, op, 0, 1, (int)hi, Rn, Rd);
+		if( lo )
+			encode_add_sub_imm(ctx, 1, op, 0, 0, (int)lo, Rd, Rd);
+		return true;
+	}
+	return false;
+}
+
+// ----------------------------------------------------------------------------
+// Load/store with FP-relative or arbitrary base+offs.
+// Picks LDR/STR(unsigned imm scaled) when offset fits, else LDUR/STUR (signed,
+// unscaled, ±256), else falls back to a register-offset form.
+//
+// Register-form offset requires a scratch register that must NOT collide with
+// reg_t (for STR Xt,[base,Xt] would store the offset value at the offset
+// location). When base also lives in a backend temp (ARM_TMP1/TMP2), we may
+// run out of disjoint temps. In that case, fold the offset into base in place
+// using ADD/SUB-imm (preserving base across the load/store), which is valid
+// for magnitudes up to 0xFFFFFF.
+// ----------------------------------------------------------------------------
+static void emit_ld_st_ex( code_ctx *ctx, bool is_load, emit_mode mode, int reg_t, Arm64Reg base, int offs, Arm64Reg avoid ) {
+	int size = ls_size_for(mode);
+	int V = is_fp_mode(mode) ? 1 : 0;
+	int opc = is_load ? 1 : 0; // 0=STR, 1=LDR (for V=0 GPR; same for V=1 FP)
+	int scale = 1 << size;
+	if( offs >= 0 && (offs & (scale - 1)) == 0 && (offs / scale) < 0x1000 ) {
+		encode_ldr_str_imm(ctx, size, V, opc, offs / scale, base, (Arm64Reg)reg_t);
+		return;
+	}
+	if( offs >= -256 && offs < 256 ) {
+		encode_ldur_stur(ctx, size, V, opc, offs, base, (Arm64Reg)reg_t);
+		return;
+	}
+	// Pick an offset temp.  Constraints:
+	//   - For stores, off_tmp must not equal reg_t (else STR Xt,[base,Xt]
+	//     writes the offset value instead of the data).  Loads are immune
+	//     since LDR reads the offset register before writing reg_t.
+	//   - off_tmp must not equal base (the load/store needs base intact).
+	//   - off_tmp must not equal `avoid` (a caller-supplied register the
+	//     caller has parked a live value in — typically the OUTER base in
+	//     emit_store/emit_load_addr while loading the data argument).
+	// For FP loads/stores, reg_t is a V-register, so V-vs-X never collides.
+	Arm64Reg off_tmp = ARM_TMP1;
+	if( V == 0 ) {
+		bool bad_t1 = (!is_load && reg_t == ARM_TMP1) || base == ARM_TMP1 || avoid == ARM_TMP1;
+		if( bad_t1 ) off_tmp = ARM_TMP2;
+		bool bad_t2 = (!is_load && reg_t == off_tmp) || base == off_tmp || avoid == off_tmp;
+		if( bad_t2 ) off_tmp = ARM_TMP3;
+		bool bad_t3 = (!is_load && reg_t == off_tmp) || base == off_tmp || avoid == off_tmp;
+		if( bad_t3 ) jit_error("aarch64 emit_ld_st: no free offset temp");
+	}
+	load_immediate(ctx, offs, off_tmp, true);
+	encode_ldr_str_reg(ctx, size, V, opc, off_tmp, /*option=*/3 /*LSL*/, /*S=*/0, base, (Arm64Reg)reg_t);
+}
+
+static void emit_ld_st( code_ctx *ctx, bool is_load, emit_mode mode, int reg_t, Arm64Reg base, int offs ) {
+	emit_ld_st_ex(ctx, is_load, mode, reg_t, base, offs, (Arm64Reg)-1 /*no avoid*/);
+}
+
+// MOV between two GPRs. Handles SP as source/dest (ARM disallows ORR with SP).
+static void emit_mov_gpr( code_ctx *ctx, Arm64Reg dst, Arm64Reg src, int sf ) {
+	if( dst == src ) return;
+	if( dst == SP_REG || src == SP_REG ) {
+		// ADD <dst>, <src>, #0  (only form that accepts SP).
+		encode_add_sub_imm(ctx, sf, 0, 0, 0, 0, src, dst);
+	} else {
+		// ORR <dst>, XZR, <src>
+		encode_logical_reg(ctx, sf, 0x01, 0, 0, src, 0, XZR, dst);
+	}
+}
+
+// MOV between two FP regs (preserves the lane size used by the mode).
+// Uses ORR.16B (same encoding regardless of S/D since it's a bitwise move).
+// FMOV is also an option; we use FMOV (scalar) for clarity.
+static void emit_mov_fpr( code_ctx *ctx, Arm64FpReg dst, Arm64FpReg src, emit_mode mode ) {
+	if( dst == src ) return;
+	int type = (mode == M_F64) ? 1 : 0; // 1=double, 0=single
+	// FMOV (register) opcode = 0
+	encode_fp_1src(ctx, /*M=*/0, /*S=*/0, type, /*opcode=*/0, src, dst);
+}
+
+// Generic MOV that mirrors x86's emit_mov: handles reg/reg, reg/mem, mem/reg.
+// imm-to-reg goes through emit_load_const.
+static void emit_load_const( code_ctx *ctx, ereg out, uint64_t value, emit_mode mode );
+
+// Phase 4 forward declarations (defined later in this file).
+static int  reserve_const_segment( code_ctx *ctx, int size, int align );
+static int  alloc_const( code_ctx *ctx, uint64_t value, int adrp_pos );
+static void emit_const_load( code_ctx *ctx, Arm64Reg dst, uint64_t value );
+static void emit_const_addr( code_ctx *ctx, Arm64Reg dst, uint64_t value );
+static void emit_pool_offset_addr( code_ctx *ctx, Arm64Reg dst, int const_offset );
+static Arm64FpReg materialize_fpr( code_ctx *ctx, ereg src, emit_mode mode, Arm64FpReg tmp );
+static Arm64Reg   materialize_gpr( code_ctx *ctx, ereg src, emit_mode mode, Arm64Reg tmp );
+static Arm64Reg   materialize_gpr_ex( code_ctx *ctx, ereg src, emit_mode mode, Arm64Reg tmp, Arm64Reg avoid );
+
+// LEA-like: out = base + offs.  Used when an operand encodes an address as
+// (R_REG, value=offs) — e.g. MK_STACK_OFFS, or the LEA-rewritten ADDRESS op.
+static void emit_lea_imm( code_ctx *ctx, Arm64Reg out, Arm64Reg base, int offs ) {
+	if( offs == 0 ) {
+		emit_mov_gpr(ctx, out, base, 1);
+	} else if( offs > 0 && offs <= 0xFFF ) {
+		encode_add_sub_imm(ctx, 1, 0, 0, 0, offs, base, out);
+	} else if( offs < 0 && -offs <= 0xFFF ) {
+		encode_add_sub_imm(ctx, 1, 1, 0, 0, -offs, base, out);
+	} else {
+		load_immediate(ctx, offs, ARM_TMP1, true);
+		encode_add_sub_reg(ctx, 1, 0, 0, 0, ARM_TMP1, 0, base, out);
+	}
+}
+
+static void emit_mov( code_ctx *ctx, ereg dst, ereg src, emit_mode mode ) {
+	int dst_kind = REG_KIND(dst);
+	int src_kind = REG_KIND(src);
+
+	if( dst_kind == R_REG && src_kind == R_REG ) {
+		// MK_STACK_OFFS / LEA-rewritten ADDRESS: src encodes (reg, offs).
+		// Treat as an address computation: dst = src_reg + offs.
+		if( !is_fp_mode(mode) && REG_VALUE(src) != 0 ) {
+			emit_lea_imm(ctx, gpr_id(dst), gpr_id(src), REG_VALUE(src));
+			return;
+		}
+		if( is_fp_mode(mode) )
+			emit_mov_fpr(ctx, fpr_id(dst), fpr_id(src), mode);
+		else
+			emit_mov_gpr(ctx, gpr_id(dst), gpr_id(src), sf_for(mode));
+		return;
+	}
+	if( dst_kind == R_REG && src_kind == R_REG_PTR ) {
+		// LOAD: dst <- [base + offs]
+		Arm64Reg base = gpr_id(src);
+		int offs = REG_VALUE(src);
+		int reg_t = is_fp_mode(mode) ? fpr_id(dst) : gpr_id(dst);
+		emit_ld_st(ctx, /*is_load=*/true, mode, reg_t, base, offs);
+		return;
+	}
+	if( dst_kind == R_REG_PTR && src_kind == R_REG ) {
+		// STORE: [base + offs] <- src
+		Arm64Reg base = gpr_id(dst);
+		int offs = REG_VALUE(dst);
+		int reg_t = is_fp_mode(mode) ? fpr_id(src) : gpr_id(src);
+		emit_ld_st(ctx, /*is_load=*/false, mode, reg_t, base, offs);
+		return;
+	}
+	if( dst_kind == R_REG && src_kind == R_CONST ) {
+		emit_load_const(ctx, dst, (uint64_t)REG_VALUE(src), mode);
+		return;
+	}
+	if( dst_kind == R_REG_PTR && src_kind == R_REG_PTR ) {
+		// memory-to-memory: load through a scratch register, then store.
+		// Use V31 for FP modes and ARM_TMP1 for integer/pointer modes — both
+		// are reserved as backend-private scratch.
+		Arm64Reg sb = gpr_id(src);
+		int so = REG_VALUE(src);
+		Arm64Reg db = gpr_id(dst);
+		int doff = REG_VALUE(dst);
+		if( is_fp_mode(mode) ) {
+			emit_ld_st(ctx, /*is_load=*/true,  mode, (Arm64FpReg)31, sb, so);
+			emit_ld_st(ctx, /*is_load=*/false, mode, (Arm64FpReg)31, db, doff);
+		} else {
+			emit_ld_st(ctx, /*is_load=*/true,  mode, ARM_TMP1, sb, so);
+			emit_ld_st(ctx, /*is_load=*/false, mode, ARM_TMP1, db, doff);
+		}
+		return;
+	}
+	jit_error("aarch64 emit_mov: unhandled operand kinds");
+}
+
+// ----------------------------------------------------------------------------
+// LOAD_CONST: integer immediate or floating constant.
+// Float constants need the literal pool (Phase 4). For Phase 2 only ints.
+// ----------------------------------------------------------------------------
+static void emit_load_const( code_ctx *ctx, ereg out, uint64_t value, emit_mode mode ) {
+	if( REG_KIND(out) != R_REG ) {
+		// emit-into-memory: load the bit pattern into ARM_TMP1 and store as the
+		// requested width.  For floats we treat the FP constant's bit pattern as
+		// an integer — the resulting STR writes the same bytes a FP STR would.
+		emit_mode store_mode = is_fp_mode(mode) ? (mode == M_F32 ? M_I32 : M_PTR) : mode;
+		load_immediate(ctx, (int64_t)value, ARM_TMP1, sf_for(store_mode) == 1);
+		Arm64Reg base = gpr_id(out);
+		int offs = REG_VALUE(out);
+		emit_ld_st(ctx, /*is_load=*/false, store_mode, ARM_TMP1, base, offs);
+		return;
+	}
+	if( is_fp_mode(mode) ) {
+		// Float constants live in the literal pool: ADRP+LDR into the FP reg.
+		// jit_emit.c packs F32 constants into the low 32 bits of `value` with
+		// the upper 32 bits zeroed, so we must use the matching width-encoding
+		// (size=2 → LDR Sd, ...). Loading 8 bytes would pull the zero high
+		// half into D and yield a subnormal double when read as F64.
+		Arm64FpReg fp_dst = fpr_id(out);
+		int adrp_pos = byte_count(ctx->code);
+		int size = (mode == M_F32) ? 2 : 3;
+		encode_adrp(ctx, 0, 0, ARM_TMP1);                              // ADRP X16, page
+		// LDR Sd|Dd, [X16, #lo12]   V=1, opc=01, imm12=placeholder
+		encode_ldr_str_imm(ctx, size, 1, 1, 0, ARM_TMP1, (Arm64Reg)fp_dst);
+		alloc_const(ctx, value, adrp_pos);
+		return;
+	}
+	load_immediate(ctx, (int64_t)value, gpr_id(out), sf_for(mode) == 1);
+}
+
+// ----------------------------------------------------------------------------
+// PUSH / POP. ARM has no explicit push/pop; we use STR/LDR with pre/post-index
+// on SP. To match the x86 stack-offset accounting (which assumes 16 bytes are
+// already consumed by RIP+RBP), PUSH X29 emits STP X29, X30, [SP, #-16]! so
+// LR is implicitly saved as part of FP-save. POP X29 mirrors with LDP.
+// All other PUSH/POPs use 16-byte SP movement (8 bytes wasted) to keep SP
+// 16-byte aligned per AAPCS64.
+// ----------------------------------------------------------------------------
+static void emit_push( code_ctx *ctx, ereg r, emit_mode mode ) {
+	if( is_fp_mode(mode) ) {
+		// SUB SP, SP, #16; STR Dn, [SP].  Materialize through V31 if r is not a register.
+		Arm64FpReg src = (REG_KIND(r) == R_REG) ? fpr_id(r) : materialize_fpr(ctx, r, mode, (Arm64FpReg)31);
+		emit_sp_offs(ctx, -16);
+		encode_ldr_str_imm(ctx, /*size=*/3, /*V=*/1, /*opc=*/0 /*STR*/, 0, SP_REG, (Arm64Reg)src);
+		return;
+	}
+	// materialize_gpr handles MK_STACK_OFFS by adding the offset; gpr_id alone
+	// would discard it (mapping STACK_REG->FP and ignoring REG_VALUE).
+	Arm64Reg src = materialize_gpr(ctx, r, mode, ARM_TMP1);
+	if( src == FP && REG_KIND(r) == R_REG && REG_VALUE(r) == 0 ) {
+		// True PUSH FP (prologue) — emit STP x29,x30,[sp,#-16]! to also save LR.
+		encode_ldp_stp(ctx, /*opc=*/2, /*V=*/0, /*mode=*/0x03, /*imm7=*/-2 & 0x7F, LR, SP_REG, FP);
+		return;
+	}
+	// SUB SP, SP, #16; STR Xn, [SP]
+	emit_sp_offs(ctx, -16);
+	encode_ldr_str_imm(ctx, /*size=*/3, /*V=*/0, /*opc=*/0, 0, SP_REG, src);
+}
+
+static void emit_pop( code_ctx *ctx, ereg r, emit_mode mode ) {
+	if( REG_KIND(r) != R_REG ) jit_error("aarch64 POP non-reg not implemented");
+	if( is_fp_mode(mode) ) {
+		encode_ldr_str_imm(ctx, /*size=*/3, /*V=*/1, /*opc=*/1 /*LDR*/, 0, SP_REG, (Arm64Reg)fpr_id(r));
+		emit_sp_offs(ctx, 16);
+		return;
+	}
+	Arm64Reg dst = gpr_id(r);
+	if( dst == FP ) {
+		// LDP X29, X30, [SP], #16   opc=10, V=0, mode=01 (post-index load), imm7=2
+		encode_ldp_stp(ctx, /*opc=*/2, /*V=*/0, /*mode=*/0x01, /*imm7=*/2, LR, SP_REG, FP);
+		return;
+	}
+	// LDR Xn, [SP]; ADD SP, SP, #16
+	encode_ldr_str_imm(ctx, /*size=*/3, /*V=*/0, /*opc=*/1, 0, SP_REG, dst);
+	emit_sp_offs(ctx, 16);
+}
+
+// ----------------------------------------------------------------------------
+// CMP / TEST.  e->mode tells us int width / float; e->size_offs holds the
+// upstream OJxxx opcode (consumed later by the JCOND/CMOV that follows).
+// ----------------------------------------------------------------------------
+static void emit_cmp( code_ctx *ctx, einstr *e ) {
+	if( is_fp_mode(e->mode) ) {
+		// FCMP. NaN handling deferred; bare FCMP is correct for ordered compares
+		// and gives QNaN-as-unordered which matches ARM defaults.
+		Arm64FpReg ra = materialize_fpr(ctx, e->a, e->mode, (Arm64FpReg)29);
+		Arm64FpReg rb = materialize_fpr(ctx, e->b, e->mode, (Arm64FpReg)30);
+		int type = (e->mode == M_F64) ? 1 : 0;
+		encode_fp_compare(ctx, /*M=*/0, /*S=*/0, type, rb, /*op=*/0, ra);
+		return;
+	}
+	// Integer compare: SUBS XZR, Xa, Xb  (or imm form).
+	// materialize_gpr handles R_REG (incl. MK_STACK_OFFS via emit_lea_imm),
+	// R_CONST, and R_REG_PTR — picking gpr_id alone would drop the FP+N
+	// offset for stack-allocated addresses.
+	int sf = sf_for(e->mode);
+	Arm64Reg a = materialize_gpr(ctx, e->a, e->mode, ARM_TMP1);
+	if( REG_KIND(e->b) == R_CONST ) {
+		int64_t val = (int64_t)REG_VALUE(e->b);
+		if( val >= 0 && val <= 0xFFF ) {
+			// CMP Xa, #imm  (SUBS XZR, Xa, #imm; sf, op=1, S=1)
+			encode_add_sub_imm(ctx, sf, 1, 1, 0, (int)val, a, XZR);
+			return;
+		}
+		if( val < 0 && -val <= 0xFFF ) {
+			// CMN Xa, #imm  (ADDS XZR, Xa, #imm)
+			encode_add_sub_imm(ctx, sf, 0, 1, 0, (int)-val, a, XZR);
+			return;
+		}
+		load_immediate(ctx, val, ARM_TMP2, sf == 1);
+		encode_add_sub_reg(ctx, sf, 1, 1, 0, ARM_TMP2, 0, a, XZR);
+		return;
+	}
+	Arm64Reg b = materialize_gpr_ex(ctx, e->b, e->mode, ARM_TMP2, a);
+	encode_add_sub_reg(ctx, sf, 1, 1, 0, b, 0, a, XZR);
+}
+
+static void emit_test( code_ctx *ctx, einstr *e ) {
+	if( is_fp_mode(e->mode) ) jit_error("aarch64 TEST float not supported");
+	int sf = sf_for(e->mode);
+	// materialize_gpr folds MK_STACK_OFFS (R_REG kind + non-zero REG_VALUE)
+	// into FP+N so we never TST raw FP for stack-allocated address operands.
+	Arm64Reg a = materialize_gpr(ctx, e->a, e->mode, ARM_TMP1);
+	// TST Xa, Xa  (ANDS XZR, Xa, Xa); opc=11 (ANDS), shift=0, N=0
+	encode_logical_reg(ctx, sf, 0x03, 0, 0, a, 0, a, XZR);
+}
+
+// ----------------------------------------------------------------------------
+// JCOND / JUMP — branch fixups patched after function emit.
+// ----------------------------------------------------------------------------
+static void add_branch_fixup( code_ctx *ctx, int code_pos, int target_op, int is_cond ) {
+	int_arr_add_impl(&ctx->jit->galloc, &ctx->branch_fixups, code_pos);
+	int_arr_add_impl(&ctx->jit->galloc, &ctx->branch_fixups, target_op);
+	int_arr_add_impl(&ctx->jit->galloc, &ctx->branch_fixups, is_cond);
+}
+
+static void emit_jump( code_ctx *ctx, int target_op_offset ) {
+	// target_op_offset is the IR-relative displacement, target = cur_op + 1 + offset
+	int target = ctx->cur_op + 1 + target_op_offset;
+	int pos = byte_count(ctx->code);
+	encode_branch_uncond(ctx, 0); // placeholder
+	add_branch_fixup(ctx, pos, target, 0);
+}
+
+static void emit_jump_cond( code_ctx *ctx, ArmCondition cond, int target_op_offset ) {
+	int target = ctx->cur_op + 1 + target_op_offset;
+	int pos = byte_count(ctx->code);
+	encode_branch_cond(ctx, 0, cond);
+	add_branch_fixup(ctx, pos, target, 1);
+}
+
+// Mirror x86 get_cond_jump: walk back through MOV/JCOND/CMOV/XCHG/CXCHG to find
+// the comparison whose flags this JCOND/CMOV consumes.  Translate the upstream
+// OJxxx opcode into an ARM condition code.
+static ArmCondition get_cond_jump( code_ctx *ctx ) {
+	int prev = 0;
+	einstr *p;
+	do {
+		p = ctx->jit->reg_instrs + ctx->cur_op - (++prev);
+	} while( p->op == MOV || p->op == JCOND || p->op == CMOV || p->op == XCHG || p->op == CXCHG );
+	switch( p->size_offs ) {
+	case OJFalse:
+	case OJNull:
+		return COND_EQ;
+	case OJTrue:
+	case OJNotNull:
+		return COND_NE;
+	// For ARM64 FCMP, NaN sets N=0, Z=0, C=1, V=1.  IEEE 754 ordered
+	// predicates need to evaluate FALSE for NaN. HS (C==1) and HI (C==1
+	// && Z==0) both fire on NaN — wrong. GE (N==V) and GT (Z==0 && N==V)
+	// reject NaN since V differs from N. The x86 backend can use JUGte/JUGt
+	// for FP only because x86 UCOMISS sets CF=1 on NaN, making JAE/JA
+	// reject it; ARM's carry conventions are inverted from x86's.
+	// LO (C==0) and LS (C==0 || Z==1) already reject NaN on ARM (C=1).
+	case OJSGte:
+		return COND_GE;
+	case OJSGt:
+		return COND_GT;
+	case OJUGte:
+		return COND_HS;
+	case OJSLt:
+		return is_fp_mode(p->mode) ? COND_LO : COND_LT;
+	case OJSLte:
+		return is_fp_mode(p->mode) ? COND_LS : COND_LE;
+	case OJULt:
+		return COND_LO;
+	case OJEq:
+		return COND_EQ;
+	case OJNotEq:
+		return COND_NE;
+	case OJNotLt:
+		// HS (C==1) fires on NaN (C=1) and on ordered >= (C=1) ✓.
+		// GE (N==V) would reject NaN (V=1, N=0) — wrong, NaN means
+		// "not less than" should fire.
+		return COND_HS;
+	case OJNotGte:
+		// LT (N!=V) is signed less-than for INT, and for FP it fires on
+		// NaN (V=1) — the right semantics for "not >=".
+		// LO (C==0) would not fire on NaN (C=1) — wrong for FP.
+		return COND_LT;
+	case 0:
+		if( p->op == DEBUG_BREAK ) return COND_EQ;
+		// fallthrough
+	default:
+		jit_error("aarch64 get_cond_jump: unknown OJ opcode");
+		return COND_AL;
+	}
+}
+
+static void patch_branch( code_ctx *ctx, int pos, int target_byte_pos, int is_cond ) {
+	int delta = target_byte_pos - pos;
+	if( delta & 3 ) jit_error("aarch64 branch target not 4-byte aligned");
+	int imm = delta >> 2;
+	unsigned int *insn = (unsigned int*)&ctx->code.values[pos];
+	if( is_cond ) {
+		// imm19 lives in bits [23:5]; cond + 0x54000000 prefix retained.
+		if( imm < -(1 << 18) || imm >= (1 << 18) )
+			jit_error("aarch64 B.cond out of range (Phase 2 limit)");
+		*insn = (*insn & ~(0x7FFFF << 5)) | ((imm & 0x7FFFF) << 5);
+	} else {
+		// imm26 lives in bits [25:0]; opcode 000101.
+		if( imm < -(1 << 25) || imm >= (1 << 25) )
+			jit_error("aarch64 B out of range");
+		*insn = (*insn & ~0x03FFFFFF) | (imm & 0x03FFFFFF);
+	}
+}
+
+// ----------------------------------------------------------------------------
+// Operand materialization: ensure src is a live register; load through a temp
+// if it's a constant or memory.  Returns the GPR encoding to use.
+// ----------------------------------------------------------------------------
+static Arm64Reg materialize_gpr_ex( code_ctx *ctx, ereg src, emit_mode mode, Arm64Reg tmp, Arm64Reg avoid ) {
+	if( REG_KIND(src) == R_REG ) {
+		Arm64Reg base = gpr_id(src);
+		int v = REG_VALUE(src);
+		if( v == 0 ) return base;
+		emit_lea_imm(ctx, tmp, base, v);
+		return tmp;
+	}
+	if( REG_KIND(src) == R_CONST ) {
+		load_immediate(ctx, (int64_t)REG_VALUE(src), tmp, sf_for(mode) == 1);
+		return tmp;
+	}
+	if( REG_KIND(src) == R_REG_PTR ) {
+		// Load directly via emit_ld_st_ex so the offset-temp picker can avoid
+		// `avoid` (typically the caller's outer base register).
+		emit_ld_st_ex(ctx, true, mode, tmp, gpr_id(src), REG_VALUE(src), avoid);
+		return tmp;
+	}
+	emit_mov(ctx, R(tmp), src, mode);
+	return tmp;
+}
+
+static Arm64Reg materialize_gpr( code_ctx *ctx, ereg src, emit_mode mode, Arm64Reg tmp ) {
+	return materialize_gpr_ex(ctx, src, mode, tmp, (Arm64Reg)-1);
+}
+
+static Arm64FpReg materialize_fpr( code_ctx *ctx, ereg src, emit_mode mode, Arm64FpReg tmp ) {
+	if( REG_KIND(src) == R_REG ) return fpr_id(src);
+	if( REG_KIND(src) == R_REG_PTR ) {
+		Arm64Reg base = gpr_id(src);
+		int offs = REG_VALUE(src);
+		emit_ld_st(ctx, /*is_load=*/true, mode, tmp, base, offs);
+		return tmp;
+	}
+	if( REG_KIND(src) == R_CONST ) {
+		// FP constants always live in the literal pool.
+		int adrp_pos = byte_count(ctx->code);
+		encode_adrp(ctx, 0, 0, ARM_TMP1);
+		encode_ldr_str_imm(ctx, 3, 1, 1, 0, ARM_TMP1, (Arm64Reg)tmp);
+		alloc_const(ctx, (uint64_t)REG_VALUE(src), adrp_pos);
+		return tmp;
+	}
+	jit_error("aarch64 materialize_fpr: unsupported operand kind");
+	return (Arm64FpReg)0;
+}
+
+// ----------------------------------------------------------------------------
+// Bitfield helpers (SBFM / UBFM raw encoding) for sign/zero-extension.
+// ----------------------------------------------------------------------------
+static void emit_bitfield( code_ctx *ctx, int sf, int opc, int immr, int imms, Arm64Reg Rn, Arm64Reg Rd ) {
+	// [31]=sf, [30:29]=opc (00=SBFM, 01=BFM, 10=UBFM), [28:23]=100110, [22]=N(=sf),
+	// [21:16]=immr, [15:10]=imms, [9:5]=Rn, [4:0]=Rd
+	unsigned int insn = ((unsigned)sf << 31) | ((unsigned)opc << 29) | (0x26u << 23) |
+	                    ((unsigned)sf << 22) | ((immr & 0x3F) << 16) | ((imms & 0x3F) << 10) |
+	                    ((Rn & 0x1F) << 5) | (Rd & 0x1F);
+	EMIT32(ctx, insn);
+}
+
+static void emit_sxt_to_int( code_ctx *ctx, emit_mode in_mode, Arm64Reg Rn, Arm64Reg Rd ) {
+	// SXTB Wd, Wn / SXTH Wd, Wn — produce sign-extended 32-bit result.
+	switch( in_mode ) {
+	case M_UI8:  emit_bitfield(ctx, 0, 0x00, 0, 7, Rn, Rd); break;
+	case M_UI16: emit_bitfield(ctx, 0, 0x00, 0, 15, Rn, Rd); break;
+	default: jit_error("emit_sxt_to_int unsupported in_mode");
+	}
+}
+
+static void emit_sxt_to_ptr( code_ctx *ctx, emit_mode in_mode, Arm64Reg Rn, Arm64Reg Rd ) {
+	// SBFM Xd, Xn, #0, #N — sign-extend to 64-bit.
+	switch( in_mode ) {
+	case M_UI8:  emit_bitfield(ctx, 1, 0x00, 0, 7, Rn, Rd); break;
+	case M_UI16: emit_bitfield(ctx, 1, 0x00, 0, 15, Rn, Rd); break;
+	case M_I32:  emit_bitfield(ctx, 1, 0x00, 0, 31, Rn, Rd); break;
+	default: jit_error("emit_sxt_to_ptr unsupported in_mode");
+	}
+}
+
+static void emit_uxt_to_w( code_ctx *ctx, emit_mode in_mode, Arm64Reg Rn, Arm64Reg Rd ) {
+	// UXTB Wd, Wn / UXTH Wd, Wn — implemented as AND Wd, Wn, #mask.
+	switch( in_mode ) {
+	case M_UI8:  encode_logical_imm(ctx, 0, 0x00, 0, 0, 7, Rn, Rd); break;   // AND Wd, Wn, #0xFF
+	case M_UI16: encode_logical_imm(ctx, 0, 0x00, 0, 0, 15, Rn, Rd); break;  // AND Wd, Wn, #0xFFFF
+	default: jit_error("emit_uxt_to_w unsupported in_mode");
+	}
+}
+
+// ----------------------------------------------------------------------------
+// BINOP / UNOP integer.  e->size_offs encodes the upstream Haxe op (OAdd, ...).
+// ARM has 3-operand ALU so we can write directly to `out` from `a, b`.
+// ----------------------------------------------------------------------------
+static void emit_div_mod( code_ctx *ctx, hl_op op, Arm64Reg out, Arm64Reg a, Arm64Reg b, int sf );
+
+static void emit_binop_int( code_ctx *ctx, hl_op op, ereg out_e, ereg a_e, ereg b_e, emit_mode mode ) {
+	int sf = sf_for(mode);
+	Arm64Reg out = (REG_KIND(out_e) == R_REG) ? gpr_id(out_e) : ARM_TMP1;
+	Arm64Reg a = materialize_gpr(ctx, a_e, mode, ARM_TMP1);
+
+	// Constant-imm fast paths (ADD/SUB/AND/OR/XOR with small immediates).
+	if( REG_KIND(b_e) == R_CONST ) {
+		int64_t v = (int64_t)REG_VALUE(b_e);
+		if( (op == OAdd || op == OSub) && v >= 0 && v <= 0xFFF ) {
+			encode_add_sub_imm(ctx, sf, op == OSub ? 1 : 0, 0, 0, (int)v, a, out);
+			goto store_out;
+		}
+		if( (op == OAdd || op == OSub) && v < 0 && -v <= 0xFFF ) {
+			encode_add_sub_imm(ctx, sf, op == OSub ? 0 : 1, 0, 0, (int)-v, a, out);
+			goto store_out;
+		}
+	}
+
+	Arm64Reg b = materialize_gpr_ex(ctx, b_e, mode, ARM_TMP2, a);
+
+	switch( op ) {
+	case OAdd: encode_add_sub_reg(ctx, sf, 0, 0, 0, b, 0, a, out); break;
+	case OSub: encode_add_sub_reg(ctx, sf, 1, 0, 0, b, 0, a, out); break;
+	case OMul: encode_madd_msub(ctx, sf, 0, b, XZR, a, out); break;
+	case OAnd: encode_logical_reg(ctx, sf, 0x00, 0, 0, b, 0, a, out); break;
+	case OOr:  encode_logical_reg(ctx, sf, 0x01, 0, 0, b, 0, a, out); break;
+	case OXor: encode_logical_reg(ctx, sf, 0x02, 0, 0, b, 0, a, out); break;
+	case OShl: encode_shift_reg(ctx, sf, 0x00, b, a, out); break;  // LSLV
+	case OUShr: encode_shift_reg(ctx, sf, 0x01, b, a, out); break; // LSRV
+	case OSShr: encode_shift_reg(ctx, sf, 0x02, b, a, out); break; // ASRV
+	case OSDiv:
+	case OUDiv:
+	case OSMod:
+	case OUMod:
+		emit_div_mod(ctx, op, out, a, b, sf);
+		break;
+	default:
+		jit_error("aarch64 emit_binop_int: unsupported op");
+	}
+
+	// Sub-word result truncation.  Loads/stores already truncate, but ALU on
+	// 32-bit reg leaves upper W zero already; we only need a mask for 8/16-bit.
+	if( mode == M_UI8 ) {
+		encode_logical_imm(ctx, 0, 0x00, 0, 0, 7, out, out);   // AND Wd, Wd, #0xFF
+	} else if( mode == M_UI16 ) {
+		encode_logical_imm(ctx, 0, 0x00, 0, 0, 15, out, out);  // AND Wd, Wd, #0xFFFF
+	}
+
+store_out:
+	if( REG_KIND(out_e) != R_REG ) {
+		emit_mov(ctx, out_e, R(ARM_TMP1), mode);
+	}
+}
+
+// ----------------------------------------------------------------------------
+// Integer divide / modulo with Haxe semantics:
+//   OUDiv:  b == 0       => 0
+//   OUMod:  b == 0       => 0
+//   OSDiv:  b == 0 || -1 => a*b   (matches x86; avoids INT_MIN/-1 overflow trap)
+//   OSMod:  b == 0 || -1 => 0
+// ARM SDIV/UDIV give 0 for div/0, but mod via MSUB needs explicit guarding.
+// ----------------------------------------------------------------------------
+static void emit_div_mod( code_ctx *ctx, hl_op op, Arm64Reg out, Arm64Reg a, Arm64Reg b, int sf ) {
+	bool unsign = (op == OUDiv || op == OUMod);
+	bool is_div = (op == OSDiv || op == OUDiv);
+
+	// Test b for 0; signed ops also test for -1.
+	encode_logical_reg(ctx, sf, 0x03, 0, 0, b, 0, b, XZR);  // TST b, b
+	int jz_pos = byte_count(ctx->code);
+	encode_branch_cond(ctx, 0, COND_EQ);  // patched later
+
+	int jneg_pos = -1;
+	if( !unsign ) {
+		// CMN b, #1  (= b + 1; sets Z if b == -1)
+		encode_add_sub_imm(ctx, sf, 0, 1, 0, 1, b, XZR);
+		jneg_pos = byte_count(ctx->code);
+		encode_branch_cond(ctx, 0, COND_EQ);
+	}
+
+	// Mainline.  encode_div's U bit is 0=UDIV, 1=SDIV (per the ARM ARM
+	// bit-10 encoding) — pass `unsign ? 0 : 1`, NOT the inverse.
+	if( is_div ) {
+		// SDIV/UDIV out, a, b
+		encode_div(ctx, sf, unsign ? 0 : 1, b, a, out);
+	} else {
+		// MSUB needs the ORIGINAL `a` and `b` after the divide; SDIV writes
+		// `out`, so any of {out==a, out==b} would clobber a source.  Spill
+		// the aliased operand(s) to backend temps first.  ARM_TMP3 is
+		// reserved precisely for cases like this where we need a third
+		// independent register.
+		Arm64Reg a_safe = a, b_safe = b;
+		if( out == a ) {
+			emit_mov_gpr(ctx, ARM_TMP3, a, sf);
+			a_safe = ARM_TMP3;
+			if( b == a ) b_safe = ARM_TMP3; // a==b too: same value in TMP3
+		}
+		if( out == b && b_safe == b ) {
+			// Need a different temp from a_safe (which may be ARM_TMP3 already).
+			Arm64Reg t = (a_safe == ARM_TMP1) ? ARM_TMP2 : ARM_TMP1;
+			emit_mov_gpr(ctx, t, b, sf);
+			b_safe = t;
+		}
+		encode_div(ctx, sf, unsign ? 0 : 1, b_safe, a_safe, out);
+		// MSUB out, out, b_safe, a_safe  =>  out = a_safe - out * b_safe
+		encode_madd_msub(ctx, sf, 1, b_safe, a_safe, out, out);
+	}
+	int jdone_pos = byte_count(ctx->code);
+	encode_branch_uncond(ctx, 0);
+
+	// Special case path: result = 0 (mod or unsigned div) or a*b (signed div).
+	int special_pos = byte_count(ctx->code);
+	if( op == OSDiv ) {
+		// out = a * b
+		encode_madd_msub(ctx, sf, 0, b, XZR, a, out);
+	} else {
+		// out = 0
+		encode_logical_reg(ctx, sf, 0x01, 0, 0, XZR, 0, XZR, out);  // ORR out, XZR, XZR
+	}
+
+	int after = byte_count(ctx->code);
+
+	// Patch branches.
+	int delta_jz = (special_pos - jz_pos) >> 2;
+	*(unsigned int*)&ctx->code.values[jz_pos] =
+		(*(unsigned int*)&ctx->code.values[jz_pos] & ~(0x7FFFF << 5)) | ((delta_jz & 0x7FFFF) << 5);
+	if( jneg_pos >= 0 ) {
+		int delta_jn = (special_pos - jneg_pos) >> 2;
+		*(unsigned int*)&ctx->code.values[jneg_pos] =
+			(*(unsigned int*)&ctx->code.values[jneg_pos] & ~(0x7FFFF << 5)) | ((delta_jn & 0x7FFFF) << 5);
+	}
+	int delta_done = (after - jdone_pos) >> 2;
+	*(unsigned int*)&ctx->code.values[jdone_pos] =
+		(*(unsigned int*)&ctx->code.values[jdone_pos] & ~0x03FFFFFF) | (delta_done & 0x03FFFFFF);
+}
+
+// ----------------------------------------------------------------------------
+// BINOP / UNOP float.
+// ----------------------------------------------------------------------------
+static void emit_binop_fp( code_ctx *ctx, hl_op op, ereg out_e, ereg a_e, ereg b_e, emit_mode mode ) {
+	bool out_to_mem = (REG_KIND(out_e) != R_REG);
+	Arm64FpReg out = out_to_mem ? (Arm64FpReg)31 : fpr_id(out_e);
+	// Use V29/V30 as scratch FP regs (in our scratch list, won't collide with `out`=V31).
+	Arm64FpReg a = materialize_fpr(ctx, a_e, mode, (Arm64FpReg)29);
+	Arm64FpReg b = materialize_fpr(ctx, b_e, mode, (Arm64FpReg)30);
+	int type = (mode == M_F64) ? 1 : 0;
+	int opcode;
+	switch( op ) {
+	case OAdd:  opcode = 0x02; break; // FADD
+	case OSub:  opcode = 0x03; break; // FSUB
+	case OMul:  opcode = 0x00; break; // FMUL
+	case OSDiv: opcode = 0x01; break; // FDIV
+	default: jit_error("aarch64 emit_binop_fp: unsupported op");
+	}
+	encode_fp_arith(ctx, /*M=*/0, /*S=*/0, type, b, opcode, a, out);
+	if( out_to_mem ) {
+		Arm64Reg base = gpr_id(out_e);
+		int offs = REG_VALUE(out_e);
+		emit_ld_st(ctx, false, mode, out, base, offs);
+	}
+}
+
+static void emit_unop( code_ctx *ctx, hl_op op, ereg out_e, ereg a_e, emit_mode mode ) {
+	if( is_fp_mode(mode) ) {
+		bool out_to_mem = (REG_KIND(out_e) != R_REG);
+		Arm64FpReg out = out_to_mem ? (Arm64FpReg)31 : fpr_id(out_e);
+		Arm64FpReg a = materialize_fpr(ctx, a_e, mode, (Arm64FpReg)29);
+		int type = (mode == M_F64) ? 1 : 0;
+		switch( op ) {
+		case ONeg: encode_fp_1src(ctx, 0, 0, type, /*FNEG*/2, a, out); break;
+		default: jit_error("aarch64 emit_unop float: unsupported op");
+		}
+		if( out_to_mem ) {
+			Arm64Reg base = gpr_id(out_e);
+			int offs = REG_VALUE(out_e);
+			emit_ld_st(ctx, false, mode, out, base, offs);
+		}
+		return;
+	}
+	int sf = sf_for(mode);
+	Arm64Reg out = (REG_KIND(out_e) == R_REG) ? gpr_id(out_e) : ARM_TMP1;
+	Arm64Reg a = materialize_gpr(ctx, a_e, mode, ARM_TMP1);
+	switch( op ) {
+	case ONeg:
+		// SUB out, XZR, a  (NEG alias)
+		encode_add_sub_reg(ctx, sf, 1, 0, 0, a, 0, XZR, out);
+		break;
+	case ONot:
+		// EOR out, a, #1  (boolean toggle).  N must equal sf for value 1.
+		encode_logical_imm(ctx, sf, 0x02, sf, 0, 0, a, out);
+		break;
+	case OIncr:
+		encode_add_sub_imm(ctx, sf, 0, 0, 0, 1, a, out);
+		break;
+	case ODecr:
+		encode_add_sub_imm(ctx, sf, 1, 0, 0, 1, a, out);
+		break;
+	default:
+		jit_error("aarch64 emit_unop: unsupported op");
+	}
+	if( mode == M_UI8 ) {
+		encode_logical_imm(ctx, 0, 0x00, 0, 0, 7, out, out);
+	} else if( mode == M_UI16 ) {
+		encode_logical_imm(ctx, 0, 0x00, 0, 0, 15, out, out);
+	}
+	if( REG_KIND(out_e) != R_REG ) emit_mov(ctx, out_e, R(ARM_TMP1), mode);
+}
+
+// ----------------------------------------------------------------------------
+// CONV / CONV_UNSIGNED.  e->mode = output mode, e->size_offs = input mode.
+// ----------------------------------------------------------------------------
+static void emit_conv( code_ctx *ctx, einstr *e, ereg out_e, bool unsign ) {
+	emit_mode out_mode = e->mode;
+	emit_mode in_mode = (emit_mode)e->size_offs;
+	bool out_fp = is_fp_mode(out_mode);
+	bool in_fp = is_fp_mode(in_mode);
+
+	// Materialize source.
+	Arm64Reg a_gpr = 0;
+	Arm64FpReg a_fpr = (Arm64FpReg)0;
+	if( in_fp ) {
+		a_fpr = materialize_fpr(ctx, e->a, in_mode, (Arm64FpReg)29);
+	} else {
+		a_gpr = materialize_gpr(ctx, e->a, in_mode, ARM_TMP1);
+	}
+
+	// Pick output register encoding.  When the result lives in memory we route
+	// the value through a backend-private temporary in the appropriate class.
+	bool out_to_mem = REG_KIND(out_e) != R_REG;
+	Arm64Reg dst_gpr = (!out_fp && !out_to_mem) ? gpr_id(out_e)
+	                  : (!out_fp ? ARM_TMP2 : 0);
+	// V31 is in our scratch list and serves as an FP temp; we still need to
+	// emit a follow-up STR if the output is memory.
+	Arm64FpReg dst_fpr = (out_fp && !out_to_mem) ? fpr_id(out_e)
+	                    : (out_fp ? (Arm64FpReg)31 : (Arm64FpReg)0);
+
+	if( in_fp && out_fp ) {
+		// FCVT between F32/F64
+		int type = (in_mode == M_F64) ? 1 : 0;       // input type
+		int opcode = (in_mode == M_F32) ? 0x05 : 0x04; // F32->F64 = 0x05, F64->F32 = 0x04
+		encode_fp_1src(ctx, 0, 0, type, opcode, a_fpr, dst_fpr);
+	} else if( in_fp && !out_fp ) {
+		// FP -> int.  FCVTZS / FCVTZU (round toward zero).
+		int sf = sf_for(out_mode);
+		int type = (in_mode == M_F64) ? 1 : 0;
+		int rmode = 3;       // round toward zero
+		int opc = unsign ? 1 : 0;  // 0=FCVTZS, 1=FCVTZU
+		encode_fcvt_int(ctx, sf, 0, type, rmode, opc, a_fpr, dst_gpr);
+	} else if( !in_fp && out_fp ) {
+		// int -> FP. SCVTF / UCVTF.
+		int sf = sf_for(in_mode);
+		int type = (out_mode == M_F64) ? 1 : 0;
+		int rmode = 0;
+		int opc = unsign ? 3 : 2;  // 2=SCVTF, 3=UCVTF
+		// First, widen sub-word inputs to full width.  UI8/UI16 are
+		// unsigned regardless of the `unsign` flag (which here selects
+		// SCVTF vs UCVTF), so always zero-extend the byte/half before
+		// the FP conversion.
+		Arm64Reg src = a_gpr;
+		if( in_mode == M_UI8 || in_mode == M_UI16 ) {
+			emit_uxt_to_w(ctx, in_mode, src, ARM_TMP1);
+			src = ARM_TMP1;
+		}
+		encode_int_fcvt(ctx, sf, 0, type, rmode, opc, src, dst_fpr);
+	} else {
+		// int -> int.
+		switch( in_mode ) {
+		case M_UI8:
+		case M_UI16:
+			// UI8/UI16 are inherently unsigned in HL — widening to a larger
+			// integer must always zero-extend, matching x86's MOVZX.  The
+			// `unsign` flag is only meaningful for FP conversions.
+			if( out_mode == M_PTR || out_mode == M_I32 ) {
+				emit_uxt_to_w(ctx, in_mode, a_gpr, dst_gpr);
+			} else if( out_mode == M_UI16 || out_mode == M_UI8 ) {
+				emit_uxt_to_w(ctx, out_mode, a_gpr, dst_gpr);
+			}
+			break;
+		case M_I32:
+			if( out_mode == M_PTR ) {
+				if( unsign ) emit_mov_gpr(ctx, dst_gpr, a_gpr, 0); // MOV Wd, Wn — zero-extends to X
+				else emit_sxt_to_ptr(ctx, M_I32, a_gpr, dst_gpr);
+			} else {
+				emit_mov_gpr(ctx, dst_gpr, a_gpr, sf_for(out_mode));
+				if( out_mode == M_UI8 || out_mode == M_UI16 )
+					emit_uxt_to_w(ctx, out_mode, dst_gpr, dst_gpr);
+			}
+			break;
+		case M_PTR:
+			if( out_mode == M_I32 ) {
+				emit_mov_gpr(ctx, dst_gpr, a_gpr, 0);  // truncate
+			} else if( out_mode == M_UI8 || out_mode == M_UI16 ) {
+				emit_uxt_to_w(ctx, out_mode, a_gpr, dst_gpr);
+			} else {
+				emit_mov_gpr(ctx, dst_gpr, a_gpr, 1);
+			}
+			break;
+		default:
+			jit_error("aarch64 emit_conv: unsupported int conversion");
+		}
+	}
+
+	if( out_to_mem ) {
+		if( out_fp ) {
+			// STR D31/S31, [base+offs] — base might be inside a register operand
+			// of `out_e`; use emit_ld_st with the FP class.
+			Arm64Reg base = gpr_id(out_e);
+			int offs = REG_VALUE(out_e);
+			emit_ld_st(ctx, false, out_mode, dst_fpr, base, offs);
+		} else {
+			emit_mov(ctx, out_e, R(ARM_TMP2), out_mode);
+		}
+	}
+}
+
+// ----------------------------------------------------------------------------
+// STORE / LOAD_ADDR / LEA.
+// ----------------------------------------------------------------------------
+static void emit_store( code_ctx *ctx, einstr *e ) {
+	int offs = e->size_offs;
+	Arm64Reg base;
+	if( REG_KIND(e->a) == R_REG ) {
+		base = gpr_id(e->a);
+		// MK_STACK_OFFS(v) and MK_ADDR-like values encode the offset in the
+		// register's value field; combine it with size_offs.  For regular
+		// register operands REG_VALUE is 0, so this is a no-op.
+		offs += REG_VALUE(e->a);
+	} else {
+		emit_mov(ctx, R(ARM_TMP1), e->a, M_PTR);
+		base = ARM_TMP1;
+	}
+	if( is_fp_mode(e->mode) ) {
+		if( REG_KIND(e->b) == R_REG ) {
+			emit_ld_st(ctx, false, e->mode, fpr_id(e->b), base, offs);
+		} else {
+			// Route the bit pattern through a GPR.  STR writes the same bytes
+			// regardless of FP vs. INT class.
+			Arm64Reg tmp = (base == ARM_TMP1) ? ARM_TMP2 : ARM_TMP1;
+			emit_mode int_mode = (e->mode == M_F32) ? M_I32 : M_PTR;
+			if( REG_KIND(e->b) == R_CONST ) {
+				load_immediate(ctx, (int64_t)REG_VALUE(e->b), tmp, sf_for(int_mode) == 1);
+			} else if( REG_KIND(e->b) == R_REG_PTR ) {
+				// Spilled FP vreg: load via emit_ld_st_ex so the offset-temp picker
+				// can avoid clobbering `base` (parked in ARM_TMP1 when e->a was spilled).
+				emit_ld_st_ex(ctx, true, int_mode, tmp, gpr_id(e->b), REG_VALUE(e->b), base);
+			} else {
+				emit_mov(ctx, R(tmp), e->b, int_mode);
+			}
+			emit_ld_st_ex(ctx, false, int_mode, tmp, base, offs, (Arm64Reg)-1);
+		}
+		return;
+	}
+	int reg_t;
+	if( REG_KIND(e->b) == R_REG && REG_VALUE(e->b) == 0 ) {
+		reg_t = gpr_id(e->b);
+	} else {
+		Arm64Reg tmp = (base == ARM_TMP1) ? ARM_TMP2 : ARM_TMP1;
+		if( REG_KIND(e->b) == R_REG ) {
+			// MK_STACK_OFFS / LEA-rewritten ADDRESS: source encodes (reg, offs).
+			// Materialize the effective address into tmp.
+			emit_lea_imm(ctx, tmp, gpr_id(e->b), REG_VALUE(e->b));
+		} else if( REG_KIND(e->b) == R_CONST ) {
+			load_immediate(ctx, (int64_t)REG_VALUE(e->b), tmp, sf_for(e->mode) == 1);
+		} else if( REG_KIND(e->b) == R_REG_PTR ) {
+			// Load directly via emit_ld_st_ex so we can tell it to avoid
+			// clobbering `base` (which lives in ARM_TMP1 when e->a was spilled).
+			emit_ld_st_ex(ctx, true, e->mode, tmp, gpr_id(e->b), REG_VALUE(e->b), base);
+		} else {
+			emit_mov(ctx, R(tmp), e->b, e->mode);
+		}
+		reg_t = tmp;
+	}
+	emit_ld_st_ex(ctx, false, e->mode, reg_t, base, offs, (Arm64Reg)-1);
+}
+
+static void emit_load_addr( code_ctx *ctx, einstr *e, ereg out_e ) {
+	emit_mode lmode = (emit_mode)e->nargs;
+	Arm64Reg base;
+	int offs = e->size_offs;
+	if( REG_KIND(e->a) == R_REG ) {
+		base = gpr_id(e->a);
+		offs += REG_VALUE(e->a);
+	} else {
+		emit_mov(ctx, R(ARM_TMP1), e->a, M_PTR);
+		base = ARM_TMP1;
+	}
+	if( is_fp_mode(lmode) ) {
+		if( REG_KIND(out_e) == R_REG ) {
+			emit_ld_st(ctx, true, lmode, fpr_id(out_e), base, offs);
+		} else {
+			// FP load into V31 then STR to memory dst.
+			emit_ld_st(ctx, true, lmode, (Arm64FpReg)31, base, offs);
+			Arm64Reg out_base = gpr_id(out_e);
+			int out_offs = REG_VALUE(out_e);
+			emit_ld_st(ctx, false, lmode, (Arm64FpReg)31, out_base, out_offs);
+		}
+		return;
+	}
+	Arm64Reg dst = (REG_KIND(out_e) == R_REG) ? gpr_id(out_e) : ARM_TMP2;
+	emit_ld_st(ctx, true, lmode, dst, base, offs);
+	if( REG_KIND(out_e) != R_REG ) {
+		emit_mov(ctx, out_e, R(ARM_TMP2), e->mode);
+	}
+}
+
+static void emit_lea( code_ctx *ctx, einstr *e, ereg out_e ) {
+	int mult = e->size_offs & 0xFF;
+	int offs = e->size_offs >> 8;
+	if( REG_KIND(e->a) == R_REG ) offs += REG_VALUE(e->a);
+
+	Arm64Reg out = (REG_KIND(out_e) == R_REG) ? gpr_id(out_e) : ARM_TMP1;
+	Arm64Reg a;
+	if( REG_KIND(e->a) == R_REG ) {
+		a = gpr_id(e->a);
+	} else {
+		emit_mov(ctx, R(ARM_TMP1), e->a, M_PTR);
+		a = ARM_TMP1;
+	}
+
+	if( mult == 0 || IS_NULL(e->b) ) {
+		// out = a + offs
+		if( offs == 0 ) {
+			emit_mov_gpr(ctx, out, a, 1);
+		} else if( offs > 0 && offs <= 0xFFF ) {
+			encode_add_sub_imm(ctx, 1, 0, 0, 0, offs, a, out);
+		} else if( offs < 0 && -offs <= 0xFFF ) {
+			encode_add_sub_imm(ctx, 1, 1, 0, 0, -offs, a, out);
+		} else {
+			load_immediate(ctx, offs, ARM_TMP2, true);
+			encode_add_sub_reg(ctx, 1, 0, 0, 0, ARM_TMP2, 0, a, out);
+		}
+	} else {
+		if( mult != 1 && mult != 2 && mult != 4 && mult != 8 )
+			jit_error("aarch64 LEA: unsupported scale");
+		int shift = (mult == 1) ? 0 : (mult == 2) ? 1 : (mult == 4) ? 2 : 3;
+		// Index width matches HL semantics — array indexes are M_I32.  Materialize
+		// from a 32-bit slot so we don't read garbage from the adjacent vreg, and
+		// use the extended-register ADD with UXTW so only the lower 32 bits feed
+		// the address calculation.
+		Arm64Reg b = materialize_gpr_ex(ctx, e->b, M_I32, ARM_TMP2, a);
+		// out = a + UXTW(b) << shift
+		encode_add_sub_ext(ctx, /*sf=*/1, /*op=*/0, /*S=*/0, b, /*option=UXTW*/2, shift, a, out);
+		if( offs != 0 ) {
+			if( offs > 0 && offs <= 0xFFF ) {
+				encode_add_sub_imm(ctx, 1, 0, 0, 0, offs, out, out);
+			} else if( offs < 0 && -offs <= 0xFFF ) {
+				encode_add_sub_imm(ctx, 1, 1, 0, 0, -offs, out, out);
+			} else {
+				load_immediate(ctx, offs, ARM_TMP2, true);
+				encode_add_sub_reg(ctx, 1, 0, 0, 0, ARM_TMP2, 0, out, out);
+			}
+		}
+	}
+
+	if( REG_KIND(out_e) != R_REG ) emit_mov(ctx, out_e, R(ARM_TMP1), M_PTR);
+}
+
+// ----------------------------------------------------------------------------
+// CMOV / XCHG / PUSH_CONST / PREFETCH.
+// ----------------------------------------------------------------------------
+static void emit_cmov_arm( code_ctx *ctx, ereg out_e, ereg a_e, ArmCondition cond ) {
+	if( REG_KIND(out_e) != R_REG ) jit_error("aarch64 CMOV non-reg out");
+	Arm64Reg out = gpr_id(out_e);
+	Arm64Reg a = materialize_gpr(ctx, a_e, M_PTR, ARM_TMP1);
+	// CSEL out, a, out, cond  (if cond: out=a; else out=out)
+	encode_cond_select(ctx, 1, 0, out, cond, 0, a, out);
+}
+
+static void emit_xchg( code_ctx *ctx, einstr *e ) {
+	if( REG_KIND(e->a) != R_REG || REG_KIND(e->b) != R_REG )
+		jit_error("aarch64 XCHG with non-reg operand");
+	Arm64Reg ra = gpr_id(e->a);
+	Arm64Reg rb = gpr_id(e->b);
+	emit_mov_gpr(ctx, ARM_TMP1, ra, 1);
+	emit_mov_gpr(ctx, ra, rb, 1);
+	emit_mov_gpr(ctx, rb, ARM_TMP1, 1);
+}
+
+static void emit_push_const( code_ctx *ctx, einstr *e ) {
+	if( e->mode != M_PTR ) jit_error("aarch64 PUSH_CONST non-ptr mode");
+	load_immediate(ctx, (int64_t)e->value, ARM_TMP1, true);
+	emit_sp_offs(ctx, -16);
+	encode_ldr_str_imm(ctx, 3, 0, 0, 0, SP_REG, ARM_TMP1);  // STR X16, [SP]
+}
+
+// ----------------------------------------------------------------------------
+// Phase 4: constant-pool helpers.
+// ----------------------------------------------------------------------------
+
+static int reserve_const_segment( code_ctx *ctx, int size, int align ) {
+	int pos = byte_count(ctx->const_table);
+	if( align ) {
+		int k = pos & (align - 1);
+		if( k ) {
+			byte_reserve_impl(&ctx->jit->galloc, &ctx->const_table, align - k);
+			pos = byte_count(ctx->const_table);
+		}
+	}
+	byte_reserve_impl(&ctx->jit->galloc, &ctx->const_table, size);
+	return pos;
+}
+
+// Insert (or find) a 64-bit value in the constant table; record the current
+// emission point as an ADRP+LDR (or ADRP+ADD) pair to be patched later.
+// Returns the byte offset of the value inside ctx->const_table.
+static int alloc_const( code_ctx *ctx, uint64_t value, int adrp_pos ) {
+	int pos = value_map_find(ctx->const_table_lookup, value);
+	if( pos < 0 ) {
+		pos = reserve_const_segment(ctx, 8, 8);
+		*(uint64_t*)byte_addr(ctx->const_table, pos) = value;
+		value_map_add_impl(&ctx->jit->galloc, &ctx->const_table_lookup, value, pos);
+	}
+	int_arr_add_impl(&ctx->jit->galloc, &ctx->const_refs, ctx->jit->out_pos + adrp_pos);
+	int_arr_add_impl(&ctx->jit->galloc, &ctx->const_refs, pos);
+	return pos;
+}
+
+// Emit ADRP dst, page ; LDR dst, [dst, #lo12]  — load constant `value` from pool.
+static void emit_const_load( code_ctx *ctx, Arm64Reg dst, uint64_t value ) {
+	int adrp_pos = byte_count(ctx->code);
+	encode_adrp(ctx, 0, 0, dst);                   // imm21 placeholder
+	encode_ldr_str_imm(ctx, 3, 0, 1, 0, dst, dst); // LDR Xd, [Xd, #0]
+	alloc_const(ctx, value, adrp_pos);
+}
+
+// Emit ADRP dst, page ; ADD dst, dst, #lo12  — load address of pool entry `value`.
+static void emit_const_addr( code_ctx *ctx, Arm64Reg dst, uint64_t value ) {
+	int adrp_pos = byte_count(ctx->code);
+	encode_adrp(ctx, 0, 0, dst);                                // imm21 placeholder
+	encode_add_sub_imm(ctx, 1, 0, 0, 0, 0, dst, dst);           // ADD Xd, Xd, #0
+	alloc_const(ctx, value, adrp_pos);
+}
+
+// Emit ADRP+ADD pair targeting an offset INSIDE the const table (used for
+// jump-table base addressing). The offset is recorded directly, not the value.
+static void emit_pool_offset_addr( code_ctx *ctx, Arm64Reg dst, int const_offset ) {
+	int adrp_pos = byte_count(ctx->code);
+	encode_adrp(ctx, 0, 0, dst);
+	encode_add_sub_imm(ctx, 1, 0, 0, 0, 0, dst, dst);
+	int_arr_add_impl(&ctx->jit->galloc, &ctx->const_refs, ctx->jit->out_pos + adrp_pos);
+	int_arr_add_impl(&ctx->jit->galloc, &ctx->const_refs, const_offset);
+}
+
+// ----------------------------------------------------------------------------
+// Phase 4: call ops, LOAD_FUN, JUMP_TABLE.
+// ----------------------------------------------------------------------------
+
+// CALL_FUN: emit BL with a deferred imm26 patch (resolved in flush_consts once
+// jit->mod->functions_ptrs[fid] holds the in-output offset).
+static void emit_call_fun( code_ctx *ctx, einstr *e ) {
+	int pos = byte_count(ctx->code);
+	encode_branch_link(ctx, 0); // imm26 placeholder
+	int_arr_add_impl(&ctx->jit->galloc, &ctx->funs, ctx->jit->out_pos + pos);
+	int_arr_add_impl(&ctx->jit->galloc, &ctx->funs, (int)e->a);
+	int_arr_add_impl(&ctx->jit->galloc, &ctx->funs, /*kind=BL*/0);
+}
+
+// LOAD_FUN: emit ADRP + ADD with a deferred imm21+imm12 patch — produces the
+// absolute address of the JIT-compiled function in `out`.
+static void emit_load_fun( code_ctx *ctx, ereg out_e, int fid ) {
+	Arm64Reg out = (REG_KIND(out_e) == R_REG) ? gpr_id(out_e) : ARM_TMP1;
+	int pos = byte_count(ctx->code);
+	encode_adrp(ctx, 0, 0, out);
+	encode_add_sub_imm(ctx, 1, 0, 0, 0, 0, out, out);
+	int_arr_add_impl(&ctx->jit->galloc, &ctx->funs, ctx->jit->out_pos + pos);
+	int_arr_add_impl(&ctx->jit->galloc, &ctx->funs, fid);
+	int_arr_add_impl(&ctx->jit->galloc, &ctx->funs, /*kind=ADRP+ADD*/1);
+	if( REG_KIND(out_e) != R_REG ) emit_mov(ctx, out_e, R(ARM_TMP1), M_PTR);
+}
+
+// CALL_PTR: indirect call via constant pool, with shortcuts for the two known
+// near-call targets (hl_null_access, hl_jit_null_field_access).
+static void emit_call_ptr( code_ctx *ctx, einstr *e ) {
+	uint64_t target = (uint64_t)e->value;
+	int near_pos = -1;
+	if( target == (uint64_t)(uintptr_t)hl_null_access )
+		near_pos = ctx->null_access_pos;
+	else if( target == (uint64_t)(uintptr_t)hl_jit_null_field_access )
+		near_pos = ctx->null_field_pos;
+
+	if( near_pos >= 0 ) {
+		// BL <stub> — direct PC-relative call to the trampoline emitted in
+		// hl_codegen_init.  Both source and target are within the same output
+		// buffer, so resolve the imm26 immediately.
+		int pos = ctx->jit->out_pos + byte_count(ctx->code);
+		intptr_t delta = (intptr_t)near_pos - (intptr_t)pos;
+		int imm26 = (int)(delta >> 2);
+		encode_branch_link(ctx, imm26);
+	} else {
+		emit_const_load(ctx, ARM_TMP1, target);
+		encode_branch_reg(ctx, /*BLR*/1, ARM_TMP1);
+	}
+	// Sub-word return masking to match x86's MOVZX behavior.
+	if( e->mode == M_UI8 )
+		encode_logical_imm(ctx, 0, 0x00, 0, 0, 7, X0, X0);
+	else if( e->mode == M_UI16 )
+		encode_logical_imm(ctx, 0, 0x00, 0, 0, 15, X0, X0);
+}
+
+// CALL_REG: BLR <Xn>.
+static void emit_call_reg( code_ctx *ctx, einstr *e ) {
+	Arm64Reg target = materialize_gpr(ctx, e->a, M_PTR, ARM_TMP1);
+	encode_branch_reg(ctx, /*BLR*/1, target);
+}
+
+// JUMP_TABLE: dispatch through a const_table-resident jump table whose entries
+// are absolute target addresses (filled in hl_codegen_final).  Index value lives
+// in e->a (32-bit int).  Falls through after BR — caller assumes no return.
+static void emit_jump_table( code_ctx *ctx, einstr *e ) {
+	int n = e->nargs;
+	int start = reserve_const_segment(ctx, 8 * n, 16);
+
+	// Materialize index as a zero-extended 64-bit value.  IR convention: e->a
+	// holds an int (M_I32); MOV Wn, Wn zero-extends to X.
+	Arm64Reg idx;
+	if( REG_KIND(e->a) == R_REG ) {
+		Arm64Reg src = gpr_id(e->a);
+		// MOV W17, Wsrc — clears upper 32 bits.
+		encode_logical_reg(ctx, 0, 0x01, 0, 0, src, 0, XZR, ARM_TMP2);
+		idx = ARM_TMP2;
+	} else {
+		emit_mov(ctx, R(ARM_TMP2), e->a, M_I32);
+		// Re-zero-extend to be safe.
+		encode_logical_reg(ctx, 0, 0x01, 0, 0, ARM_TMP2, 0, XZR, ARM_TMP2);
+		idx = ARM_TMP2;
+	}
+
+	emit_pool_offset_addr(ctx, ARM_TMP1, start);
+	// LDR X16, [X16, idx, LSL #3]  size=3, V=0, opc=1, option=3 (LSL/UXTX), S=1
+	encode_ldr_str_reg(ctx, 3, 0, 1, idx, /*option=*/3, /*S=*/1, ARM_TMP1, ARM_TMP1);
+	encode_branch_reg(ctx, /*BR*/0, ARM_TMP1);
+
+	ereg *args = hl_emit_get_args(ctx->jit->emit, e);
+	for( int k = 0; k < n; k++ ) {
+		int_arr_add_impl(&ctx->jit->galloc, &ctx->const_addr, start + k * 8);
+		int_arr_add_impl(&ctx->jit->galloc, &ctx->const_addr, ctx->cur_op + (int)args[k] + 1);
+	}
+}
+
+static void emit_prefetch( code_ctx *ctx, einstr *e ) {
+	int prfop;
+	switch( e->size_offs ) {
+	case 0: prfop = 0; break;   // PLDL1KEEP
+	case 1: prfop = 2; break;   // PLDL2KEEP
+	case 2: prfop = 4; break;   // PLDL3KEEP
+	case 3: prfop = 1; break;   // PLDL1STRM
+	case 4: prfop = 16; break;  // PSTL1KEEP
+	default: jit_error("aarch64 PREFETCH: bad size_offs");
+	}
+	Arm64Reg base;
+	if( REG_KIND(e->a) == R_REG ) {
+		base = gpr_id(e->a);
+	} else {
+		emit_mov(ctx, R(ARM_TMP1), e->a, M_PTR);
+		base = ARM_TMP1;
+	}
+	// PRFM: size=11, V=0, opc=10, imm12=0, Rn=base, Rt=prfop
+	encode_ldr_str_imm(ctx, 3, 0, 2, 0, base, (Arm64Reg)prfop);
+}
+
+// ============================================================================
+// hl_codegen_flush
+// ============================================================================
+
+void hl_codegen_flush( jit_ctx *jit ) {
+	code_ctx *ctx = jit->code;
+	if( ctx->flushed ) return;
+	ctx->flushed = true;
+	jit->code_size = ctx->code.cur;
+	jit->code_instrs = ctx->code.values;
+	jit->code_pos_map = ctx->pos_map;
+	if( ctx->pos_map ) ctx->pos_map[ctx->cur_op + 1] = ctx->code.cur;
+}
+
+// ============================================================================
+// hl_codegen_function — the main per-IR-op switch
+// ============================================================================
+
+void hl_codegen_function( jit_ctx *jit ) {
+	code_ctx *ctx = jit->code;
+	ctx->flushed = false;
+	byte_free(&ctx->code);
+	int_arr_free(&ctx->branch_fixups);
+	free(ctx->pos_map);
+	ctx->pos_map = (int*)malloc((jit->reg_instr_count + 1) * sizeof(int));
+	ctx->pos_map[0] = 0;
+	byte_reserve(ctx->code, 64);
+	ctx->code.cur -= 64;
+
+	int const_addr_prev = int_arr_count(ctx->const_addr);
+
+	for( int cur_pos = 0; cur_pos < jit->reg_instr_count; cur_pos++ ) {
+		einstr *e = jit->reg_instrs + cur_pos;
+		ereg out = jit->reg_writes[cur_pos];
+		byte_reserve(ctx->code, 64);
+		ctx->code.cur -= 64;
+		ctx->cur_op = cur_pos;
+		if( cur_pos > 0 ) ctx->pos_map[cur_pos] = ctx->code.cur;
+
+		switch( e->op ) {
+		case LOAD_ARG:
+			// nop — argument lives in its allocated register already
+			continue;
+		case NOP:
+			// HINT #0  (NOP)
+			EMIT32(ctx, 0xD503201F);
+			break;
+		case MOV:
+			emit_mov(ctx, out, e->a, e->mode);
+			break;
+		case LOAD_CONST:
+			emit_load_const(ctx, out, e->value, e->mode);
+			break;
+		case RET:
+			// Result placement was handled by upstream regs phase via a preceding
+			// regs_emit_mov(out, e->a). Here we just emit the actual return.
+			encode_branch_reg(ctx, /*opc=*/2 /*RET*/, LR);
+			break;
+		case PUSH:
+			emit_push(ctx, e->a, e->mode);
+			break;
+		case POP:
+			emit_pop(ctx, e->a, e->mode);
+			break;
+		case STACK_OFFS:
+			emit_sp_offs(ctx, e->size_offs);
+			break;
+		case CMP:
+			emit_cmp(ctx, e);
+			break;
+		case TEST:
+			emit_test(ctx, e);
+			break;
+		case JCOND:
+			emit_jump_cond(ctx, get_cond_jump(ctx), e->size_offs);
+			break;
+		case JUMP:
+			emit_jump(ctx, e->size_offs);
+			break;
+		case DEBUG_BREAK:
+			// BRK #0  — encoded as 0xD4200000
+			EMIT32(ctx, 0xD4200000);
+			break;
+		case BINOP:
+			if( is_fp_mode(e->mode) )
+				emit_binop_fp(ctx, (hl_op)e->size_offs, out, e->a, e->b, e->mode);
+			else
+				emit_binop_int(ctx, (hl_op)e->size_offs, out, e->a, e->b, e->mode);
+			break;
+		case UNOP:
+			// jit_emit.c lowers `not b` and similar boolean toggles as a UNOP
+			// with two operands (a, b=immediate, op=OXor).  Dispatch the
+			// two-operand form through the regular binop handler so OXor/OAnd/OOr
+			// don't need a second copy of the encoding.
+			if( !IS_NULL(e->b) ) {
+				if( is_fp_mode(e->mode) )
+					emit_binop_fp(ctx, (hl_op)e->size_offs, out, e->a, e->b, e->mode);
+				else
+					emit_binop_int(ctx, (hl_op)e->size_offs, out, e->a, e->b, e->mode);
+			} else {
+				emit_unop(ctx, (hl_op)e->size_offs, out, e->a, e->mode);
+			}
+			break;
+		case CONV:
+			emit_conv(ctx, e, out, /*unsign=*/false);
+			break;
+		case CONV_UNSIGNED:
+			emit_conv(ctx, e, out, /*unsign=*/true);
+			break;
+		case STORE:
+			emit_store(ctx, e);
+			break;
+		case LOAD_ADDR:
+			emit_load_addr(ctx, e, out);
+			break;
+		case LEA:
+			emit_lea(ctx, e, out);
+			break;
+		case CMOV:
+			emit_cmov_arm(ctx, out, e->a, get_cond_jump(ctx));
+			break;
+		case XCHG:
+			emit_xchg(ctx, e);
+			break;
+		case CXCHG:
+			// x86 emits BREAK() here too — atomic compare-exchange unimplemented.
+			EMIT32(ctx, 0xD4200000);
+			break;
+		case PUSH_CONST:
+			emit_push_const(ctx, e);
+			break;
+		case PREFETCH:
+			emit_prefetch(ctx, e);
+			break;
+		case CALL_FUN:
+			emit_call_fun(ctx, e);
+			break;
+		case CALL_PTR:
+			emit_call_ptr(ctx, e);
+			break;
+		case CALL_REG:
+			emit_call_reg(ctx, e);
+			break;
+		case LOAD_FUN:
+			emit_load_fun(ctx, out, e->size_offs);
+			break;
+		case JUMP_TABLE:
+			emit_jump_table(ctx, e);
+			break;
+		case ADDRESS:
+			// Rewritten to LEA in the regs phase; should never reach here.
+			jit_error("aarch64: ADDRESS reached backend (regs phase should rewrite)");
+			break;
+		case CATCH:
+			// IR marker only (mirrors x86) — no code emitted.
+			break;
+		default:
+			{
+				static const char *op_names[] = {
+					"LOAD_ADDR", "LOAD_CONST", "LOAD_ARG", "LOAD_FUN", "STORE",
+					"LEA", "TEST", "CMP", "JCOND", "JUMP", "JUMP_TABLE",
+					"BINOP", "UNOP", "CONV", "CONV_UNSIGNED", "RET",
+					"CALL_PTR", "CALL_REG", "CALL_FUN", "MOV", "CMOV",
+					"XCHG", "CXCHG", "PUSH_CONST", "PUSH", "POP",
+					"ALLOC_STACK", "PREFETCH", "DEBUG_BREAK", "BLOCK",
+					"ENTER", "STACK_OFFS", "CATCH", "ADDRESS", "NOP"
+				};
+				static char errbuf[128];
+				const char *name = (e->op < (int)(sizeof(op_names)/sizeof(*op_names)))
+					? op_names[e->op] : "?";
+				snprintf(errbuf, sizeof(errbuf), "aarch64: unhandled IR op %s (%d) at cur_op=%d",
+					name, e->op, cur_pos);
+				jit_error(errbuf);
+			}
+			break;
+		}
+
+		if( ctx->code.cur > ctx->code.max ) jit_error("aarch64 code buffer overrun");
+	}
+
+	// Functions are 4-byte aligned naturally on ARM; no padding needed for now.
+	hl_codegen_flush(jit);
+
+	// Patch all in-function branches.
+	for( int i = 0; i < int_arr_count(ctx->branch_fixups); i += 3 ) {
+		int pos = int_arr_get(ctx->branch_fixups, i);
+		int target_op = int_arr_get(ctx->branch_fixups, i + 1);
+		int is_cond = int_arr_get(ctx->branch_fixups, i + 2);
+		int target_byte_pos = ctx->pos_map[target_op];
+		patch_branch(ctx, pos, target_byte_pos, is_cond);
+	}
+
+	// Convert any jump-table target_op_index entries recorded by emit_jump_table
+	// into absolute byte offsets in the output buffer.
+	for( int i = const_addr_prev; i < int_arr_count(ctx->const_addr); i += 2 ) {
+		int target_op = int_arr_get(ctx->const_addr, i + 1);
+		int offs = jit->out_pos + ctx->pos_map[target_op];
+		ctx->const_addr.values[i + 1] = offs;
+	}
+}
+
+// ============================================================================
+// Phase 4: module-level emission.
+// ============================================================================
+
+// Helper: finalize a freshly-emitted helper stub (null-access stubs, c2hl,
+// hl2c).  Mirrors x86's flush_function: reports the start/size to the unwind
+// machinery and rounds the function buffer to 16 bytes.
+static void flush_helper( code_ctx *ctx, int start ) {
+	hl_jit_define_function(ctx->jit, start, ctx->jit->out_pos + byte_count(ctx->code) - start);
+	while( byte_count(ctx->code) & 15 )
+		EMIT32(ctx, 0xD503201F); // NOP
+	if( byte_count(ctx->code) > ctx->code.max ) jit_error("aarch64 trampoline overrun");
+}
+
+// Patch a placeholder branch (B, BL, or B.cond) emitted at byte position `pos`
+// to target byte position `target` in the same buffer.  Selects imm26 for
+// unconditional and imm19 for conditional based on the opcode bits.
+static void patch_helper_branch( code_ctx *ctx, int pos, int target ) {
+	int delta = (target - pos) >> 2;
+	unsigned int *insn = (unsigned int*)&ctx->code.values[pos];
+	unsigned int op = (*insn >> 26) & 0x3F;
+	if( op == 0x05 || op == 0x25 ) {
+		// B / BL: imm26
+		*insn = (*insn & ~0x03FFFFFFu) | ((unsigned)delta & 0x03FFFFFF);
+	} else {
+		// B.cond: imm19 in bits [23:5]
+		*insn = (*insn & ~(0x7FFFFu << 5)) | ((unsigned)(delta & 0x7FFFF) << 5);
+	}
+}
+
+// Emit a function prologue compatible with the Apple ARM64 + AAPCS64 ABI:
+// STP X29, X30, [SP, #-16]! ; MOV X29, SP.
+static void emit_helper_prologue( code_ctx *ctx ) {
+	encode_ldp_stp(ctx, /*opc=*/2, /*V=*/0, /*mode=*/0x03, /*imm7=*/-2 & 0x7F, LR, SP_REG, FP);
+	emit_mov_gpr(ctx, FP, SP_REG, 1);
+}
+
+// Emit the standard epilogue used by all helpers/trampolines:
+// MOV SP, X29 ; LDP X29, X30, [SP], #16 ; RET.
+static void emit_helper_epilogue( code_ctx *ctx ) {
+	emit_mov_gpr(ctx, SP_REG, FP, 1);
+	encode_ldp_stp(ctx, /*opc=*/2, /*V=*/0, /*mode=*/0x01, /*imm7=*/2, LR, SP_REG, FP);
+	encode_branch_reg(ctx, /*RET*/2, LR);
+}
+
+// Emit hl_null_access stub: ADRP/LDR the C function pointer and BLR (it never
+// returns; we still emit a BRK afterward to mirror x86).
+static void emit_null_access_stub( code_ctx *ctx, void *target ) {
+	emit_helper_prologue(ctx);
+	emit_const_load(ctx, ARM_TMP1, (uint64_t)(uintptr_t)target);
+	encode_branch_reg(ctx, /*BLR*/1, ARM_TMP1);
+	EMIT32(ctx, 0xD4200000); // BRK #0
+}
+
+// Emit hl_jit_null_field_access stub.  The caller passes the field hash in W0.
+// The C function takes one int argument (the hash), so our trampoline doesn't
+// need to marshal — just forward.
+static void emit_null_field_stub( code_ctx *ctx, void *target ) {
+	emit_helper_prologue(ctx);
+	emit_const_load(ctx, ARM_TMP1, (uint64_t)(uintptr_t)target);
+	encode_branch_reg(ctx, /*BLR*/1, ARM_TMP1);
+	EMIT32(ctx, 0xD4200000); // BRK #0
+}
+
+// Emit the c2hl trampoline.
+//
+// Called from C with: X0 = JIT-compiled fn ptr, X1 = &vargs (struct{regs[16];
+// stack[16]}), X2 = stack-arg count.
+//
+// The C side (jit.c:callback_c2hl) populates vargs.regs[0..7] with int reg
+// args, vargs.regs[8..15] with FP reg args, and vargs.stack[16-N..15] with the
+// N stack args (leftmost stack arg at vargs.stack[15]).  We:
+//   1. Load X0..X7 from [vargs+0..56] and D0..D7 from [vargs+64..120].
+//   2. Push the stack args in reverse order so the leftmost ends up at SP+0.
+//   3. BLR fn ; restore frame ; RET.
+//
+// X16/X17 hold the fn pointer and vargs through the call (they survive any
+// data-load up to BLR; the dynamic linker only clobbers them at the BLR itself,
+// at which point we're done with them).
+static void emit_c2hl_trampoline( code_ctx *ctx ) {
+	emit_helper_prologue(ctx);
+	emit_mov_gpr(ctx, ARM_TMP1, X0, 1);  // X16 = fn
+	emit_mov_gpr(ctx, ARM_TMP2, X1, 1);  // X17 = vargs
+	emit_mov_gpr(ctx, X9, X2, 1);        // X9  = stack count
+
+	// Load int arg regs from vargs.regs[0..7].
+	encode_ldp_stp(ctx, 0x02, 0, 0x02, 0, X1, ARM_TMP2, X0); // LDP X0,X1, [X17, #0]
+	encode_ldp_stp(ctx, 0x02, 0, 0x02, 2, X3, ARM_TMP2, X2); // LDP X2,X3, [X17, #16]
+	encode_ldp_stp(ctx, 0x02, 0, 0x02, 4, X5, ARM_TMP2, X4); // LDP X4,X5, [X17, #32]
+	encode_ldp_stp(ctx, 0x02, 0, 0x02, 6, X7, ARM_TMP2, X6); // LDP X6,X7, [X17, #48]
+	// Load FP arg regs from vargs.regs[8..15] (= byte offsets 64..120).
+	encode_ldp_stp(ctx, 0x01, 1, 0x02, 8,  (Arm64Reg)1, ARM_TMP2, (Arm64Reg)0);  // LDP D0,D1, [X17, #64]
+	encode_ldp_stp(ctx, 0x01, 1, 0x02, 10, (Arm64Reg)3, ARM_TMP2, (Arm64Reg)2);  // LDP D2,D3, [X17, #80]
+	encode_ldp_stp(ctx, 0x01, 1, 0x02, 12, (Arm64Reg)5, ARM_TMP2, (Arm64Reg)4);  // LDP D4,D5, [X17, #96]
+	encode_ldp_stp(ctx, 0x01, 1, 0x02, 14, (Arm64Reg)7, ARM_TMP2, (Arm64Reg)6);  // LDP D6,D7, [X17, #112]
+
+	// Push stack args, padding SP to 16 bytes if N is odd.
+	// total bytes = N*8 + (N&1)*8 — always a multiple of 16.
+
+	// CBZ X9, no_stack — skip everything if no stack args.
+	int cbz_skip_pos = byte_count(ctx->code);
+	encode_cbz_cbnz(ctx, /*sf=*/1, /*op=*/0, 0, X9);
+
+	// X10 = X9 * 8        (size in bytes; LSL #3 via UBFM).
+	emit_bitfield(ctx, /*sf=*/1, /*opc=UBFM*/0x02, /*immr=*/(64 - 3) & 0x3F, /*imms=*/63 - 3, X9, X10);
+
+	// Pad: if X9 is odd, allocate +8.  X10 += (X9 & 1) << 3
+	// AND X11, X9, #1 ; LSL X11, X11, #3 ; ADD X10, X10, X11.
+	encode_logical_imm(ctx, 1, 0x00, 1, 0, 0, X9, X11);  // AND X11, X9, #1 (immr=0,imms=0,N=1 → 1)
+	emit_bitfield(ctx, 1, 0x02, (64 - 3) & 0x3F, 63 - 3, X11, X11);
+	encode_add_sub_reg(ctx, 1, 0, 0, 0, X11, 0, X10, X10);
+
+	// SUB SP, SP, X10  — must use ADD/SUB (extended register); the shifted-reg
+	// form treats register 31 as XZR, not SP, so this would silently NOP out.
+	encode_add_sub_ext(ctx, 1, 1, 0, X10, /*UXTX*/3, 0, SP_REG, SP_REG);
+
+	// Source pointer X12 = vargs + (32 - N) * 8 = vargs + 256 - X10
+	// Compute via X12 = vargs + 256, then X12 -= X10.
+	encode_add_sub_imm(ctx, 1, 0, 0, 0, 256, ARM_TMP2, X12);            // ADD X12, X17, #256
+	encode_add_sub_reg(ctx, 1, 1, 0, 0, X10, 0, X12, X12);              // SUB X12, X12, X10
+
+	// Destination pointer X13 = SP
+	emit_mov_gpr(ctx, X13, SP_REG, 1);
+
+	// Counter X14 = X9
+	emit_mov_gpr(ctx, X14, X9, 1);
+
+	// Copy loop: while X14 != 0: *X13++ = *X12++ ; X14--.
+	int loop_top = byte_count(ctx->code);
+	encode_ldr_str_imm(ctx, 3, 0, 1, 0, X12, X15);                      // LDR X15, [X12, #0]
+	encode_add_sub_imm(ctx, 1, 0, 0, 0, 8, X12, X12);                   // ADD X12, X12, #8
+	encode_ldr_str_imm(ctx, 3, 0, 0, 0, X13, X15);                      // STR X15, [X13, #0]
+	encode_add_sub_imm(ctx, 1, 0, 0, 0, 8, X13, X13);                   // ADD X13, X13, #8
+	encode_add_sub_imm(ctx, 1, 1, 1, 0, 1, X14, X14);                   // SUBS X14, X14, #1
+	int loop_branch_pos = byte_count(ctx->code);
+	encode_branch_cond(ctx, 0, COND_NE);                                 // B.NE loop_top
+	patch_helper_branch(ctx, loop_branch_pos, loop_top);
+
+	// Patch the CBZ skip target = end of stack-push block.
+	int after_stack = byte_count(ctx->code);
+	patch_helper_branch(ctx, cbz_skip_pos, after_stack);
+	// --- END STACK PUSH ---
+
+	// BLR fn (X16).
+	encode_branch_reg(ctx, /*BLR*/1, ARM_TMP1);
+
+	emit_helper_epilogue(ctx);
+}
+
+// Emit the hl2c trampoline.  Called from JIT-compiled HL code; X0 holds the
+// closure (vclosure_wrapper*), X1..X7,V0..V7 hold call args.  We:
+//   1. Spill X0..X7 and V0..V7 into a 128-byte buffer beneath the saved frame.
+//   2. Inspect cl->t->fun->ret->kind to decide between hl_jit_wrapper_ptr
+//      (default) and hl_jit_wrapper_d (HF32/HF64 return).
+//   3. Call wrapper(closure, &caller_stack_args, &spilled_regs).
+static void emit_hl2c_trampoline( code_ctx *ctx ) {
+	hl_type_fun *ft = NULL;
+
+	emit_helper_prologue(ctx);
+	emit_sp_offs(ctx, -128); // SUB SP, SP, #128
+
+	// Spill X0..X7 → [SP+0..56].  mode 0x12 = signed-offset STORE.
+	encode_ldp_stp(ctx, 0x02, 0, 0x12, 0, X1, SP_REG, X0); // STP X0,X1, [SP, #0]
+	encode_ldp_stp(ctx, 0x02, 0, 0x12, 2, X3, SP_REG, X2); // STP X2,X3, [SP, #16]
+	encode_ldp_stp(ctx, 0x02, 0, 0x12, 4, X5, SP_REG, X4); // STP X4,X5, [SP, #32]
+	encode_ldp_stp(ctx, 0x02, 0, 0x12, 6, X7, SP_REG, X6); // STP X6,X7, [SP, #48]
+	// Spill V0..V7 → [SP+64..120] (V0 at lowest, matching wrapper expectations).
+	encode_ldp_stp(ctx, 0x01, 1, 0x12, 8,  (Arm64Reg)1, SP_REG, (Arm64Reg)0); // STP D0,D1, [SP, #64]
+	encode_ldp_stp(ctx, 0x01, 1, 0x12, 10, (Arm64Reg)3, SP_REG, (Arm64Reg)2); // STP D2,D3, [SP, #80]
+	encode_ldp_stp(ctx, 0x01, 1, 0x12, 12, (Arm64Reg)5, SP_REG, (Arm64Reg)4); // STP D4,D5, [SP, #96]
+	encode_ldp_stp(ctx, 0x01, 1, 0x12, 14, (Arm64Reg)7, SP_REG, (Arm64Reg)6); // STP D6,D7, [SP, #112]
+
+	// X9 = closure (still in X0 — copy to keep X0 alive across loads).
+	emit_mov_gpr(ctx, X9, X0, 1);
+	// X9 = X9->t            ; LDR X9, [X9, #0]
+	encode_ldr_str_imm(ctx, 3, 0, 1, 0, X9, X9);
+	// X9 = X9->fun          ; LDR X9, [X9, #8]
+	encode_ldr_str_imm(ctx, 3, 0, 1, 1, X9, X9);
+	// X9 = X9->ret          ; LDR X9, [X9, #offsetof(hl_type_fun, ret)]
+	int ret_offset = (int)(int_val)&ft->ret;
+	if( (ret_offset & 7) == 0 && (unsigned)ret_offset < 0x8000 )
+		encode_ldr_str_imm(ctx, 3, 0, 1, ret_offset / 8, X9, X9);
+	else {
+		load_immediate(ctx, ret_offset, X10, true);
+		encode_ldr_str_reg(ctx, 3, 0, 1, X10, /*option=*/3, /*S=*/0, X9, X9);
+	}
+	// W9 = W9->kind         ; LDR W9, [X9, #0]
+	encode_ldr_str_imm(ctx, 2, 0, 1, 0, X9, X9);
+
+	// Branch on return-type kind.  HF64 / HF32 → wrapper_d; default → wrapper_ptr.
+	encode_add_sub_imm(ctx, 0, 1, 1, 0, HF64, X9, XZR);   // CMP W9, #HF64
+	int jeq_f64 = byte_count(ctx->code);
+	encode_branch_cond(ctx, 0, COND_EQ);
+	encode_add_sub_imm(ctx, 0, 1, 1, 0, HF32, X9, XZR);   // CMP W9, #HF32
+	int jeq_f32 = byte_count(ctx->code);
+	encode_branch_cond(ctx, 0, COND_EQ);
+
+	// Default path: load wrapper_ptr.
+	emit_const_load(ctx, ARM_TMP1, (uint64_t)(uintptr_t)hl_jit_wrapper_ptr);
+	int jdone_default = byte_count(ctx->code);
+	encode_branch_uncond(ctx, 0);
+
+	// Float path.
+	int float_path = byte_count(ctx->code);
+	patch_helper_branch(ctx, jeq_f64, float_path);
+	patch_helper_branch(ctx, jeq_f32, float_path);
+	emit_const_load(ctx, ARM_TMP1, (uint64_t)(uintptr_t)hl_jit_wrapper_d);
+
+	int after_select = byte_count(ctx->code);
+	patch_helper_branch(ctx, jdone_default, after_select);
+
+	// Set up wrapper args:
+	// X0 (closure)  — already in X0 across the type-walk because the LDR chain
+	//                 above used X9 only.  ✓
+	// X1 = caller stack args = X29 + 16 (skip saved fp+lr).
+	encode_add_sub_imm(ctx, 1, 0, 0, 0, 16, FP, X1);
+	// X2 = &spilled regs = SP.
+	emit_mov_gpr(ctx, X2, SP_REG, 1);
+
+	// Call wrapper.
+	encode_branch_reg(ctx, /*BLR*/1, ARM_TMP1);
+
+	emit_helper_epilogue(ctx);
+}
+
+void hl_codegen_init( jit_ctx *jit ) {
+	code_ctx *ctx = jit->code;
+	byte_reserve(ctx->code, 4096);
+	ctx->code.cur -= 4096;
+
+	// hl_null_access stub.
+	ctx->null_access_pos = jit->out_pos + byte_count(ctx->code);
+	emit_null_access_stub(ctx, (void*)hl_null_access);
+	flush_helper(ctx, ctx->null_access_pos);
+
+	// hl_jit_null_field_access stub.
+	ctx->null_field_pos = jit->out_pos + byte_count(ctx->code);
+	emit_null_field_stub(ctx, (void*)hl_jit_null_field_access);
+	flush_helper(ctx, ctx->null_field_pos);
+
+	// c2hl + hl2c trampolines.
+	jit->code_funs.c2hl = jit->out_pos + byte_count(ctx->code);
+	emit_c2hl_trampoline(ctx);
+	flush_helper(ctx, jit->code_funs.c2hl);
+
+	jit->code_funs.hl2c = jit->out_pos + byte_count(ctx->code);
+	emit_hl2c_trampoline(ctx);
+	flush_helper(ctx, jit->code_funs.hl2c);
+
+	hl_codegen_flush(jit);
+}
+
+// ---------------------------------------------------------------------------
+// hl_codegen_flush_consts: patch BL/ADRP/LDR/ADD references against absolute
+// positions, then append the constant table to the output stream.
+// ---------------------------------------------------------------------------
+
+// Patch ADRP imm21 split (immlo at bits 30:29, immhi at bits 23:5) given a
+// target byte address `target_abs` and the address `pc_abs` of the ADRP insn.
+// Both are absolute byte offsets within `jit->output` (page-aligned arithmetic
+// is preserved when the buffer is later mmap'd to a page-aligned VA).
+static void patch_adrp_imm21( unsigned char *out, int pc_abs, int target_abs ) {
+	int imm21 = (target_abs >> 12) - (pc_abs >> 12);
+	unsigned int *insn = (unsigned int*)(out + pc_abs);
+	unsigned int immlo = (unsigned)(imm21 & 0x3);
+	unsigned int immhi = (unsigned)((imm21 >> 2) & 0x7FFFF);
+	*insn = (*insn & ~((0x3u << 29) | (0x7FFFFu << 5)))
+	      | (immlo << 29) | (immhi << 5);
+}
+
+// Patch ADD/LDR imm12 (bits 21:10).  `scale` is the instruction's natural
+// immediate scale (1 for ADD, 8 for 64-bit LDR, etc.).  Caller guarantees the
+// low bits of the target are aligned to `scale`.
+static void patch_imm12( unsigned char *out, int pos, int target_lo12, int scale ) {
+	unsigned int *insn = (unsigned int*)(out + pos);
+	unsigned int imm12 = (unsigned)((target_lo12 / scale) & 0xFFF);
+	*insn = (*insn & ~(0xFFFu << 10)) | (imm12 << 10);
+}
+
+void hl_codegen_flush_consts( jit_ctx *jit ) {
+	code_ctx *ctx = jit->code;
+
+	// Patch cross-function call sites recorded in `funs`.
+	for( int i = 0; i < int_arr_count(ctx->funs); i += 3 ) {
+		int pos = int_arr_get(ctx->funs, i);
+		int fid = int_arr_get(ctx->funs, i + 1);
+		int kind = int_arr_get(ctx->funs, i + 2);
+		intptr_t target_offs = (intptr_t)jit->mod->functions_ptrs[fid];
+		if( kind == 0 ) {
+			// BL imm26.
+			intptr_t delta = target_offs - (intptr_t)pos;
+			int imm26 = (int)(delta >> 2);
+			unsigned int *insn = (unsigned int*)(jit->output + pos);
+			*insn = (*insn & ~0x03FFFFFFu) | ((unsigned)imm26 & 0x03FFFFFF);
+		} else {
+			// ADRP+ADD pair: pos = ADRP, pos+4 = ADD.
+			patch_adrp_imm21(jit->output, pos, (int)target_offs);
+			int lo12 = (int)target_offs & 0xFFF;
+			patch_imm12(jit->output, pos + 4, lo12, /*scale=*/1);
+		}
+	}
+	int_arr_reset(&ctx->funs);
+
+	// Pad jit->out_pos to an 8-byte boundary so that constants at offset 0
+	// (and every multiple of 8) within the table are reachable through LDR's
+	// 8-byte-scaled imm12 field with no precision loss.
+	while( jit->out_pos & 7 ) {
+		if( jit->out_pos < jit->out_max ) jit->output[jit->out_pos] = 0;
+		jit->out_pos++;
+	}
+
+	// Append the constant table to the output stream.
+	jit->code_size = byte_count(ctx->const_table);
+	jit->code_instrs = ctx->const_table.values;
+	ctx->const_table_pos = jit->out_pos;
+
+	// Patch ADRP+(LDR|ADD) const-pool refs.
+	for( int i = 0; i < int_arr_count(ctx->const_refs); i += 2 ) {
+		int adrp_pos = int_arr_get(ctx->const_refs, i);
+		int coffs = int_arr_get(ctx->const_refs, i + 1);
+		int target = ctx->const_table_pos + coffs;
+		patch_adrp_imm21(jit->output, adrp_pos, target);
+		// Detect whether the second insn is LDR (Xt|Dt|St) or ADD by inspecting
+		// the top 10 bits (31:22). LDR (unsigned-imm) encoding is
+		// `size 111 V 01 01 imm12 Rn Rt`; the 8-byte-scaled imm12 lives in
+		// bits 21:10. ADD-imm leaves the imm12 unscaled.
+		// Top10 bits (>>22) of canonical encodings:
+		//   LDR Xt (size=11,V=0,opc=01): 0b1111100101 = 0x3E5  (scale=8)
+		//   LDR Dt (size=11,V=1,opc=01): 0b1111110101 = 0x3F5  (scale=8)
+		//   LDR St (size=10,V=1,opc=01): 0b1011110101 = 0x2F5  (scale=4)
+		// ADD-imm always falls into the else.
+		unsigned int second = *(unsigned int*)(jit->output + adrp_pos + 4);
+		int lo12 = target & 0xFFF;
+		switch( (second >> 22) & 0x3FF ) {
+		case 0x3E5: // LDR Xt
+		case 0x3F5: // LDR Dt
+			patch_imm12(jit->output, adrp_pos + 4, lo12, /*scale=*/8);
+			break;
+		case 0x2F5: // LDR St
+			patch_imm12(jit->output, adrp_pos + 4, lo12, /*scale=*/4);
+			break;
+		default:
+			// ADD (imm), unscaled.
+			patch_imm12(jit->output, adrp_pos + 4, lo12, /*scale=*/1);
+			break;
+		}
+	}
+	int_arr_reset(&ctx->const_refs);
+
+	byte_free(&ctx->const_table);
+	value_map_free(&ctx->const_table_lookup);
+}
+
+void hl_codegen_final( jit_ctx *jit ) {
+	code_ctx *ctx = jit->code;
+	// Fill jump-table entries with absolute addresses inside final_code.
+	for( int i = 0; i < int_arr_count(ctx->const_addr); i += 2 ) {
+		int table_offs = int_arr_get(ctx->const_addr, i);
+		int target_offs = int_arr_get(ctx->const_addr, i + 1);
+		*(void**)(jit->final_code + ctx->const_table_pos + table_offs) =
+			jit->final_code + target_offs;
+	}
+	int_arr_free(&ctx->const_addr);
+}
diff --git a/src/jit_aarch64_emit.c b/src/jit_aarch64_emit.c
new file mode 100644
index 000000000..dbef3be37
--- /dev/null
+++ b/src/jit_aarch64_emit.c
@@ -0,0 +1,864 @@
+/*
+ * Copyright (C)2015-2016 Haxe Foundation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * AArch64 Instruction Encoding
+ *
+ * This file provides low-level instruction encoding functions for the AArch64
+ * architecture. All instructions are 32-bit fixed width.
+ *
+ * References:
+ * - ARM Architecture Reference Manual ARMv8 (ARM ARM)
+ * - AArch64 Instruction Set Architecture
+ */
+
+#if !defined(__aarch64__) && !defined(_M_ARM64)
+#  error "This file is for AArch64 architecture only."
+#endif
+
+#include "jit_aarch64_emit.h"
+
+/*
+ * Helper macros for bit field manipulation
+ */
+#define BITS(val, start, len) (((unsigned int)(val) & ((1u << (len)) - 1)) << (start))
+#define BIT(val, pos) (((unsigned int)(val) & 1) << (pos))
+
+// EMIT32 is defined in jit_common.h
+
+// ============================================================================
+// ADD/SUB Instructions
+// ============================================================================
+
+/**
+ * Encode ADD/SUB (immediate) instruction
+ * Format: ADD/SUB Xd, Xn, #imm12 [, LSL #shift]
+ *
+ * @param sf     1=64-bit, 0=32-bit
+ * @param op     0=ADD, 1=SUB
+ * @param S      1=set flags (ADDS/SUBS), 0=don't set flags
+ * @param shift  0=LSL #0, 1=LSL #12
+ * @param imm12  12-bit unsigned immediate
+ * @param Rn     Source register (0-31, 31=SP)
+ * @param Rd     Destination register (0-31, 31=SP)
+ */
+void encode_add_sub_imm(code_ctx *ctx, int sf, int op, int S, int shift, int imm12, Arm64Reg Rn, Arm64Reg Rd) {
+	// ADD/SUB (immediate) encoding:
+	// [31] = sf, [30] = op (0=ADD, 1=SUB), [29] = S, [28:23] = 100010, [22] = sh
+	// [21:10] = imm12, [9:5] = Rn, [4:0] = Rd
+	unsigned int insn = BIT(sf, 31) |         // [31] = sf
+	                    BIT(op, 30) |         // [30] = op
+	                    BIT(S, 29) |          // [29] = S
+	                    BITS(0x22, 23, 6) |   // [28:23] = 100010
+	                    BIT(shift, 22) |      // [22] = sh
+	                    BITS(imm12, 10, 12) | // [21:10] = imm12
+	                    BITS(Rn, 5, 5) |      // [9:5] = Rn
+	                    BITS(Rd, 0, 5);       // [4:0] = Rd
+	EMIT32(ctx, insn);
+}
+
+/**
+ * Encode ADD/SUB (shifted register) instruction
+ * Format: ADD/SUB Xd, Xn, Xm [, shift #amount]
+ *
+ * @param sf     1=64-bit, 0=32-bit
+ * @param op     0=ADD, 1=SUB
+ * @param S      1=set flags, 0=don't set flags
+ * @param shift  00=LSL, 01=LSR, 10=ASR
+ * @param Rm     Second source register
+ * @param imm6   Shift amount (0-63)
+ * @param Rn     First source register
+ * @param Rd     Destination register
+ */
+void encode_add_sub_reg(code_ctx *ctx, int sf, int op, int S, int shift, Arm64Reg Rm,
+                        int imm6, Arm64Reg Rn, Arm64Reg Rd) {
+	unsigned int insn = BIT(sf, 31) | BITS(op, 30, 1) | BIT(S, 29) | BITS(0x0B, 24, 5) |
+	                    BITS(shift, 22, 2) | BITS(Rm, 16, 5) | BITS(imm6, 10, 6) |
+	                    BITS(Rn, 5, 5) | BITS(Rd, 0, 5);
+	EMIT32(ctx, insn);
+}
+
+/**
+ * Encode ADD/SUB (extended register) instruction
+ * Format: ADD/SUB Xd, Xn, Wm, extend [#amount]
+ *
+ * @param sf      1=64-bit, 0=32-bit
+ * @param op      0=ADD, 1=SUB
+ * @param S       1=set flags, 0=don't set flags
+ * @param Rm      Second source register
+ * @param option  Extend type (UXTB=000, UXTH=001, UXTW=010, UXTX=011, SXTB=100, SXTH=101, SXTW=110, SXTX=111)
+ * @param imm3    Shift amount (0-4)
+ * @param Rn      First source register
+ * @param Rd      Destination register
+ */
+void encode_add_sub_ext(code_ctx *ctx, int sf, int op, int S, Arm64Reg Rm,
+                        int option, int imm3, Arm64Reg Rn, Arm64Reg Rd) {
+	unsigned int insn = BIT(sf, 31) | BITS(op, 30, 1) | BIT(S, 29) | BITS(0x0B, 24, 5) |
+	                    BITS(1, 21, 2) | BITS(Rm, 16, 5) | BITS(option, 13, 3) |
+	                    BITS(imm3, 10, 3) | BITS(Rn, 5, 5) | BITS(Rd, 0, 5);
+	EMIT32(ctx, insn);
+}
+
+// ============================================================================
+// Logical Instructions
+// ============================================================================
+
+/**
+ * Encode Logical (immediate) instruction
+ * Format: AND/ORR/EOR/ANDS Xd, Xn, #imm
+ *
+ * @param sf    1=64-bit, 0=32-bit
+ * @param opc   00=AND, 01=ORR, 10=EOR, 11=ANDS
+ * @param N     Immediate encoding parameter
+ * @param immr  Immediate encoding parameter (rotation)
+ * @param imms  Immediate encoding parameter (size)
+ * @param Rn    Source register
+ * @param Rd    Destination register
+ */
+void encode_logical_imm(code_ctx *ctx, int sf, int opc, int N, int immr, int imms, Arm64Reg Rn, Arm64Reg Rd) {
+	unsigned int insn = BIT(sf, 31) | BITS(opc, 29, 2) | BITS(0x24, 23, 6) | BIT(N, 22) |
+	                    BITS(immr, 16, 6) | BITS(imms, 10, 6) | BITS(Rn, 5, 5) | BITS(Rd, 0, 5);
+	EMIT32(ctx, insn);
+}
+
+/**
+ * Encode Logical (shifted register) instruction
+ * Format: AND/ORR/EOR/ANDS Xd, Xn, Xm [, shift #amount]
+ *
+ * @param sf     1=64-bit, 0=32-bit
+ * @param opc    00=AND, 01=ORR, 10=EOR, 11=ANDS
+ * @param shift  00=LSL, 01=LSR, 10=ASR, 11=ROR
+ * @param N      Must be 0 for regular logical ops
+ * @param Rm     Second source register
+ * @param imm6   Shift amount
+ * @param Rn     First source register
+ * @param Rd     Destination register
+ */
+void encode_logical_reg(code_ctx *ctx, int sf, int opc, int shift, int N, Arm64Reg Rm,
+                        int imm6, Arm64Reg Rn, Arm64Reg Rd) {
+	unsigned int insn = BIT(sf, 31) | BITS(opc, 29, 2) | BITS(0x0A, 24, 5) | BITS(shift, 22, 2) |
+	                    BIT(N, 21) | BITS(Rm, 16, 5) | BITS(imm6, 10, 6) |
+	                    BITS(Rn, 5, 5) | BITS(Rd, 0, 5);
+	EMIT32(ctx, insn);
+}
+
+// ============================================================================
+// Move Wide (immediate) Instructions
+// ============================================================================
+
+/**
+ * Encode MOVZ/MOVN/MOVK instruction
+ * Format: MOVZ/MOVN/MOVK Xd, #imm16 [, LSL #shift]
+ *
+ * @param sf    1=64-bit, 0=32-bit
+ * @param opc   10=MOVZ, 00=MOVN, 11=MOVK
+ * @param hw    Hardware position (0-3 for 64-bit, 0-1 for 32-bit) - selects 16-bit field
+ * @param imm16 16-bit immediate value
+ * @param Rd    Destination register
+ */
+void encode_mov_wide_imm(code_ctx *ctx, int sf, int opc, int hw, int imm16, Arm64Reg Rd) {
+	unsigned int insn = BIT(sf, 31) | BITS(opc, 29, 2) | BITS(0x25, 23, 6) |
+	                    BITS(hw, 21, 2) | BITS(imm16, 5, 16) | BITS(Rd, 0, 5);
+	EMIT32(ctx, insn);
+}
+
+// ============================================================================
+// Multiply Instructions
+// ============================================================================
+
+/**
+ * Encode MADD/MSUB instruction (multiply-add/subtract)
+ * Format: MADD Xd, Xn, Xm, Xa  (Xd = Xa + Xn*Xm)
+ *         MSUB Xd, Xn, Xm, Xa  (Xd = Xa - Xn*Xm)
+ *
+ * @param sf  1=64-bit, 0=32-bit
+ * @param op  0=MADD, 1=MSUB
+ * @param Rm  Second multiplicand
+ * @param Ra  Addend/subtrahend (use XZR for simple MUL)
+ * @param Rn  First multiplicand
+ * @param Rd  Destination
+ */
+void encode_madd_msub(code_ctx *ctx, int sf, int op, Arm64Reg Rm, Arm64Reg Ra, Arm64Reg Rn, Arm64Reg Rd) {
+	// MADD/MSUB encoding: [31]=sf, [30:29]=00, [28:24]=11011, [23:21]=000, [20:16]=Rm
+	// [15]=op (0=MADD, 1=MSUB), [14:10]=Ra, [9:5]=Rn, [4:0]=Rd
+	unsigned int insn = BIT(sf, 31) | BITS(0xD8, 21, 8) | BITS(Rm, 16, 5) |
+	                    BIT(op, 15) | BITS(Ra, 10, 5) | BITS(Rn, 5, 5) | BITS(Rd, 0, 5);
+	EMIT32(ctx, insn);
+}
+
+/**
+ * Encode SDIV/UDIV instruction
+ * Format: SDIV/UDIV Xd, Xn, Xm
+ *
+ * @param sf  1=64-bit, 0=32-bit
+ * @param U   0=UDIV (unsigned), 1=SDIV (signed)  — this matches the
+ *            ARM ARM bit-10 encoding: 0=UDIV, 1=SDIV.  (Earlier comment
+ *            had this inverted.)
+ * @param Rm  Divisor
+ * @param Rn  Dividend
+ * @param Rd  Destination (quotient)
+ */
+void encode_div(code_ctx *ctx, int sf, int U, Arm64Reg Rm, Arm64Reg Rn, Arm64Reg Rd) {
+	// SDIV/UDIV encoding: [31]=sf, [30:29]=00, [28:21]=11010110, [20:16]=Rm
+	// [15:11]=00001, [10]=U (1=SDIV, 0=UDIV), [9:5]=Rn, [4:0]=Rd
+	unsigned int insn = BIT(sf, 31) | BITS(0xD6, 21, 8) | BITS(Rm, 16, 5) |
+	                    BITS(0x2 | U, 10, 6) | BITS(Rn, 5, 5) | BITS(Rd, 0, 5);
+	EMIT32(ctx, insn);
+}
+
+// ============================================================================
+// Shift Instructions
+// ============================================================================
+
+/**
+ * Encode variable shift (LSLV/LSRV/ASRV/RORV)
+ * Format: LSL/LSR/ASR/ROR Xd, Xn, Xm
+ *
+ * @param sf   1=64-bit, 0=32-bit
+ * @param op2  00=LSLV, 01=LSRV, 10=ASRV, 11=RORV
+ * @param Rm   Shift amount register
+ * @param Rn   Source register
+ * @param Rd   Destination register
+ */
+void encode_shift_reg(code_ctx *ctx, int sf, int op2, Arm64Reg Rm, Arm64Reg Rn, Arm64Reg Rd) {
+	// LSLV/LSRV/ASRV/RORV encoding: [31]=sf, [30:29]=00, [28:21]=11010110, [20:16]=Rm
+	// [15:12]=0010, [11:10]=op2, [9:5]=Rn, [4:0]=Rd
+	unsigned int insn = BIT(sf, 31) | BITS(0xD6, 21, 8) | BITS(Rm, 16, 5) |
+	                    BITS(0x08 | op2, 10, 6) | BITS(Rn, 5, 5) | BITS(Rd, 0, 5);
+	EMIT32(ctx, insn);
+}
+
+// ============================================================================
+// Load/Store Instructions
+// ============================================================================
+
+/**
+ * Encode LDR/STR (unsigned immediate offset)
+ * Format: LDR/STR Xt, [Xn, #imm]
+ *
+ * @param size  00=8-bit, 01=16-bit, 10=32-bit, 11=64-bit
+ * @param V     0=GPR, 1=FP/SIMD
+ * @param opc   For V=0: 01=LDR, 00=STR, 10=LDRSW, 11=prfm
+ * @param imm12 Unsigned 12-bit offset (scaled by size)
+ * @param Rn    Base register
+ * @param Rt    Source/destination register
+ */
+void encode_ldr_str_imm(code_ctx *ctx, int size, int V, int opc, int imm12, Arm64Reg Rn, Arm64Reg Rt) {
+	// LDR/STR (unsigned offset) encoding:
+	// [31:30] = size, [29:27] = 111, [26] = V, [25:24] = 01, [23:22] = opc
+	// [21:10] = imm12, [9:5] = Rn, [4:0] = Rt
+	unsigned int insn = BITS(size, 30, 2) |   // [31:30] = size
+	                    BITS(7, 27, 3) |      // [29:27] = 111
+	                    BIT(V, 26) |          // [26] = V
+	                    BITS(1, 24, 2) |      // [25:24] = 01 (unsigned offset)
+	                    BITS(opc, 22, 2) |    // [23:22] = opc
+	                    BITS(imm12, 10, 12) | // [21:10] = imm12
+	                    BITS(Rn, 5, 5) |      // [9:5] = Rn
+	                    BITS(Rt, 0, 5);       // [4:0] = Rt
+	EMIT32(ctx, insn);
+}
+
+/**
+ * Encode LDR/STR (register offset)
+ * Format: LDR/STR Xt, [Xn, Xm{, extend {#amount}}]
+ *
+ * @param size    00=8-bit, 01=16-bit, 10=32-bit, 11=64-bit
+ * @param V       0=GPR, 1=FP/SIMD
+ * @param opc     For V=0: 01=LDR, 00=STR
+ * @param Rm      Offset register
+ * @param option  Extend type (010=UXTW, 011=LSL, 110=SXTW, 111=SXTX)
+ * @param S       1=scale offset by size, 0=no scaling
+ * @param Rn      Base register
+ * @param Rt      Source/destination register
+ */
+void encode_ldr_str_reg(code_ctx *ctx, int size, int V, int opc, Arm64Reg Rm,
+                        int option, int S, Arm64Reg Rn, Arm64Reg Rt) {
+	// LDR/STR (register offset) encoding:
+	// [31:30] = size, [29:27] = 111, [26] = V, [25:24] = 00, [23:22] = opc
+	// [21] = 1, [20:16] = Rm, [15:13] = option, [12] = S, [11:10] = 10
+	// [9:5] = Rn, [4:0] = Rt
+	unsigned int insn = BITS(size, 30, 2) |   // [31:30] = size
+	                    BITS(7, 27, 3) |      // [29:27] = 111
+	                    BIT(V, 26) |          // [26] = V
+	                    BITS(0, 24, 2) |      // [25:24] = 00 (register offset)
+	                    BITS(opc, 22, 2) |    // [23:22] = opc
+	                    BIT(1, 21) |          // [21] = 1
+	                    BITS(Rm, 16, 5) |     // [20:16] = Rm
+	                    BITS(option, 13, 3) | // [15:13] = option
+	                    BIT(S, 12) |          // [12] = S
+	                    BITS(2, 10, 2) |      // [11:10] = 10
+	                    BITS(Rn, 5, 5) |      // [9:5] = Rn
+	                    BITS(Rt, 0, 5);       // [4:0] = Rt
+	EMIT32(ctx, insn);
+}
+
+/**
+ * Encode LDUR/STUR (unscaled signed offset)
+ * Format: LDUR/STUR Rt, [Xn, #simm9]
+ *
+ * This instruction uses a signed 9-bit immediate offset (-256 to +255) that is
+ * NOT scaled by the access size. This is ideal for accessing stack locals at
+ * negative offsets from the frame pointer.
+ *
+ * @param size  00=8-bit, 01=16-bit, 10=32-bit, 11=64-bit
+ * @param V     0=GPR, 1=FP/SIMD
+ * @param opc   00=STUR, 01=LDUR
+ * @param imm9  Signed 9-bit offset (-256 to +255), unscaled
+ * @param Rn    Base register
+ * @param Rt    Source/destination register
+ */
+void encode_ldur_stur(code_ctx *ctx, int size, int V, int opc, int imm9, Arm64Reg Rn, Arm64Reg Rt) {
+	// LDUR/STUR (unscaled offset) encoding:
+	// [31:30] = size, [29:27] = 111, [26] = V, [25:24] = 00, [23:22] = opc
+	// [21] = 0, [20:12] = imm9, [11:10] = 00, [9:5] = Rn, [4:0] = Rt
+	unsigned int insn = BITS(size, 30, 2) |        // [31:30] = size
+	                    BITS(7, 27, 3) |           // [29:27] = 111
+	                    BIT(V, 26) |               // [26] = V
+	                    BITS(0, 24, 2) |           // [25:24] = 00 (unscaled offset)
+	                    BITS(opc, 22, 2) |         // [23:22] = opc
+	                    BIT(0, 21) |               // [21] = 0
+	                    BITS(imm9 & 0x1FF, 12, 9) | // [20:12] = imm9 (masked to 9 bits)
+	                    BITS(0, 10, 2) |           // [11:10] = 00
+	                    BITS(Rn, 5, 5) |           // [9:5] = Rn
+	                    BITS(Rt, 0, 5);            // [4:0] = Rt
+	EMIT32(ctx, insn);
+}
+
+/**
+ * Encode LDP/STP (Load/Store Pair)
+ * Format: LDP/STP Xt1, Xt2, [Xn, #imm]  (various addressing modes)
+ *
+ * @param opc   Size: 00=32-bit, 10=64-bit
+ * @param V     0=GPR, 1=FP/SIMD registers
+ * @param mode  Addressing mode + load/store:
+ *              0x01 = post-indexed load   (LDP Xt1, Xt2, [Xn], #imm)
+ *              0x02 = signed-offset load  (LDP Xt1, Xt2, [Xn, #imm])
+ *              0x03 = pre-indexed store   (STP Xt1, Xt2, [Xn, #imm]!)
+ *              0x12 = signed-offset store (STP Xt1, Xt2, [Xn, #imm])
+ *              0x13 = pre-indexed load    (LDP Xt1, Xt2, [Xn, #imm]!)
+ *              0x11 = post-indexed store  (STP Xt1, Xt2, [Xn], #imm)
+ * @param imm7  Signed 7-bit offset (scaled by register size: *4 for 32-bit, *8 for 64-bit)
+ * @param Rt2   Second register
+ * @param Rn    Base register
+ * @param Rt    First register
+ *
+ * ARM64 encoding:
+ *   [31:30] = opc (size)
+ *   [29:27] = 101 (fixed)
+ *   [26]    = V
+ *   [25:24] = addressing mode (01=post, 10=offset, 11=pre)
+ *   [23]    = 0 (reserved)
+ *   [22]    = L (0=store, 1=load)
+ *   [21:15] = imm7
+ *   [14:10] = Rt2
+ *   [9:5]   = Rn
+ *   [4:0]   = Rt
+ */
+void encode_ldp_stp(code_ctx *ctx, int opc, int V, int mode, int imm7,
+                    Arm64Reg Rt2, Arm64Reg Rn, Arm64Reg Rt) {
+	int addr_mode, L;
+
+	// Decode mode parameter to get addressing mode and load/store bit.
+	// Bit 4 (0x10) of mode forces store; otherwise the legacy mappings apply.
+	if (mode & 0x10) {
+		addr_mode = mode & 3;
+		L = 0;
+	} else if (mode == 0x03) {
+		// Pre-indexed store: STP Xt1, Xt2, [Xn, #imm]!
+		addr_mode = 3;
+		L = 0;
+	} else if (mode == 0x01) {
+		// Post-indexed load: LDP Xt1, Xt2, [Xn], #imm
+		addr_mode = 1;
+		L = 1;
+	} else {
+		// Default: use mode as addressing mode, assume load
+		addr_mode = mode & 3;
+		L = 1;
+	}
+
+	unsigned int insn = BITS(opc, 30, 2) |       // [31:30] = opc
+	                    BITS(5, 27, 3) |         // [29:27] = 101
+	                    BIT(V, 26) |             // [26] = V
+	                    BITS(addr_mode, 23, 2) | // [24:23] = addressing mode
+	                    BIT(L, 22) |             // [22] = L
+	                    BITS(imm7, 15, 7) |      // [21:15] = imm7
+	                    BITS(Rt2, 10, 5) |       // [14:10] = Rt2
+	                    BITS(Rn, 5, 5) |         // [9:5] = Rn
+	                    BITS(Rt, 0, 5);          // [4:0] = Rt
+	EMIT32(ctx, insn);
+}
+
+// ============================================================================
+// PC-Relative Addressing
+// ============================================================================
+
+/**
+ * Encode ADRP instruction
+ * Format: ADRP Xd, label  (load PC-relative page address)
+ *
+ * @param immlo  Low 2 bits of 21-bit offset (bits 0-1)
+ * @param immhi  High 19 bits of 21-bit offset (bits 2-20)
+ * @param Rd     Destination register
+ *
+ * Note: offset is in pages (4KB), so actual byte offset = imm21 << 12
+ */
+void encode_adrp(code_ctx *ctx, int immlo, int immhi, Arm64Reg Rd) {
+	unsigned int insn = BITS(1, 31, 1) | BITS(immlo, 29, 2) | BITS(0x10, 24, 5) |
+	                    BITS(immhi, 5, 19) | BITS(Rd, 0, 5);
+	EMIT32(ctx, insn);
+}
+
+/**
+ * Encode ADR instruction
+ * Format: ADR Xd, label  (load PC-relative address)
+ *
+ * @param immlo  Low 2 bits of 21-bit offset
+ * @param immhi  High 19 bits of 21-bit offset
+ * @param Rd     Destination register
+ */
+void encode_adr(code_ctx *ctx, int immlo, int immhi, Arm64Reg Rd) {
+	unsigned int insn = BITS(0, 31, 1) | BITS(immlo, 29, 2) | BITS(0x10, 24, 5) |
+	                    BITS(immhi, 5, 19) | BITS(Rd, 0, 5);
+	EMIT32(ctx, insn);
+}
+
+// ============================================================================
+// Branch Instructions
+// ============================================================================
+
+/**
+ * Encode conditional branch
+ * Format: B.cond label
+ *
+ * @param imm19  Signed 19-bit offset (in instructions, i.e., offset/4)
+ * @param cond   Condition code (0000=EQ, 0001=NE, 1010=GE, 1011=LT, etc.)
+ */
+void encode_branch_cond(code_ctx *ctx, int imm19, ArmCondition cond) {
+	unsigned int insn = BITS(0x54, 24, 8) | BITS(imm19, 5, 19) | BITS(cond, 0, 4);
+	EMIT32(ctx, insn);
+}
+
+/**
+ * Encode unconditional branch
+ * Format: B label
+ *
+ * @param imm26  Signed 26-bit offset (in instructions, i.e., offset/4)
+ */
+void encode_branch_uncond(code_ctx *ctx, int imm26) {
+	unsigned int insn = BITS(0x05, 26, 6) | BITS(imm26, 0, 26);
+	EMIT32(ctx, insn);
+}
+
+/**
+ * Encode branch with link
+ * Format: BL label
+ *
+ * @param imm26  Signed 26-bit offset (in instructions)
+ */
+void encode_branch_link(code_ctx *ctx, int imm26) {
+	unsigned int insn = BITS(0x25, 26, 6) | BITS(imm26, 0, 26);
+	EMIT32(ctx, insn);
+}
+
+/**
+ * Encode register branch instructions
+ * Format: BR/BLR/RET Xn
+ *
+ * @param opc  00=BR, 01=BLR, 10=RET
+ * @param Rn   Register containing target address (X30/LR for RET)
+ */
+void encode_branch_reg(code_ctx *ctx, int opc, Arm64Reg Rn) {
+	unsigned int insn = BITS(0x6B0, 21, 11) | BITS(opc, 21, 2) |
+	                    BITS(0x1F, 16, 5) | BITS(Rn, 5, 5);
+	EMIT32(ctx, insn);
+}
+
+/**
+ * Encode CBZ/CBNZ (compare and branch if zero/non-zero)
+ * Format: CBZ/CBNZ Xt, label
+ *
+ * @param sf     1=64-bit, 0=32-bit
+ * @param op     0=CBZ, 1=CBNZ
+ * @param imm19  Signed 19-bit offset (in instructions)
+ * @param Rt     Register to test
+ */
+void encode_cbz_cbnz(code_ctx *ctx, int sf, int op, int imm19, Arm64Reg Rt) {
+	unsigned int insn = BIT(sf, 31) | BITS(0x1A, 25, 6) | BIT(op, 24) |
+	                    BITS(imm19, 5, 19) | BITS(Rt, 0, 5);
+	EMIT32(ctx, insn);
+}
+
+/**
+ * Encode TBZ/TBNZ (test bit and branch if zero/non-zero)
+ * Format: TBZ/TBNZ Xt, #bit, label
+ *
+ * @param b5     Bit 5 of bit position (0-63)
+ * @param op     0=TBZ, 1=TBNZ
+ * @param b40    Bits 4-0 of bit position
+ * @param imm14  Signed 14-bit offset (in instructions)
+ * @param Rt     Register to test
+ */
+void encode_tbz_tbnz(code_ctx *ctx, int b5, int op, int b40, int imm14, Arm64Reg Rt) {
+	unsigned int insn = BIT(b5, 31) | BITS(0x1B, 25, 6) | BIT(op, 24) |
+	                    BITS(b40, 19, 5) | BITS(imm14, 5, 14) | BITS(Rt, 0, 5);
+	EMIT32(ctx, insn);
+}
+
+// ============================================================================
+// Floating-Point Instructions
+// ============================================================================
+
+/**
+ * Encode floating-point arithmetic (2-source)
+ * Format: FADD/FSUB/FMUL/FDIV/FMAX/FMIN Vd, Vn, Vm
+ *
+ * @param M       0=scalar, 1=vector
+ * @param S       0=single precision, 1=double precision
+ * @param type    00=single, 01=double
+ * @param Rm      Second source register
+ * @param opcode  0000=FMUL, 0001=FDIV, 0010=FADD, 0011=FSUB, 0100=FMAX, 0101=FMIN
+ * @param Rn      First source register
+ * @param Rd      Destination register
+ */
+void encode_fp_arith(code_ctx *ctx, int M, int S, int type, Arm64FpReg Rm,
+                     int opcode, Arm64FpReg Rn, Arm64FpReg Rd) {
+	unsigned int insn = BIT(M, 31) | BIT(S, 29) | BITS(0x1E, 24, 5) |
+	                    BITS(type, 22, 2) | BITS(1, 21, 1) | BITS(Rm, 16, 5) |
+	                    BITS(opcode, 12, 4) | BITS(2, 10, 2) |
+	                    BITS(Rn, 5, 5) | BITS(Rd, 0, 5);
+	EMIT32(ctx, insn);
+}
+
+/**
+ * Encode floating-point negate/abs/sqrt (1-source)
+ * Format: FNEG/FABS/FSQRT Vd, Vn
+ *
+ * @param M       0=scalar, 1=vector
+ * @param S       0=single precision, 1=double precision
+ * @param type    00=single, 01=double
+ * @param opcode  000000=FMOV, 000001=FABS, 000010=FNEG, 000011=FSQRT
+ * @param Rn      Source register
+ * @param Rd      Destination register
+ */
+void encode_fp_1src(code_ctx *ctx, int M, int S, int type, int opcode, Arm64FpReg Rn, Arm64FpReg Rd) {
+	unsigned int insn = BIT(M, 31) | BIT(S, 29) | BITS(0x1E, 24, 5) |
+	                    BITS(type, 22, 2) | BITS(1, 21, 1) |
+	                    BITS(opcode, 15, 6) | BITS(0x10, 10, 5) |
+	                    BITS(Rn, 5, 5) | BITS(Rd, 0, 5);
+	EMIT32(ctx, insn);
+}
+
+/**
+ * Encode floating-point compare
+ * Format: FCMP/FCMPE Vn, Vm
+ *
+ * @param M     0=scalar
+ * @param S     0=single precision, 1=double precision
+ * @param type  00=single, 01=double
+ * @param Rm    Second source register (or 0 for comparison with zero)
+ * @param op    00=FCMP, 10=FCMPE (signal exception on qNaN)
+ * @param Rn    First source register
+ */
+void encode_fp_compare(code_ctx *ctx, int M, int S, int type, Arm64FpReg Rm, int op, Arm64FpReg Rn) {
+	unsigned int insn = BIT(M, 31) | BIT(S, 29) | BITS(0x1E, 24, 5) |
+	                    BITS(type, 22, 2) | BITS(1, 21, 1) | BITS(Rm, 16, 5) |
+	                    BITS(op, 14, 2) | BITS(8, 10, 4) | BITS(Rn, 5, 5);
+	EMIT32(ctx, insn);
+}
+
+/**
+ * Encode floating-point conversion to integer
+ * Format: FCVTZS/FCVTZU Xd, Vn
+ *
+ * @param sf    1=64-bit int, 0=32-bit int
+ * @param S     0=single precision, 1=double precision
+ * @param type  00=single, 01=double, 10/11=half
+ * @param rmode 00=round to nearest, 01=round towards +inf, 10=round towards -inf, 11=round towards zero
+ * @param opc   000=FCVTNS, 001=FCVTNU, 010=SCVTF, 011=UCVTF, 110=FMOV, 111=FMOV
+ * @param Rn    Source FP register
+ * @param Rd    Destination integer register
+ */
+void encode_fcvt_int(code_ctx *ctx, int sf, int S, int type, int rmode, int opc, Arm64FpReg Rn, Arm64Reg Rd) {
+	unsigned int insn = BIT(sf, 31) | BIT(S, 29) | BITS(0x1E, 24, 5) |
+	                    BITS(type, 22, 2) | BITS(1, 21, 1) |
+	                    BITS(rmode, 19, 2) | BITS(opc, 16, 3) |
+	                    BITS(Rn, 5, 5) | BITS(Rd, 0, 5);
+	EMIT32(ctx, insn);
+}
+
+/**
+ * Encode integer conversion to floating-point
+ * Format: SCVTF/UCVTF Vd, Xn
+ *
+ * @param sf    1=64-bit int, 0=32-bit int
+ * @param S     0=single precision, 1=double precision
+ * @param type  00=single, 01=double
+ * @param rmode 00 for conversions
+ * @param opc   010=SCVTF, 011=UCVTF
+ * @param Rn    Source integer register
+ * @param Rd    Destination FP register
+ */
+void encode_int_fcvt(code_ctx *ctx, int sf, int S, int type, int rmode, int opc, Arm64Reg Rn, Arm64FpReg Rd) {
+	unsigned int insn = BIT(sf, 31) | BIT(S, 29) | BITS(0x1E, 24, 5) |
+	                    BITS(type, 22, 2) | BITS(1, 21, 1) |
+	                    BITS(rmode, 19, 2) | BITS(opc, 16, 3) |
+	                    BITS(Rn, 5, 5) | BITS(Rd, 0, 5);
+	EMIT32(ctx, insn);
+}
+
+// ============================================================================
+// Conditional Select
+// ============================================================================
+
+/**
+ * Encode CSEL/CSINC/CSINV/CSNEG
+ * Format: CSEL Xd, Xn, Xm, cond
+ *
+ * @param sf    1=64-bit, 0=32-bit
+ * @param op    0=CSEL, 1=CSINC/CSINV/CSNEG (depends on op2)
+ * @param Rm    Second source register
+ * @param cond  Condition code
+ * @param op2   00=CSEL, 01=CSINC, 10=CSINV, 11=CSNEG
+ * @param Rn    First source register
+ * @param Rd    Destination register
+ */
+void encode_cond_select(code_ctx *ctx, int sf, int op, Arm64Reg Rm, ArmCondition cond,
+                        int op2, Arm64Reg Rn, Arm64Reg Rd) {
+	// CSEL/CSINC/CSINV/CSNEG encoding: [31]=sf, [30]=op, [29]=S=0, [28:21]=11010100
+	// [20:16]=Rm, [15:12]=cond, [11:10]=op2, [9:5]=Rn, [4:0]=Rd
+	unsigned int insn = BIT(sf, 31) | BIT(op, 30) | BITS(0xD4, 21, 8) |
+	                    BITS(Rm, 16, 5) | BITS(cond, 12, 4) |
+	                    BITS(op2, 10, 2) | BITS(Rn, 5, 5) | BITS(Rd, 0, 5);
+	EMIT32(ctx, insn);
+}
+
+// ============================================================================
+// High-Level Helper Functions
+// ============================================================================
+
+// ----------------------------------------------------------------------------
+// Logical Immediate Encoding Helpers
+// ----------------------------------------------------------------------------
+
+/**
+ * Rotate a 64-bit value right by the specified amount
+ */
+static inline uint64_t rotate_right_64(uint64_t val, int rotation) {
+	return (val >> (rotation & 63)) | (val << ((-rotation) & 63));
+}
+
+/**
+ * Check if a 64-bit value can be encoded as a logical immediate
+ * and compute the N, immr, imms fields if so.
+ *
+ * Based on the optimized algorithm from dougallj:
+ * https://dougallj.wordpress.com/2021/10/30/
+ *
+ * AArch64 logical immediates can represent bitmask patterns consisting of
+ * a single run of 1-bits, optionally rotated, and replicated across element
+ * sizes of 2, 4, 8, 16, 32, or 64 bits.
+ *
+ * @param val   The 64-bit value to check
+ * @param N     Output: N field (1 for 64-bit element, 0 otherwise)
+ * @param immr  Output: rotation amount field (6 bits)
+ * @param imms  Output: element size/ones encoding field (6 bits)
+ * @return      true if value is encodable, false otherwise
+ */
+static bool is_logical_immediate_64(uint64_t val, int *N, int *immr, int *imms) {
+	// All-zeros and all-ones cannot be encoded
+	if (val == 0 || ~val == 0)
+		return false;
+
+	// Find rotation to normalize the pattern
+	// val & (val + 1) clears trailing ones; ctz gives rotation amount
+	// Handle the case where val is all trailing ones (ctzll(0) is undefined)
+	uint64_t tmp = val & (val + 1);
+	int rotation = (tmp == 0) ? 0 : __builtin_ctzll(tmp);
+	uint64_t normalized = rotate_right_64(val, rotation);
+
+	// Count leading zeros and trailing ones in normalized form
+	int zeroes = __builtin_clzll(normalized);
+	int ones = __builtin_ctzll(~normalized);
+	int size = zeroes + ones;
+
+	// Validate: pattern must repeat when rotated by size
+	// This also implicitly checks that size is a power of 2
+	if (rotate_right_64(val, size) != val)
+		return false;
+
+	// Encode the fields
+	*immr = (-rotation) & (size - 1);
+	*imms = ((-(size << 1)) | (ones - 1)) & 0x3f;
+	*N = (size >> 6);
+
+	return true;
+}
+
+/**
+ * Check if a 32-bit value can be encoded as a logical immediate
+ * for 32-bit operations (where N must be 0).
+ *
+ * @param val   The 32-bit value to check
+ * @param N     Output: N field (must be 0 for 32-bit)
+ * @param immr  Output: rotation amount field
+ * @param imms  Output: element size/ones encoding field
+ * @return      true if value is encodable, false otherwise
+ */
+static bool is_logical_immediate_32(uint32_t val, int *N, int *immr, int *imms) {
+	// All-zeros and all-ones cannot be encoded
+	if (val == 0 || val == 0xFFFFFFFF)
+		return false;
+
+	// Replicate 32-bit pattern to 64-bit for encoding calculation
+	uint64_t val64 = ((uint64_t)val << 32) | val;
+
+	if (!is_logical_immediate_64(val64, N, immr, imms))
+		return false;
+
+	// For 32-bit operations, N must be 0 (element size <= 32)
+	if (*N != 0)
+		return false;
+
+	return true;
+}
+
+// ----------------------------------------------------------------------------
+
+/**
+ * Load an immediate value into a register
+ * Uses logical immediate (ORR) when possible, otherwise MOVZ/MOVK sequence
+ *
+ * @param val       64-bit immediate value
+ * @param dst       Destination register
+ * @param is_64bit  true=64-bit register, false=32-bit register
+ */
+void load_immediate(code_ctx *ctx, int64_t val, Arm64Reg dst, bool is_64bit) {
+	int sf = is_64bit ? 1 : 0;
+
+	// Special case: zero
+	if (val == 0) {
+		// MOV Xd, XZR (using ORR with XZR)
+		encode_logical_reg(ctx, sf, 0x01, 0, 0, XZR, 0, XZR, dst);
+		return;
+	}
+
+	// Special case: all ones (for 32-bit: 0xFFFFFFFF, for 64-bit: 0xFFFFFFFFFFFFFFFF)
+	if ((!is_64bit && val == 0xFFFFFFFF) || (is_64bit && val == -1LL)) {
+		// MOVN Xd, #0
+		encode_mov_wide_imm(ctx, sf, 0x00, 0, 0, dst);
+		return;
+	}
+
+	// Special case: small negative values that fit in a single MOVN instruction
+	// MOVN Xd, #imm16 produces ~imm16, which equals -(imm16+1)
+	// So for values in range [-65536, -1], we can use a single MOVN
+	// For 32-bit mode, sign extension is automatic
+	if (val < 0 && val >= -65536) {
+		// ~val gives us the immediate to use with MOVN
+		// e.g., for val=-8: ~(-8) = 7, and MOVN Xd, #7 produces ~7 = -8
+		encode_mov_wide_imm(ctx, sf, 0x00, 0, (int)(~val) & 0xFFFF, dst);
+		return;
+	}
+
+	// Special case: small positive values that fit in a single MOVZ instruction
+	if (val > 0 && val <= 65535) {
+		encode_mov_wide_imm(ctx, sf, 0x02, 0, (int)val, dst);
+		return;
+	}
+
+	// Try logical immediate encoding: ORR Xd, XZR, #imm
+	// This can load many bitmask patterns with a single instruction
+	{
+		int N, immr, imms;
+		bool can_encode = is_64bit
+			? is_logical_immediate_64((uint64_t)val, &N, &immr, &imms)
+			: is_logical_immediate_32((uint32_t)val, &N, &immr, &imms);
+
+		if (can_encode) {
+			// ORR Xd, XZR, #imm  (opc=0x01 for ORR)
+			encode_logical_imm(ctx, sf, 0x01, N, immr, imms, XZR, dst);
+			return;
+		}
+	}
+
+	// Count which halfwords are non-zero
+	uint64_t uval = (uint64_t)val;
+	int hw0 = uval & 0xFFFF;
+	int hw1 = (uval >> 16) & 0xFFFF;
+	int hw2 = (uval >> 32) & 0xFFFF;
+	int hw3 = (uval >> 48) & 0xFFFF;
+
+	int nonzero_count = 0;
+	if (hw0) nonzero_count++;
+	if (hw1) nonzero_count++;
+	if (is_64bit) {
+		if (hw2) nonzero_count++;
+		if (hw3) nonzero_count++;
+	}
+
+	// Try MOVN (move inverted) if more halfwords are 0xFFFF than not
+	int ones_count = 0;
+	if (hw0 == 0xFFFF) ones_count++;
+	if (hw1 == 0xFFFF) ones_count++;
+	if (is_64bit) {
+		if (hw2 == 0xFFFF) ones_count++;
+		if (hw3 == 0xFFFF) ones_count++;
+	}
+
+	int total_hw = is_64bit ? 4 : 2;
+	bool use_movn = (ones_count > nonzero_count);
+
+	if (use_movn) {
+		// Use MOVN (inverted) + MOVK
+		int first = 1;
+		for (int i = 0; i < total_hw; i++) {
+			int hw_val = (uval >> (i * 16)) & 0xFFFF;
+			if (hw_val != 0xFFFF) {
+				if (first) {
+					// MOVN Xd, #(~hw_val & 0xFFFF), LSL #(i*16)
+					encode_mov_wide_imm(ctx, sf, 0x00, i, (~hw_val) & 0xFFFF, dst);
+					first = 0;
+				} else {
+					// MOVK Xd, #hw_val, LSL #(i*16)
+					encode_mov_wide_imm(ctx, sf, 0x03, i, hw_val, dst);
+				}
+			}
+		}
+	} else {
+		// Use MOVZ + MOVK
+		int first = 1;
+		for (int i = 0; i < total_hw; i++) {
+			int hw_val = (uval >> (i * 16)) & 0xFFFF;
+			if (hw_val != 0) {
+				if (first) {
+					// MOVZ Xd, #hw_val, LSL #(i*16)
+					encode_mov_wide_imm(ctx, sf, 0x02, i, hw_val, dst);
+					first = 0;
+				} else {
+					// MOVK Xd, #hw_val, LSL #(i*16)
+					encode_mov_wide_imm(ctx, sf, 0x03, i, hw_val, dst);
+				}
+			}
+		}
+	}
+}
diff --git a/src/jit_aarch64_emit.h b/src/jit_aarch64_emit.h
new file mode 100644
index 000000000..0371af69c
--- /dev/null
+++ b/src/jit_aarch64_emit.h
@@ -0,0 +1,240 @@
+/*
+ * Copyright (C)2015-2026 Haxe Foundation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+#ifndef JIT_AARCH64_EMIT_H
+#define JIT_AARCH64_EMIT_H
+
+#include <stdbool.h>
+#include <stdint.h>
+#include <hlmodule.h>
+#include <jit.h>
+#include "data_struct.h"
+
+// Per-TU instantiation of byte_arr (the code buffer type).
+// Helpers are static-inline so two TUs may include this header without ODR conflict.
+#define S_TYPE			byte_arr
+#define S_NAME(name)	byte_##name
+#define S_VALUE			unsigned char
+#include "data_struct.c"
+#define byte_reserve(set,count)	byte_reserve_impl(DEF_ALLOC,&set,count)
+
+// value_map: dedup uint64 constants in the literal pool (Phase 4+).
+#define S_SORTED
+#define S_MAP
+#define S_TYPE			value_map
+#define S_NAME(name)	value_map_##name
+#define S_KEY			uint64
+#define S_VALUE			int
+#define S_DEFVAL		-1
+#include "data_struct.c"
+#undef S_MAP
+#undef S_SORTED
+
+// Backend codegen context (each backend defines its own _code_ctx layout).
+// Phase 2: function shell + branch fixups + per-IR-op pos_map.
+// Phase 4: constant pool, function-call relocations, jump-table absolutes.
+struct _code_ctx {
+	jit_ctx *jit;
+	byte_arr code;
+	// Each pending branch is a triple (code_byte_pos, target_ir_op, is_cond)
+	// patched after the function's pos_map is finalized.
+	int_arr branch_fixups;
+	int *pos_map;
+	int cur_op;
+	bool flushed;
+	// Phase 4: cross-function call relocations (BL imm26 or ADRP+ADD).
+	// Triples (code_byte_pos, fid, kind) where kind=0:BL, kind=1:ADRP+ADD pair.
+	int_arr funs;
+	// Constant pool. Each constant ref is (adrp_pos, const_offset); patched in
+	// hl_codegen_flush_consts to ADRP imm21 + LDR/ADD imm12 split.
+	value_map const_table_lookup;
+	byte_arr const_table;
+	int_arr const_refs;
+	// Jump-table absolute fills: pairs (table_offs, target_byte_pos_in_output).
+	// In hl_codegen_final each entry becomes `final_code + target` written into
+	// `final_code + const_table_pos + table_offs`.
+	int_arr const_addr;
+	int const_table_pos;
+	// Direct-call shortcuts for null-access stubs (BL within ±128 MB).
+	int null_access_pos;
+	int null_field_pos;
+};
+
+// Write a 32-bit instruction to ctx->code. Caller is responsible for byte_reserve.
+#define EMIT32(ctx, val) do { \
+	*(unsigned int*)&(ctx)->code.values[(ctx)->code.cur] = (unsigned int)(val); \
+	(ctx)->code.cur += 4; \
+} while(0)
+
+/*
+ * AArch64 Register Definitions
+ */
+
+// General Purpose Registers (64-bit: X0-X30, 32-bit: W0-W30)
+typedef enum {
+	X0  = 0,  X1  = 1,  X2  = 2,  X3  = 3,
+	X4  = 4,  X5  = 5,  X6  = 6,  X7  = 7,
+	X8  = 8,  X9  = 9,  X10 = 10, X11 = 11,
+	X12 = 12, X13 = 13, X14 = 14, X15 = 15,
+	X16 = 16, X17 = 17, X18 = 18, X19 = 19,
+	X20 = 20, X21 = 21, X22 = 22, X23 = 23,
+	X24 = 24, X25 = 25, X26 = 26, X27 = 27,
+	X28 = 28, X29 = 29, X30 = 30,
+
+	// Special register names
+	FP = 29,      // Frame Pointer (X29)
+	LR = 30,      // Link Register (X30)
+	SP_REG = 31,  // Stack Pointer (encoding value, context-dependent)
+	XZR = 31      // Zero Register (encoding value, context-dependent)
+} Arm64Reg;
+
+// 32-bit register names (W registers)
+typedef enum {
+	W0  = 0,  W1  = 1,  W2  = 2,  W3  = 3,
+	W4  = 4,  W5  = 5,  W6  = 6,  W7  = 7,
+	W8  = 8,  W9  = 9,  W10 = 10, W11 = 11,
+	W12 = 12, W13 = 13, W14 = 14, W15 = 15,
+	W16 = 16, W17 = 17, W18 = 18, W19 = 19,
+	W20 = 20, W21 = 21, W22 = 22, W23 = 23,
+	W24 = 24, W25 = 25, W26 = 26, W27 = 27,
+	W28 = 28, W29 = 29, W30 = 30,
+	WZR = 31  // 32-bit zero register
+} Arm64Reg32;
+
+// Floating-Point/SIMD Registers
+typedef enum {
+	V0  = 0,  V1  = 1,  V2  = 2,  V3  = 3,
+	V4  = 4,  V5  = 5,  V6  = 6,  V7  = 7,
+	V8  = 8,  V9  = 9,  V10 = 10, V11 = 11,
+	V12 = 12, V13 = 13, V14 = 14, V15 = 15,
+	V16 = 16, V17 = 17, V18 = 18, V19 = 19,
+	V20 = 20, V21 = 21, V22 = 22, V23 = 23,
+	V24 = 24, V25 = 25, V26 = 26, V27 = 27,
+	V28 = 28, V29 = 29, V30 = 30, V31 = 31
+} Arm64FpReg;
+
+// Aliases for specific precision
+// D0-D31 = 64-bit (double precision) - same encoding as V0-V31
+// S0-S31 = 32-bit (single precision) - same encoding as V0-V31
+// H0-H31 = 16-bit (half precision) - same encoding as V0-V31
+
+/*
+ * Condition Codes for Conditional Branches and Selects
+ */
+typedef enum {
+	COND_EQ = 0x0,  // Equal (Z == 1)
+	COND_NE = 0x1,  // Not equal (Z == 0)
+	COND_CS = 0x2,  // Carry set (C == 1), also HS (unsigned higher or same)
+	COND_CC = 0x3,  // Carry clear (C == 0), also LO (unsigned lower)
+	COND_MI = 0x4,  // Minus/negative (N == 1)
+	COND_PL = 0x5,  // Plus/positive or zero (N == 0)
+	COND_VS = 0x6,  // Overflow set (V == 1)
+	COND_VC = 0x7,  // Overflow clear (V == 0)
+	COND_HI = 0x8,  // Unsigned higher (C == 1 && Z == 0)
+	COND_LS = 0x9,  // Unsigned lower or same (C == 0 || Z == 1)
+	COND_GE = 0xA,  // Signed greater than or equal (N == V)
+	COND_LT = 0xB,  // Signed less than (N != V)
+	COND_GT = 0xC,  // Signed greater than (Z == 0 && N == V)
+	COND_LE = 0xD,  // Signed less than or equal (Z == 1 || N != V)
+	COND_AL = 0xE,  // Always (unconditional)
+	COND_NV = 0xF   // Never (reserved, don't use)
+} ArmCondition;
+
+// Aliases
+#define COND_HS COND_CS  // Unsigned higher or same
+#define COND_LO COND_CC  // Unsigned lower
+
+/*
+ * Extend/Shift Types
+ */
+typedef enum {
+	EXTEND_UXTB = 0,  // Unsigned extend byte
+	EXTEND_UXTH = 1,  // Unsigned extend halfword
+	EXTEND_UXTW = 2,  // Unsigned extend word
+	EXTEND_UXTX = 3,  // Unsigned extend doubleword (64-bit, same as LSL)
+	EXTEND_SXTB = 4,  // Signed extend byte
+	EXTEND_SXTH = 5,  // Signed extend halfword
+	EXTEND_SXTW = 6,  // Signed extend word
+	EXTEND_SXTX = 7   // Signed extend doubleword
+} ArmExtend;
+
+typedef enum {
+	SHIFT_LSL = 0,  // Logical shift left
+	SHIFT_LSR = 1,  // Logical shift right
+	SHIFT_ASR = 2,  // Arithmetic shift right
+	SHIFT_ROR = 3   // Rotate right
+} ArmShift;
+
+/*
+ * Function Declarations
+ */
+
+// ADD/SUB instructions
+void encode_add_sub_imm(code_ctx *ctx, int sf, int op, int S, int shift, int imm12, Arm64Reg Rn, Arm64Reg Rd);
+void encode_add_sub_reg(code_ctx *ctx, int sf, int op, int S, int shift, Arm64Reg Rm, int imm6, Arm64Reg Rn, Arm64Reg Rd);
+void encode_add_sub_ext(code_ctx *ctx, int sf, int op, int S, Arm64Reg Rm, int option, int imm3, Arm64Reg Rn, Arm64Reg Rd);
+
+// Logical instructions
+void encode_logical_imm(code_ctx *ctx, int sf, int opc, int N, int immr, int imms, Arm64Reg Rn, Arm64Reg Rd);
+void encode_logical_reg(code_ctx *ctx, int sf, int opc, int shift, int N, Arm64Reg Rm, int imm6, Arm64Reg Rn, Arm64Reg Rd);
+
+// Move wide immediate
+void encode_mov_wide_imm(code_ctx *ctx, int sf, int opc, int hw, int imm16, Arm64Reg Rd);
+
+// Multiply/divide
+void encode_madd_msub(code_ctx *ctx, int sf, int op, Arm64Reg Rm, Arm64Reg Ra, Arm64Reg Rn, Arm64Reg Rd);
+void encode_div(code_ctx *ctx, int sf, int U, Arm64Reg Rm, Arm64Reg Rn, Arm64Reg Rd);
+
+// Shift instructions
+void encode_shift_reg(code_ctx *ctx, int sf, int op2, Arm64Reg Rm, Arm64Reg Rn, Arm64Reg Rd);
+
+// Load/store instructions
+void encode_ldr_str_imm(code_ctx *ctx, int size, int V, int opc, int imm12, Arm64Reg Rn, Arm64Reg Rt);
+void encode_ldr_str_reg(code_ctx *ctx, int size, int V, int opc, Arm64Reg Rm, int option, int S, Arm64Reg Rn, Arm64Reg Rt);
+void encode_ldur_stur(code_ctx *ctx, int size, int V, int opc, int imm9, Arm64Reg Rn, Arm64Reg Rt);
+void encode_ldp_stp(code_ctx *ctx, int opc, int V, int mode, int imm7, Arm64Reg Rt2, Arm64Reg Rn, Arm64Reg Rt);
+
+// PC-relative addressing
+void encode_adrp(code_ctx *ctx, int immlo, int immhi, Arm64Reg Rd);
+void encode_adr(code_ctx *ctx, int immlo, int immhi, Arm64Reg Rd);
+
+// Branch instructions
+void encode_branch_cond(code_ctx *ctx, int imm19, ArmCondition cond);
+void encode_branch_uncond(code_ctx *ctx, int imm26);
+void encode_branch_link(code_ctx *ctx, int imm26);
+void encode_branch_reg(code_ctx *ctx, int opc, Arm64Reg Rn);
+void encode_cbz_cbnz(code_ctx *ctx, int sf, int op, int imm19, Arm64Reg Rt);
+void encode_tbz_tbnz(code_ctx *ctx, int b5, int op, int b40, int imm14, Arm64Reg Rt);
+
+// Floating-point instructions
+void encode_fp_arith(code_ctx *ctx, int M, int S, int type, Arm64FpReg Rm, int opcode, Arm64FpReg Rn, Arm64FpReg Rd);
+void encode_fp_1src(code_ctx *ctx, int M, int S, int type, int opcode, Arm64FpReg Rn, Arm64FpReg Rd);
+void encode_fp_compare(code_ctx *ctx, int M, int S, int type, Arm64FpReg Rm, int op, Arm64FpReg Rn);
+void encode_fcvt_int(code_ctx *ctx, int sf, int S, int type, int rmode, int opc, Arm64FpReg Rn, Arm64Reg Rd);
+void encode_int_fcvt(code_ctx *ctx, int sf, int S, int type, int rmode, int opc, Arm64Reg Rn, Arm64FpReg Rd);
+
+// Conditional select
+void encode_cond_select(code_ctx *ctx, int sf, int op, Arm64Reg Rm, ArmCondition cond, int op2, Arm64Reg Rn, Arm64Reg Rd);
+
+// High-level helpers
+void load_immediate(code_ctx *ctx, int64_t val, Arm64Reg dst, bool is_64bit);
+
+#endif // JIT_AARCH64_EMIT_H
diff --git a/src/jit_dump.c b/src/jit_dump.c
new file mode 100644
index 000000000..c1b16a073
--- /dev/null
+++ b/src/jit_dump.c
@@ -0,0 +1,584 @@
+/*
+ * Copyright (C)2015-2016 Haxe Foundation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+#include <jit.h>
+
+static const char *op_names[] = {
+	"load-addr",
+	"load-const",
+	"load-arg",
+	"load-fun",
+	"store",
+	"lea",
+	"test",
+	"cmp",
+	"jcond",
+	"jump",
+	"jump-table",
+	"binop",
+	"unop",
+	"conv",
+	"conv-unsigned",
+	"ret",
+	"call",
+	"call",
+	"call",
+	"mov",
+	"cmov",
+	"xchg",
+	"cxhg",
+	"push-const",
+	"push",
+	"pop",
+	"alloc-stack",
+	"prefetch",
+	"debug-break",
+	"block",
+	"enter",
+	"stack",
+	"catch",
+	"address",
+	"nop"
+};
+
+bool hl_jit_dump_bin = false;
+
+const char *hl_natreg_str( int reg, emit_mode m );
+
+const char *hl_emit_regstr( ereg v, emit_mode m ) {
+	static char fmts[4][10];
+	static int flip = 0;
+	// allow up to four concurrent val_str
+	char *fmt = fmts[flip++&3];
+	if( IS_NULL(v) ) {
+		sprintf(fmt,"NULL???");
+		return fmt;
+	}
+	int val = REG_VALUE(v);
+	switch( REG_KIND(v) ) {
+	case R_VALUE:
+		sprintf(fmt,"V%d",v);
+		break;
+	case R_PHI:
+		sprintf(fmt,"P%d",-v);
+		break;
+	case R_CONST:
+		sprintf(fmt,"%d",val);
+		break;
+	case R_REG:
+		if( val == 0 )
+			sprintf(fmt,"%s",hl_natreg_str(v,m));
+		else if( val > 0 )
+			sprintf(fmt,"%s+%Xh",hl_natreg_str(v,m),val);
+		else
+			sprintf(fmt,"%s-%Xh",hl_natreg_str(v,m),-val);
+		break;
+	case R_REG_PTR:
+		if( val == 0 )
+			sprintf(fmt,"[%s]",REG_REG(v) == STACK_REG ? "ST" : hl_natreg_str(v,M_PTR));
+		else if( val > 0 )
+			sprintf(fmt,"[%s+%Xh]",REG_REG(v) == STACK_REG ? "ST" : hl_natreg_str(v,M_PTR),val);
+		else
+			sprintf(fmt,"[%s-%Xh]",REG_REG(v) == STACK_REG ? "ST" : hl_natreg_str(v,M_PTR),-val);
+		break;
+	default:
+		jit_assert();
+		break;
+	}
+	return fmt;
+}
+
+static void hl_dump_arg( hl_function *fun, int fmt, int val, char sep, int pos ) {
+	if( fmt == 0 ) return;
+	printf("%c", sep);
+	switch( fmt ) {
+	case 1:
+	case 2:
+		printf("R%d", val);
+		if( val < 0 || val >= fun->nregs ) printf("?");
+		break;
+	case 3:
+		printf("%d", val);
+		break;
+	case 4:
+		printf("[%d]", val);
+		break;
+	case 5:
+	case 6:
+		printf("@%X", val + pos + 1);
+		break;
+	default:
+		printf("?#%d", fmt);
+		break;
+	}
+}
+
+#define OP(_,_a,_b,_c) ((_a) | (((_b)&0xFF) << 8) | (((_c)&0xFF) << 16)),
+#define OP_BEGIN static int hl_op_fmt[] = {
+#define OP_END };
+#undef R
+#include "opcodes.h"
+
+static void hl_dump_op( hl_function *fun, hl_opcode *op ) {
+	printf("%s", hl_op_name(op->op) + 1);
+	int fmt = hl_op_fmt[op->op];
+	int pos = (int)(op - fun->ops);
+	hl_dump_arg(fun, fmt & 0xFF, op->p1, ' ', pos);
+	if( ((fmt >> 8) & 0xFF) == 5 ) {
+		int count = (fmt >> 16) & 0xFF;
+		printf(" [");
+		if( count == 4 ) {
+			printf("%d", op->p2);
+			printf(",%d", op->p3);
+			printf(",%d", (int)(int_val)op->extra);
+		} else if( op->op == OSwitch ) {
+			for(int i=0;i<op->p2;i++) {
+				if( i != 0 ) printf(",");
+				printf("@%X", (op->extra[i] + pos + 1));
+			}
+			printf(",def=@%X", op->p3 + pos + 1);
+		} else {
+			if( count == 0xFF )
+				count = op->p3;
+			else {
+				printf("%d,%d,",op->p2,op->p3);
+				count -= 3;
+			}
+			for(int i=0;i<count;i++) {
+				if( i != 0 ) printf(",");
+				printf("%d", op->extra[i]);
+			}
+		}
+		printf("]");
+	} else {
+		hl_dump_arg(fun, (fmt >> 8) & 0xFF, op->p2,',', pos);
+		hl_dump_arg(fun, fmt >> 16, op->p3,',', pos);
+	}
+}
+
+static const char *emit_mode_str( emit_mode mode ) {
+	switch( mode ) {
+	case M_UI8: return "-ui8";
+	case M_UI16: return "-ui16";
+	case M_I32: return "-i32";
+	case M_F32: return "-f32";
+	case M_F64: return "-f64";
+	case M_PTR: return "";
+	case M_VOID: return "-void";
+	case M_NORET: return "-noret";
+	default:
+		static char buf[50];
+		sprintf(buf,"?%d",mode);
+		return buf;
+	}
+}
+
+static void dump_value( jit_ctx *ctx, uint64 value, emit_mode mode ) {
+	union {
+		uint64 v;
+		double d;
+		float f;
+	} tmp;
+	hl_module *mod = ctx->mod;
+	hl_code *code = ctx->mod->code;
+	switch( mode ) {
+	case M_NONE:
+		printf("?0x%llX",value);
+		break;
+	case M_UI8:
+	case M_UI16:
+	case M_I32:
+		if( (int)value >= -0x10000 && (int)value <= 0x10000 )
+			printf("%d",(int)value);
+		else
+			printf("0x%X",(int)value);
+		break;
+	case M_F32:
+		tmp.v = value;
+		printf("%f",tmp.f);
+		break;
+	case M_F64:
+		tmp.v = value;
+		printf("%g",tmp.d);
+		break;
+	default:
+		if( value == 0 )
+			printf("NULL");
+		else if( mode == M_PTR && value >= (uint64)code->types && value < (uint64)(code->types + code->ntypes) )
+			uprintf(USTR("<%s>"),hl_type_str((hl_type*)value));
+		else if( mode == M_PTR && value == (uint64)mod->globals_data )
+			printf("<globals>");
+		else if( value == (uint64)&hlt_void )
+			printf("<void>");
+		else
+			printf("0x%llX",value);
+		break;
+	}
+}
+
+static void hl_dump_fun_name( hl_function *f ) {
+	if( f->obj ) {
+		uprintf(USTR("%s."),f->obj->name);
+		uprintf(USTR("%s"),f->field.name);
+	}
+	else if( f->field.ref ) {
+		uprintf(USTR("%s."),f->field.ref->obj->name);
+		uprintf(USTR("~%s"),f->field.ref->field.name);
+		printf(".%d",f->ref);
+	}
+	printf("[%X]", f->findex);
+}
+
+static void hl_dump_args( jit_ctx *ctx, einstr *e ) {
+	if( e->nargs == 0xFF )
+		return;
+	ereg *v = hl_emit_get_args(ctx->emit, e);
+	printf("(");
+	for(int i=0;i<e->nargs;i++) {
+		if( i != 0 ) printf(",");
+		printf("%s", val_str(v[i],M_NONE));
+	}
+	printf(")");
+}
+
+typedef struct { const char *name; void *ptr; } named_ptr;
+static void hl_dump_ptr_name( jit_ctx *ctx, void *ptr ) {
+#	define N(v)	ptr_names[i].name = #v; ptr_names[i].ptr = v; i++
+#	define N2(n,v)	ptr_names[i].name = n; ptr_names[i].ptr = v; i++
+#	define DYN(p) N2("dyn_get" #p, hl_dyn_get##p); N2("dyn_set" #p, hl_dyn_set##p); N2("dyn_cast" #p, hl_dyn_cast##p)
+	static named_ptr ptr_names[256] = { NULL };
+	int i = 0;
+	if( !ptr_names[0].ptr ) {
+		N(hl_alloc_dynbool);
+		N(hl_alloc_dynamic);
+		N(hl_alloc_obj);
+		N(hl_alloc_dynobj);
+		N(hl_alloc_virtual);
+		N(hl_alloc_closure_ptr);
+		N(hl_dyn_call);
+		N(hl_dyn_call_obj);
+		N(hl_throw);
+		N(hl_rethrow);
+		N(hl_to_virtual);
+		N(hl_alloc_enum);
+		N(hl_dyn_compare);
+		N(hl_same_type);
+		DYN(f);
+		DYN(d);
+		DYN(i64);
+		DYN(i);
+		DYN(p);
+		N2("null_field",hl_jit_null_field_access);
+		N2("null_access",hl_null_access);
+		N(hl_get_thread);
+		N(setjmp);
+		N(_setjmp);
+		N2("assert",hl_jit_assert);
+		N(fmod);
+		N(fmodf);
+		i = 0;
+	}
+#	undef N
+#	undef N2
+	while( true ) {
+		named_ptr p = ptr_names[i++];
+		if( !p.ptr ) break;
+		if( p.ptr == ptr ) {
+			printf("<%s>",p.name);
+			return;
+		}
+	}
+	for(i=0;i<ctx->mod->code->nnatives;i++) {
+		hl_native *n = ctx->mod->code->natives + i;
+		if( ctx->mod->functions_ptrs[n->findex] == ptr ) {
+			printf("<%s.%s>",n->lib[0] == '?' ? n->lib + 1 : n->lib,n->name);
+			return;
+		}
+	}
+	printf("<?0x%llX>",(uint64)ptr);
+}
+
+void hl_emit_flush( jit_ctx *ctx );
+void hl_regs_flush( jit_ctx *ctx );
+void hl_codegen_flush( jit_ctx *ctx );
+
+#define reg_str(r) val_str(r,e->mode)
+
+static void dump_instr( jit_ctx *ctx, einstr *e, int cur_pos ) {
+	printf("%s", op_names[e->op]);
+	bool show_size = true;
+	switch( e->op ) {
+	case TEST:
+	case CMP:
+		printf("-%s", hl_op_name(e->size_offs)+2);
+		show_size = false;
+		break;
+	case BINOP:
+	case UNOP:
+		printf("-%s", hl_op_name(e->size_offs)+1);
+		show_size = false;
+		break;
+	default:
+		break;
+	}
+	if( e->mode )
+		printf("%s", emit_mode_str(e->mode));
+	switch( e->op ) {
+	case CALL_FUN:
+		printf(" ");
+		{
+			int fid = ctx->mod->functions_indexes[e->a];
+			hl_code *code = ctx->mod->code;
+			if( fid < code->nfunctions ) {
+				hl_dump_fun_name(&code->functions[fid]);
+			} else {
+				printf("???");
+			}
+		}
+		hl_dump_args(ctx,e);
+		break;
+	case CALL_REG:
+		printf(" %s", val_str(e->a,M_PTR));
+		hl_dump_args(ctx,e);
+		break;
+	case CALL_PTR:
+		printf(" ");
+		hl_dump_ptr_name(ctx, (void*)e->value);
+		hl_dump_args(ctx,e);
+		break;
+	case JUMP:
+	case JCOND:
+		printf(" @%X", cur_pos + 1 + e->size_offs);
+		break;
+	case JUMP_TABLE:
+		{
+			int *offsets = hl_emit_get_args(ctx->emit, e);
+			printf(" %s (", reg_str(e->a));
+			for(int k=0;k<e->nargs;k++) {
+				if( k > 0 ) printf(",");
+				printf("@%X", cur_pos + 1 + offsets[k]);
+			}
+			printf(")");
+		}	
+		break;
+	case BLOCK:
+		printf(" #%d", e->size_offs);
+		if( e->size_offs && ctx->blocks[e->size_offs].pred_count == 0 )
+			printf(" ???DEAD");
+		break;
+	case STACK_OFFS:
+		if( e->size_offs >= 0 )
+			printf(" +%Xh", e->size_offs);
+		else
+			printf(" -%Xh", -e->size_offs);
+		break;
+	case LOAD_CONST:
+	case PUSH_CONST:
+		printf(" ");
+		dump_value(ctx, e->value, e->mode);
+		break;
+	case LOAD_ADDR:
+		if( e->nargs != e->mode ) {
+			if( e->mode == M_PTR ) printf("-ptr");
+			printf("%s", e->nargs == M_PTR ? "-ptr" : emit_mode_str(e->nargs));
+		}
+		printf(" %s[%Xh]", val_str(e->a,M_PTR), e->size_offs);
+		break;
+	case STORE:
+		{
+			int offs = e->size_offs;
+			if( offs == 0 )
+				printf(" [%s]", val_str(e->a,M_PTR));
+			else
+				printf(" %s[%Xh]", val_str(e->a,M_PTR), offs);
+			printf(" = %s", reg_str(e->b));
+		}
+		break;
+	case CONV:
+	case CONV_UNSIGNED:
+		if( e->mode == M_PTR ) printf("-i64");
+		printf("%s %s", e->size_offs == M_PTR ? "-i64" : emit_mode_str(e->size_offs), val_str(e->a,(emit_mode)e->size_offs));
+		break;
+	case LEA:
+		printf(" [%s", reg_str(e->a));
+		if( !IS_NULL(e->b) ) printf("+%s", reg_str(e->b));
+		if( (e->size_offs&0xFF) > 1 ) printf("*%d",e->size_offs&0xFF);
+		if( e->size_offs >> 8 ) printf("+%Xh", e->size_offs>>8);
+		printf("]");
+		break;	
+	default:
+		if( !IS_NULL(e->a) ) {
+			printf(" %s", reg_str(e->a));
+			if( !IS_NULL(e->b) ) printf(", %s", reg_str(e->b));
+		}
+		if( show_size && e->size_offs != 0 )
+			printf(" %d", e->size_offs);
+		break;
+	}
+}
+
+void hl_emit_dump( jit_ctx *ctx ) {
+	hl_function *f = ctx->fun;
+	int nargs = f->type->fun->nargs;
+	// if it not was not before (in case of dump during process)
+	hl_emit_flush(ctx);
+	hl_regs_flush(ctx);
+	hl_codegen_flush(ctx);
+	printf("function ");
+	hl_dump_fun_name(f);
+	printf("(");
+	for(int i=0;i<nargs;i++) {
+		if( i > 0 ) printf(",");
+		printf("R%d", i);
+	}
+	printf(")\n");
+	for(int i=0;i<f->nregs;i++) {
+		printf("\tR%d : ",i);
+		uprintf(USTR("%s\n"), hl_type_str(f->regs[i]));
+	}
+	// check blocks intervals
+	int cur = 0;
+	for(int i=0;i<ctx->block_count;i++) {
+		eblock *b = ctx->blocks + i;
+		if( b->start_pos != cur ) printf("  ??? BLOCK %d START AT %X != %X\n", i, b->start_pos, cur);
+		if( b->end_pos < b->start_pos ) printf("  ??? BLOCK %d RANGE [%X,%X]\n", i, b->start_pos, b->end_pos);
+		cur = b->end_pos;
+	}
+	if( cur != ctx->instr_count )
+		printf("  ??? MISSING BLOCK FOR RANGE %X-%X\n", cur, ctx->instr_count);
+	// print instrs
+	int vpos = 1;
+	int rpos = 0;
+	int cpos = 0;
+	int cur_op = 0;
+	bool new_op = false;
+	eblock *cur_block = NULL;
+	for(int icount=0;icount<ctx->instr_count;icount++) {
+		while( ctx->emit_pos_map[cur_op] == icount ) {
+			printf("@%X ", cur_op);
+			hl_dump_op(ctx->fun, f->ops + cur_op);
+			printf("\n");
+			new_op = true;
+			cur_op++;
+		}
+		einstr *e = ctx->instrs + icount;
+		printf("\t\t@%X ", icount);
+		if( vpos < ctx->value_count && ctx->values_writes[vpos] == icount )
+			printf("V%d = ", vpos++);
+		dump_instr(ctx, e, icount);
+		if( e->op == JCOND || e->op == JUMP ) {
+			int target = icount + 1 + e->size_offs;
+			bool bad = false;
+			if( icount + 1 >= ctx->instr_count || target < 0 || target >= ctx->instr_count )
+				bad = true;
+			else if( ctx->instrs[target].op != BLOCK || (e->op == JCOND && ctx->instrs[icount+1].op != BLOCK) )
+				bad = true;
+			else {
+				bool found = false;
+				for(int k=0;k<cur_block->next_count;k++) {
+					if( cur_block->nexts[k] == ctx->instrs[target].size_offs )
+						found = true;
+					if( (e->op == JUMP || e->op == JUMP_TABLE) && ctx->instrs[icount+1].op == BLOCK && ctx->instrs[icount+1].size_offs == cur_block->nexts[k] )
+						printf(" ???LEAK");
+				}
+				if( !found ) printf(" ???NEXT");
+			}
+			if( bad )
+				printf(" ???");
+		}
+		if( e->op == BLOCK ) {
+			eblock *b = &ctx->blocks[e->size_offs];
+			for(int k=0;k<b->pred_count;k++) {
+				eblock *p = &ctx->blocks[b->preds[k]];
+				einstr *pe = &ctx->instrs[p->end_pos-1];				
+				if( p->end_pos == icount )
+					continue;
+				bool bad = false;
+				if( (pe->op == JUMP || pe->op == JCOND) && pe->size_offs == icount - p->end_pos )
+					bad = false;
+				else if( pe->op != JUMP_TABLE )
+					bad = true;
+				if( bad )
+					printf(" ???PREV#%d",b->preds[k]);
+			}
+			for(int k=0;k<b->phi_count;k++) {
+				ephi *p = b->phis + k;
+				printf("\n\t\t@%X %s = phi%s(",icount,val_str(p->value,p->mode),emit_mode_str(p->mode));
+				for(int n=0;n<p->nvalues;n++) {
+					if( n > 0 ) printf(",");
+					printf("%s:%d",val_str(p->values[n],p->mode),p->blocks[n]);
+				}
+				if( p->nvalues == 0 )
+					printf("unwritten");
+				printf(")");
+				if( p->nvalues == 1 )
+					printf(" ???");
+			}
+			cur_block = b;
+		}
+		while( rpos < ctx->reg_instr_count && rpos < ctx->reg_pos_map[icount+1] ) {
+			ereg out = ctx->reg_writes[rpos];
+			e = ctx->reg_instrs + rpos;
+			printf("\n\t\t\t\t@%X ",rpos);
+			if( !IS_NULL(out) ) printf("%s = ",reg_str(out));
+			dump_instr(ctx,e,rpos);
+			bool first = true;
+			while( cpos < ctx->code_size && cpos < ctx->code_pos_map[rpos+1] ) {
+				if( first ) {
+					if( hl_jit_dump_bin )
+						printf("\t\t\t");
+					else
+						printf("\033[80G");
+					first = false;
+					if( new_op ) {
+						new_op = false;
+						cpos += ctx->cfg.debug_prefix_size;
+						if( cpos == ctx->code_pos_map[rpos+1] ) break;
+					}
+				}
+				printf("%.2X",ctx->code_instrs[cpos++]);
+			}
+			rpos++;
+		}
+		printf("\n");
+	}
+	// invalid ?
+	while( vpos < ctx->value_count )
+		printf("  ??? UNWRITTEN VALUE V%d @%X\n", vpos, ctx->values_writes[vpos++]);
+	// interrupted
+	if( cur_op < f->nops ) {
+		printf("@%X ", cur_op);
+		hl_dump_op(ctx->fun, f->ops + cur_op);
+		printf("\n\t\t...\n");
+	}
+	if( cpos == ctx->code_size && cpos > 0 ) {
+		int n = 1;
+		for(int i=0;i<cpos;i++) {
+			while( ctx->code_pos_map[n] == i ) {
+				if( (n & 15) == 0 ) printf("\n"); else printf(" ");
+				n++;
+			}
+			printf("%.2X", ctx->code_instrs[i]);
+		}
+	}
+	printf("\n\n");
+	fflush(stdout);
+}
diff --git a/src/jit_emit.c b/src/jit_emit.c
new file mode 100644
index 000000000..7524c5483
--- /dev/null
+++ b/src/jit_emit.c
@@ -0,0 +1,2214 @@
+/*
+ * Copyright (C)2015-2016 Haxe Foundation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+#include <hlmodule.h>
+#include <jit.h>
+#include <setjmp.h>
+#include "data_struct.h"
+
+//#define EMIT_DEBUG
+
+#ifdef EMIT_DEBUG
+#	define emit_debug	jit_debug
+#else
+#	define emit_debug(...)
+#endif
+
+int hl_emit_mode_sizes[] = {0,1,2,4,HL_WSIZE,8,4,0,0};
+
+typedef struct {
+	hl_type *t;
+	int id;
+	ereg stored;
+} vreg;
+
+#define MAX_TMP_ARGS	32
+#define MAX_TRAPS		32
+
+typedef struct _linked_inf linked_inf;
+typedef struct _emit_block emit_block;
+typedef struct _tmp_phi tmp_phi;
+
+#define S_TYPE			blocks
+#define S_NAME(name)	blocks_##name
+#define S_VALUE			emit_block*
+#include "data_struct.c"
+#define blocks_add(set,v)		blocks_add_impl(DEF_ALLOC,&(set),v)
+
+#define S_TYPE			phi_arr
+#define S_NAME(name)	phi_##name
+#define S_VALUE			tmp_phi*
+#include "data_struct.c"
+#define phi_add(set,v)		phi_add_impl(DEF_ALLOC,&(set),v)
+
+#define S_SORTED
+
+#define S_MAP
+#define S_TYPE			ereg_map
+#define S_NAME(name)	ereg_##name
+#define S_KEY			ereg
+#define S_VALUE			emit_block*
+#include "data_struct.c"
+#define ereg_add(set,k,v)		ereg_add_pair_impl(DEF_ALLOC,&(set),k,v)
+
+#define S_MAP
+
+#define S_TYPE			vreg_map
+#define S_NAME(name)	vreg_##name
+#define S_KEY			int
+#define S_VALUE			ereg
+#include "data_struct.c"
+#define vreg_replace(set,k,v) vreg_replace_impl(DEF_ALLOC,&(set),k,v)
+
+struct _linked_inf {
+	int id;
+	void *ptr;
+	linked_inf *next;
+};
+
+struct _emit_block {
+	int id;
+	int start_pos;
+	int end_pos;
+	int wait_nexts;
+	bool sealed;
+	blocks nexts;
+	blocks preds;
+	vreg_map written_vars;
+	phi_arr phis;
+	emit_block *wait_seal_next;
+};
+
+struct _tmp_phi {
+	ereg value;
+	vreg *r;
+	ereg target;
+	int final_id;
+	bool locked;
+	bool opt;
+	emit_mode mode;
+	emit_block *b;
+	ereg_map vals;
+	phi_arr ref_phis;
+	linked_inf *ref_blocks;
+};
+
+typedef struct {
+	ereg stack;
+	int target;
+} trap_inf;
+
+struct _emit_ctx {
+	hl_module *mod;
+	hl_function *fun;
+	jit_ctx *jit;
+
+	einstr *instrs;
+	vreg *vregs;
+	tmp_phi **phis;
+	int max_instrs;
+	int max_regs;
+	int max_phis;
+	int emit_pos;
+	int op_pos;
+	int phi_count;
+	int phi_depth;
+	bool flushed;
+
+	ereg tmp_args[MAX_TMP_ARGS];
+	trap_inf traps[MAX_TRAPS];
+	int *pos_map;
+	int pos_map_size;
+	int trap_count;
+
+	int_arr args_data;
+	int_arr jump_regs;
+	int_arr values;
+
+	blocks blocks;
+	emit_block *current_block;
+	emit_block *wait_seal;
+	linked_inf *arrival_points;
+	vclosure *closure_list;
+};
+
+#define R(i)	(ctx->vregs + (i))
+
+#define LOAD(r) emit_load_reg(ctx, r)
+#define STORE(r, v) emit_store_reg(ctx, r, v)
+#define LOAD_CONST(v, t) emit_load_const(ctx, (uint64)(v), t)
+#define LOAD_CONST_PTR(v) LOAD_CONST(v,&hlt_bytes)
+#define LOAD_MEM(v, offs, t) emit_load_mem(ctx, v, offs, t, t)
+#define LOAD_MEM_PTR(v, offs) LOAD_MEM(v, offs, &hlt_bytes)
+#define STORE_MEM(to, offs, v) emit_store_mem(ctx, to, offs, v)
+#define LOAD_OBJ_METHOD(obj,id) LOAD_MEM_PTR(LOAD_MEM_PTR(LOAD_MEM_PTR(obj,0),HL_WSIZE*2),HL_WSIZE*(id))
+#define OFFSET(base,index,mult,offset) emit_gen_ext(ctx, LEA, base, index, M_PTR, (mult) | ((offset) << 8))
+#define BREAK() emit_gen(ctx, DEBUG_BREAK, UNUSED, UNUSED, 0)
+#define GET_MODE(r) emit_get_mode(ctx,r)
+#define GET_PHI(r) ctx->phis[-(r)-1]
+#define HDYN_VALUE 8
+
+static hl_type hlt_ui8 = { HUI8, 0 };
+static hl_type hlt_ui16 = { HUI16, 0 };
+
+static linked_inf *link_add( emit_ctx *ctx, int id, void *ptr, linked_inf *head ) {
+	linked_inf *l = hl_malloc(&ctx->jit->falloc,sizeof(linked_inf));
+	l->id = id;
+	l->ptr = ptr;
+	l->next = head;
+	return l;
+}
+
+static linked_inf *link_add_sort_unique( emit_ctx *ctx, int id, void *ptr, linked_inf *head ) {
+	linked_inf *prev = NULL;
+	linked_inf *cur = head;
+	while( cur && cur->id < id ) {
+		prev = cur;
+		cur = cur->next;
+	}
+	// check duplicate
+	while( cur && cur->id == id ) {
+		if( cur->ptr == ptr )
+			return head;
+		cur = cur->next;
+	}
+	// insert
+	linked_inf *l = hl_malloc(&ctx->jit->falloc,sizeof(linked_inf));
+	l->id = id;
+	l->ptr = ptr;
+	if( !prev ) {
+		l->next = head;
+		return l;
+	} else {
+		l->next = prev->next;
+		prev->next = l;
+		return head;
+	}
+}
+
+static linked_inf *link_add_sort_replace( emit_ctx *ctx, int id, void *ptr, linked_inf *head ) {
+	linked_inf *prev = NULL;
+	linked_inf *cur = head;
+	while( cur && cur->id < id ) {
+		prev = cur;
+		cur = cur->next;
+	}
+	// replace duplicate
+	if( cur && cur->id == id ) {
+		cur->ptr = ptr;
+		return head;
+	}
+	// insert
+	linked_inf *l = hl_malloc(&ctx->jit->falloc,sizeof(linked_inf));
+	l->id = id;
+	l->ptr = ptr;
+	if( !prev ) {
+		l->next = head;
+		return l;
+	} else {
+		l->next = prev->next;
+		prev->next = l;
+		return head;
+	}
+}
+
+static void *link_sort_lookup( linked_inf *head, int id ) {
+	while( head && head->id < id )
+		head = head->next;
+	if( head && head->id == id )
+		return head->ptr;
+	return NULL;
+}
+
+static linked_inf *link_sort_remove( linked_inf *head, int id ) {
+	linked_inf *prev = NULL;
+	linked_inf *cur = head;
+	while( cur && cur->id < id ) {
+		prev = cur;
+		cur = cur->next;
+	}
+	if( cur && cur->id == id ) {
+		if( !prev )
+			return cur->next;
+		prev->next = cur->next;
+		return head;
+	}
+	return head;
+}
+
+static emit_mode hl_type_mode( hl_type *t ) {
+	static emit_mode CONV[] = {
+		M_VOID,
+		M_UI8,
+		M_UI16,
+		M_I32,
+		M_PTR,
+		M_F32,
+		M_F64,
+		sizeof(bool) == 1 ? M_UI8 : M_I32,
+	};
+	if( t->kind <= HBOOL )
+		return CONV[t->kind];
+	return M_PTR;
+}
+
+static ereg new_value( emit_ctx *ctx ) {
+	ereg r = int_arr_count(ctx->values);
+	int_arr_add(ctx->values, ctx->emit_pos-1);
+	return r;
+}
+
+static ereg *get_tmp_args( emit_ctx *ctx, int count ) {
+	if( count > MAX_TMP_ARGS ) jit_error("Too many arguments");
+	return ctx->tmp_args;
+}
+
+static emit_mode emit_get_mode( emit_ctx *ctx, ereg v ) {
+	if( IS_NULL(v) ) jit_assert();
+	if( v < 0 )
+		return GET_PHI(v)->mode;
+	return ctx->instrs[int_arr_get(ctx->values,v)].mode;
+}
+
+static const char *phi_prefix( emit_ctx *ctx ) {
+	static char tmp[20];
+	int sp = 3 + ctx->phi_depth * 2;
+	if( sp > 19 ) sp = 19;
+	memset(tmp,0x20,sp);
+	tmp[sp] = 0;
+	return tmp;
+}
+
+static einstr *emit_instr( emit_ctx *ctx, emit_op op ) {
+	if( ctx->emit_pos == ctx->max_instrs ) {
+		int pos = ctx->emit_pos;
+		int next_size = ctx->max_instrs ? (ctx->max_instrs << 1) : 256;
+		einstr *instrs = (einstr*)malloc(sizeof(einstr) * next_size);
+		if( instrs == NULL ) jit_error("Out of memory");
+		memcpy(instrs, ctx->instrs, pos * sizeof(einstr));
+		memset(instrs + pos, 0, (next_size - pos) * sizeof(einstr));
+		free(ctx->instrs);
+		ctx->instrs = instrs;
+		ctx->max_instrs = next_size;
+	} else if( (ctx->emit_pos & 0xFF) == 0 )
+		memset(ctx->instrs + ctx->emit_pos, 0, 256 * sizeof(einstr));
+	einstr *e = ctx->instrs + ctx->emit_pos++;
+	e->op = op;
+	return e;
+}
+
+static void emit_store_mem( emit_ctx *ctx, ereg to, int offs, ereg from ) {
+	einstr *e = emit_instr(ctx, STORE);
+	e->mode = GET_MODE(from);
+	e->size_offs = offs;
+	e->a = to;
+	e->b = from;
+}
+
+#define store_args hl_emit_store_args
+void hl_emit_store_args( emit_ctx *ctx, einstr *e, ereg *args, int count ) {
+	if( count < 0 ) jit_assert();
+	if( count > 256 ) jit_error("Too many arguments");
+	e->nargs = (unsigned char)count;
+	if( count == 0 ) return;
+	if( count == 1 ) {
+		e->size_offs = args[0];
+		return;
+	}
+	int *args_data = int_arr_reserve(ctx->args_data, count);
+	e->size_offs = (int)(args_data - ctx->args_data.values);
+	memcpy(args_data, args, sizeof(int) * count);
+}
+
+ereg *hl_emit_get_args( emit_ctx *ctx, einstr *e ) {
+	if( e->nargs == 0 )
+		return NULL;
+	if( e->nargs == 1 )
+		return (ereg*)&e->size_offs;
+	return (ereg*)(ctx->args_data.values + e->size_offs);
+}
+
+static ereg emit_gen_ext( emit_ctx *ctx, emit_op op, ereg a, ereg b, int mode, int size_offs ) {
+	einstr *e = emit_instr(ctx, op);
+	if( (unsigned char)mode != mode ) jit_assert();
+	e->mode = (unsigned char)mode;
+	e->size_offs = size_offs;
+	e->a = a;
+	e->b = b;
+	return mode == 0 || mode == M_NORET ? UNUSED : new_value(ctx);
+}
+
+static ereg emit_gen( emit_ctx *ctx, emit_op op, ereg a, ereg b, int mode ) {
+	return emit_gen_ext(ctx,op,a,b,mode,0);
+}
+
+static ereg emit_gen_size( emit_ctx *ctx, emit_op op, int size_offs ) {
+	return emit_gen_ext(ctx,op,UNUSED,UNUSED,op==ALLOC_STACK ? M_PTR : 0,size_offs);
+}
+
+static void patch_instr_mode( emit_ctx *ctx, int mode ) {
+	ctx->instrs[ctx->emit_pos-1].mode = (unsigned char)mode;
+}
+
+static tmp_phi *alloc_phi( emit_ctx *ctx, emit_block *b, vreg *r ) {
+	if( ctx->phi_count == ctx->max_phis ) {
+		int new_size = ctx->max_phis ? ctx->max_phis << 1 : 64;
+		tmp_phi **phis = (tmp_phi**)malloc(sizeof(tmp_phi*) * new_size);
+		if( phis == NULL ) jit_error("Out of memory");
+		memcpy(phis, ctx->phis, sizeof(tmp_phi*) * ctx->phi_count);
+		free(ctx->phis);
+		ctx->phis = phis;
+		ctx->max_phis = new_size;
+	}
+	tmp_phi *p = (tmp_phi*)hl_zalloc(&ctx->jit->falloc, sizeof(tmp_phi));
+	p->b = b;
+	p->r = r;
+	if( r ) p->mode = hl_type_mode(r->t);
+	p->value = -(++ctx->phi_count);
+	phi_add(b->phis,p);
+	GET_PHI(p->value) = p;
+	return p;
+}
+
+static emit_block *alloc_block( emit_ctx *ctx ) {
+	emit_block *b = hl_zalloc(&ctx->jit->falloc, sizeof(emit_block));
+	b->id = blocks_count(ctx->blocks);
+	b->start_pos = ctx->emit_pos;
+	blocks_add(ctx->blocks, b);
+	if( b->id > 0 ) emit_gen_size(ctx, BLOCK, b->id);
+	return b;
+}
+
+static void block_add_pred( emit_ctx *ctx, emit_block *b, emit_block *p ) {
+	for_iter(blocks,p2,b->preds)
+		if( p2 == p )
+			return;
+	blocks_add(b->preds,p);
+	blocks_add(p->nexts,b);
+	emit_debug("  PRED #%d\n",p->id);
+}
+
+static void store_block_var( emit_ctx *ctx, emit_block *b, vreg *r, ereg v ) {
+	if( IS_NULL(v) ) jit_assert();
+	vreg_replace(b->written_vars,r->id,v);
+	if( v < 0 ) {
+		tmp_phi *p = GET_PHI(v);
+		p->ref_blocks = link_add_sort_unique(ctx,b->id,b,p->ref_blocks);
+	}
+}
+
+static bool split_block( emit_ctx *ctx ) {
+	if( ctx->current_block->start_pos == ctx->emit_pos-1 )
+		return false;
+	emit_block *b = alloc_block(ctx);
+	b->sealed = true;
+	emit_debug("BLOCK #%d@%X[%X]\n",b->id,b->start_pos,ctx->op_pos);
+	while( ctx->arrival_points && ctx->arrival_points->id == ctx->op_pos ) {
+		block_add_pred(ctx, b, (emit_block*)ctx->arrival_points->ptr);
+		ctx->arrival_points = ctx->arrival_points->next;
+	}
+	einstr *eprev = &ctx->instrs[b->start_pos-1];
+	if( eprev->op != JUMP && eprev->op != JUMP_TABLE && eprev->op != RET && eprev->mode != M_NORET )
+		block_add_pred(ctx, b, ctx->current_block);
+	ctx->current_block->end_pos = b->start_pos;
+	ctx->current_block = b;
+	return true;
+}
+
+static void add_jump_target( emit_ctx *ctx, int offs ) {
+	if( offs == 0 && ctx->current_block->start_pos == ctx->emit_pos-1 )
+		return;
+	int target = offs + ctx->op_pos + 1;
+	ctx->arrival_points = link_add_sort_unique(ctx, target, ctx->current_block, ctx->arrival_points);
+}
+
+static int emit_jump( emit_ctx *ctx, bool cond ) {
+	int p = ctx->emit_pos;
+	emit_gen(ctx, cond ? JCOND : JUMP, UNUSED, UNUSED, 0);
+	if( !cond ) add_jump_target(ctx, 0);
+	split_block(ctx);
+	return p;
+}
+
+static void patch_jump( emit_ctx *ctx, int jpos ) {
+	emit_block *b = NULL;
+	// find the block or initial jump was
+	for_iter_back(blocks,b2,ctx->blocks) {
+		if( b2->start_pos <= jpos ) {
+			b = b2;
+			break;
+		}
+	}
+	if( !b || b == ctx->current_block ) jit_assert();
+	// patch opcode
+	bool after_block = ctx->current_block->start_pos == ctx->emit_pos-1;
+	ctx->instrs[jpos].size_offs = ctx->emit_pos - (after_block?1:0) - (jpos + 1);
+	if( after_block ) {
+		block_add_pred(ctx, ctx->current_block, b);
+	} else {
+		if( !split_block(ctx) ) jit_assert();
+	}
+}
+
+static void register_jump( emit_ctx *ctx, int jpos, int offs ) {
+	int target = offs + ctx->op_pos + 1;
+	int_arr_add(ctx->jump_regs, jpos);
+	int_arr_add(ctx->jump_regs, target);
+	if( offs > 0 ) add_jump_target(ctx, offs);
+}
+
+static ereg emit_load_const( emit_ctx *ctx, uint64 value, hl_type *size_t ) {
+	einstr *e = emit_instr(ctx, LOAD_CONST);
+	e->mode = hl_type_mode(size_t);
+	e->value = value;
+	return new_value(ctx);
+}
+
+static ereg emit_load_mem( emit_ctx *ctx, ereg v, int offset, hl_type *size_t, hl_type *to_t ) {
+	einstr *e = emit_instr(ctx, LOAD_ADDR);
+	e->mode = hl_type_mode(to_t);
+	e->a = v;
+	e->nargs = hl_type_mode(size_t);
+	e->size_offs = offset;
+	return new_value(ctx);
+}
+
+static void emit_store_reg( emit_ctx *ctx, vreg *to, ereg v ) {
+	if( to->t->kind == HVOID ) return;
+	if( IS_NULL(v) ) jit_assert();
+	store_block_var(ctx,ctx->current_block,to,v);
+	if( ctx->trap_count > 0 ) {
+		// if the value was written before the trap, let's update it
+		if( !IS_NULL(to->stored) )
+			STORE_MEM(emit_gen(ctx,ADDRESS,to->stored,UNUSED,M_PTR), 0, v);
+	} else {
+		to->stored = v;
+	}
+}
+
+static ereg emit_native_call( emit_ctx *ctx, void *native_ptr, ereg args[], int nargs, hl_type *ret ) {
+	einstr *e = emit_instr(ctx, CALL_PTR);
+	e->mode = (unsigned char)(ret ? hl_type_mode(ret) : M_NORET);
+	e->value = (int_val)native_ptr;
+	store_args(ctx, e, args, nargs);
+	return ret == NULL || e->mode == M_VOID ? UNUSED : new_value(ctx);
+}
+
+static ereg emit_dyn_call( emit_ctx *ctx, ereg f, ereg args[], int nargs, hl_type *ret ) {
+	einstr *e = emit_instr(ctx, CALL_REG);
+	e->mode = hl_type_mode(ret);
+	e->a = f;
+	store_args(ctx, e, args, nargs);
+	return e->mode == M_VOID ? UNUSED : new_value(ctx);
+}
+
+static void emit_test( emit_ctx *ctx, ereg v, hl_op o ) {
+	emit_gen_ext(ctx, TEST, v, UNUSED, 0, o);
+	patch_instr_mode(ctx, GET_MODE(v));
+}
+
+static void emit_cmp( emit_ctx *ctx, ereg a, ereg b, hl_op o ) {
+	emit_gen_ext(ctx, CMP, a, b, 0, o);
+	patch_instr_mode(ctx, GET_MODE(a));
+}
+
+static void phi_remove_val( emit_ctx *ctx, tmp_phi *p, ereg v ) {
+	ereg_remove(&p->vals,v);
+	emit_debug("%sPHI-REM-DEP %s = %s\n", phi_prefix(ctx), val_str(p->value,p->mode), val_str(v,p->mode));
+}
+
+static void phi_add_val( emit_ctx *ctx, tmp_phi *p, ereg v, emit_block *from ) {
+	if( !p->b ) jit_assert();
+	if( IS_NULL(v) ) jit_assert();
+	if( p->value == v )
+		return;
+	if( !ereg_add(p->vals,v,from) )
+		return;
+	emit_debug("%sPHI-DEP %s:#%d = %s\n", phi_prefix(ctx), val_str(p->value,p->mode), from->id, val_str(v,p->mode));
+	if( v < 0 ) {
+		tmp_phi *p2 = GET_PHI(v);
+		phi_add(p2->ref_phis,p);
+	}
+}
+
+static ereg optimize_phi_rec( emit_ctx *ctx, tmp_phi *p ) {
+
+	if( p->locked ) jit_assert();
+	ereg same = UNUSED;
+	for_iter_key(ereg,v,p->vals) {
+		if( v == same || v == p->value )
+			continue;
+		if( !IS_NULL(same) )
+			return p->value;
+		same = v;
+	}
+	if( IS_NULL(same) )
+		return p->value; // sealed (no dep yet)
+
+	if( !phi_count(p->ref_phis) && !p->ref_blocks )
+		return same;
+
+	if( p->locked || p->opt ) jit_assert();
+
+	emit_debug("%sPHI-OPT %s = %s\n", phi_prefix(ctx), val_str(p->value,p->mode), val_str(same,p->mode));
+	p->opt = true;
+	ctx->phi_depth++;
+	linked_inf *l = p->ref_blocks;
+	while( l ) {
+		emit_block *b = (emit_block*)l->ptr;
+		if( vreg_find(b->written_vars,p->r->id) == p->value )
+			store_block_var(ctx,b,p->r,same);
+		l = l->next;
+	}
+	for_iter(phi,p2,p->ref_phis) {
+		emit_block *bsame = ereg_find(p2->vals,p->value);
+		phi_remove_val(ctx,p2,p->value);
+		phi_add_val(ctx,p2,same,bsame);
+	}
+	p->ref_blocks = NULL;
+	int count = phi_count(p->ref_phis);
+	tmp_phi **phis = phi_free(&p->ref_phis);
+	for(int i=0;i<count;i++)
+		optimize_phi_rec(ctx, phis[i]);
+	ctx->phi_depth--;
+	emit_debug("%sPHI-OPT-DONE %s = %s\n", phi_prefix(ctx), val_str(p->value,p->mode), val_str(same,p->mode));
+	return optimize_phi_rec(ctx,p);
+}
+
+static ereg emit_load_reg_block( emit_ctx *ctx, emit_block *b, vreg *r );
+
+static ereg gather_phis( emit_ctx *ctx, tmp_phi *p ) {
+	p->locked = true;
+	for_iter(blocks,b,p->b->preds) {
+		ereg r = p->r ? emit_load_reg_block(ctx, b, p->r) : p->value;
+		phi_add_val(ctx, p, r, b);
+	}
+	p->locked = false;
+	return optimize_phi_rec(ctx, p);
+}
+
+static ereg emit_load_reg_block( emit_ctx *ctx, emit_block *b, vreg *r ) {
+	ereg v = vreg_find(b->written_vars,r->id);
+	if( !IS_NULL(v) )
+		return v;
+	if( !b->sealed ) {
+		tmp_phi *p = alloc_phi(ctx,b,r);
+		emit_debug("%sPHI-SEALED %s = R%d\n",phi_prefix(ctx),val_str(p->value,p->mode),r->id);
+		v = p->value;
+	} else if( blocks_count(b->preds) == 1 )
+		v = emit_load_reg_block(ctx, blocks_get(b->preds,0), r);
+	else {
+		tmp_phi *p = alloc_phi(ctx,b,r);
+		store_block_var(ctx,b,r,p->value);
+		v = gather_phis(ctx, p);
+	}
+	store_block_var(ctx,b,r,v);
+	return v;
+}
+
+static ereg emit_load_reg( emit_ctx *ctx, vreg *r ) {
+	return emit_load_reg_block(ctx, ctx->current_block, r);
+}
+
+static void seal_block( emit_ctx *ctx, emit_block *b ) {
+	emit_debug("  SEAL #%d\n",b->id);
+	for_iter(phi,p,b->phis)
+		gather_phis(ctx, p);
+	b->sealed = true;
+}
+
+static ereg emit_call_fid( emit_ctx *ctx, int findex, ereg *args, int nargs, emit_mode mode ) {
+	einstr *e = emit_instr(ctx, CALL_FUN);
+	e->mode = mode;
+	e->a = findex;
+	store_args(ctx, e, args, nargs);
+	return mode == M_VOID ? UNUSED : new_value(ctx);
+}
+
+static void emit_call_fun( emit_ctx *ctx, vreg *dst, int findex, int count, int *args_regs ) {
+	hl_module *m = ctx->mod;
+	int fid = m->functions_indexes[findex];
+	bool isNative = fid >= m->code->nfunctions;
+	ereg *args = get_tmp_args(ctx, count);
+	for(int i=0;i<count;i++)
+		args[i] = LOAD(R(args_regs[i]));
+	if( isNative )
+		STORE(dst, emit_native_call(ctx, m->functions_ptrs[findex], args, count, dst->t));
+	else {
+		ereg out = emit_call_fid(ctx,findex,args,count,hl_type_mode(dst->t));
+		if( out ) STORE(dst, out);
+	}
+}
+
+static vclosure *alloc_static_closure( emit_ctx *ctx, int fid ) {
+	hl_module *m = ctx->mod;
+	vclosure *c = hl_malloc(&m->ctx.alloc,sizeof(vclosure));
+	int fidx = m->functions_indexes[fid];
+	c->hasValue = 0;
+	if( fidx >= m->code->nfunctions ) {
+		// native
+		c->t = m->code->natives[fidx - m->code->nfunctions].t;
+		c->fun = m->functions_ptrs[fid];
+		c->value = NULL;
+	} else {
+		c->t = m->code->functions[fidx].type;
+		c->fun = (void*)(int_val)fid;
+		c->value = ctx->closure_list;
+		ctx->closure_list = c;
+	}
+	return c;
+}
+
+static void *get_dynget( hl_type *t ) {
+	switch( t->kind ) {
+	case HF32:
+		return hl_dyn_getf;
+	case HF64:
+		return hl_dyn_getd;
+	case HI64:
+	case HGUID:
+		return hl_dyn_geti64;
+	case HI32:
+	case HUI16:
+	case HUI8:
+	case HBOOL:
+		return hl_dyn_geti;
+	default:
+		return hl_dyn_getp;
+	}
+}
+
+static void *get_dynset( hl_type *t ) {
+	switch( t->kind ) {
+	case HF32:
+		return hl_dyn_setf;
+	case HF64:
+		return hl_dyn_setd;
+	case HI64:
+	case HGUID:
+		return hl_dyn_seti64;
+	case HI32:
+	case HUI16:
+	case HUI8:
+	case HBOOL:
+		return hl_dyn_seti;
+	default:
+		return hl_dyn_setp;
+	}
+}
+
+static void *get_dyncast( hl_type *t ) {
+	switch( t->kind ) {
+	case HF32:
+		return hl_dyn_castf;
+	case HF64:
+		return hl_dyn_castd;
+	case HI64:
+	case HGUID:
+		return hl_dyn_casti64;
+	case HI32:
+	case HUI16:
+	case HUI8:
+	case HBOOL:
+		return hl_dyn_casti;
+	default:
+		return hl_dyn_castp;
+	}
+}
+
+static void emit_store_size( emit_ctx *ctx, ereg dst, int dst_offset, ereg src, int src_offset, int total_size ) {
+	int offset = 0;
+	while( offset < total_size) {
+		int remain = total_size - offset;
+		hl_type *ct = remain >= HL_WSIZE ? &hlt_bytes : (remain >= 4 ? &hlt_i32 : &hlt_ui8);
+		STORE_MEM(dst, dst_offset+offset, LOAD_MEM(src,src_offset+offset,ct));
+		offset += hl_type_size(ct);
+	}
+}
+
+
+static ereg emit_conv( emit_ctx *ctx, ereg v, emit_mode from, emit_mode to, bool _unsigned ) {
+	if( from == to && !_unsigned )
+		return emit_gen(ctx,MOV,v,UNUSED,to);
+	if( IS_FLOAT(from) != IS_FLOAT(to) )
+		return emit_gen_ext(ctx, _unsigned ? CONV_UNSIGNED : CONV, v, UNUSED, to, from);
+	return emit_gen_ext(ctx, CONV, v, UNUSED, to, from);
+}
+
+static bool dyn_need_type( hl_type *t ) {
+	return !(t->kind == HF32 || t->kind == HF64 || t->kind == HI64 || t->kind == HGUID);
+}
+
+static void emit_dyn_cast( emit_ctx *ctx, ereg v, hl_type *t, vreg *dst ) {
+	hl_type *dt = dst->t;
+	if( t->kind == HNULL && t->tparam->kind == dt->kind ) {
+		emit_test(ctx, v, OJNotNull);
+		int jnot = emit_jump(ctx, true);
+		ereg v1 = LOAD_CONST(0,dt);
+		STORE(dst, v1);
+		int jend = emit_jump(ctx, false);
+		patch_jump(ctx, jnot);
+		ereg v2 = LOAD_MEM(v,HDYN_VALUE,dt);
+		STORE(dst, v2);
+		patch_jump(ctx, jend);
+		return;
+	}
+	bool need_dyn = dyn_need_type(dt);
+	ereg st = emit_gen_size(ctx, ALLOC_STACK, HL_WSIZE);
+	STORE_MEM(st, 0, v);
+	ereg args[3];
+	args[0] = st;
+	args[1] = LOAD_CONST_PTR(t);
+	if( need_dyn ) args[2] = LOAD_CONST_PTR(dt);
+	ereg r = emit_native_call(ctx, get_dyncast(dt), args, need_dyn ? 3 : 2, dt);
+	STORE(dst, r);
+}
+
+static void emit_opcode( emit_ctx *ctx, hl_opcode *o );
+
+static void remap_phi_reg( emit_ctx *ctx, ereg *r ) {
+	if( *r >= 0 || IS_NULL(*r) )
+		return;
+	tmp_phi *p = GET_PHI(*r);
+	while( p->final_id < 0 ) {
+		if( p->target >= 0 ) {
+			*r = p->target;
+			return;
+		}
+		p = GET_PHI(p->target);
+	}
+	if( p->final_id == 0 )
+		return;
+	*r = -p->final_id; // new phis
+}
+
+static void emit_write_block( emit_ctx *ctx, emit_block *b ) {
+	jit_ctx *jit = ctx->jit;
+	eblock *bl = jit->blocks + b->id;
+	bl->start_pos = b->id == 0 ? 0 : b->start_pos;
+	bl->end_pos = b->end_pos;
+	bl->pred_count = blocks_count(b->preds);
+	bl->next_count = blocks_count(b->nexts);
+	bl->preds = (int*)hl_malloc(&jit->falloc,sizeof(int)*bl->pred_count);
+	bl->nexts = (int*)hl_malloc(&jit->falloc,sizeof(int)*bl->next_count);
+	for(int i=0;i<bl->pred_count;i++)
+		bl->preds[i] = blocks_get(b->preds,i)->id;
+	for(int i=0;i<bl->next_count;i++)
+		bl->nexts[i] = blocks_get(b->nexts,i)->id;
+	// write phis
+	{
+		for_iter(phi,p,b->phis)
+			if( p->final_id >= 0 )
+				bl->phi_count++;
+	}
+	bl->phis = (ephi*)hl_zalloc(&jit->falloc,sizeof(ephi)*bl->phi_count);
+	jit->phi_count += bl->phi_count;
+	int i = 0;
+	for_iter(phi,p,b->phis) {
+		if( p->final_id < 0 )
+			continue;
+		ephi *p2 = bl->phis + i++;
+		if( p->final_id == 0 )
+			p2->value = p->value;
+		else
+			p2->value = -p->final_id;
+		p2->mode = p->mode;
+		p2->nvalues = ereg_count(p->vals);
+		p2->values = (ereg*)hl_malloc(&jit->falloc,sizeof(ereg)*p2->nvalues);
+		p2->blocks = (ereg*)hl_malloc(&jit->falloc,sizeof(int)*p2->nvalues);
+		int k = 0;
+		for_iter_key(ereg,v,p->vals) {
+			remap_phi_reg(ctx, &v);
+			p2->values[k++] = v;
+		}
+		k = 0;
+		for_iter(ereg,bfrom,p->vals)
+			p2->blocks[k++] = bfrom->id;
+	}
+}
+
+void hl_emit_remap_jumps( emit_ctx *ctx, void *_jumps, einstr *instrs, int *pos_map ) {
+	int_arr jumps = *(int_arr*)_jumps;
+	int i = 0;
+	while( i < int_arr_count(jumps) ) {
+		int pos = int_arr_get(jumps,i++);
+		int target = int_arr_get(jumps,i++);
+		einstr *e = instrs + pos;
+		if( e->op == JUMP_TABLE ) {
+			int *args = (int*)hl_emit_get_args(ctx, e);
+			for(int k=0;k<e->nargs;k++)
+				args[k] = pos_map[target + args[k]] - (pos + 1);
+		} else
+			e->size_offs = pos_map[target] - (pos + 1);
+	}
+	int_arr_reset((int_arr*)_jumps);
+}
+
+void hl_emit_flush( jit_ctx *jit ) {
+	emit_ctx *ctx = jit->emit;
+	if( ctx->flushed ) return;
+	ctx->flushed = true;
+	ctx->pos_map[ctx->fun->nops] = ctx->emit_pos;
+	ctx->current_block->end_pos = ctx->emit_pos;
+	hl_emit_remap_jumps(ctx,&ctx->jump_regs, ctx->instrs, ctx->pos_map);
+	jit->instrs = ctx->instrs;
+	jit->instr_count = ctx->emit_pos;
+	jit->emit_pos_map = ctx->pos_map;
+	jit->phi_count = 0;
+	jit->block_count = ctx->current_block->id + 1;
+	jit->blocks = hl_zalloc(&jit->falloc,sizeof(eblock) * jit->block_count);
+	jit->value_count = int_arr_count(ctx->values);
+	jit->values_writes = ctx->values.values;
+	for_iter(blocks,b,ctx->blocks)
+		emit_write_block(ctx,b);
+}
+
+void hl_emit_reg_iter( jit_ctx *jit, einstr *e, void *ctx, void (*iter_reg)( void *, ereg * ) ) {
+	switch( e->op ) {
+	case CALL_REG:
+		iter_reg(ctx,&e->a);
+	case CALL_FUN:
+	case CALL_PTR:
+		{
+			int i;
+			ereg *args = hl_emit_get_args(jit->emit, e);
+			for(i=0;i<e->nargs;i++)
+				iter_reg(ctx, args + i);
+		}
+		break;
+	case LOAD_CONST:
+	case PUSH_CONST:
+		// skip
+		break;
+	default:
+		if( !IS_NULL(e->a) ) {
+			iter_reg(ctx,&e->a);
+			if( !IS_NULL(e->b) )
+				iter_reg(ctx,&e->b);
+		}
+		break;
+	}
+}
+
+ereg **hl_emit_get_regs( einstr *e, int *count ) {
+	static ereg *tmp[2];
+	int k = 0;
+	switch( e->op ) {
+	case CALL_REG:
+	case CALL_FUN:
+	case CALL_PTR:
+		jit_assert();
+		break;
+	case LOAD_CONST:
+	case PUSH_CONST:
+		// skip
+		break;
+	default:
+		if( !IS_NULL(e->a) ) {
+			tmp[k++] = &e->a;
+			if( !IS_NULL(e->b) )
+				tmp[k++] = &e->b;
+		}
+		break;
+	}
+	*count = k;
+	return tmp;
+}
+
+static void hl_emit_clean_phis( emit_ctx *ctx ) {
+	for(int i=0;i<ctx->phi_count;i++) {
+		tmp_phi *p = ctx->phis[i];
+		tmp_phi *cur = p;
+		ereg r;
+		while( true ) {
+			cur->opt = false;
+			r = optimize_phi_rec(ctx,cur);
+			if( r >= 0 || r == cur->value ) break;
+			cur = GET_PHI(r);
+		}
+		p->target = r;
+	}
+	int new_phis = 0;
+	for(int i=0;i<ctx->phi_count;i++) {
+		tmp_phi *p = ctx->phis[i];
+		if( p->target == p->value )
+			p->final_id = ++new_phis;
+		else
+			p->final_id = -1;
+	}
+	for(int i=0;i<ctx->emit_pos;i++)
+		hl_emit_reg_iter(ctx->jit, ctx->instrs + i, ctx, (void*)remap_phi_reg);
+}
+
+void hl_emit_function( jit_ctx *jit ) {
+	emit_ctx *ctx = jit->emit;
+	hl_function *f = jit->fun;
+	int i;
+	ctx->mod = jit->mod;
+	ctx->fun = f;
+	ctx->emit_pos = 0;
+	ctx->trap_count = 0;
+	ctx->phi_count = 0;
+	ctx->flushed = false;
+	int_arr_free(&ctx->args_data);
+	int_arr_free(&ctx->jump_regs);
+	int_arr_free(&ctx->values);
+	blocks_free(&ctx->blocks);
+	int_arr_add(ctx->values,-1);
+	ctx->current_block = alloc_block(ctx);
+	ctx->current_block->sealed = true;
+	ctx->arrival_points = NULL;
+	emit_debug("---- begin [%X] ----\n",f->findex);
+	if( f->nregs > ctx->max_regs ) {
+		free(ctx->vregs);
+		ctx->vregs = (vreg*)malloc(sizeof(vreg) * (f->nregs + 1));
+		if( ctx->vregs == NULL ) jit_assert();
+		for(i=0;i<f->nregs;i++)
+			R(i)->id = i;
+		ctx->max_regs = f->nregs;
+	}
+
+	if( f->nops >= ctx->pos_map_size ) {
+		free(ctx->pos_map);
+		ctx->pos_map = (int*)malloc(sizeof(int) * (f->nops+1));
+		if( ctx->pos_map == NULL ) jit_assert();
+		ctx->pos_map_size = f->nops;
+	}
+
+	for(i=0;i<f->nregs;i++) {
+		vreg *r = R(i);
+		r->t = f->regs[i];
+		r->stored = UNUSED;
+	}
+
+	emit_gen_size(ctx, BLOCK, 0);
+	emit_gen(ctx,ENTER,UNUSED,UNUSED,M_NONE);
+	for(i=0;i<f->type->fun->nargs;i++) {
+		hl_type *t = f->type->fun->args[i];
+		STORE(R(i), emit_gen(ctx, LOAD_ARG, UNUSED, UNUSED, hl_type_mode(t)));
+	}
+
+	for(int op_pos=0;op_pos<f->nops;op_pos++) {
+		ctx->op_pos = op_pos;
+		if( ctx->emit_pos > 0 && ctx->instrs[ctx->emit_pos-1].op == BLOCK )
+			ctx->pos_map[op_pos] = ctx->emit_pos-1;
+		else
+			ctx->pos_map[op_pos] = ctx->emit_pos;
+		if( ctx->arrival_points ) {
+			if( ctx->arrival_points->id < op_pos )
+				jit_assert();
+			while( ctx->arrival_points && ctx->arrival_points->id == op_pos && !split_block(ctx) ) {
+				emit_block *b = ctx->arrival_points->ptr;
+				for_iter(blocks,bp,ctx->current_block->preds) {
+					if( b == bp ) { b = NULL; break; }
+				}
+				if( b ) block_add_pred(ctx, ctx->current_block, b);
+				ctx->arrival_points = ctx->arrival_points->next;
+			}
+			if( ctx->trap_count && ctx->traps[ctx->trap_count-1].target == ctx->op_pos )
+				ctx->trap_count--;
+		}
+		emit_opcode(ctx,f->ops + op_pos);
+	}
+	// emit a break if we're not supposed to reach here : will fix RtlUnwind on windows too.
+	if( f->nops == 0 || f->ops[f->nops-1].op != ORet )
+		BREAK();
+	if( ctx->arrival_points )
+		jit_assert();
+
+	hl_emit_clean_phis(ctx);
+	hl_emit_flush(ctx->jit);
+	if( ctx->wait_seal ) jit_assert();
+}
+
+void hl_emit_alloc( jit_ctx *jit ) {
+	emit_ctx *ctx = (emit_ctx*)malloc(sizeof(emit_ctx));
+	if( ctx == NULL ) jit_assert();
+	memset(ctx,0,sizeof(emit_ctx));
+	ctx->jit = jit;
+	jit->emit = ctx;
+	if( sizeof(einstr) != 16 ) jit_assert();
+}
+
+void hl_emit_free( jit_ctx *jit ) {
+	emit_ctx *ctx = jit->emit;
+	free(ctx->vregs);
+	free(ctx->instrs);
+	free(ctx->pos_map);
+	free(ctx);
+	jit->emit = NULL;
+}
+
+void hl_emit_final( jit_ctx *jit ) {
+	emit_ctx *ctx = jit->emit;
+	vclosure *l = ctx->closure_list;
+	while( l ) {
+		vclosure *n = (vclosure*)l->value;
+		l->value = NULL;
+		l->fun = jit->final_code + (int_val)jit->mod->functions_ptrs[(int_val)l->fun];
+		l = n;
+	}
+	ctx->closure_list = NULL;
+}
+
+static bool seal_block_rec( emit_ctx *ctx, emit_block *b, int target ) {
+	if( b->start_pos < target )
+		return false;
+	if( b->start_pos == target ) {
+		b->wait_nexts--;
+		block_add_pred(ctx, b, ctx->current_block);
+		while( b && b->wait_nexts == 0 && ctx->wait_seal == b ) {
+			seal_block(ctx,b);
+			b = b->wait_seal_next;
+			ctx->wait_seal = b;
+		}
+		return true;
+	}
+	for_iter(blocks,p,b->preds)
+		if( p->start_pos < b->start_pos && seal_block_rec(ctx,p,target) )
+			return true;
+	return false;
+}
+
+static void register_block_jump( emit_ctx *ctx, int offs, bool cond ) {
+	int jidx = ctx->emit_pos;
+	emit_gen(ctx, cond ? JCOND : JUMP, UNUSED, UNUSED, 0);
+	register_jump(ctx, jidx, offs);
+	if( offs < 0 ) {
+		int target = ctx->pos_map[ctx->op_pos + 1 + offs];
+		emit_block *b = ctx->current_block;
+		if( !seal_block_rec(ctx, b, target) ) jit_assert();
+	}
+}
+
+static void prepare_loop_block( emit_ctx *ctx ) {
+	emit_block *b = ctx->current_block;
+	// gather all backward jumps to know when the block will be finished
+	for(int i=ctx->op_pos+1;i<ctx->fun->nops;i++) {
+		hl_opcode *op = &ctx->fun->ops[i];
+		int offs = 0;
+		switch( op->op ) {
+		case OJFalse:
+		case OJTrue:
+		case OJNotNull:
+		case OJNull:
+			offs = op->p2;
+			break;
+		case OJAlways:
+			offs = op->p1;
+			break;
+		case OJEq:
+		case OJNotEq:
+		case OJSLt:
+		case OJSGte:
+		case OJSLte:
+		case OJSGt:
+		case OJULt:
+		case OJUGte:
+		case OJNotLt:
+		case OJNotGte:
+			offs = op->p3;
+			break;
+		default:
+			break;
+		}
+		if( offs < 0 && i + 1 + offs == ctx->op_pos ) {
+			emit_debug("  WAIT @%X\n",i);
+			b->wait_nexts++;
+			if( b->sealed ) {
+				b->sealed = false;
+				b->wait_seal_next = ctx->wait_seal;
+				ctx->wait_seal = b;
+			}
+		}
+	}
+}
+
+static void emit_jump_dyn( emit_ctx *ctx, hl_op op, hl_type *at, ereg a, hl_type *bt, ereg b, int offset ) {
+	if( at->kind == HDYN || bt->kind == HDYN || at->kind == HFUN || bt->kind == HFUN ) {
+		ereg args[2] = { a, b };
+		ereg ret = emit_native_call(ctx,hl_dyn_compare,args,2,&hlt_i32);
+		if( op == OJSGt || op == OJSGte ) {
+			emit_cmp(ctx, ret, LOAD_CONST(hl_invalid_comparison,&hlt_i32), OJEq);
+			int jinvalid = emit_jump(ctx, true);
+			emit_test(ctx, ret, op);
+			register_block_jump(ctx, offset, true);
+			patch_jump(ctx, jinvalid);
+			return;
+		}
+		emit_test(ctx, ret, op);
+		// continue
+	} else switch( at->kind ) {
+	case HTYPE:
+		{
+			ereg args[2] = { a, b };
+			ereg ret = emit_native_call(ctx,hl_same_type,args,2,&hlt_bool);
+			emit_test(ctx, emit_gen_ext(ctx,UNOP,ret,UNUSED,M_I32,ONot), op);
+		}
+		break;
+	case HNULL:
+		{
+			if( op == OJEq ) {
+				// if( a == b || (a && b && a->v == b->v) ) goto
+				emit_cmp(ctx,a,b,OJEq);
+				register_block_jump(ctx,offset,true);
+				emit_test(ctx,a,OJNull);
+				int ja = emit_jump(ctx,true);
+				emit_test(ctx,b,OJNull);
+				int jb = emit_jump(ctx,true);				
+				hl_type *vt = at->tparam;
+				emit_cmp(ctx, LOAD_MEM(a,HDYN_VALUE,vt), LOAD_MEM(b,HDYN_VALUE,vt), OJEq);
+				register_block_jump(ctx,offset,true);
+				patch_jump(ctx,ja);
+				patch_jump(ctx,jb);
+			} else if( op == OJNotEq ) {
+				// if( a != b && (!a || !b || a->v != b->v) ) goto
+				emit_cmp(ctx,a,b,OJEq);
+				int jeq = emit_jump(ctx,true);
+				emit_test(ctx,a,OJEq);
+				register_block_jump(ctx,offset,true);
+				split_block(ctx);
+				emit_test(ctx,b,OJEq);
+				register_block_jump(ctx,offset,true);
+				split_block(ctx);
+				hl_type *vt = at->tparam;
+				emit_cmp(ctx, LOAD_MEM(a,HDYN_VALUE,vt), LOAD_MEM(b,HDYN_VALUE,vt), OJNull);
+				add_jump_target(ctx, 0);
+				int jcmp = emit_jump(ctx,true);
+				register_block_jump(ctx,offset,true);
+				patch_jump(ctx,jcmp);
+				patch_jump(ctx,jeq);
+			} else
+				jit_assert();
+		}
+		return;
+	case HVIRTUAL:
+		if( bt->kind == HOBJ ) {
+			if( op == OJEq ) {
+				// if( a == b || (a && a->value == b) ) goto
+				emit_cmp(ctx, a, b, OJEq);
+				register_block_jump(ctx,offset,true);
+				split_block(ctx);
+				emit_test(ctx, a, OJNull);
+				int jnot = emit_jump(ctx, true);
+				emit_cmp(ctx, LOAD_MEM_PTR(a,HL_WSIZE), b, OJEq);
+				register_block_jump(ctx,offset,true);
+				split_block(ctx);
+				patch_jump(ctx, jnot);
+			} else if( op == OJNotEq ) {
+				// if( a != b && (!a || a->value != b) ) goto
+				emit_cmp(ctx, a, b, OJEq);
+				int jsame = emit_jump(ctx, true);
+				emit_test(ctx, a, OJNull);
+				register_block_jump(ctx,offset,true);
+				split_block(ctx);
+				emit_cmp(ctx, LOAD_MEM_PTR(a,HL_WSIZE), b, OJNotEq);
+				register_block_jump(ctx,offset,true);
+				split_block(ctx);
+				patch_jump(ctx,jsame);
+			} else
+				jit_assert();
+		} else {
+			if( op == OJEq ) {
+				// if( a == b || (a && b && a->value && a->value == b->value) ) goto
+				emit_cmp(ctx, a, b, OJEq);
+				register_block_jump(ctx,offset,true);
+				split_block(ctx);
+				emit_test(ctx, a, OJNull);
+				int ja = emit_jump(ctx, true);
+				emit_test(ctx, b, OJNull);
+				int jb = emit_jump(ctx, true);
+				ereg va = LOAD_MEM_PTR(a,HL_WSIZE);
+				emit_test(ctx, va, OJNull);
+				int jva = emit_jump(ctx, true);
+				ereg vb = LOAD_MEM_PTR(b,HL_WSIZE);
+				emit_cmp(ctx, va, vb, OJEq);
+				register_block_jump(ctx,offset,true);
+				split_block(ctx);
+				patch_jump(ctx,ja);
+				patch_jump(ctx,jb);
+				patch_jump(ctx,jva);
+			} else if( op == OJNotEq ) {
+				// if( a != b && (!a || !b || !a->value || a->value != b->value) ) goto
+				emit_cmp(ctx, a, b, OJEq);
+				int jeq1 = emit_jump(ctx, true);
+				emit_test(ctx, a, OJNull);
+				int ja = emit_jump(ctx, true);
+				emit_test(ctx, b, OJNull);
+				int jb = emit_jump(ctx, true);
+				ereg va = LOAD_MEM_PTR(a,HL_WSIZE);
+				emit_test(ctx, va, OJNull);
+				int jva = emit_jump(ctx, true);
+				ereg vb = LOAD_MEM_PTR(b,HL_WSIZE);
+				emit_cmp(ctx, va, vb, OJEq);
+				int jeq2 = emit_jump(ctx, true);
+				split_block(ctx);
+				patch_jump(ctx,ja);
+				patch_jump(ctx,jb);
+				patch_jump(ctx,jva);
+				register_block_jump(ctx,offset,false);
+				split_block(ctx);
+				patch_jump(ctx,jeq1);
+				patch_jump(ctx,jeq2);
+			} else
+				jit_assert();
+		}
+		return;
+	case HOBJ:
+	case HSTRUCT:
+		if( bt->kind == HVIRTUAL ) {
+			emit_jump_dyn(ctx,op,bt,b,at,a,offset); // inverse
+			return;
+		}
+		if( hl_get_obj_rt(at)->compareFun ) {
+			ereg args[] = {a,b};
+			switch( op ) {
+			case OJEq:
+				{
+					// if( a == b || (a && b && cmp(a,b) == 0) ) goto
+					emit_cmp(ctx,a,b,OJEq);
+					int jeq = emit_jump(ctx, true);
+					emit_test(ctx,a,OJNull);
+					int ja = emit_jump(ctx, true);				
+					emit_test(ctx,b,OJNull);
+					int jb = emit_jump(ctx, true);
+					emit_test(ctx, emit_call_fid(ctx,(int)(int_val)at->obj->rt->compareFun,args,2,M_I32),OJNotNull);
+					int jcmp = emit_jump(ctx, true);
+					patch_jump(ctx, jeq);
+					register_block_jump(ctx, offset, false);
+					split_block(ctx);
+					patch_jump(ctx, ja);
+					patch_jump(ctx, jb);
+					patch_jump(ctx, jcmp);
+				}
+				break;
+			case OJNotEq:
+				{
+					// if( a != b && (!a || !b || cmp(a,b) != 0) ) goto
+					emit_cmp(ctx,a,b,OJEq);
+					add_jump_target(ctx, 0);
+					int jeq = emit_jump(ctx, true);
+					emit_test(ctx,a,OJEq);
+					register_block_jump(ctx,offset,true);
+					split_block(ctx);
+					emit_test(ctx,b,OJEq);
+					register_block_jump(ctx,offset,true);
+					split_block(ctx);
+					emit_test(ctx, emit_call_fid(ctx,(int)(int_val)at->obj->rt->compareFun,args,2,M_I32),OJNotNull);
+					register_block_jump(ctx,offset,true);
+					patch_jump(ctx,jeq);
+				}
+				break;
+			default:
+				{
+					// if( a && b && cmp(a,b) ~op~ 0 ) goto
+					emit_test(ctx,a,OJNull);
+					int ja = emit_jump(ctx, true);
+					emit_test(ctx,b,OJNull);
+					int jb = emit_jump(ctx, true);
+					emit_cmp(ctx, emit_call_fid(ctx,(int)(int_val)at->obj->rt->compareFun,args,2,M_I32), LOAD_CONST(0,&hlt_i32),op);
+					register_block_jump(ctx,offset,true);
+					patch_jump(ctx,ja);
+					patch_jump(ctx,jb);
+				}
+				break;
+			}
+			return;
+		}
+		// fallthrough
+	default:
+		emit_cmp(ctx, a, b, op);
+		break;
+	}
+	register_block_jump(ctx, offset, true);
+}
+
+static void emit_opcode( emit_ctx *ctx, hl_opcode *o ) {
+	vreg *dst = R(o->p1);
+	vreg *ra = R(o->p2);
+	vreg *rb = R(o->p3);
+	hl_module *m = ctx->mod;
+#ifdef HL_DEBUG
+	int uid = (ctx->fun->findex << 16) | ctx->op_pos;
+	__ignore(&uid);
+#endif
+	switch( o->op ) {
+	case OMov:
+	case OUnsafeCast:
+		STORE(dst, emit_gen(ctx,MOV,LOAD(ra),UNUSED,hl_type_mode(ra->t)));
+		break;
+	case OInt:
+		STORE(dst, LOAD_CONST(m->code->ints[o->p2], dst->t));
+		break;
+	case OBool:
+		STORE(dst, LOAD_CONST(o->p2, &hlt_bool));
+		break;
+	case ONull:
+		STORE(dst, LOAD_CONST(0, dst->t));
+		break;
+	case OFloat:
+		{
+			union {
+				float f;
+				double d;
+				uint64 i;
+			} v;
+			if( dst->t->kind == HF32 ) {
+				v.i = 0;
+				v.f = (float)m->code->floats[o->p2];
+			} else
+				v.d = m->code->floats[o->p2];
+			STORE(dst, LOAD_CONST(v.i, dst->t));
+		}
+		break;
+	case OString:
+		STORE(dst, LOAD_CONST_PTR(hl_get_ustring(m->code,o->p2)));
+		break;
+	case OBytes:
+		{
+			char *b = m->code->version >= 5 ? m->code->bytes + m->code->bytes_pos[o->p2] : m->code->strings[o->p2];
+			STORE(dst,LOAD_CONST_PTR(b));
+		}
+		break;
+	case OGetGlobal:
+		{
+			int offs = m->globals_indexes[o->p2];
+			STORE(dst, LOAD_MEM_PTR(LOAD_CONST_PTR(m->globals_data),offs));
+		}
+		break;
+	case OSetGlobal:
+		{
+			int offs = m->globals_indexes[o->p1];
+			STORE_MEM(LOAD_CONST_PTR(m->globals_data),offs,LOAD(ra));
+		}
+		break;
+	case OCall0:
+		emit_call_fun(ctx, dst, o->p2, 0, NULL);
+		break;
+	case OCall1:
+		emit_call_fun(ctx, dst, o->p2, 1, &o->p3);
+		break;
+	case OCall2:
+		{
+			int args[2] = { o->p3, (int)(int_val)o->extra };
+			emit_call_fun(ctx, dst, o->p2, 2, args);
+		}
+		break;
+	case OCall3:
+		{
+			int args[3] = { o->p3, o->extra[0], o->extra[1] };
+			emit_call_fun(ctx, dst, o->p2, 3, args);
+		}
+		break;
+	case OCall4:
+		{
+			int args[4] = { o->p3, o->extra[0], o->extra[1], o->extra[2] };
+			emit_call_fun(ctx, dst, o->p2, 4, args);
+		}
+		break;
+	case OCallN:
+		emit_call_fun(ctx, dst, o->p2, o->p3, o->extra);
+		break;
+	case OSub:
+	case OAdd:
+	case OMul:
+	case OSDiv:
+	case OUDiv:
+	case OShl:
+	case OSShr:
+	case OUShr:
+	case OAnd:
+	case OOr:
+	case OXor:
+	case OSMod:
+	case OUMod:
+		{
+			ereg va = LOAD(ra);
+			ereg vb = LOAD(rb);
+			ereg r;
+			if( (dst->t->kind == HF32 || dst->t->kind == HF64) && o->op == OSMod ) {
+				ereg args[] = {va,vb};
+				r = emit_native_call(ctx, dst->t->kind == HF32 ? (void*)fmodf : (void*)fmod, args, 2, dst->t);
+			} else {
+				r = emit_gen_ext(ctx, BINOP, va, vb, hl_type_mode(dst->t), o->op);
+			}
+			STORE(dst, r);
+		}
+		break;
+	case ONeg:
+		STORE(dst, emit_gen_ext(ctx, UNOP, LOAD(ra), UNUSED, hl_type_mode(dst->t), o->op));
+		break;
+	case ONot:
+		STORE(dst, emit_gen_ext(ctx, UNOP, LOAD(ra), LOAD_CONST(1,&hlt_i32), hl_type_mode(dst->t), OXor));
+		break;
+	case OJFalse:
+	case OJTrue:
+	case OJNotNull:
+	case OJNull:
+		{
+			emit_test(ctx, LOAD(dst), o->op);
+			register_block_jump(ctx, o->p2, true);
+			add_jump_target(ctx, 0);
+		}
+		break;
+	case OJEq:
+	case OJNotEq:
+	case OJSLt:
+	case OJSGte:
+	case OJSLte:
+	case OJSGt:
+	case OJULt:
+	case OJUGte:
+	case OJNotLt:
+	case OJNotGte:
+		emit_jump_dyn(ctx,o->op,dst->t,LOAD(dst),ra->t,LOAD(ra),o->p3);
+		add_jump_target(ctx, 0);
+		break;
+	case OJAlways:
+		register_block_jump(ctx, o->p1, false);
+		break;
+	case OToDyn:
+		if( ra->t->kind == HBOOL ) {
+			ereg arg = LOAD(ra);
+			STORE(dst, emit_native_call(ctx,hl_alloc_dynbool,&arg,1,&hlt_dyn));
+		} else {
+			ereg arg = LOAD_CONST_PTR(ra->t);
+			ereg ret = emit_native_call(ctx,hl_alloc_dynamic,&arg,1,&hlt_dyn);
+			STORE_MEM(ret,HDYN_VALUE,LOAD(ra));
+			STORE(dst, ret);
+		}
+		break;
+	case OToSFloat:
+	case OToInt:
+	case OToUFloat:
+		STORE(dst, emit_conv(ctx,LOAD(ra),hl_type_mode(ra->t),hl_type_mode(dst->t), o->op == OToUFloat));
+		break;
+	case ORet:
+		emit_gen(ctx, RET, dst->t->kind == HVOID ? UNUSED : LOAD(dst), 0, M_NORET);
+		patch_instr_mode(ctx, hl_type_mode(dst->t));
+		break;
+	case OIncr:
+	case ODecr:
+		STORE(dst, emit_gen_ext(ctx,UNOP,LOAD(dst),UNUSED,hl_type_mode(dst->t),o->op));
+		break;
+	case ONew:
+		{
+			ereg arg = UNUSED;
+			void *allocFun = NULL;
+			int nargs = 1;
+			switch( dst->t->kind ) {
+			case HOBJ:
+			case HSTRUCT:
+				allocFun = hl_alloc_obj;
+				break;
+			case HDYNOBJ:
+				allocFun = hl_alloc_dynobj;
+				nargs = 0;
+				break;
+			case HVIRTUAL:
+				allocFun = hl_alloc_virtual;
+				break;
+			default:
+				jit_assert();
+			}
+			if( nargs ) arg = LOAD_CONST_PTR(dst->t);
+			STORE(dst, emit_native_call(ctx,allocFun,&arg,nargs,dst->t));
+		}
+		break;
+	case OInstanceClosure:
+		{
+			ereg args[3];
+			args[0] = LOAD_CONST_PTR(m->code->functions[m->functions_indexes[o->p2]].type);
+			einstr *e = emit_instr(ctx, LOAD_FUN);
+			e->mode = M_PTR;
+			e->size_offs = o->p2;
+			args[1] = new_value(ctx);
+			args[2] = LOAD(rb);
+			STORE(dst, emit_native_call(ctx,hl_alloc_closure_ptr,args,3,dst->t));
+		}
+		break;
+	case OVirtualClosure:
+		{
+			hl_type *t = NULL;
+			hl_type *ot = ra->t;
+			while( t == NULL ) {
+				int i;
+				for(i=0;i<ot->obj->nproto;i++) {
+					hl_obj_proto *pp = ot->obj->proto + i;
+					if( pp->pindex == o->p3 ) {
+						t = m->code->functions[m->functions_indexes[pp->findex]].type;
+						break;
+					}
+				}
+				ot = ot->obj->super;
+			}
+			ereg args[3];
+			ereg obj = LOAD(ra);
+			args[0] = LOAD_CONST_PTR(t);
+			args[1] = LOAD_OBJ_METHOD(obj,o->p3);
+			args[2] = obj;
+			STORE(dst, emit_native_call(ctx,hl_alloc_closure_ptr,args,3,dst->t));
+		}
+		break;
+	case OCallClosure:
+		if( ra->t->kind == HDYN ) {
+			int i;
+			ereg st = emit_gen_size(ctx, ALLOC_STACK, o->p3 * HL_WSIZE);
+			for(i=0;i<o->p3;i++) {
+				vreg *r = R(o->extra[i]);
+				if( !hl_is_dynamic(r->t) ) jit_assert();
+				STORE_MEM(st,i*HL_WSIZE,LOAD(r));
+			}
+			ereg args[3];
+			args[0] = LOAD(ra);
+			args[1] = st;
+			args[2] = LOAD_CONST(o->p3,&hlt_i32);
+			emit_dyn_cast(ctx,emit_native_call(ctx,hl_dyn_call,args,3,dst->t),ra->t,dst);
+		} else {
+			ereg r = LOAD(ra);
+			ereg *args = get_tmp_args(ctx,o->p3+1);
+			// Code for if( c->hasValue ) c->fun(c->value,args) else c->fun(args)
+			ereg has = LOAD_MEM(r,HL_WSIZE*2,&hlt_i32);
+			emit_test(ctx, has, OJNull);
+			int jidx = emit_jump(ctx, true);
+			int i;
+			args[0] = LOAD_MEM_PTR(r,HL_WSIZE * 3);
+			for(i=0;i<o->p3;i++)
+				args[i+1] = LOAD(R(o->extra[i]));
+			ereg v1 = emit_dyn_call(ctx,LOAD_MEM_PTR(r,HL_WSIZE),args,o->p3 + 1,dst->t);
+			STORE(dst, v1);
+			int jend = emit_jump(ctx, false);
+			patch_jump(ctx, jidx);
+			for(i=0;i<o->p3;i++)
+				args[i] = LOAD(R(o->extra[i]));
+			ereg v2 = emit_dyn_call(ctx,LOAD_MEM_PTR(r,HL_WSIZE),args,o->p3,dst->t);
+			STORE(dst, v2);
+			patch_jump(ctx, jend);
+		}
+		break;
+	case OStaticClosure:
+		{
+			vclosure *c = alloc_static_closure(ctx,o->p2);
+			STORE(dst, LOAD_CONST_PTR(c));
+		}
+		break;
+	case OField:
+		{
+			switch( ra->t->kind ) {
+			case HOBJ:
+			case HSTRUCT:
+				{
+					hl_runtime_obj *rt = hl_get_obj_rt(ra->t);
+					ereg r = LOAD(ra);
+					if( dst->t->kind == HSTRUCT ) {
+						hl_type *ft = hl_obj_field_fetch(ra->t,o->p3)->t;
+						if( ft->kind == HPACKED ) {
+							STORE(dst,OFFSET(r, UNUSED, 0, rt->fields_indexes[o->p3]));
+							break;
+						}
+					}
+					STORE(dst, LOAD_MEM(r,rt->fields_indexes[o->p3],dst->t));
+				}
+				break;
+			case HVIRTUAL:
+				// code for : if( hl_vfields(o)[f] ) r = *hl_vfields(o)[f]; else r = hl_dyn_get(o,hash(field),vt)
+				{
+					ereg obj = LOAD(ra);
+					ereg field = LOAD_MEM_PTR(obj,sizeof(vvirtual)+HL_WSIZE*o->p3);
+					emit_test(ctx, field, OJNull);
+					int jidx = emit_jump(ctx, true);
+					ereg v1 = LOAD_MEM(field,0,dst->t);
+					STORE(dst, v1);
+					int jend = emit_jump(ctx, false);
+					patch_jump(ctx, jidx);
+					bool need_type = dyn_need_type(dst->t);
+					ereg args[3];
+					args[0] = obj;
+					args[1] = LOAD_CONST(ra->t->virt->fields[o->p3].hashed_name,&hlt_i32);
+					if( need_type ) args[2] = LOAD_CONST_PTR(dst->t);
+					ereg v2 = emit_native_call(ctx,get_dynget(dst->t),args,need_type?3:2,dst->t);
+					STORE(dst, v2);
+					patch_jump(ctx, jend);
+				}
+				break;
+			default:
+				jit_assert();
+				break;
+			}
+		}
+		break;
+	case OSetField:
+		{
+			switch( dst->t->kind ) {
+			case HOBJ:
+			case HSTRUCT:
+				{
+					ereg obj = LOAD(dst);
+					ereg val = LOAD(rb);
+					hl_runtime_obj *rt = hl_get_obj_rt(dst->t);
+					int field_pos = rt->fields_indexes[o->p2];
+					if( rb->t->kind == HSTRUCT ) {
+						hl_type *ft = hl_obj_field_fetch(dst->t,o->p2)->t;
+						if( ft->kind == HPACKED ) {
+							emit_store_size(ctx,obj,field_pos,val,0,hl_get_obj_rt(ft->tparam)->size);
+							break;
+						}
+					}
+					STORE_MEM(obj,field_pos, val);
+				}
+				break;
+			case HVIRTUAL:
+				// code for : if( hl_vfields(o)[f] ) *hl_vfields(o)[f] = v; else hl_dyn_set(o,hash(field),vt,v)
+				{
+					ereg obj = LOAD(dst);
+					ereg val = LOAD(rb);
+					ereg field = LOAD_MEM_PTR(obj,sizeof(vvirtual)+HL_WSIZE*o->p2);
+					emit_test(ctx, field, OJNull);
+					int jidx = emit_jump(ctx, true);
+					STORE_MEM(field, 0, val);
+					int jend = emit_jump(ctx, false);
+					patch_jump(ctx, jidx);
+					bool need_type = dyn_need_type(dst->t);
+					ereg args[4];
+					args[0] = obj;
+					args[1] = LOAD_CONST(dst->t->virt->fields[o->p2].hashed_name,&hlt_i32);
+					if( need_type ) {
+						args[2] = LOAD_CONST_PTR(rb->t);
+						args[3] = val;
+					} else {
+						args[2] = val;
+					}
+					emit_native_call(ctx,get_dynset(dst->t),args,need_type?4:3,dst->t);
+					patch_jump(ctx, jend);
+				}
+				break;
+			default:
+				jit_assert();
+				break;
+			}
+		}
+		break;
+	case OGetThis:
+		{
+			vreg *r = R(0);
+			ereg obj = LOAD(r);
+			hl_runtime_obj *rt = hl_get_obj_rt(r->t);
+			int field_pos = rt->fields_indexes[o->p2];
+			if( dst->t->kind == HSTRUCT ) {
+				hl_type *ft = hl_obj_field_fetch(r->t,o->p2)->t;
+				if( ft->kind == HPACKED ) {
+					STORE(dst, OFFSET(obj, UNUSED, 0, field_pos));
+					break;
+				}
+			}
+			STORE(dst, LOAD_MEM(obj, field_pos, dst->t));
+		}
+		break;
+	case OSetThis:
+		{
+			vreg *r = R(0);
+			ereg obj = LOAD(r);
+			ereg val = LOAD(ra);
+			hl_runtime_obj *rt = hl_get_obj_rt(r->t);
+			int field_pos = rt->fields_indexes[o->p1];
+			if( ra->t->kind == HSTRUCT ) {
+				hl_type *ft = hl_obj_field_fetch(r->t,o->p1)->t;
+				if( ft->kind == HPACKED ) {
+					emit_store_size(ctx, obj, field_pos, val, 0, hl_get_obj_rt(ft->tparam)->size);
+					break;
+				}
+			}
+			STORE_MEM(obj,field_pos,val);
+		}
+		break;
+	case OCallThis:
+		{
+			int i;
+			int nargs = o->p3 + 1;
+			ereg obj = LOAD(R(0));
+			ereg *args = get_tmp_args(ctx, nargs);
+			args[0] = obj;
+			for(i=1;i<nargs;i++)
+				args[i] = LOAD(R(o->extra[i-1]));
+			ereg fun = LOAD_OBJ_METHOD(obj, o->p2);
+			STORE(dst, emit_dyn_call(ctx,fun,args,nargs,dst->t));
+		}
+		break;
+	case OCallMethod:
+		{
+			vreg *r = R(o->extra[0]);
+			ereg obj = LOAD(r);
+			switch( r->t->kind ) {
+			case HOBJ:
+				{
+					int i;
+					int nargs = o->p3;
+					ereg *args = get_tmp_args(ctx, nargs);
+					for(i=0;i<nargs;i++)
+						args[i] = LOAD(R(o->extra[i]));
+					ereg fun = LOAD_OBJ_METHOD(obj, o->p2);
+					STORE(dst, emit_dyn_call(ctx,fun,args,nargs,dst->t));
+				}
+				break;
+			case HVIRTUAL:
+				// code for : if( (fun=hl_vfields(o)[f]) ) dst = fun(o->value,args...); else dst = hl_dyn_call_obj(o->value,ft,field,args,&ret)
+				{
+					vreg *_o = R(o->extra[0]);
+					ereg obj = LOAD(_o);
+					ereg fun = LOAD_MEM_PTR(obj,sizeof(vvirtual)+HL_WSIZE*o->p2);
+					emit_test(ctx, fun, OJNull);
+					int jidx = emit_jump(ctx, true);
+
+					int nargs = o->p3;
+					ereg *args = get_tmp_args(ctx, nargs);
+					int i;
+					args[0] = LOAD_MEM_PTR(obj,HL_WSIZE);
+					for(i=1;i<nargs;i++)
+						args[i] = LOAD(R(o->extra[i]));
+					ereg v1 = emit_dyn_call(ctx,fun,args,nargs,dst->t);
+					STORE(dst, v1);
+
+					int jend = emit_jump(ctx, false);
+					patch_jump(ctx, jidx);
+
+					nargs = o->p3 - 1;
+					ereg eargs = nargs == 0 ? LOAD_CONST_PTR(NULL) : emit_gen_size(ctx, ALLOC_STACK, nargs * HL_WSIZE);
+					for(i=0;i<nargs;i++) {
+						vreg *r = R(o->extra[i+1]);
+						if( hl_is_ptr(r->t) )
+							STORE_MEM(eargs,i*HL_WSIZE,LOAD(r));
+						else
+							STORE_MEM(eargs,i*HL_WSIZE,emit_gen(ctx, ADDRESS, LOAD(r), UNUSED, M_PTR));
+					}
+					bool need_dyn = !hl_is_ptr(dst->t) && dst->t->kind != HVOID;
+					ereg edyn = need_dyn ? emit_gen_size(ctx, ALLOC_STACK, sizeof(vdynamic)) : LOAD_CONST_PTR(NULL);
+
+					args = get_tmp_args(ctx, 5);
+					args[0] = LOAD_MEM_PTR(obj,HL_WSIZE);
+					args[1] = LOAD_CONST_PTR(_o->t->virt->fields[o->p2].t);
+					args[2] = LOAD_CONST(_o->t->virt->fields[o->p2].hashed_name,&hlt_i32);
+					args[3] = eargs;
+					args[4] = edyn;
+
+					ereg v2 = emit_native_call(ctx, hl_dyn_call_obj, args, 5, &hlt_bytes);
+					if( need_dyn )
+						STORE(dst, LOAD_MEM(edyn,HDYN_VALUE,dst->t));
+					else
+						STORE(dst, v2);
+					patch_jump(ctx, jend);
+				}
+				break;
+			default:
+				jit_assert();
+				break;
+			}
+		}
+		break;
+	case OThrow:
+	case ORethrow:
+		{
+			ereg arg = LOAD(dst);
+			emit_native_call(ctx, o->op == OThrow ? hl_throw : hl_rethrow, &arg, 1, NULL);
+		}
+		break;
+	case OLabel:
+		split_block(ctx);
+		prepare_loop_block(ctx);
+		break;
+	case OGetI8:
+	case OGetI16:
+	case OGetMem:
+		{
+			hl_type *size_t = o->op == OGetI8 ? &hlt_ui8 : o->op == OGetI16 ? &hlt_ui16 : dst->t;
+			ereg offs = OFFSET(LOAD(ra),LOAD(rb),1,0);
+			ereg val = emit_load_mem(ctx, offs, 0, size_t, dst->t);
+			STORE(dst, val);
+		}
+		break;
+	case OSetI8:
+	case OSetI16:
+	case OSetMem:
+		{
+			ereg offs = OFFSET(LOAD(dst), LOAD(ra),1,0);
+			ereg val = LOAD(rb);
+			STORE_MEM(offs, 0, val);
+			if( o->op != OSetMem ) patch_instr_mode(ctx, o->op == OSetI8 ? M_UI8 : M_UI16);
+		}
+		break;
+	case OType:
+		STORE(dst, LOAD_CONST_PTR(m->code->types + o->p2));
+		break;
+	case OGetType:
+		{
+			ereg r = LOAD(ra);
+			emit_test(ctx, r, OJNotNull);
+			int jidx = emit_jump(ctx, true);
+			ereg v1 = LOAD_CONST_PTR(&hlt_void);
+			STORE(dst,v1);
+			int jend = emit_jump(ctx, false);
+			patch_jump(ctx, jidx);
+			ereg v2 = LOAD_MEM_PTR(r,0);
+			STORE(dst,v2);
+			patch_jump(ctx, jend);
+		}
+		break;
+	case OGetArray:
+		{
+			if( ra->t->kind == HABSTRACT ) {
+				int osize;
+				bool isPtr = dst->t->kind != HOBJ && dst->t->kind != HSTRUCT;
+				if( isPtr )
+					osize = HL_WSIZE; // a pointer into the carray
+				else {
+					hl_runtime_obj *rt = hl_get_obj_rt(dst->t);
+					osize = rt->size; // a mem offset into it
+				}
+				ereg pos = (osize <= 8 && ((osize - 1) & osize) == 0) ? OFFSET(LOAD(ra), LOAD(rb), osize, 0) : OFFSET(LOAD(ra), emit_gen_ext(ctx,BINOP,LOAD(rb),MK_CONST(osize),M_I32,OMul),1,0);
+				ereg val = isPtr ? LOAD_MEM_PTR(pos,0) : pos;
+				STORE(dst, val);
+			} else {
+				ereg pos = OFFSET(LOAD(ra), LOAD(rb), hl_type_size(dst->t), sizeof(varray));
+				STORE(dst, LOAD_MEM(pos,0,dst->t));
+			}
+		}
+		break;
+	case OSetArray:
+		{
+			if( dst->t->kind == HABSTRACT ) {
+				int osize;
+				bool isPtr = rb->t->kind != HOBJ && rb->t->kind != HSTRUCT;
+				if( isPtr) {
+					osize = HL_WSIZE;
+				} else {
+					hl_runtime_obj *rt = hl_get_obj_rt(rb->t);
+					osize = rt->size;
+				}
+				ereg pos = (osize <= 8 && ((osize - 1) & osize) == 0) ? OFFSET(LOAD(dst), LOAD(ra), osize, 0) : OFFSET(LOAD(dst), emit_gen_ext(ctx,BINOP,LOAD(ra),MK_CONST(osize),M_I32,OMul),1,0);
+				emit_store_size(ctx, pos, 0, LOAD(rb), 0, osize);
+			} else  {
+				ereg pos = OFFSET(LOAD(dst), LOAD(ra), hl_type_size(dst->t), sizeof(varray));
+				STORE_MEM(pos, 0, LOAD(rb));
+			}
+		}
+		break;
+	case OArraySize:
+		STORE(dst, LOAD_MEM(LOAD(ra),HL_WSIZE*2,&hlt_i32));
+		break;
+	case ORef:
+		STORE(dst, emit_gen(ctx, ADDRESS, LOAD(ra), UNUSED, M_PTR));
+		break;
+	case OUnref:
+		STORE(dst, LOAD_MEM(LOAD(ra),0,dst->t));
+		break;
+	case OSetref:
+		STORE_MEM(LOAD(dst),0,LOAD(ra));
+		break;
+	case ORefData:
+		switch( ra->t->kind ) {
+		case HARRAY:
+			STORE(dst, OFFSET(LOAD(ra),UNUSED,0,sizeof(varray)));
+			break;
+		default:
+			jit_assert();
+		}
+		break;
+	case ORefOffset:
+		STORE(dst, OFFSET(LOAD(ra),LOAD(rb), hl_type_size(dst->t->tparam),0));
+		break;
+	case OToVirtual:
+		{
+			ereg args[2];
+			args[0] = LOAD_CONST_PTR(dst->t);
+			args[1] = LOAD(ra);
+			STORE(dst, emit_native_call(ctx,hl_to_virtual,args,2, dst->t));
+		}
+		break;
+	case OMakeEnum:
+		{
+			ereg args[2];
+			args[0] = LOAD_CONST_PTR(dst->t);
+			args[1] = LOAD_CONST(o->p2,&hlt_i32);
+			ereg en = emit_native_call(ctx, hl_alloc_enum, args, 2, dst->t);
+			hl_enum_construct *c = &dst->t->tenum->constructs[o->p2];
+			for(int i=0;i<c->nparams;i++)
+				STORE_MEM(en, c->offsets[i], LOAD(R(o->extra[i])));
+			STORE(dst, en);
+		}
+		break;
+	case OEnumAlloc:
+		{
+			ereg args[2];
+			args[0] = LOAD_CONST_PTR(dst->t);
+			args[1] = LOAD_CONST(o->p2,&hlt_i32);
+			STORE(dst, emit_native_call(ctx, hl_alloc_enum, args, 2, dst->t));
+		}
+		break;
+	case OEnumField:
+		{
+			hl_enum_construct *c = &ra->t->tenum->constructs[o->p3];
+			int slot = (int)(int_val)o->extra;
+			STORE(dst, LOAD_MEM(LOAD(ra),c->offsets[slot], dst->t));
+		}
+		break;
+	case OEnumIndex:
+		STORE(dst, LOAD_MEM(LOAD(ra),HL_WSIZE,dst->t));
+		break;
+	case OSetEnumField:
+		{
+			hl_enum_construct *c = &dst->t->tenum->constructs[0];
+			STORE_MEM(LOAD(dst), c->offsets[o->p2], LOAD(rb));
+		}
+		break;
+	case ONullCheck:
+		{
+			emit_test(ctx, LOAD(dst), OJNotNull);
+			add_jump_target(ctx, 0);
+			int jok = emit_jump(ctx, true);
+
+			// ----- DETECT FIELD ACCESS ----------------
+			hl_function *f = ctx->fun;
+			hl_opcode *next = f->ops + ctx->op_pos + 1;
+			bool null_field_access = false;
+			int hashed_name = 0;
+			// skip const and operation between nullcheck and access
+			while( (next < f->ops + f->nops - 1) && (next->op >= OInt && next->op <= ODecr) ) {
+				next++;
+			}
+			if( (next->op == OField && next->p2 == o->p1) || (next->op == OSetField && next->p1 == o->p1) ) {
+				int fid = next->op == OField ? next->p3 : next->p2;
+				hl_obj_field *f = NULL;
+				if( dst->t->kind == HOBJ || dst->t->kind == HSTRUCT )
+					f = hl_obj_field_fetch(dst->t, fid);
+				else if( dst->t->kind == HVIRTUAL )
+					f = dst->t->virt->fields + fid;
+				if( f == NULL ) jit_assert();
+				null_field_access = true;
+				hashed_name = f->hashed_name;
+			} else if( (next->op >= OCall1 && next->op <= OCallN) && next->p3 == o->p1 ) {
+				int fid = next->p2 < 0 ? -1 : m->functions_indexes[next->p2];
+				hl_function *cf = m->code->functions + fid;
+				const uchar *name = fun_field_name(cf);
+				null_field_access = true;
+				hashed_name = hl_hash_gen(name, true);
+			}
+			// -----------------------------------------
+			if( null_field_access ) {
+				einstr *e = emit_instr(ctx, PUSH_CONST);
+				e->mode = M_PTR;
+				e->value = hashed_name;
+			}
+			emit_native_call(ctx, null_field_access ? (void*)hl_jit_null_field_access : (void*)hl_null_access, NULL, 0, NULL);
+			patch_jump(ctx, jok);
+		}
+		break;
+	case OSafeCast:
+		emit_dyn_cast(ctx, LOAD(ra), ra->t, dst);
+		break;
+	case ODynGet:
+		{
+			bool need_type = dyn_need_type(dst->t);
+			ereg args[3];
+			args[0] = LOAD(ra);
+			args[1] = LOAD_CONST(hl_hash_utf8(m->code->strings[o->p3]),&hlt_i32);
+			if( need_type ) args[2] = LOAD_CONST_PTR(dst->t);
+			STORE(dst, emit_native_call(ctx, get_dynget(dst->t), args, need_type ? 3 : 2, dst->t));
+		}
+		break;
+	case ODynSet:
+		{
+			bool need_type = dyn_need_type(dst->t);
+			ereg args[4];
+			args[0] = LOAD(dst);
+			args[1] = LOAD_CONST(hl_hash_utf8(m->code->strings[o->p2]),&hlt_i32);
+			if( need_type ) {
+				args[2] = LOAD_CONST_PTR(rb->t);
+				args[3] = LOAD(rb);
+			} else
+				args[2] = LOAD(rb);
+			emit_native_call(ctx, get_dynset(rb->t), args, need_type ? 4 : 3, &hlt_void);
+		}
+		break;
+	case OTrap:
+		{
+			ereg st = emit_gen_size(ctx, ALLOC_STACK, sizeof(hl_trap_ctx));
+
+			ereg thread, current_addr;
+			static hl_thread_info *tinf = NULL;
+			static hl_trap_ctx *trap = NULL;
+#			ifndef HL_THREADS
+			if( tinf == NULL ) tinf = hl_get_thread();
+			current_addr = LOAD_CONST_PTR(&tinf->trap_current);
+#			else
+			thread = emit_native_call(ctx, hl_get_thread, NULL, 0, &hlt_bytes);
+			current_addr = OFFSET(thread, UNUSED, 0, (int)(int_val)&tinf->trap_current);
+#			endif
+			STORE_MEM(st, (int)(int_val)&trap->prev, LOAD_MEM_PTR(current_addr,0));
+			STORE_MEM(current_addr, 0, st);
+
+
+			/*
+				trap E,@catch
+				catch g
+				catch g2
+				...
+				@:catch
+
+				// Before haxe 5
+				This is a bit hackshish : we want to detect the type of exception filtered by the catch so we check the following
+				sequence of HL opcodes:
+
+				trap E,@catch
+				...
+				@catch:
+				global R, _
+				call _, ???(R,E)
+
+				??? is expected to be hl.BaseType.check
+			*/
+			hl_function *f = ctx->fun;
+			hl_opcode *cat = f->ops + ctx->op_pos + 1;
+			hl_opcode *next = f->ops + ctx->op_pos + 1 + o->p2;
+			hl_opcode *next2 = f->ops + ctx->op_pos + 2 + o->p2;
+			void *addr = NULL;
+			int offs = 0;
+			if( cat->op == OCatch || (next->op == OGetGlobal && next2->op == OCall2 && next2->p3 == next->p1 && dst->id == (int)(int_val)next2->extra) ) {
+				int gindex = cat->op == OCatch ? cat->p1 : next->p2;
+				hl_type *gt = m->code->globals[gindex];
+				while( gt->kind == HOBJ && gt->obj->super ) gt = gt->obj->super;
+				if( gt->kind == HOBJ && gt->obj->nfields && gt->obj->fields[0].t->kind == HTYPE ) {
+					addr = m->globals_data;
+					offs = m->globals_indexes[gindex];
+				}
+			}
+			STORE_MEM(st, (int)(int_val)&trap->tcheck, addr ? LOAD_MEM_PTR(LOAD_CONST_PTR(addr),offs) : LOAD_CONST_PTR(NULL));
+
+			void *fun = setjmp;
+			ereg args[2];
+			int nargs = 1;
+			args[0] = st;
+#if defined(HL_WIN) && defined(HL_64)
+			// On Win64 setjmp actually takes two arguments
+			// the jump buffer and the frame pointer (or the stack pointer if there is no FP)
+			nargs = 2;
+			args[1] = emit_gen(ctx,LEA,MK_STACK_REG(0),UNUSED,M_PTR);
+#endif
+#ifdef HL_MINGW
+			fun = _setjmp;
+#endif
+			ereg ret = emit_native_call(ctx, fun, args, nargs, &hlt_i32);
+			emit_test(ctx, ret, OJNull);
+			int jskip = emit_jump(ctx, true);
+			STORE(dst, tinf ? LOAD_CONST_PTR(&tinf->exc_value) : LOAD_MEM_PTR(thread,(int)(int_val)&tinf->exc_value));
+
+			int jtrap = ctx->emit_pos;
+			emit_gen(ctx, JUMP, UNUSED, UNUSED, 0);
+			register_jump(ctx, jtrap, o->p2);
+			split_block(ctx);
+			patch_jump(ctx, jskip);
+
+			if( ctx->trap_count == MAX_TRAPS ) jit_error("Too many try/catch depth");
+			trap_inf *inf = &ctx->traps[ctx->trap_count++];
+			inf->stack = st;
+			inf->target = o->p2 + 1 + ctx->op_pos;
+		}
+		break;
+	case OEndTrap:
+		{
+			if( ctx->trap_count == 0 ) jit_assert();
+			ereg st = ctx->traps[ctx->trap_count - 1].stack;
+
+			ereg thread, current_addr;
+			static hl_thread_info *tinf = NULL;
+			static hl_trap_ctx *trap = NULL;
+#			ifndef HL_THREADS
+			if( tinf == NULL ) tinf = hl_get_thread();
+			current_addr = LOAD_CONST_PTR(&tinf->trap_current);
+#			else
+			thread = emit_native_call(ctx, hl_get_thread, NULL, 0, &hlt_bytes);
+			current_addr = OFFSET(thread, UNUSED, 0, (int)(int_val)&tinf->trap_current);
+#			endif
+
+			STORE_MEM(current_addr, 0, LOAD_MEM_PTR(st,(int)(int_val)&trap->prev));
+
+			emit_instr(ctx, CATCH);
+		}
+		break;
+	case OSwitch:
+		{
+			ereg v = LOAD(dst);
+			int count = o->p2;
+			emit_cmp(ctx,v,LOAD_CONST(count,&hlt_i32),OJUGte);
+			add_jump_target(ctx, 0);
+			int jdefault = emit_jump(ctx, true);
+			int pos = ctx->emit_pos;
+			einstr *e = emit_instr(ctx, JUMP_TABLE);
+			e->a = v;
+			patch_instr_mode(ctx, M_NORET);
+			store_args(ctx,e,(ereg*)o->extra,count);
+			register_jump(ctx, pos, 0);
+			for(int k=0;k<count;k++) {
+				int offs = o->extra[k];
+				if( offs < 0 ) jit_assert();
+				if( offs == 0 ) continue;
+				add_jump_target(ctx, offs);
+			}
+			patch_jump(ctx, jdefault);
+		}
+		break;
+	case OGetTID:
+		STORE(dst, LOAD_MEM(LOAD(ra),0,&hlt_i32));
+		break;
+	case OAssert:
+		emit_native_call(ctx, hl_jit_assert, NULL, 0, NULL);
+		break;
+	case ONop:
+		break;
+	case OPrefetch:
+		{
+			ereg r = LOAD(dst);
+			if( o->p2 > 0 ) {
+				switch( dst->t->kind ) {
+				case HOBJ:
+				case HSTRUCT:
+					{
+						hl_runtime_obj *rt = hl_get_obj_rt(dst->t);
+						r = OFFSET(r, UNUSED, 0, rt->fields_indexes[o->p2-1]);
+					}
+					break;
+				default:
+					jit_assert();
+					break;
+				}
+			}
+			emit_gen_ext(ctx, PREFETCH, r, UNUSED, M_NONE, o->p3);
+		}
+		break;
+	case OAsm:
+		jit_assert();
+		break;
+	case OCatch:
+		// Only used by OTrap typing
+		break;
+	default:
+		jit_error(hl_op_name(o->op));
+		break;
+	}
+}
diff --git a/src/jit_old.c b/src/jit_old.c
new file mode 100644
index 000000000..7e4e6e88b
--- /dev/null
+++ b/src/jit_old.c
@@ -0,0 +1,4730 @@
+/*
+ * Copyright (C)2015-2016 Haxe Foundation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+#ifdef _MSC_VER
+#pragma warning(disable:4820)
+#endif
+#include <math.h>
+#include <hlmodule.h>
+#include "hlsystem.h"
+
+#ifdef __arm__
+#	error "JIT does not support ARM processors, only x86 and x86-64 are supported, please use HashLink/C native compilation instead"
+#endif
+
+#ifdef HL_DEBUG
+#	define JIT_DEBUG
+#endif
+
+typedef enum {
+	Eax = 0,
+	Ecx = 1,
+	Edx = 2,
+	Ebx = 3,
+	Esp = 4,
+	Ebp = 5,
+	Esi = 6,
+	Edi = 7,
+#ifdef HL_64
+	R8 = 8,
+	R9 = 9,
+	R10	= 10,
+	R11	= 11,
+	R12	= 12,
+	R13	= 13,
+	R14	= 14,
+	R15	= 15,
+#endif
+	_LAST = 0xFF
+} CpuReg;
+
+typedef enum {
+	MOV,
+	LEA,
+	PUSH,
+	ADD,
+	SUB,
+	IMUL,	// only overflow flag changes compared to MUL
+	DIV,
+	IDIV,
+	CDQ,
+	CDQE,
+	POP,
+	RET,
+	CALL,
+	AND,
+	OR,
+	XOR,
+	CMP,
+	TEST,
+	NOP,
+	SHL,
+	SHR,
+	SAR,
+	INC,
+	DEC,
+	JMP,
+	// FPU
+	FSTP,
+	FSTP32,
+	FLD,
+	FLD32,
+	FLDCW,
+	// SSE
+	MOVSD,
+	MOVSS,
+	COMISD,
+	COMISS,
+	ADDSD,
+	SUBSD,
+	MULSD,
+	DIVSD,
+	ADDSS,
+	SUBSS,
+	MULSS,
+	DIVSS,
+	XORPD,
+	CVTSI2SD,
+	CVTSI2SS,
+	CVTSD2SI,
+	CVTSD2SS,
+	CVTSS2SD,
+	CVTSS2SI,
+	STMXCSR,
+	LDMXCSR,
+	// 8-16 bits
+	MOV8,
+	CMP8,
+	TEST8,
+	PUSH8,
+	MOV16,
+	CMP16,
+	TEST16,
+	// prefetchs
+	PREFETCHT0,
+	PREFETCHT1,
+	PREFETCHT2,
+	PREFETCHNTA,
+	PREFETCHW,
+	// --
+	_CPU_LAST
+} CpuOp;
+
+#define JAlways		0
+#define JOverflow	0x80
+#define JULt		0x82
+#define JUGte		0x83
+#define JEq			0x84
+#define JNeq		0x85
+#define JULte		0x86
+#define JUGt		0x87
+#define JParity		0x8A
+#define JNParity	0x8B
+#define JSLt		0x8C
+#define JSGte		0x8D
+#define JSLte		0x8E
+#define JSGt		0x8F
+
+#define JCarry		JLt
+#define JZero		JEq
+#define JNotZero	JNeq
+
+#define B(bv)	*ctx->buf.b++ = (unsigned char)(bv)
+#define W(wv)	*ctx->buf.w++ = wv
+
+#ifdef HL_64
+#	define W64(wv)	*ctx->buf.w64++ = wv
+#else
+#	define W64(wv)	W(wv)
+#endif
+
+static const int SIB_MULT[] = {-1, 0, 1, -1, 2, -1, -1, -1, 3};
+
+#define MOD_RM(mod,reg,rm)		B(((mod) << 6) | (((reg)&7) << 3) | ((rm)&7))
+#define SIB(mult,rmult,rbase)	B((SIB_MULT[mult]<<6) | (((rmult)&7)<<3) | ((rbase)&7))
+#define IS_SBYTE(c)				( (c) >= -128 && (c) < 128 )
+
+#define AddJump(how,local)		{ if( (how) == JAlways ) { B(0xE9); } else { B(0x0F); B(how); }; local = BUF_POS(); W(0); }
+#define AddJump_small(how,local) { if( (how) == JAlways ) { B(0xEB); } else B(how - 0x10); local = BUF_POS() | 0x40000000; B(0); }
+#define XJump(how,local)		AddJump(how,local)
+#define XJump_small(how,local)		AddJump_small(how,local)
+
+#define MAX_OP_SIZE				256
+
+#define BUF_POS()				((int)(ctx->buf.b - ctx->startBuf))
+#define RTYPE(r)				r->t->kind
+
+#ifdef HL_64
+#	define RESERVE_ADDRESS	0x8000000000000000
+#else
+#	define RESERVE_ADDRESS	0x80000000
+#endif
+
+#if defined(HL_WIN_CALL) && defined(HL_64)
+#	define IS_WINCALL64 1
+#else
+#	define IS_WINCALL64 0
+#endif
+
+typedef struct jlist jlist;
+struct jlist {
+	int pos;
+	int target;
+	jlist *next;
+};
+
+typedef struct vreg vreg;
+
+typedef enum {
+	RCPU = 0,
+	RFPU = 1,
+	RSTACK = 2,
+	RCONST = 3,
+	RADDR = 4,
+	RMEM = 5,
+	RUNUSED = 6,
+	RCPU_CALL = 1 | 8,
+	RCPU_8BITS = 1 | 16
+} preg_kind;
+
+typedef struct {
+	preg_kind kind;
+	int id;
+	int lock;
+	vreg *holds;
+} preg;
+
+struct vreg {
+	int stackPos;
+	int size;
+	hl_type *t;
+	preg *current;
+	preg stack;
+};
+
+#define REG_AT(i)		(ctx->pregs + (i))
+
+#ifdef HL_64
+#	define RCPU_COUNT	16
+#	define RFPU_COUNT	16
+#	ifdef HL_WIN_CALL
+#		define CALL_NREGS			4
+#		define RCPU_SCRATCH_COUNT	7
+#		define RFPU_SCRATCH_COUNT	6
+static const int RCPU_SCRATCH_REGS[] = { Eax, Ecx, Edx, R8, R9, R10, R11 };
+static const CpuReg CALL_REGS[] = { Ecx, Edx, R8, R9 };
+#	else
+#		define CALL_NREGS			6 // TODO : XMM6+XMM7 are FPU reg parameters
+#		define RCPU_SCRATCH_COUNT	9
+#		define RFPU_SCRATCH_COUNT	16
+static const int RCPU_SCRATCH_REGS[] = { Eax, Ecx, Edx, Esi, Edi, R8, R9, R10, R11 };
+static const CpuReg CALL_REGS[] = { Edi, Esi, Edx, Ecx, R8, R9 };
+#	endif
+#else
+#	define CALL_NREGS	0
+#	define RCPU_COUNT	8
+#	define RFPU_COUNT	8
+#	define RCPU_SCRATCH_COUNT	3
+#	define RFPU_SCRATCH_COUNT	8
+static const int RCPU_SCRATCH_REGS[] = { Eax, Ecx, Edx };
+#endif
+
+#define XMM(i)			((i) + RCPU_COUNT)
+#define PXMM(i)			REG_AT(XMM(i))
+#define REG_IS_FPU(i)	((i) >= RCPU_COUNT)
+
+#define PEAX			REG_AT(Eax)
+#define PESP			REG_AT(Esp)
+#define PEBP			REG_AT(Ebp)
+
+#define REG_COUNT	(RCPU_COUNT + RFPU_COUNT)
+
+#define ID2(a,b)	((a) | ((b)<<8))
+#define R(id)		(ctx->vregs + (id))
+#define ASSERT(i)	{ printf("JIT ERROR %d (jit.c line %d)\n",i,(int)__LINE__); jit_exit(); }
+#define IS_FLOAT(r)	((r)->t->kind == HF64 || (r)->t->kind == HF32)
+#define RLOCK(r)		if( (r)->lock < ctx->currentPos ) (r)->lock = ctx->currentPos
+#define RUNLOCK(r)		if( (r)->lock == ctx->currentPos ) (r)->lock = 0
+
+#define BREAK()		B(0xCC)
+
+static preg _unused = { RUNUSED, 0, 0, NULL };
+static preg *UNUSED = &_unused;
+
+struct _jit_ctx {
+	union {
+		unsigned char *b;
+		unsigned int *w;
+		unsigned long long *w64;
+		int *i;
+		double *d;
+	} buf;
+	vreg *vregs;
+	preg pregs[REG_COUNT];
+	vreg *savedRegs[REG_COUNT];
+	int savedLocks[REG_COUNT];
+	int *opsPos;
+	int maxRegs;
+	int maxOps;
+	int bufSize;
+	int totalRegsSize;
+	int functionPos;
+	int allocOffset;
+	int currentPos;
+	int nativeArgsCount;
+	unsigned char *startBuf;
+	hl_module *m;
+	hl_function *f;
+	jlist *jumps;
+	jlist *calls;
+	jlist *switchs;
+	hl_alloc falloc; // cleared per-function
+	hl_alloc galloc;
+	vclosure *closure_list;
+	hl_debug_infos *debug;
+	int c2hl;
+	int hl2c;
+	void *static_functions[8];
+	bool static_function_offset;
+#ifdef WIN64_UNWIND_TABLES
+	int unwind_offset;
+	int nunwind;
+	PRUNTIME_FUNCTION unwind_table;
+#endif
+};
+
+#ifdef WIN64_UNWIND_TABLES
+
+typedef enum _UNWIND_OP_CODES
+{
+	UWOP_PUSH_NONVOL = 0, /* info == register number */
+	UWOP_ALLOC_LARGE,	  /* no info, alloc size in next 2 slots */
+	UWOP_ALLOC_SMALL,	  /* info == size of allocation / 8 - 1 */
+	UWOP_SET_FPREG,		  /* no info, FP = RSP + UNWIND_INFO.FPRegOffset*16 */
+	UWOP_SAVE_NONVOL,	  /* info == register number, offset in next slot */
+	UWOP_SAVE_NONVOL_FAR, /* info == register number, offset in next 2 slots */
+	UWOP_SAVE_XMM128 = 8, /* info == XMM reg number, offset in next slot */
+	UWOP_SAVE_XMM128_FAR, /* info == XMM reg number, offset in next 2 slots */
+	UWOP_PUSH_MACHFRAME	  /* info == 0: no error-code, 1: error-code */
+} UNWIND_CODE_OPS;
+
+void write_uwcode(jit_ctx *ctx, unsigned char offset, UNWIND_CODE_OPS code, unsigned char info)
+{
+	B(offset);
+	B((code) | (info) << 4);
+}
+
+void write_unwind_data(jit_ctx *ctx)
+{
+	// All generated functions use a frame pointer, so the same unwind info can be used for all of them
+	unsigned char version = 1;
+	unsigned char flags = 0;
+	unsigned char CountOfCodes = 2;
+	unsigned char SizeOfProlog = 4;
+	unsigned char FrameRegister = 5; // RBP
+	unsigned char FrameOffset = 0;
+	B((version) | (flags) << 3);
+	B(SizeOfProlog);
+	B(CountOfCodes);
+	B((FrameRegister) | (FrameOffset) << 4);
+	write_uwcode(ctx, 4, UWOP_SET_FPREG, 0);
+	write_uwcode(ctx, 1, UWOP_PUSH_NONVOL, 5);
+}
+#endif
+
+#define jit_exit() { hl_debug_break(); exit(-1); }
+#define jit_error(msg)	_jit_error(ctx,msg,__LINE__)
+
+#ifndef HL_64
+#	ifdef HL_DEBUG
+#		define error_i64() jit_error("i64-32")
+#	else
+void error_i64() {
+	printf("The module you are loading is using 64 bit ints that are not supported by the HL32.\nPlease run using HL64 or compile with -D hl-legacy32");
+	jit_exit();
+}
+#	endif
+#endif
+
+static void _jit_error( jit_ctx *ctx, const char *msg, int line );
+static void on_jit_error( const char *msg, int_val line );
+
+static preg *pmem( preg *r, CpuReg reg, int offset ) {
+	r->kind = RMEM;
+	r->id = 0 | (reg << 4) | (offset << 8);
+	return r;
+}
+
+static preg *pmem2( preg *r, CpuReg reg, CpuReg reg2, int mult, int offset ) {
+	r->kind = RMEM;
+	r->id = mult | (reg << 4) | (reg2 << 8);
+	r->holds = (void*)(int_val)offset;
+	return r;
+}
+
+#ifdef HL_64
+static preg *pcodeaddr( preg *r, int offset ) {
+	r->kind = RMEM;
+	r->id = 15 | (offset << 4);
+	return r;
+}
+#endif
+
+static preg *pconst( preg *r, int c ) {
+	r->kind = RCONST;
+	r->holds = NULL;
+	r->id = c;
+	return r;
+}
+
+static preg *pconst64( preg *r, int_val c ) {
+#ifdef HL_64
+	if( ((int)c) == c )
+		return pconst(r,(int)c);
+	r->kind = RCONST;
+	r->id = 0xC064C064;
+	r->holds = (vreg*)c;
+	return r;
+#else
+	return pconst(r,(int)c);
+#endif
+}
+
+#ifndef HL_64
+// it is not possible to access direct 64 bit address in x86-64
+static preg *paddr( preg *r, void *p ) {
+	r->kind = RADDR;
+	r->holds = (vreg*)p;
+	return r;
+}
+#endif
+
+static void save_regs( jit_ctx *ctx ) {
+	int i;
+	for(i=0;i<REG_COUNT;i++) {
+		ctx->savedRegs[i] = ctx->pregs[i].holds;
+		ctx->savedLocks[i] = ctx->pregs[i].lock;
+	}
+}
+
+static void restore_regs( jit_ctx *ctx ) {
+	int i;
+	for(i=0;i<ctx->maxRegs;i++)
+		ctx->vregs[i].current = NULL;
+	for(i=0;i<REG_COUNT;i++) {
+		vreg *r = ctx->savedRegs[i];
+		preg *p = ctx->pregs + i;
+		p->holds = r;
+		p->lock = ctx->savedLocks[i];
+		if( r ) r->current = p;
+	}
+}
+
+static void jit_buf( jit_ctx *ctx ) {
+	if( BUF_POS() > ctx->bufSize - MAX_OP_SIZE ) {
+		int nsize = ctx->bufSize * 4 / 3;
+		unsigned char *nbuf;
+		int curpos;
+		if( nsize == 0 ) {
+			int i;
+			for(i=0;i<ctx->m->code->nfunctions;i++)
+				nsize += ctx->m->code->functions[i].nops;
+			nsize *= 4;
+		}
+		if( nsize < ctx->bufSize + MAX_OP_SIZE * 4 ) nsize = ctx->bufSize + MAX_OP_SIZE * 4;
+		curpos = BUF_POS();
+		nbuf = (unsigned char*)malloc(nsize);
+		if( nbuf == NULL ) ASSERT(nsize);
+		if( ctx->startBuf ) {
+			memcpy(nbuf,ctx->startBuf,curpos);
+			free(ctx->startBuf);
+		}
+		ctx->startBuf = nbuf;
+		ctx->buf.b = nbuf + curpos;
+		ctx->bufSize = nsize;
+	}
+}
+
+static const char *KNAMES[] = { "cpu","fpu","stack","const","addr","mem","unused" };
+#define ERRIF(c)	if( c ) { printf("%s(%s,%s)\n",f?f->name:"???",KNAMES[a->kind], KNAMES[b->kind]); ASSERT(0); }
+
+typedef struct {
+	const char *name;						// single operand
+	int r_mem;		// r32 / r/m32				r32
+	int mem_r;		// r/m32 / r32				r/m32
+	int r_const;	// r32 / imm32				imm32
+	int r_i8;		// r32 / imm8				imm8
+	int mem_const;	// r/m32 / imm32			N/A
+} opform;
+
+#define FLAG_LONGOP	0x80000000
+#define FLAG_16B	0x40000000
+#define FLAG_8B		0x20000000
+#define FLAG_DUAL   0x10000000
+
+#define RM(op,id) ((op) | (((id)+1)<<8))
+#define GET_RM(op)	(((op) >> ((op) < 0 ? 24 : 8)) & 15)
+#define SBYTE(op) ((op) << 16)
+#define LONG_OP(op)	((op) | FLAG_LONGOP)
+#define OP16(op)	LONG_OP((op) | FLAG_16B)
+#define LONG_RM(op,id)	LONG_OP(op | (((id) + 1) << 24))
+
+static opform OP_FORMS[_CPU_LAST] = {
+	{ "MOV", 0x8B, 0x89, 0xB8, 0, RM(0xC7,0) },
+	{ "LEA", 0x8D },
+	{ "PUSH", 0x50, RM(0xFF,6), 0x68, 0x6A },
+	{ "ADD", 0x03, 0x01, RM(0x81,0), RM(0x83,0) },
+	{ "SUB", 0x2B, 0x29, RM(0x81,5), RM(0x83,5) },
+	{ "IMUL", LONG_OP(0x0FAF), 0, 0x69 | FLAG_DUAL, 0x6B | FLAG_DUAL },
+	{ "DIV", RM(0xF7,6), RM(0xF7,6) },
+	{ "IDIV", RM(0xF7,7), RM(0xF7,7) },
+	{ "CDQ", 0x99 },
+	{ "CDQE", 0x98 },
+	{ "POP", 0x58, RM(0x8F,0) },
+	{ "RET", 0xC3 },
+	{ "CALL", RM(0xFF,2), RM(0xFF,2), 0xE8 },
+	{ "AND", 0x23, 0x21, RM(0x81,4), RM(0x83,4) },
+	{ "OR", 0x0B, 0x09, RM(0x81,1), RM(0x83,1) },
+	{ "XOR", 0x33, 0x31, RM(0x81,6), RM(0x83,6) },
+	{ "CMP", 0x3B, 0x39, RM(0x81,7), RM(0x83,7) },
+	{ "TEST", 0x85, 0x85/*SWP?*/, RM(0xF7,0) },
+	{ "NOP", 0x90 },
+	{ "SHL", RM(0xD3,4), 0, 0, RM(0xC1,4) },
+	{ "SHR", RM(0xD3,5), 0, 0, RM(0xC1,5) },
+	{ "SAR", RM(0xD3,7), 0, 0, RM(0xC1,7) },
+	{ "INC", IS_64 ? RM(0xFF,0) : 0x40, RM(0xFF,0) },
+	{ "DEC", IS_64 ? RM(0xFF,1) : 0x48, RM(0xFF,1) },
+	{ "JMP", RM(0xFF,4) },
+	// FPU
+	{ "FSTP", 0, RM(0xDD,3) },
+	{ "FSTP32", 0, RM(0xD9,3) },
+	{ "FLD", 0, RM(0xDD,0) },
+	{ "FLD32", 0, RM(0xD9,0) },
+	{ "FLDCW", 0, RM(0xD9, 5) },
+	// SSE
+	{ "MOVSD", 0xF20F10, 0xF20F11  },
+	{ "MOVSS", 0xF30F10, 0xF30F11  },
+	{ "COMISD", 0x660F2F },
+	{ "COMISS", LONG_OP(0x0F2F) },
+	{ "ADDSD", 0xF20F58 },
+	{ "SUBSD", 0xF20F5C },
+	{ "MULSD", 0xF20F59 },
+	{ "DIVSD", 0xF20F5E },
+	{ "ADDSS", 0xF30F58 },
+	{ "SUBSS", 0xF30F5C },
+	{ "MULSS", 0xF30F59 },
+	{ "DIVSS", 0xF30F5E },
+	{ "XORPD", 0x660F57 },
+	{ "CVTSI2SD", 0xF20F2A },
+	{ "CVTSI2SS", 0xF30F2A },
+	{ "CVTSD2SI", 0xF20F2D },
+	{ "CVTSD2SS", 0xF20F5A },
+	{ "CVTSS2SD", 0xF30F5A },
+	{ "CVTSS2SI", 0xF30F2D },
+	{ "STMXCSR", 0, LONG_RM(0x0FAE,3) },
+	{ "LDMXCSR", 0, LONG_RM(0x0FAE,2) },
+	// 8 bits,
+	{ "MOV8", 0x8A, 0x88, 0, 0xB0, RM(0xC6,0) },
+	{ "CMP8", 0x3A, 0x38, 0, RM(0x80,7) },
+	{ "TEST8", 0x84, 0x84, RM(0xF6,0) },
+	{ "PUSH8", 0, 0, 0x6A | FLAG_8B },
+	{ "MOV16", OP16(0x8B), OP16(0x89), OP16(0xB8) },
+	{ "CMP16", OP16(0x3B), OP16(0x39) },
+	{ "TEST16", OP16(0x85) },
+	// prefetchs
+	{ "PREFETCHT0", 0, LONG_RM(0x0F18,1) },
+	{ "PREFETCHT1", 0, LONG_RM(0x0F18,2) },
+	{ "PREFETCHT2", 0, LONG_RM(0x0F18,3) },
+	{ "PREFETCHNTA", 0, LONG_RM(0x0F18,0) },
+	{ "PREFETCHW", 0, LONG_RM(0x0F0D,1) },
+};
+
+#ifdef HL_64
+#	define REX()	if( r64 ) B(r64 | 0x40)
+#else
+#	define REX()
+#endif
+
+#define	OP(b)	\
+	if( (b) & 0xFF0000 ) { \
+		B((b)>>16); \
+		if( r64 ) B(r64 | 0x40); /* also in 32 bits mode */ \
+		B((b)>>8); \
+		B(b); \
+	} else { \
+		if( (b) & FLAG_16B ) { \
+			B(0x66); \
+			REX(); \
+		} else {\
+			REX(); \
+			if( (b) & FLAG_LONGOP ) B((b)>>8); \
+		}\
+		B(b); \
+	}
+
+static bool is_reg8( preg *a ) {
+	return a->kind == RSTACK || a->kind == RMEM || a->kind == RCONST || (a->kind == RCPU && a->id != Esi && a->id != Edi);
+}
+
+static void op( jit_ctx *ctx, CpuOp o, preg *a, preg *b, bool mode64 ) {
+	opform *f = &OP_FORMS[o];
+	int r64 = mode64 && (o != PUSH && o != POP && o != CALL && o != PUSH8 && o < PREFETCHT0) ? 8 : 0;
+	switch( o ) {
+	case CMP8:
+	case TEST8:
+	case MOV8:
+		if( !is_reg8(a) || !is_reg8(b) )
+			ASSERT(0);
+		break;
+	default:
+		break;
+	}
+	switch( ID2(a->kind,b->kind) ) {
+	case ID2(RUNUSED,RUNUSED):
+		ERRIF(f->r_mem == 0);
+		OP(f->r_mem);
+		break;
+	case ID2(RCPU,RCPU):
+	case ID2(RFPU,RFPU):
+		ERRIF( f->r_mem == 0 );
+		if( a->id > 7 ) r64 |= 4;
+		if( b->id > 7 ) r64 |= 1;
+		OP(f->r_mem);
+		MOD_RM(3,a->id,b->id);
+		break;
+	case ID2(RCPU,RFPU):
+	case ID2(RFPU,RCPU):
+		ERRIF( (f->r_mem>>16) == 0 );
+		if( a->id > 7 ) r64 |= 4;
+		if( b->id > 7 ) r64 |= 1;
+		OP(f->r_mem);
+		MOD_RM(3,a->id,b->id);
+		break;
+	case ID2(RCPU,RUNUSED):
+		ERRIF( f->r_mem == 0 );
+		if( a->id > 7 ) r64 |= 1;
+		if( GET_RM(f->r_mem) > 0 ) {
+			OP(f->r_mem);
+			MOD_RM(3, GET_RM(f->r_mem)-1, a->id);
+		} else
+			OP(f->r_mem + (a->id&7));
+		break;
+	case ID2(RSTACK,RUNUSED):
+		ERRIF( f->mem_r == 0 || GET_RM(f->mem_r) == 0 );
+		{
+			int stackPos = R(a->id)->stackPos;
+			OP(f->mem_r);
+			if( IS_SBYTE(stackPos) ) {
+				MOD_RM(1,GET_RM(f->mem_r)-1,Ebp);
+				B(stackPos);
+			} else {
+				MOD_RM(2,GET_RM(f->mem_r)-1,Ebp);
+				W(stackPos);
+			}
+		}
+		break;
+	case ID2(RCPU,RCONST):
+		ERRIF( f->r_const == 0 && f->r_i8 == 0 );
+		if( a->id > 7 ) r64 |= 1;
+		{
+			int_val cval = b->holds ? (int_val)b->holds : b->id;
+			// short byte form
+			if( f->r_i8 && IS_SBYTE(cval) ) {
+				if( (f->r_i8&FLAG_DUAL) && a->id > 7 ) r64 |= 4;
+				OP(f->r_i8);
+				if( (f->r_i8&FLAG_DUAL) ) MOD_RM(3,a->id,a->id); else MOD_RM(3,GET_RM(f->r_i8)-1,a->id);
+				B((int)cval);
+			} else if( GET_RM(f->r_const) > 0 || (f->r_const&FLAG_DUAL) ) {
+				if( (f->r_i8&FLAG_DUAL) && a->id > 7 ) r64 |= 4;
+				OP(f->r_const&0xFF);
+				if( (f->r_i8&FLAG_DUAL) ) MOD_RM(3,a->id,a->id); else MOD_RM(3,GET_RM(f->r_const)-1,a->id);
+				if( mode64 && IS_64 && o == MOV ) W64(cval); else W((int)cval);
+			} else {
+				ERRIF( f->r_const == 0);
+				OP((f->r_const&0xFF) + (a->id&7));
+				if( mode64 && IS_64 && o == MOV ) W64(cval); else W((int)cval);
+			}
+		}
+		break;
+	case ID2(RSTACK,RCPU):
+	case ID2(RSTACK,RFPU):
+		ERRIF( f->mem_r == 0 );
+		if( b->id > 7 ) r64 |= 4;
+		{
+			int stackPos = R(a->id)->stackPos;
+			OP(f->mem_r);
+			if( IS_SBYTE(stackPos) ) {
+				MOD_RM(1,b->id,Ebp);
+				B(stackPos);
+			} else {
+				MOD_RM(2,b->id,Ebp);
+				W(stackPos);
+			}
+		}
+		break;
+	case ID2(RCPU,RSTACK):
+	case ID2(RFPU,RSTACK):
+		ERRIF( f->r_mem == 0 );
+		if( a->id > 7 ) r64 |= 4;
+		{
+			int stackPos = R(b->id)->stackPos;
+			OP(f->r_mem);
+			if( IS_SBYTE(stackPos) ) {
+				MOD_RM(1,a->id,Ebp);
+				B(stackPos);
+			} else {
+				MOD_RM(2,a->id,Ebp);
+				W(stackPos);
+			}
+		}
+		break;
+	case ID2(RCONST,RUNUSED):
+		ERRIF( f->r_const == 0 );
+		{
+			int_val cval = a->holds ? (int_val)a->holds : a->id;
+			OP(f->r_const);
+			if( f->r_const & FLAG_8B ) B((int)cval); else W((int)cval);
+		}
+		break;
+	case ID2(RMEM,RUNUSED):
+		ERRIF( f->mem_r == 0 );
+		{
+			int mult = a->id & 0xF;
+			int regOrOffs = mult == 15 ? a->id >> 4 : a->id >> 8;
+			CpuReg reg = (a->id >> 4) & 0xF;
+			if( mult == 15 ) {
+				ERRIF(1);
+			} else if( mult == 0 ) {
+				if( reg > 7 ) r64 |= 1;
+				OP(f->mem_r);
+				if( regOrOffs == 0 && (reg&7) != Ebp ) {
+					MOD_RM(0,GET_RM(f->mem_r)-1,reg);
+					if( (reg&7) == Esp ) B(0x24);
+				} else if( IS_SBYTE(regOrOffs) ) {
+					MOD_RM(1,GET_RM(f->mem_r)-1,reg);
+					if( (reg&7) == Esp ) B(0x24);
+					B(regOrOffs);
+				} else {
+					MOD_RM(2,GET_RM(f->mem_r)-1,reg);
+					if( (reg&7) == Esp ) B(0x24);
+					W(regOrOffs);
+				}
+			} else {
+				// [eax + ebx * M]
+				ERRIF(1);
+			}
+		}
+		break;
+	case ID2(RCPU, RMEM):
+	case ID2(RFPU, RMEM):
+		ERRIF( f->r_mem == 0 );
+		{
+			int mult = b->id & 0xF;
+			int regOrOffs = mult == 15 ? b->id >> 4 : b->id >> 8;
+			CpuReg reg = (b->id >> 4) & 0xF;
+			if( mult == 15 ) {
+				int pos;
+				if( a->id > 7 ) r64 |= 4;
+				OP(f->r_mem);
+				MOD_RM(0,a->id,5);
+				if( IS_64 ) {
+					// offset wrt current code
+					pos = BUF_POS() + 4;
+					W(regOrOffs - pos);
+				} else {
+					ERRIF(1);
+				}
+			} else if( mult == 0 ) {
+				if( a->id > 7 ) r64 |= 4;
+				if( reg > 7 ) r64 |= 1;
+				OP(f->r_mem);
+				if( regOrOffs == 0 && (reg&7) != Ebp ) {
+					MOD_RM(0,a->id,reg);
+					if( (reg&7) == Esp ) B(0x24);
+				} else if( IS_SBYTE(regOrOffs) ) {
+					MOD_RM(1,a->id,reg);
+					if( (reg&7) == Esp ) B(0x24);
+					B(regOrOffs);
+				} else {
+					MOD_RM(2,a->id,reg);
+					if( (reg&7) == Esp ) B(0x24);
+					W(regOrOffs);
+				}
+			} else {
+				int offset = (int)(int_val)b->holds;
+				if( a->id > 7 ) r64 |= 4;
+				if( reg > 7 ) r64 |= 1;
+				if( regOrOffs > 7 ) r64 |= 2;
+				OP(f->r_mem);
+				MOD_RM(offset == 0 ? 0 : IS_SBYTE(offset) ? 1 : 2,a->id,4);
+				SIB(mult,regOrOffs,reg);
+				if( offset ) {
+					if( IS_SBYTE(offset) ) B(offset); else W(offset);
+				}
+			}
+		}
+		break;
+#	ifndef HL_64
+	case ID2(RFPU,RADDR):
+#	endif
+	case ID2(RCPU,RADDR):
+		ERRIF( f->r_mem == 0 );
+		if( a->id > 7 ) r64 |= 4;
+		OP(f->r_mem);
+		MOD_RM(0,a->id,5);
+		if( IS_64 )
+			W64((int_val)b->holds);
+		else
+			W((int)(int_val)b->holds);
+		break;
+#	ifndef HL_64
+	case ID2(RADDR,RFPU):
+#	endif
+	case ID2(RADDR,RCPU):
+		ERRIF( f->mem_r == 0 );
+		if( b->id > 7 ) r64 |= 4;
+		OP(f->mem_r);
+		MOD_RM(0,b->id,5);
+		if( IS_64 )
+			W64((int_val)a->holds);
+		else
+			W((int)(int_val)a->holds);
+		break;
+	case ID2(RMEM, RCPU):
+	case ID2(RMEM, RFPU):
+		ERRIF( f->mem_r == 0 );
+		{
+			int mult = a->id & 0xF;
+			int regOrOffs = mult == 15 ? a->id >> 4 : a->id >> 8;
+			CpuReg reg = (a->id >> 4) & 0xF;
+			if( mult == 15 ) {
+				int pos;
+				if( b->id > 7 ) r64 |= 4;
+				OP(f->mem_r);
+				MOD_RM(0,b->id,5);
+				if( IS_64 ) {
+					// offset wrt current code
+					pos = BUF_POS() + 4;
+					W(regOrOffs - pos);
+				} else {
+					ERRIF(1);
+				}
+			} else if( mult == 0 ) {
+				if( b->id > 7 ) r64 |= 4;
+				if( reg > 7 ) r64 |= 1;
+				OP(f->mem_r);
+				if( regOrOffs == 0 && (reg&7) != Ebp ) {
+					MOD_RM(0,b->id,reg);
+					if( (reg&7) == Esp ) B(0x24);
+				} else if( IS_SBYTE(regOrOffs) ) {
+					MOD_RM(1,b->id,reg);
+					if( (reg&7) == Esp ) B(0x24);
+					B(regOrOffs);
+				} else {
+					MOD_RM(2,b->id,reg);
+					if( (reg&7) == Esp ) B(0x24);
+					W(regOrOffs);
+				}
+			} else {
+				int offset = (int)(int_val)a->holds;
+				if( b->id > 7 ) r64 |= 4;
+				if( reg > 7 ) r64 |= 1;
+				if( regOrOffs > 7 ) r64 |= 2;
+				OP(f->mem_r);
+				MOD_RM(offset == 0 ? 0 : IS_SBYTE(offset) ? 1 : 2,b->id,4);
+				SIB(mult,regOrOffs,reg);
+				if( offset ) {
+					if( IS_SBYTE(offset) ) B(offset); else W(offset);
+				}
+			}
+		}
+		break;
+	default:
+		ERRIF(1);
+	}
+	if( ctx->debug && ctx->f && o == CALL ) {
+		preg p;
+		op(ctx,MOV,pmem(&p,Esp,-HL_WSIZE),PEBP,true); // erase EIP (clean stack report)
+	}
+}
+
+static void op32( jit_ctx *ctx, CpuOp o, preg *a, preg *b ) {
+	op(ctx,o,a,b,false);
+}
+
+static void op64( jit_ctx *ctx, CpuOp o, preg *a, preg *b ) {
+#ifndef HL_64
+	op(ctx,o,a,b,false);
+#else
+	op(ctx,o,a,b,true);
+#endif
+}
+
+static void patch_jump( jit_ctx *ctx, int p ) {
+	if( p == 0 ) return;
+	if( p & 0x40000000 ) {
+		int d;
+		p &= 0x3FFFFFFF;
+		d = BUF_POS() - (p + 1);
+		if( d < -128 || d >= 128 ) ASSERT(d);
+		*(char*)(ctx->startBuf + p) = (char)d;
+	} else {
+		*(int*)(ctx->startBuf + p) = BUF_POS() - (p + 4);
+	}
+}
+
+static void patch_jump_to( jit_ctx *ctx, int p, int target ) {
+	if( p == 0 ) return;
+	if( p & 0x40000000 ) {
+		int d;
+		p &= 0x3FFFFFFF;
+		d = target - (p + 1);
+		if( d < -128 || d >= 128 ) ASSERT(d);
+		*(char*)(ctx->startBuf + p) = (char)d;
+	} else {
+		*(int*)(ctx->startBuf + p) = target - (p + 4);
+	}
+}
+
+static int stack_size( hl_type *t ) {
+	switch( t->kind ) {
+	case HUI8:
+	case HUI16:
+	case HBOOL:
+#	ifdef HL_64
+	case HI32:
+	case HF32:
+#	endif
+		return sizeof(int_val);
+	case HI64:
+	default:
+		return hl_type_size(t);
+	}
+}
+
+static int call_reg_index( int reg ) {
+#	ifdef HL_64
+	int i;
+	for(i=0;i<CALL_NREGS;i++)
+		if( CALL_REGS[i] == reg )
+			return i;
+#	endif
+	return -1;
+}
+
+static bool is_call_reg( preg *p ) {
+#	ifdef HL_64
+	int i;
+	if( p->kind == RFPU )
+		return p->id < CALL_NREGS;
+	for(i=0;i<CALL_NREGS;i++)
+		if( p->kind == RCPU && p->id == CALL_REGS[i] )
+			return true;
+	return false;
+#	else
+	return false;
+#	endif
+}
+
+static preg *alloc_reg( jit_ctx *ctx, preg_kind k ) {
+	int i;
+	preg *p;
+	switch( k ) {
+	case RCPU:
+	case RCPU_CALL:
+	case RCPU_8BITS:
+		{
+			int off = ctx->allocOffset++;
+			const int count = RCPU_SCRATCH_COUNT;
+			for(i=0;i<count;i++) {
+				int r = RCPU_SCRATCH_REGS[(i + off)%count];
+				p = ctx->pregs + r;
+				if( p->lock >= ctx->currentPos ) continue;
+				if( k == RCPU_CALL && is_call_reg(p) ) continue;
+				if( k == RCPU_8BITS && !is_reg8(p) ) continue;
+				if( p->holds == NULL ) {
+					RLOCK(p);
+					return p;
+				}
+			}
+			for(i=0;i<count;i++) {
+				preg *p = ctx->pregs + RCPU_SCRATCH_REGS[(i + off)%count];
+				if( p->lock >= ctx->currentPos ) continue;
+				if( k == RCPU_CALL && is_call_reg(p) ) continue;
+				if( k == RCPU_8BITS && !is_reg8(p) ) continue;
+				if( p->holds ) {
+					RLOCK(p);
+					p->holds->current = NULL;
+					p->holds = NULL;
+					return p;
+				}
+			}
+		}
+		break;
+	case RFPU:
+		{
+			int off = ctx->allocOffset++;
+			const int count = RFPU_SCRATCH_COUNT;
+			for(i=0;i<count;i++) {
+				preg *p = PXMM((i + off)%count);
+				if( p->lock >= ctx->currentPos ) continue;
+				if( p->holds == NULL ) {
+					RLOCK(p);
+					return p;
+				}
+			}
+			for(i=0;i<count;i++) {
+				preg *p = PXMM((i + off)%count);
+				if( p->lock >= ctx->currentPos ) continue;
+				if( p->holds ) {
+					RLOCK(p);
+					p->holds->current = NULL;
+					p->holds = NULL;
+					return p;
+				}
+			}
+		}
+		break;
+	default:
+		ASSERT(k);
+	}
+	ASSERT(0); // out of registers !
+	return NULL;
+}
+
+static preg *fetch( vreg *r ) {
+	if( r->current )
+		return r->current;
+	return &r->stack;
+}
+
+static void scratch( preg *r ) {
+	if( r && r->holds ) {
+		r->holds->current = NULL;
+		r->holds = NULL;
+		r->lock = 0;
+	}
+}
+
+static preg *copy( jit_ctx *ctx, preg *to, preg *from, int size );
+
+static void load( jit_ctx *ctx, preg *r, vreg *v ) {
+	preg *from = fetch(v);
+	if( from == r || v->size == 0 ) return;
+	if( r->holds ) r->holds->current = NULL;
+	if( v->current ) {
+		v->current->holds = NULL;
+		from = r;
+	}
+	r->holds = v;
+	v->current = r;
+	copy(ctx,r,from,v->size);
+}
+
+static preg *alloc_fpu( jit_ctx *ctx, vreg *r, bool andLoad ) {
+	preg *p = fetch(r);
+	if( p->kind != RFPU ) {
+		if( !IS_FLOAT(r) && (IS_64 || r->t->kind != HI64) ) ASSERT(r->t->kind);
+		p = alloc_reg(ctx, RFPU);
+		if( andLoad )
+			load(ctx,p,r);
+		else {
+			if( r->current )
+				r->current->holds = NULL;
+			r->current = p;
+			p->holds = r;
+		}
+	} else
+		RLOCK(p);
+	return p;
+}
+
+static void reg_bind( vreg *r, preg *p ) {
+	if( r->current )
+		r->current->holds = NULL;
+	r->current = p;
+	p->holds = r;
+}
+
+static preg *alloc_cpu( jit_ctx *ctx, vreg *r, bool andLoad ) {
+	preg *p = fetch(r);
+	if( p->kind != RCPU ) {
+#		ifndef HL_64
+		if( r->t->kind == HI64 ) return alloc_fpu(ctx,r,andLoad);
+		if( r->size > 4 ) ASSERT(r->size);
+#		endif
+		p = alloc_reg(ctx, RCPU);
+		if( andLoad )
+			load(ctx,p,r);
+		else
+			reg_bind(r,p);
+	} else
+		RLOCK(p);
+	return p;
+}
+
+// allocate a register that is not a call parameter
+static preg *alloc_cpu_call( jit_ctx *ctx, vreg *r ) {
+	preg *p = fetch(r);
+	if( p->kind != RCPU ) {
+#		ifndef HL_64
+		if( r->t->kind == HI64 ) return alloc_fpu(ctx,r,true);
+		if( r->size > 4 ) ASSERT(r->size);
+#		endif
+		p = alloc_reg(ctx, RCPU_CALL);
+		load(ctx,p,r);
+	} else if( is_call_reg(p) ) {
+		preg *p2 = alloc_reg(ctx, RCPU_CALL);
+		op64(ctx,MOV,p2,p);
+		scratch(p);
+		reg_bind(r,p2);
+		return p2;
+	} else
+		RLOCK(p);
+	return p;
+}
+
+static preg *fetch32( jit_ctx *ctx, vreg *r ) {
+	if( r->current )
+		return r->current;
+	// make sure that the register is correctly erased
+	if( r->size < 4 ) {
+		preg *p = alloc_cpu(ctx, r, true);
+		RUNLOCK(p);
+		return p;
+	}
+	return fetch(r);
+}
+
+// make sure higher bits are zeroes
+static preg *alloc_cpu64( jit_ctx *ctx, vreg *r, bool andLoad ) {
+#	ifndef HL_64
+	return alloc_cpu(ctx,r,andLoad);
+#	else
+	preg *p = fetch(r);
+	if( !andLoad ) ASSERT(0);
+	if( p->kind != RCPU ) {
+		p = alloc_reg(ctx, RCPU);
+		op64(ctx,XOR,p,p);
+		load(ctx,p,r);
+	} else {
+		// remove higher bits
+		preg tmp;
+		op64(ctx,SHL,p,pconst(&tmp,32));
+		op64(ctx,SHR,p,pconst(&tmp,32));
+		RLOCK(p);
+	}
+	return p;
+#	endif
+}
+
+// make sure the register can be used with 8 bits access
+static preg *alloc_cpu8( jit_ctx *ctx, vreg *r, bool andLoad ) {
+	preg *p = fetch(r);
+	if( p->kind != RCPU ) {
+		p = alloc_reg(ctx, RCPU_8BITS);
+		load(ctx,p,r);
+	} else if( !is_reg8(p) ) {
+		preg *p2 = alloc_reg(ctx, RCPU_8BITS);
+		op64(ctx,MOV,p2,p);
+		scratch(p);
+		reg_bind(r,p2);
+		return p2;
+	} else
+		RLOCK(p);
+	return p;
+}
+
+static preg *copy( jit_ctx *ctx, preg *to, preg *from, int size ) {
+	if( size == 0 || to == from ) return to;
+	switch( ID2(to->kind,from->kind) ) {
+	case ID2(RMEM,RCPU):
+	case ID2(RSTACK,RCPU):
+	case ID2(RCPU,RSTACK):
+	case ID2(RCPU,RMEM):
+	case ID2(RCPU,RCPU):
+#	ifndef HL_64
+	case ID2(RCPU,RADDR):
+	case ID2(RADDR,RCPU):
+#	endif
+		switch( size ) {
+		case 1:
+			if( to->kind == RCPU ) {
+				op64(ctx,XOR,to,to);
+				if( !is_reg8(to) ) {
+					preg p;
+					op32(ctx,MOV16,to,from);
+					op32(ctx,SHL,to,pconst(&p,24));
+					op32(ctx,SHR,to,pconst(&p,24));
+					break;
+				}
+			}
+			if( !is_reg8(from) ) {
+				preg *r = alloc_reg(ctx, RCPU_CALL);
+				op32(ctx, MOV, r, from);
+				RUNLOCK(r);
+				op32(ctx,MOV8,to,r);
+				return from;
+			}
+			op32(ctx,MOV8,to,from);
+			break;
+		case 2:
+			if( to->kind == RCPU )
+				op64(ctx,XOR,to,to);
+			op32(ctx,MOV16,to,from);
+			break;
+		case 4:
+			op32(ctx,MOV,to,from);
+			break;
+		case 8:
+			if( IS_64 ) {
+				op64(ctx,MOV,to,from);
+				break;
+			}
+		default:
+			ASSERT(size);
+		}
+		return to->kind == RCPU ? to : from;
+	case ID2(RFPU,RFPU):
+	case ID2(RMEM,RFPU):
+	case ID2(RSTACK,RFPU):
+	case ID2(RFPU,RMEM):
+	case ID2(RFPU,RSTACK):
+		switch( size ) {
+		case 8:
+			op64(ctx,MOVSD,to,from);
+			break;
+		case 4:
+			op32(ctx,MOVSS,to,from);
+			break;
+		default:
+			ASSERT(size);
+		}
+		return to->kind == RFPU ? to : from;
+	case ID2(RMEM,RSTACK):
+		{
+			vreg *rfrom = R(from->id);
+			if( IS_FLOAT(rfrom) )
+				return copy(ctx,to,alloc_fpu(ctx,rfrom,true),size);
+			return copy(ctx,to,alloc_cpu(ctx,rfrom,true),size);
+		}
+	case ID2(RMEM,RMEM):
+	case ID2(RSTACK,RMEM):
+	case ID2(RSTACK,RSTACK):
+#	ifndef HL_64
+	case ID2(RMEM,RADDR):
+	case ID2(RSTACK,RADDR):
+	case ID2(RADDR,RSTACK):
+#	endif
+		{
+			preg *tmp;
+			if( (!IS_64 && size == 8) || (to->kind == RSTACK && IS_FLOAT(R(to->id))) || (from->kind == RSTACK && IS_FLOAT(R(from->id))) ) {
+				tmp = alloc_reg(ctx, RFPU);
+				op64(ctx,size == 8 ? MOVSD : MOVSS,tmp,from);
+			} else {
+				tmp = alloc_reg(ctx, RCPU);
+				copy(ctx,tmp,from,size);
+			}
+			return copy(ctx,to,tmp,size);
+		}
+#	ifdef HL_64
+	case ID2(RCPU,RADDR):
+	case ID2(RMEM,RADDR):
+	case ID2(RSTACK,RADDR):
+		{
+			preg p;
+			preg *tmp = alloc_reg(ctx, RCPU);
+			op64(ctx,MOV,tmp,pconst64(&p,(int_val)from->holds));
+			return copy(ctx,to,pmem(&p,tmp->id,0),size);
+		}
+	case ID2(RADDR,RCPU):
+	case ID2(RADDR,RMEM):
+	case ID2(RADDR,RSTACK):
+		{
+			preg p;
+			preg *tmp = alloc_reg(ctx, RCPU);
+			op64(ctx,MOV,tmp,pconst64(&p,(int_val)to->holds));
+			return copy(ctx,pmem(&p,tmp->id,0),from,size);
+		}
+#	endif
+	default:
+		break;
+	}
+	printf("copy(%s,%s)\n",KNAMES[to->kind], KNAMES[from->kind]);
+	ASSERT(0);
+	return NULL;
+}
+
+static void store( jit_ctx *ctx, vreg *r, preg *v, bool bind ) {
+	if( r->current && r->current != v ) {
+		r->current->holds = NULL;
+		r->current = NULL;
+	}
+	v = copy(ctx,&r->stack,v,r->size);
+	if( IS_FLOAT(r) != (v->kind == RFPU) )
+		ASSERT(0);
+	if( bind && r->current != v && (v->kind == RCPU || v->kind == RFPU) ) {
+		scratch(v);
+		r->current = v;
+		v->holds = r;
+	}
+}
+
+static void store_result( jit_ctx *ctx, vreg *r ) {
+#	ifndef HL_64
+	switch( r->t->kind ) {
+	case HF64:
+		scratch(r->current);
+		op64(ctx,FSTP,&r->stack,UNUSED);
+		break;
+	case HF32:
+		scratch(r->current);
+		op64(ctx,FSTP32,&r->stack,UNUSED);
+		break;
+	case HI64:
+		scratch(r->current);
+		error_i64();
+		break;
+	default:
+#	endif
+		store(ctx,r,IS_FLOAT(r) ? REG_AT(XMM(0)) : PEAX,true);
+#	ifndef HL_64
+		break;
+	}
+#	endif
+}
+
+static void op_mov( jit_ctx *ctx, vreg *to, vreg *from ) {
+	preg *r = fetch(from);
+#	ifndef HL_64
+	if( to->t->kind == HI64 ) {
+		error_i64();
+		return;
+	}
+#	endif
+	if( from->t->kind == HF32 && r->kind != RFPU )
+		r = alloc_fpu(ctx,from,true);
+	store(ctx, to, r, true);
+}
+
+static void copy_to( jit_ctx *ctx, vreg *to, preg *from ) {
+	store(ctx,to,from,true);
+}
+
+static void copy_from( jit_ctx *ctx, preg *to, vreg *from ) {
+	copy(ctx,to,fetch(from),from->size);
+}
+
+static void store_const( jit_ctx *ctx, vreg *r, int c ) {
+	preg p;
+	if( c == 0 )
+		op(ctx,XOR,alloc_cpu(ctx,r,false),alloc_cpu(ctx,r,false),r->size == 8);
+	else if( r->size == 8 )
+		op64(ctx,MOV,alloc_cpu(ctx,r,false),pconst64(&p,c));
+	else
+		op32(ctx,MOV,alloc_cpu(ctx,r,false),pconst(&p,c));
+	store(ctx,r,r->current,false);
+}
+
+static void discard_regs( jit_ctx *ctx, bool native_call ) {
+	int i;
+	for(i=0;i<RCPU_SCRATCH_COUNT;i++) {
+		preg *r = ctx->pregs + RCPU_SCRATCH_REGS[i];
+		if( r->holds ) {
+			r->holds->current = NULL;
+			r->holds = NULL;
+		}
+	}
+	for(i=0;i<RFPU_COUNT;i++) {
+		preg *r = ctx->pregs + XMM(i);
+		if( r->holds ) {
+			r->holds->current = NULL;
+			r->holds = NULL;
+		}
+	}
+}
+
+static int pad_before_call( jit_ctx *ctx, int size ) {
+	int total = size + ctx->totalRegsSize + HL_WSIZE * 2; // EIP+EBP
+	if( total & 15 ) {
+		int pad = 16 - (total & 15);
+		preg p;
+		if( pad ) op64(ctx,SUB,PESP,pconst(&p,pad));
+		size += pad;
+	}
+	return size;
+}
+
+static void push_reg( jit_ctx *ctx, vreg *r ) {
+	preg p;
+	switch( stack_size(r->t) ) {
+	case 1:
+		op64(ctx,SUB,PESP,pconst(&p,1));
+		op32(ctx,MOV8,pmem(&p,Esp,0),alloc_cpu8(ctx,r,true));
+		break;
+	case 2:
+		op64(ctx,SUB,PESP,pconst(&p,2));
+		op32(ctx,MOV16,pmem(&p,Esp,0),alloc_cpu(ctx,r,true));
+		break;
+	case 4:
+		if( r->size < 4 )
+			alloc_cpu(ctx,r,true); // force fetch (higher bits set to 0)
+		if( !IS_64 ) {
+			if( r->current != NULL && r->current->kind == RFPU ) scratch(r->current);
+			op32(ctx,PUSH,fetch(r),UNUSED);
+		} else {
+			// pseudo push32 (not available)
+			op64(ctx,SUB,PESP,pconst(&p,4));
+			op32(ctx,MOV,pmem(&p,Esp,0),alloc_cpu(ctx,r,true));
+		}
+		break;
+	case 8:
+		if( fetch(r)->kind == RFPU ) {
+			op64(ctx,SUB,PESP,pconst(&p,8));
+			op64(ctx,MOVSD,pmem(&p,Esp,0),fetch(r));
+		} else if( IS_64 )
+			op64(ctx,PUSH,fetch(r),UNUSED);
+		else if( r->stack.kind == RSTACK ) {
+			scratch(r->current);
+			r->stackPos += 4;
+			op32(ctx,PUSH,&r->stack,UNUSED);
+			r->stackPos -= 4;
+			op32(ctx,PUSH,&r->stack,UNUSED);
+		} else
+			ASSERT(0);
+		break;
+	default:
+		ASSERT(r->size);
+	}
+}
+
+static int begin_native_call( jit_ctx *ctx, int nargs ) {
+	ctx->nativeArgsCount = nargs;
+	return pad_before_call(ctx, nargs > CALL_NREGS ? (nargs - CALL_NREGS) * HL_WSIZE : 0);
+}
+
+static preg *alloc_native_arg( jit_ctx *ctx ) {
+#	ifdef HL_64
+	int rid = ctx->nativeArgsCount - 1;
+	preg *r = rid < CALL_NREGS ? REG_AT(CALL_REGS[rid]) : alloc_reg(ctx,RCPU_CALL);
+	scratch(r);
+	return r;
+#	else
+	return alloc_reg(ctx, RCPU);
+#	endif
+}
+
+static void set_native_arg( jit_ctx *ctx, preg *r ) {
+	if( r->kind == RSTACK ) {
+		vreg *v = ctx->vregs + r->id;
+		if( v->size < 4 )
+			r = fetch32(ctx, v);
+	}
+#	ifdef HL_64
+	if( r->kind == RFPU ) ASSERT(0);
+	int rid = --ctx->nativeArgsCount;
+	preg *target;
+	if( rid >= CALL_NREGS ) {
+		op64(ctx,PUSH,r,UNUSED);
+		return;
+	}
+	target = REG_AT(CALL_REGS[rid]);
+	if( target != r ) {
+		op64(ctx, MOV, target, r);
+		scratch(target);
+	}
+#	else
+	op32(ctx,PUSH,r,UNUSED);
+#	endif
+}
+
+static void set_native_arg_fpu( jit_ctx *ctx, preg *r, bool isf32 ) {
+#	ifdef HL_64
+	if( r->kind == RCPU ) ASSERT(0);
+	// can only be used if last argument !!
+	ctx->nativeArgsCount--;
+	preg *target = REG_AT(XMM(IS_WINCALL64 ? ctx->nativeArgsCount : 0));
+	if( target != r ) {
+		op64(ctx, isf32 ? MOVSS : MOVSD, target, r);
+		scratch(target);
+	}
+#	else
+	op32(ctx,PUSH,r,UNUSED);
+#	endif
+}
+
+typedef struct {
+	int nextCpu;
+	int nextFpu;
+	int mapped[REG_COUNT];
+} call_regs;
+
+static int select_call_reg( call_regs *regs, hl_type *t, int id ) {
+#	ifndef HL_64
+	return -1;
+#else
+	bool isFloat = t->kind == HF32 || t->kind == HF64;
+#	ifdef HL_WIN_CALL
+	int index = regs->nextCpu++;
+#	else
+	int index = isFloat ? regs->nextFpu++ : regs->nextCpu++;
+#	endif
+	if( index >= CALL_NREGS )
+		return -1;
+	int reg = isFloat ? XMM(index) : CALL_REGS[index];
+	regs->mapped[reg] = id + 1;
+	return reg;
+#endif
+}
+
+static int mapped_reg( call_regs *regs, int id ) {
+#	ifndef HL_64
+	return -1;
+#else
+	int i;
+	for(i=0;i<CALL_NREGS;i++) {
+		int r = CALL_REGS[i];
+		if( regs->mapped[r] == id + 1 ) return r;
+		r = XMM(i);
+		if( regs->mapped[r] == id + 1 ) return r;
+	}
+	return -1;
+#endif
+}
+
+static int prepare_call_args( jit_ctx *ctx, int count, int *args, vreg *vregs, int extraSize ) {
+	int i;
+	int size = extraSize, paddedSize;
+	call_regs ctmp = {0};
+	for(i=0;i<count;i++) {
+		vreg *r = vregs + args[i];
+		int cr = select_call_reg(&ctmp, r->t, i);
+		if( cr >= 0 ) {
+			preg *c = REG_AT(cr);
+			preg *cur = fetch(r);
+			if( cur != c ) {
+				copy(ctx,c,cur,r->size);
+				scratch(c);
+			}
+			RLOCK(c);
+			continue;
+		}
+		size += stack_size(r->t);
+	}
+	paddedSize = pad_before_call(ctx,size);
+	for(i=0;i<count;i++) {
+		// RTL
+		int j = count - (i + 1);
+		vreg *r = vregs + args[j];
+		if( (i & 7) == 0 ) jit_buf(ctx);
+		if( mapped_reg(&ctmp,j) >= 0 ) continue;
+		push_reg(ctx,r);
+		if( r->current ) RUNLOCK(r->current);
+	}
+	return paddedSize;
+}
+
+static void op_call( jit_ctx *ctx, preg *r, int size ) {
+	preg p;
+#	ifdef JIT_DEBUG
+	if( IS_64 && size >= 0 ) {
+		int jchk;
+		op32(ctx,TEST,PESP,pconst(&p,15));
+		XJump(JZero,jchk);
+		BREAK(); // unaligned ESP
+		patch_jump(ctx, jchk);
+	}
+#	endif
+	if( IS_WINCALL64 ) {
+		// MSVC requires 32bytes of free space here
+		op64(ctx,SUB,PESP,pconst(&p,32));
+		if( size >= 0 ) size += 32;
+	}
+	op32(ctx, CALL, r, UNUSED);
+	if( size > 0 ) op64(ctx,ADD,PESP,pconst(&p,size));
+}
+
+static void call_native( jit_ctx *ctx, void *nativeFun, int size ) {
+	bool isExc = nativeFun == hl_assert || nativeFun == hl_throw || nativeFun == on_jit_error;
+	preg p;
+	// native function, already resolved
+	op64(ctx,MOV,PEAX,pconst64(&p,(int_val)nativeFun));
+	op_call(ctx,PEAX, isExc ? -1 : size);
+	if( isExc )
+		return;
+	discard_regs(ctx, true);
+}
+
+static void op_call_fun( jit_ctx *ctx, vreg *dst, int findex, int count, int *args ) {
+	int fid = findex < 0 ? -1 : ctx->m->functions_indexes[findex];
+	bool isNative = fid >= ctx->m->code->nfunctions;
+	int size = prepare_call_args(ctx,count,args,ctx->vregs,0);
+	preg p;
+	if( fid < 0 ) {
+		ASSERT(fid);
+	} else if( isNative ) {
+		call_native(ctx,ctx->m->functions_ptrs[findex],size);
+	} else {
+		int cpos = BUF_POS() + (IS_WINCALL64 ? 4 : 0);
+#		ifdef JIT_DEBUG
+		if( IS_64 ) cpos += 13; // ESP CHECK
+#		endif
+		if( ctx->m->functions_ptrs[findex] ) {
+			// already compiled
+			op_call(ctx,pconst(&p,(int)(int_val)ctx->m->functions_ptrs[findex] - (cpos + 5)), size);
+		} else if( ctx->m->code->functions + fid == ctx->f ) {
+			// our current function
+			op_call(ctx,pconst(&p, ctx->functionPos - (cpos + 5)), size);
+		} else {
+			// stage for later
+			jlist *j = (jlist*)hl_malloc(&ctx->galloc,sizeof(jlist));
+			j->pos = cpos;
+			j->target = findex;
+			j->next = ctx->calls;
+			ctx->calls = j;
+			op_call(ctx,pconst(&p,0), size);
+		}
+		discard_regs(ctx, false);
+	}
+	if( dst )
+		store_result(ctx,dst);
+}
+
+static void op_enter( jit_ctx *ctx ) {
+	preg p;
+	op64(ctx, PUSH, PEBP, UNUSED);
+	op64(ctx, MOV, PEBP, PESP);
+	if( ctx->totalRegsSize ) op64(ctx, SUB, PESP, pconst(&p,ctx->totalRegsSize));
+}
+
+static void op_ret( jit_ctx *ctx, vreg *r ) {
+	preg p;
+	switch( r->t->kind ) {
+	case HF32:
+#		ifdef HL_64
+		op64(ctx, MOVSS, PXMM(0), fetch(r));
+#		else
+		op64(ctx,FLD32,&r->stack,UNUSED);
+#		endif
+		break;
+	case HF64:
+#		ifdef HL_64
+		op64(ctx, MOVSD, PXMM(0), fetch(r));
+#		else
+		op64(ctx,FLD,&r->stack,UNUSED);
+#		endif
+		break;
+	default:
+		if( r->size < 4 && !r->current )
+			fetch32(ctx, r);
+		if( r->current != PEAX )
+			op64(ctx,MOV,PEAX,fetch(r));
+		break;
+	}
+	if( ctx->totalRegsSize ) op64(ctx, ADD, PESP, pconst(&p, ctx->totalRegsSize));
+#	ifdef JIT_DEBUG
+	{
+		int jeq;
+		op64(ctx, CMP, PESP, PEBP);
+		XJump_small(JEq,jeq);
+		jit_error("invalid ESP");
+		patch_jump(ctx,jeq);
+	}
+#	endif
+	op64(ctx, POP, PEBP, UNUSED);
+	op64(ctx, RET, UNUSED, UNUSED);
+}
+
+static void call_native_consts( jit_ctx *ctx, void *nativeFun, int_val *args, int nargs ) {
+	int size = pad_before_call(ctx, IS_64 ? 0 : HL_WSIZE*nargs);
+	preg p;
+	int i;
+#	ifdef HL_64
+	for(i=0;i<nargs;i++)
+		op64(ctx, MOV, REG_AT(CALL_REGS[i]), pconst64(&p, args[i]));
+#	else
+	for(i=nargs-1;i>=0;i--)
+		op32(ctx, PUSH, pconst64(&p, args[i]), UNUSED);
+#	endif
+	call_native(ctx, nativeFun, size);
+}
+
+static void on_jit_error( const char *msg, int_val line ) {
+	char buf[256];
+	int iline = (int)line;
+	sprintf(buf,"%s (line %d)",msg,iline);
+#ifdef HL_WIN_DESKTOP
+	MessageBoxA(NULL,buf,"JIT ERROR",MB_OK);
+#else
+	printf("JIT ERROR : %s\n",buf);
+#endif
+	hl_debug_break();
+	hl_throw(NULL);
+}
+
+static void _jit_error( jit_ctx *ctx, const char *msg, int line ) {
+	int_val args[2] = { (int_val)msg, (int_val)line };
+	call_native_consts(ctx,on_jit_error,args,2);
+}
+
+
+static preg *op_binop( jit_ctx *ctx, vreg *dst, vreg *a, vreg *b, hl_op bop ) {
+	preg *pa = fetch(a), *pb = fetch(b), *out = NULL;
+	CpuOp o;
+	if( IS_FLOAT(a) ) {
+		bool isf32 = a->t->kind == HF32;
+		switch( bop ) {
+		case OAdd: o = isf32 ? ADDSS : ADDSD; break;
+		case OSub: o = isf32 ? SUBSS : SUBSD; break;
+		case OMul: o = isf32 ? MULSS : MULSD; break;
+		case OSDiv: o = isf32 ? DIVSS : DIVSD; break;
+		case OJSLt:
+		case OJSGte:
+		case OJSLte:
+		case OJSGt:
+		case OJEq:
+		case OJNotEq:
+		case OJNotLt:
+		case OJNotGte:
+			o = isf32 ? COMISS : COMISD;
+			break;
+		case OSMod:
+			{
+				int args[] = { a->stack.id, b->stack.id };
+				int size = prepare_call_args(ctx,2,args,ctx->vregs,0);
+				void *mod_fun;
+				if( isf32 ) mod_fun = fmodf; else mod_fun = fmod;
+				call_native(ctx,mod_fun,size);
+				store_result(ctx,dst);
+				return fetch(dst);
+			}
+		default:
+			printf("%s\n", hl_op_name(bop));
+			ASSERT(bop);
+		}
+	} else {
+		bool is64 =	a->t->kind == HI64;
+#	ifndef HL_64
+		if( is64 ) {
+			error_i64();
+			return fetch(a);
+		}
+#	endif
+		switch( bop ) {
+		case OAdd: o = ADD; break;
+		case OSub: o = SUB; break;
+		case OMul: o = IMUL; break;
+		case OAnd: o = AND; break;
+		case OOr: o = OR; break;
+		case OXor: o = XOR; break;
+		case OShl:
+		case OUShr:
+		case OSShr:
+			if( !b->current || b->current->kind != RCPU || b->current->id != Ecx ) {
+				scratch(REG_AT(Ecx));
+				op(ctx,MOV,REG_AT(Ecx),pb,is64);
+				RLOCK(REG_AT(Ecx));
+				pa = fetch(a);
+			} else
+				RLOCK(b->current);
+			if( pa->kind != RCPU ) {
+				pa = alloc_reg(ctx, RCPU);
+				op(ctx,MOV,pa,fetch(a), is64);
+			}
+			op(ctx,bop == OShl ? SHL : (bop == OUShr ? SHR : SAR), pa, UNUSED,is64);
+			if( dst ) store(ctx, dst, pa, true);
+			return pa;
+		case OSDiv:
+		case OUDiv:
+		case OSMod:
+		case OUMod:
+			{
+				preg *out = bop == OSMod || bop == OUMod ? REG_AT(Edx) : PEAX;
+				preg *r = pb;
+				preg p;
+				int jz, jz1 = 0, jend;
+				if( pa->kind == RCPU && pa->id == Eax ) RLOCK(pa);
+				// ensure b in CPU reg and not in Eax/Edx (for UI8/UI16)
+				if( pb->kind != RCPU || (pb->id == Eax || pb->id == Edx) ) {
+					scratch(REG_AT(Ecx));
+					scratch(pb);
+					load(ctx,REG_AT(Ecx),b);
+					r = REG_AT(Ecx);
+				}
+				// integer div 0 => 0
+				op(ctx,TEST,r,r,is64);
+				XJump_small(JZero, jz);
+				// Prevent MIN/-1 overflow exception
+				// OSMod: r = (b == 0 || b == -1) ? 0 : a % b
+				// OSDiv: r = (b == 0 || b == -1) ? a * b : a / b
+				if( bop == OSMod || bop == OSDiv ) {
+					op(ctx, CMP, r, pconst(&p,-1), is64);
+					XJump_small(JEq, jz1);
+				}
+				pa = fetch(a);
+				if( pa->kind != RCPU || pa->id != Eax ) {
+					scratch(PEAX);
+					scratch(pa);
+					load(ctx,PEAX,a);
+				}
+				scratch(REG_AT(Edx));
+				scratch(REG_AT(Eax));
+				if( bop == OUDiv || bop == OUMod )
+					op(ctx, XOR, REG_AT(Edx), REG_AT(Edx), is64);
+				else
+					op(ctx, CDQ, UNUSED, UNUSED, is64); // sign-extend Eax into Eax:Edx
+				op(ctx, bop == OUDiv || bop == OUMod ? DIV : IDIV, r, UNUSED, is64);
+				XJump_small(JAlways, jend);
+				patch_jump(ctx, jz);
+				patch_jump(ctx, jz1);
+				if( bop != OSDiv ) {
+					op(ctx, XOR, out, out, is64);
+				} else {
+					load(ctx, out, a);
+					op(ctx, IMUL, out, r, is64);
+				}
+				patch_jump(ctx, jend);
+				if( dst ) store(ctx, dst, out, true);
+				return out;
+			}
+		case OJSLt:
+		case OJSGte:
+		case OJSLte:
+		case OJSGt:
+		case OJULt:
+		case OJUGte:
+		case OJEq:
+		case OJNotEq:
+			switch( a->t->kind ) {
+			case HUI8:
+			case HBOOL:
+				o = CMP8;
+				break;
+			case HUI16:
+				o = CMP16;
+				break;
+			default:
+				o = CMP;
+				break;
+			}
+			break;
+		default:
+			printf("%s\n", hl_op_name(bop));
+			ASSERT(bop);
+		}
+	}
+	switch( RTYPE(a) ) {
+	case HI32:
+	case HUI8:
+	case HUI16:
+	case HBOOL:
+#	ifndef HL_64
+	case HDYNOBJ:
+	case HVIRTUAL:
+	case HOBJ:
+	case HSTRUCT:
+	case HFUN:
+	case HMETHOD:
+	case HBYTES:
+	case HNULL:
+	case HENUM:
+	case HDYN:
+	case HTYPE:
+	case HABSTRACT:
+	case HARRAY:
+#	endif
+		switch( ID2(pa->kind, pb->kind) ) {
+		case ID2(RCPU,RCPU):
+		case ID2(RCPU,RSTACK):
+			op32(ctx, o, pa, pb);
+			scratch(pa);
+			out = pa;
+			break;
+		case ID2(RSTACK,RCPU):
+			if( dst == a && o != IMUL ) {
+				op32(ctx, o, pa, pb);
+				dst = NULL;
+				out = pa;
+			} else {
+				alloc_cpu(ctx,a, true);
+				return op_binop(ctx,dst,a,b,bop);
+			}
+			break;
+		case ID2(RSTACK,RSTACK):
+			alloc_cpu(ctx, a, true);
+			return op_binop(ctx, dst, a, b, bop);
+		default:
+			printf("%s(%d,%d)\n", hl_op_name(bop), pa->kind, pb->kind);
+			ASSERT(ID2(pa->kind, pb->kind));
+		}
+		if( dst ) store(ctx, dst, out, true);
+		return out;
+#	ifdef HL_64
+	case HOBJ:
+	case HSTRUCT:
+	case HDYNOBJ:
+	case HVIRTUAL:
+	case HFUN:
+	case HMETHOD:
+	case HBYTES:
+	case HNULL:
+	case HENUM:
+	case HDYN:
+	case HTYPE:
+	case HABSTRACT:
+	case HARRAY:
+	case HI64:
+	case HGUID:
+		switch( ID2(pa->kind, pb->kind) ) {
+		case ID2(RCPU,RCPU):
+		case ID2(RCPU,RSTACK):
+			op64(ctx, o, pa, pb);
+			scratch(pa);
+			out = pa;
+			break;
+		case ID2(RSTACK,RCPU):
+			if( dst == a && OP_FORMS[o].mem_r ) {
+				op64(ctx, o, pa, pb);
+				dst = NULL;
+				out = pa;
+			} else {
+				alloc_cpu(ctx,a, true);
+				return op_binop(ctx,dst,a,b,bop);
+			}
+			break;
+		case ID2(RSTACK,RSTACK):
+			alloc_cpu(ctx, a, true);
+			return op_binop(ctx, dst, a, b, bop);
+		default:
+			printf("%s(%d,%d)\n", hl_op_name(bop), pa->kind, pb->kind);
+			ASSERT(ID2(pa->kind, pb->kind));
+		}
+		if( dst ) store(ctx, dst, out, true);
+		return out;
+#	endif
+	case HF64:
+	case HF32:
+		pa = alloc_fpu(ctx, a, true);
+		pb = alloc_fpu(ctx, b, true);
+		switch( ID2(pa->kind, pb->kind) ) {
+		case ID2(RFPU,RFPU):
+			op64(ctx,o,pa,pb);
+			if( (o == COMISD || o == COMISS) && bop != OJSGt ) {
+				int jnotnan;
+				XJump_small(JNParity,jnotnan);
+				switch( bop ) {
+				case OJSLt:
+				case OJNotLt:
+					{
+						preg *r = alloc_reg(ctx,RCPU);
+						// set CF=0, ZF=1
+						op64(ctx,XOR,r,r);
+						RUNLOCK(r);
+						break;
+					}
+				case OJSGte:
+				case OJNotGte:
+					{
+						preg *r = alloc_reg(ctx,RCPU);
+						// set ZF=0, CF=1
+						op64(ctx,XOR,r,r);
+						op64(ctx,CMP,r,PESP);
+						RUNLOCK(r);
+						break;
+					}
+					break;
+				case OJNotEq:
+				case OJEq:
+					// set ZF=0, CF=?
+				case OJSLte:
+					// set ZF=0, CF=0
+					op64(ctx,TEST,PESP,PESP);
+					break;
+				default:
+					ASSERT(bop);
+				}
+				patch_jump(ctx,jnotnan);
+			}
+			scratch(pa);
+			out = pa;
+			break;
+		default:
+			printf("%s(%d,%d)\n", hl_op_name(bop), pa->kind, pb->kind);
+			ASSERT(ID2(pa->kind, pb->kind));
+		}
+		if( dst ) store(ctx, dst, out, true);
+		return out;
+	default:
+		ASSERT(RTYPE(a));
+	}
+	return NULL;
+}
+
+static int do_jump( jit_ctx *ctx, hl_op op, bool isFloat ) {
+	int j;
+	switch( op ) {
+	case OJAlways:
+		XJump(JAlways,j);
+		break;
+	case OJSGte:
+		XJump(isFloat ? JUGte : JSGte,j);
+		break;
+	case OJSGt:
+		XJump(isFloat ? JUGt : JSGt,j);
+		break;
+	case OJUGte:
+		XJump(JUGte,j);
+		break;
+	case OJSLt:
+		XJump(isFloat ? JULt : JSLt,j);
+		break;
+	case OJSLte:
+		XJump(isFloat ? JULte : JSLte,j);
+		break;
+	case OJULt:
+		XJump(JULt,j);
+		break;
+	case OJEq:
+		XJump(JEq,j);
+		break;
+	case OJNotEq:
+		XJump(JNeq,j);
+		break;
+	case OJNotLt:
+		XJump(JUGte,j);
+		break;
+	case OJNotGte:
+		XJump(JULt,j);
+		break;
+	default:
+		j = 0;
+		printf("Unknown JUMP %d\n",op);
+		break;
+	}
+	return j;
+}
+
+static void register_jump( jit_ctx *ctx, int pos, int target ) {
+	jlist *j = (jlist*)hl_malloc(&ctx->falloc, sizeof(jlist));
+	j->pos = pos;
+	j->target = target;
+	j->next = ctx->jumps;
+	ctx->jumps = j;
+	if( target != 0 && ctx->opsPos[target] == 0 )
+		ctx->opsPos[target] = -1;
+}
+
+#define HDYN_VALUE 8
+
+static void dyn_value_compare( jit_ctx *ctx, preg *a, preg *b, hl_type *t ) {
+	preg p;
+	switch( t->kind ) {
+	case HUI8:
+	case HBOOL:
+		op32(ctx,MOV8,a,pmem(&p,a->id,HDYN_VALUE));
+		op32(ctx,MOV8,b,pmem(&p,b->id,HDYN_VALUE));
+		op64(ctx,CMP8,a,b);
+		break;
+	case HUI16:
+		op32(ctx,MOV16,a,pmem(&p,a->id,HDYN_VALUE));
+		op32(ctx,MOV16,b,pmem(&p,b->id,HDYN_VALUE));
+		op64(ctx,CMP16,a,b);
+		break;
+	case HI32:
+		op32(ctx,MOV,a,pmem(&p,a->id,HDYN_VALUE));
+		op32(ctx,MOV,b,pmem(&p,b->id,HDYN_VALUE));
+		op64(ctx,CMP,a,b);
+		break;
+	case HF32:
+		{
+			preg *fa = alloc_reg(ctx, RFPU);
+			preg *fb = alloc_reg(ctx, RFPU);
+			op64(ctx,MOVSS,fa,pmem(&p,a->id,HDYN_VALUE));
+			op64(ctx,MOVSS,fb,pmem(&p,b->id,HDYN_VALUE));
+			op64(ctx,COMISD,fa,fb);
+		}
+		break;
+	case HF64:
+		{
+			preg *fa = alloc_reg(ctx, RFPU);
+			preg *fb = alloc_reg(ctx, RFPU);
+			op64(ctx,MOVSD,fa,pmem(&p,a->id,HDYN_VALUE));
+			op64(ctx,MOVSD,fb,pmem(&p,b->id,HDYN_VALUE));
+			op64(ctx,COMISD,fa,fb);
+		}
+		break;
+	case HI64:
+	default:
+		// ptr comparison
+		op64(ctx,MOV,a,pmem(&p,a->id,HDYN_VALUE));
+		op64(ctx,MOV,b,pmem(&p,b->id,HDYN_VALUE));
+		op64(ctx,CMP,a,b);
+		break;
+	}
+}
+
+static void op_jump( jit_ctx *ctx, vreg *a, vreg *b, hl_opcode *op, int targetPos ) {
+	if( a->t->kind == HDYN || b->t->kind == HDYN || a->t->kind == HFUN || b->t->kind == HFUN ) {
+		int args[] = { a->stack.id, b->stack.id };
+		int size = prepare_call_args(ctx,2,args,ctx->vregs,0);
+		call_native(ctx,hl_dyn_compare,size);
+		if( op->op == OJSGt || op->op == OJSGte ) {
+			preg p;
+			int jinvalid;
+			op32(ctx,CMP,PEAX,pconst(&p,hl_invalid_comparison));
+			XJump_small(JEq,jinvalid);
+			op32(ctx,TEST,PEAX,PEAX);
+			register_jump(ctx,do_jump(ctx,op->op, IS_FLOAT(a)),targetPos);
+			patch_jump(ctx,jinvalid);
+			return;
+		}
+		op32(ctx,TEST,PEAX,PEAX);
+	} else switch( a->t->kind ) {
+	case HTYPE:
+		{
+			int args[] = { a->stack.id, b->stack.id };
+			int size = prepare_call_args(ctx,2,args,ctx->vregs,0);
+			preg p;
+			call_native(ctx,hl_same_type,size);
+			op64(ctx,CMP8,PEAX,pconst(&p,1));
+		}
+		break;
+	case HNULL:
+		{
+			preg *pa = hl_type_size(a->t->tparam) == 1 ? alloc_cpu8(ctx,a,true) : alloc_cpu(ctx,a,true);
+			preg *pb = hl_type_size(b->t->tparam) == 1 ? alloc_cpu8(ctx,b,true) : alloc_cpu(ctx,b,true);
+			if( op->op == OJEq ) {
+				// if( a == b || (a && b && a->v == b->v) ) goto
+				int ja, jb;
+				// if( a != b && (!a || !b || a->v != b->v) ) goto
+				op64(ctx,CMP,pa,pb);
+				register_jump(ctx,do_jump(ctx,OJEq,false),targetPos);
+				op64(ctx,TEST,pa,pa);
+				XJump_small(JZero,ja);
+				op64(ctx,TEST,pb,pb);
+				XJump_small(JZero,jb);
+				dyn_value_compare(ctx,pa,pb,a->t->tparam);
+				register_jump(ctx,do_jump(ctx,OJEq,false),targetPos);
+				scratch(pa);
+				scratch(pb);
+				patch_jump(ctx,ja);
+				patch_jump(ctx,jb);
+			} else if( op->op == OJNotEq ) {
+				int jeq, jcmp;
+				// if( a != b && (!a || !b || a->v != b->v) ) goto
+				op64(ctx,CMP,pa,pb);
+				XJump_small(JEq,jeq);
+				op64(ctx,TEST,pa,pa);
+				register_jump(ctx,do_jump(ctx,OJEq,false),targetPos);
+				op64(ctx,TEST,pb,pb);
+				register_jump(ctx,do_jump(ctx,OJEq,false),targetPos);
+				dyn_value_compare(ctx,pa,pb,a->t->tparam);
+				XJump_small(JZero,jcmp);
+				scratch(pa);
+				scratch(pb);
+				register_jump(ctx,do_jump(ctx,OJNotEq,false),targetPos);
+				patch_jump(ctx,jcmp);
+				patch_jump(ctx,jeq);
+			} else
+				ASSERT(op->op);
+			return;
+		}
+	case HVIRTUAL:
+		{
+			preg p;
+			preg *pa = alloc_cpu(ctx,a,true);
+			preg *pb = alloc_cpu(ctx,b,true);
+			int ja,jb,jav,jbv,jvalue;
+			if( b->t->kind == HOBJ ) {
+				if( op->op == OJEq ) {
+					// if( a ? (b && a->value == b) : (b == NULL) ) goto
+					op64(ctx,TEST,pa,pa);
+					XJump_small(JZero,ja);
+					op64(ctx,TEST,pb,pb);
+					XJump_small(JZero,jb);
+					op64(ctx,MOV,pa,pmem(&p,pa->id,HL_WSIZE));
+					op64(ctx,CMP,pa,pb);
+					XJump_small(JAlways,jvalue);
+					patch_jump(ctx,ja);
+					op64(ctx,TEST,pb,pb);
+					patch_jump(ctx,jvalue);
+					register_jump(ctx,do_jump(ctx,OJEq,false),targetPos);
+					patch_jump(ctx,jb);
+				} else if( op->op == OJNotEq ) {
+					// if( a ? (b == NULL || a->value != b) : (b != NULL) ) goto
+					op64(ctx,TEST,pa,pa);
+					XJump_small(JZero,ja);
+					op64(ctx,TEST,pb,pb);
+					register_jump(ctx,do_jump(ctx,OJEq,false),targetPos);
+					op64(ctx,MOV,pa,pmem(&p,pa->id,HL_WSIZE));
+					op64(ctx,CMP,pa,pb);
+					XJump_small(JAlways,jvalue);
+					patch_jump(ctx,ja);
+					op64(ctx,TEST,pb,pb);
+					patch_jump(ctx,jvalue);
+					register_jump(ctx,do_jump(ctx,OJNotEq,false),targetPos);
+				} else
+					ASSERT(op->op);
+				scratch(pa);
+				return;
+			}
+			op64(ctx,CMP,pa,pb);
+			if( op->op == OJEq ) {
+				// if( a == b || (a && b && a->value && b->value && a->value == b->value) ) goto
+				register_jump(ctx,do_jump(ctx,OJEq, false),targetPos);
+				op64(ctx,TEST,pa,pa);
+				XJump_small(JZero,ja);
+				op64(ctx,TEST,pb,pb);
+				XJump_small(JZero,jb);
+				op64(ctx,MOV,pa,pmem(&p,pa->id,HL_WSIZE));
+				op64(ctx,TEST,pa,pa);
+				XJump_small(JZero,jav);
+				op64(ctx,MOV,pb,pmem(&p,pb->id,HL_WSIZE));
+				op64(ctx,TEST,pb,pb);
+				XJump_small(JZero,jbv);
+				op64(ctx,CMP,pa,pb);
+				XJump_small(JNeq,jvalue);
+				register_jump(ctx,do_jump(ctx,OJEq, false),targetPos);
+				patch_jump(ctx,ja);
+				patch_jump(ctx,jb);
+				patch_jump(ctx,jav);
+				patch_jump(ctx,jbv);
+				patch_jump(ctx,jvalue);
+			} else if( op->op == OJNotEq ) {
+				int jnext;
+				// if( a != b && (!a || !b || !a->value || !b->value || a->value != b->value) ) goto
+				XJump_small(JEq,jnext);
+				op64(ctx,TEST,pa,pa);
+				XJump_small(JZero,ja);
+				op64(ctx,TEST,pb,pb);
+				XJump_small(JZero,jb);
+				op64(ctx,MOV,pa,pmem(&p,pa->id,HL_WSIZE));
+				op64(ctx,TEST,pa,pa);
+				XJump_small(JZero,jav);
+				op64(ctx,MOV,pb,pmem(&p,pb->id,HL_WSIZE));
+				op64(ctx,TEST,pb,pb);
+				XJump_small(JZero,jbv);
+				op64(ctx,CMP,pa,pb);
+				XJump_small(JEq,jvalue);
+				patch_jump(ctx,ja);
+				patch_jump(ctx,jb);
+				patch_jump(ctx,jav);
+				patch_jump(ctx,jbv);
+				register_jump(ctx,do_jump(ctx,OJAlways, false),targetPos);
+				patch_jump(ctx,jnext);
+				patch_jump(ctx,jvalue);
+			} else
+				ASSERT(op->op);
+			scratch(pa);
+			scratch(pb);
+			return;
+		}
+		break;
+	case HOBJ:
+	case HSTRUCT:
+		if( b->t->kind == HVIRTUAL ) {
+			op_jump(ctx,b,a,op,targetPos); // inverse
+			return;
+		}
+		if( hl_get_obj_rt(a->t)->compareFun ) {
+			preg *pa = alloc_cpu(ctx,a,true);
+			preg *pb = alloc_cpu(ctx,b,true);
+			preg p;
+			int jeq, ja, jb, jcmp;
+			int args[] = { a->stack.id, b->stack.id };
+			switch( op->op ) {
+			case OJEq:
+				// if( a == b || (a && b && cmp(a,b) == 0) ) goto
+				op64(ctx,CMP,pa,pb);
+				XJump_small(JEq,jeq);
+				op64(ctx,TEST,pa,pa);
+				XJump_small(JZero,ja);
+				op64(ctx,TEST,pb,pb);
+				XJump_small(JZero,jb);
+				op_call_fun(ctx,NULL,(int)(int_val)a->t->obj->rt->compareFun,2,args);
+				op32(ctx,TEST,PEAX,PEAX);
+				XJump_small(JNotZero,jcmp);
+				patch_jump(ctx,jeq);
+				register_jump(ctx,do_jump(ctx,OJAlways,false),targetPos);
+				patch_jump(ctx,ja);
+				patch_jump(ctx,jb);
+				patch_jump(ctx,jcmp);
+				break;
+			case OJNotEq:
+				// if( a != b && (!a || !b || cmp(a,b) != 0) ) goto
+				op64(ctx,CMP,pa,pb);
+				XJump_small(JEq,jeq);
+				op64(ctx,TEST,pa,pa);
+				register_jump(ctx,do_jump(ctx,OJEq,false),targetPos);
+				op64(ctx,TEST,pb,pb);
+				register_jump(ctx,do_jump(ctx,OJEq,false),targetPos);
+
+				op_call_fun(ctx,NULL,(int)(int_val)a->t->obj->rt->compareFun,2,args);
+				op32(ctx,TEST,PEAX,PEAX);
+				XJump_small(JZero,jcmp);
+
+				register_jump(ctx,do_jump(ctx,OJNotEq,false),targetPos);
+				patch_jump(ctx,jcmp);
+				patch_jump(ctx,jeq);
+				break;
+			default:
+				// if( a && b && cmp(a,b) ?? 0 ) goto
+				op64(ctx,TEST,pa,pa);
+				XJump_small(JZero,ja);
+				op64(ctx,TEST,pb,pb);
+				XJump_small(JZero,jb);
+				op_call_fun(ctx,NULL,(int)(int_val)a->t->obj->rt->compareFun,2,args);
+				op32(ctx,CMP,PEAX,pconst(&p,0));
+				register_jump(ctx,do_jump(ctx,op->op,false),targetPos);
+				patch_jump(ctx,ja);
+				patch_jump(ctx,jb);
+				break;
+			}
+			return;
+		}
+		// fallthrough
+	default:
+		// make sure we have valid 8 bits registers
+		if( a->size == 1 ) alloc_cpu8(ctx,a,true);
+		if( b->size == 1 ) alloc_cpu8(ctx,b,true);
+		op_binop(ctx,NULL,a,b,op->op);
+		break;
+	}
+	register_jump(ctx,do_jump(ctx,op->op, IS_FLOAT(a)),targetPos);
+}
+
+jit_ctx *hl_jit_alloc() {
+	int i;
+	jit_ctx *ctx = (jit_ctx*)malloc(sizeof(jit_ctx));
+	if( ctx == NULL ) return NULL;
+	memset(ctx,0,sizeof(jit_ctx));
+	hl_alloc_init(&ctx->falloc);
+	hl_alloc_init(&ctx->galloc);
+	for(i=0;i<RCPU_COUNT;i++) {
+		preg *r = REG_AT(i);
+		r->id = i;
+		r->kind = RCPU;
+	}
+	for(i=0;i<RFPU_COUNT;i++) {
+		preg *r = REG_AT(XMM(i));
+		r->id = i;
+		r->kind = RFPU;
+	}
+	return ctx;
+}
+
+void hl_jit_free( jit_ctx *ctx, h_bool can_reset ) {
+	free(ctx->vregs);
+	free(ctx->opsPos);
+	free(ctx->startBuf);
+	ctx->maxRegs = 0;
+	ctx->vregs = NULL;
+	ctx->maxOps = 0;
+	ctx->opsPos = NULL;
+	ctx->startBuf = NULL;
+	ctx->bufSize = 0;
+	ctx->buf.b = NULL;
+	ctx->calls = NULL;
+	ctx->switchs = NULL;
+	ctx->closure_list = NULL;
+	hl_free(&ctx->falloc);
+	hl_free(&ctx->galloc);
+	if( !can_reset ) free(ctx);
+}
+
+static void jit_nops( jit_ctx *ctx ) {
+	while( BUF_POS() & 15 )
+		op32(ctx, NOP, UNUSED, UNUSED);
+}
+
+#define MAX_ARGS 16
+
+static void *call_jit_c2hl = NULL;
+static void *call_jit_hl2c = NULL;
+
+static void *callback_c2hl( void *_f, hl_type *t, void **args, vdynamic *ret ) {
+	/*
+		prepare stack and regs according to prepare_call_args, but by reading runtime type information
+		from the function type. The stack and regs will be setup by the trampoline function.
+	*/
+	void **f = (void**)_f;
+	unsigned char stack[MAX_ARGS * 8];
+	call_regs cregs = {0};
+	if( t->fun->nargs > MAX_ARGS )
+		hl_error("Too many arguments for dynamic call");
+	int i, size = 0, pad = 0, pos = 0;
+	for(i=0;i<t->fun->nargs;i++) {
+		hl_type *at = t->fun->args[i];
+		int creg = select_call_reg(&cregs,at,i);
+		if( creg >= 0 )
+			continue;
+		size += stack_size(at);
+	}
+	pad = (-size) & 15;
+	size += pad;
+	pos = 0;
+	for(i=0;i<t->fun->nargs;i++) {
+		// RTL
+		hl_type *at = t->fun->args[i];
+		void *v = args[i];
+		int creg = mapped_reg(&cregs,i);
+		void *store;
+		if( creg >= 0 ) {
+			if( REG_IS_FPU(creg) ) {
+				store = stack + size + CALL_NREGS * HL_WSIZE + (creg - XMM(0)) * sizeof(double);
+			} else {
+				store = stack + size + call_reg_index(creg) * HL_WSIZE;
+			}
+			switch( at->kind ) {
+			case HBOOL:
+			case HUI8:
+				*(int_val*)store = *(unsigned char*)v;
+				break;
+			case HUI16:
+				*(int_val*)store = *(unsigned short*)v;
+				break;
+			case HI32:
+				*(int_val*)store = *(int*)v;
+				break;
+			case HF32:
+				*(void**)store = 0;
+				*(float*)store = *(float*)v;
+				break;
+			case HF64:
+				*(double*)store = *(double*)v;
+				break;
+			case HI64:
+			case HGUID:
+				*(int64*)store = *(int64*)v;
+				break;
+			default:
+				*(void**)store = v;
+				break;
+			}
+		} else {
+			int tsize = stack_size(at);
+			store = stack + pos;
+			pos += tsize;
+			switch( at->kind ) {
+			case HBOOL:
+			case HUI8:
+				*(int*)store = *(unsigned char*)v;
+				break;
+			case HUI16:
+				*(int*)store = *(unsigned short*)v;
+				break;
+			case HI32:
+			case HF32:
+				*(int*)store = *(int*)v;
+				break;
+			case HF64:
+				*(double*)store = *(double*)v;
+				break;
+			case HI64:
+			case HGUID:
+				*(int64*)store = *(int64*)v;
+				break;
+			default:
+				*(void**)store = v;
+				break;
+			}
+		}
+	}
+	pos += pad;
+	pos >>= IS_64 ? 3 : 2;
+	switch( t->fun->ret->kind ) {
+	case HUI8:
+	case HUI16:
+	case HI32:
+	case HBOOL:
+		ret->v.i = ((int (*)(void *, void *, void *))call_jit_c2hl)(*f, (void**)&stack + pos, &stack);
+		return &ret->v.i;
+	case HI64:
+	case HGUID:
+		ret->v.i64 = ((int64 (*)(void *, void *, void *))call_jit_c2hl)(*f, (void**)&stack + pos, &stack);
+		return &ret->v.i64;
+	case HF32:
+		ret->v.f = ((float (*)(void *, void *, void *))call_jit_c2hl)(*f, (void**)&stack + pos, &stack);
+		return &ret->v.f;
+	case HF64:
+		ret->v.d = ((double (*)(void *, void *, void *))call_jit_c2hl)(*f, (void**)&stack + pos, &stack);
+		return &ret->v.d;
+	default:
+		return ((void *(*)(void *, void *, void *))call_jit_c2hl)(*f, (void**)&stack + pos, &stack);
+	}
+}
+
+static void jit_c2hl( jit_ctx *ctx ) {
+	//	create the function that will be called by callback_c2hl
+	//	it will make sure to prepare the stack/regs according to native calling conventions
+	int jeq, jloop, jstart;
+	preg *fptr, *stack, *stend;
+	preg p;
+
+	op64(ctx,PUSH,PEBP,UNUSED);
+	op64(ctx,MOV,PEBP,PESP);
+
+#	ifdef HL_64
+
+	fptr = REG_AT(R10);
+	stack = PEAX;
+	stend = REG_AT(R11);
+	op64(ctx, MOV, fptr, REG_AT(CALL_REGS[0]));
+	op64(ctx, MOV, stack, REG_AT(CALL_REGS[1]));
+	op64(ctx, MOV, stend, REG_AT(CALL_REGS[2]));
+
+	// set native call regs
+	int i;
+	for(i=0;i<CALL_NREGS;i++)
+		op64(ctx,MOV,REG_AT(CALL_REGS[i]),pmem(&p,stack->id,i*HL_WSIZE));
+	for(i=0;i<CALL_NREGS;i++)
+		op64(ctx,MOVSD,REG_AT(XMM(i)),pmem(&p,stack->id,(i+CALL_NREGS)*HL_WSIZE));
+
+#	else
+
+	// make sure the stack is aligned on 16 bytes
+	// the amount of push we will do afterwards is guaranteed to be a multiple of 16bytes by hl_callback
+#	ifdef HL_VCC
+	// VCC does not guarantee us an aligned stack...
+	op64(ctx,MOV,PEAX,PESP);
+	op64(ctx,AND,PEAX,pconst(&p,15));
+	op64(ctx,SUB,PESP,PEAX);
+#	else
+	op64(ctx,SUB,PESP,pconst(&p,8));
+#	endif
+
+	// mov arguments to regs
+	fptr = REG_AT(Eax);
+	stack = REG_AT(Edx);
+	stend = REG_AT(Ecx);
+	op64(ctx,MOV,fptr,pmem(&p,Ebp,HL_WSIZE*2));
+	op64(ctx,MOV,stack,pmem(&p,Ebp,HL_WSIZE*3));
+	op64(ctx,MOV,stend,pmem(&p,Ebp,HL_WSIZE*4));
+
+#	endif
+
+	// push stack args
+	jstart = BUF_POS();
+	op64(ctx,CMP,stack,stend);
+	XJump(JEq,jeq);
+	op64(ctx,SUB,stack,pconst(&p,HL_WSIZE));
+	op64(ctx,PUSH,pmem(&p,stack->id,0),UNUSED);
+	XJump(JAlways,jloop);
+	patch_jump(ctx,jeq);
+	patch_jump_to(ctx, jloop, jstart);
+
+	op_call(ctx,fptr,0);
+
+	// cleanup and ret
+	op64(ctx,MOV,PESP,PEBP);
+	op64(ctx,POP,PEBP, UNUSED);
+	op64(ctx,RET,UNUSED,UNUSED);
+}
+
+static vdynamic *jit_wrapper_call( vclosure_wrapper *c, char *stack_args, void **regs ) {
+	vdynamic *args[MAX_ARGS];
+	int i;
+	int nargs = c->cl.t->fun->nargs;
+	call_regs cregs = {0};
+	if( nargs > MAX_ARGS )
+		hl_error("Too many arguments for wrapped call");
+	cregs.nextCpu++; // skip fptr in HL64 - was passed as arg0
+	for(i=0;i<nargs;i++) {
+		hl_type *t = c->cl.t->fun->args[i];
+		int creg = select_call_reg(&cregs,t,i);
+		if( creg < 0 ) {
+			args[i] = hl_is_dynamic(t) ? *(vdynamic**)stack_args : hl_make_dyn(stack_args,t);
+			stack_args += stack_size(t);
+		} else if( hl_is_dynamic(t) ) {
+			args[i] = *(vdynamic**)(regs + call_reg_index(creg));
+		} else if( t->kind == HF32 || t->kind == HF64 ) {
+			args[i] = hl_make_dyn(regs + CALL_NREGS + creg - XMM(0),&hlt_f64);
+		} else {
+			args[i] = hl_make_dyn(regs + call_reg_index(creg),t);
+		}
+	}
+	return hl_dyn_call(c->wrappedFun,args,nargs);
+}
+
+static void *jit_wrapper_ptr( vclosure_wrapper *c, char *stack_args, void **regs ) {
+	vdynamic *ret = jit_wrapper_call(c, stack_args, regs);
+	hl_type *tret = c->cl.t->fun->ret;
+	switch( tret->kind ) {
+	case HVOID:
+		return NULL;
+	case HUI8:
+	case HUI16:
+	case HI32:
+	case HBOOL:
+		return (void*)(int_val)hl_dyn_casti(&ret,&hlt_dyn,tret);
+	case HI64:
+	case HGUID:
+		return (void*)(int_val)hl_dyn_casti64(&ret,&hlt_dyn);
+	default:
+		return hl_dyn_castp(&ret,&hlt_dyn,tret);
+	}
+}
+
+static double jit_wrapper_d( vclosure_wrapper *c, char *stack_args, void **regs ) {
+	vdynamic *ret = jit_wrapper_call(c, stack_args, regs);
+	return hl_dyn_castd(&ret,&hlt_dyn);
+}
+
+static void jit_hl2c( jit_ctx *ctx ) {
+	// create a function that is called with a vclosure_wrapper* and native args
+	// and pack and pass the args to callback_hl2c
+	preg p;
+	int jfloat1, jfloat2, jexit;
+	hl_type_fun *ft = NULL;
+	int size;
+#	ifdef HL_64
+	preg *cl = REG_AT(CALL_REGS[0]);
+	preg *tmp = REG_AT(CALL_REGS[1]);
+#	else
+	preg *cl = REG_AT(Ecx);
+	preg *tmp = REG_AT(Edx);
+#	endif
+
+	op64(ctx,PUSH,PEBP,UNUSED);
+	op64(ctx,MOV,PEBP,PESP);
+
+#	ifdef HL_64
+	// push registers
+	int i;
+	op64(ctx,SUB,PESP,pconst(&p,CALL_NREGS*8));
+	for(i=0;i<CALL_NREGS;i++)
+		op64(ctx,MOVSD,pmem(&p,Esp,i*8),REG_AT(XMM(i)));
+	for(i=0;i<CALL_NREGS;i++)
+		op64(ctx,PUSH,REG_AT(CALL_REGS[CALL_NREGS - 1 - i]),UNUSED);
+#	endif
+
+	// opcodes for:
+	//		switch( arg0->t->fun->ret->kind ) {
+	//		case HF32: case HF64: return jit_wrapper_d(arg0,&args);
+	//		default: return jit_wrapper_ptr(arg0,&args);
+	//		}
+	if( !IS_64 )
+		op64(ctx,MOV,cl,pmem(&p,Ebp,HL_WSIZE*2)); // load arg0
+	op64(ctx,MOV,tmp,pmem(&p,cl->id,0)); // ->t
+	op64(ctx,MOV,tmp,pmem(&p,tmp->id,HL_WSIZE)); // ->fun
+	op64(ctx,MOV,tmp,pmem(&p,tmp->id,(int)(int_val)&ft->ret)); // ->ret
+	op32(ctx,MOV,tmp,pmem(&p,tmp->id,0)); // -> kind
+
+	op32(ctx,CMP,tmp,pconst(&p,HF64));
+	XJump_small(JEq,jfloat1);
+	op32(ctx,CMP,tmp,pconst(&p,HF32));
+	XJump_small(JEq,jfloat2);
+
+	// 64 bits : ESP + EIP (+WIN64PAD)
+	// 32 bits : ESP + EIP + PARAM0
+	int args_pos = IS_64 ? ((IS_WINCALL64 ? 32 : 0) + HL_WSIZE * 2) : (HL_WSIZE*3);
+
+	size = begin_native_call(ctx,3);
+	op64(ctx, LEA, tmp, pmem(&p,Ebp,-HL_WSIZE*CALL_NREGS*2));
+	set_native_arg(ctx, tmp);
+	op64(ctx, LEA, tmp, pmem(&p,Ebp,args_pos));
+	set_native_arg(ctx, tmp);
+	set_native_arg(ctx, cl);
+	call_native(ctx, jit_wrapper_ptr, size);
+	XJump_small(JAlways, jexit);
+
+	patch_jump(ctx,jfloat1);
+	patch_jump(ctx,jfloat2);
+	size = begin_native_call(ctx,3);
+	op64(ctx, LEA, tmp, pmem(&p,Ebp,-HL_WSIZE*CALL_NREGS*2));
+	set_native_arg(ctx, tmp);
+	op64(ctx, LEA, tmp, pmem(&p,Ebp,args_pos));
+	set_native_arg(ctx, tmp);
+	set_native_arg(ctx, cl);
+	call_native(ctx, jit_wrapper_d, size);
+
+	patch_jump(ctx,jexit);
+	op64(ctx,MOV,PESP,PEBP);
+	op64(ctx,POP,PEBP, UNUSED);
+	op64(ctx,RET,UNUSED,UNUSED);
+}
+
+static void jit_fail( uchar *msg ) {
+	if( msg == NULL ) {
+		hl_debug_break();
+		msg = USTR("assert");
+	}
+	vdynamic *d = hl_alloc_dynamic(&hlt_bytes);
+	d->v.ptr = msg;
+	hl_throw(d);
+}
+
+static void jit_null_access( jit_ctx *ctx ) {
+	op64(ctx,PUSH,PEBP,UNUSED);
+	op64(ctx,MOV,PEBP,PESP);
+	int_val arg = (int_val)USTR("Null access");
+	call_native_consts(ctx, jit_fail, &arg, 1);
+}
+
+static void jit_null_fail( int fhash ) {
+	vbyte *field = hl_field_name(fhash);
+	hl_buffer *b = hl_alloc_buffer();
+	hl_buffer_str(b, USTR("Null access ."));
+	hl_buffer_str(b, (uchar*)field);
+	vdynamic *d = hl_alloc_dynamic(&hlt_bytes);
+	d->v.ptr = hl_buffer_content(b,NULL);
+	hl_throw(d);
+}
+
+static void jit_null_field_access( jit_ctx *ctx ) {
+	preg p;
+	op64(ctx,PUSH,PEBP,UNUSED);
+	op64(ctx,MOV,PEBP,PESP);
+	int size = begin_native_call(ctx, 1);
+	int args_pos = (IS_WINCALL64 ? 32 : 0) + HL_WSIZE*2;
+	set_native_arg(ctx, pmem(&p,Ebp,args_pos));
+	call_native(ctx,jit_null_fail,size);
+}
+
+static void jit_assert( jit_ctx *ctx ) {
+	op64(ctx,PUSH,PEBP,UNUSED);
+	op64(ctx,MOV,PEBP,PESP);
+	int_val arg = 0;
+	call_native_consts(ctx, jit_fail, &arg, 1);
+}
+
+static int jit_build( jit_ctx *ctx, void (*fbuild)( jit_ctx *) ) {
+	int pos;
+	jit_buf(ctx);
+	jit_nops(ctx);
+	pos = BUF_POS();
+	fbuild(ctx);
+	int endPos = BUF_POS();
+	jit_nops(ctx);
+#ifdef WIN64_UNWIND_TABLES
+	int fid = ctx->nunwind++;
+	ctx->unwind_table[fid].BeginAddress = pos;
+	ctx->unwind_table[fid].EndAddress = endPos;
+	ctx->unwind_table[fid].UnwindData = ctx->unwind_offset;
+#endif
+	return pos;
+}
+
+static void hl_jit_init_module( jit_ctx *ctx, hl_module *m ) {
+	int i;
+	ctx->m = m;
+	if( m->code->hasdebug ) {
+		ctx->debug = (hl_debug_infos*)malloc(sizeof(hl_debug_infos) * m->code->nfunctions);
+		memset(ctx->debug, -1, sizeof(hl_debug_infos) * m->code->nfunctions);
+	}
+	for(i=0;i<m->code->nfloats;i++) {
+		jit_buf(ctx);
+		*ctx->buf.d++ = m->code->floats[i];
+	}
+#ifdef WIN64_UNWIND_TABLES
+	jit_buf(ctx);
+	ctx->unwind_offset = BUF_POS();
+	write_unwind_data(ctx);
+
+	ctx->unwind_table = malloc(sizeof(RUNTIME_FUNCTION) * (m->code->nfunctions + 10));
+	memset(ctx->unwind_table, 0, sizeof(RUNTIME_FUNCTION) * (m->code->nfunctions + 10));
+#endif
+}
+
+void hl_jit_init( jit_ctx *ctx, hl_module *m ) {
+	hl_jit_init_module(ctx,m);
+	ctx->c2hl = jit_build(ctx, jit_c2hl);
+	ctx->hl2c = jit_build(ctx, jit_hl2c);
+	ctx->static_functions[0] = (void*)(int_val)jit_build(ctx,jit_null_access);
+	ctx->static_functions[1] = (void*)(int_val)jit_build(ctx,jit_assert);
+	ctx->static_functions[2] = (void*)(int_val)jit_build(ctx,jit_null_field_access);
+}
+
+void hl_jit_reset( jit_ctx *ctx, hl_module *m ) {
+	ctx->debug = NULL;
+	hl_jit_init_module(ctx,m);
+}
+
+static void *get_dyncast( hl_type *t ) {
+	switch( t->kind ) {
+	case HF32:
+		return hl_dyn_castf;
+	case HF64:
+		return hl_dyn_castd;
+	case HI64:
+	case HGUID:
+		return hl_dyn_casti64;
+	case HI32:
+	case HUI16:
+	case HUI8:
+	case HBOOL:
+		return hl_dyn_casti;
+	default:
+		return hl_dyn_castp;
+	}
+}
+
+static void *get_dynset( hl_type *t ) {
+	switch( t->kind ) {
+	case HF32:
+		return hl_dyn_setf;
+	case HF64:
+		return hl_dyn_setd;
+	case HI64:
+	case HGUID:
+		return hl_dyn_seti64;
+	case HI32:
+	case HUI16:
+	case HUI8:
+	case HBOOL:
+		return hl_dyn_seti;
+	default:
+		return hl_dyn_setp;
+	}
+}
+
+static void *get_dynget( hl_type *t ) {
+	switch( t->kind ) {
+	case HF32:
+		return hl_dyn_getf;
+	case HF64:
+		return hl_dyn_getd;
+	case HI64:
+	case HGUID:
+		return hl_dyn_geti64;
+	case HI32:
+	case HUI16:
+	case HUI8:
+	case HBOOL:
+		return hl_dyn_geti;
+	default:
+		return hl_dyn_getp;
+	}
+}
+
+static double uint_to_double( unsigned int v ) {
+	return v;
+}
+
+static vclosure *alloc_static_closure( jit_ctx *ctx, int fid ) {
+	hl_module *m = ctx->m;
+	vclosure *c = hl_malloc(&m->ctx.alloc,sizeof(vclosure));
+	int fidx = m->functions_indexes[fid];
+	c->hasValue = 0;
+	if( fidx >= m->code->nfunctions ) {
+		// native
+		c->t = m->code->natives[fidx - m->code->nfunctions].t;
+		c->fun = m->functions_ptrs[fid];
+		c->value = NULL;
+	} else {
+		c->t = m->code->functions[fidx].type;
+		c->fun = (void*)(int_val)fid;
+		c->value = ctx->closure_list;
+		ctx->closure_list = c;
+	}
+	return c;
+}
+
+static void make_dyn_cast( jit_ctx *ctx, vreg *dst, vreg *v ) {
+	int size;
+	preg p;
+	preg *tmp;
+	if( v->t->kind == HNULL && v->t->tparam->kind == dst->t->kind ) {
+		int jnull, jend;
+		preg *out;
+		switch( dst->t->kind ) {
+		case HUI8:
+		case HUI16:
+		case HI32:
+		case HBOOL:
+		case HI64:
+		case HGUID:
+			tmp = alloc_cpu(ctx, v, true);
+			op64(ctx, TEST, tmp, tmp);
+			XJump_small(JZero, jnull);
+			op64(ctx, MOV, tmp, pmem(&p,tmp->id,8));
+			XJump_small(JAlways, jend);
+			patch_jump(ctx, jnull);
+			op64(ctx, XOR, tmp, tmp);
+			patch_jump(ctx, jend);
+			store(ctx, dst, tmp, true);
+			return;
+		case HF32:
+		case HF64:
+			tmp = alloc_cpu(ctx, v, true);
+			out = alloc_fpu(ctx, dst, false);
+			op64(ctx, TEST, tmp, tmp);
+			XJump_small(JZero, jnull);
+			op64(ctx, dst->t->kind == HF32 ? MOVSS : MOVSD, out, pmem(&p,tmp->id,8));
+			XJump_small(JAlways, jend);
+			patch_jump(ctx, jnull);
+			op64(ctx, XORPD, out, out);
+			patch_jump(ctx, jend);
+			store(ctx, dst, out, true);
+			return;
+		default:
+			break;
+		}
+	}
+	switch( dst->t->kind ) {
+	case HF32:
+	case HF64:
+	case HI64:
+	case HGUID:
+		size = begin_native_call(ctx, 2);
+		set_native_arg(ctx, pconst64(&p,(int_val)v->t));
+		break;
+	default:
+		size = begin_native_call(ctx, 3);
+		set_native_arg(ctx, pconst64(&p,(int_val)dst->t));
+		set_native_arg(ctx, pconst64(&p,(int_val)v->t));
+		break;
+	}
+	tmp = alloc_native_arg(ctx);
+	op64(ctx,MOV,tmp,REG_AT(Ebp));
+	if( v->stackPos >= 0 )
+		op64(ctx,ADD,tmp,pconst(&p,v->stackPos));
+	else
+		op64(ctx,SUB,tmp,pconst(&p,-v->stackPos));
+	set_native_arg(ctx,tmp);
+	call_native(ctx,get_dyncast(dst->t),size);
+	store_result(ctx, dst);
+}
+
+int hl_jit_function( jit_ctx *ctx, hl_module *m, hl_function *f ) {
+	int i, size = 0, opCount;
+	int codePos = BUF_POS();
+	int nargs = f->type->fun->nargs;
+	unsigned short *debug16 = NULL;
+	int *debug32 = NULL;
+	call_regs cregs = {0};
+	hl_thread_info *tinf = NULL;
+	preg p;
+	ctx->f = f;
+	ctx->allocOffset = 0;
+	if( f->nregs > ctx->maxRegs ) {
+		free(ctx->vregs);
+		ctx->vregs = (vreg*)malloc(sizeof(vreg) * (f->nregs + 1));
+		if( ctx->vregs == NULL ) {
+			ctx->maxRegs = 0;
+			return -1;
+		}
+		ctx->maxRegs = f->nregs;
+	}
+	if( f->nops > ctx->maxOps ) {
+		free(ctx->opsPos);
+		ctx->opsPos = (int*)malloc(sizeof(int) * (f->nops + 1));
+		if( ctx->opsPos == NULL ) {
+			ctx->maxOps = 0;
+			return -1;
+		}
+		ctx->maxOps = f->nops;
+	}
+	memset(ctx->opsPos,0,(f->nops+1)*sizeof(int));
+	for(i=0;i<f->nregs;i++) {
+		vreg *r = R(i);
+		r->t = f->regs[i];
+		r->size = hl_type_size(r->t);
+		r->current = NULL;
+		r->stack.holds = NULL;
+		r->stack.id = i;
+		r->stack.kind = RSTACK;
+	}
+	size = 0;
+	int argsSize = 0;
+	for(i=0;i<nargs;i++) {
+		vreg *r = R(i);
+		int creg = select_call_reg(&cregs,r->t,i);
+		if( creg < 0 || IS_WINCALL64 ) {
+			// use existing stack storage
+			r->stackPos = argsSize + HL_WSIZE * 2;
+			argsSize += stack_size(r->t);
+		} else {
+			// make room in local vars
+			size += r->size;
+			size += hl_pad_size(size,r->t);
+			r->stackPos = -size;
+		}
+	}
+	for(i=nargs;i<f->nregs;i++) {
+		vreg *r = R(i);
+		size += r->size;
+		size += hl_pad_size(size,r->t); // align local vars
+		r->stackPos = -size;
+	}
+#	ifdef HL_64
+	size += (-size) & 15; // align on 16 bytes
+#	else
+	size += hl_pad_size(size,&hlt_dyn); // align on word size
+#	endif
+	ctx->totalRegsSize = size;
+	jit_buf(ctx);
+	ctx->functionPos = BUF_POS();
+	// make sure currentPos is > 0 before any reg allocations happen
+	// otherwise `alloc_reg` thinks that all registers are locked
+	ctx->currentPos = 1;
+	op_enter(ctx);
+#	ifdef HL_64
+	{
+		// store in local var
+		for(i=0;i<nargs;i++) {
+			vreg *r = R(i);
+			preg *p;
+			int reg = mapped_reg(&cregs, i);
+			if( reg < 0 ) continue;
+			p = REG_AT(reg);
+			copy(ctx,fetch(r),p,r->size);
+			p->holds = r;
+			r->current = p;
+		}
+	}
+#	endif
+	if( ctx->m->code->hasdebug ) {
+		debug16 = (unsigned short*)malloc(sizeof(unsigned short) * (f->nops + 1));
+		debug16[0] = (unsigned short)(BUF_POS() - codePos);
+	}
+	ctx->opsPos[0] = BUF_POS();
+
+	for(opCount=0;opCount<f->nops;opCount++) {
+		int jump;
+		hl_opcode *o = f->ops + opCount;
+		vreg *dst = R(o->p1);
+		vreg *ra = R(o->p2);
+		vreg *rb = R(o->p3);
+		ctx->currentPos = opCount + 1;
+		jit_buf(ctx);
+#		ifdef JIT_DEBUG
+		if( opCount == 0 || f->ops[opCount-1].op != OAsm ) {
+			int uid = opCount + (f->findex<<16);
+			op32(ctx, PUSH, pconst(&p,uid), UNUSED);
+			op64(ctx, ADD, PESP, pconst(&p,HL_WSIZE));
+		}
+#		endif
+		// emit code
+		switch( o->op ) {
+		case OMov:
+		case OUnsafeCast:
+			op_mov(ctx, dst, ra);
+			break;
+		case OInt:
+			store_const(ctx, dst, m->code->ints[o->p2]);
+			break;
+		case OBool:
+			store_const(ctx, dst, o->p2);
+			break;
+		case OGetGlobal:
+			{
+				void *addr = m->globals_data + m->globals_indexes[o->p2];
+#				ifdef HL_64
+				preg *tmp = alloc_reg(ctx, RCPU);
+				op64(ctx, MOV, tmp, pconst64(&p,(int_val)addr));
+				copy_to(ctx, dst, pmem(&p,tmp->id,0));
+#				else
+				copy_to(ctx, dst, paddr(&p,addr));
+#				endif
+			}
+			break;
+		case OSetGlobal:
+			{
+				void *addr = m->globals_data + m->globals_indexes[o->p1];
+#				ifdef HL_64
+				preg *tmp = alloc_reg(ctx, RCPU);
+				op64(ctx, MOV, tmp, pconst64(&p,(int_val)addr));
+				copy_from(ctx, pmem(&p,tmp->id,0), ra);
+#				else
+				copy_from(ctx, paddr(&p,addr), ra);
+#				endif
+			}
+			break;
+		case OCall3:
+			{
+				int args[3] = { o->p3, o->extra[0], o->extra[1] };
+				op_call_fun(ctx, dst, o->p2, 3, args);
+			}
+			break;
+		case OCall4:
+			{
+				int args[4] = { o->p3, o->extra[0], o->extra[1], o->extra[2] };
+				op_call_fun(ctx, dst, o->p2, 4, args);
+			}
+			break;
+		case OCallN:
+			op_call_fun(ctx, dst, o->p2, o->p3, o->extra);
+			break;
+		case OCall0:
+			op_call_fun(ctx, dst, o->p2, 0, NULL);
+			break;
+		case OCall1:
+			op_call_fun(ctx, dst, o->p2, 1, &o->p3);
+			break;
+		case OCall2:
+			{
+				int args[2] = { o->p3, (int)(int_val)o->extra };
+				op_call_fun(ctx, dst, o->p2, 2, args);
+			}
+			break;
+		case OSub:
+		case OAdd:
+		case OMul:
+		case OSDiv:
+		case OUDiv:
+		case OShl:
+		case OSShr:
+		case OUShr:
+		case OAnd:
+		case OOr:
+		case OXor:
+		case OSMod:
+		case OUMod:
+			op_binop(ctx, dst, ra, rb, o->op);
+			break;
+		case ONeg:
+			{
+				if( IS_FLOAT(ra) ) {
+					preg *pa = alloc_reg(ctx,RFPU);
+					preg *pb = alloc_fpu(ctx,ra,true);
+					op64(ctx,XORPD,pa,pa);
+					op64(ctx,ra->t->kind == HF32 ? SUBSS : SUBSD,pa,pb);
+					store(ctx,dst,pa,true);
+				} else if( ra->t->kind == HI64 ) {
+#					ifdef HL_64
+					preg *pa = alloc_reg(ctx,RCPU);
+					preg *pb = alloc_cpu(ctx,ra,true);
+					op64(ctx,XOR,pa,pa);
+					op64(ctx,SUB,pa,pb);
+					store(ctx,dst,pa,true);
+#					else
+					error_i64();
+#					endif
+				} else {
+					preg *pa = alloc_reg(ctx,RCPU);
+					preg *pb = alloc_cpu(ctx,ra,true);
+					op32(ctx,XOR,pa,pa);
+					op32(ctx,SUB,pa,pb);
+					store(ctx,dst,pa,true);
+				}
+			}
+			break;
+		case ONot:
+			{
+				preg *v = alloc_cpu(ctx,ra,true);
+				op32(ctx,XOR,v,pconst(&p,1));
+				store(ctx,dst,v,true);
+			}
+			break;
+		case OJFalse:
+		case OJTrue:
+		case OJNotNull:
+		case OJNull:
+			{
+				preg *r = dst->t->kind == HBOOL ? alloc_cpu8(ctx, dst, true) : alloc_cpu(ctx, dst, true);
+				op64(ctx, dst->t->kind == HBOOL ? TEST8 : TEST, r, r);
+				XJump( o->op == OJFalse || o->op == OJNull ? JZero : JNotZero,jump);
+				register_jump(ctx,jump,(opCount + 1) + o->p2);
+			}
+			break;
+		case OJEq:
+		case OJNotEq:
+		case OJSLt:
+		case OJSGte:
+		case OJSLte:
+		case OJSGt:
+		case OJULt:
+		case OJUGte:
+		case OJNotLt:
+		case OJNotGte:
+			op_jump(ctx,dst,ra,o,(opCount + 1) + o->p3);
+			break;
+		case OJAlways:
+			jump = do_jump(ctx,o->op,false);
+			register_jump(ctx,jump,(opCount + 1) + o->p1);
+			break;
+		case OToDyn:
+			if( ra->t->kind == HBOOL ) {
+				int size = begin_native_call(ctx, 1);
+				set_native_arg(ctx, fetch(ra));
+				call_native(ctx, hl_alloc_dynbool, size);
+				store(ctx, dst, PEAX, true);
+			} else {
+				int_val rt = (int_val)ra->t;
+				int jskip = 0;
+				if( hl_is_ptr(ra->t) ) {
+					int jnz;
+					preg *a = alloc_cpu(ctx,ra,true);
+					op64(ctx,TEST,a,a);
+					XJump_small(JNotZero,jnz);
+					op64(ctx,XOR,PEAX,PEAX); // will replace the result of alloc_dynamic at jump land
+					XJump_small(JAlways,jskip);
+					patch_jump(ctx,jnz);
+				}
+				call_native_consts(ctx, hl_alloc_dynamic, &rt, 1);
+				// copy value to dynamic
+				if( (IS_FLOAT(ra) || ra->size == 8) && !IS_64 ) {
+					preg *tmp = REG_AT(RCPU_SCRATCH_REGS[1]);
+					op64(ctx,MOV,tmp,&ra->stack);
+					op32(ctx,MOV,pmem(&p,Eax,HDYN_VALUE),tmp);
+					if( ra->t->kind == HF64 ) {
+						ra->stackPos += 4;
+						op64(ctx,MOV,tmp,&ra->stack);
+						op32(ctx,MOV,pmem(&p,Eax,HDYN_VALUE+4),tmp);
+						ra->stackPos -= 4;
+					}
+				} else {
+					preg *tmp = REG_AT(RCPU_SCRATCH_REGS[1]);
+					copy_from(ctx,tmp,ra);
+					op64(ctx,MOV,pmem(&p,Eax,HDYN_VALUE),tmp);
+				}
+				if( hl_is_ptr(ra->t) ) patch_jump(ctx,jskip);
+				store(ctx, dst, PEAX, true);
+			}
+			break;
+		case OToSFloat:
+			if( ra == dst ) break;
+			if (ra->t->kind == HI32 || ra->t->kind == HUI16 || ra->t->kind == HUI8) {
+				preg* r = alloc_cpu(ctx, ra, true);
+				preg* w = alloc_fpu(ctx, dst, false);
+				op32(ctx, dst->t->kind == HF64 ? CVTSI2SD : CVTSI2SS, w, r);
+				store(ctx, dst, w, true);
+			} else if (ra->t->kind == HI64 ) {
+				preg* r = alloc_cpu(ctx, ra, true);
+				preg* w = alloc_fpu(ctx, dst, false);
+				op64(ctx, dst->t->kind == HF64 ? CVTSI2SD : CVTSI2SS, w, r);
+				store(ctx, dst, w, true);
+			} else if( ra->t->kind == HF64 && dst->t->kind == HF32 ) {
+				preg *r = alloc_fpu(ctx,ra,true);
+				preg *w = alloc_fpu(ctx,dst,false);
+				op32(ctx,CVTSD2SS,w,r);
+				store(ctx, dst, w, true);
+			} else if( ra->t->kind == HF32 && dst->t->kind == HF64 ) {
+				preg *r = alloc_fpu(ctx,ra,true);
+				preg *w = alloc_fpu(ctx,dst,false);
+				op32(ctx,CVTSS2SD,w,r);
+				store(ctx, dst, w, true);
+			} else
+				ASSERT(0);
+			break;
+		case OToUFloat:
+			{
+				int size;
+				size = prepare_call_args(ctx,1,&o->p2,ctx->vregs,0);
+				call_native(ctx,uint_to_double,size);
+				store_result(ctx,dst);
+			}
+			break;
+		case OToInt:
+			if( ra == dst ) break;
+			if( ra->t->kind == HF64 ) {
+				preg *r = alloc_fpu(ctx,ra,true);
+				preg *w = alloc_cpu(ctx,dst,false);
+				preg *tmp = alloc_reg(ctx,RCPU);
+				op32(ctx,STMXCSR,pmem(&p,Esp,-4),UNUSED);
+				op32(ctx,MOV,tmp,&p);
+				op32(ctx,OR,tmp,pconst(&p,0x6000)); // set round towards 0
+				op32(ctx,MOV,pmem(&p,Esp,-8),tmp);
+				op32(ctx,LDMXCSR,&p,UNUSED);
+				op32(ctx,CVTSD2SI,w,r);
+				op32(ctx,LDMXCSR,pmem(&p,Esp,-4),UNUSED);
+				store(ctx, dst, w, true);
+			} else if (ra->t->kind == HF32) {
+				preg *r = alloc_fpu(ctx, ra, true);
+				preg *w = alloc_cpu(ctx, dst, false);
+				preg *tmp = alloc_reg(ctx, RCPU);
+				op32(ctx, STMXCSR, pmem(&p, Esp, -4), UNUSED);
+				op32(ctx, MOV, tmp, &p);
+				op32(ctx, OR, tmp, pconst(&p, 0x6000)); // set round towards 0
+				op32(ctx, MOV, pmem(&p, Esp, -8), tmp);
+				op32(ctx, LDMXCSR, &p, UNUSED);
+				op32(ctx, CVTSS2SI, w, r);
+				op32(ctx, LDMXCSR, pmem(&p, Esp, -4), UNUSED);
+				store(ctx, dst, w, true);
+			} else if( (dst->t->kind == HI64 || dst->t->kind == HGUID) && ra->t->kind == HI32 ) {
+				if( ra->current != PEAX ) {
+					op32(ctx, MOV, PEAX, fetch(ra));
+					scratch(PEAX);
+				}
+#				ifdef HL_64
+				op64(ctx, CDQE, UNUSED, UNUSED); // sign-extend Eax into Rax
+				store(ctx, dst, PEAX, true);
+#				else
+				op32(ctx, CDQ, UNUSED, UNUSED); // sign-extend Eax into Eax:Edx
+				scratch(REG_AT(Edx));
+				op32(ctx, MOV, fetch(dst), PEAX);
+				dst->stackPos += 4;
+				op32(ctx, MOV, fetch(dst), REG_AT(Edx));
+				dst->stackPos -= 4;
+			} else if( dst->t->kind == HI32 && ra->t->kind == HI64 ) {
+				error_i64();
+#				endif
+			} else {
+				preg *r = alloc_cpu(ctx,dst,false);
+				copy_from(ctx, r, ra);
+				store(ctx, dst, r, true);
+			}
+			break;
+		case ORet:
+			op_ret(ctx, dst);
+			break;
+		case OIncr:
+			{
+				if( IS_FLOAT(dst) ) {
+					ASSERT(0);
+				} else {
+					preg *v = fetch32(ctx,dst);
+					op32(ctx,INC,v,UNUSED);
+					if( v->kind != RSTACK ) store(ctx, dst, v, false);
+				}
+			}
+			break;
+		case ODecr:
+			{
+				if( IS_FLOAT(dst) ) {
+					ASSERT(0);
+				} else {
+					preg *v = fetch32(ctx,dst);
+					op32(ctx,DEC,v,UNUSED);
+					if( v->kind != RSTACK ) store(ctx, dst, v, false);
+				}
+			}
+			break;
+		case OFloat:
+			{
+				if( m->code->floats[o->p2] == 0 ) {
+					preg *f = alloc_fpu(ctx,dst,false);
+					op64(ctx,XORPD,f,f);
+				} else switch( dst->t->kind ) {
+				case HF64:
+				case HF32:
+#					ifdef HL_64
+					op64(ctx,dst->t->kind == HF32 ? CVTSD2SS : MOVSD,alloc_fpu(ctx,dst,false),pcodeaddr(&p,o->p2 * 8));
+#					else
+					op64(ctx,dst->t->kind == HF32 ? MOVSS : MOVSD,alloc_fpu(ctx,dst,false),paddr(&p,m->code->floats + o->p2));
+#					endif
+					break;
+				default:
+					ASSERT(dst->t->kind);
+				}
+				store(ctx,dst,dst->current,false);
+			}
+			break;
+		case OString:
+			op64(ctx,MOV,alloc_cpu(ctx, dst, false),pconst64(&p,(int_val)hl_get_ustring(m->code,o->p2)));
+			store(ctx,dst,dst->current,false);
+			break;
+		case OBytes:
+			{
+				char *b = m->code->version >= 5 ? m->code->bytes + m->code->bytes_pos[o->p2] : m->code->strings[o->p2];
+				op64(ctx,MOV,alloc_cpu(ctx,dst,false),pconst64(&p,(int_val)b));
+				store(ctx,dst,dst->current,false);
+			}
+			break;
+		case ONull:
+			{
+				op64(ctx,XOR,alloc_cpu(ctx, dst, false),alloc_cpu(ctx, dst, false));
+				store(ctx,dst,dst->current,false);
+			}
+			break;
+		case ONew:
+			{
+				int_val args[] = { (int_val)dst->t };
+				void *allocFun;
+				int nargs = 1;
+				switch( dst->t->kind ) {
+				case HOBJ:
+				case HSTRUCT:
+					allocFun = hl_alloc_obj;
+					break;
+				case HDYNOBJ:
+					allocFun = hl_alloc_dynobj;
+					nargs = 0;
+					break;
+				case HVIRTUAL:
+					allocFun = hl_alloc_virtual;
+					break;
+				default:
+					ASSERT(dst->t->kind);
+				}
+				call_native_consts(ctx, allocFun, args, nargs);
+				store(ctx, dst, PEAX, true);
+			}
+			break;
+		case OInstanceClosure:
+			{
+				preg *r = alloc_cpu(ctx, rb, true);
+				jlist *j = (jlist*)hl_malloc(&ctx->galloc,sizeof(jlist));
+				int size = begin_native_call(ctx,3);
+				set_native_arg(ctx,r);
+
+				j->pos = BUF_POS();
+				j->target = o->p2;
+				j->next = ctx->calls;
+				ctx->calls = j;
+
+				set_native_arg(ctx,pconst64(&p,RESERVE_ADDRESS));
+				set_native_arg(ctx,pconst64(&p,(int_val)m->code->functions[m->functions_indexes[o->p2]].type));
+				call_native(ctx,hl_alloc_closure_ptr,size);
+				store(ctx,dst,PEAX,true);
+			}
+			break;
+		case OVirtualClosure:
+			{
+				int size, i;
+				preg *r = alloc_cpu_call(ctx, ra);
+				hl_type *t = NULL;
+				hl_type *ot = ra->t;
+				while( t == NULL ) {
+					for(i=0;i<ot->obj->nproto;i++) {
+						hl_obj_proto *pp = ot->obj->proto + i;
+						if( pp->pindex == o->p3 ) {
+							t = m->code->functions[m->functions_indexes[pp->findex]].type;
+							break;
+						}
+					}
+					ot = ot->obj->super;
+				}
+				size = begin_native_call(ctx,3);
+				set_native_arg(ctx,r);
+				// read r->type->vobj_proto[i] for function address
+				op64(ctx,MOV,r,pmem(&p,r->id,0));
+				op64(ctx,MOV,r,pmem(&p,r->id,HL_WSIZE*2));
+				op64(ctx,MOV,r,pmem(&p,r->id,HL_WSIZE*o->p3));
+				set_native_arg(ctx,r);
+				op64(ctx,MOV,r,pconst64(&p,(int_val)t));
+				set_native_arg(ctx,r);
+				call_native(ctx,hl_alloc_closure_ptr,size);
+				store(ctx,dst,PEAX,true);
+			}
+			break;
+		case OCallClosure:
+			if( ra->t->kind == HDYN ) {
+				// ASM for {
+				//	vdynamic *args[] = {args};
+				//  vdynamic *ret = hl_dyn_call(closure,args,nargs);
+				//  dst = hl_dyncast(ret,t_dynamic,t_dst);
+				// }
+				int offset = o->p3 * HL_WSIZE;
+				preg *r = alloc_reg(ctx, RCPU_CALL);
+				if( offset & 15 ) offset += 16 - (offset & 15);
+				op64(ctx,SUB,PESP,pconst(&p,offset));
+				op64(ctx,MOV,r,PESP);
+				for(i=0;i<o->p3;i++) {
+					vreg *a = R(o->extra[i]);
+					if( !hl_is_dynamic(a->t) ) ASSERT(0);
+					preg *v = alloc_cpu(ctx,a,true);
+					op64(ctx,MOV,pmem(&p,r->id,i * HL_WSIZE),v);
+					RUNLOCK(v);
+				}
+#				ifdef HL_64
+				int size = begin_native_call(ctx, 3) + offset;
+				set_native_arg(ctx, pconst(&p,o->p3));
+				set_native_arg(ctx, r);
+				set_native_arg(ctx, fetch(ra));
+#				else
+				int size = pad_before_call(ctx,HL_WSIZE*2 + sizeof(int) + offset);
+				op64(ctx,PUSH,pconst(&p,o->p3),UNUSED);
+				op64(ctx,PUSH,r,UNUSED);
+				op64(ctx,PUSH,alloc_cpu(ctx,ra,true),UNUSED);
+#				endif
+				call_native(ctx,hl_dyn_call,size);
+				if( dst->t->kind != HVOID ) {
+					store(ctx,dst,PEAX,true);
+					make_dyn_cast(ctx,dst,dst);
+				}
+			} else {
+				int jhasvalue, jend, size;
+				// ASM for  if( c->hasValue ) c->fun(value,args) else c->fun(args)
+				preg *r = alloc_cpu(ctx,ra,true);
+				preg *tmp = alloc_reg(ctx, RCPU);
+				op32(ctx,MOV,tmp,pmem(&p,r->id,HL_WSIZE*2));
+				op32(ctx,TEST,tmp,tmp);
+				scratch(tmp);
+				XJump_small(JNotZero,jhasvalue);
+				save_regs(ctx);
+				size = prepare_call_args(ctx,o->p3,o->extra,ctx->vregs,0);
+				preg *rr = r;
+				if( rr->holds != ra ) rr = alloc_cpu(ctx, ra, true);
+				op_call(ctx, pmem(&p,rr->id,HL_WSIZE), size);
+				XJump_small(JAlways,jend);
+				patch_jump(ctx,jhasvalue);
+				restore_regs(ctx);
+#				ifdef HL_64
+				{
+					int regids[64];
+					preg *pc = REG_AT(CALL_REGS[0]);
+					vreg *sc = R(f->nregs); // scratch register that we temporary rebind
+					if( o->p3 >= 63 ) jit_error("assert");
+					memcpy(regids + 1, o->extra, o->p3 * sizeof(int));
+					regids[0] = f->nregs;
+					sc->size = HL_WSIZE;
+					sc->t = &hlt_dyn;
+					op64(ctx, MOV, pc, pmem(&p,r->id,HL_WSIZE*3));
+					scratch(pc);
+					sc->current = pc;
+					pc->holds = sc;
+					size = prepare_call_args(ctx,o->p3 + 1,regids,ctx->vregs,0);
+					if( r->holds != ra ) r = alloc_cpu(ctx, ra, true);
+				}
+#				else
+				size = prepare_call_args(ctx,o->p3,o->extra,ctx->vregs,HL_WSIZE);
+				if( r->holds != ra ) r = alloc_cpu(ctx, ra, true);
+				op64(ctx, PUSH,pmem(&p,r->id,HL_WSIZE*3),UNUSED); // push closure value
+#				endif
+				op_call(ctx, pmem(&p,r->id,HL_WSIZE), size);
+				discard_regs(ctx,false);
+				patch_jump(ctx,jend);
+				store_result(ctx, dst);
+			}
+			break;
+		case OStaticClosure:
+			{
+				vclosure *c = alloc_static_closure(ctx,o->p2);
+				preg *r = alloc_reg(ctx, RCPU);
+				op64(ctx, MOV, r, pconst64(&p,(int_val)c));
+				store(ctx,dst,r,true);
+			}
+			break;
+		case OField:
+			{
+#				ifndef HL_64
+				if( dst->t->kind == HI64 ) {
+					error_i64();
+					break;
+				}
+#				endif
+				switch( ra->t->kind ) {
+				case HOBJ:
+				case HSTRUCT:
+					{
+						hl_runtime_obj *rt = hl_get_obj_rt(ra->t);
+						preg *rr = alloc_cpu(ctx,ra, true);
+						if( dst->t->kind == HSTRUCT ) {
+							hl_type *ft = hl_obj_field_fetch(ra->t,o->p3)->t;
+							if( ft->kind == HPACKED ) {
+								preg *r = alloc_reg(ctx,RCPU);
+								op64(ctx,LEA,r,pmem(&p,(CpuReg)rr->id,rt->fields_indexes[o->p3]));
+								store(ctx,dst,r,true);
+								break;
+							}
+						}
+						copy_to(ctx,dst,pmem(&p, (CpuReg)rr->id, rt->fields_indexes[o->p3]));
+					}
+					break;
+				case HVIRTUAL:
+					// ASM for --> if( hl_vfields(o)[f] ) r = *hl_vfields(o)[f]; else r = hl_dyn_get(o,hash(field),vt)
+					{
+						int jhasfield, jend, size;
+						bool need_type = !(IS_FLOAT(dst) || dst->t->kind == HI64);
+						preg *v = alloc_cpu_call(ctx,ra);
+						preg *r = alloc_reg(ctx,RCPU);
+						op64(ctx,MOV,r,pmem(&p,v->id,sizeof(vvirtual)+HL_WSIZE*o->p3));
+						op64(ctx,TEST,r,r);
+						XJump_small(JNotZero,jhasfield);
+						size = begin_native_call(ctx, need_type ? 3 : 2);
+						if( need_type ) set_native_arg(ctx,pconst64(&p,(int_val)dst->t));
+						set_native_arg(ctx,pconst64(&p,(int_val)ra->t->virt->fields[o->p3].hashed_name));
+						set_native_arg(ctx,v);
+						call_native(ctx,get_dynget(dst->t),size);
+						store_result(ctx,dst);
+						XJump_small(JAlways,jend);
+						patch_jump(ctx,jhasfield);
+						copy_to(ctx, dst, pmem(&p,(CpuReg)r->id,0));
+						patch_jump(ctx,jend);
+						scratch(dst->current);
+					}
+					break;
+				default:
+					ASSERT(ra->t->kind);
+					break;
+				}
+			}
+			break;
+		case OSetField:
+			{
+				switch( dst->t->kind ) {
+				case HOBJ:
+				case HSTRUCT:
+					{
+						hl_runtime_obj *rt = hl_get_obj_rt(dst->t);
+						preg *rr = alloc_cpu(ctx, dst, true);
+						if( rb->t->kind == HSTRUCT ) {
+							hl_type *ft = hl_obj_field_fetch(dst->t,o->p2)->t;
+							if( ft->kind == HPACKED ) {
+								hl_runtime_obj *frt = hl_get_obj_rt(ft->tparam);
+								preg *prb = alloc_cpu(ctx, rb, true);
+								preg *tmp = alloc_reg(ctx, RCPU_CALL);
+								int offset = 0;
+								while( offset < frt->size ) {
+									int remain = frt->size - offset;
+									int copy_size = remain >= HL_WSIZE ? HL_WSIZE : (remain >= 4 ? 4 : (remain >= 2 ? 2 : 1));
+									copy(ctx, tmp, pmem(&p, (CpuReg)prb->id, offset), copy_size);
+									copy(ctx, pmem(&p, (CpuReg)rr->id, rt->fields_indexes[o->p2]+offset), tmp, copy_size);
+									offset += copy_size;
+								}
+								break;
+							}
+						}
+						copy_from(ctx, pmem(&p, (CpuReg)rr->id, rt->fields_indexes[o->p2]), rb);
+					}
+					break;
+				case HVIRTUAL:
+					// ASM for --> if( hl_vfields(o)[f] ) *hl_vfields(o)[f] = v; else hl_dyn_set(o,hash(field),vt,v)
+					{
+						int jhasfield, jend;
+						preg *obj = alloc_cpu_call(ctx,dst);
+						preg *r = alloc_reg(ctx,RCPU);
+						op64(ctx,MOV,r,pmem(&p,obj->id,sizeof(vvirtual)+HL_WSIZE*o->p2));
+						op64(ctx,TEST,r,r);
+						XJump_small(JNotZero,jhasfield);
+#						ifdef HL_64
+						switch( rb->t->kind ) {
+						case HF64:
+						case HF32:
+							size = begin_native_call(ctx,3);
+							set_native_arg_fpu(ctx, fetch(rb), rb->t->kind == HF32);
+							break;
+						case HI64:
+						case HGUID:
+							size = begin_native_call(ctx,3);
+							set_native_arg(ctx, fetch(rb));
+							break;
+						default:
+							size = begin_native_call(ctx, 4);
+							set_native_arg(ctx, fetch(rb));
+							set_native_arg(ctx, pconst64(&p,(int_val)rb->t));
+							break;
+						}
+						set_native_arg(ctx,pconst(&p,dst->t->virt->fields[o->p2].hashed_name));
+						set_native_arg(ctx,obj);
+#						else
+						switch( rb->t->kind ) {
+						case HF64:
+						case HI64:
+						case HGUID:
+							size = pad_before_call(ctx,HL_WSIZE*2 + sizeof(double));
+							push_reg(ctx,rb);
+							break;
+						case HF32:
+							size = pad_before_call(ctx,HL_WSIZE*2 + sizeof(float));
+							push_reg(ctx,rb);
+							break;
+						default:
+							size = pad_before_call(ctx,HL_WSIZE*4);
+							op64(ctx,PUSH,fetch32(ctx,rb),UNUSED);
+							op64(ctx,MOV,r,pconst64(&p,(int_val)rb->t));
+							op64(ctx,PUSH,r,UNUSED);
+							break;
+						}
+						op32(ctx,MOV,r,pconst(&p,dst->t->virt->fields[o->p2].hashed_name));
+						op64(ctx,PUSH,r,UNUSED);
+						op64(ctx,PUSH,obj,UNUSED);
+#						endif
+						call_native(ctx,get_dynset(rb->t),size);
+						XJump_small(JAlways,jend);
+						patch_jump(ctx,jhasfield);
+						copy_from(ctx, pmem(&p,(CpuReg)r->id,0), rb);
+						patch_jump(ctx,jend);
+						scratch(rb->current);
+					}
+					break;
+				default:
+					ASSERT(dst->t->kind);
+					break;
+				}
+			}
+			break;
+		case OGetThis:
+			{
+				vreg *r = R(0);
+				hl_runtime_obj *rt = hl_get_obj_rt(r->t);
+				preg *rr = alloc_cpu(ctx,r, true);
+				if( dst->t->kind == HSTRUCT ) {
+					hl_type *ft = hl_obj_field_fetch(r->t,o->p2)->t;
+					if( ft->kind == HPACKED ) {
+						preg *r = alloc_reg(ctx,RCPU);
+						op64(ctx,LEA,r,pmem(&p,(CpuReg)rr->id,rt->fields_indexes[o->p2]));
+						store(ctx,dst,r,true);
+						break;
+					}
+				}
+				copy_to(ctx,dst,pmem(&p, (CpuReg)rr->id, rt->fields_indexes[o->p2]));
+			}
+			break;
+		case OSetThis:
+			{
+				vreg *r = R(0);
+				hl_runtime_obj *rt = hl_get_obj_rt(r->t);
+				preg *rr = alloc_cpu(ctx, r, true);
+				if( ra->t->kind == HSTRUCT ) {
+					hl_type *ft = hl_obj_field_fetch(r->t,o->p1)->t;
+					if( ft->kind == HPACKED ) {
+						hl_runtime_obj *frt = hl_get_obj_rt(ft->tparam);
+						preg *pra = alloc_cpu(ctx, ra, true);
+						preg *tmp = alloc_reg(ctx, RCPU_CALL);
+						int offset = 0;
+						while( offset < frt->size ) {
+							int remain = frt->size - offset;
+							int copy_size = remain >= HL_WSIZE ? HL_WSIZE : (remain >= 4 ? 4 : (remain >= 2 ? 2 : 1));
+							copy(ctx, tmp, pmem(&p, (CpuReg)pra->id, offset), copy_size);
+							copy(ctx, pmem(&p, (CpuReg)rr->id, rt->fields_indexes[o->p1]+offset), tmp, copy_size);
+							offset += copy_size;
+						}
+						break;
+					}
+				}
+				copy_from(ctx, pmem(&p, (CpuReg)rr->id, rt->fields_indexes[o->p1]), ra);
+			}
+			break;
+		case OCallThis:
+			{
+				int nargs = o->p3 + 1;
+				int *args = (int*)hl_malloc(&ctx->falloc,sizeof(int) * nargs);
+				int size;
+				preg *r = alloc_cpu(ctx, R(0), true);
+				preg *tmp;
+				tmp = alloc_reg(ctx, RCPU_CALL);
+				op64(ctx,MOV,tmp,pmem(&p,r->id,0)); // read type
+				op64(ctx,MOV,tmp,pmem(&p,tmp->id,HL_WSIZE*2)); // read proto
+				args[0] = 0;
+				for(i=1;i<nargs;i++)
+					args[i] = o->extra[i-1];
+				size = prepare_call_args(ctx,nargs,args,ctx->vregs,0);
+				op_call(ctx,pmem(&p,tmp->id,o->p2*HL_WSIZE),size);
+				discard_regs(ctx, false);
+				store_result(ctx, dst);
+			}
+			break;
+		case OCallMethod:
+			switch( R(o->extra[0])->t->kind ) {
+			case HOBJ: {
+				int size;
+				preg *r = alloc_cpu(ctx, R(o->extra[0]), true);
+				preg *tmp;
+				tmp = alloc_reg(ctx, RCPU_CALL);
+				op64(ctx,MOV,tmp,pmem(&p,r->id,0)); // read type
+				op64(ctx,MOV,tmp,pmem(&p,tmp->id,HL_WSIZE*2)); // read proto
+				size = prepare_call_args(ctx,o->p3,o->extra,ctx->vregs,0);
+				op_call(ctx,pmem(&p,tmp->id,o->p2*HL_WSIZE),size);
+				discard_regs(ctx, false);
+				store_result(ctx, dst);
+				break;
+			}
+			case HVIRTUAL:
+				// ASM for --> if( hl_vfields(o)[f] ) dst = *hl_vfields(o)[f](o->value,args...); else dst = hl_dyn_call_obj(o->value,field,args,&ret)
+				{
+					int size;
+					int paramsSize;
+					int jhasfield, jend;
+					bool need_dyn;
+					bool obj_in_args = false;
+					vreg *obj = R(o->extra[0]);
+					preg *v = alloc_cpu_call(ctx,obj);
+					preg *r = alloc_reg(ctx,RCPU_CALL);
+					op64(ctx,MOV,r,pmem(&p,v->id,sizeof(vvirtual)+HL_WSIZE*o->p2));
+					op64(ctx,TEST,r,r);
+					save_regs(ctx);
+
+					if( o->p3 < 6 ) {
+						XJump_small(JNotZero,jhasfield);
+					} else {
+						XJump(JNotZero,jhasfield);
+					}
+
+					need_dyn = !hl_is_ptr(dst->t) && dst->t->kind != HVOID;
+					paramsSize = (o->p3 - 1) * HL_WSIZE;
+					if( need_dyn ) paramsSize += sizeof(vdynamic);
+					if( paramsSize & 15 ) paramsSize += 16 - (paramsSize&15);
+					op64(ctx,SUB,PESP,pconst(&p,paramsSize));
+					op64(ctx,MOV,r,PESP);
+
+					for(i=0;i<o->p3-1;i++) {
+						vreg *a = R(o->extra[i+1]);
+						if( hl_is_ptr(a->t) ) {
+							op64(ctx,MOV,pmem(&p,r->id,i*HL_WSIZE),alloc_cpu(ctx,a,true));
+							if( a->current != v ) {
+								RUNLOCK(a->current);
+							} else
+								obj_in_args = true;
+						} else {
+							preg *r2 = alloc_reg(ctx,RCPU);
+							op64(ctx,LEA,r2,&a->stack);
+							op64(ctx,MOV,pmem(&p,r->id,i*HL_WSIZE),r2);
+							if( r2 != v ) RUNLOCK(r2);
+						}
+					}
+
+					jit_buf(ctx);
+
+					if( !need_dyn ) {
+						size = begin_native_call(ctx, 5);
+						set_native_arg(ctx, pconst(&p,0));
+					} else {
+						preg *rtmp = alloc_reg(ctx,RCPU);
+						op64(ctx,LEA,rtmp,pmem(&p,Esp,paramsSize - sizeof(vdynamic)));
+						size = begin_native_call(ctx, 5);
+						set_native_arg(ctx,rtmp);
+						if( !IS_64 ) RUNLOCK(rtmp);
+					}
+					set_native_arg(ctx,r);
+					set_native_arg(ctx,pconst(&p,obj->t->virt->fields[o->p2].hashed_name)); // fid
+					set_native_arg(ctx,pconst64(&p,(int_val)obj->t->virt->fields[o->p2].t)); // ftype
+					set_native_arg(ctx,pmem(&p,v->id,HL_WSIZE)); // o->value
+					call_native(ctx,hl_dyn_call_obj,size + paramsSize);
+					if( need_dyn ) {
+						preg *r = IS_FLOAT(dst) ? REG_AT(XMM(0)) : PEAX;
+						copy(ctx,r,pmem(&p,Esp,HDYN_VALUE - (int)sizeof(vdynamic)),dst->size);
+						store(ctx, dst, r, false);
+					} else
+						store(ctx, dst, PEAX, false);
+
+					XJump_small(JAlways,jend);
+					patch_jump(ctx,jhasfield);
+					restore_regs(ctx);
+
+					if( !obj_in_args ) {
+						// o = o->value hack
+						if( v->holds ) v->holds->current = NULL;
+						obj->current = v;
+						v->holds = obj;
+						op64(ctx,MOV,v,pmem(&p,v->id,HL_WSIZE));
+						size = prepare_call_args(ctx,o->p3,o->extra,ctx->vregs,0);
+					} else {
+						// keep o->value in R(f->nregs)
+						int regids[64];
+						preg *pc = alloc_reg(ctx,RCPU_CALL);
+						vreg *sc = R(f->nregs); // scratch register that we temporary rebind
+						if( o->p3 >= 63 ) jit_error("assert");
+						memcpy(regids, o->extra, o->p3 * sizeof(int));
+						regids[0] = f->nregs;
+						sc->size = HL_WSIZE;
+						sc->t = &hlt_dyn;
+						op64(ctx, MOV, pc, pmem(&p,v->id,HL_WSIZE));
+						scratch(pc);
+						sc->current = pc;
+						pc->holds = sc;
+						size = prepare_call_args(ctx,o->p3,regids,ctx->vregs,0);
+					}
+
+					op_call(ctx,r,size);
+					discard_regs(ctx, false);
+					store_result(ctx, dst);
+					patch_jump(ctx,jend);
+				}
+				break;
+			default:
+				ASSERT(0);
+				break;
+			}
+			break;
+		case ORethrow:
+			{
+				int size = prepare_call_args(ctx,1,&o->p1,ctx->vregs,0);
+				call_native(ctx,hl_rethrow,size);
+			}
+			break;
+		case OThrow:
+			{
+				int size = prepare_call_args(ctx,1,&o->p1,ctx->vregs,0);
+				call_native(ctx,hl_throw,size);
+			}
+			break;
+		case OLabel:
+			// NOP for now
+			discard_regs(ctx,false);
+			break;
+		case OGetI8:
+		case OGetI16:
+			{
+				preg *base = alloc_cpu(ctx, ra, true);
+				preg *offset = alloc_cpu64(ctx, rb, true);
+				preg *r = alloc_reg(ctx,o->op == OGetI8 ? RCPU_8BITS : RCPU);
+				op64(ctx,XOR,r,r);
+				op32(ctx, o->op == OGetI8 ? MOV8 : MOV16,r,pmem2(&p,base->id,offset->id,1,0));
+				store(ctx, dst, r, true);
+			}
+			break;
+		case OGetMem:
+			{
+				#ifndef HL_64
+				if (dst->t->kind == HI64) {
+					error_i64();
+				}
+				#endif
+				preg *base = alloc_cpu(ctx, ra, true);
+				preg *offset = alloc_cpu64(ctx, rb, true);
+				store(ctx, dst, pmem2(&p,base->id,offset->id,1,0), false);
+			}
+			break;
+		case OSetI8:
+			{
+				preg *base = alloc_cpu(ctx, dst, true);
+				preg *offset = alloc_cpu64(ctx, ra, true);
+				preg *value = alloc_cpu8(ctx, rb, true);
+				op32(ctx,MOV8,pmem2(&p,base->id,offset->id,1,0),value);
+			}
+			break;
+		case OSetI16:
+			{
+				preg *base = alloc_cpu(ctx, dst, true);
+				preg *offset = alloc_cpu64(ctx, ra, true);
+				preg *value = alloc_cpu(ctx, rb, true);
+				op32(ctx,MOV16,pmem2(&p,base->id,offset->id,1,0),value);
+			}
+			break;
+		case OSetMem:
+			{
+				preg *base = alloc_cpu(ctx, dst, true);
+				preg *offset = alloc_cpu64(ctx, ra, true);
+				preg *value;
+				switch( rb->t->kind ) {
+				case HI32:
+					value = alloc_cpu(ctx, rb, true);
+					op32(ctx,MOV,pmem2(&p,base->id,offset->id,1,0),value);
+					break;
+				case HF32:
+					value = alloc_fpu(ctx, rb, true);
+					op32(ctx,MOVSS,pmem2(&p,base->id,offset->id,1,0),value);
+					break;
+				case HF64:
+					value = alloc_fpu(ctx, rb, true);
+					op32(ctx,MOVSD,pmem2(&p,base->id,offset->id,1,0),value);
+					break;
+				case HI64:
+				case HGUID:
+					value = alloc_cpu(ctx, rb, true);
+					op64(ctx,MOV,pmem2(&p,base->id,offset->id,1,0),value);
+					break;
+				default:
+					ASSERT(rb->t->kind);
+					break;
+				}
+			}
+			break;
+		case OType:
+			{
+				op64(ctx,MOV,alloc_cpu(ctx, dst, false),pconst64(&p,(int_val)(m->code->types + o->p2)));
+				store(ctx,dst,dst->current,false);
+			}
+			break;
+		case OGetType:
+			{
+				int jnext, jend;
+				preg *r = alloc_cpu(ctx, ra, true);
+				preg *tmp = alloc_reg(ctx, RCPU);
+				op64(ctx,TEST,r,r);
+				XJump_small(JNotZero,jnext);
+				op64(ctx,MOV, tmp, pconst64(&p,(int_val)&hlt_void));
+				XJump_small(JAlways,jend);
+				patch_jump(ctx,jnext);
+				op64(ctx, MOV, tmp, pmem(&p,r->id,0));
+				patch_jump(ctx,jend);
+				store(ctx,dst,tmp,true);
+			}
+			break;
+		case OGetArray:
+			{
+				preg *rdst = IS_FLOAT(dst) ? alloc_fpu(ctx,dst,false) : alloc_cpu(ctx,dst,false);
+				if( ra->t->kind == HABSTRACT ) {
+					int osize;
+					bool isRead = dst->t->kind != HOBJ && dst->t->kind != HSTRUCT;
+					if( isRead )
+						osize = sizeof(void*);
+					else {
+						hl_runtime_obj *rt = hl_get_obj_rt(dst->t);
+						osize = rt->size;
+					}
+					preg *idx = alloc_cpu64(ctx, rb, true);
+					op64(ctx, IMUL, idx, pconst(&p,osize));
+					op64(ctx, isRead?MOV:LEA, rdst, pmem2(&p,alloc_cpu(ctx,ra, true)->id,idx->id,1,0));
+					store(ctx,dst,dst->current,false);
+					scratch(idx);
+				} else {
+					copy(ctx, rdst, pmem2(&p,alloc_cpu(ctx,ra,true)->id,alloc_cpu64(ctx,rb,true)->id,hl_type_size(dst->t),sizeof(varray)), dst->size);
+					store(ctx,dst,dst->current,false);
+				}
+			}
+			break;
+		case OSetArray:
+			{
+				if( dst->t->kind == HABSTRACT ) {
+					int osize;
+					bool isWrite = rb->t->kind != HOBJ && rb->t->kind != HSTRUCT;
+					if( isWrite ) {
+						osize = sizeof(void*);
+					} else {
+						hl_runtime_obj *rt = hl_get_obj_rt(rb->t);
+						osize = rt->size;
+					}
+					preg *pdst = alloc_cpu(ctx,dst,true);
+					preg *pra = alloc_cpu64(ctx,ra,true);
+					op64(ctx, IMUL, pra, pconst(&p,osize));
+					op64(ctx, ADD, pdst, pra);
+					scratch(pra);
+					preg *prb = alloc_cpu(ctx,rb,true);
+					preg *tmp = alloc_reg(ctx, RCPU_CALL);
+					int offset = 0;
+					while( offset < osize ) {
+						int remain = osize - offset;
+						int copy_size = remain >= HL_WSIZE ? HL_WSIZE : (remain >= 4 ? 4 : (remain >= 2 ? 2 : 1));
+						copy(ctx, tmp, pmem(&p, prb->id, offset), copy_size);
+						copy(ctx, pmem(&p, pdst->id, offset), tmp, copy_size);
+						offset += copy_size;
+					}
+					scratch(pdst);
+				} else  {
+					preg *rrb = IS_FLOAT(rb) ? alloc_fpu(ctx,rb,true) : alloc_cpu(ctx,rb,true);
+					copy(ctx, pmem2(&p,alloc_cpu(ctx,dst,true)->id,alloc_cpu64(ctx,ra,true)->id,hl_type_size(rb->t),sizeof(varray)), rrb, rb->size);
+				}
+			}
+			break;
+		case OArraySize:
+			{
+				op32(ctx,MOV,alloc_cpu(ctx,dst,false),pmem(&p,alloc_cpu(ctx,ra,true)->id,ra->t->kind == HABSTRACT ? HL_WSIZE + 4 : HL_WSIZE*2));
+				store(ctx,dst,dst->current,false);
+			}
+			break;
+		case ORef:
+			{
+				scratch(ra->current);
+				op64(ctx,MOV,alloc_cpu(ctx,dst,false),REG_AT(Ebp));
+				if( ra->stackPos < 0 )
+					op64(ctx,SUB,dst->current,pconst(&p,-ra->stackPos));
+				else
+					op64(ctx,ADD,dst->current,pconst(&p,ra->stackPos));
+				store(ctx,dst,dst->current,false);
+			}
+			break;
+		case OUnref:
+			copy_to(ctx,dst,pmem(&p,alloc_cpu(ctx,ra,true)->id,0));
+			break;
+		case OSetref:
+			copy_from(ctx,pmem(&p,alloc_cpu(ctx,dst,true)->id,0),ra);
+			break;
+		case ORefData:
+			switch( ra->t->kind ) {
+			case HARRAY:
+				{
+					preg *r = fetch(ra);
+					preg *d = alloc_cpu(ctx,dst,false);
+					op64(ctx,MOV,d,r);
+					op64(ctx,ADD,d,pconst(&p,sizeof(varray)));
+					store(ctx,dst,dst->current,false);
+				}
+				break;
+			default:
+				ASSERT(ra->t->kind);
+			}
+			break;
+		case ORefOffset:
+			{
+				preg *d = alloc_cpu(ctx,rb,true);
+				preg *r2 = alloc_cpu(ctx,dst,false);
+				preg *r = fetch(ra);
+				int size = hl_type_size(dst->t->tparam);
+				op64(ctx,MOV,r2,r);
+				switch( size ) {
+				case 1:
+					break;
+				case 2:
+					op64(ctx,SHL,d,pconst(&p,1));
+					break;
+				case 4:
+					op64(ctx,SHL,d,pconst(&p,2));
+					break;
+				case 8:
+					op64(ctx,SHL,d,pconst(&p,3));
+					break;
+				default:
+					op64(ctx,IMUL,d,pconst(&p,size));
+					break;
+				}
+				op64(ctx,ADD,r2,d);
+				scratch(d);
+				store(ctx,dst,dst->current,false);
+			}
+			break;
+		case OToVirtual:
+			{
+#				ifdef HL_64
+				int size = pad_before_call(ctx, 0);
+				op64(ctx,MOV,REG_AT(CALL_REGS[1]),fetch(ra));
+				op64(ctx,MOV,REG_AT(CALL_REGS[0]),pconst64(&p,(int_val)dst->t));
+#				else
+				int size = pad_before_call(ctx, HL_WSIZE*2);
+				op32(ctx,PUSH,fetch(ra),UNUSED);
+				op32(ctx,PUSH,pconst(&p,(int)(int_val)dst->t),UNUSED);
+#				endif
+				if( ra->t->kind == HOBJ ) hl_get_obj_rt(ra->t); // ensure it's initialized
+				call_native(ctx,hl_to_virtual,size);
+				store(ctx,dst,PEAX,true);
+			}
+			break;
+		case OMakeEnum:
+			{
+				hl_enum_construct *c = &dst->t->tenum->constructs[o->p2];
+				int_val args[] = { (int_val)dst->t, o->p2 };
+				int i;
+				call_native_consts(ctx, hl_alloc_enum, args, 2);
+				RLOCK(PEAX);
+				for(i=0;i<c->nparams;i++) {
+					preg *r = fetch(R(o->extra[i]));
+					copy(ctx, pmem(&p,Eax,c->offsets[i]),r, R(o->extra[i])->size);
+					RUNLOCK(fetch(R(o->extra[i])));
+					if ((i & 15) == 0) jit_buf(ctx);
+				}
+				store(ctx, dst, PEAX, true);
+			}
+			break;
+		case OEnumAlloc:
+			{
+				int_val args[] = { (int_val)dst->t, o->p2 };
+				call_native_consts(ctx, hl_alloc_enum, args, 2);
+				store(ctx, dst, PEAX, true);
+			}
+			break;
+		case OEnumField:
+			{
+				hl_enum_construct *c = &ra->t->tenum->constructs[o->p3];
+				preg *r = alloc_cpu(ctx,ra,true);
+				copy_to(ctx,dst,pmem(&p,r->id,c->offsets[(int)(int_val)o->extra]));
+			}
+			break;
+		case OSetEnumField:
+			{
+				hl_enum_construct *c = &dst->t->tenum->constructs[0];
+				preg *r = alloc_cpu(ctx,dst,true);
+				switch( rb->t->kind ) {
+				case HF64:
+					{
+						preg *d = alloc_fpu(ctx,rb,true);
+						copy(ctx,pmem(&p,r->id,c->offsets[o->p2]),d,8);
+						break;
+					}
+				default:
+					copy(ctx,pmem(&p,r->id,c->offsets[o->p2]),alloc_cpu(ctx,rb,true),hl_type_size(c->params[o->p2]));
+					break;
+				}
+			}
+			break;
+		case ONullCheck:
+			{
+				int jz;
+				preg *r = alloc_cpu(ctx,dst,true);
+				op64(ctx,TEST,r,r);
+				XJump_small(JNotZero,jz);
+
+				hl_opcode *next = f->ops + opCount + 1;
+				bool null_field_access = false;
+				int hashed_name = 0;
+				// skip const and operation between nullcheck and access
+				while( (next < f->ops + f->nops - 1) && (next->op >= OInt && next->op <= ODecr) ) {
+					next++;
+				}
+				if( (next->op == OField && next->p2 == o->p1) || (next->op == OSetField && next->p1 == o->p1) ) {
+					int fid = next->op == OField ? next->p3 : next->p2;
+					hl_obj_field *f = NULL;
+					if( dst->t->kind == HOBJ || dst->t->kind == HSTRUCT )
+						f = hl_obj_field_fetch(dst->t, fid);
+					else if( dst->t->kind == HVIRTUAL )
+						f = dst->t->virt->fields + fid;
+					if( f == NULL ) ASSERT(dst->t->kind);
+					null_field_access = true;
+					hashed_name = f->hashed_name;
+				} else if( (next->op >= OCall1 && next->op <= OCallN) && next->p3 == o->p1 ) {
+					int fid = next->p2 < 0 ? -1 : ctx->m->functions_indexes[next->p2];
+					hl_function *cf = ctx->m->code->functions + fid;
+					const uchar *name = fun_field_name(cf);
+					null_field_access = true;
+					hashed_name = hl_hash_gen(name, true);
+				}
+
+				if( null_field_access ) {
+					pad_before_call(ctx, HL_WSIZE);
+					if( hashed_name >= 0 && hashed_name < 256 )
+						op64(ctx,PUSH8,pconst(&p,hashed_name),UNUSED);
+					else
+						op32(ctx,PUSH,pconst(&p,hashed_name),UNUSED);
+				} else {
+					pad_before_call(ctx, 0);
+				}
+
+				jlist *j = (jlist*)hl_malloc(&ctx->galloc,sizeof(jlist));
+				j->pos = BUF_POS();
+				j->target = null_field_access ? -3 : -1;
+				j->next = ctx->calls;
+				ctx->calls = j;
+
+				op64(ctx,MOV,PEAX,pconst64(&p,RESERVE_ADDRESS));
+				op_call(ctx,PEAX,-1);
+				patch_jump(ctx,jz);
+			}
+			break;
+		case OSafeCast:
+			make_dyn_cast(ctx, dst, ra);
+			break;
+		case ODynGet:
+			{
+				int size;
+#				ifdef HL_64
+				if( IS_FLOAT(dst) || dst->t->kind == HI64 ) {
+					size = begin_native_call(ctx,2);
+				} else {
+					size = begin_native_call(ctx,3);
+					set_native_arg(ctx,pconst64(&p,(int_val)dst->t));
+				}
+				set_native_arg(ctx,pconst64(&p,(int_val)hl_hash_utf8(m->code->strings[o->p3])));
+				set_native_arg(ctx,fetch(ra));
+#				else
+				preg *r;
+				r = alloc_reg(ctx,RCPU);
+				if( IS_FLOAT(dst) || dst->t->kind == HI64 ) {
+					size = pad_before_call(ctx,HL_WSIZE*2);
+				} else {
+					size = pad_before_call(ctx,HL_WSIZE*3);
+					op64(ctx,MOV,r,pconst64(&p,(int_val)dst->t));
+					op64(ctx,PUSH,r,UNUSED);
+				}
+				op64(ctx,MOV,r,pconst64(&p,(int_val)hl_hash_utf8(m->code->strings[o->p3])));
+				op64(ctx,PUSH,r,UNUSED);
+				op64(ctx,PUSH,fetch(ra),UNUSED);
+#				endif
+				call_native(ctx,get_dynget(dst->t),size);
+				store_result(ctx,dst);
+			}
+			break;
+		case ODynSet:
+			{
+				int size;
+#				ifdef HL_64
+				switch( rb->t->kind ) {
+				case HF32:
+				case HF64:
+					size = begin_native_call(ctx, 3);
+					set_native_arg_fpu(ctx,fetch(rb),rb->t->kind == HF32);
+					set_native_arg(ctx,pconst64(&p,hl_hash_gen(hl_get_ustring(m->code,o->p2),true)));
+					set_native_arg(ctx,fetch(dst));
+					call_native(ctx,get_dynset(rb->t),size);
+					break;
+				case HI64:
+				case HGUID:
+					size = begin_native_call(ctx, 3);
+					set_native_arg(ctx,fetch(rb));
+					set_native_arg(ctx,pconst64(&p,hl_hash_gen(hl_get_ustring(m->code,o->p2),true)));
+					set_native_arg(ctx,fetch(dst));
+					call_native(ctx,get_dynset(rb->t),size);
+					break;
+				default:
+					size = begin_native_call(ctx,4);
+					set_native_arg(ctx,fetch(rb));
+					set_native_arg(ctx,pconst64(&p,(int_val)rb->t));
+					set_native_arg(ctx,pconst64(&p,hl_hash_gen(hl_get_ustring(m->code,o->p2),true)));
+					set_native_arg(ctx,fetch(dst));
+					call_native(ctx,get_dynset(rb->t),size);
+					break;
+				}
+#				else
+				switch( rb->t->kind ) {
+				case HF32:
+					size = pad_before_call(ctx, HL_WSIZE*2 + sizeof(float));
+					push_reg(ctx,rb);
+					op32(ctx,PUSH,pconst64(&p,hl_hash_gen(hl_get_ustring(m->code,o->p2),true)),UNUSED);
+					op32(ctx,PUSH,fetch(dst),UNUSED);
+					call_native(ctx,get_dynset(rb->t),size);
+					break;
+				case HF64:
+				case HI64:
+				case HGUID:
+					size = pad_before_call(ctx, HL_WSIZE*2 + sizeof(double));
+					push_reg(ctx,rb);
+					op32(ctx,PUSH,pconst64(&p,hl_hash_gen(hl_get_ustring(m->code,o->p2),true)),UNUSED);
+					op32(ctx,PUSH,fetch(dst),UNUSED);
+					call_native(ctx,get_dynset(rb->t),size);
+					break;
+				default:
+					size = pad_before_call(ctx, HL_WSIZE*4);
+					op32(ctx,PUSH,fetch32(ctx,rb),UNUSED);
+					op32(ctx,PUSH,pconst64(&p,(int_val)rb->t),UNUSED);
+					op32(ctx,PUSH,pconst64(&p,hl_hash_gen(hl_get_ustring(m->code,o->p2),true)),UNUSED);
+					op32(ctx,PUSH,fetch(dst),UNUSED);
+					call_native(ctx,get_dynset(rb->t),size);
+					break;
+				}
+#				endif
+			}
+			break;
+		case OTrap:
+			{
+				int size, jenter, jtrap;
+				int offset = 0;
+				int trap_size = (sizeof(hl_trap_ctx) + 15) & 0xFFF0;
+				hl_trap_ctx *t = NULL;
+#				ifndef HL_THREADS
+				if( tinf == NULL ) tinf = hl_get_thread(); // single thread
+#				endif
+
+#				ifdef HL_64
+				preg *trap = REG_AT(CALL_REGS[0]);
+#				else
+				preg *trap = PEAX;
+#				endif
+				RLOCK(trap);
+
+				preg *treg = alloc_reg(ctx, RCPU);
+				if( !tinf ) {
+					call_native(ctx, hl_get_thread, 0);
+					op64(ctx,MOV,treg,PEAX);
+					offset = (int)(int_val)&tinf->trap_current;
+				} else {
+					offset = 0;
+					op64(ctx,MOV,treg,pconst64(&p,(int_val)&tinf->trap_current));
+				}
+				op64(ctx,MOV,trap,pmem(&p,treg->id,offset));
+				op64(ctx,SUB,PESP,pconst(&p,trap_size));
+				op64(ctx,MOV,pmem(&p,Esp,(int)(int_val)&t->prev),trap);
+				op64(ctx,MOV,trap,PESP);
+				op64(ctx,MOV,pmem(&p,treg->id,offset),trap);
+
+				/*
+					trap E,@catch
+					catch g
+					catch g2
+					...
+					@:catch
+
+					// Before haxe 5
+					This is a bit hackshish : we want to detect the type of exception filtered by the catch so we check the following
+					sequence of HL opcodes:
+
+					trap E,@catch
+					...
+					@catch:
+					global R, _
+					call _, ???(R,E)
+
+					??? is expected to be hl.BaseType.check
+				*/
+				hl_opcode *cat = f->ops + opCount + 1;
+				hl_opcode *next = f->ops + opCount + 1 + o->p2;
+				hl_opcode *next2 = f->ops + opCount + 2 + o->p2;
+				if( cat->op == OCatch || (next->op == OGetGlobal && next2->op == OCall2 && next2->p3 == next->p1 && dst->stack.id == (int)(int_val)next2->extra) ) {
+					int gindex = cat->op == OCatch ? cat->p1 : next->p2;
+					hl_type *gt = m->code->globals[gindex];
+					while( gt->kind == HOBJ && gt->obj->super ) gt = gt->obj->super;
+					if( gt->kind == HOBJ && gt->obj->nfields && gt->obj->fields[0].t->kind == HTYPE ) {
+						void *addr = m->globals_data + m->globals_indexes[gindex];
+#						ifdef HL_64
+						op64(ctx,MOV,treg,pconst64(&p,(int_val)addr));
+						op64(ctx,MOV,treg,pmem(&p,treg->id,0));
+#						else
+						op64(ctx,MOV,treg,paddr(&p,addr));
+#						endif
+					} else
+						op64(ctx,MOV,treg,pconst(&p,0));
+				} else {
+					op64(ctx,MOV,treg,pconst(&p,0));
+				}
+				op64(ctx,MOV,pmem(&p,Esp,(int)(int_val)&t->tcheck),treg);
+
+				// On Win64 setjmp actually takes two arguments
+				// the jump buffer and the frame pointer (or the stack pointer if there is no FP)
+#if defined(HL_WIN) && defined(HL_64)
+				size = begin_native_call(ctx, 2);
+				set_native_arg(ctx, REG_AT(Ebp));
+#else
+				size = begin_native_call(ctx, 1);
+#endif
+				set_native_arg(ctx,trap);
+#ifdef HL_MINGW
+				call_native(ctx,_setjmp,size);
+#else
+				call_native(ctx,setjmp,size);
+#endif
+				op64(ctx,TEST,PEAX,PEAX);
+				XJump_small(JZero,jenter);
+				op64(ctx,ADD,PESP,pconst(&p,trap_size));
+				if( !tinf ) {
+					call_native(ctx, hl_get_thread, 0);
+					op64(ctx,MOV,PEAX,pmem(&p, Eax, (int)(int_val)&tinf->exc_value));
+				} else {
+					op64(ctx,MOV,PEAX,pconst64(&p,(int_val)&tinf->exc_value));
+					op64(ctx,MOV,PEAX,pmem(&p, Eax, 0));
+				}
+				store(ctx,dst,PEAX,false);
+
+				jtrap = do_jump(ctx,OJAlways,false);
+				register_jump(ctx,jtrap,(opCount + 1) + o->p2);
+				patch_jump(ctx,jenter);
+			}
+			break;
+		case OEndTrap:
+			{
+				int trap_size = (sizeof(hl_trap_ctx) + 15) & 0xFFF0;
+				hl_trap_ctx *tmp = NULL;
+				preg *addr,*r;
+				int offset;
+				if (!tinf) {
+					call_native(ctx, hl_get_thread, 0);
+					addr = PEAX;
+					RLOCK(addr);
+					offset = (int)(int_val)&tinf->trap_current;
+				} else {
+					offset = 0;
+					addr = alloc_reg(ctx, RCPU);
+					op64(ctx, MOV, addr, pconst64(&p, (int_val)&tinf->trap_current));
+				}
+				r = alloc_reg(ctx, RCPU);
+				op64(ctx, MOV, r, pmem(&p,addr->id,offset));
+				op64(ctx, MOV, r, pmem(&p,r->id,(int)(int_val)&tmp->prev));
+				op64(ctx, MOV, pmem(&p,addr->id, offset), r);
+#				ifdef HL_WIN
+				// erase eip (prevent false positive)
+				{
+					_JUMP_BUFFER *b = NULL;
+#					ifdef HL_64
+					op64(ctx,MOV,pmem(&p,Esp,(int)(int_val)&(b->Rip)),PEAX);
+#					else
+					op64(ctx,MOV,pmem(&p,Esp,(int)&(b->Eip)),PEAX);
+#					endif
+				}
+#				endif
+				op64(ctx,ADD,PESP,pconst(&p,trap_size));
+			}
+			break;
+		case OEnumIndex:
+			{
+				preg *r = alloc_reg(ctx,RCPU);
+				op64(ctx,MOV,r,pmem(&p,alloc_cpu(ctx,ra,true)->id,HL_WSIZE));
+				store(ctx,dst,r,true);
+				break;
+			}
+			break;
+		case OSwitch:
+			{
+				int jdefault;
+				int i;
+				preg *r = alloc_cpu(ctx, dst, true);
+				preg *r2 = alloc_reg(ctx, RCPU);
+				op32(ctx, CMP, r, pconst(&p,o->p2));
+				XJump(JUGte,jdefault);
+				// r2 = r * 5 + eip
+#				ifdef HL_64
+				op64(ctx, XOR, r2, r2);
+#				endif
+				op32(ctx, MOV, r2, r);
+				op32(ctx, SHL, r2, pconst(&p,2));
+				op32(ctx, ADD, r2, r);
+#				ifdef HL_64
+				preg *tmp = alloc_reg(ctx, RCPU);
+				op64(ctx, MOV, tmp, pconst64(&p,RESERVE_ADDRESS));
+#				else
+				op64(ctx, ADD, r2, pconst64(&p,RESERVE_ADDRESS));
+#				endif
+				{
+					jlist *s = (jlist*)hl_malloc(&ctx->galloc, sizeof(jlist));
+					s->pos = BUF_POS() - sizeof(void*);
+					s->next = ctx->switchs;
+					ctx->switchs = s;
+				}
+#				ifdef HL_64
+				op64(ctx, ADD, r2, tmp);
+#				endif
+				op64(ctx, JMP, r2, UNUSED);
+				for(i=0;i<o->p2;i++) {
+					int j = do_jump(ctx,OJAlways,false);
+					register_jump(ctx,j,(opCount + 1) + o->extra[i]);
+					if( (i & 15) == 0 ) jit_buf(ctx);
+				}
+				patch_jump(ctx, jdefault);
+			}
+			break;
+		case OGetTID:
+			op32(ctx, MOV, alloc_cpu(ctx,dst,false), pmem(&p,alloc_cpu(ctx,ra,true)->id,0));
+			store(ctx,dst,dst->current,false);
+			break;
+		case OAssert:
+			{
+				pad_before_call(ctx, 0);
+				jlist *j = (jlist*)hl_malloc(&ctx->galloc,sizeof(jlist));
+				j->pos = BUF_POS();
+				j->target = -2;
+				j->next = ctx->calls;
+				ctx->calls = j;
+
+				op64(ctx,MOV,PEAX,pconst64(&p,RESERVE_ADDRESS));
+				op_call(ctx,PEAX,-1);
+			}
+			break;
+		case ONop:
+			break;
+		case OPrefetch:
+			{
+				preg *r = alloc_cpu(ctx, dst, true);
+				if( o->p2 > 0 ) {
+					switch( dst->t->kind ) {
+					case HOBJ:
+					case HSTRUCT:
+						{
+							hl_runtime_obj *rt = hl_get_obj_rt(dst->t);
+							preg *r2 = alloc_reg(ctx, RCPU);
+							op64(ctx, LEA, r2, pmem(&p, r->id, rt->fields_indexes[o->p2-1]));
+							r = r2;
+						}
+						break;
+					default:
+						ASSERT(dst->t->kind);
+						break;
+					}
+				}
+				switch( o->p3 ) {
+				case 0:
+					op64(ctx, PREFETCHT0, pmem(&p,r->id,0), UNUSED);
+					break;
+				case 1:
+					op64(ctx, PREFETCHT1, pmem(&p,r->id,0), UNUSED);
+					break;
+				case 2:
+					op64(ctx, PREFETCHT2, pmem(&p,r->id,0), UNUSED);
+					break;
+				case 3:
+					op64(ctx, PREFETCHNTA, pmem(&p,r->id,0), UNUSED);
+					break;
+				case 4:
+					op64(ctx, PREFETCHW, pmem(&p,r->id,0), UNUSED);
+					break;
+				default:
+					ASSERT(o->p3);
+					break;
+				}
+			}
+			break;
+		case OAsm:
+			{
+				switch( o->p1 ) {
+				case 0: // byte output
+					B(o->p2);
+					break;
+				case 1: // scratch cpu reg
+					scratch(REG_AT(o->p2));
+					break;
+				case 2: // read vm reg
+					rb--;
+					copy(ctx, REG_AT(o->p2), &rb->stack, rb->size);
+					scratch(REG_AT(o->p2));
+					break;
+				case 3: // write vm reg
+					rb--;
+					copy(ctx, &rb->stack, REG_AT(o->p2), rb->size);
+					scratch(rb->current);
+					break;
+				case 4:
+					if( ctx->totalRegsSize != 0 )
+						hl_fatal("Asm naked function should not have local variables");
+					if( opCount != 0 )
+						hl_fatal("Asm naked function should be on first opcode");
+					ctx->buf.b -= BUF_POS() - ctx->functionPos; // reset to our function start
+					break;
+				default:
+					ASSERT(o->p1);
+					break;
+				}
+			}
+			break;
+		case OCatch:
+			// Only used by OTrap typing
+			break;
+		default:
+			jit_error(hl_op_name(o->op));
+			break;
+		}
+		// we are landing at this position, assume we have lost our registers
+		if( ctx->opsPos[opCount+1] == -1 )
+			discard_regs(ctx,true);
+		ctx->opsPos[opCount+1] = BUF_POS();
+
+		// write debug infos
+		size = BUF_POS() - codePos;
+		if( debug16 && size > 0xFF00 ) {
+			debug32 = malloc(sizeof(int) * (f->nops + 1));
+			for(i=0;i<ctx->currentPos;i++)
+				debug32[i] = debug16[i];
+			free(debug16);
+			debug16 = NULL;
+		}
+		if( debug16 ) debug16[ctx->currentPos] = (unsigned short)size; else if( debug32 ) debug32[ctx->currentPos] = size;
+
+	}
+	// patch jumps
+	{
+		jlist *j = ctx->jumps;
+		while( j ) {
+			*(int*)(ctx->startBuf + j->pos) = ctx->opsPos[j->target] - (j->pos + 4);
+			j = j->next;
+		}
+		ctx->jumps = NULL;
+	}
+	int codeEndPos = BUF_POS();
+	// add nops padding
+	jit_nops(ctx);
+	// clear regs
+	for(i=0;i<REG_COUNT;i++) {
+		preg *r = REG_AT(i);
+		r->holds = NULL;
+		r->lock = 0;
+	}
+	// save debug infos
+	if( ctx->debug ) {
+		int fid = (int)(f - m->code->functions);
+		ctx->debug[fid].start = codePos;
+		ctx->debug[fid].offsets = debug32 ? (void*)debug32 : (void*)debug16;
+		ctx->debug[fid].large = debug32 != NULL;
+	}
+	// unwind info
+#ifdef WIN64_UNWIND_TABLES
+	int uw_idx = ctx->nunwind++;
+	ctx->unwind_table[uw_idx].BeginAddress = codePos;
+	ctx->unwind_table[uw_idx].EndAddress = codeEndPos;
+	ctx->unwind_table[uw_idx].UnwindData = ctx->unwind_offset;
+#endif
+	// reset tmp allocator
+	hl_free(&ctx->falloc);
+	return codePos;
+}
+
+static void *get_wrapper( hl_type *t ) {
+	return call_jit_hl2c;
+}
+
+void hl_jit_patch_method( void *old_fun, void **new_fun_table ) {
+	// mov eax, addr
+	// jmp [eax]
+	unsigned char *b = (unsigned char*)old_fun;
+	unsigned long long addr = (unsigned long long)(int_val)new_fun_table;
+#	ifdef HL_64
+	*b++ = 0x48;
+	*b++ = 0xB8;
+	*b++ = (unsigned char)addr;
+	*b++ = (unsigned char)(addr>>8);
+	*b++ = (unsigned char)(addr>>16);
+	*b++ = (unsigned char)(addr>>24);
+	*b++ = (unsigned char)(addr>>32);
+	*b++ = (unsigned char)(addr>>40);
+	*b++ = (unsigned char)(addr>>48);
+	*b++ = (unsigned char)(addr>>56);
+#	else
+	*b++ = 0xB8;
+	*b++ = (unsigned char)addr;
+	*b++ = (unsigned char)(addr>>8);
+	*b++ = (unsigned char)(addr>>16);
+	*b++ = (unsigned char)(addr>>24);
+#	endif
+	*b++ = 0xFF;
+	*b++ = 0x20;
+}
+
+static void missing_closure() {
+	hl_error("Missing static closure");
+}
+
+void *hl_jit_code( jit_ctx *ctx, hl_module *m, int *codesize, hl_debug_infos **debug, hl_module *previous ) {
+	jlist *c;
+	int size = BUF_POS();
+	unsigned char *code;
+	if( size & 4095 ) size += 4096 - (size&4095);
+	code = (unsigned char*)hl_alloc_executable_memory(size);
+	if( code == NULL ) return NULL;
+	memcpy(code,ctx->startBuf,BUF_POS());
+	*codesize = size;
+	*debug = ctx->debug;
+	if( !call_jit_c2hl ) {
+		call_jit_c2hl = code + ctx->c2hl;
+		call_jit_hl2c = code + ctx->hl2c;
+		hl_setup.get_wrapper = get_wrapper;
+		hl_setup.static_call = callback_c2hl;
+		hl_setup.static_call_ref = true;
+	}
+#ifdef WIN64_UNWIND_TABLES
+	m->unwind_table = ctx->unwind_table;
+	RtlAddFunctionTable(m->unwind_table, ctx->nunwind, (DWORD64)code);
+#endif
+	if( !ctx->static_function_offset ) {
+		int i;
+		ctx->static_function_offset = true;
+		for(i=0;i<(int)(sizeof(ctx->static_functions)/sizeof(void*));i++)
+			ctx->static_functions[i] = (void*)(code + (int)(int_val)ctx->static_functions[i]);
+	}
+	// patch calls
+	c = ctx->calls;
+	while( c ) {
+		void *fabs;
+		if( c->target < 0 )
+			fabs = ctx->static_functions[-c->target-1];
+		else {
+			fabs = m->functions_ptrs[c->target];
+			if( fabs == NULL ) {
+				// read absolute address from previous module
+				int old_idx = m->hash->functions_hashes[m->functions_indexes[c->target]];
+				if( old_idx < 0 )
+					return NULL;
+				fabs = previous->functions_ptrs[(previous->code->functions + old_idx)->findex];
+			} else {
+				// relative
+				fabs = (unsigned char*)code + (int)(int_val)fabs;
+			}
+		}
+		if( (code[c->pos]&~3) == (IS_64?0x48:0xB8) || code[c->pos] == 0x68 ) // MOV : absolute | PUSH
+			*(void**)(code + c->pos + (IS_64?2:1)) = fabs;
+		else {
+			int_val delta = (int_val)fabs - (int_val)code - (c->pos + 5);
+			int rpos = (int)delta;
+			if( (int_val)rpos != delta ) {
+				printf("Target code too far too rebase\n");
+				return NULL;
+			}
+			*(int*)(code + c->pos + 1) = rpos;
+		}
+		c = c->next;
+	}
+	// patch switchs
+	c = ctx->switchs;
+	while( c ) {
+		*(void**)(code + c->pos) = code + c->pos + (IS_64 ? 14 : 6);
+		c = c->next;
+	}
+	// patch closures
+	{
+		vclosure *c = ctx->closure_list;
+		while( c ) {
+			vclosure *next;
+			int fidx = (int)(int_val)c->fun;
+			void *fabs = m->functions_ptrs[fidx];
+			if( fabs == NULL ) {
+				// read absolute address from previous module
+				int old_idx = m->hash->functions_hashes[m->functions_indexes[fidx]];
+				if( old_idx < 0 )
+					fabs = missing_closure;
+				else
+					fabs = previous->functions_ptrs[(previous->code->functions + old_idx)->findex];
+			} else {
+				// relative
+				fabs = (unsigned char*)code + (int)(int_val)fabs;
+			}
+			c->fun = fabs;
+			next = (vclosure*)c->value;
+			c->value = NULL;
+			c = next;
+		}
+	}
+	return code;
+}
+
diff --git a/src/jit_regs.c b/src/jit_regs.c
new file mode 100644
index 000000000..50f151f06
--- /dev/null
+++ b/src/jit_regs.c
@@ -0,0 +1,813 @@
+/*
+ * Copyright (C)2015-2016 Haxe Foundation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+#include <hlmodule.h>
+#include <jit.h>
+#include "data_struct.h"
+
+#define VAL(k)		(ctx->values + (k))
+
+//#define REGS_DEBUG
+
+#ifdef REGS_DEBUG
+#	define regs_debug	jit_debug
+#else
+#	define regs_debug(...)
+#endif
+
+#define INVALID	0x80000000
+
+#define VIDX(e)	(((e) < 0) ? ctx->jit->value_count + (-(e)-1) : (e))
+#define VAL_REG(e) VAL(VIDX(e))
+#define REG_MODE(m)	(IS_FLOAT(m) ? 1 :0)
+#define REG_CFG(m)	(m ? &ctx->jit->cfg.floats : &ctx->jit->cfg.regs)
+
+#define EMIT(r,a,b,m)	regs_emit(ctx,UNUSED,r,a,b,m,0)
+#define BREAK()	EMIT(DEBUG_BREAK,UNUSED,UNUSED,0)
+
+typedef struct {
+	int id;
+	int stack_pos;
+	int last_read;
+	int tot_reads;
+	emit_mode mode;
+	ereg pref_reg;
+	ereg reg;
+} value_info;
+
+#define S_TYPE			values
+#define S_NAME(name)	values_##name
+#define S_VALUE			value_info*
+#include "data_struct.c"
+#define values_add(set,v)		values_add_impl(DEF_ALLOC,&(set),v)
+
+struct _regs_ctx {
+	jit_ctx *jit;
+	value_info *values;
+	values scratch;
+	int_arr jump_regs;
+	int_arr pack_movs;
+	int_arr *blocks_phis;
+	int max_instrs;
+	int cur_op;
+	int emit_pos;
+	int stack_size;
+	int stack_offset;
+	int loop_start;
+	int loop_end;
+	einstr *instrs;
+	ereg *out_write;
+	int *pos_map;
+	bool flushed;
+	bool has_direct_call;
+	int persists_uses[2];
+};
+
+typedef int call_regs[2];
+
+static ereg get_call_reg( regs_ctx *ctx, call_regs regs, emit_mode m ) {
+	ereg r;
+	int mode = REG_MODE(m);
+	reg_config *cfg = REG_CFG(mode);
+	int idx = IS_WINCALL64 ? 0 : mode;
+	if( regs[idx] < cfg->nargs )
+		r = cfg->arg[regs[idx]++];
+	else
+		r = UNUSED;
+	return r;
+}
+
+static int get_stack_size( regs_ctx *ctx, emit_mode m ) {
+	int size = hl_emit_mode_sizes[m];
+	if( size < HL_WSIZE ) size = HL_WSIZE;
+	int min = ctx->jit->cfg.stack_arg_size;
+	if( min && size < min ) size = min;
+	return size;
+}
+
+static void regs_write_instr( regs_ctx *ctx, einstr *e, ereg out ) {
+	if( ctx->emit_pos == ctx->max_instrs ) {
+		int pos = ctx->emit_pos;
+		int next_size = ctx->max_instrs ? (ctx->max_instrs << 1) : 256;
+		einstr *instrs = (einstr*)malloc(sizeof(einstr) * next_size);
+		ereg *out = (ereg*)malloc(sizeof(ereg) * next_size);
+		if( instrs == NULL || out == NULL ) jit_error("Out of memory");
+		memcpy(instrs, ctx->instrs, pos * sizeof(einstr));
+		memcpy(out, ctx->out_write, pos * sizeof(ereg));
+		memset(instrs + pos, 0, (next_size - pos) * sizeof(einstr));
+		free(ctx->instrs);
+		free(ctx->out_write);
+		ctx->instrs = instrs;
+		ctx->out_write = out;
+		ctx->max_instrs = next_size;
+	} else if( (ctx->emit_pos & 0xFF) == 0 )
+		memset(ctx->instrs + ctx->emit_pos, 0, 256 * sizeof(einstr));
+	ctx->out_write[ctx->emit_pos] = out;
+	ctx->instrs[ctx->emit_pos++] = *e;
+}
+
+static void regs_emit( regs_ctx *ctx, ereg out, emit_op op, ereg a, ereg b, emit_mode m, int size_offs ) {
+	einstr e;
+	e.header = op;
+	e.mode = m;
+	e.a = a;
+	e.b = b;
+	e.size_offs = size_offs;
+	regs_write_instr(ctx, &e, out);
+}
+
+static void regs_emit_mov( regs_ctx *ctx, ereg to, ereg from, emit_mode m ) {
+	if( to == from ) return;
+	regs_emit(ctx,to,MOV,from,UNUSED,m,0);
+}
+
+static int regs_alloc_stack( regs_ctx *ctx, int size ) {
+	ctx->stack_size += size;
+	ctx->stack_size += jit_pad_size(ctx->stack_size,size);
+	return -ctx->stack_size;
+}
+
+#define value_str(v)	value_to_str(ctx,v)
+
+static const char *value_to_str( regs_ctx *ctx, value_info *v ) {
+	static char out[20];
+	sprintf(out,"%s:%s", val_str(v->id,v->mode), val_str(v->reg,v->mode));
+	return out;
+}
+
+static void spill( regs_ctx *ctx, value_info *v ) {
+	if( v->stack_pos == INVALID ) v->stack_pos = regs_alloc_stack(ctx, hl_emit_mode_sizes[v->mode]);
+	v->reg = MK_STACK_REG(v->stack_pos);
+	values_remove(&ctx->scratch,v);
+	regs_debug("REG SPILL %s @%X\n",value_str(v),ctx->cur_op);
+}
+
+static bool regs_alloc_reg( regs_ctx *ctx, value_info *v ) {
+	// lookup available reg
+	int mode = REG_MODE(v->mode);
+	reg_config *cfg = REG_CFG(mode);
+	if( !IS_NULL(v->pref_reg) ) {
+		bool free = true;
+		for_iter(values,v2,ctx->scratch) {
+			if( v2->reg == v->pref_reg ) {
+				free = false;
+				break;
+			}
+		}
+		if( free ) {
+			for(int i=0;i<ctx->persists_uses[mode];i++)
+				if( cfg->persist[i] == v->pref_reg ) {
+					free = false;
+					break;
+				}
+		}
+		if( free ) {
+			v->reg = v->pref_reg;
+			return true;
+		}
+	}
+	value_info *first = NULL;
+	for(int i=0;i<cfg->nscratchs;i++) {
+		ereg r = cfg->scratch[i];
+		for_iter(values,v2,ctx->scratch) {
+			if( v2->reg == r ) {
+				if( first == NULL ) first = v2;
+				r = UNUSED;
+				break;
+			}
+		}
+		if( !IS_NULL(r) ) {
+			v->reg = r;
+			return true;
+		}
+	}
+	if( ctx->persists_uses[mode] < cfg->npersists ) {
+		v->reg = cfg->persist[ctx->persists_uses[mode]++];
+		return false;
+	}
+	// free the oldest scratch reg
+	if( !first ) jit_assert();
+	v->reg = first->reg;
+	spill(ctx, first);
+	return true;
+}
+
+static void regs_assign( regs_ctx *ctx, value_info *v ) {
+	if( v->reg != UNUSED ) jit_assert();
+	if( regs_alloc_reg(ctx, v) )
+		values_add(ctx->scratch, v);
+	regs_debug("REG ASSIGN %s @%X-@%X\n",value_str(v),ctx->cur_op,v->last_read);
+}
+
+static void regs_write_live( regs_ctx *ctx, ereg *r ) {
+	if( IS_NULL(*r) ) jit_assert();
+	if( !REG_IS_VAL(*r) ) return; // some are injections of native regs at emit
+	value_info *v = VAL_REG(*r);
+	int write = v->id >= 0 ? ctx->jit->values_writes[v->id] : -1;
+	v->last_read = ctx->loop_end && write < ctx->loop_start ? ctx->loop_end : ctx->cur_op;
+	v->tot_reads++;
+}
+
+static value_info *regs_current( regs_ctx *ctx, ereg r ) {
+	for_iter(values,v,ctx->scratch) {
+		if( v->reg == r )
+			return v;
+	}
+	return NULL;
+}
+
+static void regs_compute_liveness( regs_ctx *ctx ) {
+#	define MAX_LOOP_DEPTH 256
+	int loop_saves[MAX_LOOP_DEPTH];
+	int loop_count = 0;
+	int write_index = 1;
+	jit_ctx *jit = ctx->jit;
+	hl_type *tret = ctx->jit->fun->type->fun->ret;
+	emit_mode mret = tret->kind == HF32 || tret->kind == HF64 ? M_F64 : M_PTR;
+	ereg ret = REG_CFG(REG_MODE(mret))->ret;
+	for(int cur_op=0;cur_op<jit->instr_count;cur_op++) {
+		einstr *e = jit->instrs + cur_op;
+		value_info *write = NULL;
+
+		while( ctx->loop_end == cur_op && cur_op ) {
+			ctx->loop_end = loop_saves[--loop_count];
+			ctx->loop_start = loop_saves[--loop_count];
+		}
+
+		if( write_index < jit->value_count && jit->values_writes[write_index] == cur_op )
+			write = VAL(write_index++);
+
+		ctx->cur_op = cur_op;
+		hl_emit_reg_iter(jit,e,ctx,(void*)regs_write_live);
+		if( IS_CALL(e->op) ) {
+			// anticipate register usage in call so we can previlege this assign
+			ereg *r = hl_emit_get_args(jit->emit, e);
+			call_regs regs = {0};
+			bool needs_push = false;
+			for(int k=0;k<e->nargs;k++) {
+				ereg arg = r[k];
+				value_info *v = REG_IS_VAL(arg) ? VAL_REG(r[k]) : NULL;
+				ereg r = get_call_reg(ctx, regs, v ? v->mode : M_I32);
+				if( IS_NULL(r) ) {
+					needs_push = true;
+					continue;
+				}
+				if( v && IS_NULL(v->pref_reg) )
+					v->pref_reg = r;
+			}
+			if( !needs_push && e->mode != M_NORET ) ctx->has_direct_call = true;
+			if( write && IS_NULL(write->pref_reg) )
+				write->pref_reg = REG_CFG(REG_MODE(e->mode))->ret;
+		} else switch( e->op ) {
+		case RET:
+			if( e->a ) {
+				value_info *v = VAL_REG(e->a);
+				if( v->pref_reg == UNUSED ) v->pref_reg = ret;
+			}
+			break;
+		case BINOP:
+			switch( e->size_offs ) {
+			case OSShr:
+			case OUShr:
+			case OShl:
+				if( jit->cfg.req_bit_shifts ) VAL_REG(e->b)->pref_reg = jit->cfg.req_bit_shifts;
+				break;
+			case OSDiv:
+			case OUDiv:
+			case OSMod:
+			case OUMod:
+				if( !IS_FLOAT(e->mode) ) {
+					if( jit->cfg.req_div_a ) VAL_REG(e->a)->pref_reg = jit->cfg.req_div_a;
+					if( jit->cfg.req_div_b ) VAL_REG(e->b)->pref_reg = jit->cfg.req_div_b;
+				}
+				break;
+			}
+			break;
+		case BLOCK:
+			{
+				// are we in loop ?
+				eblock *bl = jit->blocks + e->size_offs;
+				int loop_end = -1;
+				for(int k=0;k<bl->pred_count;k++) {
+					eblock *b2 = jit->blocks + bl->preds[k];
+					if( b2->start_pos > bl->start_pos && b2->end_pos >= loop_end )
+						loop_end = b2->end_pos - 1;
+				}
+				if( loop_end > 0 ) {
+					loop_saves[loop_count++] = ctx->loop_start;
+					loop_saves[loop_count++] = ctx->loop_end;
+					ctx->loop_start = cur_op;
+					ctx->loop_end = loop_end;
+				}
+			}
+			break;
+		default:
+			break;
+		}
+	}
+	if( loop_count != 0 ) jit_assert();
+	// compute reverse phis
+	for(int b=0;b<jit->block_count;b++) {
+		eblock *bl = jit->blocks + b;
+		for(int p=0;p<bl->phi_count;p++) {
+			ephi *ph = bl->phis + p;
+			VAL_REG(ph->value)->mode = ph->mode;
+			for(int k=0;k<ph->nvalues;k++) {
+				ereg v = ph->values[k];
+				eblock *b2 = jit->blocks + ph->blocks[k];
+				value_info *val = VAL_REG(v);
+				int_arr *arr = &ctx->blocks_phis[b2 - jit->blocks];
+				regs_debug("ADD PHI %s:=%s to #%d@%X\n",val_str(ph->value,ph->mode),val_str(v,ph->mode),(int)(b2 - jit->blocks),b2->end_pos-1);
+				int_arr_add(*arr,v);
+				int_arr_add(*arr,ph->value);
+				int_arr_add(*arr,(bl - b2) == 1);
+				val->tot_reads++;
+				if( val->last_read < b2->end_pos )
+					val->last_read = b2->end_pos;
+			}
+		}
+	}
+}
+
+static void regs_assign_regs( regs_ctx *ctx ) {
+	jit_ctx *jit = ctx->jit;
+	// assign args
+	call_regs regs = {0};
+	int args_count = 0;
+	for(int i=1;i<=ctx->jit->fun->type->fun->nargs;i++) {
+		value_info *v = VAL(i);
+		einstr *e = ctx->jit->instrs + ctx->jit->values_writes[i];
+		int size = hl_emit_mode_sizes[e->mode];
+		if( size <= 0 && e->mode != M_VOID ) jit_assert();
+		ereg r = get_call_reg(ctx,regs,e->mode);
+		if( !IS_NULL(r) ) {
+			v->reg = r;
+			values_add(ctx->scratch,v);
+		}
+		if( IS_NULL(r) || IS_WINCALL64 ) {
+			// use existing stack storage
+			v->stack_pos = (args_count++ + 2) * HL_WSIZE;
+			if( IS_NULL(r) ) v->reg = MK_STACK_REG(v->stack_pos);
+		}
+	}
+	// assign registers
+	int write_index = 1;
+	for(int cur_op=0;cur_op<jit->instr_count;cur_op++) {
+		einstr e = jit->instrs[cur_op];
+		value_info *write = NULL;
+#		ifdef HL_DEBUG
+		int eid = (jit->fun->findex << 16) | cur_op;
+		__ignore(&eid);
+#		endif
+		ctx->cur_op = cur_op;
+
+
+		if( write_index < jit->value_count && jit->values_writes[write_index] == cur_op ) {
+			write = VAL(write_index++);
+			// try to preserve ops in the from  A = A op B
+			if( (e.op == UNOP || e.op == BINOP) && write->pref_reg == UNUSED ) {
+				value_info *v = VAL_REG(e.a);
+				if( IS_REG(v->reg) ) write->pref_reg = v->reg;
+			}
+		}
+
+		for_iter_back(values,v,ctx->scratch) {
+			if( v->last_read <= cur_op )
+				values_remove(&ctx->scratch,v);
+		}
+
+		if( IS_CALL(e.op) ) {
+			ereg *args = hl_emit_get_args(ctx->jit->emit,&e);
+			call_regs regs = {0};
+			bool will_scratch = e.mode != M_NORET;
+			value_info *vcall = e.op == CALL_REG ? VAL_REG(e.a) : NULL;
+			if( will_scratch ) {
+				for_iter_back(values,v2,ctx->scratch) {
+					if( v2->last_read > cur_op )
+						spill(ctx,v2);
+				}
+			}
+			for(int k=0;k<e.nargs;k++) {
+				if( !REG_IS_VAL(args[k]) ) continue;
+				value_info *v = VAL_REG(args[k]);
+				ereg r = get_call_reg(ctx,regs,v->mode);
+				if( !IS_NULL(r) ) {
+					value_info *cur = regs_current(ctx,r);
+					if( cur && cur != v )
+						spill(ctx,cur);
+					if( vcall && vcall->reg == r )
+						spill(ctx,vcall);
+				}
+			}
+			if( will_scratch ) values_reset(&ctx->scratch);
+		}
+		switch( e.op ) {
+		case BLOCK:
+			for_iter_back(values,v,ctx->scratch) {
+				if( v->last_read == cur_op )
+					values_remove(&ctx->scratch,v);
+			}
+			eblock *bl = jit->blocks + e.size_offs;
+			for(int k=0;k<bl->phi_count;k++) {
+				ephi *p = bl->phis + k;
+				value_info *v = VAL_REG(p->value);
+				for(int n=0;n<p->nvalues;n++) {
+					value_info *vn = VAL_REG(p->values[n]);
+					// ignore previously set pref_reg (minimize moves)
+					if( IS_REG(vn->reg) && !regs_current(ctx,vn->reg) ) {
+						v->pref_reg = vn->reg;
+						break;
+					}
+				}
+				regs_assign(ctx, v);
+			}
+			break;
+		case CATCH:
+			{
+				for_iter_back(values,v2,ctx->scratch)
+					spill(ctx,v2);
+			}
+			break;
+		case ALLOC_STACK:
+			write->reg = MK_STACK_OFFS(regs_alloc_stack(ctx, e.size_offs));
+			continue;
+		case LOAD_ARG:
+			if( write->reg == UNUSED )
+				regs_assign(ctx, write); // assign for stack reg
+			continue;
+		case ADDRESS:
+			{
+				if( REG_KIND(e.a) == R_CONST ) jit_assert();
+				value_info *v = VAL_REG(e.a);
+				spill(ctx, v);
+				break;
+			}
+		default:
+			break;
+		}
+		if( write ) regs_assign(ctx, write);
+	}
+	// assign stack regs
+	int nvalues = jit->value_count + jit->phi_count;
+	ctx->stack_offset = (ctx->persists_uses[0] + ctx->persists_uses[1]) * 8;
+	for(int i=0;i<nvalues;i++) {
+		value_info *v = ctx->values + i;
+		if( v->reg == UNUSED ) v->reg = MK_STACK_REG(v->stack_pos);
+	}
+}
+
+static void flush_movs( regs_ctx *ctx, bool cond ) {
+	int_arr movs = ctx->pack_movs;
+	while( true ) {
+		int size = int_arr_count(movs);
+		if( !size ) break;
+		bool cycle = true;
+		for(int k=0;k<size;k+=3) {
+			ereg to = int_arr_get(movs,k);
+			ereg from = int_arr_get(movs,k+1);
+			if( to == from ) {
+				int_arr_remove_range(&movs,k,3);
+				cycle = false;
+				continue;
+			}
+			bool read = false;
+			for(int k2=1;k2<size;k2+=3) {
+				ereg from = int_arr_get(movs,k2);
+				if( from == to ) {
+					read = true;
+					break;
+				}
+			}
+			if( !read ) {
+				ereg from = int_arr_get(movs,k+1);
+				int mode = int_arr_get(movs,k+2);
+				bool cmov = cond && IS_REG(to);
+				regs_emit(ctx,to,cmov?CMOV:MOV,from,UNUSED,mode,0);
+				int_arr_remove_range(&movs,k,3);
+				cycle = false;
+				break;
+			}
+		}
+		if( cycle ) {
+			ereg to = int_arr_get(movs,0);
+			ereg from = int_arr_get(movs,1);
+			int mode = int_arr_get(movs,2);
+			bool cmov = cond && (IS_REG(to) || IS_REG(from));
+			regs_emit(ctx,UNUSED,cmov?CXCHG:XCHG,to,from,mode,0);
+			int_arr_remove_range(&movs,0,3);
+			size -= 3;
+			// After XCHG(to,from) the data that was in `to` is now in `from`
+			// and vice versa.  Rename the FROM-slot of each remaining mov so
+			// it reads from the physical register where the value actually
+			// lives.  (The TO-slot is the desired destination physical reg
+			// and is unaffected by the XCHG.)
+			for(int k=0;k<size;k+=3) {
+				if( int_arr_get(movs,k+1) == to )
+					movs.values[k+1] = from;
+				else if( int_arr_get(movs,k+1) == from )
+					movs.values[k+1] = to;
+			}
+		}
+	}
+	ctx->pack_movs = movs;
+	int_arr_reset(&ctx->pack_movs);
+}
+
+static void flush_phis( regs_ctx *ctx, eblock *b, bool cond, bool after ) {
+	if( !b ) return;
+	jit_ctx *jit = ctx->jit;
+	int bid = (int)(b - jit->blocks);
+	int_arr arr = ctx->blocks_phis[bid];
+	int idx = 0;
+	int_arr movs = ctx->pack_movs;
+
+	while( idx < int_arr_count(arr) ) {
+		ereg a = int_arr_get(arr,idx++);
+		ereg b = int_arr_get(arr,idx++);
+		int bcount = int_arr_get(arr,idx++);
+		if( after != (bcount == 1) )
+			continue;
+		value_info *from = VAL_REG(a);
+		value_info *to = VAL_REG(b);
+		if( from->reg == to->reg ) continue;
+		int size = int_arr_count(movs);
+		bool dup = false;
+		for(int k=0;k<size;k+=3) {
+			if( int_arr_get(movs,k) == to->reg && int_arr_get(movs,k+1) == from->reg ) {
+				dup = true;
+				break;
+			}
+		}
+		if( !dup ) {
+			int_arr_add(movs, to->reg);
+			int_arr_add(movs, from->reg);
+			int_arr_add(movs, from->mode);
+		}
+	}
+	ctx->pack_movs = movs;
+	if( !cond )
+		int_arr_free(&ctx->blocks_phis[bid]);
+	flush_movs(ctx, cond);
+}
+
+static void regs_emit_instrs( regs_ctx *ctx ) {
+	jit_ctx *jit = ctx->jit;
+	eblock *cur_block = NULL;
+	call_regs regs = {0};
+	int write_index = 1;
+	ctx->pos_map[0] = 0;
+
+	int stack_offset = ctx->stack_size;
+	int push_size = HL_WSIZE * 2 + ctx->stack_offset; // RIP + RBP save
+	if( jit->cfg.stack_align ) {
+		int align = (stack_offset + push_size) % jit->cfg.stack_align;
+		if( align ) stack_offset += jit->cfg.stack_align - align;
+	}
+
+	for(int cur_op=0;cur_op<jit->instr_count;cur_op++) {
+		einstr e = jit->instrs[cur_op];
+		ereg *ret_val = NULL;
+		int nread;
+		int instr_stack_offset = 0;
+		ctx->cur_op = cur_op;
+
+		value_info *vout = NULL;
+		ereg out = UNUSED;
+		if( write_index < jit->value_count && jit->values_writes[write_index] == cur_op ) {
+			vout = VAL(write_index++);
+			out = vout->reg;
+		}
+
+		if( IS_CALL(e.op) ) {
+			ereg *args = hl_emit_get_args(ctx->jit->emit,&e);
+			call_regs regs = {0};
+			int stack_args = 0;
+			int stack_bits = 0;
+			for(int k=0;k<e.nargs;k++) {
+				value_info *v = REG_IS_VAL(args[k]) ? VAL_REG(args[k]) : NULL;
+				emit_mode mode = v ? v->mode : M_I32;
+				ereg r = get_call_reg(ctx,regs,mode);
+				if( IS_NULL(r) ) {
+					stack_args += get_stack_size(ctx, mode);
+					stack_bits |= 1 << k;
+				} else if( !v || r != v->reg ) {
+					int_arr_add(ctx->pack_movs,r);
+					int_arr_add(ctx->pack_movs,v ? v->reg : args[k]);
+					int_arr_add(ctx->pack_movs,mode);
+				}
+			}
+			if( stack_args > 0 ) {
+				int offset = 0;
+				if( jit->cfg.stack_align ) {
+					int align = stack_args % jit->cfg.stack_align;
+					if( align ) offset = jit->cfg.stack_align - align;
+				}
+				if( offset )
+					regs_emit(ctx,UNUSED,STACK_OFFS,UNUSED,UNUSED,0,-offset);
+				for(int k=e.nargs-1;k>=0;k--) {
+					if( stack_bits & (1 << k) ) {
+						value_info *v = REG_IS_VAL(args[k]) ? VAL_REG(args[k]) : NULL;
+						EMIT(PUSH,VAL_REG(args[k])->reg,UNUSED,v && IS_FLOAT(v->mode) ? v->mode : M_PTR);
+					}
+				}
+				if( IS_WINCALL64 ) {
+					regs_emit(ctx,UNUSED,STACK_OFFS,UNUSED,UNUSED,0,-0x20);
+					offset += 0x20;
+				}
+				instr_stack_offset = stack_args+offset;
+			}
+			flush_movs(ctx,0);
+			e.nargs = 0xFF;
+			if( vout && vout->last_read > cur_op ) 
+				ret_val = &REG_CFG(REG_MODE(e.mode))->ret;
+			else if( e.mode != M_NORET ) {
+				e.mode = M_VOID; // ignore output
+				out = UNUSED;
+			}
+			if( e.op == CALL_REG )
+				e.a = VAL_REG(e.a)->reg;
+		} else {
+			ereg **regs = hl_emit_get_regs(&e,&nread);
+			for(int k=0;k<nread;k++) {
+				ereg *r = regs[k];
+				if( !REG_IS_VAL(*r) ) continue;
+				value_info *v = VAL_REG(*r);
+				*r = v->reg;
+			}
+		}
+		switch( e.op ) {
+		case ALLOC_STACK:
+		case CATCH:
+			break;
+		case BLOCK:
+			cur_block = jit->blocks + e.size_offs;
+			break;
+		case LOAD_ARG:
+			{
+				ereg def = get_call_reg(ctx,regs,e.mode);
+				if( def && out != def )
+					regs_emit_mov(ctx,out,def,e.mode);
+				else
+					regs_write_instr(ctx, &e, out);
+			}
+			break;
+		case ENTER:
+			{
+				EMIT(PUSH,jit->cfg.stack_pos,UNUSED,M_PTR);
+				regs_emit_mov(ctx,jit->cfg.stack_pos,jit->cfg.stack_reg,M_PTR);
+				if( stack_offset )
+					regs_emit(ctx,UNUSED,STACK_OFFS,UNUSED,UNUSED,M_PTR,-stack_offset);
+				for(int i=0;i<ctx->persists_uses[0];i++)
+					EMIT(PUSH,ctx->jit->cfg.regs.persist[i],UNUSED,M_PTR);
+				for(int i=0;i<ctx->persists_uses[1];i++)
+					EMIT(PUSH,ctx->jit->cfg.floats.persist[i],UNUSED,M_F64);
+				if( IS_WINCALL64 && ctx->has_direct_call )
+					regs_emit(ctx,UNUSED,STACK_OFFS,UNUSED,UNUSED,M_PTR,-0x20);
+			}
+			break;
+		case JCOND:
+		case JUMP:
+		case JUMP_TABLE:
+			flush_phis(ctx,cur_block, e.op == JCOND, false);
+			if( e.op == JUMP_TABLE ) {
+				// copy args (remap later)
+				hl_emit_store_args(jit->emit,&e,hl_emit_get_args(jit->emit,&e),e.nargs);
+			}
+			regs_write_instr(ctx, &e, out);
+			int_arr_add(ctx->jump_regs, ctx->emit_pos - 1);
+			int_arr_add(ctx->jump_regs, cur_op + 1 + (e.op == JUMP_TABLE ? 0 : e.size_offs));
+			if( e.op == JCOND ) flush_phis(ctx,cur_block, false, true);
+			break;
+		case RET:
+			if( e.a ) {
+				ereg ret = REG_CFG(REG_MODE(e.mode))->ret;
+				if( e.a != ret )
+					regs_emit_mov(ctx, ret, e.a, e.mode);
+			}
+#			ifdef WIN64_UNWIND_TABLES
+			// if we have our stack offset just after a call, the unwind algorithm
+			// will subtract and create invalid stack frame. this is because we do
+			// not register the stack offset in our unwind table so all functions
+			// can share the same definition
+			if( cur_op && IS_CALL(jit->instrs[cur_op-1].op) )
+				EMIT(NOP,UNUSED,UNUSED,M_NONE);
+#			endif
+			if( IS_WINCALL64 && ctx->has_direct_call )
+				regs_emit(ctx,UNUSED,STACK_OFFS,UNUSED,UNUSED,M_PTR,0x20);
+			for(int i=ctx->persists_uses[1]-1;i>=0;i--)
+				EMIT(POP,ctx->jit->cfg.floats.persist[i],UNUSED,M_F64);
+			for(int i=ctx->persists_uses[0]-1;i>=0;i--)
+				EMIT(POP,ctx->jit->cfg.regs.persist[i],UNUSED,M_PTR);
+			if( stack_offset ) {
+				regs_emit(ctx,UNUSED,STACK_OFFS,UNUSED,UNUSED,M_PTR,stack_offset);
+			}
+			EMIT(POP,jit->cfg.stack_pos,UNUSED,M_PTR);
+			EMIT(RET,UNUSED,UNUSED,M_NONE);
+			break;
+		case MOV:
+			if( out == e.a ) break;
+			// fallthrough
+		default:
+			if( e.op == ADDRESS ) {
+				e.op = LEA;
+				if( REG_KIND(e.a) != R_REG_PTR ) jit_assert();
+				e.a = (e.a & ~R_REG_PTR) | R_REG;
+			}
+			if( ret_val && out ) {
+				regs_write_instr(ctx, &e, *ret_val);
+				regs_emit_mov(ctx, out, *ret_val, e.mode);
+			} else
+				regs_write_instr(ctx, &e, out);
+			break;
+		}
+		if( instr_stack_offset )
+			regs_emit(ctx,UNUSED,STACK_OFFS,UNUSED,UNUSED,M_PTR,instr_stack_offset);
+		if( cur_block && cur_block->end_pos == cur_op+1 )
+			flush_phis(ctx,cur_block,false,true);
+		ctx->pos_map[cur_op+1] = ctx->emit_pos;
+	}
+}
+
+void hl_regs_flush( jit_ctx *jit ) {
+	regs_ctx *ctx = jit->regs;
+	if( ctx->flushed ) return;
+	ctx->flushed = true;
+	jit->reg_instr_count = ctx->emit_pos;
+	jit->reg_instrs = ctx->instrs;
+	jit->reg_writes = ctx->out_write;
+	jit->reg_pos_map = ctx->pos_map;
+	if( ctx->pos_map ) ctx->pos_map[ctx->cur_op+1] = ctx->emit_pos;
+	hl_emit_remap_jumps(jit->emit, &ctx->jump_regs, ctx->instrs, ctx->pos_map);
+}
+
+void hl_regs_function( jit_ctx *jit ) {
+	regs_ctx *ctx = jit->regs;
+	int nvalues = jit->value_count + jit->phi_count;
+	memset(ctx->persists_uses,0,sizeof(ctx->persists_uses));
+	free(ctx->pos_map);
+	ctx->flushed = false;
+	ctx->has_direct_call = false;
+	ctx->pos_map = (int*)malloc((jit->instr_count + 1) * sizeof(int));
+	ctx->emit_pos = 0;
+	ctx->cur_op = 0;
+	ctx->stack_size = 0;
+	jit->reg_instrs = NULL;
+	values_free(&ctx->scratch);
+	int_arr_free(&ctx->jump_regs);
+	int_arr_free(&ctx->pack_movs);
+	ctx->blocks_phis = (int_arr*)hl_zalloc(&jit->falloc,sizeof(int_arr) * jit->block_count);
+	ctx->values = (value_info*)hl_zalloc(&jit->falloc,sizeof(value_info) * nvalues);
+	for(int i=1;i<nvalues;i++) {
+		value_info *v = VAL(i);
+		v->reg = UNUSED;
+		v->pref_reg = UNUSED;
+		v->stack_pos = INVALID;
+		v->last_read = -1;
+		if( i < jit->value_count ) {
+			v->id = i;
+			v->mode = jit->instrs[jit->values_writes[i]].mode;
+		} else {
+			v->id = -(i-jit->value_count) - 1;
+			v->mode = M_NONE;
+		}
+	}
+	regs_compute_liveness(ctx);
+	regs_assign_regs(ctx);
+	regs_emit_instrs(ctx);
+	hl_regs_flush(ctx->jit);
+}
+
+
+void hl_regs_alloc( jit_ctx *jit ) {
+	regs_ctx *ctx = malloc(sizeof(regs_ctx));
+	memset(ctx,0,sizeof(regs_ctx));
+	ctx->jit = jit;
+	jit->regs = ctx;
+}
+
+void hl_regs_free( jit_ctx *jit ) {
+	regs_ctx *ctx = jit->regs;
+	free(ctx->pos_map);
+	free(ctx->instrs);
+	free(ctx->out_write);
+	free(ctx);
+}
+
diff --git a/src/jit_x86_64.c b/src/jit_x86_64.c
new file mode 100644
index 000000000..a2b6185c3
--- /dev/null
+++ b/src/jit_x86_64.c
@@ -0,0 +1,1722 @@
+/*
+ * Copyright (C)2015-2016 Haxe Foundation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+#include <hlmodule.h>
+#include <jit.h>
+#include "data_struct.h"
+
+#ifdef HL_DEBUG
+#	define GEN_DEBUG
+#endif
+
+#define S_TYPE			byte_arr
+#define S_NAME(name)	byte_##name
+#define S_VALUE			unsigned char
+#include "data_struct.c"
+#define byte_reserve(set,count)	byte_reserve_impl(DEF_ALLOC,&set,count)
+#define VAL_CONST		0x80000000
+#define VAL_MEM(reg)	(FL_MEMPTR | (reg))
+
+#define S_TYPE			value_arr
+#define S_NAME(name)	value_arr_##name
+#define S_VALUE			uint64
+#include "data_struct.c"
+
+#define S_SORTED
+#define S_MAP
+#define S_TYPE			value_map
+#define S_NAME(name)	value_map_##name
+#define S_KEY			uint64
+#define S_VALUE			int
+#define S_DEFVAL		-1
+#include "data_struct.c"
+
+typedef enum {
+	RAX = 0,
+	RCX = 1,
+	RDX = 2,
+	RBX = 3,
+	RSP = 4,
+	RBP = 5,
+	RSI = 6,
+	RDI = 7,
+#ifdef HL_64
+	R8 = 8,
+	R9 = 9,
+	R10	= 10,
+	R11	= 11,
+	R12	= 12,
+	R13	= 13,
+	R14	= 14,
+	R15	= 15,
+#endif
+	_UNUSED = 0xFF
+} CpuReg;
+
+#define R(id)		MK_REG(id,R_REG)
+#define MMX(id)		MK_REG((id)+64,R_REG)
+
+typedef enum {
+	_MOV,
+	_LEA,
+	_PUSH,
+	ADD,
+	SUB,
+	IMUL,	// only overflow flag changes compared to MUL
+	DIV,
+	IDIV,
+	NEG,
+	CDQ,
+	CDQE,
+	_POP,
+	_RET,
+	_CALL,
+	AND,
+	OR,
+	XOR,
+	_CMP,
+	_TEST,
+	SHL,
+	SHR,
+	SAR,
+	INC,
+	DEC,
+	JMP,
+	MOVSXD,
+	// FPU
+	FSTP,
+	FSTP32,
+	FLD,
+	FLD32,
+	FLDCW,
+	// SSE
+	MOVSD,
+	MOVSS,
+	COMISD,
+	COMISS,
+	ADDSD,
+	SUBSD,
+	MULSD,
+	DIVSD,
+	ADDSS,
+	SUBSS,
+	MULSS,
+	DIVSS,
+	XORPS,
+	XORPD,
+	CVTSI2SD,
+	CVTSI2SS,
+	CVTTSD2SI,
+	CVTSD2SS,
+	CVTSS2SD,
+	CVTTSS2SI,
+	STMXCSR,
+	LDMXCSR,
+	STC,
+	CLC,
+	// 8-16 bits
+	ADD8,
+	SUB8,
+	MOV8,
+	MOVZX8,
+	MOVSX8,
+	CMP8,
+	TEST8,
+	PUSH8,
+	ADD16,
+	SUB16,
+	IMUL16,
+	MOV16,
+	MOVZX16,
+	MOVSX16,
+	CMP16,
+	TEST16,
+	// prefetchs
+	PREFETCHT0,
+	PREFETCHT1,
+	PREFETCHT2,
+	PREFETCHNTA,
+	PREFETCHW,
+	// --
+	_CPU_LAST
+} CpuOp;
+
+#define JAlways			0xE9
+#define JAlways_short	0xEB
+#define JOverflow	0x80
+#define JULt		0x82
+#define JUGte		0x83
+#define JEq			0x84
+#define JNeq		0x85
+#define JULte		0x86
+#define JUGt		0x87
+#define JParity		0x8A
+#define JNParity	0x8B
+#define JSLt		0x8C
+#define JSGte		0x8D
+#define JSLte		0x8E
+#define JSGt		0x8F
+
+#define JCarry		JLt
+#define JZero		JEq
+#define JNotZero	JNeq
+
+#define FLAG_LONGOP	0x80000000
+#define FLAG_16B	0x40000000
+#define FLAG_8B		0x20000000
+#define FLAG_DUAL   0x10000000
+#define FLAG_DEF64	0x08000000
+
+#define RM(op,id) ((op) | (((id)+1)<<8))
+#define GET_RM(op)	(((op) >> ((op) < 0 ? 24 : 8)) & 15)
+#define SBYTE(op) ((op) << 16)
+#define LONG_OP(op)	((op) | FLAG_LONGOP)
+#define OP16(op)	((op) | FLAG_16B)
+#define LONG_RM(op,id)	LONG_OP(op | (((id) + 1) << 24))
+
+typedef struct {
+	const char *name;						// single operand
+	int r_mem;		// r32 / r/m32				r32
+	int mem_r;		// r/m32 / r32				r/m32
+	int r_const;	// r32 / imm32				imm32
+	int r_i8;		// r32 / imm8				imm8
+} opform;
+
+static opform OP_FORMS[] = {
+	{ "MOV", 0x8B, 0x89, 0xB8, 0 },
+	{ "LEA", 0x8D },
+	{ "PUSH", 0x50 | FLAG_DEF64, RM(0xFF,6), 0x68, 0x6A },
+	{ "ADD", 0x03, 0x01, RM(0x81,0), RM(0x83,0) },
+	{ "SUB", 0x2B, 0x29, RM(0x81,5), RM(0x83,5) },
+	{ "IMUL", LONG_OP(0x0FAF), 0, 0x69 | FLAG_DUAL, 0x6B | FLAG_DUAL },
+	{ "DIV", RM(0xF7,6), RM(0xF7,6) },
+	{ "IDIV", RM(0xF7,7), RM(0xF7,7) },
+	{ "NEG", RM(0xF7,3) },
+	{ "CDQ", 0x99 },
+	{ "CDQE", 0x98 },
+	{ "POP", 0x58 | FLAG_DEF64, RM(0x8F,0) },
+	{ "RET", 0xC3 },
+	{ "CALL", RM(0xFF,2) | FLAG_DEF64, RM(0xFF,2), 0xE8 },
+	{ "AND", 0x23, 0x21, RM(0x81,4), RM(0x83,4) },
+	{ "OR", 0x0B, 0x09, RM(0x81,1), RM(0x83,1) },
+	{ "XOR", 0x33, 0x31, RM(0x81,6), RM(0x83,6) },
+	{ "CMP", 0x3B, 0x39, RM(0x81,7), RM(0x83,7) },
+	{ "TEST", 0x85, 0x85/*SWP?*/, RM(0xF7,0) },
+	{ "SHL", RM(0xD3,4), 0, 0, RM(0xC1,4) },
+	{ "SHR", RM(0xD3,5), 0, 0, RM(0xC1,5) },
+	{ "SAR", RM(0xD3,7), 0, 0, RM(0xC1,7) },
+	{ "INC", IS_64 ? RM(0xFF,0) : 0x40, RM(0xFF,0) },
+	{ "DEC", IS_64 ? RM(0xFF,1) : 0x48, RM(0xFF,1) },
+	{ "JMP", RM(0xFF,4) },
+	{ "MOVSXD", 0x63 },
+	// FPU
+	{ "FSTP", 0, RM(0xDD,3) },
+	{ "FSTP32", 0, RM(0xD9,3) },
+	{ "FLD", 0, RM(0xDD,0) },
+	{ "FLD32", 0, RM(0xD9,0) },
+	{ "FLDCW", 0, RM(0xD9, 5) },
+	// SSE
+	{ "MOVSD", 0xF20F10, 0xF20F11  },
+	{ "MOVSS", 0xF30F10, 0xF30F11  },
+	{ "COMISD", LONG_RM(0x660F2F,1) },
+	{ "COMISS", LONG_RM(0x0F2F,1) },
+	{ "ADDSD", 0xF20F58 },
+	{ "SUBSD", 0xF20F5C },
+	{ "MULSD", 0xF20F59 },
+	{ "DIVSD", 0xF20F5E },
+	{ "ADDSS", 0xF30F58 },
+	{ "SUBSS", 0xF30F5C },
+	{ "MULSS", 0xF30F59 },
+	{ "DIVSS", 0xF30F5E },
+	{ "XORPS", LONG_OP(0x0F57) },
+	{ "XORPD", 0x660F57 },
+	{ "CVTSI2SD", 0xF20F2A },
+	{ "CVTSI2SS", 0xF30F2A },
+	{ "CVTTSD2SI", 0xF20F2C },
+	{ "CVTSD2SS", 0xF20F5A },
+	{ "CVTSS2SD", 0xF30F5A },
+	{ "CVTTSS2SI", 0xF30F2C },
+	{ "STMXCSR", 0, LONG_RM(0x0FAE,3) },
+	{ "LDMXCSR", 0, LONG_RM(0x0FAE,2) },
+	{ "STC", 0xF9 },
+	{ "CLC", 0xF8 },
+	// 8 bits,
+	{ "ADD8", 0, RM(0x00,3) },
+	{ "SUB8", 0, 0x28 },
+	{ "MOV8", 0x8A, 0x88, 0, RM(0xC6,0) },
+	{ "MOVZX8", LONG_OP(0x0FB6) },
+	{ "MOVSX8", LONG_OP(0x0FBE) },
+	{ "CMP8", 0x3A, 0x38, 0, RM(0x80,7) },
+	{ "TEST8", 0x84, 0x84, RM(0xF6,0) },
+	{ "PUSH8", FLAG_DEF64, 0, 0x6A | FLAG_8B },
+	{ "ADD16", 0, OP16(0x01) },
+	{ "SUB16", 0, OP16(0x29) },
+	{ "IMUL16", OP16(LONG_OP(0x0FAF)) },
+	{ "MOV16", OP16(0x8B), OP16(0x89), OP16(0xB8) },
+	{ "MOVZX16", LONG_OP(0x0FB7) },
+	{ "MOVSX16", LONG_OP(0x0FBF) },
+	{ "CMP16", OP16(0x3B), OP16(0x39) },
+	{ "TEST16", OP16(0x85) },
+	// prefetchs
+	{ "PREFETCHT0", FLAG_DEF64, LONG_RM(0x0F18,1) },
+	{ "PREFETCHT1", FLAG_DEF64, LONG_RM(0x0F18,2) },
+	{ "PREFETCHT2", FLAG_DEF64, LONG_RM(0x0F18,3) },
+	{ "PREFETCHNTA", FLAG_DEF64, LONG_RM(0x0F18,0) },
+	{ "PREFETCHW", FLAG_DEF64, LONG_RM(0x0F0D,1) },
+};
+
+#ifdef HL_64
+#	define REX()	if( r64 ) B(r64 | 0x40)
+#else
+#	define REX()
+#endif
+
+static const int SIB_MULT[] = {-1, 0, 1, -1, 2, -1, -1, -1, 3};
+
+#define B(v)					ctx->code.values[ctx->code.cur++] = (unsigned char)(v)
+#define W(wv)					*(int*)&ctx->code.values[_incr(&ctx->code.cur,4)] = wv
+#define W64(v64)				*(int_val*)&ctx->code.values[_incr(&ctx->code.cur,8)] = v64
+
+#define MOD_RM(mod,reg,rm)		B(((mod) << 6) | (((reg)&7) << 3) | ((rm)&7))
+#define SIB(mult,rmult,rbase)	B((SIB_MULT[mult]<<6) | (((rmult)&7)<<3) | ((rbase)&7))
+#define IS_SBYTE(c)				( (c) >= -128 && (c) < 128 )
+
+#define BREAK()					B(0xCC)
+
+#define	OP(b)	\
+	if( (b) & 0xFF0000 ) { \
+		B((b)>>16); \
+		if( r64 ) B(r64 | 0x40); /* also in 32 bits mode */ \
+		B((b)>>8); \
+		B(b); \
+	} else { \
+		if( (b) & FLAG_16B ) { \
+			B(0x66); \
+			REX(); \
+		} else {\
+			REX(); \
+		}\
+		if( (b) & FLAG_LONGOP ) B((b)>>8); \
+		B(b); \
+	}
+
+struct _code_ctx {
+	jit_ctx *jit;
+	byte_arr code;
+	int_arr funs;
+	int_arr short_jumps;
+	int_arr near_jumps;
+	value_map const_table_lookup;
+	byte_arr const_table;
+	int_arr const_refs;
+	int_arr const_addr;
+	int *pos_map;
+	int cur_op;
+	bool flushed;
+	int const_table_pos;
+	int null_access_pos;
+	int null_field_pos;
+};
+
+static int _incr( int*v, int n ) {
+	int k = *v;
+	*v += n;
+	return k;
+}
+
+const char *hl_natreg_str( int reg, emit_mode m ) {
+	static char out[16];
+	static const char *regs_str[] = { "AX", "CX", "DX", "BX", "SP", "BP", "SI", "DI" };
+	static const char *regs_str8[] = { "AL", "CL", "DL", "BL", "SPL", "BPL", "SIL", "DIL" };
+	CpuReg r = REG_REG(reg);
+	switch( m ) {
+	case M_I32:
+		if( r < 8 )
+			sprintf(out,"E%s",regs_str[r]);
+		else
+			sprintf(out,"R%dD%s",r,r<16?"":"???");
+		break;
+	case M_UI16:
+		if( r < 8 )
+			sprintf(out,"%s",regs_str[r]);
+		else
+			sprintf(out,"R%dW%s",r,r<16?"":"???");
+		break;
+	case M_UI8:
+		if( r < 8 )
+			sprintf(out,"%s",regs_str8[r]);
+		else
+			sprintf(out,"R%dB%s",r,r<16?"":"???");
+		break;
+	case M_F32:
+		r -= 64;
+		sprintf(out,"XMM%df%s",r,r >= 0 && r < 16 ? "" : "???");
+		break;
+	case M_F64:
+		r -= 64;
+		sprintf(out,"XMM%d%s",r,r >= 0 && r < 16 ? "" : "???");
+		break;
+	default:
+		if( r < 8 )
+			sprintf(out,"R%s",regs_str[r]);
+		else
+			sprintf(out,"R%d%s",r,r<16?"":"???");
+		break;
+	}
+	return out;
+}
+
+static int scratch_float_reg = -1;
+
+static ereg scratch_not_param[] = { R(RAX), R(R10), R(R11) };
+
+void hl_jit_init_regs( regs_config *cfg ) {
+	// exclude R11 at it's use as temporary for various ops
+#	ifdef HL_WIN_CALL
+	static int scratch_regs[] = { R(RAX), R(RCX), R(RDX), R(R8), R(R9), R(R10), /*R(R11)*/ };
+	static int free_regs[] = { R(RSI), R(RDI), R(RBX), R(R12), R(R13), R(R14), R(R15) };
+	static int call_regs[] = { R(RCX), R(RDX), R(R8), R(R9) };
+#	else
+	static int scratch_regs[] = { R(RAX), R(RCX), R(RDX), R(RSI), R(RDI), R(R8), R(R9), R(R10), /*R(R11)*/ };
+	static int free_regs[] = { R(RBX), R(R12), R(R13), R(R14), R(R15) };
+	static int call_regs[] = { R(RDI), R(RSI), R(RDX), R(RCX), R(R8), R(R9) };
+#	endif
+	cfg->regs.ret = scratch_regs[0];
+	cfg->regs.nscratchs = sizeof(scratch_regs) / sizeof(int);
+	cfg->regs.npersists = sizeof(free_regs) / sizeof(int);
+	cfg->regs.nargs = sizeof(call_regs) / sizeof(int);
+	cfg->regs.scratch = (ereg*)scratch_regs;
+	cfg->regs.persist = (ereg*)free_regs;
+	cfg->regs.arg = (ereg*)call_regs;
+	// floats
+	static int floats[] = {
+		MMX(0), MMX(1), MMX(2), MMX(3), 
+		MMX(4), MMX(5), MMX(6), MMX(7), 
+		MMX(8), MMX(9), MMX(10), MMX(11), 
+		MMX(12), MMX(13), MMX(14), MMX(15)
+	};
+#	ifdef HL_WIN_CALL
+	cfg->floats.nargs = 4;
+	cfg->floats.nscratchs = 6;
+#	else
+	cfg->floats.nargs = 8;
+	cfg->floats.nscratchs = 16;
+#	endif
+	scratch_float_reg = cfg->floats.nscratchs - 1;
+	cfg->floats.nscratchs--;
+	cfg->floats.ret = floats[0];
+	cfg->floats.scratch = (ereg*)floats;
+	cfg->floats.arg = (ereg*)floats;
+	cfg->floats.persist = (ereg*)floats + cfg->floats.nscratchs + 1;
+	cfg->floats.npersists = 15 - cfg->floats.nscratchs;
+	// extra
+	cfg->req_bit_shifts = R(RCX);
+	cfg->req_div_a = R(RAX);
+	cfg->req_div_b = R(RCX);
+	cfg->stack_reg = R(RSP);
+	cfg->stack_pos = R(RBP);
+	cfg->stack_align = 16;
+#	ifdef GEN_DEBUG
+	cfg->debug_prefix_size = 6;
+#	endif
+}
+
+#define EMIT(op,a,b,mode) emit_ext(ctx,op,a,b,mode,0)
+#define ID2(a,b)	((a) | ((b)<<8))
+
+typedef enum {
+	RCPU = 0,
+	RFPU = 1,
+	RSTACK = 2,
+	RCONST = 3,
+	RMEM = 4,
+	RUNUSED = 5,
+} preg_kind;
+
+typedef struct {
+	preg_kind kind;
+	CpuReg reg;
+	int64 value;
+} preg;
+
+#define ERRIF(v)	if( v ) jit_assert()
+
+static preg make_reg( ereg r, uint64 value ) {
+	preg p;
+	if( IS_NULL(r) ) {
+		p.kind = RUNUSED;
+		return p;
+	}
+	if( r == VAL_CONST ) {
+		p.kind = RCONST;
+		p.value = value;
+		return p;
+	}
+	p.reg = REG_REG(r);
+	p.value = REG_VALUE(r);
+	switch( REG_KIND(r) ) {
+	case R_REG:
+		if( p.reg >= 64 ) {
+			p.kind = RFPU;
+			p.reg -= 64;
+		} else
+			p.kind = RCPU;
+		break;
+	case R_REG_PTR:
+		if( p.reg == RBP )
+			p.kind = RSTACK;
+		else
+			p.kind = RMEM;
+		break;
+	case R_CONST:
+		p.kind = RCONST;
+		break;
+	default:
+		jit_assert();
+		break;
+	}
+	if( p.reg < 0 || p.reg > 15 ) jit_assert();
+	return p;
+}
+
+static void emit_ext( code_ctx *ctx, CpuOp op, ereg _a, ereg _b, emit_mode mode, int_val _value ) {
+	opform *f = &OP_FORMS[op];
+	int mode64 = mode == M_PTR && (f->r_mem&FLAG_DEF64) == 0 ? 8 : 0;
+	int r64 = mode64;
+	preg a = make_reg(_a,_value), b = make_reg(_b,_value);
+	switch( ID2(a.kind,b.kind) ) {
+	case ID2(RUNUSED,RUNUSED):
+		ERRIF(f->r_mem == 0);
+		OP(f->r_mem);
+		break;
+	case ID2(RCPU,RCPU):
+	case ID2(RFPU,RFPU):
+		if( f->mem_r ) {
+			// canonical form
+			if( a.reg & 8 ) r64 |= 1;
+			if( b.reg & 8 ) r64 |= 4;
+			OP(f->mem_r);
+			MOD_RM(3,b.reg,a.reg);
+		} else {
+			ERRIF( f->r_mem == 0 );
+			if( a.reg & 8 ) r64 |= 4;
+			if( b.reg & 8 ) r64 |= 1;
+			OP(f->r_mem);
+			MOD_RM(3,a.reg,b.reg);
+		}
+		break;
+	case ID2(RCPU,RFPU):
+	case ID2(RFPU,RCPU):
+		ERRIF( (f->r_mem>>16) == 0 );
+		if( a.reg & 8 ) r64 |= 4;
+		if( b.reg & 8 ) r64 |= 1;
+		OP(f->r_mem);
+		MOD_RM(3,a.reg,b.reg);
+		break;
+	case ID2(RCPU,RUNUSED):
+		ERRIF( f->r_mem == 0 );
+		if( a.reg & 8 ) r64 |= 1;
+		if( GET_RM(f->r_mem) > 0 ) {
+			OP(f->r_mem);
+			MOD_RM(3, GET_RM(f->r_mem)-1, a.reg);
+		} else
+			OP(f->r_mem + (a.reg&7));
+		break;
+	case ID2(RSTACK,RUNUSED):
+		ERRIF( f->mem_r == 0 || GET_RM(f->mem_r) == 0 );
+		OP(f->mem_r);
+		if( IS_SBYTE(a.value) ) {
+			MOD_RM(1,GET_RM(f->mem_r)-1,RBP);
+			B(a.value);
+		} else {
+			MOD_RM(2,GET_RM(f->mem_r)-1,RBP);
+			W((int)a.value);
+		}
+		break;
+	case ID2(RCPU,RCONST):
+		ERRIF( f->r_const == 0 && f->r_i8 == 0 );
+		if( a.reg & 8 ) r64 |= 1;
+		if( f->r_i8 && IS_SBYTE(b.value) ) {
+			if( (f->r_i8&FLAG_DUAL) && (a.reg & 8) ) r64 |= 4;
+			OP(f->r_i8);
+			if( (f->r_i8&FLAG_DUAL) ) MOD_RM(3,a.reg,a.reg); else MOD_RM(3,GET_RM(f->r_i8)-1,a.reg);
+			B(b.value);
+		} else if( GET_RM(f->r_const) > 0 || (f->r_const&FLAG_DUAL) ) {
+			if( (f->r_i8&FLAG_DUAL) && (a.reg & 8) ) r64 |= 4;
+			OP(f->r_const&0xFF);
+			if( (f->r_i8&FLAG_DUAL) ) MOD_RM(3,a.reg,a.reg); else MOD_RM(3,GET_RM(f->r_const)-1,a.reg);
+			if( mode64 && IS_64 && op == _MOV ) W64(b.value); else W((int)b.value);
+		} else {
+			ERRIF( f->r_const == 0);
+			OP((f->r_const&0xFF) + (a.reg&7));
+			if( mode64 && IS_64 && op == _MOV ) W64(b.value); else W((int)b.value);
+		}
+		break;
+	case ID2(RSTACK,RCPU):
+	case ID2(RSTACK,RFPU):
+		ERRIF( f->mem_r == 0 );
+		if( b.reg & 8 ) r64 |= 4;
+		OP(f->mem_r);
+		if( IS_SBYTE(a.value) ) {
+			MOD_RM(1,b.reg,RBP);
+			B(a.value);
+		} else {
+			MOD_RM(2,b.reg,RBP);
+			W((int)a.value);
+		}
+		break;
+	case ID2(RCPU,RSTACK):
+	case ID2(RFPU,RSTACK):
+		ERRIF( f->r_mem == 0 );
+		if( a.reg & 8 ) r64 |= 4;
+		OP(f->r_mem);
+		if( IS_SBYTE(b.value) ) {
+			MOD_RM(1,a.reg,RBP);
+			B(b.value);
+		} else {
+			MOD_RM(2,a.reg,RBP);
+			W((int)b.value);
+		}
+		break;
+	case ID2(RCONST,RUNUSED):
+		ERRIF( f->r_const == 0 );
+		OP(f->r_const);
+		if( f->r_const & FLAG_8B ) B(a.value); else W((int)a.value);
+		break;
+	case ID2(RMEM,RUNUSED):
+		ERRIF( f->mem_r == 0 );
+		if( a.reg & 8 ) r64 |= 1;
+		OP(f->mem_r);
+		if( a.value == 0 && (a.reg&7) != RBP ) {
+			MOD_RM(0,GET_RM(f->mem_r)-1,a.reg);
+			if( (a.reg&7) == RSP ) B(0x24);
+		} else if( IS_SBYTE(a.value) ) {
+			MOD_RM(1,GET_RM(f->mem_r)-1,a.reg);
+			if( (a.reg&7) == RSP ) B(0x24);
+			B(a.value);
+		} else {
+			MOD_RM(2,GET_RM(f->mem_r)-1,a.reg);
+			if( (a.reg&7) == RSP ) B(0x24);
+			W((int)a.value);
+		}
+		break;
+	case ID2(RCPU, RMEM):
+	case ID2(RFPU, RMEM):
+		ERRIF( f->r_mem == 0 );
+		if( a.reg & 8 ) r64 |= 4;
+		if( b.reg & 8 ) r64 |= 1;
+		OP(f->r_mem);
+		if( b.value == 0 && (b.reg&7) != RBP ) {
+			MOD_RM(0,a.reg,b.reg);
+			if( (b.reg&7) == RSP ) B(0x24);
+		} else if( IS_SBYTE(b.value) ) {
+			MOD_RM(1,a.reg,b.reg);
+			if( (b.reg&7) == RSP ) B(0x24);
+			B(b.value);
+		} else {
+			MOD_RM(2,a.reg,b.reg);
+			if( (b.reg&7) == RSP ) B(0x24);
+			W((int)b.value);
+		}
+		break;
+	case ID2(RMEM, RCPU):
+	case ID2(RMEM, RFPU):
+		ERRIF( f->mem_r == 0 );
+		if( a.reg & 8 ) r64 |= 1;
+		if( b.reg & 8 ) r64 |= 4;
+		OP(f->mem_r);
+		if( a.value == 0 && (a.reg&7) != RBP ) {
+			MOD_RM(0,b.reg,a.reg);
+			if( (a.reg&7) == RSP ) B(0x24);
+		} else if( IS_SBYTE(a.value) ) {
+			MOD_RM(1,b.reg,a.reg);
+			if( (a.reg&7) == RSP ) B(0x24);
+			B(a.value);
+		} else {
+			MOD_RM(2,b.reg,a.reg);
+			if( (a.reg&7) == RSP ) B(0x24);
+			W((int)a.value);
+		}
+		break;
+	default:
+		ERRIF(1);
+	}
+}
+
+static void emit_jump( code_ctx *ctx, int mode, int offset ) {
+	int op_mult = 16;
+#	ifdef GEN_DEBUG
+	op_mult += 6; // additional debug info per op
+#	endif
+	if( IS_SBYTE(offset*op_mult) ) {
+		// assume it's ok to use short jump
+		B(mode == JAlways ? JAlways_short : mode - 0x10);
+		int_arr_add(ctx->short_jumps, byte_count(ctx->code));
+		int_arr_add(ctx->short_jumps, ctx->cur_op + offset + 1);
+		B(-2);
+	} else {
+		if( mode != JAlways ) B(0x0F);
+		B(mode);
+		int_arr_add(ctx->near_jumps, byte_count(ctx->code));
+		int_arr_add(ctx->near_jumps, ctx->cur_op + offset + 1);
+		W(-5);
+	}
+}
+
+#define RTMP R(R11)
+static ereg get_tmp( emit_mode mode ) {
+	if( IS_FLOAT(mode) ) 
+		return MMX(scratch_float_reg);
+	return RTMP;
+}
+
+static void emit_mov( code_ctx *ctx, ereg out, ereg val, emit_mode mode ) {
+	if( out == val )
+		return;
+	if( !IS_REG(out) && (!IS_REG(val) || REG_VALUE(val) != 0) ) {
+		ereg tmp = get_tmp(mode);
+		emit_mov(ctx, tmp, val, mode);
+		emit_mov(ctx, out, tmp, mode);
+	} else if( IS_REG(val) && REG_VALUE(val) != 0 ) {
+		emit_ext(ctx,_LEA,out,REG_PTR(val),M_PTR,0);
+	} else {
+		static CpuOp MOV_OP[] = {_MOV,MOV8,MOV16,_MOV,_MOV,MOVSD,MOVSS,_MOV,_MOV};
+		CpuOp op = MOV_OP[mode];
+		if( (mode == M_UI8 || mode == M_UI16) && IS_REG(out) ) {
+			op++; // MOVZX
+			mode = M_PTR;
+		}
+		emit_ext(ctx,op,out,val,mode,0);
+	}
+}
+
+static int jump_near( code_ctx *ctx, int mode ) {
+	int pos = byte_count(ctx->code);
+	if( mode < 0 ) {
+		// backwards
+		int target = -mode;
+		B(JAlways_short);
+		B(target - (pos + 2)); 
+	} else {
+		B(mode == JAlways ? JAlways_short : mode - 0x10);
+		B(0);
+	}
+	return pos;
+}
+
+static void patch_jump_near( code_ctx *ctx, int jpos ) {
+	if( !jpos ) return;
+	ctx->code.values[jpos + 1] = (unsigned char)(byte_count(ctx->code) - (jpos + 2));
+}
+
+static void emit_div_mod( code_ctx *ctx, hl_op op, ereg out, ereg a, ereg b, emit_mode mode ) {
+	if( IS_FLOAT(mode) ) {
+		BREAK();
+		return;
+	}
+	ereg bas = R(RAX), div = R(RDX);
+	if( out != bas ) EMIT(_PUSH,bas,UNUSED,M_PTR);
+	if( out != div ) EMIT(_PUSH,div,UNUSED,M_PTR);
+	if( b == bas || b == div || !IS_REG(b) ) {
+		EMIT(_MOV,RTMP,b,mode);
+		b = RTMP;
+	}
+	if( a != bas ) EMIT(_MOV,bas,a,mode);
+		
+	// check for div = 0
+	EMIT(_TEST,b,b,mode);
+	int jz = jump_near(ctx,JZero);
+	int jz1 = 0;
+	// Prevent MIN/-1 overflow exception
+	// OSMod: r = (b == 0 || b == -1) ? 0 : a % b
+	// OSDiv: r = (b == 0 || b == -1) ? a * b : a / b
+	if( op == OSMod || op == OSDiv ) {
+		EMIT(_CMP,b,MK_CONST(-1),mode);
+		jz1 = jump_near(ctx,JZero);
+	}
+	bool unsign = op == OUDiv || op == OUMod;
+	if( unsign )
+		EMIT(XOR,div,div,mode);
+	else
+		EMIT(CDQ, UNUSED, UNUSED, mode);
+	EMIT(unsign ? DIV : IDIV, b, UNUSED, mode);
+	ereg res = (op == OUDiv || op == OSDiv) ? bas : div;
+	int jn = jump_near(ctx,JAlways);
+	patch_jump_near(ctx,jz);
+	patch_jump_near(ctx,jz1);
+	if( op != OSDiv ) {
+		EMIT(XOR, res, res, mode);
+	} else {
+		if( res != bas ) EMIT(_MOV,res,bas,mode);
+		EMIT(IMUL,res,b,mode);
+	}
+	patch_jump_near(ctx,jn);
+	if( out != res ) EMIT(_MOV,out,res,mode);
+	if( out != div ) EMIT(_POP,div,UNUSED,M_PTR);	
+	if( out != bas ) EMIT(_POP,bas,UNUSED,M_PTR);
+}
+
+static void emit_anyop( code_ctx *ctx, hl_op op, ereg out, ereg a, ereg b, emit_mode mode ) {
+	CpuOp cop;
+	int mask = 0;
+#	define F_OP(iop,f32,f64) cop = mode == M_F32 ? f32 : (mode == M_F64 ? f64 : iop);
+#	define DECL_OP(i8,i16,iop,f32,f64) static CpuOp ops_##iop[] = {-1,i8,i16,iop,iop,f64,f32,-1,-1}; cop = ops_##iop[mode]
+	switch( op ) {
+	case OAdd:
+		DECL_OP(ADD8,ADD16,ADD,ADDSS,ADDSD);
+		break;
+	case OSub:
+		DECL_OP(SUB8,SUB16,SUB,SUBSS,SUBSD);
+		break;
+	case OMul:
+		DECL_OP(IMUL16/*NO IMUL8*/,IMUL16,IMUL,MULSS,MULSD);
+		if( mode == M_UI8 ) mask = 0xFF;
+		break;
+	case OIncr:
+		cop = INC;
+		break;
+	case ODecr:
+		cop = DEC;
+		break;
+	case OAnd:
+		cop = AND;
+		break;
+	case OOr:
+		cop = OR;
+		break;
+	case OXor:
+		cop = XOR;
+		break;
+	case OShl:
+	case OSShr:
+	case OUShr:
+		{
+			ereg f = R(RCX);
+			if( b != f ) {
+				if( a == f || out == f ) {
+					EMIT(_MOV,RTMP,a,mode);
+					a = RTMP;
+				}
+				if( out == f ) {
+					EMIT(_MOV,f,b,mode);
+					emit_anyop(ctx, op, RTMP, RTMP, f, mode);
+					EMIT(_MOV,f,RTMP,mode);
+				} else {
+					EMIT(_PUSH,f,UNUSED,M_PTR);
+					EMIT(_MOV,f,b,mode);
+					emit_anyop(ctx, op, out, a, f, mode);
+					EMIT(_POP,f,UNUSED,M_PTR);
+				}
+				return;
+			}
+		}
+		if( out == b ) {
+			ereg r = get_tmp(mode);
+			emit_anyop(ctx,op,r,a,b,mode);
+			emit_mov(ctx,out,r,mode);
+			return;
+		}
+		b = UNUSED;
+		cop = (op == OShl ? SHL : (op == OSShr ? SAR : SHR));
+		break;
+	case OSDiv:
+		F_OP(0,DIVSS,DIVSD);
+		if( IS_FLOAT(mode) ) break;
+	case OSMod:
+	case OUMod:
+	case OUDiv:
+		emit_div_mod(ctx,op,out,a,b,mode);
+		return;
+	case ONot:
+		if( IS_REG(a) ) {
+			EMIT(XOR,a,MK_CONST(1),M_I32);
+		} else {			
+			BREAK();
+		}
+		return;
+	case ONeg:
+		if( IS_FLOAT(mode) ) {
+			if( out != a && IS_REG(out) ) {
+				EMIT(mode == M_F32 ? XORPS : XORPD, out, out, mode);
+				EMIT(mode == M_F32 ? SUBSS : SUBSD, out, a, mode);
+			} else {
+				ereg tmp = get_tmp(mode);
+				EMIT(mode == M_F32 ? XORPS : XORPD, tmp, tmp, mode);
+				EMIT(mode == M_F32 ? SUBSS : SUBSD, tmp, a, mode);
+				EMIT(mode == M_F32 ? MOVSS : MOVSD, out, tmp, mode);
+			}
+			return;
+		}
+		cop = NEG;
+		break;
+	default:
+		jit_assert();
+		break;
+	}
+
+	if( out == a && IS_REG(a) ) {
+		EMIT(cop,out,b,mode);
+	} else if( !IS_REG(out) || out == b ) {
+		ereg tmp = get_tmp(mode);
+		emit_mov(ctx, tmp, a, mode);
+		EMIT(cop,tmp,b,mode);
+		if( mask ) {
+			EMIT(AND,tmp,MK_CONST(mask),M_I32);
+			mask = 0;
+		}
+		emit_mov(ctx, out, tmp, mode);
+	} else {
+		emit_mov(ctx, out, a, mode);
+		EMIT(cop,out,b,mode);
+	}
+	if( mask ) EMIT(AND,out,MK_CONST(mask),M_I32);
+}
+
+void hl_codegen_flush( jit_ctx *jit ) {
+	code_ctx *ctx = jit->code;
+	if( ctx->flushed ) return;
+	ctx->flushed = true;
+	jit->code_size = ctx->code.cur;
+	jit->code_instrs = ctx->code.values;
+	jit->code_pos_map = ctx->pos_map;
+	if( ctx->pos_map ) ctx->pos_map[ctx->cur_op+1] = ctx->code.cur;
+}
+
+static void emit_nop( code_ctx *ctx, int size ) {
+	byte_reserve(ctx->code,size);
+	ctx->code.cur -= size;
+	if( size >= 8 ) {
+		W(0x841F0F);
+		W(0);
+		return;
+	}
+	if( size >= 5 ) {
+		W(0x441F0F);
+		B(0);
+		return;
+	}
+	if( size >= 4 ) {
+		W(0x401F0F);
+		return;
+	}
+	if( size >= 3 ) {
+		B(0x0F);
+		B(0x1F);
+		B(0x00);
+		return;
+	}
+	if( size >= 2 ) {
+		B(0x66);
+		B(0x90);
+		return;
+	}
+	B(0x90);
+}
+
+#define CALC_REX(w,a,b) (((w)&8) ? 4 : 0) | (((b)&8) ? 2 : 0) | (((a) & 8) ? 1 : 0)
+
+#define REX64(out,a,b)	B(0x48 | CALC_REX(out,a,b))
+#define REX32(out,a,b)	{ int v = CALC_REX(out,a,b); if( v ) B(v|0x40); }
+
+static void emit_lea( code_ctx *ctx, ereg out, einstr *_e ) {
+	einstr e = *_e;
+
+	int mult = e.size_offs & 0xFF;
+	int offs = e.size_offs >> 8;
+	if( mult != 0 && (mult < 0 || mult > 8 || (mult & (mult - 1)) != 0) ) jit_assert();
+
+	if( IS_REG(e.a) )
+		offs += REG_VALUE(e.a);
+
+	if( !IS_REG(e.a) ) {
+		// a is always a mem address !
+		emit_mov(ctx, RTMP, e.a, M_PTR);
+		e.a = RTMP;
+		if( e.b && !IS_REG(e.b) ) {
+			if( !IS_REG(out) ) jit_assert();
+			emit_mov(ctx, out, e.b, M_I32);
+			e.b = out;
+		}
+	} else if( e.b && !IS_REG(e.b) ) {
+		// b is always an int index !
+		emit_mov(ctx, RTMP, e.b, M_I32);
+		e.b = RTMP;
+	}
+
+	if( mult == 0 ) {
+		if( REG_KIND(e.a) != R_REG ) jit_assert();
+		// no index
+		emit_ext(ctx,_LEA,out,MK_ADDR(e.a,offs),M_PTR,0);
+		return;
+	}
+
+	bool use_offs = offs != 0 || (e.a&7) == RBP;
+	REX64(out,e.a,e.b);
+	B(0x8D);
+	MOD_RM(use_offs ? 1 : 0,out,4);
+	SIB(mult,e.b,e.a);
+	if( use_offs ) {
+		if( !IS_SBYTE(offs) ) jit_assert();
+		B(offs);
+	}
+}
+
+static void align_function( code_ctx *ctx ) {
+	while( byte_count(ctx->code) & 15 )
+		emit_nop(ctx,16 - (byte_count(ctx->code) & 15));
+}
+
+static int reserve_const_segment( code_ctx *ctx, int size, int align ) {
+	int pos = byte_count(ctx->const_table);
+	if( align ) {
+		int k = pos & (align-1);
+		if( k ) {
+			byte_reserve_impl(&ctx->jit->galloc,&ctx->const_table,align - k);
+			pos = byte_count(ctx->const_table);
+		}
+	}
+	byte_reserve_impl(&ctx->jit->galloc,&ctx->const_table,size);
+	return pos;
+}
+
+static void alloc_const( code_ctx *ctx, uint64 value ) {
+	int pos = value_map_find(ctx->const_table_lookup, value);
+	if( pos < 0 ) {
+		pos = reserve_const_segment(ctx,8,8);
+		*(uint64*)byte_addr(ctx->const_table,pos) = value;
+		value_map_add_impl(&ctx->jit->galloc,&ctx->const_table_lookup,value,pos);
+	}
+	int_arr_add_impl(&ctx->jit->galloc,&ctx->const_refs,ctx->jit->out_pos + byte_count(ctx->code) - 4);
+	int_arr_add_impl(&ctx->jit->galloc,&ctx->const_refs,pos);
+}
+
+static int emit_lea_rel( code_ctx *ctx, ereg out ) {
+	B(0x48 + ((out & 8) ? 4 : 0));
+	B(0x8D);
+	MOD_RM(0,out&7,5);
+	int pos = ctx->jit->out_pos + byte_count(ctx->code);
+	W(0);
+	return pos;
+}
+
+static int get_cond_jump( code_ctx *ctx ) {
+	int prev = 0;
+	einstr *p;
+	do {
+		p = ctx->jit->reg_instrs + ctx->cur_op - (++prev);
+	} while( p->op == MOV || p->op == JCOND || p->op == CMOV || p->op == XCHG || p->op == CXCHG );
+	int op;
+	switch( p->size_offs ) {
+	case OJFalse:
+	case OJNull:
+		op = JZero;
+		break;
+	case OJTrue:
+	case OJNotNull:
+		op = JNotZero;
+		break;
+	case OJSGte:
+		op = IS_FLOAT(p->mode) ? JUGte : JSGte;
+		break;
+	case OJSGt:
+		op = IS_FLOAT(p->mode) ? JUGt : JSGt;
+		break;
+	case OJUGte:
+		op = JUGte;
+		break;
+	case OJSLt:
+		op = IS_FLOAT(p->mode) ? JULt : JSLt;
+		break;
+	case OJSLte:
+		op = IS_FLOAT(p->mode) ? JULte : JSLte;
+		break;
+	case OJULt:
+		op = JULt;
+		break;
+	case OJEq:
+		op = JEq;
+		break;
+	case OJNotEq:
+		op = JNeq;
+		break;
+	case OJNotLt:
+		op = JUGte;
+		break;
+	case OJNotGte:
+		op = JULt;
+		break;
+	case 0:
+		if( p->op == DEBUG_BREAK ) {
+			// found a debug break !
+			BREAK();
+			op = JZero;
+			break;
+		}
+		// fallback
+	default:
+		jit_assert();
+		break;
+	}
+	return op;
+}
+
+static void emit_cmov( code_ctx *ctx, ereg out, ereg r, int cond, emit_mode m ) {
+	if( IS_FLOAT(m) ) jit_assert();
+	if( hl_emit_mode_sizes[m] == 8 )
+		REX64(out,r,UNUSED);
+	else
+		REX32(out,r,UNUSED);
+	B(0x0F);
+	B(cond - 0x40);
+	MOD_RM(3,out,r);
+}
+
+void hl_codegen_function( jit_ctx *jit ) {
+	code_ctx *ctx = jit->code;
+	ctx->flushed = false;
+	byte_free(&ctx->code);
+	int_arr_free(&ctx->near_jumps);
+	int_arr_free(&ctx->short_jumps);
+	free(ctx->pos_map);
+	ctx->pos_map = (int*)malloc((jit->reg_instr_count + 1) * sizeof(int));
+	ctx->pos_map[0] = 0;
+	int const_addr_prev = int_arr_count(ctx->const_addr);
+	byte_reserve(ctx->code,64);
+	ctx->code.cur -= 64;
+#	ifdef GEN_DEBUG
+	int reg_index = 0;
+	int emit_index = 0;
+#	endif
+	for(int cur_pos=0;cur_pos<jit->reg_instr_count;cur_pos++) {
+		einstr *e = jit->reg_instrs + cur_pos;
+		ereg out = jit->reg_writes[cur_pos];
+		byte_reserve(ctx->code,64);
+		ctx->code.cur -= 64;
+		ctx->cur_op = cur_pos;
+		if( cur_pos > 0 ) ctx->pos_map[cur_pos] = ctx->code.cur;
+#		ifdef GEN_DEBUG
+		int rid = cur_pos | (jit->fun->findex << 16);
+		while( reg_index < jit->instr_count && jit->reg_pos_map[reg_index] <= cur_pos ) reg_index++;
+		int uid;
+		while( emit_index < jit->fun->nops && jit->emit_pos_map[emit_index] < reg_index ) {
+			uid = emit_index | (jit->fun->findex << 16);
+			__ignore(&uid);
+			__ignore(&rid);
+			emit_index++;
+			if( emit_index >= jit->fun->nops || jit->emit_pos_map[emit_index] >= reg_index )
+				emit_ext(ctx,_MOV,RTMP,VAL_CONST,M_I32,uid);
+		}
+#		endif
+		switch( e->op ) {
+		case LOAD_ARG:
+			continue; // nop
+		case MOV:
+			emit_mov(ctx, out, e->a, e->mode);
+			break;
+		case XCHG:
+			{
+				ereg tmp = get_tmp(e->mode);
+				if( !IS_REG(e->a) && !IS_REG(e->b) )
+					jit_assert();
+				emit_mov(ctx, tmp, e->a, M_PTR);
+				emit_mov(ctx, e->a, e->b, M_PTR);
+				emit_mov(ctx, e->b, tmp, M_PTR);
+			}
+			break;
+		case STORE:
+			if( !IS_REG(e->a) && !IS_REG(e->b) ) {
+				if( e->mode != M_PTR ) {
+					// no push/pop 32 bit
+					ereg tmp2 = R(RAX);
+					emit_mode mode = e->mode == M_F64 ? M_PTR : e->mode == M_F32 ? M_I32 : e->mode; 
+					EMIT(_PUSH,tmp2,UNUSED,M_PTR);
+					emit_mov(ctx, RTMP, e->a, M_PTR);
+					emit_mov(ctx, tmp2, e->b, mode);
+					emit_mov(ctx, MK_ADDR(RTMP,e->size_offs), tmp2, mode);
+					EMIT(_POP,tmp2,UNUSED,M_PTR);
+				} else {
+					if( IS_FLOAT(e->mode) ) BREAK();
+					EMIT(_PUSH,e->b,UNUSED,e->mode);
+					emit_mov(ctx, RTMP, e->a, M_PTR);
+					emit_ext(ctx, _POP,REG_ADD_OFFSET(REG_PTR(RTMP),e->size_offs), UNUSED, e->mode, 0);
+				}
+			} else if( !IS_REG(e->a) ) {
+				emit_mov(ctx, RTMP, e->a, M_PTR);
+				emit_mov(ctx, MK_ADDR(RTMP,e->size_offs), e->b, e->mode);
+			} else
+				emit_mov(ctx, REG_ADD_OFFSET(REG_PTR(e->a),e->size_offs), e->b, e->mode);
+			break;
+		case PUSH:
+			if( IS_FLOAT(e->mode) ) {
+				if( !IS_REG(e->a) )
+					EMIT(_PUSH,e->a,UNUSED,M_PTR);
+				else {
+					EMIT(SUB,R(RSP),MK_CONST(8),M_PTR);
+					EMIT(e->mode == M_F32 ? MOVSS : MOVSD,REG_PTR(R(RSP)),e->a,e->mode);
+				}
+			} else if( IS_REG(e->a) && REG_VALUE(e->a) != 0 ) {
+				emit_mov(ctx, RTMP, e->a, e->mode);
+				EMIT(_PUSH, RTMP, UNUSED, M_PTR);
+			} else
+				EMIT(_PUSH, e->a, UNUSED, M_PTR);
+			break;
+		case POP:
+			if( IS_FLOAT(e->mode) ) {
+				EMIT(e->mode == M_F32 ? MOVSS : MOVSD,REG_PTR(R(RSP)),e->a,e->mode);
+				EMIT(ADD,R(RSP),MK_CONST(8),M_PTR);
+			} else {
+				EMIT(_POP, e->a, UNUSED, M_PTR);
+			}
+			break;
+		case PUSH_CONST:
+			if( e->mode != M_PTR ) jit_assert();
+			if( (e->value&0xFF) == e->value )
+				emit_ext(ctx,PUSH8, VAL_CONST, UNUSED, M_PTR, e->value);
+			else if( (e->value&0xFFFFFFFF) == e->value )
+				emit_ext(ctx,_PUSH, VAL_CONST, UNUSED, M_I32, e->value); // will push 64bits
+			else
+				emit_ext(ctx,_PUSH, VAL_CONST, UNUSED, M_PTR, e->value);
+			break;
+		case DEBUG_BREAK:
+			BREAK();
+			break;
+		case RET:
+			if( !IS_NULL(e->a) ) {
+				ereg ret = IS_FLOAT(e->mode) ? MMX(0) : R(RAX);
+				if( e->a != ret ) emit_mov(ctx, ret, e->a, e->mode);
+			}
+			EMIT(_RET, UNUSED, UNUSED, M_NONE);
+			break;
+		case LOAD_CONST:
+			{
+				emit_mode mode = e->mode;
+				if( !IS_REG(out) )
+					mode = (mode == M_F32 ? M_I32 : mode == M_F64 ? M_PTR : mode); // don't use FP for stack ops
+				ereg w = IS_REG(out) ? out : get_tmp(mode);
+				if( e->value == 0 )
+					EMIT(mode == M_F32 ? XORPS : mode == M_F64 ? XORPD : XOR, w, w, mode);
+				else if( IS_FLOAT(mode) ) {
+					// MOVSS / MOVSD with data relative
+					B(e->mode == M_F32 ? 0xF3 : 0xF2);
+					if( out&8 ) B(0x44);
+					B(0x0F);
+					B(0x10);
+					MOD_RM(0,out&7,5);
+					W(0);					
+					alloc_const(ctx, e->value);
+				} else if( mode == M_PTR && (e->value&0xFFFFFFFF) == e->value )
+					emit_ext(ctx, _MOV, w, VAL_CONST, M_I32, e->value);
+				else
+					emit_ext(ctx, _MOV, w, VAL_CONST, mode, e->value);
+				if( w != out )
+					emit_mov(ctx, out, w, mode);
+			}
+			break;
+		case LOAD_ADDR:
+			if( IS_REG(e->a) && e->nargs == e->mode ) {
+				emit_mov(ctx, out, REG_ADD_OFFSET(REG_PTR(e->a),e->size_offs), e->nargs);
+			} else {
+				ereg tmp = IS_REG(out) || (e->nargs == e->mode) ? out : RTMP;
+				emit_mov(ctx, RTMP, e->a, M_PTR);
+				emit_mov(ctx, tmp, MK_ADDR(RTMP,e->size_offs), e->nargs);
+				if( out != tmp )
+					emit_mov(ctx, out, tmp, e->mode);
+			}
+			break;
+		case LOAD_FUN:
+			{
+				ereg w = IS_REG(out) ? out : RTMP;
+				int pos = emit_lea_rel(ctx,w);
+				int fid = e->size_offs;
+				int_arr_add_impl(&ctx->jit->galloc,&ctx->funs,pos);
+				int_arr_add_impl(&ctx->jit->galloc,&ctx->funs,fid);
+				if( w != out )
+					emit_mov(ctx, out, w, M_PTR);
+			}
+			break;
+		case CALL_FUN:
+			B(0xE8);
+			{
+				int pos = jit->out_pos + byte_count(ctx->code);
+				int fid = e->a;
+				int_arr_add_impl(&ctx->jit->galloc,&ctx->funs,pos);
+				int_arr_add_impl(&ctx->jit->galloc,&ctx->funs,fid);
+				W(0);
+			}
+			break;
+		case CALL_PTR:
+			if( e->value == (uint64)hl_null_access || e->value == (uint64)hl_jit_null_field_access ) {
+				// call near
+				int target = e->value == (uint64)hl_null_access ? ctx->null_access_pos : ctx->null_field_pos;
+				B(0xE8);
+				W(target - (jit->out_pos + byte_count(ctx->code) + 4));
+			} else {
+				// call near indirect
+				B(0xFF);
+				B(0x15);
+				W(0);
+				alloc_const(ctx, (uint64)e->value);
+				if( e->mode == M_UI8 || e->mode == M_UI16 ) {
+					// clear value upper bits
+					EMIT(e->mode == M_UI8 ? MOVZX8 : MOVZX16,R(RAX),R(RAX),M_PTR);
+				}
+			}
+			break;
+		case CALL_REG:
+			EMIT(_CALL, e->a, UNUSED, M_NONE);
+			break;
+		case TEST:
+			if( IS_FLOAT(e->mode) )
+				jit_assert();
+			if( !IS_REG(e->a) ) {
+				ereg tmp = get_tmp(e->mode);
+				emit_mov(ctx, tmp, e->a, e->mode);
+				EMIT(_TEST,tmp,tmp,e->mode);
+			} else
+				EMIT(_TEST,e->a,e->a,e->mode);
+			break;
+		case CMP:
+			{
+				CpuOp op;
+				switch( e->mode ) {
+				case M_UI8: op = CMP8; break;
+				case M_UI16: op = CMP16; break;
+				case M_F32: op = COMISS; break;
+				case M_F64: op = COMISD; break;
+				default: op = _CMP; break;
+				}
+				ereg a = e->a;
+				if( !IS_REG(e->a) && (IS_FLOAT(e->mode) || !IS_REG(e->b)) ) {
+					ereg tmp = get_tmp(e->mode);
+					emit_mov(ctx, tmp, e->a, e->mode);
+					a = tmp;
+				}
+				EMIT(op,a,e->b,e->mode);
+				if( IS_FLOAT(e->mode) && e->size_offs != OJSGt && e->size_offs != OJNull && e->size_offs != OJNotNull ) {
+					// handle NaNs
+					int jnotnan = jump_near(ctx,JNParity);
+					switch( e->size_offs ) {
+					case OJSLt:
+					case OJNotLt:
+						// set CF=0, ZF=1
+						EMIT(XOR,RTMP,RTMP,M_I32);
+						break;
+					case OJSGte:
+					case OJNotGte:
+						// set ZF=0, CF=1
+						EMIT(XOR,RTMP,RTMP,M_I32);
+						EMIT(STC,UNUSED,UNUSED,0);
+						break;
+					case OJNotEq:
+					case OJEq:
+						// set ZF=0, CF=?
+					case OJSLte:
+						// set ZF=0, CF=0
+						EMIT(TEST,R(RSP),R(RSP),M_PTR);
+						break;
+					default:
+						jit_assert();
+					}
+					patch_jump_near(ctx,jnotnan);
+				}
+			}
+			break;
+		case JCOND:
+			{
+				int jump = get_cond_jump(ctx);
+				emit_jump(ctx, jump, e->size_offs);
+			}
+			break;
+		case JUMP:
+			emit_jump(ctx, JAlways, e->size_offs);
+			break;
+		case JUMP_TABLE:
+			{
+				int start = reserve_const_segment(ctx,HL_WSIZE * e->nargs,16);
+				int pos = emit_lea_rel(ctx, RTMP);
+				int_arr_add_impl(&ctx->jit->galloc,&ctx->const_refs,pos);
+				int_arr_add_impl(&ctx->jit->galloc,&ctx->const_refs,start);
+				ereg a = RTMP;
+				ereg b = e->a;
+				if( IS_REG(b) ) {
+					// jump [a+b*8]
+					B(0x40 | ((a&8)?1:0) | ((b&8)?2:0));
+					B(0xFF);
+					B(0x24);
+					SIB(3,(b&7),(a&7));
+				} else {
+					ereg save = R(RAX);
+					EMIT(_PUSH,save,UNUSED,M_PTR);
+					EMIT(_MOV,save,b,M_I32);
+					// lea tmp, [tmp+save*8]
+					einstr etmp;
+					etmp.a = a;
+					etmp.b = save;
+					etmp.size_offs = 8;
+					emit_lea(ctx, RTMP, &etmp);
+					EMIT(_POP,save,UNUSED,M_PTR);
+					// jump [tmp]
+					B(0x40 | ((RTMP&8)?1:0));
+					B(0xFF);
+					MOD_RM(0,4,RTMP&7);
+				}
+				ereg *args = hl_emit_get_args(jit->emit,e);
+				for(int k=0;k<e->nargs;k++) {
+					int_arr_add_impl(&jit->galloc,&ctx->const_addr,start + k * HL_WSIZE);
+					int_arr_add_impl(&jit->galloc,&ctx->const_addr,ctx->cur_op + (int)args[k] + 1);
+				}
+			}
+			break;
+		case CONV_UNSIGNED:
+		case CONV:
+			{
+				emit_mode in_mode = e->size_offs;
+				ereg r = IS_REG(e->a) ? e->a : get_tmp(in_mode);
+				if( r != e->a ) emit_mov(ctx, r, e->a, in_mode);
+				CpuOp op = -1;
+				switch( ID2(e->mode,in_mode) ) {
+				case ID2(M_F32,M_UI8):
+				case ID2(M_F32,M_UI16):
+				case ID2(M_F32,M_I32):
+				case ID2(M_F32,M_PTR):
+					op = CVTSI2SS;
+					break;
+				case ID2(M_F64,M_UI8):
+				case ID2(M_F64,M_UI16):
+				case ID2(M_F64,M_I32):
+				case ID2(M_F64,M_PTR):
+					op = CVTSI2SD;
+					break;
+				case ID2(M_UI8,M_F32):
+				case ID2(M_UI16,M_F32):
+				case ID2(M_I32,M_F32):
+				case ID2(M_PTR,M_F32):
+					op = CVTTSS2SI;
+					break;
+				case ID2(M_UI8,M_F64):
+				case ID2(M_UI16,M_F64):
+				case ID2(M_I32,M_F64):
+				case ID2(M_PTR,M_F64):
+					op = CVTTSD2SI;
+					break;
+				case ID2(M_F32,M_F64):
+					op = CVTSD2SS;
+					break;
+				case ID2(M_F64,M_F32):
+					op = CVTSS2SD;
+					break;
+				case ID2(M_PTR,M_I32):
+					// sign extend 32-64 bit conv
+					op = MOVSXD;
+					break;
+				case ID2(M_UI16,M_UI8):
+				case ID2(M_I32,M_UI8):
+				case ID2(M_PTR,M_UI8):
+				case ID2(M_UI8, M_UI16):
+				case ID2(M_UI8, M_I32):
+				case ID2(M_UI8, M_PTR):
+					op = MOVZX8;
+					break;
+				case ID2(M_I32,M_UI16):
+				case ID2(M_PTR,M_UI16):
+				case ID2(M_UI16, M_I32):
+				case ID2(M_UI16, M_PTR):
+					op = MOVZX16;
+					break;
+				case ID2(M_I32,M_PTR):
+					op = _MOV;
+					break;
+				default:
+					jit_assert();
+					break;
+				}
+				if( IS_REG(out) || op == _MOV )
+					EMIT(op,out,r,e->op == CONV_UNSIGNED ? M_PTR : e->mode);
+				else {
+					ereg r2 = get_tmp(e->mode);
+					EMIT(op,r2,r,e->op == CONV_UNSIGNED ? M_PTR : e->mode);
+					emit_mov(ctx,out,r2,e->mode);
+				}
+			}
+			break;
+		case BINOP:
+		case UNOP:
+			emit_anyop(ctx, e->size_offs, out, e->a, e->b, e->mode);
+			break;
+		case LEA:
+			if( !IS_REG(out) ) {
+				ereg tmp = get_tmp(e->mode);
+				emit_lea(ctx,tmp,e);
+				emit_mov(ctx,out,tmp,e->mode);
+			} else
+				emit_lea(ctx,out,e);
+			break;
+		case STACK_OFFS:
+			if( e->size_offs >= 0 )
+				EMIT(ADD,R(RSP),MK_CONST(e->size_offs),M_PTR);
+			else
+				EMIT(SUB,R(RSP),MK_CONST(-e->size_offs),M_PTR);
+			break;
+		case PREFETCH:
+			{
+				CpuOp op;
+				switch( e->size_offs ) {
+				case 0: op = PREFETCHT0; break;
+				case 1: op = PREFETCHT1; break;
+				case 2: op = PREFETCHT2; break;
+				case 3: op = PREFETCHNTA; break;
+				case 4: op = PREFETCHW; break;
+				default: jit_assert();
+				}
+				ereg a = e->a;
+				if( !IS_REG(e->a) ) {
+					emit_mov(ctx,RTMP,e->a,M_PTR);
+					a = RTMP;
+				}
+				EMIT(op,REG_PTR(a),UNUSED,M_PTR);
+			}
+			break;
+		case CMOV:
+			{
+				int cond = get_cond_jump(ctx);
+				if( !IS_REG(out) ) jit_assert();
+				if( IS_REG(e->a) ) {
+					emit_cmov(ctx,out,e->a,cond,M_PTR);					
+				} else {
+					emit_mov(ctx,RTMP,e->a,e->mode);
+					emit_cmov(ctx,out,RTMP,cond,M_PTR);
+				}
+			}
+			break;
+		case CXCHG:
+			BREAK();
+			break;
+		case NOP:
+			emit_nop(ctx,1);
+			break;
+		default:
+			jit_assert();
+			break;
+		}
+		if( ctx->code.cur > ctx->code.max ) jit_assert();
+	}
+	align_function(ctx);
+	hl_codegen_flush(jit);
+	for(int i=0;i<int_arr_count(ctx->short_jumps);i+=2) {
+		int pos = int_arr_get(ctx->short_jumps,i);
+		int target = int_arr_get(ctx->short_jumps,i+1);
+		int offset = ctx->pos_map[target] - (pos + 1);
+		if( !IS_SBYTE(offset) ) jit_assert();
+		*(char*)&ctx->code.values[pos] = (char)offset;
+	}
+	for(int i=0;i<int_arr_count(ctx->near_jumps);i+=2) {
+		int pos = int_arr_get(ctx->near_jumps,i);
+		int target = int_arr_get(ctx->near_jumps,i+1);
+		int offset = ctx->pos_map[target] - (pos + 4);
+		*(int*)&ctx->code.values[pos] = offset;
+	}
+	for(int i=const_addr_prev;i<int_arr_count(ctx->const_addr);i+=2) {
+		int target = int_arr_get(ctx->const_addr,i+1);
+		int offs = jit->out_pos + ctx->pos_map[target];
+		ctx->const_addr.values[i+1] = offs;
+	}
+}
+
+void hl_codegen_alloc( jit_ctx *jit ) {
+	code_ctx *ctx = (code_ctx*)malloc(sizeof(code_ctx));
+	memset(ctx,0,sizeof(code_ctx));
+	jit->code = ctx;
+	ctx->jit = jit;
+}
+
+static void flush_function( code_ctx *ctx, int start ) {
+	hl_jit_define_function(ctx->jit, start, ctx->jit->out_pos + byte_count(ctx->code) - start);
+	align_function(ctx);
+	if( byte_count(ctx->code) > ctx->code.max ) jit_assert();
+}
+
+void hl_codegen_init( jit_ctx *jit ) {
+	code_ctx *ctx = jit->code;
+	byte_reserve(ctx->code,1024);
+	ctx->code.cur -= 1024;
+
+	// generate hl_null_access stub
+	ctx->null_access_pos = jit->out_pos + byte_count(ctx->code);
+	EMIT(_PUSH,R(RBP),UNUSED,M_PTR);
+	EMIT(_MOV,R(RBP),R(RSP),M_PTR);
+	EMIT(SUB,R(RSP),MK_CONST(0x20),M_PTR);
+	emit_ext(ctx,_MOV,R(RAX),VAL_CONST,M_PTR,(int_val)hl_null_access);
+	EMIT(_CALL,R(RAX),UNUSED,M_PTR);
+	BREAK();
+	flush_function(ctx, ctx->null_access_pos);
+	
+	// generate hl_null_field access stub
+	ctx->null_field_pos = jit->out_pos + byte_count(ctx->code);
+	EMIT(_PUSH,R(RBP),UNUSED,M_PTR);
+	EMIT(_MOV,R(RBP),R(RSP),M_PTR);
+	EMIT(SUB,R(RSP),MK_CONST(0x28),M_PTR);
+	EMIT(_MOV,jit->cfg.regs.arg[0],MK_ADDR(RBP,HL_WSIZE*2),M_I32);
+	emit_ext(ctx,_MOV,R(RAX),VAL_CONST,M_PTR,(int_val)hl_jit_null_field_access);
+	EMIT(_CALL,R(RAX),UNUSED,M_PTR);
+	BREAK();
+	flush_function(ctx, ctx->null_field_pos);
+
+	// generate c2hl stub
+	jit->code_funs.c2hl = jit->out_pos + byte_count(ctx->code);
+	regs_config *cfg = &jit->cfg;
+	EMIT(_PUSH,R(RBP),UNUSED,M_PTR);
+	EMIT(_MOV,R(RBP),R(RSP),M_PTR);
+
+	ereg fptr = scratch_not_param[0];
+	ereg vargs = scratch_not_param[1];
+	ereg nargs = scratch_not_param[2];
+	EMIT(_MOV,fptr,cfg->regs.arg[0],M_PTR);
+	EMIT(_MOV,vargs,cfg->regs.arg[1],M_PTR);
+	EMIT(_MOV,nargs,cfg->regs.arg[2],M_I32);
+
+	for(int i=0;i<cfg->regs.nargs;i++)
+		EMIT(_MOV, cfg->regs.arg[i], MK_ADDR(vargs,i*8), M_PTR);
+	for(int i=0;i<cfg->floats.nargs;i++)
+		EMIT(MOVSD, cfg->floats.arg[i]-64, MK_ADDR(vargs,(i + cfg->regs.nargs) * 8), M_PTR);
+
+	EMIT(ADD,vargs,MK_CONST((MAX_ARGS - 1) * HL_WSIZE),M_PTR);
+	int begin = byte_count(ctx->code);
+	EMIT(_TEST,nargs,nargs,M_I32);
+	int pos = jump_near(ctx,JZero);
+	EMIT(_PUSH,MK_ADDR(vargs,0),UNUSED,M_PTR);
+	EMIT(SUB,vargs,MK_CONST(HL_WSIZE),M_PTR);
+	EMIT(DEC,nargs,UNUSED,M_I32);
+	jump_near(ctx,-begin);
+	patch_jump_near(ctx,pos);
+
+	if( IS_WINCALL64 ) EMIT(SUB,R(RSP),MK_CONST(0x20),M_PTR);
+	EMIT(_CALL, fptr, UNUSED, M_NONE);
+
+	EMIT(_MOV,R(RSP),R(RBP),M_PTR);
+	EMIT(_POP,R(RBP),UNUSED,M_PTR);
+	EMIT(_RET,UNUSED,UNUSED,M_NONE);
+	
+	flush_function(ctx, jit->code_funs.c2hl);
+
+	// generate hl2c stub
+	jit->code_funs.hl2c = jit->out_pos + byte_count(ctx->code);
+	ereg cl = cfg->regs.arg[0];
+	ereg tmp = cfg->regs.arg[1];
+	EMIT(_PUSH,R(RBP),UNUSED,M_PTR);
+	EMIT(_MOV,R(RBP),R(RSP),M_PTR);
+	EMIT(SUB,R(RSP),MK_CONST(cfg->floats.nargs*8),M_PTR);
+
+	// push all possible call registers
+	for(int i=0;i<cfg->floats.nargs;i++)
+		EMIT(MOVSD,MK_ADDR(RSP,i*8),cfg->floats.arg[cfg->floats.nargs - 1 - i],M_F64);
+	for(int i=0;i<cfg->regs.nargs;i++)
+		EMIT(_PUSH,cfg->regs.arg[cfg->regs.nargs - 1 - i],UNUSED,M_PTR);
+
+	// opcodes for:
+	//		switch( arg0->t->fun->ret->kind ) {
+	//		case HF32: case HF64: return jit_wrapper_d(arg0,&args);
+	//		default: return jit_wrapper_ptr(arg0,&args);
+	//		}
+	hl_type_fun *ft = NULL;
+	ereg fun_ptr = scratch_not_param[0];
+
+	EMIT(_MOV,tmp,MK_ADDR(cl,0),M_PTR); // ->t
+	EMIT(_MOV,tmp,MK_ADDR(tmp,HL_WSIZE),M_PTR); // ->fun
+	EMIT(_MOV,tmp,MK_ADDR(tmp,(int)(int_val)&ft->ret),M_PTR); // ->rets
+	EMIT(_MOV,tmp,MK_ADDR(tmp,0),M_I32); // ->kind
+
+	EMIT(_CMP,tmp,MK_CONST(HF64),M_I32);
+	int float1 = jump_near(ctx,JEq);
+	EMIT(_CMP,tmp,MK_CONST(HF32),M_I32);
+	int float2 = jump_near(ctx,JEq);
+	emit_ext(ctx,_MOV,fun_ptr,VAL_CONST,M_PTR,(int_val)hl_jit_wrapper_ptr);
+	
+	int jexit = jump_near(ctx, JAlways);
+	patch_jump_near(ctx, float1);
+	patch_jump_near(ctx, float2);
+	emit_ext(ctx,_MOV,fun_ptr,VAL_CONST,M_PTR,(int_val)hl_jit_wrapper_d);
+	patch_jump_near(ctx, jexit);
+
+	int stack_args_pos = HL_WSIZE * (IS_64?2:3);
+	if( IS_WINCALL64 ) {
+		stack_args_pos += 0x20;
+		EMIT(SUB,R(RSP),MK_CONST(0x20),M_PTR);
+	}
+	EMIT(_LEA,cfg->regs.arg[1],MK_ADDR(R(RBP),stack_args_pos),M_PTR);
+	EMIT(_LEA,cfg->regs.arg[2],MK_ADDR(R(RBP),-(cfg->floats.nargs * 8 + cfg->regs.nargs * HL_WSIZE)),M_PTR);
+	EMIT(_CALL,fun_ptr,UNUSED,M_PTR);
+
+	if( IS_WINCALL64 )
+		EMIT(ADD,R(RSP),MK_CONST(0x20),M_PTR);
+
+	EMIT(_MOV,R(RSP),R(RBP),M_PTR);
+	EMIT(_POP,R(RBP),UNUSED,M_PTR);
+	EMIT(_RET,UNUSED,UNUSED,M_NONE);
+	
+	flush_function(ctx, jit->code_funs.hl2c);
+
+	
+	hl_codegen_flush(jit);
+}
+
+void hl_codegen_free( jit_ctx *jit ) {
+	code_ctx *ctx = jit->code;
+	free(ctx->pos_map);
+	free(ctx);
+}
+
+void hl_codegen_flush_consts( jit_ctx *jit ) {
+	code_ctx *ctx = jit->code;
+	// patch function offsets
+	for(int i=0;i<int_arr_count(ctx->funs);i+=2) {
+		int pos = int_arr_get(ctx->funs,i);
+		int fid = int_arr_get(ctx->funs,i+1);
+		int offset = (int)(int_val)jit->mod->functions_ptrs[fid] - (pos + 4);
+		*(int*)(jit->output + pos) = offset;
+	}
+	int_arr_reset(&ctx->funs);
+	// emit constant table
+	jit->code_size = byte_count(ctx->const_table);
+	jit->code_instrs = ctx->const_table.values;
+	ctx->const_table_pos = jit->out_pos;
+	// patch constant offsets
+	for(int i=0;i<int_arr_count(ctx->const_refs);i+=2) {
+		int pos = int_arr_get(ctx->const_refs,i);
+		int coffs = int_arr_get(ctx->const_refs,i+1);
+		int offset = (ctx->const_table_pos + coffs) - (pos + 4);
+		*(int*)(jit->output + pos) = offset;
+	}
+	int_arr_reset(&ctx->const_refs);
+	// cleanup
+	byte_free(&ctx->const_table);
+	value_map_free(&ctx->const_table_lookup);
+}
+
+void hl_codegen_final( jit_ctx *jit ) {
+	code_ctx *ctx = jit->code;
+	// patch absolute addresses
+	for(int i=0;i<int_arr_count(ctx->const_addr);i+=2) {
+		int pos = int_arr_get(ctx->const_addr,i);
+		int offs = int_arr_get(ctx->const_addr,i+1);
+		*(void**)(jit->final_code + ctx->const_table_pos + pos) = jit->final_code + offs;
+	}
+	int_arr_free(&ctx->const_addr);
+}
diff --git a/src/main.c b/src/main.c
index 6054060d0..5ad605e36 100644
--- a/src/main.c
+++ b/src/main.c
@@ -20,7 +20,7 @@
  * DEALINGS IN THE SOFTWARE.
  */
 #include <hl.h>
-#include <hlmodule.h>
+#include <jit.h>
 #include "hlsystem.h"
 
 #ifdef HL_WIN
@@ -259,7 +259,7 @@ int main(int argc, pchar *argv[]) {
 		file = PSTR("hlboot.dat");
 		fchk = pfopen(file,"rb");
 		if( fchk == NULL ) {
-			printf("HL/JIT %d.%d.%d (c)2015-2025 Haxe Foundation\n  Usage : hl [--debug <port>] [--debug-wait] <file>\n",HL_VERSION>>16,(HL_VERSION>>8)&0xFF,HL_VERSION&0xFF);
+			printf("HL/JIT %d.%d.%d (c)2015-2026 Haxe Foundation\n  Usage : hl [--debug <port>] [--debug-wait] <file>\n",HL_VERSION>>16,(HL_VERSION>>8)&0xFF,HL_VERSION&0xFF);
 			return 1;
 		}
 		fclose(fchk);
diff --git a/src/module.c b/src/module.c
index e668b1064..b6d7a4a97 100644
--- a/src/module.c
+++ b/src/module.c
@@ -21,6 +21,7 @@
  */
 #include <hl.h>
 #include <hlmodule.h>
+#include <jit.h>
 
 #ifdef HL_WIN
 #	undef _GUID
@@ -34,6 +35,10 @@ EXTERN_C IMAGE_DOS_HEADER __ImageBase;
 
 #define HOT_RELOAD_EXTRA_GLOBALS	4096
 
+#ifdef HL_DEBUG
+#	define ALLOW_DUMP
+#endif
+
 HL_API void hl_prim_not_loaded( const uchar *err );
 
 static hl_module **cur_modules = NULL;
@@ -72,7 +77,7 @@ static bool module_resolve_pos( hl_module *m, void *addr, int *fidx, int *fpos )
 	while( min < max ) {
 		int mid = (min + max) >> 1;
 		int offset = dbg->large ? ((int*)dbg->offsets)[mid] : ((unsigned short*)dbg->offsets)[mid];
-		if( offset <= code_pos )
+		if( offset < code_pos )
 			min = mid + 1;
 		else
 			max = mid;
@@ -224,10 +229,8 @@ static int module_capture_stack( void **stack, int size ) {
 			unsigned char *code = m->jit_code;
 			int code_size = m->codesize;
 			if( module_addr >= (void*)code && module_addr < (void*)(code + code_size) ) {
-				if( stack && count == size ) {
+				if( stack && count == size )
 					break;
-				}
-
 				if( stack )
 					stack[count++] = module_addr;
 				else
@@ -248,6 +251,41 @@ static int module_capture_stack( void **stack, int size ) {
 		}
 	}
 	return count;
+#elif defined(__aarch64__) || defined(_M_ARM64)
+	// On AArch64, walk the frame pointer (X29) chain instead of scanning the stack.
+	// The heuristic scanner produces false positives from callee-saved register spills
+	// (STP X19,X20 etc.) that look like (stack_addr, code_addr) pairs.
+	void *stack_top = hl_get_thread()->stack_top;
+	void **fp = (void **)__builtin_frame_address(0);
+	int count = 0;
+	while( fp && (void *)fp < stack_top ) {
+		void *lr = fp[1];
+		void *next_fp = fp[0];
+		int i;
+		for(i=0;i<modules_count;i++) {
+			hl_module *m = cur_modules[i];
+			unsigned char *code = m->jit_code;
+			int code_size = m->codesize;
+			if( lr >= (void*)code && lr < (void*)(code + code_size) ) {
+				if( m->jit_debug ) {
+					int s = m->jit_debug[0].start;
+					code += s;
+					code_size -= s;
+					if( lr < (void*)code || lr >= (void*)(code + code_size) ) continue;
+				}
+				if( stack ) {
+					if( count == size ) return count;
+					stack[count] = lr;
+				}
+				count++;
+				break;
+			}
+		}
+		if( next_fp == NULL || next_fp <= (void *)fp || next_fp >= stack_top )
+			break;
+		fp = (void **)next_fp;
+	}
+	return count;
 #else
 	return hl_module_capture_stack_range(hl_get_thread()->stack_top, (void**)&stack, stack, size);
 #endif
@@ -705,21 +743,57 @@ int hl_module_init( hl_module *m, h_bool hot_reload ) {
 	if( hot_reload ) m->hash = hl_code_hash_alloc(m->code);
 	hl_module_init_natives(m);
 	hl_module_init_indexes(m);
+#	ifdef WIN64_UNWIND_TABLES
+	m->unwind_table_size = m->code->nfunctions + 10; // extra space for jit internals
+	m->unwind_table = malloc(sizeof(RUNTIME_FUNCTION) * m->unwind_table_size);
+	memset(m->unwind_table, 0, sizeof(RUNTIME_FUNCTION) * m->unwind_table_size);
+#	endif
 	// JIT
 	ctx = hl_jit_alloc();
 	if( ctx == NULL )
 		return 0;
 	hl_jit_init(ctx, m);
+#	ifdef ALLOW_DUMP
+	bool dump = false;
+	int filter = -1;
+	for(i=0;i<hl_setup.sys_nargs;i++) {
+		uchar *arg = hl_setup.sys_args[i];
+		if( ucmp(arg,USTR("--dump")) == 0 ) dump = true;
+		if( ucmp(arg,USTR("--dump-bin")) == 0 ) hl_jit_dump_bin = true;
+		if( memcmp(arg,USTR("--dump="),sizeof(USTR("--dump"))) == 0 ) {
+			dump = true;
+			filter = 0;
+			int pos = 7;
+			while( arg[pos] ) {
+				filter *= 16;
+				if( arg[pos] >= '0' && arg[pos] <= '9' )
+					filter |= arg[pos] - '0';
+				else
+					filter |= arg[pos] - 'A' + 10;
+				pos++;
+			}
+		}
+	}
+#	endif
 	for(i=0;i<m->code->nfunctions;i++) {
 		hl_function *f = m->code->functions + i;
+#		ifdef ALLOW_DUMP
+		if( filter >= 0 && filter != f->findex ) continue;
+#		endif
 		int fpos = hl_jit_function(ctx, m, f);
 		if( fpos < 0 ) {
 			hl_jit_free(ctx, false);
 			return 0;
 		}
 		m->functions_ptrs[f->findex] = (void*)(int_val)fpos;
+#		ifdef ALLOW_DUMP
+		if( dump ) hl_emit_dump(ctx);
+#		endif
 	}
 	m->jit_code = hl_jit_code(ctx, m, &m->codesize, &m->jit_debug, NULL);
+#	ifdef ALLOW_DUMP
+	if( filter >= 0 ) exit(0);
+#	endif
 	for(i=0;i<m->code->nfunctions;i++) {
 		hl_function *f = m->code->functions + i;
 		m->functions_ptrs[f->findex] = ((unsigned char*)m->jit_code) + ((int_val)m->functions_ptrs[f->findex]);
@@ -735,6 +809,9 @@ int hl_module_init( hl_module *m, h_bool hot_reload ) {
 	hl_gc_set_dump_types(hl_module_types_dump);
 #	ifdef HL_VTUNE
 	hl_setup.vtune_init = modules_init_vtune;
+#	endif
+#	ifdef WIN64_UNWIND_TABLES
+	RtlAddFunctionTable(m->unwind_table, m->unwind_table_size, (DWORD64)m->jit_code);
 #	endif
 	hl_jit_free(ctx, hot_reload);
 	if( hot_reload ) {
diff --git a/src/opcodes.h b/src/opcodes.h
index ab9b1fa51..9e4df7f60 100644
--- a/src/opcodes.h
+++ b/src/opcodes.h
@@ -67,8 +67,8 @@ OP_BEGIN
 	OP(OIncr,R,X,X)
 	OP(ODecr,R,X,X)
 
-	OP(OCall0,R,R,X)
-	OP(OCall1,R,R,R)
+	OP(OCall0,R,C,X)
+	OP(OCall1,R,C,R)
 	OP(OCall2,R,AR,4)
 	OP(OCall3,R,AR,5)
 	OP(OCall4,R,AR,6)
@@ -78,17 +78,17 @@ OP_BEGIN
 	OP(OCallClosure,R,AR,VAR_ARGS)
 
 	OP(OStaticClosure,R,G,X)
-	OP(OInstanceClosure,R,R,G)
+	OP(OInstanceClosure,R,C,R)
 	OP(OVirtualClosure,R,R,G)
 
 	OP(OGetGlobal,R,G,X)
-	OP(OSetGlobal,R_NW,G,X)
-	OP(OField,R,R,C)
-	OP(OSetField,R_NW,R,C)
-	OP(OGetThis,R,C,X)
-	OP(OSetThis,R_NW,R,X)
+	OP(OSetGlobal,G,R,X)
+	OP(OField,R,R,G)
+	OP(OSetField,R_NW,G,R)
+	OP(OGetThis,R,G,X)
+	OP(OSetThis,G,R,X)
 	OP(ODynGet,R,R,C)
-	OP(ODynSet,R_NW,R,C)
+	OP(ODynSet,R_NW,C,R)
 
 	OP(OJTrue,R_NW,J,X)
 	OP(OJFalse,R_NW,J,X)
@@ -134,7 +134,7 @@ OP_BEGIN
 
 	OP(ONew,R,X,X)
 	OP(OArraySize,R,R,X)
-	OP(OType,R,R,X)
+	OP(OType,R,G,X)
 	OP(OGetType,R,R,X)
 	OP(OGetTID,R,R,X)
 
diff --git a/src/profile.c b/src/profile.c
index e0df0efc3..09ba265ed 100644
--- a/src/profile.c
+++ b/src/profile.c
@@ -146,13 +146,23 @@ static void *get_thread_stackptr( thread_handle *t, void **eip ) {
 	return (void*)c.Esp;
 #	endif
 #elif defined(HL_LINUX)
-#	ifdef HL_64
+#	if defined(__aarch64__) || defined(_M_ARM64)
+	*eip = (void*)shared_context.context.uc_mcontext.pc;
+	return (void*)shared_context.context.uc_mcontext.sp;
+#	elif defined(HL_64)
 	*eip = (void*)shared_context.context.uc_mcontext.gregs[REG_RIP];
 	return (void*)shared_context.context.uc_mcontext.gregs[REG_RSP];
 #	else
 	*eip = (void*)shared_context.context.uc_mcontext.gregs[REG_EIP];
 	return (void*)shared_context.context.uc_mcontext.gregs[REG_ESP];
 #	endif
+#elif defined(HL_MAC) && defined(__aarch64__)
+	struct __darwin_mcontext64 *mcontext = shared_context.context.uc_mcontext;
+	if (mcontext != NULL) {
+		*eip = (void*)mcontext->__ss.__pc;
+		return (void*)mcontext->__ss.__sp;
+	}
+	return NULL;
 #elif defined(HL_MAC) && defined(__x86_64__)
 	struct __darwin_mcontext64 *mcontext = shared_context.context.uc_mcontext;
 	if (mcontext != NULL) {
diff --git a/src/std/types.c b/src/std/types.c
index eaf228db6..8db708185 100644
--- a/src/std/types.c
+++ b/src/std/types.c
@@ -35,7 +35,7 @@ HL_PRIM hl_type hlt_bool = { HBOOL };
 HL_PRIM hl_type hlt_abstract = { HABSTRACT, {USTR("<abstract>")} };
 
 static const uchar *TSTR[] = {
-	USTR("void"), USTR("i8"), USTR("i16"), USTR("i32"), USTR("i64"), USTR("f32"), USTR("f64"),
+	USTR("void"), USTR("ui8"), USTR("ui16"), USTR("i32"), USTR("i64"), USTR("f32"), USTR("f64"),
 	USTR("bool"), USTR("bytes"), USTR("dynamic"), NULL, NULL,
 	USTR("array"), USTR("type"), NULL, NULL, USTR("dynobj"),
 	NULL, NULL, NULL, NULL, NULL, NULL, USTR("guid")
@@ -43,8 +43,8 @@ static const uchar *TSTR[] = {
 
 static int T_SIZES[] = {
 	0, // VOID
-	1, // I8
-	2, // I16
+	1, // UI8
+	2, // UI16
 	4, // I32
 	8, // I64
 	4, // F32
@@ -160,8 +160,8 @@ HL_PRIM bool hl_same_type( hl_type *a, hl_type *b ) {
 HL_PRIM bool hl_is_dynamic( hl_type *t ) {
 	static bool T_IS_DYNAMIC[] = {
 		false, // HVOID,
-		false, // HI8
-		false, // HI16
+		false, // HUI8
+		false, // HUI16
 		false, // HI32
 		false, // HI64
 		false, // HF32
@@ -190,8 +190,8 @@ HL_PRIM bool hl_is_dynamic( hl_type *t ) {
 HL_PRIM bool hl_is_ptr( hl_type *t ) {
 	static bool T_IS_PTR[] = {
 		false, // HVOID,
-		false, // HI8
-		false, // HI16
+		false, // HUI8
+		false, // HUI16
 		false, // HI32
 		false, // HI64
 		false, // HF32