From 3c338b2760121570a2f7bae73535a7180c083171 Mon Sep 17 00:00:00 2001
From: Garrick Meeker <gmeeker@gmail.com>
Date: Wed, 11 Mar 2026 16:56:47 -0700
Subject: [PATCH 01/16] OpenCL: Initial backend

---
 CMakeLists.txt    |   34 +
 README.md         |    5 +
 main.cpp          |  787 +++++++++++---------
 spirv_cross_c.cpp |   12 +
 spirv_cross_c.h   | 1780 +++++++++++++++++++++++----------------------
 spirv_opencl.cpp  | 1067 +++++++++++++++++++++++++++
 spirv_opencl.hpp  |  125 ++++
 test_shaders.py   |  133 +++-
 8 files changed, 2726 insertions(+), 1217 deletions(-)
 create mode 100644 spirv_opencl.cpp
 create mode 100644 spirv_opencl.hpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8e5129c8b..d9ac3b141 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -45,6 +45,7 @@ option(SPIRV_CROSS_ENABLE_TESTS "Enable SPIRV-Cross tests." ON)
 option(SPIRV_CROSS_ENABLE_GLSL "Enable GLSL support." ON)
 option(SPIRV_CROSS_ENABLE_HLSL "Enable HLSL target support." ON)
 option(SPIRV_CROSS_ENABLE_MSL "Enable MSL target support." ON)
+option(SPIRV_CROSS_ENABLE_OPENCL "Enable OpenCL target support." ON)
 option(SPIRV_CROSS_ENABLE_CPP "Enable C++ target support." ON)
 option(SPIRV_CROSS_ENABLE_REFLECT "Enable JSON reflection target support." ON)
 option(SPIRV_CROSS_ENABLE_C_API "Enable C API wrapper support in static library." ON)
@@ -242,6 +243,10 @@ set(spirv-cross-msl-sources
 		${CMAKE_CURRENT_SOURCE_DIR}/spirv_msl.cpp
 		${CMAKE_CURRENT_SOURCE_DIR}/spirv_msl.hpp)
 
+set(spirv-cross-opencl-sources
+        ${CMAKE_CURRENT_SOURCE_DIR}/spirv_opencl.cpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/spirv_opencl.hpp)
+
 set(spirv-cross-hlsl-sources
 		${CMAKE_CURRENT_SOURCE_DIR}/spirv_hlsl.cpp
 		${CMAKE_CURRENT_SOURCE_DIR}/spirv_hlsl.hpp)
@@ -306,6 +311,16 @@ if (SPIRV_CROSS_STATIC)
 		endif()
 	endif()
 
+    if (SPIRV_CROSS_ENABLE_OPENCL)
+        spirv_cross_add_library(spirv-cross-opencl spirv_cross_opencl STATIC
+                ${spirv-cross-opencl-sources})
+        if (SPIRV_CROSS_ENABLE_GLSL)
+            target_link_libraries(spirv-cross-opencl PRIVATE spirv-cross-glsl)
+        else()
+            message(FATAL_ERROR "Must enable GLSL support to enable OpenCL support.")
+        endif()
+    endif()
+
 	if (SPIRV_CROSS_ENABLE_HLSL)
 		spirv_cross_add_library(spirv-cross-hlsl spirv_cross_hlsl STATIC
 				${spirv-cross-hlsl-sources})
@@ -343,6 +358,11 @@ if (SPIRV_CROSS_STATIC)
 			target_compile_definitions(spirv-cross-c PRIVATE SPIRV_CROSS_C_API_MSL=1)
 		endif()
 
+	    if (SPIRV_CROSS_ENABLE_OPENCL)
+            target_link_libraries(spirv-cross-c PRIVATE spirv-cross-opencl)
+            target_compile_definitions(spirv-cross-c PRIVATE SPIRV_CROSS_C_API_OPENCL=1)
+    	endif()
+
 		if (SPIRV_CROSS_ENABLE_CPP)
 			target_link_libraries(spirv-cross-c PRIVATE spirv-cross-cpp)
 			target_compile_definitions(spirv-cross-c PRIVATE SPIRV_CROSS_C_API_CPP=1)
@@ -393,6 +413,15 @@ if (SPIRV_CROSS_SHARED)
 		target_compile_definitions(spirv-cross-c-shared PRIVATE SPIRV_CROSS_C_API_MSL=1)
 	endif()
 
+	if (SPIRV_CROSS_ENABLE_OPENCL)
+		if (SPIRV_CROSS_ENABLE_GLSL)
+			target_sources(spirv-cross-c-shared PRIVATE ${spirv-cross-opencl-sources})
+		else()
+			message(FATAL_ERROR "Must enable GLSL support to enable OpenCL support.")
+		endif()
+		target_compile_definitions(spirv-cross-c-shared PRIVATE SPIRV_CROSS_C_API_OPENCL=1)
+	endif()
+
 	if (SPIRV_CROSS_ENABLE_CPP)
 		if (SPIRV_CROSS_ENABLE_GLSL)
 			target_sources(spirv-cross-c-shared PRIVATE ${spirv-cross-cpp-sources})
@@ -439,6 +468,10 @@ if (SPIRV_CROSS_CLI)
 		message(FATAL_ERROR "Must enable MSL if building CLI.")
 	endif()
 
+	if (NOT SPIRV_CROSS_ENABLE_OPENCL)
+		message(FATAL_ERROR "Must enable OpenCL if building CLI.")
+	endif()
+
 	if (NOT SPIRV_CROSS_ENABLE_CPP)
 		message(FATAL_ERROR "Must enable C++ if building CLI.")
 	endif()
@@ -468,6 +501,7 @@ if (SPIRV_CROSS_CLI)
 			spirv-cross-cpp
 			spirv-cross-reflect
 			spirv-cross-msl
+			spirv-cross-opencl
 			spirv-cross-util
 			spirv-cross-core)
 
diff --git a/README.md b/README.md
index a1aa5511d..9030cd2a7 100644
--- a/README.md
+++ b/README.md
@@ -15,6 +15,7 @@ SPIRV-Cross is a tool designed for parsing and converting SPIR-V to other shader
   - Convert SPIR-V to readable, usable and efficient GLSL
   - Convert SPIR-V to readable, usable and efficient Metal Shading Language (MSL)
   - Convert SPIR-V to readable, usable and efficient HLSL
+  - Convert SPIR-V to readable, usable and efficient OpenCL
   - Convert SPIR-V to a JSON reflection format
   - Convert SPIR-V to debuggable C++ [DEPRECATED]
   - Reflection API to simplify the creation of Vulkan pipeline layouts
@@ -546,6 +547,10 @@ To test the roundtrip path GLSL -> SPIR-V -> MSL, `--msl` can be added, e.g. `./
 
 To test the roundtrip path GLSL -> SPIR-V -> HLSL, `--hlsl` can be added, e.g. `./test_shaders.py --hlsl shaders-hlsl`.
 
+### OpenCL backend
+
+To test the roundtrip path GLSL -> SPIR-V -> OpenCL, `--opencl` can be added, e.g. `./test_shaders.py --opencl shaders-opencl`.
+
 ### Updating regression tests
 
 When legitimate changes are found, use `--update` flag to update regression files.
diff --git a/main.cpp b/main.cpp
index 7dc5404dd..adcfccbdd 100644
--- a/main.cpp
+++ b/main.cpp
@@ -26,6 +26,7 @@
 #include "spirv_glsl.hpp"
 #include "spirv_hlsl.hpp"
 #include "spirv_msl.hpp"
+#include "spirv_opencl.hpp"
 #include "spirv_parser.hpp"
 #include "spirv_reflect.hpp"
 #include <algorithm>
@@ -39,8 +40,8 @@
 #include <unordered_set>
 
 #ifdef _WIN32
-#include <io.h>
 #include <fcntl.h>
+#include <io.h>
 #endif
 
 #ifdef HAVE_SPIRV_CROSS_GIT_VERSION
@@ -297,10 +298,17 @@ static void print_resources(const Compiler &compiler, StorageClass storage,
 		auto &type = compiler.get_type(res.value_type_id);
 		switch (type.basetype)
 		{
-		case SPIRType::Float: basetype = "float"; break;
-		case SPIRType::Int: basetype = "int"; break;
-		case SPIRType::UInt: basetype = "uint"; break;
-		default: break;
+		case SPIRType::Float:
+			basetype = "float";
+			break;
+		case SPIRType::Int:
+			basetype = "int";
+			break;
+		case SPIRType::UInt:
+			basetype = "uint";
+			break;
+		default:
+			break;
 		}
 
 		uint32_t array_size = 0;
@@ -326,16 +334,30 @@ static void print_resources(const Compiler &compiler, StorageClass storage,
 		string builtin_str;
 		switch (res.builtin)
 		{
-		case BuiltInPosition: builtin_str = "Position"; break;
-		case BuiltInPointSize: builtin_str = "PointSize"; break;
-		case BuiltInCullDistance: builtin_str = "CullDistance"; break;
-		case BuiltInClipDistance: builtin_str = "ClipDistance"; break;
-		case BuiltInTessLevelInner: builtin_str = "TessLevelInner"; break;
-		case BuiltInTessLevelOuter: builtin_str = "TessLevelOuter"; break;
-		default: builtin_str = string("builtin #") + to_string(res.builtin);
+		case BuiltInPosition:
+			builtin_str = "Position";
+			break;
+		case BuiltInPointSize:
+			builtin_str = "PointSize";
+			break;
+		case BuiltInCullDistance:
+			builtin_str = "CullDistance";
+			break;
+		case BuiltInClipDistance:
+			builtin_str = "ClipDistance";
+			break;
+		case BuiltInTessLevelInner:
+			builtin_str = "TessLevelInner";
+			break;
+		case BuiltInTessLevelOuter:
+			builtin_str = "TessLevelOuter";
+			break;
+		default:
+			builtin_str = string("builtin #") + to_string(res.builtin);
 		}
 
-		fprintf(stderr, "Builtin %s (%s) (active: %s).\n", builtin_str.c_str(), type_str.c_str(), active ? "yes" : "no");
+		fprintf(stderr, "Builtin %s (%s) (active: %s).\n", builtin_str.c_str(), type_str.c_str(),
+		        active ? "yes" : "no");
 	}
 	fprintf(stderr, "=============\n\n");
 }
@@ -465,63 +487,65 @@ static void print_resources(const Compiler &compiler, const ShaderResources &res
 	fprintf(stderr, "\n");
 
 	fprintf(stderr, "Execution modes:\n");
-	modes.for_each_bit([&](uint32_t i) {
-		auto mode = static_cast<ExecutionMode>(i);
-		uint32_t arg0 = compiler.get_execution_mode_argument(mode, 0);
-		uint32_t arg1 = compiler.get_execution_mode_argument(mode, 1);
-		uint32_t arg2 = compiler.get_execution_mode_argument(mode, 2);
-
-		switch (static_cast<ExecutionMode>(i))
-		{
-		case ExecutionModeInvocations:
-			fprintf(stderr, "  Invocations: %u\n", arg0);
-			break;
-
-		case ExecutionModeLocalSize:
-			fprintf(stderr, "  LocalSize: (%u, %u, %u)\n", arg0, arg1, arg2);
-			break;
-
-		case ExecutionModeOutputVertices:
-			fprintf(stderr, "  OutputVertices: %u\n", arg0);
-			break;
+	modes.for_each_bit(
+	    [&](uint32_t i)
+	    {
+		    auto mode = static_cast<ExecutionMode>(i);
+		    uint32_t arg0 = compiler.get_execution_mode_argument(mode, 0);
+		    uint32_t arg1 = compiler.get_execution_mode_argument(mode, 1);
+		    uint32_t arg2 = compiler.get_execution_mode_argument(mode, 2);
+
+		    switch (static_cast<ExecutionMode>(i))
+		    {
+		    case ExecutionModeInvocations:
+			    fprintf(stderr, "  Invocations: %u\n", arg0);
+			    break;
+
+		    case ExecutionModeLocalSize:
+			    fprintf(stderr, "  LocalSize: (%u, %u, %u)\n", arg0, arg1, arg2);
+			    break;
+
+		    case ExecutionModeOutputVertices:
+			    fprintf(stderr, "  OutputVertices: %u\n", arg0);
+			    break;
 
 #define CHECK_MODE(m)                  \
 	case ExecutionMode##m:             \
 		fprintf(stderr, "  %s\n", #m); \
 		break
-			CHECK_MODE(SpacingEqual);
-			CHECK_MODE(SpacingFractionalEven);
-			CHECK_MODE(SpacingFractionalOdd);
-			CHECK_MODE(VertexOrderCw);
-			CHECK_MODE(VertexOrderCcw);
-			CHECK_MODE(PixelCenterInteger);
-			CHECK_MODE(OriginUpperLeft);
-			CHECK_MODE(OriginLowerLeft);
-			CHECK_MODE(EarlyFragmentTests);
-			CHECK_MODE(PointMode);
-			CHECK_MODE(Xfb);
-			CHECK_MODE(DepthReplacing);
-			CHECK_MODE(DepthGreater);
-			CHECK_MODE(DepthLess);
-			CHECK_MODE(DepthUnchanged);
-			CHECK_MODE(LocalSizeHint);
-			CHECK_MODE(InputPoints);
-			CHECK_MODE(InputLines);
-			CHECK_MODE(InputLinesAdjacency);
-			CHECK_MODE(Triangles);
-			CHECK_MODE(InputTrianglesAdjacency);
-			CHECK_MODE(Quads);
-			CHECK_MODE(Isolines);
-			CHECK_MODE(OutputPoints);
-			CHECK_MODE(OutputLineStrip);
-			CHECK_MODE(OutputTriangleStrip);
-			CHECK_MODE(VecTypeHint);
-			CHECK_MODE(ContractionOff);
-
-		default:
-			break;
-		}
-	});
+			    CHECK_MODE(SpacingEqual);
+			    CHECK_MODE(SpacingFractionalEven);
+			    CHECK_MODE(SpacingFractionalOdd);
+			    CHECK_MODE(VertexOrderCw);
+			    CHECK_MODE(VertexOrderCcw);
+			    CHECK_MODE(PixelCenterInteger);
+			    CHECK_MODE(OriginUpperLeft);
+			    CHECK_MODE(OriginLowerLeft);
+			    CHECK_MODE(EarlyFragmentTests);
+			    CHECK_MODE(PointMode);
+			    CHECK_MODE(Xfb);
+			    CHECK_MODE(DepthReplacing);
+			    CHECK_MODE(DepthGreater);
+			    CHECK_MODE(DepthLess);
+			    CHECK_MODE(DepthUnchanged);
+			    CHECK_MODE(LocalSizeHint);
+			    CHECK_MODE(InputPoints);
+			    CHECK_MODE(InputLines);
+			    CHECK_MODE(InputLinesAdjacency);
+			    CHECK_MODE(Triangles);
+			    CHECK_MODE(InputTrianglesAdjacency);
+			    CHECK_MODE(Quads);
+			    CHECK_MODE(Isolines);
+			    CHECK_MODE(OutputPoints);
+			    CHECK_MODE(OutputLineStrip);
+			    CHECK_MODE(OutputTriangleStrip);
+			    CHECK_MODE(VecTypeHint);
+			    CHECK_MODE(ContractionOff);
+
+		    default:
+			    break;
+		    }
+	    });
 	fprintf(stderr, "\n");
 
 	print_resources(compiler, "subpass inputs", res.subpass_inputs);
@@ -646,7 +670,7 @@ struct CLIArguments
 	bool msl_pad_fragment_output = false;
 	bool msl_domain_lower_left = false;
 	bool msl_argument_buffers = false;
-	uint32_t msl_argument_buffers_tier = 0;		// Tier 1
+	uint32_t msl_argument_buffers_tier = 0; // Tier 1
 	bool msl_texture_buffer_native = false;
 	bool msl_framebuffer_fetch = false;
 	bool msl_invariant_float_math = false;
@@ -751,6 +775,11 @@ struct CLIArguments
 	bool use_420pack_extension = true;
 	bool remove_unused = false;
 	bool combined_samplers_inherit_bindings = false;
+
+	bool opencl = false;
+	uint32_t opencl_version = 120;
+	bool opencl_enable_fp64 = false;
+	bool opencl_enable_64bit_atomics = false;
 };
 
 static void print_version()
@@ -770,6 +799,7 @@ static void print_help_backend()
 	        "\t[--vulkan-semantics] or [-V]:\n\t\tEmit Vulkan GLSL instead of plain GLSL. Makes use of Vulkan-only features to match SPIR-V.\n"
 	        "\t[--msl]:\n\t\tEmit Metal Shading Language (MSL).\n"
 	        "\t[--hlsl]:\n\t\tEmit HLSL.\n"
+            "\t[--opencl]:\n\t\tEmit OpenCL C (compute shaders only).\n"
 	        "\t[--reflect]:\n\t\tEmit JSON reflection.\n"
 	        "\t[--cpp]:\n\t\tDEPRECATED. Emits C++ code.\n"
 	);
@@ -1267,7 +1297,8 @@ static string compile_iteration(const CLIArguments &args, std::vector<uint32_t>
 		msl_opts.pad_fragment_output_components = args.msl_pad_fragment_output;
 		msl_opts.tess_domain_origin_lower_left = args.msl_domain_lower_left;
 		msl_opts.argument_buffers = args.msl_argument_buffers;
-		msl_opts.argument_buffers_tier = static_cast<CompilerMSL::Options::ArgumentBuffersTier>(args.msl_argument_buffers_tier);
+		msl_opts.argument_buffers_tier =
+		    static_cast<CompilerMSL::Options::ArgumentBuffersTier>(args.msl_argument_buffers_tier);
 		msl_opts.texture_buffer_native = args.msl_texture_buffer_native;
 		msl_opts.multiview = args.msl_multiview;
 		msl_opts.multiview_layered_rendering = args.msl_multiview_layered_rendering;
@@ -1323,6 +1354,15 @@ static string compile_iteration(const CLIArguments &args, std::vector<uint32_t>
 		if (args.msl_combined_sampler_suffix)
 			msl_comp->set_combined_sampler_suffix(args.msl_combined_sampler_suffix);
 	}
+	else if (args.opencl)
+	{
+		compiler.reset(new CompilerOpenCL(std::move(spirv_parser.get_parsed_ir())));
+		auto *ocl_comp = static_cast<CompilerOpenCL *>(compiler.get());
+		CompilerOpenCL::Options ocl_opts = ocl_comp->get_opencl_options();
+		ocl_opts.opencl_version = args.opencl_version;
+		ocl_opts.enable_fp64 = args.opencl_enable_fp64;
+		ocl_comp->set_opencl_options(ocl_opts);
+	}
 	else if (args.hlsl)
 		compiler.reset(new CompilerHLSL(std::move(spirv_parser.get_parsed_ir())));
 	else
@@ -1335,7 +1375,8 @@ static string compile_iteration(const CLIArguments &args, std::vector<uint32_t>
 
 	if (!args.variable_type_remaps.empty())
 	{
-		auto remap_cb = [&](const SPIRType &, const string &name, string &out) -> void {
+		auto remap_cb = [&](const SPIRType &, const string &name, string &out) -> void
+		{
 			for (const VariableTypeRemap &remap : args.variable_type_remaps)
 				if (name == remap.variable_name)
 					out = remap.new_variable_type;
@@ -1599,9 +1640,8 @@ static string compile_iteration(const CLIArguments &args, std::vector<uint32_t>
 
 		for (auto &named_remap : args.hlsl_attr_remap_named)
 		{
-			auto itr = std::find_if(res.stage_inputs.begin(), res.stage_inputs.end(), [&](const Resource &input_res) {
-				return input_res.name == named_remap.name;
-			});
+			auto itr = std::find_if(res.stage_inputs.begin(), res.stage_inputs.end(),
+			                        [&](const Resource &input_res) { return input_res.name == named_remap.name; });
 
 			if (itr != res.stage_inputs.end())
 			{
@@ -1633,55 +1673,79 @@ static int main_inner(int argc, char *argv[])
 	CLIArguments args;
 	CLICallbacks cbs;
 
-	cbs.add("--help", [](CLIParser &parser) {
-		print_help();
-		parser.end();
-	});
-	cbs.add("--help-all", [](CLIParser &parser) {
-		print_help_all();
-		parser.end();
-	});
-	cbs.add("--help-backend", [](CLIParser &parser) {
-		print_help_backend();
-		parser.end();
-	});
-	cbs.add("--help-common", [](CLIParser &parser) {
-		print_help_common();
-		parser.end();
-	});
-	cbs.add("--help-glsl", [](CLIParser &parser) {
-		print_help_glsl();
-		parser.end();
-	});
-	cbs.add("--help-msl", [](CLIParser &parser) {
-		print_help_msl();
-		parser.end();
-	});
-	cbs.add("--help-hlsl", [](CLIParser &parser) {
-		print_help_hlsl();
-		parser.end();
-	});
-	cbs.add("--help-obscure", [](CLIParser &parser) {
-		print_help_obscure();
-		parser.end();
-	});
-	cbs.add("--revision", [](CLIParser &parser) {
-		print_version();
-		parser.end();
-	});
+	cbs.add("--help",
+	        [](CLIParser &parser)
+	        {
+		        print_help();
+		        parser.end();
+	        });
+	cbs.add("--help-all",
+	        [](CLIParser &parser)
+	        {
+		        print_help_all();
+		        parser.end();
+	        });
+	cbs.add("--help-backend",
+	        [](CLIParser &parser)
+	        {
+		        print_help_backend();
+		        parser.end();
+	        });
+	cbs.add("--help-common",
+	        [](CLIParser &parser)
+	        {
+		        print_help_common();
+		        parser.end();
+	        });
+	cbs.add("--help-glsl",
+	        [](CLIParser &parser)
+	        {
+		        print_help_glsl();
+		        parser.end();
+	        });
+	cbs.add("--help-msl",
+	        [](CLIParser &parser)
+	        {
+		        print_help_msl();
+		        parser.end();
+	        });
+	cbs.add("--help-hlsl",
+	        [](CLIParser &parser)
+	        {
+		        print_help_hlsl();
+		        parser.end();
+	        });
+	cbs.add("--help-obscure",
+	        [](CLIParser &parser)
+	        {
+		        print_help_obscure();
+		        parser.end();
+	        });
+	cbs.add("--revision",
+	        [](CLIParser &parser)
+	        {
+		        print_version();
+		        parser.end();
+	        });
 	cbs.add("--output", [&args](CLIParser &parser) { args.output = parser.next_string(); });
-	cbs.add("--es", [&args](CLIParser &) {
-		args.es = true;
-		args.set_es = true;
-	});
-	cbs.add("--no-es", [&args](CLIParser &) {
-		args.es = false;
-		args.set_es = true;
-	});
-	cbs.add("--version", [&args](CLIParser &parser) {
-		args.version = parser.next_uint();
-		args.set_version = true;
-	});
+	cbs.add("--es",
+	        [&args](CLIParser &)
+	        {
+		        args.es = true;
+		        args.set_es = true;
+	        });
+	cbs.add("--no-es",
+	        [&args](CLIParser &)
+	        {
+		        args.es = false;
+		        args.set_es = true;
+	        });
+	cbs.add("--version",
+	        [&args](CLIParser &parser)
+	        {
+		        args.version = parser.next_uint();
+		        args.set_version = true;
+	        });
 	cbs.add("--dump-resources", [&args](CLIParser &) { args.dump_resources = true; });
 	cbs.add("--force-temporary", [&args](CLIParser &) { args.force_temporary = true; });
 	cbs.add("--flatten-ubo", [&args](CLIParser &) { args.flatten_ubo = true; });
@@ -1695,15 +1759,17 @@ static int main_inner(int argc, char *argv[])
 	cbs.add("--glsl-emit-push-constant-as-ubo", [&args](CLIParser &) { args.glsl_emit_push_constant_as_ubo = true; });
 	cbs.add("--glsl-emit-ubo-as-plain-uniforms", [&args](CLIParser &) { args.glsl_emit_ubo_as_plain_uniforms = true; });
 	cbs.add("--glsl-force-flattened-io-blocks", [&args](CLIParser &) { args.glsl_force_flattened_io_blocks = true; });
-	cbs.add("--glsl-ovr-multiview-view-count", [&args](CLIParser &parser) { args.glsl_ovr_multiview_view_count = parser.next_uint(); });
-	cbs.add("--glsl-remap-ext-framebuffer-fetch", [&args](CLIParser &parser) {
-		uint32_t input_index = parser.next_uint();
-		uint32_t color_attachment = parser.next_uint();
-		args.glsl_ext_framebuffer_fetch.push_back({ input_index, color_attachment });
-	});
-	cbs.add("--glsl-ext-framebuffer-fetch-noncoherent", [&args](CLIParser &) {
-		args.glsl_ext_framebuffer_fetch_noncoherent = true;
-	});
+	cbs.add("--glsl-ovr-multiview-view-count",
+	        [&args](CLIParser &parser) { args.glsl_ovr_multiview_view_count = parser.next_uint(); });
+	cbs.add("--glsl-remap-ext-framebuffer-fetch",
+	        [&args](CLIParser &parser)
+	        {
+		        uint32_t input_index = parser.next_uint();
+		        uint32_t color_attachment = parser.next_uint();
+		        args.glsl_ext_framebuffer_fetch.push_back({ input_index, color_attachment });
+	        });
+	cbs.add("--glsl-ext-framebuffer-fetch-noncoherent",
+	        [&args](CLIParser &) { args.glsl_ext_framebuffer_fetch_noncoherent = true; });
 	cbs.add("--vulkan-glsl-disable-ext-samplerless-texture-functions",
 	        [&args](CLIParser &) { args.vulkan_glsl_disable_ext_samplerless_texture_functions = true; });
 	cbs.add("--disable-storage-image-qualifier-deduction",
@@ -1715,14 +1781,15 @@ static int main_inner(int argc, char *argv[])
 	cbs.add("--hlsl-enable-compat", [&args](CLIParser &) { args.hlsl_compat = true; });
 	cbs.add("--hlsl-support-nonzero-basevertex-baseinstance",
 	        [&args](CLIParser &) { args.hlsl_support_nonzero_base = true; });
-	cbs.add("--hlsl-basevertex-baseinstance-binding", [&args](CLIParser &parser) {
-		args.hlsl_base_vertex_index_explicit_binding = true;
-		args.hlsl_base_vertex_index_register_index = parser.next_uint();
-		args.hlsl_base_vertex_index_register_space = parser.next_uint();
-	});
-	cbs.add("--hlsl-auto-binding", [&args](CLIParser &parser) {
-		args.hlsl_binding_flags |= hlsl_resource_type_to_flag(parser.next_string());
-	});
+	cbs.add("--hlsl-basevertex-baseinstance-binding",
+	        [&args](CLIParser &parser)
+	        {
+		        args.hlsl_base_vertex_index_explicit_binding = true;
+		        args.hlsl_base_vertex_index_register_index = parser.next_uint();
+		        args.hlsl_base_vertex_index_register_space = parser.next_uint();
+	        });
+	cbs.add("--hlsl-auto-binding", [&args](CLIParser &parser)
+	        { args.hlsl_binding_flags |= hlsl_resource_type_to_flag(parser.next_string()); });
 	cbs.add("--hlsl-force-storage-buffer-as-uav",
 	        [&args](CLIParser &) { args.hlsl_force_storage_buffer_as_uav = true; });
 	cbs.add("--hlsl-nonwritable-uav-texture-as-srv",
@@ -1730,7 +1797,8 @@ static int main_inner(int argc, char *argv[])
 	cbs.add("--hlsl-enable-16bit-types", [&args](CLIParser &) { args.hlsl_enable_16bit_types = true; });
 	cbs.add("--hlsl-flatten-matrix-vertex-input-semantics",
 	        [&args](CLIParser &) { args.hlsl_flatten_matrix_vertex_input_semantics = true; });
-	cbs.add("--hlsl-preserve-structured-buffers", [&args](CLIParser &) { args.hlsl_preserve_structured_buffers = true; });
+	cbs.add("--hlsl-preserve-structured-buffers",
+	        [&args](CLIParser &) { args.hlsl_preserve_structured_buffers = true; });
 	cbs.add("--hlsl-user-semantic", [&args](CLIParser &) { args.hlsl_user_semantic = true; });
 	cbs.add("--vulkan-semantics", [&args](CLIParser &) { args.vulkan_semantics = true; });
 	cbs.add("-V", [&args](CLIParser &) { args.vulkan_semantics = true; });
@@ -1758,23 +1826,27 @@ static int main_inner(int argc, char *argv[])
 	cbs.add("--msl-view-index-from-device-index",
 	        [&args](CLIParser &) { args.msl_view_index_from_device_index = true; });
 	cbs.add("--msl-dispatch-base", [&args](CLIParser &) { args.msl_dispatch_base = true; });
-	cbs.add("--msl-dynamic-buffer", [&args](CLIParser &parser) {
-		args.msl_argument_buffers = true;
-		// Make sure next_uint() is called in-order.
-		uint32_t desc_set = parser.next_uint();
-		uint32_t binding = parser.next_uint();
-		args.msl_dynamic_buffers.push_back(make_pair(desc_set, binding));
-	});
+	cbs.add("--msl-dynamic-buffer",
+	        [&args](CLIParser &parser)
+	        {
+		        args.msl_argument_buffers = true;
+		        // Make sure next_uint() is called in-order.
+		        uint32_t desc_set = parser.next_uint();
+		        uint32_t binding = parser.next_uint();
+		        args.msl_dynamic_buffers.push_back(make_pair(desc_set, binding));
+	        });
 	cbs.add("--msl-decoration-binding", [&args](CLIParser &) { args.msl_decoration_binding = true; });
 	cbs.add("--msl-force-active-argument-buffer-resources",
 	        [&args](CLIParser &) { args.msl_force_active_argument_buffer_resources = true; });
-	cbs.add("--msl-inline-uniform-block", [&args](CLIParser &parser) {
-		args.msl_argument_buffers = true;
-		// Make sure next_uint() is called in-order.
-		uint32_t desc_set = parser.next_uint();
-		uint32_t binding = parser.next_uint();
-		args.msl_inline_uniform_blocks.push_back(make_pair(desc_set, binding));
-	});
+	cbs.add("--msl-inline-uniform-block",
+	        [&args](CLIParser &parser)
+	        {
+		        args.msl_argument_buffers = true;
+		        // Make sure next_uint() is called in-order.
+		        uint32_t desc_set = parser.next_uint();
+		        uint32_t binding = parser.next_uint();
+		        args.msl_inline_uniform_blocks.push_back(make_pair(desc_set, binding));
+	        });
 	cbs.add("--msl-force-native-arrays", [&args](CLIParser &) { args.msl_force_native_arrays = true; });
 	cbs.add("--msl-disable-frag-depth-builtin", [&args](CLIParser &) { args.msl_enable_frag_depth_builtin = false; });
 	cbs.add("--msl-disable-frag-stencil-ref-builtin",
@@ -1783,92 +1855,100 @@ static int main_inner(int argc, char *argv[])
 	        [&args](CLIParser &parser) { args.msl_enable_frag_output_mask = parser.next_hex_uint(); });
 	cbs.add("--msl-no-clip-distance-user-varying",
 	        [&args](CLIParser &) { args.msl_enable_clip_distance_user_varying = false; });
-	cbs.add("--msl-add-shader-input", [&args](CLIParser &parser) {
-		MSLShaderInterfaceVariable input;
-		// Make sure next_uint() is called in-order.
-		input.location = parser.next_uint();
-		const char *format = parser.next_value_string("other");
-		if (strcmp(format, "any32") == 0)
-			input.format = MSL_SHADER_VARIABLE_FORMAT_ANY32;
-		else if (strcmp(format, "any16") == 0)
-			input.format = MSL_SHADER_VARIABLE_FORMAT_ANY16;
-		else if (strcmp(format, "u16") == 0)
-			input.format = MSL_SHADER_VARIABLE_FORMAT_UINT16;
-		else if (strcmp(format, "u8") == 0)
-			input.format = MSL_SHADER_VARIABLE_FORMAT_UINT8;
-		else
-			input.format = MSL_SHADER_VARIABLE_FORMAT_OTHER;
-		input.vecsize = parser.next_uint();
-		const char *rate = parser.next_value_string("vertex");
-		if (strcmp(rate, "primitive") == 0)
-			input.rate = MSL_SHADER_VARIABLE_RATE_PER_PRIMITIVE;
-		else if (strcmp(rate, "patch") == 0)
-			input.rate = MSL_SHADER_VARIABLE_RATE_PER_PATCH;
-		else
-			input.rate = MSL_SHADER_VARIABLE_RATE_PER_VERTEX;
-		args.msl_shader_inputs.push_back(input);
-	});
-	cbs.add("--msl-add-shader-output", [&args](CLIParser &parser) {
-		MSLShaderInterfaceVariable output;
-		// Make sure next_uint() is called in-order.
-		output.location = parser.next_uint();
-		const char *format = parser.next_value_string("other");
-		if (strcmp(format, "any32") == 0)
-			output.format = MSL_SHADER_VARIABLE_FORMAT_ANY32;
-		else if (strcmp(format, "any16") == 0)
-			output.format = MSL_SHADER_VARIABLE_FORMAT_ANY16;
-		else if (strcmp(format, "u16") == 0)
-			output.format = MSL_SHADER_VARIABLE_FORMAT_UINT16;
-		else if (strcmp(format, "u8") == 0)
-			output.format = MSL_SHADER_VARIABLE_FORMAT_UINT8;
-		else
-			output.format = MSL_SHADER_VARIABLE_FORMAT_OTHER;
-		output.vecsize = parser.next_uint();
-		const char *rate = parser.next_value_string("vertex");
-		if (strcmp(rate, "primitive") == 0)
-			output.rate = MSL_SHADER_VARIABLE_RATE_PER_PRIMITIVE;
-		else if (strcmp(rate, "patch") == 0)
-			output.rate = MSL_SHADER_VARIABLE_RATE_PER_PATCH;
-		else
-			output.rate = MSL_SHADER_VARIABLE_RATE_PER_VERTEX;
-		args.msl_shader_outputs.push_back(output);
-	});
-	cbs.add("--msl-shader-input", [&args](CLIParser &parser) {
-		MSLShaderInterfaceVariable input;
-		// Make sure next_uint() is called in-order.
-		input.location = parser.next_uint();
-		const char *format = parser.next_value_string("other");
-		if (strcmp(format, "any32") == 0)
-			input.format = MSL_SHADER_VARIABLE_FORMAT_ANY32;
-		else if (strcmp(format, "any16") == 0)
-			input.format = MSL_SHADER_VARIABLE_FORMAT_ANY16;
-		else if (strcmp(format, "u16") == 0)
-			input.format = MSL_SHADER_VARIABLE_FORMAT_UINT16;
-		else if (strcmp(format, "u8") == 0)
-			input.format = MSL_SHADER_VARIABLE_FORMAT_UINT8;
-		else
-			input.format = MSL_SHADER_VARIABLE_FORMAT_OTHER;
-		input.vecsize = parser.next_uint();
-		args.msl_shader_inputs.push_back(input);
-	});
-	cbs.add("--msl-shader-output", [&args](CLIParser &parser) {
-		MSLShaderInterfaceVariable output;
-		// Make sure next_uint() is called in-order.
-		output.location = parser.next_uint();
-		const char *format = parser.next_value_string("other");
-		if (strcmp(format, "any32") == 0)
-			output.format = MSL_SHADER_VARIABLE_FORMAT_ANY32;
-		else if (strcmp(format, "any16") == 0)
-			output.format = MSL_SHADER_VARIABLE_FORMAT_ANY16;
-		else if (strcmp(format, "u16") == 0)
-			output.format = MSL_SHADER_VARIABLE_FORMAT_UINT16;
-		else if (strcmp(format, "u8") == 0)
-			output.format = MSL_SHADER_VARIABLE_FORMAT_UINT8;
-		else
-			output.format = MSL_SHADER_VARIABLE_FORMAT_OTHER;
-		output.vecsize = parser.next_uint();
-		args.msl_shader_outputs.push_back(output);
-	});
+	cbs.add("--msl-add-shader-input",
+	        [&args](CLIParser &parser)
+	        {
+		        MSLShaderInterfaceVariable input;
+		        // Make sure next_uint() is called in-order.
+		        input.location = parser.next_uint();
+		        const char *format = parser.next_value_string("other");
+		        if (strcmp(format, "any32") == 0)
+			        input.format = MSL_SHADER_VARIABLE_FORMAT_ANY32;
+		        else if (strcmp(format, "any16") == 0)
+			        input.format = MSL_SHADER_VARIABLE_FORMAT_ANY16;
+		        else if (strcmp(format, "u16") == 0)
+			        input.format = MSL_SHADER_VARIABLE_FORMAT_UINT16;
+		        else if (strcmp(format, "u8") == 0)
+			        input.format = MSL_SHADER_VARIABLE_FORMAT_UINT8;
+		        else
+			        input.format = MSL_SHADER_VARIABLE_FORMAT_OTHER;
+		        input.vecsize = parser.next_uint();
+		        const char *rate = parser.next_value_string("vertex");
+		        if (strcmp(rate, "primitive") == 0)
+			        input.rate = MSL_SHADER_VARIABLE_RATE_PER_PRIMITIVE;
+		        else if (strcmp(rate, "patch") == 0)
+			        input.rate = MSL_SHADER_VARIABLE_RATE_PER_PATCH;
+		        else
+			        input.rate = MSL_SHADER_VARIABLE_RATE_PER_VERTEX;
+		        args.msl_shader_inputs.push_back(input);
+	        });
+	cbs.add("--msl-add-shader-output",
+	        [&args](CLIParser &parser)
+	        {
+		        MSLShaderInterfaceVariable output;
+		        // Make sure next_uint() is called in-order.
+		        output.location = parser.next_uint();
+		        const char *format = parser.next_value_string("other");
+		        if (strcmp(format, "any32") == 0)
+			        output.format = MSL_SHADER_VARIABLE_FORMAT_ANY32;
+		        else if (strcmp(format, "any16") == 0)
+			        output.format = MSL_SHADER_VARIABLE_FORMAT_ANY16;
+		        else if (strcmp(format, "u16") == 0)
+			        output.format = MSL_SHADER_VARIABLE_FORMAT_UINT16;
+		        else if (strcmp(format, "u8") == 0)
+			        output.format = MSL_SHADER_VARIABLE_FORMAT_UINT8;
+		        else
+			        output.format = MSL_SHADER_VARIABLE_FORMAT_OTHER;
+		        output.vecsize = parser.next_uint();
+		        const char *rate = parser.next_value_string("vertex");
+		        if (strcmp(rate, "primitive") == 0)
+			        output.rate = MSL_SHADER_VARIABLE_RATE_PER_PRIMITIVE;
+		        else if (strcmp(rate, "patch") == 0)
+			        output.rate = MSL_SHADER_VARIABLE_RATE_PER_PATCH;
+		        else
+			        output.rate = MSL_SHADER_VARIABLE_RATE_PER_VERTEX;
+		        args.msl_shader_outputs.push_back(output);
+	        });
+	cbs.add("--msl-shader-input",
+	        [&args](CLIParser &parser)
+	        {
+		        MSLShaderInterfaceVariable input;
+		        // Make sure next_uint() is called in-order.
+		        input.location = parser.next_uint();
+		        const char *format = parser.next_value_string("other");
+		        if (strcmp(format, "any32") == 0)
+			        input.format = MSL_SHADER_VARIABLE_FORMAT_ANY32;
+		        else if (strcmp(format, "any16") == 0)
+			        input.format = MSL_SHADER_VARIABLE_FORMAT_ANY16;
+		        else if (strcmp(format, "u16") == 0)
+			        input.format = MSL_SHADER_VARIABLE_FORMAT_UINT16;
+		        else if (strcmp(format, "u8") == 0)
+			        input.format = MSL_SHADER_VARIABLE_FORMAT_UINT8;
+		        else
+			        input.format = MSL_SHADER_VARIABLE_FORMAT_OTHER;
+		        input.vecsize = parser.next_uint();
+		        args.msl_shader_inputs.push_back(input);
+	        });
+	cbs.add("--msl-shader-output",
+	        [&args](CLIParser &parser)
+	        {
+		        MSLShaderInterfaceVariable output;
+		        // Make sure next_uint() is called in-order.
+		        output.location = parser.next_uint();
+		        const char *format = parser.next_value_string("other");
+		        if (strcmp(format, "any32") == 0)
+			        output.format = MSL_SHADER_VARIABLE_FORMAT_ANY32;
+		        else if (strcmp(format, "any16") == 0)
+			        output.format = MSL_SHADER_VARIABLE_FORMAT_ANY16;
+		        else if (strcmp(format, "u16") == 0)
+			        output.format = MSL_SHADER_VARIABLE_FORMAT_UINT16;
+		        else if (strcmp(format, "u8") == 0)
+			        output.format = MSL_SHADER_VARIABLE_FORMAT_UINT8;
+		        else
+			        output.format = MSL_SHADER_VARIABLE_FORMAT_OTHER;
+		        output.vecsize = parser.next_uint();
+		        args.msl_shader_outputs.push_back(output);
+	        });
 	cbs.add("--msl-raw-buffer-tese-input", [&args](CLIParser &) { args.msl_raw_buffer_tese_input = true; });
 	cbs.add("--msl-multi-patch-workgroup", [&args](CLIParser &) { args.msl_multi_patch_workgroup = true; });
 	cbs.add("--msl-vertex-for-tessellation", [&args](CLIParser &) { args.msl_vertex_for_tessellation = true; });
@@ -1888,92 +1968,118 @@ static int main_inner(int argc, char *argv[])
 	cbs.add("--msl-no-manual-helper-invocation-updates",
 	        [&args](CLIParser &) { args.msl_manual_helper_invocation_updates = false; });
 	cbs.add("--msl-check-discarded-frag-stores", [&args](CLIParser &) { args.msl_check_discarded_frag_stores = true; });
-	cbs.add("--msl-force-frag-with-side-effects-execution", [&args](CLIParser &) { args.msl_force_fragment_with_side_effects_execution = true; });
+	cbs.add("--msl-force-frag-with-side-effects-execution",
+	        [&args](CLIParser &) { args.msl_force_fragment_with_side_effects_execution = true; });
 	cbs.add("--msl-sample-dref-lod-array-as-grad",
 	        [&args](CLIParser &) { args.msl_sample_dref_lod_array_as_grad = true; });
 	cbs.add("--msl-no-readwrite-texture-fences", [&args](CLIParser &) { args.msl_readwrite_texture_fences = false; });
 	cbs.add("--msl-agx-manual-cube-grad-fixup", [&args](CLIParser &) { args.msl_agx_manual_cube_grad_fixup = true; });
-	cbs.add("--msl-combined-sampler-suffix", [&args](CLIParser &parser) {
-		args.msl_combined_sampler_suffix = parser.next_string();
-	});
+	cbs.add("--msl-combined-sampler-suffix",
+	        [&args](CLIParser &parser) { args.msl_combined_sampler_suffix = parser.next_string(); });
 	cbs.add("--msl-runtime-array-rich-descriptor",
 	        [&args](CLIParser &) { args.msl_runtime_array_rich_descriptor = true; });
-	cbs.add("--msl-replace-recursive-inputs",
-	        [&args](CLIParser &) { args.msl_replace_recursive_inputs = true; });
-	cbs.add("--msl-input-attachment-is-ds-attachment", [&args](CLIParser &) { args.msl_input_attachment_is_ds_attachment = true; });
+	cbs.add("--msl-replace-recursive-inputs", [&args](CLIParser &) { args.msl_replace_recursive_inputs = true; });
+	cbs.add("--msl-input-attachment-is-ds-attachment",
+	        [&args](CLIParser &) { args.msl_input_attachment_is_ds_attachment = true; });
 	cbs.add("--msl-disable-rasterization", [&args](CLIParser &) { args.msl_disable_rasterization = true; });
 	cbs.add("--msl-auto-disable-rasterization", [&args](CLIParser &) { args.msl_auto_disable_rasterization = true; });
-	cbs.add("--msl-default-point-size", [&args](CLIParser &parser) {
-		args.msl_enable_point_size_default = true;
-		args.msl_default_point_size = static_cast<float>(parser.next_double());
-	});
+	cbs.add("--msl-default-point-size",
+	        [&args](CLIParser &parser)
+	        {
+		        args.msl_enable_point_size_default = true;
+		        args.msl_default_point_size = static_cast<float>(parser.next_double());
+	        });
+	cbs.add("--opencl", [&args](CLIParser &) { args.opencl = true; });
+	cbs.add("--opencl-version", [&args](CLIParser &parser) { args.opencl_version = parser.next_uint(); });
+	cbs.add("--opencl-fp64", [&args](CLIParser &) { args.opencl_enable_fp64 = true; });
+	cbs.add("--opencl-64bit-atomics", [&args](CLIParser &) { args.opencl_enable_64bit_atomics = true; });
 	cbs.add("--extension", [&args](CLIParser &parser) { args.extensions.push_back(parser.next_string()); });
-	cbs.add("--rename-entry-point", [&args](CLIParser &parser) {
-		auto old_name = parser.next_string();
-		auto new_name = parser.next_string();
-		auto model = stage_to_execution_model(parser.next_string());
-		args.entry_point_rename.push_back({ old_name, new_name, std::move(model) });
-	});
+	cbs.add("--rename-entry-point",
+	        [&args](CLIParser &parser)
+	        {
+		        auto old_name = parser.next_string();
+		        auto new_name = parser.next_string();
+		        auto model = stage_to_execution_model(parser.next_string());
+		        args.entry_point_rename.push_back({ old_name, new_name, std::move(model) });
+	        });
 	cbs.add("--entry", [&args](CLIParser &parser) { args.entry = parser.next_string(); });
 	cbs.add("--stage", [&args](CLIParser &parser) { args.entry_stage = parser.next_string(); });
 	cbs.add("--separate-shader-objects", [&args](CLIParser &) { args.sso = true; });
-	cbs.add("--set-hlsl-vertex-input-semantic", [&args](CLIParser &parser) {
-		HLSLVertexAttributeRemap remap;
-		remap.location = parser.next_uint();
-		remap.semantic = parser.next_string();
-		args.hlsl_attr_remap.push_back(std::move(remap));
-	});
-	cbs.add("--set-hlsl-named-vertex-input-semantic", [&args](CLIParser &parser) {
-		HLSLVertexAttributeRemapNamed remap;
-		remap.name = parser.next_string();
-		remap.semantic = parser.next_string();
-		args.hlsl_attr_remap_named.push_back(std::move(remap));
-	});
-
-	cbs.add("--remap", [&args](CLIParser &parser) {
-		string src = parser.next_string();
-		string dst = parser.next_string();
-		uint32_t components = parser.next_uint();
-		args.remaps.push_back({ std::move(src), std::move(dst), components });
-	});
-
-	cbs.add("--remap-variable-type", [&args](CLIParser &parser) {
-		string var_name = parser.next_string();
-		string new_type = parser.next_string();
-		args.variable_type_remaps.push_back({ std::move(var_name), std::move(new_type) });
-	});
-
-	cbs.add("--rename-interface-variable", [&args](CLIParser &parser) {
-		StorageClass cls = StorageClassMax;
-		string clsStr = parser.next_string();
-		if (clsStr == "in")
-			cls = StorageClassInput;
-		else if (clsStr == "out")
-			cls = StorageClassOutput;
-
-		uint32_t loc = parser.next_uint();
-		string var_name = parser.next_string();
-		args.interface_variable_renames.push_back({ cls, loc, std::move(var_name) });
-	});
-
-	cbs.add("--pls-in", [&args](CLIParser &parser) {
-		auto fmt = pls_format(parser.next_string());
-		auto name = parser.next_string();
-		args.pls_in.push_back({ std::move(fmt), std::move(name) });
-	});
-	cbs.add("--pls-out", [&args](CLIParser &parser) {
-		auto fmt = pls_format(parser.next_string());
-		auto name = parser.next_string();
-		args.pls_out.push_back({ std::move(fmt), std::move(name) });
-	});
-	cbs.add("--shader-model", [&args](CLIParser &parser) {
-		args.shader_model = parser.next_uint();
-		args.set_shader_model = true;
-	});
-	cbs.add("--msl-version", [&args](CLIParser &parser) {
-		args.msl_version = parser.next_uint();
-		args.set_msl_version = true;
-	});
+	cbs.add("--set-hlsl-vertex-input-semantic",
+	        [&args](CLIParser &parser)
+	        {
+		        HLSLVertexAttributeRemap remap;
+		        remap.location = parser.next_uint();
+		        remap.semantic = parser.next_string();
+		        args.hlsl_attr_remap.push_back(std::move(remap));
+	        });
+	cbs.add("--set-hlsl-named-vertex-input-semantic",
+	        [&args](CLIParser &parser)
+	        {
+		        HLSLVertexAttributeRemapNamed remap;
+		        remap.name = parser.next_string();
+		        remap.semantic = parser.next_string();
+		        args.hlsl_attr_remap_named.push_back(std::move(remap));
+	        });
+
+	cbs.add("--remap",
+	        [&args](CLIParser &parser)
+	        {
+		        string src = parser.next_string();
+		        string dst = parser.next_string();
+		        uint32_t components = parser.next_uint();
+		        args.remaps.push_back({ std::move(src), std::move(dst), components });
+	        });
+
+	cbs.add("--remap-variable-type",
+	        [&args](CLIParser &parser)
+	        {
+		        string var_name = parser.next_string();
+		        string new_type = parser.next_string();
+		        args.variable_type_remaps.push_back({ std::move(var_name), std::move(new_type) });
+	        });
+
+	cbs.add("--rename-interface-variable",
+	        [&args](CLIParser &parser)
+	        {
+		        StorageClass cls = StorageClassMax;
+		        string clsStr = parser.next_string();
+		        if (clsStr == "in")
+			        cls = StorageClassInput;
+		        else if (clsStr == "out")
+			        cls = StorageClassOutput;
+
+		        uint32_t loc = parser.next_uint();
+		        string var_name = parser.next_string();
+		        args.interface_variable_renames.push_back({ cls, loc, std::move(var_name) });
+	        });
+
+	cbs.add("--pls-in",
+	        [&args](CLIParser &parser)
+	        {
+		        auto fmt = pls_format(parser.next_string());
+		        auto name = parser.next_string();
+		        args.pls_in.push_back({ std::move(fmt), std::move(name) });
+	        });
+	cbs.add("--pls-out",
+	        [&args](CLIParser &parser)
+	        {
+		        auto fmt = pls_format(parser.next_string());
+		        auto name = parser.next_string();
+		        args.pls_out.push_back({ std::move(fmt), std::move(name) });
+	        });
+	cbs.add("--shader-model",
+	        [&args](CLIParser &parser)
+	        {
+		        args.shader_model = parser.next_uint();
+		        args.set_shader_model = true;
+	        });
+	cbs.add("--msl-version",
+	        [&args](CLIParser &parser)
+	        {
+		        args.msl_version = parser.next_uint();
+		        args.set_msl_version = true;
+	        });
 
 	cbs.add("--remove-unused-variables", [&args](CLIParser &) { args.remove_unused = true; });
 	cbs.add("--combined-samplers-inherit-bindings",
@@ -1982,34 +2088,37 @@ static int main_inner(int argc, char *argv[])
 	cbs.add("--no-support-nonzero-baseinstance", [&](CLIParser &) { args.support_nonzero_baseinstance = false; });
 	cbs.add("--emit-line-directives", [&args](CLIParser &) { args.emit_line_directives = true; });
 
-	cbs.add("--mask-stage-output-location", [&](CLIParser &parser) {
-		uint32_t location = parser.next_uint();
-		uint32_t component = parser.next_uint();
-		args.masked_stage_outputs.push_back({ location, component });
-	});
-
-	cbs.add("--mask-stage-output-builtin", [&](CLIParser &parser) {
-		BuiltIn masked_builtin = BuiltInMax;
-		std::string builtin = parser.next_string();
-		if (builtin == "Position")
-			masked_builtin = BuiltInPosition;
-		else if (builtin == "PointSize")
-			masked_builtin = BuiltInPointSize;
-		else if (builtin == "CullDistance")
-			masked_builtin = BuiltInCullDistance;
-		else if (builtin == "ClipDistance")
-			masked_builtin = BuiltInClipDistance;
-		else
-		{
-			print_help();
-			exit(EXIT_FAILURE);
-		}
-		args.masked_stage_builtins.push_back(masked_builtin);
-	});
-
-	cbs.add("--force-recompile-max-debug-iterations", [&](CLIParser &parser) {
-		args.force_recompile_max_debug_iterations = parser.next_uint();
-	});
+	cbs.add("--mask-stage-output-location",
+	        [&](CLIParser &parser)
+	        {
+		        uint32_t location = parser.next_uint();
+		        uint32_t component = parser.next_uint();
+		        args.masked_stage_outputs.push_back({ location, component });
+	        });
+
+	cbs.add("--mask-stage-output-builtin",
+	        [&](CLIParser &parser)
+	        {
+		        BuiltIn masked_builtin = BuiltInMax;
+		        std::string builtin = parser.next_string();
+		        if (builtin == "Position")
+			        masked_builtin = BuiltInPosition;
+		        else if (builtin == "PointSize")
+			        masked_builtin = BuiltInPointSize;
+		        else if (builtin == "CullDistance")
+			        masked_builtin = BuiltInCullDistance;
+		        else if (builtin == "ClipDistance")
+			        masked_builtin = BuiltInClipDistance;
+		        else
+		        {
+			        print_help();
+			        exit(EXIT_FAILURE);
+		        }
+		        args.masked_stage_builtins.push_back(masked_builtin);
+	        });
+
+	cbs.add("--force-recompile-max-debug-iterations",
+	        [&](CLIParser &parser) { args.force_recompile_max_debug_iterations = parser.next_uint(); });
 
 	cbs.add("--relax-nan-checks", [&](CLIParser &) { args.relax_nan_checks = true; });
 
diff --git a/spirv_cross_c.cpp b/spirv_cross_c.cpp
index 1604385e5..4494700ed 100644
--- a/spirv_cross_c.cpp
+++ b/spirv_cross_c.cpp
@@ -40,6 +40,9 @@
 #if SPIRV_CROSS_C_API_REFLECT
 #include "spirv_reflect.hpp"
 #endif
+#if SPIRV_CROSS_C_API_OPENCL
+#include "spirv_opencl.hpp"
+#endif
 
 #ifdef HAVE_SPIRV_CROSS_GIT_VERSION
 #include "gitversion.h"
@@ -339,6 +342,15 @@ spvc_result spvc_context_create_compiler(spvc_context context, spvc_backend back
 			break;
 #endif
 
+#if SPIRV_CROSS_C_API_OPENCL
+        case SPVC_BACKEND_OPENCL:
+			if (mode == SPVC_CAPTURE_MODE_TAKE_OWNERSHIP)
+				comp->compiler.reset(new CompilerOpenCL(std::move(parsed_ir->parsed)));
+			else if (mode == SPVC_CAPTURE_MODE_COPY)
+				comp->compiler.reset(new CompilerOpenCL(parsed_ir->parsed));
+			break;
+#endif
+
 		default:
 			context->report_error("Invalid backend.");
 			return SPVC_ERROR_INVALID_ARGUMENT;
diff --git a/spirv_cross_c.h b/spirv_cross_c.h
index 30f1c459c..76d2b8155 100644
--- a/spirv_cross_c.h
+++ b/spirv_cross_c.h
@@ -24,8 +24,8 @@
 #ifndef SPIRV_CROSS_C_API_H
 #define SPIRV_CROSS_C_API_H
 
-#include <stddef.h>
 #include "spirv.h"
+#include <stddef.h>
 
 /*
  * C89-compatible wrapper for SPIRV-Cross' API.
@@ -34,7 +34,8 @@
  */
 
 #ifdef __cplusplus
-extern "C" {
+extern "C"
+{
 #endif
 
 /* Bumped if ABI or API breaks backwards compatibility. */
@@ -59,212 +60,213 @@ extern "C" {
 #endif
 #endif
 
-/*
+	/*
  * Gets the SPVC_C_API_VERSION_* used to build this library.
  * Can be used to check for ABI mismatch if so-versioning did not catch it.
  */
-SPVC_PUBLIC_API void spvc_get_version(unsigned *major, unsigned *minor, unsigned *patch);
-
-/* Gets a human readable version string to identify which commit a particular binary was created from. */
-SPVC_PUBLIC_API const char *spvc_get_commit_revision_and_timestamp(void);
-
-/* These types are opaque to the user. */
-typedef struct spvc_context_s *spvc_context;
-typedef struct spvc_parsed_ir_s *spvc_parsed_ir;
-typedef struct spvc_compiler_s *spvc_compiler;
-typedef struct spvc_compiler_options_s *spvc_compiler_options;
-typedef struct spvc_resources_s *spvc_resources;
-struct spvc_type_s;
-typedef const struct spvc_type_s *spvc_type;
-typedef struct spvc_constant_s *spvc_constant;
-struct spvc_set_s;
-typedef const struct spvc_set_s *spvc_set;
+	SPVC_PUBLIC_API void spvc_get_version(unsigned *major, unsigned *minor, unsigned *patch);
+
+	/* Gets a human readable version string to identify which commit a particular binary was created from. */
+	SPVC_PUBLIC_API const char *spvc_get_commit_revision_and_timestamp(void);
+
+	/* These types are opaque to the user. */
+	typedef struct spvc_context_s *spvc_context;
+	typedef struct spvc_parsed_ir_s *spvc_parsed_ir;
+	typedef struct spvc_compiler_s *spvc_compiler;
+	typedef struct spvc_compiler_options_s *spvc_compiler_options;
+	typedef struct spvc_resources_s *spvc_resources;
+	struct spvc_type_s;
+	typedef const struct spvc_type_s *spvc_type;
+	typedef struct spvc_constant_s *spvc_constant;
+	struct spvc_set_s;
+	typedef const struct spvc_set_s *spvc_set;
 
-/*
+	/*
  * Shallow typedefs. All SPIR-V IDs are plain 32-bit numbers, but this helps communicate which data is used.
  * Maps to a SPIRType.
  */
-typedef SpvId spvc_type_id;
-/* Maps to a SPIRVariable. */
-typedef SpvId spvc_variable_id;
-/* Maps to a SPIRConstant. */
-typedef SpvId spvc_constant_id;
-
-/* See C++ API. */
-typedef struct spvc_reflected_resource
-{
-	spvc_variable_id id;
-	spvc_type_id base_type_id;
-	spvc_type_id type_id;
-	const char *name;
-} spvc_reflected_resource;
-
-typedef struct spvc_reflected_builtin_resource
-{
-	SpvBuiltIn builtin;
-	spvc_type_id value_type_id;
-	spvc_reflected_resource resource;
-} spvc_reflected_builtin_resource;
-
-/* See C++ API. */
-typedef struct spvc_entry_point
-{
-	SpvExecutionModel execution_model;
-	const char *name;
-} spvc_entry_point;
-
-/* See C++ API. */
-typedef struct spvc_combined_image_sampler
-{
-	spvc_variable_id combined_id;
-	spvc_variable_id image_id;
-	spvc_variable_id sampler_id;
-} spvc_combined_image_sampler;
-
-/* See C++ API. */
-typedef struct spvc_specialization_constant
-{
-	spvc_constant_id id;
-	unsigned constant_id;
-} spvc_specialization_constant;
+	typedef SpvId spvc_type_id;
+	/* Maps to a SPIRVariable. */
+	typedef SpvId spvc_variable_id;
+	/* Maps to a SPIRConstant. */
+	typedef SpvId spvc_constant_id;
+
+	/* See C++ API. */
+	typedef struct spvc_reflected_resource
+	{
+		spvc_variable_id id;
+		spvc_type_id base_type_id;
+		spvc_type_id type_id;
+		const char *name;
+	} spvc_reflected_resource;
+
+	typedef struct spvc_reflected_builtin_resource
+	{
+		SpvBuiltIn builtin;
+		spvc_type_id value_type_id;
+		spvc_reflected_resource resource;
+	} spvc_reflected_builtin_resource;
+
+	/* See C++ API. */
+	typedef struct spvc_entry_point
+	{
+		SpvExecutionModel execution_model;
+		const char *name;
+	} spvc_entry_point;
+
+	/* See C++ API. */
+	typedef struct spvc_combined_image_sampler
+	{
+		spvc_variable_id combined_id;
+		spvc_variable_id image_id;
+		spvc_variable_id sampler_id;
+	} spvc_combined_image_sampler;
+
+	/* See C++ API. */
+	typedef struct spvc_specialization_constant
+	{
+		spvc_constant_id id;
+		unsigned constant_id;
+	} spvc_specialization_constant;
+
+	/* See C++ API. */
+	typedef struct spvc_buffer_range
+	{
+		unsigned index;
+		size_t offset;
+		size_t range;
+	} spvc_buffer_range;
+
+	/* See C++ API. */
+	typedef struct spvc_hlsl_root_constants
+	{
+		unsigned start;
+		unsigned end;
+		unsigned binding;
+		unsigned space;
+	} spvc_hlsl_root_constants;
+
+	/* See C++ API. */
+	typedef struct spvc_hlsl_vertex_attribute_remap
+	{
+		unsigned location;
+		const char *semantic;
+	} spvc_hlsl_vertex_attribute_remap;
 
-/* See C++ API. */
-typedef struct spvc_buffer_range
-{
-	unsigned index;
-	size_t offset;
-	size_t range;
-} spvc_buffer_range;
-
-/* See C++ API. */
-typedef struct spvc_hlsl_root_constants
-{
-	unsigned start;
-	unsigned end;
-	unsigned binding;
-	unsigned space;
-} spvc_hlsl_root_constants;
-
-/* See C++ API. */
-typedef struct spvc_hlsl_vertex_attribute_remap
-{
-	unsigned location;
-	const char *semantic;
-} spvc_hlsl_vertex_attribute_remap;
-
-/*
+	/*
  * Be compatible with non-C99 compilers, which do not have stdbool.
  * Only recent MSVC compilers supports this for example, and ideally SPIRV-Cross should be linkable
  * from a wide range of compilers in its C wrapper.
  */
-typedef unsigned char spvc_bool;
+	typedef unsigned char spvc_bool;
 #define SPVC_TRUE ((spvc_bool)1)
 #define SPVC_FALSE ((spvc_bool)0)
 
-typedef enum spvc_result
-{
-	/* Success. */
-	SPVC_SUCCESS = 0,
+	typedef enum spvc_result
+	{
+		/* Success. */
+		SPVC_SUCCESS = 0,
 
-	/* The SPIR-V is invalid. Should have been caught by validation ideally. */
-	SPVC_ERROR_INVALID_SPIRV = -1,
+		/* The SPIR-V is invalid. Should have been caught by validation ideally. */
+		SPVC_ERROR_INVALID_SPIRV = -1,
 
-	/* The SPIR-V might be valid or invalid, but SPIRV-Cross currently cannot correctly translate this to your target language. */
-	SPVC_ERROR_UNSUPPORTED_SPIRV = -2,
+		/* The SPIR-V might be valid or invalid, but SPIRV-Cross currently cannot correctly translate this to your target language. */
+		SPVC_ERROR_UNSUPPORTED_SPIRV = -2,
 
-	/* If for some reason we hit this, new or malloc failed. */
-	SPVC_ERROR_OUT_OF_MEMORY = -3,
+		/* If for some reason we hit this, new or malloc failed. */
+		SPVC_ERROR_OUT_OF_MEMORY = -3,
 
-	/* Invalid API argument. */
-	SPVC_ERROR_INVALID_ARGUMENT = -4,
+		/* Invalid API argument. */
+		SPVC_ERROR_INVALID_ARGUMENT = -4,
 
-	SPVC_ERROR_INT_MAX = 0x7fffffff
-} spvc_result;
+		SPVC_ERROR_INT_MAX = 0x7fffffff
+	} spvc_result;
 
-typedef enum spvc_capture_mode
-{
-	/* The Parsed IR payload will be copied, and the handle can be reused to create other compiler instances. */
-	SPVC_CAPTURE_MODE_COPY = 0,
+	typedef enum spvc_capture_mode
+	{
+		/* The Parsed IR payload will be copied, and the handle can be reused to create other compiler instances. */
+		SPVC_CAPTURE_MODE_COPY = 0,
 
-	/*
+		/*
 	 * The payload will now be owned by the compiler.
 	 * parsed_ir should now be considered a dead blob and must not be used further.
 	 * This is optimal for performance and should be the go-to option.
 	 */
-	SPVC_CAPTURE_MODE_TAKE_OWNERSHIP = 1,
-
-	SPVC_CAPTURE_MODE_INT_MAX = 0x7fffffff
-} spvc_capture_mode;
-
-typedef enum spvc_backend
-{
-	/* This backend can only perform reflection, no compiler options are supported. Maps to spirv_cross::Compiler. */
-	SPVC_BACKEND_NONE = 0,
-	SPVC_BACKEND_GLSL = 1, /* spirv_cross::CompilerGLSL */
-	SPVC_BACKEND_HLSL = 2, /* CompilerHLSL */
-	SPVC_BACKEND_MSL = 3, /* CompilerMSL */
-	SPVC_BACKEND_CPP = 4, /* CompilerCPP */
-	SPVC_BACKEND_JSON = 5, /* CompilerReflection w/ JSON backend */
-	SPVC_BACKEND_INT_MAX = 0x7fffffff
-} spvc_backend;
-
-/* Maps to C++ API. */
-typedef enum spvc_resource_type
-{
-	SPVC_RESOURCE_TYPE_UNKNOWN = 0,
-	SPVC_RESOURCE_TYPE_UNIFORM_BUFFER = 1,
-	SPVC_RESOURCE_TYPE_STORAGE_BUFFER = 2,
-	SPVC_RESOURCE_TYPE_STAGE_INPUT = 3,
-	SPVC_RESOURCE_TYPE_STAGE_OUTPUT = 4,
-	SPVC_RESOURCE_TYPE_SUBPASS_INPUT = 5,
-	SPVC_RESOURCE_TYPE_STORAGE_IMAGE = 6,
-	SPVC_RESOURCE_TYPE_SAMPLED_IMAGE = 7,
-	SPVC_RESOURCE_TYPE_ATOMIC_COUNTER = 8,
-	SPVC_RESOURCE_TYPE_PUSH_CONSTANT = 9,
-	SPVC_RESOURCE_TYPE_SEPARATE_IMAGE = 10,
-	SPVC_RESOURCE_TYPE_SEPARATE_SAMPLERS = 11,
-	SPVC_RESOURCE_TYPE_ACCELERATION_STRUCTURE = 12,
-	SPVC_RESOURCE_TYPE_RAY_QUERY = 13,
-	SPVC_RESOURCE_TYPE_SHADER_RECORD_BUFFER = 14,
-	SPVC_RESOURCE_TYPE_GL_PLAIN_UNIFORM = 15,
-	SPVC_RESOURCE_TYPE_TENSOR = 16,
-	SPVC_RESOURCE_TYPE_INT_MAX = 0x7fffffff
-} spvc_resource_type;
-
-typedef enum spvc_builtin_resource_type
-{
-	SPVC_BUILTIN_RESOURCE_TYPE_UNKNOWN = 0,
-	SPVC_BUILTIN_RESOURCE_TYPE_STAGE_INPUT = 1,
-	SPVC_BUILTIN_RESOURCE_TYPE_STAGE_OUTPUT = 2,
-	SPVC_BUILTIN_RESOURCE_TYPE_INT_MAX = 0x7fffffff
-} spvc_builtin_resource_type;
-
-/* Maps to spirv_cross::SPIRType::BaseType. */
-typedef enum spvc_basetype
-{
-	SPVC_BASETYPE_UNKNOWN = 0,
-	SPVC_BASETYPE_VOID = 1,
-	SPVC_BASETYPE_BOOLEAN = 2,
-	SPVC_BASETYPE_INT8 = 3,
-	SPVC_BASETYPE_UINT8 = 4,
-	SPVC_BASETYPE_INT16 = 5,
-	SPVC_BASETYPE_UINT16 = 6,
-	SPVC_BASETYPE_INT32 = 7,
-	SPVC_BASETYPE_UINT32 = 8,
-	SPVC_BASETYPE_INT64 = 9,
-	SPVC_BASETYPE_UINT64 = 10,
-	SPVC_BASETYPE_ATOMIC_COUNTER = 11,
-	SPVC_BASETYPE_FP16 = 12,
-	SPVC_BASETYPE_FP32 = 13,
-	SPVC_BASETYPE_FP64 = 14,
-	SPVC_BASETYPE_STRUCT = 15,
-	SPVC_BASETYPE_IMAGE = 16,
-	SPVC_BASETYPE_SAMPLED_IMAGE = 17,
-	SPVC_BASETYPE_SAMPLER = 18,
-	SPVC_BASETYPE_ACCELERATION_STRUCTURE = 19,
-
-	SPVC_BASETYPE_INT_MAX = 0x7fffffff
-} spvc_basetype;
+		SPVC_CAPTURE_MODE_TAKE_OWNERSHIP = 1,
+
+		SPVC_CAPTURE_MODE_INT_MAX = 0x7fffffff
+	} spvc_capture_mode;
+
+	typedef enum spvc_backend
+	{
+		/* This backend can only perform reflection, no compiler options are supported. Maps to spirv_cross::Compiler. */
+		SPVC_BACKEND_NONE = 0,
+		SPVC_BACKEND_GLSL = 1, /* spirv_cross::CompilerGLSL */
+		SPVC_BACKEND_HLSL = 2, /* CompilerHLSL */
+		SPVC_BACKEND_MSL = 3, /* CompilerMSL */
+		SPVC_BACKEND_CPP = 4, /* CompilerCPP */
+		SPVC_BACKEND_JSON = 5, /* CompilerReflection w/ JSON backend */
+		SPVC_BACKEND_OPENCL = 6, /* CompilerOpenCL */
+		SPVC_BACKEND_INT_MAX = 0x7fffffff
+	} spvc_backend;
+
+	/* Maps to C++ API. */
+	typedef enum spvc_resource_type
+	{
+		SPVC_RESOURCE_TYPE_UNKNOWN = 0,
+		SPVC_RESOURCE_TYPE_UNIFORM_BUFFER = 1,
+		SPVC_RESOURCE_TYPE_STORAGE_BUFFER = 2,
+		SPVC_RESOURCE_TYPE_STAGE_INPUT = 3,
+		SPVC_RESOURCE_TYPE_STAGE_OUTPUT = 4,
+		SPVC_RESOURCE_TYPE_SUBPASS_INPUT = 5,
+		SPVC_RESOURCE_TYPE_STORAGE_IMAGE = 6,
+		SPVC_RESOURCE_TYPE_SAMPLED_IMAGE = 7,
+		SPVC_RESOURCE_TYPE_ATOMIC_COUNTER = 8,
+		SPVC_RESOURCE_TYPE_PUSH_CONSTANT = 9,
+		SPVC_RESOURCE_TYPE_SEPARATE_IMAGE = 10,
+		SPVC_RESOURCE_TYPE_SEPARATE_SAMPLERS = 11,
+		SPVC_RESOURCE_TYPE_ACCELERATION_STRUCTURE = 12,
+		SPVC_RESOURCE_TYPE_RAY_QUERY = 13,
+		SPVC_RESOURCE_TYPE_SHADER_RECORD_BUFFER = 14,
+		SPVC_RESOURCE_TYPE_GL_PLAIN_UNIFORM = 15,
+		SPVC_RESOURCE_TYPE_TENSOR = 16,
+		SPVC_RESOURCE_TYPE_INT_MAX = 0x7fffffff
+	} spvc_resource_type;
+
+	typedef enum spvc_builtin_resource_type
+	{
+		SPVC_BUILTIN_RESOURCE_TYPE_UNKNOWN = 0,
+		SPVC_BUILTIN_RESOURCE_TYPE_STAGE_INPUT = 1,
+		SPVC_BUILTIN_RESOURCE_TYPE_STAGE_OUTPUT = 2,
+		SPVC_BUILTIN_RESOURCE_TYPE_INT_MAX = 0x7fffffff
+	} spvc_builtin_resource_type;
+
+	/* Maps to spirv_cross::SPIRType::BaseType. */
+	typedef enum spvc_basetype
+	{
+		SPVC_BASETYPE_UNKNOWN = 0,
+		SPVC_BASETYPE_VOID = 1,
+		SPVC_BASETYPE_BOOLEAN = 2,
+		SPVC_BASETYPE_INT8 = 3,
+		SPVC_BASETYPE_UINT8 = 4,
+		SPVC_BASETYPE_INT16 = 5,
+		SPVC_BASETYPE_UINT16 = 6,
+		SPVC_BASETYPE_INT32 = 7,
+		SPVC_BASETYPE_UINT32 = 8,
+		SPVC_BASETYPE_INT64 = 9,
+		SPVC_BASETYPE_UINT64 = 10,
+		SPVC_BASETYPE_ATOMIC_COUNTER = 11,
+		SPVC_BASETYPE_FP16 = 12,
+		SPVC_BASETYPE_FP32 = 13,
+		SPVC_BASETYPE_FP64 = 14,
+		SPVC_BASETYPE_STRUCT = 15,
+		SPVC_BASETYPE_IMAGE = 16,
+		SPVC_BASETYPE_SAMPLED_IMAGE = 17,
+		SPVC_BASETYPE_SAMPLER = 18,
+		SPVC_BASETYPE_ACCELERATION_STRUCTURE = 19,
+
+		SPVC_BASETYPE_INT_MAX = 0x7fffffff
+	} spvc_basetype;
 
 #define SPVC_COMPILER_OPTION_COMMON_BIT 0x1000000
 #define SPVC_COMPILER_OPTION_GLSL_BIT 0x2000000
@@ -275,143 +277,143 @@ typedef enum spvc_basetype
 
 #define SPVC_MAKE_MSL_VERSION(major, minor, patch) ((major) * 10000 + (minor) * 100 + (patch))
 
-/* Maps to C++ API. */
-typedef enum spvc_msl_platform
-{
-	SPVC_MSL_PLATFORM_IOS = 0,
-	SPVC_MSL_PLATFORM_MACOS = 1,
-	SPVC_MSL_PLATFORM_MAX_INT = 0x7fffffff
-} spvc_msl_platform;
-
-/* Maps to C++ API. */
-typedef enum spvc_msl_index_type
-{
-	SPVC_MSL_INDEX_TYPE_NONE = 0,
-	SPVC_MSL_INDEX_TYPE_UINT16 = 1,
-	SPVC_MSL_INDEX_TYPE_UINT32 = 2,
-	SPVC_MSL_INDEX_TYPE_MAX_INT = 0x7fffffff
-} spvc_msl_index_type;
-
-/* Maps to C++ API. */
-typedef enum spvc_msl_shader_variable_format
-{
-	SPVC_MSL_SHADER_VARIABLE_FORMAT_OTHER = 0,
-	SPVC_MSL_SHADER_VARIABLE_FORMAT_UINT8 = 1,
-	SPVC_MSL_SHADER_VARIABLE_FORMAT_UINT16 = 2,
-	SPVC_MSL_SHADER_VARIABLE_FORMAT_ANY16 = 3,
-	SPVC_MSL_SHADER_VARIABLE_FORMAT_ANY32 = 4,
-
-	/* Deprecated names. */
-	SPVC_MSL_VERTEX_FORMAT_OTHER = SPVC_MSL_SHADER_VARIABLE_FORMAT_OTHER,
-	SPVC_MSL_VERTEX_FORMAT_UINT8 = SPVC_MSL_SHADER_VARIABLE_FORMAT_UINT8,
-	SPVC_MSL_VERTEX_FORMAT_UINT16 = SPVC_MSL_SHADER_VARIABLE_FORMAT_UINT16,
-	SPVC_MSL_SHADER_INPUT_FORMAT_OTHER = SPVC_MSL_SHADER_VARIABLE_FORMAT_OTHER,
-	SPVC_MSL_SHADER_INPUT_FORMAT_UINT8 = SPVC_MSL_SHADER_VARIABLE_FORMAT_UINT8,
-	SPVC_MSL_SHADER_INPUT_FORMAT_UINT16 = SPVC_MSL_SHADER_VARIABLE_FORMAT_UINT16,
-	SPVC_MSL_SHADER_INPUT_FORMAT_ANY16 = SPVC_MSL_SHADER_VARIABLE_FORMAT_ANY16,
-	SPVC_MSL_SHADER_INPUT_FORMAT_ANY32 = SPVC_MSL_SHADER_VARIABLE_FORMAT_ANY32,
-
-
-	SPVC_MSL_SHADER_INPUT_FORMAT_INT_MAX = 0x7fffffff
-} spvc_msl_shader_variable_format, spvc_msl_shader_input_format, spvc_msl_vertex_format;
-
-/* Maps to C++ API. Deprecated; use spvc_msl_shader_interface_var. */
-typedef struct spvc_msl_vertex_attribute
-{
-	unsigned location;
-
-	/* Obsolete, do not use. Only lingers on for ABI compatibility. */
-	unsigned msl_buffer;
-	/* Obsolete, do not use. Only lingers on for ABI compatibility. */
-	unsigned msl_offset;
-	/* Obsolete, do not use. Only lingers on for ABI compatibility. */
-	unsigned msl_stride;
-	/* Obsolete, do not use. Only lingers on for ABI compatibility. */
-	spvc_bool per_instance;
-
-	spvc_msl_vertex_format format;
-	SpvBuiltIn builtin;
-} spvc_msl_vertex_attribute;
+	/* Maps to C++ API. */
+	typedef enum spvc_msl_platform
+	{
+		SPVC_MSL_PLATFORM_IOS = 0,
+		SPVC_MSL_PLATFORM_MACOS = 1,
+		SPVC_MSL_PLATFORM_MAX_INT = 0x7fffffff
+	} spvc_msl_platform;
+
+	/* Maps to C++ API. */
+	typedef enum spvc_msl_index_type
+	{
+		SPVC_MSL_INDEX_TYPE_NONE = 0,
+		SPVC_MSL_INDEX_TYPE_UINT16 = 1,
+		SPVC_MSL_INDEX_TYPE_UINT32 = 2,
+		SPVC_MSL_INDEX_TYPE_MAX_INT = 0x7fffffff
+	} spvc_msl_index_type;
+
+	/* Maps to C++ API. */
+	typedef enum spvc_msl_shader_variable_format
+	{
+		SPVC_MSL_SHADER_VARIABLE_FORMAT_OTHER = 0,
+		SPVC_MSL_SHADER_VARIABLE_FORMAT_UINT8 = 1,
+		SPVC_MSL_SHADER_VARIABLE_FORMAT_UINT16 = 2,
+		SPVC_MSL_SHADER_VARIABLE_FORMAT_ANY16 = 3,
+		SPVC_MSL_SHADER_VARIABLE_FORMAT_ANY32 = 4,
+
+		/* Deprecated names. */
+		SPVC_MSL_VERTEX_FORMAT_OTHER = SPVC_MSL_SHADER_VARIABLE_FORMAT_OTHER,
+		SPVC_MSL_VERTEX_FORMAT_UINT8 = SPVC_MSL_SHADER_VARIABLE_FORMAT_UINT8,
+		SPVC_MSL_VERTEX_FORMAT_UINT16 = SPVC_MSL_SHADER_VARIABLE_FORMAT_UINT16,
+		SPVC_MSL_SHADER_INPUT_FORMAT_OTHER = SPVC_MSL_SHADER_VARIABLE_FORMAT_OTHER,
+		SPVC_MSL_SHADER_INPUT_FORMAT_UINT8 = SPVC_MSL_SHADER_VARIABLE_FORMAT_UINT8,
+		SPVC_MSL_SHADER_INPUT_FORMAT_UINT16 = SPVC_MSL_SHADER_VARIABLE_FORMAT_UINT16,
+		SPVC_MSL_SHADER_INPUT_FORMAT_ANY16 = SPVC_MSL_SHADER_VARIABLE_FORMAT_ANY16,
+		SPVC_MSL_SHADER_INPUT_FORMAT_ANY32 = SPVC_MSL_SHADER_VARIABLE_FORMAT_ANY32,
+
+		SPVC_MSL_SHADER_INPUT_FORMAT_INT_MAX = 0x7fffffff
+	} spvc_msl_shader_variable_format,
+	    spvc_msl_shader_input_format, spvc_msl_vertex_format;
+
+	/* Maps to C++ API. Deprecated; use spvc_msl_shader_interface_var. */
+	typedef struct spvc_msl_vertex_attribute
+	{
+		unsigned location;
+
+		/* Obsolete, do not use. Only lingers on for ABI compatibility. */
+		unsigned msl_buffer;
+		/* Obsolete, do not use. Only lingers on for ABI compatibility. */
+		unsigned msl_offset;
+		/* Obsolete, do not use. Only lingers on for ABI compatibility. */
+		unsigned msl_stride;
+		/* Obsolete, do not use. Only lingers on for ABI compatibility. */
+		spvc_bool per_instance;
+
+		spvc_msl_vertex_format format;
+		SpvBuiltIn builtin;
+	} spvc_msl_vertex_attribute;
 
-/*
+	/*
  * Initializes the vertex attribute struct.
  */
-SPVC_PUBLIC_API void spvc_msl_vertex_attribute_init(spvc_msl_vertex_attribute *attr);
+	SPVC_PUBLIC_API void spvc_msl_vertex_attribute_init(spvc_msl_vertex_attribute *attr);
 
-/* Maps to C++ API. Deprecated; use spvc_msl_shader_interface_var_2. */
-typedef struct spvc_msl_shader_interface_var
-{
-	unsigned location;
-	spvc_msl_vertex_format format;
-	SpvBuiltIn builtin;
-	unsigned vecsize;
-} spvc_msl_shader_interface_var, spvc_msl_shader_input;
+	/* Maps to C++ API. Deprecated; use spvc_msl_shader_interface_var_2. */
+	typedef struct spvc_msl_shader_interface_var
+	{
+		unsigned location;
+		spvc_msl_vertex_format format;
+		SpvBuiltIn builtin;
+		unsigned vecsize;
+	} spvc_msl_shader_interface_var, spvc_msl_shader_input;
 
-/*
+	/*
  * Initializes the shader input struct.
  * Deprecated. Use spvc_msl_shader_interface_var_init_2().
  */
-SPVC_PUBLIC_API void spvc_msl_shader_interface_var_init(spvc_msl_shader_interface_var *var);
-/*
+	SPVC_PUBLIC_API void spvc_msl_shader_interface_var_init(spvc_msl_shader_interface_var *var);
+	/*
  * Deprecated. Use spvc_msl_shader_interface_var_init_2().
  */
-SPVC_PUBLIC_API void spvc_msl_shader_input_init(spvc_msl_shader_input *input);
+	SPVC_PUBLIC_API void spvc_msl_shader_input_init(spvc_msl_shader_input *input);
+
+	/* Maps to C++ API. */
+	typedef enum spvc_msl_shader_variable_rate
+	{
+		SPVC_MSL_SHADER_VARIABLE_RATE_PER_VERTEX = 0,
+		SPVC_MSL_SHADER_VARIABLE_RATE_PER_PRIMITIVE = 1,
+		SPVC_MSL_SHADER_VARIABLE_RATE_PER_PATCH = 2,
+
+		SPVC_MSL_SHADER_VARIABLE_RATE_INT_MAX = 0x7fffffff,
+	} spvc_msl_shader_variable_rate;
+
+	/* Maps to C++ API. */
+	typedef struct spvc_msl_shader_interface_var_2
+	{
+		unsigned location;
+		spvc_msl_shader_variable_format format;
+		SpvBuiltIn builtin;
+		unsigned vecsize;
+		spvc_msl_shader_variable_rate rate;
+	} spvc_msl_shader_interface_var_2;
 
-/* Maps to C++ API. */
-typedef enum spvc_msl_shader_variable_rate
-{
-	SPVC_MSL_SHADER_VARIABLE_RATE_PER_VERTEX = 0,
-	SPVC_MSL_SHADER_VARIABLE_RATE_PER_PRIMITIVE = 1,
-	SPVC_MSL_SHADER_VARIABLE_RATE_PER_PATCH = 2,
-
-	SPVC_MSL_SHADER_VARIABLE_RATE_INT_MAX = 0x7fffffff,
-} spvc_msl_shader_variable_rate;
-
-/* Maps to C++ API. */
-typedef struct spvc_msl_shader_interface_var_2
-{
-	unsigned location;
-	spvc_msl_shader_variable_format format;
-	SpvBuiltIn builtin;
-	unsigned vecsize;
-	spvc_msl_shader_variable_rate rate;
-} spvc_msl_shader_interface_var_2;
-
-/*
+	/*
  * Initializes the shader interface variable struct.
  */
-SPVC_PUBLIC_API void spvc_msl_shader_interface_var_init_2(spvc_msl_shader_interface_var_2 *var);
+	SPVC_PUBLIC_API void spvc_msl_shader_interface_var_init_2(spvc_msl_shader_interface_var_2 *var);
 
-/* Maps to C++ API.
+	/* Maps to C++ API.
  * Deprecated. Use spvc_msl_resource_binding_2. */
-typedef struct spvc_msl_resource_binding
-{
-	SpvExecutionModel stage;
-	unsigned desc_set;
-	unsigned binding;
-	unsigned msl_buffer;
-	unsigned msl_texture;
-	unsigned msl_sampler;
-} spvc_msl_resource_binding;
-
-typedef struct spvc_msl_resource_binding_2
-{
-	SpvExecutionModel stage;
-	unsigned desc_set;
-	unsigned binding;
-	unsigned count;
-	unsigned msl_buffer;
-	unsigned msl_texture;
-	unsigned msl_sampler;
-} spvc_msl_resource_binding_2;
+	typedef struct spvc_msl_resource_binding
+	{
+		SpvExecutionModel stage;
+		unsigned desc_set;
+		unsigned binding;
+		unsigned msl_buffer;
+		unsigned msl_texture;
+		unsigned msl_sampler;
+	} spvc_msl_resource_binding;
+
+	typedef struct spvc_msl_resource_binding_2
+	{
+		SpvExecutionModel stage;
+		unsigned desc_set;
+		unsigned binding;
+		unsigned count;
+		unsigned msl_buffer;
+		unsigned msl_texture;
+		unsigned msl_sampler;
+	} spvc_msl_resource_binding_2;
 
-/*
+	/*
  * Initializes the resource binding struct.
  * The defaults are non-zero.
  * Deprecated: Use spvc_msl_resource_binding_init_2.
  */
-SPVC_PUBLIC_API void spvc_msl_resource_binding_init(spvc_msl_resource_binding *binding);
-SPVC_PUBLIC_API void spvc_msl_resource_binding_init_2(spvc_msl_resource_binding_2 *binding);
+	SPVC_PUBLIC_API void spvc_msl_resource_binding_init(spvc_msl_resource_binding *binding);
+	SPVC_PUBLIC_API void spvc_msl_resource_binding_init_2(spvc_msl_resource_binding_2 *binding);
 
 #define SPVC_MSL_PUSH_CONSTANT_DESC_SET (~(0u))
 #define SPVC_MSL_PUSH_CONSTANT_BINDING (0)
@@ -422,707 +424,735 @@ SPVC_PUBLIC_API void spvc_msl_resource_binding_init_2(spvc_msl_resource_binding_
 /* Obsolete. Sticks around for backwards compatibility. */
 #define SPVC_MSL_AUX_BUFFER_STRUCT_VERSION 1
 
-/* Runtime check for incompatibility. Obsolete. */
-SPVC_PUBLIC_API unsigned spvc_msl_get_aux_buffer_struct_version(void);
-
-/* Maps to C++ API. */
-typedef enum spvc_msl_sampler_coord
-{
-	SPVC_MSL_SAMPLER_COORD_NORMALIZED = 0,
-	SPVC_MSL_SAMPLER_COORD_PIXEL = 1,
-	SPVC_MSL_SAMPLER_INT_MAX = 0x7fffffff
-} spvc_msl_sampler_coord;
-
-/* Maps to C++ API. */
-typedef enum spvc_msl_sampler_filter
-{
-	SPVC_MSL_SAMPLER_FILTER_NEAREST = 0,
-	SPVC_MSL_SAMPLER_FILTER_LINEAR = 1,
-	SPVC_MSL_SAMPLER_FILTER_INT_MAX = 0x7fffffff
-} spvc_msl_sampler_filter;
+	/* Runtime check for incompatibility. Obsolete. */
+	SPVC_PUBLIC_API unsigned spvc_msl_get_aux_buffer_struct_version(void);
+
+	/* Maps to C++ API. */
+	typedef enum spvc_msl_sampler_coord
+	{
+		SPVC_MSL_SAMPLER_COORD_NORMALIZED = 0,
+		SPVC_MSL_SAMPLER_COORD_PIXEL = 1,
+		SPVC_MSL_SAMPLER_INT_MAX = 0x7fffffff
+	} spvc_msl_sampler_coord;
+
+	/* Maps to C++ API. */
+	typedef enum spvc_msl_sampler_filter
+	{
+		SPVC_MSL_SAMPLER_FILTER_NEAREST = 0,
+		SPVC_MSL_SAMPLER_FILTER_LINEAR = 1,
+		SPVC_MSL_SAMPLER_FILTER_INT_MAX = 0x7fffffff
+	} spvc_msl_sampler_filter;
+
+	/* Maps to C++ API. */
+	typedef enum spvc_msl_sampler_mip_filter
+	{
+		SPVC_MSL_SAMPLER_MIP_FILTER_NONE = 0,
+		SPVC_MSL_SAMPLER_MIP_FILTER_NEAREST = 1,
+		SPVC_MSL_SAMPLER_MIP_FILTER_LINEAR = 2,
+		SPVC_MSL_SAMPLER_MIP_FILTER_INT_MAX = 0x7fffffff
+	} spvc_msl_sampler_mip_filter;
+
+	/* Maps to C++ API. */
+	typedef enum spvc_msl_sampler_address
+	{
+		SPVC_MSL_SAMPLER_ADDRESS_CLAMP_TO_ZERO = 0,
+		SPVC_MSL_SAMPLER_ADDRESS_CLAMP_TO_EDGE = 1,
+		SPVC_MSL_SAMPLER_ADDRESS_CLAMP_TO_BORDER = 2,
+		SPVC_MSL_SAMPLER_ADDRESS_REPEAT = 3,
+		SPVC_MSL_SAMPLER_ADDRESS_MIRRORED_REPEAT = 4,
+		SPVC_MSL_SAMPLER_ADDRESS_INT_MAX = 0x7fffffff
+	} spvc_msl_sampler_address;
+
+	/* Maps to C++ API. */
+	typedef enum spvc_msl_sampler_compare_func
+	{
+		SPVC_MSL_SAMPLER_COMPARE_FUNC_NEVER = 0,
+		SPVC_MSL_SAMPLER_COMPARE_FUNC_LESS = 1,
+		SPVC_MSL_SAMPLER_COMPARE_FUNC_LESS_EQUAL = 2,
+		SPVC_MSL_SAMPLER_COMPARE_FUNC_GREATER = 3,
+		SPVC_MSL_SAMPLER_COMPARE_FUNC_GREATER_EQUAL = 4,
+		SPVC_MSL_SAMPLER_COMPARE_FUNC_EQUAL = 5,
+		SPVC_MSL_SAMPLER_COMPARE_FUNC_NOT_EQUAL = 6,
+		SPVC_MSL_SAMPLER_COMPARE_FUNC_ALWAYS = 7,
+		SPVC_MSL_SAMPLER_COMPARE_FUNC_INT_MAX = 0x7fffffff
+	} spvc_msl_sampler_compare_func;
+
+	/* Maps to C++ API. */
+	typedef enum spvc_msl_sampler_border_color
+	{
+		SPVC_MSL_SAMPLER_BORDER_COLOR_TRANSPARENT_BLACK = 0,
+		SPVC_MSL_SAMPLER_BORDER_COLOR_OPAQUE_BLACK = 1,
+		SPVC_MSL_SAMPLER_BORDER_COLOR_OPAQUE_WHITE = 2,
+		SPVC_MSL_SAMPLER_BORDER_COLOR_INT_MAX = 0x7fffffff
+	} spvc_msl_sampler_border_color;
+
+	/* Maps to C++ API. */
+	typedef enum spvc_msl_format_resolution
+	{
+		SPVC_MSL_FORMAT_RESOLUTION_444 = 0,
+		SPVC_MSL_FORMAT_RESOLUTION_422,
+		SPVC_MSL_FORMAT_RESOLUTION_420,
+		SPVC_MSL_FORMAT_RESOLUTION_INT_MAX = 0x7fffffff
+	} spvc_msl_format_resolution;
+
+	/* Maps to C++ API. */
+	typedef enum spvc_msl_chroma_location
+	{
+		SPVC_MSL_CHROMA_LOCATION_COSITED_EVEN = 0,
+		SPVC_MSL_CHROMA_LOCATION_MIDPOINT,
+		SPVC_MSL_CHROMA_LOCATION_INT_MAX = 0x7fffffff
+	} spvc_msl_chroma_location;
+
+	/* Maps to C++ API. */
+	typedef enum spvc_msl_component_swizzle
+	{
+		SPVC_MSL_COMPONENT_SWIZZLE_IDENTITY = 0,
+		SPVC_MSL_COMPONENT_SWIZZLE_ZERO,
+		SPVC_MSL_COMPONENT_SWIZZLE_ONE,
+		SPVC_MSL_COMPONENT_SWIZZLE_R,
+		SPVC_MSL_COMPONENT_SWIZZLE_G,
+		SPVC_MSL_COMPONENT_SWIZZLE_B,
+		SPVC_MSL_COMPONENT_SWIZZLE_A,
+		SPVC_MSL_COMPONENT_SWIZZLE_INT_MAX = 0x7fffffff
+	} spvc_msl_component_swizzle;
+
+	/* Maps to C++ API. */
+	typedef enum spvc_msl_sampler_ycbcr_model_conversion
+	{
+		SPVC_MSL_SAMPLER_YCBCR_MODEL_CONVERSION_RGB_IDENTITY = 0,
+		SPVC_MSL_SAMPLER_YCBCR_MODEL_CONVERSION_YCBCR_IDENTITY,
+		SPVC_MSL_SAMPLER_YCBCR_MODEL_CONVERSION_YCBCR_BT_709,
+		SPVC_MSL_SAMPLER_YCBCR_MODEL_CONVERSION_YCBCR_BT_601,
+		SPVC_MSL_SAMPLER_YCBCR_MODEL_CONVERSION_YCBCR_BT_2020,
+		SPVC_MSL_SAMPLER_YCBCR_MODEL_CONVERSION_INT_MAX = 0x7fffffff
+	} spvc_msl_sampler_ycbcr_model_conversion;
+
+	/* Maps to C+ API. */
+	typedef enum spvc_msl_sampler_ycbcr_range
+	{
+		SPVC_MSL_SAMPLER_YCBCR_RANGE_ITU_FULL = 0,
+		SPVC_MSL_SAMPLER_YCBCR_RANGE_ITU_NARROW,
+		SPVC_MSL_SAMPLER_YCBCR_RANGE_INT_MAX = 0x7fffffff
+	} spvc_msl_sampler_ycbcr_range;
+
+	/* Maps to C++ API. */
+	typedef struct spvc_msl_constexpr_sampler
+	{
+		spvc_msl_sampler_coord coord;
+		spvc_msl_sampler_filter min_filter;
+		spvc_msl_sampler_filter mag_filter;
+		spvc_msl_sampler_mip_filter mip_filter;
+		spvc_msl_sampler_address s_address;
+		spvc_msl_sampler_address t_address;
+		spvc_msl_sampler_address r_address;
+		spvc_msl_sampler_compare_func compare_func;
+		spvc_msl_sampler_border_color border_color;
+		float lod_clamp_min;
+		float lod_clamp_max;
+		int max_anisotropy;
+
+		spvc_bool compare_enable;
+		spvc_bool lod_clamp_enable;
+		spvc_bool anisotropy_enable;
+	} spvc_msl_constexpr_sampler;
 
-/* Maps to C++ API. */
-typedef enum spvc_msl_sampler_mip_filter
-{
-	SPVC_MSL_SAMPLER_MIP_FILTER_NONE = 0,
-	SPVC_MSL_SAMPLER_MIP_FILTER_NEAREST = 1,
-	SPVC_MSL_SAMPLER_MIP_FILTER_LINEAR = 2,
-	SPVC_MSL_SAMPLER_MIP_FILTER_INT_MAX = 0x7fffffff
-} spvc_msl_sampler_mip_filter;
-
-/* Maps to C++ API. */
-typedef enum spvc_msl_sampler_address
-{
-	SPVC_MSL_SAMPLER_ADDRESS_CLAMP_TO_ZERO = 0,
-	SPVC_MSL_SAMPLER_ADDRESS_CLAMP_TO_EDGE = 1,
-	SPVC_MSL_SAMPLER_ADDRESS_CLAMP_TO_BORDER = 2,
-	SPVC_MSL_SAMPLER_ADDRESS_REPEAT = 3,
-	SPVC_MSL_SAMPLER_ADDRESS_MIRRORED_REPEAT = 4,
-	SPVC_MSL_SAMPLER_ADDRESS_INT_MAX = 0x7fffffff
-} spvc_msl_sampler_address;
-
-/* Maps to C++ API. */
-typedef enum spvc_msl_sampler_compare_func
-{
-	SPVC_MSL_SAMPLER_COMPARE_FUNC_NEVER = 0,
-	SPVC_MSL_SAMPLER_COMPARE_FUNC_LESS = 1,
-	SPVC_MSL_SAMPLER_COMPARE_FUNC_LESS_EQUAL = 2,
-	SPVC_MSL_SAMPLER_COMPARE_FUNC_GREATER = 3,
-	SPVC_MSL_SAMPLER_COMPARE_FUNC_GREATER_EQUAL = 4,
-	SPVC_MSL_SAMPLER_COMPARE_FUNC_EQUAL = 5,
-	SPVC_MSL_SAMPLER_COMPARE_FUNC_NOT_EQUAL = 6,
-	SPVC_MSL_SAMPLER_COMPARE_FUNC_ALWAYS = 7,
-	SPVC_MSL_SAMPLER_COMPARE_FUNC_INT_MAX = 0x7fffffff
-} spvc_msl_sampler_compare_func;
-
-/* Maps to C++ API. */
-typedef enum spvc_msl_sampler_border_color
-{
-	SPVC_MSL_SAMPLER_BORDER_COLOR_TRANSPARENT_BLACK = 0,
-	SPVC_MSL_SAMPLER_BORDER_COLOR_OPAQUE_BLACK = 1,
-	SPVC_MSL_SAMPLER_BORDER_COLOR_OPAQUE_WHITE = 2,
-	SPVC_MSL_SAMPLER_BORDER_COLOR_INT_MAX = 0x7fffffff
-} spvc_msl_sampler_border_color;
-
-/* Maps to C++ API. */
-typedef enum spvc_msl_format_resolution
-{
-	SPVC_MSL_FORMAT_RESOLUTION_444 = 0,
-	SPVC_MSL_FORMAT_RESOLUTION_422,
-	SPVC_MSL_FORMAT_RESOLUTION_420,
-	SPVC_MSL_FORMAT_RESOLUTION_INT_MAX = 0x7fffffff
-} spvc_msl_format_resolution;
-
-/* Maps to C++ API. */
-typedef enum spvc_msl_chroma_location
-{
-	SPVC_MSL_CHROMA_LOCATION_COSITED_EVEN = 0,
-	SPVC_MSL_CHROMA_LOCATION_MIDPOINT,
-	SPVC_MSL_CHROMA_LOCATION_INT_MAX = 0x7fffffff
-} spvc_msl_chroma_location;
-
-/* Maps to C++ API. */
-typedef enum spvc_msl_component_swizzle
-{
-	SPVC_MSL_COMPONENT_SWIZZLE_IDENTITY = 0,
-	SPVC_MSL_COMPONENT_SWIZZLE_ZERO,
-	SPVC_MSL_COMPONENT_SWIZZLE_ONE,
-	SPVC_MSL_COMPONENT_SWIZZLE_R,
-	SPVC_MSL_COMPONENT_SWIZZLE_G,
-	SPVC_MSL_COMPONENT_SWIZZLE_B,
-	SPVC_MSL_COMPONENT_SWIZZLE_A,
-	SPVC_MSL_COMPONENT_SWIZZLE_INT_MAX = 0x7fffffff
-} spvc_msl_component_swizzle;
-
-/* Maps to C++ API. */
-typedef enum spvc_msl_sampler_ycbcr_model_conversion
-{
-	SPVC_MSL_SAMPLER_YCBCR_MODEL_CONVERSION_RGB_IDENTITY = 0,
-	SPVC_MSL_SAMPLER_YCBCR_MODEL_CONVERSION_YCBCR_IDENTITY,
-	SPVC_MSL_SAMPLER_YCBCR_MODEL_CONVERSION_YCBCR_BT_709,
-	SPVC_MSL_SAMPLER_YCBCR_MODEL_CONVERSION_YCBCR_BT_601,
-	SPVC_MSL_SAMPLER_YCBCR_MODEL_CONVERSION_YCBCR_BT_2020,
-	SPVC_MSL_SAMPLER_YCBCR_MODEL_CONVERSION_INT_MAX = 0x7fffffff
-} spvc_msl_sampler_ycbcr_model_conversion;
-
-/* Maps to C+ API. */
-typedef enum spvc_msl_sampler_ycbcr_range
-{
-	SPVC_MSL_SAMPLER_YCBCR_RANGE_ITU_FULL = 0,
-	SPVC_MSL_SAMPLER_YCBCR_RANGE_ITU_NARROW,
-	SPVC_MSL_SAMPLER_YCBCR_RANGE_INT_MAX = 0x7fffffff
-} spvc_msl_sampler_ycbcr_range;
-
-/* Maps to C++ API. */
-typedef struct spvc_msl_constexpr_sampler
-{
-	spvc_msl_sampler_coord coord;
-	spvc_msl_sampler_filter min_filter;
-	spvc_msl_sampler_filter mag_filter;
-	spvc_msl_sampler_mip_filter mip_filter;
-	spvc_msl_sampler_address s_address;
-	spvc_msl_sampler_address t_address;
-	spvc_msl_sampler_address r_address;
-	spvc_msl_sampler_compare_func compare_func;
-	spvc_msl_sampler_border_color border_color;
-	float lod_clamp_min;
-	float lod_clamp_max;
-	int max_anisotropy;
-
-	spvc_bool compare_enable;
-	spvc_bool lod_clamp_enable;
-	spvc_bool anisotropy_enable;
-} spvc_msl_constexpr_sampler;
-
-/*
+	/*
  * Initializes the constexpr sampler struct.
  * The defaults are non-zero.
  */
-SPVC_PUBLIC_API void spvc_msl_constexpr_sampler_init(spvc_msl_constexpr_sampler *sampler);
+	SPVC_PUBLIC_API void spvc_msl_constexpr_sampler_init(spvc_msl_constexpr_sampler *sampler);
+
+	/* Maps to the sampler Y'CbCr conversion-related portions of MSLConstexprSampler. See C++ API for defaults and details. */
+	typedef struct spvc_msl_sampler_ycbcr_conversion
+	{
+		unsigned planes;
+		spvc_msl_format_resolution resolution;
+		spvc_msl_sampler_filter chroma_filter;
+		spvc_msl_chroma_location x_chroma_offset;
+		spvc_msl_chroma_location y_chroma_offset;
+		spvc_msl_component_swizzle swizzle[4];
+		spvc_msl_sampler_ycbcr_model_conversion ycbcr_model;
+		spvc_msl_sampler_ycbcr_range ycbcr_range;
+		unsigned bpc;
+	} spvc_msl_sampler_ycbcr_conversion;
 
-/* Maps to the sampler Y'CbCr conversion-related portions of MSLConstexprSampler. See C++ API for defaults and details. */
-typedef struct spvc_msl_sampler_ycbcr_conversion
-{
-	unsigned planes;
-	spvc_msl_format_resolution resolution;
-	spvc_msl_sampler_filter chroma_filter;
-	spvc_msl_chroma_location x_chroma_offset;
-	spvc_msl_chroma_location y_chroma_offset;
-	spvc_msl_component_swizzle swizzle[4];
-	spvc_msl_sampler_ycbcr_model_conversion ycbcr_model;
-	spvc_msl_sampler_ycbcr_range ycbcr_range;
-	unsigned bpc;
-} spvc_msl_sampler_ycbcr_conversion;
-
-/*
+	/*
  * Initializes the constexpr sampler struct.
  * The defaults are non-zero.
  */
-SPVC_PUBLIC_API void spvc_msl_sampler_ycbcr_conversion_init(spvc_msl_sampler_ycbcr_conversion *conv);
-
-/* Maps to C++ API. */
-typedef enum spvc_hlsl_binding_flag_bits
-{
-	SPVC_HLSL_BINDING_AUTO_NONE_BIT = 0,
-	SPVC_HLSL_BINDING_AUTO_PUSH_CONSTANT_BIT = 1 << 0,
-	SPVC_HLSL_BINDING_AUTO_CBV_BIT = 1 << 1,
-	SPVC_HLSL_BINDING_AUTO_SRV_BIT = 1 << 2,
-	SPVC_HLSL_BINDING_AUTO_UAV_BIT = 1 << 3,
-	SPVC_HLSL_BINDING_AUTO_SAMPLER_BIT = 1 << 4,
-	SPVC_HLSL_BINDING_AUTO_ALL = 0x7fffffff
-} spvc_hlsl_binding_flag_bits;
-typedef unsigned spvc_hlsl_binding_flags;
+	SPVC_PUBLIC_API void spvc_msl_sampler_ycbcr_conversion_init(spvc_msl_sampler_ycbcr_conversion *conv);
+
+	/* Maps to C++ API. */
+	typedef enum spvc_hlsl_binding_flag_bits
+	{
+		SPVC_HLSL_BINDING_AUTO_NONE_BIT = 0,
+		SPVC_HLSL_BINDING_AUTO_PUSH_CONSTANT_BIT = 1 << 0,
+		SPVC_HLSL_BINDING_AUTO_CBV_BIT = 1 << 1,
+		SPVC_HLSL_BINDING_AUTO_SRV_BIT = 1 << 2,
+		SPVC_HLSL_BINDING_AUTO_UAV_BIT = 1 << 3,
+		SPVC_HLSL_BINDING_AUTO_SAMPLER_BIT = 1 << 4,
+		SPVC_HLSL_BINDING_AUTO_ALL = 0x7fffffff
+	} spvc_hlsl_binding_flag_bits;
+	typedef unsigned spvc_hlsl_binding_flags;
 
 #define SPVC_HLSL_PUSH_CONSTANT_DESC_SET (~(0u))
 #define SPVC_HLSL_PUSH_CONSTANT_BINDING (0)
 
-/* Maps to C++ API. */
-typedef struct spvc_hlsl_resource_binding_mapping
-{
-	unsigned register_space;
-	unsigned register_binding;
-} spvc_hlsl_resource_binding_mapping;
+	/* Maps to C++ API. */
+	typedef struct spvc_hlsl_resource_binding_mapping
+	{
+		unsigned register_space;
+		unsigned register_binding;
+	} spvc_hlsl_resource_binding_mapping;
 
-typedef struct spvc_hlsl_resource_binding
-{
-	SpvExecutionModel stage;
-	unsigned desc_set;
-	unsigned binding;
+	typedef struct spvc_hlsl_resource_binding
+	{
+		SpvExecutionModel stage;
+		unsigned desc_set;
+		unsigned binding;
 
-	spvc_hlsl_resource_binding_mapping cbv, uav, srv, sampler;
-} spvc_hlsl_resource_binding;
+		spvc_hlsl_resource_binding_mapping cbv, uav, srv, sampler;
+	} spvc_hlsl_resource_binding;
 
-/*
+	/*
  * Initializes the resource binding struct.
  * The defaults are non-zero.
  */
-SPVC_PUBLIC_API void spvc_hlsl_resource_binding_init(spvc_hlsl_resource_binding *binding);
-
-/* Maps to the various spirv_cross::Compiler*::Option structures. See C++ API for defaults and details. */
-typedef enum spvc_compiler_option
-{
-	SPVC_COMPILER_OPTION_UNKNOWN = 0,
+	SPVC_PUBLIC_API void spvc_hlsl_resource_binding_init(spvc_hlsl_resource_binding *binding);
+
+	/* Maps to the various spirv_cross::Compiler*::Option structures. See C++ API for defaults and details. */
+	typedef enum spvc_compiler_option
+	{
+		SPVC_COMPILER_OPTION_UNKNOWN = 0,
 
-	SPVC_COMPILER_OPTION_FORCE_TEMPORARY = 1 | SPVC_COMPILER_OPTION_COMMON_BIT,
-	SPVC_COMPILER_OPTION_FLATTEN_MULTIDIMENSIONAL_ARRAYS = 2 | SPVC_COMPILER_OPTION_COMMON_BIT,
-	SPVC_COMPILER_OPTION_FIXUP_DEPTH_CONVENTION = 3 | SPVC_COMPILER_OPTION_COMMON_BIT,
-	SPVC_COMPILER_OPTION_FLIP_VERTEX_Y = 4 | SPVC_COMPILER_OPTION_COMMON_BIT,
+		SPVC_COMPILER_OPTION_FORCE_TEMPORARY = 1 | SPVC_COMPILER_OPTION_COMMON_BIT,
+		SPVC_COMPILER_OPTION_FLATTEN_MULTIDIMENSIONAL_ARRAYS = 2 | SPVC_COMPILER_OPTION_COMMON_BIT,
+		SPVC_COMPILER_OPTION_FIXUP_DEPTH_CONVENTION = 3 | SPVC_COMPILER_OPTION_COMMON_BIT,
+		SPVC_COMPILER_OPTION_FLIP_VERTEX_Y = 4 | SPVC_COMPILER_OPTION_COMMON_BIT,
+
+		SPVC_COMPILER_OPTION_GLSL_SUPPORT_NONZERO_BASE_INSTANCE = 5 | SPVC_COMPILER_OPTION_GLSL_BIT,
+		SPVC_COMPILER_OPTION_GLSL_SEPARATE_SHADER_OBJECTS = 6 | SPVC_COMPILER_OPTION_GLSL_BIT,
+		SPVC_COMPILER_OPTION_GLSL_ENABLE_420PACK_EXTENSION = 7 | SPVC_COMPILER_OPTION_GLSL_BIT,
+		SPVC_COMPILER_OPTION_GLSL_VERSION = 8 | SPVC_COMPILER_OPTION_GLSL_BIT,
+		SPVC_COMPILER_OPTION_GLSL_ES = 9 | SPVC_COMPILER_OPTION_GLSL_BIT,
+		SPVC_COMPILER_OPTION_GLSL_VULKAN_SEMANTICS = 10 | SPVC_COMPILER_OPTION_GLSL_BIT,
+		SPVC_COMPILER_OPTION_GLSL_ES_DEFAULT_FLOAT_PRECISION_HIGHP = 11 | SPVC_COMPILER_OPTION_GLSL_BIT,
+		SPVC_COMPILER_OPTION_GLSL_ES_DEFAULT_INT_PRECISION_HIGHP = 12 | SPVC_COMPILER_OPTION_GLSL_BIT,
 
-	SPVC_COMPILER_OPTION_GLSL_SUPPORT_NONZERO_BASE_INSTANCE = 5 | SPVC_COMPILER_OPTION_GLSL_BIT,
-	SPVC_COMPILER_OPTION_GLSL_SEPARATE_SHADER_OBJECTS = 6 | SPVC_COMPILER_OPTION_GLSL_BIT,
-	SPVC_COMPILER_OPTION_GLSL_ENABLE_420PACK_EXTENSION = 7 | SPVC_COMPILER_OPTION_GLSL_BIT,
-	SPVC_COMPILER_OPTION_GLSL_VERSION = 8 | SPVC_COMPILER_OPTION_GLSL_BIT,
-	SPVC_COMPILER_OPTION_GLSL_ES = 9 | SPVC_COMPILER_OPTION_GLSL_BIT,
-	SPVC_COMPILER_OPTION_GLSL_VULKAN_SEMANTICS = 10 | SPVC_COMPILER_OPTION_GLSL_BIT,
-	SPVC_COMPILER_OPTION_GLSL_ES_DEFAULT_FLOAT_PRECISION_HIGHP = 11 | SPVC_COMPILER_OPTION_GLSL_BIT,
-	SPVC_COMPILER_OPTION_GLSL_ES_DEFAULT_INT_PRECISION_HIGHP = 12 | SPVC_COMPILER_OPTION_GLSL_BIT,
+		SPVC_COMPILER_OPTION_HLSL_SHADER_MODEL = 13 | SPVC_COMPILER_OPTION_HLSL_BIT,
+		SPVC_COMPILER_OPTION_HLSL_POINT_SIZE_COMPAT = 14 | SPVC_COMPILER_OPTION_HLSL_BIT,
+		SPVC_COMPILER_OPTION_HLSL_POINT_COORD_COMPAT = 15 | SPVC_COMPILER_OPTION_HLSL_BIT,
+		SPVC_COMPILER_OPTION_HLSL_SUPPORT_NONZERO_BASE_VERTEX_BASE_INSTANCE = 16 | SPVC_COMPILER_OPTION_HLSL_BIT,
 
-	SPVC_COMPILER_OPTION_HLSL_SHADER_MODEL = 13 | SPVC_COMPILER_OPTION_HLSL_BIT,
-	SPVC_COMPILER_OPTION_HLSL_POINT_SIZE_COMPAT = 14 | SPVC_COMPILER_OPTION_HLSL_BIT,
-	SPVC_COMPILER_OPTION_HLSL_POINT_COORD_COMPAT = 15 | SPVC_COMPILER_OPTION_HLSL_BIT,
-	SPVC_COMPILER_OPTION_HLSL_SUPPORT_NONZERO_BASE_VERTEX_BASE_INSTANCE = 16 | SPVC_COMPILER_OPTION_HLSL_BIT,
+		SPVC_COMPILER_OPTION_MSL_VERSION = 17 | SPVC_COMPILER_OPTION_MSL_BIT,
+		SPVC_COMPILER_OPTION_MSL_TEXEL_BUFFER_TEXTURE_WIDTH = 18 | SPVC_COMPILER_OPTION_MSL_BIT,
+
+		/* Obsolete, use SWIZZLE_BUFFER_INDEX instead. */
+		SPVC_COMPILER_OPTION_MSL_AUX_BUFFER_INDEX = 19 | SPVC_COMPILER_OPTION_MSL_BIT,
+		SPVC_COMPILER_OPTION_MSL_SWIZZLE_BUFFER_INDEX = 19 | SPVC_COMPILER_OPTION_MSL_BIT,
 
-	SPVC_COMPILER_OPTION_MSL_VERSION = 17 | SPVC_COMPILER_OPTION_MSL_BIT,
-	SPVC_COMPILER_OPTION_MSL_TEXEL_BUFFER_TEXTURE_WIDTH = 18 | SPVC_COMPILER_OPTION_MSL_BIT,
-
-	/* Obsolete, use SWIZZLE_BUFFER_INDEX instead. */
-	SPVC_COMPILER_OPTION_MSL_AUX_BUFFER_INDEX = 19 | SPVC_COMPILER_OPTION_MSL_BIT,
-	SPVC_COMPILER_OPTION_MSL_SWIZZLE_BUFFER_INDEX = 19 | SPVC_COMPILER_OPTION_MSL_BIT,
+		SPVC_COMPILER_OPTION_MSL_INDIRECT_PARAMS_BUFFER_INDEX = 20 | SPVC_COMPILER_OPTION_MSL_BIT,
+		SPVC_COMPILER_OPTION_MSL_SHADER_OUTPUT_BUFFER_INDEX = 21 | SPVC_COMPILER_OPTION_MSL_BIT,
+		SPVC_COMPILER_OPTION_MSL_SHADER_PATCH_OUTPUT_BUFFER_INDEX = 22 | SPVC_COMPILER_OPTION_MSL_BIT,
+		SPVC_COMPILER_OPTION_MSL_SHADER_TESS_FACTOR_OUTPUT_BUFFER_INDEX = 23 | SPVC_COMPILER_OPTION_MSL_BIT,
+		SPVC_COMPILER_OPTION_MSL_SHADER_INPUT_WORKGROUP_INDEX = 24 | SPVC_COMPILER_OPTION_MSL_BIT,
+		SPVC_COMPILER_OPTION_MSL_ENABLE_POINT_SIZE_BUILTIN = 25 | SPVC_COMPILER_OPTION_MSL_BIT,
+		SPVC_COMPILER_OPTION_MSL_DISABLE_RASTERIZATION = 26 | SPVC_COMPILER_OPTION_MSL_BIT,
+		SPVC_COMPILER_OPTION_MSL_CAPTURE_OUTPUT_TO_BUFFER = 27 | SPVC_COMPILER_OPTION_MSL_BIT,
+		SPVC_COMPILER_OPTION_MSL_SWIZZLE_TEXTURE_SAMPLES = 28 | SPVC_COMPILER_OPTION_MSL_BIT,
+		SPVC_COMPILER_OPTION_MSL_PAD_FRAGMENT_OUTPUT_COMPONENTS = 29 | SPVC_COMPILER_OPTION_MSL_BIT,
+		SPVC_COMPILER_OPTION_MSL_TESS_DOMAIN_ORIGIN_LOWER_LEFT = 30 | SPVC_COMPILER_OPTION_MSL_BIT,
+		SPVC_COMPILER_OPTION_MSL_PLATFORM = 31 | SPVC_COMPILER_OPTION_MSL_BIT,
+		SPVC_COMPILER_OPTION_MSL_ARGUMENT_BUFFERS = 32 | SPVC_COMPILER_OPTION_MSL_BIT,
 
-	SPVC_COMPILER_OPTION_MSL_INDIRECT_PARAMS_BUFFER_INDEX = 20 | SPVC_COMPILER_OPTION_MSL_BIT,
-	SPVC_COMPILER_OPTION_MSL_SHADER_OUTPUT_BUFFER_INDEX = 21 | SPVC_COMPILER_OPTION_MSL_BIT,
-	SPVC_COMPILER_OPTION_MSL_SHADER_PATCH_OUTPUT_BUFFER_INDEX = 22 | SPVC_COMPILER_OPTION_MSL_BIT,
-	SPVC_COMPILER_OPTION_MSL_SHADER_TESS_FACTOR_OUTPUT_BUFFER_INDEX = 23 | SPVC_COMPILER_OPTION_MSL_BIT,
-	SPVC_COMPILER_OPTION_MSL_SHADER_INPUT_WORKGROUP_INDEX = 24 | SPVC_COMPILER_OPTION_MSL_BIT,
-	SPVC_COMPILER_OPTION_MSL_ENABLE_POINT_SIZE_BUILTIN = 25 | SPVC_COMPILER_OPTION_MSL_BIT,
-	SPVC_COMPILER_OPTION_MSL_DISABLE_RASTERIZATION = 26 | SPVC_COMPILER_OPTION_MSL_BIT,
-	SPVC_COMPILER_OPTION_MSL_CAPTURE_OUTPUT_TO_BUFFER = 27 | SPVC_COMPILER_OPTION_MSL_BIT,
-	SPVC_COMPILER_OPTION_MSL_SWIZZLE_TEXTURE_SAMPLES = 28 | SPVC_COMPILER_OPTION_MSL_BIT,
-	SPVC_COMPILER_OPTION_MSL_PAD_FRAGMENT_OUTPUT_COMPONENTS = 29 | SPVC_COMPILER_OPTION_MSL_BIT,
-	SPVC_COMPILER_OPTION_MSL_TESS_DOMAIN_ORIGIN_LOWER_LEFT = 30 | SPVC_COMPILER_OPTION_MSL_BIT,
-	SPVC_COMPILER_OPTION_MSL_PLATFORM = 31 | SPVC_COMPILER_OPTION_MSL_BIT,
-	SPVC_COMPILER_OPTION_MSL_ARGUMENT_BUFFERS = 32 | SPVC_COMPILER_OPTION_MSL_BIT,
+		SPVC_COMPILER_OPTION_GLSL_EMIT_PUSH_CONSTANT_AS_UNIFORM_BUFFER = 33 | SPVC_COMPILER_OPTION_GLSL_BIT,
 
-	SPVC_COMPILER_OPTION_GLSL_EMIT_PUSH_CONSTANT_AS_UNIFORM_BUFFER = 33 | SPVC_COMPILER_OPTION_GLSL_BIT,
+		SPVC_COMPILER_OPTION_MSL_TEXTURE_BUFFER_NATIVE = 34 | SPVC_COMPILER_OPTION_MSL_BIT,
 
-	SPVC_COMPILER_OPTION_MSL_TEXTURE_BUFFER_NATIVE = 34 | SPVC_COMPILER_OPTION_MSL_BIT,
+		SPVC_COMPILER_OPTION_GLSL_EMIT_UNIFORM_BUFFER_AS_PLAIN_UNIFORMS = 35 | SPVC_COMPILER_OPTION_GLSL_BIT,
 
-	SPVC_COMPILER_OPTION_GLSL_EMIT_UNIFORM_BUFFER_AS_PLAIN_UNIFORMS = 35 | SPVC_COMPILER_OPTION_GLSL_BIT,
+		SPVC_COMPILER_OPTION_MSL_BUFFER_SIZE_BUFFER_INDEX = 36 | SPVC_COMPILER_OPTION_MSL_BIT,
 
-	SPVC_COMPILER_OPTION_MSL_BUFFER_SIZE_BUFFER_INDEX = 36 | SPVC_COMPILER_OPTION_MSL_BIT,
+		SPVC_COMPILER_OPTION_EMIT_LINE_DIRECTIVES = 37 | SPVC_COMPILER_OPTION_COMMON_BIT,
 
-	SPVC_COMPILER_OPTION_EMIT_LINE_DIRECTIVES = 37 | SPVC_COMPILER_OPTION_COMMON_BIT,
+		SPVC_COMPILER_OPTION_MSL_MULTIVIEW = 38 | SPVC_COMPILER_OPTION_MSL_BIT,
+		SPVC_COMPILER_OPTION_MSL_VIEW_MASK_BUFFER_INDEX = 39 | SPVC_COMPILER_OPTION_MSL_BIT,
+		SPVC_COMPILER_OPTION_MSL_DEVICE_INDEX = 40 | SPVC_COMPILER_OPTION_MSL_BIT,
+		SPVC_COMPILER_OPTION_MSL_VIEW_INDEX_FROM_DEVICE_INDEX = 41 | SPVC_COMPILER_OPTION_MSL_BIT,
+		SPVC_COMPILER_OPTION_MSL_DISPATCH_BASE = 42 | SPVC_COMPILER_OPTION_MSL_BIT,
+		SPVC_COMPILER_OPTION_MSL_DYNAMIC_OFFSETS_BUFFER_INDEX = 43 | SPVC_COMPILER_OPTION_MSL_BIT,
+		SPVC_COMPILER_OPTION_MSL_TEXTURE_1D_AS_2D = 44 | SPVC_COMPILER_OPTION_MSL_BIT,
+		SPVC_COMPILER_OPTION_MSL_ENABLE_BASE_INDEX_ZERO = 45 | SPVC_COMPILER_OPTION_MSL_BIT,
 
-	SPVC_COMPILER_OPTION_MSL_MULTIVIEW = 38 | SPVC_COMPILER_OPTION_MSL_BIT,
-	SPVC_COMPILER_OPTION_MSL_VIEW_MASK_BUFFER_INDEX = 39 | SPVC_COMPILER_OPTION_MSL_BIT,
-	SPVC_COMPILER_OPTION_MSL_DEVICE_INDEX = 40 | SPVC_COMPILER_OPTION_MSL_BIT,
-	SPVC_COMPILER_OPTION_MSL_VIEW_INDEX_FROM_DEVICE_INDEX = 41 | SPVC_COMPILER_OPTION_MSL_BIT,
-	SPVC_COMPILER_OPTION_MSL_DISPATCH_BASE = 42 | SPVC_COMPILER_OPTION_MSL_BIT,
-	SPVC_COMPILER_OPTION_MSL_DYNAMIC_OFFSETS_BUFFER_INDEX = 43 | SPVC_COMPILER_OPTION_MSL_BIT,
-	SPVC_COMPILER_OPTION_MSL_TEXTURE_1D_AS_2D = 44 | SPVC_COMPILER_OPTION_MSL_BIT,
-	SPVC_COMPILER_OPTION_MSL_ENABLE_BASE_INDEX_ZERO = 45 | SPVC_COMPILER_OPTION_MSL_BIT,
+		/* Obsolete. Use MSL_FRAMEBUFFER_FETCH_SUBPASS instead. */
+		SPVC_COMPILER_OPTION_MSL_IOS_FRAMEBUFFER_FETCH_SUBPASS = 46 | SPVC_COMPILER_OPTION_MSL_BIT,
+		SPVC_COMPILER_OPTION_MSL_FRAMEBUFFER_FETCH_SUBPASS = 46 | SPVC_COMPILER_OPTION_MSL_BIT,
 
-	/* Obsolete. Use MSL_FRAMEBUFFER_FETCH_SUBPASS instead. */
-	SPVC_COMPILER_OPTION_MSL_IOS_FRAMEBUFFER_FETCH_SUBPASS = 46 | SPVC_COMPILER_OPTION_MSL_BIT,
-	SPVC_COMPILER_OPTION_MSL_FRAMEBUFFER_FETCH_SUBPASS = 46 | SPVC_COMPILER_OPTION_MSL_BIT,
+		SPVC_COMPILER_OPTION_MSL_INVARIANT_FP_MATH = 47 | SPVC_COMPILER_OPTION_MSL_BIT,
+		SPVC_COMPILER_OPTION_MSL_EMULATE_CUBEMAP_ARRAY = 48 | SPVC_COMPILER_OPTION_MSL_BIT,
+		SPVC_COMPILER_OPTION_MSL_ENABLE_DECORATION_BINDING = 49 | SPVC_COMPILER_OPTION_MSL_BIT,
+		SPVC_COMPILER_OPTION_MSL_FORCE_ACTIVE_ARGUMENT_BUFFER_RESOURCES = 50 | SPVC_COMPILER_OPTION_MSL_BIT,
+		SPVC_COMPILER_OPTION_MSL_FORCE_NATIVE_ARRAYS = 51 | SPVC_COMPILER_OPTION_MSL_BIT,
 
-	SPVC_COMPILER_OPTION_MSL_INVARIANT_FP_MATH = 47 | SPVC_COMPILER_OPTION_MSL_BIT,
-	SPVC_COMPILER_OPTION_MSL_EMULATE_CUBEMAP_ARRAY = 48 | SPVC_COMPILER_OPTION_MSL_BIT,
-	SPVC_COMPILER_OPTION_MSL_ENABLE_DECORATION_BINDING = 49 | SPVC_COMPILER_OPTION_MSL_BIT,
-	SPVC_COMPILER_OPTION_MSL_FORCE_ACTIVE_ARGUMENT_BUFFER_RESOURCES = 50 | SPVC_COMPILER_OPTION_MSL_BIT,
-	SPVC_COMPILER_OPTION_MSL_FORCE_NATIVE_ARRAYS = 51 | SPVC_COMPILER_OPTION_MSL_BIT,
+		SPVC_COMPILER_OPTION_ENABLE_STORAGE_IMAGE_QUALIFIER_DEDUCTION = 52 | SPVC_COMPILER_OPTION_COMMON_BIT,
 
-	SPVC_COMPILER_OPTION_ENABLE_STORAGE_IMAGE_QUALIFIER_DEDUCTION = 52 | SPVC_COMPILER_OPTION_COMMON_BIT,
+		SPVC_COMPILER_OPTION_HLSL_FORCE_STORAGE_BUFFER_AS_UAV = 53 | SPVC_COMPILER_OPTION_HLSL_BIT,
 
-	SPVC_COMPILER_OPTION_HLSL_FORCE_STORAGE_BUFFER_AS_UAV = 53 | SPVC_COMPILER_OPTION_HLSL_BIT,
+		SPVC_COMPILER_OPTION_FORCE_ZERO_INITIALIZED_VARIABLES = 54 | SPVC_COMPILER_OPTION_COMMON_BIT,
 
-	SPVC_COMPILER_OPTION_FORCE_ZERO_INITIALIZED_VARIABLES = 54 | SPVC_COMPILER_OPTION_COMMON_BIT,
+		SPVC_COMPILER_OPTION_HLSL_NONWRITABLE_UAV_TEXTURE_AS_SRV = 55 | SPVC_COMPILER_OPTION_HLSL_BIT,
 
-	SPVC_COMPILER_OPTION_HLSL_NONWRITABLE_UAV_TEXTURE_AS_SRV = 55 | SPVC_COMPILER_OPTION_HLSL_BIT,
+		SPVC_COMPILER_OPTION_MSL_ENABLE_FRAG_OUTPUT_MASK = 56 | SPVC_COMPILER_OPTION_MSL_BIT,
+		SPVC_COMPILER_OPTION_MSL_ENABLE_FRAG_DEPTH_BUILTIN = 57 | SPVC_COMPILER_OPTION_MSL_BIT,
+		SPVC_COMPILER_OPTION_MSL_ENABLE_FRAG_STENCIL_REF_BUILTIN = 58 | SPVC_COMPILER_OPTION_MSL_BIT,
+		SPVC_COMPILER_OPTION_MSL_ENABLE_CLIP_DISTANCE_USER_VARYING = 59 | SPVC_COMPILER_OPTION_MSL_BIT,
 
-	SPVC_COMPILER_OPTION_MSL_ENABLE_FRAG_OUTPUT_MASK = 56 | SPVC_COMPILER_OPTION_MSL_BIT,
-	SPVC_COMPILER_OPTION_MSL_ENABLE_FRAG_DEPTH_BUILTIN = 57 | SPVC_COMPILER_OPTION_MSL_BIT,
-	SPVC_COMPILER_OPTION_MSL_ENABLE_FRAG_STENCIL_REF_BUILTIN = 58 | SPVC_COMPILER_OPTION_MSL_BIT,
-	SPVC_COMPILER_OPTION_MSL_ENABLE_CLIP_DISTANCE_USER_VARYING = 59 | SPVC_COMPILER_OPTION_MSL_BIT,
+		SPVC_COMPILER_OPTION_HLSL_ENABLE_16BIT_TYPES = 60 | SPVC_COMPILER_OPTION_HLSL_BIT,
 
-	SPVC_COMPILER_OPTION_HLSL_ENABLE_16BIT_TYPES = 60 | SPVC_COMPILER_OPTION_HLSL_BIT,
+		SPVC_COMPILER_OPTION_MSL_MULTI_PATCH_WORKGROUP = 61 | SPVC_COMPILER_OPTION_MSL_BIT,
+		SPVC_COMPILER_OPTION_MSL_SHADER_INPUT_BUFFER_INDEX = 62 | SPVC_COMPILER_OPTION_MSL_BIT,
+		SPVC_COMPILER_OPTION_MSL_SHADER_INDEX_BUFFER_INDEX = 63 | SPVC_COMPILER_OPTION_MSL_BIT,
+		SPVC_COMPILER_OPTION_MSL_VERTEX_FOR_TESSELLATION = 64 | SPVC_COMPILER_OPTION_MSL_BIT,
+		SPVC_COMPILER_OPTION_MSL_VERTEX_INDEX_TYPE = 65 | SPVC_COMPILER_OPTION_MSL_BIT,
 
-	SPVC_COMPILER_OPTION_MSL_MULTI_PATCH_WORKGROUP = 61 | SPVC_COMPILER_OPTION_MSL_BIT,
-	SPVC_COMPILER_OPTION_MSL_SHADER_INPUT_BUFFER_INDEX = 62 | SPVC_COMPILER_OPTION_MSL_BIT,
-	SPVC_COMPILER_OPTION_MSL_SHADER_INDEX_BUFFER_INDEX = 63 | SPVC_COMPILER_OPTION_MSL_BIT,
-	SPVC_COMPILER_OPTION_MSL_VERTEX_FOR_TESSELLATION = 64 | SPVC_COMPILER_OPTION_MSL_BIT,
-	SPVC_COMPILER_OPTION_MSL_VERTEX_INDEX_TYPE = 65 | SPVC_COMPILER_OPTION_MSL_BIT,
+		SPVC_COMPILER_OPTION_GLSL_FORCE_FLATTENED_IO_BLOCKS = 66 | SPVC_COMPILER_OPTION_GLSL_BIT,
 
-	SPVC_COMPILER_OPTION_GLSL_FORCE_FLATTENED_IO_BLOCKS = 66 | SPVC_COMPILER_OPTION_GLSL_BIT,
+		SPVC_COMPILER_OPTION_MSL_MULTIVIEW_LAYERED_RENDERING = 67 | SPVC_COMPILER_OPTION_MSL_BIT,
+		SPVC_COMPILER_OPTION_MSL_ARRAYED_SUBPASS_INPUT = 68 | SPVC_COMPILER_OPTION_MSL_BIT,
+		SPVC_COMPILER_OPTION_MSL_R32UI_LINEAR_TEXTURE_ALIGNMENT = 69 | SPVC_COMPILER_OPTION_MSL_BIT,
+		SPVC_COMPILER_OPTION_MSL_R32UI_ALIGNMENT_CONSTANT_ID = 70 | SPVC_COMPILER_OPTION_MSL_BIT,
 
-	SPVC_COMPILER_OPTION_MSL_MULTIVIEW_LAYERED_RENDERING = 67 | SPVC_COMPILER_OPTION_MSL_BIT,
-	SPVC_COMPILER_OPTION_MSL_ARRAYED_SUBPASS_INPUT = 68 | SPVC_COMPILER_OPTION_MSL_BIT,
-	SPVC_COMPILER_OPTION_MSL_R32UI_LINEAR_TEXTURE_ALIGNMENT = 69 | SPVC_COMPILER_OPTION_MSL_BIT,
-	SPVC_COMPILER_OPTION_MSL_R32UI_ALIGNMENT_CONSTANT_ID = 70 | SPVC_COMPILER_OPTION_MSL_BIT,
+		SPVC_COMPILER_OPTION_HLSL_FLATTEN_MATRIX_VERTEX_INPUT_SEMANTICS = 71 | SPVC_COMPILER_OPTION_HLSL_BIT,
 
-	SPVC_COMPILER_OPTION_HLSL_FLATTEN_MATRIX_VERTEX_INPUT_SEMANTICS = 71 | SPVC_COMPILER_OPTION_HLSL_BIT,
+		SPVC_COMPILER_OPTION_MSL_IOS_USE_SIMDGROUP_FUNCTIONS = 72 | SPVC_COMPILER_OPTION_MSL_BIT,
+		SPVC_COMPILER_OPTION_MSL_EMULATE_SUBGROUPS = 73 | SPVC_COMPILER_OPTION_MSL_BIT,
+		SPVC_COMPILER_OPTION_MSL_FIXED_SUBGROUP_SIZE = 74 | SPVC_COMPILER_OPTION_MSL_BIT,
+		SPVC_COMPILER_OPTION_MSL_FORCE_SAMPLE_RATE_SHADING = 75 | SPVC_COMPILER_OPTION_MSL_BIT,
+		SPVC_COMPILER_OPTION_MSL_IOS_SUPPORT_BASE_VERTEX_INSTANCE = 76 | SPVC_COMPILER_OPTION_MSL_BIT,
 
-	SPVC_COMPILER_OPTION_MSL_IOS_USE_SIMDGROUP_FUNCTIONS = 72 | SPVC_COMPILER_OPTION_MSL_BIT,
-	SPVC_COMPILER_OPTION_MSL_EMULATE_SUBGROUPS = 73 | SPVC_COMPILER_OPTION_MSL_BIT,
-	SPVC_COMPILER_OPTION_MSL_FIXED_SUBGROUP_SIZE = 74 | SPVC_COMPILER_OPTION_MSL_BIT,
-	SPVC_COMPILER_OPTION_MSL_FORCE_SAMPLE_RATE_SHADING = 75 | SPVC_COMPILER_OPTION_MSL_BIT,
-	SPVC_COMPILER_OPTION_MSL_IOS_SUPPORT_BASE_VERTEX_INSTANCE = 76 | SPVC_COMPILER_OPTION_MSL_BIT,
+		SPVC_COMPILER_OPTION_GLSL_OVR_MULTIVIEW_VIEW_COUNT = 77 | SPVC_COMPILER_OPTION_GLSL_BIT,
 
-	SPVC_COMPILER_OPTION_GLSL_OVR_MULTIVIEW_VIEW_COUNT = 77 | SPVC_COMPILER_OPTION_GLSL_BIT,
+		SPVC_COMPILER_OPTION_RELAX_NAN_CHECKS = 78 | SPVC_COMPILER_OPTION_COMMON_BIT,
 
-	SPVC_COMPILER_OPTION_RELAX_NAN_CHECKS = 78 | SPVC_COMPILER_OPTION_COMMON_BIT,
+		SPVC_COMPILER_OPTION_MSL_RAW_BUFFER_TESE_INPUT = 79 | SPVC_COMPILER_OPTION_MSL_BIT,
+		SPVC_COMPILER_OPTION_MSL_SHADER_PATCH_INPUT_BUFFER_INDEX = 80 | SPVC_COMPILER_OPTION_MSL_BIT,
+		SPVC_COMPILER_OPTION_MSL_MANUAL_HELPER_INVOCATION_UPDATES = 81 | SPVC_COMPILER_OPTION_MSL_BIT,
+		SPVC_COMPILER_OPTION_MSL_CHECK_DISCARDED_FRAG_STORES = 82 | SPVC_COMPILER_OPTION_MSL_BIT,
 
-	SPVC_COMPILER_OPTION_MSL_RAW_BUFFER_TESE_INPUT = 79 | SPVC_COMPILER_OPTION_MSL_BIT,
-	SPVC_COMPILER_OPTION_MSL_SHADER_PATCH_INPUT_BUFFER_INDEX = 80 | SPVC_COMPILER_OPTION_MSL_BIT,
-	SPVC_COMPILER_OPTION_MSL_MANUAL_HELPER_INVOCATION_UPDATES = 81 | SPVC_COMPILER_OPTION_MSL_BIT,
-	SPVC_COMPILER_OPTION_MSL_CHECK_DISCARDED_FRAG_STORES = 82 | SPVC_COMPILER_OPTION_MSL_BIT,
+		SPVC_COMPILER_OPTION_GLSL_ENABLE_ROW_MAJOR_LOAD_WORKAROUND = 83 | SPVC_COMPILER_OPTION_GLSL_BIT,
 
-	SPVC_COMPILER_OPTION_GLSL_ENABLE_ROW_MAJOR_LOAD_WORKAROUND = 83 | SPVC_COMPILER_OPTION_GLSL_BIT,
+		SPVC_COMPILER_OPTION_MSL_ARGUMENT_BUFFERS_TIER = 84 | SPVC_COMPILER_OPTION_MSL_BIT,
+		SPVC_COMPILER_OPTION_MSL_SAMPLE_DREF_LOD_ARRAY_AS_GRAD = 85 | SPVC_COMPILER_OPTION_MSL_BIT,
+		SPVC_COMPILER_OPTION_MSL_READWRITE_TEXTURE_FENCES = 86 | SPVC_COMPILER_OPTION_MSL_BIT,
+		SPVC_COMPILER_OPTION_MSL_REPLACE_RECURSIVE_INPUTS = 87 | SPVC_COMPILER_OPTION_MSL_BIT,
+		SPVC_COMPILER_OPTION_MSL_AGX_MANUAL_CUBE_GRAD_FIXUP = 88 | SPVC_COMPILER_OPTION_MSL_BIT,
+		SPVC_COMPILER_OPTION_MSL_FORCE_FRAGMENT_WITH_SIDE_EFFECTS_EXECUTION = 89 | SPVC_COMPILER_OPTION_MSL_BIT,
 
-	SPVC_COMPILER_OPTION_MSL_ARGUMENT_BUFFERS_TIER = 84 | SPVC_COMPILER_OPTION_MSL_BIT,
-	SPVC_COMPILER_OPTION_MSL_SAMPLE_DREF_LOD_ARRAY_AS_GRAD = 85 | SPVC_COMPILER_OPTION_MSL_BIT,
-	SPVC_COMPILER_OPTION_MSL_READWRITE_TEXTURE_FENCES = 86 | SPVC_COMPILER_OPTION_MSL_BIT,
-	SPVC_COMPILER_OPTION_MSL_REPLACE_RECURSIVE_INPUTS = 87 | SPVC_COMPILER_OPTION_MSL_BIT,
-	SPVC_COMPILER_OPTION_MSL_AGX_MANUAL_CUBE_GRAD_FIXUP = 88 | SPVC_COMPILER_OPTION_MSL_BIT,
-	SPVC_COMPILER_OPTION_MSL_FORCE_FRAGMENT_WITH_SIDE_EFFECTS_EXECUTION = 89 | SPVC_COMPILER_OPTION_MSL_BIT,
+		SPVC_COMPILER_OPTION_HLSL_USE_ENTRY_POINT_NAME = 90 | SPVC_COMPILER_OPTION_HLSL_BIT,
+		SPVC_COMPILER_OPTION_HLSL_PRESERVE_STRUCTURED_BUFFERS = 91 | SPVC_COMPILER_OPTION_HLSL_BIT,
 
-	SPVC_COMPILER_OPTION_HLSL_USE_ENTRY_POINT_NAME = 90 | SPVC_COMPILER_OPTION_HLSL_BIT,
-	SPVC_COMPILER_OPTION_HLSL_PRESERVE_STRUCTURED_BUFFERS = 91 | SPVC_COMPILER_OPTION_HLSL_BIT,
+		SPVC_COMPILER_OPTION_MSL_AUTO_DISABLE_RASTERIZATION = 92 | SPVC_COMPILER_OPTION_MSL_BIT,
 
-	SPVC_COMPILER_OPTION_MSL_AUTO_DISABLE_RASTERIZATION = 92 | SPVC_COMPILER_OPTION_MSL_BIT,
+		SPVC_COMPILER_OPTION_MSL_ENABLE_POINT_SIZE_DEFAULT = 93 | SPVC_COMPILER_OPTION_MSL_BIT,
 
-	SPVC_COMPILER_OPTION_MSL_ENABLE_POINT_SIZE_DEFAULT = 93 | SPVC_COMPILER_OPTION_MSL_BIT,
+		SPVC_COMPILER_OPTION_HLSL_USER_SEMANTIC = 94 | SPVC_COMPILER_OPTION_HLSL_BIT,
 
-	SPVC_COMPILER_OPTION_HLSL_USER_SEMANTIC = 94 | SPVC_COMPILER_OPTION_HLSL_BIT,
+		SPVC_COMPILER_OPTION_INT_MAX = 0x7fffffff
+	} spvc_compiler_option;
 
-	SPVC_COMPILER_OPTION_INT_MAX = 0x7fffffff
-} spvc_compiler_option;
-
-/*
+	/*
  * Context is the highest-level API construct.
  * The context owns all memory allocations made by its child object hierarchy, including various non-opaque structs and strings.
  * This means that the API user only has to care about one "destroy" call ever when using the C API.
  * All pointers handed out by the APIs are only valid as long as the context
  * is alive and spvc_context_release_allocations has not been called.
  */
-SPVC_PUBLIC_API spvc_result spvc_context_create(spvc_context *context);
+	SPVC_PUBLIC_API spvc_result spvc_context_create(spvc_context *context);
 
-/* Frees all memory allocations and objects associated with the context and its child objects. */
-SPVC_PUBLIC_API void spvc_context_destroy(spvc_context context);
+	/* Frees all memory allocations and objects associated with the context and its child objects. */
+	SPVC_PUBLIC_API void spvc_context_destroy(spvc_context context);
 
-/* Frees all memory allocations and objects associated with the context and its child objects, but keeps the context alive. */
-SPVC_PUBLIC_API void spvc_context_release_allocations(spvc_context context);
+	/* Frees all memory allocations and objects associated with the context and its child objects, but keeps the context alive. */
+	SPVC_PUBLIC_API void spvc_context_release_allocations(spvc_context context);
 
-/* Get the string for the last error which was logged. */
-SPVC_PUBLIC_API const char *spvc_context_get_last_error_string(spvc_context context);
+	/* Get the string for the last error which was logged. */
+	SPVC_PUBLIC_API const char *spvc_context_get_last_error_string(spvc_context context);
 
-/* Get notified in a callback when an error triggers. Useful for debugging. */
-typedef void (*spvc_error_callback)(void *userdata, const char *error);
-SPVC_PUBLIC_API void spvc_context_set_error_callback(spvc_context context, spvc_error_callback cb, void *userdata);
+	/* Get notified in a callback when an error triggers. Useful for debugging. */
+	typedef void (*spvc_error_callback)(void *userdata, const char *error);
+	SPVC_PUBLIC_API void spvc_context_set_error_callback(spvc_context context, spvc_error_callback cb, void *userdata);
 
-/* SPIR-V parsing interface. Maps to Parser which then creates a ParsedIR, and that IR is extracted into the handle. */
-SPVC_PUBLIC_API spvc_result spvc_context_parse_spirv(spvc_context context, const SpvId *spirv, size_t word_count,
-                                                     spvc_parsed_ir *parsed_ir);
+	/* SPIR-V parsing interface. Maps to Parser which then creates a ParsedIR, and that IR is extracted into the handle. */
+	SPVC_PUBLIC_API spvc_result spvc_context_parse_spirv(spvc_context context, const SpvId *spirv, size_t word_count,
+	                                                     spvc_parsed_ir *parsed_ir);
 
-/*
+	/*
  * Create a compiler backend. Capture mode controls if we construct by copy or move semantics.
  * It is always recommended to use SPVC_CAPTURE_MODE_TAKE_OWNERSHIP if you only intend to cross-compile the IR once.
  */
-SPVC_PUBLIC_API spvc_result spvc_context_create_compiler(spvc_context context, spvc_backend backend,
-                                                         spvc_parsed_ir parsed_ir, spvc_capture_mode mode,
-                                                         spvc_compiler *compiler);
-
-/* Maps directly to C++ API. */
-SPVC_PUBLIC_API unsigned spvc_compiler_get_current_id_bound(spvc_compiler compiler);
-
-/* Create compiler options, which will initialize defaults. */
-SPVC_PUBLIC_API spvc_result spvc_compiler_create_compiler_options(spvc_compiler compiler,
-                                                                  spvc_compiler_options *options);
-/* Override options. Will return error if e.g. MSL options are used for the HLSL backend, etc. */
-SPVC_PUBLIC_API spvc_result spvc_compiler_options_set_bool(spvc_compiler_options options,
-                                                           spvc_compiler_option option, spvc_bool value);
-SPVC_PUBLIC_API spvc_result spvc_compiler_options_set_uint(spvc_compiler_options options,
-                                                           spvc_compiler_option option, unsigned value);
-/* Set compiler options. */
-SPVC_PUBLIC_API spvc_result spvc_compiler_install_compiler_options(spvc_compiler compiler,
-                                                                   spvc_compiler_options options);
-
-/* Compile IR into a string. *source is owned by the context, and caller must not free it themselves. */
-SPVC_PUBLIC_API spvc_result spvc_compiler_compile(spvc_compiler compiler, const char **source);
-
-/* Maps to C++ API. */
-SPVC_PUBLIC_API spvc_result spvc_compiler_add_header_line(spvc_compiler compiler, const char *line);
-SPVC_PUBLIC_API spvc_result spvc_compiler_require_extension(spvc_compiler compiler, const char *ext);
-SPVC_PUBLIC_API size_t spvc_compiler_get_num_required_extensions(spvc_compiler compiler);
-SPVC_PUBLIC_API const char *spvc_compiler_get_required_extension(spvc_compiler compiler, size_t index);
-SPVC_PUBLIC_API spvc_result spvc_compiler_flatten_buffer_block(spvc_compiler compiler, spvc_variable_id id);
-
-SPVC_PUBLIC_API spvc_bool spvc_compiler_variable_is_depth_or_compare(spvc_compiler compiler, spvc_variable_id id);
-
-SPVC_PUBLIC_API spvc_result spvc_compiler_mask_stage_output_by_location(spvc_compiler compiler,
-                                                                        unsigned location, unsigned component);
-SPVC_PUBLIC_API spvc_result spvc_compiler_mask_stage_output_by_builtin(spvc_compiler compiler, SpvBuiltIn builtin);
+	SPVC_PUBLIC_API spvc_result spvc_context_create_compiler(spvc_context context, spvc_backend backend,
+	                                                         spvc_parsed_ir parsed_ir, spvc_capture_mode mode,
+	                                                         spvc_compiler *compiler);
+
+	/* Maps directly to C++ API. */
+	SPVC_PUBLIC_API unsigned spvc_compiler_get_current_id_bound(spvc_compiler compiler);
+
+	/* Create compiler options, which will initialize defaults. */
+	SPVC_PUBLIC_API spvc_result spvc_compiler_create_compiler_options(spvc_compiler compiler,
+	                                                                  spvc_compiler_options *options);
+	/* Override options. Will return error if e.g. MSL options are used for the HLSL backend, etc. */
+	SPVC_PUBLIC_API spvc_result spvc_compiler_options_set_bool(spvc_compiler_options options,
+	                                                           spvc_compiler_option option, spvc_bool value);
+	SPVC_PUBLIC_API spvc_result spvc_compiler_options_set_uint(spvc_compiler_options options,
+	                                                           spvc_compiler_option option, unsigned value);
+	/* Set compiler options. */
+	SPVC_PUBLIC_API spvc_result spvc_compiler_install_compiler_options(spvc_compiler compiler,
+	                                                                   spvc_compiler_options options);
+
+	/* Compile IR into a string. *source is owned by the context, and caller must not free it themselves. */
+	SPVC_PUBLIC_API spvc_result spvc_compiler_compile(spvc_compiler compiler, const char **source);
+
+	/* Maps to C++ API. */
+	SPVC_PUBLIC_API spvc_result spvc_compiler_add_header_line(spvc_compiler compiler, const char *line);
+	SPVC_PUBLIC_API spvc_result spvc_compiler_require_extension(spvc_compiler compiler, const char *ext);
+	SPVC_PUBLIC_API size_t spvc_compiler_get_num_required_extensions(spvc_compiler compiler);
+	SPVC_PUBLIC_API const char *spvc_compiler_get_required_extension(spvc_compiler compiler, size_t index);
+	SPVC_PUBLIC_API spvc_result spvc_compiler_flatten_buffer_block(spvc_compiler compiler, spvc_variable_id id);
+
+	SPVC_PUBLIC_API spvc_bool spvc_compiler_variable_is_depth_or_compare(spvc_compiler compiler, spvc_variable_id id);
+
+	SPVC_PUBLIC_API spvc_result spvc_compiler_mask_stage_output_by_location(spvc_compiler compiler, unsigned location,
+	                                                                        unsigned component);
+	SPVC_PUBLIC_API spvc_result spvc_compiler_mask_stage_output_by_builtin(spvc_compiler compiler, SpvBuiltIn builtin);
 
-/*
+	/*
  * HLSL specifics.
  * Maps to C++ API.
  */
-SPVC_PUBLIC_API spvc_result spvc_compiler_hlsl_set_root_constants_layout(spvc_compiler compiler,
-                                                                         const spvc_hlsl_root_constants *constant_info,
-                                                                         size_t count);
-SPVC_PUBLIC_API spvc_result spvc_compiler_hlsl_add_vertex_attribute_remap(spvc_compiler compiler,
-                                                                          const spvc_hlsl_vertex_attribute_remap *remap,
-                                                                          size_t remaps);
-SPVC_PUBLIC_API spvc_variable_id spvc_compiler_hlsl_remap_num_workgroups_builtin(spvc_compiler compiler);
-
-SPVC_PUBLIC_API spvc_result spvc_compiler_hlsl_set_resource_binding_flags(spvc_compiler compiler,
-                                                                          spvc_hlsl_binding_flags flags);
-
-SPVC_PUBLIC_API spvc_result spvc_compiler_hlsl_add_resource_binding(spvc_compiler compiler,
-                                                                    const spvc_hlsl_resource_binding *binding);
-SPVC_PUBLIC_API spvc_bool spvc_compiler_hlsl_is_resource_used(spvc_compiler compiler,
-                                                              SpvExecutionModel model,
-                                                              unsigned set,
-                                                              unsigned binding);
+	SPVC_PUBLIC_API spvc_result spvc_compiler_hlsl_set_root_constants_layout(
+	    spvc_compiler compiler, const spvc_hlsl_root_constants *constant_info, size_t count);
+	SPVC_PUBLIC_API spvc_result spvc_compiler_hlsl_add_vertex_attribute_remap(
+	    spvc_compiler compiler, const spvc_hlsl_vertex_attribute_remap *remap, size_t remaps);
+	SPVC_PUBLIC_API spvc_variable_id spvc_compiler_hlsl_remap_num_workgroups_builtin(spvc_compiler compiler);
 
-/*
+	SPVC_PUBLIC_API spvc_result spvc_compiler_hlsl_set_resource_binding_flags(spvc_compiler compiler,
+	                                                                          spvc_hlsl_binding_flags flags);
+
+	SPVC_PUBLIC_API spvc_result spvc_compiler_hlsl_add_resource_binding(spvc_compiler compiler,
+	                                                                    const spvc_hlsl_resource_binding *binding);
+	SPVC_PUBLIC_API spvc_bool spvc_compiler_hlsl_is_resource_used(spvc_compiler compiler, SpvExecutionModel model,
+	                                                              unsigned set, unsigned binding);
+
+	/*
  * MSL specifics.
  * Maps to C++ API.
  */
-SPVC_PUBLIC_API spvc_bool spvc_compiler_msl_is_rasterization_disabled(spvc_compiler compiler);
-
-/* Obsolete. Renamed to needs_swizzle_buffer. */
-SPVC_PUBLIC_API spvc_bool spvc_compiler_msl_needs_aux_buffer(spvc_compiler compiler);
-SPVC_PUBLIC_API spvc_bool spvc_compiler_msl_needs_swizzle_buffer(spvc_compiler compiler);
-SPVC_PUBLIC_API spvc_bool spvc_compiler_msl_needs_buffer_size_buffer(spvc_compiler compiler);
-
-SPVC_PUBLIC_API spvc_bool spvc_compiler_msl_needs_output_buffer(spvc_compiler compiler);
-SPVC_PUBLIC_API spvc_bool spvc_compiler_msl_needs_patch_output_buffer(spvc_compiler compiler);
-SPVC_PUBLIC_API spvc_bool spvc_compiler_msl_needs_input_threadgroup_mem(spvc_compiler compiler);
-SPVC_PUBLIC_API spvc_result spvc_compiler_msl_add_vertex_attribute(spvc_compiler compiler,
-                                                                   const spvc_msl_vertex_attribute *attrs);
-/* Deprecated; use spvc_compiler_msl_add_resource_binding_2(). */
-SPVC_PUBLIC_API spvc_result spvc_compiler_msl_add_resource_binding(spvc_compiler compiler,
-                                                                   const spvc_msl_resource_binding *binding);
-SPVC_PUBLIC_API spvc_result spvc_compiler_msl_add_resource_binding_2(spvc_compiler compiler,
-                                                                     const spvc_msl_resource_binding_2 *binding);
-/* Deprecated; use spvc_compiler_msl_add_shader_input_2(). */
-SPVC_PUBLIC_API spvc_result spvc_compiler_msl_add_shader_input(spvc_compiler compiler,
-                                                               const spvc_msl_shader_interface_var *input);
-SPVC_PUBLIC_API spvc_result spvc_compiler_msl_add_shader_input_2(spvc_compiler compiler,
-                                                                 const spvc_msl_shader_interface_var_2 *input);
-/* Deprecated; use spvc_compiler_msl_add_shader_output_2(). */
-SPVC_PUBLIC_API spvc_result spvc_compiler_msl_add_shader_output(spvc_compiler compiler,
-                                                                const spvc_msl_shader_interface_var *output);
-SPVC_PUBLIC_API spvc_result spvc_compiler_msl_add_shader_output_2(spvc_compiler compiler,
-                                                                  const spvc_msl_shader_interface_var_2 *output);
-SPVC_PUBLIC_API spvc_result spvc_compiler_msl_add_discrete_descriptor_set(spvc_compiler compiler, unsigned desc_set);
-SPVC_PUBLIC_API spvc_result spvc_compiler_msl_set_argument_buffer_device_address_space(spvc_compiler compiler, unsigned desc_set, spvc_bool device_address);
-
-/* Obsolete, use is_shader_input_used. */
-SPVC_PUBLIC_API spvc_bool spvc_compiler_msl_is_vertex_attribute_used(spvc_compiler compiler, unsigned location);
-SPVC_PUBLIC_API spvc_bool spvc_compiler_msl_is_shader_input_used(spvc_compiler compiler, unsigned location);
-SPVC_PUBLIC_API spvc_bool spvc_compiler_msl_is_shader_output_used(spvc_compiler compiler, unsigned location);
-
-SPVC_PUBLIC_API spvc_bool spvc_compiler_msl_is_resource_used(spvc_compiler compiler,
-                                                             SpvExecutionModel model,
-                                                             unsigned set,
-                                                             unsigned binding);
-SPVC_PUBLIC_API spvc_result spvc_compiler_msl_remap_constexpr_sampler(spvc_compiler compiler, spvc_variable_id id, const spvc_msl_constexpr_sampler *sampler);
-SPVC_PUBLIC_API spvc_result spvc_compiler_msl_remap_constexpr_sampler_by_binding(spvc_compiler compiler, unsigned desc_set, unsigned binding, const spvc_msl_constexpr_sampler *sampler);
-SPVC_PUBLIC_API spvc_result spvc_compiler_msl_remap_constexpr_sampler_ycbcr(spvc_compiler compiler, spvc_variable_id id, const spvc_msl_constexpr_sampler *sampler, const spvc_msl_sampler_ycbcr_conversion *conv);
-SPVC_PUBLIC_API spvc_result spvc_compiler_msl_remap_constexpr_sampler_by_binding_ycbcr(spvc_compiler compiler, unsigned desc_set, unsigned binding, const spvc_msl_constexpr_sampler *sampler, const spvc_msl_sampler_ycbcr_conversion *conv);
-SPVC_PUBLIC_API spvc_result spvc_compiler_msl_set_fragment_output_components(spvc_compiler compiler, unsigned location, unsigned components);
-
-SPVC_PUBLIC_API unsigned spvc_compiler_msl_get_automatic_resource_binding(spvc_compiler compiler, spvc_variable_id id);
-SPVC_PUBLIC_API unsigned spvc_compiler_msl_get_automatic_resource_binding_secondary(spvc_compiler compiler, spvc_variable_id id);
-
-SPVC_PUBLIC_API spvc_result spvc_compiler_msl_add_dynamic_buffer(spvc_compiler compiler, unsigned desc_set, unsigned binding, unsigned index);
-
-SPVC_PUBLIC_API spvc_result spvc_compiler_msl_add_inline_uniform_block(spvc_compiler compiler, unsigned desc_set, unsigned binding);
-
-SPVC_PUBLIC_API spvc_result spvc_compiler_msl_set_combined_sampler_suffix(spvc_compiler compiler, const char *suffix);
-SPVC_PUBLIC_API const char *spvc_compiler_msl_get_combined_sampler_suffix(spvc_compiler compiler);
+	SPVC_PUBLIC_API spvc_bool spvc_compiler_msl_is_rasterization_disabled(spvc_compiler compiler);
+
+	/* Obsolete. Renamed to needs_swizzle_buffer. */
+	SPVC_PUBLIC_API spvc_bool spvc_compiler_msl_needs_aux_buffer(spvc_compiler compiler);
+	SPVC_PUBLIC_API spvc_bool spvc_compiler_msl_needs_swizzle_buffer(spvc_compiler compiler);
+	SPVC_PUBLIC_API spvc_bool spvc_compiler_msl_needs_buffer_size_buffer(spvc_compiler compiler);
+
+	SPVC_PUBLIC_API spvc_bool spvc_compiler_msl_needs_output_buffer(spvc_compiler compiler);
+	SPVC_PUBLIC_API spvc_bool spvc_compiler_msl_needs_patch_output_buffer(spvc_compiler compiler);
+	SPVC_PUBLIC_API spvc_bool spvc_compiler_msl_needs_input_threadgroup_mem(spvc_compiler compiler);
+	SPVC_PUBLIC_API spvc_result spvc_compiler_msl_add_vertex_attribute(spvc_compiler compiler,
+	                                                                   const spvc_msl_vertex_attribute *attrs);
+	/* Deprecated; use spvc_compiler_msl_add_resource_binding_2(). */
+	SPVC_PUBLIC_API spvc_result spvc_compiler_msl_add_resource_binding(spvc_compiler compiler,
+	                                                                   const spvc_msl_resource_binding *binding);
+	SPVC_PUBLIC_API spvc_result spvc_compiler_msl_add_resource_binding_2(spvc_compiler compiler,
+	                                                                     const spvc_msl_resource_binding_2 *binding);
+	/* Deprecated; use spvc_compiler_msl_add_shader_input_2(). */
+	SPVC_PUBLIC_API spvc_result spvc_compiler_msl_add_shader_input(spvc_compiler compiler,
+	                                                               const spvc_msl_shader_interface_var *input);
+	SPVC_PUBLIC_API spvc_result spvc_compiler_msl_add_shader_input_2(spvc_compiler compiler,
+	                                                                 const spvc_msl_shader_interface_var_2 *input);
+	/* Deprecated; use spvc_compiler_msl_add_shader_output_2(). */
+	SPVC_PUBLIC_API spvc_result spvc_compiler_msl_add_shader_output(spvc_compiler compiler,
+	                                                                const spvc_msl_shader_interface_var *output);
+	SPVC_PUBLIC_API spvc_result spvc_compiler_msl_add_shader_output_2(spvc_compiler compiler,
+	                                                                  const spvc_msl_shader_interface_var_2 *output);
+	SPVC_PUBLIC_API spvc_result spvc_compiler_msl_add_discrete_descriptor_set(spvc_compiler compiler,
+	                                                                          unsigned desc_set);
+	SPVC_PUBLIC_API spvc_result spvc_compiler_msl_set_argument_buffer_device_address_space(spvc_compiler compiler,
+	                                                                                       unsigned desc_set,
+	                                                                                       spvc_bool device_address);
+
+	/* Obsolete, use is_shader_input_used. */
+	SPVC_PUBLIC_API spvc_bool spvc_compiler_msl_is_vertex_attribute_used(spvc_compiler compiler, unsigned location);
+	SPVC_PUBLIC_API spvc_bool spvc_compiler_msl_is_shader_input_used(spvc_compiler compiler, unsigned location);
+	SPVC_PUBLIC_API spvc_bool spvc_compiler_msl_is_shader_output_used(spvc_compiler compiler, unsigned location);
+
+	SPVC_PUBLIC_API spvc_bool spvc_compiler_msl_is_resource_used(spvc_compiler compiler, SpvExecutionModel model,
+	                                                             unsigned set, unsigned binding);
+	SPVC_PUBLIC_API spvc_result spvc_compiler_msl_remap_constexpr_sampler(spvc_compiler compiler, spvc_variable_id id,
+	                                                                      const spvc_msl_constexpr_sampler *sampler);
+	SPVC_PUBLIC_API spvc_result spvc_compiler_msl_remap_constexpr_sampler_by_binding(
+	    spvc_compiler compiler, unsigned desc_set, unsigned binding, const spvc_msl_constexpr_sampler *sampler);
+	SPVC_PUBLIC_API spvc_result spvc_compiler_msl_remap_constexpr_sampler_ycbcr(
+	    spvc_compiler compiler, spvc_variable_id id, const spvc_msl_constexpr_sampler *sampler,
+	    const spvc_msl_sampler_ycbcr_conversion *conv);
+	SPVC_PUBLIC_API spvc_result spvc_compiler_msl_remap_constexpr_sampler_by_binding_ycbcr(
+	    spvc_compiler compiler, unsigned desc_set, unsigned binding, const spvc_msl_constexpr_sampler *sampler,
+	    const spvc_msl_sampler_ycbcr_conversion *conv);
+	SPVC_PUBLIC_API spvc_result spvc_compiler_msl_set_fragment_output_components(spvc_compiler compiler,
+	                                                                             unsigned location,
+	                                                                             unsigned components);
+
+	SPVC_PUBLIC_API unsigned spvc_compiler_msl_get_automatic_resource_binding(spvc_compiler compiler,
+	                                                                          spvc_variable_id id);
+	SPVC_PUBLIC_API unsigned spvc_compiler_msl_get_automatic_resource_binding_secondary(spvc_compiler compiler,
+	                                                                                    spvc_variable_id id);
+
+	SPVC_PUBLIC_API spvc_result spvc_compiler_msl_add_dynamic_buffer(spvc_compiler compiler, unsigned desc_set,
+	                                                                 unsigned binding, unsigned index);
+
+	SPVC_PUBLIC_API spvc_result spvc_compiler_msl_add_inline_uniform_block(spvc_compiler compiler, unsigned desc_set,
+	                                                                       unsigned binding);
+
+	SPVC_PUBLIC_API spvc_result spvc_compiler_msl_set_combined_sampler_suffix(spvc_compiler compiler,
+	                                                                          const char *suffix);
+	SPVC_PUBLIC_API const char *spvc_compiler_msl_get_combined_sampler_suffix(spvc_compiler compiler);
 
-/*
+	/*
  * Reflect resources.
  * Maps almost 1:1 to C++ API.
  */
-SPVC_PUBLIC_API spvc_result spvc_compiler_get_active_interface_variables(spvc_compiler compiler, spvc_set *set);
-SPVC_PUBLIC_API spvc_result spvc_compiler_set_enabled_interface_variables(spvc_compiler compiler, spvc_set set);
-SPVC_PUBLIC_API spvc_result spvc_compiler_create_shader_resources(spvc_compiler compiler, spvc_resources *resources);
-SPVC_PUBLIC_API spvc_result spvc_compiler_create_shader_resources_for_active_variables(spvc_compiler compiler,
-                                                                                       spvc_resources *resources,
-                                                                                       spvc_set active);
-SPVC_PUBLIC_API spvc_result spvc_resources_get_resource_list_for_type(spvc_resources resources, spvc_resource_type type,
-                                                                      const spvc_reflected_resource **resource_list,
-                                                                      size_t *resource_size);
-
-SPVC_PUBLIC_API spvc_result spvc_resources_get_builtin_resource_list_for_type(
-		spvc_resources resources, spvc_builtin_resource_type type,
-		const spvc_reflected_builtin_resource **resource_list,
-		size_t *resource_size);
+	SPVC_PUBLIC_API spvc_result spvc_compiler_get_active_interface_variables(spvc_compiler compiler, spvc_set *set);
+	SPVC_PUBLIC_API spvc_result spvc_compiler_set_enabled_interface_variables(spvc_compiler compiler, spvc_set set);
+	SPVC_PUBLIC_API spvc_result spvc_compiler_create_shader_resources(spvc_compiler compiler,
+	                                                                  spvc_resources *resources);
+	SPVC_PUBLIC_API spvc_result spvc_compiler_create_shader_resources_for_active_variables(spvc_compiler compiler,
+	                                                                                       spvc_resources *resources,
+	                                                                                       spvc_set active);
+	SPVC_PUBLIC_API spvc_result spvc_resources_get_resource_list_for_type(spvc_resources resources,
+	                                                                      spvc_resource_type type,
+	                                                                      const spvc_reflected_resource **resource_list,
+	                                                                      size_t *resource_size);
+
+	SPVC_PUBLIC_API spvc_result spvc_resources_get_builtin_resource_list_for_type(
+	    spvc_resources resources, spvc_builtin_resource_type type,
+	    const spvc_reflected_builtin_resource **resource_list, size_t *resource_size);
 
-/*
+	/*
  * Decorations.
  * Maps to C++ API.
  */
-SPVC_PUBLIC_API void spvc_compiler_set_decoration(spvc_compiler compiler, SpvId id, SpvDecoration decoration,
-                                                  unsigned argument);
-SPVC_PUBLIC_API void spvc_compiler_set_decoration_string(spvc_compiler compiler, SpvId id, SpvDecoration decoration,
-                                                         const char *argument);
-SPVC_PUBLIC_API void spvc_compiler_set_name(spvc_compiler compiler, SpvId id, const char *argument);
-SPVC_PUBLIC_API void spvc_compiler_set_member_decoration(spvc_compiler compiler, spvc_type_id id, unsigned member_index,
-                                                         SpvDecoration decoration, unsigned argument);
-SPVC_PUBLIC_API void spvc_compiler_set_member_decoration_string(spvc_compiler compiler, spvc_type_id id,
-                                                                unsigned member_index, SpvDecoration decoration,
-                                                                const char *argument);
-SPVC_PUBLIC_API void spvc_compiler_set_member_name(spvc_compiler compiler, spvc_type_id id, unsigned member_index,
-                                                   const char *argument);
-SPVC_PUBLIC_API void spvc_compiler_unset_decoration(spvc_compiler compiler, SpvId id, SpvDecoration decoration);
-SPVC_PUBLIC_API void spvc_compiler_unset_member_decoration(spvc_compiler compiler, spvc_type_id id,
-                                                           unsigned member_index, SpvDecoration decoration);
-
-SPVC_PUBLIC_API spvc_bool spvc_compiler_has_decoration(spvc_compiler compiler, SpvId id, SpvDecoration decoration);
-SPVC_PUBLIC_API spvc_bool spvc_compiler_has_member_decoration(spvc_compiler compiler, spvc_type_id id,
-                                                              unsigned member_index, SpvDecoration decoration);
-SPVC_PUBLIC_API const char *spvc_compiler_get_name(spvc_compiler compiler, SpvId id);
-SPVC_PUBLIC_API unsigned spvc_compiler_get_decoration(spvc_compiler compiler, SpvId id, SpvDecoration decoration);
-SPVC_PUBLIC_API const char *spvc_compiler_get_decoration_string(spvc_compiler compiler, SpvId id,
-                                                                SpvDecoration decoration);
-SPVC_PUBLIC_API unsigned spvc_compiler_get_member_decoration(spvc_compiler compiler, spvc_type_id id,
-                                                             unsigned member_index, SpvDecoration decoration);
-SPVC_PUBLIC_API const char *spvc_compiler_get_member_decoration_string(spvc_compiler compiler, spvc_type_id id,
-                                                                       unsigned member_index, SpvDecoration decoration);
-SPVC_PUBLIC_API const char *spvc_compiler_get_member_name(spvc_compiler compiler, spvc_type_id id, unsigned member_index);
+	SPVC_PUBLIC_API void spvc_compiler_set_decoration(spvc_compiler compiler, SpvId id, SpvDecoration decoration,
+	                                                  unsigned argument);
+	SPVC_PUBLIC_API void spvc_compiler_set_decoration_string(spvc_compiler compiler, SpvId id, SpvDecoration decoration,
+	                                                         const char *argument);
+	SPVC_PUBLIC_API void spvc_compiler_set_name(spvc_compiler compiler, SpvId id, const char *argument);
+	SPVC_PUBLIC_API void spvc_compiler_set_member_decoration(spvc_compiler compiler, spvc_type_id id,
+	                                                         unsigned member_index, SpvDecoration decoration,
+	                                                         unsigned argument);
+	SPVC_PUBLIC_API void spvc_compiler_set_member_decoration_string(spvc_compiler compiler, spvc_type_id id,
+	                                                                unsigned member_index, SpvDecoration decoration,
+	                                                                const char *argument);
+	SPVC_PUBLIC_API void spvc_compiler_set_member_name(spvc_compiler compiler, spvc_type_id id, unsigned member_index,
+	                                                   const char *argument);
+	SPVC_PUBLIC_API void spvc_compiler_unset_decoration(spvc_compiler compiler, SpvId id, SpvDecoration decoration);
+	SPVC_PUBLIC_API void spvc_compiler_unset_member_decoration(spvc_compiler compiler, spvc_type_id id,
+	                                                           unsigned member_index, SpvDecoration decoration);
+
+	SPVC_PUBLIC_API spvc_bool spvc_compiler_has_decoration(spvc_compiler compiler, SpvId id, SpvDecoration decoration);
+	SPVC_PUBLIC_API spvc_bool spvc_compiler_has_member_decoration(spvc_compiler compiler, spvc_type_id id,
+	                                                              unsigned member_index, SpvDecoration decoration);
+	SPVC_PUBLIC_API const char *spvc_compiler_get_name(spvc_compiler compiler, SpvId id);
+	SPVC_PUBLIC_API unsigned spvc_compiler_get_decoration(spvc_compiler compiler, SpvId id, SpvDecoration decoration);
+	SPVC_PUBLIC_API const char *spvc_compiler_get_decoration_string(spvc_compiler compiler, SpvId id,
+	                                                                SpvDecoration decoration);
+	SPVC_PUBLIC_API unsigned spvc_compiler_get_member_decoration(spvc_compiler compiler, spvc_type_id id,
+	                                                             unsigned member_index, SpvDecoration decoration);
+	SPVC_PUBLIC_API const char *spvc_compiler_get_member_decoration_string(spvc_compiler compiler, spvc_type_id id,
+	                                                                       unsigned member_index,
+	                                                                       SpvDecoration decoration);
+	SPVC_PUBLIC_API const char *spvc_compiler_get_member_name(spvc_compiler compiler, spvc_type_id id,
+	                                                          unsigned member_index);
 
-/*
+	/*
  * Entry points.
  * Maps to C++ API.
  */
-SPVC_PUBLIC_API spvc_result spvc_compiler_get_entry_points(spvc_compiler compiler,
-                                                           const spvc_entry_point **entry_points,
-                                                           size_t *num_entry_points);
-SPVC_PUBLIC_API spvc_result spvc_compiler_set_entry_point(spvc_compiler compiler, const char *name,
-                                                          SpvExecutionModel model);
-SPVC_PUBLIC_API spvc_result spvc_compiler_rename_entry_point(spvc_compiler compiler, const char *old_name,
-                                                             const char *new_name, SpvExecutionModel model);
-SPVC_PUBLIC_API const char *spvc_compiler_get_cleansed_entry_point_name(spvc_compiler compiler, const char *name,
-                                                                        SpvExecutionModel model);
-SPVC_PUBLIC_API void spvc_compiler_set_execution_mode(spvc_compiler compiler, SpvExecutionMode mode);
-SPVC_PUBLIC_API void spvc_compiler_unset_execution_mode(spvc_compiler compiler, SpvExecutionMode mode);
-SPVC_PUBLIC_API void spvc_compiler_set_execution_mode_with_arguments(spvc_compiler compiler, SpvExecutionMode mode,
-                                                                     unsigned arg0, unsigned arg1, unsigned arg2);
-SPVC_PUBLIC_API spvc_result spvc_compiler_get_execution_modes(spvc_compiler compiler, const SpvExecutionMode **modes,
-                                                              size_t *num_modes);
-SPVC_PUBLIC_API unsigned spvc_compiler_get_execution_mode_argument(spvc_compiler compiler, SpvExecutionMode mode);
-SPVC_PUBLIC_API unsigned spvc_compiler_get_execution_mode_argument_by_index(spvc_compiler compiler,
-                                                                            SpvExecutionMode mode, unsigned index);
-SPVC_PUBLIC_API SpvExecutionModel spvc_compiler_get_execution_model(spvc_compiler compiler);
-SPVC_PUBLIC_API void spvc_compiler_update_active_builtins(spvc_compiler compiler);
-SPVC_PUBLIC_API spvc_bool spvc_compiler_has_active_builtin(spvc_compiler compiler, SpvBuiltIn builtin, SpvStorageClass storage);
+	SPVC_PUBLIC_API spvc_result spvc_compiler_get_entry_points(spvc_compiler compiler,
+	                                                           const spvc_entry_point **entry_points,
+	                                                           size_t *num_entry_points);
+	SPVC_PUBLIC_API spvc_result spvc_compiler_set_entry_point(spvc_compiler compiler, const char *name,
+	                                                          SpvExecutionModel model);
+	SPVC_PUBLIC_API spvc_result spvc_compiler_rename_entry_point(spvc_compiler compiler, const char *old_name,
+	                                                             const char *new_name, SpvExecutionModel model);
+	SPVC_PUBLIC_API const char *spvc_compiler_get_cleansed_entry_point_name(spvc_compiler compiler, const char *name,
+	                                                                        SpvExecutionModel model);
+	SPVC_PUBLIC_API void spvc_compiler_set_execution_mode(spvc_compiler compiler, SpvExecutionMode mode);
+	SPVC_PUBLIC_API void spvc_compiler_unset_execution_mode(spvc_compiler compiler, SpvExecutionMode mode);
+	SPVC_PUBLIC_API void spvc_compiler_set_execution_mode_with_arguments(spvc_compiler compiler, SpvExecutionMode mode,
+	                                                                     unsigned arg0, unsigned arg1, unsigned arg2);
+	SPVC_PUBLIC_API spvc_result spvc_compiler_get_execution_modes(spvc_compiler compiler,
+	                                                              const SpvExecutionMode **modes, size_t *num_modes);
+	SPVC_PUBLIC_API unsigned spvc_compiler_get_execution_mode_argument(spvc_compiler compiler, SpvExecutionMode mode);
+	SPVC_PUBLIC_API unsigned spvc_compiler_get_execution_mode_argument_by_index(spvc_compiler compiler,
+	                                                                            SpvExecutionMode mode, unsigned index);
+	SPVC_PUBLIC_API SpvExecutionModel spvc_compiler_get_execution_model(spvc_compiler compiler);
+	SPVC_PUBLIC_API void spvc_compiler_update_active_builtins(spvc_compiler compiler);
+	SPVC_PUBLIC_API spvc_bool spvc_compiler_has_active_builtin(spvc_compiler compiler, SpvBuiltIn builtin,
+	                                                           SpvStorageClass storage);
 
-/*
+	/*
  * Type query interface.
  * Maps to C++ API, except it's read-only.
  */
-SPVC_PUBLIC_API spvc_type spvc_compiler_get_type_handle(spvc_compiler compiler, spvc_type_id id);
+	SPVC_PUBLIC_API spvc_type spvc_compiler_get_type_handle(spvc_compiler compiler, spvc_type_id id);
 
-/* Pulls out SPIRType::self. This effectively gives the type ID without array or pointer qualifiers.
+	/* Pulls out SPIRType::self. This effectively gives the type ID without array or pointer qualifiers.
  * This is necessary when reflecting decoration/name information on members of a struct,
  * which are placed in the base type, not the qualified type.
  * This is similar to spvc_reflected_resource::base_type_id. */
-SPVC_PUBLIC_API spvc_type_id spvc_type_get_base_type_id(spvc_type type);
-
-SPVC_PUBLIC_API spvc_basetype spvc_type_get_basetype(spvc_type type);
-SPVC_PUBLIC_API unsigned spvc_type_get_bit_width(spvc_type type);
-SPVC_PUBLIC_API unsigned spvc_type_get_vector_size(spvc_type type);
-SPVC_PUBLIC_API unsigned spvc_type_get_columns(spvc_type type);
-SPVC_PUBLIC_API unsigned spvc_type_get_num_array_dimensions(spvc_type type);
-SPVC_PUBLIC_API spvc_bool spvc_type_array_dimension_is_literal(spvc_type type, unsigned dimension);
-SPVC_PUBLIC_API SpvId spvc_type_get_array_dimension(spvc_type type, unsigned dimension);
-SPVC_PUBLIC_API unsigned spvc_type_get_num_member_types(spvc_type type);
-SPVC_PUBLIC_API spvc_type_id spvc_type_get_member_type(spvc_type type, unsigned index);
-SPVC_PUBLIC_API SpvStorageClass spvc_type_get_storage_class(spvc_type type);
-
-/* Image type query. */
-SPVC_PUBLIC_API spvc_type_id spvc_type_get_image_sampled_type(spvc_type type);
-SPVC_PUBLIC_API SpvDim spvc_type_get_image_dimension(spvc_type type);
-SPVC_PUBLIC_API spvc_bool spvc_type_get_image_is_depth(spvc_type type);
-SPVC_PUBLIC_API spvc_bool spvc_type_get_image_arrayed(spvc_type type);
-SPVC_PUBLIC_API spvc_bool spvc_type_get_image_multisampled(spvc_type type);
-SPVC_PUBLIC_API spvc_bool spvc_type_get_image_is_storage(spvc_type type);
-SPVC_PUBLIC_API SpvImageFormat spvc_type_get_image_storage_format(spvc_type type);
-SPVC_PUBLIC_API SpvAccessQualifier spvc_type_get_image_access_qualifier(spvc_type type);
+	SPVC_PUBLIC_API spvc_type_id spvc_type_get_base_type_id(spvc_type type);
+
+	SPVC_PUBLIC_API spvc_basetype spvc_type_get_basetype(spvc_type type);
+	SPVC_PUBLIC_API unsigned spvc_type_get_bit_width(spvc_type type);
+	SPVC_PUBLIC_API unsigned spvc_type_get_vector_size(spvc_type type);
+	SPVC_PUBLIC_API unsigned spvc_type_get_columns(spvc_type type);
+	SPVC_PUBLIC_API unsigned spvc_type_get_num_array_dimensions(spvc_type type);
+	SPVC_PUBLIC_API spvc_bool spvc_type_array_dimension_is_literal(spvc_type type, unsigned dimension);
+	SPVC_PUBLIC_API SpvId spvc_type_get_array_dimension(spvc_type type, unsigned dimension);
+	SPVC_PUBLIC_API unsigned spvc_type_get_num_member_types(spvc_type type);
+	SPVC_PUBLIC_API spvc_type_id spvc_type_get_member_type(spvc_type type, unsigned index);
+	SPVC_PUBLIC_API SpvStorageClass spvc_type_get_storage_class(spvc_type type);
+
+	/* Image type query. */
+	SPVC_PUBLIC_API spvc_type_id spvc_type_get_image_sampled_type(spvc_type type);
+	SPVC_PUBLIC_API SpvDim spvc_type_get_image_dimension(spvc_type type);
+	SPVC_PUBLIC_API spvc_bool spvc_type_get_image_is_depth(spvc_type type);
+	SPVC_PUBLIC_API spvc_bool spvc_type_get_image_arrayed(spvc_type type);
+	SPVC_PUBLIC_API spvc_bool spvc_type_get_image_multisampled(spvc_type type);
+	SPVC_PUBLIC_API spvc_bool spvc_type_get_image_is_storage(spvc_type type);
+	SPVC_PUBLIC_API SpvImageFormat spvc_type_get_image_storage_format(spvc_type type);
+	SPVC_PUBLIC_API SpvAccessQualifier spvc_type_get_image_access_qualifier(spvc_type type);
 
-/*
+	/*
  * Buffer layout query.
  * Maps to C++ API.
  */
-SPVC_PUBLIC_API spvc_result spvc_compiler_get_declared_struct_size(spvc_compiler compiler, spvc_type struct_type, size_t *size);
-SPVC_PUBLIC_API spvc_result spvc_compiler_get_declared_struct_size_runtime_array(spvc_compiler compiler,
-                                                                                 spvc_type struct_type, size_t array_size, size_t *size);
-SPVC_PUBLIC_API spvc_result spvc_compiler_get_declared_struct_member_size(spvc_compiler compiler, spvc_type type, unsigned index, size_t *size);
-
-SPVC_PUBLIC_API spvc_result spvc_compiler_type_struct_member_offset(spvc_compiler compiler,
-                                                                    spvc_type type, unsigned index, unsigned *offset);
-SPVC_PUBLIC_API spvc_result spvc_compiler_type_struct_member_array_stride(spvc_compiler compiler,
-                                                                          spvc_type type, unsigned index, unsigned *stride);
-SPVC_PUBLIC_API spvc_result spvc_compiler_type_struct_member_matrix_stride(spvc_compiler compiler,
-                                                                           spvc_type type, unsigned index, unsigned *stride);
+	SPVC_PUBLIC_API spvc_result spvc_compiler_get_declared_struct_size(spvc_compiler compiler, spvc_type struct_type,
+	                                                                   size_t *size);
+	SPVC_PUBLIC_API spvc_result spvc_compiler_get_declared_struct_size_runtime_array(spvc_compiler compiler,
+	                                                                                 spvc_type struct_type,
+	                                                                                 size_t array_size, size_t *size);
+	SPVC_PUBLIC_API spvc_result spvc_compiler_get_declared_struct_member_size(spvc_compiler compiler, spvc_type type,
+	                                                                          unsigned index, size_t *size);
+
+	SPVC_PUBLIC_API spvc_result spvc_compiler_type_struct_member_offset(spvc_compiler compiler, spvc_type type,
+	                                                                    unsigned index, unsigned *offset);
+	SPVC_PUBLIC_API spvc_result spvc_compiler_type_struct_member_array_stride(spvc_compiler compiler, spvc_type type,
+	                                                                          unsigned index, unsigned *stride);
+	SPVC_PUBLIC_API spvc_result spvc_compiler_type_struct_member_matrix_stride(spvc_compiler compiler, spvc_type type,
+	                                                                           unsigned index, unsigned *stride);
 
-/*
+	/*
  * Workaround helper functions.
  * Maps to C++ API.
  */
-SPVC_PUBLIC_API spvc_result spvc_compiler_build_dummy_sampler_for_combined_images(spvc_compiler compiler, spvc_variable_id *id);
-SPVC_PUBLIC_API spvc_result spvc_compiler_build_combined_image_samplers(spvc_compiler compiler);
-SPVC_PUBLIC_API spvc_result spvc_compiler_get_combined_image_samplers(spvc_compiler compiler,
-                                                                      const spvc_combined_image_sampler **samplers,
-                                                                      size_t *num_samplers);
+	SPVC_PUBLIC_API spvc_result spvc_compiler_build_dummy_sampler_for_combined_images(spvc_compiler compiler,
+	                                                                                  spvc_variable_id *id);
+	SPVC_PUBLIC_API spvc_result spvc_compiler_build_combined_image_samplers(spvc_compiler compiler);
+	SPVC_PUBLIC_API spvc_result spvc_compiler_get_combined_image_samplers(spvc_compiler compiler,
+	                                                                      const spvc_combined_image_sampler **samplers,
+	                                                                      size_t *num_samplers);
 
-/*
+	/*
  * Constants
  * Maps to C++ API.
  */
-SPVC_PUBLIC_API spvc_result spvc_compiler_get_specialization_constants(spvc_compiler compiler,
-                                                                       const spvc_specialization_constant **constants,
-                                                                       size_t *num_constants);
-SPVC_PUBLIC_API spvc_constant spvc_compiler_get_constant_handle(spvc_compiler compiler,
-                                                                spvc_constant_id id);
+	SPVC_PUBLIC_API spvc_result spvc_compiler_get_specialization_constants(
+	    spvc_compiler compiler, const spvc_specialization_constant **constants, size_t *num_constants);
+	SPVC_PUBLIC_API spvc_constant spvc_compiler_get_constant_handle(spvc_compiler compiler, spvc_constant_id id);
 
-SPVC_PUBLIC_API spvc_constant_id spvc_compiler_get_work_group_size_specialization_constants(spvc_compiler compiler,
-                                                                                            spvc_specialization_constant *x,
-                                                                                            spvc_specialization_constant *y,
-                                                                                            spvc_specialization_constant *z);
+	SPVC_PUBLIC_API spvc_constant_id spvc_compiler_get_work_group_size_specialization_constants(
+	    spvc_compiler compiler, spvc_specialization_constant *x, spvc_specialization_constant *y,
+	    spvc_specialization_constant *z);
 
-/*
+	/*
  * Buffer ranges
  * Maps to C++ API.
  */
-SPVC_PUBLIC_API spvc_result spvc_compiler_get_active_buffer_ranges(spvc_compiler compiler,
-                                                                   spvc_variable_id id,
-                                                                   const spvc_buffer_range **ranges,
-                                                                   size_t *num_ranges);
+	SPVC_PUBLIC_API spvc_result spvc_compiler_get_active_buffer_ranges(spvc_compiler compiler, spvc_variable_id id,
+	                                                                   const spvc_buffer_range **ranges,
+	                                                                   size_t *num_ranges);
 
-/*
+	/*
  * No stdint.h until C99, sigh :(
  * For smaller types, the result is sign or zero-extended as appropriate.
  * Maps to C++ API.
  * TODO: The SPIRConstant query interface and modification interface is not quite complete.
  */
-SPVC_PUBLIC_API float spvc_constant_get_scalar_fp16(spvc_constant constant, unsigned column, unsigned row);
-SPVC_PUBLIC_API float spvc_constant_get_scalar_fp32(spvc_constant constant, unsigned column, unsigned row);
-SPVC_PUBLIC_API double spvc_constant_get_scalar_fp64(spvc_constant constant, unsigned column, unsigned row);
-SPVC_PUBLIC_API unsigned spvc_constant_get_scalar_u32(spvc_constant constant, unsigned column, unsigned row);
-SPVC_PUBLIC_API int spvc_constant_get_scalar_i32(spvc_constant constant, unsigned column, unsigned row);
-SPVC_PUBLIC_API unsigned spvc_constant_get_scalar_u16(spvc_constant constant, unsigned column, unsigned row);
-SPVC_PUBLIC_API int spvc_constant_get_scalar_i16(spvc_constant constant, unsigned column, unsigned row);
-SPVC_PUBLIC_API unsigned spvc_constant_get_scalar_u8(spvc_constant constant, unsigned column, unsigned row);
-SPVC_PUBLIC_API int spvc_constant_get_scalar_i8(spvc_constant constant, unsigned column, unsigned row);
-SPVC_PUBLIC_API void spvc_constant_get_subconstants(spvc_constant constant, const spvc_constant_id **constituents, size_t *count);
-SPVC_PUBLIC_API unsigned long long spvc_constant_get_scalar_u64(spvc_constant constant, unsigned column, unsigned row);
-SPVC_PUBLIC_API long long spvc_constant_get_scalar_i64(spvc_constant constant, unsigned column, unsigned row);
-SPVC_PUBLIC_API spvc_type_id spvc_constant_get_type(spvc_constant constant);
+	SPVC_PUBLIC_API float spvc_constant_get_scalar_fp16(spvc_constant constant, unsigned column, unsigned row);
+	SPVC_PUBLIC_API float spvc_constant_get_scalar_fp32(spvc_constant constant, unsigned column, unsigned row);
+	SPVC_PUBLIC_API double spvc_constant_get_scalar_fp64(spvc_constant constant, unsigned column, unsigned row);
+	SPVC_PUBLIC_API unsigned spvc_constant_get_scalar_u32(spvc_constant constant, unsigned column, unsigned row);
+	SPVC_PUBLIC_API int spvc_constant_get_scalar_i32(spvc_constant constant, unsigned column, unsigned row);
+	SPVC_PUBLIC_API unsigned spvc_constant_get_scalar_u16(spvc_constant constant, unsigned column, unsigned row);
+	SPVC_PUBLIC_API int spvc_constant_get_scalar_i16(spvc_constant constant, unsigned column, unsigned row);
+	SPVC_PUBLIC_API unsigned spvc_constant_get_scalar_u8(spvc_constant constant, unsigned column, unsigned row);
+	SPVC_PUBLIC_API int spvc_constant_get_scalar_i8(spvc_constant constant, unsigned column, unsigned row);
+	SPVC_PUBLIC_API void spvc_constant_get_subconstants(spvc_constant constant, const spvc_constant_id **constituents,
+	                                                    size_t *count);
+	SPVC_PUBLIC_API unsigned long long spvc_constant_get_scalar_u64(spvc_constant constant, unsigned column,
+	                                                                unsigned row);
+	SPVC_PUBLIC_API long long spvc_constant_get_scalar_i64(spvc_constant constant, unsigned column, unsigned row);
+	SPVC_PUBLIC_API spvc_type_id spvc_constant_get_type(spvc_constant constant);
 
-/*
+	/*
  * C implementation of the C++ api.
  */
-SPVC_PUBLIC_API void spvc_constant_set_scalar_fp16(spvc_constant constant, unsigned column, unsigned row, unsigned short value);
-SPVC_PUBLIC_API void spvc_constant_set_scalar_fp32(spvc_constant constant, unsigned column, unsigned row, float value);
-SPVC_PUBLIC_API void spvc_constant_set_scalar_fp64(spvc_constant constant, unsigned column, unsigned row, double value);
-SPVC_PUBLIC_API void spvc_constant_set_scalar_u32(spvc_constant constant, unsigned column, unsigned row, unsigned value);
-SPVC_PUBLIC_API void spvc_constant_set_scalar_i32(spvc_constant constant, unsigned column, unsigned row, int value);
-SPVC_PUBLIC_API void spvc_constant_set_scalar_u64(spvc_constant constant, unsigned column, unsigned row, unsigned long long value);
-SPVC_PUBLIC_API void spvc_constant_set_scalar_i64(spvc_constant constant, unsigned column, unsigned row, long long value);
-SPVC_PUBLIC_API void spvc_constant_set_scalar_u16(spvc_constant constant, unsigned column, unsigned row, unsigned short value);
-SPVC_PUBLIC_API void spvc_constant_set_scalar_i16(spvc_constant constant, unsigned column, unsigned row, signed short value);
-SPVC_PUBLIC_API void spvc_constant_set_scalar_u8(spvc_constant constant, unsigned column, unsigned row, unsigned char value);
-SPVC_PUBLIC_API void spvc_constant_set_scalar_i8(spvc_constant constant, unsigned column, unsigned row, signed char value);
+	SPVC_PUBLIC_API void spvc_constant_set_scalar_fp16(spvc_constant constant, unsigned column, unsigned row,
+	                                                   unsigned short value);
+	SPVC_PUBLIC_API void spvc_constant_set_scalar_fp32(spvc_constant constant, unsigned column, unsigned row,
+	                                                   float value);
+	SPVC_PUBLIC_API void spvc_constant_set_scalar_fp64(spvc_constant constant, unsigned column, unsigned row,
+	                                                   double value);
+	SPVC_PUBLIC_API void spvc_constant_set_scalar_u32(spvc_constant constant, unsigned column, unsigned row,
+	                                                  unsigned value);
+	SPVC_PUBLIC_API void spvc_constant_set_scalar_i32(spvc_constant constant, unsigned column, unsigned row, int value);
+	SPVC_PUBLIC_API void spvc_constant_set_scalar_u64(spvc_constant constant, unsigned column, unsigned row,
+	                                                  unsigned long long value);
+	SPVC_PUBLIC_API void spvc_constant_set_scalar_i64(spvc_constant constant, unsigned column, unsigned row,
+	                                                  long long value);
+	SPVC_PUBLIC_API void spvc_constant_set_scalar_u16(spvc_constant constant, unsigned column, unsigned row,
+	                                                  unsigned short value);
+	SPVC_PUBLIC_API void spvc_constant_set_scalar_i16(spvc_constant constant, unsigned column, unsigned row,
+	                                                  signed short value);
+	SPVC_PUBLIC_API void spvc_constant_set_scalar_u8(spvc_constant constant, unsigned column, unsigned row,
+	                                                 unsigned char value);
+	SPVC_PUBLIC_API void spvc_constant_set_scalar_i8(spvc_constant constant, unsigned column, unsigned row,
+	                                                 signed char value);
 
-/*
+	/*
  * Misc reflection
  * Maps to C++ API.
  */
-SPVC_PUBLIC_API spvc_bool spvc_compiler_get_binary_offset_for_decoration(spvc_compiler compiler,
-                                                                         spvc_variable_id id,
-                                                                         SpvDecoration decoration,
-                                                                         unsigned *word_offset);
-
-SPVC_PUBLIC_API spvc_bool spvc_compiler_buffer_is_hlsl_counter_buffer(spvc_compiler compiler, spvc_variable_id id);
-SPVC_PUBLIC_API spvc_bool spvc_compiler_buffer_get_hlsl_counter_buffer(spvc_compiler compiler, spvc_variable_id id,
-                                                                       spvc_variable_id *counter_id);
-
-SPVC_PUBLIC_API spvc_result spvc_compiler_get_declared_capabilities(spvc_compiler compiler,
-                                                                    const SpvCapability **capabilities,
-                                                                    size_t *num_capabilities);
-SPVC_PUBLIC_API spvc_result spvc_compiler_get_declared_extensions(spvc_compiler compiler, const char ***extensions,
-                                                                  size_t *num_extensions);
-
-SPVC_PUBLIC_API const char *spvc_compiler_get_remapped_declared_block_name(spvc_compiler compiler, spvc_variable_id id);
-SPVC_PUBLIC_API spvc_result spvc_compiler_get_buffer_block_decorations(spvc_compiler compiler, spvc_variable_id id,
-                                                                       const SpvDecoration **decorations,
-                                                                       size_t *num_decorations);
+	SPVC_PUBLIC_API spvc_bool spvc_compiler_get_binary_offset_for_decoration(spvc_compiler compiler,
+	                                                                         spvc_variable_id id,
+	                                                                         SpvDecoration decoration,
+	                                                                         unsigned *word_offset);
+
+	SPVC_PUBLIC_API spvc_bool spvc_compiler_buffer_is_hlsl_counter_buffer(spvc_compiler compiler, spvc_variable_id id);
+	SPVC_PUBLIC_API spvc_bool spvc_compiler_buffer_get_hlsl_counter_buffer(spvc_compiler compiler, spvc_variable_id id,
+	                                                                       spvc_variable_id *counter_id);
+
+	SPVC_PUBLIC_API spvc_result spvc_compiler_get_declared_capabilities(spvc_compiler compiler,
+	                                                                    const SpvCapability **capabilities,
+	                                                                    size_t *num_capabilities);
+	SPVC_PUBLIC_API spvc_result spvc_compiler_get_declared_extensions(spvc_compiler compiler, const char ***extensions,
+	                                                                  size_t *num_extensions);
+
+	SPVC_PUBLIC_API const char *spvc_compiler_get_remapped_declared_block_name(spvc_compiler compiler,
+	                                                                           spvc_variable_id id);
+	SPVC_PUBLIC_API spvc_result spvc_compiler_get_buffer_block_decorations(spvc_compiler compiler, spvc_variable_id id,
+	                                                                       const SpvDecoration **decorations,
+	                                                                       size_t *num_decorations);
 
 #ifdef __cplusplus
 }
diff --git a/spirv_opencl.cpp b/spirv_opencl.cpp
new file mode 100644
index 000000000..68c447aa3
--- /dev/null
+++ b/spirv_opencl.cpp
@@ -0,0 +1,1067 @@
+/*
+ * Copyright 2016-2021 The Brenwill Workshop Ltd.
+ * SPDX-License-Identifier: Apache-2.0 OR MIT
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * At your option, you may choose to accept this material under either:
+ * 1. The Apache License, Version 2.0, found at, or
+ * 2. The MIT License, found at.
+ */
+
+#include "spirv_opencl.hpp"
+#include "GLSL.std.450.h"
+
+#include <algorithm>
+#include <iostream>
+#include <utility>
+
+using namespace SPIRV_CROSS_SPV_HEADER_NAMESPACE;
+using namespace SPIRV_CROSS_NAMESPACE;
+using namespace std;
+
+CompilerOpenCL::CompilerOpenCL(vector<uint32_t> spirv_)
+    : CompilerGLSL(std::move(spirv_))
+{
+}
+
+CompilerOpenCL::CompilerOpenCL(const uint32_t *ir_, size_t word_count)
+    : CompilerGLSL(ir_, word_count)
+{
+}
+
+CompilerOpenCL::CompilerOpenCL(const ParsedIR &ir_)
+    : CompilerGLSL(ir_)
+{
+}
+
+CompilerOpenCL::CompilerOpenCL(ParsedIR &&ir_)
+    : CompilerGLSL(std::move(ir_))
+{
+}
+
+string CompilerOpenCL::compile()
+{
+	if (get_execution_model() != ExecutionModelGLCompute)
+		SPIRV_CROSS_THROW("OpenCL backend only supports compute shaders (ExecutionModelGLCompute).");
+
+	ir.fixup_reserved_names();
+
+	options.vulkan_semantics = true;
+	options.es = false;
+	options.version = 450;
+
+	backend.null_pointer_literal = "NULL";
+	backend.float_literal_suffix = true;
+	backend.double_literal_suffix = true;
+	backend.uint32_t_literal_suffix = true;
+	backend.int16_t_literal_suffix = "s";
+	backend.uint16_t_literal_suffix = "us";
+	backend.basic_int_type = "int";
+	backend.basic_uint_type = "uint";
+	backend.basic_int8_type = "char";
+	backend.basic_uint8_type = "uchar";
+	backend.basic_int16_type = "short";
+	backend.basic_uint16_type = "ushort";
+	backend.boolean_mix_function = "mix";
+	backend.swizzle_is_function = false;
+	backend.shared_is_implied = false;
+	backend.use_initializer_list = true;
+	backend.use_typed_initializer_list = true;
+	backend.native_row_major_matrix = false;
+	backend.unsized_array_supported = false;
+	backend.can_declare_arrays_inline = false;
+	backend.allow_truncated_access_chain = true;
+	backend.comparison_image_samples_scalar = true;
+	backend.native_pointers = true;
+	backend.nonuniform_qualifier = "";
+	backend.supports_empty_struct = true;
+	backend.support_64bit_switch = opencl_options.enable_64bit_atomics;
+	backend.boolean_in_struct_remapped_type = SPIRType::Boolean;
+	backend.discard_literal = "";
+	backend.demote_literal = "";
+	backend.workgroup_size_is_hidden = false;
+	backend.supports_extensions = true;
+	backend.force_gl_in_out_block = false;
+	backend.force_merged_mesh_block = false;
+	backend.array_is_value_type = false;
+	backend.array_is_value_type_in_buffer_blocks = false;
+	backend.support_pointer_to_pointer = true;
+	backend.implicit_c_integer_promotion_rules = true;
+	backend.supports_spec_constant_array_size = false;
+
+	fixup_anonymous_struct_names();
+	fixup_type_alias();
+	replace_illegal_names();
+	build_function_control_flow_graphs_and_analyze();
+	update_active_builtins();
+	analyze_image_and_sampler_usage();
+	analyze_interlocked_resource_usage();
+
+	set_enabled_interface_variables(get_active_interface_variables());
+	reorder_type_alias();
+
+	uint32_t pass_count = 0;
+	do
+	{
+		reset(pass_count);
+		buffer.reset();
+
+		emit_header();
+		emit_specialization_constants_and_structs();
+		emit_resources();
+		emit_function(get<SPIRFunction>(ir.default_entry_point), Bitset());
+
+		pass_count++;
+	} while (is_forcing_recompilation());
+
+	return buffer.str();
+}
+
+bool CompilerOpenCL::specialization_constant_is_macro(uint32_t const_id) const
+{
+	return constant_macro_ids.find(const_id) != constant_macro_ids.end();
+}
+
+void CompilerOpenCL::emit_header()
+{
+	statement("// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)");
+	statement("");
+
+	if (opencl_options.opencl_version >= 200)
+		statement("#pragma OPENCL EXTENSION cl_khr_3d_image_writes : enable");
+	if (opencl_options.enable_fp64)
+		statement("#pragma OPENCL EXTENSION cl_khr_fp64 : enable");
+	if (opencl_options.enable_64bit_atomics && opencl_options.opencl_version >= 200)
+		statement("#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable");
+	statement("");
+
+	for (auto &header : header_lines)
+		statement(header);
+	if (!header_lines.empty())
+		statement("");
+}
+
+const char *CompilerOpenCL::to_storage_qualifiers_glsl(const SPIRVariable &)
+{
+	// OpenCL uses address space in type, not as a separate qualifier like "uniform"
+	return "";
+}
+
+void CompilerOpenCL::emit_resources()
+{
+	replace_illegal_names();
+}
+
+void CompilerOpenCL::replace_illegal_names()
+{
+	static const unordered_set<string> keywords = {
+		"char",
+		"char2",
+		"char3",
+		"char4",
+		"char8",
+		"char16",
+		"uchar",
+		"uchar2",
+		"uchar3",
+		"uchar4",
+		"uchar8",
+		"uchar16",
+		"short",
+		"short2",
+		"short3",
+		"short4",
+		"short8",
+		"short16",
+		"ushort",
+		"ushort2",
+		"ushort3",
+		"ushort4",
+		"ushort8",
+		"ushort16",
+		"int",
+		"int2",
+		"int3",
+		"int4",
+		"int8",
+		"int16",
+		"uint",
+		"uint2",
+		"uint3",
+		"uint4",
+		"uint8",
+		"uint16",
+		"long",
+		"long2",
+		"long3",
+		"long4",
+		"long8",
+		"long16",
+		"ulong",
+		"ulong2",
+		"ulong3",
+		"ulong4",
+		"ulong8",
+		"ulong16",
+		"float",
+		"float2",
+		"float3",
+		"float4",
+		"float8",
+		"float16",
+		"double",
+		"double2",
+		"double3",
+		"double4",
+		"double8",
+		"double16",
+		"bool",
+		"bool2",
+		"bool3",
+		"bool4",
+		"bool8",
+		"bool16",
+		"half",
+		"half2",
+		"half3",
+		"half4",
+		"half8",
+		"half16",
+		"quad",
+		"quad2",
+		"quad3",
+		"quad4",
+		"quad8",
+		"quad16",
+		"complex",
+		"imaginary"
+		"__global",
+		"global",
+		"__local",
+		"local",
+		"__constant",
+		"constant",
+		"__private",
+		"private",
+		"image1d_t",
+		"image1d_buffer_t",
+		"image1d_array_t",
+		"image2d_t",
+		"image2d_array_t",
+		"image2d_depth_t",
+		"image2d_array_depth_t",
+		"image3d_t",
+		"sampler_t",
+		"event_t",
+		"clk_event_t",
+		"ndrange_t",
+		"queue_t",
+		"reserve_id_t",
+		"__kernel",
+		"kernel",
+		"__read_only",
+		"read_only",
+		"__write_only",
+		"write_only",
+		"__read_write",
+		"read_write",
+		"atomic",
+		"pipe",
+		"MAXFLOAT",
+		"HUGE_VALF",
+		"INFINITY",
+		"NAN",
+		"HUGE_VAL",
+		"M_E_F",
+		"M_LOG2E_F",
+		"M_LOG10E_F",
+		"M_LN2_F",
+		"M_LN10_F",
+		"M_PI_F",
+		"M_PI_2_F",
+		"M_PI_4_F",
+		"M_1_PI_F",
+		"M_2_PI_F",
+		"M_2_SQRTPI_F",
+		"M_SQRT2_F",
+		"M_SQRT1_2_F",
+	};
+
+	CompilerGLSL::replace_illegal_names(keywords);
+	CompilerGLSL::replace_illegal_names();
+}
+
+void CompilerOpenCL::emit_workgroup_size_attribute()
+{
+	auto &ep = get_entry_point();
+	uint32_t x = ep.workgroup_size.x;
+	uint32_t y = ep.workgroup_size.y;
+	uint32_t z = ep.workgroup_size.z;
+	if (x == 0)
+		x = 1;
+	if (y == 0)
+		y = 1;
+	if (z == 0)
+		z = 1;
+	statement("__attribute__((reqd_work_group_size(", x, ", ", y, ", ", z, ")))");
+}
+
+void CompilerOpenCL::emit_entry_point_declarations()
+{
+	// Emit local variables for compute builtins so that builtin_to_glsl can return a name
+	if (!processing_entry_point)
+		return;
+
+	auto &execution = get_entry_point();
+	if (execution.model != ExecutionModelGLCompute)
+		return;
+
+	bool need_workgroup_id = active_input_builtins.get(BuiltInWorkgroupId);
+	bool need_local_id = active_input_builtins.get(BuiltInLocalInvocationId);
+	bool need_global_id = active_input_builtins.get(BuiltInGlobalInvocationId);
+	bool need_num_workgroups = active_input_builtins.get(BuiltInNumWorkgroups);
+	bool need_workgroup_size = active_input_builtins.get(BuiltInWorkgroupSize);
+	bool need_local_invocation_index = active_input_builtins.get(BuiltInLocalInvocationIndex);
+	bool need_global_size = active_input_builtins.get(BuiltInGlobalSize);
+
+	if (need_workgroup_id)
+		statement("uint3 spvWorkgroupId = (uint3)(get_group_id(0), get_group_id(1), get_group_id(2));");
+	if (need_local_id)
+		statement("uint3 spvLocalInvocationId = (uint3)(get_local_id(0), get_local_id(1), get_local_id(2));");
+	if (need_global_id)
+		statement("uint3 spvGlobalInvocationId = (uint3)(get_global_id(0), get_global_id(1), get_global_id(2));");
+	if (need_num_workgroups)
+		statement("uint3 spvNumWorkgroups = (uint3)(get_num_groups(0), get_num_groups(1), get_num_groups(2));");
+	if (need_workgroup_size)
+		statement("uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));");
+	if (need_local_invocation_index)
+		statement("uint spvLocalInvocationIndex = get_local_id(2) * get_local_size(0) * get_local_size(1) + "
+		          "get_local_id(1) * get_local_size(0) + get_local_id(0);");
+	if (need_global_size)
+		statement("uint3 spvGlobalSize = (uint3)(get_global_size(0), get_global_size(1), get_global_size(2));");
+}
+
+string CompilerOpenCL::builtin_to_glsl(BuiltIn builtin, StorageClass storage)
+{
+	(void)storage;
+	switch (builtin)
+	{
+	case BuiltInWorkgroupId:
+		return "spvWorkgroupId";
+	case BuiltInLocalInvocationId:
+		return "spvLocalInvocationId";
+	case BuiltInGlobalInvocationId:
+		return "spvGlobalInvocationId";
+	case BuiltInNumWorkgroups:
+		return "spvNumWorkgroups";
+	case BuiltInWorkgroupSize:
+		return "spvWorkgroupSize";
+	case BuiltInLocalInvocationIndex:
+		return "spvLocalInvocationIndex";
+	case BuiltInGlobalSize:
+		return "spvGlobalSize";
+	case BuiltInNumSubgroups:
+	case BuiltInSubgroupId:
+	case BuiltInSubgroupSize:
+	case BuiltInSubgroupLocalInvocationId:
+		SPIRV_CROSS_THROW("OpenCL subgroup builtins not yet implemented.");
+	default:
+		SPIRV_CROSS_THROW("Unsupported builtin for OpenCL compute shader.");
+	}
+}
+
+bool CompilerOpenCL::builtin_translates_to_nonarray(BuiltIn builtin) const
+{
+	(void)builtin;
+	return false;
+}
+
+// In OpenCL, address space qualifiers are required for all pointer or reference variables
+string CompilerOpenCL::get_variable_address_space(const SPIRVariable &argument)
+{
+	const auto &type = get<SPIRType>(argument.basetype);
+	return get_type_address_space(type, argument.self, true);
+}
+
+string CompilerOpenCL::get_type_address_space(const SPIRType &type, uint32_t id, bool argument)
+{
+	// This can be called for variable pointer contexts as well, so be very careful about which method we choose.
+	Bitset flags;
+	auto *var = maybe_get<SPIRVariable>(id);
+	if (var && type.basetype == SPIRType::Struct &&
+	    (has_decoration(type.self, DecorationBlock) || has_decoration(type.self, DecorationBufferBlock)))
+		flags = get_buffer_block_flags(id);
+	else
+	{
+		flags = get_decoration_bitset(id);
+
+		if (type.basetype == SPIRType::Struct &&
+		    (has_decoration(type.self, DecorationBlock) || has_decoration(type.self, DecorationBufferBlock)))
+		{
+			flags.merge_or(ir.get_buffer_block_type_flags(type));
+		}
+	}
+
+	const char *addr_space = "";
+	switch (type.storage)
+	{
+	case StorageClassUniform:
+	case StorageClassStorageBuffer:
+		addr_space = "__global";
+		break;
+	case StorageClassUniformConstant:
+	case StorageClassPushConstant:
+		addr_space = "__constant";
+		break;
+	case StorageClassWorkgroup:
+		addr_space = "__local";
+		break;
+	default:
+		// __private is default and would be redundant
+		break;
+	}
+	return addr_space;
+}
+
+const char *CompilerOpenCL::to_restrict(uint32_t id, bool space)
+{
+	// This can be called for variable pointer contexts as well, so be very careful about which method we choose.
+	Bitset flags;
+	if (ir.ids[id].get_type() == TypeVariable)
+	{
+		uint32_t type_id = expression_type_id(id);
+		auto &type = expression_type(id);
+		if (type.basetype == SPIRType::Struct &&
+		    (has_decoration(type_id, DecorationBlock) || has_decoration(type_id, DecorationBufferBlock)))
+			flags = get_buffer_block_flags(id);
+		else
+			flags = get_decoration_bitset(id);
+	}
+	else
+		flags = get_decoration_bitset(id);
+
+	return flags.get(DecorationRestrict) || flags.get(DecorationRestrictPointerEXT) ?
+	           (space ? "__restrict " : "__restrict") :
+	           "";
+}
+
+string CompilerOpenCL::type_to_glsl(const SPIRType &type, uint32_t id, bool member)
+{
+	string type_name;
+
+	// Pointer?
+	if (is_pointer(type) || type_is_array_of_pointers(type))
+	{
+		assert(type.pointer_depth > 0);
+
+		const char *restrict_kw;
+
+		auto type_address_space = get_type_address_space(type, id);
+		const auto *p_parent_type = &get<SPIRType>(type.parent_type);
+
+		// Work around C pointer qualifier rules. If glsl_type is a pointer type as well
+		// we'll need to emit the address space to the right.
+		// We could always go this route, but it makes the code unnatural.
+		// Prefer emitting thread T *foo over T thread* foo since it's more readable,
+		// but we'll have to emit thread T * thread * T constant bar; for example.
+		if (is_pointer(type) && is_pointer(*p_parent_type))
+			type_name = join(type_to_glsl(*p_parent_type, id), " ", type_address_space, " ");
+		else
+		{
+			// Since this is not a pointer-to-pointer, ensure we've dug down to the base type.
+			// Some situations chain pointers even though they are not formally pointers-of-pointers.
+			while (is_pointer(*p_parent_type))
+				p_parent_type = &get<SPIRType>(p_parent_type->parent_type);
+
+			type_name = join(type_address_space, " ", type_to_glsl(*p_parent_type, id));
+		}
+
+		switch (type.basetype)
+		{
+		case SPIRType::Image:
+		case SPIRType::SampledImage:
+		case SPIRType::Sampler:
+			// These are handles.
+			break;
+		default:
+			// Anything else can be a raw pointer.
+			type_name += "*";
+			restrict_kw = to_restrict(id, false);
+			if (*restrict_kw)
+			{
+				type_name += " ";
+				type_name += restrict_kw;
+			}
+			break;
+		}
+		return type_name;
+	}
+
+	switch (type.basetype)
+	{
+	case SPIRType::Struct:
+		// Need OpName lookup here to get a "sensible" name for a struct.
+		type_name = to_name(type.self);
+		break;
+
+	case SPIRType::Image:
+	case SPIRType::SampledImage:
+		return image_type_glsl(type, id, member);
+
+	case SPIRType::Sampler:
+		return "sampler_t";
+
+	case SPIRType::Void:
+		return "void";
+
+	case SPIRType::AtomicCounter:
+		return "atomic_uint";
+
+	// Scalars
+	case SPIRType::Boolean:
+		type_name = "bool";
+		break;
+
+	case SPIRType::Char:
+	case SPIRType::SByte:
+		type_name = "char";
+		break;
+	case SPIRType::UByte:
+		type_name = "uchar";
+		break;
+	case SPIRType::Short:
+		type_name = "short";
+		break;
+	case SPIRType::UShort:
+		type_name = "ushort";
+		break;
+	case SPIRType::Int:
+		type_name = "int";
+		break;
+	case SPIRType::UInt:
+		type_name = "uint";
+		break;
+	case SPIRType::Int64:
+		type_name = "long";
+		break;
+	case SPIRType::UInt64:
+		type_name = "ulong";
+		break;
+	case SPIRType::Half:
+		type_name = "half";
+		break;
+	case SPIRType::Float:
+		type_name = "float";
+		break;
+	case SPIRType::Double:
+		if (!opencl_options.enable_fp64)
+			SPIRV_CROSS_THROW("Double requires cl_khr_fp64.");
+		type_name = "double";
+		break;
+
+	default:
+		return "unknown_type";
+	}
+
+	// Vector?
+	if (type.vecsize > 1)
+		type_name += to_string(type.vecsize);
+
+	return type_name;
+}
+
+string CompilerOpenCL::type_to_glsl(const SPIRType &type, uint32_t id)
+{
+	return type_to_glsl(type, id, false);
+}
+
+string CompilerOpenCL::image_type_glsl(const SPIRType &type, uint32_t id, bool member)
+{
+	(void)id;
+	(void)member;
+	if (type.basetype != SPIRType::Image)
+		return "";
+
+	bool readonly = type.image.sampled != 2;
+	const char *access = readonly ? "read_only" : "write_only";
+	switch (type.image.dim)
+	{
+	case Dim1D:
+		return join(access, " image1d_t");
+	case Dim2D:
+		return type.image.arrayed ? join(access, " image2d_array_t") : join(access, " image2d_t");
+	case Dim3D:
+		return join(access, " image3d_t");
+	case DimCube:
+		return join(access, " image2d_t"); // OpenCL has no cube; use 2D
+	case DimBuffer:
+		return join(access, " image1d_buffer_t");
+	default:
+		SPIRV_CROSS_THROW("Unsupported image dimension for OpenCL.");
+	}
+}
+
+std::string CompilerOpenCL::type_to_glsl_constructor(const SPIRType &type)
+{
+	string ret = CompilerGLSL::type_to_glsl_constructor(type);
+	printf("type_to_glsl_constructor: %s\n", ret.c_str());
+	if (!ret.empty())
+		ret = join("(", ret, ")");
+	return ret;
+}
+
+// GCC workaround of lambdas calling protected funcs
+std::string CompilerOpenCL::variable_decl(const SPIRType &type, const std::string &name, uint32_t id)
+{
+	return CompilerGLSL::variable_decl(type, name, id);
+}
+
+std::string CompilerOpenCL::entry_point_args(bool append_comma)
+{
+	// Reset flattening maps for this compilation pass
+	flattened_buffer_vars.clear();
+	push_const_member_map.clear();
+
+	std::string ep_args;
+
+	struct Resource
+	{
+		SPIRVariable *var;
+		SPIRVariable *discrete_descriptor_alias;
+		string name;
+		SPIRType::BaseType basetype;
+		uint32_t index;
+		uint32_t plane;
+		uint32_t secondary_index;
+	};
+
+	SmallVector<Resource> resources;
+
+	ir.for_each_typed_id<SPIRVariable>(
+	    [&](uint32_t var_id, SPIRVariable &var)
+	    {
+		    auto &type = get_variable_data_type(var);
+		    /*
+		    if (var.storage == StorageClassPushConstant)
+		    {
+			    for (uint32_t mbr_idx = 0; mbr_idx < uint32_t(type.member_types.size()); mbr_idx++)
+			    {
+				    if (!ep_args.empty())
+					    ep_args += ", ";
+
+				    auto mbr_name = to_member_name(type, mbr_idx);
+					const auto &member_type = this->get<SPIRType>(type.member_types[mbr_idx]);
+				    ep_args += join(this->type_to_glsl(member_type), " ", mbr_name);
+				    // Record the mapping so emit_instruction can rewrite access chains
+				    push_const_member_map[var_id][mbr_idx] = mbr_name;
+			    }
+		    }
+			*/
+		    if (var.storage == StorageClassStorageBuffer || has_decoration(type.self, DecorationBufferBlock))
+		    {
+			    Bitset flags = ir.get_buffer_block_flags(var);
+			    bool is_readonly = flags.get(DecorationNonWritable);
+
+			    auto to_structuredbuffer_subtype_name = [this](const SPIRType &parent_type) -> std::string
+			    {
+				    if (parent_type.basetype == SPIRType::Struct && parent_type.member_types.size() == 1)
+				    {
+					    // Use type of first struct member as a StructuredBuffer will have only one '._m0' field in SPIR-V
+					    const auto &member0_type = this->get<SPIRType>(parent_type.member_types.front());
+					    return this->type_to_glsl(member0_type);
+				    }
+				    else
+				    {
+					    // Otherwise, this StructuredBuffer only has a basic subtype, e.g. StructuredBuffer<int>
+					    return this->type_to_glsl(parent_type);
+				    }
+			    };
+			    if (!ep_args.empty())
+				    ep_args += ", ";
+
+			    ep_args += join("__global ", is_readonly ? "const " : "", to_structuredbuffer_subtype_name(type), "* ",
+			                    to_name(var_id));
+			    // Record so emit_instruction can rewrite OpAccessChain against this var
+			    flattened_buffer_vars.insert(var_id);
+		    }
+		    else if ((var.storage == StorageClassUniform || var.storage == StorageClassUniformConstant ||
+		              var.storage == StorageClassPushConstant || var.storage == StorageClassStorageBuffer) &&
+		             !is_hidden_variable(var))
+		    {
+			    switch (var.basetype)
+			    {
+			    case SPIRType::Struct:
+			    {
+				    break;
+			    }
+			    case SPIRType::Sampler:
+				    break;
+			    case SPIRType::Image:
+			    {
+				    if (!ep_args.empty())
+					    ep_args += ", ";
+
+				    ep_args += type_to_glsl(type, var_id) + " " + to_name(var_id);
+				    break;
+			    }
+			    case SPIRType::AccelerationStructure:
+			    {
+				    break;
+			    }
+			    default:
+				    if (!ep_args.empty())
+					    ep_args += ", ";
+
+				    ep_args += type_to_glsl(type, var_id) + " " + to_name(var_id);
+				    break;
+			    }
+		    }
+	    });
+
+	if (!ep_args.empty() && append_comma)
+		ep_args += ", ";
+
+	return ep_args;
+}
+
+string CompilerOpenCL::get_inner_entry_point_name() const
+{
+	return "comp_main";
+}
+
+void CompilerOpenCL::emit_function_prototype(SPIRFunction &func, const Bitset &return_flags)
+{
+	(void)return_flags;
+	if (func.self != ir.default_entry_point)
+		add_function_overload(func);
+
+	// Entry point: __kernel void name(...)
+	emit_workgroup_size_attribute();
+	string decl;
+	decl += "__kernel void ";
+	if (func.self == ir.default_entry_point)
+	{
+		decl += get_inner_entry_point_name();
+		processing_entry_point = true;
+	}
+	else
+		decl += to_name(func.self);
+	decl += "(";
+
+	if (processing_entry_point)
+	{
+		decl += entry_point_args(!func.arguments.empty());
+
+		// append entry point args to avoid conflicts in local variable names.
+		local_variable_names.insert(resource_names.begin(), resource_names.end());
+	}
+
+	for (auto &arg : func.arguments)
+	{
+		add_local_variable_name(arg.id);
+
+		decl += argument_decl(arg);
+		if (&arg != &func.arguments.back())
+			decl += ", ";
+
+		// Hold a pointer to the parameter so we can invalidate the readonly field if needed.
+		auto *var = maybe_get<SPIRVariable>(arg.id);
+		if (var)
+			var->parameter = &arg;
+	}
+
+	decl += ")";
+	statement(decl);
+}
+
+void CompilerOpenCL::emit_specialization_constants_and_structs()
+{
+	SpecializationConstant wg_x, wg_y, wg_z;
+	ID workgroup_size_id = get_work_group_size_specialization_constants(wg_x, wg_y, wg_z);
+
+	bool emitted = false;
+	unordered_set<uint32_t> declared_structs;
+	unordered_set<uint32_t> aligned_structs;
+
+	// Very particular use of the soft loop lock.
+	// align_struct may need to create custom types on the fly, but we don't care about
+	// these types for purpose of iterating over them in ir.ids_for_type and friends.
+	auto loop_lock = ir.create_loop_soft_lock();
+
+	// Physical storage buffer pointers can have cyclical references,
+	// so emit forward declarations of them before other structs.
+	// Ignore type_id because we want the underlying struct type from the pointer.
+	ir.for_each_typed_id<SPIRType>(
+	    [&](uint32_t /* type_id */, const SPIRType &type)
+	    {
+		    if (type.basetype == SPIRType::Struct && type.pointer &&
+		        type.storage == StorageClassPhysicalStorageBuffer && declared_structs.count(type.self) == 0)
+		    {
+			    statement("struct ", to_name(type.self), ";");
+			    declared_structs.insert(type.self);
+			    emitted = true;
+		    }
+	    });
+	if (emitted)
+		statement("");
+
+	emitted = false;
+	declared_structs.clear();
+
+	// It is possible to have multiple spec constants that use the same spec constant ID.
+	// The most common cause of this is defining spec constants in GLSL while also declaring
+	// the workgroup size to use those spec constants. But, Metal forbids declaring more than
+	// one variable with the same function constant ID.
+	// In this case, we must only declare one variable with the [[function_constant(id)]]
+	// attribute, and use its initializer to initialize all the spec constants with
+	// that ID.
+	std::unordered_map<uint32_t, ConstantID> unique_func_constants;
+
+	for (auto &id_ : ir.ids_for_constant_undef_or_type)
+	{
+		auto &id = ir.ids[id_];
+
+		if (id.get_type() == TypeConstant)
+		{
+			auto &c = id.get<SPIRConstant>();
+
+			if (c.specialization)
+			{
+				auto &type = get<SPIRType>(c.constant_type);
+				string sc_type_name = type_to_glsl(type);
+				add_resource_name(c.self);
+				string sc_name = to_name(c.self);
+
+				// Specialization constants are only supported in SPIR-V not OpenCL C.
+				// Just declare the "default" directly.
+				if (has_decoration(c.self, DecorationSpecId))
+				{
+					// Fallback to macro overrides.
+					uint32_t constant_id = get_decoration(c.self, DecorationSpecId);
+					c.specialization_constant_macro_name = constant_value_macro_name(constant_id);
+
+					statement("#ifndef ", c.specialization_constant_macro_name);
+					statement("#define ", c.specialization_constant_macro_name, " ", constant_expression(c));
+					statement("#endif");
+					statement("constant ", sc_type_name, " ", sc_name, " = ", c.specialization_constant_macro_name,
+					          ";");
+
+					// Record the usage of macro
+					constant_macro_ids.insert(constant_id);
+				}
+				else
+				{
+					// Composite specialization constants must be built from other specialization constants.
+					statement("constant ", sc_type_name, " ", sc_name, " = ", constant_expression(c), ";");
+				}
+				emitted = true;
+			}
+		}
+		else if (id.get_type() == TypeConstantOp)
+		{
+			auto &c = id.get<SPIRConstantOp>();
+			auto &type = get<SPIRType>(c.basetype);
+			add_resource_name(c.self);
+			auto name = to_name(c.self);
+			statement("constant ", variable_decl(type, name), " = ", constant_op_expression(c), ";");
+			emitted = true;
+		}
+		else if (id.get_type() == TypeType)
+		{
+			// Output non-builtin interface structs. These include local function structs
+			// and structs nested within uniform and read-write buffers.
+			auto &type = id.get<SPIRType>();
+			TypeID type_id = type.self;
+
+			bool is_struct = (type.basetype == SPIRType::Struct) && type.array.empty() && !type.pointer;
+			bool is_block =
+			    has_decoration(type.self, DecorationBlock) || has_decoration(type.self, DecorationBufferBlock);
+
+			bool is_builtin_block = is_block && is_builtin_type(type);
+			bool is_declarable_struct = is_struct && !is_builtin_block;
+
+			// Align and emit declarable structs...but avoid declaring each more than once.
+			if (is_declarable_struct && declared_structs.count(type_id) == 0)
+			{
+				if (emitted)
+					statement("");
+				emitted = false;
+
+				declared_structs.insert(type_id);
+
+				// Make sure we declare the underlying struct type, and not the "decorated" type with pointers, etc.
+				emit_struct(get<SPIRType>(type_id));
+			}
+		}
+		else if (id.get_type() == TypeUndef)
+		{
+			auto &undef = id.get<SPIRUndef>();
+			auto &type = get<SPIRType>(undef.basetype);
+			// OpUndef can be void for some reason ...
+			if (type.basetype == SPIRType::Void)
+				return;
+		}
+	}
+
+	if (emitted)
+		statement("");
+}
+
+void CompilerOpenCL::emit_instruction(const Instruction &instruction)
+{
+	auto ops = stream(instruction);
+	auto opcode = static_cast<Op>(instruction.op);
+
+	// Map buffer atomics to OpenCL C names (atomic_add, atomic_sub, etc.)
+	auto opencl_atomic = [this, ops](const char *opencl_op)
+	{
+		if (check_atomic_image(ops[2]))
+			SPIRV_CROSS_THROW("Image atomics not yet implemented for OpenCL.");
+		emit_atomic_func_op(ops[0], ops[1], ops[2], ops[5], opencl_op);
+	};
+
+	switch (opcode)
+	{
+	case OpAtomicExchange:
+		if (check_atomic_image(ops[2]))
+			SPIRV_CROSS_THROW("Image atomics not yet implemented for OpenCL.");
+		emit_atomic_func_op(ops[0], ops[1], ops[2], ops[5], "atomic_xchg");
+		break;
+	case OpAtomicCompareExchange:
+		if (check_atomic_image(ops[2]))
+			SPIRV_CROSS_THROW("Image atomics not yet implemented for OpenCL.");
+		// OpenCL atomic_cmpxchg(ptr, expected, desired)
+		emit_atomic_func_op(ops[0], ops[1], ops[2], ops[7], ops[6], "atomic_cmpxchg");
+		break;
+	case OpAtomicIAdd:
+	case OpAtomicFAddEXT:
+		opencl_atomic("atomic_add");
+		break;
+	case OpAtomicISub:
+	{
+		if (check_atomic_image(ops[2]))
+			SPIRV_CROSS_THROW("Image atomics not yet implemented for OpenCL.");
+		forced_temporaries.insert(ops[1]);
+		auto expr = join("atomic_sub(", to_atomic_ptr_expression(ops[2]), ", ", to_enclosed_expression(ops[5]), ")");
+		emit_op(ops[0], ops[1], expr, should_forward(ops[2]) && should_forward(ops[5]));
+		flush_all_atomic_capable_variables();
+		break;
+	}
+	case OpAtomicSMin:
+	case OpAtomicUMin:
+		opencl_atomic("atomic_min");
+		break;
+	case OpAtomicSMax:
+	case OpAtomicUMax:
+		opencl_atomic("atomic_max");
+		break;
+	case OpAtomicAnd:
+		opencl_atomic("atomic_and");
+		break;
+	case OpAtomicOr:
+		opencl_atomic("atomic_or");
+		break;
+	case OpAtomicXor:
+		opencl_atomic("atomic_xor");
+		break;
+	case OpAtomicLoad:
+	{
+		if (check_atomic_image(ops[2]))
+			SPIRV_CROSS_THROW("Image atomics not yet implemented for OpenCL.");
+		auto &type = expression_type(ops[2]);
+		forced_temporaries.insert(ops[1]);
+		bool unsigned_type = (type.basetype == SPIRType::UInt);
+		const char *increment = unsigned_type ? "0u" : "0";
+		emit_op(ops[0], ops[1], join("atomic_add(", to_atomic_ptr_expression(ops[2]), ", ", increment, ")"), false);
+		flush_all_atomic_capable_variables();
+		break;
+	}
+	case OpAtomicStore:
+	{
+		if (check_atomic_image(ops[0]))
+			SPIRV_CROSS_THROW("Image atomics not yet implemented for OpenCL.");
+		statement("atomic_xchg(", to_atomic_ptr_expression(ops[0]), ", ", to_expression(ops[3]), ");");
+		flush_all_atomic_capable_variables();
+		break;
+	}
+	case OpAtomicIIncrement:
+	case OpAtomicIDecrement:
+	{
+		if (check_atomic_image(ops[2]))
+			SPIRV_CROSS_THROW("Image atomics not yet implemented for OpenCL.");
+		forced_temporaries.insert(ops[1]);
+		auto &type = expression_type(ops[2]);
+		bool unsigned_type = (type.basetype == SPIRType::UInt);
+		const char *inc = (opcode == OpAtomicIIncrement && unsigned_type) ? "1u" :
+		                  (opcode == OpAtomicIIncrement)                  ? "1" :
+		                  unsigned_type                                   ? "uint(-1)" :
+		                                                                    "-1";
+		emit_op(ops[0], ops[1], join("atomic_add(", to_atomic_ptr_expression(ops[2]), ", ", inc, ")"), false);
+		flush_all_atomic_capable_variables();
+		break;
+	}
+	case OpAccessChain:
+	case OpInBoundsAccessChain:
+	{
+		uint32_t result_type = ops[0];
+		uint32_t result_id = ops[1];
+		uint32_t base_id = ops[2];
+		uint32_t length = instruction.length;
+
+		// SSBO flattened to __global T*: rewrite [base, member_0, element_idx] → base[element_idx]
+		if (flattened_buffer_vars.count(base_id) && length >= 5)
+		{
+			// ops[3] = struct member index (always 0 for single-member SSBO) — skip
+			// ops[4] = element index within the runtime array
+			auto expr = join(to_name(base_id), "[", to_expression(ops[4]), "]");
+			auto &e = set<SPIRExpression>(result_id, std::move(expr), result_type, true);
+			auto *backing_var = maybe_get_backing_variable(base_id);
+			e.loaded_from = backing_var ? backing_var->self : ID(base_id);
+			e.access_chain = true;
+			forwarded_temporaries.insert(result_id);
+			suppressed_usage_tracking.insert(result_id);
+			for (uint32_t i = 2; i < length; i++)
+				inherit_expression_dependencies(result_id, ops[i]);
+			if (get<SPIRExpression>(result_id).expression_dependencies.empty())
+				forwarded_temporaries.erase(result_id);
+			break;
+		}
+
+		// Push constant expanded to scalar params: rewrite [p_var, member_idx] → scalar param name
+		auto push_it = push_const_member_map.find(base_id);
+		if (push_it != push_const_member_map.end() && length >= 4)
+		{
+			uint32_t mbr_idx = get<SPIRConstant>(ops[3]).scalar();
+			auto name_it = push_it->second.find(mbr_idx);
+			if (name_it != push_it->second.end())
+			{
+				auto &e = set<SPIRExpression>(result_id, name_it->second, result_type, false);
+				e.loaded_from = base_id;
+				e.access_chain = true;
+				break;
+			}
+		}
+
+		// Fall through to base class for all other access chains
+		CompilerGLSL::emit_instruction(instruction);
+		break;
+	}
+
+	default:
+		CompilerGLSL::emit_instruction(instruction);
+		break;
+	}
+}
diff --git a/spirv_opencl.hpp b/spirv_opencl.hpp
new file mode 100644
index 000000000..da64673d8
--- /dev/null
+++ b/spirv_opencl.hpp
@@ -0,0 +1,125 @@
+/*
+ * Copyright 2016-2021 The Brenwill Workshop Ltd.
+ * SPDX-License-Identifier: Apache-2.0 OR MIT
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * At your option, you may choose to accept this material under either:
+ * 1. The Apache License, Version 2.0, found at, or
+ * 2. The MIT License, found at.
+ */
+
+#ifndef SPIRV_CROSS_OPENCL_HPP
+#define SPIRV_CROSS_OPENCL_HPP
+
+#include "spirv_glsl.hpp"
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+namespace SPIRV_CROSS_NAMESPACE
+{
+using namespace SPIRV_CROSS_SPV_HEADER_NAMESPACE;
+
+// Decompiles SPIR-V (compute only) to OpenCL C
+class CompilerOpenCL : public CompilerGLSL
+{
+public:
+	struct Options
+	{
+		// OpenCL C version: 120 = 1.2, 200 = 2.0
+		uint32_t opencl_version = make_opencl_version(1, 2);
+		// Enable cl_khr_fp64 (double) extension
+		bool enable_fp64 = false;
+		// Enable cl_khr_int64_extended_atomics extension
+		bool enable_64bit_atomics = false;
+
+		void set_opencl_version(uint32_t major, uint32_t minor = 0, uint32_t patch = 0)
+		{
+			opencl_version = make_opencl_version(major, minor, patch);
+		}
+
+		bool supports_opencl_version(uint32_t major, uint32_t minor = 0, uint32_t patch = 0) const
+		{
+			return opencl_version >= make_opencl_version(major, minor, patch);
+		}
+
+		static uint32_t make_opencl_version(uint32_t major, uint32_t minor = 0, uint32_t patch = 0)
+		{
+			return (major * 100) + (minor * 10) + patch;
+		}
+	};
+
+	explicit CompilerOpenCL(std::vector<uint32_t> spirv_);
+	CompilerOpenCL(const uint32_t *ir_, size_t word_count);
+	explicit CompilerOpenCL(const ParsedIR &ir_);
+	explicit CompilerOpenCL(ParsedIR &&ir_);
+
+	const Options &get_opencl_options() const
+	{
+		return opencl_options;
+	}
+	void set_opencl_options(const Options &opts)
+	{
+		opencl_options = opts;
+	}
+
+	std::string compile() override;
+
+	// Information about specialization constants that are translated into macros
+	// instead of using constant declarations.
+	// These must only be called after a successful call to CompilerOpenCL::compile().
+	bool specialization_constant_is_macro(uint32_t constant_id) const;
+
+protected:
+	void emit_header() override;
+	void emit_resources();
+	void emit_specialization_constants_and_structs();
+	std::string type_to_glsl(const SPIRType &type, uint32_t id, bool member);
+	std::string type_to_glsl(const SPIRType &type, uint32_t id = 0) override;
+	std::string builtin_to_glsl(BuiltIn builtin, StorageClass storage) override;
+	std::string image_type_glsl(const SPIRType &type, uint32_t id = 0, bool member = false) override;
+	const char *to_storage_qualifiers_glsl(const SPIRVariable &var) override;
+	void emit_entry_point_declarations() override;
+	// GCC workaround of lambdas calling protected functions (for older GCC versions)
+	std::string variable_decl(const SPIRType &type, const std::string &name, uint32_t id = 0) override;
+	void emit_function_prototype(SPIRFunction &func, const Bitset &return_flags) override;
+	void emit_instruction(const Instruction &instruction) override;
+	virtual bool builtin_translates_to_nonarray(BuiltIn builtin) const override;
+	std::string get_variable_address_space(const SPIRVariable &argument);
+	std::string get_type_address_space(const SPIRType &type, uint32_t id, bool argument = false);
+	const char *to_restrict(uint32_t id, bool space);
+
+	void replace_illegal_names() override;
+
+	Options opencl_options;
+
+	// SSBO variables emitted as flat element pointers (__global T*) in the kernel signature
+	std::unordered_set<uint32_t> flattened_buffer_vars;
+	// Push-constant variable → { member_index → scalar param name }
+	std::unordered_map<uint32_t, std::unordered_map<uint32_t, std::string>> push_const_member_map;
+
+	std::unordered_set<uint32_t> constant_macro_ids;
+
+	void emit_workgroup_size_attribute();
+
+	std::string entry_point_args(bool append_comma);
+	std::string get_inner_entry_point_name() const;
+};
+
+} // namespace SPIRV_CROSS_NAMESPACE
+
+#endif
diff --git a/test_shaders.py b/test_shaders.py
index 2e019f091..b3f87fc7f 100755
--- a/test_shaders.py
+++ b/test_shaders.py
@@ -33,12 +33,13 @@
 from functools import partial
 
 class Paths():
-    def __init__(self, spirv_cross, glslang, spirv_as, spirv_val, spirv_opt):
+    def __init__(self, spirv_cross, glslang, spirv_as, spirv_val, spirv_opt, clang):
         self.spirv_cross = spirv_cross
         self.glslang = glslang
         self.spirv_as = spirv_as
         self.spirv_val = spirv_val
         self.spirv_opt = spirv_opt
+        self.clang = clang
 
 def remove_file(path):
     #print('Removing file:', path)
@@ -57,7 +58,7 @@ def parse_stats(stats):
     m = re.search('([0-9]+) uniform registers', stats)
     uniform_regs = int(m.group(1)) if m else 0
 
-    m_list = re.findall('(-?[0-9]+)\s+(-?[0-9]+)\s+(-?[0-9]+)', stats)
+    m_list = re.findall(r'(-?[0-9]+)\s+(-?[0-9]+)\s+(-?[0-9]+)', stats)
     alu_short = float(m_list[1][0]) if m_list else 0
     ls_short = float(m_list[1][1]) if m_list else 0
     tex_short = float(m_list[1][2]) if m_list else 0
@@ -584,6 +585,104 @@ def cross_compile_hlsl(shader, spirv, opt, force_no_external_validation, iterati
 
     return (spirv_path, hlsl_path)
 
+def path_to_opencl_standard(shader):
+    if '.cl30.' in shader:
+        return '-cl-std=CL3.0'
+    elif '.cl22.' in shader:
+        return '-cl-std=CL2.2'
+    elif '.cl21.' in shader:
+        return '-cl-std=CL2.1'
+    elif '.cl20.' in shader:
+        return '-cl-std=CL2.0'
+    else:
+        return '-cl-std=CL1.2'
+
+def path_to_opencl_standard_cli(shader):
+    if '.cl30.' in shader:
+        return '300'
+    elif '.cl22.' in shader:
+        return '220'
+    elif '.cl21.' in shader:
+        return '210'
+    elif '.cl20.' in shader:
+        return '200'
+    else:
+        return '120'
+
+ignore_clang = False
+def validate_shader_opencl(shader, opt, paths):
+    shader = reference_path(shader[0], shader[1], opt)
+    extensions = []
+
+    global ignore_clang
+    try:
+        defines = ['-D' + ext for ext in extensions]
+        version = path_to_opencl_standard_cli(shader)
+        subprocess.check_call([paths.clang, '-Xclang',
+                               path_to_opencl_standard(shader),
+                               '-D__OPENCL_C_VERSION__=' + version,
+                               '-D__OPENCL_VERSION__=' + version] + defines +
+                              [
+                               '-emit-llvm', '-target', 'spir64-unknown-unknown',
+                               '-Xclang', '-finclude-default-header', '-x', 'cl', '-c', shader])
+
+    except OSError as oe:
+        if (oe.errno != errno.ENOENT):   # Ignore clang not found error
+            raise
+        print('clang does not exist, ignoring further attempts to use it.')
+        ignore_clang = True
+    except subprocess.CalledProcessError:
+        print('Error compiling OpenCL kernel: ' + shader)
+        raise RuntimeError('Failed to compile OpenCL kernel')
+
+def cross_compile_opencl(shader, spirv, opt, iterations, paths):
+    spirv_path = create_temporary()
+    opencl_path = create_temporary(os.path.basename(shader))
+
+    spirv_16 = '.spv16.' in shader
+    spirv_14 = '.spv14.' in shader
+
+    if spirv_16:
+        spirv_env = 'spv1.6'
+        glslang_env = 'vulkan1.3'
+    elif spirv_14:
+        spirv_env = 'vulkan1.1spv1.4'
+        glslang_env = 'spirv1.4'
+    else:
+        spirv_env = 'vulkan1.1'
+        glslang_env = 'vulkan1.1'
+
+    spirv_cmd = [paths.spirv_as, '--preserve-numeric-ids', '--target-env', spirv_env, '-o', spirv_path, shader]
+
+    if spirv:
+        subprocess.check_call(spirv_cmd)
+    else:
+        glslang_cmd = [paths.glslang, '--amb' ,'--target-env', glslang_env, '-V', '-o', spirv_path, shader]
+        if '.g.' in shader:
+            glslang_cmd.append('-g')
+        if '.gV.' in shader:
+            glslang_cmd.append('-gV')
+        subprocess.check_call(glslang_cmd)
+
+    if opt and (not shader_is_invalid_spirv(shader)):
+        if '.graphics-robust-access.' in shader:
+            subprocess.check_call([paths.spirv_opt, '--skip-validation', '-O', '--graphics-robust-access', '-o', spirv_path, spirv_path])
+        else:
+            subprocess.check_call([paths.spirv_opt, '--skip-validation', '-O', '-o', spirv_path, spirv_path])
+
+    spirv_cross_path = paths.spirv_cross
+
+    opencl_args = [spirv_cross_path, '--output', opencl_path, spirv_path, '--opencl', '--iterations', str(iterations)]
+    opencl_args.append('--opencl-version')
+    opencl_args.append(path_to_opencl_standard_cli(shader))
+
+    subprocess.check_call(opencl_args)
+
+    if not shader_is_invalid_spirv(opencl_path):
+        subprocess.check_call([paths.spirv_val, '--allow-localsizeid', '--scalar-block-layout', '--target-env', spirv_env, spirv_path])
+
+    return (spirv_path, opencl_path)
+
 def cross_compile_reflect(shader, spirv, opt, iterations, paths):
     spirv_path = create_temporary()
     reflect_path = create_temporary(os.path.basename(shader))
@@ -942,6 +1041,24 @@ def test_shader_hlsl(stats, shader, args, paths):
     regression_check(shader, hlsl, args)
     remove_file(spirv)
 
+def test_shader_opencl(stats, shader, args, paths):
+    joined_path = os.path.join(shader[0], shader[1])
+
+    if os.path.splitext(joined_path)[1] == '.cl':
+        return
+
+    print('Testing OpenCL kernel:', joined_path)
+    is_spirv = shader_is_spirv(shader[1])
+    noopt = shader_is_noopt(shader[1])
+    spirv, opencl = cross_compile_opencl(joined_path, is_spirv, args.opt and (not noopt), args.iterations, paths)
+    regression_check(shader, opencl, args)
+
+    skip_validation = '.invalid.' in joined_path
+    if (not args.force_no_external_validation) and (not skip_validation):
+        validate_shader_opencl(shader, args.opt, paths)
+
+    remove_file(spirv)
+
 def test_shader_reflect(stats, shader, args, paths):
     joined_path = os.path.join(shader[0], shader[1])
     print('Testing shader reflection:', joined_path)
@@ -952,12 +1069,14 @@ def test_shader_reflect(stats, shader, args, paths):
     remove_file(spirv)
 
 def test_shader_file(relpath, stats, args, backend):
-    paths = Paths(args.spirv_cross, args.glslang, args.spirv_as, args.spirv_val, args.spirv_opt)
+    paths = Paths(args.spirv_cross, args.glslang, args.spirv_as, args.spirv_val, args.spirv_opt, args.clang)
     try:
         if backend == 'msl':
             test_shader_msl(stats, (args.folder, relpath), args, paths)
         elif backend == 'hlsl':
             test_shader_hlsl(stats, (args.folder, relpath), args, paths)
+        elif backend == 'opencl':
+            test_shader_opencl(stats, (args.folder, relpath), args, paths)
         elif backend == 'reflect':
             test_shader_reflect(stats, (args.folder, relpath), args, paths)
         else:
@@ -1033,6 +1152,9 @@ def main():
     parser.add_argument('--hlsl',
             action = 'store_true',
             help = 'Test HLSL backend.')
+    parser.add_argument('--opencl',
+            action = 'store_true',
+            help = 'Test OpenCL backend.')
     parser.add_argument('--force-no-external-validation',
             action = 'store_true',
             help = 'Disable all external validation.')
@@ -1060,6 +1182,9 @@ def main():
     parser.add_argument('--spirv-opt',
             default = 'spirv-opt',
             help = 'Explicit path to spirv-opt')
+    parser.add_argument('--clang',
+            default = 'clang',
+            help = 'Explicit path to clang')
     parser.add_argument('--iterations',
             default = 1,
             type = int,
@@ -1082,6 +1207,8 @@ def main():
         backend = 'msl'
     elif args.hlsl:
         backend = 'hlsl'
+    elif args.opencl:
+        backend = 'opencl'
     elif args.reflect:
         backend = 'reflect'
 

From 5c943a9383d49bf6c3fd15b4c7fabe474cee6efd Mon Sep 17 00:00:00 2001
From: Garrick Meeker <gmeeker@gmail.com>
Date: Thu, 12 Mar 2026 18:17:18 -0700
Subject: [PATCH 02/16] Adding shaders-opencl with more OpenCL backend support

---
 Package.swift                                 |    2 +
 main.cpp                                      |    6 +
 ...rent-structured-buffer.structured.asm.frag |   42 +-
 ...rent-structured-buffer.structured.asm.frag |   42 +-
 .../asm/comp/atomic-decrement.asm.comp        |   19 +
 .../asm/comp/atomic-increment.asm.comp        |   19 +
 .../asm/comp/bitcast_iadd.asm.comp            |   32 +
 .../asm/comp/bitcast_icmp.asm.comp            |   32 +
 .../asm/comp/bitcast_sar.asm.comp             |   34 +
 .../asm/comp/bitcast_sdiv.asm.comp            |   34 +
 .../asm/comp/bitcast_slr.asm.comp             |   34 +
 .../asm/comp/block-name-alias-global.asm.comp |   48 +
 .../comp/buffer-write-relative-addr.asm.comp  |   24 +
 .../asm/comp/buffer-write.asm.comp            |   16 +
 .../comp/copy-object-ssbo-to-ssbo.asm.comp    |   24 +
 .../asm/comp/copy-object-ubo-to-ssbo.asm.comp |   24 +
 .../asm/comp/duplicate-spec-id.asm.comp       |   26 +
 .../asm/comp/fma.spv16.asm.comp               |   23 +
 .../comp/global-parameter-name-alias.asm.comp |   30 +
 ...e-load-store-short-vector.invalid.asm.comp |   18 +
 ...p-spec-constant-op-vector-related.asm.comp |   78 +
 .../shaders-opencl/asm/comp/quantize.asm.comp |   35 +
 .../asm/comp/relaxed-block-layout.asm.comp    |   23 +
 .../comp/replicated-composites.spv16.asm.comp |   28 +
 ...specialization-constant-workgroup.asm.comp |   26 +
 .../struct-resource-name-aliasing.asm.comp    |   22 +
 .../asm/comp/uint_smulextended.asm.comp       |   28 +
 .../undefined-constant-composite.asm.comp     |   40 +
 ...undefined-spec-constant-composite.asm.comp |   46 +
 .../asm/comp/variable-pointers-2.asm.comp     |   68 +
 ...ariable-pointers-store-forwarding.asm.comp |   35 +
 .../vector-builtin-type-cast-func.asm.comp    |   34 +
 .../comp/vector-builtin-type-cast.asm.comp    |   24 +
 .../access-private-workgroup-in-function.comp |   36 +
 reference/shaders-opencl/comp/arguments.comp  |   25 +
 reference/shaders-opencl/comp/atomic.comp     |   53 +
 reference/shaders-opencl/comp/barriers.comp   |   80 +
 reference/shaders-opencl/comp/basic.comp      |   37 +
 .../comp/basic.dispatchbase.comp              |   43 +
 .../comp/buffer-push-const.comp               |   25 +
 reference/shaders-opencl/comp/builtins.comp   |   15 +
 .../comp/cfg-preserve-parameter.comp          |   75 +
 .../comp/complex-type-alias.comp              |   56 +
 .../comp/composite-construct.comp             |   37 +
 reference/shaders-opencl/comp/culling.comp    |   37 +
 .../shaders-opencl/comp/defer-parens.comp     |   25 +
 reference/shaders-opencl/comp/dowhile.comp    |   34 +
 .../shaders-opencl/comp/expect-assume.comp    |   17 +
 .../comp/force-recompile-hooks.swizzle.comp   |   12 +
 reference/shaders-opencl/comp/functions.comp  |   15 +
 .../comp/global-invocation-id.comp            |   18 +
 reference/shaders-opencl/comp/image.comp      |   11 +
 reference/shaders-opencl/comp/insert.comp     |   23 +
 .../comp/local-invocation-id.comp             |   18 +
 .../comp/local-invocation-index.comp          |   18 +
 .../comp/local-size-duplicate-spec-id.comp    |   42 +
 reference/shaders-opencl/comp/mod.comp        |   30 +
 reference/shaders-opencl/comp/modf.comp       |   37 +
 .../shaders-opencl/comp/outer-product.comp    |   42 +
 .../shaders-opencl/comp/packing-test-1.comp   |   36 +
 .../shaders-opencl/comp/packing-test-2.comp   |   33 +
 .../shaders-opencl/comp/read-write-only.comp  |   35 +
 reference/shaders-opencl/comp/rmw-opt.comp    |   31 +
 ...alar-std450-distance-length-normalize.comp |   25 +
 .../comp/shared-std450.double.comp            |   31 +
 .../comp/shared-struct-bool-cast.comp         |  106 +
 .../comp/shared-zero-init-simple.comp         |   27 +
 .../shaders-opencl/comp/shared-zero-init.comp |   32 +
 reference/shaders-opencl/comp/shared.comp     |   30 +
 .../comp/spec-constant-work-group-size.comp   |   39 +
 .../shaders-opencl/comp/struct-layout.comp    |   32 +
 .../shaders-opencl/comp/struct-nested.comp    |   33 +
 .../comp/struct-packing.invalid.comp          |    0
 .../shaders-opencl/comp/torture-loop.comp     |   55 +
 reference/shaders-opencl/comp/type-alias.comp |   61 +
 reference/shaders-opencl/comp/udiv.comp       |   24 +
 .../shaders-opencl/comp/writable-ssbo.comp    |   18 +
 .../asm/comp/atomic-decrement.asm.comp        |   71 +
 .../asm/comp/atomic-increment.asm.comp        |   71 +
 shaders-opencl/asm/comp/bitcast_iadd.asm.comp |   79 +
 shaders-opencl/asm/comp/bitcast_icmp.asm.comp |  101 +
 shaders-opencl/asm/comp/bitcast_sar.asm.comp  |   77 +
 shaders-opencl/asm/comp/bitcast_sdiv.asm.comp |   77 +
 shaders-opencl/asm/comp/bitcast_slr.asm.comp  |   77 +
 .../asm/comp/block-name-alias-global.asm.comp |  119 ++
 .../comp/buffer-write-relative-addr.asm.comp  |   93 +
 shaders-opencl/asm/comp/buffer-write.asm.comp |   59 +
 .../comp/copy-object-ssbo-to-ssbo.asm.comp    |   43 +
 .../asm/comp/copy-object-ubo-to-ssbo.asm.comp |   43 +
 .../asm/comp/duplicate-spec-id.asm.comp       |   54 +
 shaders-opencl/asm/comp/fma.spv16.asm.comp    |   65 +
 .../comp/global-parameter-name-alias.asm.comp |  102 +
 ...e-load-store-short-vector.invalid.asm.comp |   75 +
 ...p-spec-constant-op-vector-related.asm.comp |  107 +
 shaders-opencl/asm/comp/quantize.asm.comp     |   67 +
 .../asm/comp/relaxed-block-layout.asm.comp    |  108 +
 .../comp/replicated-composites.spv16.asm.comp |   81 +
 ...specialization-constant-workgroup.asm.comp |   47 +
 .../struct-resource-name-aliasing.asm.comp    |   49 +
 .../asm/comp/uint_smulextended.asm.comp       |   61 +
 .../undefined-constant-composite.asm.comp     |  102 +
 ...undefined-spec-constant-composite.asm.comp |  122 ++
 .../asm/comp/variable-pointers-2.asm.comp     |  117 +
 ...ariable-pointers-store-forwarding.asm.comp |   75 +
 .../vector-builtin-type-cast-func.asm.comp    |  147 ++
 .../comp/vector-builtin-type-cast.asm.comp    |  128 ++
 .../access-private-workgroup-in-function.comp |   31 +
 shaders-opencl/comp/arguments.comp            |   13 +
 shaders-opencl/comp/atomic.comp               |   56 +
 shaders-opencl/comp/barriers.comp             |   79 +
 shaders-opencl/comp/basic.comp                |   28 +
 shaders-opencl/comp/basic.dispatchbase.comp   |   29 +
 shaders-opencl/comp/buffer-push-const.comp    |    9 +
 shaders-opencl/comp/builtins.comp             |   12 +
 .../comp/cfg-preserve-parameter.comp          |   54 +
 shaders-opencl/comp/complex-type-alias.comp   |   41 +
 shaders-opencl/comp/composite-construct.comp  |   31 +
 shaders-opencl/comp/culling.comp              |   26 +
 shaders-opencl/comp/defer-parens.comp         |   30 +
 shaders-opencl/comp/dowhile.comp              |   31 +
 shaders-opencl/comp/expect-assume.comp        |   19 +
 .../comp/force-recompile-hooks.swizzle.comp   |    9 +
 shaders-opencl/comp/functions.comp            |   12 +
 shaders-opencl/comp/global-invocation-id.comp |    9 +
 shaders-opencl/comp/image.comp                |   12 +
 shaders-opencl/comp/insert.comp               |   18 +
 shaders-opencl/comp/local-invocation-id.comp  |    9 +
 .../comp/local-invocation-index.comp          |    9 +
 .../comp/local-size-duplicate-spec-id.comp    |   15 +
 shaders-opencl/comp/mod.comp                  |   26 +
 shaders-opencl/comp/modf.comp                 |   23 +
 shaders-opencl/comp/outer-product.comp        |   37 +
 shaders-opencl/comp/packing-test-1.comp       |   18 +
 shaders-opencl/comp/packing-test-2.comp       |   16 +
 shaders-opencl/comp/read-write-only.comp      |   26 +
 shaders-opencl/comp/rmw-opt.comp              |   27 +
 ...alar-std450-distance-length-normalize.comp |   20 +
 shaders-opencl/comp/shared-std450.double.comp |   27 +
 .../comp/shared-struct-bool-cast.comp         |   35 +
 .../comp/shared-zero-init-simple.comp         |   24 +
 shaders-opencl/comp/shared-zero-init.comp     |   28 +
 shaders-opencl/comp/shared.comp               |   27 +
 .../comp/spec-constant-work-group-size.comp   |   17 +
 shaders-opencl/comp/struct-layout.comp        |   24 +
 shaders-opencl/comp/struct-nested.comp        |   20 +
 .../comp/struct-packing.invalid.comp          |   77 +
 shaders-opencl/comp/torture-loop.comp         |   40 +
 shaders-opencl/comp/type-alias.comp           |   45 +
 shaders-opencl/comp/udiv.comp                 |   17 +
 shaders-opencl/comp/writable-ssbo.comp        |    9 +
 spirv_glsl.hpp                                |   64 +-
 spirv_opencl.cpp                              | 1874 +++++++++++++++--
 spirv_opencl.hpp                              |   55 +
 test_shaders.py                               |   19 +-
 test_shaders.sh                               |    2 +
 155 files changed, 7911 insertions(+), 210 deletions(-)
 create mode 100644 reference/shaders-opencl/asm/comp/atomic-decrement.asm.comp
 create mode 100644 reference/shaders-opencl/asm/comp/atomic-increment.asm.comp
 create mode 100644 reference/shaders-opencl/asm/comp/bitcast_iadd.asm.comp
 create mode 100644 reference/shaders-opencl/asm/comp/bitcast_icmp.asm.comp
 create mode 100644 reference/shaders-opencl/asm/comp/bitcast_sar.asm.comp
 create mode 100644 reference/shaders-opencl/asm/comp/bitcast_sdiv.asm.comp
 create mode 100644 reference/shaders-opencl/asm/comp/bitcast_slr.asm.comp
 create mode 100644 reference/shaders-opencl/asm/comp/block-name-alias-global.asm.comp
 create mode 100644 reference/shaders-opencl/asm/comp/buffer-write-relative-addr.asm.comp
 create mode 100644 reference/shaders-opencl/asm/comp/buffer-write.asm.comp
 create mode 100644 reference/shaders-opencl/asm/comp/copy-object-ssbo-to-ssbo.asm.comp
 create mode 100644 reference/shaders-opencl/asm/comp/copy-object-ubo-to-ssbo.asm.comp
 create mode 100644 reference/shaders-opencl/asm/comp/duplicate-spec-id.asm.comp
 create mode 100644 reference/shaders-opencl/asm/comp/fma.spv16.asm.comp
 create mode 100644 reference/shaders-opencl/asm/comp/global-parameter-name-alias.asm.comp
 create mode 100644 reference/shaders-opencl/asm/comp/image-load-store-short-vector.invalid.asm.comp
 create mode 100644 reference/shaders-opencl/asm/comp/op-spec-constant-op-vector-related.asm.comp
 create mode 100644 reference/shaders-opencl/asm/comp/quantize.asm.comp
 create mode 100644 reference/shaders-opencl/asm/comp/relaxed-block-layout.asm.comp
 create mode 100644 reference/shaders-opencl/asm/comp/replicated-composites.spv16.asm.comp
 create mode 100644 reference/shaders-opencl/asm/comp/specialization-constant-workgroup.asm.comp
 create mode 100644 reference/shaders-opencl/asm/comp/struct-resource-name-aliasing.asm.comp
 create mode 100644 reference/shaders-opencl/asm/comp/uint_smulextended.asm.comp
 create mode 100644 reference/shaders-opencl/asm/comp/undefined-constant-composite.asm.comp
 create mode 100644 reference/shaders-opencl/asm/comp/undefined-spec-constant-composite.asm.comp
 create mode 100644 reference/shaders-opencl/asm/comp/variable-pointers-2.asm.comp
 create mode 100644 reference/shaders-opencl/asm/comp/variable-pointers-store-forwarding.asm.comp
 create mode 100644 reference/shaders-opencl/asm/comp/vector-builtin-type-cast-func.asm.comp
 create mode 100644 reference/shaders-opencl/asm/comp/vector-builtin-type-cast.asm.comp
 create mode 100644 reference/shaders-opencl/comp/access-private-workgroup-in-function.comp
 create mode 100644 reference/shaders-opencl/comp/arguments.comp
 create mode 100644 reference/shaders-opencl/comp/atomic.comp
 create mode 100644 reference/shaders-opencl/comp/barriers.comp
 create mode 100644 reference/shaders-opencl/comp/basic.comp
 create mode 100644 reference/shaders-opencl/comp/basic.dispatchbase.comp
 create mode 100644 reference/shaders-opencl/comp/buffer-push-const.comp
 create mode 100644 reference/shaders-opencl/comp/builtins.comp
 create mode 100644 reference/shaders-opencl/comp/cfg-preserve-parameter.comp
 create mode 100644 reference/shaders-opencl/comp/complex-type-alias.comp
 create mode 100644 reference/shaders-opencl/comp/composite-construct.comp
 create mode 100644 reference/shaders-opencl/comp/culling.comp
 create mode 100644 reference/shaders-opencl/comp/defer-parens.comp
 create mode 100644 reference/shaders-opencl/comp/dowhile.comp
 create mode 100644 reference/shaders-opencl/comp/expect-assume.comp
 create mode 100644 reference/shaders-opencl/comp/force-recompile-hooks.swizzle.comp
 create mode 100644 reference/shaders-opencl/comp/functions.comp
 create mode 100644 reference/shaders-opencl/comp/global-invocation-id.comp
 create mode 100644 reference/shaders-opencl/comp/image.comp
 create mode 100644 reference/shaders-opencl/comp/insert.comp
 create mode 100644 reference/shaders-opencl/comp/local-invocation-id.comp
 create mode 100644 reference/shaders-opencl/comp/local-invocation-index.comp
 create mode 100644 reference/shaders-opencl/comp/local-size-duplicate-spec-id.comp
 create mode 100644 reference/shaders-opencl/comp/mod.comp
 create mode 100644 reference/shaders-opencl/comp/modf.comp
 create mode 100644 reference/shaders-opencl/comp/outer-product.comp
 create mode 100644 reference/shaders-opencl/comp/packing-test-1.comp
 create mode 100644 reference/shaders-opencl/comp/packing-test-2.comp
 create mode 100644 reference/shaders-opencl/comp/read-write-only.comp
 create mode 100644 reference/shaders-opencl/comp/rmw-opt.comp
 create mode 100644 reference/shaders-opencl/comp/scalar-std450-distance-length-normalize.comp
 create mode 100644 reference/shaders-opencl/comp/shared-std450.double.comp
 create mode 100644 reference/shaders-opencl/comp/shared-struct-bool-cast.comp
 create mode 100644 reference/shaders-opencl/comp/shared-zero-init-simple.comp
 create mode 100644 reference/shaders-opencl/comp/shared-zero-init.comp
 create mode 100644 reference/shaders-opencl/comp/shared.comp
 create mode 100644 reference/shaders-opencl/comp/spec-constant-work-group-size.comp
 create mode 100644 reference/shaders-opencl/comp/struct-layout.comp
 create mode 100644 reference/shaders-opencl/comp/struct-nested.comp
 create mode 100644 reference/shaders-opencl/comp/struct-packing.invalid.comp
 create mode 100644 reference/shaders-opencl/comp/torture-loop.comp
 create mode 100644 reference/shaders-opencl/comp/type-alias.comp
 create mode 100644 reference/shaders-opencl/comp/udiv.comp
 create mode 100644 reference/shaders-opencl/comp/writable-ssbo.comp
 create mode 100644 shaders-opencl/asm/comp/atomic-decrement.asm.comp
 create mode 100644 shaders-opencl/asm/comp/atomic-increment.asm.comp
 create mode 100644 shaders-opencl/asm/comp/bitcast_iadd.asm.comp
 create mode 100644 shaders-opencl/asm/comp/bitcast_icmp.asm.comp
 create mode 100644 shaders-opencl/asm/comp/bitcast_sar.asm.comp
 create mode 100644 shaders-opencl/asm/comp/bitcast_sdiv.asm.comp
 create mode 100644 shaders-opencl/asm/comp/bitcast_slr.asm.comp
 create mode 100644 shaders-opencl/asm/comp/block-name-alias-global.asm.comp
 create mode 100644 shaders-opencl/asm/comp/buffer-write-relative-addr.asm.comp
 create mode 100644 shaders-opencl/asm/comp/buffer-write.asm.comp
 create mode 100644 shaders-opencl/asm/comp/copy-object-ssbo-to-ssbo.asm.comp
 create mode 100644 shaders-opencl/asm/comp/copy-object-ubo-to-ssbo.asm.comp
 create mode 100644 shaders-opencl/asm/comp/duplicate-spec-id.asm.comp
 create mode 100644 shaders-opencl/asm/comp/fma.spv16.asm.comp
 create mode 100644 shaders-opencl/asm/comp/global-parameter-name-alias.asm.comp
 create mode 100644 shaders-opencl/asm/comp/image-load-store-short-vector.invalid.asm.comp
 create mode 100644 shaders-opencl/asm/comp/op-spec-constant-op-vector-related.asm.comp
 create mode 100644 shaders-opencl/asm/comp/quantize.asm.comp
 create mode 100644 shaders-opencl/asm/comp/relaxed-block-layout.asm.comp
 create mode 100644 shaders-opencl/asm/comp/replicated-composites.spv16.asm.comp
 create mode 100644 shaders-opencl/asm/comp/specialization-constant-workgroup.asm.comp
 create mode 100644 shaders-opencl/asm/comp/struct-resource-name-aliasing.asm.comp
 create mode 100644 shaders-opencl/asm/comp/uint_smulextended.asm.comp
 create mode 100644 shaders-opencl/asm/comp/undefined-constant-composite.asm.comp
 create mode 100644 shaders-opencl/asm/comp/undefined-spec-constant-composite.asm.comp
 create mode 100644 shaders-opencl/asm/comp/variable-pointers-2.asm.comp
 create mode 100644 shaders-opencl/asm/comp/variable-pointers-store-forwarding.asm.comp
 create mode 100644 shaders-opencl/asm/comp/vector-builtin-type-cast-func.asm.comp
 create mode 100644 shaders-opencl/asm/comp/vector-builtin-type-cast.asm.comp
 create mode 100644 shaders-opencl/comp/access-private-workgroup-in-function.comp
 create mode 100644 shaders-opencl/comp/arguments.comp
 create mode 100644 shaders-opencl/comp/atomic.comp
 create mode 100644 shaders-opencl/comp/barriers.comp
 create mode 100644 shaders-opencl/comp/basic.comp
 create mode 100644 shaders-opencl/comp/basic.dispatchbase.comp
 create mode 100644 shaders-opencl/comp/buffer-push-const.comp
 create mode 100644 shaders-opencl/comp/builtins.comp
 create mode 100644 shaders-opencl/comp/cfg-preserve-parameter.comp
 create mode 100644 shaders-opencl/comp/complex-type-alias.comp
 create mode 100644 shaders-opencl/comp/composite-construct.comp
 create mode 100644 shaders-opencl/comp/culling.comp
 create mode 100644 shaders-opencl/comp/defer-parens.comp
 create mode 100644 shaders-opencl/comp/dowhile.comp
 create mode 100644 shaders-opencl/comp/expect-assume.comp
 create mode 100644 shaders-opencl/comp/force-recompile-hooks.swizzle.comp
 create mode 100644 shaders-opencl/comp/functions.comp
 create mode 100644 shaders-opencl/comp/global-invocation-id.comp
 create mode 100644 shaders-opencl/comp/image.comp
 create mode 100644 shaders-opencl/comp/insert.comp
 create mode 100644 shaders-opencl/comp/local-invocation-id.comp
 create mode 100644 shaders-opencl/comp/local-invocation-index.comp
 create mode 100644 shaders-opencl/comp/local-size-duplicate-spec-id.comp
 create mode 100644 shaders-opencl/comp/mod.comp
 create mode 100644 shaders-opencl/comp/modf.comp
 create mode 100644 shaders-opencl/comp/outer-product.comp
 create mode 100644 shaders-opencl/comp/packing-test-1.comp
 create mode 100644 shaders-opencl/comp/packing-test-2.comp
 create mode 100644 shaders-opencl/comp/read-write-only.comp
 create mode 100644 shaders-opencl/comp/rmw-opt.comp
 create mode 100644 shaders-opencl/comp/scalar-std450-distance-length-normalize.comp
 create mode 100644 shaders-opencl/comp/shared-std450.double.comp
 create mode 100644 shaders-opencl/comp/shared-struct-bool-cast.comp
 create mode 100644 shaders-opencl/comp/shared-zero-init-simple.comp
 create mode 100644 shaders-opencl/comp/shared-zero-init.comp
 create mode 100644 shaders-opencl/comp/shared.comp
 create mode 100644 shaders-opencl/comp/spec-constant-work-group-size.comp
 create mode 100644 shaders-opencl/comp/struct-layout.comp
 create mode 100644 shaders-opencl/comp/struct-nested.comp
 create mode 100644 shaders-opencl/comp/struct-packing.invalid.comp
 create mode 100644 shaders-opencl/comp/torture-loop.comp
 create mode 100644 shaders-opencl/comp/type-alias.comp
 create mode 100644 shaders-opencl/comp/udiv.comp
 create mode 100644 shaders-opencl/comp/writable-ssbo.comp

diff --git a/Package.swift b/Package.swift
index 99e601936..2ab54c018 100644
--- a/Package.swift
+++ b/Package.swift
@@ -47,6 +47,8 @@ let package = Package(
                       "shaders-msl",
                       "shaders-msl-no-opt",
                       "shaders-no-opt",
+                      "shaders-opencl",
+                      "shaders-opencl-no-opt",
                       "shaders-other",
                       "shaders-reflection",
                       "shaders-ue4",
diff --git a/main.cpp b/main.cpp
index adcfccbdd..2fc6ced5c 100644
--- a/main.cpp
+++ b/main.cpp
@@ -780,6 +780,8 @@ struct CLIArguments
 	uint32_t opencl_version = 120;
 	bool opencl_enable_fp64 = false;
 	bool opencl_enable_64bit_atomics = false;
+	bool opencl_enable_subgroups = false;
+	bool opencl_enable_shuffle = false;
 };
 
 static void print_version()
@@ -1361,6 +1363,8 @@ static string compile_iteration(const CLIArguments &args, std::vector<uint32_t>
 		CompilerOpenCL::Options ocl_opts = ocl_comp->get_opencl_options();
 		ocl_opts.opencl_version = args.opencl_version;
 		ocl_opts.enable_fp64 = args.opencl_enable_fp64;
+		ocl_opts.enable_subgroups = args.opencl_enable_subgroups;
+		ocl_opts.enable_shuffle = args.opencl_enable_shuffle;
 		ocl_comp->set_opencl_options(ocl_opts);
 	}
 	else if (args.hlsl)
@@ -1993,6 +1997,8 @@ static int main_inner(int argc, char *argv[])
 	cbs.add("--opencl-version", [&args](CLIParser &parser) { args.opencl_version = parser.next_uint(); });
 	cbs.add("--opencl-fp64", [&args](CLIParser &) { args.opencl_enable_fp64 = true; });
 	cbs.add("--opencl-64bit-atomics", [&args](CLIParser &) { args.opencl_enable_64bit_atomics = true; });
+	cbs.add("--opencl-subgroups", [&args](CLIParser &) { args.opencl_enable_subgroups = true; });
+	cbs.add("--opencl-shuffle", [&args](CLIParser &) { args.opencl_enable_shuffle = true; });
 	cbs.add("--extension", [&args](CLIParser &parser) { args.extensions.push_back(parser.next_string()); });
 	cbs.add("--rename-entry-point",
 	        [&args](CLIParser &parser)
diff --git a/reference/opt/shaders-hlsl/asm/frag/globally-coherent-structured-buffer.structured.asm.frag b/reference/opt/shaders-hlsl/asm/frag/globally-coherent-structured-buffer.structured.asm.frag
index 996d5f6fb..23994db3a 100644
--- a/reference/opt/shaders-hlsl/asm/frag/globally-coherent-structured-buffer.structured.asm.frag
+++ b/reference/opt/shaders-hlsl/asm/frag/globally-coherent-structured-buffer.structured.asm.frag
@@ -1,21 +1,21 @@
-globallycoherent RWStructuredBuffer<float4> TestBuffer : register(u0);
-
-static float4 out_var_SV_Target0;
-
-struct SPIRV_Cross_Output
-{
-    float4 out_var_SV_Target0 : SV_Target0;
-};
-
-void frag_main()
-{
-    out_var_SV_Target0 = TestBuffer[0u];
-}
-
-SPIRV_Cross_Output main()
-{
-    frag_main();
-    SPIRV_Cross_Output stage_output;
-    stage_output.out_var_SV_Target0 = out_var_SV_Target0;
-    return stage_output;
-}
+globallycoherent RWStructuredBuffer<float4> TestBuffer : register(u0);
+
+static float4 out_var_SV_Target0;
+
+struct SPIRV_Cross_Output
+{
+    float4 out_var_SV_Target0 : SV_Target0;
+};
+
+void frag_main()
+{
+    out_var_SV_Target0 = TestBuffer[0u];
+}
+
+SPIRV_Cross_Output main()
+{
+    frag_main();
+    SPIRV_Cross_Output stage_output;
+    stage_output.out_var_SV_Target0 = out_var_SV_Target0;
+    return stage_output;
+}
diff --git a/reference/shaders-hlsl/asm/frag/globally-coherent-structured-buffer.structured.asm.frag b/reference/shaders-hlsl/asm/frag/globally-coherent-structured-buffer.structured.asm.frag
index 996d5f6fb..23994db3a 100644
--- a/reference/shaders-hlsl/asm/frag/globally-coherent-structured-buffer.structured.asm.frag
+++ b/reference/shaders-hlsl/asm/frag/globally-coherent-structured-buffer.structured.asm.frag
@@ -1,21 +1,21 @@
-globallycoherent RWStructuredBuffer<float4> TestBuffer : register(u0);
-
-static float4 out_var_SV_Target0;
-
-struct SPIRV_Cross_Output
-{
-    float4 out_var_SV_Target0 : SV_Target0;
-};
-
-void frag_main()
-{
-    out_var_SV_Target0 = TestBuffer[0u];
-}
-
-SPIRV_Cross_Output main()
-{
-    frag_main();
-    SPIRV_Cross_Output stage_output;
-    stage_output.out_var_SV_Target0 = out_var_SV_Target0;
-    return stage_output;
-}
+globallycoherent RWStructuredBuffer<float4> TestBuffer : register(u0);
+
+static float4 out_var_SV_Target0;
+
+struct SPIRV_Cross_Output
+{
+    float4 out_var_SV_Target0 : SV_Target0;
+};
+
+void frag_main()
+{
+    out_var_SV_Target0 = TestBuffer[0u];
+}
+
+SPIRV_Cross_Output main()
+{
+    frag_main();
+    SPIRV_Cross_Output stage_output;
+    stage_output.out_var_SV_Target0 = out_var_SV_Target0;
+    return stage_output;
+}
diff --git a/reference/shaders-opencl/asm/comp/atomic-decrement.asm.comp b/reference/shaders-opencl/asm/comp/atomic-decrement.asm.comp
new file mode 100644
index 000000000..36a844495
--- /dev/null
+++ b/reference/shaders-opencl/asm/comp/atomic-decrement.asm.comp
@@ -0,0 +1,19 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct u0_counters
+{
+    uint c;
+};
+
+typedef struct u0_counters u0_counters;
+
+__attribute__((reqd_work_group_size(4, 1, 1)))
+__kernel void comp_main(write_only image1d_buffer_t u0, __global uint* u0_counter)
+{
+    uint _24 = atomic_add(&(u0_counter[0]), (uint)(-1));
+    float4 r0;
+    r0.x = as_float(_24);
+    write_imageui(u0, as_int((as_uint(as_int(r0.x)) * 1u) + (as_uint(0) >> 2u)), (uint4)(as_uint(as_int(((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x))));
+}
+
diff --git a/reference/shaders-opencl/asm/comp/atomic-increment.asm.comp b/reference/shaders-opencl/asm/comp/atomic-increment.asm.comp
new file mode 100644
index 000000000..4c9563240
--- /dev/null
+++ b/reference/shaders-opencl/asm/comp/atomic-increment.asm.comp
@@ -0,0 +1,19 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct u0_counters
+{
+    uint c;
+};
+
+typedef struct u0_counters u0_counters;
+
+__attribute__((reqd_work_group_size(4, 1, 1)))
+__kernel void comp_main(write_only image1d_buffer_t u0, __global uint* u0_counter)
+{
+    uint _24 = atomic_add(&(u0_counter[0]), 1u);
+    float4 r0;
+    r0.x = as_float(_24);
+    write_imageui(u0, as_int((as_uint(as_int(r0.x)) * 1u) + (as_uint(0) >> 2u)), (uint4)(as_uint(as_int(((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x))));
+}
+
diff --git a/reference/shaders-opencl/asm/comp/bitcast_iadd.asm.comp b/reference/shaders-opencl/asm/comp/bitcast_iadd.asm.comp
new file mode 100644
index 000000000..5c0520b3a
--- /dev/null
+++ b/reference/shaders-opencl/asm/comp/bitcast_iadd.asm.comp
@@ -0,0 +1,32 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct _3
+{
+    int4 _m0;
+    uint4 _m1;
+};
+
+typedef struct _3 _3;
+
+struct _4
+{
+    uint4 _m0;
+    int4 _m1;
+};
+
+typedef struct _4 _4;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global _3* _5, __global _4* _6)
+{
+    _6->_m0 = _5->_m1 + as_uint4(_5->_m0);
+    _6->_m0 = as_uint4(_5->_m0) + _5->_m1;
+    _6->_m0 = _5->_m1 + _5->_m1;
+    _6->_m0 = as_uint4(_5->_m0 + _5->_m0);
+    _6->_m1 = as_int4(_5->_m1 + _5->_m1);
+    _6->_m1 = _5->_m0 + _5->_m0;
+    _6->_m1 = as_int4(_5->_m1) + _5->_m0;
+    _6->_m1 = _5->_m0 + as_int4(_5->_m1);
+}
+
diff --git a/reference/shaders-opencl/asm/comp/bitcast_icmp.asm.comp b/reference/shaders-opencl/asm/comp/bitcast_icmp.asm.comp
new file mode 100644
index 000000000..c2195a52c
--- /dev/null
+++ b/reference/shaders-opencl/asm/comp/bitcast_icmp.asm.comp
@@ -0,0 +1,32 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct _3
+{
+    int4 _m0;
+    uint4 _m1;
+};
+
+typedef struct _3 _3;
+
+struct _4
+{
+    uint4 _m0;
+    int4 _m1;
+};
+
+typedef struct _4 _4;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global _3* _5, __global _4* _6)
+{
+    _6->_m0 = select((uint4)(0u), (uint4)(1u), as_int4(_5->_m1) < _5->_m0);
+    _6->_m0 = select((uint4)(0u), (uint4)(1u), as_int4(_5->_m1) <= _5->_m0);
+    _6->_m0 = select((uint4)(0u), (uint4)(1u), _5->_m1 < as_uint4(_5->_m0));
+    _6->_m0 = select((uint4)(0u), (uint4)(1u), _5->_m1 <= as_uint4(_5->_m0));
+    _6->_m0 = select((uint4)(0u), (uint4)(1u), as_int4(_5->_m1) > _5->_m0);
+    _6->_m0 = select((uint4)(0u), (uint4)(1u), as_int4(_5->_m1) >= _5->_m0);
+    _6->_m0 = select((uint4)(0u), (uint4)(1u), _5->_m1 > as_uint4(_5->_m0));
+    _6->_m0 = select((uint4)(0u), (uint4)(1u), _5->_m1 >= as_uint4(_5->_m0));
+}
+
diff --git a/reference/shaders-opencl/asm/comp/bitcast_sar.asm.comp b/reference/shaders-opencl/asm/comp/bitcast_sar.asm.comp
new file mode 100644
index 000000000..93916384b
--- /dev/null
+++ b/reference/shaders-opencl/asm/comp/bitcast_sar.asm.comp
@@ -0,0 +1,34 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct _3
+{
+    int4 _m0;
+    uint4 _m1;
+};
+
+typedef struct _3 _3;
+
+struct _4
+{
+    uint4 _m0;
+    int4 _m1;
+};
+
+typedef struct _4 _4;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global _3* _5, __global _4* _6)
+{
+    int4 _22 = _5->_m0;
+    uint4 _23 = _5->_m1;
+    _6->_m0 = as_uint4(as_int4(_23) >> _22);
+    _6->_m0 = as_uint4(_22 >> as_int4(_23));
+    _6->_m0 = as_uint4(as_int4(_23) >> as_int4(_23));
+    _6->_m0 = as_uint4(_22 >> _22);
+    _6->_m1 = as_int4(_23) >> as_int4(_23);
+    _6->_m1 = _22 >> _22;
+    _6->_m1 = as_int4(_23) >> _22;
+    _6->_m1 = _22 >> as_int4(_23);
+}
+
diff --git a/reference/shaders-opencl/asm/comp/bitcast_sdiv.asm.comp b/reference/shaders-opencl/asm/comp/bitcast_sdiv.asm.comp
new file mode 100644
index 000000000..f5a1a3a67
--- /dev/null
+++ b/reference/shaders-opencl/asm/comp/bitcast_sdiv.asm.comp
@@ -0,0 +1,34 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct _3
+{
+    int4 _m0;
+    uint4 _m1;
+};
+
+typedef struct _3 _3;
+
+struct _4
+{
+    uint4 _m0;
+    int4 _m1;
+};
+
+typedef struct _4 _4;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global _3* _5, __global _4* _6)
+{
+    int4 _22 = _5->_m0;
+    uint4 _23 = _5->_m1;
+    _6->_m0 = as_uint4(as_int4(_23) / _22);
+    _6->_m0 = as_uint4(_22 / as_int4(_23));
+    _6->_m0 = as_uint4(as_int4(_23) / as_int4(_23));
+    _6->_m0 = as_uint4(_22 / _22);
+    _6->_m1 = as_int4(_23) / as_int4(_23);
+    _6->_m1 = _22 / _22;
+    _6->_m1 = as_int4(_23) / _22;
+    _6->_m1 = _22 / as_int4(_23);
+}
+
diff --git a/reference/shaders-opencl/asm/comp/bitcast_slr.asm.comp b/reference/shaders-opencl/asm/comp/bitcast_slr.asm.comp
new file mode 100644
index 000000000..525761cc2
--- /dev/null
+++ b/reference/shaders-opencl/asm/comp/bitcast_slr.asm.comp
@@ -0,0 +1,34 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct _3
+{
+    int4 _m0;
+    uint4 _m1;
+};
+
+typedef struct _3 _3;
+
+struct _4
+{
+    uint4 _m0;
+    int4 _m1;
+};
+
+typedef struct _4 _4;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global _3* _5, __global _4* _6)
+{
+    int4 _22 = _5->_m0;
+    uint4 _23 = _5->_m1;
+    _6->_m0 = _23 >> as_uint4(_22);
+    _6->_m0 = as_uint4(_22) >> _23;
+    _6->_m0 = _23 >> _23;
+    _6->_m0 = as_uint4(_22) >> as_uint4(_22);
+    _6->_m1 = as_int4(_23 >> _23);
+    _6->_m1 = as_int4(as_uint4(_22) >> as_uint4(_22));
+    _6->_m1 = as_int4(_23 >> as_uint4(_22));
+    _6->_m1 = as_int4(as_uint4(_22) >> _23);
+}
+
diff --git a/reference/shaders-opencl/asm/comp/block-name-alias-global.asm.comp b/reference/shaders-opencl/asm/comp/block-name-alias-global.asm.comp
new file mode 100644
index 000000000..166f01b62
--- /dev/null
+++ b/reference/shaders-opencl/asm/comp/block-name-alias-global.asm.comp
@@ -0,0 +1,48 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct A
+{
+    int a;
+    int b;
+};
+
+typedef struct A A;
+
+struct A_1
+{
+    A Data[1];
+};
+
+typedef struct A_1 A_1;
+
+struct A_2
+{
+    A Data[1024];
+};
+
+typedef struct A_2 A_2;
+
+struct B
+{
+    A Data[1];
+};
+
+typedef struct B B;
+
+struct B_1
+{
+    A Data[1024];
+};
+
+typedef struct B_1 B_1;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global A* C1, A_2 C2, __global A* C3, B_1 C4)
+{
+    C1[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x].a = C2.Data[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x].a;
+    C1[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x].b = C2.Data[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x].b;
+    C3[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x].a = C4.Data[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x].a;
+    C3[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x].b = C4.Data[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x].b;
+}
+
diff --git a/reference/shaders-opencl/asm/comp/buffer-write-relative-addr.asm.comp b/reference/shaders-opencl/asm/comp/buffer-write-relative-addr.asm.comp
new file mode 100644
index 000000000..d69fb8a51
--- /dev/null
+++ b/reference/shaders-opencl/asm/comp/buffer-write-relative-addr.asm.comp
@@ -0,0 +1,24 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct cb5_struct
+{
+    float4 _m0[5];
+};
+
+typedef struct cb5_struct cb5_struct;
+
+__attribute__((reqd_work_group_size(4, 1, 1)))
+__kernel void comp_main(cb5_struct cb0_5, write_only image1d_buffer_t u0)
+{
+    float4 r0_1;
+    r0_1.x = as_float(as_int(((uint3)(get_local_id(0), get_local_id(1), get_local_id(2))).x) << 4);
+    r0_1.y = as_float(as_int(((uint3)(get_local_id(0), get_local_id(1), get_local_id(2))).x));
+    uint _41 = as_uint(r0_1.x) >> 2u;
+    uint4 _50 = as_uint4(cb0_5._m0[as_uint(as_int(r0_1.y)) + 1u]);
+    write_imageui(u0, as_int(_41), _50.xxxx);
+    write_imageui(u0, as_int(_41 + 1u), _50.yyyy);
+    write_imageui(u0, as_int(_41 + 2u), _50.zzzz);
+    write_imageui(u0, as_int(_41 + 3u), _50.wwww);
+}
+
diff --git a/reference/shaders-opencl/asm/comp/buffer-write.asm.comp b/reference/shaders-opencl/asm/comp/buffer-write.asm.comp
new file mode 100644
index 000000000..ce88fd4e3
--- /dev/null
+++ b/reference/shaders-opencl/asm/comp/buffer-write.asm.comp
@@ -0,0 +1,16 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct cb
+{
+    float value;
+};
+
+typedef struct cb cb;
+
+__attribute__((reqd_work_group_size(32, 1, 1)))
+__kernel void comp_main(cb _8, write_only image1d_buffer_t _buffer)
+{
+    write_imagef(_buffer, as_int((32u * ((uint3)(get_group_id(0), get_group_id(1), get_group_id(2))).x) + ((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))), (float4)(_8.value));
+}
+
diff --git a/reference/shaders-opencl/asm/comp/copy-object-ssbo-to-ssbo.asm.comp b/reference/shaders-opencl/asm/comp/copy-object-ssbo-to-ssbo.asm.comp
new file mode 100644
index 000000000..8da6f6cfa
--- /dev/null
+++ b/reference/shaders-opencl/asm/comp/copy-object-ssbo-to-ssbo.asm.comp
@@ -0,0 +1,24 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct _19
+{
+};
+typedef struct _19 _19;
+
+struct _5
+{
+    int _m0;
+    _19 _m1;
+    _19 _m2;
+    int _m3;
+};
+
+typedef struct _5 _5;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global _5* _3, __global _5* _4)
+{
+    *_4 = (*_3);
+}
+
diff --git a/reference/shaders-opencl/asm/comp/copy-object-ubo-to-ssbo.asm.comp b/reference/shaders-opencl/asm/comp/copy-object-ubo-to-ssbo.asm.comp
new file mode 100644
index 000000000..3ab995c11
--- /dev/null
+++ b/reference/shaders-opencl/asm/comp/copy-object-ubo-to-ssbo.asm.comp
@@ -0,0 +1,24 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct _19
+{
+};
+typedef struct _19 _19;
+
+struct _5
+{
+    int _m0;
+    _19 _m1;
+    _19 _m2;
+    int _m3;
+};
+
+typedef struct _5 _5;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(_5 _3, __global _5* _4)
+{
+    *_4 = _3;
+}
+
diff --git a/reference/shaders-opencl/asm/comp/duplicate-spec-id.asm.comp b/reference/shaders-opencl/asm/comp/duplicate-spec-id.asm.comp
new file mode 100644
index 000000000..177a60dc6
--- /dev/null
+++ b/reference/shaders-opencl/asm/comp/duplicate-spec-id.asm.comp
@@ -0,0 +1,26 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct StorageBuffer
+{
+    float values[1];
+};
+
+typedef struct StorageBuffer StorageBuffer;
+
+#ifndef SPIRV_CROSS_CONSTANT_ID_0
+#define SPIRV_CROSS_CONSTANT_ID_0 1
+#endif
+constant int foo = SPIRV_CROSS_CONSTANT_ID_0;
+#ifndef SPIRV_CROSS_CONSTANT_ID_0
+#define SPIRV_CROSS_CONSTANT_ID_0 2.0f
+#endif
+constant float bar = SPIRV_CROSS_CONSTANT_ID_0;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global float* ssbo)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    ssbo[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))] = convert_float(foo) + bar;
+}
+
diff --git a/reference/shaders-opencl/asm/comp/fma.spv16.asm.comp b/reference/shaders-opencl/asm/comp/fma.spv16.asm.comp
new file mode 100644
index 000000000..9343d7f25
--- /dev/null
+++ b/reference/shaders-opencl/asm/comp/fma.spv16.asm.comp
@@ -0,0 +1,23 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct SSBO2
+{
+    float4 out_data[1];
+};
+
+typedef struct SSBO2 SSBO2;
+
+struct SSBO
+{
+    float4 in_data[1];
+};
+
+typedef struct SSBO SSBO;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global float4* _4, __global const float4* _6)
+{
+    _4[0] = fma(_6[0], _6[1], _6[1]);
+}
+
diff --git a/reference/shaders-opencl/asm/comp/global-parameter-name-alias.asm.comp b/reference/shaders-opencl/asm/comp/global-parameter-name-alias.asm.comp
new file mode 100644
index 000000000..f3caf7d90
--- /dev/null
+++ b/reference/shaders-opencl/asm/comp/global-parameter-name-alias.asm.comp
@@ -0,0 +1,30 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct ssbo
+{
+    uint _data[1];
+};
+
+typedef struct ssbo ssbo;
+
+void Load( uint* size, __global const uint* ssbo)
+{
+    int byteAddrTemp = as_int((*size) >> as_uint(2));
+    uint4 data = (uint4)(ssbo[byteAddrTemp], ssbo[byteAddrTemp + 1], ssbo[byteAddrTemp + 2], ssbo[byteAddrTemp + 3]);
+}
+
+void _main( uint3* id, __global const uint* ssbo)
+{
+    uint param = 4u;
+    Load(&param, ssbo);
+}
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global const uint* ssbo)
+{
+    uint3 id_1 = ((uint3)(get_global_id(0), get_global_id(1), get_global_id(2)));
+    uint3 param_1 = id_1;
+    _main(&param_1, ssbo);
+}
+
diff --git a/reference/shaders-opencl/asm/comp/image-load-store-short-vector.invalid.asm.comp b/reference/shaders-opencl/asm/comp/image-load-store-short-vector.invalid.asm.comp
new file mode 100644
index 000000000..f7d65805e
--- /dev/null
+++ b/reference/shaders-opencl/asm/comp/image-load-store-short-vector.invalid.asm.comp
@@ -0,0 +1,18 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+void _main( uint3* id)
+{
+    float2 loaded = read_imagef(TargetTexture, as_int2((*id).xy)).xy;
+    float2 storeTemp = loaded + (float2)(1.0f);
+    write_imagef(TargetTexture, as_int2((*id).xy + (uint2)(1u)), (float4)(storeTemp));
+}
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(write_only image2d_t TargetTexture)
+{
+    uint3 id_1 = ((uint3)(get_group_id(0), get_group_id(1), get_group_id(2)));
+    uint3 param = id_1;
+    _main(&param);
+}
+
diff --git a/reference/shaders-opencl/asm/comp/op-spec-constant-op-vector-related.asm.comp b/reference/shaders-opencl/asm/comp/op-spec-constant-op-vector-related.asm.comp
new file mode 100644
index 000000000..463f75b99
--- /dev/null
+++ b/reference/shaders-opencl/asm/comp/op-spec-constant-op-vector-related.asm.comp
@@ -0,0 +1,78 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct _29
+{
+    int _m0[3][3];
+};
+
+typedef struct _29 _29;
+
+struct _7
+{
+    int _m0[1];
+};
+
+typedef struct _7 _7;
+
+constant int3 _32 = (int3)(0);
+constant int _33[3] = { 0, 0, 0 };
+constant int _34[3][3] = { { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 } };
+#ifndef SPIRV_CROSS_CONSTANT_ID_0
+#define SPIRV_CROSS_CONSTANT_ID_0 0
+#endif
+constant int _3 = SPIRV_CROSS_CONSTANT_ID_0;
+#ifndef SPIRV_CROSS_CONSTANT_ID_1
+#define SPIRV_CROSS_CONSTANT_ID_1 0
+#endif
+constant int _4 = SPIRV_CROSS_CONSTANT_ID_1;
+#ifndef SPIRV_CROSS_CONSTANT_ID_2
+#define SPIRV_CROSS_CONSTANT_ID_2 0
+#endif
+constant int _5 = SPIRV_CROSS_CONSTANT_ID_2;
+#define _36 ({ _3, 0, 0 })
+#define _37 ({ _3, _4, 0 })
+#define _38 ({ _3, _4, _5 })
+#define _39 ({ _4, 0, 0 })
+#define _40 ({ _4, _5, 0 })
+#define _41 ({ _4, _5, _3 })
+#define _42 ({ _5, 0, 0 })
+#define _43 ({ _5, _3, 0 })
+#define _44 ({ _5, _3, _4 })
+#define _45 ({ { _3, _4, _5 }, { 0, 0, 0 }, { 0, 0, 0 } })
+#define _46 ({ { _3, _4, _5 }, { _4, _5, _3 }, { 0, 0, 0 } })
+#define _47 ({ { _3, _4, _5 }, { _4, _5, _3 }, { _5, _3, _4 } })
+#define _48 ((_29){ { { _3, _4, _5 }, { _4, _5, _3 }, { _5, _3, _4 } } })
+#define _49 ((_29){ { { _3, _4, _5 }, { _4, _5, _5 }, { _5, _3, _4 } } })
+#define _50 (_48._m0[0][0])
+#define _51 (_48._m0[1][0])
+#define _52 (_48._m0[0][1])
+#define _53 (_48._m0[2][2])
+#define _54 (_48._m0[2][0])
+#define _55 (_48._m0[1][1])
+#define _56 ((_50 == _51))
+#define _57 ((_52 == _53))
+#define _58 ((_54 == _55))
+#define _59 ((int)(_56))
+#define _60 ((int)(_57))
+#define _61 (_58 ? 2 : 1)
+#define _62 ((int3)(_3, 0, 0))
+#define _63 ((int3)(0, _4, 0))
+#define _64 ((int3)(0, 0, _5))
+#define _65 ((int3)(_62.x, 0, _62.z))
+#define _66 ((int3)(0, _63.y, _63.x))
+#define _67 ((int3)(_64.z, 0, _64.z))
+#define _68 ((int3)(_65.y, _65.x, _66.y))
+#define _69 ((int3)(_67.z, _68.y, _68.z))
+#define _70 (_69.x)
+#define _71 (_69.y)
+#define _72 (_69.z)
+#define _73 ((_70 - _71))
+#define _74 ((_73 * _72))
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global int* _8, __global int* _9)
+{
+    _9[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x] = _8[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x] + ((((1 - _59) * _60) * (_61 - 1)) * _74);
+}
+
diff --git a/reference/shaders-opencl/asm/comp/quantize.asm.comp b/reference/shaders-opencl/asm/comp/quantize.asm.comp
new file mode 100644
index 000000000..3743c7776
--- /dev/null
+++ b/reference/shaders-opencl/asm/comp/quantize.asm.comp
@@ -0,0 +1,35 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct SSBO0
+{
+    float scalar;
+    float2 vec2_val;
+    float3 vec3_val;
+    float4 vec4_val;
+};
+
+typedef struct SSBO0 SSBO0;
+
+uint spvPackHalf2x16(float2 v) {
+    uint r;
+    vstore_half(v.x, 0, (__private half *)&r);
+    vstore_half(v.y, 1, (__private half *)&r);
+    return r;
+}
+
+float2 spvUnpackHalf2x16(uint u) {
+    const __private uint *p = &u;
+    return (float2)(vload_half(0, (const __private half *)p),
+                   vload_half(1, (const __private half *)p));
+}
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global SSBO0* _12)
+{
+    _12->scalar = spvUnpackHalf2x16(spvPackHalf2x16((float2)(_12->scalar, 0.0f))).x;
+    _12->vec2_val = spvUnpackHalf2x16(spvPackHalf2x16(_12->vec2_val));
+    _12->vec3_val = (float3)(spvUnpackHalf2x16(spvPackHalf2x16(_12->vec3_val.xy)), spvUnpackHalf2x16(spvPackHalf2x16((float2)(_12->vec3_val.z, 0.0f))).x);
+    _12->vec4_val = (float4)(spvUnpackHalf2x16(spvPackHalf2x16(_12->vec4_val.xy)), spvUnpackHalf2x16(spvPackHalf2x16(_12->vec4_val.zw)));
+}
+
diff --git a/reference/shaders-opencl/asm/comp/relaxed-block-layout.asm.comp b/reference/shaders-opencl/asm/comp/relaxed-block-layout.asm.comp
new file mode 100644
index 000000000..ddae4bb54
--- /dev/null
+++ b/reference/shaders-opencl/asm/comp/relaxed-block-layout.asm.comp
@@ -0,0 +1,23 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct foo
+{
+    uint bar;
+    float3 baz;
+    uchar quux;
+    uchar4 blah;
+    half2 wibble;
+};
+
+typedef struct foo foo;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global foo* _10)
+{
+    _10->bar = ((uint3)(get_local_id(0), get_local_id(1), get_local_id(2))).x;
+    _10->baz = convert_float3(((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))));
+    _10->blah = convert_uchar4((uint4)(convert_uint4(_10->blah).xyz + ((uint3)(get_group_id(0), get_group_id(1), get_group_id(2))), 0u));
+    _10->wibble = convert_half2(convert_float2(_10->wibble) * convert_float2(((uint3)(get_num_groups(0), get_num_groups(1), get_num_groups(2))).xy));
+}
+
diff --git a/reference/shaders-opencl/asm/comp/replicated-composites.spv16.asm.comp b/reference/shaders-opencl/asm/comp/replicated-composites.spv16.asm.comp
new file mode 100644
index 000000000..5bcad0013
--- /dev/null
+++ b/reference/shaders-opencl/asm/comp/replicated-composites.spv16.asm.comp
@@ -0,0 +1,28 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+#ifndef SPIRV_CROSS_CONSTANT_ID_0
+#define SPIRV_CROSS_CONSTANT_ID_0 0.0f
+#endif
+constant float spec_const = SPIRV_CROSS_CONSTANT_ID_0;
+constant float4 _20 = (float4)(spec_const);
+constant float _26[8] = { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f };
+
+struct UBO
+{
+    float uniform_float;
+};
+
+typedef struct UBO UBO;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(UBO ubo)
+{
+    float4 a = (float4)(0.0f);
+    float4 b = (float4)(1.0f);
+    float4 c = _20;
+    float4 d = (float4)(ubo.uniform_float);
+    float4 e = d;
+    float f[8] = {ubo.uniform_float, ubo.uniform_float, ubo.uniform_float, ubo.uniform_float, ubo.uniform_float, ubo.uniform_float, ubo.uniform_float, ubo.uniform_float};
+}
+
diff --git a/reference/shaders-opencl/asm/comp/specialization-constant-workgroup.asm.comp b/reference/shaders-opencl/asm/comp/specialization-constant-workgroup.asm.comp
new file mode 100644
index 000000000..20235cb7f
--- /dev/null
+++ b/reference/shaders-opencl/asm/comp/specialization-constant-workgroup.asm.comp
@@ -0,0 +1,26 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct SSBO
+{
+    float a;
+};
+
+typedef struct SSBO SSBO;
+
+#ifndef SPIRV_CROSS_CONSTANT_ID_10
+#define SPIRV_CROSS_CONSTANT_ID_10 9u
+#endif
+constant uint _19 = SPIRV_CROSS_CONSTANT_ID_10;
+#ifndef SPIRV_CROSS_CONSTANT_ID_12
+#define SPIRV_CROSS_CONSTANT_ID_12 4u
+#endif
+constant uint _21 = SPIRV_CROSS_CONSTANT_ID_12;
+constant uint3 spvWorkgroupSize = (uint3)(_19, 20u, _21);
+
+__attribute__((reqd_work_group_size(9, 20, 4)))
+__kernel void comp_main(__global float* _6)
+{
+    _6[0] += 1.0f;
+}
+
diff --git a/reference/shaders-opencl/asm/comp/struct-resource-name-aliasing.asm.comp b/reference/shaders-opencl/asm/comp/struct-resource-name-aliasing.asm.comp
new file mode 100644
index 000000000..bdcb6b78a
--- /dev/null
+++ b/reference/shaders-opencl/asm/comp/struct-resource-name-aliasing.asm.comp
@@ -0,0 +1,22 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct bufA
+{
+    uint _data[1];
+};
+
+typedef struct bufA bufA;
+
+void _main(__global uint* bufA, __global uint* bufB)
+{
+    bufA[0] = 0u;
+    bufB[0] = 0u;
+}
+
+__attribute__((reqd_work_group_size(8, 8, 1)))
+__kernel void comp_main(__global uint* bufA, __global uint* bufB)
+{
+    _main(bufA, bufB);
+}
+
diff --git a/reference/shaders-opencl/asm/comp/uint_smulextended.asm.comp b/reference/shaders-opencl/asm/comp/uint_smulextended.asm.comp
new file mode 100644
index 000000000..ab2d4a703
--- /dev/null
+++ b/reference/shaders-opencl/asm/comp/uint_smulextended.asm.comp
@@ -0,0 +1,28 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct _4
+{
+    uint _m0[1];
+};
+
+typedef struct _4 _4;
+
+struct _20
+{
+    uint _m0;
+    uint _m1;
+};
+
+typedef struct _20 _20;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global uint* _5, __global uint* _6, __global uint* _7, __global uint* _8)
+{
+    _20 _28;
+    _28._m0 = _5[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x] * _6[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x];
+    _28._m1 = mul_hi(_5[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x], _6[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x]);
+    _7[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x] = _28._m0;
+    _8[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x] = _28._m1;
+}
+
diff --git a/reference/shaders-opencl/asm/comp/undefined-constant-composite.asm.comp b/reference/shaders-opencl/asm/comp/undefined-constant-composite.asm.comp
new file mode 100644
index 000000000..6a87c437b
--- /dev/null
+++ b/reference/shaders-opencl/asm/comp/undefined-constant-composite.asm.comp
@@ -0,0 +1,40 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct _20
+{
+    int _m0;
+    int _m1;
+};
+
+typedef struct _20 _20;
+
+struct _5
+{
+    int _m0[10];
+};
+
+typedef struct _5 _5;
+
+struct _7
+{
+    int _m0[10];
+};
+
+typedef struct _7 _7;
+
+constant int _28 = 0;
+
+int _39( int* _41,  _20* _42)
+{
+    return (*_41) + _42->_m1;
+}
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global int* _6, __global int* _8)
+{
+    int _32 = _8[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x];
+    _20 _33 = (_20){ _28, 200 };
+    _6[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x] = _39(&_32, &_33);
+}
+
diff --git a/reference/shaders-opencl/asm/comp/undefined-spec-constant-composite.asm.comp b/reference/shaders-opencl/asm/comp/undefined-spec-constant-composite.asm.comp
new file mode 100644
index 000000000..7ff37b8a2
--- /dev/null
+++ b/reference/shaders-opencl/asm/comp/undefined-spec-constant-composite.asm.comp
@@ -0,0 +1,46 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct _21
+{
+    int _m0;
+    int _m1;
+};
+
+typedef struct _21 _21;
+
+struct _5
+{
+    int _m0[10];
+};
+
+typedef struct _5 _5;
+
+struct _7
+{
+    int _m0[10];
+};
+
+typedef struct _7 _7;
+
+constant int _29 = 0;
+#ifndef SPIRV_CROSS_CONSTANT_ID_0
+#define SPIRV_CROSS_CONSTANT_ID_0 0
+#endif
+constant int _9 = SPIRV_CROSS_CONSTANT_ID_0;
+constant _21 _30 = (_21){ _9, _29 };
+
+int _42( int* _44,  _21* _45,  _21* _46)
+{
+    return ((*_44) + _45->_m0) + _46->_m1;
+}
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global int* _6, __global int* _8)
+{
+    int _34 = _8[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x];
+    _21 _35 = _30;
+    _21 _36 = (_21){ _29, 200 };
+    _6[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x] = _42(&_34, &_35, &_36);
+}
+
diff --git a/reference/shaders-opencl/asm/comp/variable-pointers-2.asm.comp b/reference/shaders-opencl/asm/comp/variable-pointers-2.asm.comp
new file mode 100644
index 000000000..3bfb4fcbd
--- /dev/null
+++ b/reference/shaders-opencl/asm/comp/variable-pointers-2.asm.comp
@@ -0,0 +1,68 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct foo
+{
+    int a[128];
+    uint b;
+    float2 c;
+};
+
+typedef struct foo foo;
+
+struct bar
+{
+    int d;
+};
+
+typedef struct bar bar;
+
+__global foo* select_buffer(__global foo* a_1_1, bar cb)
+{
+    return (cb.d != 0) ? a_1_1 : NULL;
+}
+
+__private uint3* select_input(__private uint3* _3_ptr, __private uint3* _4_ptr, bar cb)
+{
+    #define _3 (*_3_ptr)
+    #define _4 (*_4_ptr)
+    return (cb.d != 0) ? &_3 : &_4;
+    #undef _3
+    #undef _4
+}
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global foo* buf, bar cb)
+{
+    uint3 _3 = ((uint3)(get_global_id(0), get_global_id(1), get_global_id(2)));
+    uint3 _4 = ((uint3)(get_local_id(0), get_local_id(1), get_local_id(2)));
+    __global foo* _44 = select_buffer((__global foo*)buf, cb);
+    __global foo* _65 = _44;
+    __private uint3* _45 = select_input(&_3, &_4, cb);
+    __global foo* _66 = _65;
+    __global int* _49;
+    __global int* _52;
+    _49 = &_66->a[0u];
+    _52 = &buf->a[0u];
+    int _54;
+    int _55;
+    for (;;)
+    {
+        _54 = *_49;
+        _55 = *_52;
+        if (_54 != _55)
+        {
+            int _63 = (_54 + _55) + as_int((*_45).x);
+            *_49 = _63;
+            *_52 = _63;
+            _49 = &_49[1u];
+            _52 = &_52[1u];
+            continue;
+        }
+        else
+        {
+            break;
+        }
+    }
+}
+
diff --git a/reference/shaders-opencl/asm/comp/variable-pointers-store-forwarding.asm.comp b/reference/shaders-opencl/asm/comp/variable-pointers-store-forwarding.asm.comp
new file mode 100644
index 000000000..cbc654c1c
--- /dev/null
+++ b/reference/shaders-opencl/asm/comp/variable-pointers-store-forwarding.asm.comp
@@ -0,0 +1,35 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct foo
+{
+    int a;
+};
+
+typedef struct foo foo;
+
+struct bar
+{
+    int b;
+};
+
+typedef struct bar bar;
+
+__global int* _24(__global foo* a_1, __global bar* b_1, __private uint3* _3_ptr)
+{
+    #define _3 (*_3_ptr)
+    return (_3.x != 0u) ? &a_1->a : &b_1->b;
+    #undef _3
+}
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global int* x, __global int* y)
+{
+    uint3 _3 = ((uint3)(get_global_id(0), get_global_id(1), get_global_id(2)));
+    __global int* _34 = _24((__global foo*)x, (__global bar*)y, &_3);
+    __global int* _33 = _34;
+    int _37 = x[0];
+    *_33 = 0;
+    y[0] = _37 + _37;
+}
+
diff --git a/reference/shaders-opencl/asm/comp/vector-builtin-type-cast-func.asm.comp b/reference/shaders-opencl/asm/comp/vector-builtin-type-cast-func.asm.comp
new file mode 100644
index 000000000..d4f5be9be
--- /dev/null
+++ b/reference/shaders-opencl/asm/comp/vector-builtin-type-cast-func.asm.comp
@@ -0,0 +1,34 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct cb1_struct
+{
+    float4 _RESERVED_IDENTIFIER_FIXUP_m0[1];
+};
+
+typedef struct cb1_struct cb1_struct;
+
+int2 get_texcoord( int2* base,  int2* index, __private int3* _RESERVED_IDENTIFIER_FIXUP_gl_LocalInvocationID_ptr)
+{
+    #define _RESERVED_IDENTIFIER_FIXUP_gl_LocalInvocationID (*_RESERVED_IDENTIFIER_FIXUP_gl_LocalInvocationID_ptr)
+    return ((*base) * as_int3(_RESERVED_IDENTIFIER_FIXUP_gl_LocalInvocationID).xy) + (*index);
+    #undef _RESERVED_IDENTIFIER_FIXUP_gl_LocalInvocationID
+}
+
+__attribute__((reqd_work_group_size(16, 16, 1)))
+__kernel void comp_main(write_only image2d_t u0, cb1_struct cb0_1)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    int3 _RESERVED_IDENTIFIER_FIXUP_gl_LocalInvocationID = as_int3(((uint3)(get_local_id(0), get_local_id(1), get_local_id(2))));
+    int2 r0 = (int2)((int2)(get_image_width(u0), get_image_height(u0)) >> as_int2((uint2)(4u)));
+    for (int i = 0; i < r0.y; i++)
+    {
+        for (int j = 0; j < r0.x; j++)
+        {
+            int2 param = r0;
+            int2 param_1 = (int2)(i, j);
+            write_imagef(u0, get_texcoord(&param, &param_1, &_RESERVED_IDENTIFIER_FIXUP_gl_LocalInvocationID), cb0_1._RESERVED_IDENTIFIER_FIXUP_m0[0].xxxx);
+        }
+    }
+}
+
diff --git a/reference/shaders-opencl/asm/comp/vector-builtin-type-cast.asm.comp b/reference/shaders-opencl/asm/comp/vector-builtin-type-cast.asm.comp
new file mode 100644
index 000000000..22834dd8a
--- /dev/null
+++ b/reference/shaders-opencl/asm/comp/vector-builtin-type-cast.asm.comp
@@ -0,0 +1,24 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct cb1_struct
+{
+    float4 _RESERVED_IDENTIFIER_FIXUP_m0[1];
+};
+
+typedef struct cb1_struct cb1_struct;
+
+__attribute__((reqd_work_group_size(16, 16, 1)))
+__kernel void comp_main(write_only image2d_t u0, cb1_struct cb0_1)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    int2 r0 = (int2)((int2)(get_image_width(u0), get_image_height(u0)) >> as_int2((uint2)(4u)));
+    for (int i = 0; i < r0.y; i++)
+    {
+        for (int j = 0; j < r0.x; j++)
+        {
+            write_imagef(u0, (r0 * as_int3(((uint3)(get_local_id(0), get_local_id(1), get_local_id(2)))).xy) + (int2)(i, j), cb0_1._RESERVED_IDENTIFIER_FIXUP_m0[0].xxxx);
+        }
+    }
+}
+
diff --git a/reference/shaders-opencl/comp/access-private-workgroup-in-function.comp b/reference/shaders-opencl/comp/access-private-workgroup-in-function.comp
new file mode 100644
index 000000000..4aeedb66b
--- /dev/null
+++ b/reference/shaders-opencl/comp/access-private-workgroup-in-function.comp
@@ -0,0 +1,36 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+void set_f(int* f_ptr)
+{
+    #define f (*f_ptr)
+    f = 40;
+    #undef f
+}
+
+void set_shared_u(__local int* u_ptr)
+{
+    #define u (*u_ptr)
+    u = 50;
+    #undef u
+}
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main()
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    __local int u;
+    int f;
+    set_f(&f);
+    set_shared_u(&u);
+    if (((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0))) == 0u)
+    {
+        f = 10;
+    }
+    else
+    {
+        f = 30;
+        u = 20;
+    }
+}
+
diff --git a/reference/shaders-opencl/comp/arguments.comp b/reference/shaders-opencl/comp/arguments.comp
new file mode 100644
index 000000000..df95f2205
--- /dev/null
+++ b/reference/shaders-opencl/comp/arguments.comp
@@ -0,0 +1,25 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct Buf
+{
+    uint data[1];
+};
+
+typedef struct Buf Buf;
+
+struct parameter
+{
+    uint n;
+};
+
+typedef struct parameter parameter;
+
+__attribute__((reqd_work_group_size(64, 1, 1)))
+__kernel void comp_main(__global uint* _19, parameter p)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    uint i = ((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x;
+    _19[i] = i + p.n;
+}
+
diff --git a/reference/shaders-opencl/comp/atomic.comp b/reference/shaders-opencl/comp/atomic.comp
new file mode 100644
index 000000000..5c5d824eb
--- /dev/null
+++ b/reference/shaders-opencl/comp/atomic.comp
@@ -0,0 +1,53 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct SSBO
+{
+    uint u32;
+    int i32;
+};
+
+typedef struct SSBO SSBO;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global SSBO* ssbo)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    __local uint shared_u32;
+    __local int shared_i32;
+    uint _16 = atomic_add(&(ssbo->u32), 1u);
+    uint _18 = atomic_or(&(ssbo->u32), 1u);
+    uint _20 = atomic_xor(&(ssbo->u32), 1u);
+    uint _22 = atomic_and(&(ssbo->u32), 1u);
+    uint _24 = atomic_min(&(ssbo->u32), 1u);
+    uint _26 = atomic_max(&(ssbo->u32), 1u);
+    uint _28 = atomic_xchg(&(ssbo->u32), 1u);
+    uint _32 = atomic_cmpxchg(&(ssbo->u32), 10u, 2u);
+    int _36 = atomic_add(&(ssbo->i32), 1);
+    int _38 = atomic_or(&(ssbo->i32), 1);
+    int _40 = atomic_xor(&(ssbo->i32), 1);
+    int _42 = atomic_and(&(ssbo->i32), 1);
+    int _44 = atomic_min(&(ssbo->i32), 1);
+    int _46 = atomic_max(&(ssbo->i32), 1);
+    int _48 = atomic_xchg(&(ssbo->i32), 1);
+    int _52 = atomic_cmpxchg(&(ssbo->i32), 10, 2);
+    shared_u32 = 10u;
+    shared_i32 = 10;
+    uint _57 = atomic_add(&shared_u32, 1u);
+    uint _58 = atomic_or(&shared_u32, 1u);
+    uint _59 = atomic_xor(&shared_u32, 1u);
+    uint _60 = atomic_and(&shared_u32, 1u);
+    uint _61 = atomic_min(&shared_u32, 1u);
+    uint _62 = atomic_max(&shared_u32, 1u);
+    uint _63 = atomic_xchg(&shared_u32, 1u);
+    uint _64 = atomic_cmpxchg(&shared_u32, 10u, 2u);
+    int _65 = atomic_add(&shared_i32, 1);
+    int _66 = atomic_or(&shared_i32, 1);
+    int _67 = atomic_xor(&shared_i32, 1);
+    int _68 = atomic_and(&shared_i32, 1);
+    int _69 = atomic_min(&shared_i32, 1);
+    int _70 = atomic_max(&shared_i32, 1);
+    int _71 = atomic_xchg(&shared_i32, 1);
+    int _72 = atomic_cmpxchg(&shared_i32, 10, 2);
+}
+
diff --git a/reference/shaders-opencl/comp/barriers.comp b/reference/shaders-opencl/comp/barriers.comp
new file mode 100644
index 000000000..1a63caaf6
--- /dev/null
+++ b/reference/shaders-opencl/comp/barriers.comp
@@ -0,0 +1,80 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+void barrier_shared()
+{
+    mem_fence(CLK_LOCAL_MEM_FENCE);
+}
+
+void full_barrier()
+{
+    mem_fence(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
+}
+
+void image_barrier()
+{
+    mem_fence(CLK_GLOBAL_MEM_FENCE);
+}
+
+void buffer_barrier()
+{
+    mem_fence(CLK_GLOBAL_MEM_FENCE);
+}
+
+void group_barrier()
+{
+    mem_fence(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
+}
+
+void barrier_shared_exec()
+{
+    mem_fence(CLK_LOCAL_MEM_FENCE);
+    barrier(CLK_LOCAL_MEM_FENCE);
+}
+
+void full_barrier_exec()
+{
+    mem_fence(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
+    barrier(CLK_LOCAL_MEM_FENCE);
+}
+
+void image_barrier_exec()
+{
+    mem_fence(CLK_GLOBAL_MEM_FENCE);
+    barrier(CLK_LOCAL_MEM_FENCE);
+}
+
+void buffer_barrier_exec()
+{
+    mem_fence(CLK_GLOBAL_MEM_FENCE);
+    barrier(CLK_LOCAL_MEM_FENCE);
+}
+
+void group_barrier_exec()
+{
+    mem_fence(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
+    barrier(CLK_LOCAL_MEM_FENCE);
+}
+
+void exec_barrier()
+{
+    barrier(CLK_LOCAL_MEM_FENCE);
+}
+
+__attribute__((reqd_work_group_size(4, 1, 1)))
+__kernel void comp_main()
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    barrier_shared();
+    full_barrier();
+    image_barrier();
+    buffer_barrier();
+    group_barrier();
+    barrier_shared_exec();
+    full_barrier_exec();
+    image_barrier_exec();
+    buffer_barrier_exec();
+    group_barrier_exec();
+    exec_barrier();
+}
+
diff --git a/reference/shaders-opencl/comp/basic.comp b/reference/shaders-opencl/comp/basic.comp
new file mode 100644
index 000000000..541f0d8e2
--- /dev/null
+++ b/reference/shaders-opencl/comp/basic.comp
@@ -0,0 +1,37 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct SSBO
+{
+    float4 in_data[1];
+};
+
+typedef struct SSBO SSBO;
+
+struct SSBO2
+{
+    float4 out_data[1];
+};
+
+typedef struct SSBO2 SSBO2;
+
+struct SSBO3
+{
+    uint counter;
+};
+
+typedef struct SSBO3 SSBO3;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global const float4* _23, __global float4* _45, __global uint* _48)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    uint ident = ((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x;
+    float4 idata = _23[ident];
+    if (dot(idata, (float4)(1.0f, 5.0f, 6.0f, 2.0f)) > 8.19999980926513671875f)
+    {
+        uint _52 = atomic_add(&(_48[0]), 1u);
+        _45[_52] = idata;
+    }
+}
+
diff --git a/reference/shaders-opencl/comp/basic.dispatchbase.comp b/reference/shaders-opencl/comp/basic.dispatchbase.comp
new file mode 100644
index 000000000..fc994276b
--- /dev/null
+++ b/reference/shaders-opencl/comp/basic.dispatchbase.comp
@@ -0,0 +1,43 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct SSBO
+{
+    float4 in_data[1];
+};
+
+typedef struct SSBO SSBO;
+
+struct SSBO2
+{
+    float4 out_data[1];
+};
+
+typedef struct SSBO2 SSBO2;
+
+struct SSBO3
+{
+    uint counter;
+};
+
+typedef struct SSBO3 SSBO3;
+
+#ifndef SPIRV_CROSS_CONSTANT_ID_10
+#define SPIRV_CROSS_CONSTANT_ID_10 1u
+#endif
+constant uint _59 = SPIRV_CROSS_CONSTANT_ID_10;
+constant uint3 spvWorkgroupSize = (uint3)(_59, 1u, 1u);
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global const float4* _27, __global float4* _49, __global uint* _52)
+{
+    uint ident = ((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x;
+    uint workgroup = ((uint3)(get_group_id(0), get_group_id(1), get_group_id(2))).x;
+    float4 idata = _27[ident];
+    if (dot(idata, (float4)(1.0f, 5.0f, 6.0f, 2.0f)) > 8.19999980926513671875f)
+    {
+        uint _56 = atomic_add(&(_52[0]), 1u);
+        _49[_56] = idata;
+    }
+}
+
diff --git a/reference/shaders-opencl/comp/buffer-push-const.comp b/reference/shaders-opencl/comp/buffer-push-const.comp
new file mode 100644
index 000000000..df95f2205
--- /dev/null
+++ b/reference/shaders-opencl/comp/buffer-push-const.comp
@@ -0,0 +1,25 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct Buf
+{
+    uint data[1];
+};
+
+typedef struct Buf Buf;
+
+struct parameter
+{
+    uint n;
+};
+
+typedef struct parameter parameter;
+
+__attribute__((reqd_work_group_size(64, 1, 1)))
+__kernel void comp_main(__global uint* _19, parameter p)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    uint i = ((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x;
+    _19[i] = i + p.n;
+}
+
diff --git a/reference/shaders-opencl/comp/builtins.comp b/reference/shaders-opencl/comp/builtins.comp
new file mode 100644
index 000000000..d0a877bee
--- /dev/null
+++ b/reference/shaders-opencl/comp/builtins.comp
@@ -0,0 +1,15 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+__attribute__((reqd_work_group_size(8, 4, 2)))
+__kernel void comp_main()
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    uint3 local_id = ((uint3)(get_local_id(0), get_local_id(1), get_local_id(2)));
+    uint3 global_id = ((uint3)(get_global_id(0), get_global_id(1), get_global_id(2)));
+    uint local_index = ((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)));
+    uint3 work_group_size = spvWorkgroupSize;
+    uint3 num_work_groups = ((uint3)(get_num_groups(0), get_num_groups(1), get_num_groups(2)));
+    uint3 work_group_id = ((uint3)(get_group_id(0), get_group_id(1), get_group_id(2)));
+}
+
diff --git a/reference/shaders-opencl/comp/cfg-preserve-parameter.comp b/reference/shaders-opencl/comp/cfg-preserve-parameter.comp
new file mode 100644
index 000000000..707d5ec40
--- /dev/null
+++ b/reference/shaders-opencl/comp/cfg-preserve-parameter.comp
@@ -0,0 +1,75 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+void out_test_0( int* cond_5,  int* i_5)
+{
+    if ((*cond_5) == 0)
+    {
+        *i_5 = 40;
+    }
+    else
+    {
+        *i_5 = 60;
+    }
+}
+
+void out_test_1( int* cond_1_1,  int* i_1_1)
+{
+    switch ((*cond_1_1))
+    {
+        case 40:
+        {
+            *i_1_1 = 40;
+            break;
+        }
+        default:
+        {
+            *i_1_1 = 70;
+            break;
+        }
+    }
+}
+
+void inout_test_0( int* cond_2_1,  int* i_2_1)
+{
+    if ((*cond_2_1) == 0)
+    {
+        *i_2_1 = 40;
+    }
+}
+
+void inout_test_1( int* cond_3_1,  int* i_3_1)
+{
+    switch ((*cond_3_1))
+    {
+        case 40:
+        {
+            *i_3_1 = 40;
+            break;
+        }
+    }
+}
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main()
+{
+    int cond_4_1 = 40;
+    int i_4_1 = 50;
+    int param_8 = cond_4_1;
+    int param_1_1 = i_4_1;
+    out_test_0(&param_8, &param_1_1);
+    i_4_1 = param_1_1;
+    int param_2_1 = cond_4_1;
+    int param_3_1 = i_4_1;
+    out_test_1(&param_2_1, &param_3_1);
+    i_4_1 = param_3_1;
+    int param_4_1 = cond_4_1;
+    int param_5_1 = i_4_1;
+    inout_test_0(&param_4_1, &param_5_1);
+    i_4_1 = param_5_1;
+    int param_6_1 = cond_4_1;
+    int param_7_1 = i_4_1;
+    inout_test_1(&param_6_1, &param_7_1);
+    i_4_1 = param_7_1;
+}
+
diff --git a/reference/shaders-opencl/comp/complex-type-alias.comp b/reference/shaders-opencl/comp/complex-type-alias.comp
new file mode 100644
index 000000000..137313959
--- /dev/null
+++ b/reference/shaders-opencl/comp/complex-type-alias.comp
@@ -0,0 +1,56 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct Foo0
+{
+    float a;
+};
+
+typedef struct Foo0 Foo0;
+
+struct Foo1
+{
+    Foo0 a;
+};
+
+typedef struct Foo1 Foo1;
+
+struct Foo2
+{
+    Foo1 a;
+    float weight;
+};
+
+typedef struct Foo2 Foo2;
+
+struct SSBO
+{
+    Foo2 outputs[1];
+};
+
+typedef struct SSBO SSBO;
+
+void Zero( Foo0* v_1)
+{
+    v_1->a = 0.0f;
+}
+
+__attribute__((reqd_work_group_size(8, 8, 1)))
+__kernel void comp_main(__global Foo2* _53)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    __local Foo2 coeffs[64];
+    Foo2 data_1;
+    data_1.weight = 0.0f;
+    Foo0 param_1;
+    Zero(&param_1);
+    data_1.a.a = param_1;
+    coeffs[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))] = data_1;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0))) == 0u)
+    {
+        _53[((uint3)(get_group_id(0), get_group_id(1), get_group_id(2))).x].a.a.a = coeffs[0].a.a.a;
+        _53[((uint3)(get_group_id(0), get_group_id(1), get_group_id(2))).x].weight = coeffs[0].weight;
+    }
+}
+
diff --git a/reference/shaders-opencl/comp/composite-construct.comp b/reference/shaders-opencl/comp/composite-construct.comp
new file mode 100644
index 000000000..995603ba4
--- /dev/null
+++ b/reference/shaders-opencl/comp/composite-construct.comp
@@ -0,0 +1,37 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct SSBO0
+{
+    float4 as[1];
+};
+
+typedef struct SSBO0 SSBO0;
+
+struct SSBO1
+{
+    float4 bs[1];
+};
+
+typedef struct SSBO1 SSBO1;
+
+constant float4 _43[2] = { (float4)(20.0f), (float4)(40.0f) };
+
+struct Composite
+{
+    float4 a;
+    float4 b;
+};
+
+typedef struct Composite Composite;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global float4* _16, __global float4* _32)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    float4 values[2] = { _16[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x], _32[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x] };
+    Composite c = (Composite){ values[0], _43[1] };
+    _16[0] = values[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))];
+    _32[1] = c.b;
+}
+
diff --git a/reference/shaders-opencl/comp/culling.comp b/reference/shaders-opencl/comp/culling.comp
new file mode 100644
index 000000000..c6286177f
--- /dev/null
+++ b/reference/shaders-opencl/comp/culling.comp
@@ -0,0 +1,37 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct SSBO
+{
+    float in_data[1];
+};
+
+typedef struct SSBO SSBO;
+
+struct SSBO2
+{
+    float out_data[1];
+};
+
+typedef struct SSBO2 SSBO2;
+
+struct SSBO3
+{
+    uint count;
+};
+
+typedef struct SSBO3 SSBO3;
+
+__attribute__((reqd_work_group_size(4, 1, 1)))
+__kernel void comp_main(__global const float* _22, __global float* _38, __global uint* _41)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    uint ident = ((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x;
+    float idata = _22[ident];
+    if (idata > 12.0f)
+    {
+        uint _45 = atomic_add(&(_41[0]), 1u);
+        _38[_45] = idata;
+    }
+}
+
diff --git a/reference/shaders-opencl/comp/defer-parens.comp b/reference/shaders-opencl/comp/defer-parens.comp
new file mode 100644
index 000000000..90d8ebb41
--- /dev/null
+++ b/reference/shaders-opencl/comp/defer-parens.comp
@@ -0,0 +1,25 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct SSBO
+{
+    float4 data;
+    int index;
+};
+
+typedef struct SSBO SSBO;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global SSBO* _13)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    float4 d = _13->data;
+    _13->data = (float4)(d.x, d.yz + (float2)(10.0f), d.w);
+    _13->data = (d + d) + d;
+    _13->data = (d.yz + (float2)(10.0f)).xxyy;
+    float t = (d.yz + (float2)(10.0f)).y;
+    _13->data = (float4)(t);
+    t = (d.zw + (float2)(10.0f))[_13->index];
+    _13->data = (float4)(t);
+}
+
diff --git a/reference/shaders-opencl/comp/dowhile.comp b/reference/shaders-opencl/comp/dowhile.comp
new file mode 100644
index 000000000..2dca8bcda
--- /dev/null
+++ b/reference/shaders-opencl/comp/dowhile.comp
@@ -0,0 +1,34 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct SSBO
+{
+    float4 mvp;
+    float4 in_data[1];
+};
+
+typedef struct SSBO SSBO;
+
+struct SSBO2
+{
+    float4 out_data[1];
+};
+
+typedef struct SSBO2 SSBO2;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global const SSBO* _28, __global float4* _52)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    int i;
+    uint ident = ((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x;
+    i = 0;
+    float4 idat = _28->in_data[ident];
+    do
+    {
+        idat = _28->mvp * idat;
+        i++;
+    } while (i < 16);
+    _52[ident] = idat;
+}
+
diff --git a/reference/shaders-opencl/comp/expect-assume.comp b/reference/shaders-opencl/comp/expect-assume.comp
new file mode 100644
index 000000000..a9415be79
--- /dev/null
+++ b/reference/shaders-opencl/comp/expect-assume.comp
@@ -0,0 +1,17 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct buffer_t
+{
+    uint z;
+};
+
+typedef struct buffer_t buffer_t;
+
+__attribute__((reqd_work_group_size(32, 1, 1)))
+__kernel void comp_main(__global uint* buf)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    buf[0] = ((uint3)(get_group_id(0), get_group_id(1), get_group_id(2))).z;
+}
+
diff --git a/reference/shaders-opencl/comp/force-recompile-hooks.swizzle.comp b/reference/shaders-opencl/comp/force-recompile-hooks.swizzle.comp
new file mode 100644
index 000000000..c36ee1b4c
--- /dev/null
+++ b/reference/shaders-opencl/comp/force-recompile-hooks.swizzle.comp
@@ -0,0 +1,12 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+const sampler_t spvDefaultSampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(read_only image2d_t foo, write_only image2d_t bar)
+{
+    float4 a_1 = read_imagef(foo, spvDefaultSampler, (float2)(1.0f));
+    write_imagef(bar, (int2)(0), a_1);
+}
+
diff --git a/reference/shaders-opencl/comp/functions.comp b/reference/shaders-opencl/comp/functions.comp
new file mode 100644
index 000000000..490fc907e
--- /dev/null
+++ b/reference/shaders-opencl/comp/functions.comp
@@ -0,0 +1,15 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+void myfunc(__local int* foo)
+{
+    foo[0] = 13;
+}
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main()
+{
+    __local int foo[1337];
+    myfunc(foo);
+}
+
diff --git a/reference/shaders-opencl/comp/global-invocation-id.comp b/reference/shaders-opencl/comp/global-invocation-id.comp
new file mode 100644
index 000000000..84693b0ee
--- /dev/null
+++ b/reference/shaders-opencl/comp/global-invocation-id.comp
@@ -0,0 +1,18 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct myBlock
+{
+    int a;
+    float b[1];
+};
+
+typedef struct myBlock myBlock;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global myBlock* myStorage)
+{
+    myStorage->a = (myStorage->a + 1) % 256;
+    myStorage->b[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x] += 0.0199999995529651641845703125f;
+}
+
diff --git a/reference/shaders-opencl/comp/image.comp b/reference/shaders-opencl/comp/image.comp
new file mode 100644
index 000000000..4eca27cde
--- /dev/null
+++ b/reference/shaders-opencl/comp/image.comp
@@ -0,0 +1,11 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(read_only image2d_t uImageIn, write_only image2d_t uImageOut)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    float4 v = read_imagef(uImageIn, as_int2(((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).xy) + (int2)(get_image_width(uImageIn), get_image_height(uImageIn)));
+    write_imagef(uImageOut, as_int2(((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).xy), v);
+}
+
diff --git a/reference/shaders-opencl/comp/insert.comp b/reference/shaders-opencl/comp/insert.comp
new file mode 100644
index 000000000..64d372262
--- /dev/null
+++ b/reference/shaders-opencl/comp/insert.comp
@@ -0,0 +1,23 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct SSBO
+{
+    float4 out_data[1];
+};
+
+typedef struct SSBO SSBO;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global float4* _27)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    float4 v;
+    v.x = 10.0f;
+    v.y = 30.0f;
+    v.z = 70.0f;
+    v.w = 90.0f;
+    _27[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x] = v;
+    _27[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x][1u] = 20.0f;
+}
+
diff --git a/reference/shaders-opencl/comp/local-invocation-id.comp b/reference/shaders-opencl/comp/local-invocation-id.comp
new file mode 100644
index 000000000..0def2374c
--- /dev/null
+++ b/reference/shaders-opencl/comp/local-invocation-id.comp
@@ -0,0 +1,18 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct myBlock
+{
+    int a;
+    float b[1];
+};
+
+typedef struct myBlock myBlock;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global myBlock* myStorage)
+{
+    myStorage->a = (myStorage->a + 1) % 256;
+    myStorage->b[((uint3)(get_local_id(0), get_local_id(1), get_local_id(2))).x] += 0.0199999995529651641845703125f;
+}
+
diff --git a/reference/shaders-opencl/comp/local-invocation-index.comp b/reference/shaders-opencl/comp/local-invocation-index.comp
new file mode 100644
index 000000000..0a1a8ed3c
--- /dev/null
+++ b/reference/shaders-opencl/comp/local-invocation-index.comp
@@ -0,0 +1,18 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct myBlock
+{
+    int a;
+    float b[1];
+};
+
+typedef struct myBlock myBlock;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global myBlock* myStorage)
+{
+    myStorage->a = (myStorage->a + 1) % 256;
+    myStorage->b[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))] += 0.0199999995529651641845703125f;
+}
+
diff --git a/reference/shaders-opencl/comp/local-size-duplicate-spec-id.comp b/reference/shaders-opencl/comp/local-size-duplicate-spec-id.comp
new file mode 100644
index 000000000..5f462293d
--- /dev/null
+++ b/reference/shaders-opencl/comp/local-size-duplicate-spec-id.comp
@@ -0,0 +1,42 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct StorageBuffer
+{
+    uint values[1];
+};
+
+typedef struct StorageBuffer StorageBuffer;
+
+#ifndef SPIRV_CROSS_CONSTANT_ID_0
+#define SPIRV_CROSS_CONSTANT_ID_0 1
+#endif
+constant int local_size_x_val = SPIRV_CROSS_CONSTANT_ID_0;
+#ifndef SPIRV_CROSS_CONSTANT_ID_1
+#define SPIRV_CROSS_CONSTANT_ID_1 1
+#endif
+constant int local_size_y_val = SPIRV_CROSS_CONSTANT_ID_1;
+#ifndef SPIRV_CROSS_CONSTANT_ID_2
+#define SPIRV_CROSS_CONSTANT_ID_2 1
+#endif
+constant int local_size_z_val = SPIRV_CROSS_CONSTANT_ID_2;
+#ifndef SPIRV_CROSS_CONSTANT_ID_0
+#define SPIRV_CROSS_CONSTANT_ID_0 1u
+#endif
+constant uint _22 = SPIRV_CROSS_CONSTANT_ID_0;
+#ifndef SPIRV_CROSS_CONSTANT_ID_1
+#define SPIRV_CROSS_CONSTANT_ID_1 1u
+#endif
+constant uint _23 = SPIRV_CROSS_CONSTANT_ID_1;
+#ifndef SPIRV_CROSS_CONSTANT_ID_2
+#define SPIRV_CROSS_CONSTANT_ID_2 1u
+#endif
+constant uint _24 = SPIRV_CROSS_CONSTANT_ID_2;
+constant uint3 spvWorkgroupSize = (uint3)(_22, _23, _24);
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global uint* ssbo)
+{
+    ssbo[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))] = 1u;
+}
+
diff --git a/reference/shaders-opencl/comp/mod.comp b/reference/shaders-opencl/comp/mod.comp
new file mode 100644
index 000000000..ad1ead0ed
--- /dev/null
+++ b/reference/shaders-opencl/comp/mod.comp
@@ -0,0 +1,30 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct SSBO
+{
+    float4 in_data[1];
+};
+
+typedef struct SSBO SSBO;
+
+struct SSBO2
+{
+    float4 out_data[1];
+};
+
+typedef struct SSBO2 SSBO2;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global const float4* _23, __global float4* _33)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    uint ident = ((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x;
+    float4 v = fmod(_23[ident], _33[ident]);
+    _33[ident] = v;
+    uint4 vu = as_uint4(_23[ident]) % as_uint4(_33[ident]);
+    _33[ident] = as_float4(vu);
+    int4 vi = as_int4(_23[ident]) % as_int4(_33[ident]);
+    _33[ident] = as_float4(vi);
+}
+
diff --git a/reference/shaders-opencl/comp/modf.comp b/reference/shaders-opencl/comp/modf.comp
new file mode 100644
index 000000000..e890b4341
--- /dev/null
+++ b/reference/shaders-opencl/comp/modf.comp
@@ -0,0 +1,37 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct SSBO
+{
+    float4 in_data[1];
+};
+
+typedef struct SSBO SSBO;
+
+struct ResType
+{
+    float4 _m0;
+    float4 _m1;
+};
+
+typedef struct ResType ResType;
+
+struct SSBO2
+{
+    float4 out_data[1];
+};
+
+typedef struct SSBO2 SSBO2;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global const float4* _23, __global float4* _38)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    uint ident = ((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x;
+    ResType _32;
+    _32._m0 = modf(_23[ident], &_32._m1);
+    float4 i = _32._m1;
+    float4 v = _32._m0;
+    _38[ident] = v;
+}
+
diff --git a/reference/shaders-opencl/comp/outer-product.comp b/reference/shaders-opencl/comp/outer-product.comp
new file mode 100644
index 000000000..8441e6d2d
--- /dev/null
+++ b/reference/shaders-opencl/comp/outer-product.comp
@@ -0,0 +1,42 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct SSBO
+{
+    float2 m22;
+    float3 m23;
+    float4 m24;
+    float2 m32;
+    float3 m33;
+    float4 m34;
+    float2 m42;
+    float3 m43;
+    float4 m44;
+};
+
+typedef struct SSBO SSBO;
+
+struct ReadSSBO
+{
+    float2 v2;
+    float3 v3;
+    float4 v4;
+};
+
+typedef struct ReadSSBO ReadSSBO;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global SSBO* _21, __global const ReadSSBO* _26)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    _21->m22 = _26->v2 * _26->v2.x;
+    _21->m23 = _26->v3 * _26->v2.x;
+    _21->m24 = _26->v4 * _26->v2.x;
+    _21->m32 = _26->v2 * _26->v3.x;
+    _21->m33 = _26->v3 * _26->v3.x;
+    _21->m34 = _26->v4 * _26->v3.x;
+    _21->m42 = _26->v2 * _26->v4.x;
+    _21->m43 = _26->v3 * _26->v4.x;
+    _21->m44 = _26->v4 * _26->v4.x;
+}
+
diff --git a/reference/shaders-opencl/comp/packing-test-1.comp b/reference/shaders-opencl/comp/packing-test-1.comp
new file mode 100644
index 000000000..0afe8d4eb
--- /dev/null
+++ b/reference/shaders-opencl/comp/packing-test-1.comp
@@ -0,0 +1,36 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct T1
+{
+    float3 a;
+    float b;
+};
+
+typedef struct T1 T1;
+
+struct Buffer0
+{
+    T1 buf0[1];
+};
+
+typedef struct Buffer0 Buffer0;
+
+struct Buffer1
+{
+    float buf1[1];
+};
+
+typedef struct Buffer1 Buffer1;
+
+__attribute__((reqd_work_group_size(32, 1, 1)))
+__kernel void comp_main(__global T1* _15, __global float* _34)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    T1 v;
+    v.a = _15[0].a;
+    v.b = _15[0].b;
+    float x = v.b;
+    _34[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x] = x;
+}
+
diff --git a/reference/shaders-opencl/comp/packing-test-2.comp b/reference/shaders-opencl/comp/packing-test-2.comp
new file mode 100644
index 000000000..ddf27da61
--- /dev/null
+++ b/reference/shaders-opencl/comp/packing-test-2.comp
@@ -0,0 +1,33 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct T1
+{
+    float3 a;
+    float b;
+};
+
+typedef struct T1 T1;
+
+struct Buffer0
+{
+    T1 buf0[1];
+};
+
+typedef struct Buffer0 Buffer0;
+
+struct Buffer1
+{
+    float buf1[1];
+};
+
+typedef struct Buffer1 Buffer1;
+
+__attribute__((reqd_work_group_size(32, 1, 1)))
+__kernel void comp_main(__global T1* _14, __global float* _24)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    float x = _14[0].b;
+    _24[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x] = x;
+}
+
diff --git a/reference/shaders-opencl/comp/read-write-only.comp b/reference/shaders-opencl/comp/read-write-only.comp
new file mode 100644
index 000000000..6b54b862b
--- /dev/null
+++ b/reference/shaders-opencl/comp/read-write-only.comp
@@ -0,0 +1,35 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct SSBO2
+{
+    float4 data4;
+    float4 data5;
+};
+
+typedef struct SSBO2 SSBO2;
+
+struct SSBO0
+{
+    float4 data0;
+    float4 data1;
+};
+
+typedef struct SSBO0 SSBO0;
+
+struct SSBO1
+{
+    float4 data2;
+    float4 data3;
+};
+
+typedef struct SSBO1 SSBO1;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global SSBO2* _10, __global const SSBO0* _15, __global SSBO1* _21)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    _10->data4 = _15->data0 + _21->data2;
+    _10->data5 = _15->data1 + _21->data3;
+}
+
diff --git a/reference/shaders-opencl/comp/rmw-opt.comp b/reference/shaders-opencl/comp/rmw-opt.comp
new file mode 100644
index 000000000..f205a3654
--- /dev/null
+++ b/reference/shaders-opencl/comp/rmw-opt.comp
@@ -0,0 +1,31 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct SSBO
+{
+    int a;
+};
+
+typedef struct SSBO SSBO;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global int* _9)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    _9[0] += 10;
+    _9[0] -= 10;
+    _9[0] *= 10;
+    _9[0] /= 10;
+    _9[0] = _9[0] << 2;
+    _9[0] = _9[0] >> 3;
+    _9[0] &= 40;
+    _9[0] ^= 10;
+    _9[0] %= 40;
+    _9[0] |= 1;
+    bool c = false;
+    bool d = true;
+    c = c && d;
+    d = d || c;
+    _9[0] = (int)(c && d);
+}
+
diff --git a/reference/shaders-opencl/comp/scalar-std450-distance-length-normalize.comp b/reference/shaders-opencl/comp/scalar-std450-distance-length-normalize.comp
new file mode 100644
index 000000000..59f3fb7ed
--- /dev/null
+++ b/reference/shaders-opencl/comp/scalar-std450-distance-length-normalize.comp
@@ -0,0 +1,25 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct SSBO
+{
+    float a;
+    float b;
+    float c;
+    float d;
+    float e;
+    float f;
+};
+
+typedef struct SSBO SSBO;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global SSBO* _9)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    _9->c = distance(_9->a, _9->b);
+    _9->d = length(_9->a);
+    _9->e = normalize(_9->a);
+    _9->f = distance(_9->a - 1.0f, _9->b - 2.0f);
+}
+
diff --git a/reference/shaders-opencl/comp/shared-std450.double.comp b/reference/shaders-opencl/comp/shared-std450.double.comp
new file mode 100644
index 000000000..84cb8354c
--- /dev/null
+++ b/reference/shaders-opencl/comp/shared-std450.double.comp
@@ -0,0 +1,31 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+struct SSBO
+{
+    double in_data[1];
+};
+
+typedef struct SSBO SSBO;
+
+struct SSBO2
+{
+    double out_data[1];
+};
+
+typedef struct SSBO2 SSBO2;
+
+__attribute__((reqd_work_group_size(4, 1, 1)))
+__kernel void comp_main(__global const double* _22, __global double* _44)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    __local double sShared[4];
+    uint ident = ((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x;
+    double idata = _22[ident];
+    sShared[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))] = idata;
+    mem_fence(CLK_LOCAL_MEM_FENCE);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    _44[ident] = sShared[(4u - ((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))) - 1u];
+}
+
diff --git a/reference/shaders-opencl/comp/shared-struct-bool-cast.comp b/reference/shaders-opencl/comp/shared-struct-bool-cast.comp
new file mode 100644
index 000000000..870672c31
--- /dev/null
+++ b/reference/shaders-opencl/comp/shared-struct-bool-cast.comp
@@ -0,0 +1,106 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct S1
+{
+    int3 a;
+    uint2 b;
+    int4 c;
+    uint d;
+};
+
+typedef struct S1 S1;
+
+struct block
+{
+    uint passed;
+};
+
+typedef struct block block;
+
+bool compare_ivec3( int3* a,  int3* b)
+{
+    return all((*a) == (*b));
+}
+
+bool compare_uvec2( uint2* a_1,  uint2* b_1)
+{
+    return all((*a_1) == (*b_1));
+}
+
+bool compare_bvec4( int4* a_2,  int4* b_2)
+{
+    return all((*a_2) == (*b_2));
+}
+
+bool compare_uint( uint* a_3,  uint* b_3)
+{
+    return (*a_3) == (*b_3);
+}
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global uint* _132)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    __local S1 s1;
+    s1.a = (int3)(6, 8, 8);
+    s1.b = (uint2)(4u);
+    s1.c = (int4)(false, false, false, true);
+    s1.d = 6u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    mem_fence(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
+    bool allOk = true;
+    bool _99;
+    if (allOk)
+    {
+        int3 param = (int3)(6, 8, 8);
+        int3 param_1 = s1.a;
+        _99 = compare_ivec3(&param, &param_1);
+    }
+    else
+    {
+        _99 = allOk;
+    }
+    allOk = _99;
+    bool _108;
+    if (allOk)
+    {
+        uint2 param_2 = (uint2)(4u);
+        uint2 param_3 = s1.b;
+        _108 = compare_uvec2(&param_2, &param_3);
+    }
+    else
+    {
+        _108 = allOk;
+    }
+    allOk = _108;
+    bool _117;
+    if (allOk)
+    {
+        int4 param_4 = (int4)(false, false, false, true);
+        int4 param_5 = s1.c;
+        _117 = compare_bvec4(&param_4, &param_5);
+    }
+    else
+    {
+        _117 = allOk;
+    }
+    allOk = _117;
+    bool _126;
+    if (allOk)
+    {
+        uint param_6 = 6u;
+        uint param_7 = s1.d;
+        _126 = compare_uint(&param_6, &param_7);
+    }
+    else
+    {
+        _126 = allOk;
+    }
+    allOk = _126;
+    if (allOk)
+    {
+        _132[0] += as_uint(1);
+    }
+}
+
diff --git a/reference/shaders-opencl/comp/shared-zero-init-simple.comp b/reference/shaders-opencl/comp/shared-zero-init-simple.comp
new file mode 100644
index 000000000..62136f145
--- /dev/null
+++ b/reference/shaders-opencl/comp/shared-zero-init-simple.comp
@@ -0,0 +1,27 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct SSBO
+{
+    float in_data[1];
+};
+
+typedef struct SSBO SSBO;
+
+struct SSBO2
+{
+    float out_data[1];
+};
+
+typedef struct SSBO2 SSBO2;
+
+__attribute__((reqd_work_group_size(4, 1, 1)))
+__kernel void comp_main(__global const float* _22, __global float* _32)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    __local float sShared;
+    uint ident = ((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x;
+    float idata = _22[ident];
+    _32[ident] = sShared + idata;
+}
+
diff --git a/reference/shaders-opencl/comp/shared-zero-init.comp b/reference/shaders-opencl/comp/shared-zero-init.comp
new file mode 100644
index 000000000..ec05e3c9a
--- /dev/null
+++ b/reference/shaders-opencl/comp/shared-zero-init.comp
@@ -0,0 +1,32 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct SSBO
+{
+    float in_data[1];
+};
+
+typedef struct SSBO SSBO;
+
+constant float _31[4] = { 0.0f, 0.0f, 0.0f, 0.0f };
+
+struct SSBO2
+{
+    float out_data[1];
+};
+
+typedef struct SSBO2 SSBO2;
+
+__attribute__((reqd_work_group_size(4, 1, 1)))
+__kernel void comp_main(__global const float* _22, __global float* _48)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    __local float sShared[4];
+    uint ident = ((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x;
+    float idata = _22[ident];
+    sShared[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))] += idata;
+    mem_fence(CLK_LOCAL_MEM_FENCE);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    _48[ident] = sShared[(4u - ((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))) - 1u];
+}
+
diff --git a/reference/shaders-opencl/comp/shared.comp b/reference/shaders-opencl/comp/shared.comp
new file mode 100644
index 000000000..a1b217b6f
--- /dev/null
+++ b/reference/shaders-opencl/comp/shared.comp
@@ -0,0 +1,30 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct SSBO
+{
+    float in_data[1];
+};
+
+typedef struct SSBO SSBO;
+
+struct SSBO2
+{
+    float out_data[1];
+};
+
+typedef struct SSBO2 SSBO2;
+
+__attribute__((reqd_work_group_size(4, 1, 1)))
+__kernel void comp_main(__global const float* _22, __global float* _44)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    __local float sShared[4];
+    uint ident = ((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x;
+    float idata = _22[ident];
+    sShared[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))] = idata;
+    mem_fence(CLK_LOCAL_MEM_FENCE);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    _44[ident] = sShared[(4u - ((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))) - 1u];
+}
+
diff --git a/reference/shaders-opencl/comp/spec-constant-work-group-size.comp b/reference/shaders-opencl/comp/spec-constant-work-group-size.comp
new file mode 100644
index 000000000..4bf86f53f
--- /dev/null
+++ b/reference/shaders-opencl/comp/spec-constant-work-group-size.comp
@@ -0,0 +1,39 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+#ifndef SPIRV_CROSS_CONSTANT_ID_1
+#define SPIRV_CROSS_CONSTANT_ID_1 2
+#endif
+constant int b = SPIRV_CROSS_CONSTANT_ID_1;
+#ifndef SPIRV_CROSS_CONSTANT_ID_0
+#define SPIRV_CROSS_CONSTANT_ID_0 1
+#endif
+constant int a = SPIRV_CROSS_CONSTANT_ID_0;
+
+struct SSBO
+{
+    int v[1];
+};
+
+typedef struct SSBO SSBO;
+
+#define _21 ((as_uint(a) + 0u))
+#ifndef SPIRV_CROSS_CONSTANT_ID_10
+#define SPIRV_CROSS_CONSTANT_ID_10 1u
+#endif
+constant uint _22 = SPIRV_CROSS_CONSTANT_ID_10;
+constant uint3 spvWorkgroupSize = (uint3)(_22, 20u, 1u);
+#define _27 (spvWorkgroupSize.x)
+#define _28 ((_21 + _27))
+#define _29 (spvWorkgroupSize.y)
+#define _30 ((_28 + _29))
+#define _32 ((1 - a))
+
+__attribute__((reqd_work_group_size(1, 20, 1)))
+__kernel void comp_main(__global int* _17)
+{
+    int spec_const_array_size[b];
+    spec_const_array_size[a] = a;
+    _17[_30] = b + spec_const_array_size[_32];
+}
+
diff --git a/reference/shaders-opencl/comp/struct-layout.comp b/reference/shaders-opencl/comp/struct-layout.comp
new file mode 100644
index 000000000..eb416ee27
--- /dev/null
+++ b/reference/shaders-opencl/comp/struct-layout.comp
@@ -0,0 +1,32 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct Foo
+{
+    float4 m;
+};
+
+typedef struct Foo Foo;
+
+struct SSBO2
+{
+    Foo out_data[1];
+};
+
+typedef struct SSBO2 SSBO2;
+
+struct SSBO
+{
+    Foo in_data[1];
+};
+
+typedef struct SSBO SSBO;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global Foo* _23, __global const Foo* _30)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    uint ident = ((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x;
+    _23[ident].m = _30[ident].m * _30[ident].m;
+}
+
diff --git a/reference/shaders-opencl/comp/struct-nested.comp b/reference/shaders-opencl/comp/struct-nested.comp
new file mode 100644
index 000000000..3aeed9be7
--- /dev/null
+++ b/reference/shaders-opencl/comp/struct-nested.comp
@@ -0,0 +1,33 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct s1
+{
+    int a;
+};
+
+typedef struct s1 s1;
+
+struct s2
+{
+    s1 b;
+};
+
+typedef struct s2 s2;
+
+struct dstbuffer
+{
+    s2 test[1];
+};
+
+typedef struct dstbuffer dstbuffer;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global s2* _19)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    s2 testVal;
+    testVal.b.a = 0;
+    _19[0].b.a = testVal.b.a;
+}
+
diff --git a/reference/shaders-opencl/comp/struct-packing.invalid.comp b/reference/shaders-opencl/comp/struct-packing.invalid.comp
new file mode 100644
index 000000000..e69de29bb
diff --git a/reference/shaders-opencl/comp/torture-loop.comp b/reference/shaders-opencl/comp/torture-loop.comp
new file mode 100644
index 000000000..45f32a55b
--- /dev/null
+++ b/reference/shaders-opencl/comp/torture-loop.comp
@@ -0,0 +1,55 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct SSBO
+{
+    float4 mvp;
+    float4 in_data[1];
+};
+
+typedef struct SSBO SSBO;
+
+struct SSBO2
+{
+    float4 out_data[1];
+};
+
+typedef struct SSBO2 SSBO2;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global const SSBO* _24, __global float4* _89)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    uint ident_1 = ((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x;
+    float4 idat_1 = _24->in_data[ident_1];
+    int k_1 = 0;
+    for (;;)
+    {
+        int _39 = k_1;
+        int _40 = _39 + 1;
+        k_1 = _40;
+        if (_40 < 10)
+        {
+            idat_1 *= 2.0f;
+            k_1++;
+            continue;
+        }
+        else
+        {
+            break;
+        }
+    }
+    for (uint i_1 = 0u; i_1 < 16u; i_1 += as_uint(1), k_1++)
+    {
+        for (uint j_1 = 0u; j_1 < 30u; j_1 += as_uint(1))
+        {
+            idat_1 = _24->mvp * idat_1;
+        }
+    }
+    do
+    {
+        k_1++;
+    } while (k_1 > 10);
+    _89[ident_1] = idat_1;
+}
+
diff --git a/reference/shaders-opencl/comp/type-alias.comp b/reference/shaders-opencl/comp/type-alias.comp
new file mode 100644
index 000000000..0a195bf2b
--- /dev/null
+++ b/reference/shaders-opencl/comp/type-alias.comp
@@ -0,0 +1,61 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct S0
+{
+    float4 a;
+};
+
+typedef struct S0 S0;
+
+struct S1
+{
+    float4 a;
+};
+
+typedef struct S1 S1;
+
+struct SSBO0
+{
+    S0 s0s[1];
+};
+
+typedef struct SSBO0 SSBO0;
+
+struct SSBO1
+{
+    S1 s1s[1];
+};
+
+typedef struct SSBO1 SSBO1;
+
+struct SSBO2
+{
+    float4 outputs[1];
+};
+
+typedef struct SSBO2 SSBO2;
+
+float4 overload( S0* s0)
+{
+    return s0->a;
+}
+
+float4 overload_1( S1* s1)
+{
+    return s1->a;
+}
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global S0* _36, __global S1* _55, __global float4* _66)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    S0 s0_1;
+    s0_1.a = _36[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x].a;
+    S1 s1_1;
+    s1_1.a = _55[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x].a;
+    S0 param = s0_1;
+    S1 param_1 = s1_1;
+    _66[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x] = overload(&param) + overload_1(&param_1);
+}
+
diff --git a/reference/shaders-opencl/comp/udiv.comp b/reference/shaders-opencl/comp/udiv.comp
new file mode 100644
index 000000000..7e336b9b4
--- /dev/null
+++ b/reference/shaders-opencl/comp/udiv.comp
@@ -0,0 +1,24 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct SSBO2
+{
+    uint outputs[1];
+};
+
+typedef struct SSBO2 SSBO2;
+
+struct SSBO
+{
+    uint inputs[1];
+};
+
+typedef struct SSBO SSBO;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global uint* _10, __global uint* _23)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    _10[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x] = _23[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x] / 29u;
+}
+
diff --git a/reference/shaders-opencl/comp/writable-ssbo.comp b/reference/shaders-opencl/comp/writable-ssbo.comp
new file mode 100644
index 000000000..30716e427
--- /dev/null
+++ b/reference/shaders-opencl/comp/writable-ssbo.comp
@@ -0,0 +1,18 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct myBlock
+{
+    int a;
+    float b;
+};
+
+typedef struct myBlock myBlock;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global myBlock* myStorage)
+{
+    myStorage->a = (myStorage->a + 1) % 256;
+    myStorage->b += 0.0199999995529651641845703125f;
+}
+
diff --git a/shaders-opencl/asm/comp/atomic-decrement.asm.comp b/shaders-opencl/asm/comp/atomic-decrement.asm.comp
new file mode 100644
index 000000000..a87b93188
--- /dev/null
+++ b/shaders-opencl/asm/comp/atomic-decrement.asm.comp
@@ -0,0 +1,71 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Wine VKD3D Shader Compiler; 0
+; Bound: 43
+; Schema: 0
+               OpCapability Shader
+               OpCapability SampledBuffer
+               OpCapability ImageBuffer
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint GLCompute %3 "main" %15
+               OpExecutionMode %3 LocalSize 4 1 1
+               OpName %3 "main"
+               OpName %8 "u0"
+               OpName %9 "u0_counters"
+               OpMemberName %9 0 "c"
+               OpName %11 "u0_counter"
+               OpName %15 "vThreadID"
+               OpName %19 "r0"
+               OpDecorate %8 DescriptorSet 0
+               OpDecorate %8 Binding 0
+               OpMemberDecorate %9 0 Offset 0
+               OpDecorate %9 BufferBlock
+               OpDecorate %11 DescriptorSet 1
+               OpDecorate %11 Binding 0
+               OpDecorate %15 BuiltIn GlobalInvocationId
+          %1 = OpTypeVoid
+          %2 = OpTypeFunction %1
+          %5 = OpTypeInt 32 0
+          %6 = OpTypeImage %5 Buffer 0 0 0 2 R32ui
+          %7 = OpTypePointer UniformConstant %6
+          %8 = OpVariable %7 UniformConstant
+          %9 = OpTypeStruct %5
+         %10 = OpTypePointer Uniform %9
+         %11 = OpVariable %10 Uniform
+         %12 = OpTypeInt 32 1
+         %13 = OpTypeVector %12 3
+         %14 = OpTypePointer Input %13
+         %15 = OpVariable %14 Input
+         %16 = OpTypeFloat 32
+         %17 = OpTypeVector %16 4
+         %18 = OpTypePointer Function %17
+         %20 = OpTypePointer Uniform %5
+         %21 = OpConstant %5 0
+         %23 = OpConstant %5 1
+         %26 = OpTypePointer Function %16
+         %33 = OpConstant %12 0
+         %34 = OpConstant %5 2
+         %37 = OpTypePointer Input %12
+         %41 = OpTypeVector %5 4
+          %3 = OpFunction %1 None %2
+          %4 = OpLabel
+         %19 = OpVariable %18 Function
+         %22 = OpAccessChain %20 %11 %21
+         %24 = OpAtomicIDecrement %5 %22 %23 %21
+         %25 = OpBitcast %16 %24
+         %27 = OpInBoundsAccessChain %26 %19 %21
+               OpStore %27 %25
+         %28 = OpLoad %6 %8
+         %29 = OpInBoundsAccessChain %26 %19 %21
+         %30 = OpLoad %16 %29
+         %31 = OpBitcast %12 %30
+         %32 = OpIMul %5 %31 %23
+         %35 = OpShiftRightLogical %5 %33 %34
+         %36 = OpIAdd %5 %32 %35
+         %38 = OpInBoundsAccessChain %37 %15 %21
+         %39 = OpLoad %12 %38
+         %40 = OpBitcast %5 %39
+         %42 = OpCompositeConstruct %41 %40 %40 %40 %40
+               OpImageWrite %28 %36 %42
+               OpReturn
+               OpFunctionEnd
diff --git a/shaders-opencl/asm/comp/atomic-increment.asm.comp b/shaders-opencl/asm/comp/atomic-increment.asm.comp
new file mode 100644
index 000000000..3acb7115f
--- /dev/null
+++ b/shaders-opencl/asm/comp/atomic-increment.asm.comp
@@ -0,0 +1,71 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Wine VKD3D Shader Compiler; 0
+; Bound: 43
+; Schema: 0
+               OpCapability Shader
+               OpCapability SampledBuffer
+               OpCapability ImageBuffer
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint GLCompute %3 "main" %15
+               OpExecutionMode %3 LocalSize 4 1 1
+               OpName %3 "main"
+               OpName %8 "u0"
+               OpName %9 "u0_counters"
+               OpMemberName %9 0 "c"
+               OpName %11 "u0_counter"
+               OpName %15 "vThreadID"
+               OpName %19 "r0"
+               OpDecorate %8 DescriptorSet 0
+               OpDecorate %8 Binding 0
+               OpMemberDecorate %9 0 Offset 0
+               OpDecorate %9 BufferBlock
+               OpDecorate %11 DescriptorSet 1
+               OpDecorate %11 Binding 0
+               OpDecorate %15 BuiltIn GlobalInvocationId
+          %1 = OpTypeVoid
+          %2 = OpTypeFunction %1
+          %5 = OpTypeInt 32 0
+          %6 = OpTypeImage %5 Buffer 0 0 0 2 R32ui
+          %7 = OpTypePointer UniformConstant %6
+          %8 = OpVariable %7 UniformConstant
+          %9 = OpTypeStruct %5
+         %10 = OpTypePointer Uniform %9
+         %11 = OpVariable %10 Uniform
+         %12 = OpTypeInt 32 1
+         %13 = OpTypeVector %12 3
+         %14 = OpTypePointer Input %13
+         %15 = OpVariable %14 Input
+         %16 = OpTypeFloat 32
+         %17 = OpTypeVector %16 4
+         %18 = OpTypePointer Function %17
+         %20 = OpTypePointer Uniform %5
+         %21 = OpConstant %5 0
+         %23 = OpConstant %5 1
+         %26 = OpTypePointer Function %16
+         %33 = OpConstant %12 0
+         %34 = OpConstant %5 2
+         %37 = OpTypePointer Input %12
+         %41 = OpTypeVector %5 4
+          %3 = OpFunction %1 None %2
+          %4 = OpLabel
+         %19 = OpVariable %18 Function
+         %22 = OpAccessChain %20 %11 %21
+         %24 = OpAtomicIIncrement %5 %22 %23 %21
+         %25 = OpBitcast %16 %24
+         %27 = OpInBoundsAccessChain %26 %19 %21
+               OpStore %27 %25
+         %28 = OpLoad %6 %8
+         %29 = OpInBoundsAccessChain %26 %19 %21
+         %30 = OpLoad %16 %29
+         %31 = OpBitcast %12 %30
+         %32 = OpIMul %5 %31 %23
+         %35 = OpShiftRightLogical %5 %33 %34
+         %36 = OpIAdd %5 %32 %35
+         %38 = OpInBoundsAccessChain %37 %15 %21
+         %39 = OpLoad %12 %38
+         %40 = OpBitcast %5 %39
+         %42 = OpCompositeConstruct %41 %40 %40 %40 %40
+               OpImageWrite %28 %36 %42
+               OpReturn
+               OpFunctionEnd
diff --git a/shaders-opencl/asm/comp/bitcast_iadd.asm.comp b/shaders-opencl/asm/comp/bitcast_iadd.asm.comp
new file mode 100644
index 000000000..3b31ab285
--- /dev/null
+++ b/shaders-opencl/asm/comp/bitcast_iadd.asm.comp
@@ -0,0 +1,79 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos Glslang Reference Front End; 1
+; Bound: 30
+; Schema: 0
+               OpCapability Shader
+          %1 = OpExtInstImport "GLSL.std.450"
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint GLCompute %func "main"
+               OpExecutionMode %func LocalSize 1 1 1
+               OpSource ESSL 310
+               OpSourceExtension "GL_GOOGLE_cpp_style_line_directive"
+               OpSourceExtension "GL_GOOGLE_include_directive"
+               OpMemberDecorate %input_struct 0 Offset 0
+               OpMemberDecorate %input_struct 1 Offset 16
+               OpMemberDecorate %output_struct 0 Offset 0
+               OpMemberDecorate %output_struct 1 Offset 16
+               OpDecorate %input_struct BufferBlock
+               OpDecorate %inputs DescriptorSet 0
+               OpDecorate %inputs Binding 0
+			   OpDecorate %inputs Restrict
+               OpDecorate %output_struct BufferBlock
+               OpDecorate %outputs DescriptorSet 0
+               OpDecorate %outputs Binding 1
+			   OpDecorate %outputs Restrict
+
+          %void = OpTypeVoid
+          %main_func = OpTypeFunction %void
+
+          %uint = OpTypeInt 32 0
+          %uvec4 = OpTypeVector %uint 4
+
+         %int = OpTypeInt 32 1
+         %ivec4 = OpTypeVector %int 4
+
+         %ivec4_ptr = OpTypePointer Uniform %ivec4
+         %uvec4_ptr = OpTypePointer Uniform %uvec4
+
+		 %zero = OpConstant %int 0
+		 %one = OpConstant %int 1
+
+         %input_struct = OpTypeStruct %ivec4 %uvec4
+         %input_struct_ptr = OpTypePointer Uniform %input_struct
+         %inputs = OpVariable %input_struct_ptr Uniform
+         %output_struct = OpTypeStruct %uvec4 %ivec4
+         %output_struct_ptr = OpTypePointer Uniform %output_struct
+         %outputs = OpVariable %output_struct_ptr Uniform
+
+          %func = OpFunction %void None %main_func
+          %block = OpLabel
+
+         %input1_ptr = OpAccessChain %ivec4_ptr %inputs %zero
+         %input0_ptr = OpAccessChain %uvec4_ptr %inputs %one
+         %input1 = OpLoad %ivec4 %input1_ptr
+         %input0 = OpLoad %uvec4 %input0_ptr
+
+         %output_ptr_uvec4 = OpAccessChain %uvec4_ptr %outputs %zero
+         %output_ptr_ivec4 = OpAccessChain %ivec4_ptr %outputs %one
+
+; Test all variants of IAdd
+         %result_iadd_0 = OpIAdd %uvec4 %input0 %input1
+         %result_iadd_1 = OpIAdd %uvec4 %input1 %input0
+         %result_iadd_2 = OpIAdd %uvec4 %input0 %input0
+         %result_iadd_3 = OpIAdd %uvec4 %input1 %input1
+         %result_iadd_4 = OpIAdd %ivec4 %input0 %input0
+         %result_iadd_5 = OpIAdd %ivec4 %input1 %input1
+         %result_iadd_6 = OpIAdd %ivec4 %input0 %input1
+         %result_iadd_7 = OpIAdd %ivec4 %input1 %input0
+			   OpStore %output_ptr_uvec4 %result_iadd_0
+			   OpStore %output_ptr_uvec4 %result_iadd_1
+			   OpStore %output_ptr_uvec4 %result_iadd_2
+			   OpStore %output_ptr_uvec4 %result_iadd_3
+			   OpStore %output_ptr_ivec4 %result_iadd_4
+			   OpStore %output_ptr_ivec4 %result_iadd_5
+			   OpStore %output_ptr_ivec4 %result_iadd_6
+			   OpStore %output_ptr_ivec4 %result_iadd_7
+
+               OpReturn
+               OpFunctionEnd
diff --git a/shaders-opencl/asm/comp/bitcast_icmp.asm.comp b/shaders-opencl/asm/comp/bitcast_icmp.asm.comp
new file mode 100644
index 000000000..b7b4e0b2e
--- /dev/null
+++ b/shaders-opencl/asm/comp/bitcast_icmp.asm.comp
@@ -0,0 +1,101 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos Glslang Reference Front End; 1
+; Bound: 30
+; Schema: 0
+               OpCapability Shader
+          %1 = OpExtInstImport "GLSL.std.450"
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint GLCompute %func "main"
+               OpExecutionMode %func LocalSize 1 1 1
+               OpSource ESSL 310
+               OpSourceExtension "GL_GOOGLE_cpp_style_line_directive"
+               OpSourceExtension "GL_GOOGLE_include_directive"
+               OpMemberDecorate %input_struct 0 Offset 0
+               OpMemberDecorate %input_struct 1 Offset 16
+               OpMemberDecorate %output_struct 0 Offset 0
+               OpMemberDecorate %output_struct 1 Offset 16
+               OpDecorate %input_struct BufferBlock
+               OpDecorate %inputs DescriptorSet 0
+               OpDecorate %inputs Binding 0
+			   OpDecorate %inputs Restrict
+               OpDecorate %output_struct BufferBlock
+               OpDecorate %outputs DescriptorSet 0
+               OpDecorate %outputs Binding 1
+			   OpDecorate %outputs Restrict
+
+          %void = OpTypeVoid
+          %main_func = OpTypeFunction %void
+
+		  %bool = OpTypeBool
+		  %bvec4 = OpTypeVector %bool 4
+
+          %uint = OpTypeInt 32 0
+          %uvec4 = OpTypeVector %uint 4
+
+         %int = OpTypeInt 32 1
+         %ivec4 = OpTypeVector %int 4
+
+         %ivec4_ptr = OpTypePointer Uniform %ivec4
+         %uvec4_ptr = OpTypePointer Uniform %uvec4
+
+		 %zero = OpConstant %int 0
+		 %one = OpConstant %int 1
+		 %uzero = OpConstant %uint 0
+		 %uone = OpConstant %uint 1
+		 %utrue = OpConstantComposite %uvec4 %uone %uone %uone %uone
+		 %ufalse = OpConstantComposite %uvec4 %uzero %uzero %uzero %uzero
+
+         %input_struct = OpTypeStruct %ivec4 %uvec4
+         %input_struct_ptr = OpTypePointer Uniform %input_struct
+         %inputs = OpVariable %input_struct_ptr Uniform
+         %output_struct = OpTypeStruct %uvec4 %ivec4
+         %output_struct_ptr = OpTypePointer Uniform %output_struct
+         %outputs = OpVariable %output_struct_ptr Uniform
+
+          %func = OpFunction %void None %main_func
+          %block = OpLabel
+
+         %input1_ptr = OpAccessChain %ivec4_ptr %inputs %zero
+         %input0_ptr = OpAccessChain %uvec4_ptr %inputs %one
+         %input1 = OpLoad %ivec4 %input1_ptr
+         %input0 = OpLoad %uvec4 %input0_ptr
+
+         %output_ptr_uvec4 = OpAccessChain %uvec4_ptr %outputs %zero
+
+         %result_slt = OpSLessThan %bvec4 %input0 %input1
+         %result_sle = OpSLessThanEqual %bvec4 %input0 %input1
+         %result_ult = OpULessThan %bvec4 %input0 %input1
+         %result_ule = OpULessThanEqual %bvec4 %input0 %input1
+         %result_sgt = OpSGreaterThan %bvec4 %input0 %input1
+         %result_sge = OpSGreaterThanEqual %bvec4 %input0 %input1
+         %result_ugt = OpUGreaterThan %bvec4 %input0 %input1
+         %result_uge = OpUGreaterThanEqual %bvec4 %input0 %input1
+
+		 %int_slt = OpSelect %uvec4 %result_slt %utrue %ufalse
+		 OpStore %output_ptr_uvec4 %int_slt
+
+		 %int_sle = OpSelect %uvec4 %result_sle %utrue %ufalse
+		 OpStore %output_ptr_uvec4 %int_sle
+
+		 %int_ult = OpSelect %uvec4 %result_ult %utrue %ufalse
+		 OpStore %output_ptr_uvec4 %int_ult
+
+		 %int_ule = OpSelect %uvec4 %result_ule %utrue %ufalse
+		 OpStore %output_ptr_uvec4 %int_ule
+
+		 %int_sgt = OpSelect %uvec4 %result_sgt %utrue %ufalse
+		 OpStore %output_ptr_uvec4 %int_sgt
+
+		 %int_sge = OpSelect %uvec4 %result_sge %utrue %ufalse
+		 OpStore %output_ptr_uvec4 %int_sge
+
+		 %int_ugt = OpSelect %uvec4 %result_ugt %utrue %ufalse
+		 OpStore %output_ptr_uvec4 %int_ugt
+
+		 %int_uge = OpSelect %uvec4 %result_uge %utrue %ufalse
+		 OpStore %output_ptr_uvec4 %int_uge
+
+
+               OpReturn
+               OpFunctionEnd
diff --git a/shaders-opencl/asm/comp/bitcast_sar.asm.comp b/shaders-opencl/asm/comp/bitcast_sar.asm.comp
new file mode 100644
index 000000000..64f19fc34
--- /dev/null
+++ b/shaders-opencl/asm/comp/bitcast_sar.asm.comp
@@ -0,0 +1,77 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos Glslang Reference Front End; 1
+; Bound: 30
+; Schema: 0
+               OpCapability Shader
+          %1 = OpExtInstImport "GLSL.std.450"
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint GLCompute %func "main"
+               OpExecutionMode %func LocalSize 1 1 1
+               OpSource ESSL 310
+               OpSourceExtension "GL_GOOGLE_cpp_style_line_directive"
+               OpSourceExtension "GL_GOOGLE_include_directive"
+               OpMemberDecorate %input_struct 0 Offset 0
+               OpMemberDecorate %input_struct 1 Offset 16
+               OpMemberDecorate %output_struct 0 Offset 0
+               OpMemberDecorate %output_struct 1 Offset 16
+               OpDecorate %input_struct BufferBlock
+               OpDecorate %inputs DescriptorSet 0
+               OpDecorate %inputs Binding 0
+               OpDecorate %output_struct BufferBlock
+               OpDecorate %outputs DescriptorSet 0
+               OpDecorate %outputs Binding 1
+
+          %void = OpTypeVoid
+          %main_func = OpTypeFunction %void
+
+          %uint = OpTypeInt 32 0
+          %uvec4 = OpTypeVector %uint 4
+
+         %int = OpTypeInt 32 1
+         %ivec4 = OpTypeVector %int 4
+
+         %ivec4_ptr = OpTypePointer Uniform %ivec4
+         %uvec4_ptr = OpTypePointer Uniform %uvec4
+
+		 %zero = OpConstant %int 0
+		 %one = OpConstant %int 1
+
+         %input_struct = OpTypeStruct %ivec4 %uvec4
+         %input_struct_ptr = OpTypePointer Uniform %input_struct
+         %inputs = OpVariable %input_struct_ptr Uniform
+         %output_struct = OpTypeStruct %uvec4 %ivec4
+         %output_struct_ptr = OpTypePointer Uniform %output_struct
+         %outputs = OpVariable %output_struct_ptr Uniform
+
+          %func = OpFunction %void None %main_func
+          %block = OpLabel
+
+         %input1_ptr = OpAccessChain %ivec4_ptr %inputs %zero
+         %input0_ptr = OpAccessChain %uvec4_ptr %inputs %one
+         %input1 = OpLoad %ivec4 %input1_ptr
+         %input0 = OpLoad %uvec4 %input0_ptr
+
+         %output_ptr_uvec4 = OpAccessChain %uvec4_ptr %outputs %zero
+         %output_ptr_ivec4 = OpAccessChain %ivec4_ptr %outputs %one
+
+; Test all variants of ShiftRightArithmetic
+         %result_iadd_0 = OpShiftRightArithmetic %uvec4 %input0 %input1
+         %result_iadd_1 = OpShiftRightArithmetic %uvec4 %input1 %input0
+         %result_iadd_2 = OpShiftRightArithmetic %uvec4 %input0 %input0
+         %result_iadd_3 = OpShiftRightArithmetic %uvec4 %input1 %input1
+         %result_iadd_4 = OpShiftRightArithmetic %ivec4 %input0 %input0
+         %result_iadd_5 = OpShiftRightArithmetic %ivec4 %input1 %input1
+         %result_iadd_6 = OpShiftRightArithmetic %ivec4 %input0 %input1
+         %result_iadd_7 = OpShiftRightArithmetic %ivec4 %input1 %input0
+			   OpStore %output_ptr_uvec4 %result_iadd_0
+			   OpStore %output_ptr_uvec4 %result_iadd_1
+			   OpStore %output_ptr_uvec4 %result_iadd_2
+			   OpStore %output_ptr_uvec4 %result_iadd_3
+			   OpStore %output_ptr_ivec4 %result_iadd_4
+			   OpStore %output_ptr_ivec4 %result_iadd_5
+			   OpStore %output_ptr_ivec4 %result_iadd_6
+			   OpStore %output_ptr_ivec4 %result_iadd_7
+
+               OpReturn
+               OpFunctionEnd
diff --git a/shaders-opencl/asm/comp/bitcast_sdiv.asm.comp b/shaders-opencl/asm/comp/bitcast_sdiv.asm.comp
new file mode 100644
index 000000000..ab73ec83d
--- /dev/null
+++ b/shaders-opencl/asm/comp/bitcast_sdiv.asm.comp
@@ -0,0 +1,77 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos Glslang Reference Front End; 1
+; Bound: 30
+; Schema: 0
+               OpCapability Shader
+          %1 = OpExtInstImport "GLSL.std.450"
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint GLCompute %func "main"
+               OpExecutionMode %func LocalSize 1 1 1
+               OpSource ESSL 310
+               OpSourceExtension "GL_GOOGLE_cpp_style_line_directive"
+               OpSourceExtension "GL_GOOGLE_include_directive"
+               OpMemberDecorate %input_struct 0 Offset 0
+               OpMemberDecorate %input_struct 1 Offset 16
+               OpMemberDecorate %output_struct 0 Offset 0
+               OpMemberDecorate %output_struct 1 Offset 16
+               OpDecorate %input_struct BufferBlock
+               OpDecorate %inputs DescriptorSet 0
+               OpDecorate %inputs Binding 0
+               OpDecorate %output_struct BufferBlock
+               OpDecorate %outputs DescriptorSet 0
+               OpDecorate %outputs Binding 1
+
+          %void = OpTypeVoid
+          %main_func = OpTypeFunction %void
+
+          %uint = OpTypeInt 32 0
+          %uvec4 = OpTypeVector %uint 4
+
+         %int = OpTypeInt 32 1
+         %ivec4 = OpTypeVector %int 4
+
+         %ivec4_ptr = OpTypePointer Uniform %ivec4
+         %uvec4_ptr = OpTypePointer Uniform %uvec4
+
+		 %zero = OpConstant %int 0
+		 %one = OpConstant %int 1
+
+         %input_struct = OpTypeStruct %ivec4 %uvec4
+         %input_struct_ptr = OpTypePointer Uniform %input_struct
+         %inputs = OpVariable %input_struct_ptr Uniform
+         %output_struct = OpTypeStruct %uvec4 %ivec4
+         %output_struct_ptr = OpTypePointer Uniform %output_struct
+         %outputs = OpVariable %output_struct_ptr Uniform
+
+          %func = OpFunction %void None %main_func
+          %block = OpLabel
+
+         %input1_ptr = OpAccessChain %ivec4_ptr %inputs %zero
+         %input0_ptr = OpAccessChain %uvec4_ptr %inputs %one
+         %input1 = OpLoad %ivec4 %input1_ptr
+         %input0 = OpLoad %uvec4 %input0_ptr
+
+         %output_ptr_uvec4 = OpAccessChain %uvec4_ptr %outputs %zero
+         %output_ptr_ivec4 = OpAccessChain %ivec4_ptr %outputs %one
+
+; Test all variants of SDiv
+         %result_iadd_0 = OpSDiv %uvec4 %input0 %input1
+         %result_iadd_1 = OpSDiv %uvec4 %input1 %input0
+         %result_iadd_2 = OpSDiv %uvec4 %input0 %input0
+         %result_iadd_3 = OpSDiv %uvec4 %input1 %input1
+         %result_iadd_4 = OpSDiv %ivec4 %input0 %input0
+         %result_iadd_5 = OpSDiv %ivec4 %input1 %input1
+         %result_iadd_6 = OpSDiv %ivec4 %input0 %input1
+         %result_iadd_7 = OpSDiv %ivec4 %input1 %input0
+			   OpStore %output_ptr_uvec4 %result_iadd_0
+			   OpStore %output_ptr_uvec4 %result_iadd_1
+			   OpStore %output_ptr_uvec4 %result_iadd_2
+			   OpStore %output_ptr_uvec4 %result_iadd_3
+			   OpStore %output_ptr_ivec4 %result_iadd_4
+			   OpStore %output_ptr_ivec4 %result_iadd_5
+			   OpStore %output_ptr_ivec4 %result_iadd_6
+			   OpStore %output_ptr_ivec4 %result_iadd_7
+
+               OpReturn
+               OpFunctionEnd
diff --git a/shaders-opencl/asm/comp/bitcast_slr.asm.comp b/shaders-opencl/asm/comp/bitcast_slr.asm.comp
new file mode 100644
index 000000000..6741f5cb5
--- /dev/null
+++ b/shaders-opencl/asm/comp/bitcast_slr.asm.comp
@@ -0,0 +1,77 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos Glslang Reference Front End; 1
+; Bound: 30
+; Schema: 0
+               OpCapability Shader
+          %1 = OpExtInstImport "GLSL.std.450"
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint GLCompute %func "main"
+               OpExecutionMode %func LocalSize 1 1 1
+               OpSource ESSL 310
+               OpSourceExtension "GL_GOOGLE_cpp_style_line_directive"
+               OpSourceExtension "GL_GOOGLE_include_directive"
+               OpMemberDecorate %input_struct 0 Offset 0
+               OpMemberDecorate %input_struct 1 Offset 16
+               OpMemberDecorate %output_struct 0 Offset 0
+               OpMemberDecorate %output_struct 1 Offset 16
+               OpDecorate %input_struct BufferBlock
+               OpDecorate %inputs DescriptorSet 0
+               OpDecorate %inputs Binding 0
+               OpDecorate %output_struct BufferBlock
+               OpDecorate %outputs DescriptorSet 0
+               OpDecorate %outputs Binding 1
+
+          %void = OpTypeVoid
+          %main_func = OpTypeFunction %void
+
+          %uint = OpTypeInt 32 0
+          %uvec4 = OpTypeVector %uint 4
+
+         %int = OpTypeInt 32 1
+         %ivec4 = OpTypeVector %int 4
+
+         %ivec4_ptr = OpTypePointer Uniform %ivec4
+         %uvec4_ptr = OpTypePointer Uniform %uvec4
+
+		 %zero = OpConstant %int 0
+		 %one = OpConstant %int 1
+
+         %input_struct = OpTypeStruct %ivec4 %uvec4
+         %input_struct_ptr = OpTypePointer Uniform %input_struct
+         %inputs = OpVariable %input_struct_ptr Uniform
+         %output_struct = OpTypeStruct %uvec4 %ivec4
+         %output_struct_ptr = OpTypePointer Uniform %output_struct
+         %outputs = OpVariable %output_struct_ptr Uniform
+
+          %func = OpFunction %void None %main_func
+          %block = OpLabel
+
+         %input1_ptr = OpAccessChain %ivec4_ptr %inputs %zero
+         %input0_ptr = OpAccessChain %uvec4_ptr %inputs %one
+         %input1 = OpLoad %ivec4 %input1_ptr
+         %input0 = OpLoad %uvec4 %input0_ptr
+
+         %output_ptr_uvec4 = OpAccessChain %uvec4_ptr %outputs %zero
+         %output_ptr_ivec4 = OpAccessChain %ivec4_ptr %outputs %one
+
+; Test all variants of ShiftRightLogical
+         %result_iadd_0 = OpShiftRightLogical %uvec4 %input0 %input1
+         %result_iadd_1 = OpShiftRightLogical %uvec4 %input1 %input0
+         %result_iadd_2 = OpShiftRightLogical %uvec4 %input0 %input0
+         %result_iadd_3 = OpShiftRightLogical %uvec4 %input1 %input1
+         %result_iadd_4 = OpShiftRightLogical %ivec4 %input0 %input0
+         %result_iadd_5 = OpShiftRightLogical %ivec4 %input1 %input1
+         %result_iadd_6 = OpShiftRightLogical %ivec4 %input0 %input1
+         %result_iadd_7 = OpShiftRightLogical %ivec4 %input1 %input0
+			   OpStore %output_ptr_uvec4 %result_iadd_0
+			   OpStore %output_ptr_uvec4 %result_iadd_1
+			   OpStore %output_ptr_uvec4 %result_iadd_2
+			   OpStore %output_ptr_uvec4 %result_iadd_3
+			   OpStore %output_ptr_ivec4 %result_iadd_4
+			   OpStore %output_ptr_ivec4 %result_iadd_5
+			   OpStore %output_ptr_ivec4 %result_iadd_6
+			   OpStore %output_ptr_ivec4 %result_iadd_7
+
+               OpReturn
+               OpFunctionEnd
diff --git a/shaders-opencl/asm/comp/block-name-alias-global.asm.comp b/shaders-opencl/asm/comp/block-name-alias-global.asm.comp
new file mode 100644
index 000000000..85f6cc041
--- /dev/null
+++ b/shaders-opencl/asm/comp/block-name-alias-global.asm.comp
@@ -0,0 +1,119 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos Glslang Reference Front End; 7
+; Bound: 59
+; Schema: 0
+               OpCapability Shader
+          %1 = OpExtInstImport "GLSL.std.450"
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint GLCompute %main "main" %gl_GlobalInvocationID
+               OpExecutionMode %main LocalSize 1 1 1
+               OpSource GLSL 450
+               OpName %main "main"
+               OpName %Foo "A"
+               OpMemberName %Foo 0 "a"
+               OpMemberName %Foo 1 "b"
+               OpName %A "A"
+               OpMemberName %A 0 "Data"
+               OpName %C1 "C1"
+               OpName %gl_GlobalInvocationID "gl_GlobalInvocationID"
+               OpName %Foo_0 "A"
+               OpMemberName %Foo_0 0 "a"
+               OpMemberName %Foo_0 1 "b"
+               OpName %A_0 "A"
+               OpMemberName %A_0 0 "Data"
+               OpName %C2 "C2"
+               OpName %B "B"
+               OpMemberName %B 0 "Data"
+               OpName %C3 "C3"
+               OpName %B_0 "B"
+               OpMemberName %B_0 0 "Data"
+               OpName %C4 "C4"
+               OpMemberDecorate %Foo 0 Offset 0
+               OpMemberDecorate %Foo 1 Offset 4
+               OpDecorate %_runtimearr_Foo ArrayStride 8
+               OpMemberDecorate %A 0 Offset 0
+               OpDecorate %A BufferBlock
+               OpDecorate %C1 DescriptorSet 0
+               OpDecorate %C1 Binding 1
+               OpDecorate %gl_GlobalInvocationID BuiltIn GlobalInvocationId
+               OpMemberDecorate %Foo_0 0 Offset 0
+               OpMemberDecorate %Foo_0 1 Offset 4
+               OpDecorate %_arr_Foo_0_uint_1024 ArrayStride 16
+               OpMemberDecorate %A_0 0 Offset 0
+               OpDecorate %A_0 Block
+               OpDecorate %C2 DescriptorSet 0
+               OpDecorate %C2 Binding 2
+               OpDecorate %_runtimearr_Foo_0 ArrayStride 8
+               OpMemberDecorate %B 0 Offset 0
+               OpDecorate %B BufferBlock
+               OpDecorate %C3 DescriptorSet 0
+               OpDecorate %C3 Binding 0
+               OpDecorate %_arr_Foo_0_uint_1024_0 ArrayStride 16
+               OpMemberDecorate %B_0 0 Offset 0
+               OpDecorate %B_0 Block
+               OpDecorate %C4 DescriptorSet 0
+               OpDecorate %C4 Binding 3
+       %void = OpTypeVoid
+          %3 = OpTypeFunction %void
+        %int = OpTypeInt 32 1
+        %Foo = OpTypeStruct %int %int
+%_runtimearr_Foo = OpTypeRuntimeArray %Foo
+          %A = OpTypeStruct %_runtimearr_Foo
+%_ptr_Uniform_A = OpTypePointer Uniform %A
+         %C1 = OpVariable %_ptr_Uniform_A Uniform
+      %int_0 = OpConstant %int 0
+       %uint = OpTypeInt 32 0
+     %v3uint = OpTypeVector %uint 3
+%_ptr_Input_v3uint = OpTypePointer Input %v3uint
+%gl_GlobalInvocationID = OpVariable %_ptr_Input_v3uint Input
+     %uint_0 = OpConstant %uint 0
+%_ptr_Input_uint = OpTypePointer Input %uint
+      %Foo_0 = OpTypeStruct %int %int
+  %uint_1024 = OpConstant %uint 1024
+%_arr_Foo_0_uint_1024 = OpTypeArray %Foo_0 %uint_1024
+        %A_0 = OpTypeStruct %_arr_Foo_0_uint_1024
+%_ptr_Uniform_A_0 = OpTypePointer Uniform %A_0
+         %C2 = OpVariable %_ptr_Uniform_A_0 Uniform
+%_ptr_Uniform_Foo_0 = OpTypePointer Uniform %Foo_0
+%_ptr_Uniform_Foo = OpTypePointer Uniform %Foo
+%_ptr_Uniform_int = OpTypePointer Uniform %int
+      %int_1 = OpConstant %int 1
+%_runtimearr_Foo_0 = OpTypeRuntimeArray %Foo
+          %B = OpTypeStruct %_runtimearr_Foo_0
+%_ptr_Uniform_B = OpTypePointer Uniform %B
+         %C3 = OpVariable %_ptr_Uniform_B Uniform
+%_arr_Foo_0_uint_1024_0 = OpTypeArray %Foo_0 %uint_1024
+        %B_0 = OpTypeStruct %_arr_Foo_0_uint_1024_0
+%_ptr_Uniform_B_0 = OpTypePointer Uniform %B_0
+         %C4 = OpVariable %_ptr_Uniform_B_0 Uniform
+       %main = OpFunction %void None %3
+          %5 = OpLabel
+         %19 = OpAccessChain %_ptr_Input_uint %gl_GlobalInvocationID %uint_0
+         %20 = OpLoad %uint %19
+         %27 = OpAccessChain %_ptr_Input_uint %gl_GlobalInvocationID %uint_0
+         %28 = OpLoad %uint %27
+         %30 = OpAccessChain %_ptr_Uniform_Foo_0 %C2 %int_0 %28
+         %31 = OpLoad %Foo_0 %30
+         %33 = OpAccessChain %_ptr_Uniform_Foo %C1 %int_0 %20
+         %34 = OpCompositeExtract %int %31 0
+         %36 = OpAccessChain %_ptr_Uniform_int %33 %int_0
+               OpStore %36 %34
+         %37 = OpCompositeExtract %int %31 1
+         %39 = OpAccessChain %_ptr_Uniform_int %33 %int_1
+               OpStore %39 %37
+         %44 = OpAccessChain %_ptr_Input_uint %gl_GlobalInvocationID %uint_0
+         %45 = OpLoad %uint %44
+         %50 = OpAccessChain %_ptr_Input_uint %gl_GlobalInvocationID %uint_0
+         %51 = OpLoad %uint %50
+         %52 = OpAccessChain %_ptr_Uniform_Foo_0 %C4 %int_0 %51
+         %53 = OpLoad %Foo_0 %52
+         %54 = OpAccessChain %_ptr_Uniform_Foo %C3 %int_0 %45
+         %55 = OpCompositeExtract %int %53 0
+         %56 = OpAccessChain %_ptr_Uniform_int %54 %int_0
+               OpStore %56 %55
+         %57 = OpCompositeExtract %int %53 1
+         %58 = OpAccessChain %_ptr_Uniform_int %54 %int_1
+               OpStore %58 %57
+               OpReturn
+               OpFunctionEnd
diff --git a/shaders-opencl/asm/comp/buffer-write-relative-addr.asm.comp b/shaders-opencl/asm/comp/buffer-write-relative-addr.asm.comp
new file mode 100644
index 000000000..400690b04
--- /dev/null
+++ b/shaders-opencl/asm/comp/buffer-write-relative-addr.asm.comp
@@ -0,0 +1,93 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Wine VKD3D Shader Compiler; 0
+; Bound: 59
+; Schema: 0
+               OpCapability Shader
+               OpCapability UniformBufferArrayDynamicIndexing
+               OpCapability SampledBuffer
+               OpCapability ImageBuffer
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint GLCompute %main "main" %vThreadIDInGroup
+               OpExecutionMode %main LocalSize 4 1 1
+               OpName %main "main"
+               OpName %cb5_struct "cb5_struct"
+               OpName %cb0_5 "cb0_5"
+               OpName %u0 "u0"
+               OpName %vThreadIDInGroup "vThreadIDInGroup"
+               OpName %r0 "r0"
+               OpDecorate %_arr_v4float_uint_5 ArrayStride 16
+               OpDecorate %cb5_struct Block
+               OpMemberDecorate %cb5_struct 0 Offset 0
+               OpDecorate %cb0_5 DescriptorSet 0
+               OpDecorate %cb0_5 Binding 1
+               OpDecorate %u0 DescriptorSet 0
+               OpDecorate %u0 Binding 0
+               OpDecorate %u0 NonReadable
+               OpDecorate %vThreadIDInGroup BuiltIn LocalInvocationId
+       %void = OpTypeVoid
+          %2 = OpTypeFunction %void
+      %float = OpTypeFloat 32
+    %v4float = OpTypeVector %float 4
+       %uint = OpTypeInt 32 0
+     %uint_5 = OpConstant %uint 5
+%_arr_v4float_uint_5 = OpTypeArray %v4float %uint_5
+ %cb5_struct = OpTypeStruct %_arr_v4float_uint_5
+%_ptr_Uniform_cb5_struct = OpTypePointer Uniform %cb5_struct
+      %cb0_5 = OpVariable %_ptr_Uniform_cb5_struct Uniform
+         %13 = OpTypeImage %uint Buffer 0 0 0 2 R32ui
+%_ptr_UniformConstant_13 = OpTypePointer UniformConstant %13
+         %u0 = OpVariable %_ptr_UniformConstant_13 UniformConstant
+        %int = OpTypeInt 32 1
+      %v3int = OpTypeVector %int 3
+%_ptr_Input_v3int = OpTypePointer Input %v3int
+%vThreadIDInGroup = OpVariable %_ptr_Input_v3int Input
+%_ptr_Function_v4float = OpTypePointer Function %v4float
+%_ptr_Input_int = OpTypePointer Input %int
+     %uint_0 = OpConstant %uint 0
+      %int_4 = OpConstant %int 4
+%_ptr_Function_float = OpTypePointer Function %float
+     %uint_1 = OpConstant %uint 1
+     %uint_2 = OpConstant %uint 2
+%_ptr_Uniform_v4float = OpTypePointer Uniform %v4float
+     %v4uint = OpTypeVector %uint 4
+     %uint_3 = OpConstant %uint 3
+       %main = OpFunction %void None %2
+          %4 = OpLabel
+         %r0 = OpVariable %_ptr_Function_v4float Function
+         %24 = OpInBoundsAccessChain %_ptr_Input_int %vThreadIDInGroup %uint_0
+         %25 = OpLoad %int %24
+         %27 = OpShiftLeftLogical %int %25 %int_4
+         %28 = OpBitcast %float %27
+         %30 = OpInBoundsAccessChain %_ptr_Function_float %r0 %uint_0
+               OpStore %30 %28
+         %31 = OpInBoundsAccessChain %_ptr_Input_int %vThreadIDInGroup %uint_0
+         %32 = OpLoad %int %31
+         %33 = OpBitcast %float %32
+         %35 = OpInBoundsAccessChain %_ptr_Function_float %r0 %uint_1
+               OpStore %35 %33
+         %36 = OpLoad %13 %u0
+         %37 = OpInBoundsAccessChain %_ptr_Function_float %r0 %uint_0
+         %38 = OpLoad %float %37
+         %39 = OpBitcast %uint %38
+         %41 = OpShiftRightLogical %uint %39 %uint_2
+         %42 = OpInBoundsAccessChain %_ptr_Function_float %r0 %uint_1
+         %43 = OpLoad %float %42
+         %44 = OpBitcast %int %43
+         %45 = OpIAdd %uint %44 %uint_1
+         %47 = OpAccessChain %_ptr_Uniform_v4float %cb0_5 %uint_0 %45
+         %48 = OpLoad %v4float %47
+         %50 = OpBitcast %v4uint %48
+         %51 = OpVectorShuffle %v4uint %50 %50 0 0 0 0
+               OpImageWrite %36 %41 %51
+         %52 = OpVectorShuffle %v4uint %50 %50 1 1 1 1
+         %53 = OpIAdd %uint %41 %uint_1
+               OpImageWrite %36 %53 %52
+         %54 = OpVectorShuffle %v4uint %50 %50 2 2 2 2
+         %55 = OpIAdd %uint %41 %uint_2
+               OpImageWrite %36 %55 %54
+         %56 = OpVectorShuffle %v4uint %50 %50 3 3 3 3
+         %58 = OpIAdd %uint %41 %uint_3
+               OpImageWrite %36 %58 %56
+               OpReturn
+               OpFunctionEnd
diff --git a/shaders-opencl/asm/comp/buffer-write.asm.comp b/shaders-opencl/asm/comp/buffer-write.asm.comp
new file mode 100644
index 000000000..697324ba6
--- /dev/null
+++ b/shaders-opencl/asm/comp/buffer-write.asm.comp
@@ -0,0 +1,59 @@
+; SPIR-V
+; Version: 1.3
+; Generator: Khronos Glslang Reference Front End; 7
+; Bound: 63
+; Schema: 0
+               OpCapability Shader
+               OpCapability ImageBuffer
+          %1 = OpExtInstImport "GLSL.std.450"
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint GLCompute %main "main" %group_id %group_index
+               OpExecutionMode %main LocalSize 32 1 1
+               OpSource HLSL 500
+               OpName %main "main"
+               OpName %cb "cb"
+               OpMemberName %cb 0 "value"
+               OpName %_ ""
+               OpName %buffer "buffer"
+               OpName %group_id "group_id"
+               OpName %group_index "group_index"
+               OpMemberDecorate %cb 0 Offset 0
+               OpDecorate %cb Block
+               OpDecorate %_ DescriptorSet 0
+               OpDecorate %_ Binding 7
+               OpDecorate %buffer DescriptorSet 0
+               OpDecorate %buffer Binding 0
+               OpDecorate %group_id BuiltIn WorkgroupId
+               OpDecorate %group_index BuiltIn LocalInvocationIndex
+       %void = OpTypeVoid
+          %3 = OpTypeFunction %void
+       %uint = OpTypeInt 32 0
+     %v3uint = OpTypeVector %uint 3
+    %uint_32 = OpConstant %uint 32
+      %float = OpTypeFloat 32
+         %cb = OpTypeStruct %float
+%_ptr_Uniform_cb = OpTypePointer Uniform %cb
+          %_ = OpVariable %_ptr_Uniform_cb Uniform
+        %int = OpTypeInt 32 1
+      %int_0 = OpConstant %int 0
+%_ptr_Uniform_float = OpTypePointer Uniform %float
+         %34 = OpTypeImage %float Buffer 0 0 0 2 R32f
+%_ptr_UniformConstant_34 = OpTypePointer UniformConstant %34
+     %buffer = OpVariable %_ptr_UniformConstant_34 UniformConstant
+%_ptr_Input_v3uint = OpTypePointer Input %v3uint
+   %group_id = OpVariable %_ptr_Input_v3uint Input
+%_ptr_Input_uint = OpTypePointer Input %uint
+%group_index = OpVariable %_ptr_Input_uint Input
+       %main = OpFunction %void None %3
+          %5 = OpLabel
+         %43 = OpLoad %v3uint %group_id
+         %47 = OpLoad %uint %group_index
+         %56 = OpCompositeExtract %uint %43 0
+         %57 = OpIMul %uint %uint_32 %56
+         %59 = OpIAdd %uint %57 %47
+         %60 = OpAccessChain %_ptr_Uniform_float %_ %int_0
+         %61 = OpLoad %float %60
+         %62 = OpLoad %34 %buffer
+               OpImageWrite %62 %59 %61
+               OpReturn
+               OpFunctionEnd
diff --git a/shaders-opencl/asm/comp/copy-object-ssbo-to-ssbo.asm.comp b/shaders-opencl/asm/comp/copy-object-ssbo-to-ssbo.asm.comp
new file mode 100644
index 000000000..b01262f5b
--- /dev/null
+++ b/shaders-opencl/asm/comp/copy-object-ssbo-to-ssbo.asm.comp
@@ -0,0 +1,43 @@
+OpCapability Shader
+OpExtension "SPV_KHR_storage_buffer_storage_class"
+OpMemoryModel Logical GLSL450
+OpEntryPoint GLCompute %main "main" %var_id
+OpExecutionMode %main LocalSize 1 1 1
+OpDecorate %var_id BuiltIn GlobalInvocationId
+OpDecorate %var_input Binding 0
+OpDecorate %var_input DescriptorSet 0
+OpDecorate %var_outdata Binding 1
+OpDecorate %var_outdata DescriptorSet 0
+OpMemberDecorate %type_container_struct 0 Offset 0
+OpMemberDecorate %type_container_struct 1 Offset 4
+OpMemberDecorate %type_container_struct 2 Offset 8
+OpMemberDecorate %type_container_struct 3 Offset 12
+OpDecorate %type_container_struct Block
+%bool      = OpTypeBool
+%void      = OpTypeVoid
+%voidf     = OpTypeFunction %void
+%u32       = OpTypeInt 32 0
+%i32       = OpTypeInt 32 1
+%f32       = OpTypeFloat 32
+%uvec3     = OpTypeVector %u32 3
+%fvec3     = OpTypeVector %f32 3
+%uvec3ptr  = OpTypePointer Input %uvec3
+%i32ptr    = OpTypePointer Uniform %i32
+%f32ptr    = OpTypePointer Uniform %f32
+%i32arr    = OpTypeRuntimeArray %i32
+%f32arr    = OpTypeRuntimeArray %f32
+%type_empty_struct					= OpTypeStruct
+%type_container_struct				= OpTypeStruct %i32 %type_empty_struct %type_empty_struct %i32
+%type_container_struct_ubo_ptr		= OpTypePointer Uniform %type_container_struct
+%type_container_struct_ssbo_ptr	= OpTypePointer StorageBuffer %type_container_struct
+%var_id							= OpVariable %uvec3ptr Input
+%var_input						= OpVariable %type_container_struct_ssbo_ptr StorageBuffer
+%var_outdata					= OpVariable %type_container_struct_ssbo_ptr StorageBuffer
+
+%main								= OpFunction %void None %voidf
+%label								= OpLabel
+%input_copy					= OpCopyObject %type_container_struct_ssbo_ptr %var_input
+%result						= OpLoad %type_container_struct %input_copy
+OpStore %var_outdata %result
+OpReturn
+OpFunctionEnd
diff --git a/shaders-opencl/asm/comp/copy-object-ubo-to-ssbo.asm.comp b/shaders-opencl/asm/comp/copy-object-ubo-to-ssbo.asm.comp
new file mode 100644
index 000000000..63df59ac3
--- /dev/null
+++ b/shaders-opencl/asm/comp/copy-object-ubo-to-ssbo.asm.comp
@@ -0,0 +1,43 @@
+OpCapability Shader
+OpExtension "SPV_KHR_storage_buffer_storage_class"
+OpMemoryModel Logical GLSL450
+OpEntryPoint GLCompute %main "main" %var_id
+OpExecutionMode %main LocalSize 1 1 1
+OpDecorate %var_id BuiltIn GlobalInvocationId
+OpDecorate %var_input Binding 0
+OpDecorate %var_input DescriptorSet 0
+OpDecorate %var_outdata Binding 1
+OpDecorate %var_outdata DescriptorSet 0
+OpMemberDecorate %type_container_struct 0 Offset 0
+OpMemberDecorate %type_container_struct 1 Offset 16
+OpMemberDecorate %type_container_struct 2 Offset 32
+OpMemberDecorate %type_container_struct 3 Offset 48
+OpDecorate %type_container_struct Block
+%bool      = OpTypeBool
+%void      = OpTypeVoid
+%voidf     = OpTypeFunction %void
+%u32       = OpTypeInt 32 0
+%i32       = OpTypeInt 32 1
+%f32       = OpTypeFloat 32
+%uvec3     = OpTypeVector %u32 3
+%fvec3     = OpTypeVector %f32 3
+%uvec3ptr  = OpTypePointer Input %uvec3
+%i32ptr    = OpTypePointer Uniform %i32
+%f32ptr    = OpTypePointer Uniform %f32
+%i32arr    = OpTypeRuntimeArray %i32
+%f32arr    = OpTypeRuntimeArray %f32
+%type_empty_struct					= OpTypeStruct
+%type_container_struct				= OpTypeStruct %i32 %type_empty_struct %type_empty_struct %i32
+%type_container_struct_ubo_ptr		= OpTypePointer Uniform %type_container_struct
+%type_container_struct_ssbo_ptr	= OpTypePointer StorageBuffer %type_container_struct
+%var_id							= OpVariable %uvec3ptr Input
+%var_input						= OpVariable %type_container_struct_ubo_ptr Uniform
+%var_outdata					= OpVariable %type_container_struct_ssbo_ptr StorageBuffer
+
+%main								= OpFunction %void None %voidf
+%label								= OpLabel
+%input_copy					= OpCopyObject %type_container_struct_ubo_ptr %var_input
+%result						= OpLoad %type_container_struct %input_copy
+OpStore %var_outdata %result
+OpReturn
+OpFunctionEnd
diff --git a/shaders-opencl/asm/comp/duplicate-spec-id.asm.comp b/shaders-opencl/asm/comp/duplicate-spec-id.asm.comp
new file mode 100644
index 000000000..4a5aa3d8b
--- /dev/null
+++ b/shaders-opencl/asm/comp/duplicate-spec-id.asm.comp
@@ -0,0 +1,54 @@
+; SPIR-V
+; Version: 1.3
+; Generator: Khronos Glslang Reference Front End; 11
+; Bound: 26
+; Schema: 0
+               OpCapability Shader
+          %1 = OpExtInstImport "GLSL.std.450"
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint GLCompute %main "main" %gl_LocalInvocationIndex
+               OpExecutionMode %main LocalSize 1 1 1
+               OpSource GLSL 450
+               OpName %main "main"
+               OpName %StorageBuffer "StorageBuffer"
+               OpMemberName %StorageBuffer 0 "values"
+               OpName %ssbo "ssbo"
+               OpName %gl_LocalInvocationIndex "gl_LocalInvocationIndex"
+               OpName %foo "foo"
+               OpName %bar "bar"
+               OpDecorate %_runtimearr_float ArrayStride 4
+               OpMemberDecorate %StorageBuffer 0 Offset 0
+               OpDecorate %StorageBuffer Block
+               OpDecorate %ssbo DescriptorSet 0
+               OpDecorate %ssbo Binding 0
+               OpDecorate %gl_LocalInvocationIndex BuiltIn LocalInvocationIndex
+               OpDecorate %foo SpecId 0
+               OpDecorate %bar SpecId 0
+               OpDecorate %gl_WorkGroupSize BuiltIn WorkgroupSize
+       %void = OpTypeVoid
+          %3 = OpTypeFunction %void
+      %float = OpTypeFloat 32
+%_runtimearr_float = OpTypeRuntimeArray %float
+%StorageBuffer = OpTypeStruct %_runtimearr_float
+%_ptr_StorageBuffer_StorageBuffer = OpTypePointer StorageBuffer %StorageBuffer
+       %ssbo = OpVariable %_ptr_StorageBuffer_StorageBuffer StorageBuffer
+        %int = OpTypeInt 32 1
+      %int_0 = OpConstant %int 0
+       %uint = OpTypeInt 32 0
+%_ptr_Input_uint = OpTypePointer Input %uint
+%gl_LocalInvocationIndex = OpVariable %_ptr_Input_uint Input
+        %foo = OpSpecConstant %int 1
+        %bar = OpSpecConstant %float 2
+%_ptr_StorageBuffer_float = OpTypePointer StorageBuffer %float
+     %v3uint = OpTypeVector %uint 3
+     %uint_1 = OpConstant %uint 1
+%gl_WorkGroupSize = OpConstantComposite %v3uint %uint_1 %uint_1 %uint_1
+       %main = OpFunction %void None %3
+          %5 = OpLabel
+         %16 = OpLoad %uint %gl_LocalInvocationIndex
+         %18 = OpConvertSToF %float %foo
+         %20 = OpFAdd %float %18 %bar
+         %22 = OpAccessChain %_ptr_StorageBuffer_float %ssbo %int_0 %16
+               OpStore %22 %20
+               OpReturn
+               OpFunctionEnd
diff --git a/shaders-opencl/asm/comp/fma.spv16.asm.comp b/shaders-opencl/asm/comp/fma.spv16.asm.comp
new file mode 100644
index 000000000..ec57f19fe
--- /dev/null
+++ b/shaders-opencl/asm/comp/fma.spv16.asm.comp
@@ -0,0 +1,65 @@
+; SPIR-V
+; Version: 1.6
+; Generator: Khronos Glslang Reference Front End; 11
+; Bound: 30
+; Schema: 0
+               OpCapability Shader
+               OpCapability FMAKHR
+               OpExtension "SPV_KHR_fma"
+          %1 = OpExtInstImport "GLSL.std.450"
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint GLCompute %main "main" %_ %__0
+               OpExecutionMode %main LocalSize 1 1 1
+               OpSource ESSL 310
+               OpName %main "main"
+               OpName %SSBO2 "SSBO2"
+               OpMemberName %SSBO2 0 "out_data"
+               OpName %_ ""
+               OpName %SSBO "SSBO"
+               OpMemberName %SSBO 0 "in_data"
+               OpName %__0 ""
+               OpDecorate %_runtimearr_v4float ArrayStride 16
+               OpDecorate %SSBO2 Block
+               OpMemberDecorate %SSBO2 0 NonReadable
+               OpMemberDecorate %SSBO2 0 Offset 0
+               OpDecorate %_ NonReadable
+               OpDecorate %_ Binding 1
+               OpDecorate %_ DescriptorSet 0
+               OpDecorate %_runtimearr_v4float_0 ArrayStride 16
+               OpDecorate %SSBO Block
+               OpMemberDecorate %SSBO 0 NonWritable
+               OpMemberDecorate %SSBO 0 Offset 0
+               OpDecorate %__0 NonWritable
+               OpDecorate %__0 Binding 0
+               OpDecorate %__0 DescriptorSet 0
+       %void = OpTypeVoid
+          %3 = OpTypeFunction %void
+      %float = OpTypeFloat 32
+    %v4float = OpTypeVector %float 4
+%_runtimearr_v4float = OpTypeRuntimeArray %v4float
+      %SSBO2 = OpTypeStruct %_runtimearr_v4float
+%_ptr_StorageBuffer_SSBO2 = OpTypePointer StorageBuffer %SSBO2
+          %_ = OpVariable %_ptr_StorageBuffer_SSBO2 StorageBuffer
+        %int = OpTypeInt 32 1
+      %int_0 = OpConstant %int 0
+%_runtimearr_v4float_0 = OpTypeRuntimeArray %v4float
+       %SSBO = OpTypeStruct %_runtimearr_v4float_0
+%_ptr_StorageBuffer_SSBO = OpTypePointer StorageBuffer %SSBO
+        %__0 = OpVariable %_ptr_StorageBuffer_SSBO StorageBuffer
+%_ptr_StorageBuffer_v4float = OpTypePointer StorageBuffer %v4float
+      %int_1 = OpConstant %int 1
+       %uint = OpTypeInt 32 0
+     %v3uint = OpTypeVector %uint 3
+     %uint_1 = OpConstant %uint 1
+         %29 = OpConstantComposite %v3uint %uint_1 %uint_1 %uint_1
+       %main = OpFunction %void None %3
+          %5 = OpLabel
+         %19 = OpAccessChain %_ptr_StorageBuffer_v4float %__0 %int_0 %int_0
+         %20 = OpLoad %v4float %19
+         %22 = OpAccessChain %_ptr_StorageBuffer_v4float %__0 %int_0 %int_1
+         %23 = OpLoad %v4float %22
+         %24 = OpFmaKHR %v4float %20 %23 %23
+         %25 = OpAccessChain %_ptr_StorageBuffer_v4float %_ %int_0 %int_0
+               OpStore %25 %24
+               OpReturn
+               OpFunctionEnd
diff --git a/shaders-opencl/asm/comp/global-parameter-name-alias.asm.comp b/shaders-opencl/asm/comp/global-parameter-name-alias.asm.comp
new file mode 100644
index 000000000..78b1dc74e
--- /dev/null
+++ b/shaders-opencl/asm/comp/global-parameter-name-alias.asm.comp
@@ -0,0 +1,102 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos Glslang Reference Front End; 6
+; Bound: 61
+; Schema: 0
+               OpCapability Shader
+          %1 = OpExtInstImport "GLSL.std.450"
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint GLCompute %main "main" %id_1
+               OpExecutionMode %main LocalSize 1 1 1
+               OpSource HLSL 500
+               OpName %main "main"
+               OpName %Load_u1_ "Load(u1;"
+               OpName %size "size"
+               OpName %_main_vu3_ "@main(vu3;"
+               OpName %id "id"
+               OpName %data "data"
+               OpName %byteAddrTemp "byteAddrTemp"
+               OpName %ssbo "ssbo"
+               OpMemberName %ssbo 0 "@data"
+               OpName %ssbo_0 "ssbo"
+               OpName %param "param"
+               OpName %id_0 "id"
+               OpName %id_1 "id"
+               OpName %param_0 "param"
+               OpDecorate %_runtimearr_uint ArrayStride 4
+               OpMemberDecorate %ssbo 0 NonWritable
+               OpMemberDecorate %ssbo 0 Offset 0
+               OpDecorate %ssbo BufferBlock
+               OpDecorate %ssbo_0 DescriptorSet 0
+               OpDecorate %ssbo_0 Binding 1
+               OpDecorate %id_1 BuiltIn GlobalInvocationId
+       %void = OpTypeVoid
+          %3 = OpTypeFunction %void
+       %uint = OpTypeInt 32 0
+%_ptr_Function_uint = OpTypePointer Function %uint
+          %8 = OpTypeFunction %void %_ptr_Function_uint
+     %v3uint = OpTypeVector %uint 3
+%_ptr_Function_v3uint = OpTypePointer Function %v3uint
+         %14 = OpTypeFunction %void %_ptr_Function_v3uint
+     %v4uint = OpTypeVector %uint 4
+%_ptr_Function_v4uint = OpTypePointer Function %v4uint
+        %int = OpTypeInt 32 1
+%_ptr_Function_int = OpTypePointer Function %int
+      %int_2 = OpConstant %int 2
+%_runtimearr_uint = OpTypeRuntimeArray %uint
+       %ssbo = OpTypeStruct %_runtimearr_uint
+%_ptr_Uniform_ssbo = OpTypePointer Uniform %ssbo
+     %ssbo_0 = OpVariable %_ptr_Uniform_ssbo Uniform
+      %int_0 = OpConstant %int 0
+%_ptr_Uniform_uint = OpTypePointer Uniform %uint
+      %int_1 = OpConstant %int 1
+      %int_3 = OpConstant %int 3
+     %uint_4 = OpConstant %uint 4
+%_ptr_Input_v3uint = OpTypePointer Input %v3uint
+       %id_1 = OpVariable %_ptr_Input_v3uint Input
+       %main = OpFunction %void None %3
+          %5 = OpLabel
+       %id_0 = OpVariable %_ptr_Function_v3uint Function
+    %param_0 = OpVariable %_ptr_Function_v3uint Function
+         %57 = OpLoad %v3uint %id_1
+               OpStore %id_0 %57
+         %59 = OpLoad %v3uint %id_0
+               OpStore %param_0 %59
+         %60 = OpFunctionCall %void %_main_vu3_ %param_0
+               OpReturn
+               OpFunctionEnd
+   %Load_u1_ = OpFunction %void None %8
+       %size = OpFunctionParameter %_ptr_Function_uint
+         %11 = OpLabel
+       %data = OpVariable %_ptr_Function_v4uint Function
+%byteAddrTemp = OpVariable %_ptr_Function_int Function
+         %24 = OpLoad %uint %size
+         %26 = OpShiftRightLogical %int %24 %int_2
+               OpStore %byteAddrTemp %26
+         %32 = OpLoad %int %byteAddrTemp
+         %34 = OpAccessChain %_ptr_Uniform_uint %ssbo_0 %int_0 %32
+         %35 = OpLoad %uint %34
+         %36 = OpLoad %int %byteAddrTemp
+         %38 = OpIAdd %int %36 %int_1
+         %39 = OpAccessChain %_ptr_Uniform_uint %ssbo_0 %int_0 %38
+         %40 = OpLoad %uint %39
+         %41 = OpLoad %int %byteAddrTemp
+         %42 = OpIAdd %int %41 %int_2
+         %43 = OpAccessChain %_ptr_Uniform_uint %ssbo_0 %int_0 %42
+         %44 = OpLoad %uint %43
+         %45 = OpLoad %int %byteAddrTemp
+         %47 = OpIAdd %int %45 %int_3
+         %48 = OpAccessChain %_ptr_Uniform_uint %ssbo_0 %int_0 %47
+         %49 = OpLoad %uint %48
+         %50 = OpCompositeConstruct %v4uint %35 %40 %44 %49
+               OpStore %data %50
+               OpReturn
+               OpFunctionEnd
+ %_main_vu3_ = OpFunction %void None %14
+         %id = OpFunctionParameter %_ptr_Function_v3uint
+         %17 = OpLabel
+      %param = OpVariable %_ptr_Function_uint Function
+               OpStore %param %uint_4
+         %53 = OpFunctionCall %void %Load_u1_ %param
+               OpReturn
+               OpFunctionEnd
diff --git a/shaders-opencl/asm/comp/image-load-store-short-vector.invalid.asm.comp b/shaders-opencl/asm/comp/image-load-store-short-vector.invalid.asm.comp
new file mode 100644
index 000000000..8f759293e
--- /dev/null
+++ b/shaders-opencl/asm/comp/image-load-store-short-vector.invalid.asm.comp
@@ -0,0 +1,75 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos Glslang Reference Front End; 7
+; Bound: 44
+; Schema: 0
+               OpCapability Shader
+               OpCapability StorageImageExtendedFormats
+          %1 = OpExtInstImport "GLSL.std.450"
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint GLCompute %main "main" %id_1
+               OpExecutionMode %main LocalSize 1 1 1
+               OpSource HLSL 500
+               OpName %main "main"
+               OpName %_main_vu3_ "@main(vu3;"
+               OpName %id "id"
+               OpName %loaded "loaded"
+               OpName %TargetTexture "TargetTexture"
+               OpName %storeTemp "storeTemp"
+               OpName %id_0 "id"
+               OpName %id_1 "id"
+               OpName %param "param"
+               OpDecorate %TargetTexture DescriptorSet 0
+               OpDecorate %TargetTexture Binding 0
+               OpDecorate %id_1 BuiltIn WorkgroupId
+       %void = OpTypeVoid
+          %3 = OpTypeFunction %void
+       %uint = OpTypeInt 32 0
+     %v3uint = OpTypeVector %uint 3
+%_ptr_Function_v3uint = OpTypePointer Function %v3uint
+          %9 = OpTypeFunction %void %_ptr_Function_v3uint
+      %float = OpTypeFloat 32
+    %v2float = OpTypeVector %float 2
+%_ptr_Function_v2float = OpTypePointer Function %v2float
+         %17 = OpTypeImage %float 2D 0 0 0 2 Rg32f
+%_ptr_UniformConstant_17 = OpTypePointer UniformConstant %17
+%TargetTexture = OpVariable %_ptr_UniformConstant_17 UniformConstant
+     %v2uint = OpTypeVector %uint 2
+    %float_1 = OpConstant %float 1
+     %uint_1 = OpConstant %uint 1
+%_ptr_Input_v3uint = OpTypePointer Input %v3uint
+       %id_1 = OpVariable %_ptr_Input_v3uint Input
+       %main = OpFunction %void None %3
+          %5 = OpLabel
+       %id_0 = OpVariable %_ptr_Function_v3uint Function
+      %param = OpVariable %_ptr_Function_v3uint Function
+         %40 = OpLoad %v3uint %id_1
+               OpStore %id_0 %40
+         %42 = OpLoad %v3uint %id_0
+               OpStore %param %42
+         %43 = OpFunctionCall %void %_main_vu3_ %param
+               OpReturn
+               OpFunctionEnd
+ %_main_vu3_ = OpFunction %void None %9
+         %id = OpFunctionParameter %_ptr_Function_v3uint
+         %12 = OpLabel
+     %loaded = OpVariable %_ptr_Function_v2float Function
+  %storeTemp = OpVariable %_ptr_Function_v2float Function
+         %20 = OpLoad %17 %TargetTexture
+         %22 = OpLoad %v3uint %id
+         %23 = OpVectorShuffle %v2uint %22 %22 0 1
+         %24 = OpImageRead %v2float %20 %23
+               OpStore %loaded %24
+         %26 = OpLoad %v2float %loaded
+         %28 = OpCompositeConstruct %v2float %float_1 %float_1
+         %29 = OpFAdd %v2float %26 %28
+               OpStore %storeTemp %29
+         %30 = OpLoad %17 %TargetTexture
+         %31 = OpLoad %v3uint %id
+         %32 = OpVectorShuffle %v2uint %31 %31 0 1
+         %34 = OpCompositeConstruct %v2uint %uint_1 %uint_1
+         %35 = OpIAdd %v2uint %32 %34
+         %36 = OpLoad %v2float %storeTemp
+               OpImageWrite %30 %35 %36
+               OpReturn
+               OpFunctionEnd
diff --git a/shaders-opencl/asm/comp/op-spec-constant-op-vector-related.asm.comp b/shaders-opencl/asm/comp/op-spec-constant-op-vector-related.asm.comp
new file mode 100644
index 000000000..65a7eedd9
--- /dev/null
+++ b/shaders-opencl/asm/comp/op-spec-constant-op-vector-related.asm.comp
@@ -0,0 +1,107 @@
+OpCapability Shader
+OpMemoryModel Logical GLSL450
+OpEntryPoint GLCompute %main "main" %id 
+OpExecutionMode %main LocalSize 1 1 1
+OpName %main           "main"
+OpName %id             "gl_GlobalInvocationID"
+OpDecorate %id BuiltIn GlobalInvocationId
+OpDecorate %sc_0  SpecId 0
+OpDecorate %sc_1  SpecId 1
+OpDecorate %sc_2  SpecId 2
+OpDecorate %i32arr ArrayStride 4
+OpDecorate %buf BufferBlock
+OpDecorate %indata DescriptorSet 0
+OpDecorate %indata Binding 0
+OpDecorate %outdata DescriptorSet 0
+OpDecorate %outdata Binding 1
+OpDecorate %f32arr ArrayStride 4
+OpMemberDecorate %buf 0 Offset 0
+%bool      = OpTypeBool
+%void      = OpTypeVoid
+%voidf     = OpTypeFunction %void
+%u32       = OpTypeInt 32 0
+%i32       = OpTypeInt 32 1
+%f32       = OpTypeFloat 32
+%uvec3     = OpTypeVector %u32 3
+%fvec3     = OpTypeVector %f32 3
+%uvec3ptr  = OpTypePointer Input %uvec3
+%i32ptr    = OpTypePointer Uniform %i32
+%f32ptr    = OpTypePointer Uniform %f32
+%i32arr    = OpTypeRuntimeArray %i32
+%f32arr    = OpTypeRuntimeArray %f32
+%ivec3       = OpTypeVector %i32 3
+%zero        = OpConstant %i32 0
+%one         = OpConstant %i32 1
+%two         = OpConstant %i32 2
+%three       = OpConstant %i32 3
+%iarr3       = OpTypeArray %i32 %three
+%imat3       = OpTypeArray %iarr3 %three
+%struct      = OpTypeStruct %imat3
+%buf         = OpTypeStruct %i32arr
+%bufptr      = OpTypePointer Uniform %buf
+%indata      = OpVariable %bufptr Uniform
+%outdata     = OpVariable %bufptr Uniform
+%id          = OpVariable %uvec3ptr Input
+%ivec3_0     = OpConstantComposite %ivec3 %zero %zero %zero
+%vec3_undef  = OpUndef %ivec3
+%iarr3_0     = OpConstantComposite %iarr3 %zero %zero %zero
+%imat3_0     = OpConstantComposite %imat3 %iarr3_0 %iarr3_0 %iarr3_0
+%struct_0    = OpConstantComposite %struct %imat3_0
+%sc_0        = OpSpecConstant %i32 0
+%sc_1        = OpSpecConstant %i32 0
+%sc_2        = OpSpecConstant %i32 0
+%iarr3_a     = OpSpecConstantOp %iarr3  CompositeInsert  %sc_0        %iarr3_0     0
+%iarr3_b     = OpSpecConstantOp %iarr3  CompositeInsert  %sc_1        %iarr3_a     1
+%iarr3_c     = OpSpecConstantOp %iarr3  CompositeInsert  %sc_2        %iarr3_b     2
+%iarr3_d     = OpSpecConstantOp %iarr3  CompositeInsert  %sc_1        %iarr3_0     0
+%iarr3_e     = OpSpecConstantOp %iarr3  CompositeInsert  %sc_2        %iarr3_d     1
+%iarr3_f     = OpSpecConstantOp %iarr3  CompositeInsert  %sc_0        %iarr3_e     2
+%iarr3_g     = OpSpecConstantOp %iarr3  CompositeInsert  %sc_2        %iarr3_0     0
+%iarr3_h     = OpSpecConstantOp %iarr3  CompositeInsert  %sc_0        %iarr3_g     1
+%iarr3_i     = OpSpecConstantOp %iarr3  CompositeInsert  %sc_1        %iarr3_h     2
+%imat3_a     = OpSpecConstantOp %imat3  CompositeInsert  %iarr3_c     %imat3_0     0
+%imat3_b     = OpSpecConstantOp %imat3  CompositeInsert  %iarr3_f     %imat3_a     1
+%imat3_c     = OpSpecConstantOp %imat3  CompositeInsert  %iarr3_i     %imat3_b     2
+%struct_a    = OpSpecConstantOp %struct CompositeInsert  %imat3_c     %struct_0    0
+%struct_b    = OpSpecConstantOp %struct CompositeInsert  %sc_2        %struct_a    0 1 2
+%comp_0_0    = OpSpecConstantOp %i32    CompositeExtract %struct_a    0 0 0
+%comp_1_0    = OpSpecConstantOp %i32    CompositeExtract %struct_a    0 1 0
+%comp_0_1    = OpSpecConstantOp %i32    CompositeExtract %struct_a    0 0 1
+%comp_2_2    = OpSpecConstantOp %i32    CompositeExtract %struct_a    0 2 2
+%comp_2_0    = OpSpecConstantOp %i32    CompositeExtract %struct_a    0 2 0
+%comp_1_1    = OpSpecConstantOp %i32    CompositeExtract %struct_a    0 1 1
+%cmpres_0    = OpSpecConstantOp %bool   IEqual %comp_0_0 %comp_1_0
+%cmpres_1    = OpSpecConstantOp %bool   IEqual %comp_0_1 %comp_2_2
+%cmpres_2    = OpSpecConstantOp %bool   IEqual %comp_2_0 %comp_1_1
+%mustbe_0    = OpSpecConstantOp %i32    Select %cmpres_0 %one %zero
+%mustbe_1    = OpSpecConstantOp %i32    Select %cmpres_1 %one %zero
+%mustbe_2    = OpSpecConstantOp %i32    Select %cmpres_2 %two %one
+%sc_vec3_0   = OpSpecConstantOp %ivec3 CompositeInsert  %sc_0        %ivec3_0     0
+%sc_vec3_1   = OpSpecConstantOp %ivec3 CompositeInsert  %sc_1        %ivec3_0     1
+%sc_vec3_2   = OpSpecConstantOp %ivec3 CompositeInsert  %sc_2        %ivec3_0     2
+%sc_vec3_0_s = OpSpecConstantOp %ivec3 VectorShuffle    %sc_vec3_0   %vec3_undef  0          0xFFFFFFFF 2
+%sc_vec3_1_s = OpSpecConstantOp %ivec3 VectorShuffle    %sc_vec3_1   %vec3_undef  0xFFFFFFFF 1          0
+%sc_vec3_2_s = OpSpecConstantOp %ivec3 VectorShuffle    %vec3_undef  %sc_vec3_2   5          0xFFFFFFFF 5
+%sc_vec3_01  = OpSpecConstantOp %ivec3 VectorShuffle    %sc_vec3_0_s %sc_vec3_1_s 1 0 4
+%sc_vec3_012 = OpSpecConstantOp %ivec3 VectorShuffle    %sc_vec3_01  %sc_vec3_2_s 5 1 2
+%sc_ext_0    = OpSpecConstantOp %i32   CompositeExtract %sc_vec3_012              0
+%sc_ext_1    = OpSpecConstantOp %i32   CompositeExtract %sc_vec3_012              1
+%sc_ext_2    = OpSpecConstantOp %i32   CompositeExtract %sc_vec3_012              2
+%sc_sub      = OpSpecConstantOp %i32   ISub             %sc_ext_0    %sc_ext_1
+%sc_factor   = OpSpecConstantOp %i32   IMul             %sc_sub      %sc_ext_2
+%main      = OpFunction %void None %voidf
+%label     = OpLabel
+%subf_a      = OpISub %i32 %one %mustbe_0
+%subf_b      = OpIMul %i32 %subf_a %mustbe_1
+%subf_c      = OpISub %i32 %mustbe_2 %one
+%factor      = OpIMul %i32 %subf_b %subf_c
+%sc_final    = OpIMul %i32 %factor %sc_factor
+%idval     = OpLoad %uvec3 %id
+%x         = OpCompositeExtract %u32 %idval 0
+%inloc     = OpAccessChain %i32ptr %indata %zero %x
+%inval     = OpLoad %i32 %inloc
+%final     = OpIAdd %i32 %inval %sc_final
+%outloc    = OpAccessChain %i32ptr %outdata %zero %x
+             OpStore %outloc %final
+             OpReturn
+             OpFunctionEnd
diff --git a/shaders-opencl/asm/comp/quantize.asm.comp b/shaders-opencl/asm/comp/quantize.asm.comp
new file mode 100644
index 000000000..f5afc6570
--- /dev/null
+++ b/shaders-opencl/asm/comp/quantize.asm.comp
@@ -0,0 +1,67 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos Glslang Reference Front End; 1
+; Bound: 38
+; Schema: 0
+               OpCapability Shader
+          %1 = OpExtInstImport "GLSL.std.450"
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint GLCompute %4 "main"
+               OpExecutionMode %4 LocalSize 1 1 1
+               OpSource ESSL 310
+               OpName %4 "main"
+               OpName %10 "SSBO0"
+               OpMemberName %10 0 "scalar"
+               OpMemberName %10 1 "vec2_val"
+               OpMemberName %10 2 "vec3_val"
+               OpMemberName %10 3 "vec4_val"
+               OpName %12 ""
+               OpMemberDecorate %10 0 Offset 0
+               OpMemberDecorate %10 1 Offset 8
+               OpMemberDecorate %10 2 Offset 16
+               OpMemberDecorate %10 3 Offset 32
+               OpDecorate %10 BufferBlock
+               OpDecorate %12 DescriptorSet 0
+               OpDecorate %12 Binding 0
+          %2 = OpTypeVoid
+          %3 = OpTypeFunction %2
+          %6 = OpTypeFloat 32
+          %7 = OpTypeVector %6 2
+          %8 = OpTypeVector %6 3
+          %9 = OpTypeVector %6 4
+         %10 = OpTypeStruct %6 %7 %8 %9
+         %11 = OpTypePointer Uniform %10
+         %12 = OpVariable %11 Uniform
+         %13 = OpTypeInt 32 1
+         %14 = OpConstant %13 0
+         %15 = OpTypePointer Uniform %6
+         %20 = OpConstant %13 1
+         %21 = OpTypePointer Uniform %7
+         %26 = OpConstant %13 2
+         %27 = OpTypePointer Uniform %8
+         %32 = OpConstant %13 3
+         %33 = OpTypePointer Uniform %9
+          %4 = OpFunction %2 None %3
+          %5 = OpLabel
+         %16 = OpAccessChain %15 %12 %14
+         %17 = OpLoad %6 %16
+         %18 = OpQuantizeToF16 %6 %17
+         %19 = OpAccessChain %15 %12 %14
+               OpStore %19 %18
+         %22 = OpAccessChain %21 %12 %20
+         %23 = OpLoad %7 %22
+         %24 = OpQuantizeToF16 %7 %23
+         %25 = OpAccessChain %21 %12 %20
+               OpStore %25 %24
+         %28 = OpAccessChain %27 %12 %26
+         %29 = OpLoad %8 %28
+         %30 = OpQuantizeToF16 %8 %29
+         %31 = OpAccessChain %27 %12 %26
+               OpStore %31 %30
+         %34 = OpAccessChain %33 %12 %32
+         %35 = OpLoad %9 %34
+         %36 = OpQuantizeToF16 %9 %35
+         %37 = OpAccessChain %33 %12 %32
+               OpStore %37 %36
+               OpReturn
+               OpFunctionEnd
diff --git a/shaders-opencl/asm/comp/relaxed-block-layout.asm.comp b/shaders-opencl/asm/comp/relaxed-block-layout.asm.comp
new file mode 100644
index 000000000..dd909426d
--- /dev/null
+++ b/shaders-opencl/asm/comp/relaxed-block-layout.asm.comp
@@ -0,0 +1,108 @@
+; SPIR-V
+; Version: 1.3
+; Generator: Khronos Glslang Reference Front End; 7
+; Bound: 63
+; Schema: 0
+               OpCapability Shader
+               OpCapability StorageBuffer16BitAccess
+               OpCapability StorageBuffer8BitAccess
+               OpCapability UniformAndStorageBuffer8BitAccess
+               OpExtension "SPV_KHR_8bit_storage"
+          %1 = OpExtInstImport "GLSL.std.450"
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint GLCompute %main "main" %gl_LocalInvocationID %gl_GlobalInvocationID %gl_WorkGroupID %gl_NumWorkGroups
+               OpExecutionMode %main LocalSize 1 1 1
+               OpSource GLSL 450
+               OpSourceExtension "GL_EXT_shader_16bit_storage"
+               OpSourceExtension "GL_EXT_shader_8bit_storage"
+               OpName %main "main"
+               OpName %foo "foo"
+               OpMemberName %foo 0 "bar"
+               OpMemberName %foo 1 "baz"
+               OpMemberName %foo 2 "quux"
+               OpMemberName %foo 3 "blah"
+               OpMemberName %foo 4 "wibble"
+               OpName %_ ""
+               OpName %gl_LocalInvocationID "gl_LocalInvocationID"
+               OpName %gl_GlobalInvocationID "gl_GlobalInvocationID"
+               OpName %gl_WorkGroupID "gl_WorkGroupID"
+               OpName %gl_NumWorkGroups "gl_NumWorkGroups"
+               OpMemberDecorate %foo 0 Offset 0
+               OpMemberDecorate %foo 1 Offset 4
+               OpMemberDecorate %foo 2 Offset 16
+               OpMemberDecorate %foo 3 Offset 17
+               OpMemberDecorate %foo 4 Offset 22
+               OpDecorate %foo BufferBlock
+               OpDecorate %_ DescriptorSet 0
+               OpDecorate %_ Binding 0
+               OpDecorate %gl_LocalInvocationID BuiltIn LocalInvocationId
+               OpDecorate %gl_GlobalInvocationID BuiltIn GlobalInvocationId
+               OpDecorate %gl_WorkGroupID BuiltIn WorkgroupId
+               OpDecorate %gl_NumWorkGroups BuiltIn NumWorkgroups
+       %void = OpTypeVoid
+          %3 = OpTypeFunction %void
+       %uint = OpTypeInt 32 0
+      %float = OpTypeFloat 32
+    %v3float = OpTypeVector %float 3
+      %uchar = OpTypeInt 8 0
+    %v4uchar = OpTypeVector %uchar 4
+       %half = OpTypeFloat 16
+     %v2half = OpTypeVector %half 2
+        %foo = OpTypeStruct %uint %v3float %uchar %v4uchar %v2half
+%_ptr_Uniform_foo = OpTypePointer Uniform %foo
+          %_ = OpVariable %_ptr_Uniform_foo Uniform
+        %int = OpTypeInt 32 1
+      %int_0 = OpConstant %int 0
+     %v3uint = OpTypeVector %uint 3
+%_ptr_Input_v3uint = OpTypePointer Input %v3uint
+%gl_LocalInvocationID = OpVariable %_ptr_Input_v3uint Input
+     %uint_0 = OpConstant %uint 0
+%_ptr_Input_uint = OpTypePointer Input %uint
+%_ptr_Uniform_uint = OpTypePointer Uniform %uint
+      %int_1 = OpConstant %int 1
+%gl_GlobalInvocationID = OpVariable %_ptr_Input_v3uint Input
+%_ptr_Uniform_v3float = OpTypePointer Uniform %v3float
+      %int_3 = OpConstant %int 3
+%_ptr_Uniform_v4uchar = OpTypePointer Uniform %v4uchar
+     %v4uint = OpTypeVector %uint 4
+%gl_WorkGroupID = OpVariable %_ptr_Input_v3uint Input
+      %int_4 = OpConstant %int 4
+%_ptr_Uniform_v2half = OpTypePointer Uniform %v2half
+    %v2float = OpTypeVector %float 2
+%gl_NumWorkGroups = OpVariable %_ptr_Input_v3uint Input
+     %v2uint = OpTypeVector %uint 2
+       %main = OpFunction %void None %3
+          %5 = OpLabel
+         %23 = OpAccessChain %_ptr_Input_uint %gl_LocalInvocationID %uint_0
+         %24 = OpLoad %uint %23
+         %26 = OpAccessChain %_ptr_Uniform_uint %_ %int_0
+               OpStore %26 %24
+         %29 = OpLoad %v3uint %gl_GlobalInvocationID
+         %30 = OpConvertUToF %v3float %29
+         %32 = OpAccessChain %_ptr_Uniform_v3float %_ %int_1
+               OpStore %32 %30
+         %35 = OpAccessChain %_ptr_Uniform_v4uchar %_ %int_3
+         %36 = OpLoad %v4uchar %35
+         %38 = OpUConvert %v4uint %36
+         %39 = OpVectorShuffle %v3uint %38 %38 0 1 2
+         %41 = OpLoad %v3uint %gl_WorkGroupID
+         %42 = OpIAdd %v3uint %39 %41
+         %43 = OpCompositeExtract %uint %42 0
+         %44 = OpCompositeExtract %uint %42 1
+         %45 = OpCompositeExtract %uint %42 2
+         %46 = OpCompositeConstruct %v4uint %43 %44 %45 %uint_0
+         %47 = OpUConvert %v4uchar %46
+         %48 = OpAccessChain %_ptr_Uniform_v4uchar %_ %int_3
+               OpStore %48 %47
+         %51 = OpAccessChain %_ptr_Uniform_v2half %_ %int_4
+         %52 = OpLoad %v2half %51
+         %54 = OpFConvert %v2float %52
+         %57 = OpLoad %v3uint %gl_NumWorkGroups
+         %58 = OpVectorShuffle %v2uint %57 %57 0 1
+         %59 = OpConvertUToF %v2float %58
+         %60 = OpFMul %v2float %54 %59
+         %61 = OpFConvert %v2half %60
+         %62 = OpAccessChain %_ptr_Uniform_v2half %_ %int_4
+               OpStore %62 %61
+               OpReturn
+               OpFunctionEnd
diff --git a/shaders-opencl/asm/comp/replicated-composites.spv16.asm.comp b/shaders-opencl/asm/comp/replicated-composites.spv16.asm.comp
new file mode 100644
index 000000000..f6f699e74
--- /dev/null
+++ b/shaders-opencl/asm/comp/replicated-composites.spv16.asm.comp
@@ -0,0 +1,81 @@
+; SPIR-V
+; Version: 1.6
+; Generator: Khronos Glslang Reference Front End; 11
+; Bound: 41
+; Schema: 0
+               OpCapability Shader
+               OpCapability ReplicatedCompositesEXT
+               OpExtension "SPV_EXT_replicated_composites"
+          %1 = OpExtInstImport "GLSL.std.450"
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint GLCompute %main "main" %ubo
+               OpExecutionMode %main LocalSize 1 1 1
+               OpSource GLSL 450
+               OpSourceExtension "GL_EXT_shader_explicit_arithmetic_types_float16"
+               OpSourceExtension "GL_KHR_memory_scope_semantics"
+               OpName %main "main"
+               OpName %a "a"
+               OpName %b "b"
+               OpName %c "c"
+               OpName %spec_const "spec_const"
+               OpName %array "array"
+               OpName %d "d"
+               OpName %UBO "UBO"
+               OpMemberName %UBO 0 "uniform_float"
+               OpName %ubo "ubo"
+               OpName %e "e"
+               OpName %f "f"
+               OpDecorate %spec_const SpecId 0
+               OpDecorate %UBO Block
+               OpMemberDecorate %UBO 0 Offset 0
+               OpDecorate %ubo Binding 0
+               OpDecorate %ubo DescriptorSet 0
+       %void = OpTypeVoid
+          %3 = OpTypeFunction %void
+      %float = OpTypeFloat 32
+    %v4float = OpTypeVector %float 4
+%_ptr_Function_v4float = OpTypePointer Function %v4float
+    %float_0 = OpConstant %float 0
+         %11 = OpConstantCompositeReplicateEXT %v4float %float_0
+%mat4v4float = OpTypeMatrix %v4float 4
+%_ptr_Function_mat4v4float = OpTypePointer Function %mat4v4float
+    %float_1 = OpConstant %float 1
+         %16 = OpConstantCompositeReplicateEXT %v4float %float_1
+         %17 = OpConstantCompositeReplicateEXT %mat4v4float %16
+ %spec_const = OpSpecConstant %float 0
+         %20 = OpSpecConstantCompositeReplicateEXT %v4float %spec_const
+       %uint = OpTypeInt 32 0
+     %uint_8 = OpConstant %uint 8
+%_arr_float_uint_8 = OpTypeArray %float %uint_8
+%_ptr_Function__arr_float_uint_8 = OpTypePointer Function %_arr_float_uint_8
+         %26 = OpConstantCompositeReplicateEXT %_arr_float_uint_8 %float_1
+        %UBO = OpTypeStruct %float
+%_ptr_Uniform_UBO = OpTypePointer Uniform %UBO
+        %ubo = OpVariable %_ptr_Uniform_UBO Uniform
+        %int = OpTypeInt 32 1
+      %int_0 = OpConstant %int 0
+%_ptr_Uniform_float = OpTypePointer Uniform %float
+       %main = OpFunction %void None %3
+          %5 = OpLabel
+          %a = OpVariable %_ptr_Function_v4float Function
+          %b = OpVariable %_ptr_Function_mat4v4float Function
+          %c = OpVariable %_ptr_Function_v4float Function
+      %array = OpVariable %_ptr_Function__arr_float_uint_8 Function
+          %d = OpVariable %_ptr_Function_v4float Function
+          %e = OpVariable %_ptr_Function_mat4v4float Function
+          %f = OpVariable %_ptr_Function__arr_float_uint_8 Function
+               OpStore %a %11
+               OpStore %b %17
+               OpStore %c %20
+               OpStore %array %26
+         %34 = OpAccessChain %_ptr_Uniform_float %ubo %int_0
+         %35 = OpLoad %float %34
+         %36 = OpCompositeConstructReplicateEXT %v4float %35
+               OpStore %d %36
+         %38 = OpLoad %v4float %d
+         %39 = OpCompositeConstructReplicateEXT %mat4v4float %38
+               OpStore %e %39
+         %40 = OpCompositeConstructReplicateEXT %_arr_float_uint_8 %35
+               OpStore %f %40
+               OpReturn
+               OpFunctionEnd
diff --git a/shaders-opencl/asm/comp/specialization-constant-workgroup.asm.comp b/shaders-opencl/asm/comp/specialization-constant-workgroup.asm.comp
new file mode 100644
index 000000000..188e3fec3
--- /dev/null
+++ b/shaders-opencl/asm/comp/specialization-constant-workgroup.asm.comp
@@ -0,0 +1,47 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos Glslang Reference Front End; 1
+; Bound: 24
+; Schema: 0
+               OpCapability Shader
+          %1 = OpExtInstImport "GLSL.std.450"
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint GLCompute %main "main"
+               OpExecutionMode %main LocalSize 1 20 1
+               OpSource ESSL 310
+               OpName %main "main"
+               OpName %SSBO "SSBO"
+               OpMemberName %SSBO 0 "a"
+               OpName %_ ""
+               OpMemberDecorate %SSBO 0 Offset 0
+               OpDecorate %SSBO BufferBlock
+               OpDecorate %_ DescriptorSet 0
+               OpDecorate %_ Binding 0
+               OpDecorate %19 SpecId 10
+               OpDecorate %21 SpecId 12
+               OpDecorate %gl_WorkGroupSize BuiltIn WorkgroupSize
+       %void = OpTypeVoid
+          %3 = OpTypeFunction %void
+      %float = OpTypeFloat 32
+       %SSBO = OpTypeStruct %float
+%_ptr_Uniform_SSBO = OpTypePointer Uniform %SSBO
+          %_ = OpVariable %_ptr_Uniform_SSBO Uniform
+        %int = OpTypeInt 32 1
+      %int_0 = OpConstant %int 0
+    %float_1 = OpConstant %float 1
+%_ptr_Uniform_float = OpTypePointer Uniform %float
+       %uint = OpTypeInt 32 0
+         %19 = OpSpecConstant %uint 9
+    %uint_20 = OpConstant %uint 20
+         %21 = OpSpecConstant %uint 4
+     %v3uint = OpTypeVector %uint 3
+%gl_WorkGroupSize = OpSpecConstantComposite %v3uint %19 %uint_20 %21
+       %main = OpFunction %void None %3
+          %5 = OpLabel
+         %14 = OpAccessChain %_ptr_Uniform_float %_ %int_0
+         %15 = OpLoad %float %14
+         %16 = OpFAdd %float %15 %float_1
+         %17 = OpAccessChain %_ptr_Uniform_float %_ %int_0
+               OpStore %17 %16
+               OpReturn
+               OpFunctionEnd
diff --git a/shaders-opencl/asm/comp/struct-resource-name-aliasing.asm.comp b/shaders-opencl/asm/comp/struct-resource-name-aliasing.asm.comp
new file mode 100644
index 000000000..384da305a
--- /dev/null
+++ b/shaders-opencl/asm/comp/struct-resource-name-aliasing.asm.comp
@@ -0,0 +1,49 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos Glslang Reference Front End; 7
+; Bound: 21
+; Schema: 0
+               OpCapability Shader
+          %1 = OpExtInstImport "GLSL.std.450"
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint GLCompute %main "main"
+               OpExecutionMode %main LocalSize 8 8 1
+               OpSource HLSL 500
+               OpName %main "main"
+               OpName %_main_ "@main("
+               OpName %bufA "bufA"
+               OpMemberName %bufA 0 "@data"
+               OpName %bufA_0 "bufA"
+               OpName %bufB "bufB"
+               OpDecorate %_runtimearr_uint ArrayStride 4
+               OpMemberDecorate %bufA 0 Offset 0
+               OpDecorate %bufA BufferBlock
+               OpDecorate %bufA_0 DescriptorSet 0
+               OpDecorate %bufB DescriptorSet 0
+               OpDecorate %bufA_0 Binding 0
+               OpDecorate %bufB Binding 1
+       %void = OpTypeVoid
+          %3 = OpTypeFunction %void
+       %uint = OpTypeInt 32 0
+%_runtimearr_uint = OpTypeRuntimeArray %uint
+       %bufA = OpTypeStruct %_runtimearr_uint
+%_ptr_Uniform_bufA = OpTypePointer Uniform %bufA
+     %bufA_0 = OpVariable %_ptr_Uniform_bufA Uniform
+        %int = OpTypeInt 32 1
+      %int_0 = OpConstant %int 0
+     %uint_0 = OpConstant %uint 0
+%_ptr_Uniform_uint = OpTypePointer Uniform %uint
+       %bufB = OpVariable %_ptr_Uniform_bufA Uniform
+       %main = OpFunction %void None %3
+          %5 = OpLabel
+         %20 = OpFunctionCall %void %_main_
+               OpReturn
+               OpFunctionEnd
+     %_main_ = OpFunction %void None %3
+          %7 = OpLabel
+         %17 = OpAccessChain %_ptr_Uniform_uint %bufA_0 %int_0 %int_0
+               OpStore %17 %uint_0
+         %19 = OpAccessChain %_ptr_Uniform_uint %bufB %int_0 %int_0
+               OpStore %19 %uint_0
+               OpReturn
+               OpFunctionEnd
diff --git a/shaders-opencl/asm/comp/uint_smulextended.asm.comp b/shaders-opencl/asm/comp/uint_smulextended.asm.comp
new file mode 100644
index 000000000..32d483636
--- /dev/null
+++ b/shaders-opencl/asm/comp/uint_smulextended.asm.comp
@@ -0,0 +1,61 @@
+                         OpCapability Shader
+
+                         OpMemoryModel Logical GLSL450
+                         OpEntryPoint GLCompute %main "main" %gl_GlobalInvocationId
+                         OpExecutionMode %main LocalSize 1 1 1
+
+                         OpDecorate %gl_GlobalInvocationId BuiltIn GlobalInvocationId
+                         OpDecorate %ra_uint ArrayStride 4
+                         OpDecorate %struct_uint4 BufferBlock
+                         OpMemberDecorate %struct_uint4 0 Offset 0
+                         OpDecorate %input0 DescriptorSet 0
+                         OpDecorate %input0 Binding 0
+                         OpDecorate %input1 DescriptorSet 0
+                         OpDecorate %input1 Binding 1
+                         OpDecorate %output0 DescriptorSet 0
+                         OpDecorate %output0 Binding 2
+                         OpDecorate %output1 DescriptorSet 0
+                         OpDecorate %output1 Binding 3
+
+                 %uint = OpTypeInt 32 0
+             %ptr_uint = OpTypePointer Uniform %uint
+       %ptr_input_uint = OpTypePointer Input %uint
+                %uint3 = OpTypeVector %uint 3
+      %ptr_input_uint3 = OpTypePointer Input %uint3
+                 %void = OpTypeVoid
+               %voidFn = OpTypeFunction %void
+
+               %uint_0 = OpConstant %uint 0
+               %uint_1 = OpConstant %uint 1
+              %ra_uint = OpTypeRuntimeArray %uint
+                %uint4 = OpTypeVector %uint 4
+         %struct_uint4 = OpTypeStruct %ra_uint
+     %ptr_struct_uint4 = OpTypePointer Uniform %struct_uint4
+           %resulttype = OpTypeStruct %uint %uint
+%gl_GlobalInvocationId = OpVariable %ptr_input_uint3 Input
+               %input0 = OpVariable %ptr_struct_uint4 Uniform
+               %input1 = OpVariable %ptr_struct_uint4 Uniform
+
+              %output0 = OpVariable %ptr_struct_uint4 Uniform
+              %output1 = OpVariable %ptr_struct_uint4 Uniform
+
+                 %main = OpFunction %void None %voidFn
+            %mainStart = OpLabel
+            %index_ptr = OpAccessChain %ptr_input_uint %gl_GlobalInvocationId %uint_0
+                %index = OpLoad %uint %index_ptr
+              %in_ptr0 = OpAccessChain %ptr_uint %input0 %uint_0 %index
+             %invalue0 = OpLoad %uint %in_ptr0
+              %in_ptr1 = OpAccessChain %ptr_uint %input1 %uint_0 %index
+             %invalue1 = OpLoad %uint %in_ptr1
+
+             %outvalue = OpSMulExtended %resulttype %invalue0 %invalue1
+            %outvalue0 = OpCompositeExtract %uint %outvalue 0
+             %out_ptr0 = OpAccessChain %ptr_uint %output0 %uint_0 %index
+                         OpStore %out_ptr0 %outvalue0
+            %outvalue1 = OpCompositeExtract %uint %outvalue 1
+             %out_ptr1 = OpAccessChain %ptr_uint %output1 %uint_0 %index
+                         OpStore %out_ptr1 %outvalue1
+
+
+                         OpReturn
+                         OpFunctionEnd
diff --git a/shaders-opencl/asm/comp/undefined-constant-composite.asm.comp b/shaders-opencl/asm/comp/undefined-constant-composite.asm.comp
new file mode 100644
index 000000000..9de0501fe
--- /dev/null
+++ b/shaders-opencl/asm/comp/undefined-constant-composite.asm.comp
@@ -0,0 +1,102 @@
+;
+; The shader below is based on the following GLSL shader:
+;
+;     #version 450
+;
+;     struct Pair {
+;         int first;
+;         int second;
+;     };
+;
+;     const Pair constant_pair = { 100, 200 };
+;
+;     layout(set=0, binding=0, std430) buffer InputBlock {
+;         int array[10];
+;     } inputValues;
+;
+;     layout(set=0, binding=1, std430) buffer OutputBlock {
+;         int array[10];
+;     } outputValues;
+;
+;     int add_second (int value, Pair pair) {
+;         return value + pair.second;
+;     }
+;
+;     void main() {
+;         uint idx = gl_GlobalInvocationID.x;
+;         outputValues.array[idx] = add_second(inputValues.array[idx], constant_pair);
+;     }
+;
+; However, the first element of constant_pair has been modified to be undefined.
+;
+                            OpCapability Shader
+                  %std450 = OpExtInstImport "GLSL.std.450"
+                            OpMemoryModel Logical GLSL450
+                            OpEntryPoint GLCompute %main "main" %gl_GlobalInvocationID
+                            OpExecutionMode %main LocalSize 1 1 1
+                            OpDecorate %gl_GlobalInvocationID BuiltIn GlobalInvocationId
+                            OpDecorate %_arr_int_uint_10 ArrayStride 4
+                            OpMemberDecorate %OutputBlock 0 Offset 0
+                            OpDecorate %OutputBlock BufferBlock
+                            OpDecorate %outputValues DescriptorSet 0
+                            OpDecorate %outputValues Binding 1
+                            OpMemberDecorate %InputBlock 0 Offset 0
+                            OpDecorate %InputBlock BufferBlock
+                            OpDecorate %inputValues DescriptorSet 0
+                            OpDecorate %inputValues Binding 0
+                    %void = OpTypeVoid
+               %void_func = OpTypeFunction %void
+                     %int = OpTypeInt 32 1
+                    %uint = OpTypeInt 32 0
+                  %v3uint = OpTypeVector %uint 3
+                   %int_0 = OpConstant %int 0
+                   %int_1 = OpConstant %int 1
+                 %int_200 = OpConstant %int 200
+                  %uint_0 = OpConstant %uint 0
+                 %uint_10 = OpConstant %uint 10
+       %_ptr_Function_int = OpTypePointer Function %int
+                    %Pair = OpTypeStruct %int %int
+      %_ptr_Function_Pair = OpTypePointer Function %Pair
+    %add_second_func_type = OpTypeFunction %int %_ptr_Function_int %_ptr_Function_Pair
+      %_ptr_Function_uint = OpTypePointer Function %uint
+       %_ptr_Input_v3uint = OpTypePointer Input %v3uint
+         %_ptr_Input_uint = OpTypePointer Input %uint
+        %_arr_int_uint_10 = OpTypeArray %int %uint_10
+             %OutputBlock = OpTypeStruct %_arr_int_uint_10
+%_ptr_Uniform_OutputBlock = OpTypePointer Uniform %OutputBlock
+            %outputValues = OpVariable %_ptr_Uniform_OutputBlock Uniform
+              %InputBlock = OpTypeStruct %_arr_int_uint_10
+ %_ptr_Uniform_InputBlock = OpTypePointer Uniform %InputBlock
+             %inputValues = OpVariable %_ptr_Uniform_InputBlock Uniform
+                            ; Replaced %int_100 with an undefined int.
+               %undef_int = OpUndef %int
+                            ; Composed a constant Pair with the undefined int in the first member.
+              %const_Pair = OpConstantComposite %Pair %undef_int %int_200
+        %_ptr_Uniform_int = OpTypePointer Uniform %int
+   %gl_GlobalInvocationID = OpVariable %_ptr_Input_v3uint Input
+                    %main = OpFunction %void None %void_func
+              %main_label = OpLabel
+                 %param_1 = OpVariable %_ptr_Function_int Function
+                 %param_2 = OpVariable %_ptr_Function_Pair Function
+                %gidx_ptr = OpAccessChain %_ptr_Input_uint %gl_GlobalInvocationID %uint_0
+                    %gidx = OpLoad %uint %gidx_ptr
+         %input_value_ptr = OpAccessChain %_ptr_Uniform_int %inputValues %int_0 %gidx
+             %input_value = OpLoad %int %input_value_ptr
+                            OpStore %param_1 %input_value
+                            OpStore %param_2 %const_Pair
+                  %retval = OpFunctionCall %int %add_second %param_1 %param_2
+        %output_value_ptr = OpAccessChain %_ptr_Uniform_int %outputValues %int_0 %gidx
+                            OpStore %output_value_ptr %retval
+                            OpReturn
+                            OpFunctionEnd
+              %add_second = OpFunction %int None %add_second_func_type
+               %value_ptr = OpFunctionParameter %_ptr_Function_int
+                    %pair = OpFunctionParameter %_ptr_Function_Pair
+        %add_second_label = OpLabel
+                   %value = OpLoad %int %value_ptr
+                            ; Access the second struct member, which is defined.
+         %pair_second_ptr = OpAccessChain %_ptr_Function_int %pair %int_1
+             %pair_second = OpLoad %int %pair_second_ptr
+              %add_result = OpIAdd %int %value %pair_second
+                            OpReturnValue %add_result
+                            OpFunctionEnd
diff --git a/shaders-opencl/asm/comp/undefined-spec-constant-composite.asm.comp b/shaders-opencl/asm/comp/undefined-spec-constant-composite.asm.comp
new file mode 100644
index 000000000..d89a402bf
--- /dev/null
+++ b/shaders-opencl/asm/comp/undefined-spec-constant-composite.asm.comp
@@ -0,0 +1,122 @@
+;
+; The shader below is based on the following GLSL shader:
+;
+;     #version 450
+;
+;     struct Pair {
+;         int first;
+;         int second;
+;     };
+;
+;     const Pair constant_pair = { 100, 200 };
+;
+;     layout (constant_id=0) const int constantFirst = 0;
+;
+;     Pair spec_constant_pair = { constantFirst, 200 };
+;
+;     layout(set=0, binding=0, std430) buffer InputBlock {
+;         int array[10];
+;     } inputValues;
+;
+;     layout(set=0, binding=1, std430) buffer OutputBlock {
+;         int array[10];
+;     } outputValues;
+;
+;     int add_first_and_second (int value, Pair p1, Pair p2) {
+;         return value + p1.first + p2.second;
+;     }
+;
+;     void main() {
+;         uint idx = gl_GlobalInvocationID.x;
+;         outputValues.array[idx] = add_first_and_second(inputValues.array[idx], spec_constant_pair, constant_pair);
+;     }
+;
+; However, both the constant_pair and the spec_constant_pair have one of their members replaced by undefined values.
+;
+                              OpCapability Shader
+                    %std450 = OpExtInstImport "GLSL.std.450"
+                              OpMemoryModel Logical GLSL450
+                              OpEntryPoint GLCompute %main "main" %gl_GlobalInvocationID
+                              OpExecutionMode %main LocalSize 1 1 1
+                              OpDecorate %gl_GlobalInvocationID BuiltIn GlobalInvocationId
+                              OpDecorate %_arr_int_uint_10 ArrayStride 4
+                              OpMemberDecorate %OutputBlock 0 Offset 0
+                              OpDecorate %OutputBlock BufferBlock
+                              OpDecorate %outputValues DescriptorSet 0
+                              OpDecorate %outputValues Binding 1
+                              OpMemberDecorate %InputBlock 0 Offset 0
+                              OpDecorate %InputBlock BufferBlock
+                              OpDecorate %inputValues DescriptorSet 0
+                              OpDecorate %inputValues Binding 0
+                              OpDecorate %spec_constant SpecId 0
+                      %void = OpTypeVoid
+                 %void_func = OpTypeFunction %void
+                       %int = OpTypeInt 32 1
+                      %uint = OpTypeInt 32 0
+                    %v3uint = OpTypeVector %uint 3
+                     %int_0 = OpConstant %int 0
+                     %int_1 = OpConstant %int 1
+                   %int_200 = OpConstant %int 200
+                    %uint_0 = OpConstant %uint 0
+                   %uint_10 = OpConstant %uint 10
+         %_ptr_Function_int = OpTypePointer Function %int
+                      %Pair = OpTypeStruct %int %int
+        %_ptr_Function_Pair = OpTypePointer Function %Pair
+%add_pair_members_func_type = OpTypeFunction %int %_ptr_Function_int %_ptr_Function_Pair %_ptr_Function_Pair
+        %_ptr_Function_uint = OpTypePointer Function %uint
+         %_ptr_Input_v3uint = OpTypePointer Input %v3uint
+           %_ptr_Input_uint = OpTypePointer Input %uint
+          %_arr_int_uint_10 = OpTypeArray %int %uint_10
+               %OutputBlock = OpTypeStruct %_arr_int_uint_10
+  %_ptr_Uniform_OutputBlock = OpTypePointer Uniform %OutputBlock
+              %outputValues = OpVariable %_ptr_Uniform_OutputBlock Uniform
+                %InputBlock = OpTypeStruct %_arr_int_uint_10
+   %_ptr_Uniform_InputBlock = OpTypePointer Uniform %InputBlock
+               %inputValues = OpVariable %_ptr_Uniform_InputBlock Uniform
+                              ; Replaced %int_100 with an undefined int.
+                 %undef_int = OpUndef %int
+                              ; Composed a spec constant Pair with an undefined int in the second member.
+             %spec_constant = OpSpecConstant %int 0
+           %spec_const_Pair = OpSpecConstantComposite %Pair %spec_constant %undef_int
+                              ; Composed a constant Pair with the undefined int in the first member.
+                %const_Pair = OpConstantComposite %Pair %undef_int %int_200
+          %_ptr_Uniform_int = OpTypePointer Uniform %int
+     %gl_GlobalInvocationID = OpVariable %_ptr_Input_v3uint Input
+                      %main = OpFunction %void None %void_func
+                %main_label = OpLabel
+                   %param_1 = OpVariable %_ptr_Function_int Function
+                   %param_2 = OpVariable %_ptr_Function_Pair Function
+                   %param_3 = OpVariable %_ptr_Function_Pair Function
+                  %gidx_ptr = OpAccessChain %_ptr_Input_uint %gl_GlobalInvocationID %uint_0
+                      %gidx = OpLoad %uint %gidx_ptr
+           %input_value_ptr = OpAccessChain %_ptr_Uniform_int %inputValues %int_0 %gidx
+               %input_value = OpLoad %int %input_value_ptr
+                              OpStore %param_1 %input_value
+                              OpStore %param_2 %spec_const_Pair
+                              OpStore %param_3 %const_Pair
+                              ; Pass the input value as the first argument.
+                              ; Pass the specialization constant Pair as the second argument.
+                              ; Pass the constant Pair as the third argument.
+                    %retval = OpFunctionCall %int %add_pair_members %param_1 %param_2 %param_3
+          %output_value_ptr = OpAccessChain %_ptr_Uniform_int %outputValues %int_0 %gidx
+                              OpStore %output_value_ptr %retval
+                              OpReturn
+                              OpFunctionEnd
+          %add_pair_members = OpFunction %int None %add_pair_members_func_type
+                 %value_ptr = OpFunctionParameter %_ptr_Function_int
+                    %pair_1 = OpFunctionParameter %_ptr_Function_Pair
+                    %pair_2 = OpFunctionParameter %_ptr_Function_Pair
+    %add_pair_members_label = OpLabel
+                     %value = OpLoad %int %value_ptr
+                              ; Access the first struct member from the first pair.
+                              ; Access the second struct member from the second pair.
+                              ; Both should be defined according to the function call above.
+          %pair_1_first_ptr = OpAccessChain %_ptr_Function_int %pair_1 %int_0
+         %pair_2_second_ptr = OpAccessChain %_ptr_Function_int %pair_2 %int_1
+              %pair_1_first = OpLoad %int %pair_1_first_ptr
+             %pair_2_second = OpLoad %int %pair_2_second_ptr
+            %partial_result = OpIAdd %int %value %pair_1_first
+              %final_result = OpIAdd %int %partial_result %pair_2_second
+                              OpReturnValue %final_result
+                              OpFunctionEnd
+
diff --git a/shaders-opencl/asm/comp/variable-pointers-2.asm.comp b/shaders-opencl/asm/comp/variable-pointers-2.asm.comp
new file mode 100644
index 000000000..308162f0b
--- /dev/null
+++ b/shaders-opencl/asm/comp/variable-pointers-2.asm.comp
@@ -0,0 +1,117 @@
+; SPIR-V
+; Version: 1.3
+; Generator: Khronos SPIR-V Tools Assembler; 0
+; Bound: 65
+; Schema: 0
+               OpCapability Shader
+               OpCapability VariablePointers
+          %1 = OpExtInstImport "GLSL.std.450"
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint GLCompute %main "main" %gl_GlobalInvocationID %gl_LocalInvocationID
+               OpExecutionMode %main LocalSize 1 1 1
+               OpSource GLSL 450
+               OpName %main "main"
+               OpName %foo "foo"
+               OpMemberName %foo 0 "a"
+               OpMemberName %foo 1 "b"
+               OpMemberName %foo 2 "c"
+               OpName %bar "bar"
+               OpMemberName %bar 0 "d"
+               OpName %buf "buf"
+               OpName %cb "cb"
+               OpName %select_buffer "select_buffer"
+               OpName %select_input "select_input"
+               OpName %a "a"
+               OpMemberDecorate %foo 0 Offset 0
+               OpMemberDecorate %foo 1 Offset 512
+               OpMemberDecorate %foo 2 Offset 520
+               OpMemberDecorate %bar 0 Offset 0
+               OpDecorate %foo Block
+               OpDecorate %bar Block
+               OpDecorate %buf DescriptorSet 0
+               OpDecorate %buf Binding 0
+               OpDecorate %cb DescriptorSet 0
+               OpDecorate %cb Binding 1
+               OpDecorate %_ptr_StorageBuffer_int ArrayStride 4
+               OpDecorate %_arr_int_uint_128 ArrayStride 4
+               OpDecorate %gl_GlobalInvocationID BuiltIn GlobalInvocationId
+               OpDecorate %gl_LocalInvocationID BuiltIn LocalInvocationId
+       %void = OpTypeVoid
+         %15 = OpTypeFunction %void
+        %int = OpTypeInt 32 1
+       %uint = OpTypeInt 32 0
+     %v3uint = OpTypeVector %uint 3
+%_ptr_Input_v3uint = OpTypePointer Input %v3uint
+%gl_GlobalInvocationID = OpVariable %_ptr_Input_v3uint Input
+%gl_LocalInvocationID = OpVariable %_ptr_Input_v3uint Input
+   %uint_128 = OpConstant %uint 128
+%_arr_int_uint_128 = OpTypeArray %int %uint_128
+      %float = OpTypeFloat 32
+    %v2float = OpTypeVector %float 2
+        %foo = OpTypeStruct %_arr_int_uint_128 %uint %v2float
+%_ptr_StorageBuffer_foo = OpTypePointer StorageBuffer %foo
+        %buf = OpVariable %_ptr_StorageBuffer_foo StorageBuffer
+        %bar = OpTypeStruct %int
+%_ptr_Uniform_bar = OpTypePointer Uniform %bar
+         %cb = OpVariable %_ptr_Uniform_bar Uniform
+     %uint_0 = OpConstant %uint 0
+       %bool = OpTypeBool
+%_ptr_Uniform_int = OpTypePointer Uniform %int
+         %28 = OpTypeFunction %_ptr_StorageBuffer_foo %_ptr_StorageBuffer_foo
+      %int_0 = OpConstant %int 0
+     %uint_1 = OpConstant %uint 1
+         %31 = OpConstantNull %_ptr_StorageBuffer_foo
+         %32 = OpTypeFunction %_ptr_Input_v3uint
+%_ptr_StorageBuffer_int = OpTypePointer StorageBuffer %int
+%_ptr_Function__ptr_StorageBuffer_foo = OpTypePointer Function %_ptr_StorageBuffer_foo
+%select_buffer = OpFunction %_ptr_StorageBuffer_foo None %28
+          %a = OpFunctionParameter %_ptr_StorageBuffer_foo
+         %33 = OpLabel
+         %34 = OpAccessChain %_ptr_Uniform_int %cb %uint_0
+         %35 = OpLoad %int %34
+         %36 = OpINotEqual %bool %35 %int_0
+         %37 = OpSelect %_ptr_StorageBuffer_foo %36 %a %31
+               OpReturnValue %37
+               OpFunctionEnd
+%select_input = OpFunction %_ptr_Input_v3uint None %32
+         %38 = OpLabel
+         %39 = OpAccessChain %_ptr_Uniform_int %cb %uint_0
+         %40 = OpLoad %int %39
+         %41 = OpINotEqual %bool %40 %int_0
+         %42 = OpSelect %_ptr_Input_v3uint %41 %gl_GlobalInvocationID %gl_LocalInvocationID
+               OpReturnValue %42
+               OpFunctionEnd
+       %main = OpFunction %void None %15
+         %43 = OpLabel
+         %65 = OpVariable %_ptr_Function__ptr_StorageBuffer_foo Function
+         %44 = OpFunctionCall %_ptr_StorageBuffer_foo %select_buffer %buf
+               OpStore %65 %44
+         %45 = OpFunctionCall %_ptr_Input_v3uint %select_input
+         %66 = OpLoad %_ptr_StorageBuffer_foo %65
+         %46 = OpAccessChain %_ptr_StorageBuffer_int %66 %uint_0 %uint_0
+         %47 = OpAccessChain %_ptr_StorageBuffer_int %buf %uint_0 %uint_0
+               OpBranch %48
+         %48 = OpLabel
+         %49 = OpPhi %_ptr_StorageBuffer_int %46 %43 %50 %51
+         %52 = OpPhi %_ptr_StorageBuffer_int %47 %43 %53 %51
+         %54 = OpLoad %int %49
+         %55 = OpLoad %int %52
+         %56 = OpINotEqual %bool %54 %55
+               OpLoopMerge %58 %51 None
+               OpBranchConditional %56 %57 %58
+         %57 = OpLabel
+         %59 = OpIAdd %int %54 %55
+         %60 = OpLoad %v3uint %45
+         %61 = OpCompositeExtract %uint %60 0
+         %62 = OpBitcast %int %61
+         %63 = OpIAdd %int %59 %62
+               OpStore %49 %63
+               OpStore %52 %63
+               OpBranch %51
+         %51 = OpLabel
+         %50 = OpPtrAccessChain %_ptr_StorageBuffer_int %49 %uint_1
+         %53 = OpPtrAccessChain %_ptr_StorageBuffer_int %52 %uint_1
+               OpBranch %48
+         %58 = OpLabel
+               OpReturn
+               OpFunctionEnd
diff --git a/shaders-opencl/asm/comp/variable-pointers-store-forwarding.asm.comp b/shaders-opencl/asm/comp/variable-pointers-store-forwarding.asm.comp
new file mode 100644
index 000000000..3dcb04f02
--- /dev/null
+++ b/shaders-opencl/asm/comp/variable-pointers-store-forwarding.asm.comp
@@ -0,0 +1,75 @@
+; SPIR-V
+; Version: 1.3
+; Generator: Khronos SPIR-V Tools Assembler; 0
+; Bound: 40
+; Schema: 0
+               OpCapability Shader
+               OpCapability VariablePointers
+          %1 = OpExtInstImport "GLSL.std.450"
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint GLCompute %main "main" %gl_GlobalInvocationID
+               OpExecutionMode %main LocalSize 1 1 1
+               OpSource GLSL 450
+               OpName %main "main"
+               OpName %foo "foo"
+               OpMemberName %foo 0 "a"
+               OpName %bar "bar"
+               OpMemberName %bar 0 "b"
+               OpName %x "x"
+               OpName %y "y"
+               OpName %a "a"
+               OpName %b "b"
+               OpMemberDecorate %foo 0 Offset 0
+               OpMemberDecorate %bar 0 Offset 0
+               OpDecorate %foo Block
+               OpDecorate %bar Block
+               OpDecorate %x DescriptorSet 0
+               OpDecorate %x Binding 0
+               OpDecorate %y DescriptorSet 0
+               OpDecorate %y Binding 1
+               OpDecorate %gl_GlobalInvocationID BuiltIn GlobalInvocationId
+       %void = OpTypeVoid
+         %11 = OpTypeFunction %void
+        %int = OpTypeInt 32 1
+       %uint = OpTypeInt 32 0
+     %v3uint = OpTypeVector %uint 3
+%_ptr_Input_v3uint = OpTypePointer Input %v3uint
+%gl_GlobalInvocationID = OpVariable %_ptr_Input_v3uint Input
+        %foo = OpTypeStruct %int
+%_ptr_StorageBuffer_foo = OpTypePointer StorageBuffer %foo
+          %x = OpVariable %_ptr_StorageBuffer_foo StorageBuffer
+        %bar = OpTypeStruct %int
+%_ptr_StorageBuffer_bar = OpTypePointer StorageBuffer %bar
+          %y = OpVariable %_ptr_StorageBuffer_bar StorageBuffer
+     %uint_0 = OpConstant %uint 0
+      %int_0 = OpConstant %int 0
+       %bool = OpTypeBool
+%_ptr_StorageBuffer_int = OpTypePointer StorageBuffer %int
+         %22 = OpTypeFunction %_ptr_StorageBuffer_int %_ptr_StorageBuffer_foo %_ptr_StorageBuffer_bar
+%_ptr_Function__ptr_StorageBuffer_int = OpTypePointer Function %_ptr_StorageBuffer_int
+         %24 = OpFunction %_ptr_StorageBuffer_int None %22
+          %a = OpFunctionParameter %_ptr_StorageBuffer_foo
+          %b = OpFunctionParameter %_ptr_StorageBuffer_bar
+         %25 = OpLabel
+         %26 = OpLoad %v3uint %gl_GlobalInvocationID
+         %27 = OpCompositeExtract %uint %26 0
+         %28 = OpINotEqual %bool %27 %uint_0
+         %29 = OpAccessChain %_ptr_StorageBuffer_int %a %uint_0
+         %30 = OpAccessChain %_ptr_StorageBuffer_int %b %uint_0
+         %31 = OpSelect %_ptr_StorageBuffer_int %28 %29 %30
+               OpReturnValue %31
+               OpFunctionEnd
+       %main = OpFunction %void None %11
+         %32 = OpLabel
+         %33 = OpVariable %_ptr_Function__ptr_StorageBuffer_int Function
+         %34 = OpFunctionCall %_ptr_StorageBuffer_int %24 %x %y
+               OpStore %33 %34
+         %35 = OpLoad %_ptr_StorageBuffer_int %33
+         %36 = OpAccessChain %_ptr_StorageBuffer_int %x %uint_0
+         %37 = OpLoad %int %36
+               OpStore %35 %int_0
+         %38 = OpIAdd %int %37 %37
+         %39 = OpAccessChain %_ptr_StorageBuffer_int %y %uint_0
+               OpStore %39 %38
+               OpReturn
+               OpFunctionEnd
diff --git a/shaders-opencl/asm/comp/vector-builtin-type-cast-func.asm.comp b/shaders-opencl/asm/comp/vector-builtin-type-cast-func.asm.comp
new file mode 100644
index 000000000..c01432b5d
--- /dev/null
+++ b/shaders-opencl/asm/comp/vector-builtin-type-cast-func.asm.comp
@@ -0,0 +1,147 @@
+; SPIR-V
+; Version: 1.3
+; Generator: Khronos Glslang Reference Front End; 6
+; Bound: 90
+; Schema: 0
+               OpCapability Shader
+               OpCapability ImageQuery
+               OpCapability StorageImageWriteWithoutFormat
+          %1 = OpExtInstImport "GLSL.std.450"
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint GLCompute %main "main" %gl_LocalInvocationID
+               OpExecutionMode %main LocalSize 16 16 1
+               OpSource GLSL 450
+               OpName %main "main"
+               OpName %get_texcoord_vi2_vi2_ "get_texcoord(vi2;vi2;"
+               OpName %base "base"
+               OpName %index "index"
+               OpName %gl_LocalInvocationID "gl_LocalInvocationID"
+               OpName %r0 "r0"
+               OpName %u0 "u0"
+               OpName %i "i"
+               OpName %j "j"
+               OpName %param "param"
+               OpName %param_0 "param"
+               OpName %cb1_struct "cb1_struct"
+               OpMemberName %cb1_struct 0 "_m0"
+               OpName %cb0_1 "cb0_1"
+               OpDecorate %gl_LocalInvocationID BuiltIn LocalInvocationId
+               OpDecorate %u0 DescriptorSet 0
+               OpDecorate %u0 Binding 1
+               OpDecorate %u0 NonReadable
+               OpDecorate %_arr_v4float_uint_1 ArrayStride 16
+               OpMemberDecorate %cb1_struct 0 Offset 0
+               OpDecorate %cb1_struct Block
+               OpDecorate %cb0_1 DescriptorSet 0
+               OpDecorate %cb0_1 Binding 0
+               OpDecorate %gl_WorkGroupSize BuiltIn WorkgroupSize
+       %void = OpTypeVoid
+          %3 = OpTypeFunction %void
+        %int = OpTypeInt 32 1
+      %v2int = OpTypeVector %int 2
+%_ptr_Function_v2int = OpTypePointer Function %v2int
+          %9 = OpTypeFunction %v2int %_ptr_Function_v2int %_ptr_Function_v2int
+      %v3int = OpTypeVector %int 3
+%_ptr_Input_v3int = OpTypePointer Input %v3int
+%gl_LocalInvocationID = OpVariable %_ptr_Input_v3int Input
+       %uint = OpTypeInt 32 0
+     %v2uint = OpTypeVector %uint 2
+      %float = OpTypeFloat 32
+         %30 = OpTypeImage %float 2D 0 0 0 2 Unknown
+%_ptr_UniformConstant_30 = OpTypePointer UniformConstant %30
+         %u0 = OpVariable %_ptr_UniformConstant_30 UniformConstant
+     %uint_4 = OpConstant %uint 4
+%_ptr_Function_int = OpTypePointer Function %int
+      %int_0 = OpConstant %int 0
+     %uint_1 = OpConstant %uint 1
+       %bool = OpTypeBool
+     %uint_0 = OpConstant %uint 0
+    %v4float = OpTypeVector %float 4
+%_arr_v4float_uint_1 = OpTypeArray %v4float %uint_1
+ %cb1_struct = OpTypeStruct %_arr_v4float_uint_1
+%_ptr_Uniform_cb1_struct = OpTypePointer Uniform %cb1_struct
+      %cb0_1 = OpVariable %_ptr_Uniform_cb1_struct Uniform
+%_ptr_Uniform_v4float = OpTypePointer Uniform %v4float
+      %int_1 = OpConstant %int 1
+    %uint_16 = OpConstant %uint 16
+     %v3uint = OpTypeVector %uint 3
+%gl_WorkGroupSize = OpConstantComposite %v3uint %uint_16 %uint_16 %uint_1
+       %main = OpFunction %void None %3
+          %5 = OpLabel
+         %r0 = OpVariable %_ptr_Function_v2int Function
+          %i = OpVariable %_ptr_Function_int Function
+          %j = OpVariable %_ptr_Function_int Function
+      %param = OpVariable %_ptr_Function_v2int Function
+    %param_0 = OpVariable %_ptr_Function_v2int Function
+         %33 = OpLoad %30 %u0
+         %34 = OpImageQuerySize %v2int %33
+         %36 = OpCompositeConstruct %v2uint %uint_4 %uint_4
+         %37 = OpShiftRightArithmetic %v2int %34 %36
+         %38 = OpCompositeExtract %int %37 0
+         %39 = OpCompositeExtract %int %37 1
+         %40 = OpCompositeConstruct %v2int %38 %39
+               OpStore %r0 %40
+               OpStore %i %int_0
+               OpBranch %44
+         %44 = OpLabel
+               OpLoopMerge %46 %47 None
+               OpBranch %48
+         %48 = OpLabel
+         %49 = OpLoad %int %i
+         %51 = OpAccessChain %_ptr_Function_int %r0 %uint_1
+         %52 = OpLoad %int %51
+         %54 = OpSLessThan %bool %49 %52
+               OpBranchConditional %54 %45 %46
+         %45 = OpLabel
+               OpStore %j %int_0
+               OpBranch %56
+         %56 = OpLabel
+               OpLoopMerge %58 %59 None
+               OpBranch %60
+         %60 = OpLabel
+         %61 = OpLoad %int %j
+         %63 = OpAccessChain %_ptr_Function_int %r0 %uint_0
+         %64 = OpLoad %int %63
+         %65 = OpSLessThan %bool %61 %64
+               OpBranchConditional %65 %57 %58
+         %57 = OpLabel
+         %66 = OpLoad %30 %u0
+         %67 = OpLoad %int %i
+         %68 = OpLoad %int %j
+         %69 = OpCompositeConstruct %v2int %67 %68
+         %71 = OpLoad %v2int %r0
+               OpStore %param %71
+               OpStore %param_0 %69
+         %73 = OpFunctionCall %v2int %get_texcoord_vi2_vi2_ %param %param_0
+         %80 = OpAccessChain %_ptr_Uniform_v4float %cb0_1 %int_0 %int_0
+         %81 = OpLoad %v4float %80
+         %82 = OpVectorShuffle %v4float %81 %81 0 0 0 0
+               OpImageWrite %66 %73 %82
+               OpBranch %59
+         %59 = OpLabel
+         %83 = OpLoad %int %j
+         %85 = OpIAdd %int %83 %int_1
+               OpStore %j %85
+               OpBranch %56
+         %58 = OpLabel
+               OpBranch %47
+         %47 = OpLabel
+         %86 = OpLoad %int %i
+         %87 = OpIAdd %int %86 %int_1
+               OpStore %i %87
+               OpBranch %44
+         %46 = OpLabel
+               OpReturn
+               OpFunctionEnd
+%get_texcoord_vi2_vi2_ = OpFunction %v2int None %9
+       %base = OpFunctionParameter %_ptr_Function_v2int
+      %index = OpFunctionParameter %_ptr_Function_v2int
+         %13 = OpLabel
+         %14 = OpLoad %v2int %base
+         %20 = OpLoad %v3int %gl_LocalInvocationID
+         %21 = OpVectorShuffle %v2int %20 %20 0 1
+         %23 = OpIMul %v2int %14 %21
+         %24 = OpLoad %v2int %index
+         %25 = OpIAdd %v2int %23 %24
+               OpReturnValue %25
+               OpFunctionEnd
diff --git a/shaders-opencl/asm/comp/vector-builtin-type-cast.asm.comp b/shaders-opencl/asm/comp/vector-builtin-type-cast.asm.comp
new file mode 100644
index 000000000..e79354026
--- /dev/null
+++ b/shaders-opencl/asm/comp/vector-builtin-type-cast.asm.comp
@@ -0,0 +1,128 @@
+; SPIR-V
+; Version: 1.3
+; Generator: Khronos Glslang Reference Front End; 6
+; Bound: 78
+; Schema: 0
+               OpCapability Shader
+               OpCapability ImageQuery
+               OpCapability StorageImageWriteWithoutFormat
+          %1 = OpExtInstImport "GLSL.std.450"
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint GLCompute %main "main" %gl_LocalInvocationID
+               OpExecutionMode %main LocalSize 16 16 1
+               OpSource GLSL 450
+               OpName %main "main"
+               OpName %r0 "r0"
+               OpName %u0 "u0"
+               OpName %i "i"
+               OpName %j "j"
+               OpName %gl_LocalInvocationID "gl_LocalInvocationID"
+               OpName %cb1_struct "cb1_struct"
+               OpMemberName %cb1_struct 0 "_m0"
+               OpName %cb0_1 "cb0_1"
+               OpDecorate %u0 DescriptorSet 0
+               OpDecorate %u0 Binding 1
+               OpDecorate %u0 NonReadable
+               OpDecorate %gl_LocalInvocationID BuiltIn LocalInvocationId
+               OpDecorate %_arr_v4float_uint_1 ArrayStride 16
+               OpMemberDecorate %cb1_struct 0 Offset 0
+               OpDecorate %cb1_struct Block
+               OpDecorate %cb0_1 DescriptorSet 0
+               OpDecorate %cb0_1 Binding 0
+               OpDecorate %gl_WorkGroupSize BuiltIn WorkgroupSize
+       %void = OpTypeVoid
+          %3 = OpTypeFunction %void
+        %int = OpTypeInt 32 1
+      %v2int = OpTypeVector %int 2
+%_ptr_Function_v2int = OpTypePointer Function %v2int
+      %float = OpTypeFloat 32
+         %11 = OpTypeImage %float 2D 0 0 0 2 Unknown
+%_ptr_UniformConstant_11 = OpTypePointer UniformConstant %11
+         %u0 = OpVariable %_ptr_UniformConstant_11 UniformConstant
+       %uint = OpTypeInt 32 0
+     %uint_4 = OpConstant %uint 4
+     %v2uint = OpTypeVector %uint 2
+%_ptr_Function_int = OpTypePointer Function %int
+      %int_0 = OpConstant %int 0
+     %uint_1 = OpConstant %uint 1
+       %bool = OpTypeBool
+     %uint_0 = OpConstant %uint 0
+      %v3int = OpTypeVector %int 3
+%_ptr_Input_v3int = OpTypePointer Input %v3int
+%gl_LocalInvocationID = OpVariable %_ptr_Input_v3int Input
+    %v4float = OpTypeVector %float 4
+%_arr_v4float_uint_1 = OpTypeArray %v4float %uint_1
+ %cb1_struct = OpTypeStruct %_arr_v4float_uint_1
+%_ptr_Uniform_cb1_struct = OpTypePointer Uniform %cb1_struct
+      %cb0_1 = OpVariable %_ptr_Uniform_cb1_struct Uniform
+%_ptr_Uniform_v4float = OpTypePointer Uniform %v4float
+      %int_1 = OpConstant %int 1
+    %uint_16 = OpConstant %uint 16
+     %v3uint = OpTypeVector %uint 3
+%gl_WorkGroupSize = OpConstantComposite %v3uint %uint_16 %uint_16 %uint_1
+       %main = OpFunction %void None %3
+          %5 = OpLabel
+         %r0 = OpVariable %_ptr_Function_v2int Function
+          %i = OpVariable %_ptr_Function_int Function
+          %j = OpVariable %_ptr_Function_int Function
+         %14 = OpLoad %11 %u0
+         %15 = OpImageQuerySize %v2int %14
+         %19 = OpCompositeConstruct %v2uint %uint_4 %uint_4
+         %20 = OpShiftRightArithmetic %v2int %15 %19
+         %21 = OpCompositeExtract %int %20 0
+         %22 = OpCompositeExtract %int %20 1
+         %23 = OpCompositeConstruct %v2int %21 %22
+               OpStore %r0 %23
+               OpStore %i %int_0
+               OpBranch %27
+         %27 = OpLabel
+               OpLoopMerge %29 %30 None
+               OpBranch %31
+         %31 = OpLabel
+         %32 = OpLoad %int %i
+         %34 = OpAccessChain %_ptr_Function_int %r0 %uint_1
+         %35 = OpLoad %int %34
+         %37 = OpSLessThan %bool %32 %35
+               OpBranchConditional %37 %28 %29
+         %28 = OpLabel
+               OpStore %j %int_0
+               OpBranch %39
+         %39 = OpLabel
+               OpLoopMerge %41 %42 None
+               OpBranch %43
+         %43 = OpLabel
+         %44 = OpLoad %int %j
+         %46 = OpAccessChain %_ptr_Function_int %r0 %uint_0
+         %47 = OpLoad %int %46
+         %48 = OpSLessThan %bool %44 %47
+               OpBranchConditional %48 %40 %41
+         %40 = OpLabel
+         %49 = OpLoad %11 %u0
+         %50 = OpLoad %v2int %r0
+         %54 = OpLoad %v3int %gl_LocalInvocationID
+         %55 = OpVectorShuffle %v2int %54 %54 0 1
+         %57 = OpIMul %v2int %50 %55
+         %58 = OpLoad %int %i
+         %59 = OpLoad %int %j
+         %60 = OpCompositeConstruct %v2int %58 %59
+         %61 = OpIAdd %v2int %57 %60
+         %68 = OpAccessChain %_ptr_Uniform_v4float %cb0_1 %int_0 %int_0
+         %69 = OpLoad %v4float %68
+         %70 = OpVectorShuffle %v4float %69 %69 0 0 0 0
+               OpImageWrite %49 %61 %70
+               OpBranch %42
+         %42 = OpLabel
+         %71 = OpLoad %int %j
+         %73 = OpIAdd %int %71 %int_1
+               OpStore %j %73
+               OpBranch %39
+         %41 = OpLabel
+               OpBranch %30
+         %30 = OpLabel
+         %74 = OpLoad %int %i
+         %75 = OpIAdd %int %74 %int_1
+               OpStore %i %75
+               OpBranch %27
+         %29 = OpLabel
+               OpReturn
+               OpFunctionEnd
diff --git a/shaders-opencl/comp/access-private-workgroup-in-function.comp b/shaders-opencl/comp/access-private-workgroup-in-function.comp
new file mode 100644
index 000000000..7cb1e6f13
--- /dev/null
+++ b/shaders-opencl/comp/access-private-workgroup-in-function.comp
@@ -0,0 +1,31 @@
+#version 450
+layout(local_size_x = 1) in;
+
+int f;
+shared int u;
+
+void set_f()
+{
+	f = 40;
+}
+
+void set_shared_u()
+{
+	u = 50;
+}
+
+void main()
+{
+	set_f();
+	set_shared_u();
+	if (gl_LocalInvocationIndex == 0u)
+	{
+		f = 10;
+	}
+	else
+	{
+		f = 30;
+		u = 20;
+	}
+}
+
diff --git a/shaders-opencl/comp/arguments.comp b/shaders-opencl/comp/arguments.comp
new file mode 100644
index 000000000..dd154edd3
--- /dev/null
+++ b/shaders-opencl/comp/arguments.comp
@@ -0,0 +1,13 @@
+   #version 450
+   layout(local_size_x = 64) in;
+   layout(set = 0, binding = 0) buffer Buf { uint data[]; };
+layout(push_constant) uniform parameter
+{
+    uint n;
+} p;
+
+
+   void main() {
+     uint i = gl_GlobalInvocationID.x;
+     data[i] = i + p.n;
+   }
diff --git a/shaders-opencl/comp/atomic.comp b/shaders-opencl/comp/atomic.comp
new file mode 100644
index 000000000..e25c4f6d2
--- /dev/null
+++ b/shaders-opencl/comp/atomic.comp
@@ -0,0 +1,56 @@
+#version 310 es
+#extension GL_OES_shader_image_atomic : require
+layout(local_size_x = 1) in;
+
+layout(r32ui, binding = 0) uniform highp uimage2D uImage;
+layout(r32i, binding = 1) uniform highp iimage2D iImage;
+layout(binding = 2, std430) buffer SSBO
+{
+    uint u32;
+    int  i32;
+} ssbo;
+
+shared uint shared_u32;
+shared int shared_i32;
+
+void main()
+{
+    atomicAdd(ssbo.u32, 1u);
+    atomicOr(ssbo.u32, 1u);
+    atomicXor(ssbo.u32, 1u);
+    atomicAnd(ssbo.u32, 1u);
+    atomicMin(ssbo.u32, 1u);
+    atomicMax(ssbo.u32, 1u);
+    atomicExchange(ssbo.u32, 1u);
+    atomicCompSwap(ssbo.u32, 10u, 2u);
+
+    atomicAdd(ssbo.i32, 1);
+    atomicOr(ssbo.i32, 1);
+    atomicXor(ssbo.i32, 1);
+    atomicAnd(ssbo.i32, 1);
+    atomicMin(ssbo.i32, 1);
+    atomicMax(ssbo.i32, 1);
+    atomicExchange(ssbo.i32, 1);
+    atomicCompSwap(ssbo.i32, 10, 2);
+
+	shared_u32 = 10u;
+	shared_i32 = 10;
+    atomicAdd(shared_u32, 1u);
+    atomicOr(shared_u32, 1u);
+    atomicXor(shared_u32, 1u);
+    atomicAnd(shared_u32, 1u);
+    atomicMin(shared_u32, 1u);
+    atomicMax(shared_u32, 1u);
+    atomicExchange(shared_u32, 1u);
+    atomicCompSwap(shared_u32, 10u, 2u);
+
+    atomicAdd(shared_i32, 1);
+    atomicOr(shared_i32, 1);
+    atomicXor(shared_i32, 1);
+    atomicAnd(shared_i32, 1);
+    atomicMin(shared_i32, 1);
+    atomicMax(shared_i32, 1);
+    atomicExchange(shared_i32, 1);
+    atomicCompSwap(shared_i32, 10, 2);
+}
+
diff --git a/shaders-opencl/comp/barriers.comp b/shaders-opencl/comp/barriers.comp
new file mode 100644
index 000000000..7e0ea42d4
--- /dev/null
+++ b/shaders-opencl/comp/barriers.comp
@@ -0,0 +1,79 @@
+#version 310 es
+layout(local_size_x = 4) in;
+
+void barrier_shared()
+{
+	memoryBarrierShared();
+}
+
+void full_barrier()
+{
+	memoryBarrier();
+}
+
+void image_barrier()
+{
+	memoryBarrierImage();
+}
+
+void buffer_barrier()
+{
+	memoryBarrierBuffer();
+}
+
+void group_barrier()
+{
+	groupMemoryBarrier();
+}
+
+void barrier_shared_exec()
+{
+	memoryBarrierShared();
+	barrier();
+}
+
+void full_barrier_exec()
+{
+	memoryBarrier();
+	barrier();
+}
+
+void image_barrier_exec()
+{
+	memoryBarrierImage();
+	barrier();
+}
+
+void buffer_barrier_exec()
+{
+	memoryBarrierBuffer();
+	barrier();
+}
+
+void group_barrier_exec()
+{
+	groupMemoryBarrier();
+	barrier();
+}
+
+void exec_barrier()
+{
+	barrier();
+}
+
+void main()
+{
+	barrier_shared();
+	full_barrier();
+	image_barrier();
+	buffer_barrier();
+	group_barrier();
+
+	barrier_shared_exec();
+	full_barrier_exec();
+	image_barrier_exec();
+	buffer_barrier_exec();
+	group_barrier_exec();
+
+	exec_barrier();
+}
diff --git a/shaders-opencl/comp/basic.comp b/shaders-opencl/comp/basic.comp
new file mode 100644
index 000000000..f9bf55670
--- /dev/null
+++ b/shaders-opencl/comp/basic.comp
@@ -0,0 +1,28 @@
+#version 310 es
+layout(local_size_x = 1) in;
+
+layout(std430, binding = 0) readonly buffer SSBO
+{
+    vec4 in_data[];
+};
+
+layout(std430, binding = 1) writeonly buffer SSBO2
+{
+    vec4 out_data[];
+};
+
+layout(std430, binding = 2) buffer SSBO3
+{
+    uint counter;
+};
+
+void main()
+{
+    uint ident = gl_GlobalInvocationID.x;
+    vec4 idata = in_data[ident];
+    if (dot(idata, vec4(1.0, 5.0, 6.0, 2.0)) > 8.2)
+    {
+        out_data[atomicAdd(counter, 1u)] = idata;
+    }
+}
+
diff --git a/shaders-opencl/comp/basic.dispatchbase.comp b/shaders-opencl/comp/basic.dispatchbase.comp
new file mode 100644
index 000000000..2c873468c
--- /dev/null
+++ b/shaders-opencl/comp/basic.dispatchbase.comp
@@ -0,0 +1,29 @@
+#version 310 es
+layout(local_size_x_id = 10) in;
+
+layout(std430, binding = 0) readonly buffer SSBO
+{
+    vec4 in_data[];
+};
+
+layout(std430, binding = 1) writeonly buffer SSBO2
+{
+    vec4 out_data[];
+};
+
+layout(std430, binding = 2) buffer SSBO3
+{
+    uint counter;
+};
+
+void main()
+{
+    uint ident = gl_GlobalInvocationID.x;
+    uint workgroup = gl_WorkGroupID.x;
+    vec4 idata = in_data[ident];
+    if (dot(idata, vec4(1.0, 5.0, 6.0, 2.0)) > 8.2)
+    {
+        out_data[atomicAdd(counter, 1u)] = idata;
+    }
+}
+
diff --git a/shaders-opencl/comp/buffer-push-const.comp b/shaders-opencl/comp/buffer-push-const.comp
new file mode 100644
index 000000000..d3f102e46
--- /dev/null
+++ b/shaders-opencl/comp/buffer-push-const.comp
@@ -0,0 +1,9 @@
+#version 450
+layout(local_size_x = 64) in;
+layout(set = 0, binding = 0) buffer Buf { uint data[]; };
+layout(push_constant) uniform parameter { uint n; } p;
+
+void main() {
+    uint i = gl_GlobalInvocationID.x;
+    data[i] = i + p.n;
+}
diff --git a/shaders-opencl/comp/builtins.comp b/shaders-opencl/comp/builtins.comp
new file mode 100644
index 000000000..88bb5951e
--- /dev/null
+++ b/shaders-opencl/comp/builtins.comp
@@ -0,0 +1,12 @@
+#version 310 es
+layout(local_size_x = 8, local_size_y = 4, local_size_z = 2) in;
+
+void main()
+{
+	uvec3 local_id = gl_LocalInvocationID;
+	uvec3 global_id = gl_GlobalInvocationID;
+	uint local_index = gl_LocalInvocationIndex;
+	uvec3 work_group_size = gl_WorkGroupSize;
+	uvec3 num_work_groups = gl_NumWorkGroups;
+	uvec3 work_group_id = gl_WorkGroupID;
+}
diff --git a/shaders-opencl/comp/cfg-preserve-parameter.comp b/shaders-opencl/comp/cfg-preserve-parameter.comp
new file mode 100644
index 000000000..9ef909200
--- /dev/null
+++ b/shaders-opencl/comp/cfg-preserve-parameter.comp
@@ -0,0 +1,54 @@
+#version 310 es
+
+// We write in all paths (and no reads), so should just be out.
+void out_test_0(int cond, inout int i)
+{
+   if (cond == 0)
+      i = 40;
+   else
+      i = 60;
+}
+
+// We write in all paths (and no reads), so should just be out.
+void out_test_1(int cond, inout int i)
+{
+   switch (cond)
+   {
+      case 40:
+         i = 40;
+         break;
+
+      default:
+         i = 70;
+         break;
+   }
+}
+
+// We don't write in all paths, so should be inout.
+void inout_test_0(int cond, inout int i)
+{
+   if (cond == 0)
+      i = 40;
+}
+
+void inout_test_1(int cond, inout int i)
+{
+   switch (cond)
+   {
+      case 40:
+         i = 40;
+         break;
+   }
+}
+
+
+void main()
+{
+   int cond = 40;
+   int i = 50;
+
+   out_test_0(cond, i);
+   out_test_1(cond, i);
+   inout_test_0(cond, i);
+   inout_test_1(cond, i);
+}
diff --git a/shaders-opencl/comp/complex-type-alias.comp b/shaders-opencl/comp/complex-type-alias.comp
new file mode 100644
index 000000000..4b9b6eddb
--- /dev/null
+++ b/shaders-opencl/comp/complex-type-alias.comp
@@ -0,0 +1,41 @@
+#version 450
+layout(local_size_x = 8, local_size_y = 8, local_size_z = 1) in;
+
+struct Foo0
+{
+    float a;
+};
+
+struct Foo1
+{
+	Foo0 a;
+};
+
+void Zero(out Foo0 v)
+{
+	v.a = 0.0;
+}
+
+struct Foo2
+{
+    Foo1 a;
+	float weight;
+};
+
+layout(std430, binding = 0) buffer SSBO
+{
+    Foo2 outputs[];
+};
+
+shared Foo2 coeffs[64];
+
+void main()
+{
+    Foo2 data;
+    data.weight = 0.0;
+    Zero(data.a.a);
+    coeffs[gl_LocalInvocationIndex] = data;
+	barrier();
+    if (gl_LocalInvocationIndex == 0u)
+        outputs[gl_WorkGroupID.x] = coeffs[0];
+}
diff --git a/shaders-opencl/comp/composite-construct.comp b/shaders-opencl/comp/composite-construct.comp
new file mode 100644
index 000000000..305477532
--- /dev/null
+++ b/shaders-opencl/comp/composite-construct.comp
@@ -0,0 +1,31 @@
+#version 310 es
+layout(local_size_x = 1) in;
+
+layout(std430, binding = 0) buffer SSBO0
+{
+   vec4 as[];
+};
+
+layout(std430, binding = 1) buffer SSBO1
+{
+   vec4 bs[];
+};
+
+struct Composite
+{
+   vec4 a;
+   vec4 b;
+};
+
+const vec4 const_values[2] = vec4[](vec4(20.0), vec4(40.0));
+
+void main()
+{
+   vec4 values[2] = vec4[](as[gl_GlobalInvocationID.x], bs[gl_GlobalInvocationID.x]);
+   vec4 copy_values[2];
+   copy_values = const_values;
+   Composite c = Composite(values[0], copy_values[1]);
+
+   as[0] = values[gl_LocalInvocationIndex];
+   bs[1] = c.b;
+}
diff --git a/shaders-opencl/comp/culling.comp b/shaders-opencl/comp/culling.comp
new file mode 100644
index 000000000..9f8331b10
--- /dev/null
+++ b/shaders-opencl/comp/culling.comp
@@ -0,0 +1,26 @@
+#version 310 es
+layout(local_size_x = 4) in;
+
+layout(std430, binding = 0) readonly buffer SSBO
+{
+    float in_data[];
+};
+
+layout(std430, binding = 1) writeonly buffer SSBO2
+{
+    float out_data[];
+};
+
+layout(std430, binding = 2) buffer SSBO3
+{
+    uint count;
+};
+
+void main()
+{
+    uint ident = gl_GlobalInvocationID.x;
+    float idata = in_data[ident];
+    if (idata > 12.0)
+        out_data[atomicAdd(count, 1u)] = idata;
+}
+
diff --git a/shaders-opencl/comp/defer-parens.comp b/shaders-opencl/comp/defer-parens.comp
new file mode 100644
index 000000000..4e8ea6b39
--- /dev/null
+++ b/shaders-opencl/comp/defer-parens.comp
@@ -0,0 +1,30 @@
+#version 310 es
+layout(local_size_x = 1) in;
+
+layout(binding = 0, std430) buffer SSBO
+{
+	vec4 data;
+	int index;
+};
+
+void main()
+{
+	// Tests defer-parens behavior where a binary expression is OpCompositeExtracted chained together
+	// with an OpCompositeConstruct optimization.
+	vec4 d = data;
+	data = vec4(d.x, d.yz + 10.0, d.w);
+
+	// Verify binary ops.
+	data = d + d + d;
+
+	// Verify swizzles.
+	data = (d.yz + 10.0).xxyy;
+
+	// OpCompositeExtract
+	float t = (d.yz + 10.0).y;
+	data = vec4(t);
+
+	// OpVectorExtractDynamic
+	t = (d.zw + 10.0)[index];
+	data = vec4(t);
+}
diff --git a/shaders-opencl/comp/dowhile.comp b/shaders-opencl/comp/dowhile.comp
new file mode 100644
index 000000000..709db75a1
--- /dev/null
+++ b/shaders-opencl/comp/dowhile.comp
@@ -0,0 +1,31 @@
+#version 310 es
+layout(local_size_x = 1) in;
+
+layout(std430, binding = 0) readonly buffer SSBO
+{
+    mat4 mvp;
+    vec4 in_data[];
+};
+
+layout(std430, binding = 1) writeonly buffer SSBO2
+{
+    vec4 out_data[];
+};
+
+int i;
+
+void main()
+{
+    uint ident = gl_GlobalInvocationID.x;
+
+    i = 0;
+    vec4 idat = in_data[ident];
+    do
+    {
+        idat = mvp * idat;
+        i++;
+    } while(i < 16);
+
+    out_data[ident] = idat;
+}
+
diff --git a/shaders-opencl/comp/expect-assume.comp b/shaders-opencl/comp/expect-assume.comp
new file mode 100644
index 000000000..767019e5b
--- /dev/null
+++ b/shaders-opencl/comp/expect-assume.comp
@@ -0,0 +1,19 @@
+#version 450
+#extension GL_EXT_spirv_intrinsics : require
+
+layout(local_size_x = 32) in;
+
+layout(std430, binding = 0) buffer buffer_t {
+    uint z;
+} buf;
+
+spirv_instruction (extensions = ["SPV_KHR_expect_assume"], capabilities = [5629], id = 5630)
+void assume_true(bool condition);
+
+spirv_instruction (extensions = ["SPV_KHR_expect_assume"], capabilities = [5629], id = 5631)
+uint expect(uint value, uint exp_value);
+
+void main() {
+    assume_true(gl_WorkGroupID.x < 32);
+    buf.z = expect(gl_WorkGroupID.z, uint(0));
+}
diff --git a/shaders-opencl/comp/force-recompile-hooks.swizzle.comp b/shaders-opencl/comp/force-recompile-hooks.swizzle.comp
new file mode 100644
index 000000000..2752d3051
--- /dev/null
+++ b/shaders-opencl/comp/force-recompile-hooks.swizzle.comp
@@ -0,0 +1,9 @@
+#version 450
+
+layout(binding = 0) uniform sampler2D foo;
+layout(binding = 1, rgba8) uniform image2D bar;
+
+void main() {
+	vec4 a = texture(foo, vec2(1, 1));
+	imageStore(bar, ivec2(0, 0), a);
+}
diff --git a/shaders-opencl/comp/functions.comp b/shaders-opencl/comp/functions.comp
new file mode 100644
index 000000000..478c8ebe8
--- /dev/null
+++ b/shaders-opencl/comp/functions.comp
@@ -0,0 +1,12 @@
+#version 450
+shared int foo[1337];
+
+void myfunc()
+{
+	foo[0]=13;
+}
+
+void main()
+{
+	myfunc();
+}
diff --git a/shaders-opencl/comp/global-invocation-id.comp b/shaders-opencl/comp/global-invocation-id.comp
new file mode 100644
index 000000000..404ca36a8
--- /dev/null
+++ b/shaders-opencl/comp/global-invocation-id.comp
@@ -0,0 +1,9 @@
+#version 450
+layout(set = 0, binding = 0) buffer myBlock {
+    int a;
+    float b[1];
+} myStorage;
+void main() {
+    myStorage.a = (myStorage.a + 1) % 256;
+    myStorage.b[gl_GlobalInvocationID.x] = myStorage.b[gl_GlobalInvocationID.x] + 0.02;
+}
diff --git a/shaders-opencl/comp/image.comp b/shaders-opencl/comp/image.comp
new file mode 100644
index 000000000..e375534a5
--- /dev/null
+++ b/shaders-opencl/comp/image.comp
@@ -0,0 +1,12 @@
+#version 310 es
+layout(local_size_x = 1) in;
+
+layout(rgba8, binding = 0) uniform readonly mediump image2D uImageIn;
+layout(rgba8, binding = 1) uniform writeonly mediump image2D uImageOut;
+
+void main()
+{
+    vec4 v = imageLoad(uImageIn, ivec2(gl_GlobalInvocationID.xy) + imageSize(uImageIn));
+    imageStore(uImageOut, ivec2(gl_GlobalInvocationID.xy), v);
+}
+
diff --git a/shaders-opencl/comp/insert.comp b/shaders-opencl/comp/insert.comp
new file mode 100644
index 000000000..07c1f8d7a
--- /dev/null
+++ b/shaders-opencl/comp/insert.comp
@@ -0,0 +1,18 @@
+#version 310 es
+layout(local_size_x = 1) in;
+
+layout(std430, binding = 0) writeonly buffer SSBO
+{
+    vec4 out_data[];
+};
+
+void main()
+{
+    vec4 v;
+    v.x = 10.0;
+    v.y = 30.0;
+    v.z = 70.0;
+    v.w = 90.0;
+    out_data[gl_GlobalInvocationID.x] = v;
+    out_data[gl_GlobalInvocationID.x].y = 20.0;
+}
diff --git a/shaders-opencl/comp/local-invocation-id.comp b/shaders-opencl/comp/local-invocation-id.comp
new file mode 100644
index 000000000..ff2033f66
--- /dev/null
+++ b/shaders-opencl/comp/local-invocation-id.comp
@@ -0,0 +1,9 @@
+#version 450
+layout(set = 0, binding = 0) buffer myBlock {
+    int a;
+    float b[1];
+} myStorage;
+void main() {
+    myStorage.a = (myStorage.a + 1) % 256;
+    myStorage.b[gl_LocalInvocationID.x] = myStorage.b[gl_LocalInvocationID.x] + 0.02;
+}
diff --git a/shaders-opencl/comp/local-invocation-index.comp b/shaders-opencl/comp/local-invocation-index.comp
new file mode 100644
index 000000000..b661d9002
--- /dev/null
+++ b/shaders-opencl/comp/local-invocation-index.comp
@@ -0,0 +1,9 @@
+#version 450
+layout(set = 0, binding = 0) buffer myBlock {
+    int a;
+    float b[1];
+} myStorage;
+void main() {
+    myStorage.a = (myStorage.a + 1) % 256;
+    myStorage.b[gl_LocalInvocationIndex.x] = myStorage.b[gl_LocalInvocationIndex.x] + 0.02;
+}
diff --git a/shaders-opencl/comp/local-size-duplicate-spec-id.comp b/shaders-opencl/comp/local-size-duplicate-spec-id.comp
new file mode 100644
index 000000000..060858b97
--- /dev/null
+++ b/shaders-opencl/comp/local-size-duplicate-spec-id.comp
@@ -0,0 +1,15 @@
+#version 450
+
+layout(constant_id=0) const int local_size_x_val = 1;
+layout(constant_id=1) const int local_size_y_val = 1;
+layout(constant_id=2) const int local_size_z_val = 1;
+
+layout(local_size_x_id=0, local_size_y_id=1, local_size_z_id=2) in;
+
+layout(set=0, binding=0) buffer StorageBuffer {
+    uint values[];
+} ssbo;
+
+void main() {
+    ssbo.values[gl_LocalInvocationIndex] = 1u;
+}
diff --git a/shaders-opencl/comp/mod.comp b/shaders-opencl/comp/mod.comp
new file mode 100644
index 000000000..1631456e3
--- /dev/null
+++ b/shaders-opencl/comp/mod.comp
@@ -0,0 +1,26 @@
+#version 310 es
+layout(local_size_x = 1) in;
+
+layout(std430, binding = 0) readonly buffer SSBO
+{
+    vec4 in_data[];
+};
+
+layout(std430, binding = 1) writeonly buffer SSBO2
+{
+    vec4 out_data[];
+};
+
+void main()
+{
+    uint ident = gl_GlobalInvocationID.x;
+    vec4 v = mod(in_data[ident], out_data[ident]);
+    out_data[ident] = v;
+
+    uvec4 vu = floatBitsToUint(in_data[ident]) % floatBitsToUint(out_data[ident]);
+    out_data[ident] = uintBitsToFloat(vu);
+
+    ivec4 vi = floatBitsToInt(in_data[ident]) % floatBitsToInt(out_data[ident]);
+    out_data[ident] = intBitsToFloat(vi);
+}
+
diff --git a/shaders-opencl/comp/modf.comp b/shaders-opencl/comp/modf.comp
new file mode 100644
index 000000000..edadefcf0
--- /dev/null
+++ b/shaders-opencl/comp/modf.comp
@@ -0,0 +1,23 @@
+#version 310 es
+layout(local_size_x = 1) in;
+
+layout(std430, binding = 0) readonly buffer SSBO
+{
+    vec4 in_data[];
+};
+
+layout(std430, binding = 1) writeonly buffer SSBO2
+{
+    vec4 out_data[];
+};
+
+void main()
+{
+    uint ident = gl_GlobalInvocationID.x;
+    vec4 i;
+    //vec4 v = frexp(in_data[ident], i);
+    //out_data[ident] = ldexp(v, i);
+    vec4 v = modf(in_data[ident], i);
+    out_data[ident] = v;
+}
+
diff --git a/shaders-opencl/comp/outer-product.comp b/shaders-opencl/comp/outer-product.comp
new file mode 100644
index 000000000..9aba2a54b
--- /dev/null
+++ b/shaders-opencl/comp/outer-product.comp
@@ -0,0 +1,37 @@
+#version 450
+layout(local_size_x = 1) in;
+
+layout(set = 0, binding = 0, std430) writeonly buffer SSBO
+{
+	mat2 m22;
+	mat2x3 m23;
+	mat2x4 m24;
+	mat3x2 m32;
+	mat3 m33;
+	mat3x4 m34;
+	mat4x2 m42;
+	mat4x3 m43;
+	mat4 m44;
+};
+
+layout(set = 0, binding = 1, std430) readonly buffer ReadSSBO
+{
+	vec2 v2;
+	vec3 v3;
+	vec4 v4;
+};
+
+void main()
+{
+	m22 = outerProduct(v2, v2);
+	m23 = outerProduct(v3, v2);
+	m24 = outerProduct(v4, v2);
+
+	m32 = outerProduct(v2, v3);
+	m33 = outerProduct(v3, v3);
+	m34 = outerProduct(v4, v3);
+
+	m42 = outerProduct(v2, v4);
+	m43 = outerProduct(v3, v4);
+	m44 = outerProduct(v4, v4);
+}
diff --git a/shaders-opencl/comp/packing-test-1.comp b/shaders-opencl/comp/packing-test-1.comp
new file mode 100644
index 000000000..1a8a39e21
--- /dev/null
+++ b/shaders-opencl/comp/packing-test-1.comp
@@ -0,0 +1,18 @@
+#version 450
+struct T1
+{
+    vec3 a;
+    float b;
+};
+
+layout(std430, binding = 1) buffer Buffer0 { T1 buf0[]; };
+layout(std430, binding = 2) buffer Buffer1 { float buf1[]; };
+
+layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in;
+void main()
+{
+    // broken case in Metal!
+    T1 v = buf0[0];
+    float x = v.b;
+    buf1[gl_GlobalInvocationID.x] = x;
+}
diff --git a/shaders-opencl/comp/packing-test-2.comp b/shaders-opencl/comp/packing-test-2.comp
new file mode 100644
index 000000000..73268beec
--- /dev/null
+++ b/shaders-opencl/comp/packing-test-2.comp
@@ -0,0 +1,16 @@
+#version 450
+struct T1
+{
+    vec3 a;
+    float b;
+};
+
+layout(std430, binding = 1) buffer Buffer0 { T1 buf0[]; };
+layout(std430, binding = 2) buffer Buffer1 { float buf1[]; };
+
+layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in;
+void main()
+{
+    float x = buf0[0].b;
+    buf1[gl_GlobalInvocationID.x] = x;
+}
diff --git a/shaders-opencl/comp/read-write-only.comp b/shaders-opencl/comp/read-write-only.comp
new file mode 100644
index 000000000..b224b6f12
--- /dev/null
+++ b/shaders-opencl/comp/read-write-only.comp
@@ -0,0 +1,26 @@
+#version 310 es
+layout(local_size_x = 1) in;
+
+layout(binding = 0, std430) readonly buffer SSBO0
+{
+   vec4 data0;
+   vec4 data1;
+};
+
+layout(binding = 1, std430) restrict buffer SSBO1
+{
+   vec4 data2;
+   vec4 data3;
+};
+
+layout(binding = 2, std430) restrict writeonly buffer SSBO2
+{
+   vec4 data4;
+   vec4 data5;
+};
+
+void main()
+{
+   data4 = data0 + data2;
+   data5 = data1 + data3;
+}
diff --git a/shaders-opencl/comp/rmw-opt.comp b/shaders-opencl/comp/rmw-opt.comp
new file mode 100644
index 000000000..a6e1e7fe7
--- /dev/null
+++ b/shaders-opencl/comp/rmw-opt.comp
@@ -0,0 +1,27 @@
+#version 310 es
+layout(local_size_x = 1) in;
+
+layout(std430, binding = 0) buffer SSBO
+{
+	int a;
+};
+
+void main()
+{
+	a += 10;
+	a -= 10;
+	a *= 10;
+	a /= 10;
+	a <<= 2;
+	a >>= 3;
+	a &= 40;
+	a ^= 10;
+	a %= 40;
+	a |= 1;
+
+	bool c = false;
+	bool d = true;
+	c = c && d;
+	d = d || c;
+	a = c && d ? 1 : 0;
+}
diff --git a/shaders-opencl/comp/scalar-std450-distance-length-normalize.comp b/shaders-opencl/comp/scalar-std450-distance-length-normalize.comp
new file mode 100644
index 000000000..635463229
--- /dev/null
+++ b/shaders-opencl/comp/scalar-std450-distance-length-normalize.comp
@@ -0,0 +1,20 @@
+#version 450
+layout(local_size_x = 1) in;
+
+layout(std430, set = 0, binding = 0) buffer SSBO
+{
+	float a;
+	float b;
+	float c;
+	float d;
+	float e;
+	float f;
+};
+
+void main()
+{
+	c = distance(a, b);
+	d = length(a);
+	e = normalize(a);
+	f = distance(a-1, b-2);
+}
diff --git a/shaders-opencl/comp/shared-std450.double.comp b/shaders-opencl/comp/shared-std450.double.comp
new file mode 100644
index 000000000..07e96e6bc
--- /dev/null
+++ b/shaders-opencl/comp/shared-std450.double.comp
@@ -0,0 +1,27 @@
+#version 450
+layout(local_size_x = 4) in;
+
+shared double sShared[gl_WorkGroupSize.x];
+
+layout(std430, binding = 0) readonly buffer SSBO
+{
+    double in_data[];
+};
+
+layout(std430, binding = 1) writeonly buffer SSBO2
+{
+    double out_data[];
+};
+
+void main()
+{
+    uint ident = gl_GlobalInvocationID.x;
+    double idata = in_data[ident];
+
+    sShared[gl_LocalInvocationIndex] = idata;
+    memoryBarrierShared();
+    barrier();
+
+    out_data[ident] = sShared[gl_WorkGroupSize.x - gl_LocalInvocationIndex - 1u];
+}
+
diff --git a/shaders-opencl/comp/shared-struct-bool-cast.comp b/shaders-opencl/comp/shared-struct-bool-cast.comp
new file mode 100644
index 000000000..d6479b3e4
--- /dev/null
+++ b/shaders-opencl/comp/shared-struct-bool-cast.comp
@@ -0,0 +1,35 @@
+#version 450
+layout(local_size_x = 1) in;
+
+layout(std140, binding = 0) buffer block { highp uint passed; };
+struct S1 {
+	mediump ivec3 a;
+	highp uvec2 b;
+	bvec4 c;
+	mediump uint d;
+};
+
+bool compare_ivec3    (highp ivec3 a, highp ivec3 b)  { return a == b; }
+bool compare_uint     (highp uint a, highp uint b)    { return a == b; }
+bool compare_uvec2    (highp uvec2 a, highp uvec2 b)  { return a == b; }
+bool compare_bvec4    (bvec4 a, bvec4 b)              { return a == b; }
+
+shared S1 s1;
+
+void main (void) {
+	s1.a = ivec3(6, 8, 8);
+	s1.b = uvec2(4u, 4u);
+	s1.c = bvec4(false, false, false, true);
+	s1.d = 6u;
+
+	barrier();
+	memoryBarrier();
+	bool allOk = true;
+	allOk = allOk && compare_ivec3(ivec3(6, 8, 8), s1.a);
+	allOk = allOk && compare_uvec2(uvec2(4u, 4u), s1.b);
+	allOk = allOk && compare_bvec4(bvec4(false, false, false, true), s1.c);
+	allOk = allOk && compare_uint(6u, s1.d);
+	if (allOk)
+		passed++;
+
+}
diff --git a/shaders-opencl/comp/shared-zero-init-simple.comp b/shaders-opencl/comp/shared-zero-init-simple.comp
new file mode 100644
index 000000000..fe9bac5ad
--- /dev/null
+++ b/shaders-opencl/comp/shared-zero-init-simple.comp
@@ -0,0 +1,24 @@
+#version 450
+#extension GL_EXT_null_initializer : enable
+layout(local_size_x = 4) in;
+
+shared float sShared = {};
+
+layout(std430, binding = 0) readonly buffer SSBO
+{
+    float in_data[];
+};
+
+layout(std430, binding = 1) writeonly buffer SSBO2
+{
+    float out_data[];
+};
+
+void main()
+{
+    uint ident = gl_GlobalInvocationID.x;
+    float idata = in_data[ident];
+
+    out_data[ident] = sShared + idata;
+}
+
diff --git a/shaders-opencl/comp/shared-zero-init.comp b/shaders-opencl/comp/shared-zero-init.comp
new file mode 100644
index 000000000..f30522c77
--- /dev/null
+++ b/shaders-opencl/comp/shared-zero-init.comp
@@ -0,0 +1,28 @@
+#version 450
+#extension GL_EXT_null_initializer : enable
+layout(local_size_x = 4) in;
+
+shared float sShared[gl_WorkGroupSize.x] = {};
+
+layout(std430, binding = 0) readonly buffer SSBO
+{
+    float in_data[];
+};
+
+layout(std430, binding = 1) writeonly buffer SSBO2
+{
+    float out_data[];
+};
+
+void main()
+{
+    uint ident = gl_GlobalInvocationID.x;
+    float idata = in_data[ident];
+
+    sShared[gl_LocalInvocationIndex] += idata;
+    memoryBarrierShared();
+    barrier();
+
+    out_data[ident] = sShared[gl_WorkGroupSize.x - gl_LocalInvocationIndex - 1u];
+}
+
diff --git a/shaders-opencl/comp/shared.comp b/shaders-opencl/comp/shared.comp
new file mode 100644
index 000000000..4deff9359
--- /dev/null
+++ b/shaders-opencl/comp/shared.comp
@@ -0,0 +1,27 @@
+#version 310 es
+layout(local_size_x = 4) in;
+
+shared float sShared[gl_WorkGroupSize.x];
+
+layout(std430, binding = 0) readonly buffer SSBO
+{
+    float in_data[];
+};
+
+layout(std430, binding = 1) writeonly buffer SSBO2
+{
+    float out_data[];
+};
+
+void main()
+{
+    uint ident = gl_GlobalInvocationID.x;
+    float idata = in_data[ident];
+
+    sShared[gl_LocalInvocationIndex] = idata;
+    memoryBarrierShared();
+    barrier();
+
+    out_data[ident] = sShared[gl_WorkGroupSize.x - gl_LocalInvocationIndex - 1u];
+}
+
diff --git a/shaders-opencl/comp/spec-constant-work-group-size.comp b/shaders-opencl/comp/spec-constant-work-group-size.comp
new file mode 100644
index 000000000..09b65dc99
--- /dev/null
+++ b/shaders-opencl/comp/spec-constant-work-group-size.comp
@@ -0,0 +1,17 @@
+#version 450
+layout(local_size_x_id = 10, local_size_y = 20) in;
+
+layout(constant_id = 0) const int a = 1;
+layout(constant_id = 1) const int b = 2;
+
+layout(set = 1, binding = 0) writeonly buffer SSBO
+{
+	int v[];
+};
+
+void main()
+{
+	int spec_const_array_size[b];
+	spec_const_array_size[a] = a;
+	v[a + gl_WorkGroupSize.x + gl_WorkGroupSize.y] = b + spec_const_array_size[1 - a];
+}
diff --git a/shaders-opencl/comp/struct-layout.comp b/shaders-opencl/comp/struct-layout.comp
new file mode 100644
index 000000000..5a2b7802d
--- /dev/null
+++ b/shaders-opencl/comp/struct-layout.comp
@@ -0,0 +1,24 @@
+#version 310 es
+layout(local_size_x = 1) in;
+
+struct Foo
+{
+    mat4 m;
+};
+
+layout(std430, binding = 0) readonly buffer SSBO
+{
+    Foo in_data[];
+};
+
+layout(std430, binding = 1) writeonly buffer SSBO2
+{
+    Foo out_data[];
+};
+
+void main()
+{
+    uint ident = gl_GlobalInvocationID.x;
+    out_data[ident].m = in_data[ident].m * in_data[ident].m;
+}
+
diff --git a/shaders-opencl/comp/struct-nested.comp b/shaders-opencl/comp/struct-nested.comp
new file mode 100644
index 000000000..d9645cbc4
--- /dev/null
+++ b/shaders-opencl/comp/struct-nested.comp
@@ -0,0 +1,20 @@
+#version 450
+				   
+struct s1
+{
+	int a;
+};
+
+struct s2
+{
+	s1 b;
+};
+
+layout(std430, binding = 1) buffer dstbuffer{ s2 test[]; };
+layout(local_size_x = 1, local_size_y = 1, local_size_z = 1) in;
+void main()
+{
+	s2 testVal;
+	testVal.b.a = 0;
+	test[0] = testVal;
+}
\ No newline at end of file
diff --git a/shaders-opencl/comp/struct-packing.invalid.comp b/shaders-opencl/comp/struct-packing.invalid.comp
new file mode 100644
index 000000000..5baf45cb3
--- /dev/null
+++ b/shaders-opencl/comp/struct-packing.invalid.comp
@@ -0,0 +1,77 @@
+#version 310 es
+layout(local_size_x = 1) in;
+
+struct S0
+{
+    vec2 a[1];
+    float b;
+};
+
+struct S1
+{
+    vec3 a;
+    float b;
+};
+
+struct S2
+{
+    vec3 a[1];
+    float b;
+};
+
+struct S3
+{
+    vec2 a;
+    float b;
+};
+
+struct S4
+{
+	vec2 c;
+};
+
+struct Content
+{
+    S0 m0s[1];
+    S1 m1s[1];
+    S2 m2s[1];
+    S0 m0;
+    S1 m1;
+    S2 m2;
+    S3 m3;
+    float m4;
+
+	S4 m3s[8];
+};
+
+layout(binding = 1, std430) buffer SSBO1
+{
+    Content content;
+    Content content1[2];
+    Content content2;
+
+    layout(column_major) mat2 m0;
+    layout(column_major) mat2 m1;
+    layout(column_major) mat2x3 m2[4];
+    layout(column_major) mat3x2 m3;
+    layout(row_major) mat2 m4;
+    layout(row_major) mat2 m5[9];
+    layout(row_major) mat2x3 m6[4][2];
+    layout(row_major) mat3x2 m7;
+    float array[];
+} ssbo_430;
+
+layout(binding = 0, std140) buffer SSBO0
+{
+    Content content;
+    Content content1[2];
+    Content content2;
+    float array[];
+} ssbo_140;
+
+void main()
+{
+    ssbo_430.content = ssbo_140.content;
+    ssbo_430.content.m1.a = ssbo_430.m6[1][1] * ssbo_430.content.m3.a;	// test packed matrix access
+}
+
diff --git a/shaders-opencl/comp/torture-loop.comp b/shaders-opencl/comp/torture-loop.comp
new file mode 100644
index 000000000..54a1221a1
--- /dev/null
+++ b/shaders-opencl/comp/torture-loop.comp
@@ -0,0 +1,40 @@
+#version 310 es
+layout(local_size_x = 1) in;
+
+layout(std430, binding = 0) readonly buffer SSBO
+{
+    mat4 mvp;
+    vec4 in_data[];
+};
+
+layout(std430, binding = 1) writeonly buffer SSBO2
+{
+    vec4 out_data[];
+};
+
+void main()
+{
+    uint ident = gl_GlobalInvocationID.x;
+    vec4 idat = in_data[ident];
+
+    int k = 0;
+
+    // Continue with side effects.
+    while (++k < 10)
+    {
+        idat *= 2.0;
+        k++;
+    }
+
+    // Again used here ...
+    for (uint i = 0u; i < 16u; i++, k++)
+        for (uint j = 0u; j < 30u; j++)
+            idat = mvp * idat;
+
+    do
+    {
+        k++;
+    } while (k > 10);
+    out_data[ident] = idat;
+}
+
diff --git a/shaders-opencl/comp/type-alias.comp b/shaders-opencl/comp/type-alias.comp
new file mode 100644
index 000000000..343d350a2
--- /dev/null
+++ b/shaders-opencl/comp/type-alias.comp
@@ -0,0 +1,45 @@
+#version 310 es
+layout(local_size_x = 1) in;
+
+struct S0
+{
+   vec4 a;
+};
+
+struct S1
+{
+   vec4 a;
+};
+
+vec4 overload(S0 s0)
+{
+   return s0.a;
+}
+
+vec4 overload(S1 s1)
+{
+   return s1.a;
+}
+
+layout(std430, binding = 0) buffer SSBO0
+{
+   S0 s0s[];
+};
+
+layout(std430, binding = 1) buffer SSBO1
+{
+   S1 s1s[];
+};
+
+layout(std430, binding = 2) buffer SSBO2
+{
+   vec4 outputs[];
+};
+
+
+void main()
+{
+   S0 s0 = s0s[gl_GlobalInvocationID.x];
+   S1 s1 = s1s[gl_GlobalInvocationID.x];
+   outputs[gl_GlobalInvocationID.x] = overload(s0) + overload(s1); 
+}
diff --git a/shaders-opencl/comp/udiv.comp b/shaders-opencl/comp/udiv.comp
new file mode 100644
index 000000000..d4e1133bc
--- /dev/null
+++ b/shaders-opencl/comp/udiv.comp
@@ -0,0 +1,17 @@
+#version 310 es
+layout(local_size_x = 1) in;
+
+layout(std430, binding = 0) buffer SSBO
+{
+    uint inputs[];
+};
+
+layout(std430, binding = 1) buffer SSBO2
+{
+    uint outputs[];
+};
+
+void main()
+{
+    outputs[gl_GlobalInvocationID.x] = inputs[gl_GlobalInvocationID.x] / 29u;
+}
diff --git a/shaders-opencl/comp/writable-ssbo.comp b/shaders-opencl/comp/writable-ssbo.comp
new file mode 100644
index 000000000..d0cc18deb
--- /dev/null
+++ b/shaders-opencl/comp/writable-ssbo.comp
@@ -0,0 +1,9 @@
+#version 450
+layout(set = 0, binding = 0) buffer myBlock {
+    int a;
+    float b;
+} myStorage;
+void main() {
+    myStorage.a = (myStorage.a + 1) % 256;
+    myStorage.b = myStorage.b + 0.02;
+}
diff --git a/spirv_glsl.hpp b/spirv_glsl.hpp
index d6c20247a..4773595db 100644
--- a/spirv_glsl.hpp
+++ b/spirv_glsl.hpp
@@ -266,7 +266,7 @@ class CompilerGLSL : public Compiler
 	// require_extension("GL_KHR_my_extension");
 	void require_extension(const std::string &ext);
 
-	// Returns the list of required extensions. After compilation this will contains any other 
+	// Returns the list of required extensions. After compilation this will contains any other
 	// extensions that the compiler used automatically, in addition to the user specified ones.
 	const SmallVector<std::string> &get_required_extensions() const;
 
@@ -403,6 +403,16 @@ class CompilerGLSL : public Compiler
 
 	// Virtualize methods which need to be overridden by subclass targets like C++ and such.
 	virtual void emit_function_prototype(SPIRFunction &func, const Bitset &return_flags);
+	// Called right after the opening { of a non-entry helper function body.
+	// Override to emit per-function preamble declarations (e.g. #define aliases).
+	virtual void emit_function_local_declarations(SPIRFunction &)
+	{
+	}
+	// Called right before the closing } of a non-entry helper function body.
+	// Override to clean up anything emitted by emit_function_local_declarations.
+	virtual void emit_function_local_epilogue(SPIRFunction &)
+	{
+	}
 
 	SPIRBlock *current_emitting_block = nullptr;
 	SmallVector<SPIRBlock *> current_emitting_switch_stack;
@@ -451,9 +461,8 @@ class CompilerGLSL : public Compiler
 	virtual void emit_struct_member(const SPIRType &type, uint32_t member_type_id, uint32_t index,
 	                                const std::string &qualifier = "", uint32_t base_offset = 0);
 	virtual std::string image_type_glsl(const SPIRType &type, uint32_t id = 0, bool member = false);
-	std::string constant_expression(const SPIRConstant &c,
-	                                bool inside_block_like_struct_scope = false,
-	                                bool inside_struct_scope = false);
+	virtual std::string constant_expression(const SPIRConstant &c, bool inside_block_like_struct_scope = false,
+	                                        bool inside_struct_scope = false);
 	virtual std::string constant_op_expression(const SPIRConstantOp &cop);
 	virtual std::string constant_expression_vector(const SPIRConstant &c, uint32_t vector);
 	virtual void emit_fixup();
@@ -520,7 +529,7 @@ class CompilerGLSL : public Compiler
 	}
 
 	template <typename T, typename... Ts>
-	inline void statement_inner(T &&t, Ts &&... ts)
+	inline void statement_inner(T &&t, Ts &&...ts)
 	{
 		buffer << std::forward<T>(t);
 		statement_count++;
@@ -528,7 +537,7 @@ class CompilerGLSL : public Compiler
 	}
 
 	template <typename... Ts>
-	inline void statement(Ts &&... ts)
+	inline void statement(Ts &&...ts)
 	{
 		if (is_forcing_recompilation())
 		{
@@ -553,7 +562,7 @@ class CompilerGLSL : public Compiler
 	}
 
 	template <typename... Ts>
-	inline void statement_no_indent(Ts &&... ts)
+	inline void statement_no_indent(Ts &&...ts)
 	{
 		auto old_indent = indent;
 		indent = 0;
@@ -588,15 +597,14 @@ class CompilerGLSL : public Compiler
 	void add_local_variable_name(uint32_t id);
 	void add_resource_name(uint32_t id);
 	void add_member_name(SPIRType &type, uint32_t name);
-	void add_function_overload(const SPIRFunction &func);
+	virtual void add_function_overload(const SPIRFunction &func);
 
 	virtual bool is_non_native_row_major_matrix(uint32_t id);
 	virtual bool member_is_non_native_row_major_matrix(const SPIRType &type, uint32_t index);
 	bool member_is_remapped_physical_type(const SPIRType &type, uint32_t index) const;
 	bool member_is_packed_physical_type(const SPIRType &type, uint32_t index) const;
 	virtual std::string convert_row_major_matrix(std::string exp_str, const SPIRType &exp_type,
-	                                             uint32_t physical_type_id, bool is_packed,
-	                                             bool relaxed = false);
+	                                             uint32_t physical_type_id, bool is_packed, bool relaxed = false);
 
 	std::unordered_set<std::string> local_variable_names;
 	std::unordered_set<std::string> resource_names;
@@ -672,7 +680,7 @@ class CompilerGLSL : public Compiler
 		bool supports_spec_constant_array_size = true;
 	} backend;
 
-	void emit_struct(SPIRType &type);
+	virtual void emit_struct(SPIRType &type);
 	void emit_resources();
 	void emit_extension_workarounds(ExecutionModel model);
 	void emit_subgroup_arithmetic_workaround(const std::string &func, Op op, GroupOperation group_op);
@@ -724,7 +732,8 @@ class CompilerGLSL : public Compiler
 	                          const char *op);
 	void emit_binary_func_op(uint32_t result_type, uint32_t result_id, uint32_t op0, uint32_t op1, const char *op);
 	void emit_atomic_func_op(uint32_t result_type, uint32_t result_id, uint32_t op0, uint32_t op1, const char *op);
-	void emit_atomic_func_op(uint32_t result_type, uint32_t result_id, uint32_t op0, uint32_t op1, uint32_t op2, const char *op);
+	void emit_atomic_func_op(uint32_t result_type, uint32_t result_id, uint32_t op0, uint32_t op1, uint32_t op2,
+	                         const char *op);
 
 	void emit_unary_func_op_cast(uint32_t result_type, uint32_t result_id, uint32_t op0, const char *op,
 	                             SPIRType::BaseType input_type, SPIRType::BaseType expected_result_type);
@@ -747,7 +756,8 @@ class CompilerGLSL : public Compiler
 	void emit_unrolled_binary_op(uint32_t result_type, uint32_t result_id, uint32_t op0, uint32_t op1, const char *op,
 	                             bool negate, SPIRType::BaseType expected_type);
 	void emit_binary_op_cast(uint32_t result_type, uint32_t result_id, uint32_t op0, uint32_t op1, const char *op,
-	                         SPIRType::BaseType input_type, bool skip_cast_if_equal_type, bool implicit_integer_promotion);
+	                         SPIRType::BaseType input_type, bool skip_cast_if_equal_type,
+	                         bool implicit_integer_promotion);
 
 	SPIRType binary_op_bitcast_helper(std::string &cast_op0, std::string &cast_op1, SPIRType::BaseType &input_type,
 	                                  uint32_t op0, uint32_t op1, bool skip_cast_if_equal_type);
@@ -781,8 +791,8 @@ class CompilerGLSL : public Compiler
 	virtual bool access_chain_needs_stage_io_builtin_translation(uint32_t base);
 
 	virtual bool check_physical_type_cast(std::string &expr, const SPIRType *type, uint32_t physical_type);
-	virtual bool prepare_access_chain_for_scalar_access(std::string &expr, const SPIRType &type,
-	                                                    StorageClass storage, bool &is_packed);
+	virtual bool prepare_access_chain_for_scalar_access(std::string &expr, const SPIRType &type, StorageClass storage,
+	                                                    bool &is_packed);
 
 	std::string access_chain(uint32_t base, const uint32_t *indices, uint32_t count, const SPIRType &target_type,
 	                         AccessChainMeta *meta = nullptr, bool ptr_chain = false);
@@ -813,14 +823,14 @@ class CompilerGLSL : public Compiler
 	SPIRExpression &emit_uninitialized_temporary_expression(uint32_t type, uint32_t id);
 	virtual void append_global_func_args(const SPIRFunction &func, uint32_t index, SmallVector<std::string> &arglist);
 	std::string to_non_uniform_aware_expression(uint32_t id);
-	std::string to_atomic_ptr_expression(uint32_t id);
-	std::string to_pretty_expression_if_int_constant(
-			uint32_t id,
-			const GlslConstantNameMapping *mapping_start, const GlslConstantNameMapping *mapping_end,
-			bool register_expression_read = true);
+	virtual std::string to_atomic_ptr_expression(uint32_t id);
+	std::string to_pretty_expression_if_int_constant(uint32_t id, const GlslConstantNameMapping *mapping_start,
+	                                                 const GlslConstantNameMapping *mapping_end,
+	                                                 bool register_expression_read = true);
 	std::string to_expression(uint32_t id, bool register_expression_read = true);
 	std::string to_composite_constructor_expression(const SPIRType &parent_type, uint32_t id, bool block_like_type);
-	std::string to_rerolled_array_expression(const SPIRType &parent_type, const std::string &expr, const SPIRType &type);
+	std::string to_rerolled_array_expression(const SPIRType &parent_type, const std::string &expr,
+	                                         const SPIRType &type);
 	std::string to_enclosed_expression(uint32_t id, bool register_expression_read = true);
 	std::string to_unpacked_expression(uint32_t id, bool register_expression_read = true);
 	std::string to_unpacked_row_major_matrix_expression(uint32_t id);
@@ -837,9 +847,10 @@ class CompilerGLSL : public Compiler
 	std::string address_of_expression(const std::string &expr);
 	void strip_enclosed_expression(std::string &expr);
 	std::string to_member_name(const SPIRType &type, uint32_t index);
-	virtual std::string to_member_reference(uint32_t base, const SPIRType &type, uint32_t index, bool ptr_chain_is_resolved);
+	virtual std::string to_member_reference(uint32_t base, const SPIRType &type, uint32_t index,
+	                                        bool ptr_chain_is_resolved);
 	std::string to_multi_member_reference(const SPIRType &type, const SmallVector<uint32_t> &indices);
-	std::string type_to_glsl_constructor(const SPIRType &type);
+	virtual std::string type_to_glsl_constructor(const SPIRType &type);
 	std::string argument_decl(const SPIRFunction::Parameter &arg);
 	virtual std::string to_qualifiers_glsl(uint32_t id);
 	void fixup_io_block_patch_primitive_qualifiers(const SPIRVariable &var);
@@ -853,8 +864,8 @@ class CompilerGLSL : public Compiler
 	std::string layout_for_variable(const SPIRVariable &variable);
 	std::string to_combined_image_sampler(VariableID image_id, VariableID samp_id);
 	virtual bool skip_argument(uint32_t id) const;
-	virtual bool emit_array_copy(const char *expr, uint32_t lhs_id, uint32_t rhs_id,
-	                             StorageClass lhs_storage, StorageClass rhs_storage);
+	virtual bool emit_array_copy(const char *expr, uint32_t lhs_id, uint32_t rhs_id, StorageClass lhs_storage,
+	                             StorageClass rhs_storage);
 	virtual void emit_block_hints(const SPIRBlock &block);
 	virtual std::string to_initializer_expression(const SPIRVariable &var);
 	virtual std::string to_zero_initialized_expression(uint32_t type_id);
@@ -863,8 +874,7 @@ class CompilerGLSL : public Compiler
 	bool buffer_is_packing_standard(const SPIRType &type, BufferPackingStandard packing,
 	                                uint32_t *failed_index = nullptr, uint32_t start_offset = 0,
 	                                uint32_t end_offset = ~(0u));
-	std::string buffer_to_packing_standard(const SPIRType &type,
-	                                       bool support_std430_without_scalar_layout,
+	std::string buffer_to_packing_standard(const SPIRType &type, bool support_std430_without_scalar_layout,
 	                                       bool support_enhanced_layouts);
 
 	uint32_t type_to_packed_base_size(const SPIRType &type, BufferPackingStandard packing);
diff --git a/spirv_opencl.cpp b/spirv_opencl.cpp
index 68c447aa3..522ba7d92 100644
--- a/spirv_opencl.cpp
+++ b/spirv_opencl.cpp
@@ -25,7 +25,6 @@
 #include "GLSL.std.450.h"
 
 #include <algorithm>
-#include <iostream>
 #include <utility>
 
 using namespace SPIRV_CROSS_SPV_HEADER_NAMESPACE;
@@ -59,6 +58,18 @@ string CompilerOpenCL::compile()
 
 	ir.fixup_reserved_names();
 
+	// Rename WorkgroupSize spec constants to "spvWorkgroupSize" so that builtin_to_glsl
+	// and the constant declaration both use the same name (Bug B fix for task #13).
+	ir.for_each_typed_id<SPIRConstant>(
+	    [&](uint32_t id, SPIRConstant &c)
+	    {
+		    if (c.specialization && has_decoration(c.self, DecorationBuiltIn) &&
+		        BuiltIn(get_decoration(c.self, DecorationBuiltIn)) == BuiltInWorkgroupSize)
+		    {
+			    ir.set_name(id, "spvWorkgroupSize");
+		    }
+	    });
+
 	options.vulkan_semantics = true;
 	options.es = false;
 	options.version = 450;
@@ -82,7 +93,7 @@ string CompilerOpenCL::compile()
 	backend.use_typed_initializer_list = true;
 	backend.native_row_major_matrix = false;
 	backend.unsized_array_supported = false;
-	backend.can_declare_arrays_inline = false;
+	backend.can_declare_arrays_inline = true;
 	backend.allow_truncated_access_chain = true;
 	backend.comparison_image_samples_scalar = true;
 	backend.native_pointers = true;
@@ -160,9 +171,294 @@ const char *CompilerOpenCL::to_storage_qualifiers_glsl(const SPIRVariable &)
 	return "";
 }
 
+void CompilerOpenCL::compute_kernel_resources()
+{
+	// Collect all SSBOs/BufferBlocks that get flattened to __global T* kernel parameters.
+	flattened_buffer_vars.clear();
+	flattened_var_type_decl.clear();
+
+	ir.for_each_typed_id<SPIRVariable>(
+	    [&](uint32_t var_id, SPIRVariable &var)
+	    {
+		    auto &type = get_variable_data_type(var);
+		    if (var.storage == StorageClassStorageBuffer || has_decoration(type.self, DecorationBufferBlock))
+		    {
+			    Bitset flags = ir.get_buffer_block_flags(var);
+			    bool is_readonly = flags.get(DecorationNonWritable);
+
+			    // Compute the element type string for __global T* (same logic as entry_point_args).
+			    string subtype;
+			    if (type.basetype == SPIRType::Struct && type.member_types.size() == 1)
+			    {
+				    const auto &member0_type = get<SPIRType>(type.member_types.front());
+				    subtype = type_to_glsl(member0_type);
+			    }
+			    else
+			    {
+				    subtype = type_to_glsl(type);
+			    }
+
+			    flattened_buffer_vars.insert(var_id);
+			    flattened_var_type_decl[var_id] = join("__global ", is_readonly ? "const " : "", subtype, "* ");
+		    }
+	    });
+
+	// For each non-entry function, find which flattened buffer vars it accesses (direct + transitive).
+	func_flattened_args.clear();
+
+	// First pass: direct accesses.
+	unordered_map<uint32_t, unordered_set<uint32_t>> direct_accesses;
+	ir.for_each_typed_id<SPIRFunction>(
+	    [&](uint32_t func_id, SPIRFunction &func)
+	    {
+		    if (func_id == ir.default_entry_point)
+			    return;
+
+		    auto &accessed = direct_accesses[func_id];
+		    for (auto block_id : func.blocks)
+		    {
+			    auto &block = get<SPIRBlock>(block_id);
+			    for (auto &insn : block.ops)
+			    {
+				    const uint32_t *ops = stream(insn);
+				    for (uint32_t i = 0; i < insn.length; i++)
+				    {
+					    if (flattened_buffer_vars.count(ops[i]))
+						    accessed.insert(ops[i]);
+				    }
+			    }
+		    }
+	    });
+
+	// Second pass: propagate transitively through function calls.
+	bool changed = true;
+	while (changed)
+	{
+		changed = false;
+		ir.for_each_typed_id<SPIRFunction>(
+		    [&](uint32_t func_id, SPIRFunction &func)
+		    {
+			    if (func_id == ir.default_entry_point)
+				    return;
+			    auto &my_accesses = direct_accesses[func_id];
+			    for (auto block_id : func.blocks)
+			    {
+				    auto &block = get<SPIRBlock>(block_id);
+				    for (auto &insn : block.ops)
+				    {
+					    if (static_cast<Op>(insn.op) == OpFunctionCall)
+					    {
+						    const uint32_t *ops = stream(insn);
+						    uint32_t callee_id = ops[2];
+						    if (callee_id != ir.default_entry_point && direct_accesses.count(callee_id))
+						    {
+							    for (auto var_id : direct_accesses[callee_id])
+							    {
+								    if (!my_accesses.count(var_id))
+								    {
+									    my_accesses.insert(var_id);
+									    changed = true;
+								    }
+							    }
+						    }
+					    }
+				    }
+			    }
+		    });
+	}
+
+	// Convert to sorted vectors (stable ordering by var ID).
+	for (auto &kv : direct_accesses)
+	{
+		if (!kv.second.empty())
+		{
+			SmallVector<uint32_t> sorted;
+			for (auto var_id : kv.second)
+				sorted.push_back(var_id);
+			std::sort(sorted.begin(), sorted.end());
+			func_flattened_args[kv.first] = sorted;
+		}
+	}
+
+	// Collect workgroup (StorageClassWorkgroup) and private global (StorageClassPrivate) variables
+	// that are accessed in non-entry helper functions.  In OpenCL C 1.2 these cannot be at file
+	// scope, so they must be declared in the kernel body and threaded as pointer parameters.
+	workgroup_var_ptr_type.clear();
+	workgroup_scalar_vars.clear();
+	threaded_input_builtins.clear();
+
+	unordered_set<uint32_t> threadable_vars;
+	ir.for_each_typed_id<SPIRVariable>(
+	    [&](uint32_t var_id, SPIRVariable &var)
+	    {
+		    if (var.storage == StorageClassWorkgroup || var.storage == StorageClassPrivate)
+		    {
+			    auto &type = get_variable_data_type(var);
+			    bool is_array = !type.array.empty();
+			    bool is_workgroup = (var.storage == StorageClassWorkgroup);
+
+			    // Determine element/base type for the pointer parameter.
+			    string elem_type_str;
+			    if (is_array)
+			    {
+				    // Strip outermost array dimension to get element type.
+				    auto elem_type = type;
+				    elem_type.array.pop_back();
+				    if (!elem_type.array_size_literal.empty())
+					    elem_type.array_size_literal.pop_back();
+				    elem_type_str = type_to_glsl(elem_type);
+			    }
+			    else
+			    {
+				    elem_type_str = type_to_glsl(type);
+			    }
+
+			    string addr_space = is_workgroup ? "__local " : "";
+			    workgroup_var_ptr_type[var_id] = addr_space + elem_type_str + "*";
+			    if (!is_array)
+				    workgroup_scalar_vars.insert(var_id);
+
+			    threadable_vars.insert(var_id);
+		    }
+		    // UBO (Uniform + Block) and PushConstant variables become kernel params.
+		    // Helper functions can't see them, so they must be threaded as value params.
+		    else if (var.storage == StorageClassPushConstant ||
+		             (var.storage == StorageClassUniform && !is_hidden_variable(var) &&
+		              has_decoration(get_variable_data_type(var).self, DecorationBlock) &&
+		              !has_decoration(get_variable_data_type(var).self, DecorationBufferBlock)))
+		    {
+			    auto &type = get_variable_data_type(var);
+			    if (type.basetype == SPIRType::Struct)
+			    {
+				    // Pass by value — no pointer, no #define trick needed.
+				    workgroup_var_ptr_type[var_id] = type_to_glsl(type);
+				    // NOT added to workgroup_scalar_vars (no #define needed — pass by value)
+				    threadable_vars.insert(var_id);
+			    }
+		    }
+		    // Input builtin variables (gl_GlobalInvocationID, etc.) accessed in non-entry functions
+		    // need to be materialized as __private local variables and threaded as pointers.
+		    else if (var.storage == StorageClassInput && has_decoration(var_id, DecorationBuiltIn))
+		    {
+			    auto &type = get_variable_data_type(var);
+			    workgroup_var_ptr_type[var_id] = join("__private ", type_to_glsl(type), "*");
+			    workgroup_scalar_vars.insert(var_id);
+			    threadable_vars.insert(var_id);
+			    auto builtin = BuiltIn(get_decoration(var_id, DecorationBuiltIn));
+			    threaded_input_builtins[static_cast<uint32_t>(builtin)] = var_id;
+		    }
+	    });
+
+	// Direct accesses of threadable vars in non-entry functions.
+	unordered_map<uint32_t, unordered_set<uint32_t>> wg_direct;
+	ir.for_each_typed_id<SPIRFunction>(
+	    [&](uint32_t func_id, SPIRFunction &func)
+	    {
+		    if (func_id == ir.default_entry_point)
+			    return;
+		    auto &accessed = wg_direct[func_id];
+		    for (auto block_id : func.blocks)
+		    {
+			    auto &block = get<SPIRBlock>(block_id);
+			    for (auto &insn : block.ops)
+			    {
+				    const uint32_t *ops = stream(insn);
+				    for (uint32_t i = 0; i < insn.length; i++)
+				    {
+					    if (threadable_vars.count(ops[i]))
+						    accessed.insert(ops[i]);
+				    }
+			    }
+		    }
+	    });
+
+	// Transitively propagate.
+	changed = true;
+	while (changed)
+	{
+		changed = false;
+		ir.for_each_typed_id<SPIRFunction>(
+		    [&](uint32_t func_id, SPIRFunction &func)
+		    {
+			    if (func_id == ir.default_entry_point)
+				    return;
+			    auto &my = wg_direct[func_id];
+			    for (auto block_id : func.blocks)
+			    {
+				    auto &block = get<SPIRBlock>(block_id);
+				    for (auto &insn : block.ops)
+				    {
+					    if (static_cast<Op>(insn.op) == OpFunctionCall)
+					    {
+						    const uint32_t *ops = stream(insn);
+						    uint32_t callee_id = ops[2];
+						    if (callee_id != ir.default_entry_point && wg_direct.count(callee_id))
+						    {
+							    for (auto var_id : wg_direct[callee_id])
+							    {
+								    if (!my.count(var_id))
+								    {
+									    my.insert(var_id);
+									    changed = true;
+								    }
+							    }
+						    }
+					    }
+				    }
+			    }
+		    });
+	}
+
+	// Convert to sorted vectors.
+	func_workgroup_args.clear();
+	for (auto &kv : wg_direct)
+	{
+		if (!kv.second.empty())
+		{
+			SmallVector<uint32_t> sorted;
+			for (auto var_id : kv.second)
+				sorted.push_back(var_id);
+			std::sort(sorted.begin(), sorted.end());
+			func_workgroup_args[kv.first] = sorted;
+		}
+	}
+}
+
 void CompilerOpenCL::emit_resources()
 {
 	replace_illegal_names();
+	compute_kernel_resources();
+
+	// Task #14: Polyfills for packHalf2x16 / unpackHalf2x16.
+	// OpenCL C has vstore_half / vload_half which convert float ↔ float16 in memory.
+	// These flags are set by emit_glsl_op and trigger a recompile on the next pass.
+	if (needs_half_pack_polyfill)
+	{
+		statement("uint spvPackHalf2x16(float2 v) {");
+		statement("    uint r;");
+		statement("    vstore_half(v.x, 0, (__private half *)&r);");
+		statement("    vstore_half(v.y, 1, (__private half *)&r);");
+		statement("    return r;");
+		statement("}");
+		statement("");
+	}
+	if (needs_half_unpack_polyfill)
+	{
+		statement("float2 spvUnpackHalf2x16(uint u) {");
+		statement("    const __private uint *p = &u;");
+		statement("    return (float2)(vload_half(0, (const __private half *)p),");
+		statement("                   vload_half(1, (const __private half *)p));");
+		statement("}");
+		statement("");
+	}
+
+	// Default sampler for combined image+sampler usage (OpenCL C requires file-scope const sampler_t).
+	if (needs_default_sampler)
+	{
+		statement("const sampler_t spvDefaultSampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | "
+		          "CLK_FILTER_NEAREST;");
+		statement("");
+	}
 }
 
 void CompilerOpenCL::replace_illegal_names()
@@ -247,7 +543,7 @@ void CompilerOpenCL::replace_illegal_names()
 		"quad8",
 		"quad16",
 		"complex",
-		"imaginary"
+		"imaginary",
 		"__global",
 		"global",
 		"__local",
@@ -329,50 +625,154 @@ void CompilerOpenCL::emit_entry_point_declarations()
 	if (execution.model != ExecutionModelGLCompute)
 		return;
 
-	bool need_workgroup_id = active_input_builtins.get(BuiltInWorkgroupId);
-	bool need_local_id = active_input_builtins.get(BuiltInLocalInvocationId);
-	bool need_global_id = active_input_builtins.get(BuiltInGlobalInvocationId);
-	bool need_num_workgroups = active_input_builtins.get(BuiltInNumWorkgroups);
+	// Bug A fix (task #13): builtins are now inline calls in builtin_to_glsl, so we only need
+	// spvWorkgroupSize when there is no spec-constant version (which lives at file scope).
+	// Check whether there is a specialization constant decorated BuiltInWorkgroupSize.
+	bool has_spec_workgroup_size = false;
+	ir.for_each_typed_id<SPIRConstant>(
+	    [&](uint32_t, const SPIRConstant &c)
+	    {
+		    if (c.specialization && has_decoration(c.self, DecorationBuiltIn) &&
+		        BuiltIn(get_decoration(c.self, DecorationBuiltIn)) == BuiltInWorkgroupSize)
+			    has_spec_workgroup_size = true;
+	    });
+
 	bool need_workgroup_size = active_input_builtins.get(BuiltInWorkgroupSize);
-	bool need_local_invocation_index = active_input_builtins.get(BuiltInLocalInvocationIndex);
-	bool need_global_size = active_input_builtins.get(BuiltInGlobalSize);
-
-	if (need_workgroup_id)
-		statement("uint3 spvWorkgroupId = (uint3)(get_group_id(0), get_group_id(1), get_group_id(2));");
-	if (need_local_id)
-		statement("uint3 spvLocalInvocationId = (uint3)(get_local_id(0), get_local_id(1), get_local_id(2));");
-	if (need_global_id)
-		statement("uint3 spvGlobalInvocationId = (uint3)(get_global_id(0), get_global_id(1), get_global_id(2));");
-	if (need_num_workgroups)
-		statement("uint3 spvNumWorkgroups = (uint3)(get_num_groups(0), get_num_groups(1), get_num_groups(2));");
-	if (need_workgroup_size)
+	if (!need_workgroup_size)
+	{
+		ir.for_each_typed_id<SPIRConstant>(
+		    [&](uint32_t, const SPIRConstant &c)
+		    {
+			    if (has_decoration(c.self, DecorationBuiltIn) &&
+			        BuiltIn(get_decoration(c.self, DecorationBuiltIn)) == BuiltInWorkgroupSize)
+				    need_workgroup_size = true;
+		    });
+	}
+
+	// Only emit the kernel-local spvWorkgroupSize variable when there is no file-scope spec constant.
+	// When a spec constant exists, it is already emitted as a file-scope `constant uint3 spvWorkgroupSize`.
+	if (need_workgroup_size && !has_spec_workgroup_size)
 		statement("uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));");
-	if (need_local_invocation_index)
-		statement("uint spvLocalInvocationIndex = get_local_id(2) * get_local_size(0) * get_local_size(1) + "
-		          "get_local_id(1) * get_local_size(0) + get_local_id(0);");
-	if (need_global_size)
-		statement("uint3 spvGlobalSize = (uint3)(get_global_size(0), get_global_size(1), get_global_size(2));");
+
+	// Task #6: Emit __local declarations for workgroup (shared) variables.
+	// In OpenCL C 1.x, __local variables must be declared inside kernel functions.
+	ir.for_each_typed_id<SPIRVariable>(
+	    [&](uint32_t, SPIRVariable &var)
+	    {
+		    if (var.storage == StorageClassWorkgroup && !is_hidden_variable(var))
+		    {
+			    auto &type = get_variable_data_type(var);
+			    statement("__local ", variable_decl(type, to_name(var.self), var.self), ";");
+		    }
+	    });
+
+	// Emit private global variables as kernel-local variables.
+	// OpenCL C 1.x doesn't support __private file-scope variables, so we move them inside.
+	for (auto global : global_variables)
+	{
+		auto &var = get<SPIRVariable>(global);
+		if (var.storage == StorageClassPrivate && !is_hidden_variable(var, true))
+		{
+			add_local_variable_name(var.self);
+			string initializer;
+			if (var.initializer)
+				initializer = join(" = ", to_expression(var.initializer));
+			statement(CompilerGLSL::variable_decl(var), initializer, ";");
+		}
+	}
+
+	// Materialize Input builtin variables as local variables.
+	// In OpenCL C, builtins like get_global_id() are function calls, not variables.
+	// When code needs variable pointers to these builtins (either threaded to non-entry
+	// functions or used in pointer select within the entry point), we must create
+	// actual __private local variables.
+	//
+	// Collect all builtins that need materialization: union of threaded and entry-point sets.
+	unordered_map<uint32_t, uint32_t> builtins_to_materialize;
+	for (auto &kv : threaded_input_builtins)
+	{
+		auto var_id = kv.second;
+		bool actually_threaded = false;
+		for (auto &fa : func_workgroup_args)
+		{
+			for (auto vid : fa.second)
+			{
+				if (vid == var_id)
+				{
+					actually_threaded = true;
+					break;
+				}
+			}
+			if (actually_threaded)
+				break;
+		}
+		if (actually_threaded)
+			builtins_to_materialize[kv.first] = kv.second;
+	}
+	for (auto &kv : entry_point_materialized_builtins)
+		builtins_to_materialize[kv.first] = kv.second;
+
+	// Use a guard flag so builtin_to_glsl returns the function call form (not the variable name).
+	emitting_builtin_materialization = true;
+	for (auto &kv : builtins_to_materialize)
+	{
+		auto var_id = kv.second;
+		auto &type = get_variable_data_type(get<SPIRVariable>(var_id));
+		auto builtin = BuiltIn(kv.first);
+		string rhs = builtin_to_glsl(builtin, StorageClassInput);
+		// Builtins return uint3 but the SPIR-V variable may be declared as int3.
+		string var_type_str = type_to_glsl(type);
+		if (type.basetype == SPIRType::Int && type.vecsize == 3)
+			rhs = join("as_int3(", rhs, ")");
+		else if (type.basetype == SPIRType::Int && type.vecsize == 2)
+			rhs = join("as_int2(", rhs, ")");
+		statement(var_type_str, " ", to_name(var_id), " = ", rhs, ";");
+	}
+	emitting_builtin_materialization = false;
 }
 
 string CompilerOpenCL::builtin_to_glsl(BuiltIn builtin, StorageClass storage)
 {
 	(void)storage;
+	if (!emitting_builtin_materialization)
+	{
+		auto key = static_cast<uint32_t>(builtin);
+		// If this builtin is threaded as a pointer param to non-entry functions,
+		// return the variable name so the #define macro can dereference it.
+		if (!processing_entry_point)
+		{
+			auto it = threaded_input_builtins.find(key);
+			if (it != threaded_input_builtins.end())
+				return to_name(it->second);
+		}
+		// If this builtin is materialized as a local variable in the entry point,
+		// return the variable name so that &var_name gives a valid lvalue pointer.
+		if (processing_entry_point)
+		{
+			auto it = entry_point_materialized_builtins.find(key);
+			if (it != entry_point_materialized_builtins.end())
+				return to_name(it->second);
+		}
+	}
 	switch (builtin)
 	{
 	case BuiltInWorkgroupId:
-		return "spvWorkgroupId";
+		return "((uint3)(get_group_id(0), get_group_id(1), get_group_id(2)))";
 	case BuiltInLocalInvocationId:
-		return "spvLocalInvocationId";
+		return "((uint3)(get_local_id(0), get_local_id(1), get_local_id(2)))";
 	case BuiltInGlobalInvocationId:
-		return "spvGlobalInvocationId";
+		return "((uint3)(get_global_id(0), get_global_id(1), get_global_id(2)))";
 	case BuiltInNumWorkgroups:
-		return "spvNumWorkgroups";
+		return "((uint3)(get_num_groups(0), get_num_groups(1), get_num_groups(2)))";
 	case BuiltInWorkgroupSize:
+		// spvWorkgroupSize is either a kernel-local variable or a file-scope spec constant;
+		// both are named "spvWorkgroupSize" so returning this name works in both cases.
 		return "spvWorkgroupSize";
 	case BuiltInLocalInvocationIndex:
-		return "spvLocalInvocationIndex";
+		return "((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) "
+		       "+ get_local_id(0)))";
 	case BuiltInGlobalSize:
-		return "spvGlobalSize";
+		return "((uint3)(get_global_size(0), get_global_size(1), get_global_size(2)))";
 	case BuiltInNumSubgroups:
 	case BuiltInSubgroupId:
 	case BuiltInSubgroupSize:
@@ -396,7 +796,7 @@ string CompilerOpenCL::get_variable_address_space(const SPIRVariable &argument)
 	return get_type_address_space(type, argument.self, true);
 }
 
-string CompilerOpenCL::get_type_address_space(const SPIRType &type, uint32_t id, bool argument)
+string CompilerOpenCL::get_type_address_space(const SPIRType &type, uint32_t id, bool)
 {
 	// This can be called for variable pointer contexts as well, so be very careful about which method we choose.
 	Bitset flags;
@@ -429,6 +829,10 @@ string CompilerOpenCL::get_type_address_space(const SPIRType &type, uint32_t id,
 	case StorageClassWorkgroup:
 		addr_space = "__local";
 		break;
+	case StorageClassInput:
+		// Input builtins materialized as __private local variables.
+		addr_space = "__private";
+		break;
 	default:
 		// __private is default and would be redundant
 		break;
@@ -532,7 +936,9 @@ string CompilerOpenCL::type_to_glsl(const SPIRType &type, uint32_t id, bool memb
 
 	// Scalars
 	case SPIRType::Boolean:
-		type_name = "bool";
+		// OpenCL C has no bool vector types (bool2/bool4 etc.). Map bool vectors to int.
+		// Scalar bool is fine, but vector bool must be int (comparisons return intN in OpenCL).
+		type_name = (type.vecsize > 1) ? "int" : "bool";
 		break;
 
 	case SPIRType::Char:
@@ -590,13 +996,40 @@ string CompilerOpenCL::type_to_glsl(const SPIRType &type, uint32_t id)
 
 string CompilerOpenCL::image_type_glsl(const SPIRType &type, uint32_t id, bool member)
 {
-	(void)id;
 	(void)member;
-	if (type.basetype != SPIRType::Image)
+	if (type.basetype != SPIRType::Image && type.basetype != SPIRType::SampledImage)
 		return "";
 
-	bool readonly = type.image.sampled != 2;
-	const char *access = readonly ? "read_only" : "write_only";
+	// Determine access qualifier.
+	// SampledImage or sampled==1 means the image is used with a sampler (read-only).
+	// sampled==2 means storage image (check decorations for read/write).
+	const char *access;
+	if (type.basetype == SPIRType::SampledImage || type.image.sampled == 1)
+	{
+		access = "read_only";
+	}
+	else
+	{
+		auto *var = (id != 0) ? maybe_get<SPIRVariable>(id) : nullptr;
+		if (var)
+		{
+			bool non_readable = has_decoration(id, DecorationNonReadable);
+			bool non_writable = has_decoration(id, DecorationNonWritable);
+			if (non_readable)
+				access = "write_only";
+			else if (non_writable)
+				access = "read_only";
+			else if (opencl_options.opencl_version >= 200)
+				access = "read_write";
+			else
+				access = "write_only"; // OCL 1.2: default to write_only when no decoration
+		}
+		else
+		{
+			access = "write_only";
+		}
+	}
+
 	switch (type.image.dim)
 	{
 	case Dim1D:
@@ -614,102 +1047,390 @@ string CompilerOpenCL::image_type_glsl(const SPIRType &type, uint32_t id, bool m
 	}
 }
 
+// Mirrors CompilerMSL::get_physical_type_id_stride so that OpPtrAccessChain
+// (used by VariablePointers) does not throw on the OpenCL backend.
+uint32_t CompilerOpenCL::get_physical_type_id_stride(TypeID type_id) const
+{
+	auto &type = get<SPIRType>(type_id);
+
+	// PhysicalStorageBuffer pointers are 64-bit (8 bytes).
+	if (type.pointer && type.storage == StorageClassPhysicalStorageBuffer)
+		return 8u;
+
+	switch (type.basetype)
+	{
+	case SPIRType::Unknown:
+	case SPIRType::Void:
+	case SPIRType::AtomicCounter:
+	case SPIRType::Image:
+	case SPIRType::SampledImage:
+	case SPIRType::Sampler:
+		SPIRV_CROSS_THROW("Querying stride of opaque type.");
+
+	default:
+		break;
+	}
+
+	if (type.basetype == SPIRType::Struct)
+		return (uint32_t)get_declared_struct_size(type);
+
+	// Scalar / vector / matrix: width in bits → bytes, with vec3 padded to vec4.
+	uint32_t vecsize = type.vecsize;
+	if (vecsize == 3)
+		vecsize = 4;
+	return vecsize * type.columns * (type.width / 8u);
+}
+
 std::string CompilerOpenCL::type_to_glsl_constructor(const SPIRType &type)
 {
 	string ret = CompilerGLSL::type_to_glsl_constructor(type);
-	printf("type_to_glsl_constructor: %s\n", ret.c_str());
 	if (!ret.empty())
 		ret = join("(", ret, ")");
 	return ret;
 }
 
-// GCC workaround of lambdas calling protected funcs
-std::string CompilerOpenCL::variable_decl(const SPIRType &type, const std::string &name, uint32_t id)
+// OpenCL C requires cast syntax for replicated vector/matrix constants: (float4)(val) not float4(val).
+// constant_expression is not virtual in GLSL, so we override it here to fix replicated composites.
+std::string CompilerOpenCL::constant_expression(const SPIRConstant &c, bool inside_block_like_struct_scope,
+                                                bool inside_struct_scope)
 {
-	return CompilerGLSL::variable_decl(type, name, id);
+	auto &type = get<SPIRType>(c.constant_type);
+	if (c.replicated && type.op != OpTypeArray)
+	{
+		auto sub_expr = to_expression(c.subconstants[0]);
+		if (type.op == OpTypeMatrix)
+		{
+			// OpenCL C has no native matrix type; matrices are represented as their column vector type.
+			// For a replicated matrix constant, just use the column value directly.
+			return sub_expr;
+		}
+		else
+		{
+			// Vector replicate: (float4)(scalar)
+			return join(type_to_glsl_constructor(type), "(", sub_expr, ")");
+		}
+	}
+	return CompilerGLSL::constant_expression(c, inside_block_like_struct_scope, inside_struct_scope);
 }
 
-std::string CompilerOpenCL::entry_point_args(bool append_comma)
+// OpenCL C requires cast syntax for vector construction: (float4)(1.0, 2.0, 3.0, 4.0)
+// The GLSL base emits: float4(1.0, 2.0, 3.0, 4.0) which is invalid in OpenCL C.
+std::string CompilerOpenCL::constant_expression_vector(const SPIRConstant &c, uint32_t vector)
 {
-	// Reset flattening maps for this compilation pass
-	flattened_buffer_vars.clear();
-	push_const_member_map.clear();
+	string res = CompilerGLSL::constant_expression_vector(c, vector);
 
-	std::string ep_args;
+	auto type = get<SPIRType>(c.constant_type);
+	type.columns = 1;
 
-	struct Resource
+	if (type.vecsize > 1)
 	{
-		SPIRVariable *var;
-		SPIRVariable *discrete_descriptor_alias;
-		string name;
-		SPIRType::BaseType basetype;
-		uint32_t index;
-		uint32_t plane;
-		uint32_t secondary_index;
-	};
+		// The base class emits: typename(args). OpenCL needs: (typename)(args).
+		auto type_name = type_to_glsl(type);
+		if (res.size() > type_name.size() + 1 && res.substr(0, type_name.size()) == type_name &&
+		    res[type_name.size()] == '(')
+		{
+			res = "(" + type_name + ")(" + res.substr(type_name.size() + 1);
+		}
+	}
 
-	SmallVector<Resource> resources;
+	return res;
+}
 
-	ir.for_each_typed_id<SPIRVariable>(
-	    [&](uint32_t var_id, SPIRVariable &var)
-	    {
-		    auto &type = get_variable_data_type(var);
-		    /*
-		    if (var.storage == StorageClassPushConstant)
-		    {
-			    for (uint32_t mbr_idx = 0; mbr_idx < uint32_t(type.member_types.size()); mbr_idx++)
-			    {
-				    if (!ep_args.empty())
-					    ep_args += ", ";
+// Override GLSLstd450 extension op handling for OpenCL-specific fixes.
+void CompilerOpenCL::emit_glsl_op(uint32_t result_type, uint32_t result_id, uint32_t op, const uint32_t *args,
+                                  uint32_t count)
+{
+	auto glsl_op = static_cast<GLSLstd450>(op);
 
-				    auto mbr_name = to_member_name(type, mbr_idx);
-					const auto &member_type = this->get<SPIRType>(type.member_types[mbr_idx]);
-				    ep_args += join(this->type_to_glsl(member_type), " ", mbr_name);
-				    // Record the mapping so emit_instruction can rewrite access chains
-				    push_const_member_map[var_id][mbr_idx] = mbr_name;
-			    }
-		    }
-			*/
-		    if (var.storage == StorageClassStorageBuffer || has_decoration(type.self, DecorationBufferBlock))
-		    {
-			    Bitset flags = ir.get_buffer_block_flags(var);
-			    bool is_readonly = flags.get(DecorationNonWritable);
+	switch (glsl_op)
+	{
+	case GLSLstd450Modf:
+	{
+		// OpenCL modf takes a pointer for the integer part: modf(x, &iptr)
+		register_call_out_argument(args[1]);
+		forced_temporaries.insert(result_id);
+		emit_op(result_type, result_id, join("modf(", to_expression(args[0]), ", &", to_expression(args[1]), ")"),
+		        false);
+		break;
+	}
 
-			    auto to_structuredbuffer_subtype_name = [this](const SPIRType &parent_type) -> std::string
-			    {
-				    if (parent_type.basetype == SPIRType::Struct && parent_type.member_types.size() == 1)
-				    {
-					    // Use type of first struct member as a StructuredBuffer will have only one '._m0' field in SPIR-V
-					    const auto &member0_type = this->get<SPIRType>(parent_type.member_types.front());
-					    return this->type_to_glsl(member0_type);
-				    }
-				    else
-				    {
-					    // Otherwise, this StructuredBuffer only has a basic subtype, e.g. StructuredBuffer<int>
-					    return this->type_to_glsl(parent_type);
-				    }
-			    };
-			    if (!ep_args.empty())
-				    ep_args += ", ";
+	case GLSLstd450ModfStruct:
+	{
+		// OpenCL modf: result._m0 = modf(x, &result._m1)
+		auto &type = get<SPIRType>(result_type);
+		emit_uninitialized_temporary_expression(result_type, result_id);
+		statement(to_expression(result_id), ".", to_member_name(type, 0), " = modf(", to_expression(args[0]), ", &",
+		          to_expression(result_id), ".", to_member_name(type, 1), ");");
+		break;
+	}
 
-			    ep_args += join("__global ", is_readonly ? "const " : "", to_structuredbuffer_subtype_name(type), "* ",
-			                    to_name(var_id));
-			    // Record so emit_instruction can rewrite OpAccessChain against this var
-			    flattened_buffer_vars.insert(var_id);
-		    }
-		    else if ((var.storage == StorageClassUniform || var.storage == StorageClassUniformConstant ||
-		              var.storage == StorageClassPushConstant || var.storage == StorageClassStorageBuffer) &&
-		             !is_hidden_variable(var))
-		    {
-			    switch (var.basetype)
-			    {
-			    case SPIRType::Struct:
-			    {
-				    break;
-			    }
-			    case SPIRType::Sampler:
+	// Task #14: Map GLSL half-precision pack/unpack to OpenCL polyfills.
+	// On the first pass the polyfill may not exist yet; set the flag and force a recompile
+	// so that emit_resources() will emit the helper functions before they are called.
+	case GLSLstd450PackHalf2x16:
+		if (!needs_half_pack_polyfill)
+		{
+			needs_half_pack_polyfill = true;
+			force_recompile();
+		}
+		emit_unary_func_op(result_type, result_id, args[0], "spvPackHalf2x16");
+		break;
+	case GLSLstd450UnpackHalf2x16:
+		if (!needs_half_unpack_polyfill)
+		{
+			needs_half_unpack_polyfill = true;
+			force_recompile();
+		}
+		emit_unary_func_op(result_type, result_id, args[0], "spvUnpackHalf2x16");
+		break;
+
+	default:
+		CompilerGLSL::emit_glsl_op(result_type, result_id, op, args, count);
+		break;
+	}
+}
+
+// Tasks #8: Map type-punning builtins to OpenCL as_TYPE() intrinsics.
+// Also fix integral bitcasts: int4 → uint4 must use as_uint4(), not uint4().
+std::string CompilerOpenCL::bitcast_glsl_op(const SPIRType &out_type, const SPIRType &in_type)
+{
+	// Same basetype: no-op
+	if (out_type.basetype == in_type.basetype)
+		return "";
+
+	// All bitcasts (float↔int, int↔uint, half↔short, etc.) use as_TYPE() in OpenCL C.
+	// type_to_glsl gives us the full type name including vector size (e.g. "float4", "uint").
+	auto out_name = type_to_glsl(out_type);
+	return "as_" + out_name;
+}
+
+// Task #7: In OpenCL C, atomic functions take a pointer argument.
+// Access chain expressions (access_chain = true) may be C lvalues (e.g. ssbo->u32) → need &.
+// But single-member flattened SSBOs emit the raw pointer itself (e.g. _48 is __global uint*)
+// which doesn't need & even though it has access_chain=true.
+std::string CompilerOpenCL::to_atomic_ptr_expression(uint32_t id)
+{
+	auto *e = maybe_get<SPIRExpression>(id);
+	if (e && e->access_chain)
+	{
+		// For SSBO access chains, we need a pointer.
+		// subscripted_deref_exprs marks access chains that are C values (e.g. _48[0]).
+		// For those, we need & to get a pointer (which simplifies to the base pointer _48).
+		// For non-subscripted access chains (pointer-typed), no & is needed.
+		if (subscripted_deref_exprs.count(id))
+			return "&(" + to_expression(id) + ")";
+		return "&" + to_expression(id);
+	}
+
+	// Variable used directly as atomic operand (e.g. shared_u32, a workgroup variable).
+	// In C this is an lvalue, so we need & to get a pointer.
+	auto *var = maybe_get<SPIRVariable>(id);
+	if (var && (var->storage == StorageClassWorkgroup || var->storage == StorageClassStorageBuffer ||
+	            var->storage == StorageClassUniform))
+	{
+		return "&" + to_expression(id);
+	}
+
+	return to_expression(id);
+}
+
+// Task #3: In OpenCL C, pointer-to-struct member access uses -> instead of .
+// ptr_chain_is_resolved == false means this is the first member access from the base.
+std::string CompilerOpenCL::to_member_reference(uint32_t base, const SPIRType &type, uint32_t index,
+                                                bool ptr_chain_is_resolved)
+{
+	if (!ptr_chain_is_resolved && !subscripted_deref_exprs.count(base))
+	{
+		const auto &base_type = expression_type(base);
+		if (is_pointer(base_type))
+		{
+			StorageClass sc = base_type.storage;
+
+			// Function/Private storage: use -> only for actual function pointer parameters
+			// (out/inout params represented as __private T* in OpenCL C).
+			// Regular local variables (OpVariable Function) are emitted as value types, use '.'.
+			if (sc == StorageClassFunction || sc == StorageClassPrivate)
+			{
+				auto *var = maybe_get<SPIRVariable>(base);
+				if (var && var->parameter != nullptr)
+					return join("->", to_member_name(type, index));
+			}
+
+			// StorageBuffer SSBOs / __global pointers: always use ->.
+			// Loaded values (OpLoad result) would have struct type, not pointer type,
+			// so is_pointer() above is false — we only reach here with actual pointers.
+			// Note: StorageClassWorkgroup is excluded because __local variables are emitted
+			// as value types in OpenCL C, so member access uses '.'.
+			if (sc == StorageClassStorageBuffer || sc == StorageClassCrossWorkgroup)
+			{
+				return join("->", to_member_name(type, index));
+			}
+			// StorageClassUniform (UBO): emitted by value in OpenCL — use '.'
+		}
+	}
+	return join(".", to_member_name(type, index));
+}
+
+// Task #4: Emit typedef so structs can be referenced without the 'struct' keyword in OpenCL C.
+void CompilerOpenCL::emit_struct(SPIRType &type)
+{
+	// Check whether the base class will actually emit this struct (it returns early for aliases).
+	bool will_emit = type.type_alias == TypeID(0) ||
+	                 has_extended_decoration(type.type_alias, SPIRVCrossDecorationBufferBlockRepacked);
+
+	CompilerGLSL::emit_struct(type);
+
+	if (will_emit)
+	{
+		auto name = to_name(type.self);
+		statement("typedef struct ", name, " ", name, ";");
+		statement("");
+	}
+}
+
+// GCC workaround of lambdas calling protected funcs
+std::string CompilerOpenCL::variable_decl(const SPIRType &type, const std::string &name, uint32_t id)
+{
+	return CompilerGLSL::variable_decl(type, name, id);
+}
+
+// OpenCL C does not support function overloading. If two functions share a name but differ in
+// signature (different type hashes), the GLSL base class would allow both to keep the same name
+// (since GLSL allows overloading). Override to always rename when a name is already taken.
+void CompilerOpenCL::add_function_overload(const SPIRFunction &func)
+{
+	// Let the base class do its normal work first.
+	CompilerGLSL::add_function_overload(func);
+
+	// After base class runs, check if another function already has our (possibly newly assigned) name.
+	// function_overloads maps name → set of type hashes. If this name maps to more than one hash,
+	// the base class already handled the conflict. But if this is the SECOND function with the same
+	// base name but different hash (GLSL would allow this), we still have a name collision in C.
+	// Re-check: if more than one unique-hash entry shares our name, force a rename on this function.
+	auto current_name = to_name(func.self);
+	auto itr = function_overloads.find(current_name);
+	if (itr != end(function_overloads) && itr->second.size() > 1)
+	{
+		// Two (or more) different signatures share this name. Rename this function.
+		add_resource_name(func.self);
+		function_overloads[to_name(func.self)].insert(0); // sentinel
+	}
+}
+
+// For out/inout function parameters (pointer types in SPIR-V), we emit the function parameter as
+// '__private T *param'. At call sites we must pass '&arg' (take address) so the pointer is valid.
+std::string CompilerOpenCL::to_func_call_arg(const SPIRFunction::Parameter &callee_param, uint32_t id)
+{
+	// Check if the callee parameter expects a pointer (out/inout).
+	auto &param_type = expression_type(callee_param.id);
+	if (is_pointer(param_type) && param_type.storage == StorageClassFunction)
+	{
+		// Pass address of the argument variable.
+		return join("&", to_expression(id));
+	}
+
+	// Flattened buffer vars are already pointers (__global T*).
+	// Don't take their address when passing to functions expecting buffer pointers.
+	if (flattened_buffer_vars.count(id))
+	{
+		auto &arg_type = expression_type(id);
+		auto &callee_type = expression_type(callee_param.id);
+		if (is_pointer(arg_type) && is_pointer(callee_type) &&
+		    (callee_type.storage == StorageClassStorageBuffer || callee_type.storage == StorageClassUniform))
+		{
+			// The flattened var is __global T* but the callee expects __global struct_type*.
+			// Cast to the expected type.
+			auto callee_type_name = type_to_glsl(callee_type);
+			return join("(", callee_type_name, ")", to_expression(id));
+		}
+	}
+
+	return CompilerGLSL::to_func_call_arg(callee_param, id);
+}
+
+std::string CompilerOpenCL::entry_point_args(bool append_comma)
+{
+	// Note: flattened_buffer_vars is already populated by compute_kernel_resources() in emit_resources().
+	// Only reset push_const_member_map here.
+	push_const_member_map.clear();
+
+	std::string ep_args;
+
+	struct Resource
+	{
+		SPIRVariable *var;
+		SPIRVariable *discrete_descriptor_alias;
+		string name;
+		SPIRType::BaseType basetype;
+		uint32_t index;
+		uint32_t plane;
+		uint32_t secondary_index;
+	};
+
+	SmallVector<Resource> resources;
+
+	ir.for_each_typed_id<SPIRVariable>(
+	    [&](uint32_t var_id, SPIRVariable &var)
+	    {
+		    auto &type = get_variable_data_type(var);
+		    // Push constants: emit as struct value parameter.
+		    if (var.storage == StorageClassPushConstant)
+		    {
+			    if (!ep_args.empty())
+				    ep_args += ", ";
+			    ep_args += join(type_to_glsl(type), " ", to_name(var_id));
+		    }
+		    else if (var.storage == StorageClassStorageBuffer || has_decoration(type.self, DecorationBufferBlock))
+		    {
+			    Bitset flags = ir.get_buffer_block_flags(var);
+			    bool is_readonly = flags.get(DecorationNonWritable);
+
+			    auto to_structuredbuffer_subtype_name = [this](const SPIRType &parent_type) -> std::string
+			    {
+				    if (parent_type.basetype == SPIRType::Struct && parent_type.member_types.size() == 1)
+				    {
+					    // Use type of first struct member as a StructuredBuffer will have only one '._m0' field in SPIR-V
+					    const auto &member0_type = this->get<SPIRType>(parent_type.member_types.front());
+					    return this->type_to_glsl(member0_type);
+				    }
+				    else
+				    {
+					    // Otherwise, this StructuredBuffer only has a basic subtype, e.g. StructuredBuffer<int>
+					    return this->type_to_glsl(parent_type);
+				    }
+			    };
+			    if (!ep_args.empty())
+				    ep_args += ", ";
+
+			    ep_args += join("__global ", is_readonly ? "const " : "", to_structuredbuffer_subtype_name(type), "* ",
+			                    to_name(var_id));
+			    // Record so emit_instruction can rewrite OpAccessChain against this var
+			    flattened_buffer_vars.insert(var_id);
+		    }
+		    else if ((var.storage == StorageClassUniform || var.storage == StorageClassUniformConstant ||
+		              var.storage == StorageClassPushConstant || var.storage == StorageClassStorageBuffer) &&
+		             !is_hidden_variable(var))
+		    {
+			    switch (type.basetype)
+			    {
+			    case SPIRType::Struct:
+			    {
+				    // UBO (Uniform + Block): emit as value parameter
+				    if (var.storage == StorageClassUniform && has_decoration(type.self, DecorationBlock))
+				    {
+					    if (!ep_args.empty())
+						    ep_args += ", ";
+					    ep_args += join(type_to_glsl(type), " ", to_name(var_id));
+				    }
+				    break;
+			    }
+			    case SPIRType::Sampler:
+				    if (!ep_args.empty())
+					    ep_args += ", ";
+				    ep_args += "sampler_t " + to_name(var_id);
 				    break;
 			    case SPIRType::Image:
+			    case SPIRType::SampledImage:
 			    {
 				    if (!ep_args.empty())
 					    ep_args += ", ";
@@ -748,17 +1469,24 @@ void CompilerOpenCL::emit_function_prototype(SPIRFunction &func, const Bitset &r
 	if (func.self != ir.default_entry_point)
 		add_function_overload(func);
 
-	// Entry point: __kernel void name(...)
-	emit_workgroup_size_attribute();
+	bool is_entry_point = (func.self == ir.default_entry_point);
+
 	string decl;
-	decl += "__kernel void ";
-	if (func.self == ir.default_entry_point)
+	if (is_entry_point)
 	{
+		// Emit work group size attribute and __kernel qualifier for entry point
+		emit_workgroup_size_attribute();
+		decl += "__kernel void ";
 		decl += get_inner_entry_point_name();
 		processing_entry_point = true;
 	}
 	else
+	{
+		// Regular helper function
+		auto &type = get<SPIRType>(func.return_type);
+		decl += type_to_glsl(type) + " ";
 		decl += to_name(func.self);
+	}
 	decl += "(";
 
 	if (processing_entry_point)
@@ -773,7 +1501,11 @@ void CompilerOpenCL::emit_function_prototype(SPIRFunction &func, const Bitset &r
 	{
 		add_local_variable_name(arg.id);
 
-		decl += argument_decl(arg);
+		// OpenCL C has no in/out/inout qualifiers — skip direction prefix from argument_decl.
+		auto &arg_type = expression_type(arg.id);
+		decl += to_qualifiers_glsl(arg.id);
+		decl += variable_decl(arg_type, to_name(arg.id), arg.id);
+
 		if (&arg != &func.arguments.back())
 			decl += ", ";
 
@@ -783,15 +1515,119 @@ void CompilerOpenCL::emit_function_prototype(SPIRFunction &func, const Bitset &r
 			var->parameter = &arg;
 	}
 
+	// For non-entry helper functions: append extra __global T* params for any flattened buffer
+	// vars that this function (directly or transitively) accesses. This "threads" kernel resources
+	// down through the call chain since OpenCL C has no global address space for buffer pointers.
+	if (!is_entry_point)
+	{
+		bool first_resource = func.arguments.empty();
+
+		auto it = func_flattened_args.find(func.self);
+		if (it != func_flattened_args.end())
+		{
+			for (auto var_id : it->second)
+			{
+				auto type_it = flattened_var_type_decl.find(var_id);
+				if (type_it != flattened_var_type_decl.end())
+				{
+					if (!first_resource)
+						decl += ", ";
+					first_resource = false;
+					decl += type_it->second + to_name(var_id);
+				}
+			}
+		}
+
+		// Also thread workgroup/private global vars as pointer params.
+		auto wg_it = func_workgroup_args.find(func.self);
+		if (wg_it != func_workgroup_args.end())
+		{
+			for (auto var_id : wg_it->second)
+			{
+				auto type_it = workgroup_var_ptr_type.find(var_id);
+				if (type_it != workgroup_var_ptr_type.end())
+				{
+					if (!first_resource)
+						decl += ", ";
+					first_resource = false;
+					bool is_scalar = workgroup_scalar_vars.count(var_id) != 0;
+					string param_name = is_scalar ? (to_name(var_id) + "_ptr") : to_name(var_id);
+					decl += type_it->second + " " + param_name;
+				}
+			}
+		}
+	}
+
 	decl += ")";
 	statement(decl);
 }
 
-void CompilerOpenCL::emit_specialization_constants_and_structs()
+void CompilerOpenCL::append_global_func_args(const SPIRFunction &func, uint32_t index, SmallVector<string> &arglist)
+{
+	// First, call the base class to handle combined image samplers and other shadow args.
+	CompilerGLSL::append_global_func_args(func, index, arglist);
+
+	// Then append flattened kernel buffer vars threaded through helper functions.
+	auto it = func_flattened_args.find(func.self);
+	if (it != func_flattened_args.end())
+	{
+		for (auto var_id : it->second)
+		{
+			if (flattened_var_type_decl.count(var_id))
+				arglist.push_back(to_name(var_id));
+		}
+	}
+
+	// Thread workgroup/private global vars.
+	auto wg_it = func_workgroup_args.find(func.self);
+	if (wg_it != func_workgroup_args.end())
+	{
+		for (auto var_id : wg_it->second)
+		{
+			if (workgroup_var_ptr_type.count(var_id))
+			{
+				bool is_scalar = workgroup_scalar_vars.count(var_id) != 0;
+				// Arrays decay to pointer; scalars need address-of.
+				arglist.push_back(is_scalar ? ("&" + to_name(var_id)) : to_name(var_id));
+			}
+		}
+	}
+}
+
+void CompilerOpenCL::emit_function_local_declarations(SPIRFunction &func)
+{
+	// For helper functions that access workgroup/private global scalar variables via pointer params:
+	// emit #define var_name (*var_name_ptr) so that existing expressions (e.g. "u = 50;")
+	// transparently dereference the pointer parameter.
+	auto wg_it = func_workgroup_args.find(func.self);
+	if (wg_it != func_workgroup_args.end())
+	{
+		for (auto var_id : wg_it->second)
+		{
+			if (workgroup_scalar_vars.count(var_id))
+			{
+				auto var_name = to_name(var_id);
+				statement("#define ", var_name, " (*", var_name, "_ptr)");
+			}
+		}
+	}
+}
+
+void CompilerOpenCL::emit_function_local_epilogue(SPIRFunction &func)
 {
-	SpecializationConstant wg_x, wg_y, wg_z;
-	ID workgroup_size_id = get_work_group_size_specialization_constants(wg_x, wg_y, wg_z);
+	auto wg_it = func_workgroup_args.find(func.self);
+	if (wg_it != func_workgroup_args.end())
+	{
+		for (auto var_id : wg_it->second)
+		{
+			if (workgroup_scalar_vars.count(var_id))
+				statement("#undef ", to_name(var_id));
+		}
+	}
+}
 
+void CompilerOpenCL::emit_specialization_constants_and_structs()
+{
 	bool emitted = false;
 	unordered_set<uint32_t> declared_structs;
 	unordered_set<uint32_t> aligned_structs;
@@ -869,14 +1705,32 @@ void CompilerOpenCL::emit_specialization_constants_and_structs()
 				}
 				emitted = true;
 			}
+			else
+			{
+				// Non-specialization constant arrays need to be declared at file scope
+				// because OpenCL C does not support arrays as value types (can't inline them).
+				auto &type = get<SPIRType>(c.constant_type);
+				if (is_array(type))
+				{
+					add_resource_name(c.self);
+					auto name = to_name(c.self);
+					statement("constant ", variable_decl(type, name, c.self), " = ", constant_expression(c), ";");
+					emitted = true;
+				}
+			}
 		}
 		else if (id.get_type() == TypeConstantOp)
 		{
+			// OpSpecConstantOp results are derived from spec constants via arithmetic ops.
+			// In OpenCL C, "constant T name = expr;" requires a compile-time constant initializer,
+			// but expressions like "as_uint(spec_const)" (function calls) and "vec.x" (component
+			// access on a constant variable) are NOT constant expressions in OpenCL C.
+			// Emit as a #define macro so the expression is inlined at each use site (evaluated at
+			// runtime when used in a function body, which is the only valid use location).
 			auto &c = id.get<SPIRConstantOp>();
-			auto &type = get<SPIRType>(c.basetype);
 			add_resource_name(c.self);
 			auto name = to_name(c.self);
-			statement("constant ", variable_decl(type, name), " = ", constant_op_expression(c), ";");
+			statement("#define ", name, " (", constant_op_expression(c), ")");
 			emitted = true;
 		}
 		else if (id.get_type() == TypeType)
@@ -913,6 +1767,19 @@ void CompilerOpenCL::emit_specialization_constants_and_structs()
 			// OpUndef can be void for some reason ...
 			if (type.basetype == SPIRType::Void)
 				return;
+			// Emit a zero-initialized constant so composite uses of this undef can compile.
+			// OpUndef values are semantically undefined; zero is a safe placeholder.
+			add_resource_name(undef.self);
+			auto name = to_name(undef.self);
+			string zero_expr;
+			if (type.basetype == SPIRType::Struct)
+				zero_expr = join("(", type_to_glsl(type), "){ 0 }");
+			else if (type.vecsize > 1)
+				zero_expr = join(type_to_glsl_constructor(type), "(0)");
+			else
+				zero_expr = "0";
+			statement("constant ", type_to_glsl(type), " ", name, " = ", zero_expr, ";");
+			emitted = true;
 		}
 	}
 
@@ -920,11 +1787,65 @@ void CompilerOpenCL::emit_specialization_constants_and_structs()
 		statement("");
 }
 
+bool CompilerOpenCL::emit_array_copy(const char *expr, uint32_t lhs_id, uint32_t rhs_id, StorageClass, StorageClass)
+{
+	// OpenCL C does not support array assignment (array_is_value_type = false).
+	// Emit element-by-element copy using a for loop.
+	string lhs;
+	if (expr)
+		lhs = expr;
+	else
+		lhs = to_expression(lhs_id);
+
+	auto rhs_expr = to_expression(rhs_id);
+	auto &type = expression_type(rhs_id);
+
+	// Get the array size
+	if (!is_array(type) || type.array.empty())
+	{
+		// Not actually an array; fall back to simple assignment
+		statement(lhs, " = ", rhs_expr, ";");
+		return true;
+	}
+
+	uint32_t array_size = type.array.back();
+	if (!type.array_size_literal.back())
+	{
+		// Spec constant sized array — use simple assignment and hope for the best
+		statement(lhs, " = ", rhs_expr, ";");
+		return true;
+	}
+
+	// Emit element-by-element copy
+	for (uint32_t i = 0; i < array_size; i++)
+		statement(lhs, "[", i, "] = ", rhs_expr, "[", i, "];");
+
+	return true;
+}
+
 void CompilerOpenCL::emit_instruction(const Instruction &instruction)
 {
 	auto ops = stream(instruction);
 	auto opcode = static_cast<Op>(instruction.op);
 
+	// Task #5: Handle barrier/fence ops with OpenCL C equivalents.
+	// Returns the CLK_*_MEM_FENCE flags string for the given memory semantics.
+	auto opencl_mem_fence_flags = [](uint32_t semantics) -> string
+	{
+		// We only care about workgroup and uniform/image memory.
+		bool local = (semantics & MemorySemanticsWorkgroupMemoryMask) != 0;
+		bool global = (semantics & (MemorySemanticsUniformMemoryMask | MemorySemanticsImageMemoryMask |
+		                            MemorySemanticsCrossWorkgroupMemoryMask)) != 0;
+		if (local && global)
+			return "CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE";
+		else if (local)
+			return "CLK_LOCAL_MEM_FENCE";
+		else if (global)
+			return "CLK_GLOBAL_MEM_FENCE";
+		else
+			return "CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE"; // default: fence everything
+	};
+
 	// Map buffer atomics to OpenCL C names (atomic_add, atomic_sub, etc.)
 	auto opencl_atomic = [this, ops](const char *opencl_op)
 	{
@@ -933,8 +1854,241 @@ void CompilerOpenCL::emit_instruction(const Instruction &instruction)
 		emit_atomic_func_op(ops[0], ops[1], ops[2], ops[5], opencl_op);
 	};
 
+	// Helper: cast integer operand to the target signedness using as_TYPE() for OpenCL C.
+	// OpenCL C forbids implicit conversion between integer vector types of different signedness.
+	auto cast_int_for_icmp = [this](uint32_t id, bool want_signed) -> string
+	{
+		auto &t = expression_type(id);
+		if (type_is_integral(t))
+		{
+			bool is_signed = t.basetype == SPIRType::SByte || t.basetype == SPIRType::Short ||
+			                 t.basetype == SPIRType::Int || t.basetype == SPIRType::Int64;
+			if (is_signed != want_signed)
+			{
+				auto target_type = t;
+				target_type.basetype = want_signed ? to_signed_basetype(t.width) : to_unsigned_basetype(t.width);
+				return join("as_", type_to_glsl(target_type), "(", to_expression(id), ")");
+			}
+		}
+		return to_enclosed_expression(id);
+	};
+
+	// Helper: returns true if 'id' is a function parameter that carries a pointer type.
+	// In GLSL, out/inout params are emitted as 'out T', but in OpenCL C they are '__private T *'.
+	// Loads and stores through such params need explicit pointer dereference.
+	auto is_func_ptr_param = [&](uint32_t id) -> bool
+	{
+		auto *var = maybe_get<SPIRVariable>(id);
+		return var && var->parameter != nullptr && is_pointer(expression_type(id)) &&
+		       expression_type(id).storage == StorageClassFunction;
+	};
+
 	switch (opcode)
 	{
+	// OpLoad from an out/inout function parameter pointer: dereference.
+	case OpLoad:
+	{
+		uint32_t ptr = ops[2];
+		if (is_func_ptr_param(ptr))
+		{
+			uint32_t result_type = ops[0];
+			uint32_t result_id = ops[1];
+			emit_op(result_type, result_id, join("(*", to_name(ptr), ")"), true);
+			inherit_expression_dependencies(result_id, ptr);
+			break;
+		}
+		// Loading the whole struct from a flattened buffer pointer (or OpCopyObject of one)
+		// needs dereference. Only applies to direct loads from the variable, not access chains.
+		if (flattened_buffer_vars.count(ptr) ||
+		    (maybe_get<SPIRExpression>(ptr) && !get<SPIRExpression>(ptr).access_chain &&
+		     maybe_get_backing_variable(ptr) && flattened_buffer_vars.count(maybe_get_backing_variable(ptr)->self)))
+		{
+			uint32_t result_type = ops[0];
+			uint32_t result_id = ops[1];
+			emit_op(result_type, result_id, join("(*", to_expression(ptr), ")"), true);
+			inherit_expression_dependencies(result_id, ptr);
+			break;
+		}
+		CompilerGLSL::emit_instruction(instruction);
+		break;
+	}
+
+	// OpStore to an out/inout function parameter pointer or flattened buffer: dereference.
+	case OpStore:
+	{
+		uint32_t ptr = ops[0];
+		if (is_func_ptr_param(ptr))
+		{
+			statement("*", to_name(ptr), " = ", to_expression(ops[1]), ";");
+			register_write(ptr);
+			break;
+		}
+		// Flattened buffer vars are __global T* pointers; storing to them needs dereference.
+		if (flattened_buffer_vars.count(ptr))
+		{
+			statement("*", to_name(ptr), " = ", to_expression(ops[1]), ";");
+			register_write(ptr);
+			break;
+		}
+		CompilerGLSL::emit_instruction(instruction);
+		break;
+	}
+
+	// OpenCL C uses fmod() instead of GLSL's mod().
+	case OpFMod:
+		emit_binary_func_op(ops[0], ops[1], ops[2], ops[3], "fmod");
+		break;
+
+	// SPV_KHR_fma: fused multiply-add — OpenCL C has a native fma() builtin.
+	case OpFmaKHR:
+		emit_trinary_func_op(ops[0], ops[1], ops[2], ops[3], ops[4], "fma");
+		break;
+
+	// SPV_KHR_expect_assume: no equivalent in OpenCL C.
+	// OpAssumeTrueKHR: hint that a condition is always true — emit nothing.
+	case OpAssumeTrueKHR:
+		break;
+	// OpExpectKHR: hint that value has an expected value — emit the value unchanged.
+	case OpExpectKHR:
+		emit_op(ops[0], ops[1], to_expression(ops[2]), should_forward(ops[2]));
+		inherit_expression_dependencies(ops[1], ops[2]);
+		break;
+
+	// Type conversion ops: use OpenCL C convert_TYPE() for numeric value conversions.
+	// The GLSL base class emits (TYPE)(expr) which in OpenCL C is a bitcast for vector types,
+	// not a value conversion. convert_TYPE() is correct for both scalar and vector operands.
+	case OpConvertUToF:
+	case OpConvertSToF:
+	case OpConvertFToU:
+	case OpConvertFToS:
+	case OpFConvert:
+	case OpUConvert:
+	case OpSConvert:
+	{
+		uint32_t result_type = ops[0];
+		uint32_t result_id = ops[1];
+		auto convert_func = join("convert_", type_to_glsl(get<SPIRType>(result_type)));
+		emit_unary_func_op(result_type, result_id, ops[2], convert_func.c_str());
+		break;
+	}
+
+	// OpOuterProduct: no OpenCL builtin and no native matrix type.
+	// The result matrix type is represented as its column vector type in OpenCL C.
+	// Emit only the first column (col_vec * row_vec.x).
+	case OpOuterProduct:
+	{
+		uint32_t result_type = ops[0];
+		uint32_t result_id = ops[1];
+		uint32_t col_vec = ops[2]; // column vector
+		uint32_t row_vec = ops[3]; // row vector
+		auto &row_type = expression_type(row_vec);
+
+		// First column of the outer product: col_vec * row_vec.x
+		string first_row_elem =
+		    row_type.vecsize > 1 ? join(to_expression(row_vec), ".", index_to_swizzle(0)) : to_expression(row_vec);
+		string expr = join(to_expression(col_vec), " * ", first_row_elem);
+		emit_op(result_type, result_id, expr, should_forward(col_vec) && should_forward(row_vec));
+		inherit_expression_dependencies(result_id, col_vec);
+		inherit_expression_dependencies(result_id, row_vec);
+		break;
+	}
+
+	// Task #9: Map GLSL vector comparison functions to OpenCL C operators.
+	// GLSL: lessThan(a, b) → OpenCL: (a < b)
+	// For integer ops, add explicit as_TYPE() casts so operands match the comparison signedness.
+	// OpenCL C does not allow implicit conversion between signed and unsigned vector types.
+#define SPIRV_OPENCL_ICMP_OP(signed_op, unsigned_op, float_op_1, float_op_2, op_str) \
+	case signed_op:                                                                  \
+	{                                                                                \
+		auto left = cast_int_for_icmp(ops[2], true);                                 \
+		auto right = cast_int_for_icmp(ops[3], true);                                \
+		bool fwd = should_forward(ops[2]) && should_forward(ops[3]);                 \
+		emit_op(ops[0], ops[1], join(left, " " op_str " ", right), fwd);             \
+		inherit_expression_dependencies(ops[1], ops[2]);                             \
+		inherit_expression_dependencies(ops[1], ops[3]);                             \
+		break;                                                                       \
+	}                                                                                \
+	case unsigned_op:                                                                \
+	{                                                                                \
+		auto left = cast_int_for_icmp(ops[2], false);                                \
+		auto right = cast_int_for_icmp(ops[3], false);                               \
+		bool fwd = should_forward(ops[2]) && should_forward(ops[3]);                 \
+		emit_op(ops[0], ops[1], join(left, " " op_str " ", right), fwd);             \
+		inherit_expression_dependencies(ops[1], ops[2]);                             \
+		inherit_expression_dependencies(ops[1], ops[3]);                             \
+		break;                                                                       \
+	}                                                                                \
+	case float_op_1:                                                                 \
+	case float_op_2:                                                                 \
+		emit_binary_op(ops[0], ops[1], ops[2], ops[3], op_str);                      \
+		break;
+
+		SPIRV_OPENCL_ICMP_OP(OpSLessThan, OpULessThan, OpFOrdLessThan, OpFUnordLessThan, "<")
+		SPIRV_OPENCL_ICMP_OP(OpSLessThanEqual, OpULessThanEqual, OpFOrdLessThanEqual, OpFUnordLessThanEqual, "<=")
+		SPIRV_OPENCL_ICMP_OP(OpSGreaterThan, OpUGreaterThan, OpFOrdGreaterThan, OpFUnordGreaterThan, ">")
+		SPIRV_OPENCL_ICMP_OP(OpSGreaterThanEqual, OpUGreaterThanEqual, OpFOrdGreaterThanEqual, OpFUnordGreaterThanEqual,
+		                     ">=")
+#undef SPIRV_OPENCL_ICMP_OP
+
+	case OpIEqual:
+	case OpFOrdEqual:
+	case OpFUnordEqual:
+	case OpLogicalEqual:
+		emit_binary_op(ops[0], ops[1], ops[2], ops[3], "==");
+		break;
+	case OpINotEqual:
+	case OpFOrdNotEqual:
+	case OpFUnordNotEqual:
+	case OpLogicalNotEqual:
+		emit_binary_op(ops[0], ops[1], ops[2], ops[3], "!=");
+		break;
+
+	case OpControlBarrier:
+	{
+		// ops[0]=execution_scope, ops[1]=memory_scope, ops[2]=semantics
+		uint32_t semantics = evaluate_constant_u32(ops[2]);
+		semantics = mask_relevant_memory_semantics(semantics);
+
+		flush_control_dependent_expressions(current_emitting_block->self);
+		flush_all_active_variables();
+
+		// Emit memory fence before the execution barrier if needed
+		string fence_flags = opencl_mem_fence_flags(semantics);
+		if (semantics != 0)
+		{
+			if (opencl_options.supports_opencl_version(2, 0))
+				statement("work_group_barrier(", fence_flags, ");");
+			else
+				statement("barrier(", fence_flags, ");");
+		}
+		else
+		{
+			// Execution barrier with default local fence
+			if (opencl_options.supports_opencl_version(2, 0))
+				statement("work_group_barrier(CLK_LOCAL_MEM_FENCE);");
+			else
+				statement("barrier(CLK_LOCAL_MEM_FENCE);");
+		}
+		break;
+	}
+
+	case OpMemoryBarrier:
+	{
+		// ops[0]=memory_scope, ops[1]=semantics
+		uint32_t semantics = evaluate_constant_u32(ops[1]);
+		semantics = mask_relevant_memory_semantics(semantics);
+
+		flush_control_dependent_expressions(current_emitting_block->self);
+		flush_all_active_variables();
+
+		if (semantics != 0)
+		{
+			string fence_flags = opencl_mem_fence_flags(semantics);
+			statement("mem_fence(", fence_flags, ");");
+		}
+		break;
+	}
+
 	case OpAtomicExchange:
 		if (check_atomic_image(ops[2]))
 			SPIRV_CROSS_THROW("Image atomics not yet implemented for OpenCL.");
@@ -943,8 +2097,13 @@ void CompilerOpenCL::emit_instruction(const Instruction &instruction)
 	case OpAtomicCompareExchange:
 		if (check_atomic_image(ops[2]))
 			SPIRV_CROSS_THROW("Image atomics not yet implemented for OpenCL.");
-		// OpenCL atomic_cmpxchg(ptr, expected, desired)
-		emit_atomic_func_op(ops[0], ops[1], ops[2], ops[7], ops[6], "atomic_cmpxchg");
+		// OpenCL atomic_cmpxchg(&ptr, expected, desired)
+		forced_temporaries.insert(ops[1]);
+		emit_op(ops[0], ops[1],
+		        join("atomic_cmpxchg(", to_atomic_ptr_expression(ops[2]), ", ", to_unpacked_expression(ops[7]), ", ",
+		             to_unpacked_expression(ops[6]), ")"),
+		        false);
+		flush_all_atomic_capable_variables();
 		break;
 	case OpAtomicIAdd:
 	case OpAtomicFAddEXT:
@@ -1007,7 +2166,7 @@ void CompilerOpenCL::emit_instruction(const Instruction &instruction)
 		bool unsigned_type = (type.basetype == SPIRType::UInt);
 		const char *inc = (opcode == OpAtomicIIncrement && unsigned_type) ? "1u" :
 		                  (opcode == OpAtomicIIncrement)                  ? "1" :
-		                  unsigned_type                                   ? "uint(-1)" :
+		                  unsigned_type                                   ? "(uint)(-1)" :
 		                                                                    "-1";
 		emit_op(ops[0], ops[1], join("atomic_add(", to_atomic_ptr_expression(ops[2]), ", ", inc, ")"), false);
 		flush_all_atomic_capable_variables();
@@ -1021,23 +2180,91 @@ void CompilerOpenCL::emit_instruction(const Instruction &instruction)
 		uint32_t base_id = ops[2];
 		uint32_t length = instruction.length;
 
-		// SSBO flattened to __global T*: rewrite [base, member_0, element_idx] → base[element_idx]
-		if (flattened_buffer_vars.count(base_id) && length >= 5)
+		if (flattened_buffer_vars.count(base_id))
 		{
-			// ops[3] = struct member index (always 0 for single-member SSBO) — skip
-			// ops[4] = element index within the runtime array
-			auto expr = join(to_name(base_id), "[", to_expression(ops[4]), "]");
-			auto &e = set<SPIRExpression>(result_id, std::move(expr), result_type, true);
-			auto *backing_var = maybe_get_backing_variable(base_id);
-			e.loaded_from = backing_var ? backing_var->self : ID(base_id);
-			e.access_chain = true;
-			forwarded_temporaries.insert(result_id);
-			suppressed_usage_tracking.insert(result_id);
-			for (uint32_t i = 2; i < length; i++)
-				inherit_expression_dependencies(result_id, ops[i]);
-			if (get<SPIRExpression>(result_id).expression_dependencies.empty())
-				forwarded_temporaries.erase(result_id);
-			break;
+			// Handle SSBO access chains for buffer vars.
+			// Get the original SPIR-V struct type to determine single vs multi-member.
+			auto *base_var = maybe_get<SPIRVariable>(base_id);
+			const SPIRType *struct_type = base_var ? &get_variable_data_type(*base_var) : nullptr;
+			bool is_single_member = struct_type && struct_type->member_types.size() == 1;
+
+			string expr;
+			bool handled = false;
+
+			bool is_subscript_deref = false; // result is a C value (subscripted), not a pointer
+
+			if (length >= 5 && is_single_member)
+			{
+				// Single-member SSBO flattened to __global T*: ptr[element_idx][.member]*
+				// ops[3] = struct member index (always 0, skip)
+				// ops[4] = element index within the runtime array
+				// ops[5+] = optional sub-member indices
+				expr = join(to_name(base_id), "[", to_expression(ops[4]), "]");
+				is_subscript_deref = true;
+				// Walk additional sub-member indices using type info.
+				if (length >= 6 && struct_type)
+				{
+					const SPIRType *cur_type = &get<SPIRType>(struct_type->member_types[0]);
+					for (uint32_t i = 5; i < length; i++)
+					{
+						if (cur_type->basetype == SPIRType::Struct)
+						{
+							uint32_t mbr_idx = get<SPIRConstant>(ops[i]).scalar();
+							expr += join(".", to_member_name(*cur_type, mbr_idx));
+							cur_type = &get<SPIRType>(cur_type->member_types[mbr_idx]);
+						}
+						else
+						{
+							// Array or other type - fall back to index notation
+							expr += join("[", to_expression(ops[i]), "]");
+						}
+					}
+				}
+				handled = true;
+			}
+			else if (length == 5 && !is_single_member && struct_type)
+			{
+				// Multi-member SSBO: ptr->member_name[element_idx]
+				// ops[3] = member index, ops[4] = array element index
+				uint32_t mbr_idx = get<SPIRConstant>(ops[3]).scalar();
+				auto mbr_name = to_member_name(*struct_type, mbr_idx);
+				expr = join(to_name(base_id), "->", mbr_name, "[", to_expression(ops[4]), "]");
+				is_subscript_deref = true;
+				handled = true;
+			}
+			else if (length == 4 && is_single_member)
+			{
+				// Single-member SSBO flattened to T*: accessing the one member gives element 0.
+				expr = join(to_name(base_id), "[0]");
+				is_subscript_deref = true;
+				handled = true;
+			}
+			else if (length == 4 && !is_single_member && struct_type)
+			{
+				// Multi-member SSBO: ptr->member_name (lvalue, not address-of)
+				uint32_t mbr_idx = get<SPIRConstant>(ops[3]).scalar();
+				auto mbr_name = to_member_name(*struct_type, mbr_idx);
+				expr = join(to_name(base_id), "->", mbr_name);
+				is_subscript_deref = true; // result is a struct value (accessed through ->), use . for children
+				handled = true;
+			}
+
+			if (handled)
+			{
+				auto &e = set<SPIRExpression>(result_id, std::move(expr), result_type, true);
+				auto *backing_var = maybe_get_backing_variable(base_id);
+				e.loaded_from = backing_var ? backing_var->self : ID(base_id);
+				e.access_chain = true;
+				if (is_subscript_deref)
+					subscripted_deref_exprs.insert(result_id);
+				forwarded_temporaries.insert(result_id);
+				suppressed_usage_tracking.insert(result_id);
+				for (uint32_t i = 2; i < length; i++)
+					inherit_expression_dependencies(result_id, ops[i]);
+				if (get<SPIRExpression>(result_id).expression_dependencies.empty())
+					forwarded_temporaries.erase(result_id);
+				break;
+			}
 		}
 
 		// Push constant expanded to scalar params: rewrite [p_var, member_idx] → scalar param name
@@ -1055,8 +2282,377 @@ void CompilerOpenCL::emit_instruction(const Instruction &instruction)
 			}
 		}
 
+		// If the base expression is already a subscripted/dereferenced C value (e.g. ptr[idx]),
+		// the result of further member access is also a C value. Propagate the tracking so
+		// to_member_reference continues to use '.' instead of '->'.
+		bool base_is_deref = subscripted_deref_exprs.count(base_id) != 0;
+
 		// Fall through to base class for all other access chains
 		CompilerGLSL::emit_instruction(instruction);
+
+		if (base_is_deref)
+			subscripted_deref_exprs.insert(result_id);
+		break;
+	}
+
+	case OpSelect:
+	{
+		uint32_t result_type = ops[0];
+		uint32_t result_id = ops[1];
+		uint32_t condition = ops[2];
+		uint32_t true_val = ops[3];
+		uint32_t false_val = ops[4];
+		auto &cond_type = expression_type(condition);
+		auto &res_type = get<SPIRType>(result_type);
+
+		if (res_type.pointer)
+		{
+			// If result is a pointer, the pointed-to values may be written through it.
+			register_write(true_val);
+			register_write(false_val);
+
+			// Pointer select in OpenCL C: need special handling because
+			// flattened buffer vars are already pointers (no & needed),
+			// Input builtins are function calls (can't take &), and
+			// null pointer constants need to be emitted as NULL.
+			auto make_ptr_expr = [&](uint32_t val) -> string
+			{
+				// Null pointer constant
+				if (ir.ids[val].get_type() == TypeConstant)
+					return "NULL";
+				// Flattened buffer var — already a pointer value
+				if (flattened_buffer_vars.count(val))
+					return to_enclosed_expression(val);
+				// Input builtin variable — materialize as local var and take address
+				auto *var = maybe_get<SPIRVariable>(val);
+				if (var && var->storage == StorageClassInput && has_decoration(val, DecorationBuiltIn))
+				{
+					if (processing_entry_point)
+					{
+						// Entry point: materialize the builtin as a local variable.
+						auto builtin = BuiltIn(get_decoration(val, DecorationBuiltIn));
+						auto key = static_cast<uint32_t>(builtin);
+						if (entry_point_materialized_builtins.emplace(key, val).second)
+							force_recompile();
+						return "&" + to_name(val);
+					}
+					else
+					{
+						// Non-entry function: builtins are threaded via #define trick,
+						// so to_name(val) is a valid lvalue via the macro.
+						return "&" + to_name(val);
+					}
+				}
+				// Default: use base class pointer expression
+				return to_enclosed_pointer_expression(val);
+			};
+
+			auto expr = join(to_enclosed_expression(condition), " ? ", make_ptr_expr(true_val), " : ",
+			                 make_ptr_expr(false_val));
+			emit_op(result_type, result_id, expr,
+			        should_forward(condition) && should_forward(true_val) && should_forward(false_val));
+			inherit_expression_dependencies(result_id, condition);
+			inherit_expression_dependencies(result_id, true_val);
+			inherit_expression_dependencies(result_id, false_val);
+		}
+		else if (cond_type.vecsize > 1 && cond_type.basetype == SPIRType::Boolean && res_type.vecsize > 1)
+		{
+			// In OpenCL C, vector ternary and bool-to-int casts don't work like GLSL.
+			// Use OpenCL's select(false_val, true_val, cond) instead.
+			emit_trinary_func_op(result_type, result_id, false_val, true_val, condition, "select");
+		}
+		else
+		{
+			CompilerGLSL::emit_instruction(instruction);
+		}
+		break;
+	}
+
+	case OpCompositeConstructReplicateEXT:
+	{
+		// GLSL base uses type(value) for vector splat, but OpenCL C needs (type)(value).
+		uint32_t result_type = ops[0];
+		uint32_t result_id = ops[1];
+		auto &type = get<SPIRType>(result_type);
+		if (type.op == OpTypeMatrix)
+		{
+			// OpenCL C has no native matrix type; matrices are represented as their column vector type.
+			// Just use the sub-value directly (representing the first/only column).
+			emit_op(result_type, result_id, to_expression(ops[2]), should_forward(ops[2]));
+			inherit_expression_dependencies(result_id, ops[2]);
+		}
+		else if (type.op != OpTypeArray && type.vecsize > 1)
+		{
+			// Vector replicate: (float4)(scalar_value)
+			auto rhs = join(type_to_glsl_constructor(type), "(", to_expression(ops[2]), ")");
+			emit_op(result_type, result_id, rhs, true);
+			inherit_expression_dependencies(result_id, ops[2]);
+		}
+		else
+		{
+			// Array replicate: delegate to base
+			CompilerGLSL::emit_instruction(instruction);
+		}
+		break;
+	}
+
+	// Map GLSL imulExtended/umulExtended to OpenCL C mul_hi + multiply.
+	case OpUMulExtended:
+	case OpSMulExtended:
+	{
+		uint32_t result_type = ops[0];
+		uint32_t result_id = ops[1];
+		uint32_t op0 = ops[2];
+		uint32_t op1 = ops[3];
+		auto &type = get<SPIRType>(result_type);
+		emit_uninitialized_temporary_expression(result_type, result_id);
+		// _m0 = low bits (a * b), _m1 = high bits (mul_hi(a, b))
+		statement(to_expression(result_id), ".", to_member_name(type, 0), " = ", to_expression(op0), " * ",
+		          to_expression(op1), ";");
+		statement(to_expression(result_id), ".", to_member_name(type, 1), " = mul_hi(", to_expression(op0), ", ",
+		          to_expression(op1), ");");
+		break;
+	}
+
+	case OpQuantizeToF16:
+	{
+		// GLSL emits unpackHalf2x16/packHalf2x16 which aren't OpenCL builtins.
+		// Use our polyfill functions instead.
+		if (!needs_half_pack_polyfill || !needs_half_unpack_polyfill)
+		{
+			needs_half_pack_polyfill = true;
+			needs_half_unpack_polyfill = true;
+			force_recompile();
+		}
+		uint32_t result_type = ops[0];
+		uint32_t id = ops[1];
+		uint32_t arg = ops[2];
+		string op;
+		auto &type = get<SPIRType>(result_type);
+		switch (type.vecsize)
+		{
+		case 1:
+			op = join("spvUnpackHalf2x16(spvPackHalf2x16((float2)(", to_expression(arg), ", 0.0f))).x");
+			break;
+		case 2:
+			op = join("spvUnpackHalf2x16(spvPackHalf2x16(", to_expression(arg), "))");
+			break;
+		case 3:
+		{
+			auto op0 = join("spvUnpackHalf2x16(spvPackHalf2x16(", to_expression(arg), ".xy))");
+			auto op1 = join("spvUnpackHalf2x16(spvPackHalf2x16((float2)(", to_expression(arg), ".z, 0.0f))).x");
+			op = join("(float3)(", op0, ", ", op1, ")");
+			break;
+		}
+		case 4:
+		{
+			auto op0 = join("spvUnpackHalf2x16(spvPackHalf2x16(", to_expression(arg), ".xy))");
+			auto op1 = join("spvUnpackHalf2x16(spvPackHalf2x16(", to_expression(arg), ".zw))");
+			op = join("(float4)(", op0, ", ", op1, ")");
+			break;
+		}
+		default:
+			SPIRV_CROSS_THROW("Illegal argument to OpQuantizeToF16.");
+		}
+		emit_op(result_type, id, op, should_forward(arg));
+		inherit_expression_dependencies(id, arg);
+		break;
+	}
+
+	// Map OpImageSample* (texture sampling) to OpenCL read_image* with sampler.
+	case OpImageSampleExplicitLod:
+	case OpImageSampleImplicitLod:
+	{
+		uint32_t result_type = ops[0];
+		uint32_t result_id = ops[1];
+		uint32_t combined_id = ops[2];
+		uint32_t coord_id = ops[3];
+
+		if (!needs_default_sampler)
+		{
+			needs_default_sampler = true;
+			force_recompile();
+		}
+
+		auto &result_spirtype = get<SPIRType>(result_type);
+		const char *read_func;
+		switch (result_spirtype.basetype)
+		{
+		case SPIRType::UInt:
+			read_func = "read_imageui";
+			break;
+		case SPIRType::Int:
+			read_func = "read_imagei";
+			break;
+		default:
+			read_func = "read_imagef";
+			break;
+		}
+
+		// For combined image+sampler, get the underlying image expression.
+		auto img_expr = to_expression(combined_id);
+
+		// Sampler-based read_image* takes float coordinates.
+		auto &coord_type = expression_type(coord_id);
+		string coord_expr;
+		if (coord_type.basetype == SPIRType::Float)
+			coord_expr = to_expression(coord_id);
+		else
+			coord_expr = join("convert_float", coord_type.vecsize > 1 ? to_string(coord_type.vecsize) : "", "(",
+			                  to_expression(coord_id), ")");
+
+		auto raw_expr = join(read_func, "(", img_expr, ", spvDefaultSampler, ", coord_expr, ")");
+		auto swizzled = remap_swizzle(result_spirtype, 4, raw_expr);
+
+		bool forward = should_forward(combined_id) && should_forward(coord_id);
+		emit_op(result_type, result_id, swizzled, forward);
+		inherit_expression_dependencies(result_id, combined_id);
+		inherit_expression_dependencies(result_id, coord_id);
+		break;
+	}
+
+	// Task #10: Map image read/write/query ops to OpenCL C equivalents.
+	case OpImageRead:
+	{
+		uint32_t result_type = ops[0];
+		uint32_t result_id = ops[1];
+		uint32_t image_id = ops[2];
+		uint32_t coord_id = ops[3];
+
+		auto &img_type = expression_type(image_id);
+		// SubpassData is not supported; fall through to base class.
+		if (img_type.image.dim == DimSubpassData)
+		{
+			CompilerGLSL::emit_instruction(instruction);
+			break;
+		}
+
+		auto &result_spirtype = get<SPIRType>(result_type);
+		const char *read_func;
+		switch (result_spirtype.basetype)
+		{
+		case SPIRType::UInt:
+			read_func = "read_imageui";
+			break;
+		case SPIRType::Int:
+			read_func = "read_imagei";
+			break;
+		default:
+			read_func = "read_imagef";
+			break;
+		}
+
+		// Convert coordinate to int.
+		auto coord_type = expression_type(coord_id);
+		coord_type.basetype = SPIRType::Int;
+		auto coord_expr = bitcast_expression(coord_type, expression_type(coord_id).basetype, to_expression(coord_id));
+
+		// OpenCL read functions always return a vec4; swizzle down to the required vecsize.
+		auto raw_expr = join(read_func, "(", to_expression(image_id), ", ", coord_expr, ")");
+		// Build a temporary vec4 type for the result of the read function.
+		SPIRType vec4_type = result_spirtype;
+		vec4_type.vecsize = 4;
+		auto swizzled = remap_swizzle(result_spirtype, 4, raw_expr);
+
+		bool forward = should_forward(image_id) && should_forward(coord_id);
+		emit_op(result_type, result_id, swizzled, forward);
+		inherit_expression_dependencies(result_id, image_id);
+		inherit_expression_dependencies(result_id, coord_id);
+		break;
+	}
+
+	case OpImageWrite:
+	{
+		uint32_t image_id = ops[0];
+		uint32_t coord_id = ops[1];
+		uint32_t texel_id = ops[2];
+
+		// Unset NonWritable so the variable can be written (mirroring GLSL backend).
+		auto *image_var = maybe_get_backing_variable(image_id);
+		if (image_var)
+			unset_decoration(image_var->self, DecorationNonWritable);
+
+		auto &value_type = expression_type(texel_id);
+		const char *write_func;
+		switch (value_type.basetype)
+		{
+		case SPIRType::UInt:
+			write_func = "write_imageui";
+			break;
+		case SPIRType::Int:
+			write_func = "write_imagei";
+			break;
+		default:
+			write_func = "write_imagef";
+			break;
+		}
+
+		// Convert coordinate to int.
+		auto coord_type = expression_type(coord_id);
+		coord_type.basetype = SPIRType::Int;
+		auto coord_expr = bitcast_expression(coord_type, expression_type(coord_id).basetype, to_expression(coord_id));
+
+		// OpenCL write functions expect a vec4 texel; expand if necessary.
+		// Use (vec4_type)(expr) C-style cast which is valid for scalar-to-vector broadcast.
+		SPIRType vec4_type = value_type;
+		vec4_type.vecsize = 4;
+		string texel_raw = to_expression(texel_id);
+		string texel_expr;
+		if (value_type.vecsize == 4)
+			texel_expr = texel_raw;
+		else
+			texel_expr = join("(", type_to_glsl(vec4_type), ")(", texel_raw, ")");
+
+		statement(write_func, "(", to_expression(image_id), ", ", coord_expr, ", ", texel_expr, ");");
+
+		if (image_var && variable_storage_is_aliased(*image_var))
+			flush_all_aliased_variables();
+		break;
+	}
+
+	case OpImageQuerySize:
+	{
+		uint32_t result_type = ops[0];
+		uint32_t result_id = ops[1];
+		uint32_t image_id = ops[2];
+
+		auto &img_type = expression_type(image_id);
+		if (img_type.basetype != SPIRType::Image)
+		{
+			CompilerGLSL::emit_instruction(instruction);
+			break;
+		}
+
+		auto img_expr = to_expression(image_id);
+		string size_expr;
+		auto dim = img_type.image.dim;
+		bool arrayed = img_type.image.arrayed;
+
+		if (dim == Dim1D || dim == DimBuffer)
+		{
+			size_expr = join("get_image_width(", img_expr, ")");
+		}
+		else if (dim == Dim2D || dim == DimCube)
+		{
+			if (arrayed)
+				size_expr = join("(int3)(get_image_width(", img_expr, "), get_image_height(", img_expr,
+				                 "), get_image_array_size(", img_expr, "))");
+			else
+				size_expr = join("(int2)(get_image_width(", img_expr, "), get_image_height(", img_expr, "))");
+		}
+		else if (dim == Dim3D)
+		{
+			size_expr = join("(int3)(get_image_width(", img_expr, "), get_image_height(", img_expr,
+			                 "), get_image_depth(", img_expr, "))");
+		}
+		else
+		{
+			CompilerGLSL::emit_instruction(instruction);
+			break;
+		}
+
+		emit_op(result_type, result_id, size_expr, true);
+		inherit_expression_dependencies(result_id, image_id);
 		break;
 	}
 
diff --git a/spirv_opencl.hpp b/spirv_opencl.hpp
index da64673d8..90a4291e0 100644
--- a/spirv_opencl.hpp
+++ b/spirv_opencl.hpp
@@ -46,10 +46,16 @@ class CompilerOpenCL : public CompilerGLSL
 		bool enable_fp64 = false;
 		// Enable cl_khr_int64_extended_atomics extension
 		bool enable_64bit_atomics = false;
+		// Enable cl_khr_subgroups extension
+		bool enable_subgroups = false;
+		// Enable cl_khr_subgroup_shuffle extension
+		bool enable_shuffle = false;
 
 		void set_opencl_version(uint32_t major, uint32_t minor = 0, uint32_t patch = 0)
 		{
 			opencl_version = make_opencl_version(major, minor, patch);
+			if (opencl_version >= 200 && opencl_version < 300)
+				enable_subgroups = true;
 		}
 
 		bool supports_opencl_version(uint32_t major, uint32_t minor = 0, uint32_t patch = 0) const
@@ -98,12 +104,30 @@ class CompilerOpenCL : public CompilerGLSL
 	std::string variable_decl(const SPIRType &type, const std::string &name, uint32_t id = 0) override;
 	void emit_function_prototype(SPIRFunction &func, const Bitset &return_flags) override;
 	void emit_instruction(const Instruction &instruction) override;
+	std::string to_member_reference(uint32_t base, const SPIRType &type, uint32_t index,
+	                                bool ptr_chain_is_resolved) override;
+	std::string to_func_call_arg(const SPIRFunction::Parameter &arg, uint32_t id) override;
+	void add_function_overload(const SPIRFunction &func) override;
+	void emit_struct(SPIRType &type) override;
+	std::string type_to_glsl_constructor(const SPIRType &type) override;
+	bool emit_array_copy(const char *expr, uint32_t lhs_id, uint32_t rhs_id, StorageClass lhs_storage,
+	                     StorageClass rhs_storage) override;
+	std::string constant_expression(const SPIRConstant &c, bool inside_block_like_struct_scope = false,
+	                                bool inside_struct_scope = false) override;
+	std::string constant_expression_vector(const SPIRConstant &c, uint32_t vector) override;
+	std::string bitcast_glsl_op(const SPIRType &result_type, const SPIRType &argument_type) override;
+	std::string to_atomic_ptr_expression(uint32_t id) override;
+	void emit_glsl_op(uint32_t result_type, uint32_t result_id, uint32_t op, const uint32_t *args,
+	                  uint32_t count) override;
 	virtual bool builtin_translates_to_nonarray(BuiltIn builtin) const override;
 	std::string get_variable_address_space(const SPIRVariable &argument);
 	std::string get_type_address_space(const SPIRType &type, uint32_t id, bool argument = false);
 	const char *to_restrict(uint32_t id, bool space);
+	uint32_t get_physical_type_id_stride(TypeID type_id) const override;
 
 	void replace_illegal_names() override;
+	void emit_function_local_declarations(SPIRFunction &func) override;
+	void emit_function_local_epilogue(SPIRFunction &func) override;
 
 	Options opencl_options;
 
@@ -114,6 +138,37 @@ class CompilerOpenCL : public CompilerGLSL
 
 	std::unordered_set<uint32_t> constant_macro_ids;
 
+	// Expression IDs that were produced by subscripting a flattened SSBO pointer (e.g. ptr[idx]).
+	// These are C values (not pointers), so subsequent member accesses must use '.' not '->'.
+	std::unordered_set<uint32_t> subscripted_deref_exprs;
+
+	// Set when packHalf2x16/unpackHalf2x16 polyfill helpers are needed.
+	bool needs_half_pack_polyfill = false;
+	bool needs_half_unpack_polyfill = false;
+	// Set when a default sampler is needed for combined image+sampler usage.
+	bool needs_default_sampler = false;
+
+	// For each non-entry function, the ordered list of flattened buffer var IDs to thread as extra params.
+	std::unordered_map<uint32_t, SmallVector<uint32_t>> func_flattened_args;
+	// Map from flattened buffer var ID to its OpenCL type declaration prefix ("__global T*" etc.)
+	std::unordered_map<uint32_t, std::string> flattened_var_type_decl;
+
+	// For each non-entry function, workgroup/private global vars accessed and needing pointer threading.
+	std::unordered_map<uint32_t, SmallVector<uint32_t>> func_workgroup_args;
+	// Map from workgroup/private var ID to its pointer type declaration prefix
+	std::unordered_map<uint32_t, std::string> workgroup_var_ptr_type;
+	// Set of scalar (non-array) workgroup/private vars that need #define dereference inside callees
+	std::unordered_set<uint32_t> workgroup_scalar_vars;
+
+	// Input builtin variables threaded to non-entry functions (BuiltIn enum → variable ID)
+	std::unordered_map<uint32_t, uint32_t> threaded_input_builtins;
+	// Input builtin variables materialized as local vars in the entry point (BuiltIn enum → variable ID)
+	std::unordered_map<uint32_t, uint32_t> entry_point_materialized_builtins;
+	// Guard flag to avoid circular reference during builtin materialization emission
+	bool emitting_builtin_materialization = false;
+
+	void compute_kernel_resources();
+	void append_global_func_args(const SPIRFunction &func, uint32_t index, SmallVector<std::string> &arglist) override;
 	void emit_workgroup_size_attribute();
 
 	std::string entry_point_args(bool append_comma);
diff --git a/test_shaders.py b/test_shaders.py
index b3f87fc7f..dbc38ba5c 100755
--- a/test_shaders.py
+++ b/test_shaders.py
@@ -613,6 +613,12 @@ def path_to_opencl_standard_cli(shader):
 def validate_shader_opencl(shader, opt, paths):
     shader = reference_path(shader[0], shader[1], opt)
     extensions = []
+    if '.double.' in shader:
+        extensions.append('cl_khr_fp64')
+    if '.subgroup.' in shader:
+        extensions.append('cl_khr_subgroups')
+    if '.shuffle.' in shader:
+        extensions.append('cl_khr_subgroup_shuffle')
 
     global ignore_clang
     try:
@@ -675,8 +681,17 @@ def cross_compile_opencl(shader, spirv, opt, iterations, paths):
     opencl_args = [spirv_cross_path, '--output', opencl_path, spirv_path, '--opencl', '--iterations', str(iterations)]
     opencl_args.append('--opencl-version')
     opencl_args.append(path_to_opencl_standard_cli(shader))
-
-    subprocess.check_call(opencl_args)
+    if '.double.' in shader:
+        opencl_args.append('--opencl-fp64')
+    if '.subgroup.' in shader:
+        opencl_args.append('--opencl-subgroups')
+    if '.shuffle.' in shader:
+        opencl_args.append('--opencl-shuffle')
+
+    if shader_is_invalid_spirv(shader):
+        subprocess.run(opencl_args)
+    else:
+        subprocess.check_call(opencl_args)
 
     if not shader_is_invalid_spirv(opencl_path):
         subprocess.check_call([paths.spirv_val, '--allow-localsizeid', '--scalar-block-layout', '--target-env', spirv_env, spirv_path])
diff --git a/test_shaders.sh b/test_shaders.sh
index 54bf700ca..a054710ed 100755
--- a/test_shaders.sh
+++ b/test_shaders.sh
@@ -21,6 +21,8 @@ echo "Using SPIRV-Cross in: \"$SPIRV_CROSS_PATH\"."
 ./test_shaders.py shaders-msl ${OPTS} --msl --spirv-cross "$SPIRV_CROSS_PATH" || exit 1
 ./test_shaders.py shaders-msl ${OPTS} --msl --opt --spirv-cross "$SPIRV_CROSS_PATH" || exit 1
 ./test_shaders.py shaders-msl-no-opt ${OPTS} --msl --spirv-cross "$SPIRV_CROSS_PATH" || exit 1
+./test_shaders.py shaders-opencl ${OPTS} --opencl --spirv-cross "$SPIRV_CROSS_PATH" || exit 1
+./test_shaders.py shaders-opencl ${OPTS} --opencl --opt --spirv-cross "$SPIRV_CROSS_PATH" || exit 1
 ./test_shaders.py shaders-hlsl ${OPTS} --hlsl --spirv-cross "$SPIRV_CROSS_PATH" || exit 1
 ./test_shaders.py shaders-hlsl ${OPTS} --hlsl --opt --spirv-cross "$SPIRV_CROSS_PATH" || exit 1
 ./test_shaders.py shaders-hlsl-no-opt ${OPTS} --hlsl --spirv-cross "$SPIRV_CROSS_PATH" || exit 1

From 7308550f9e29aa9fe1a2230de36463f96e8420b3 Mon Sep 17 00:00:00 2001
From: Garrick Meeker <gmeeker@gmail.com>
Date: Thu, 12 Mar 2026 18:19:39 -0700
Subject: [PATCH 03/16] Adding shaders-opencl opt references

---
 .../asm/comp/atomic-decrement.asm.comp        | 17 ++++
 .../asm/comp/atomic-increment.asm.comp        | 17 ++++
 .../asm/comp/bitcast_iadd.asm.comp            | 32 ++++++++
 .../asm/comp/bitcast_icmp.asm.comp            | 32 ++++++++
 .../asm/comp/bitcast_sar.asm.comp             | 34 ++++++++
 .../asm/comp/bitcast_sdiv.asm.comp            | 34 ++++++++
 .../asm/comp/bitcast_slr.asm.comp             | 34 ++++++++
 .../asm/comp/block-name-alias-global.asm.comp | 48 ++++++++++++
 .../comp/buffer-write-relative-addr.asm.comp  | 21 +++++
 .../asm/comp/buffer-write.asm.comp            | 16 ++++
 .../comp/copy-object-ssbo-to-ssbo.asm.comp    | 24 ++++++
 .../asm/comp/copy-object-ubo-to-ssbo.asm.comp | 24 ++++++
 .../asm/comp/duplicate-spec-id.asm.comp       | 26 +++++++
 .../asm/comp/fma.spv16.asm.comp               | 23 ++++++
 .../comp/global-parameter-name-alias.asm.comp |  8 ++
 ...e-load-store-short-vector.invalid.asm.comp | 18 +++++
 ...p-spec-constant-op-vector-related.asm.comp | 77 +++++++++++++++++++
 .../shaders-opencl/asm/comp/quantize.asm.comp | 35 +++++++++
 .../asm/comp/relaxed-block-layout.asm.comp    | 23 ++++++
 .../comp/replicated-composites.spv16.asm.comp | 30 ++++++++
 ...specialization-constant-workgroup.asm.comp | 26 +++++++
 .../struct-resource-name-aliasing.asm.comp    | 17 ++++
 .../asm/comp/uint_smulextended.asm.comp       | 28 +++++++
 .../undefined-constant-composite.asm.comp     | 33 ++++++++
 ...undefined-spec-constant-composite.asm.comp | 38 +++++++++
 .../asm/comp/variable-pointers-2.asm.comp     | 56 ++++++++++++++
 ...ariable-pointers-store-forwarding.asm.comp | 28 +++++++
 .../vector-builtin-type-cast-func.asm.comp    | 28 +++++++
 .../comp/vector-builtin-type-cast.asm.comp    | 28 +++++++
 .../access-private-workgroup-in-function.comp |  9 +++
 .../opt/shaders-opencl/comp/arguments.comp    | 24 ++++++
 reference/opt/shaders-opencl/comp/atomic.comp | 53 +++++++++++++
 .../opt/shaders-opencl/comp/barriers.comp     | 25 ++++++
 reference/opt/shaders-opencl/comp/basic.comp  | 36 +++++++++
 .../comp/basic.dispatchbase.comp              | 41 ++++++++++
 .../comp/buffer-push-const.comp               | 24 ++++++
 .../opt/shaders-opencl/comp/builtins.comp     |  9 +++
 .../comp/cfg-preserve-parameter.comp          |  8 ++
 .../comp/complex-type-alias.comp              | 46 +++++++++++
 .../comp/composite-construct.comp             | 26 +++++++
 .../opt/shaders-opencl/comp/culling.comp      | 36 +++++++++
 .../opt/shaders-opencl/comp/defer-parens.comp | 24 ++++++
 .../opt/shaders-opencl/comp/dowhile.comp      | 44 +++++++++++
 .../shaders-opencl/comp/expect-assume.comp    | 17 ++++
 .../comp/force-recompile-hooks.swizzle.comp   | 11 +++
 .../opt/shaders-opencl/comp/functions.comp    |  8 ++
 .../comp/global-invocation-id.comp            | 18 +++++
 reference/opt/shaders-opencl/comp/image.comp  | 11 +++
 reference/opt/shaders-opencl/comp/insert.comp | 18 +++++
 .../comp/local-invocation-id.comp             | 18 +++++
 .../comp/local-invocation-index.comp          | 18 +++++
 .../comp/local-size-duplicate-spec-id.comp    | 30 ++++++++
 reference/opt/shaders-opencl/comp/mod.comp    | 27 +++++++
 reference/opt/shaders-opencl/comp/modf.comp   | 34 ++++++++
 .../shaders-opencl/comp/outer-product.comp    | 45 +++++++++++
 .../shaders-opencl/comp/packing-test-1.comp   | 32 ++++++++
 .../shaders-opencl/comp/packing-test-2.comp   | 32 ++++++++
 .../shaders-opencl/comp/read-write-only.comp  | 35 +++++++++
 .../opt/shaders-opencl/comp/rmw-opt.comp      | 27 +++++++
 ...alar-std450-distance-length-normalize.comp | 25 ++++++
 .../comp/shared-std450.double.comp            | 29 +++++++
 .../comp/shared-struct-bool-cast.comp         | 65 ++++++++++++++++
 .../comp/shared-zero-init-simple.comp         | 25 ++++++
 .../shaders-opencl/comp/shared-zero-init.comp | 30 ++++++++
 reference/opt/shaders-opencl/comp/shared.comp | 28 +++++++
 .../comp/spec-constant-work-group-size.comp   | 39 ++++++++++
 .../shaders-opencl/comp/struct-layout.comp    | 31 ++++++++
 .../shaders-opencl/comp/struct-nested.comp    | 31 ++++++++
 .../comp/struct-packing.invalid.comp          |  0
 .../opt/shaders-opencl/comp/torture-loop.comp | 46 +++++++++++
 .../opt/shaders-opencl/comp/type-alias.comp   | 45 +++++++++++
 reference/opt/shaders-opencl/comp/udiv.comp   | 24 ++++++
 .../shaders-opencl/comp/writable-ssbo.comp    | 18 +++++
 73 files changed, 2077 insertions(+)
 create mode 100644 reference/opt/shaders-opencl/asm/comp/atomic-decrement.asm.comp
 create mode 100644 reference/opt/shaders-opencl/asm/comp/atomic-increment.asm.comp
 create mode 100644 reference/opt/shaders-opencl/asm/comp/bitcast_iadd.asm.comp
 create mode 100644 reference/opt/shaders-opencl/asm/comp/bitcast_icmp.asm.comp
 create mode 100644 reference/opt/shaders-opencl/asm/comp/bitcast_sar.asm.comp
 create mode 100644 reference/opt/shaders-opencl/asm/comp/bitcast_sdiv.asm.comp
 create mode 100644 reference/opt/shaders-opencl/asm/comp/bitcast_slr.asm.comp
 create mode 100644 reference/opt/shaders-opencl/asm/comp/block-name-alias-global.asm.comp
 create mode 100644 reference/opt/shaders-opencl/asm/comp/buffer-write-relative-addr.asm.comp
 create mode 100644 reference/opt/shaders-opencl/asm/comp/buffer-write.asm.comp
 create mode 100644 reference/opt/shaders-opencl/asm/comp/copy-object-ssbo-to-ssbo.asm.comp
 create mode 100644 reference/opt/shaders-opencl/asm/comp/copy-object-ubo-to-ssbo.asm.comp
 create mode 100644 reference/opt/shaders-opencl/asm/comp/duplicate-spec-id.asm.comp
 create mode 100644 reference/opt/shaders-opencl/asm/comp/fma.spv16.asm.comp
 create mode 100644 reference/opt/shaders-opencl/asm/comp/global-parameter-name-alias.asm.comp
 create mode 100644 reference/opt/shaders-opencl/asm/comp/image-load-store-short-vector.invalid.asm.comp
 create mode 100644 reference/opt/shaders-opencl/asm/comp/op-spec-constant-op-vector-related.asm.comp
 create mode 100644 reference/opt/shaders-opencl/asm/comp/quantize.asm.comp
 create mode 100644 reference/opt/shaders-opencl/asm/comp/relaxed-block-layout.asm.comp
 create mode 100644 reference/opt/shaders-opencl/asm/comp/replicated-composites.spv16.asm.comp
 create mode 100644 reference/opt/shaders-opencl/asm/comp/specialization-constant-workgroup.asm.comp
 create mode 100644 reference/opt/shaders-opencl/asm/comp/struct-resource-name-aliasing.asm.comp
 create mode 100644 reference/opt/shaders-opencl/asm/comp/uint_smulextended.asm.comp
 create mode 100644 reference/opt/shaders-opencl/asm/comp/undefined-constant-composite.asm.comp
 create mode 100644 reference/opt/shaders-opencl/asm/comp/undefined-spec-constant-composite.asm.comp
 create mode 100644 reference/opt/shaders-opencl/asm/comp/variable-pointers-2.asm.comp
 create mode 100644 reference/opt/shaders-opencl/asm/comp/variable-pointers-store-forwarding.asm.comp
 create mode 100644 reference/opt/shaders-opencl/asm/comp/vector-builtin-type-cast-func.asm.comp
 create mode 100644 reference/opt/shaders-opencl/asm/comp/vector-builtin-type-cast.asm.comp
 create mode 100644 reference/opt/shaders-opencl/comp/access-private-workgroup-in-function.comp
 create mode 100644 reference/opt/shaders-opencl/comp/arguments.comp
 create mode 100644 reference/opt/shaders-opencl/comp/atomic.comp
 create mode 100644 reference/opt/shaders-opencl/comp/barriers.comp
 create mode 100644 reference/opt/shaders-opencl/comp/basic.comp
 create mode 100644 reference/opt/shaders-opencl/comp/basic.dispatchbase.comp
 create mode 100644 reference/opt/shaders-opencl/comp/buffer-push-const.comp
 create mode 100644 reference/opt/shaders-opencl/comp/builtins.comp
 create mode 100644 reference/opt/shaders-opencl/comp/cfg-preserve-parameter.comp
 create mode 100644 reference/opt/shaders-opencl/comp/complex-type-alias.comp
 create mode 100644 reference/opt/shaders-opencl/comp/composite-construct.comp
 create mode 100644 reference/opt/shaders-opencl/comp/culling.comp
 create mode 100644 reference/opt/shaders-opencl/comp/defer-parens.comp
 create mode 100644 reference/opt/shaders-opencl/comp/dowhile.comp
 create mode 100644 reference/opt/shaders-opencl/comp/expect-assume.comp
 create mode 100644 reference/opt/shaders-opencl/comp/force-recompile-hooks.swizzle.comp
 create mode 100644 reference/opt/shaders-opencl/comp/functions.comp
 create mode 100644 reference/opt/shaders-opencl/comp/global-invocation-id.comp
 create mode 100644 reference/opt/shaders-opencl/comp/image.comp
 create mode 100644 reference/opt/shaders-opencl/comp/insert.comp
 create mode 100644 reference/opt/shaders-opencl/comp/local-invocation-id.comp
 create mode 100644 reference/opt/shaders-opencl/comp/local-invocation-index.comp
 create mode 100644 reference/opt/shaders-opencl/comp/local-size-duplicate-spec-id.comp
 create mode 100644 reference/opt/shaders-opencl/comp/mod.comp
 create mode 100644 reference/opt/shaders-opencl/comp/modf.comp
 create mode 100644 reference/opt/shaders-opencl/comp/outer-product.comp
 create mode 100644 reference/opt/shaders-opencl/comp/packing-test-1.comp
 create mode 100644 reference/opt/shaders-opencl/comp/packing-test-2.comp
 create mode 100644 reference/opt/shaders-opencl/comp/read-write-only.comp
 create mode 100644 reference/opt/shaders-opencl/comp/rmw-opt.comp
 create mode 100644 reference/opt/shaders-opencl/comp/scalar-std450-distance-length-normalize.comp
 create mode 100644 reference/opt/shaders-opencl/comp/shared-std450.double.comp
 create mode 100644 reference/opt/shaders-opencl/comp/shared-struct-bool-cast.comp
 create mode 100644 reference/opt/shaders-opencl/comp/shared-zero-init-simple.comp
 create mode 100644 reference/opt/shaders-opencl/comp/shared-zero-init.comp
 create mode 100644 reference/opt/shaders-opencl/comp/shared.comp
 create mode 100644 reference/opt/shaders-opencl/comp/spec-constant-work-group-size.comp
 create mode 100644 reference/opt/shaders-opencl/comp/struct-layout.comp
 create mode 100644 reference/opt/shaders-opencl/comp/struct-nested.comp
 create mode 100644 reference/opt/shaders-opencl/comp/struct-packing.invalid.comp
 create mode 100644 reference/opt/shaders-opencl/comp/torture-loop.comp
 create mode 100644 reference/opt/shaders-opencl/comp/type-alias.comp
 create mode 100644 reference/opt/shaders-opencl/comp/udiv.comp
 create mode 100644 reference/opt/shaders-opencl/comp/writable-ssbo.comp

diff --git a/reference/opt/shaders-opencl/asm/comp/atomic-decrement.asm.comp b/reference/opt/shaders-opencl/asm/comp/atomic-decrement.asm.comp
new file mode 100644
index 000000000..8560908e5
--- /dev/null
+++ b/reference/opt/shaders-opencl/asm/comp/atomic-decrement.asm.comp
@@ -0,0 +1,17 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct u0_counters
+{
+    uint c;
+};
+
+typedef struct u0_counters u0_counters;
+
+__attribute__((reqd_work_group_size(4, 1, 1)))
+__kernel void comp_main(write_only image1d_buffer_t u0, __global uint* u0_counter)
+{
+    uint _24 = atomic_add(&(u0_counter[0]), (uint)(-1));
+    write_imageui(u0, as_int(as_float(_24)), (uint4)(as_uint(as_int(((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x))));
+}
+
diff --git a/reference/opt/shaders-opencl/asm/comp/atomic-increment.asm.comp b/reference/opt/shaders-opencl/asm/comp/atomic-increment.asm.comp
new file mode 100644
index 000000000..8ddebf840
--- /dev/null
+++ b/reference/opt/shaders-opencl/asm/comp/atomic-increment.asm.comp
@@ -0,0 +1,17 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct u0_counters
+{
+    uint c;
+};
+
+typedef struct u0_counters u0_counters;
+
+__attribute__((reqd_work_group_size(4, 1, 1)))
+__kernel void comp_main(write_only image1d_buffer_t u0, __global uint* u0_counter)
+{
+    uint _24 = atomic_add(&(u0_counter[0]), 1u);
+    write_imageui(u0, as_int(as_float(_24)), (uint4)(as_uint(as_int(((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x))));
+}
+
diff --git a/reference/opt/shaders-opencl/asm/comp/bitcast_iadd.asm.comp b/reference/opt/shaders-opencl/asm/comp/bitcast_iadd.asm.comp
new file mode 100644
index 000000000..5c0520b3a
--- /dev/null
+++ b/reference/opt/shaders-opencl/asm/comp/bitcast_iadd.asm.comp
@@ -0,0 +1,32 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct _3
+{
+    int4 _m0;
+    uint4 _m1;
+};
+
+typedef struct _3 _3;
+
+struct _4
+{
+    uint4 _m0;
+    int4 _m1;
+};
+
+typedef struct _4 _4;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global _3* _5, __global _4* _6)
+{
+    _6->_m0 = _5->_m1 + as_uint4(_5->_m0);
+    _6->_m0 = as_uint4(_5->_m0) + _5->_m1;
+    _6->_m0 = _5->_m1 + _5->_m1;
+    _6->_m0 = as_uint4(_5->_m0 + _5->_m0);
+    _6->_m1 = as_int4(_5->_m1 + _5->_m1);
+    _6->_m1 = _5->_m0 + _5->_m0;
+    _6->_m1 = as_int4(_5->_m1) + _5->_m0;
+    _6->_m1 = _5->_m0 + as_int4(_5->_m1);
+}
+
diff --git a/reference/opt/shaders-opencl/asm/comp/bitcast_icmp.asm.comp b/reference/opt/shaders-opencl/asm/comp/bitcast_icmp.asm.comp
new file mode 100644
index 000000000..c2195a52c
--- /dev/null
+++ b/reference/opt/shaders-opencl/asm/comp/bitcast_icmp.asm.comp
@@ -0,0 +1,32 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct _3
+{
+    int4 _m0;
+    uint4 _m1;
+};
+
+typedef struct _3 _3;
+
+struct _4
+{
+    uint4 _m0;
+    int4 _m1;
+};
+
+typedef struct _4 _4;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global _3* _5, __global _4* _6)
+{
+    _6->_m0 = select((uint4)(0u), (uint4)(1u), as_int4(_5->_m1) < _5->_m0);
+    _6->_m0 = select((uint4)(0u), (uint4)(1u), as_int4(_5->_m1) <= _5->_m0);
+    _6->_m0 = select((uint4)(0u), (uint4)(1u), _5->_m1 < as_uint4(_5->_m0));
+    _6->_m0 = select((uint4)(0u), (uint4)(1u), _5->_m1 <= as_uint4(_5->_m0));
+    _6->_m0 = select((uint4)(0u), (uint4)(1u), as_int4(_5->_m1) > _5->_m0);
+    _6->_m0 = select((uint4)(0u), (uint4)(1u), as_int4(_5->_m1) >= _5->_m0);
+    _6->_m0 = select((uint4)(0u), (uint4)(1u), _5->_m1 > as_uint4(_5->_m0));
+    _6->_m0 = select((uint4)(0u), (uint4)(1u), _5->_m1 >= as_uint4(_5->_m0));
+}
+
diff --git a/reference/opt/shaders-opencl/asm/comp/bitcast_sar.asm.comp b/reference/opt/shaders-opencl/asm/comp/bitcast_sar.asm.comp
new file mode 100644
index 000000000..93916384b
--- /dev/null
+++ b/reference/opt/shaders-opencl/asm/comp/bitcast_sar.asm.comp
@@ -0,0 +1,34 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct _3
+{
+    int4 _m0;
+    uint4 _m1;
+};
+
+typedef struct _3 _3;
+
+struct _4
+{
+    uint4 _m0;
+    int4 _m1;
+};
+
+typedef struct _4 _4;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global _3* _5, __global _4* _6)
+{
+    int4 _22 = _5->_m0;
+    uint4 _23 = _5->_m1;
+    _6->_m0 = as_uint4(as_int4(_23) >> _22);
+    _6->_m0 = as_uint4(_22 >> as_int4(_23));
+    _6->_m0 = as_uint4(as_int4(_23) >> as_int4(_23));
+    _6->_m0 = as_uint4(_22 >> _22);
+    _6->_m1 = as_int4(_23) >> as_int4(_23);
+    _6->_m1 = _22 >> _22;
+    _6->_m1 = as_int4(_23) >> _22;
+    _6->_m1 = _22 >> as_int4(_23);
+}
+
diff --git a/reference/opt/shaders-opencl/asm/comp/bitcast_sdiv.asm.comp b/reference/opt/shaders-opencl/asm/comp/bitcast_sdiv.asm.comp
new file mode 100644
index 000000000..f5a1a3a67
--- /dev/null
+++ b/reference/opt/shaders-opencl/asm/comp/bitcast_sdiv.asm.comp
@@ -0,0 +1,34 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct _3
+{
+    int4 _m0;
+    uint4 _m1;
+};
+
+typedef struct _3 _3;
+
+struct _4
+{
+    uint4 _m0;
+    int4 _m1;
+};
+
+typedef struct _4 _4;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global _3* _5, __global _4* _6)
+{
+    int4 _22 = _5->_m0;
+    uint4 _23 = _5->_m1;
+    _6->_m0 = as_uint4(as_int4(_23) / _22);
+    _6->_m0 = as_uint4(_22 / as_int4(_23));
+    _6->_m0 = as_uint4(as_int4(_23) / as_int4(_23));
+    _6->_m0 = as_uint4(_22 / _22);
+    _6->_m1 = as_int4(_23) / as_int4(_23);
+    _6->_m1 = _22 / _22;
+    _6->_m1 = as_int4(_23) / _22;
+    _6->_m1 = _22 / as_int4(_23);
+}
+
diff --git a/reference/opt/shaders-opencl/asm/comp/bitcast_slr.asm.comp b/reference/opt/shaders-opencl/asm/comp/bitcast_slr.asm.comp
new file mode 100644
index 000000000..525761cc2
--- /dev/null
+++ b/reference/opt/shaders-opencl/asm/comp/bitcast_slr.asm.comp
@@ -0,0 +1,34 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct _3
+{
+    int4 _m0;
+    uint4 _m1;
+};
+
+typedef struct _3 _3;
+
+struct _4
+{
+    uint4 _m0;
+    int4 _m1;
+};
+
+typedef struct _4 _4;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global _3* _5, __global _4* _6)
+{
+    int4 _22 = _5->_m0;
+    uint4 _23 = _5->_m1;
+    _6->_m0 = _23 >> as_uint4(_22);
+    _6->_m0 = as_uint4(_22) >> _23;
+    _6->_m0 = _23 >> _23;
+    _6->_m0 = as_uint4(_22) >> as_uint4(_22);
+    _6->_m1 = as_int4(_23 >> _23);
+    _6->_m1 = as_int4(as_uint4(_22) >> as_uint4(_22));
+    _6->_m1 = as_int4(_23 >> as_uint4(_22));
+    _6->_m1 = as_int4(as_uint4(_22) >> _23);
+}
+
diff --git a/reference/opt/shaders-opencl/asm/comp/block-name-alias-global.asm.comp b/reference/opt/shaders-opencl/asm/comp/block-name-alias-global.asm.comp
new file mode 100644
index 000000000..166f01b62
--- /dev/null
+++ b/reference/opt/shaders-opencl/asm/comp/block-name-alias-global.asm.comp
@@ -0,0 +1,48 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct A
+{
+    int a;
+    int b;
+};
+
+typedef struct A A;
+
+struct A_1
+{
+    A Data[1];
+};
+
+typedef struct A_1 A_1;
+
+struct A_2
+{
+    A Data[1024];
+};
+
+typedef struct A_2 A_2;
+
+struct B
+{
+    A Data[1];
+};
+
+typedef struct B B;
+
+struct B_1
+{
+    A Data[1024];
+};
+
+typedef struct B_1 B_1;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global A* C1, A_2 C2, __global A* C3, B_1 C4)
+{
+    C1[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x].a = C2.Data[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x].a;
+    C1[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x].b = C2.Data[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x].b;
+    C3[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x].a = C4.Data[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x].a;
+    C3[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x].b = C4.Data[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x].b;
+}
+
diff --git a/reference/opt/shaders-opencl/asm/comp/buffer-write-relative-addr.asm.comp b/reference/opt/shaders-opencl/asm/comp/buffer-write-relative-addr.asm.comp
new file mode 100644
index 000000000..af86ed757
--- /dev/null
+++ b/reference/opt/shaders-opencl/asm/comp/buffer-write-relative-addr.asm.comp
@@ -0,0 +1,21 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct cb5_struct
+{
+    float4 _m0[5];
+};
+
+typedef struct cb5_struct cb5_struct;
+
+__attribute__((reqd_work_group_size(4, 1, 1)))
+__kernel void comp_main(cb5_struct cb0_5, write_only image1d_buffer_t u0)
+{
+    uint _41 = as_uint(as_float(as_int(((uint3)(get_local_id(0), get_local_id(1), get_local_id(2))).x) << 4)) >> 2u;
+    uint4 _50 = as_uint4(cb0_5._m0[as_uint(as_int(as_float(as_int(((uint3)(get_local_id(0), get_local_id(1), get_local_id(2))).x)))) + 1u]);
+    write_imageui(u0, as_int(_41), _50.xxxx);
+    write_imageui(u0, as_int(_41 + 1u), _50.yyyy);
+    write_imageui(u0, as_int(_41 + 2u), _50.zzzz);
+    write_imageui(u0, as_int(_41 + 3u), _50.wwww);
+}
+
diff --git a/reference/opt/shaders-opencl/asm/comp/buffer-write.asm.comp b/reference/opt/shaders-opencl/asm/comp/buffer-write.asm.comp
new file mode 100644
index 000000000..ce88fd4e3
--- /dev/null
+++ b/reference/opt/shaders-opencl/asm/comp/buffer-write.asm.comp
@@ -0,0 +1,16 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct cb
+{
+    float value;
+};
+
+typedef struct cb cb;
+
+__attribute__((reqd_work_group_size(32, 1, 1)))
+__kernel void comp_main(cb _8, write_only image1d_buffer_t _buffer)
+{
+    write_imagef(_buffer, as_int((32u * ((uint3)(get_group_id(0), get_group_id(1), get_group_id(2))).x) + ((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))), (float4)(_8.value));
+}
+
diff --git a/reference/opt/shaders-opencl/asm/comp/copy-object-ssbo-to-ssbo.asm.comp b/reference/opt/shaders-opencl/asm/comp/copy-object-ssbo-to-ssbo.asm.comp
new file mode 100644
index 000000000..8da6f6cfa
--- /dev/null
+++ b/reference/opt/shaders-opencl/asm/comp/copy-object-ssbo-to-ssbo.asm.comp
@@ -0,0 +1,24 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct _19
+{
+};
+typedef struct _19 _19;
+
+struct _5
+{
+    int _m0;
+    _19 _m1;
+    _19 _m2;
+    int _m3;
+};
+
+typedef struct _5 _5;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global _5* _3, __global _5* _4)
+{
+    *_4 = (*_3);
+}
+
diff --git a/reference/opt/shaders-opencl/asm/comp/copy-object-ubo-to-ssbo.asm.comp b/reference/opt/shaders-opencl/asm/comp/copy-object-ubo-to-ssbo.asm.comp
new file mode 100644
index 000000000..3ab995c11
--- /dev/null
+++ b/reference/opt/shaders-opencl/asm/comp/copy-object-ubo-to-ssbo.asm.comp
@@ -0,0 +1,24 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct _19
+{
+};
+typedef struct _19 _19;
+
+struct _5
+{
+    int _m0;
+    _19 _m1;
+    _19 _m2;
+    int _m3;
+};
+
+typedef struct _5 _5;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(_5 _3, __global _5* _4)
+{
+    *_4 = _3;
+}
+
diff --git a/reference/opt/shaders-opencl/asm/comp/duplicate-spec-id.asm.comp b/reference/opt/shaders-opencl/asm/comp/duplicate-spec-id.asm.comp
new file mode 100644
index 000000000..177a60dc6
--- /dev/null
+++ b/reference/opt/shaders-opencl/asm/comp/duplicate-spec-id.asm.comp
@@ -0,0 +1,26 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct StorageBuffer
+{
+    float values[1];
+};
+
+typedef struct StorageBuffer StorageBuffer;
+
+#ifndef SPIRV_CROSS_CONSTANT_ID_0
+#define SPIRV_CROSS_CONSTANT_ID_0 1
+#endif
+constant int foo = SPIRV_CROSS_CONSTANT_ID_0;
+#ifndef SPIRV_CROSS_CONSTANT_ID_0
+#define SPIRV_CROSS_CONSTANT_ID_0 2.0f
+#endif
+constant float bar = SPIRV_CROSS_CONSTANT_ID_0;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global float* ssbo)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    ssbo[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))] = convert_float(foo) + bar;
+}
+
diff --git a/reference/opt/shaders-opencl/asm/comp/fma.spv16.asm.comp b/reference/opt/shaders-opencl/asm/comp/fma.spv16.asm.comp
new file mode 100644
index 000000000..9343d7f25
--- /dev/null
+++ b/reference/opt/shaders-opencl/asm/comp/fma.spv16.asm.comp
@@ -0,0 +1,23 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct SSBO2
+{
+    float4 out_data[1];
+};
+
+typedef struct SSBO2 SSBO2;
+
+struct SSBO
+{
+    float4 in_data[1];
+};
+
+typedef struct SSBO SSBO;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global float4* _4, __global const float4* _6)
+{
+    _4[0] = fma(_6[0], _6[1], _6[1]);
+}
+
diff --git a/reference/opt/shaders-opencl/asm/comp/global-parameter-name-alias.asm.comp b/reference/opt/shaders-opencl/asm/comp/global-parameter-name-alias.asm.comp
new file mode 100644
index 000000000..7135f7ae1
--- /dev/null
+++ b/reference/opt/shaders-opencl/asm/comp/global-parameter-name-alias.asm.comp
@@ -0,0 +1,8 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main()
+{
+}
+
diff --git a/reference/opt/shaders-opencl/asm/comp/image-load-store-short-vector.invalid.asm.comp b/reference/opt/shaders-opencl/asm/comp/image-load-store-short-vector.invalid.asm.comp
new file mode 100644
index 000000000..f7d65805e
--- /dev/null
+++ b/reference/opt/shaders-opencl/asm/comp/image-load-store-short-vector.invalid.asm.comp
@@ -0,0 +1,18 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+void _main( uint3* id)
+{
+    float2 loaded = read_imagef(TargetTexture, as_int2((*id).xy)).xy;
+    float2 storeTemp = loaded + (float2)(1.0f);
+    write_imagef(TargetTexture, as_int2((*id).xy + (uint2)(1u)), (float4)(storeTemp));
+}
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(write_only image2d_t TargetTexture)
+{
+    uint3 id_1 = ((uint3)(get_group_id(0), get_group_id(1), get_group_id(2)));
+    uint3 param = id_1;
+    _main(&param);
+}
+
diff --git a/reference/opt/shaders-opencl/asm/comp/op-spec-constant-op-vector-related.asm.comp b/reference/opt/shaders-opencl/asm/comp/op-spec-constant-op-vector-related.asm.comp
new file mode 100644
index 000000000..b2059cd0d
--- /dev/null
+++ b/reference/opt/shaders-opencl/asm/comp/op-spec-constant-op-vector-related.asm.comp
@@ -0,0 +1,77 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct _29
+{
+    int _m0[3][3];
+};
+
+typedef struct _29 _29;
+
+struct _7
+{
+    int _m0[1];
+};
+
+typedef struct _7 _7;
+
+constant int3 _32 = (int3)(0);
+constant int _33[3] = { 0, 0, 0 };
+constant int _34[3][3] = { { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 } };
+#ifndef SPIRV_CROSS_CONSTANT_ID_0
+#define SPIRV_CROSS_CONSTANT_ID_0 0
+#endif
+constant int _3 = SPIRV_CROSS_CONSTANT_ID_0;
+#ifndef SPIRV_CROSS_CONSTANT_ID_1
+#define SPIRV_CROSS_CONSTANT_ID_1 0
+#endif
+constant int _4 = SPIRV_CROSS_CONSTANT_ID_1;
+#ifndef SPIRV_CROSS_CONSTANT_ID_2
+#define SPIRV_CROSS_CONSTANT_ID_2 0
+#endif
+constant int _5 = SPIRV_CROSS_CONSTANT_ID_2;
+#define _36 ({ _3, 0, 0 })
+#define _37 ({ _3, _4, 0 })
+#define _38 ({ _3, _4, _5 })
+#define _39 ({ _4, 0, 0 })
+#define _40 ({ _4, _5, 0 })
+#define _41 ({ _4, _5, _3 })
+#define _42 ({ _5, 0, 0 })
+#define _43 ({ _5, _3, 0 })
+#define _44 ({ _5, _3, _4 })
+#define _45 ({ { _3, _4, _5 }, { 0, 0, 0 }, { 0, 0, 0 } })
+#define _46 ({ { _3, _4, _5 }, { _4, _5, _3 }, { 0, 0, 0 } })
+#define _47 ({ { _3, _4, _5 }, { _4, _5, _3 }, { _5, _3, _4 } })
+#define _48 ((_29){ { { _3, _4, _5 }, { _4, _5, _3 }, { _5, _3, _4 } } })
+#define _50 (_48._m0[0][0])
+#define _51 (_48._m0[1][0])
+#define _52 (_48._m0[0][1])
+#define _53 (_48._m0[2][2])
+#define _54 (_48._m0[2][0])
+#define _55 (_48._m0[1][1])
+#define _56 ((_50 == _51))
+#define _57 ((_52 == _53))
+#define _58 ((_54 == _55))
+#define _59 ((int)(_56))
+#define _60 ((int)(_57))
+#define _61 (_58 ? 2 : 1)
+#define _62 ((int3)(_3, 0, 0))
+#define _63 ((int3)(0, _4, 0))
+#define _64 ((int3)(0, 0, _5))
+#define _65 ((int3)(_62.x, 0, _62.z))
+#define _66 ((int3)(0, _63.y, _63.x))
+#define _67 ((int3)(_64.z, 0, _64.z))
+#define _68 ((int3)(_65.y, _65.x, _66.y))
+#define _69 ((int3)(_67.z, _68.y, _68.z))
+#define _70 (_69.x)
+#define _71 (_69.y)
+#define _72 (_69.z)
+#define _73 ((_70 - _71))
+#define _74 ((_73 * _72))
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global int* _8, __global int* _9)
+{
+    _9[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x] = _8[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x] + ((((1 - _59) * _60) * (_61 - 1)) * _74);
+}
+
diff --git a/reference/opt/shaders-opencl/asm/comp/quantize.asm.comp b/reference/opt/shaders-opencl/asm/comp/quantize.asm.comp
new file mode 100644
index 000000000..3743c7776
--- /dev/null
+++ b/reference/opt/shaders-opencl/asm/comp/quantize.asm.comp
@@ -0,0 +1,35 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct SSBO0
+{
+    float scalar;
+    float2 vec2_val;
+    float3 vec3_val;
+    float4 vec4_val;
+};
+
+typedef struct SSBO0 SSBO0;
+
+uint spvPackHalf2x16(float2 v) {
+    uint r;
+    vstore_half(v.x, 0, (__private half *)&r);
+    vstore_half(v.y, 1, (__private half *)&r);
+    return r;
+}
+
+float2 spvUnpackHalf2x16(uint u) {
+    const __private uint *p = &u;
+    return (float2)(vload_half(0, (const __private half *)p),
+                   vload_half(1, (const __private half *)p));
+}
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global SSBO0* _12)
+{
+    _12->scalar = spvUnpackHalf2x16(spvPackHalf2x16((float2)(_12->scalar, 0.0f))).x;
+    _12->vec2_val = spvUnpackHalf2x16(spvPackHalf2x16(_12->vec2_val));
+    _12->vec3_val = (float3)(spvUnpackHalf2x16(spvPackHalf2x16(_12->vec3_val.xy)), spvUnpackHalf2x16(spvPackHalf2x16((float2)(_12->vec3_val.z, 0.0f))).x);
+    _12->vec4_val = (float4)(spvUnpackHalf2x16(spvPackHalf2x16(_12->vec4_val.xy)), spvUnpackHalf2x16(spvPackHalf2x16(_12->vec4_val.zw)));
+}
+
diff --git a/reference/opt/shaders-opencl/asm/comp/relaxed-block-layout.asm.comp b/reference/opt/shaders-opencl/asm/comp/relaxed-block-layout.asm.comp
new file mode 100644
index 000000000..ddae4bb54
--- /dev/null
+++ b/reference/opt/shaders-opencl/asm/comp/relaxed-block-layout.asm.comp
@@ -0,0 +1,23 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct foo
+{
+    uint bar;
+    float3 baz;
+    uchar quux;
+    uchar4 blah;
+    half2 wibble;
+};
+
+typedef struct foo foo;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global foo* _10)
+{
+    _10->bar = ((uint3)(get_local_id(0), get_local_id(1), get_local_id(2))).x;
+    _10->baz = convert_float3(((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))));
+    _10->blah = convert_uchar4((uint4)(convert_uint4(_10->blah).xyz + ((uint3)(get_group_id(0), get_group_id(1), get_group_id(2))), 0u));
+    _10->wibble = convert_half2(convert_float2(_10->wibble) * convert_float2(((uint3)(get_num_groups(0), get_num_groups(1), get_num_groups(2))).xy));
+}
+
diff --git a/reference/opt/shaders-opencl/asm/comp/replicated-composites.spv16.asm.comp b/reference/opt/shaders-opencl/asm/comp/replicated-composites.spv16.asm.comp
new file mode 100644
index 000000000..545ecf547
--- /dev/null
+++ b/reference/opt/shaders-opencl/asm/comp/replicated-composites.spv16.asm.comp
@@ -0,0 +1,30 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+#ifndef SPIRV_CROSS_CONSTANT_ID_0
+#define SPIRV_CROSS_CONSTANT_ID_0 0.0f
+#endif
+constant float spec_const = SPIRV_CROSS_CONSTANT_ID_0;
+constant float4 _20 = (float4)(spec_const);
+constant float _26[8] = { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f };
+
+struct UBO
+{
+    float uniform_float;
+};
+
+typedef struct UBO UBO;
+
+constant float _42 = 0;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(UBO ubo)
+{
+    float4 a_1 = (float4)(0.0f);
+    float4 b_1 = (float4)(1.0f);
+    float4 c_1 = _20;
+    float4 _36 = (float4)(ubo.uniform_float);
+    float4 d_1 = _36;
+    float4 e_1 = _36;
+}
+
diff --git a/reference/opt/shaders-opencl/asm/comp/specialization-constant-workgroup.asm.comp b/reference/opt/shaders-opencl/asm/comp/specialization-constant-workgroup.asm.comp
new file mode 100644
index 000000000..20235cb7f
--- /dev/null
+++ b/reference/opt/shaders-opencl/asm/comp/specialization-constant-workgroup.asm.comp
@@ -0,0 +1,26 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct SSBO
+{
+    float a;
+};
+
+typedef struct SSBO SSBO;
+
+#ifndef SPIRV_CROSS_CONSTANT_ID_10
+#define SPIRV_CROSS_CONSTANT_ID_10 9u
+#endif
+constant uint _19 = SPIRV_CROSS_CONSTANT_ID_10;
+#ifndef SPIRV_CROSS_CONSTANT_ID_12
+#define SPIRV_CROSS_CONSTANT_ID_12 4u
+#endif
+constant uint _21 = SPIRV_CROSS_CONSTANT_ID_12;
+constant uint3 spvWorkgroupSize = (uint3)(_19, 20u, _21);
+
+__attribute__((reqd_work_group_size(9, 20, 4)))
+__kernel void comp_main(__global float* _6)
+{
+    _6[0] += 1.0f;
+}
+
diff --git a/reference/opt/shaders-opencl/asm/comp/struct-resource-name-aliasing.asm.comp b/reference/opt/shaders-opencl/asm/comp/struct-resource-name-aliasing.asm.comp
new file mode 100644
index 000000000..853e0afac
--- /dev/null
+++ b/reference/opt/shaders-opencl/asm/comp/struct-resource-name-aliasing.asm.comp
@@ -0,0 +1,17 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct bufA
+{
+    uint _data[1];
+};
+
+typedef struct bufA bufA;
+
+__attribute__((reqd_work_group_size(8, 8, 1)))
+__kernel void comp_main(__global uint* bufA, __global uint* bufB)
+{
+    bufA[0] = 0u;
+    bufB[0] = 0u;
+}
+
diff --git a/reference/opt/shaders-opencl/asm/comp/uint_smulextended.asm.comp b/reference/opt/shaders-opencl/asm/comp/uint_smulextended.asm.comp
new file mode 100644
index 000000000..ab2d4a703
--- /dev/null
+++ b/reference/opt/shaders-opencl/asm/comp/uint_smulextended.asm.comp
@@ -0,0 +1,28 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct _4
+{
+    uint _m0[1];
+};
+
+typedef struct _4 _4;
+
+struct _20
+{
+    uint _m0;
+    uint _m1;
+};
+
+typedef struct _20 _20;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global uint* _5, __global uint* _6, __global uint* _7, __global uint* _8)
+{
+    _20 _28;
+    _28._m0 = _5[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x] * _6[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x];
+    _28._m1 = mul_hi(_5[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x], _6[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x]);
+    _7[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x] = _28._m0;
+    _8[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x] = _28._m1;
+}
+
diff --git a/reference/opt/shaders-opencl/asm/comp/undefined-constant-composite.asm.comp b/reference/opt/shaders-opencl/asm/comp/undefined-constant-composite.asm.comp
new file mode 100644
index 000000000..53694c4b8
--- /dev/null
+++ b/reference/opt/shaders-opencl/asm/comp/undefined-constant-composite.asm.comp
@@ -0,0 +1,33 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct _20
+{
+    int _m0;
+    int _m1;
+};
+
+typedef struct _20 _20;
+
+struct _5
+{
+    int _m0[10];
+};
+
+typedef struct _5 _5;
+
+struct _7
+{
+    int _m0[10];
+};
+
+typedef struct _7 _7;
+
+constant int _28 = 0;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global int* _6, __global int* _8)
+{
+    _6[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x] = _8[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x] + ((_20){ _28, 200 })._m1;
+}
+
diff --git a/reference/opt/shaders-opencl/asm/comp/undefined-spec-constant-composite.asm.comp b/reference/opt/shaders-opencl/asm/comp/undefined-spec-constant-composite.asm.comp
new file mode 100644
index 000000000..852b7b315
--- /dev/null
+++ b/reference/opt/shaders-opencl/asm/comp/undefined-spec-constant-composite.asm.comp
@@ -0,0 +1,38 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct _21
+{
+    int _m0;
+    int _m1;
+};
+
+typedef struct _21 _21;
+
+struct _5
+{
+    int _m0[10];
+};
+
+typedef struct _5 _5;
+
+struct _7
+{
+    int _m0[10];
+};
+
+typedef struct _7 _7;
+
+constant int _29 = 0;
+#ifndef SPIRV_CROSS_CONSTANT_ID_0
+#define SPIRV_CROSS_CONSTANT_ID_0 0
+#endif
+constant int _9 = SPIRV_CROSS_CONSTANT_ID_0;
+constant _21 _30 = (_21){ _9, _29 };
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global int* _6, __global int* _8)
+{
+    _6[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x] = (_8[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x] + _30._m0) + ((_21){ _29, 200 })._m1;
+}
+
diff --git a/reference/opt/shaders-opencl/asm/comp/variable-pointers-2.asm.comp b/reference/opt/shaders-opencl/asm/comp/variable-pointers-2.asm.comp
new file mode 100644
index 000000000..1e39d3aab
--- /dev/null
+++ b/reference/opt/shaders-opencl/asm/comp/variable-pointers-2.asm.comp
@@ -0,0 +1,56 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct foo
+{
+    int a[128];
+    uint b;
+    float2 c;
+};
+
+typedef struct foo foo;
+
+struct bar
+{
+    int d;
+};
+
+typedef struct bar bar;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global foo* buf, bar cb)
+{
+    uint3 _3 = ((uint3)(get_global_id(0), get_global_id(1), get_global_id(2)));
+    uint3 _4 = ((uint3)(get_local_id(0), get_local_id(1), get_local_id(2)));
+    bool _71 = cb.d != 0;
+    __global foo* _72 = _71 ? buf : NULL;
+    __global foo* _67 = _72;
+    __global foo* _65 = _72;
+    __private uint3* _79 = _71 ? &_3 : &_4;
+    __private uint3* _74 = _79;
+    __global int* _49;
+    __global int* _52;
+    _49 = &_72->a[0u];
+    _52 = &buf->a[0u];
+    int _54;
+    int _55;
+    for (;;)
+    {
+        _54 = *_49;
+        _55 = *_52;
+        if (_54 != _55)
+        {
+            int _63 = (_54 + _55) + as_int((*_79).x);
+            *_49 = _63;
+            *_52 = _63;
+            _49 = &_49[1u];
+            _52 = &_52[1u];
+            continue;
+        }
+        else
+        {
+            break;
+        }
+    }
+}
+
diff --git a/reference/opt/shaders-opencl/asm/comp/variable-pointers-store-forwarding.asm.comp b/reference/opt/shaders-opencl/asm/comp/variable-pointers-store-forwarding.asm.comp
new file mode 100644
index 000000000..1f27af228
--- /dev/null
+++ b/reference/opt/shaders-opencl/asm/comp/variable-pointers-store-forwarding.asm.comp
@@ -0,0 +1,28 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct foo
+{
+    int a;
+};
+
+typedef struct foo foo;
+
+struct bar
+{
+    int b;
+};
+
+typedef struct bar bar;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global int* x, __global int* y)
+{
+    __global int* _47 = (((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x != 0u) ? &x[0] : &y[0];
+    __global int* _40 = _47;
+    __global int* _33 = _47;
+    int _37 = x[0];
+    *_47 = 0;
+    y[0] = _37 + _37;
+}
+
diff --git a/reference/opt/shaders-opencl/asm/comp/vector-builtin-type-cast-func.asm.comp b/reference/opt/shaders-opencl/asm/comp/vector-builtin-type-cast-func.asm.comp
new file mode 100644
index 000000000..6afcb643c
--- /dev/null
+++ b/reference/opt/shaders-opencl/asm/comp/vector-builtin-type-cast-func.asm.comp
@@ -0,0 +1,28 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct cb1_struct
+{
+    float4 _RESERVED_IDENTIFIER_FIXUP_m0[1];
+};
+
+typedef struct cb1_struct cb1_struct;
+
+__attribute__((reqd_work_group_size(16, 16, 1)))
+__kernel void comp_main(write_only image2d_t u0, cb1_struct cb0_1)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    int2 _37 = (int2)(get_image_width(u0), get_image_height(u0)) >> as_int2((uint2)(4u));
+    int _98;
+    _98 = 0;
+    for (; _98 < _37.y; _98++)
+    {
+        for (int _99 = 0; _99 < _37.x; )
+        {
+            write_imagef(u0, (_37 * as_int3(((uint3)(get_local_id(0), get_local_id(1), get_local_id(2)))).xy) + (int2)(_98, _99), cb0_1._RESERVED_IDENTIFIER_FIXUP_m0[0].xxxx);
+            _99++;
+            continue;
+        }
+    }
+}
+
diff --git a/reference/opt/shaders-opencl/asm/comp/vector-builtin-type-cast.asm.comp b/reference/opt/shaders-opencl/asm/comp/vector-builtin-type-cast.asm.comp
new file mode 100644
index 000000000..549b251a5
--- /dev/null
+++ b/reference/opt/shaders-opencl/asm/comp/vector-builtin-type-cast.asm.comp
@@ -0,0 +1,28 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct cb1_struct
+{
+    float4 _RESERVED_IDENTIFIER_FIXUP_m0[1];
+};
+
+typedef struct cb1_struct cb1_struct;
+
+__attribute__((reqd_work_group_size(16, 16, 1)))
+__kernel void comp_main(write_only image2d_t u0, cb1_struct cb0_1)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    int2 _20 = (int2)(get_image_width(u0), get_image_height(u0)) >> as_int2((uint2)(4u));
+    int _80;
+    _80 = 0;
+    for (; _80 < _20.y; _80++)
+    {
+        for (int _81 = 0; _81 < _20.x; )
+        {
+            write_imagef(u0, (_20 * as_int3(((uint3)(get_local_id(0), get_local_id(1), get_local_id(2)))).xy) + (int2)(_80, _81), cb0_1._RESERVED_IDENTIFIER_FIXUP_m0[0].xxxx);
+            _81++;
+            continue;
+        }
+    }
+}
+
diff --git a/reference/opt/shaders-opencl/comp/access-private-workgroup-in-function.comp b/reference/opt/shaders-opencl/comp/access-private-workgroup-in-function.comp
new file mode 100644
index 000000000..25ff92694
--- /dev/null
+++ b/reference/opt/shaders-opencl/comp/access-private-workgroup-in-function.comp
@@ -0,0 +1,9 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main()
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+}
+
diff --git a/reference/opt/shaders-opencl/comp/arguments.comp b/reference/opt/shaders-opencl/comp/arguments.comp
new file mode 100644
index 000000000..aa81c7a82
--- /dev/null
+++ b/reference/opt/shaders-opencl/comp/arguments.comp
@@ -0,0 +1,24 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct Buf
+{
+    uint data[1];
+};
+
+typedef struct Buf Buf;
+
+struct parameter
+{
+    uint n;
+};
+
+typedef struct parameter parameter;
+
+__attribute__((reqd_work_group_size(64, 1, 1)))
+__kernel void comp_main(__global uint* _19, parameter p)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    _19[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x] = ((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x + p.n;
+}
+
diff --git a/reference/opt/shaders-opencl/comp/atomic.comp b/reference/opt/shaders-opencl/comp/atomic.comp
new file mode 100644
index 000000000..5c5d824eb
--- /dev/null
+++ b/reference/opt/shaders-opencl/comp/atomic.comp
@@ -0,0 +1,53 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct SSBO
+{
+    uint u32;
+    int i32;
+};
+
+typedef struct SSBO SSBO;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global SSBO* ssbo)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    __local uint shared_u32;
+    __local int shared_i32;
+    uint _16 = atomic_add(&(ssbo->u32), 1u);
+    uint _18 = atomic_or(&(ssbo->u32), 1u);
+    uint _20 = atomic_xor(&(ssbo->u32), 1u);
+    uint _22 = atomic_and(&(ssbo->u32), 1u);
+    uint _24 = atomic_min(&(ssbo->u32), 1u);
+    uint _26 = atomic_max(&(ssbo->u32), 1u);
+    uint _28 = atomic_xchg(&(ssbo->u32), 1u);
+    uint _32 = atomic_cmpxchg(&(ssbo->u32), 10u, 2u);
+    int _36 = atomic_add(&(ssbo->i32), 1);
+    int _38 = atomic_or(&(ssbo->i32), 1);
+    int _40 = atomic_xor(&(ssbo->i32), 1);
+    int _42 = atomic_and(&(ssbo->i32), 1);
+    int _44 = atomic_min(&(ssbo->i32), 1);
+    int _46 = atomic_max(&(ssbo->i32), 1);
+    int _48 = atomic_xchg(&(ssbo->i32), 1);
+    int _52 = atomic_cmpxchg(&(ssbo->i32), 10, 2);
+    shared_u32 = 10u;
+    shared_i32 = 10;
+    uint _57 = atomic_add(&shared_u32, 1u);
+    uint _58 = atomic_or(&shared_u32, 1u);
+    uint _59 = atomic_xor(&shared_u32, 1u);
+    uint _60 = atomic_and(&shared_u32, 1u);
+    uint _61 = atomic_min(&shared_u32, 1u);
+    uint _62 = atomic_max(&shared_u32, 1u);
+    uint _63 = atomic_xchg(&shared_u32, 1u);
+    uint _64 = atomic_cmpxchg(&shared_u32, 10u, 2u);
+    int _65 = atomic_add(&shared_i32, 1);
+    int _66 = atomic_or(&shared_i32, 1);
+    int _67 = atomic_xor(&shared_i32, 1);
+    int _68 = atomic_and(&shared_i32, 1);
+    int _69 = atomic_min(&shared_i32, 1);
+    int _70 = atomic_max(&shared_i32, 1);
+    int _71 = atomic_xchg(&shared_i32, 1);
+    int _72 = atomic_cmpxchg(&shared_i32, 10, 2);
+}
+
diff --git a/reference/opt/shaders-opencl/comp/barriers.comp b/reference/opt/shaders-opencl/comp/barriers.comp
new file mode 100644
index 000000000..1bd4de28a
--- /dev/null
+++ b/reference/opt/shaders-opencl/comp/barriers.comp
@@ -0,0 +1,25 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+__attribute__((reqd_work_group_size(4, 1, 1)))
+__kernel void comp_main()
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    mem_fence(CLK_LOCAL_MEM_FENCE);
+    mem_fence(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
+    mem_fence(CLK_GLOBAL_MEM_FENCE);
+    mem_fence(CLK_GLOBAL_MEM_FENCE);
+    mem_fence(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
+    mem_fence(CLK_LOCAL_MEM_FENCE);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    mem_fence(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    mem_fence(CLK_GLOBAL_MEM_FENCE);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    mem_fence(CLK_GLOBAL_MEM_FENCE);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    mem_fence(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    barrier(CLK_LOCAL_MEM_FENCE);
+}
+
diff --git a/reference/opt/shaders-opencl/comp/basic.comp b/reference/opt/shaders-opencl/comp/basic.comp
new file mode 100644
index 000000000..1c6c16212
--- /dev/null
+++ b/reference/opt/shaders-opencl/comp/basic.comp
@@ -0,0 +1,36 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct SSBO
+{
+    float4 in_data[1];
+};
+
+typedef struct SSBO SSBO;
+
+struct SSBO2
+{
+    float4 out_data[1];
+};
+
+typedef struct SSBO2 SSBO2;
+
+struct SSBO3
+{
+    uint counter;
+};
+
+typedef struct SSBO3 SSBO3;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global const float4* _23, __global float4* _45, __global uint* _48)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    float4 _29 = _23[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x];
+    if (dot(_29, (float4)(1.0f, 5.0f, 6.0f, 2.0f)) > 8.19999980926513671875f)
+    {
+        uint _52 = atomic_add(&(_48[0]), 1u);
+        _45[_52] = _29;
+    }
+}
+
diff --git a/reference/opt/shaders-opencl/comp/basic.dispatchbase.comp b/reference/opt/shaders-opencl/comp/basic.dispatchbase.comp
new file mode 100644
index 000000000..dfdb35d6f
--- /dev/null
+++ b/reference/opt/shaders-opencl/comp/basic.dispatchbase.comp
@@ -0,0 +1,41 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct SSBO
+{
+    float4 in_data[1];
+};
+
+typedef struct SSBO SSBO;
+
+struct SSBO2
+{
+    float4 out_data[1];
+};
+
+typedef struct SSBO2 SSBO2;
+
+struct SSBO3
+{
+    uint counter;
+};
+
+typedef struct SSBO3 SSBO3;
+
+#ifndef SPIRV_CROSS_CONSTANT_ID_10
+#define SPIRV_CROSS_CONSTANT_ID_10 1u
+#endif
+constant uint _59 = SPIRV_CROSS_CONSTANT_ID_10;
+constant uint3 spvWorkgroupSize = (uint3)(_59, 1u, 1u);
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global const float4* _27, __global float4* _49, __global uint* _52)
+{
+    float4 _33 = _27[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x];
+    if (dot(_33, (float4)(1.0f, 5.0f, 6.0f, 2.0f)) > 8.19999980926513671875f)
+    {
+        uint _56 = atomic_add(&(_52[0]), 1u);
+        _49[_56] = _33;
+    }
+}
+
diff --git a/reference/opt/shaders-opencl/comp/buffer-push-const.comp b/reference/opt/shaders-opencl/comp/buffer-push-const.comp
new file mode 100644
index 000000000..aa81c7a82
--- /dev/null
+++ b/reference/opt/shaders-opencl/comp/buffer-push-const.comp
@@ -0,0 +1,24 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct Buf
+{
+    uint data[1];
+};
+
+typedef struct Buf Buf;
+
+struct parameter
+{
+    uint n;
+};
+
+typedef struct parameter parameter;
+
+__attribute__((reqd_work_group_size(64, 1, 1)))
+__kernel void comp_main(__global uint* _19, parameter p)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    _19[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x] = ((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x + p.n;
+}
+
diff --git a/reference/opt/shaders-opencl/comp/builtins.comp b/reference/opt/shaders-opencl/comp/builtins.comp
new file mode 100644
index 000000000..1d457fdfa
--- /dev/null
+++ b/reference/opt/shaders-opencl/comp/builtins.comp
@@ -0,0 +1,9 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+__attribute__((reqd_work_group_size(8, 4, 2)))
+__kernel void comp_main()
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+}
+
diff --git a/reference/opt/shaders-opencl/comp/cfg-preserve-parameter.comp b/reference/opt/shaders-opencl/comp/cfg-preserve-parameter.comp
new file mode 100644
index 000000000..7135f7ae1
--- /dev/null
+++ b/reference/opt/shaders-opencl/comp/cfg-preserve-parameter.comp
@@ -0,0 +1,8 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main()
+{
+}
+
diff --git a/reference/opt/shaders-opencl/comp/complex-type-alias.comp b/reference/opt/shaders-opencl/comp/complex-type-alias.comp
new file mode 100644
index 000000000..39e2347ae
--- /dev/null
+++ b/reference/opt/shaders-opencl/comp/complex-type-alias.comp
@@ -0,0 +1,46 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct Foo0
+{
+    float a;
+};
+
+typedef struct Foo0 Foo0;
+
+struct Foo1
+{
+    Foo0 a;
+};
+
+typedef struct Foo1 Foo1;
+
+struct Foo2
+{
+    Foo1 a;
+    float weight;
+};
+
+typedef struct Foo2 Foo2;
+
+struct SSBO
+{
+    Foo2 outputs[1];
+};
+
+typedef struct SSBO SSBO;
+
+__attribute__((reqd_work_group_size(8, 8, 1)))
+__kernel void comp_main(__global Foo2* _53)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    __local Foo2 coeffs[64];
+    coeffs[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))] = (Foo2){ (Foo1){ (Foo0){ 0.0f } }, 0.0f };
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0))) == 0u)
+    {
+        _53[((uint3)(get_group_id(0), get_group_id(1), get_group_id(2))).x].a.a.a = coeffs[0].a.a.a;
+        _53[((uint3)(get_group_id(0), get_group_id(1), get_group_id(2))).x].weight = coeffs[0].weight;
+    }
+}
+
diff --git a/reference/opt/shaders-opencl/comp/composite-construct.comp b/reference/opt/shaders-opencl/comp/composite-construct.comp
new file mode 100644
index 000000000..6f9957e3b
--- /dev/null
+++ b/reference/opt/shaders-opencl/comp/composite-construct.comp
@@ -0,0 +1,26 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct SSBO0
+{
+    float4 as[1];
+};
+
+typedef struct SSBO0 SSBO0;
+
+struct SSBO1
+{
+    float4 bs[1];
+};
+
+typedef struct SSBO1 SSBO1;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global float4* _16, __global float4* _32)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    float4 values[2] = { _16[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x], _32[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x] };
+    _16[0] = values[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))];
+    _32[1] = (float4)(40.0f);
+}
+
diff --git a/reference/opt/shaders-opencl/comp/culling.comp b/reference/opt/shaders-opencl/comp/culling.comp
new file mode 100644
index 000000000..93e215d06
--- /dev/null
+++ b/reference/opt/shaders-opencl/comp/culling.comp
@@ -0,0 +1,36 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct SSBO
+{
+    float in_data[1];
+};
+
+typedef struct SSBO SSBO;
+
+struct SSBO2
+{
+    float out_data[1];
+};
+
+typedef struct SSBO2 SSBO2;
+
+struct SSBO3
+{
+    uint count;
+};
+
+typedef struct SSBO3 SSBO3;
+
+__attribute__((reqd_work_group_size(4, 1, 1)))
+__kernel void comp_main(__global const float* _22, __global float* _38, __global uint* _41)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    float _28 = _22[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x];
+    if (_28 > 12.0f)
+    {
+        uint _45 = atomic_add(&(_41[0]), 1u);
+        _38[_45] = _28;
+    }
+}
+
diff --git a/reference/opt/shaders-opencl/comp/defer-parens.comp b/reference/opt/shaders-opencl/comp/defer-parens.comp
new file mode 100644
index 000000000..252986498
--- /dev/null
+++ b/reference/opt/shaders-opencl/comp/defer-parens.comp
@@ -0,0 +1,24 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct SSBO
+{
+    float4 data;
+    int index;
+};
+
+typedef struct SSBO SSBO;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global SSBO* _13)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    float4 _17 = _13->data;
+    float2 _28 = _17.yz + (float2)(10.0f);
+    _13->data = (float4)(_17.x, _28, _17.w);
+    _13->data = (_17 + _17) + _17;
+    _13->data = _28.xxyy;
+    _13->data = (float4)(_28.y);
+    _13->data = (float4)((_17.zw + (float2)(10.0f))[_13->index]);
+}
+
diff --git a/reference/opt/shaders-opencl/comp/dowhile.comp b/reference/opt/shaders-opencl/comp/dowhile.comp
new file mode 100644
index 000000000..e5a51f6be
--- /dev/null
+++ b/reference/opt/shaders-opencl/comp/dowhile.comp
@@ -0,0 +1,44 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct SSBO
+{
+    float4 mvp;
+    float4 in_data[1];
+};
+
+typedef struct SSBO SSBO;
+
+struct SSBO2
+{
+    float4 out_data[1];
+};
+
+typedef struct SSBO2 SSBO2;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global const SSBO* _28, __global float4* _52)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    float4 _59;
+    int _60;
+    _60 = 0;
+    _59 = _28->in_data[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x];
+    float4 _42;
+    for (;;)
+    {
+        _42 = _28->mvp * _59;
+        int _44 = _60 + 1;
+        if (_44 < 16)
+        {
+            _60 = _44;
+            _59 = _42;
+        }
+        else
+        {
+            break;
+        }
+    }
+    _52[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x] = _42;
+}
+
diff --git a/reference/opt/shaders-opencl/comp/expect-assume.comp b/reference/opt/shaders-opencl/comp/expect-assume.comp
new file mode 100644
index 000000000..a9415be79
--- /dev/null
+++ b/reference/opt/shaders-opencl/comp/expect-assume.comp
@@ -0,0 +1,17 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct buffer_t
+{
+    uint z;
+};
+
+typedef struct buffer_t buffer_t;
+
+__attribute__((reqd_work_group_size(32, 1, 1)))
+__kernel void comp_main(__global uint* buf)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    buf[0] = ((uint3)(get_group_id(0), get_group_id(1), get_group_id(2))).z;
+}
+
diff --git a/reference/opt/shaders-opencl/comp/force-recompile-hooks.swizzle.comp b/reference/opt/shaders-opencl/comp/force-recompile-hooks.swizzle.comp
new file mode 100644
index 000000000..23990866b
--- /dev/null
+++ b/reference/opt/shaders-opencl/comp/force-recompile-hooks.swizzle.comp
@@ -0,0 +1,11 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+const sampler_t spvDefaultSampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(read_only image2d_t foo, write_only image2d_t bar)
+{
+    write_imagef(bar, (int2)(0), read_imagef(foo, spvDefaultSampler, (float2)(1.0f)));
+}
+
diff --git a/reference/opt/shaders-opencl/comp/functions.comp b/reference/opt/shaders-opencl/comp/functions.comp
new file mode 100644
index 000000000..7135f7ae1
--- /dev/null
+++ b/reference/opt/shaders-opencl/comp/functions.comp
@@ -0,0 +1,8 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main()
+{
+}
+
diff --git a/reference/opt/shaders-opencl/comp/global-invocation-id.comp b/reference/opt/shaders-opencl/comp/global-invocation-id.comp
new file mode 100644
index 000000000..84693b0ee
--- /dev/null
+++ b/reference/opt/shaders-opencl/comp/global-invocation-id.comp
@@ -0,0 +1,18 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct myBlock
+{
+    int a;
+    float b[1];
+};
+
+typedef struct myBlock myBlock;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global myBlock* myStorage)
+{
+    myStorage->a = (myStorage->a + 1) % 256;
+    myStorage->b[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x] += 0.0199999995529651641845703125f;
+}
+
diff --git a/reference/opt/shaders-opencl/comp/image.comp b/reference/opt/shaders-opencl/comp/image.comp
new file mode 100644
index 000000000..da5e16cf5
--- /dev/null
+++ b/reference/opt/shaders-opencl/comp/image.comp
@@ -0,0 +1,11 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(read_only image2d_t uImageIn, write_only image2d_t uImageOut)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    int2 _23 = as_int2(((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).xy);
+    write_imagef(uImageOut, _23, read_imagef(uImageIn, _23 + (int2)(get_image_width(uImageIn), get_image_height(uImageIn))));
+}
+
diff --git a/reference/opt/shaders-opencl/comp/insert.comp b/reference/opt/shaders-opencl/comp/insert.comp
new file mode 100644
index 000000000..930313528
--- /dev/null
+++ b/reference/opt/shaders-opencl/comp/insert.comp
@@ -0,0 +1,18 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct SSBO
+{
+    float4 out_data[1];
+};
+
+typedef struct SSBO SSBO;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global float4* _27)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    _27[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x] = (float4)(10.0f, 30.0f, 70.0f, 90.0f);
+    _27[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x][1u] = 20.0f;
+}
+
diff --git a/reference/opt/shaders-opencl/comp/local-invocation-id.comp b/reference/opt/shaders-opencl/comp/local-invocation-id.comp
new file mode 100644
index 000000000..0def2374c
--- /dev/null
+++ b/reference/opt/shaders-opencl/comp/local-invocation-id.comp
@@ -0,0 +1,18 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct myBlock
+{
+    int a;
+    float b[1];
+};
+
+typedef struct myBlock myBlock;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global myBlock* myStorage)
+{
+    myStorage->a = (myStorage->a + 1) % 256;
+    myStorage->b[((uint3)(get_local_id(0), get_local_id(1), get_local_id(2))).x] += 0.0199999995529651641845703125f;
+}
+
diff --git a/reference/opt/shaders-opencl/comp/local-invocation-index.comp b/reference/opt/shaders-opencl/comp/local-invocation-index.comp
new file mode 100644
index 000000000..0a1a8ed3c
--- /dev/null
+++ b/reference/opt/shaders-opencl/comp/local-invocation-index.comp
@@ -0,0 +1,18 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct myBlock
+{
+    int a;
+    float b[1];
+};
+
+typedef struct myBlock myBlock;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global myBlock* myStorage)
+{
+    myStorage->a = (myStorage->a + 1) % 256;
+    myStorage->b[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))] += 0.0199999995529651641845703125f;
+}
+
diff --git a/reference/opt/shaders-opencl/comp/local-size-duplicate-spec-id.comp b/reference/opt/shaders-opencl/comp/local-size-duplicate-spec-id.comp
new file mode 100644
index 000000000..99b804f76
--- /dev/null
+++ b/reference/opt/shaders-opencl/comp/local-size-duplicate-spec-id.comp
@@ -0,0 +1,30 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct StorageBuffer
+{
+    uint values[1];
+};
+
+typedef struct StorageBuffer StorageBuffer;
+
+#ifndef SPIRV_CROSS_CONSTANT_ID_0
+#define SPIRV_CROSS_CONSTANT_ID_0 1u
+#endif
+constant uint _22 = SPIRV_CROSS_CONSTANT_ID_0;
+#ifndef SPIRV_CROSS_CONSTANT_ID_1
+#define SPIRV_CROSS_CONSTANT_ID_1 1u
+#endif
+constant uint _23 = SPIRV_CROSS_CONSTANT_ID_1;
+#ifndef SPIRV_CROSS_CONSTANT_ID_2
+#define SPIRV_CROSS_CONSTANT_ID_2 1u
+#endif
+constant uint _24 = SPIRV_CROSS_CONSTANT_ID_2;
+constant uint3 spvWorkgroupSize = (uint3)(_22, _23, _24);
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global uint* ssbo)
+{
+    ssbo[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))] = 1u;
+}
+
diff --git a/reference/opt/shaders-opencl/comp/mod.comp b/reference/opt/shaders-opencl/comp/mod.comp
new file mode 100644
index 000000000..c52e0e95f
--- /dev/null
+++ b/reference/opt/shaders-opencl/comp/mod.comp
@@ -0,0 +1,27 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct SSBO
+{
+    float4 in_data[1];
+};
+
+typedef struct SSBO SSBO;
+
+struct SSBO2
+{
+    float4 out_data[1];
+};
+
+typedef struct SSBO2 SSBO2;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global const float4* _23, __global float4* _33)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    float4 _29 = _23[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x];
+    _33[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x] = fmod(_29, _33[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x]);
+    _33[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x] = as_float4(as_uint4(_29) % as_uint4(_33[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x]));
+    _33[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x] = as_float4(as_int4(_29) % as_int4(_33[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x]));
+}
+
diff --git a/reference/opt/shaders-opencl/comp/modf.comp b/reference/opt/shaders-opencl/comp/modf.comp
new file mode 100644
index 000000000..de38e3aa2
--- /dev/null
+++ b/reference/opt/shaders-opencl/comp/modf.comp
@@ -0,0 +1,34 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct SSBO
+{
+    float4 in_data[1];
+};
+
+typedef struct SSBO SSBO;
+
+struct ResType
+{
+    float4 _m0;
+    float4 _m1;
+};
+
+typedef struct ResType ResType;
+
+struct SSBO2
+{
+    float4 out_data[1];
+};
+
+typedef struct SSBO2 SSBO2;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global const float4* _23, __global float4* _38)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    ResType _32;
+    _32._m0 = modf(_23[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x], &_32._m1);
+    _38[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x] = _32._m0;
+}
+
diff --git a/reference/opt/shaders-opencl/comp/outer-product.comp b/reference/opt/shaders-opencl/comp/outer-product.comp
new file mode 100644
index 000000000..4462fc221
--- /dev/null
+++ b/reference/opt/shaders-opencl/comp/outer-product.comp
@@ -0,0 +1,45 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct SSBO
+{
+    float2 m22;
+    float3 m23;
+    float4 m24;
+    float2 m32;
+    float3 m33;
+    float4 m34;
+    float2 m42;
+    float3 m43;
+    float4 m44;
+};
+
+typedef struct SSBO SSBO;
+
+struct ReadSSBO
+{
+    float2 v2;
+    float3 v3;
+    float4 v4;
+};
+
+typedef struct ReadSSBO ReadSSBO;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global SSBO* _21, __global const ReadSSBO* _26)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    float2 _29 = _26->v2;
+    _21->m22 = _29 * _29.x;
+    float3 _38 = _26->v3;
+    _21->m23 = _38 * _29.x;
+    float4 _47 = _26->v4;
+    _21->m24 = _47 * _29.x;
+    _21->m32 = _29 * _38.x;
+    _21->m33 = _38 * _38.x;
+    _21->m34 = _47 * _38.x;
+    _21->m42 = _29 * _47.x;
+    _21->m43 = _38 * _47.x;
+    _21->m44 = _47 * _47.x;
+}
+
diff --git a/reference/opt/shaders-opencl/comp/packing-test-1.comp b/reference/opt/shaders-opencl/comp/packing-test-1.comp
new file mode 100644
index 000000000..9955dd0ea
--- /dev/null
+++ b/reference/opt/shaders-opencl/comp/packing-test-1.comp
@@ -0,0 +1,32 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct T1
+{
+    float3 a;
+    float b;
+};
+
+typedef struct T1 T1;
+
+struct Buffer0
+{
+    T1 buf0[1];
+};
+
+typedef struct Buffer0 Buffer0;
+
+struct Buffer1
+{
+    float buf1[1];
+};
+
+typedef struct Buffer1 Buffer1;
+
+__attribute__((reqd_work_group_size(32, 1, 1)))
+__kernel void comp_main(__global T1* _15, __global float* _34)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    _34[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x] = _15[0].b;
+}
+
diff --git a/reference/opt/shaders-opencl/comp/packing-test-2.comp b/reference/opt/shaders-opencl/comp/packing-test-2.comp
new file mode 100644
index 000000000..224b89a54
--- /dev/null
+++ b/reference/opt/shaders-opencl/comp/packing-test-2.comp
@@ -0,0 +1,32 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct T1
+{
+    float3 a;
+    float b;
+};
+
+typedef struct T1 T1;
+
+struct Buffer0
+{
+    T1 buf0[1];
+};
+
+typedef struct Buffer0 Buffer0;
+
+struct Buffer1
+{
+    float buf1[1];
+};
+
+typedef struct Buffer1 Buffer1;
+
+__attribute__((reqd_work_group_size(32, 1, 1)))
+__kernel void comp_main(__global T1* _14, __global float* _24)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    _24[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x] = _14[0].b;
+}
+
diff --git a/reference/opt/shaders-opencl/comp/read-write-only.comp b/reference/opt/shaders-opencl/comp/read-write-only.comp
new file mode 100644
index 000000000..6b54b862b
--- /dev/null
+++ b/reference/opt/shaders-opencl/comp/read-write-only.comp
@@ -0,0 +1,35 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct SSBO2
+{
+    float4 data4;
+    float4 data5;
+};
+
+typedef struct SSBO2 SSBO2;
+
+struct SSBO0
+{
+    float4 data0;
+    float4 data1;
+};
+
+typedef struct SSBO0 SSBO0;
+
+struct SSBO1
+{
+    float4 data2;
+    float4 data3;
+};
+
+typedef struct SSBO1 SSBO1;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global SSBO2* _10, __global const SSBO0* _15, __global SSBO1* _21)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    _10->data4 = _15->data0 + _21->data2;
+    _10->data5 = _15->data1 + _21->data3;
+}
+
diff --git a/reference/opt/shaders-opencl/comp/rmw-opt.comp b/reference/opt/shaders-opencl/comp/rmw-opt.comp
new file mode 100644
index 000000000..4127d311c
--- /dev/null
+++ b/reference/opt/shaders-opencl/comp/rmw-opt.comp
@@ -0,0 +1,27 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct SSBO
+{
+    int a;
+};
+
+typedef struct SSBO SSBO;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global int* _9)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    _9[0] += 10;
+    _9[0] -= 10;
+    _9[0] *= 10;
+    _9[0] /= 10;
+    _9[0] = _9[0] << 2;
+    _9[0] = _9[0] >> 3;
+    _9[0] &= 40;
+    _9[0] ^= 10;
+    _9[0] %= 40;
+    _9[0] |= 1;
+    _9[0] = 0;
+}
+
diff --git a/reference/opt/shaders-opencl/comp/scalar-std450-distance-length-normalize.comp b/reference/opt/shaders-opencl/comp/scalar-std450-distance-length-normalize.comp
new file mode 100644
index 000000000..59f3fb7ed
--- /dev/null
+++ b/reference/opt/shaders-opencl/comp/scalar-std450-distance-length-normalize.comp
@@ -0,0 +1,25 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct SSBO
+{
+    float a;
+    float b;
+    float c;
+    float d;
+    float e;
+    float f;
+};
+
+typedef struct SSBO SSBO;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global SSBO* _9)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    _9->c = distance(_9->a, _9->b);
+    _9->d = length(_9->a);
+    _9->e = normalize(_9->a);
+    _9->f = distance(_9->a - 1.0f, _9->b - 2.0f);
+}
+
diff --git a/reference/opt/shaders-opencl/comp/shared-std450.double.comp b/reference/opt/shaders-opencl/comp/shared-std450.double.comp
new file mode 100644
index 000000000..5859d791c
--- /dev/null
+++ b/reference/opt/shaders-opencl/comp/shared-std450.double.comp
@@ -0,0 +1,29 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+struct SSBO
+{
+    double in_data[1];
+};
+
+typedef struct SSBO SSBO;
+
+struct SSBO2
+{
+    double out_data[1];
+};
+
+typedef struct SSBO2 SSBO2;
+
+__attribute__((reqd_work_group_size(4, 1, 1)))
+__kernel void comp_main(__global const double* _22, __global double* _44)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    __local double sShared[4];
+    sShared[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))] = _22[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x];
+    mem_fence(CLK_LOCAL_MEM_FENCE);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    _44[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x] = sShared[3u - ((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))];
+}
+
diff --git a/reference/opt/shaders-opencl/comp/shared-struct-bool-cast.comp b/reference/opt/shaders-opencl/comp/shared-struct-bool-cast.comp
new file mode 100644
index 000000000..68d589539
--- /dev/null
+++ b/reference/opt/shaders-opencl/comp/shared-struct-bool-cast.comp
@@ -0,0 +1,65 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct S1
+{
+    int3 a;
+    uint2 b;
+    int4 c;
+    uint d;
+};
+
+typedef struct S1 S1;
+
+struct block
+{
+    uint passed;
+};
+
+typedef struct block block;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global uint* _132)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    __local S1 s1;
+    s1.a = (int3)(6, 8, 8);
+    s1.b = (uint2)(4u);
+    s1.c = (int4)(false, false, false, true);
+    s1.d = 6u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    mem_fence(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
+    bool _144 = all((int3)(6, 8, 8) == s1.a);
+    bool _108;
+    if (_144)
+    {
+        _108 = all((uint2)(4u) == s1.b);
+    }
+    else
+    {
+        _108 = _144;
+    }
+    bool _117;
+    if (_108)
+    {
+        _117 = all((int4)(false, false, false, true) == s1.c);
+    }
+    else
+    {
+        _117 = _108;
+    }
+    bool _126;
+    if (_117)
+    {
+        _126 = 6u == s1.d;
+    }
+    else
+    {
+        _126 = _117;
+    }
+    if (_126)
+    {
+        _132[0] += as_uint(1);
+    }
+}
+
diff --git a/reference/opt/shaders-opencl/comp/shared-zero-init-simple.comp b/reference/opt/shaders-opencl/comp/shared-zero-init-simple.comp
new file mode 100644
index 000000000..0bec24063
--- /dev/null
+++ b/reference/opt/shaders-opencl/comp/shared-zero-init-simple.comp
@@ -0,0 +1,25 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct SSBO
+{
+    float in_data[1];
+};
+
+typedef struct SSBO SSBO;
+
+struct SSBO2
+{
+    float out_data[1];
+};
+
+typedef struct SSBO2 SSBO2;
+
+__attribute__((reqd_work_group_size(4, 1, 1)))
+__kernel void comp_main(__global const float* _22, __global float* _32)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    __local float sShared;
+    _32[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x] = sShared + _22[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x];
+}
+
diff --git a/reference/opt/shaders-opencl/comp/shared-zero-init.comp b/reference/opt/shaders-opencl/comp/shared-zero-init.comp
new file mode 100644
index 000000000..b587d8f44
--- /dev/null
+++ b/reference/opt/shaders-opencl/comp/shared-zero-init.comp
@@ -0,0 +1,30 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct SSBO
+{
+    float in_data[1];
+};
+
+typedef struct SSBO SSBO;
+
+constant float _31[4] = { 0.0f, 0.0f, 0.0f, 0.0f };
+
+struct SSBO2
+{
+    float out_data[1];
+};
+
+typedef struct SSBO2 SSBO2;
+
+__attribute__((reqd_work_group_size(4, 1, 1)))
+__kernel void comp_main(__global const float* _22, __global float* _48)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    __local float sShared[4];
+    sShared[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))] += _22[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x];
+    mem_fence(CLK_LOCAL_MEM_FENCE);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    _48[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x] = sShared[3u - ((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))];
+}
+
diff --git a/reference/opt/shaders-opencl/comp/shared.comp b/reference/opt/shaders-opencl/comp/shared.comp
new file mode 100644
index 000000000..836b2bf9f
--- /dev/null
+++ b/reference/opt/shaders-opencl/comp/shared.comp
@@ -0,0 +1,28 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct SSBO
+{
+    float in_data[1];
+};
+
+typedef struct SSBO SSBO;
+
+struct SSBO2
+{
+    float out_data[1];
+};
+
+typedef struct SSBO2 SSBO2;
+
+__attribute__((reqd_work_group_size(4, 1, 1)))
+__kernel void comp_main(__global const float* _22, __global float* _44)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    __local float sShared[4];
+    sShared[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))] = _22[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x];
+    mem_fence(CLK_LOCAL_MEM_FENCE);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    _44[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x] = sShared[3u - ((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))];
+}
+
diff --git a/reference/opt/shaders-opencl/comp/spec-constant-work-group-size.comp b/reference/opt/shaders-opencl/comp/spec-constant-work-group-size.comp
new file mode 100644
index 000000000..4bf86f53f
--- /dev/null
+++ b/reference/opt/shaders-opencl/comp/spec-constant-work-group-size.comp
@@ -0,0 +1,39 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+#ifndef SPIRV_CROSS_CONSTANT_ID_1
+#define SPIRV_CROSS_CONSTANT_ID_1 2
+#endif
+constant int b = SPIRV_CROSS_CONSTANT_ID_1;
+#ifndef SPIRV_CROSS_CONSTANT_ID_0
+#define SPIRV_CROSS_CONSTANT_ID_0 1
+#endif
+constant int a = SPIRV_CROSS_CONSTANT_ID_0;
+
+struct SSBO
+{
+    int v[1];
+};
+
+typedef struct SSBO SSBO;
+
+#define _21 ((as_uint(a) + 0u))
+#ifndef SPIRV_CROSS_CONSTANT_ID_10
+#define SPIRV_CROSS_CONSTANT_ID_10 1u
+#endif
+constant uint _22 = SPIRV_CROSS_CONSTANT_ID_10;
+constant uint3 spvWorkgroupSize = (uint3)(_22, 20u, 1u);
+#define _27 (spvWorkgroupSize.x)
+#define _28 ((_21 + _27))
+#define _29 (spvWorkgroupSize.y)
+#define _30 ((_28 + _29))
+#define _32 ((1 - a))
+
+__attribute__((reqd_work_group_size(1, 20, 1)))
+__kernel void comp_main(__global int* _17)
+{
+    int spec_const_array_size[b];
+    spec_const_array_size[a] = a;
+    _17[_30] = b + spec_const_array_size[_32];
+}
+
diff --git a/reference/opt/shaders-opencl/comp/struct-layout.comp b/reference/opt/shaders-opencl/comp/struct-layout.comp
new file mode 100644
index 000000000..39cabe2a8
--- /dev/null
+++ b/reference/opt/shaders-opencl/comp/struct-layout.comp
@@ -0,0 +1,31 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct Foo
+{
+    float4 m;
+};
+
+typedef struct Foo Foo;
+
+struct SSBO2
+{
+    Foo out_data[1];
+};
+
+typedef struct SSBO2 SSBO2;
+
+struct SSBO
+{
+    Foo in_data[1];
+};
+
+typedef struct SSBO SSBO;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global Foo* _23, __global const Foo* _30)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    _23[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x].m = _30[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x].m * _30[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x].m;
+}
+
diff --git a/reference/opt/shaders-opencl/comp/struct-nested.comp b/reference/opt/shaders-opencl/comp/struct-nested.comp
new file mode 100644
index 000000000..264ad2ddb
--- /dev/null
+++ b/reference/opt/shaders-opencl/comp/struct-nested.comp
@@ -0,0 +1,31 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct s1
+{
+    int a;
+};
+
+typedef struct s1 s1;
+
+struct s2
+{
+    s1 b;
+};
+
+typedef struct s2 s2;
+
+struct dstbuffer
+{
+    s2 test[1];
+};
+
+typedef struct dstbuffer dstbuffer;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global s2* _19)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    _19[0].b.a = 0;
+}
+
diff --git a/reference/opt/shaders-opencl/comp/struct-packing.invalid.comp b/reference/opt/shaders-opencl/comp/struct-packing.invalid.comp
new file mode 100644
index 000000000..e69de29bb
diff --git a/reference/opt/shaders-opencl/comp/torture-loop.comp b/reference/opt/shaders-opencl/comp/torture-loop.comp
new file mode 100644
index 000000000..1ca9606c7
--- /dev/null
+++ b/reference/opt/shaders-opencl/comp/torture-loop.comp
@@ -0,0 +1,46 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct SSBO
+{
+    float4 mvp;
+    float4 in_data[1];
+};
+
+typedef struct SSBO SSBO;
+
+struct SSBO2
+{
+    float4 out_data[1];
+};
+
+typedef struct SSBO2 SSBO2;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global const SSBO* _24, __global float4* _89)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    float4 _101;
+    _101 = _24->in_data[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x];
+    for (int _95 = 0; (_95 + 1) < 10; )
+    {
+        _101 *= 2.0f;
+        _95 += 2;
+        continue;
+    }
+    float4 _100;
+    _100 = _101;
+    float4 _105;
+    for (uint _96 = 0u; _96 < 16u; _100 = _105, _96 += as_uint(1))
+    {
+        _105 = _100;
+        for (uint _102 = 0u; _102 < 30u; )
+        {
+            _105 = _24->mvp * _105;
+            _102 += as_uint(1);
+            continue;
+        }
+    }
+    _89[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x] = _100;
+}
+
diff --git a/reference/opt/shaders-opencl/comp/type-alias.comp b/reference/opt/shaders-opencl/comp/type-alias.comp
new file mode 100644
index 000000000..32329cb1a
--- /dev/null
+++ b/reference/opt/shaders-opencl/comp/type-alias.comp
@@ -0,0 +1,45 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct S0
+{
+    float4 a;
+};
+
+typedef struct S0 S0;
+
+struct SSBO0
+{
+    S0 s0s[1];
+};
+
+typedef struct SSBO0 SSBO0;
+
+struct S1
+{
+    float4 a;
+};
+
+typedef struct S1 S1;
+
+struct SSBO1
+{
+    S1 s1s[1];
+};
+
+typedef struct SSBO1 SSBO1;
+
+struct SSBO2
+{
+    float4 outputs[1];
+};
+
+typedef struct SSBO2 SSBO2;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global S0* _36, __global S1* _55, __global float4* _66)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    _66[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x] = _36[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x].a + _55[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x].a;
+}
+
diff --git a/reference/opt/shaders-opencl/comp/udiv.comp b/reference/opt/shaders-opencl/comp/udiv.comp
new file mode 100644
index 000000000..7e336b9b4
--- /dev/null
+++ b/reference/opt/shaders-opencl/comp/udiv.comp
@@ -0,0 +1,24 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct SSBO2
+{
+    uint outputs[1];
+};
+
+typedef struct SSBO2 SSBO2;
+
+struct SSBO
+{
+    uint inputs[1];
+};
+
+typedef struct SSBO SSBO;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global uint* _10, __global uint* _23)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    _10[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x] = _23[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x] / 29u;
+}
+
diff --git a/reference/opt/shaders-opencl/comp/writable-ssbo.comp b/reference/opt/shaders-opencl/comp/writable-ssbo.comp
new file mode 100644
index 000000000..30716e427
--- /dev/null
+++ b/reference/opt/shaders-opencl/comp/writable-ssbo.comp
@@ -0,0 +1,18 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct myBlock
+{
+    int a;
+    float b;
+};
+
+typedef struct myBlock myBlock;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global myBlock* myStorage)
+{
+    myStorage->a = (myStorage->a + 1) % 256;
+    myStorage->b += 0.0199999995529651641845703125f;
+}
+

From 6f820620b72c08a0ab7075bc2f8d1cf479057379 Mon Sep 17 00:00:00 2001
From: Garrick Meeker <gmeeker@gmail.com>
Date: Fri, 13 Mar 2026 13:49:12 -0700
Subject: [PATCH 04/16] OpenCL: Adding C API, cleanup

---
 .../asm/comp/variable-pointers-2.asm.comp     |  9 +-
 ...ariable-pointers-store-forwarding.asm.comp |  5 +-
 .../vector-builtin-type-cast-func.asm.comp    |  5 +-
 .../access-private-workgroup-in-function.comp | 10 +-
 spirv_cross_c.cpp                             | 36 +++++++
 spirv_cross_c.h                               |  9 +-
 spirv_glsl.hpp                                | 12 +--
 spirv_opencl.cpp                              | 93 +++++++++++++++----
 spirv_opencl.hpp                              |  6 +-
 test_shaders.sh                               |  1 +
 10 files changed, 144 insertions(+), 42 deletions(-)

diff --git a/reference/shaders-opencl/asm/comp/variable-pointers-2.asm.comp b/reference/shaders-opencl/asm/comp/variable-pointers-2.asm.comp
index 3bfb4fcbd..fa9b7c971 100644
--- a/reference/shaders-opencl/asm/comp/variable-pointers-2.asm.comp
+++ b/reference/shaders-opencl/asm/comp/variable-pointers-2.asm.comp
@@ -22,15 +22,16 @@ __global foo* select_buffer(__global foo* a_1_1, bar cb)
     return (cb.d != 0) ? a_1_1 : NULL;
 }
 
+#define _3 (*_3_ptr)
+#define _4 (*_4_ptr)
 __private uint3* select_input(__private uint3* _3_ptr, __private uint3* _4_ptr, bar cb)
 {
-    #define _3 (*_3_ptr)
-    #define _4 (*_4_ptr)
     return (cb.d != 0) ? &_3 : &_4;
-    #undef _3
-    #undef _4
 }
 
+#undef _3
+#undef _4
+
 __attribute__((reqd_work_group_size(1, 1, 1)))
 __kernel void comp_main(__global foo* buf, bar cb)
 {
diff --git a/reference/shaders-opencl/asm/comp/variable-pointers-store-forwarding.asm.comp b/reference/shaders-opencl/asm/comp/variable-pointers-store-forwarding.asm.comp
index cbc654c1c..f9a3b49ec 100644
--- a/reference/shaders-opencl/asm/comp/variable-pointers-store-forwarding.asm.comp
+++ b/reference/shaders-opencl/asm/comp/variable-pointers-store-forwarding.asm.comp
@@ -15,13 +15,14 @@ struct bar
 
 typedef struct bar bar;
 
+#define _3 (*_3_ptr)
 __global int* _24(__global foo* a_1, __global bar* b_1, __private uint3* _3_ptr)
 {
-    #define _3 (*_3_ptr)
     return (_3.x != 0u) ? &a_1->a : &b_1->b;
-    #undef _3
 }
 
+#undef _3
+
 __attribute__((reqd_work_group_size(1, 1, 1)))
 __kernel void comp_main(__global int* x, __global int* y)
 {
diff --git a/reference/shaders-opencl/asm/comp/vector-builtin-type-cast-func.asm.comp b/reference/shaders-opencl/asm/comp/vector-builtin-type-cast-func.asm.comp
index d4f5be9be..61d0b595f 100644
--- a/reference/shaders-opencl/asm/comp/vector-builtin-type-cast-func.asm.comp
+++ b/reference/shaders-opencl/asm/comp/vector-builtin-type-cast-func.asm.comp
@@ -8,13 +8,14 @@ struct cb1_struct
 
 typedef struct cb1_struct cb1_struct;
 
+#define _RESERVED_IDENTIFIER_FIXUP_gl_LocalInvocationID (*_RESERVED_IDENTIFIER_FIXUP_gl_LocalInvocationID_ptr)
 int2 get_texcoord( int2* base,  int2* index, __private int3* _RESERVED_IDENTIFIER_FIXUP_gl_LocalInvocationID_ptr)
 {
-    #define _RESERVED_IDENTIFIER_FIXUP_gl_LocalInvocationID (*_RESERVED_IDENTIFIER_FIXUP_gl_LocalInvocationID_ptr)
     return ((*base) * as_int3(_RESERVED_IDENTIFIER_FIXUP_gl_LocalInvocationID).xy) + (*index);
-    #undef _RESERVED_IDENTIFIER_FIXUP_gl_LocalInvocationID
 }
 
+#undef _RESERVED_IDENTIFIER_FIXUP_gl_LocalInvocationID
+
 __attribute__((reqd_work_group_size(16, 16, 1)))
 __kernel void comp_main(write_only image2d_t u0, cb1_struct cb0_1)
 {
diff --git a/reference/shaders-opencl/comp/access-private-workgroup-in-function.comp b/reference/shaders-opencl/comp/access-private-workgroup-in-function.comp
index 4aeedb66b..7a532044f 100644
--- a/reference/shaders-opencl/comp/access-private-workgroup-in-function.comp
+++ b/reference/shaders-opencl/comp/access-private-workgroup-in-function.comp
@@ -1,20 +1,22 @@
 // Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
 
 
+#define f (*f_ptr)
 void set_f(int* f_ptr)
 {
-    #define f (*f_ptr)
     f = 40;
-    #undef f
 }
 
+#undef f
+
+#define u (*u_ptr)
 void set_shared_u(__local int* u_ptr)
 {
-    #define u (*u_ptr)
     u = 50;
-    #undef u
 }
 
+#undef u
+
 __attribute__((reqd_work_group_size(1, 1, 1)))
 __kernel void comp_main()
 {
diff --git a/spirv_cross_c.cpp b/spirv_cross_c.cpp
index 4494700ed..f49366ac4 100644
--- a/spirv_cross_c.cpp
+++ b/spirv_cross_c.cpp
@@ -170,6 +170,9 @@ struct spvc_compiler_options_s : ScratchMemoryAllocation
 #if SPIRV_CROSS_C_API_HLSL
 	CompilerHLSL::Options hlsl;
 #endif
+#if SPIRV_CROSS_C_API_OPENCL
+	CompilerOpenCL::Options opencl;
+#endif
 };
 
 struct spvc_set_s : ScratchMemoryAllocation
@@ -394,6 +397,14 @@ spvc_result spvc_compiler_create_compiler_options(spvc_compiler compiler, spvc_c
 			break;
 #endif
 
+#if SPIRV_CROSS_C_API_OPENCL
+		case SPVC_BACKEND_OPENCL:
+			opt->backend_flags |= SPVC_COMPILER_OPTION_OPENCL_BIT | SPVC_COMPILER_OPTION_COMMON_BIT;
+			opt->glsl = static_cast<CompilerOpenCL *>(compiler->compiler.get())->get_common_options();
+			opt->opencl = static_cast<CompilerOpenCL *>(compiler->compiler.get())->get_opencl_options();
+			break;
+#endif
+
 #if SPIRV_CROSS_C_API_GLSL
 		case SPVC_BACKEND_GLSL:
 			opt->backend_flags |= SPVC_COMPILER_OPTION_GLSL_BIT | SPVC_COMPILER_OPTION_COMMON_BIT;
@@ -797,6 +808,24 @@ spvc_result spvc_compiler_options_set_uint(spvc_compiler_options options, spvc_c
 		break;
 #endif
 
+#if SPIRV_CROSS_C_API_OPENCL
+	case SPVC_COMPILER_OPTION_OPENCL_VERSION:
+		options->opencl.opencl_version = value;
+		break;
+	case SPVC_COMPILER_OPTION_OPENCL_ENABLE_FP64:
+		options->opencl.enable_fp64 = value != 0;
+		break;
+	case SPVC_COMPILER_OPTION_OPENCL_ENABLE_64BIT_ATOMICS:
+		options->opencl.enable_64bit_atomics = value != 0;
+		break;
+	case SPVC_COMPILER_OPTION_OPENCL_ENABLE_SUBGROUPS:
+		options->opencl.enable_subgroups = value != 0;
+		break;
+	case SPVC_COMPILER_OPTION_OPENCL_ENABLE_SHUFFLE:
+		options->opencl.enable_shuffle = value != 0;
+		break;
+#endif
+
 	default:
 		options->context->report_error("Unknown option.");
 		return SPVC_ERROR_INVALID_ARGUMENT;
@@ -830,6 +859,13 @@ spvc_result spvc_compiler_install_compiler_options(spvc_compiler compiler, spvc_
 		break;
 #endif
 
+#if SPIRV_CROSS_C_API_OPENCL
+	case SPVC_BACKEND_OPENCL:
+		static_cast<CompilerOpenCL &>(*compiler->compiler).set_common_options(options->glsl);
+		static_cast<CompilerOpenCL &>(*compiler->compiler).set_opencl_options(options->opencl);
+		break;
+#endif
+
 	default:
 		break;
 	}
diff --git a/spirv_cross_c.h b/spirv_cross_c.h
index 76d2b8155..c59c299d0 100644
--- a/spirv_cross_c.h
+++ b/spirv_cross_c.h
@@ -272,7 +272,8 @@ extern "C"
 #define SPVC_COMPILER_OPTION_GLSL_BIT 0x2000000
 #define SPVC_COMPILER_OPTION_HLSL_BIT 0x4000000
 #define SPVC_COMPILER_OPTION_MSL_BIT 0x8000000
-#define SPVC_COMPILER_OPTION_LANG_BITS 0x0f000000
+#define SPVC_COMPILER_OPTION_OPENCL_BIT 0x10000000
+#define SPVC_COMPILER_OPTION_LANG_BITS 0x1f000000
 #define SPVC_COMPILER_OPTION_ENUM_BITS 0xffffff
 
 #define SPVC_MAKE_MSL_VERSION(major, minor, patch) ((major) * 10000 + (minor) * 100 + (patch))
@@ -757,6 +758,12 @@ extern "C"
 
 		SPVC_COMPILER_OPTION_HLSL_USER_SEMANTIC = 94 | SPVC_COMPILER_OPTION_HLSL_BIT,
 
+		SPVC_COMPILER_OPTION_OPENCL_VERSION = 95 | SPVC_COMPILER_OPTION_OPENCL_BIT,
+		SPVC_COMPILER_OPTION_OPENCL_ENABLE_FP64 = 96 | SPVC_COMPILER_OPTION_OPENCL_BIT,
+		SPVC_COMPILER_OPTION_OPENCL_ENABLE_64BIT_ATOMICS = 97 | SPVC_COMPILER_OPTION_OPENCL_BIT,
+		SPVC_COMPILER_OPTION_OPENCL_ENABLE_SUBGROUPS = 98 | SPVC_COMPILER_OPTION_OPENCL_BIT,
+		SPVC_COMPILER_OPTION_OPENCL_ENABLE_SHUFFLE = 99 | SPVC_COMPILER_OPTION_OPENCL_BIT,
+
 		SPVC_COMPILER_OPTION_INT_MAX = 0x7fffffff
 	} spvc_compiler_option;
 
diff --git a/spirv_glsl.hpp b/spirv_glsl.hpp
index 4773595db..24e34d7b0 100644
--- a/spirv_glsl.hpp
+++ b/spirv_glsl.hpp
@@ -396,23 +396,13 @@ class CompilerGLSL : public Compiler
 	static bool is_supported_subgroup_op_in_opengl(Op op, const uint32_t *ops);
 
 	void reset(uint32_t iteration_count);
-	void emit_function(SPIRFunction &func, const Bitset &return_flags);
+	virtual void emit_function(SPIRFunction &func, const Bitset &return_flags);
 
 	bool has_extension(const std::string &ext) const;
 	void require_extension_internal(const std::string &ext);
 
 	// Virtualize methods which need to be overridden by subclass targets like C++ and such.
 	virtual void emit_function_prototype(SPIRFunction &func, const Bitset &return_flags);
-	// Called right after the opening { of a non-entry helper function body.
-	// Override to emit per-function preamble declarations (e.g. #define aliases).
-	virtual void emit_function_local_declarations(SPIRFunction &)
-	{
-	}
-	// Called right before the closing } of a non-entry helper function body.
-	// Override to clean up anything emitted by emit_function_local_declarations.
-	virtual void emit_function_local_epilogue(SPIRFunction &)
-	{
-	}
 
 	SPIRBlock *current_emitting_block = nullptr;
 	SmallVector<SPIRBlock *> current_emitting_switch_stack;
diff --git a/spirv_opencl.cpp b/spirv_opencl.cpp
index 522ba7d92..435979c1b 100644
--- a/spirv_opencl.cpp
+++ b/spirv_opencl.cpp
@@ -159,6 +159,31 @@ void CompilerOpenCL::emit_header()
 		statement("#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable");
 	statement("");
 
+	// Emit FP_CONTRACT pragma based on ContractionOff execution mode and FPFastMathDefault.
+	{
+		auto &ep = get_entry_point();
+		bool contract = true;
+
+		if (ep.flags.get(ExecutionModeContractionOff))
+			contract = false;
+
+		for (auto &fp_pair : ep.fp_fast_math_defaults)
+		{
+			if (fp_pair.second)
+			{
+				uint32_t flags = get<SPIRConstant>(fp_pair.second).scalar();
+				if (!(flags & FPFastMathModeAllowContractMask))
+					contract = false;
+			}
+		}
+
+		if (!contract)
+		{
+			statement("#pragma OPENCL FP_CONTRACT OFF");
+			statement("");
+		}
+	}
+
 	for (auto &header : header_lines)
 		statement(header);
 	if (!header_lines.empty())
@@ -1594,12 +1619,11 @@ void CompilerOpenCL::append_global_func_args(const SPIRFunction &func, uint32_t
 	}
 }
 
-void CompilerOpenCL::emit_function_local_declarations(SPIRFunction &func)
+void CompilerOpenCL::emit_function(SPIRFunction &func, const Bitset &return_flags)
 {
-	// For helper functions that access workgroup/private global scalar variables via pointer params:
-	// emit #define var_name (*var_name_ptr) so that existing expressions (e.g. "u = 50;")
-	// transparently dereference the pointer parameter.
+	// Emit #define macros before the function for workgroup scalar pointer aliasing.
 	auto wg_it = func_workgroup_args.find(func.self);
+	bool has_defines = false;
 	if (wg_it != func_workgroup_args.end())
 	{
 		for (auto var_id : wg_it->second)
@@ -1608,24 +1632,38 @@ void CompilerOpenCL::emit_function_local_declarations(SPIRFunction &func)
 			{
 				auto var_name = to_name(var_id);
 				statement("#define ", var_name, " (*", var_name, "_ptr)");
+				has_defines = true;
 			}
 		}
 	}
-}
 
-void CompilerOpenCL::emit_function_local_epilogue(SPIRFunction &func)
-{
-	auto wg_it = func_workgroup_args.find(func.self);
-	if (wg_it != func_workgroup_args.end())
+	CompilerGLSL::emit_function(func, return_flags);
+
+	// Emit #undef after the function.
+	if (has_defines)
 	{
 		for (auto var_id : wg_it->second)
 		{
 			if (workgroup_scalar_vars.count(var_id))
 				statement("#undef ", to_name(var_id));
 		}
+		statement("");
 	}
 }
 
+void CompilerOpenCL::emit_struct_member(const SPIRType &type, uint32_t member_type_id, uint32_t index,
+                                        const string &qualifier, uint32_t)
+{
+	auto &membertype = get<SPIRType>(member_type_id);
+	// OpenCL C does not use GLSL layout qualifiers or interpolation qualifiers.
+	statement(qualifier, variable_decl(membertype, to_member_name(type, index)), ";");
+}
+
+void CompilerOpenCL::emit_block_hints(const SPIRBlock &)
+{
+	// OpenCL C has no control-flow hint attributes; suppress SPIRV_CROSS_BRANCH/FLATTEN etc.
+}
+
 void CompilerOpenCL::emit_specialization_constants_and_structs()
 {
 	bool emitted = false;
@@ -1850,7 +1888,7 @@ void CompilerOpenCL::emit_instruction(const Instruction &instruction)
 	auto opencl_atomic = [this, ops](const char *opencl_op)
 	{
 		if (check_atomic_image(ops[2]))
-			SPIRV_CROSS_THROW("Image atomics not yet implemented for OpenCL.");
+			SPIRV_CROSS_THROW("Image atomics are not supported in OpenCL.");
 		emit_atomic_func_op(ops[0], ops[1], ops[2], ops[5], opencl_op);
 	};
 
@@ -2091,12 +2129,12 @@ void CompilerOpenCL::emit_instruction(const Instruction &instruction)
 
 	case OpAtomicExchange:
 		if (check_atomic_image(ops[2]))
-			SPIRV_CROSS_THROW("Image atomics not yet implemented for OpenCL.");
+			SPIRV_CROSS_THROW("Image atomics are not supported in OpenCL.");
 		emit_atomic_func_op(ops[0], ops[1], ops[2], ops[5], "atomic_xchg");
 		break;
 	case OpAtomicCompareExchange:
 		if (check_atomic_image(ops[2]))
-			SPIRV_CROSS_THROW("Image atomics not yet implemented for OpenCL.");
+			SPIRV_CROSS_THROW("Image atomics are not supported in OpenCL.");
 		// OpenCL atomic_cmpxchg(&ptr, expected, desired)
 		forced_temporaries.insert(ops[1]);
 		emit_op(ops[0], ops[1],
@@ -2112,7 +2150,7 @@ void CompilerOpenCL::emit_instruction(const Instruction &instruction)
 	case OpAtomicISub:
 	{
 		if (check_atomic_image(ops[2]))
-			SPIRV_CROSS_THROW("Image atomics not yet implemented for OpenCL.");
+			SPIRV_CROSS_THROW("Image atomics are not supported in OpenCL.");
 		forced_temporaries.insert(ops[1]);
 		auto expr = join("atomic_sub(", to_atomic_ptr_expression(ops[2]), ", ", to_enclosed_expression(ops[5]), ")");
 		emit_op(ops[0], ops[1], expr, should_forward(ops[2]) && should_forward(ops[5]));
@@ -2139,7 +2177,7 @@ void CompilerOpenCL::emit_instruction(const Instruction &instruction)
 	case OpAtomicLoad:
 	{
 		if (check_atomic_image(ops[2]))
-			SPIRV_CROSS_THROW("Image atomics not yet implemented for OpenCL.");
+			SPIRV_CROSS_THROW("Image atomics are not supported in OpenCL.");
 		auto &type = expression_type(ops[2]);
 		forced_temporaries.insert(ops[1]);
 		bool unsigned_type = (type.basetype == SPIRType::UInt);
@@ -2151,7 +2189,7 @@ void CompilerOpenCL::emit_instruction(const Instruction &instruction)
 	case OpAtomicStore:
 	{
 		if (check_atomic_image(ops[0]))
-			SPIRV_CROSS_THROW("Image atomics not yet implemented for OpenCL.");
+			SPIRV_CROSS_THROW("Image atomics are not supported in OpenCL.");
 		statement("atomic_xchg(", to_atomic_ptr_expression(ops[0]), ", ", to_expression(ops[3]), ");");
 		flush_all_atomic_capable_variables();
 		break;
@@ -2160,7 +2198,7 @@ void CompilerOpenCL::emit_instruction(const Instruction &instruction)
 	case OpAtomicIDecrement:
 	{
 		if (check_atomic_image(ops[2]))
-			SPIRV_CROSS_THROW("Image atomics not yet implemented for OpenCL.");
+			SPIRV_CROSS_THROW("Image atomics are not supported in OpenCL.");
 		forced_temporaries.insert(ops[1]);
 		auto &type = expression_type(ops[2]);
 		bool unsigned_type = (type.basetype == SPIRType::UInt);
@@ -2656,6 +2694,29 @@ void CompilerOpenCL::emit_instruction(const Instruction &instruction)
 		break;
 	}
 
+	case OpPtrEqual:
+	case OpPtrNotEqual:
+	case OpPtrDiff:
+	{
+		uint32_t result_type = ops[0];
+		uint32_t result_id = ops[1];
+		uint32_t op0 = ops[2];
+		uint32_t op1 = ops[3];
+		const char *op = "";
+		if (opcode == OpPtrEqual)
+			op = "==";
+		else if (opcode == OpPtrNotEqual)
+			op = "!=";
+		else if (opcode == OpPtrDiff)
+			op = "-";
+		bool forward = should_forward(op0) && should_forward(op1);
+		emit_op(result_type, result_id, join(to_pointer_expression(op0), " ", op, " ", to_pointer_expression(op1)),
+		        forward);
+		inherit_expression_dependencies(result_id, op0);
+		inherit_expression_dependencies(result_id, op1);
+		break;
+	}
+
 	default:
 		CompilerGLSL::emit_instruction(instruction);
 		break;
diff --git a/spirv_opencl.hpp b/spirv_opencl.hpp
index 90a4291e0..defe75032 100644
--- a/spirv_opencl.hpp
+++ b/spirv_opencl.hpp
@@ -126,8 +126,10 @@ class CompilerOpenCL : public CompilerGLSL
 	uint32_t get_physical_type_id_stride(TypeID type_id) const override;
 
 	void replace_illegal_names() override;
-	void emit_function_local_declarations(SPIRFunction &func) override;
-	void emit_function_local_epilogue(SPIRFunction &func) override;
+	void emit_function(SPIRFunction &func, const Bitset &return_flags) override;
+	void emit_block_hints(const SPIRBlock &block) override;
+	void emit_struct_member(const SPIRType &type, uint32_t member_type_id, uint32_t index,
+	                        const std::string &qualifier = "", uint32_t base_offset = 0) override;
 
 	Options opencl_options;
 
diff --git a/test_shaders.sh b/test_shaders.sh
index a054710ed..a6dd41c14 100755
--- a/test_shaders.sh
+++ b/test_shaders.sh
@@ -23,6 +23,7 @@ echo "Using SPIRV-Cross in: \"$SPIRV_CROSS_PATH\"."
 ./test_shaders.py shaders-msl-no-opt ${OPTS} --msl --spirv-cross "$SPIRV_CROSS_PATH" || exit 1
 ./test_shaders.py shaders-opencl ${OPTS} --opencl --spirv-cross "$SPIRV_CROSS_PATH" || exit 1
 ./test_shaders.py shaders-opencl ${OPTS} --opencl --opt --spirv-cross "$SPIRV_CROSS_PATH" || exit 1
+./test_shaders.py shaders-opencl-no-opt ${OPTS} --opencl --spirv-cross "$SPIRV_CROSS_PATH" || exit 1
 ./test_shaders.py shaders-hlsl ${OPTS} --hlsl --spirv-cross "$SPIRV_CROSS_PATH" || exit 1
 ./test_shaders.py shaders-hlsl ${OPTS} --hlsl --opt --spirv-cross "$SPIRV_CROSS_PATH" || exit 1
 ./test_shaders.py shaders-hlsl-no-opt ${OPTS} --hlsl --spirv-cross "$SPIRV_CROSS_PATH" || exit 1

From 6586da175224b9d8eab2d12f3cdb77dc221d1d0c Mon Sep 17 00:00:00 2001
From: Garrick Meeker <gmeeker@gmail.com>
Date: Fri, 13 Mar 2026 22:30:11 -0700
Subject: [PATCH 05/16] OpenCL: more GLSL mappings

---
 spirv_opencl.cpp | 590 ++++++++++++++++++++++++++++++++++++++++++++++-
 spirv_opencl.hpp |   2 +
 2 files changed, 591 insertions(+), 1 deletion(-)

diff --git a/spirv_opencl.cpp b/spirv_opencl.cpp
index 435979c1b..5a72f14e4 100644
--- a/spirv_opencl.cpp
+++ b/spirv_opencl.cpp
@@ -477,6 +477,19 @@ void CompilerOpenCL::emit_resources()
 		statement("");
 	}
 
+	// Polyfill for bitfieldReverse (32-bit scalar only — vectors call per-component).
+	if (needs_bitreverse_polyfill)
+	{
+		statement("uint spvBitReverse(uint v) {");
+		statement("    v = ((v >> 1u) & 0x55555555u) | ((v & 0x55555555u) << 1u);");
+		statement("    v = ((v >> 2u) & 0x33333333u) | ((v & 0x33333333u) << 2u);");
+		statement("    v = ((v >> 4u) & 0x0F0F0F0Fu) | ((v & 0x0F0F0F0Fu) << 4u);");
+		statement("    v = ((v >> 8u) & 0x00FF00FFu) | ((v & 0x00FF00FFu) << 8u);");
+		statement("    return (v >> 16u) | (v << 16u);");
+		statement("}");
+		statement("");
+	}
+
 	// Default sampler for combined image+sampler usage (OpenCL C requires file-scope const sampler_t).
 	if (needs_default_sampler)
 	{
@@ -854,6 +867,9 @@ string CompilerOpenCL::get_type_address_space(const SPIRType &type, uint32_t id,
 	case StorageClassWorkgroup:
 		addr_space = "__local";
 		break;
+	case StorageClassPhysicalStorageBuffer:
+		addr_space = "__global";
+		break;
 	case StorageClassInput:
 		// Input builtins materialized as __private local variables.
 		addr_space = "__private";
@@ -1209,6 +1225,132 @@ void CompilerOpenCL::emit_glsl_op(uint32_t result_type, uint32_t result_id, uint
 		emit_unary_func_op(result_type, result_id, args[0], "spvUnpackHalf2x16");
 		break;
 
+	case GLSLstd450SAbs:
+	{
+		// OpenCL abs() on integer types returns unsigned. Need bitcast back to signed if result is signed.
+		auto &out_type = get<SPIRType>(result_type);
+		auto &expr_type = expression_type(args[0]);
+
+		// Cast input to signed if needed.
+		string input_expr;
+		auto expected_basetype = to_signed_basetype(expr_type.width);
+		if (expr_type.basetype != expected_basetype)
+			input_expr = bitcast_expression(expected_basetype, args[0]);
+		else
+			input_expr = to_expression(args[0]);
+
+		string expr = join("abs(", input_expr, ")");
+
+		// abs() returns unsigned in OpenCL. Cast to result type if it's signed.
+		auto unsigned_basetype = to_unsigned_basetype(expr_type.width);
+		if (out_type.basetype != unsigned_basetype)
+		{
+			// Build the unsigned return type to bitcast from.
+			SPIRType abs_ret_type = out_type;
+			abs_ret_type.basetype = unsigned_basetype;
+			expr = join(bitcast_glsl_op(out_type, abs_ret_type), "(", expr, ")");
+		}
+
+		emit_op(result_type, result_id, expr, should_forward(args[0]));
+		inherit_expression_dependencies(result_id, args[0]);
+		break;
+	}
+
+	case GLSLstd450SSign:
+	{
+		// OpenCL has no integer sign(). Use clamp(x, -1, 1).
+		auto &expr_type = expression_type(args[0]);
+		auto &out_type = get<SPIRType>(result_type);
+
+		auto expected_basetype = to_signed_basetype(expr_type.width);
+		string input_expr;
+		if (expr_type.basetype != expected_basetype)
+			input_expr = bitcast_expression(expected_basetype, args[0]);
+		else
+			input_expr = to_expression(args[0]);
+
+		string expr = join("clamp(", input_expr, ", -1, 1)");
+
+		// Cast to result type if needed (e.g. result is unsigned).
+		if (out_type.basetype != expected_basetype)
+		{
+			SPIRType signed_type = out_type;
+			signed_type.basetype = expected_basetype;
+			expr = join(bitcast_glsl_op(out_type, signed_type), "(", expr, ")");
+		}
+
+		emit_op(result_type, result_id, expr, should_forward(args[0]));
+		inherit_expression_dependencies(result_id, args[0]);
+		break;
+	}
+
+	case GLSLstd450FindSMsb:
+	{
+		// GLSL findMSB for signed: position of highest bit that differs from sign bit.
+		// OpenCL: (W-1) - clz(x ^ (x >> (W-1)))
+		// x >> (W-1) is arithmetic shift: 0 for positive, -1 for negative.
+		// x ^ -1 = ~x, x ^ 0 = x. So this gives clz(x) for positive, clz(~x) for negative.
+		auto &expr_type = expression_type(args[0]);
+		auto &out_type = get<SPIRType>(result_type);
+		uint32_t width = expr_type.width;
+
+		// Input must be signed for arithmetic right shift.
+		auto signed_basetype = to_signed_basetype(width);
+		SPIRType signed_type = expr_type;
+		signed_type.basetype = signed_basetype;
+
+		string input_expr;
+		if (expr_type.basetype != signed_basetype)
+			input_expr = bitcast_expression(signed_basetype, args[0]);
+		else
+			input_expr = to_enclosed_expression(args[0]);
+
+		string xor_expr = join(input_expr, " ^ (", input_expr, " >> ", width - 1, ")");
+		string expr = join(width - 1, " - clz(", xor_expr, ")");
+
+		// clz on signed type returns signed, so result is signed. Cast if output is unsigned.
+		if (out_type.basetype != signed_basetype)
+			expr = join(bitcast_glsl_op(out_type, signed_type), "(", expr, ")");
+
+		emit_op(result_type, result_id, expr, should_forward(args[0]));
+		inherit_expression_dependencies(result_id, args[0]);
+		break;
+	}
+
+	case GLSLstd450FindUMsb:
+	{
+		// GLSL findMSB for unsigned: position of highest set bit, -1 for 0.
+		// OpenCL: (W-1) - clz(x). clz(0) = W, so result = -1 for 0.
+		auto &expr_type = expression_type(args[0]);
+		auto &out_type = get<SPIRType>(result_type);
+		uint32_t width = expr_type.width;
+
+		auto unsigned_basetype = to_unsigned_basetype(width);
+		string input_expr;
+		if (expr_type.basetype != unsigned_basetype)
+			input_expr = bitcast_expression(unsigned_basetype, args[0]);
+		else
+			input_expr = to_expression(args[0]);
+
+		// Cast to signed for the subtraction so result can be -1.
+		auto signed_basetype = to_signed_basetype(width);
+		SPIRType signed_type = out_type;
+		signed_type.basetype = signed_basetype;
+		string clz_expr = join("as_", type_to_glsl(signed_type), "(clz(", input_expr, "))");
+
+		string expr = join(width - 1, " - ", clz_expr);
+
+		// findMSB returns int (signed). Cast if output type differs.
+		if (out_type.basetype != signed_basetype)
+		{
+			expr = join(bitcast_glsl_op(out_type, signed_type), "(", expr, ")");
+		}
+
+		emit_op(result_type, result_id, expr, should_forward(args[0]));
+		inherit_expression_dependencies(result_id, args[0]);
+		break;
+	}
+
 	default:
 		CompilerGLSL::emit_glsl_op(result_type, result_id, op, args, count);
 		break;
@@ -1223,6 +1365,13 @@ std::string CompilerOpenCL::bitcast_glsl_op(const SPIRType &out_type, const SPIR
 	if (out_type.basetype == in_type.basetype)
 		return "";
 
+	// Pointer types are handled by emit_instruction for OpBitcast.
+	// If we get here as a fallback, use a simple C-style cast.
+	if (is_pointer(out_type))
+		return join("(", type_to_glsl(out_type), ")");
+	if (is_pointer(in_type))
+		return "as_ulong";
+
 	// All bitcasts (float↔int, int↔uint, half↔short, etc.) use as_TYPE() in OpenCL C.
 	// type_to_glsl gives us the full type name including vector size (e.g. "float4", "uint").
 	auto out_name = type_to_glsl(out_type);
@@ -1656,7 +1805,16 @@ void CompilerOpenCL::emit_struct_member(const SPIRType &type, uint32_t member_ty
 {
 	auto &membertype = get<SPIRType>(member_type_id);
 	// OpenCL C does not use GLSL layout qualifiers or interpolation qualifiers.
-	statement(qualifier, variable_decl(membertype, to_member_name(type, index)), ";");
+	// PhysicalStorageBuffer pointers in structs must be emitted as ulong since
+	// OpenCL C does not allow pointer types in kernel parameter structs.
+	if (is_pointer(membertype) && membertype.storage == StorageClassPhysicalStorageBuffer)
+	{
+		statement(qualifier, "ulong ", to_member_name(type, index), ";");
+	}
+	else
+	{
+		statement(qualifier, variable_decl(membertype, to_member_name(type, index)), ";");
+	}
 }
 
 void CompilerOpenCL::emit_block_hints(const SPIRBlock &)
@@ -1947,6 +2105,26 @@ void CompilerOpenCL::emit_instruction(const Instruction &instruction)
 			inherit_expression_dependencies(result_id, ptr);
 			break;
 		}
+		// When loading a PhysicalStorageBuffer pointer from a struct member that was
+		// emitted as ulong (because OpenCL doesn't allow pointer types in kernel struct params),
+		// cast the loaded ulong value to the typed pointer.
+		{
+			auto &result_type_obj = get<SPIRType>(ops[0]);
+			if (is_pointer(result_type_obj) && result_type_obj.storage == StorageClassPhysicalStorageBuffer)
+			{
+				auto *expr = maybe_get<SPIRExpression>(ptr);
+				if (expr && expr->access_chain)
+				{
+					uint32_t result_type = ops[0];
+					uint32_t result_id = ops[1];
+					auto ptr_type_str = type_to_glsl(result_type_obj);
+					emit_op(result_type, result_id, join("((", ptr_type_str, ")(", to_expression(ptr), "))"),
+					        should_forward(ptr));
+					inherit_expression_dependencies(result_id, ptr);
+					break;
+				}
+			}
+		}
 		CompilerGLSL::emit_instruction(instruction);
 		break;
 	}
@@ -2210,6 +2388,269 @@ void CompilerOpenCL::emit_instruction(const Instruction &instruction)
 		flush_all_atomic_capable_variables();
 		break;
 	}
+	case OpBitCount:
+	{
+		// GLSL bitCount → OpenCL popcount.
+		// popcount returns the same type as its input in OpenCL (unlike GLSL which returns int).
+		uint32_t result_type = ops[0];
+		uint32_t result_id = ops[1];
+		uint32_t arg = ops[2];
+		auto &in_type = expression_type(arg);
+		auto &out_type = get<SPIRType>(result_type);
+
+		string expr = join("popcount(", to_expression(arg), ")");
+
+		// Cast result if types differ (e.g. popcount(int4) → uint4 needs as_uint4).
+		if (out_type.basetype != in_type.basetype)
+		{
+			expr = join(bitcast_glsl_op(out_type, in_type), "(", expr, ")");
+		}
+
+		emit_op(result_type, result_id, expr, should_forward(arg));
+		inherit_expression_dependencies(result_id, arg);
+		break;
+	}
+
+	case OpBitReverse:
+	{
+		// GLSL bitfieldReverse → no OpenCL builtin.
+		// Use scalar polyfill, call per-component for vectors.
+		uint32_t result_type = ops[0];
+		uint32_t result_id = ops[1];
+		uint32_t arg = ops[2];
+		auto &type = get<SPIRType>(result_type);
+
+		if (!needs_bitreverse_polyfill)
+		{
+			needs_bitreverse_polyfill = true;
+			force_recompile();
+		}
+
+		auto unsigned_basetype = to_unsigned_basetype(type.width);
+		string input_expr = bitcast_expression(unsigned_basetype, arg);
+
+		string expr;
+		if (type.vecsize > 1)
+		{
+			// Call scalar polyfill per component.
+			SPIRType uint_type = type;
+			uint_type.basetype = unsigned_basetype;
+			expr = join("(", type_to_glsl(uint_type), ")(");
+			for (uint32_t i = 0; i < type.vecsize; i++)
+			{
+				if (i > 0)
+					expr += ", ";
+				expr += join("spvBitReverse(", input_expr, ".s", i, ")");
+			}
+			expr += ")";
+		}
+		else
+			expr = join("spvBitReverse(", input_expr, ")");
+
+		// Cast back to signed if needed.
+		if (type.basetype != unsigned_basetype)
+		{
+			SPIRType uint_type = type;
+			uint_type.basetype = unsigned_basetype;
+			expr = join(bitcast_glsl_op(type, uint_type), "(", expr, ")");
+		}
+
+		emit_op(result_type, result_id, expr, should_forward(arg));
+		inherit_expression_dependencies(result_id, arg);
+		break;
+	}
+
+	case OpBitFieldSExtract:
+	case OpBitFieldUExtract:
+	{
+		// GLSL bitfieldExtract(value, offset, bits) → OpenCL: manual extraction.
+		// Unsigned: (value >> offset) & ((1u << bits) - 1u)
+		// Signed:   (int)((value >> offset) << (W - bits)) >> (W - bits)  [arithmetic shift for sign-extend]
+		uint32_t result_type = ops[0];
+		uint32_t result_id = ops[1];
+		uint32_t value = ops[2];
+		uint32_t offset_id = ops[3];
+		uint32_t bits_id = ops[4];
+		auto &type = get<SPIRType>(result_type);
+		uint32_t width = type.width;
+
+		bool is_signed_extract = (opcode == OpBitFieldSExtract);
+
+		if (is_signed_extract)
+		{
+			auto signed_basetype = to_signed_basetype(width);
+			string val_expr = bitcast_expression(signed_basetype, value);
+			// Sign-extending extract: shift left to put field at MSB, then arithmetic shift right.
+			// result = (val << (W - bits - offset)) >> (W - bits)
+			// Simplified: extract bits, then sign-extend.
+			string expr = join("(", val_expr, " << (", width, " - ", to_expression(bits_id), " - ",
+			                   to_expression(offset_id), ")) >> (", width, " - ", to_expression(bits_id), ")");
+
+			if (type.basetype != signed_basetype)
+			{
+				SPIRType signed_type = type;
+				signed_type.basetype = signed_basetype;
+				expr = join(bitcast_glsl_op(type, signed_type), "(", expr, ")");
+			}
+
+			emit_op(result_type, result_id, expr, should_forward(value));
+		}
+		else
+		{
+			auto unsigned_basetype = to_unsigned_basetype(width);
+			string val_expr = bitcast_expression(unsigned_basetype, value);
+			SPIRType uint_type = type;
+			uint_type.basetype = unsigned_basetype;
+			auto utype = type_to_glsl(uint_type);
+			string expr = join("(", val_expr, " >> ", to_expression(offset_id), ") & ((", utype, ")(1u << ",
+			                   to_expression(bits_id), ") - (", utype, ")1u)");
+
+			if (type.basetype != unsigned_basetype)
+				expr = join(bitcast_glsl_op(type, uint_type), "(", expr, ")");
+
+			emit_op(result_type, result_id, expr, should_forward(value));
+		}
+		inherit_expression_dependencies(result_id, value);
+		inherit_expression_dependencies(result_id, offset_id);
+		inherit_expression_dependencies(result_id, bits_id);
+		break;
+	}
+
+	case OpBitFieldInsert:
+	{
+		// GLSL bitfieldInsert(base, insert, offset, bits) → OpenCL: manual insertion.
+		// mask = ((1u << bits) - 1u) << offset
+		// result = (base & ~mask) | ((insert << offset) & mask)
+		uint32_t result_type = ops[0];
+		uint32_t result_id = ops[1];
+		uint32_t base_id = ops[2];
+		uint32_t insert_id = ops[3];
+		uint32_t offset_id = ops[4];
+		uint32_t bits_id = ops[5];
+		auto &type = get<SPIRType>(result_type);
+
+		auto unsigned_basetype = to_unsigned_basetype(type.width);
+		string base_expr = bitcast_expression(unsigned_basetype, base_id);
+		string insert_expr = bitcast_expression(unsigned_basetype, insert_id);
+
+		SPIRType uint_type = type;
+		uint_type.basetype = unsigned_basetype;
+		auto utype = type_to_glsl(uint_type);
+
+		string mask =
+		    join("((", utype, ")(1u << ", to_expression(bits_id), ") - (", utype, ")1u) << ", to_expression(offset_id));
+		string expr = join("(", base_expr, " & ~(", mask, ")) | ((", insert_expr, " << ", to_expression(offset_id),
+		                   ") & (", mask, "))");
+
+		if (type.basetype != unsigned_basetype)
+			expr = join(bitcast_glsl_op(type, uint_type), "(", expr, ")");
+
+		emit_op(result_type, result_id, expr, should_forward(base_id) && should_forward(insert_id));
+		inherit_expression_dependencies(result_id, base_id);
+		inherit_expression_dependencies(result_id, insert_id);
+		inherit_expression_dependencies(result_id, offset_id);
+		inherit_expression_dependencies(result_id, bits_id);
+		break;
+	}
+
+	case OpBitcast:
+	{
+		auto &out_type = get<SPIRType>(ops[0]);
+		auto &in_type = expression_type(ops[2]);
+
+		// Bitcast involving pointer types needs special handling in OpenCL C.
+		if (is_pointer(out_type) || is_pointer(in_type))
+		{
+			uint32_t result_type = ops[0];
+			uint32_t result_id = ops[1];
+			uint32_t arg = ops[2];
+
+			string expr;
+			if (is_pointer(out_type) && !is_pointer(in_type))
+			{
+				// Non-pointer → pointer: cast via ulong if input is a vector (e.g. uvec2).
+				auto ptr_type_str = type_to_glsl(out_type);
+				if (in_type.vecsize > 1)
+					expr = join("((", ptr_type_str, ")as_ulong(", to_expression(arg), "))");
+				else
+					expr = join("((", ptr_type_str, ")(", to_expression(arg), "))");
+			}
+			else if (!is_pointer(out_type) && is_pointer(in_type))
+			{
+				// Pointer → non-pointer: cast to ulong, then to target type.
+				if (out_type.vecsize > 1)
+					expr = join("as_", type_to_glsl(out_type), "((ulong)(", to_expression(arg), "))");
+				else
+					expr = join("(", type_to_glsl(out_type), ")((ulong)(", to_expression(arg), "))");
+			}
+			else
+			{
+				// Pointer → pointer: direct C-style cast.
+				expr = join("((", type_to_glsl(out_type), ")(", to_expression(arg), "))");
+			}
+
+			emit_op(result_type, result_id, std::move(expr), should_forward(arg));
+			inherit_expression_dependencies(result_id, arg);
+			break;
+		}
+
+		CompilerGLSL::emit_instruction(instruction);
+		break;
+	}
+
+	case OpPtrAccessChain:
+	{
+		uint32_t result_type = ops[0];
+		uint32_t result_id = ops[1];
+		uint32_t base_id = ops[2];
+
+		auto &base_type = expression_type(base_id);
+		TypeID base_type_id = expression_type_id(base_id);
+
+		// Check if custom stride pointer arithmetic is needed.
+		if (has_decoration(base_type_id, DecorationArrayStride))
+		{
+			TypeID pointee_type_id = get_pointee_type_id(base_type_id);
+			uint32_t physical_stride = get_physical_type_id_stride(pointee_type_id);
+			uint32_t requested_stride = get_decoration(base_type_id, DecorationArrayStride);
+
+			if (physical_stride != requested_stride)
+			{
+				// Custom stride: use pointer arithmetic via ulong cast.
+				// *((__global T*)((ulong)ptr + index * stride))
+				uint32_t index_id = ops[3];
+				auto &pointee_type = get<SPIRType>(pointee_type_id);
+				auto &ptr_type = get<SPIRType>(base_type_id);
+				auto addr_space = get_type_address_space(ptr_type, 0);
+
+				string base_expr = to_enclosed_expression(base_id);
+				string intptr_expr =
+				    join("(ulong)(", base_expr, ") + ", to_enclosed_expression(index_id), " * ", requested_stride);
+				string ptr_cast = join("(", addr_space, " ", type_to_glsl(pointee_type), "*)(", intptr_expr, ")");
+				string expr = join("*(", ptr_cast, ")");
+
+				auto &e = set<SPIRExpression>(result_id, std::move(expr), result_type, should_forward(base_id));
+				auto *backing_var = maybe_get_backing_variable(base_id);
+				e.loaded_from = backing_var ? backing_var->self : ID(base_id);
+				e.access_chain = true;
+				forwarded_temporaries.insert(result_id);
+				suppressed_usage_tracking.insert(result_id);
+				inherit_expression_dependencies(result_id, base_id);
+				inherit_expression_dependencies(result_id, index_id);
+
+				// Mark as packed if the vector stride differs from natural alignment.
+				if (is_vector(pointee_type) && requested_stride != physical_stride)
+					set_extended_decoration(result_id, SPIRVCrossDecorationPhysicalTypePacked);
+
+				break;
+			}
+		}
+
+		// No custom stride — fall through to base class.
+		CompilerGLSL::emit_instruction(instruction);
+		break;
+	}
+
 	case OpAccessChain:
 	case OpInBoundsAccessChain:
 	{
@@ -2260,6 +2701,16 @@ void CompilerOpenCL::emit_instruction(const Instruction &instruction)
 				}
 				handled = true;
 			}
+			else if (length == 5 && !is_single_member && struct_type && !struct_type->array.empty())
+			{
+				// Array of multi-member SSBOs: ptr[array_idx].member_name
+				// ops[3] = array index (dynamic), ops[4] = member index (constant)
+				uint32_t mbr_idx = get<SPIRConstant>(ops[4]).scalar();
+				auto mbr_name = to_member_name(*struct_type, mbr_idx);
+				expr = join(to_name(base_id), "[", to_expression(ops[3]), "].", mbr_name);
+				is_subscript_deref = true;
+				handled = true;
+			}
 			else if (length == 5 && !is_single_member && struct_type)
 			{
 				// Multi-member SSBO: ptr->member_name[element_idx]
@@ -2277,6 +2728,13 @@ void CompilerOpenCL::emit_instruction(const Instruction &instruction)
 				is_subscript_deref = true;
 				handled = true;
 			}
+			else if (length == 4 && !is_single_member && struct_type && !struct_type->array.empty())
+			{
+				// Array of multi-member SSBOs: ptr[array_idx] (result is struct)
+				expr = join(to_name(base_id), "[", to_expression(ops[3]), "]");
+				is_subscript_deref = true;
+				handled = true;
+			}
 			else if (length == 4 && !is_single_member && struct_type)
 			{
 				// Multi-member SSBO: ptr->member_name (lvalue, not address-of)
@@ -2717,6 +3175,136 @@ void CompilerOpenCL::emit_instruction(const Instruction &instruction)
 		break;
 	}
 
+	case OpSDot:
+	case OpUDot:
+	case OpSUDot:
+	{
+		uint32_t result_type = ops[0];
+		uint32_t id = ops[1];
+		uint32_t vec1 = ops[2];
+		uint32_t vec2 = ops[3];
+
+		auto &input_type1 = expression_type(vec1);
+		auto &input_type2 = expression_type(vec2);
+		auto &type = get<SPIRType>(result_type);
+
+		string vec1input, vec2input;
+		uint32_t input_size = input_type1.vecsize;
+
+		if (instruction.length == 5)
+		{
+			if (ops[4] == PackedVectorFormatPackedVectorFormat4x8Bit)
+			{
+				string type1 = opcode == OpSDot || opcode == OpSUDot ? "char4" : "uchar4";
+				vec1input = join("as_", type1, "(", to_expression(vec1), ")");
+				string type2 = opcode == OpSDot ? "char4" : "uchar4";
+				vec2input = join("as_", type2, "(", to_expression(vec2), ")");
+				input_size = 4;
+			}
+			else
+				SPIRV_CROSS_THROW("Packed vector formats other than 4x8Bit for integer dot product is not supported.");
+		}
+		else
+		{
+			SPIRType::BaseType vec1_expected_type =
+			    opcode != OpUDot ? to_signed_basetype(input_type1.width) : to_unsigned_basetype(input_type1.width);
+			SPIRType::BaseType vec2_expected_type =
+			    opcode != OpSDot ? to_unsigned_basetype(input_type2.width) : to_signed_basetype(input_type2.width);
+
+			vec1input = bitcast_expression(vec1_expected_type, vec1);
+			vec2input = bitcast_expression(vec2_expected_type, vec2);
+		}
+
+		// Emit inline sum of component-wise products:
+		// (result_type)(a.s0) * (result_type)(b.s0) + ... + (result_type)(a.sN) * (result_type)(b.sN)
+		auto result_type_str = type_to_glsl(type);
+		string exp;
+		for (uint32_t i = 0; i < input_size; i++)
+		{
+			if (i > 0)
+				exp += " + ";
+			string comp = input_size > 1 ? join(".s", i) : "";
+			exp +=
+			    join("(", result_type_str, ")(", vec1input, comp, ") * (", result_type_str, ")(", vec2input, comp, ")");
+		}
+
+		emit_op(result_type, id, exp, should_forward(vec1) && should_forward(vec2));
+		inherit_expression_dependencies(id, vec1);
+		inherit_expression_dependencies(id, vec2);
+		break;
+	}
+
+	case OpSDotAccSat:
+	case OpUDotAccSat:
+	case OpSUDotAccSat:
+	{
+		uint32_t result_type = ops[0];
+		uint32_t id = ops[1];
+		uint32_t vec1 = ops[2];
+		uint32_t vec2 = ops[3];
+		uint32_t acc = ops[4];
+
+		auto input_type1 = expression_type(vec1);
+		auto input_type2 = expression_type(vec2);
+		auto &type = get<SPIRType>(result_type);
+
+		string vec1input, vec2input;
+		uint32_t input_size = input_type1.vecsize;
+
+		if (instruction.length == 6)
+		{
+			if (ops[5] == PackedVectorFormatPackedVectorFormat4x8Bit)
+			{
+				string type1 = opcode == OpSDotAccSat || opcode == OpSUDotAccSat ? "char4" : "uchar4";
+				vec1input = join("as_", type1, "(", to_expression(vec1), ")");
+				string type2 = opcode == OpSDotAccSat ? "char4" : "uchar4";
+				vec2input = join("as_", type2, "(", to_expression(vec2), ")");
+				input_size = 4;
+			}
+			else
+				SPIRV_CROSS_THROW("Packed vector formats other than 4x8Bit for integer dot product is not supported.");
+		}
+		else
+		{
+			SPIRType::BaseType vec1_expected_type = opcode != OpUDotAccSat ? to_signed_basetype(input_type1.width) :
+			                                                                 to_unsigned_basetype(input_type1.width);
+			SPIRType::BaseType vec2_expected_type = opcode != OpSDotAccSat ? to_unsigned_basetype(input_type2.width) :
+			                                                                 to_signed_basetype(input_type2.width);
+
+			vec1input = bitcast_expression(vec1_expected_type, vec1);
+			vec2input = bitcast_expression(vec2_expected_type, vec2);
+		}
+
+		SPIRType::BaseType pre_saturate_type =
+		    opcode != OpUDotAccSat ? to_signed_basetype(type.width) : to_unsigned_basetype(type.width);
+
+		// Use the pre-saturate type for internal computation so add_sat arguments match.
+		SPIRType sat_type = type;
+		sat_type.basetype = pre_saturate_type;
+		auto sat_type_str = type_to_glsl(sat_type);
+		auto result_type_str = type_to_glsl(type);
+
+		// Build dot product expression: sum of component-wise products
+		string dot_exp;
+		for (uint32_t i = 0; i < input_size; i++)
+		{
+			if (i > 0)
+				dot_exp += " + ";
+			string comp = input_size > 1 ? join(".s", i) : "";
+			dot_exp +=
+			    join("(", sat_type_str, ")(", vec1input, comp, ") * (", sat_type_str, ")(", vec2input, comp, ")");
+		}
+
+		// Wrap with add_sat and cast to result type
+		string exp =
+		    join("(", result_type_str, ")add_sat(", dot_exp, ", ", bitcast_expression(pre_saturate_type, acc), ")");
+
+		emit_op(result_type, id, exp, should_forward(vec1) && should_forward(vec2));
+		inherit_expression_dependencies(id, vec1);
+		inherit_expression_dependencies(id, vec2);
+		break;
+	}
+
 	default:
 		CompilerGLSL::emit_instruction(instruction);
 		break;
diff --git a/spirv_opencl.hpp b/spirv_opencl.hpp
index defe75032..a9991fa90 100644
--- a/spirv_opencl.hpp
+++ b/spirv_opencl.hpp
@@ -147,6 +147,8 @@ class CompilerOpenCL : public CompilerGLSL
 	// Set when packHalf2x16/unpackHalf2x16 polyfill helpers are needed.
 	bool needs_half_pack_polyfill = false;
 	bool needs_half_unpack_polyfill = false;
+	// Set when bitfieldReverse polyfill is needed.
+	bool needs_bitreverse_polyfill = false;
 	// Set when a default sampler is needed for combined image+sampler usage.
 	bool needs_default_sampler = false;
 

From cb78845e2d2438f04b03984ea7bd62914a47793c Mon Sep 17 00:00:00 2001
From: Garrick Meeker <gmeeker@gmail.com>
Date: Sat, 14 Mar 2026 10:12:47 -0700
Subject: [PATCH 06/16] OpenCL: support for matrices and half types

---
 main.cpp                                      |   16 +-
 .../comp/relaxed-block-layout.fp16.asm.comp}  |    1 +
 .../comp/replicated-composites.spv16.asm.comp |    6 +-
 .../opt/shaders-opencl/comp/dowhile.comp      |   11 +-
 .../opt/shaders-opencl/comp/inverse.comp      |   53 +
 .../comp/mat3-row-maj-read-write-const.comp   |   45 +
 reference/opt/shaders-opencl/comp/mat3.comp   |   19 +
 .../shaders-opencl/comp/outer-product.comp    |  127 +-
 .../opt/shaders-opencl/comp/rmw-matrix.comp   |   41 +
 .../comp/shared-matrix-array-of-array.comp    |  430 ++++
 .../comp/shared-matrix-cast.comp              |  131 ++
 .../shared-matrix-nested-struct-array.comp    |  463 ++++
 .../comp/shared-matrix-nested-struct.comp     |  583 +++++
 ...50.double.comp => shared-std450.fp64.comp} |    0
 .../shaders-opencl/comp/struct-layout.comp    |   21 +-
 .../shaders-opencl/comp/struct-packing.comp   |  125 ++
 .../comp/relaxed-block-layout.fp16.asm.comp   |   24 +
 .../comp/replicated-composites.spv16.asm.comp |    6 +-
 reference/shaders-opencl/comp/dowhile.comp    |   25 +-
 reference/shaders-opencl/comp/inverse.comp    |   53 +
 .../comp/mat3-row-maj-read-write-const.comp   |   47 +
 reference/shaders-opencl/comp/mat3.comp       |   20 +
 .../shaders-opencl/comp/outer-product.comp    |  127 +-
 reference/shaders-opencl/comp/rmw-matrix.comp |   41 +
 ...alar-std450-distance-length-normalize.comp |    8 +-
 .../comp/shared-matrix-array-of-array.comp    |  357 ++++
 .../comp/shared-matrix-cast.comp              |  174 ++
 .../shared-matrix-nested-struct-array.comp    |  401 ++++
 .../comp/shared-matrix-nested-struct.comp     |  598 ++++++
 ...50.double.comp => shared-std450.fp64.comp} |    0
 .../shaders-opencl/comp/struct-layout.comp    |   23 +-
 .../shaders-opencl/comp/struct-packing.comp   |  125 ++
 .../comp/struct-packing.invalid.comp          |    0
 .../shaders-opencl/comp/torture-loop.comp     |   11 +-
 ...omp => relaxed-block-layout.fp16.asm.comp} |    0
 shaders-opencl/comp/inverse.comp              |   23 +
 .../comp/mat3-row-maj-read-write-const.comp   |   17 +
 shaders-opencl/comp/mat3.comp                 |   14 +
 shaders-opencl/comp/rmw-matrix.comp           |   20 +
 .../comp/shared-matrix-array-of-array.comp    |   65 +
 shaders-opencl/comp/shared-matrix-cast.comp   |   33 +
 .../shared-matrix-nested-struct-array.comp    |   87 +
 .../comp/shared-matrix-nested-struct.comp     |  141 ++
 ...50.double.comp => shared-std450.fp64.comp} |    0
 ...cking.invalid.comp => struct-packing.comp} |    0
 spirv_cross_c.cpp                             |   13 +-
 spirv_cross_c.h                               |   11 +-
 spirv_glsl.cpp                                | 1899 +++++++++--------
 spirv_glsl.hpp                                |    5 +-
 spirv_opencl.cpp                              | 1670 ++++++++++++++-
 spirv_opencl.hpp                              |   84 +-
 test_shaders.py                               |   30 +-
 52 files changed, 7127 insertions(+), 1097 deletions(-)
 rename reference/{shaders-opencl/asm/comp/relaxed-block-layout.asm.comp => opt/shaders-opencl/asm/comp/relaxed-block-layout.fp16.asm.comp} (94%)
 create mode 100644 reference/opt/shaders-opencl/comp/inverse.comp
 create mode 100644 reference/opt/shaders-opencl/comp/mat3-row-maj-read-write-const.comp
 create mode 100644 reference/opt/shaders-opencl/comp/mat3.comp
 create mode 100644 reference/opt/shaders-opencl/comp/rmw-matrix.comp
 create mode 100644 reference/opt/shaders-opencl/comp/shared-matrix-array-of-array.comp
 create mode 100644 reference/opt/shaders-opencl/comp/shared-matrix-cast.comp
 create mode 100644 reference/opt/shaders-opencl/comp/shared-matrix-nested-struct-array.comp
 create mode 100644 reference/opt/shaders-opencl/comp/shared-matrix-nested-struct.comp
 rename reference/opt/shaders-opencl/comp/{shared-std450.double.comp => shared-std450.fp64.comp} (100%)
 create mode 100644 reference/opt/shaders-opencl/comp/struct-packing.comp
 create mode 100644 reference/shaders-opencl/asm/comp/relaxed-block-layout.fp16.asm.comp
 create mode 100644 reference/shaders-opencl/comp/inverse.comp
 create mode 100644 reference/shaders-opencl/comp/mat3-row-maj-read-write-const.comp
 create mode 100644 reference/shaders-opencl/comp/mat3.comp
 create mode 100644 reference/shaders-opencl/comp/rmw-matrix.comp
 create mode 100644 reference/shaders-opencl/comp/shared-matrix-array-of-array.comp
 create mode 100644 reference/shaders-opencl/comp/shared-matrix-cast.comp
 create mode 100644 reference/shaders-opencl/comp/shared-matrix-nested-struct-array.comp
 create mode 100644 reference/shaders-opencl/comp/shared-matrix-nested-struct.comp
 rename reference/shaders-opencl/comp/{shared-std450.double.comp => shared-std450.fp64.comp} (100%)
 create mode 100644 reference/shaders-opencl/comp/struct-packing.comp
 delete mode 100644 reference/shaders-opencl/comp/struct-packing.invalid.comp
 rename shaders-opencl/asm/comp/{relaxed-block-layout.asm.comp => relaxed-block-layout.fp16.asm.comp} (100%)
 create mode 100644 shaders-opencl/comp/inverse.comp
 create mode 100644 shaders-opencl/comp/mat3-row-maj-read-write-const.comp
 create mode 100644 shaders-opencl/comp/mat3.comp
 create mode 100644 shaders-opencl/comp/rmw-matrix.comp
 create mode 100644 shaders-opencl/comp/shared-matrix-array-of-array.comp
 create mode 100644 shaders-opencl/comp/shared-matrix-cast.comp
 create mode 100644 shaders-opencl/comp/shared-matrix-nested-struct-array.comp
 create mode 100644 shaders-opencl/comp/shared-matrix-nested-struct.comp
 rename shaders-opencl/comp/{shared-std450.double.comp => shared-std450.fp64.comp} (100%)
 rename shaders-opencl/comp/{struct-packing.invalid.comp => struct-packing.comp} (100%)

diff --git a/main.cpp b/main.cpp
index 2fc6ced5c..a53f5e758 100644
--- a/main.cpp
+++ b/main.cpp
@@ -778,10 +778,13 @@ struct CLIArguments
 
 	bool opencl = false;
 	uint32_t opencl_version = 120;
+	bool opencl_enable_fp16 = false;
 	bool opencl_enable_fp64 = false;
 	bool opencl_enable_64bit_atomics = false;
 	bool opencl_enable_subgroups = false;
-	bool opencl_enable_shuffle = false;
+	bool opencl_enable_subgroups_all = false;
+	bool opencl_emulate_subgroups = false;
+	uint32_t opencl_fixed_subgroup_size = 0;
 };
 
 static void print_version()
@@ -1362,9 +1365,12 @@ static string compile_iteration(const CLIArguments &args, std::vector<uint32_t>
 		auto *ocl_comp = static_cast<CompilerOpenCL *>(compiler.get());
 		CompilerOpenCL::Options ocl_opts = ocl_comp->get_opencl_options();
 		ocl_opts.opencl_version = args.opencl_version;
+		ocl_opts.enable_fp16 = args.opencl_enable_fp16;
 		ocl_opts.enable_fp64 = args.opencl_enable_fp64;
 		ocl_opts.enable_subgroups = args.opencl_enable_subgroups;
-		ocl_opts.enable_shuffle = args.opencl_enable_shuffle;
+		ocl_opts.enable_subgroups_all = args.opencl_enable_subgroups_all;
+		ocl_opts.emulate_subgroups = args.opencl_emulate_subgroups;
+		ocl_opts.fixed_subgroup_size = args.opencl_fixed_subgroup_size;
 		ocl_comp->set_opencl_options(ocl_opts);
 	}
 	else if (args.hlsl)
@@ -1995,10 +2001,14 @@ static int main_inner(int argc, char *argv[])
 	        });
 	cbs.add("--opencl", [&args](CLIParser &) { args.opencl = true; });
 	cbs.add("--opencl-version", [&args](CLIParser &parser) { args.opencl_version = parser.next_uint(); });
+	cbs.add("--opencl-fp16", [&args](CLIParser &) { args.opencl_enable_fp16 = true; });
 	cbs.add("--opencl-fp64", [&args](CLIParser &) { args.opencl_enable_fp64 = true; });
 	cbs.add("--opencl-64bit-atomics", [&args](CLIParser &) { args.opencl_enable_64bit_atomics = true; });
 	cbs.add("--opencl-subgroups", [&args](CLIParser &) { args.opencl_enable_subgroups = true; });
-	cbs.add("--opencl-shuffle", [&args](CLIParser &) { args.opencl_enable_shuffle = true; });
+	cbs.add("--opencl-subgroups-all", [&args](CLIParser &) { args.opencl_enable_subgroups = true; });
+	cbs.add("--opencl-emulate-subgroups", [&args](CLIParser &) { args.opencl_emulate_subgroups = true; });
+	cbs.add("--opencl-fixed-subgroup-size",
+	        [&args](CLIParser &parser) { args.opencl_fixed_subgroup_size = parser.next_uint(); });
 	cbs.add("--extension", [&args](CLIParser &parser) { args.extensions.push_back(parser.next_string()); });
 	cbs.add("--rename-entry-point",
 	        [&args](CLIParser &parser)
diff --git a/reference/shaders-opencl/asm/comp/relaxed-block-layout.asm.comp b/reference/opt/shaders-opencl/asm/comp/relaxed-block-layout.fp16.asm.comp
similarity index 94%
rename from reference/shaders-opencl/asm/comp/relaxed-block-layout.asm.comp
rename to reference/opt/shaders-opencl/asm/comp/relaxed-block-layout.fp16.asm.comp
index ddae4bb54..a8926a145 100644
--- a/reference/shaders-opencl/asm/comp/relaxed-block-layout.asm.comp
+++ b/reference/opt/shaders-opencl/asm/comp/relaxed-block-layout.fp16.asm.comp
@@ -1,5 +1,6 @@
 // Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
 
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
 
 struct foo
 {
diff --git a/reference/opt/shaders-opencl/asm/comp/replicated-composites.spv16.asm.comp b/reference/opt/shaders-opencl/asm/comp/replicated-composites.spv16.asm.comp
index 545ecf547..90501e9a1 100644
--- a/reference/opt/shaders-opencl/asm/comp/replicated-composites.spv16.asm.comp
+++ b/reference/opt/shaders-opencl/asm/comp/replicated-composites.spv16.asm.comp
@@ -1,6 +1,8 @@
 // Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
 
 
+typedef struct { float4 columns[4]; } spvMat4;
+
 #ifndef SPIRV_CROSS_CONSTANT_ID_0
 #define SPIRV_CROSS_CONSTANT_ID_0 0.0f
 #endif
@@ -21,10 +23,10 @@ __attribute__((reqd_work_group_size(1, 1, 1)))
 __kernel void comp_main(UBO ubo)
 {
     float4 a_1 = (float4)(0.0f);
-    float4 b_1 = (float4)(1.0f);
+    spvMat4 b_1 = (spvMat4){ { (float4)(1.0f), (float4)(1.0f), (float4)(1.0f), (float4)(1.0f) } };
     float4 c_1 = _20;
     float4 _36 = (float4)(ubo.uniform_float);
     float4 d_1 = _36;
-    float4 e_1 = _36;
+    spvMat4 e_1 = (spvMat4){ { _36, _36, _36, _36 } };
 }
 
diff --git a/reference/opt/shaders-opencl/comp/dowhile.comp b/reference/opt/shaders-opencl/comp/dowhile.comp
index e5a51f6be..d858c8428 100644
--- a/reference/opt/shaders-opencl/comp/dowhile.comp
+++ b/reference/opt/shaders-opencl/comp/dowhile.comp
@@ -1,9 +1,11 @@
 // Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
 
 
+typedef struct { float4 columns[4]; } spvMat4;
+
 struct SSBO
 {
-    float4 mvp;
+    spvMat4 mvp;
     float4 in_data[1];
 };
 
@@ -16,6 +18,11 @@ struct SSBO2
 
 typedef struct SSBO2 SSBO2;
 
+static float4 spvMulMat4Vec4(spvMat4 m, float4 v)
+{
+    return m.columns[0] * v.x + m.columns[1] * v.y + m.columns[2] * v.z + m.columns[3] * v.w;
+}
+
 __attribute__((reqd_work_group_size(1, 1, 1)))
 __kernel void comp_main(__global const SSBO* _28, __global float4* _52)
 {
@@ -27,7 +34,7 @@ __kernel void comp_main(__global const SSBO* _28, __global float4* _52)
     float4 _42;
     for (;;)
     {
-        _42 = _28->mvp * _59;
+        _42 = spvMulMat4Vec4(_28->mvp, _59);
         int _44 = _60 + 1;
         if (_44 < 16)
         {
diff --git a/reference/opt/shaders-opencl/comp/inverse.comp b/reference/opt/shaders-opencl/comp/inverse.comp
new file mode 100644
index 000000000..3db4ff542
--- /dev/null
+++ b/reference/opt/shaders-opencl/comp/inverse.comp
@@ -0,0 +1,53 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+typedef struct { float2 columns[2]; } spvMat2;
+typedef struct { float3 columns[3]; } spvMat3;
+typedef struct { float4 columns[4]; } spvMat4;
+
+struct MatrixOut
+{
+    spvMat2 m2out;
+    spvMat3 m3out;
+    spvMat4 m4out;
+};
+
+typedef struct MatrixOut MatrixOut;
+
+struct MatrixIn
+{
+    spvMat2 m2in;
+    spvMat3 m3in;
+    spvMat4 m4in;
+};
+
+typedef struct MatrixIn MatrixIn;
+
+static spvMat2 spvInverse2(spvMat2 m) {
+    float d = 1.0f / (m.columns[0].x * m.columns[1].y - m.columns[1].x * m.columns[0].y);
+    return (spvMat2){ { (float2)(m.columns[1].y * d, -m.columns[0].y * d), (float2)(-m.columns[1].x * d, m.columns[0].x * d) } };
+}
+
+static spvMat3 spvInverse3(spvMat3 m) {
+    float3 t = (float3)(m.columns[1].y * m.columns[2].z - m.columns[1].z * m.columns[2].y, m.columns[1].z * m.columns[2].x - m.columns[1].x * m.columns[2].z, m.columns[1].x * m.columns[2].y - m.columns[1].y * m.columns[2].x);
+    float d = 1.0f / dot(m.columns[0], t);
+    return (spvMat3){ { t * d, (float3)(m.columns[0].z * m.columns[2].y - m.columns[0].y * m.columns[2].z, m.columns[0].x * m.columns[2].z - m.columns[0].z * m.columns[2].x, m.columns[0].y * m.columns[2].x - m.columns[0].x * m.columns[2].y) * d, (float3)(m.columns[0].y * m.columns[1].z - m.columns[0].z * m.columns[1].y, m.columns[0].z * m.columns[1].x - m.columns[0].x * m.columns[1].z, m.columns[0].x * m.columns[1].y - m.columns[0].y * m.columns[1].x) * d } };
+}
+
+static spvMat4 spvInverse4(spvMat4 m) {
+    float4 t = (float4)(m.columns[2].y * m.columns[3].z * m.columns[1].w - m.columns[3].y * m.columns[2].z * m.columns[1].w + m.columns[3].y * m.columns[1].z * m.columns[2].w - m.columns[1].y * m.columns[3].z * m.columns[2].w - m.columns[2].y * m.columns[1].z * m.columns[3].w + m.columns[1].y * m.columns[2].z * m.columns[3].w, m.columns[3].x * m.columns[2].z * m.columns[1].w - m.columns[2].x * m.columns[3].z * m.columns[1].w - m.columns[3].x * m.columns[1].z * m.columns[2].w + m.columns[1].x * m.columns[3].z * m.columns[2].w + m.columns[2].x * m.columns[1].z * m.columns[3].w - m.columns[1].x * m.columns[2].z * m.columns[3].w, m.columns[2].x * m.columns[3].y * m.columns[1].w - m.columns[3].x * m.columns[2].y * m.columns[1].w + m.columns[3].x * m.columns[1].y * m.columns[2].w - m.columns[1].x * m.columns[3].y * m.columns[2].w - m.columns[2].x * m.columns[1].y * m.columns[3].w + m.columns[1].x * m.columns[2].y * m.columns[3].w, m.columns[3].x * m.columns[2].y * m.columns[1].z - m.columns[2].x * m.columns[3].y * m.columns[1].z - m.columns[3].x * m.columns[1].y * m.columns[2].z + m.columns[1].x * m.columns[3].y * m.columns[2].z + m.columns[2].x * m.columns[1].y * m.columns[3].z - m.columns[1].x * m.columns[2].y * m.columns[3].z);
+    spvMat4 r = (spvMat4){ { (float4)(t.x, m.columns[3].y * m.columns[2].z * m.columns[0].w - m.columns[2].y * m.columns[3].z * m.columns[0].w - m.columns[3].y * m.columns[0].z * m.columns[2].w + m.columns[0].y * m.columns[3].z * m.columns[2].w + m.columns[2].y * m.columns[0].z * m.columns[3].w - m.columns[0].y * m.columns[2].z * m.columns[3].w, m.columns[1].y * m.columns[3].z * m.columns[0].w - m.columns[3].y * m.columns[1].z * m.columns[0].w + m.columns[3].y * m.columns[0].z * m.columns[1].w - m.columns[0].y * m.columns[3].z * m.columns[1].w - m.columns[1].y * m.columns[0].z * m.columns[3].w + m.columns[0].y * m.columns[1].z * m.columns[3].w, m.columns[2].y * m.columns[1].z * m.columns[0].w - m.columns[1].y * m.columns[2].z * m.columns[0].w - m.columns[2].y * m.columns[0].z * m.columns[1].w + m.columns[0].y * m.columns[2].z * m.columns[1].w + m.columns[1].y * m.columns[0].z * m.columns[2].w - m.columns[0].y * m.columns[1].z * m.columns[2].w), (float4)(t.y, m.columns[2].x * m.columns[3].z * m.columns[0].w - m.columns[3].x * m.columns[2].z * m.columns[0].w + m.columns[3].x * m.columns[0].z * m.columns[2].w - m.columns[0].x * m.columns[3].z * m.columns[2].w - m.columns[2].x * m.columns[0].z * m.columns[3].w + m.columns[0].x * m.columns[2].z * m.columns[3].w, m.columns[3].x * m.columns[1].z * m.columns[0].w - m.columns[1].x * m.columns[3].z * m.columns[0].w - m.columns[3].x * m.columns[0].z * m.columns[1].w + m.columns[0].x * m.columns[3].z * m.columns[1].w + m.columns[1].x * m.columns[0].z * m.columns[3].w - m.columns[0].x * m.columns[1].z * m.columns[3].w, m.columns[1].x * m.columns[2].z * m.columns[0].w - m.columns[2].x * m.columns[1].z * m.columns[0].w + m.columns[2].x * m.columns[0].z * m.columns[1].w - m.columns[0].x * m.columns[2].z * m.columns[1].w - m.columns[1].x * m.columns[0].z * m.columns[2].w + m.columns[0].x * m.columns[1].z * m.columns[2].w), (float4)(t.z, m.columns[3].x * m.columns[2].y * m.columns[0].w - m.columns[2].x * m.columns[3].y * m.columns[0].w - m.columns[3].x * m.columns[0].y * m.columns[2].w + m.columns[0].x * m.columns[3].y * m.columns[2].w + m.columns[2].x * m.columns[0].y * m.columns[3].w - m.columns[0].x * m.columns[2].y * m.columns[3].w, m.columns[1].x * m.columns[3].y * m.columns[0].w - m.columns[3].x * m.columns[1].y * m.columns[0].w + m.columns[3].x * m.columns[0].y * m.columns[1].w - m.columns[0].x * m.columns[3].y * m.columns[1].w - m.columns[1].x * m.columns[0].y * m.columns[3].w + m.columns[0].x * m.columns[1].y * m.columns[3].w, m.columns[2].x * m.columns[1].y * m.columns[0].w - m.columns[1].x * m.columns[2].y * m.columns[0].w - m.columns[2].x * m.columns[0].y * m.columns[1].w + m.columns[0].x * m.columns[2].y * m.columns[1].w + m.columns[1].x * m.columns[0].y * m.columns[2].w - m.columns[0].x * m.columns[1].y * m.columns[2].w), (float4)(t.w, m.columns[2].x * m.columns[3].y * m.columns[0].z - m.columns[3].x * m.columns[2].y * m.columns[0].z + m.columns[3].x * m.columns[0].y * m.columns[2].z - m.columns[0].x * m.columns[3].y * m.columns[2].z - m.columns[2].x * m.columns[0].y * m.columns[3].z + m.columns[0].x * m.columns[2].y * m.columns[3].z, m.columns[3].x * m.columns[1].y * m.columns[0].z - m.columns[1].x * m.columns[3].y * m.columns[0].z - m.columns[3].x * m.columns[0].y * m.columns[1].z + m.columns[0].x * m.columns[3].y * m.columns[1].z + m.columns[1].x * m.columns[0].y * m.columns[3].z - m.columns[0].x * m.columns[1].y * m.columns[3].z, m.columns[1].x * m.columns[2].y * m.columns[0].z - m.columns[2].x * m.columns[1].y * m.columns[0].z + m.columns[2].x * m.columns[0].y * m.columns[1].z - m.columns[0].x * m.columns[2].y * m.columns[1].z - m.columns[1].x * m.columns[0].y * m.columns[2].z + m.columns[0].x * m.columns[1].y * m.columns[2].z) } };
+    float d = 1.0f / dot(m.columns[0], t);
+    r.columns[0] *= d; r.columns[1] *= d; r.columns[2] *= d; r.columns[3] *= d;
+    return r;
+}
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global MatrixOut* _15, __global const MatrixIn* _20)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    _15->m2out = spvInverse2(_20->m2in);
+    _15->m3out = spvInverse3(_20->m3in);
+    _15->m4out = spvInverse4(_20->m4in);
+}
+
diff --git a/reference/opt/shaders-opencl/comp/mat3-row-maj-read-write-const.comp b/reference/opt/shaders-opencl/comp/mat3-row-maj-read-write-const.comp
new file mode 100644
index 000000000..a1b4522fa
--- /dev/null
+++ b/reference/opt/shaders-opencl/comp/mat3-row-maj-read-write-const.comp
@@ -0,0 +1,45 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+typedef struct { float3 columns[3]; } spvMat3;
+
+struct model_t
+{
+    spvMat3 mtx_rm;
+};
+
+typedef struct model_t model_t;
+
+static float3 spvMulMat3Vec3(spvMat3 m, float3 v)
+{
+    return m.columns[0] * v.x + m.columns[1] * v.y + m.columns[2] * v.z;
+}
+
+static spvMat3 spvMulMat3Mat3(spvMat3 a, spvMat3 b)
+{
+    spvMat3 r;
+    r.columns[0] = spvMulMat3Vec3(a, b.columns[0]);
+    r.columns[1] = spvMulMat3Vec3(a, b.columns[1]);
+    r.columns[2] = spvMulMat3Vec3(a, b.columns[2]);
+    return r;
+}
+
+static spvMat3 spvTransposeMat3(spvMat3 m)
+{
+    spvMat3 r;
+    r.columns[0] = (float3)(m.columns[0].x, m.columns[1].x, m.columns[2].x);
+    r.columns[1] = (float3)(m.columns[0].y, m.columns[1].y, m.columns[2].y);
+    r.columns[2] = (float3)(m.columns[0].z, m.columns[1].z, m.columns[2].z);
+    return r;
+}
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global spvMat3* model)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    if (spvMulMat3Mat3(spvTransposeMat3(model[0]), (spvMat3){ { (float3)(4.0f, -3.0f, 1.0f), (float3)(-7.0f, 7.0f, -7.0f), (float3)(-5.0f, 6.0f, -8.0f) } }).columns[0].x != 0.0f)
+    {
+        model[0] = spvTransposeMat3((spvMat3){ { (float3)(-5.0f, -3.0f, -5.0f), (float3)(-2.0f, 2.0f, -5.0f), (float3)(6.0f, 3.0f, -8.0f) } });
+    }
+}
+
diff --git a/reference/opt/shaders-opencl/comp/mat3.comp b/reference/opt/shaders-opencl/comp/mat3.comp
new file mode 100644
index 000000000..bc825c561
--- /dev/null
+++ b/reference/opt/shaders-opencl/comp/mat3.comp
@@ -0,0 +1,19 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+typedef struct { float3 columns[3]; } spvMat3;
+
+struct SSBO2
+{
+    spvMat3 out_data[1];
+};
+
+typedef struct SSBO2 SSBO2;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global spvMat3* _22)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    _22[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x] = (spvMat3){ { (float3)(10.0f), (float3)(20.0f), (float3)(40.0f) } };
+}
+
diff --git a/reference/opt/shaders-opencl/comp/outer-product.comp b/reference/opt/shaders-opencl/comp/outer-product.comp
index 4462fc221..b607c18c3 100644
--- a/reference/opt/shaders-opencl/comp/outer-product.comp
+++ b/reference/opt/shaders-opencl/comp/outer-product.comp
@@ -1,17 +1,27 @@
 // Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
 
 
+typedef struct { float2 columns[2]; } spvMat2;
+typedef struct { float3 columns[2]; } spvMat2x3;
+typedef struct { float4 columns[2]; } spvMat2x4;
+typedef struct { float2 columns[3]; } spvMat3x2;
+typedef struct { float3 columns[3]; } spvMat3;
+typedef struct { float4 columns[3]; } spvMat3x4;
+typedef struct { float2 columns[4]; } spvMat4x2;
+typedef struct { float3 columns[4]; } spvMat4x3;
+typedef struct { float4 columns[4]; } spvMat4;
+
 struct SSBO
 {
-    float2 m22;
-    float3 m23;
-    float4 m24;
-    float2 m32;
-    float3 m33;
-    float4 m34;
-    float2 m42;
-    float3 m43;
-    float4 m44;
+    spvMat2 m22;
+    spvMat2x3 m23;
+    spvMat2x4 m24;
+    spvMat3x2 m32;
+    spvMat3 m33;
+    spvMat3x4 m34;
+    spvMat4x2 m42;
+    spvMat4x3 m43;
+    spvMat4 m44;
 };
 
 typedef struct SSBO SSBO;
@@ -25,21 +35,102 @@ struct ReadSSBO
 
 typedef struct ReadSSBO ReadSSBO;
 
+static spvMat2 spvOuterProductVec2Vec2(float2 c, float2 r)
+{
+    spvMat2 m;
+    m.columns[0] = c * r.x;
+    m.columns[1] = c * r.y;
+    return m;
+}
+
+static spvMat2x3 spvOuterProductVec3Vec2(float3 c, float2 r)
+{
+    spvMat2x3 m;
+    m.columns[0] = c * r.x;
+    m.columns[1] = c * r.y;
+    return m;
+}
+
+static spvMat2x4 spvOuterProductVec4Vec2(float4 c, float2 r)
+{
+    spvMat2x4 m;
+    m.columns[0] = c * r.x;
+    m.columns[1] = c * r.y;
+    return m;
+}
+
+static spvMat3x2 spvOuterProductVec2Vec3(float2 c, float3 r)
+{
+    spvMat3x2 m;
+    m.columns[0] = c * r.x;
+    m.columns[1] = c * r.y;
+    m.columns[2] = c * r.z;
+    return m;
+}
+
+static spvMat3 spvOuterProductVec3Vec3(float3 c, float3 r)
+{
+    spvMat3 m;
+    m.columns[0] = c * r.x;
+    m.columns[1] = c * r.y;
+    m.columns[2] = c * r.z;
+    return m;
+}
+
+static spvMat3x4 spvOuterProductVec4Vec3(float4 c, float3 r)
+{
+    spvMat3x4 m;
+    m.columns[0] = c * r.x;
+    m.columns[1] = c * r.y;
+    m.columns[2] = c * r.z;
+    return m;
+}
+
+static spvMat4x2 spvOuterProductVec2Vec4(float2 c, float4 r)
+{
+    spvMat4x2 m;
+    m.columns[0] = c * r.x;
+    m.columns[1] = c * r.y;
+    m.columns[2] = c * r.z;
+    m.columns[3] = c * r.w;
+    return m;
+}
+
+static spvMat4x3 spvOuterProductVec3Vec4(float3 c, float4 r)
+{
+    spvMat4x3 m;
+    m.columns[0] = c * r.x;
+    m.columns[1] = c * r.y;
+    m.columns[2] = c * r.z;
+    m.columns[3] = c * r.w;
+    return m;
+}
+
+static spvMat4 spvOuterProductVec4Vec4(float4 c, float4 r)
+{
+    spvMat4 m;
+    m.columns[0] = c * r.x;
+    m.columns[1] = c * r.y;
+    m.columns[2] = c * r.z;
+    m.columns[3] = c * r.w;
+    return m;
+}
+
 __attribute__((reqd_work_group_size(1, 1, 1)))
 __kernel void comp_main(__global SSBO* _21, __global const ReadSSBO* _26)
 {
     uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
     float2 _29 = _26->v2;
-    _21->m22 = _29 * _29.x;
+    _21->m22 = spvOuterProductVec2Vec2(_29, _29);
     float3 _38 = _26->v3;
-    _21->m23 = _38 * _29.x;
+    _21->m23 = spvOuterProductVec3Vec2(_38, _29);
     float4 _47 = _26->v4;
-    _21->m24 = _47 * _29.x;
-    _21->m32 = _29 * _38.x;
-    _21->m33 = _38 * _38.x;
-    _21->m34 = _47 * _38.x;
-    _21->m42 = _29 * _47.x;
-    _21->m43 = _38 * _47.x;
-    _21->m44 = _47 * _47.x;
+    _21->m24 = spvOuterProductVec4Vec2(_47, _29);
+    _21->m32 = spvOuterProductVec2Vec3(_29, _38);
+    _21->m33 = spvOuterProductVec3Vec3(_38, _38);
+    _21->m34 = spvOuterProductVec4Vec3(_47, _38);
+    _21->m42 = spvOuterProductVec2Vec4(_29, _47);
+    _21->m43 = spvOuterProductVec3Vec4(_38, _47);
+    _21->m44 = spvOuterProductVec4Vec4(_47, _47);
 }
 
diff --git a/reference/opt/shaders-opencl/comp/rmw-matrix.comp b/reference/opt/shaders-opencl/comp/rmw-matrix.comp
new file mode 100644
index 000000000..9fdc47c62
--- /dev/null
+++ b/reference/opt/shaders-opencl/comp/rmw-matrix.comp
@@ -0,0 +1,41 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+typedef struct { float4 columns[4]; } spvMat4;
+
+struct SSBO
+{
+    float a;
+    float4 b;
+    spvMat4 c;
+    float a1;
+    float4 b1;
+    spvMat4 c1;
+};
+
+typedef struct SSBO SSBO;
+
+static float4 spvMulMat4Vec4(spvMat4 m, float4 v)
+{
+    return m.columns[0] * v.x + m.columns[1] * v.y + m.columns[2] * v.z + m.columns[3] * v.w;
+}
+
+static spvMat4 spvMulMat4Mat4(spvMat4 a, spvMat4 b)
+{
+    spvMat4 r;
+    r.columns[0] = spvMulMat4Vec4(a, b.columns[0]);
+    r.columns[1] = spvMulMat4Vec4(a, b.columns[1]);
+    r.columns[2] = spvMulMat4Vec4(a, b.columns[2]);
+    r.columns[3] = spvMulMat4Vec4(a, b.columns[3]);
+    return r;
+}
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global SSBO* _11)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    _11->a *= _11->a1;
+    _11->b *= _11->b1;
+    _11->c = spvMulMat4Mat4(_11->c, _11->c1);
+}
+
diff --git a/reference/opt/shaders-opencl/comp/shared-matrix-array-of-array.comp b/reference/opt/shaders-opencl/comp/shared-matrix-array-of-array.comp
new file mode 100644
index 000000000..63af2dc47
--- /dev/null
+++ b/reference/opt/shaders-opencl/comp/shared-matrix-array-of-array.comp
@@ -0,0 +1,430 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+typedef struct { float3 columns[4]; } spvMat4x3;
+
+struct S1
+{
+    spvMat4x3 a[2];
+    float b;
+    float2 c[3];
+};
+
+typedef struct S1 S1;
+
+struct S2
+{
+    int4 a;
+    bool b[3][1][3];
+};
+
+typedef struct S2 S2;
+
+struct block
+{
+    uint passed;
+};
+
+typedef struct block block;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global uint* _383)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    __local S1 s1;
+    __local S2 s2;
+    s1.a[0] = (spvMat4x3){ { (float3)(0.0f, 2.0f, -8.0f), (float3)(6.0f, 7.0f, 5.0f), (float3)(-6.0f, 1.0f, 9.0f), (float3)(-4.0f, -3.0f, 4.0f) } };
+    s1.a[1] = (spvMat4x3){ { (float3)(4.0f, 9.0f, -9.0f), (float3)(-8.0f, -9.0f, 8.0f), (float3)(0.0f, 4.0f, -4.0f), (float3)(7.0f, 2.0f, -1.0f) } };
+    s1.b = 7.0f;
+    s1.c[0] = (float2)(-5.0f, -4.0f);
+    s1.c[1] = (float2)(3.0f, -5.0f);
+    s1.c[2] = (float2)(-3.0f, -1.0f);
+    s2.a = (int4)(1, 0, -3, 1);
+    s2.b[0][0][0] = true;
+    s2.b[0][0][1] = false;
+    s2.b[0][0][2] = false;
+    s2.b[1][0][0] = true;
+    s2.b[1][0][1] = false;
+    s2.b[1][0][2] = true;
+    s2.b[2][0][0] = false;
+    s2.b[2][0][1] = true;
+    s2.b[2][0][2] = true;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    mem_fence(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
+    bool _464 = fabs(-s1.a[0].columns[0].x) < 0.0500000007450580596923828125f;
+    bool _449;
+    if (_464)
+    {
+        _449 = fabs(2.0f - s1.a[0].columns[0].y) < 0.0500000007450580596923828125f;
+    }
+    else
+    {
+        _449 = _464;
+    }
+    bool _457;
+    if (_449)
+    {
+        _457 = fabs((-8.0f) - s1.a[0].columns[0].z) < 0.0500000007450580596923828125f;
+    }
+    else
+    {
+        _457 = _449;
+    }
+    bool _412;
+    if (_457)
+    {
+        bool _514 = fabs(6.0f - s1.a[0].columns[1].x) < 0.0500000007450580596923828125f;
+        bool _499;
+        if (_514)
+        {
+            _499 = fabs(7.0f - s1.a[0].columns[1].y) < 0.0500000007450580596923828125f;
+        }
+        else
+        {
+            _499 = _514;
+        }
+        bool _507;
+        if (_499)
+        {
+            _507 = fabs(5.0f - s1.a[0].columns[1].z) < 0.0500000007450580596923828125f;
+        }
+        else
+        {
+            _507 = _499;
+        }
+        _412 = _507;
+    }
+    else
+    {
+        _412 = _457;
+    }
+    bool _420;
+    if (_412)
+    {
+        bool _564 = fabs((-6.0f) - s1.a[0].columns[2].x) < 0.0500000007450580596923828125f;
+        bool _549;
+        if (_564)
+        {
+            _549 = fabs(1.0f - s1.a[0].columns[2].y) < 0.0500000007450580596923828125f;
+        }
+        else
+        {
+            _549 = _564;
+        }
+        bool _557;
+        if (_549)
+        {
+            _557 = fabs(9.0f - s1.a[0].columns[2].z) < 0.0500000007450580596923828125f;
+        }
+        else
+        {
+            _557 = _549;
+        }
+        _420 = _557;
+    }
+    else
+    {
+        _420 = _412;
+    }
+    bool _428;
+    if (_420)
+    {
+        bool _614 = fabs((-4.0f) - s1.a[0].columns[3].x) < 0.0500000007450580596923828125f;
+        bool _599;
+        if (_614)
+        {
+            _599 = fabs((-3.0f) - s1.a[0].columns[3].y) < 0.0500000007450580596923828125f;
+        }
+        else
+        {
+            _599 = _614;
+        }
+        bool _607;
+        if (_599)
+        {
+            _607 = fabs(4.0f - s1.a[0].columns[3].z) < 0.0500000007450580596923828125f;
+        }
+        else
+        {
+            _607 = _599;
+        }
+        _428 = _607;
+    }
+    else
+    {
+        _428 = _420;
+    }
+    bool _251;
+    if (_428)
+    {
+        bool _703 = fabs(4.0f - s1.a[1].columns[0].x) < 0.0500000007450580596923828125f;
+        bool _688;
+        if (_703)
+        {
+            _688 = fabs(9.0f - s1.a[1].columns[0].y) < 0.0500000007450580596923828125f;
+        }
+        else
+        {
+            _688 = _703;
+        }
+        bool _696;
+        if (_688)
+        {
+            _696 = fabs((-9.0f) - s1.a[1].columns[0].z) < 0.0500000007450580596923828125f;
+        }
+        else
+        {
+            _696 = _688;
+        }
+        bool _651;
+        if (_696)
+        {
+            bool _753 = fabs((-8.0f) - s1.a[1].columns[1].x) < 0.0500000007450580596923828125f;
+            bool _738;
+            if (_753)
+            {
+                _738 = fabs((-9.0f) - s1.a[1].columns[1].y) < 0.0500000007450580596923828125f;
+            }
+            else
+            {
+                _738 = _753;
+            }
+            bool _746;
+            if (_738)
+            {
+                _746 = fabs(8.0f - s1.a[1].columns[1].z) < 0.0500000007450580596923828125f;
+            }
+            else
+            {
+                _746 = _738;
+            }
+            _651 = _746;
+        }
+        else
+        {
+            _651 = _696;
+        }
+        bool _659;
+        if (_651)
+        {
+            bool _803 = fabs(-s1.a[1].columns[2].x) < 0.0500000007450580596923828125f;
+            bool _788;
+            if (_803)
+            {
+                _788 = fabs(4.0f - s1.a[1].columns[2].y) < 0.0500000007450580596923828125f;
+            }
+            else
+            {
+                _788 = _803;
+            }
+            bool _796;
+            if (_788)
+            {
+                _796 = fabs((-4.0f) - s1.a[1].columns[2].z) < 0.0500000007450580596923828125f;
+            }
+            else
+            {
+                _796 = _788;
+            }
+            _659 = _796;
+        }
+        else
+        {
+            _659 = _651;
+        }
+        bool _667;
+        if (_659)
+        {
+            bool _853 = fabs(7.0f - s1.a[1].columns[3].x) < 0.0500000007450580596923828125f;
+            bool _838;
+            if (_853)
+            {
+                _838 = fabs(2.0f - s1.a[1].columns[3].y) < 0.0500000007450580596923828125f;
+            }
+            else
+            {
+                _838 = _853;
+            }
+            bool _846;
+            if (_838)
+            {
+                _846 = fabs((-1.0f) - s1.a[1].columns[3].z) < 0.0500000007450580596923828125f;
+            }
+            else
+            {
+                _846 = _838;
+            }
+            _667 = _846;
+        }
+        else
+        {
+            _667 = _659;
+        }
+        _251 = _667;
+    }
+    else
+    {
+        _251 = _428;
+    }
+    bool _260;
+    if (_251)
+    {
+        _260 = fabs(7.0f - s1.b) < 0.0500000007450580596923828125f;
+    }
+    else
+    {
+        _260 = _251;
+    }
+    bool _269;
+    if (_260)
+    {
+        bool _900 = fabs((-5.0f) - s1.c[0].x) < 0.0500000007450580596923828125f;
+        bool _893;
+        if (_900)
+        {
+            _893 = fabs((-4.0f) - s1.c[0].y) < 0.0500000007450580596923828125f;
+        }
+        else
+        {
+            _893 = _900;
+        }
+        _269 = _893;
+    }
+    else
+    {
+        _269 = _260;
+    }
+    bool _278;
+    if (_269)
+    {
+        bool _933 = fabs(3.0f - s1.c[1].x) < 0.0500000007450580596923828125f;
+        bool _926;
+        if (_933)
+        {
+            _926 = fabs((-5.0f) - s1.c[1].y) < 0.0500000007450580596923828125f;
+        }
+        else
+        {
+            _926 = _933;
+        }
+        _278 = _926;
+    }
+    else
+    {
+        _278 = _269;
+    }
+    bool _287;
+    if (_278)
+    {
+        bool _966 = fabs((-3.0f) - s1.c[2].x) < 0.0500000007450580596923828125f;
+        bool _959;
+        if (_966)
+        {
+            _959 = fabs((-1.0f) - s1.c[2].y) < 0.0500000007450580596923828125f;
+        }
+        else
+        {
+            _959 = _966;
+        }
+        _287 = _959;
+    }
+    else
+    {
+        _287 = _278;
+    }
+    bool _296;
+    if (_287)
+    {
+        _296 = all((int4)(1, 0, -3, 1) == s2.a);
+    }
+    else
+    {
+        _296 = _287;
+    }
+    bool _305;
+    if (_296)
+    {
+        _305 = true == s2.b[0][0][0];
+    }
+    else
+    {
+        _305 = _296;
+    }
+    bool _314;
+    if (_305)
+    {
+        _314 = false == s2.b[0][0][1];
+    }
+    else
+    {
+        _314 = _305;
+    }
+    bool _323;
+    if (_314)
+    {
+        _323 = false == s2.b[0][0][2];
+    }
+    else
+    {
+        _323 = _314;
+    }
+    bool _332;
+    if (_323)
+    {
+        _332 = true == s2.b[1][0][0];
+    }
+    else
+    {
+        _332 = _323;
+    }
+    bool _341;
+    if (_332)
+    {
+        _341 = false == s2.b[1][0][1];
+    }
+    else
+    {
+        _341 = _332;
+    }
+    bool _350;
+    if (_341)
+    {
+        _350 = true == s2.b[1][0][2];
+    }
+    else
+    {
+        _350 = _341;
+    }
+    bool _359;
+    if (_350)
+    {
+        _359 = false == s2.b[2][0][0];
+    }
+    else
+    {
+        _359 = _350;
+    }
+    bool _368;
+    if (_359)
+    {
+        _368 = true == s2.b[2][0][1];
+    }
+    else
+    {
+        _368 = _359;
+    }
+    bool _377;
+    if (_368)
+    {
+        _377 = true == s2.b[2][0][2];
+    }
+    else
+    {
+        _377 = _368;
+    }
+    if (_377)
+    {
+        _383[0] += as_uint(1);
+    }
+}
+
diff --git a/reference/opt/shaders-opencl/comp/shared-matrix-cast.comp b/reference/opt/shaders-opencl/comp/shared-matrix-cast.comp
new file mode 100644
index 000000000..6734de200
--- /dev/null
+++ b/reference/opt/shaders-opencl/comp/shared-matrix-cast.comp
@@ -0,0 +1,131 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+typedef struct { float2 columns[3]; } spvMat3x2;
+
+struct S1
+{
+    float4 a;
+    spvMat3x2 b;
+    int4 c;
+};
+
+typedef struct S1 S1;
+
+struct block
+{
+    uint passed;
+};
+
+typedef struct block block;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global uint* _212)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    __local S1 s1;
+    s1.a = (float4)(1.0f, -5.0f, -9.0f, -5.0f);
+    s1.b = (spvMat3x2){ { (float2)(1.0f, -7.0f), (float2)(1.0f, 2.0f), (float2)(8.0f, 7.0f) } };
+    s1.c = (int4)(false, true, false, false);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    mem_fence(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
+    bool _264 = fabs(1.0f - s1.a.x) < 0.0500000007450580596923828125f;
+    bool _241;
+    if (_264)
+    {
+        _241 = fabs((-5.0f) - s1.a.y) < 0.0500000007450580596923828125f;
+    }
+    else
+    {
+        _241 = _264;
+    }
+    bool _249;
+    if (_241)
+    {
+        _249 = fabs((-9.0f) - s1.a.z) < 0.0500000007450580596923828125f;
+    }
+    else
+    {
+        _249 = _241;
+    }
+    bool _257;
+    if (_249)
+    {
+        _257 = fabs((-5.0f) - s1.a.w) < 0.0500000007450580596923828125f;
+    }
+    else
+    {
+        _257 = _249;
+    }
+    bool _197;
+    if (_257)
+    {
+        bool _340 = fabs(1.0f - s1.b.columns[0].x) < 0.0500000007450580596923828125f;
+        bool _333;
+        if (_340)
+        {
+            _333 = fabs((-7.0f) - s1.b.columns[0].y) < 0.0500000007450580596923828125f;
+        }
+        else
+        {
+            _333 = _340;
+        }
+        bool _306;
+        if (_333)
+        {
+            bool _373 = fabs(1.0f - s1.b.columns[1].x) < 0.0500000007450580596923828125f;
+            bool _366;
+            if (_373)
+            {
+                _366 = fabs(2.0f - s1.b.columns[1].y) < 0.0500000007450580596923828125f;
+            }
+            else
+            {
+                _366 = _373;
+            }
+            _306 = _366;
+        }
+        else
+        {
+            _306 = _333;
+        }
+        bool _314;
+        if (_306)
+        {
+            bool _406 = fabs(8.0f - s1.b.columns[2].x) < 0.0500000007450580596923828125f;
+            bool _399;
+            if (_406)
+            {
+                _399 = fabs(7.0f - s1.b.columns[2].y) < 0.0500000007450580596923828125f;
+            }
+            else
+            {
+                _399 = _406;
+            }
+            _314 = _399;
+        }
+        else
+        {
+            _314 = _306;
+        }
+        _197 = _314;
+    }
+    else
+    {
+        _197 = _257;
+    }
+    bool _206;
+    if (_197)
+    {
+        _206 = all((int4)(false, true, false, false) == s1.c);
+    }
+    else
+    {
+        _206 = _197;
+    }
+    if (_206)
+    {
+        _212[0] += as_uint(1);
+    }
+}
+
diff --git a/reference/opt/shaders-opencl/comp/shared-matrix-nested-struct-array.comp b/reference/opt/shaders-opencl/comp/shared-matrix-nested-struct-array.comp
new file mode 100644
index 000000000..33748669a
--- /dev/null
+++ b/reference/opt/shaders-opencl/comp/shared-matrix-nested-struct-array.comp
@@ -0,0 +1,463 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+typedef struct { float2 columns[2]; } spvMat2;
+typedef struct { float3 columns[2]; } spvMat2x3;
+typedef struct { float2 columns[3]; } spvMat3x2;
+typedef struct { float3 columns[4]; } spvMat4x3;
+
+struct sA
+{
+    spvMat2x3 mA;
+};
+
+typedef struct sA sA;
+
+struct sB
+{
+    spvMat2 mA;
+    spvMat3x2 mB;
+    uint3 mC;
+};
+
+typedef struct sB sB;
+
+struct sC
+{
+    sA mA;
+    sB mB;
+};
+
+typedef struct sC sC;
+
+struct sD
+{
+    sC mA;
+};
+
+typedef struct sD sD;
+
+struct sE
+{
+    spvMat3x2 mA;
+    spvMat4x3 mB;
+};
+
+typedef struct sE sE;
+
+struct sF
+{
+    sE mA;
+};
+
+typedef struct sF sF;
+
+struct sG
+{
+    sF mA;
+};
+
+typedef struct sG sG;
+
+struct sH
+{
+    int3 mA[2];
+};
+
+typedef struct sH sH;
+
+struct S1
+{
+    sD a;
+    sG b;
+    sH c[2];
+};
+
+typedef struct S1 S1;
+
+struct block
+{
+    uint passed;
+};
+
+typedef struct block block;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global uint* _424)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    __local S1 s1;
+    s1.a.mA.mA.mA = (spvMat2x3){ { (float3)(6.0f, 8.0f, 8.0f), (float3)(0.0f, -4.0f, -5.0f) } };
+    s1.a.mA.mB.mA = (spvMat2){ { (float2)(9.0f, -4.0f), (float2)(-6.0f, -1.0f) } };
+    s1.a.mA.mB.mB = (spvMat3x2){ { (float2)(-1.0f, -2.0f), (float2)(1.0f, 6.0f), (float2)(5.0f, 7.0f) } };
+    s1.a.mA.mB.mC = (uint3)(3u, 1u, 5u);
+    s1.b.mA.mA.mA = (spvMat3x2){ { (float2)(8.0f, 3.0f), (float2)(0.0f, 2.0f), (float2)(1.0f, 8.0f) } };
+    s1.b.mA.mA.mB = (spvMat4x3){ { (float3)(0.0f, 9.0f, -1.0f), (float3)(-1.0f, -7.0f, 7.0f), (float3)(-4.0f, -3.0f, 1.0f), (float3)(-4.0f, -9.0f, 1.0f) } };
+    s1.c[0].mA[0] = (int3)(true, false, false);
+    s1.c[0].mA[1] = (int3)(true, false, false);
+    s1.c[1].mA[0] = (int3)(false);
+    s1.c[1].mA[1] = (int3)(false);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    mem_fence(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
+    bool _484 = fabs(6.0f - s1.a.mA.mA.mA.columns[0].x) < 0.0500000007450580596923828125f;
+    bool _469;
+    if (_484)
+    {
+        _469 = fabs(8.0f - s1.a.mA.mA.mA.columns[0].y) < 0.0500000007450580596923828125f;
+    }
+    else
+    {
+        _469 = _484;
+    }
+    bool _477;
+    if (_469)
+    {
+        _477 = fabs(8.0f - s1.a.mA.mA.mA.columns[0].z) < 0.0500000007450580596923828125f;
+    }
+    else
+    {
+        _477 = _469;
+    }
+    bool _448;
+    if (_477)
+    {
+        bool _534 = fabs(-s1.a.mA.mA.mA.columns[1].x) < 0.0500000007450580596923828125f;
+        bool _519;
+        if (_534)
+        {
+            _519 = fabs((-4.0f) - s1.a.mA.mA.mA.columns[1].y) < 0.0500000007450580596923828125f;
+        }
+        else
+        {
+            _519 = _534;
+        }
+        bool _527;
+        if (_519)
+        {
+            _527 = fabs((-5.0f) - s1.a.mA.mA.mA.columns[1].z) < 0.0500000007450580596923828125f;
+        }
+        else
+        {
+            _527 = _519;
+        }
+        _448 = _527;
+    }
+    else
+    {
+        _448 = _477;
+    }
+    bool _346;
+    if (_448)
+    {
+        bool _593 = fabs(9.0f - s1.a.mA.mB.mA.columns[0].x) < 0.0500000007450580596923828125f;
+        bool _586;
+        if (_593)
+        {
+            _586 = fabs((-4.0f) - s1.a.mA.mB.mA.columns[0].y) < 0.0500000007450580596923828125f;
+        }
+        else
+        {
+            _586 = _593;
+        }
+        bool _567;
+        if (_586)
+        {
+            bool _626 = fabs((-6.0f) - s1.a.mA.mB.mA.columns[1].x) < 0.0500000007450580596923828125f;
+            bool _619;
+            if (_626)
+            {
+                _619 = fabs((-1.0f) - s1.a.mA.mB.mA.columns[1].y) < 0.0500000007450580596923828125f;
+            }
+            else
+            {
+                _619 = _626;
+            }
+            _567 = _619;
+        }
+        else
+        {
+            _567 = _586;
+        }
+        _346 = _567;
+    }
+    else
+    {
+        _346 = _448;
+    }
+    bool _355;
+    if (_346)
+    {
+        bool _688 = fabs((-1.0f) - s1.a.mA.mB.mB.columns[0].x) < 0.0500000007450580596923828125f;
+        bool _681;
+        if (_688)
+        {
+            _681 = fabs((-2.0f) - s1.a.mA.mB.mB.columns[0].y) < 0.0500000007450580596923828125f;
+        }
+        else
+        {
+            _681 = _688;
+        }
+        bool _654;
+        if (_681)
+        {
+            bool _721 = fabs(1.0f - s1.a.mA.mB.mB.columns[1].x) < 0.0500000007450580596923828125f;
+            bool _714;
+            if (_721)
+            {
+                _714 = fabs(6.0f - s1.a.mA.mB.mB.columns[1].y) < 0.0500000007450580596923828125f;
+            }
+            else
+            {
+                _714 = _721;
+            }
+            _654 = _714;
+        }
+        else
+        {
+            _654 = _681;
+        }
+        bool _662;
+        if (_654)
+        {
+            bool _754 = fabs(5.0f - s1.a.mA.mB.mB.columns[2].x) < 0.0500000007450580596923828125f;
+            bool _747;
+            if (_754)
+            {
+                _747 = fabs(7.0f - s1.a.mA.mB.mB.columns[2].y) < 0.0500000007450580596923828125f;
+            }
+            else
+            {
+                _747 = _754;
+            }
+            _662 = _747;
+        }
+        else
+        {
+            _662 = _654;
+        }
+        _355 = _662;
+    }
+    else
+    {
+        _355 = _346;
+    }
+    bool _364;
+    if (_355)
+    {
+        _364 = all((uint3)(3u, 1u, 5u) == s1.a.mA.mB.mC);
+    }
+    else
+    {
+        _364 = _355;
+    }
+    bool _373;
+    if (_364)
+    {
+        bool _822 = fabs(8.0f - s1.b.mA.mA.mA.columns[0].x) < 0.0500000007450580596923828125f;
+        bool _815;
+        if (_822)
+        {
+            _815 = fabs(3.0f - s1.b.mA.mA.mA.columns[0].y) < 0.0500000007450580596923828125f;
+        }
+        else
+        {
+            _815 = _822;
+        }
+        bool _788;
+        if (_815)
+        {
+            bool _855 = fabs(-s1.b.mA.mA.mA.columns[1].x) < 0.0500000007450580596923828125f;
+            bool _848;
+            if (_855)
+            {
+                _848 = fabs(2.0f - s1.b.mA.mA.mA.columns[1].y) < 0.0500000007450580596923828125f;
+            }
+            else
+            {
+                _848 = _855;
+            }
+            _788 = _848;
+        }
+        else
+        {
+            _788 = _815;
+        }
+        bool _796;
+        if (_788)
+        {
+            bool _888 = fabs(1.0f - s1.b.mA.mA.mA.columns[2].x) < 0.0500000007450580596923828125f;
+            bool _881;
+            if (_888)
+            {
+                _881 = fabs(8.0f - s1.b.mA.mA.mA.columns[2].y) < 0.0500000007450580596923828125f;
+            }
+            else
+            {
+                _881 = _888;
+            }
+            _796 = _881;
+        }
+        else
+        {
+            _796 = _788;
+        }
+        _373 = _796;
+    }
+    else
+    {
+        _373 = _364;
+    }
+    bool _382;
+    if (_373)
+    {
+        bool _970 = fabs(-s1.b.mA.mA.mB.columns[0].x) < 0.0500000007450580596923828125f;
+        bool _955;
+        if (_970)
+        {
+            _955 = fabs(9.0f - s1.b.mA.mA.mB.columns[0].y) < 0.0500000007450580596923828125f;
+        }
+        else
+        {
+            _955 = _970;
+        }
+        bool _963;
+        if (_955)
+        {
+            _963 = fabs((-1.0f) - s1.b.mA.mA.mB.columns[0].z) < 0.0500000007450580596923828125f;
+        }
+        else
+        {
+            _963 = _955;
+        }
+        bool _918;
+        if (_963)
+        {
+            bool _1020 = fabs((-1.0f) - s1.b.mA.mA.mB.columns[1].x) < 0.0500000007450580596923828125f;
+            bool _1005;
+            if (_1020)
+            {
+                _1005 = fabs((-7.0f) - s1.b.mA.mA.mB.columns[1].y) < 0.0500000007450580596923828125f;
+            }
+            else
+            {
+                _1005 = _1020;
+            }
+            bool _1013;
+            if (_1005)
+            {
+                _1013 = fabs(7.0f - s1.b.mA.mA.mB.columns[1].z) < 0.0500000007450580596923828125f;
+            }
+            else
+            {
+                _1013 = _1005;
+            }
+            _918 = _1013;
+        }
+        else
+        {
+            _918 = _963;
+        }
+        bool _926;
+        if (_918)
+        {
+            bool _1070 = fabs((-4.0f) - s1.b.mA.mA.mB.columns[2].x) < 0.0500000007450580596923828125f;
+            bool _1055;
+            if (_1070)
+            {
+                _1055 = fabs((-3.0f) - s1.b.mA.mA.mB.columns[2].y) < 0.0500000007450580596923828125f;
+            }
+            else
+            {
+                _1055 = _1070;
+            }
+            bool _1063;
+            if (_1055)
+            {
+                _1063 = fabs(1.0f - s1.b.mA.mA.mB.columns[2].z) < 0.0500000007450580596923828125f;
+            }
+            else
+            {
+                _1063 = _1055;
+            }
+            _926 = _1063;
+        }
+        else
+        {
+            _926 = _918;
+        }
+        bool _934;
+        if (_926)
+        {
+            bool _1120 = fabs((-4.0f) - s1.b.mA.mA.mB.columns[3].x) < 0.0500000007450580596923828125f;
+            bool _1105;
+            if (_1120)
+            {
+                _1105 = fabs((-9.0f) - s1.b.mA.mA.mB.columns[3].y) < 0.0500000007450580596923828125f;
+            }
+            else
+            {
+                _1105 = _1120;
+            }
+            bool _1113;
+            if (_1105)
+            {
+                _1113 = fabs(1.0f - s1.b.mA.mA.mB.columns[3].z) < 0.0500000007450580596923828125f;
+            }
+            else
+            {
+                _1113 = _1105;
+            }
+            _934 = _1113;
+        }
+        else
+        {
+            _934 = _926;
+        }
+        _382 = _934;
+    }
+    else
+    {
+        _382 = _373;
+    }
+    bool _391;
+    if (_382)
+    {
+        _391 = all((int3)(true, false, false) == s1.c[0].mA[0]);
+    }
+    else
+    {
+        _391 = _382;
+    }
+    bool _400;
+    if (_391)
+    {
+        _400 = all((int3)(true, false, false) == s1.c[0].mA[1]);
+    }
+    else
+    {
+        _400 = _391;
+    }
+    bool _409;
+    if (_400)
+    {
+        _409 = all((int3)(false) == s1.c[1].mA[0]);
+    }
+    else
+    {
+        _409 = _400;
+    }
+    bool _418;
+    if (_409)
+    {
+        _418 = all((int3)(false) == s1.c[1].mA[1]);
+    }
+    else
+    {
+        _418 = _409;
+    }
+    if (_418)
+    {
+        _424[0] += as_uint(1);
+    }
+}
+
diff --git a/reference/opt/shaders-opencl/comp/shared-matrix-nested-struct.comp b/reference/opt/shaders-opencl/comp/shared-matrix-nested-struct.comp
new file mode 100644
index 000000000..5440da2f1
--- /dev/null
+++ b/reference/opt/shaders-opencl/comp/shared-matrix-nested-struct.comp
@@ -0,0 +1,583 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+typedef struct { float2 columns[2]; } spvMat2;
+typedef struct { float2 columns[3]; } spvMat3x2;
+typedef struct { float4 columns[4]; } spvMat4;
+
+struct S1
+{
+    uint a;
+    float4 b;
+};
+
+typedef struct S1 S1;
+
+struct sA
+{
+    spvMat4 mA;
+    int3 mB;
+    int4 mC;
+};
+
+typedef struct sA sA;
+
+struct sB
+{
+    int2 mA;
+};
+
+typedef struct sB sB;
+
+struct sC
+{
+    float mA;
+    uint4 mB;
+    float mC;
+};
+
+typedef struct sC sC;
+
+struct sD
+{
+    sA mA;
+    sB mB;
+    sC mC;
+};
+
+typedef struct sD sD;
+
+struct sE
+{
+    sD mA;
+};
+
+typedef struct sE sE;
+
+struct sF
+{
+    uint3 mA;
+    bool mB;
+};
+
+typedef struct sF sF;
+
+struct sG
+{
+    sF mA;
+    spvMat3x2 mB;
+};
+
+typedef struct sG sG;
+
+struct sH
+{
+    sG mA;
+    float2 mB;
+};
+
+typedef struct sH sH;
+
+struct sI
+{
+    spvMat2 mA;
+    int3 mB;
+    int4 mC;
+};
+
+typedef struct sI sI;
+
+struct sJ
+{
+    sI mA;
+    int3 mB;
+};
+
+typedef struct sJ sJ;
+
+struct sK
+{
+    int2 mA;
+    sJ mB;
+    int2 mC;
+};
+
+typedef struct sK sK;
+
+struct S2
+{
+    sE a;
+    int3 b;
+    sH c;
+    sK d;
+};
+
+typedef struct S2 S2;
+
+struct block
+{
+    uint passed;
+};
+
+typedef struct block block;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global uint* _612)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    __local S1 s1;
+    __local S2 s2;
+    s1.a = 0u;
+    s1.b = (float4)(8.0f, 8.0f, 0.0f, -4.0f);
+    s2.a.mA.mA.mA = (spvMat4){ { (float4)(-5.0f, 9.0f, -4.0f, -6.0f), (float4)(-1.0f, -1.0f, -2.0f, 1.0f), (float4)(6.0f, 5.0f, 7.0f, -2.0f), (float4)(-4.0f, -9.0f, 8.0f, 3.0f) } };
+    s2.a.mA.mA.mB = (int3)(true, false, false);
+    s2.a.mA.mA.mC = (int4)(true, true, true, false);
+    s2.a.mA.mB.mA = (int2)(true);
+    s2.a.mA.mC.mA = 7.0f;
+    s2.a.mA.mC.mB = (uint4)(8u, 6u, 2u, 0u);
+    s2.a.mA.mC.mC = -9.0f;
+    s2.b = (int3)(1, -4, 0);
+    s2.c.mA.mA.mA = (uint3)(4u, 9u, 1u);
+    s2.c.mA.mA.mB = false;
+    s2.c.mA.mB = (spvMat3x2){ { (float2)(3.0f, -5.0f), (float2)(-1.0f, -5.0f), (float2)(-1.0f, -9.0f) } };
+    s2.c.mB = (float2)(-6.0f, -9.0f);
+    s2.d.mA = (int2)(true, false);
+    s2.d.mB.mA.mA = (spvMat2){ { (float2)(-2.0f, 3.0f), (float2)(7.0f, 2.0f) } };
+    s2.d.mB.mA.mB = (int3)(false);
+    s2.d.mB.mA.mC = (int4)(false, false, false, true);
+    s2.d.mB.mB = (int3)(true, false, false);
+    s2.d.mC = (int2)(-9, 0);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    mem_fence(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
+    bool _622 = 0u == s1.a;
+    bool _444;
+    if (_622)
+    {
+        bool _668 = fabs(8.0f - s1.b.x) < 0.0500000007450580596923828125f;
+        bool _645;
+        if (_668)
+        {
+            _645 = fabs(8.0f - s1.b.y) < 0.0500000007450580596923828125f;
+        }
+        else
+        {
+            _645 = _668;
+        }
+        bool _653;
+        if (_645)
+        {
+            _653 = fabs(-s1.b.z) < 0.0500000007450580596923828125f;
+        }
+        else
+        {
+            _653 = _645;
+        }
+        bool _661;
+        if (_653)
+        {
+            _661 = fabs((-4.0f) - s1.b.w) < 0.0500000007450580596923828125f;
+        }
+        else
+        {
+            _661 = _653;
+        }
+        _444 = _661;
+    }
+    else
+    {
+        _444 = _622;
+    }
+    bool _453;
+    if (_444)
+    {
+        bool _774 = fabs((-5.0f) - s2.a.mA.mA.mA.columns[0].x) < 0.0500000007450580596923828125f;
+        bool _751;
+        if (_774)
+        {
+            _751 = fabs(9.0f - s2.a.mA.mA.mA.columns[0].y) < 0.0500000007450580596923828125f;
+        }
+        else
+        {
+            _751 = _774;
+        }
+        bool _759;
+        if (_751)
+        {
+            _759 = fabs((-4.0f) - s2.a.mA.mA.mA.columns[0].z) < 0.0500000007450580596923828125f;
+        }
+        else
+        {
+            _759 = _751;
+        }
+        bool _767;
+        if (_759)
+        {
+            _767 = fabs((-6.0f) - s2.a.mA.mA.mA.columns[0].w) < 0.0500000007450580596923828125f;
+        }
+        else
+        {
+            _767 = _759;
+        }
+        bool _712;
+        if (_767)
+        {
+            bool _841 = fabs((-1.0f) - s2.a.mA.mA.mA.columns[1].x) < 0.0500000007450580596923828125f;
+            bool _818;
+            if (_841)
+            {
+                _818 = fabs((-1.0f) - s2.a.mA.mA.mA.columns[1].y) < 0.0500000007450580596923828125f;
+            }
+            else
+            {
+                _818 = _841;
+            }
+            bool _826;
+            if (_818)
+            {
+                _826 = fabs((-2.0f) - s2.a.mA.mA.mA.columns[1].z) < 0.0500000007450580596923828125f;
+            }
+            else
+            {
+                _826 = _818;
+            }
+            bool _834;
+            if (_826)
+            {
+                _834 = fabs(1.0f - s2.a.mA.mA.mA.columns[1].w) < 0.0500000007450580596923828125f;
+            }
+            else
+            {
+                _834 = _826;
+            }
+            _712 = _834;
+        }
+        else
+        {
+            _712 = _767;
+        }
+        bool _720;
+        if (_712)
+        {
+            bool _908 = fabs(6.0f - s2.a.mA.mA.mA.columns[2].x) < 0.0500000007450580596923828125f;
+            bool _885;
+            if (_908)
+            {
+                _885 = fabs(5.0f - s2.a.mA.mA.mA.columns[2].y) < 0.0500000007450580596923828125f;
+            }
+            else
+            {
+                _885 = _908;
+            }
+            bool _893;
+            if (_885)
+            {
+                _893 = fabs(7.0f - s2.a.mA.mA.mA.columns[2].z) < 0.0500000007450580596923828125f;
+            }
+            else
+            {
+                _893 = _885;
+            }
+            bool _901;
+            if (_893)
+            {
+                _901 = fabs((-2.0f) - s2.a.mA.mA.mA.columns[2].w) < 0.0500000007450580596923828125f;
+            }
+            else
+            {
+                _901 = _893;
+            }
+            _720 = _901;
+        }
+        else
+        {
+            _720 = _712;
+        }
+        bool _728;
+        if (_720)
+        {
+            bool _975 = fabs((-4.0f) - s2.a.mA.mA.mA.columns[3].x) < 0.0500000007450580596923828125f;
+            bool _952;
+            if (_975)
+            {
+                _952 = fabs((-9.0f) - s2.a.mA.mA.mA.columns[3].y) < 0.0500000007450580596923828125f;
+            }
+            else
+            {
+                _952 = _975;
+            }
+            bool _960;
+            if (_952)
+            {
+                _960 = fabs(8.0f - s2.a.mA.mA.mA.columns[3].z) < 0.0500000007450580596923828125f;
+            }
+            else
+            {
+                _960 = _952;
+            }
+            bool _968;
+            if (_960)
+            {
+                _968 = fabs(3.0f - s2.a.mA.mA.mA.columns[3].w) < 0.0500000007450580596923828125f;
+            }
+            else
+            {
+                _968 = _960;
+            }
+            _728 = _968;
+        }
+        else
+        {
+            _728 = _720;
+        }
+        _453 = _728;
+    }
+    else
+    {
+        _453 = _444;
+    }
+    bool _462;
+    if (_453)
+    {
+        _462 = all((int3)(true, false, false) == s2.a.mA.mA.mB);
+    }
+    else
+    {
+        _462 = _453;
+    }
+    bool _471;
+    if (_462)
+    {
+        _471 = all((int4)(true, true, true, false) == s2.a.mA.mA.mC);
+    }
+    else
+    {
+        _471 = _462;
+    }
+    bool _480;
+    if (_471)
+    {
+        _480 = all((int2)(true) == s2.a.mA.mB.mA);
+    }
+    else
+    {
+        _480 = _471;
+    }
+    bool _489;
+    if (_480)
+    {
+        _489 = fabs(7.0f - s2.a.mA.mC.mA) < 0.0500000007450580596923828125f;
+    }
+    else
+    {
+        _489 = _480;
+    }
+    bool _498;
+    if (_489)
+    {
+        _498 = all((uint4)(8u, 6u, 2u, 0u) == s2.a.mA.mC.mB);
+    }
+    else
+    {
+        _498 = _489;
+    }
+    bool _507;
+    if (_498)
+    {
+        _507 = fabs((-9.0f) - s2.a.mA.mC.mC) < 0.0500000007450580596923828125f;
+    }
+    else
+    {
+        _507 = _498;
+    }
+    bool _516;
+    if (_507)
+    {
+        _516 = all((int3)(1, -4, 0) == s2.b);
+    }
+    else
+    {
+        _516 = _507;
+    }
+    bool _525;
+    if (_516)
+    {
+        _525 = all((uint3)(4u, 9u, 1u) == s2.c.mA.mA.mA);
+    }
+    else
+    {
+        _525 = _516;
+    }
+    bool _534;
+    if (_525)
+    {
+        _534 = false == s2.c.mA.mA.mB;
+    }
+    else
+    {
+        _534 = _525;
+    }
+    bool _543;
+    if (_534)
+    {
+        bool _1106 = fabs(3.0f - s2.c.mA.mB.columns[0].x) < 0.0500000007450580596923828125f;
+        bool _1099;
+        if (_1106)
+        {
+            _1099 = fabs((-5.0f) - s2.c.mA.mB.columns[0].y) < 0.0500000007450580596923828125f;
+        }
+        else
+        {
+            _1099 = _1106;
+        }
+        bool _1072;
+        if (_1099)
+        {
+            bool _1139 = fabs((-1.0f) - s2.c.mA.mB.columns[1].x) < 0.0500000007450580596923828125f;
+            bool _1132;
+            if (_1139)
+            {
+                _1132 = fabs((-5.0f) - s2.c.mA.mB.columns[1].y) < 0.0500000007450580596923828125f;
+            }
+            else
+            {
+                _1132 = _1139;
+            }
+            _1072 = _1132;
+        }
+        else
+        {
+            _1072 = _1099;
+        }
+        bool _1080;
+        if (_1072)
+        {
+            bool _1172 = fabs((-1.0f) - s2.c.mA.mB.columns[2].x) < 0.0500000007450580596923828125f;
+            bool _1165;
+            if (_1172)
+            {
+                _1165 = fabs((-9.0f) - s2.c.mA.mB.columns[2].y) < 0.0500000007450580596923828125f;
+            }
+            else
+            {
+                _1165 = _1172;
+            }
+            _1080 = _1165;
+        }
+        else
+        {
+            _1080 = _1072;
+        }
+        _543 = _1080;
+    }
+    else
+    {
+        _543 = _534;
+    }
+    bool _552;
+    if (_543)
+    {
+        bool _1205 = fabs((-6.0f) - s2.c.mB.x) < 0.0500000007450580596923828125f;
+        bool _1198;
+        if (_1205)
+        {
+            _1198 = fabs((-9.0f) - s2.c.mB.y) < 0.0500000007450580596923828125f;
+        }
+        else
+        {
+            _1198 = _1205;
+        }
+        _552 = _1198;
+    }
+    else
+    {
+        _552 = _543;
+    }
+    bool _561;
+    if (_552)
+    {
+        _561 = all((int2)(true, false) == s2.d.mA);
+    }
+    else
+    {
+        _561 = _552;
+    }
+    bool _570;
+    if (_561)
+    {
+        bool _1263 = fabs((-2.0f) - s2.d.mB.mA.mA.columns[0].x) < 0.0500000007450580596923828125f;
+        bool _1256;
+        if (_1263)
+        {
+            _1256 = fabs(3.0f - s2.d.mB.mA.mA.columns[0].y) < 0.0500000007450580596923828125f;
+        }
+        else
+        {
+            _1256 = _1263;
+        }
+        bool _1237;
+        if (_1256)
+        {
+            bool _1296 = fabs(7.0f - s2.d.mB.mA.mA.columns[1].x) < 0.0500000007450580596923828125f;
+            bool _1289;
+            if (_1296)
+            {
+                _1289 = fabs(2.0f - s2.d.mB.mA.mA.columns[1].y) < 0.0500000007450580596923828125f;
+            }
+            else
+            {
+                _1289 = _1296;
+            }
+            _1237 = _1289;
+        }
+        else
+        {
+            _1237 = _1256;
+        }
+        _570 = _1237;
+    }
+    else
+    {
+        _570 = _561;
+    }
+    bool _579;
+    if (_570)
+    {
+        _579 = all((int3)(false) == s2.d.mB.mA.mB);
+    }
+    else
+    {
+        _579 = _570;
+    }
+    bool _588;
+    if (_579)
+    {
+        _588 = all((int4)(false, false, false, true) == s2.d.mB.mA.mC);
+    }
+    else
+    {
+        _588 = _579;
+    }
+    bool _597;
+    if (_588)
+    {
+        _597 = all((int3)(true, false, false) == s2.d.mB.mB);
+    }
+    else
+    {
+        _597 = _588;
+    }
+    bool _606;
+    if (_597)
+    {
+        _606 = all((int2)(-9, 0) == s2.d.mC);
+    }
+    else
+    {
+        _606 = _597;
+    }
+    if (_606)
+    {
+        _612[0] += as_uint(1);
+    }
+}
+
diff --git a/reference/opt/shaders-opencl/comp/shared-std450.double.comp b/reference/opt/shaders-opencl/comp/shared-std450.fp64.comp
similarity index 100%
rename from reference/opt/shaders-opencl/comp/shared-std450.double.comp
rename to reference/opt/shaders-opencl/comp/shared-std450.fp64.comp
diff --git a/reference/opt/shaders-opencl/comp/struct-layout.comp b/reference/opt/shaders-opencl/comp/struct-layout.comp
index 39cabe2a8..b2df43cd3 100644
--- a/reference/opt/shaders-opencl/comp/struct-layout.comp
+++ b/reference/opt/shaders-opencl/comp/struct-layout.comp
@@ -1,9 +1,11 @@
 // Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
 
 
+typedef struct { float4 columns[4]; } spvMat4;
+
 struct Foo
 {
-    float4 m;
+    spvMat4 m;
 };
 
 typedef struct Foo Foo;
@@ -22,10 +24,25 @@ struct SSBO
 
 typedef struct SSBO SSBO;
 
+static float4 spvMulMat4Vec4(spvMat4 m, float4 v)
+{
+    return m.columns[0] * v.x + m.columns[1] * v.y + m.columns[2] * v.z + m.columns[3] * v.w;
+}
+
+static spvMat4 spvMulMat4Mat4(spvMat4 a, spvMat4 b)
+{
+    spvMat4 r;
+    r.columns[0] = spvMulMat4Vec4(a, b.columns[0]);
+    r.columns[1] = spvMulMat4Vec4(a, b.columns[1]);
+    r.columns[2] = spvMulMat4Vec4(a, b.columns[2]);
+    r.columns[3] = spvMulMat4Vec4(a, b.columns[3]);
+    return r;
+}
+
 __attribute__((reqd_work_group_size(1, 1, 1)))
 __kernel void comp_main(__global Foo* _23, __global const Foo* _30)
 {
     uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
-    _23[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x].m = _30[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x].m * _30[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x].m;
+    _23[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x].m = spvMulMat4Mat4(_30[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x].m, _30[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x].m);
 }
 
diff --git a/reference/opt/shaders-opencl/comp/struct-packing.comp b/reference/opt/shaders-opencl/comp/struct-packing.comp
new file mode 100644
index 000000000..3f0a147f0
--- /dev/null
+++ b/reference/opt/shaders-opencl/comp/struct-packing.comp
@@ -0,0 +1,125 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+typedef struct { float2 columns[2]; } spvMat2;
+typedef struct { float3 columns[2]; } spvMat2x3;
+typedef struct { float2 columns[3]; } spvMat3x2;
+
+struct S0
+{
+    float2 a[1];
+    float b;
+};
+
+typedef struct S0 S0;
+
+struct S1
+{
+    float3 a;
+    float b;
+};
+
+typedef struct S1 S1;
+
+struct S2
+{
+    float3 a[1];
+    float b;
+};
+
+typedef struct S2 S2;
+
+struct S3
+{
+    float2 a;
+    float b;
+};
+
+typedef struct S3 S3;
+
+struct S4
+{
+    float2 c;
+};
+
+typedef struct S4 S4;
+
+struct Content
+{
+    S0 m0s[1];
+    S1 m1s[1];
+    S2 m2s[1];
+    S0 m0;
+    S1 m1;
+    S2 m2;
+    S3 m3;
+    float m4;
+    S4 m3s[8];
+};
+
+typedef struct Content Content;
+
+struct SSBO1
+{
+    Content content;
+    Content content1[2];
+    Content content2;
+    spvMat2 m0;
+    spvMat2 m1;
+    spvMat2x3 m2[4];
+    spvMat3x2 m3;
+    spvMat2 m4;
+    spvMat2 m5[9];
+    spvMat3x2 m6[4][2];
+    spvMat2x3 m7;
+    float array[1];
+};
+
+typedef struct SSBO1 SSBO1;
+
+struct SSBO0
+{
+    Content content;
+    Content content1[2];
+    Content content2;
+    float array[1];
+};
+
+typedef struct SSBO0 SSBO0;
+
+static float3 spvMulVec2Mat3x2(float2 v, spvMat3x2 m)
+{
+    return (float3)(dot(v, m.columns[0]), dot(v, m.columns[1]), dot(v, m.columns[2]));
+}
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global SSBO1* ssbo_430, __global SSBO0* ssbo_140)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    Content _60 = ssbo_140->content;
+    ssbo_430->content.m0s[0].a[0] = _60.m0s[0].a[0];
+    ssbo_430->content.m0s[0].b = _60.m0s[0].b;
+    ssbo_430->content.m1s[0].a = _60.m1s[0].a;
+    ssbo_430->content.m1s[0].b = _60.m1s[0].b;
+    ssbo_430->content.m2s[0].a[0] = _60.m2s[0].a[0];
+    ssbo_430->content.m2s[0].b = _60.m2s[0].b;
+    ssbo_430->content.m0.a[0] = _60.m0.a[0];
+    ssbo_430->content.m0.b = _60.m0.b;
+    ssbo_430->content.m1.a = _60.m1.a;
+    ssbo_430->content.m1.b = _60.m1.b;
+    ssbo_430->content.m2.a[0] = _60.m2.a[0];
+    ssbo_430->content.m2.b = _60.m2.b;
+    ssbo_430->content.m3.a = _60.m3.a;
+    ssbo_430->content.m3.b = _60.m3.b;
+    ssbo_430->content[7] = _60.m4;
+    ssbo_430->content.m3s[0].c = _60.m3s[0].c;
+    ssbo_430->content.m3s[1].c = _60.m3s[1].c;
+    ssbo_430->content.m3s[2].c = _60.m3s[2].c;
+    ssbo_430->content.m3s[3].c = _60.m3s[3].c;
+    ssbo_430->content.m3s[4].c = _60.m3s[4].c;
+    ssbo_430->content.m3s[5].c = _60.m3s[5].c;
+    ssbo_430->content.m3s[6].c = _60.m3s[6].c;
+    ssbo_430->content.m3s[7].c = _60.m3s[7].c;
+    ssbo_430->content.m1.a = spvMulVec2Mat3x2(ssbo_430->content.m3.a, ssbo_430->m6[1][1]);
+}
+
diff --git a/reference/shaders-opencl/asm/comp/relaxed-block-layout.fp16.asm.comp b/reference/shaders-opencl/asm/comp/relaxed-block-layout.fp16.asm.comp
new file mode 100644
index 000000000..a8926a145
--- /dev/null
+++ b/reference/shaders-opencl/asm/comp/relaxed-block-layout.fp16.asm.comp
@@ -0,0 +1,24 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+struct foo
+{
+    uint bar;
+    float3 baz;
+    uchar quux;
+    uchar4 blah;
+    half2 wibble;
+};
+
+typedef struct foo foo;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global foo* _10)
+{
+    _10->bar = ((uint3)(get_local_id(0), get_local_id(1), get_local_id(2))).x;
+    _10->baz = convert_float3(((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))));
+    _10->blah = convert_uchar4((uint4)(convert_uint4(_10->blah).xyz + ((uint3)(get_group_id(0), get_group_id(1), get_group_id(2))), 0u));
+    _10->wibble = convert_half2(convert_float2(_10->wibble) * convert_float2(((uint3)(get_num_groups(0), get_num_groups(1), get_num_groups(2))).xy));
+}
+
diff --git a/reference/shaders-opencl/asm/comp/replicated-composites.spv16.asm.comp b/reference/shaders-opencl/asm/comp/replicated-composites.spv16.asm.comp
index 5bcad0013..6113e59a9 100644
--- a/reference/shaders-opencl/asm/comp/replicated-composites.spv16.asm.comp
+++ b/reference/shaders-opencl/asm/comp/replicated-composites.spv16.asm.comp
@@ -1,6 +1,8 @@
 // Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
 
 
+typedef struct { float4 columns[4]; } spvMat4;
+
 #ifndef SPIRV_CROSS_CONSTANT_ID_0
 #define SPIRV_CROSS_CONSTANT_ID_0 0.0f
 #endif
@@ -19,10 +21,10 @@ __attribute__((reqd_work_group_size(1, 1, 1)))
 __kernel void comp_main(UBO ubo)
 {
     float4 a = (float4)(0.0f);
-    float4 b = (float4)(1.0f);
+    spvMat4 b = (spvMat4){ { (float4)(1.0f), (float4)(1.0f), (float4)(1.0f), (float4)(1.0f) } };
     float4 c = _20;
     float4 d = (float4)(ubo.uniform_float);
-    float4 e = d;
+    spvMat4 e = (spvMat4){ { d, d, d, d } };
     float f[8] = {ubo.uniform_float, ubo.uniform_float, ubo.uniform_float, ubo.uniform_float, ubo.uniform_float, ubo.uniform_float, ubo.uniform_float, ubo.uniform_float};
 }
 
diff --git a/reference/shaders-opencl/comp/dowhile.comp b/reference/shaders-opencl/comp/dowhile.comp
index 2dca8bcda..1e518a628 100644
--- a/reference/shaders-opencl/comp/dowhile.comp
+++ b/reference/shaders-opencl/comp/dowhile.comp
@@ -1,9 +1,11 @@
 // Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
 
 
+typedef struct { float4 columns[4]; } spvMat4;
+
 struct SSBO
 {
-    float4 mvp;
+    spvMat4 mvp;
     float4 in_data[1];
 };
 
@@ -16,19 +18,24 @@ struct SSBO2
 
 typedef struct SSBO2 SSBO2;
 
+static float4 spvMulMat4Vec4(spvMat4 m, float4 v)
+{
+    return m.columns[0] * v.x + m.columns[1] * v.y + m.columns[2] * v.z + m.columns[3] * v.w;
+}
+
 __attribute__((reqd_work_group_size(1, 1, 1)))
 __kernel void comp_main(__global const SSBO* _28, __global float4* _52)
 {
     uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
-    int i;
-    uint ident = ((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x;
-    i = 0;
-    float4 idat = _28->in_data[ident];
+    int i_1;
+    uint ident_1 = ((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x;
+    i_1 = 0;
+    float4 idat_1 = _28->in_data[ident_1];
     do
     {
-        idat = _28->mvp * idat;
-        i++;
-    } while (i < 16);
-    _52[ident] = idat;
+        idat_1 = spvMulMat4Vec4(_28->mvp, idat_1);
+        i_1++;
+    } while (i_1 < 16);
+    _52[ident_1] = idat_1;
 }
 
diff --git a/reference/shaders-opencl/comp/inverse.comp b/reference/shaders-opencl/comp/inverse.comp
new file mode 100644
index 000000000..3db4ff542
--- /dev/null
+++ b/reference/shaders-opencl/comp/inverse.comp
@@ -0,0 +1,53 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+typedef struct { float2 columns[2]; } spvMat2;
+typedef struct { float3 columns[3]; } spvMat3;
+typedef struct { float4 columns[4]; } spvMat4;
+
+struct MatrixOut
+{
+    spvMat2 m2out;
+    spvMat3 m3out;
+    spvMat4 m4out;
+};
+
+typedef struct MatrixOut MatrixOut;
+
+struct MatrixIn
+{
+    spvMat2 m2in;
+    spvMat3 m3in;
+    spvMat4 m4in;
+};
+
+typedef struct MatrixIn MatrixIn;
+
+static spvMat2 spvInverse2(spvMat2 m) {
+    float d = 1.0f / (m.columns[0].x * m.columns[1].y - m.columns[1].x * m.columns[0].y);
+    return (spvMat2){ { (float2)(m.columns[1].y * d, -m.columns[0].y * d), (float2)(-m.columns[1].x * d, m.columns[0].x * d) } };
+}
+
+static spvMat3 spvInverse3(spvMat3 m) {
+    float3 t = (float3)(m.columns[1].y * m.columns[2].z - m.columns[1].z * m.columns[2].y, m.columns[1].z * m.columns[2].x - m.columns[1].x * m.columns[2].z, m.columns[1].x * m.columns[2].y - m.columns[1].y * m.columns[2].x);
+    float d = 1.0f / dot(m.columns[0], t);
+    return (spvMat3){ { t * d, (float3)(m.columns[0].z * m.columns[2].y - m.columns[0].y * m.columns[2].z, m.columns[0].x * m.columns[2].z - m.columns[0].z * m.columns[2].x, m.columns[0].y * m.columns[2].x - m.columns[0].x * m.columns[2].y) * d, (float3)(m.columns[0].y * m.columns[1].z - m.columns[0].z * m.columns[1].y, m.columns[0].z * m.columns[1].x - m.columns[0].x * m.columns[1].z, m.columns[0].x * m.columns[1].y - m.columns[0].y * m.columns[1].x) * d } };
+}
+
+static spvMat4 spvInverse4(spvMat4 m) {
+    float4 t = (float4)(m.columns[2].y * m.columns[3].z * m.columns[1].w - m.columns[3].y * m.columns[2].z * m.columns[1].w + m.columns[3].y * m.columns[1].z * m.columns[2].w - m.columns[1].y * m.columns[3].z * m.columns[2].w - m.columns[2].y * m.columns[1].z * m.columns[3].w + m.columns[1].y * m.columns[2].z * m.columns[3].w, m.columns[3].x * m.columns[2].z * m.columns[1].w - m.columns[2].x * m.columns[3].z * m.columns[1].w - m.columns[3].x * m.columns[1].z * m.columns[2].w + m.columns[1].x * m.columns[3].z * m.columns[2].w + m.columns[2].x * m.columns[1].z * m.columns[3].w - m.columns[1].x * m.columns[2].z * m.columns[3].w, m.columns[2].x * m.columns[3].y * m.columns[1].w - m.columns[3].x * m.columns[2].y * m.columns[1].w + m.columns[3].x * m.columns[1].y * m.columns[2].w - m.columns[1].x * m.columns[3].y * m.columns[2].w - m.columns[2].x * m.columns[1].y * m.columns[3].w + m.columns[1].x * m.columns[2].y * m.columns[3].w, m.columns[3].x * m.columns[2].y * m.columns[1].z - m.columns[2].x * m.columns[3].y * m.columns[1].z - m.columns[3].x * m.columns[1].y * m.columns[2].z + m.columns[1].x * m.columns[3].y * m.columns[2].z + m.columns[2].x * m.columns[1].y * m.columns[3].z - m.columns[1].x * m.columns[2].y * m.columns[3].z);
+    spvMat4 r = (spvMat4){ { (float4)(t.x, m.columns[3].y * m.columns[2].z * m.columns[0].w - m.columns[2].y * m.columns[3].z * m.columns[0].w - m.columns[3].y * m.columns[0].z * m.columns[2].w + m.columns[0].y * m.columns[3].z * m.columns[2].w + m.columns[2].y * m.columns[0].z * m.columns[3].w - m.columns[0].y * m.columns[2].z * m.columns[3].w, m.columns[1].y * m.columns[3].z * m.columns[0].w - m.columns[3].y * m.columns[1].z * m.columns[0].w + m.columns[3].y * m.columns[0].z * m.columns[1].w - m.columns[0].y * m.columns[3].z * m.columns[1].w - m.columns[1].y * m.columns[0].z * m.columns[3].w + m.columns[0].y * m.columns[1].z * m.columns[3].w, m.columns[2].y * m.columns[1].z * m.columns[0].w - m.columns[1].y * m.columns[2].z * m.columns[0].w - m.columns[2].y * m.columns[0].z * m.columns[1].w + m.columns[0].y * m.columns[2].z * m.columns[1].w + m.columns[1].y * m.columns[0].z * m.columns[2].w - m.columns[0].y * m.columns[1].z * m.columns[2].w), (float4)(t.y, m.columns[2].x * m.columns[3].z * m.columns[0].w - m.columns[3].x * m.columns[2].z * m.columns[0].w + m.columns[3].x * m.columns[0].z * m.columns[2].w - m.columns[0].x * m.columns[3].z * m.columns[2].w - m.columns[2].x * m.columns[0].z * m.columns[3].w + m.columns[0].x * m.columns[2].z * m.columns[3].w, m.columns[3].x * m.columns[1].z * m.columns[0].w - m.columns[1].x * m.columns[3].z * m.columns[0].w - m.columns[3].x * m.columns[0].z * m.columns[1].w + m.columns[0].x * m.columns[3].z * m.columns[1].w + m.columns[1].x * m.columns[0].z * m.columns[3].w - m.columns[0].x * m.columns[1].z * m.columns[3].w, m.columns[1].x * m.columns[2].z * m.columns[0].w - m.columns[2].x * m.columns[1].z * m.columns[0].w + m.columns[2].x * m.columns[0].z * m.columns[1].w - m.columns[0].x * m.columns[2].z * m.columns[1].w - m.columns[1].x * m.columns[0].z * m.columns[2].w + m.columns[0].x * m.columns[1].z * m.columns[2].w), (float4)(t.z, m.columns[3].x * m.columns[2].y * m.columns[0].w - m.columns[2].x * m.columns[3].y * m.columns[0].w - m.columns[3].x * m.columns[0].y * m.columns[2].w + m.columns[0].x * m.columns[3].y * m.columns[2].w + m.columns[2].x * m.columns[0].y * m.columns[3].w - m.columns[0].x * m.columns[2].y * m.columns[3].w, m.columns[1].x * m.columns[3].y * m.columns[0].w - m.columns[3].x * m.columns[1].y * m.columns[0].w + m.columns[3].x * m.columns[0].y * m.columns[1].w - m.columns[0].x * m.columns[3].y * m.columns[1].w - m.columns[1].x * m.columns[0].y * m.columns[3].w + m.columns[0].x * m.columns[1].y * m.columns[3].w, m.columns[2].x * m.columns[1].y * m.columns[0].w - m.columns[1].x * m.columns[2].y * m.columns[0].w - m.columns[2].x * m.columns[0].y * m.columns[1].w + m.columns[0].x * m.columns[2].y * m.columns[1].w + m.columns[1].x * m.columns[0].y * m.columns[2].w - m.columns[0].x * m.columns[1].y * m.columns[2].w), (float4)(t.w, m.columns[2].x * m.columns[3].y * m.columns[0].z - m.columns[3].x * m.columns[2].y * m.columns[0].z + m.columns[3].x * m.columns[0].y * m.columns[2].z - m.columns[0].x * m.columns[3].y * m.columns[2].z - m.columns[2].x * m.columns[0].y * m.columns[3].z + m.columns[0].x * m.columns[2].y * m.columns[3].z, m.columns[3].x * m.columns[1].y * m.columns[0].z - m.columns[1].x * m.columns[3].y * m.columns[0].z - m.columns[3].x * m.columns[0].y * m.columns[1].z + m.columns[0].x * m.columns[3].y * m.columns[1].z + m.columns[1].x * m.columns[0].y * m.columns[3].z - m.columns[0].x * m.columns[1].y * m.columns[3].z, m.columns[1].x * m.columns[2].y * m.columns[0].z - m.columns[2].x * m.columns[1].y * m.columns[0].z + m.columns[2].x * m.columns[0].y * m.columns[1].z - m.columns[0].x * m.columns[2].y * m.columns[1].z - m.columns[1].x * m.columns[0].y * m.columns[2].z + m.columns[0].x * m.columns[1].y * m.columns[2].z) } };
+    float d = 1.0f / dot(m.columns[0], t);
+    r.columns[0] *= d; r.columns[1] *= d; r.columns[2] *= d; r.columns[3] *= d;
+    return r;
+}
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global MatrixOut* _15, __global const MatrixIn* _20)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    _15->m2out = spvInverse2(_20->m2in);
+    _15->m3out = spvInverse3(_20->m3in);
+    _15->m4out = spvInverse4(_20->m4in);
+}
+
diff --git a/reference/shaders-opencl/comp/mat3-row-maj-read-write-const.comp b/reference/shaders-opencl/comp/mat3-row-maj-read-write-const.comp
new file mode 100644
index 000000000..eb4beccdd
--- /dev/null
+++ b/reference/shaders-opencl/comp/mat3-row-maj-read-write-const.comp
@@ -0,0 +1,47 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+typedef struct { float3 columns[3]; } spvMat3;
+
+struct model_t
+{
+    spvMat3 mtx_rm;
+};
+
+typedef struct model_t model_t;
+
+static float3 spvMulMat3Vec3(spvMat3 m, float3 v)
+{
+    return m.columns[0] * v.x + m.columns[1] * v.y + m.columns[2] * v.z;
+}
+
+static spvMat3 spvMulMat3Mat3(spvMat3 a, spvMat3 b)
+{
+    spvMat3 r;
+    r.columns[0] = spvMulMat3Vec3(a, b.columns[0]);
+    r.columns[1] = spvMulMat3Vec3(a, b.columns[1]);
+    r.columns[2] = spvMulMat3Vec3(a, b.columns[2]);
+    return r;
+}
+
+static spvMat3 spvTransposeMat3(spvMat3 m)
+{
+    spvMat3 r;
+    r.columns[0] = (float3)(m.columns[0].x, m.columns[1].x, m.columns[2].x);
+    r.columns[1] = (float3)(m.columns[0].y, m.columns[1].y, m.columns[2].y);
+    r.columns[2] = (float3)(m.columns[0].z, m.columns[1].z, m.columns[2].z);
+    return r;
+}
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global spvMat3* model)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    spvMat3 mtx_cm_1 = spvTransposeMat3(model[0]);
+    spvMat3 mtx1_1 = spvMulMat3Mat3(mtx_cm_1, (spvMat3){ { (float3)(4.0f, -3.0f, 1.0f), (float3)(-7.0f, 7.0f, -7.0f), (float3)(-5.0f, 6.0f, -8.0f) } });
+    if (mtx1_1.columns[0].x != 0.0f)
+    {
+        model[0] = spvTransposeMat3((spvMat3){ { (float3)(-5.0f, -3.0f, -5.0f), (float3)(-2.0f, 2.0f, -5.0f), (float3)(6.0f, 3.0f, -8.0f) } });
+    }
+}
+
diff --git a/reference/shaders-opencl/comp/mat3.comp b/reference/shaders-opencl/comp/mat3.comp
new file mode 100644
index 000000000..12663175a
--- /dev/null
+++ b/reference/shaders-opencl/comp/mat3.comp
@@ -0,0 +1,20 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+typedef struct { float3 columns[3]; } spvMat3;
+
+struct SSBO2
+{
+    spvMat3 out_data[1];
+};
+
+typedef struct SSBO2 SSBO2;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global spvMat3* _22)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    uint ident = ((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x;
+    _22[ident] = (spvMat3){ { (float3)(10.0f), (float3)(20.0f), (float3)(40.0f) } };
+}
+
diff --git a/reference/shaders-opencl/comp/outer-product.comp b/reference/shaders-opencl/comp/outer-product.comp
index 8441e6d2d..d0ab225fd 100644
--- a/reference/shaders-opencl/comp/outer-product.comp
+++ b/reference/shaders-opencl/comp/outer-product.comp
@@ -1,17 +1,27 @@
 // Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
 
 
+typedef struct { float2 columns[2]; } spvMat2;
+typedef struct { float3 columns[2]; } spvMat2x3;
+typedef struct { float4 columns[2]; } spvMat2x4;
+typedef struct { float2 columns[3]; } spvMat3x2;
+typedef struct { float3 columns[3]; } spvMat3;
+typedef struct { float4 columns[3]; } spvMat3x4;
+typedef struct { float2 columns[4]; } spvMat4x2;
+typedef struct { float3 columns[4]; } spvMat4x3;
+typedef struct { float4 columns[4]; } spvMat4;
+
 struct SSBO
 {
-    float2 m22;
-    float3 m23;
-    float4 m24;
-    float2 m32;
-    float3 m33;
-    float4 m34;
-    float2 m42;
-    float3 m43;
-    float4 m44;
+    spvMat2 m22;
+    spvMat2x3 m23;
+    spvMat2x4 m24;
+    spvMat3x2 m32;
+    spvMat3 m33;
+    spvMat3x4 m34;
+    spvMat4x2 m42;
+    spvMat4x3 m43;
+    spvMat4 m44;
 };
 
 typedef struct SSBO SSBO;
@@ -25,18 +35,99 @@ struct ReadSSBO
 
 typedef struct ReadSSBO ReadSSBO;
 
+static spvMat2 spvOuterProductVec2Vec2(float2 c, float2 r)
+{
+    spvMat2 m;
+    m.columns[0] = c * r.x;
+    m.columns[1] = c * r.y;
+    return m;
+}
+
+static spvMat2x3 spvOuterProductVec3Vec2(float3 c, float2 r)
+{
+    spvMat2x3 m;
+    m.columns[0] = c * r.x;
+    m.columns[1] = c * r.y;
+    return m;
+}
+
+static spvMat2x4 spvOuterProductVec4Vec2(float4 c, float2 r)
+{
+    spvMat2x4 m;
+    m.columns[0] = c * r.x;
+    m.columns[1] = c * r.y;
+    return m;
+}
+
+static spvMat3x2 spvOuterProductVec2Vec3(float2 c, float3 r)
+{
+    spvMat3x2 m;
+    m.columns[0] = c * r.x;
+    m.columns[1] = c * r.y;
+    m.columns[2] = c * r.z;
+    return m;
+}
+
+static spvMat3 spvOuterProductVec3Vec3(float3 c, float3 r)
+{
+    spvMat3 m;
+    m.columns[0] = c * r.x;
+    m.columns[1] = c * r.y;
+    m.columns[2] = c * r.z;
+    return m;
+}
+
+static spvMat3x4 spvOuterProductVec4Vec3(float4 c, float3 r)
+{
+    spvMat3x4 m;
+    m.columns[0] = c * r.x;
+    m.columns[1] = c * r.y;
+    m.columns[2] = c * r.z;
+    return m;
+}
+
+static spvMat4x2 spvOuterProductVec2Vec4(float2 c, float4 r)
+{
+    spvMat4x2 m;
+    m.columns[0] = c * r.x;
+    m.columns[1] = c * r.y;
+    m.columns[2] = c * r.z;
+    m.columns[3] = c * r.w;
+    return m;
+}
+
+static spvMat4x3 spvOuterProductVec3Vec4(float3 c, float4 r)
+{
+    spvMat4x3 m;
+    m.columns[0] = c * r.x;
+    m.columns[1] = c * r.y;
+    m.columns[2] = c * r.z;
+    m.columns[3] = c * r.w;
+    return m;
+}
+
+static spvMat4 spvOuterProductVec4Vec4(float4 c, float4 r)
+{
+    spvMat4 m;
+    m.columns[0] = c * r.x;
+    m.columns[1] = c * r.y;
+    m.columns[2] = c * r.z;
+    m.columns[3] = c * r.w;
+    return m;
+}
+
 __attribute__((reqd_work_group_size(1, 1, 1)))
 __kernel void comp_main(__global SSBO* _21, __global const ReadSSBO* _26)
 {
     uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
-    _21->m22 = _26->v2 * _26->v2.x;
-    _21->m23 = _26->v3 * _26->v2.x;
-    _21->m24 = _26->v4 * _26->v2.x;
-    _21->m32 = _26->v2 * _26->v3.x;
-    _21->m33 = _26->v3 * _26->v3.x;
-    _21->m34 = _26->v4 * _26->v3.x;
-    _21->m42 = _26->v2 * _26->v4.x;
-    _21->m43 = _26->v3 * _26->v4.x;
-    _21->m44 = _26->v4 * _26->v4.x;
+    _21->m22 = spvOuterProductVec2Vec2(_26->v2, _26->v2);
+    _21->m23 = spvOuterProductVec3Vec2(_26->v3, _26->v2);
+    _21->m24 = spvOuterProductVec4Vec2(_26->v4, _26->v2);
+    _21->m32 = spvOuterProductVec2Vec3(_26->v2, _26->v3);
+    _21->m33 = spvOuterProductVec3Vec3(_26->v3, _26->v3);
+    _21->m34 = spvOuterProductVec4Vec3(_26->v4, _26->v3);
+    _21->m42 = spvOuterProductVec2Vec4(_26->v2, _26->v4);
+    _21->m43 = spvOuterProductVec3Vec4(_26->v3, _26->v4);
+    _21->m44 = spvOuterProductVec4Vec4(_26->v4, _26->v4);
 }
 
diff --git a/reference/shaders-opencl/comp/rmw-matrix.comp b/reference/shaders-opencl/comp/rmw-matrix.comp
new file mode 100644
index 000000000..9fdc47c62
--- /dev/null
+++ b/reference/shaders-opencl/comp/rmw-matrix.comp
@@ -0,0 +1,41 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+typedef struct { float4 columns[4]; } spvMat4;
+
+struct SSBO
+{
+    float a;
+    float4 b;
+    spvMat4 c;
+    float a1;
+    float4 b1;
+    spvMat4 c1;
+};
+
+typedef struct SSBO SSBO;
+
+static float4 spvMulMat4Vec4(spvMat4 m, float4 v)
+{
+    return m.columns[0] * v.x + m.columns[1] * v.y + m.columns[2] * v.z + m.columns[3] * v.w;
+}
+
+static spvMat4 spvMulMat4Mat4(spvMat4 a, spvMat4 b)
+{
+    spvMat4 r;
+    r.columns[0] = spvMulMat4Vec4(a, b.columns[0]);
+    r.columns[1] = spvMulMat4Vec4(a, b.columns[1]);
+    r.columns[2] = spvMulMat4Vec4(a, b.columns[2]);
+    r.columns[3] = spvMulMat4Vec4(a, b.columns[3]);
+    return r;
+}
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global SSBO* _11)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    _11->a *= _11->a1;
+    _11->b *= _11->b1;
+    _11->c = spvMulMat4Mat4(_11->c, _11->c1);
+}
+
diff --git a/reference/shaders-opencl/comp/scalar-std450-distance-length-normalize.comp b/reference/shaders-opencl/comp/scalar-std450-distance-length-normalize.comp
index 59f3fb7ed..c607a22f3 100644
--- a/reference/shaders-opencl/comp/scalar-std450-distance-length-normalize.comp
+++ b/reference/shaders-opencl/comp/scalar-std450-distance-length-normalize.comp
@@ -17,9 +17,9 @@ __attribute__((reqd_work_group_size(1, 1, 1)))
 __kernel void comp_main(__global SSBO* _9)
 {
     uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
-    _9->c = distance(_9->a, _9->b);
-    _9->d = length(_9->a);
-    _9->e = normalize(_9->a);
-    _9->f = distance(_9->a - 1.0f, _9->b - 2.0f);
+    _9->c = fabs(_9->a - _9->b);
+    _9->d = fabs(_9->a);
+    _9->e = sign(_9->a);
+    _9->f = fabs(_9->a - 1.0f - _9->b - 2.0f);
 }
 
diff --git a/reference/shaders-opencl/comp/shared-matrix-array-of-array.comp b/reference/shaders-opencl/comp/shared-matrix-array-of-array.comp
new file mode 100644
index 000000000..f474aede3
--- /dev/null
+++ b/reference/shaders-opencl/comp/shared-matrix-array-of-array.comp
@@ -0,0 +1,357 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+typedef struct { float3 columns[4]; } spvMat4x3;
+
+struct S1
+{
+    spvMat4x3 a[2];
+    float b;
+    float2 c[3];
+};
+
+typedef struct S1 S1;
+
+struct S2
+{
+    int4 a;
+    bool b[3][1][3];
+};
+
+typedef struct S2 S2;
+
+struct block
+{
+    uint passed;
+};
+
+typedef struct block block;
+
+bool compare_float( float* a_6,  float* b_6)
+{
+    return fabs((*a_6) - (*b_6)) < 0.0500000007450580596923828125f;
+}
+
+bool compare_vec3( float3* a_1_1,  float3* b_1_1)
+{
+    float param_50 = (*a_1_1).x;
+    float param_1_1 = (*b_1_1).x;
+    bool _85 = compare_float(&param_50, &param_1_1);
+    bool _95;
+    if (_85)
+    {
+        float param_2_1 = (*a_1_1).y;
+        float param_3_1 = (*b_1_1).y;
+        _95 = compare_float(&param_2_1, &param_3_1);
+    }
+    else
+    {
+        _95 = _85;
+    }
+    bool _106;
+    if (_95)
+    {
+        float param_4_1 = (*a_1_1).z;
+        float param_5_1 = (*b_1_1).z;
+        _106 = compare_float(&param_4_1, &param_5_1);
+    }
+    else
+    {
+        _106 = _95;
+    }
+    return _106;
+}
+
+bool compare_mat4x3( spvMat4x3* a_2_1,  spvMat4x3* b_2_1)
+{
+    float3 param_6_1 = (*a_2_1).columns[0];
+    float3 param_7_1 = (*b_2_1).columns[0];
+    bool _116 = compare_vec3(&param_6_1, &param_7_1);
+    bool _127;
+    if (_116)
+    {
+        float3 param_8_1 = (*a_2_1).columns[1];
+        float3 param_9_1 = (*b_2_1).columns[1];
+        _127 = compare_vec3(&param_8_1, &param_9_1);
+    }
+    else
+    {
+        _127 = _116;
+    }
+    bool _138;
+    if (_127)
+    {
+        float3 param_10_1 = (*a_2_1).columns[2];
+        float3 param_11_1 = (*b_2_1).columns[2];
+        _138 = compare_vec3(&param_10_1, &param_11_1);
+    }
+    else
+    {
+        _138 = _127;
+    }
+    bool _149;
+    if (_138)
+    {
+        float3 param_12_1 = (*a_2_1).columns[3];
+        float3 param_13_1 = (*b_2_1).columns[3];
+        _149 = compare_vec3(&param_12_1, &param_13_1);
+    }
+    else
+    {
+        _149 = _138;
+    }
+    return _149;
+}
+
+bool compare_vec2( float2* a_3_1,  float2* b_3_1)
+{
+    float param_14_1 = (*a_3_1).x;
+    float param_15_1 = (*b_3_1).x;
+    bool _65 = compare_float(&param_14_1, &param_15_1);
+    bool _76;
+    if (_65)
+    {
+        float param_16_1 = (*a_3_1).y;
+        float param_17_1 = (*b_3_1).y;
+        _76 = compare_float(&param_16_1, &param_17_1);
+    }
+    else
+    {
+        _76 = _65;
+    }
+    return _76;
+}
+
+bool compare_ivec4( int4* a_4_1,  int4* b_4_1)
+{
+    return all((*a_4_1) == (*b_4_1));
+}
+
+bool compare_bool( bool* a_5_1,  bool* b_5_1)
+{
+    return (*a_5_1) == (*b_5_1);
+}
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global uint* _383)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    __local S1 s1;
+    __local S2 s2;
+    s1.a[0] = (spvMat4x3){ { (float3)(0.0f, 2.0f, -8.0f), (float3)(6.0f, 7.0f, 5.0f), (float3)(-6.0f, 1.0f, 9.0f), (float3)(-4.0f, -3.0f, 4.0f) } };
+    s1.a[1] = (spvMat4x3){ { (float3)(4.0f, 9.0f, -9.0f), (float3)(-8.0f, -9.0f, 8.0f), (float3)(0.0f, 4.0f, -4.0f), (float3)(7.0f, 2.0f, -1.0f) } };
+    s1.b = 7.0f;
+    s1.c[0] = (float2)(-5.0f, -4.0f);
+    s1.c[1] = (float2)(3.0f, -5.0f);
+    s1.c[2] = (float2)(-3.0f, -1.0f);
+    s2.a = (int4)(1, 0, -3, 1);
+    s2.b[0][0][0] = true;
+    s2.b[0][0][1] = false;
+    s2.b[0][0][2] = false;
+    s2.b[1][0][0] = true;
+    s2.b[1][0][1] = false;
+    s2.b[1][0][2] = true;
+    s2.b[2][0][0] = false;
+    s2.b[2][0][1] = true;
+    s2.b[2][0][2] = true;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    mem_fence(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
+    bool allOk_1 = true;
+    bool _242;
+    if (allOk_1)
+    {
+        spvMat4x3 param_18_1 = (spvMat4x3){ { (float3)(0.0f, 2.0f, -8.0f), (float3)(6.0f, 7.0f, 5.0f), (float3)(-6.0f, 1.0f, 9.0f), (float3)(-4.0f, -3.0f, 4.0f) } };
+        spvMat4x3 param_19_1 = s1.a[0];
+        _242 = compare_mat4x3(&param_18_1, &param_19_1);
+    }
+    else
+    {
+        _242 = allOk_1;
+    }
+    allOk_1 = _242;
+    bool _251;
+    if (allOk_1)
+    {
+        spvMat4x3 param_20_1 = (spvMat4x3){ { (float3)(4.0f, 9.0f, -9.0f), (float3)(-8.0f, -9.0f, 8.0f), (float3)(0.0f, 4.0f, -4.0f), (float3)(7.0f, 2.0f, -1.0f) } };
+        spvMat4x3 param_21_1 = s1.a[1];
+        _251 = compare_mat4x3(&param_20_1, &param_21_1);
+    }
+    else
+    {
+        _251 = allOk_1;
+    }
+    allOk_1 = _251;
+    bool _260;
+    if (allOk_1)
+    {
+        float param_22_1 = 7.0f;
+        float param_23_1 = s1.b;
+        _260 = compare_float(&param_22_1, &param_23_1);
+    }
+    else
+    {
+        _260 = allOk_1;
+    }
+    allOk_1 = _260;
+    bool _269;
+    if (allOk_1)
+    {
+        float2 param_24_1 = (float2)(-5.0f, -4.0f);
+        float2 param_25_1 = s1.c[0];
+        _269 = compare_vec2(&param_24_1, &param_25_1);
+    }
+    else
+    {
+        _269 = allOk_1;
+    }
+    allOk_1 = _269;
+    bool _278;
+    if (allOk_1)
+    {
+        float2 param_26_1 = (float2)(3.0f, -5.0f);
+        float2 param_27_1 = s1.c[1];
+        _278 = compare_vec2(&param_26_1, &param_27_1);
+    }
+    else
+    {
+        _278 = allOk_1;
+    }
+    allOk_1 = _278;
+    bool _287;
+    if (allOk_1)
+    {
+        float2 param_28_1 = (float2)(-3.0f, -1.0f);
+        float2 param_29_1 = s1.c[2];
+        _287 = compare_vec2(&param_28_1, &param_29_1);
+    }
+    else
+    {
+        _287 = allOk_1;
+    }
+    allOk_1 = _287;
+    bool _296;
+    if (allOk_1)
+    {
+        int4 param_30_1 = (int4)(1, 0, -3, 1);
+        int4 param_31_1 = s2.a;
+        _296 = compare_ivec4(&param_30_1, &param_31_1);
+    }
+    else
+    {
+        _296 = allOk_1;
+    }
+    allOk_1 = _296;
+    bool _305;
+    if (allOk_1)
+    {
+        bool param_32_1 = true;
+        bool param_33_1 = s2.b[0][0][0];
+        _305 = compare_bool(&param_32_1, &param_33_1);
+    }
+    else
+    {
+        _305 = allOk_1;
+    }
+    allOk_1 = _305;
+    bool _314;
+    if (allOk_1)
+    {
+        bool param_34_1 = false;
+        bool param_35_1 = s2.b[0][0][1];
+        _314 = compare_bool(&param_34_1, &param_35_1);
+    }
+    else
+    {
+        _314 = allOk_1;
+    }
+    allOk_1 = _314;
+    bool _323;
+    if (allOk_1)
+    {
+        bool param_36_1 = false;
+        bool param_37_1 = s2.b[0][0][2];
+        _323 = compare_bool(&param_36_1, &param_37_1);
+    }
+    else
+    {
+        _323 = allOk_1;
+    }
+    allOk_1 = _323;
+    bool _332;
+    if (allOk_1)
+    {
+        bool param_38_1 = true;
+        bool param_39_1 = s2.b[1][0][0];
+        _332 = compare_bool(&param_38_1, &param_39_1);
+    }
+    else
+    {
+        _332 = allOk_1;
+    }
+    allOk_1 = _332;
+    bool _341;
+    if (allOk_1)
+    {
+        bool param_40_1 = false;
+        bool param_41_1 = s2.b[1][0][1];
+        _341 = compare_bool(&param_40_1, &param_41_1);
+    }
+    else
+    {
+        _341 = allOk_1;
+    }
+    allOk_1 = _341;
+    bool _350;
+    if (allOk_1)
+    {
+        bool param_42_1 = true;
+        bool param_43_1 = s2.b[1][0][2];
+        _350 = compare_bool(&param_42_1, &param_43_1);
+    }
+    else
+    {
+        _350 = allOk_1;
+    }
+    allOk_1 = _350;
+    bool _359;
+    if (allOk_1)
+    {
+        bool param_44_1 = false;
+        bool param_45_1 = s2.b[2][0][0];
+        _359 = compare_bool(&param_44_1, &param_45_1);
+    }
+    else
+    {
+        _359 = allOk_1;
+    }
+    allOk_1 = _359;
+    bool _368;
+    if (allOk_1)
+    {
+        bool param_46_1 = true;
+        bool param_47_1 = s2.b[2][0][1];
+        _368 = compare_bool(&param_46_1, &param_47_1);
+    }
+    else
+    {
+        _368 = allOk_1;
+    }
+    allOk_1 = _368;
+    bool _377;
+    if (allOk_1)
+    {
+        bool param_48_1 = true;
+        bool param_49_1 = s2.b[2][0][2];
+        _377 = compare_bool(&param_48_1, &param_49_1);
+    }
+    else
+    {
+        _377 = allOk_1;
+    }
+    allOk_1 = _377;
+    if (allOk_1)
+    {
+        _383[0] += as_uint(1);
+    }
+}
+
diff --git a/reference/shaders-opencl/comp/shared-matrix-cast.comp b/reference/shaders-opencl/comp/shared-matrix-cast.comp
new file mode 100644
index 000000000..d5404ae8a
--- /dev/null
+++ b/reference/shaders-opencl/comp/shared-matrix-cast.comp
@@ -0,0 +1,174 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+typedef struct { float2 columns[3]; } spvMat3x2;
+
+struct S1
+{
+    float4 a;
+    spvMat3x2 b;
+    int4 c;
+};
+
+typedef struct S1 S1;
+
+struct block
+{
+    uint passed;
+};
+
+typedef struct block block;
+
+bool compare_float( float* a_5,  float* b_5)
+{
+    return fabs((*a_5) - (*b_5)) < 0.0500000007450580596923828125f;
+}
+
+bool compare_vec4( float4* a_1_1,  float4* b_1_1)
+{
+    float param_24 = (*a_1_1).x;
+    float param_1_1 = (*b_1_1).x;
+    bool _78 = compare_float(&param_24, &param_1_1);
+    bool _88;
+    if (_78)
+    {
+        float param_2_1 = (*a_1_1).y;
+        float param_3_1 = (*b_1_1).y;
+        _88 = compare_float(&param_2_1, &param_3_1);
+    }
+    else
+    {
+        _88 = _78;
+    }
+    bool _99;
+    if (_88)
+    {
+        float param_4_1 = (*a_1_1).z;
+        float param_5_1 = (*b_1_1).z;
+        _99 = compare_float(&param_4_1, &param_5_1);
+    }
+    else
+    {
+        _99 = _88;
+    }
+    bool _110;
+    if (_99)
+    {
+        float param_6_1 = (*a_1_1).w;
+        float param_7_1 = (*b_1_1).w;
+        _110 = compare_float(&param_6_1, &param_7_1);
+    }
+    else
+    {
+        _110 = _99;
+    }
+    return _110;
+}
+
+bool compare_vec2( float2* a_2_1,  float2* b_2_1)
+{
+    float param_8_1 = (*a_2_1).x;
+    float param_9_1 = (*b_2_1).x;
+    bool _58 = compare_float(&param_8_1, &param_9_1);
+    bool _69;
+    if (_58)
+    {
+        float param_10_1 = (*a_2_1).y;
+        float param_11_1 = (*b_2_1).y;
+        _69 = compare_float(&param_10_1, &param_11_1);
+    }
+    else
+    {
+        _69 = _58;
+    }
+    return _69;
+}
+
+bool compare_mat3x2( spvMat3x2* a_3_1,  spvMat3x2* b_3_1)
+{
+    float2 param_12_1 = (*a_3_1).columns[0];
+    float2 param_13_1 = (*b_3_1).columns[0];
+    bool _121 = compare_vec2(&param_12_1, &param_13_1);
+    bool _132;
+    if (_121)
+    {
+        float2 param_14_1 = (*a_3_1).columns[1];
+        float2 param_15_1 = (*b_3_1).columns[1];
+        _132 = compare_vec2(&param_14_1, &param_15_1);
+    }
+    else
+    {
+        _132 = _121;
+    }
+    bool _143;
+    if (_132)
+    {
+        float2 param_16_1 = (*a_3_1).columns[2];
+        float2 param_17_1 = (*b_3_1).columns[2];
+        _143 = compare_vec2(&param_16_1, &param_17_1);
+    }
+    else
+    {
+        _143 = _132;
+    }
+    return _143;
+}
+
+bool compare_bvec4( int4* a_4_1,  int4* b_4_1)
+{
+    return all((*a_4_1) == (*b_4_1));
+}
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global uint* _212)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    __local S1 s1;
+    s1.a = (float4)(1.0f, -5.0f, -9.0f, -5.0f);
+    s1.b = (spvMat3x2){ { (float2)(1.0f, -7.0f), (float2)(1.0f, 2.0f), (float2)(8.0f, 7.0f) } };
+    s1.c = (int4)(false, true, false, false);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    mem_fence(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
+    bool allOk_1 = true;
+    bool _188;
+    if (allOk_1)
+    {
+        float4 param_18_1 = (float4)(1.0f, -5.0f, -9.0f, -5.0f);
+        float4 param_19_1 = s1.a;
+        _188 = compare_vec4(&param_18_1, &param_19_1);
+    }
+    else
+    {
+        _188 = allOk_1;
+    }
+    allOk_1 = _188;
+    bool _197;
+    if (allOk_1)
+    {
+        spvMat3x2 param_20_1 = (spvMat3x2){ { (float2)(1.0f, -7.0f), (float2)(1.0f, 2.0f), (float2)(8.0f, 7.0f) } };
+        spvMat3x2 param_21_1 = s1.b;
+        _197 = compare_mat3x2(&param_20_1, &param_21_1);
+    }
+    else
+    {
+        _197 = allOk_1;
+    }
+    allOk_1 = _197;
+    bool _206;
+    if (allOk_1)
+    {
+        int4 param_22_1 = (int4)(false, true, false, false);
+        int4 param_23_1 = s1.c;
+        _206 = compare_bvec4(&param_22_1, &param_23_1);
+    }
+    else
+    {
+        _206 = allOk_1;
+    }
+    allOk_1 = _206;
+    if (allOk_1)
+    {
+        _212[0] += as_uint(1);
+    }
+}
+
diff --git a/reference/shaders-opencl/comp/shared-matrix-nested-struct-array.comp b/reference/shaders-opencl/comp/shared-matrix-nested-struct-array.comp
new file mode 100644
index 000000000..9b2d8f159
--- /dev/null
+++ b/reference/shaders-opencl/comp/shared-matrix-nested-struct-array.comp
@@ -0,0 +1,401 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+typedef struct { float2 columns[2]; } spvMat2;
+typedef struct { float3 columns[2]; } spvMat2x3;
+typedef struct { float2 columns[3]; } spvMat3x2;
+typedef struct { float3 columns[4]; } spvMat4x3;
+
+struct sA
+{
+    spvMat2x3 mA;
+};
+
+typedef struct sA sA;
+
+struct sB
+{
+    spvMat2 mA;
+    spvMat3x2 mB;
+    uint3 mC;
+};
+
+typedef struct sB sB;
+
+struct sC
+{
+    sA mA;
+    sB mB;
+};
+
+typedef struct sC sC;
+
+struct sD
+{
+    sC mA;
+};
+
+typedef struct sD sD;
+
+struct sE
+{
+    spvMat3x2 mA;
+    spvMat4x3 mB;
+};
+
+typedef struct sE sE;
+
+struct sF
+{
+    sE mA;
+};
+
+typedef struct sF sF;
+
+struct sG
+{
+    sF mA;
+};
+
+typedef struct sG sG;
+
+struct sH
+{
+    int3 mA[2];
+};
+
+typedef struct sH sH;
+
+struct S1
+{
+    sD a;
+    sG b;
+    sH c[2];
+};
+
+typedef struct S1 S1;
+
+struct block
+{
+    uint passed;
+};
+
+typedef struct block block;
+
+bool compare_float( float* a_9,  float* b_9)
+{
+    return fabs((*a_9) - (*b_9)) < 0.0500000007450580596923828125f;
+}
+
+bool compare_vec3( float3* a_1_1,  float3* b_1_1)
+{
+    float param_52 = (*a_1_1).x;
+    float param_1_1 = (*b_1_1).x;
+    bool _106 = compare_float(&param_52, &param_1_1);
+    bool _116;
+    if (_106)
+    {
+        float param_2_1 = (*a_1_1).y;
+        float param_3_1 = (*b_1_1).y;
+        _116 = compare_float(&param_2_1, &param_3_1);
+    }
+    else
+    {
+        _116 = _106;
+    }
+    bool _127;
+    if (_116)
+    {
+        float param_4_1 = (*a_1_1).z;
+        float param_5_1 = (*b_1_1).z;
+        _127 = compare_float(&param_4_1, &param_5_1);
+    }
+    else
+    {
+        _127 = _116;
+    }
+    return _127;
+}
+
+bool compare_mat2x3( spvMat2x3* a_2_1,  spvMat2x3* b_2_1)
+{
+    float3 param_6_1 = (*a_2_1).columns[0];
+    float3 param_7_1 = (*b_2_1).columns[0];
+    bool _158 = compare_vec3(&param_6_1, &param_7_1);
+    bool _168;
+    if (_158)
+    {
+        float3 param_8_1 = (*a_2_1).columns[1];
+        float3 param_9_1 = (*b_2_1).columns[1];
+        _168 = compare_vec3(&param_8_1, &param_9_1);
+    }
+    else
+    {
+        _168 = _158;
+    }
+    return _168;
+}
+
+bool compare_vec2( float2* a_3_1,  float2* b_3_1)
+{
+    float param_10_1 = (*a_3_1).x;
+    float param_11_1 = (*b_3_1).x;
+    bool _86 = compare_float(&param_10_1, &param_11_1);
+    bool _97;
+    if (_86)
+    {
+        float param_12_1 = (*a_3_1).y;
+        float param_13_1 = (*b_3_1).y;
+        _97 = compare_float(&param_12_1, &param_13_1);
+    }
+    else
+    {
+        _97 = _86;
+    }
+    return _97;
+}
+
+bool compare_mat2( spvMat2* a_4_1,  spvMat2* b_4_1)
+{
+    float2 param_14_1 = (*a_4_1).columns[0];
+    float2 param_15_1 = (*b_4_1).columns[0];
+    bool _138 = compare_vec2(&param_14_1, &param_15_1);
+    bool _149;
+    if (_138)
+    {
+        float2 param_16_1 = (*a_4_1).columns[1];
+        float2 param_17_1 = (*b_4_1).columns[1];
+        _149 = compare_vec2(&param_16_1, &param_17_1);
+    }
+    else
+    {
+        _149 = _138;
+    }
+    return _149;
+}
+
+bool compare_mat3x2( spvMat3x2* a_5_1,  spvMat3x2* b_5_1)
+{
+    float2 param_18_1 = (*a_5_1).columns[0];
+    float2 param_19_1 = (*b_5_1).columns[0];
+    bool _177 = compare_vec2(&param_18_1, &param_19_1);
+    bool _187;
+    if (_177)
+    {
+        float2 param_20_1 = (*a_5_1).columns[1];
+        float2 param_21_1 = (*b_5_1).columns[1];
+        _187 = compare_vec2(&param_20_1, &param_21_1);
+    }
+    else
+    {
+        _187 = _177;
+    }
+    bool _198;
+    if (_187)
+    {
+        float2 param_22_1 = (*a_5_1).columns[2];
+        float2 param_23_1 = (*b_5_1).columns[2];
+        _198 = compare_vec2(&param_22_1, &param_23_1);
+    }
+    else
+    {
+        _198 = _187;
+    }
+    return _198;
+}
+
+bool compare_uvec3( uint3* a_6_1,  uint3* b_6_1)
+{
+    return all((*a_6_1) == (*b_6_1));
+}
+
+bool compare_mat4x3( spvMat4x3* a_7_1,  spvMat4x3* b_7_1)
+{
+    float3 param_24_1 = (*a_7_1).columns[0];
+    float3 param_25_1 = (*b_7_1).columns[0];
+    bool _207 = compare_vec3(&param_24_1, &param_25_1);
+    bool _217;
+    if (_207)
+    {
+        float3 param_26_1 = (*a_7_1).columns[1];
+        float3 param_27_1 = (*b_7_1).columns[1];
+        _217 = compare_vec3(&param_26_1, &param_27_1);
+    }
+    else
+    {
+        _217 = _207;
+    }
+    bool _227;
+    if (_217)
+    {
+        float3 param_28_1 = (*a_7_1).columns[2];
+        float3 param_29_1 = (*b_7_1).columns[2];
+        _227 = compare_vec3(&param_28_1, &param_29_1);
+    }
+    else
+    {
+        _227 = _217;
+    }
+    bool _238;
+    if (_227)
+    {
+        float3 param_30_1 = (*a_7_1).columns[3];
+        float3 param_31_1 = (*b_7_1).columns[3];
+        _238 = compare_vec3(&param_30_1, &param_31_1);
+    }
+    else
+    {
+        _238 = _227;
+    }
+    return _238;
+}
+
+bool compare_bvec3( int3* a_8_1,  int3* b_8_1)
+{
+    return all((*a_8_1) == (*b_8_1));
+}
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global uint* _424)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    __local S1 s1;
+    s1.a.mA.mA.mA = (spvMat2x3){ { (float3)(6.0f, 8.0f, 8.0f), (float3)(0.0f, -4.0f, -5.0f) } };
+    s1.a.mA.mB.mA = (spvMat2){ { (float2)(9.0f, -4.0f), (float2)(-6.0f, -1.0f) } };
+    s1.a.mA.mB.mB = (spvMat3x2){ { (float2)(-1.0f, -2.0f), (float2)(1.0f, 6.0f), (float2)(5.0f, 7.0f) } };
+    s1.a.mA.mB.mC = (uint3)(3u, 1u, 5u);
+    s1.b.mA.mA.mA = (spvMat3x2){ { (float2)(8.0f, 3.0f), (float2)(0.0f, 2.0f), (float2)(1.0f, 8.0f) } };
+    s1.b.mA.mA.mB = (spvMat4x3){ { (float3)(0.0f, 9.0f, -1.0f), (float3)(-1.0f, -7.0f, 7.0f), (float3)(-4.0f, -3.0f, 1.0f), (float3)(-4.0f, -9.0f, 1.0f) } };
+    s1.c[0].mA[0] = (int3)(true, false, false);
+    s1.c[0].mA[1] = (int3)(true, false, false);
+    s1.c[1].mA[0] = (int3)(false);
+    s1.c[1].mA[1] = (int3)(false);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    mem_fence(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
+    bool allOk_1 = true;
+    bool _337;
+    if (allOk_1)
+    {
+        spvMat2x3 param_32_1 = (spvMat2x3){ { (float3)(6.0f, 8.0f, 8.0f), (float3)(0.0f, -4.0f, -5.0f) } };
+        spvMat2x3 param_33_1 = s1.a.mA.mA.mA;
+        _337 = compare_mat2x3(&param_32_1, &param_33_1);
+    }
+    else
+    {
+        _337 = allOk_1;
+    }
+    allOk_1 = _337;
+    bool _346;
+    if (allOk_1)
+    {
+        spvMat2 param_34_1 = (spvMat2){ { (float2)(9.0f, -4.0f), (float2)(-6.0f, -1.0f) } };
+        spvMat2 param_35_1 = s1.a.mA.mB.mA;
+        _346 = compare_mat2(&param_34_1, &param_35_1);
+    }
+    else
+    {
+        _346 = allOk_1;
+    }
+    allOk_1 = _346;
+    bool _355;
+    if (allOk_1)
+    {
+        spvMat3x2 param_36_1 = (spvMat3x2){ { (float2)(-1.0f, -2.0f), (float2)(1.0f, 6.0f), (float2)(5.0f, 7.0f) } };
+        spvMat3x2 param_37_1 = s1.a.mA.mB.mB;
+        _355 = compare_mat3x2(&param_36_1, &param_37_1);
+    }
+    else
+    {
+        _355 = allOk_1;
+    }
+    allOk_1 = _355;
+    bool _364;
+    if (allOk_1)
+    {
+        uint3 param_38_1 = (uint3)(3u, 1u, 5u);
+        uint3 param_39_1 = s1.a.mA.mB.mC;
+        _364 = compare_uvec3(&param_38_1, &param_39_1);
+    }
+    else
+    {
+        _364 = allOk_1;
+    }
+    allOk_1 = _364;
+    bool _373;
+    if (allOk_1)
+    {
+        spvMat3x2 param_40_1 = (spvMat3x2){ { (float2)(8.0f, 3.0f), (float2)(0.0f, 2.0f), (float2)(1.0f, 8.0f) } };
+        spvMat3x2 param_41_1 = s1.b.mA.mA.mA;
+        _373 = compare_mat3x2(&param_40_1, &param_41_1);
+    }
+    else
+    {
+        _373 = allOk_1;
+    }
+    allOk_1 = _373;
+    bool _382;
+    if (allOk_1)
+    {
+        spvMat4x3 param_42_1 = (spvMat4x3){ { (float3)(0.0f, 9.0f, -1.0f), (float3)(-1.0f, -7.0f, 7.0f), (float3)(-4.0f, -3.0f, 1.0f), (float3)(-4.0f, -9.0f, 1.0f) } };
+        spvMat4x3 param_43_1 = s1.b.mA.mA.mB;
+        _382 = compare_mat4x3(&param_42_1, &param_43_1);
+    }
+    else
+    {
+        _382 = allOk_1;
+    }
+    allOk_1 = _382;
+    bool _391;
+    if (allOk_1)
+    {
+        int3 param_44_1 = (int3)(true, false, false);
+        int3 param_45_1 = s1.c[0].mA[0];
+        _391 = compare_bvec3(&param_44_1, &param_45_1);
+    }
+    else
+    {
+        _391 = allOk_1;
+    }
+    allOk_1 = _391;
+    bool _400;
+    if (allOk_1)
+    {
+        int3 param_46_1 = (int3)(true, false, false);
+        int3 param_47_1 = s1.c[0].mA[1];
+        _400 = compare_bvec3(&param_46_1, &param_47_1);
+    }
+    else
+    {
+        _400 = allOk_1;
+    }
+    allOk_1 = _400;
+    bool _409;
+    if (allOk_1)
+    {
+        int3 param_48_1 = (int3)(false);
+        int3 param_49_1 = s1.c[1].mA[0];
+        _409 = compare_bvec3(&param_48_1, &param_49_1);
+    }
+    else
+    {
+        _409 = allOk_1;
+    }
+    allOk_1 = _409;
+    bool _418;
+    if (allOk_1)
+    {
+        int3 param_50_1 = (int3)(false);
+        int3 param_51_1 = s1.c[1].mA[1];
+        _418 = compare_bvec3(&param_50_1, &param_51_1);
+    }
+    else
+    {
+        _418 = allOk_1;
+    }
+    allOk_1 = _418;
+    if (allOk_1)
+    {
+        _424[0] += as_uint(1);
+    }
+}
+
diff --git a/reference/shaders-opencl/comp/shared-matrix-nested-struct.comp b/reference/shaders-opencl/comp/shared-matrix-nested-struct.comp
new file mode 100644
index 000000000..b9f2423e4
--- /dev/null
+++ b/reference/shaders-opencl/comp/shared-matrix-nested-struct.comp
@@ -0,0 +1,598 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+typedef struct { float2 columns[2]; } spvMat2;
+typedef struct { float2 columns[3]; } spvMat3x2;
+typedef struct { float4 columns[4]; } spvMat4;
+
+struct S1
+{
+    uint a;
+    float4 b;
+};
+
+typedef struct S1 S1;
+
+struct sA
+{
+    spvMat4 mA;
+    int3 mB;
+    int4 mC;
+};
+
+typedef struct sA sA;
+
+struct sB
+{
+    int2 mA;
+};
+
+typedef struct sB sB;
+
+struct sC
+{
+    float mA;
+    uint4 mB;
+    float mC;
+};
+
+typedef struct sC sC;
+
+struct sD
+{
+    sA mA;
+    sB mB;
+    sC mC;
+};
+
+typedef struct sD sD;
+
+struct sE
+{
+    sD mA;
+};
+
+typedef struct sE sE;
+
+struct sF
+{
+    uint3 mA;
+    bool mB;
+};
+
+typedef struct sF sF;
+
+struct sG
+{
+    sF mA;
+    spvMat3x2 mB;
+};
+
+typedef struct sG sG;
+
+struct sH
+{
+    sG mA;
+    float2 mB;
+};
+
+typedef struct sH sH;
+
+struct sI
+{
+    spvMat2 mA;
+    int3 mB;
+    int4 mC;
+};
+
+typedef struct sI sI;
+
+struct sJ
+{
+    sI mA;
+    int3 mB;
+};
+
+typedef struct sJ sJ;
+
+struct sK
+{
+    int2 mA;
+    sJ mB;
+    int2 mC;
+};
+
+typedef struct sK sK;
+
+struct S2
+{
+    sE a;
+    int3 b;
+    sH c;
+    sK d;
+};
+
+typedef struct S2 S2;
+
+struct block
+{
+    uint passed;
+};
+
+typedef struct block block;
+
+bool compare_uint( uint* a_15,  uint* b_15)
+{
+    return (*a_15) == (*b_15);
+}
+
+bool compare_float( float* a_1_1,  float* b_1_1)
+{
+    return fabs((*a_1_1) - (*b_1_1)) < 0.0500000007450580596923828125f;
+}
+
+bool compare_vec4( float4* a_2_1,  float4* b_2_1)
+{
+    float param_70 = (*a_2_1).x;
+    float param_1_1 = (*b_2_1).x;
+    bool _147 = compare_float(&param_70, &param_1_1);
+    bool _157;
+    if (_147)
+    {
+        float param_2_1 = (*a_2_1).y;
+        float param_3_1 = (*b_2_1).y;
+        _157 = compare_float(&param_2_1, &param_3_1);
+    }
+    else
+    {
+        _157 = _147;
+    }
+    bool _168;
+    if (_157)
+    {
+        float param_4_1 = (*a_2_1).z;
+        float param_5_1 = (*b_2_1).z;
+        _168 = compare_float(&param_4_1, &param_5_1);
+    }
+    else
+    {
+        _168 = _157;
+    }
+    bool _179;
+    if (_168)
+    {
+        float param_6_1 = (*a_2_1).w;
+        float param_7_1 = (*b_2_1).w;
+        _179 = compare_float(&param_6_1, &param_7_1);
+    }
+    else
+    {
+        _179 = _168;
+    }
+    return _179;
+}
+
+bool compare_mat4( spvMat4* a_3_1,  spvMat4* b_3_1)
+{
+    float4 param_8_1 = (*a_3_1).columns[0];
+    float4 param_9_1 = (*b_3_1).columns[0];
+    bool _239 = compare_vec4(&param_8_1, &param_9_1);
+    bool _249;
+    if (_239)
+    {
+        float4 param_10_1 = (*a_3_1).columns[1];
+        float4 param_11_1 = (*b_3_1).columns[1];
+        _249 = compare_vec4(&param_10_1, &param_11_1);
+    }
+    else
+    {
+        _249 = _239;
+    }
+    bool _259;
+    if (_249)
+    {
+        float4 param_12_1 = (*a_3_1).columns[2];
+        float4 param_13_1 = (*b_3_1).columns[2];
+        _259 = compare_vec4(&param_12_1, &param_13_1);
+    }
+    else
+    {
+        _259 = _249;
+    }
+    bool _270;
+    if (_259)
+    {
+        float4 param_14_1 = (*a_3_1).columns[3];
+        float4 param_15_1 = (*b_3_1).columns[3];
+        _270 = compare_vec4(&param_14_1, &param_15_1);
+    }
+    else
+    {
+        _270 = _259;
+    }
+    return _270;
+}
+
+bool compare_bvec3( int3* a_4_1,  int3* b_4_1)
+{
+    return all((*a_4_1) == (*b_4_1));
+}
+
+bool compare_bvec4( int4* a_5_1,  int4* b_5_1)
+{
+    return all((*a_5_1) == (*b_5_1));
+}
+
+bool compare_bvec2( int2* a_6_1,  int2* b_6_1)
+{
+    return all((*a_6_1) == (*b_6_1));
+}
+
+bool compare_uvec4( uint4* a_7_1,  uint4* b_7_1)
+{
+    return all((*a_7_1) == (*b_7_1));
+}
+
+bool compare_ivec3( int3* a_8_1,  int3* b_8_1)
+{
+    return all((*a_8_1) == (*b_8_1));
+}
+
+bool compare_uvec3( uint3* a_9_1,  uint3* b_9_1)
+{
+    return all((*a_9_1) == (*b_9_1));
+}
+
+bool compare_bool( bool* a_10_1,  bool* b_10_1)
+{
+    return (*a_10_1) == (*b_10_1);
+}
+
+bool compare_vec2( float2* a_11_1,  float2* b_11_1)
+{
+    float param_16_1 = (*a_11_1).x;
+    float param_17_1 = (*b_11_1).x;
+    bool _127 = compare_float(&param_16_1, &param_17_1);
+    bool _138;
+    if (_127)
+    {
+        float param_18_1 = (*a_11_1).y;
+        float param_19_1 = (*b_11_1).y;
+        _138 = compare_float(&param_18_1, &param_19_1);
+    }
+    else
+    {
+        _138 = _127;
+    }
+    return _138;
+}
+
+bool compare_mat3x2( spvMat3x2* a_12_1,  spvMat3x2* b_12_1)
+{
+    float2 param_20_1 = (*a_12_1).columns[0];
+    float2 param_21_1 = (*b_12_1).columns[0];
+    bool _209 = compare_vec2(&param_20_1, &param_21_1);
+    bool _219;
+    if (_209)
+    {
+        float2 param_22_1 = (*a_12_1).columns[1];
+        float2 param_23_1 = (*b_12_1).columns[1];
+        _219 = compare_vec2(&param_22_1, &param_23_1);
+    }
+    else
+    {
+        _219 = _209;
+    }
+    bool _230;
+    if (_219)
+    {
+        float2 param_24_1 = (*a_12_1).columns[2];
+        float2 param_25_1 = (*b_12_1).columns[2];
+        _230 = compare_vec2(&param_24_1, &param_25_1);
+    }
+    else
+    {
+        _230 = _219;
+    }
+    return _230;
+}
+
+bool compare_mat2( spvMat2* a_13_1,  spvMat2* b_13_1)
+{
+    float2 param_26_1 = (*a_13_1).columns[0];
+    float2 param_27_1 = (*b_13_1).columns[0];
+    bool _189 = compare_vec2(&param_26_1, &param_27_1);
+    bool _200;
+    if (_189)
+    {
+        float2 param_28_1 = (*a_13_1).columns[1];
+        float2 param_29_1 = (*b_13_1).columns[1];
+        _200 = compare_vec2(&param_28_1, &param_29_1);
+    }
+    else
+    {
+        _200 = _189;
+    }
+    return _200;
+}
+
+bool compare_ivec2( int2* a_14_1,  int2* b_14_1)
+{
+    return all((*a_14_1) == (*b_14_1));
+}
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global uint* _612)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    __local S1 s1;
+    __local S2 s2;
+    s1.a = 0u;
+    s1.b = (float4)(8.0f, 8.0f, 0.0f, -4.0f);
+    s2.a.mA.mA.mA = (spvMat4){ { (float4)(-5.0f, 9.0f, -4.0f, -6.0f), (float4)(-1.0f, -1.0f, -2.0f, 1.0f), (float4)(6.0f, 5.0f, 7.0f, -2.0f), (float4)(-4.0f, -9.0f, 8.0f, 3.0f) } };
+    s2.a.mA.mA.mB = (int3)(true, false, false);
+    s2.a.mA.mA.mC = (int4)(true, true, true, false);
+    s2.a.mA.mB.mA = (int2)(true);
+    s2.a.mA.mC.mA = 7.0f;
+    s2.a.mA.mC.mB = (uint4)(8u, 6u, 2u, 0u);
+    s2.a.mA.mC.mC = -9.0f;
+    s2.b = (int3)(1, -4, 0);
+    s2.c.mA.mA.mA = (uint3)(4u, 9u, 1u);
+    s2.c.mA.mA.mB = false;
+    s2.c.mA.mB = (spvMat3x2){ { (float2)(3.0f, -5.0f), (float2)(-1.0f, -5.0f), (float2)(-1.0f, -9.0f) } };
+    s2.c.mB = (float2)(-6.0f, -9.0f);
+    s2.d.mA = (int2)(true, false);
+    s2.d.mB.mA.mA = (spvMat2){ { (float2)(-2.0f, 3.0f), (float2)(7.0f, 2.0f) } };
+    s2.d.mB.mA.mB = (int3)(false);
+    s2.d.mB.mA.mC = (int4)(false, false, false, true);
+    s2.d.mB.mB = (int3)(true, false, false);
+    s2.d.mC = (int2)(-9, 0);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    mem_fence(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
+    bool allOk_1 = true;
+    bool _435;
+    if (allOk_1)
+    {
+        uint param_30_1 = 0u;
+        uint param_31_1 = s1.a;
+        _435 = compare_uint(&param_30_1, &param_31_1);
+    }
+    else
+    {
+        _435 = allOk_1;
+    }
+    allOk_1 = _435;
+    bool _444;
+    if (allOk_1)
+    {
+        float4 param_32_1 = (float4)(8.0f, 8.0f, 0.0f, -4.0f);
+        float4 param_33_1 = s1.b;
+        _444 = compare_vec4(&param_32_1, &param_33_1);
+    }
+    else
+    {
+        _444 = allOk_1;
+    }
+    allOk_1 = _444;
+    bool _453;
+    if (allOk_1)
+    {
+        spvMat4 param_34_1 = (spvMat4){ { (float4)(-5.0f, 9.0f, -4.0f, -6.0f), (float4)(-1.0f, -1.0f, -2.0f, 1.0f), (float4)(6.0f, 5.0f, 7.0f, -2.0f), (float4)(-4.0f, -9.0f, 8.0f, 3.0f) } };
+        spvMat4 param_35_1 = s2.a.mA.mA.mA;
+        _453 = compare_mat4(&param_34_1, &param_35_1);
+    }
+    else
+    {
+        _453 = allOk_1;
+    }
+    allOk_1 = _453;
+    bool _462;
+    if (allOk_1)
+    {
+        int3 param_36_1 = (int3)(true, false, false);
+        int3 param_37_1 = s2.a.mA.mA.mB;
+        _462 = compare_bvec3(&param_36_1, &param_37_1);
+    }
+    else
+    {
+        _462 = allOk_1;
+    }
+    allOk_1 = _462;
+    bool _471;
+    if (allOk_1)
+    {
+        int4 param_38_1 = (int4)(true, true, true, false);
+        int4 param_39_1 = s2.a.mA.mA.mC;
+        _471 = compare_bvec4(&param_38_1, &param_39_1);
+    }
+    else
+    {
+        _471 = allOk_1;
+    }
+    allOk_1 = _471;
+    bool _480;
+    if (allOk_1)
+    {
+        int2 param_40_1 = (int2)(true);
+        int2 param_41_1 = s2.a.mA.mB.mA;
+        _480 = compare_bvec2(&param_40_1, &param_41_1);
+    }
+    else
+    {
+        _480 = allOk_1;
+    }
+    allOk_1 = _480;
+    bool _489;
+    if (allOk_1)
+    {
+        float param_42_1 = 7.0f;
+        float param_43_1 = s2.a.mA.mC.mA;
+        _489 = compare_float(&param_42_1, &param_43_1);
+    }
+    else
+    {
+        _489 = allOk_1;
+    }
+    allOk_1 = _489;
+    bool _498;
+    if (allOk_1)
+    {
+        uint4 param_44_1 = (uint4)(8u, 6u, 2u, 0u);
+        uint4 param_45_1 = s2.a.mA.mC.mB;
+        _498 = compare_uvec4(&param_44_1, &param_45_1);
+    }
+    else
+    {
+        _498 = allOk_1;
+    }
+    allOk_1 = _498;
+    bool _507;
+    if (allOk_1)
+    {
+        float param_46_1 = -9.0f;
+        float param_47_1 = s2.a.mA.mC.mC;
+        _507 = compare_float(&param_46_1, &param_47_1);
+    }
+    else
+    {
+        _507 = allOk_1;
+    }
+    allOk_1 = _507;
+    bool _516;
+    if (allOk_1)
+    {
+        int3 param_48_1 = (int3)(1, -4, 0);
+        int3 param_49_1 = s2.b;
+        _516 = compare_ivec3(&param_48_1, &param_49_1);
+    }
+    else
+    {
+        _516 = allOk_1;
+    }
+    allOk_1 = _516;
+    bool _525;
+    if (allOk_1)
+    {
+        uint3 param_50_1 = (uint3)(4u, 9u, 1u);
+        uint3 param_51_1 = s2.c.mA.mA.mA;
+        _525 = compare_uvec3(&param_50_1, &param_51_1);
+    }
+    else
+    {
+        _525 = allOk_1;
+    }
+    allOk_1 = _525;
+    bool _534;
+    if (allOk_1)
+    {
+        bool param_52_1 = false;
+        bool param_53_1 = s2.c.mA.mA.mB;
+        _534 = compare_bool(&param_52_1, &param_53_1);
+    }
+    else
+    {
+        _534 = allOk_1;
+    }
+    allOk_1 = _534;
+    bool _543;
+    if (allOk_1)
+    {
+        spvMat3x2 param_54_1 = (spvMat3x2){ { (float2)(3.0f, -5.0f), (float2)(-1.0f, -5.0f), (float2)(-1.0f, -9.0f) } };
+        spvMat3x2 param_55_1 = s2.c.mA.mB;
+        _543 = compare_mat3x2(&param_54_1, &param_55_1);
+    }
+    else
+    {
+        _543 = allOk_1;
+    }
+    allOk_1 = _543;
+    bool _552;
+    if (allOk_1)
+    {
+        float2 param_56_1 = (float2)(-6.0f, -9.0f);
+        float2 param_57_1 = s2.c.mB;
+        _552 = compare_vec2(&param_56_1, &param_57_1);
+    }
+    else
+    {
+        _552 = allOk_1;
+    }
+    allOk_1 = _552;
+    bool _561;
+    if (allOk_1)
+    {
+        int2 param_58_1 = (int2)(true, false);
+        int2 param_59_1 = s2.d.mA;
+        _561 = compare_bvec2(&param_58_1, &param_59_1);
+    }
+    else
+    {
+        _561 = allOk_1;
+    }
+    allOk_1 = _561;
+    bool _570;
+    if (allOk_1)
+    {
+        spvMat2 param_60_1 = (spvMat2){ { (float2)(-2.0f, 3.0f), (float2)(7.0f, 2.0f) } };
+        spvMat2 param_61_1 = s2.d.mB.mA.mA;
+        _570 = compare_mat2(&param_60_1, &param_61_1);
+    }
+    else
+    {
+        _570 = allOk_1;
+    }
+    allOk_1 = _570;
+    bool _579;
+    if (allOk_1)
+    {
+        int3 param_62_1 = (int3)(false);
+        int3 param_63_1 = s2.d.mB.mA.mB;
+        _579 = compare_bvec3(&param_62_1, &param_63_1);
+    }
+    else
+    {
+        _579 = allOk_1;
+    }
+    allOk_1 = _579;
+    bool _588;
+    if (allOk_1)
+    {
+        int4 param_64_1 = (int4)(false, false, false, true);
+        int4 param_65_1 = s2.d.mB.mA.mC;
+        _588 = compare_bvec4(&param_64_1, &param_65_1);
+    }
+    else
+    {
+        _588 = allOk_1;
+    }
+    allOk_1 = _588;
+    bool _597;
+    if (allOk_1)
+    {
+        int3 param_66_1 = (int3)(true, false, false);
+        int3 param_67_1 = s2.d.mB.mB;
+        _597 = compare_bvec3(&param_66_1, &param_67_1);
+    }
+    else
+    {
+        _597 = allOk_1;
+    }
+    allOk_1 = _597;
+    bool _606;
+    if (allOk_1)
+    {
+        int2 param_68_1 = (int2)(-9, 0);
+        int2 param_69_1 = s2.d.mC;
+        _606 = compare_ivec2(&param_68_1, &param_69_1);
+    }
+    else
+    {
+        _606 = allOk_1;
+    }
+    allOk_1 = _606;
+    if (allOk_1)
+    {
+        _612[0] += as_uint(1);
+    }
+}
+
diff --git a/reference/shaders-opencl/comp/shared-std450.double.comp b/reference/shaders-opencl/comp/shared-std450.fp64.comp
similarity index 100%
rename from reference/shaders-opencl/comp/shared-std450.double.comp
rename to reference/shaders-opencl/comp/shared-std450.fp64.comp
diff --git a/reference/shaders-opencl/comp/struct-layout.comp b/reference/shaders-opencl/comp/struct-layout.comp
index eb416ee27..375cfed37 100644
--- a/reference/shaders-opencl/comp/struct-layout.comp
+++ b/reference/shaders-opencl/comp/struct-layout.comp
@@ -1,9 +1,11 @@
 // Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
 
 
+typedef struct { float4 columns[4]; } spvMat4;
+
 struct Foo
 {
-    float4 m;
+    spvMat4 m;
 };
 
 typedef struct Foo Foo;
@@ -22,11 +24,26 @@ struct SSBO
 
 typedef struct SSBO SSBO;
 
+static float4 spvMulMat4Vec4(spvMat4 m, float4 v)
+{
+    return m.columns[0] * v.x + m.columns[1] * v.y + m.columns[2] * v.z + m.columns[3] * v.w;
+}
+
+static spvMat4 spvMulMat4Mat4(spvMat4 a, spvMat4 b)
+{
+    spvMat4 r;
+    r.columns[0] = spvMulMat4Vec4(a, b.columns[0]);
+    r.columns[1] = spvMulMat4Vec4(a, b.columns[1]);
+    r.columns[2] = spvMulMat4Vec4(a, b.columns[2]);
+    r.columns[3] = spvMulMat4Vec4(a, b.columns[3]);
+    return r;
+}
+
 __attribute__((reqd_work_group_size(1, 1, 1)))
 __kernel void comp_main(__global Foo* _23, __global const Foo* _30)
 {
     uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
-    uint ident = ((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x;
-    _23[ident].m = _30[ident].m * _30[ident].m;
+    uint ident_1 = ((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x;
+    _23[ident_1].m = spvMulMat4Mat4(_30[ident_1].m, _30[ident_1].m);
 }
 
diff --git a/reference/shaders-opencl/comp/struct-packing.comp b/reference/shaders-opencl/comp/struct-packing.comp
new file mode 100644
index 000000000..6fd919f9f
--- /dev/null
+++ b/reference/shaders-opencl/comp/struct-packing.comp
@@ -0,0 +1,125 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+typedef struct { float2 columns[2]; } spvMat2;
+typedef struct { float3 columns[2]; } spvMat2x3;
+typedef struct { float2 columns[3]; } spvMat3x2;
+
+struct S0
+{
+    float2 a[1];
+    float b;
+};
+
+typedef struct S0 S0;
+
+struct S1
+{
+    float3 a;
+    float b;
+};
+
+typedef struct S1 S1;
+
+struct S2
+{
+    float3 a[1];
+    float b;
+};
+
+typedef struct S2 S2;
+
+struct S3
+{
+    float2 a;
+    float b;
+};
+
+typedef struct S3 S3;
+
+struct S4
+{
+    float2 c;
+};
+
+typedef struct S4 S4;
+
+struct Content
+{
+    S0 m0s[1];
+    S1 m1s[1];
+    S2 m2s[1];
+    S0 m0;
+    S1 m1;
+    S2 m2;
+    S3 m3;
+    float m4;
+    S4 m3s[8];
+};
+
+typedef struct Content Content;
+
+struct SSBO1
+{
+    Content content;
+    Content content1[2];
+    Content content2;
+    spvMat2 m0;
+    spvMat2 m1;
+    spvMat2x3 m2[4];
+    spvMat3x2 m3;
+    spvMat2 m4;
+    spvMat2 m5[9];
+    spvMat3x2 m6[4][2];
+    spvMat2x3 m7;
+    float array[1];
+};
+
+typedef struct SSBO1 SSBO1;
+
+struct SSBO0
+{
+    Content content;
+    Content content1[2];
+    Content content2;
+    float array[1];
+};
+
+typedef struct SSBO0 SSBO0;
+
+static float3 spvMulVec2Mat3x2(float2 v, spvMat3x2 m)
+{
+    return (float3)(dot(v, m.columns[0]), dot(v, m.columns[1]), dot(v, m.columns[2]));
+}
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global SSBO1* ssbo_430, __global SSBO0* ssbo_140)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    Content _60 = ssbo_140->content;
+    ssbo_430->content.m0s[0].a[0] = _60.m0s[0].a[0];
+    ssbo_430->content.m0s[0].b = _60.m0s[0].b;
+    ssbo_430->content.m1s[0].a = _60.m1s[0].a;
+    ssbo_430->content.m1s[0].b = _60.m1s[0].b;
+    ssbo_430->content.m2s[0].a[0] = _60.m2s[0].a[0];
+    ssbo_430->content.m2s[0].b = _60.m2s[0].b;
+    ssbo_430->content.m0.a[0] = _60.m0.a[0];
+    ssbo_430->content.m0.b = _60.m0.b;
+    ssbo_430->content.m1.a = _60.m1.a;
+    ssbo_430->content.m1.b = _60.m1.b;
+    ssbo_430->content.m2.a[0] = _60.m2.a[0];
+    ssbo_430->content.m2.b = _60.m2.b;
+    ssbo_430->content.m3.a = _60.m3.a;
+    ssbo_430->content.m3.b = _60.m3.b;
+    ssbo_430->content.m4 = _60.m4;
+    ssbo_430->content.m3s[0].c = _60.m3s[0].c;
+    ssbo_430->content.m3s[1].c = _60.m3s[1].c;
+    ssbo_430->content.m3s[2].c = _60.m3s[2].c;
+    ssbo_430->content.m3s[3].c = _60.m3s[3].c;
+    ssbo_430->content.m3s[4].c = _60.m3s[4].c;
+    ssbo_430->content.m3s[5].c = _60.m3s[5].c;
+    ssbo_430->content.m3s[6].c = _60.m3s[6].c;
+    ssbo_430->content.m3s[7].c = _60.m3s[7].c;
+    ssbo_430->content.m1.a = spvMulVec2Mat3x2(ssbo_430->content.m3.a, ssbo_430->m6[1][1]);
+}
+
diff --git a/reference/shaders-opencl/comp/struct-packing.invalid.comp b/reference/shaders-opencl/comp/struct-packing.invalid.comp
deleted file mode 100644
index e69de29bb..000000000
diff --git a/reference/shaders-opencl/comp/torture-loop.comp b/reference/shaders-opencl/comp/torture-loop.comp
index 45f32a55b..6a978b9d6 100644
--- a/reference/shaders-opencl/comp/torture-loop.comp
+++ b/reference/shaders-opencl/comp/torture-loop.comp
@@ -1,9 +1,11 @@
 // Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
 
 
+typedef struct { float4 columns[4]; } spvMat4;
+
 struct SSBO
 {
-    float4 mvp;
+    spvMat4 mvp;
     float4 in_data[1];
 };
 
@@ -16,6 +18,11 @@ struct SSBO2
 
 typedef struct SSBO2 SSBO2;
 
+static float4 spvMulMat4Vec4(spvMat4 m, float4 v)
+{
+    return m.columns[0] * v.x + m.columns[1] * v.y + m.columns[2] * v.z + m.columns[3] * v.w;
+}
+
 __attribute__((reqd_work_group_size(1, 1, 1)))
 __kernel void comp_main(__global const SSBO* _24, __global float4* _89)
 {
@@ -43,7 +50,7 @@ __kernel void comp_main(__global const SSBO* _24, __global float4* _89)
     {
         for (uint j_1 = 0u; j_1 < 30u; j_1 += as_uint(1))
         {
-            idat_1 = _24->mvp * idat_1;
+            idat_1 = spvMulMat4Vec4(_24->mvp, idat_1);
         }
     }
     do
diff --git a/shaders-opencl/asm/comp/relaxed-block-layout.asm.comp b/shaders-opencl/asm/comp/relaxed-block-layout.fp16.asm.comp
similarity index 100%
rename from shaders-opencl/asm/comp/relaxed-block-layout.asm.comp
rename to shaders-opencl/asm/comp/relaxed-block-layout.fp16.asm.comp
diff --git a/shaders-opencl/comp/inverse.comp b/shaders-opencl/comp/inverse.comp
new file mode 100644
index 000000000..03b06d646
--- /dev/null
+++ b/shaders-opencl/comp/inverse.comp
@@ -0,0 +1,23 @@
+#version 450
+layout(local_size_x = 1) in;
+
+layout(std430, binding = 0) writeonly buffer MatrixOut
+{
+	mat2 m2out;
+	mat3 m3out;
+	mat4 m4out;
+};
+
+layout(std430, binding = 1) readonly buffer MatrixIn
+{
+	mat2 m2in;
+	mat3 m3in;
+	mat4 m4in;
+};
+
+void main()
+{
+	m2out = inverse(m2in);
+	m3out = inverse(m3in);
+	m4out = inverse(m4in);
+}
diff --git a/shaders-opencl/comp/mat3-row-maj-read-write-const.comp b/shaders-opencl/comp/mat3-row-maj-read-write-const.comp
new file mode 100644
index 000000000..068ad7972
--- /dev/null
+++ b/shaders-opencl/comp/mat3-row-maj-read-write-const.comp
@@ -0,0 +1,17 @@
+#version 450
+layout(local_size_x = 1, local_size_y = 1, local_size_z = 1) in;
+
+layout(set = 0, binding = 1, std430) buffer model_t
+{
+    layout(row_major) mediump mat3 mtx_rm;
+} model;
+
+void main()
+{
+    mat3 mtx_cm = model.mtx_rm;
+    mat3 mtx1 = mtx_cm * mat3(vec3(4.0, -3.0, 1.0), vec3(-7.0, 7.0, -7.0), vec3(-5.0, 6.0, -8.0));
+    if (mtx1[0][0] != 0.0)
+    {
+	    model.mtx_rm = mat3(vec3(-5.0, -3.0, -5.0), vec3(-2.0, 2.0, -5.0), vec3(6.0, 3.0, -8.0));
+    }
+}
diff --git a/shaders-opencl/comp/mat3.comp b/shaders-opencl/comp/mat3.comp
new file mode 100644
index 000000000..7c5bb1e4f
--- /dev/null
+++ b/shaders-opencl/comp/mat3.comp
@@ -0,0 +1,14 @@
+#version 310 es
+layout(local_size_x = 1) in;
+
+layout(std430, binding = 1) writeonly buffer SSBO2
+{
+    mat3 out_data[];
+};
+
+void main()
+{
+    uint ident = gl_GlobalInvocationID.x;
+    out_data[ident] = mat3(vec3(10.0), vec3(20.0), vec3(40.0));
+}
+
diff --git a/shaders-opencl/comp/rmw-matrix.comp b/shaders-opencl/comp/rmw-matrix.comp
new file mode 100644
index 000000000..c158ab4dd
--- /dev/null
+++ b/shaders-opencl/comp/rmw-matrix.comp
@@ -0,0 +1,20 @@
+#version 310 es
+layout(local_size_x = 1) in;
+
+layout(std430, binding = 0) buffer SSBO
+{
+	float a;
+	vec4 b;
+	mat4 c;
+
+	float a1;
+	vec4 b1;
+	mat4 c1;
+};
+
+void main()
+{
+	a *= a1;
+	b *= b1;
+	c *= c1;
+}
diff --git a/shaders-opencl/comp/shared-matrix-array-of-array.comp b/shaders-opencl/comp/shared-matrix-array-of-array.comp
new file mode 100644
index 000000000..3bbd4c0f0
--- /dev/null
+++ b/shaders-opencl/comp/shared-matrix-array-of-array.comp
@@ -0,0 +1,65 @@
+#version 450
+layout(local_size_x = 1) in;
+
+layout(std140, binding = 0) buffer block { highp uint passed; };
+struct S1 {
+	mediump mat4x3 a[2];
+	lowp float b;
+	lowp vec2 c[3];
+};
+struct S2 {
+	highp ivec4 a;
+	bool b[3][1][3];
+};
+
+bool compare_float    (highp float a, highp float b)  { return abs(a - b) < 0.05; }
+bool compare_vec2     (highp vec2 a, highp vec2 b)    { return compare_float(a.x, b.x)&&compare_float(a.y, b.y); }
+bool compare_vec3     (highp vec3 a, highp vec3 b)    { return compare_float(a.x, b.x)&&compare_float(a.y, b.y)&&compare_float(a.z, b.z); }
+bool compare_mat4x3   (highp mat4x3 a, highp mat4x3 b){ return compare_vec3(a[0], b[0])&&compare_vec3(a[1], b[1])&&compare_vec3(a[2], b[2])&&compare_vec3(a[3], b[3]); }
+bool compare_ivec4    (highp ivec4 a, highp ivec4 b)  { return a == b; }
+bool compare_bool     (bool a, bool b)                { return a == b; }
+
+shared S1 s1;
+shared S2 s2;
+
+void main (void) {
+	s1.a[0] = mat4x3(0.0, 2.0, -8.0, 6.0, 7.0, 5.0, -6.0, 1.0, 9.0, -4.0, -3.0, 4.0);
+	s1.a[1] = mat4x3(4.0, 9.0, -9.0, -8.0, -9.0, 8.0, 0.0, 4.0, -4.0, 7.0, 2.0, -1.0);
+	s1.b = 7.0;
+	s1.c[0] = vec2(-5.0, -4.0);
+	s1.c[1] = vec2(3.0, -5.0);
+	s1.c[2] = vec2(-3.0, -1.0);
+	s2.a = ivec4(1, 0, -3, 1);
+	s2.b[0][0][0] = true;
+	s2.b[0][0][1] = false;
+	s2.b[0][0][2] = false;
+	s2.b[1][0][0] = true;
+	s2.b[1][0][1] = false;
+	s2.b[1][0][2] = true;
+	s2.b[2][0][0] = false;
+	s2.b[2][0][1] = true;
+	s2.b[2][0][2] = true;
+
+	barrier();
+	memoryBarrier();
+	bool allOk = true;
+	allOk = allOk && compare_mat4x3(mat4x3(0.0, 2.0, -8.0, 6.0, 7.0, 5.0, -6.0, 1.0, 9.0, -4.0, -3.0, 4.0), s1.a[0]);
+	allOk = allOk && compare_mat4x3(mat4x3(4.0, 9.0, -9.0, -8.0, -9.0, 8.0, 0.0, 4.0, -4.0, 7.0, 2.0, -1.0), s1.a[1]);
+	allOk = allOk && compare_float(7.0, s1.b);
+	allOk = allOk && compare_vec2(vec2(-5.0, -4.0), s1.c[0]);
+	allOk = allOk && compare_vec2(vec2(3.0, -5.0), s1.c[1]);
+	allOk = allOk && compare_vec2(vec2(-3.0, -1.0), s1.c[2]);
+	allOk = allOk && compare_ivec4(ivec4(1, 0, -3, 1), s2.a);
+	allOk = allOk && compare_bool(true, s2.b[0][0][0]);
+	allOk = allOk && compare_bool(false, s2.b[0][0][1]);
+	allOk = allOk && compare_bool(false, s2.b[0][0][2]);
+	allOk = allOk && compare_bool(true, s2.b[1][0][0]);
+	allOk = allOk && compare_bool(false, s2.b[1][0][1]);
+	allOk = allOk && compare_bool(true, s2.b[1][0][2]);
+	allOk = allOk && compare_bool(false, s2.b[2][0][0]);
+	allOk = allOk && compare_bool(true, s2.b[2][0][1]);
+	allOk = allOk && compare_bool(true, s2.b[2][0][2]);
+	if (allOk)
+		passed++;
+
+}
diff --git a/shaders-opencl/comp/shared-matrix-cast.comp b/shaders-opencl/comp/shared-matrix-cast.comp
new file mode 100644
index 000000000..7e46fed7a
--- /dev/null
+++ b/shaders-opencl/comp/shared-matrix-cast.comp
@@ -0,0 +1,33 @@
+#version 450
+layout(local_size_x = 1) in;
+
+layout(std140, binding = 0) buffer block { highp uint passed; };
+struct S1 {
+	mediump vec4 a;
+	highp mat3x2 b;
+	bvec4 c;
+};
+
+bool compare_float    (highp float a, highp float b)  { return abs(a - b) < 0.05; }
+bool compare_vec2     (highp vec2 a, highp vec2 b)    { return compare_float(a.x, b.x)&&compare_float(a.y, b.y); }
+bool compare_vec4     (highp vec4 a, highp vec4 b)    { return compare_float(a.x, b.x)&&compare_float(a.y, b.y)&&compare_float(a.z, b.z)&&compare_float(a.w, b.w); }
+bool compare_mat3x2   (highp mat3x2 a, highp mat3x2 b){ return compare_vec2(a[0], b[0])&&compare_vec2(a[1], b[1])&&compare_vec2(a[2], b[2]); }
+bool compare_bvec4    (bvec4 a, bvec4 b)              { return a == b; }
+
+shared S1 s1;
+
+void main (void) {
+	s1.a = vec4(1.0, -5.0, -9.0, -5.0);
+	s1.b = mat3x2(1.0, -7.0, 1.0, 2.0, 8.0, 7.0);
+	s1.c = bvec4(false, true, false, false);
+
+	barrier();
+	memoryBarrier();
+	bool allOk = true;
+	allOk = allOk && compare_vec4(vec4(1.0, -5.0, -9.0, -5.0), s1.a);
+	allOk = allOk && compare_mat3x2(mat3x2(1.0, -7.0, 1.0, 2.0, 8.0, 7.0), s1.b);
+	allOk = allOk && compare_bvec4(bvec4(false, true, false, false), s1.c);
+	if (allOk)
+		passed++;
+
+}
diff --git a/shaders-opencl/comp/shared-matrix-nested-struct-array.comp b/shaders-opencl/comp/shared-matrix-nested-struct-array.comp
new file mode 100644
index 000000000..59ab24d84
--- /dev/null
+++ b/shaders-opencl/comp/shared-matrix-nested-struct-array.comp
@@ -0,0 +1,87 @@
+#version 450
+layout(local_size_x = 1) in;
+
+layout(std140, binding = 0) buffer block { highp uint passed; };
+struct sA
+{
+	mediump mat2x3 mA;
+};
+struct sB
+{
+	mediump mat2 mA;
+	mediump mat3x2 mB;
+	highp uvec3 mC;
+};
+struct sC
+{
+	sA mA;
+	sB mB;
+};
+struct sD
+{
+	sC mA;
+};
+struct sE
+{
+	lowp mat3x2 mA;
+	lowp mat4x3 mB;
+};
+struct sF
+{
+	sE mA;
+};
+struct sG
+{
+	sF mA;
+};
+struct sH
+{
+	bvec3 mA[2];
+};
+struct S1 {
+	sD a;
+	sG b;
+	sH c[2];
+};
+
+bool compare_float    (highp float a, highp float b)  { return abs(a - b) < 0.05; }
+bool compare_vec2     (highp vec2 a, highp vec2 b)    { return compare_float(a.x, b.x)&&compare_float(a.y, b.y); }
+bool compare_vec3     (highp vec3 a, highp vec3 b)    { return compare_float(a.x, b.x)&&compare_float(a.y, b.y)&&compare_float(a.z, b.z); }
+bool compare_mat2     (highp mat2 a, highp mat2 b)    { return compare_vec2(a[0], b[0])&&compare_vec2(a[1], b[1]); }
+bool compare_mat2x3   (highp mat2x3 a, highp mat2x3 b){ return compare_vec3(a[0], b[0])&&compare_vec3(a[1], b[1]); }
+bool compare_mat3x2   (highp mat3x2 a, highp mat3x2 b){ return compare_vec2(a[0], b[0])&&compare_vec2(a[1], b[1])&&compare_vec2(a[2], b[2]); }
+bool compare_mat4x3   (highp mat4x3 a, highp mat4x3 b){ return compare_vec3(a[0], b[0])&&compare_vec3(a[1], b[1])&&compare_vec3(a[2], b[2])&&compare_vec3(a[3], b[3]); }
+bool compare_uvec3    (highp uvec3 a, highp uvec3 b)  { return a == b; }
+bool compare_bvec3    (bvec3 a, bvec3 b)              { return a == b; }
+
+shared S1 s1;
+
+void main (void) {
+	s1.a.mA.mA.mA = mat2x3(6.0, 8.0, 8.0, 0.0, -4.0, -5.0);
+	s1.a.mA.mB.mA = mat2(9.0, -4.0, -6.0, -1.0);
+	s1.a.mA.mB.mB = mat3x2(-1.0, -2.0, 1.0, 6.0, 5.0, 7.0);
+	s1.a.mA.mB.mC = uvec3(3u, 1u, 5u);
+	s1.b.mA.mA.mA = mat3x2(8.0, 3.0, 0.0, 2.0, 1.0, 8.0);
+	s1.b.mA.mA.mB = mat4x3(0.0, 9.0, -1.0, -1.0, -7.0, 7.0, -4.0, -3.0, 1.0, -4.0, -9.0, 1.0);
+	s1.c[0].mA[0] = bvec3(true, false, false);
+	s1.c[0].mA[1] = bvec3(true, false, false);
+	s1.c[1].mA[0] = bvec3(false, false, false);
+	s1.c[1].mA[1] = bvec3(false, false, false);
+
+	barrier();
+	memoryBarrier();
+	bool allOk = true;
+	allOk = allOk && compare_mat2x3(mat2x3(6.0, 8.0, 8.0, 0.0, -4.0, -5.0), s1.a.mA.mA.mA);
+	allOk = allOk && compare_mat2(mat2(9.0, -4.0, -6.0, -1.0), s1.a.mA.mB.mA);
+	allOk = allOk && compare_mat3x2(mat3x2(-1.0, -2.0, 1.0, 6.0, 5.0, 7.0), s1.a.mA.mB.mB);
+	allOk = allOk && compare_uvec3(uvec3(3u, 1u, 5u), s1.a.mA.mB.mC);
+	allOk = allOk && compare_mat3x2(mat3x2(8.0, 3.0, 0.0, 2.0, 1.0, 8.0), s1.b.mA.mA.mA);
+	allOk = allOk && compare_mat4x3(mat4x3(0.0, 9.0, -1.0, -1.0, -7.0, 7.0, -4.0, -3.0, 1.0, -4.0, -9.0, 1.0), s1.b.mA.mA.mB);
+	allOk = allOk && compare_bvec3(bvec3(true, false, false), s1.c[0].mA[0]);
+	allOk = allOk && compare_bvec3(bvec3(true, false, false), s1.c[0].mA[1]);
+	allOk = allOk && compare_bvec3(bvec3(false, false, false), s1.c[1].mA[0]);
+	allOk = allOk && compare_bvec3(bvec3(false, false, false), s1.c[1].mA[1]);
+	if (allOk)
+		passed++;
+
+}
diff --git a/shaders-opencl/comp/shared-matrix-nested-struct.comp b/shaders-opencl/comp/shared-matrix-nested-struct.comp
new file mode 100644
index 000000000..c481f54a8
--- /dev/null
+++ b/shaders-opencl/comp/shared-matrix-nested-struct.comp
@@ -0,0 +1,141 @@
+#version 450
+layout(local_size_x = 1) in;
+
+layout(std140, binding = 0) buffer block { highp uint passed; };
+struct sA
+{
+	highp mat4 mA;
+	bvec3 mB;
+	bvec4 mC;
+};
+struct sB
+{
+	bvec2 mA;
+};
+struct sC
+{
+	highp float mA;
+	mediump uvec4 mB;
+	mediump float mC;
+};
+struct sD
+{
+	sA mA;
+	sB mB;
+	sC mC;
+};
+struct sE
+{
+	sD mA;
+};
+struct sF
+{
+	lowp uvec3 mA;
+	bool mB;
+};
+struct sG
+{
+	sF mA;
+	highp mat3x2 mB;
+};
+struct sH
+{
+	sG mA;
+	mediump vec2 mB;
+};
+struct sI
+{
+	mediump mat2 mA;
+	bvec3 mB;
+	bvec4 mC;
+};
+struct sJ
+{
+	sI mA;
+	bvec3 mB;
+};
+struct sK
+{
+	bvec2 mA;
+	sJ mB;
+	mediump ivec2 mC;
+};
+struct S1 {
+	lowp uint a;
+	mediump vec4 b;
+};
+struct S2 {
+	sE a;
+	highp ivec3 b;
+	sH c;
+	sK d;
+};
+
+bool compare_float    (highp float a, highp float b)  { return abs(a - b) < 0.05; }
+bool compare_vec2     (highp vec2 a, highp vec2 b)    { return compare_float(a.x, b.x)&&compare_float(a.y, b.y); }
+bool compare_vec4     (highp vec4 a, highp vec4 b)    { return compare_float(a.x, b.x)&&compare_float(a.y, b.y)&&compare_float(a.z, b.z)&&compare_float(a.w, b.w); }
+bool compare_mat2     (highp mat2 a, highp mat2 b)    { return compare_vec2(a[0], b[0])&&compare_vec2(a[1], b[1]); }
+bool compare_mat3x2   (highp mat3x2 a, highp mat3x2 b){ return compare_vec2(a[0], b[0])&&compare_vec2(a[1], b[1])&&compare_vec2(a[2], b[2]); }
+bool compare_mat4     (highp mat4 a, highp mat4 b)    { return compare_vec4(a[0], b[0])&&compare_vec4(a[1], b[1])&&compare_vec4(a[2], b[2])&&compare_vec4(a[3], b[3]); }
+bool compare_ivec2    (highp ivec2 a, highp ivec2 b)  { return a == b; }
+bool compare_ivec3    (highp ivec3 a, highp ivec3 b)  { return a == b; }
+bool compare_uint     (highp uint a, highp uint b)    { return a == b; }
+bool compare_uvec3    (highp uvec3 a, highp uvec3 b)  { return a == b; }
+bool compare_uvec4    (highp uvec4 a, highp uvec4 b)  { return a == b; }
+bool compare_bool     (bool a, bool b)                { return a == b; }
+bool compare_bvec2    (bvec2 a, bvec2 b)              { return a == b; }
+bool compare_bvec3    (bvec3 a, bvec3 b)              { return a == b; }
+bool compare_bvec4    (bvec4 a, bvec4 b)              { return a == b; }
+
+shared S1 s1;
+shared S2 s2;
+
+void main (void) {
+	s1.a = 0u;
+	s1.b = vec4(8.0, 8.0, 0.0, -4.0);
+	s2.a.mA.mA.mA = mat4(-5.0, 9.0, -4.0, -6.0, -1.0, -1.0, -2.0, 1.0, 6.0, 5.0, 7.0, -2.0, -4.0, -9.0, 8.0, 3.0);
+	s2.a.mA.mA.mB = bvec3(true, false, false);
+	s2.a.mA.mA.mC = bvec4(true, true, true, false);
+	s2.a.mA.mB.mA = bvec2(true, true);
+	s2.a.mA.mC.mA = 7.0;
+	s2.a.mA.mC.mB = uvec4(8u, 6u, 2u, 0u);
+	s2.a.mA.mC.mC = -9.0;
+	s2.b = ivec3(1, -4, 0);
+	s2.c.mA.mA.mA = uvec3(4u, 9u, 1u);
+	s2.c.mA.mA.mB = false;
+	s2.c.mA.mB = mat3x2(3.0, -5.0, -1.0, -5.0, -1.0, -9.0);
+	s2.c.mB = vec2(-6.0, -9.0);
+	s2.d.mA = bvec2(true, false);
+	s2.d.mB.mA.mA = mat2(-2.0, 3.0, 7.0, 2.0);
+	s2.d.mB.mA.mB = bvec3(false, false, false);
+	s2.d.mB.mA.mC = bvec4(false, false, false, true);
+	s2.d.mB.mB = bvec3(true, false, false);
+	s2.d.mC = ivec2(-9, 0);
+
+	barrier();
+	memoryBarrier();
+	bool allOk = true;
+	allOk = allOk && compare_uint(0u, s1.a);
+	allOk = allOk && compare_vec4(vec4(8.0, 8.0, 0.0, -4.0), s1.b);
+	allOk = allOk && compare_mat4(mat4(-5.0, 9.0, -4.0, -6.0, -1.0, -1.0, -2.0, 1.0, 6.0, 5.0, 7.0, -2.0, -4.0, -9.0, 8.0, 3.0), s2.a.mA.mA.mA);
+	allOk = allOk && compare_bvec3(bvec3(true, false, false), s2.a.mA.mA.mB);
+	allOk = allOk && compare_bvec4(bvec4(true, true, true, false), s2.a.mA.mA.mC);
+	allOk = allOk && compare_bvec2(bvec2(true, true), s2.a.mA.mB.mA);
+	allOk = allOk && compare_float(7.0, s2.a.mA.mC.mA);
+	allOk = allOk && compare_uvec4(uvec4(8u, 6u, 2u, 0u), s2.a.mA.mC.mB);
+	allOk = allOk && compare_float(-9.0, s2.a.mA.mC.mC);
+	allOk = allOk && compare_ivec3(ivec3(1, -4, 0), s2.b);
+	allOk = allOk && compare_uvec3(uvec3(4u, 9u, 1u), s2.c.mA.mA.mA);
+	allOk = allOk && compare_bool(false, s2.c.mA.mA.mB);
+	allOk = allOk && compare_mat3x2(mat3x2(3.0, -5.0, -1.0, -5.0, -1.0, -9.0), s2.c.mA.mB);
+	allOk = allOk && compare_vec2(vec2(-6.0, -9.0), s2.c.mB);
+	allOk = allOk && compare_bvec2(bvec2(true, false), s2.d.mA);
+	allOk = allOk && compare_mat2(mat2(-2.0, 3.0, 7.0, 2.0), s2.d.mB.mA.mA);
+	allOk = allOk && compare_bvec3(bvec3(false, false, false), s2.d.mB.mA.mB);
+	allOk = allOk && compare_bvec4(bvec4(false, false, false, true), s2.d.mB.mA.mC);
+	allOk = allOk && compare_bvec3(bvec3(true, false, false), s2.d.mB.mB);
+	allOk = allOk && compare_ivec2(ivec2(-9, 0), s2.d.mC);
+	if (allOk)
+		passed++;
+
+}
diff --git a/shaders-opencl/comp/shared-std450.double.comp b/shaders-opencl/comp/shared-std450.fp64.comp
similarity index 100%
rename from shaders-opencl/comp/shared-std450.double.comp
rename to shaders-opencl/comp/shared-std450.fp64.comp
diff --git a/shaders-opencl/comp/struct-packing.invalid.comp b/shaders-opencl/comp/struct-packing.comp
similarity index 100%
rename from shaders-opencl/comp/struct-packing.invalid.comp
rename to shaders-opencl/comp/struct-packing.comp
diff --git a/spirv_cross_c.cpp b/spirv_cross_c.cpp
index f49366ac4..1146f92d0 100644
--- a/spirv_cross_c.cpp
+++ b/spirv_cross_c.cpp
@@ -812,6 +812,9 @@ spvc_result spvc_compiler_options_set_uint(spvc_compiler_options options, spvc_c
 	case SPVC_COMPILER_OPTION_OPENCL_VERSION:
 		options->opencl.opencl_version = value;
 		break;
+	case SPVC_COMPILER_OPTION_OPENCL_ENABLE_FP16:
+		options->opencl.enable_fp16 = value != 0;
+		break;
 	case SPVC_COMPILER_OPTION_OPENCL_ENABLE_FP64:
 		options->opencl.enable_fp64 = value != 0;
 		break;
@@ -821,8 +824,14 @@ spvc_result spvc_compiler_options_set_uint(spvc_compiler_options options, spvc_c
 	case SPVC_COMPILER_OPTION_OPENCL_ENABLE_SUBGROUPS:
 		options->opencl.enable_subgroups = value != 0;
 		break;
-	case SPVC_COMPILER_OPTION_OPENCL_ENABLE_SHUFFLE:
-		options->opencl.enable_shuffle = value != 0;
+	case SPVC_COMPILER_OPTION_OPENCL_ENABLE_SUBGROUPS_ALL:
+		options->opencl.enable_subgroups_all = value != 0;
+		break;
+	case SPVC_COMPILER_OPTION_OPENCL_EMULATE_SUBGROUPS:
+		options->opencl.emulate_subgroups = value != 0;
+		break;
+	case SPVC_COMPILER_OPTION_OPENCL_FIXED_SUBGROUP_SIZE:
+		options->opencl.fixed_subgroup_size = value;
 		break;
 #endif
 
diff --git a/spirv_cross_c.h b/spirv_cross_c.h
index c59c299d0..e4d37ce46 100644
--- a/spirv_cross_c.h
+++ b/spirv_cross_c.h
@@ -759,10 +759,13 @@ extern "C"
 		SPVC_COMPILER_OPTION_HLSL_USER_SEMANTIC = 94 | SPVC_COMPILER_OPTION_HLSL_BIT,
 
 		SPVC_COMPILER_OPTION_OPENCL_VERSION = 95 | SPVC_COMPILER_OPTION_OPENCL_BIT,
-		SPVC_COMPILER_OPTION_OPENCL_ENABLE_FP64 = 96 | SPVC_COMPILER_OPTION_OPENCL_BIT,
-		SPVC_COMPILER_OPTION_OPENCL_ENABLE_64BIT_ATOMICS = 97 | SPVC_COMPILER_OPTION_OPENCL_BIT,
-		SPVC_COMPILER_OPTION_OPENCL_ENABLE_SUBGROUPS = 98 | SPVC_COMPILER_OPTION_OPENCL_BIT,
-		SPVC_COMPILER_OPTION_OPENCL_ENABLE_SHUFFLE = 99 | SPVC_COMPILER_OPTION_OPENCL_BIT,
+		SPVC_COMPILER_OPTION_OPENCL_ENABLE_FP16 = 96 | SPVC_COMPILER_OPTION_OPENCL_BIT,
+		SPVC_COMPILER_OPTION_OPENCL_ENABLE_FP64 = 97 | SPVC_COMPILER_OPTION_OPENCL_BIT,
+		SPVC_COMPILER_OPTION_OPENCL_ENABLE_64BIT_ATOMICS = 98 | SPVC_COMPILER_OPTION_OPENCL_BIT,
+		SPVC_COMPILER_OPTION_OPENCL_ENABLE_SUBGROUPS = 99 | SPVC_COMPILER_OPTION_OPENCL_BIT,
+		SPVC_COMPILER_OPTION_OPENCL_ENABLE_SUBGROUPS_ALL = 100 | SPVC_COMPILER_OPTION_OPENCL_BIT,
+		SPVC_COMPILER_OPTION_OPENCL_EMULATE_SUBGROUPS = 101 | SPVC_COMPILER_OPTION_OPENCL_BIT,
+		SPVC_COMPILER_OPTION_OPENCL_FIXED_SUBGROUP_SIZE = 102 | SPVC_COMPILER_OPTION_OPENCL_BIT,
 
 		SPVC_COMPILER_OPTION_INT_MAX = 0x7fffffff
 	} spvc_compiler_option;
diff --git a/spirv_glsl.cpp b/spirv_glsl.cpp
index 10accf077..66ab8c560 100644
--- a/spirv_glsl.cpp
+++ b/spirv_glsl.cpp
@@ -25,12 +25,12 @@
 #include "GLSL.std.450.h"
 #include "spirv_common.hpp"
 #include <algorithm>
+#include <array>
 #include <assert.h>
 #include <cmath>
 #include <limits>
 #include <locale.h>
 #include <utility>
-#include <array>
 
 #ifndef _WIN32
 #include <langinfo.h>
@@ -202,7 +202,7 @@ static BufferPackingStandard packing_to_substruct_packing(BufferPackingStandard
 		return packing;
 	}
 }
-}
+} // namespace SPIRV_CROSS_NAMESPACE
 
 void CompilerGLSL::init()
 {
@@ -352,7 +352,8 @@ void CompilerGLSL::reset(uint32_t iteration_count)
 	// and it is not practical with the current architecture
 	// to resolve everything up front.
 	if (iteration_count >= options.force_recompile_max_debug_iterations && !is_force_recompile_forward_progress)
-		SPIRV_CROSS_THROW("Maximum compilation loops detected and no forward progress was made. Must be a SPIRV-Cross bug!");
+		SPIRV_CROSS_THROW(
+		    "Maximum compilation loops detected and no forward progress was made. Must be a SPIRV-Cross bug!");
 
 	// We do some speculative optimizations which should pretty much always work out,
 	// but just in case the SPIR-V is rather weird, recompile until it's happy.
@@ -376,10 +377,12 @@ void CompilerGLSL::reset(uint32_t iteration_count)
 
 	reset_name_caches();
 
-	ir.for_each_typed_id<SPIRFunction>([&](uint32_t, SPIRFunction &func) {
-		func.active = false;
-		func.flush_undeclared = true;
-	});
+	ir.for_each_typed_id<SPIRFunction>(
+	    [&](uint32_t, SPIRFunction &func)
+	    {
+		    func.active = false;
+		    func.flush_undeclared = true;
+	    });
 
 	ir.for_each_typed_id<SPIRVariable>([&](uint32_t, SPIRVariable &var) { var.dependees.clear(); });
 
@@ -427,54 +430,54 @@ void CompilerGLSL::remap_ext_framebuffer_fetch(uint32_t input_attachment_index,
 bool CompilerGLSL::location_is_framebuffer_fetch(uint32_t location) const
 {
 	return std::find_if(begin(inout_color_attachments), end(inout_color_attachments),
-	                    [&](const std::pair<uint32_t, bool> &elem) {
-		                    return elem.first == location;
-	                    }) != end(inout_color_attachments);
+	                    [&](const std::pair<uint32_t, bool> &elem)
+	                    { return elem.first == location; }) != end(inout_color_attachments);
 }
 
 bool CompilerGLSL::location_is_non_coherent_framebuffer_fetch(uint32_t location) const
 {
 	return std::find_if(begin(inout_color_attachments), end(inout_color_attachments),
-	                    [&](const std::pair<uint32_t, bool> &elem) {
-		                    return elem.first == location && !elem.second;
-	                    }) != end(inout_color_attachments);
+	                    [&](const std::pair<uint32_t, bool> &elem)
+	                    { return elem.first == location && !elem.second; }) != end(inout_color_attachments);
 }
 
 void CompilerGLSL::find_static_extensions()
 {
-	ir.for_each_typed_id<SPIRType>([&](uint32_t, const SPIRType &type) {
-		if (type.basetype == SPIRType::Double)
-		{
-			if (options.es)
-				SPIRV_CROSS_THROW("FP64 not supported in ES profile.");
-			if (!options.es && options.version < 400)
-				require_extension_internal("GL_ARB_gpu_shader_fp64");
-		}
-		else if (type.basetype == SPIRType::Int64 || type.basetype == SPIRType::UInt64)
-		{
-			if (options.es && options.version < 310) // GL_NV_gpu_shader5 fallback requires 310.
-				SPIRV_CROSS_THROW("64-bit integers not supported in ES profile before version 310.");
-			require_extension_internal("GL_ARB_gpu_shader_int64");
-		}
-		else if (type.basetype == SPIRType::Half)
-		{
-			require_extension_internal("GL_EXT_shader_explicit_arithmetic_types_float16");
-			if (options.vulkan_semantics)
-				require_extension_internal("GL_EXT_shader_16bit_storage");
-		}
-		else if (type.basetype == SPIRType::SByte || type.basetype == SPIRType::UByte)
-		{
-			require_extension_internal("GL_EXT_shader_explicit_arithmetic_types_int8");
-			if (options.vulkan_semantics)
-				require_extension_internal("GL_EXT_shader_8bit_storage");
-		}
-		else if (type.basetype == SPIRType::Short || type.basetype == SPIRType::UShort)
-		{
-			require_extension_internal("GL_EXT_shader_explicit_arithmetic_types_int16");
-			if (options.vulkan_semantics)
-				require_extension_internal("GL_EXT_shader_16bit_storage");
-		}
-	});
+	ir.for_each_typed_id<SPIRType>(
+	    [&](uint32_t, const SPIRType &type)
+	    {
+		    if (type.basetype == SPIRType::Double)
+		    {
+			    if (options.es)
+				    SPIRV_CROSS_THROW("FP64 not supported in ES profile.");
+			    if (!options.es && options.version < 400)
+				    require_extension_internal("GL_ARB_gpu_shader_fp64");
+		    }
+		    else if (type.basetype == SPIRType::Int64 || type.basetype == SPIRType::UInt64)
+		    {
+			    if (options.es && options.version < 310) // GL_NV_gpu_shader5 fallback requires 310.
+				    SPIRV_CROSS_THROW("64-bit integers not supported in ES profile before version 310.");
+			    require_extension_internal("GL_ARB_gpu_shader_int64");
+		    }
+		    else if (type.basetype == SPIRType::Half)
+		    {
+			    require_extension_internal("GL_EXT_shader_explicit_arithmetic_types_float16");
+			    if (options.vulkan_semantics)
+				    require_extension_internal("GL_EXT_shader_16bit_storage");
+		    }
+		    else if (type.basetype == SPIRType::SByte || type.basetype == SPIRType::UByte)
+		    {
+			    require_extension_internal("GL_EXT_shader_explicit_arithmetic_types_int8");
+			    if (options.vulkan_semantics)
+				    require_extension_internal("GL_EXT_shader_8bit_storage");
+		    }
+		    else if (type.basetype == SPIRType::Short || type.basetype == SPIRType::UShort)
+		    {
+			    require_extension_internal("GL_EXT_shader_explicit_arithmetic_types_int16");
+			    if (options.vulkan_semantics)
+				    require_extension_internal("GL_EXT_shader_16bit_storage");
+		    }
+	    });
 
 	auto &execution = get_entry_point();
 	switch (execution.model)
@@ -716,8 +719,8 @@ void CompilerGLSL::find_static_extensions()
 
 void CompilerGLSL::require_polyfill(Polyfill polyfill, bool relaxed)
 {
-	uint32_t &polyfills = (relaxed && (options.es || options.vulkan_semantics)) ?
-	                      required_polyfills_relaxed : required_polyfills;
+	uint32_t &polyfills =
+	    (relaxed && (options.es || options.vulkan_semantics)) ? required_polyfills_relaxed : required_polyfills;
 
 	if ((polyfills & polyfill) == 0)
 	{
@@ -729,15 +732,17 @@ void CompilerGLSL::require_polyfill(Polyfill polyfill, bool relaxed)
 void CompilerGLSL::ray_tracing_khr_fixup_locations()
 {
 	uint32_t location = 0;
-	ir.for_each_typed_id<SPIRVariable>([&](uint32_t, SPIRVariable &var) {
-		// Incoming payload storage can also be used for tracing.
-		if (var.storage != StorageClassRayPayloadKHR && var.storage != StorageClassCallableDataKHR &&
-		    var.storage != StorageClassIncomingRayPayloadKHR && var.storage != StorageClassIncomingCallableDataKHR)
-			return;
-		if (is_hidden_variable(var))
-			return;
-		set_decoration(var.self, DecorationLocation, location++);
-	});
+	ir.for_each_typed_id<SPIRVariable>(
+	    [&](uint32_t, SPIRVariable &var)
+	    {
+		    // Incoming payload storage can also be used for tracing.
+		    if (var.storage != StorageClassRayPayloadKHR && var.storage != StorageClassCallableDataKHR &&
+		        var.storage != StorageClassIncomingRayPayloadKHR && var.storage != StorageClassIncomingCallableDataKHR)
+			    return;
+		    if (is_hidden_variable(var))
+			    return;
+		    set_decoration(var.self, DecorationLocation, location++);
+	    });
 }
 
 string CompilerGLSL::compile()
@@ -757,7 +762,7 @@ string CompilerGLSL::compile()
 	backend.workgroup_size_is_hidden = true;
 	backend.requires_relaxed_precision_analysis = options.es || options.vulkan_semantics;
 	backend.support_precise_qualifier =
-			(!options.es && options.version >= 400) || (options.es && options.version >= 320);
+	    (!options.es && options.version >= 400) || (options.es && options.version >= 320);
 	backend.constant_null_initializer = "{ }";
 	backend.requires_matching_array_initializer = true;
 
@@ -2300,8 +2305,7 @@ string CompilerGLSL::layout_for_variable(const SPIRVariable &var)
 	return res;
 }
 
-string CompilerGLSL::buffer_to_packing_standard(const SPIRType &type,
-                                                bool support_std430_without_scalar_layout,
+string CompilerGLSL::buffer_to_packing_standard(const SPIRType &type, bool support_std430_without_scalar_layout,
                                                 bool support_enhanced_layouts)
 {
 	if (support_std430_without_scalar_layout && buffer_is_packing_standard(type, BufferPackingStd430))
@@ -2313,8 +2317,7 @@ string CompilerGLSL::buffer_to_packing_standard(const SPIRType &type,
 		require_extension_internal("GL_EXT_scalar_block_layout");
 		return "scalar";
 	}
-	else if (support_std430_without_scalar_layout &&
-	         support_enhanced_layouts &&
+	else if (support_std430_without_scalar_layout && support_enhanced_layouts &&
 	         buffer_is_packing_standard(type, BufferPackingStd430EnhancedLayout))
 	{
 		if (options.es && !options.vulkan_semantics)
@@ -2326,8 +2329,7 @@ string CompilerGLSL::buffer_to_packing_standard(const SPIRType &type,
 		set_extended_decoration(type.self, SPIRVCrossDecorationExplicitOffset);
 		return "std430";
 	}
-	else if (support_enhanced_layouts &&
-	         buffer_is_packing_standard(type, BufferPackingStd140EnhancedLayout))
+	else if (support_enhanced_layouts && buffer_is_packing_standard(type, BufferPackingStd140EnhancedLayout))
 	{
 		// Fallback time. We might be able to use the ARB_enhanced_layouts to deal with this difference,
 		// however, we can only use layout(offset) on the block itself, not any substructs, so the substructs better be the appropriate layout.
@@ -2341,8 +2343,7 @@ string CompilerGLSL::buffer_to_packing_standard(const SPIRType &type,
 		set_extended_decoration(type.self, SPIRVCrossDecorationExplicitOffset);
 		return "std140";
 	}
-	else if (options.vulkan_semantics &&
-	         support_enhanced_layouts &&
+	else if (options.vulkan_semantics && support_enhanced_layouts &&
 	         buffer_is_packing_standard(type, BufferPackingScalarEnhancedLayout))
 	{
 		set_extended_decoration(type.self, SPIRVCrossDecorationExplicitOffset);
@@ -2356,8 +2357,7 @@ string CompilerGLSL::buffer_to_packing_standard(const SPIRType &type,
 		require_extension_internal("GL_EXT_scalar_block_layout");
 		return "std430";
 	}
-	else if (!support_std430_without_scalar_layout && options.vulkan_semantics &&
-	         support_enhanced_layouts &&
+	else if (!support_std430_without_scalar_layout && options.vulkan_semantics && support_enhanced_layouts &&
 	         buffer_is_packing_standard(type, BufferPackingStd430EnhancedLayout))
 	{
 		// UBOs can support std430 with GL_EXT_scalar_block_layout.
@@ -2532,7 +2532,7 @@ void CompilerGLSL::emit_buffer_reference_block(uint32_t type_id, bool forward_de
 			}
 			else if (is_array(get_pointee_type(type)))
 			{
-				SPIRType wrap_type{OpTypeStruct};
+				SPIRType wrap_type{ OpTypeStruct };
 				wrap_type.self = ir.increase_bound_by(1);
 				wrap_type.member_types.push_back(get_pointee_type_id(type_id));
 				ir.set_member_decoration(wrap_type.self, 0, DecorationOffset, 0);
@@ -2540,7 +2540,8 @@ void CompilerGLSL::emit_buffer_reference_block(uint32_t type_id, bool forward_de
 			}
 
 			if (alignment)
-				statement("layout(", packing_standard, "buffer_reference, buffer_reference_align = ", alignment, ") buffer ", buffer_name);
+				statement("layout(", packing_standard, "buffer_reference, buffer_reference_align = ", alignment,
+				          ") buffer ", buffer_name);
 			else
 				statement("layout(", packing_standard, "buffer_reference) buffer ", buffer_name);
 		}
@@ -2656,7 +2657,7 @@ void CompilerGLSL::emit_buffer_block_flattened(const SPIRVariable &var)
 	SPIRType::BaseType basic_type;
 	if (get_common_basic_type(type, basic_type))
 	{
-		SPIRType tmp { OpTypeVector };
+		SPIRType tmp{ OpTypeVector };
 		tmp.basetype = basic_type;
 		tmp.vecsize = 4;
 		if (basic_type != SPIRType::Float && basic_type != SPIRType::Int && basic_type != SPIRType::UInt)
@@ -2831,8 +2832,7 @@ void CompilerGLSL::emit_interface_block(const SPIRVariable &var)
 {
 	auto &type = get<SPIRType>(var.basetype);
 
-	if (var.storage == StorageClassInput && type.basetype == SPIRType::Double &&
-	    !options.es && options.version < 410)
+	if (var.storage == StorageClassInput && type.basetype == SPIRType::Double && !options.es && options.version < 410)
 	{
 		require_extension_internal("GL_ARB_vertex_attrib_64bit");
 	}
@@ -3072,42 +3072,48 @@ void CompilerGLSL::emit_entry_point_declarations()
 
 void CompilerGLSL::replace_illegal_names(const unordered_set<string> &keywords)
 {
-	ir.for_each_typed_id<SPIRVariable>([&](uint32_t, const SPIRVariable &var) {
-		if (is_hidden_variable(var))
-			return;
-
-		auto *meta = ir.find_meta(var.self);
-		if (!meta)
-			return;
-
-		auto &m = meta->decoration;
-		if (keywords.find(m.alias) != end(keywords))
-			m.alias = join("_", m.alias);
-	});
-
-	ir.for_each_typed_id<SPIRFunction>([&](uint32_t, const SPIRFunction &func) {
-		auto *meta = ir.find_meta(func.self);
-		if (!meta)
-			return;
-
-		auto &m = meta->decoration;
-		if (keywords.find(m.alias) != end(keywords))
-			m.alias = join("_", m.alias);
-	});
-
-	ir.for_each_typed_id<SPIRType>([&](uint32_t, const SPIRType &type) {
-		auto *meta = ir.find_meta(type.self);
-		if (!meta)
-			return;
-
-		auto &m = meta->decoration;
-		if (keywords.find(m.alias) != end(keywords))
-			m.alias = join("_", m.alias);
-
-		for (auto &memb : meta->members)
-			if (keywords.find(memb.alias) != end(keywords))
-				memb.alias = join("_", memb.alias);
-	});
+	ir.for_each_typed_id<SPIRVariable>(
+	    [&](uint32_t, const SPIRVariable &var)
+	    {
+		    if (is_hidden_variable(var))
+			    return;
+
+		    auto *meta = ir.find_meta(var.self);
+		    if (!meta)
+			    return;
+
+		    auto &m = meta->decoration;
+		    if (keywords.find(m.alias) != end(keywords))
+			    m.alias = join("_", m.alias);
+	    });
+
+	ir.for_each_typed_id<SPIRFunction>(
+	    [&](uint32_t, const SPIRFunction &func)
+	    {
+		    auto *meta = ir.find_meta(func.self);
+		    if (!meta)
+			    return;
+
+		    auto &m = meta->decoration;
+		    if (keywords.find(m.alias) != end(keywords))
+			    m.alias = join("_", m.alias);
+	    });
+
+	ir.for_each_typed_id<SPIRType>(
+	    [&](uint32_t, const SPIRType &type)
+	    {
+		    auto *meta = ir.find_meta(type.self);
+		    if (!meta)
+			    return;
+
+		    auto &m = meta->decoration;
+		    if (keywords.find(m.alias) != end(keywords))
+			    m.alias = join("_", m.alias);
+
+		    for (auto &memb : meta->members)
+			    if (keywords.find(memb.alias) != end(keywords))
+				    memb.alias = join("_", memb.alias);
+	    });
 }
 
 void CompilerGLSL::replace_illegal_names()
@@ -3207,12 +3213,15 @@ void CompilerGLSL::replace_fragment_output(SPIRVariable &var)
 
 void CompilerGLSL::replace_fragment_outputs()
 {
-	ir.for_each_typed_id<SPIRVariable>([&](uint32_t, SPIRVariable &var) {
-		auto &type = this->get<SPIRType>(var.basetype);
+	ir.for_each_typed_id<SPIRVariable>(
+	    [&](uint32_t, SPIRVariable &var)
+	    {
+		    auto &type = this->get<SPIRType>(var.basetype);
 
-		if (!is_builtin_variable(var) && !var.remapped_variable && type.pointer && var.storage == StorageClassOutput)
-			replace_fragment_output(var);
-	});
+		    if (!is_builtin_variable(var) && !var.remapped_variable && type.pointer &&
+		        var.storage == StorageClassOutput)
+			    replace_fragment_output(var);
+	    });
 }
 
 string CompilerGLSL::remap_swizzle(const SPIRType &out_type, uint32_t input_components, const string &expr)
@@ -3274,21 +3283,23 @@ void CompilerGLSL::fixup_image_load_store_access()
 	if (!options.enable_storage_image_qualifier_deduction)
 		return;
 
-	ir.for_each_typed_id<SPIRVariable>([&](uint32_t var, const SPIRVariable &) {
-		auto &vartype = expression_type(var);
-		if (vartype.basetype == SPIRType::Image && vartype.image.sampled == 2)
-		{
-			// Very old glslangValidator and HLSL compilers do not emit required qualifiers here.
-			// Solve this by making the image access as restricted as possible and loosen up if we need to.
-			// If any no-read/no-write flags are actually set, assume that the compiler knows what it's doing.
-
-			if (!has_decoration(var, DecorationNonWritable) && !has_decoration(var, DecorationNonReadable))
-			{
-				set_decoration(var, DecorationNonWritable);
-				set_decoration(var, DecorationNonReadable);
-			}
-		}
-	});
+	ir.for_each_typed_id<SPIRVariable>(
+	    [&](uint32_t var, const SPIRVariable &)
+	    {
+		    auto &vartype = expression_type(var);
+		    if (vartype.basetype == SPIRType::Image && vartype.image.sampled == 2)
+		    {
+			    // Very old glslangValidator and HLSL compilers do not emit required qualifiers here.
+			    // Solve this by making the image access as restricted as possible and loosen up if we need to.
+			    // If any no-read/no-write flags are actually set, assume that the compiler knows what it's doing.
+
+			    if (!has_decoration(var, DecorationNonWritable) && !has_decoration(var, DecorationNonReadable))
+			    {
+				    set_decoration(var, DecorationNonWritable);
+				    set_decoration(var, DecorationNonReadable);
+			    }
+		    }
+	    });
 }
 
 static bool is_block_builtin(BuiltIn builtin)
@@ -3305,34 +3316,36 @@ bool CompilerGLSL::should_force_emit_builtin_block(StorageClass storage)
 		return false;
 	bool should_force = false;
 
-	ir.for_each_typed_id<SPIRVariable>([&](uint32_t, SPIRVariable &var) {
-		if (should_force)
-			return;
-
-		auto &type = this->get<SPIRType>(var.basetype);
-		bool block = has_decoration(type.self, DecorationBlock);
-		if (var.storage == storage && block && is_builtin_variable(var))
-		{
-			uint32_t member_count = uint32_t(type.member_types.size());
-			for (uint32_t i = 0; i < member_count; i++)
-			{
-				if (has_member_decoration(type.self, i, DecorationBuiltIn) &&
-				    is_block_builtin(BuiltIn(get_member_decoration(type.self, i, DecorationBuiltIn))) &&
-				    has_member_decoration(type.self, i, DecorationOffset))
-				{
-					should_force = true;
-				}
-			}
-		}
-		else if (var.storage == storage && !block && is_builtin_variable(var))
-		{
-			if (is_block_builtin(BuiltIn(get_decoration(type.self, DecorationBuiltIn))) &&
-			    has_decoration(var.self, DecorationOffset))
-			{
-				should_force = true;
-			}
-		}
-	});
+	ir.for_each_typed_id<SPIRVariable>(
+	    [&](uint32_t, SPIRVariable &var)
+	    {
+		    if (should_force)
+			    return;
+
+		    auto &type = this->get<SPIRType>(var.basetype);
+		    bool block = has_decoration(type.self, DecorationBlock);
+		    if (var.storage == storage && block && is_builtin_variable(var))
+		    {
+			    uint32_t member_count = uint32_t(type.member_types.size());
+			    for (uint32_t i = 0; i < member_count; i++)
+			    {
+				    if (has_member_decoration(type.self, i, DecorationBuiltIn) &&
+				        is_block_builtin(BuiltIn(get_member_decoration(type.self, i, DecorationBuiltIn))) &&
+				        has_member_decoration(type.self, i, DecorationOffset))
+				    {
+					    should_force = true;
+				    }
+			    }
+		    }
+		    else if (var.storage == storage && !block && is_builtin_variable(var))
+		    {
+			    if (is_block_builtin(BuiltIn(get_decoration(type.self, DecorationBuiltIn))) &&
+			        has_decoration(var.self, DecorationOffset))
+			    {
+				    should_force = true;
+			    }
+		    }
+	    });
 
 	// If we're declaring clip/cull planes with control points we need to force block declaration.
 	if ((get_execution_model() == ExecutionModelTessellationControl ||
@@ -3351,51 +3364,53 @@ bool CompilerGLSL::should_force_emit_builtin_block(StorageClass storage)
 
 void CompilerGLSL::fixup_implicit_builtin_block_names(ExecutionModel model)
 {
-	ir.for_each_typed_id<SPIRVariable>([&](uint32_t, SPIRVariable &var) {
-		auto &type = this->get<SPIRType>(var.basetype);
-		bool block = has_decoration(type.self, DecorationBlock);
-		if ((var.storage == StorageClassOutput || var.storage == StorageClassInput) && block &&
-		    is_builtin_variable(var))
-		{
-			if (model != ExecutionModelMeshEXT)
-			{
-				// Make sure the array has a supported name in the code.
-				if (var.storage == StorageClassOutput)
-					set_name(var.self, "gl_out");
-				else if (var.storage == StorageClassInput)
-					set_name(var.self, "gl_in");
-			}
-			else
-			{
-				auto flags = get_buffer_block_flags(var.self);
-				if (flags.get(DecorationPerPrimitiveEXT))
-				{
-					set_name(var.self, "gl_MeshPrimitivesEXT");
-					set_name(type.self, "gl_MeshPerPrimitiveEXT");
-				}
-				else
-				{
-					set_name(var.self, "gl_MeshVerticesEXT");
-					set_name(type.self, "gl_MeshPerVertexEXT");
-				}
-			}
-		}
-
-		if (model == ExecutionModelMeshEXT && var.storage == StorageClassOutput && !block)
-		{
-			auto *m = ir.find_meta(var.self);
-			if (m && m->decoration.builtin)
-			{
-				auto builtin_type = m->decoration.builtin_type;
-				if (builtin_type == BuiltInPrimitivePointIndicesEXT)
-					set_name(var.self, "gl_PrimitivePointIndicesEXT");
-				else if (builtin_type == BuiltInPrimitiveLineIndicesEXT)
-					set_name(var.self, "gl_PrimitiveLineIndicesEXT");
-				else if (builtin_type == BuiltInPrimitiveTriangleIndicesEXT)
-					set_name(var.self, "gl_PrimitiveTriangleIndicesEXT");
-			}
-		}
-	});
+	ir.for_each_typed_id<SPIRVariable>(
+	    [&](uint32_t, SPIRVariable &var)
+	    {
+		    auto &type = this->get<SPIRType>(var.basetype);
+		    bool block = has_decoration(type.self, DecorationBlock);
+		    if ((var.storage == StorageClassOutput || var.storage == StorageClassInput) && block &&
+		        is_builtin_variable(var))
+		    {
+			    if (model != ExecutionModelMeshEXT)
+			    {
+				    // Make sure the array has a supported name in the code.
+				    if (var.storage == StorageClassOutput)
+					    set_name(var.self, "gl_out");
+				    else if (var.storage == StorageClassInput)
+					    set_name(var.self, "gl_in");
+			    }
+			    else
+			    {
+				    auto flags = get_buffer_block_flags(var.self);
+				    if (flags.get(DecorationPerPrimitiveEXT))
+				    {
+					    set_name(var.self, "gl_MeshPrimitivesEXT");
+					    set_name(type.self, "gl_MeshPerPrimitiveEXT");
+				    }
+				    else
+				    {
+					    set_name(var.self, "gl_MeshVerticesEXT");
+					    set_name(type.self, "gl_MeshPerVertexEXT");
+				    }
+			    }
+		    }
+
+		    if (model == ExecutionModelMeshEXT && var.storage == StorageClassOutput && !block)
+		    {
+			    auto *m = ir.find_meta(var.self);
+			    if (m && m->decoration.builtin)
+			    {
+				    auto builtin_type = m->decoration.builtin_type;
+				    if (builtin_type == BuiltInPrimitivePointIndicesEXT)
+					    set_name(var.self, "gl_PrimitivePointIndicesEXT");
+				    else if (builtin_type == BuiltInPrimitiveLineIndicesEXT)
+					    set_name(var.self, "gl_PrimitiveLineIndicesEXT");
+				    else if (builtin_type == BuiltInPrimitiveTriangleIndicesEXT)
+					    set_name(var.self, "gl_PrimitiveTriangleIndicesEXT");
+			    }
+		    }
+	    });
 }
 
 void CompilerGLSL::emit_declared_builtin_block(StorageClass storage, ExecutionModel model)
@@ -3416,121 +3431,124 @@ void CompilerGLSL::emit_declared_builtin_block(StorageClass storage, ExecutionMo
 	uint32_t xfb_stride = 0, xfb_buffer = 0, geom_stream = 0;
 	std::unordered_map<uint32_t, uint32_t> builtin_xfb_offsets;
 
-	const auto builtin_is_per_vertex_set = [](BuiltIn builtin) -> bool {
-		return builtin == BuiltInPosition || builtin == BuiltInPointSize ||
-			builtin == BuiltInClipDistance || builtin == BuiltInCullDistance;
+	const auto builtin_is_per_vertex_set = [](BuiltIn builtin) -> bool
+	{
+		return builtin == BuiltInPosition || builtin == BuiltInPointSize || builtin == BuiltInClipDistance ||
+		       builtin == BuiltInCullDistance;
 	};
 
-	ir.for_each_typed_id<SPIRVariable>([&](uint32_t, SPIRVariable &var) {
-		auto &type = this->get<SPIRType>(var.basetype);
-		bool block = has_decoration(type.self, DecorationBlock);
-		Bitset builtins;
-
-		if (var.storage == storage && block && is_builtin_variable(var))
-		{
-			uint32_t index = 0;
-			for (auto &m : ir.meta[type.self].members)
-			{
-				if (m.builtin && builtin_is_per_vertex_set(m.builtin_type))
-				{
-					builtins.set(m.builtin_type);
-					if (m.builtin_type == BuiltInCullDistance)
-						cull_distance_size = to_array_size_literal(this->get<SPIRType>(type.member_types[index]));
-					else if (m.builtin_type == BuiltInClipDistance)
-						clip_distance_size = to_array_size_literal(this->get<SPIRType>(type.member_types[index]));
-
-					if (is_block_builtin(m.builtin_type) && m.decoration_flags.get(DecorationOffset))
-					{
-						have_any_xfb_offset = true;
-						builtin_xfb_offsets[m.builtin_type] = m.offset;
-					}
-
-					if (is_block_builtin(m.builtin_type) && m.decoration_flags.get(DecorationStream))
-					{
-						uint32_t stream = m.stream;
-						if (have_geom_stream && geom_stream != stream)
-							SPIRV_CROSS_THROW("IO block member Stream mismatch.");
-						have_geom_stream = true;
-						geom_stream = stream;
-					}
-				}
-				index++;
-			}
-
-			if (storage == StorageClassOutput && has_decoration(var.self, DecorationXfbBuffer) &&
-			    has_decoration(var.self, DecorationXfbStride))
-			{
-				uint32_t buffer_index = get_decoration(var.self, DecorationXfbBuffer);
-				uint32_t stride = get_decoration(var.self, DecorationXfbStride);
-				if (have_xfb_buffer_stride && buffer_index != xfb_buffer)
-					SPIRV_CROSS_THROW("IO block member XfbBuffer mismatch.");
-				if (have_xfb_buffer_stride && stride != xfb_stride)
-					SPIRV_CROSS_THROW("IO block member XfbBuffer mismatch.");
-				have_xfb_buffer_stride = true;
-				xfb_buffer = buffer_index;
-				xfb_stride = stride;
-			}
-
-			if (storage == StorageClassOutput && has_decoration(var.self, DecorationStream))
-			{
-				uint32_t stream = get_decoration(var.self, DecorationStream);
-				if (have_geom_stream && geom_stream != stream)
-					SPIRV_CROSS_THROW("IO block member Stream mismatch.");
-				have_geom_stream = true;
-				geom_stream = stream;
-			}
-		}
-		else if (var.storage == storage && !block && is_builtin_variable(var))
-		{
-			// While we're at it, collect all declared global builtins (HLSL mostly ...).
-			auto &m = ir.meta[var.self].decoration;
-			if (m.builtin && builtin_is_per_vertex_set(m.builtin_type))
-			{
-				// For mesh/tesc output, Clip/Cull is an array-of-array. Look at innermost array type
-				// for correct result.
-				global_builtins.set(m.builtin_type);
-				if (m.builtin_type == BuiltInCullDistance)
-					cull_distance_size = to_array_size_literal(type, 0);
-				else if (m.builtin_type == BuiltInClipDistance)
-					clip_distance_size = to_array_size_literal(type, 0);
-
-				if (is_block_builtin(m.builtin_type) && m.decoration_flags.get(DecorationXfbStride) &&
-				    m.decoration_flags.get(DecorationXfbBuffer) && m.decoration_flags.get(DecorationOffset))
-				{
-					have_any_xfb_offset = true;
-					builtin_xfb_offsets[m.builtin_type] = m.offset;
-					uint32_t buffer_index = m.xfb_buffer;
-					uint32_t stride = m.xfb_stride;
-					if (have_xfb_buffer_stride && buffer_index != xfb_buffer)
-						SPIRV_CROSS_THROW("IO block member XfbBuffer mismatch.");
-					if (have_xfb_buffer_stride && stride != xfb_stride)
-						SPIRV_CROSS_THROW("IO block member XfbBuffer mismatch.");
-					have_xfb_buffer_stride = true;
-					xfb_buffer = buffer_index;
-					xfb_stride = stride;
-				}
-
-				if (is_block_builtin(m.builtin_type) && m.decoration_flags.get(DecorationStream))
-				{
-					uint32_t stream = get_decoration(var.self, DecorationStream);
-					if (have_geom_stream && geom_stream != stream)
-						SPIRV_CROSS_THROW("IO block member Stream mismatch.");
-					have_geom_stream = true;
-					geom_stream = stream;
-				}
-			}
-		}
-
-		if (builtins.empty())
-			return;
-
-		if (emitted_block)
-			SPIRV_CROSS_THROW("Cannot use more than one builtin I/O block.");
-
-		emitted_builtins = builtins;
-		emitted_block = true;
-		block_var = &var;
-	});
+	ir.for_each_typed_id<SPIRVariable>(
+	    [&](uint32_t, SPIRVariable &var)
+	    {
+		    auto &type = this->get<SPIRType>(var.basetype);
+		    bool block = has_decoration(type.self, DecorationBlock);
+		    Bitset builtins;
+
+		    if (var.storage == storage && block && is_builtin_variable(var))
+		    {
+			    uint32_t index = 0;
+			    for (auto &m : ir.meta[type.self].members)
+			    {
+				    if (m.builtin && builtin_is_per_vertex_set(m.builtin_type))
+				    {
+					    builtins.set(m.builtin_type);
+					    if (m.builtin_type == BuiltInCullDistance)
+						    cull_distance_size = to_array_size_literal(this->get<SPIRType>(type.member_types[index]));
+					    else if (m.builtin_type == BuiltInClipDistance)
+						    clip_distance_size = to_array_size_literal(this->get<SPIRType>(type.member_types[index]));
+
+					    if (is_block_builtin(m.builtin_type) && m.decoration_flags.get(DecorationOffset))
+					    {
+						    have_any_xfb_offset = true;
+						    builtin_xfb_offsets[m.builtin_type] = m.offset;
+					    }
+
+					    if (is_block_builtin(m.builtin_type) && m.decoration_flags.get(DecorationStream))
+					    {
+						    uint32_t stream = m.stream;
+						    if (have_geom_stream && geom_stream != stream)
+							    SPIRV_CROSS_THROW("IO block member Stream mismatch.");
+						    have_geom_stream = true;
+						    geom_stream = stream;
+					    }
+				    }
+				    index++;
+			    }
+
+			    if (storage == StorageClassOutput && has_decoration(var.self, DecorationXfbBuffer) &&
+			        has_decoration(var.self, DecorationXfbStride))
+			    {
+				    uint32_t buffer_index = get_decoration(var.self, DecorationXfbBuffer);
+				    uint32_t stride = get_decoration(var.self, DecorationXfbStride);
+				    if (have_xfb_buffer_stride && buffer_index != xfb_buffer)
+					    SPIRV_CROSS_THROW("IO block member XfbBuffer mismatch.");
+				    if (have_xfb_buffer_stride && stride != xfb_stride)
+					    SPIRV_CROSS_THROW("IO block member XfbBuffer mismatch.");
+				    have_xfb_buffer_stride = true;
+				    xfb_buffer = buffer_index;
+				    xfb_stride = stride;
+			    }
+
+			    if (storage == StorageClassOutput && has_decoration(var.self, DecorationStream))
+			    {
+				    uint32_t stream = get_decoration(var.self, DecorationStream);
+				    if (have_geom_stream && geom_stream != stream)
+					    SPIRV_CROSS_THROW("IO block member Stream mismatch.");
+				    have_geom_stream = true;
+				    geom_stream = stream;
+			    }
+		    }
+		    else if (var.storage == storage && !block && is_builtin_variable(var))
+		    {
+			    // While we're at it, collect all declared global builtins (HLSL mostly ...).
+			    auto &m = ir.meta[var.self].decoration;
+			    if (m.builtin && builtin_is_per_vertex_set(m.builtin_type))
+			    {
+				    // For mesh/tesc output, Clip/Cull is an array-of-array. Look at innermost array type
+				    // for correct result.
+				    global_builtins.set(m.builtin_type);
+				    if (m.builtin_type == BuiltInCullDistance)
+					    cull_distance_size = to_array_size_literal(type, 0);
+				    else if (m.builtin_type == BuiltInClipDistance)
+					    clip_distance_size = to_array_size_literal(type, 0);
+
+				    if (is_block_builtin(m.builtin_type) && m.decoration_flags.get(DecorationXfbStride) &&
+				        m.decoration_flags.get(DecorationXfbBuffer) && m.decoration_flags.get(DecorationOffset))
+				    {
+					    have_any_xfb_offset = true;
+					    builtin_xfb_offsets[m.builtin_type] = m.offset;
+					    uint32_t buffer_index = m.xfb_buffer;
+					    uint32_t stride = m.xfb_stride;
+					    if (have_xfb_buffer_stride && buffer_index != xfb_buffer)
+						    SPIRV_CROSS_THROW("IO block member XfbBuffer mismatch.");
+					    if (have_xfb_buffer_stride && stride != xfb_stride)
+						    SPIRV_CROSS_THROW("IO block member XfbBuffer mismatch.");
+					    have_xfb_buffer_stride = true;
+					    xfb_buffer = buffer_index;
+					    xfb_stride = stride;
+				    }
+
+				    if (is_block_builtin(m.builtin_type) && m.decoration_flags.get(DecorationStream))
+				    {
+					    uint32_t stream = get_decoration(var.self, DecorationStream);
+					    if (have_geom_stream && geom_stream != stream)
+						    SPIRV_CROSS_THROW("IO block member Stream mismatch.");
+					    have_geom_stream = true;
+					    geom_stream = stream;
+				    }
+			    }
+		    }
+
+		    if (builtins.empty())
+			    return;
+
+		    if (emitted_block)
+			    SPIRV_CROSS_THROW("Cannot use more than one builtin I/O block.");
+
+		    emitted_builtins = builtins;
+		    emitted_block = true;
+		    block_var = &var;
+	    });
 
 	global_builtins =
 	    Bitset(global_builtins.get_lower() & ((1ull << BuiltInPosition) | (1ull << BuiltInPointSize) |
@@ -3754,27 +3772,28 @@ void CompilerGLSL::emit_resources()
 	if (ir.addressing_model == AddressingModelPhysicalStorageBuffer64)
 	{
 		// Output buffer reference block forward declarations.
-		ir.for_each_typed_id<SPIRType>([&](uint32_t id, SPIRType &type)
-		{
-			if (is_physical_pointer(type))
-			{
-				bool emit_type = true;
-				if (!is_physical_pointer_to_buffer_block(type))
-				{
-					// Only forward-declare if we intend to emit it in the non_block_pointer types.
-					// Otherwise, these are just "benign" pointer types that exist as a result of access chains.
-					emit_type = std::find(physical_storage_non_block_pointer_types.begin(),
-					                      physical_storage_non_block_pointer_types.end(),
-					                      id) != physical_storage_non_block_pointer_types.end();
-				}
-
-				if (emit_type)
-				{
-					emit_buffer_reference_block(id, true);
-					emitted = true;
-				}
-			}
-		});
+		ir.for_each_typed_id<SPIRType>(
+		    [&](uint32_t id, SPIRType &type)
+		    {
+			    if (is_physical_pointer(type))
+			    {
+				    bool emit_type = true;
+				    if (!is_physical_pointer_to_buffer_block(type))
+				    {
+					    // Only forward-declare if we intend to emit it in the non_block_pointer types.
+					    // Otherwise, these are just "benign" pointer types that exist as a result of access chains.
+					    emit_type = std::find(physical_storage_non_block_pointer_types.begin(),
+					                          physical_storage_non_block_pointer_types.end(),
+					                          id) != physical_storage_non_block_pointer_types.end();
+				    }
+
+				    if (emit_type)
+				    {
+					    emit_buffer_reference_block(id, true);
+					    emitted = true;
+				    }
+			    }
+		    });
 	}
 
 	if (emitted)
@@ -3897,66 +3916,74 @@ void CompilerGLSL::emit_resources()
 		for (auto type : physical_storage_non_block_pointer_types)
 			emit_buffer_reference_block(type, false);
 
-		ir.for_each_typed_id<SPIRType>([&](uint32_t id, SPIRType &type) {
-			if (is_physical_pointer_to_buffer_block(type))
-				emit_buffer_reference_block(id, false);
-		});
+		ir.for_each_typed_id<SPIRType>(
+		    [&](uint32_t id, SPIRType &type)
+		    {
+			    if (is_physical_pointer_to_buffer_block(type))
+				    emit_buffer_reference_block(id, false);
+		    });
 	}
 
 	// Output UBOs and SSBOs
-	ir.for_each_typed_id<SPIRVariable>([&](uint32_t, SPIRVariable &var) {
-		auto &type = this->get<SPIRType>(var.basetype);
-
-		bool is_block_storage = type.storage == StorageClassStorageBuffer || type.storage == StorageClassUniform ||
-		                        type.storage == StorageClassShaderRecordBufferKHR;
-		bool has_block_flags = ir.meta[type.self].decoration.decoration_flags.get(DecorationBlock) ||
-		                       ir.meta[type.self].decoration.decoration_flags.get(DecorationBufferBlock);
-
-		if (var.storage != StorageClassFunction && type.pointer && is_block_storage && !is_hidden_variable(var) &&
-		    has_block_flags)
-		{
-			emit_buffer_block(var);
-		}
-	});
+	ir.for_each_typed_id<SPIRVariable>(
+	    [&](uint32_t, SPIRVariable &var)
+	    {
+		    auto &type = this->get<SPIRType>(var.basetype);
+
+		    bool is_block_storage = type.storage == StorageClassStorageBuffer || type.storage == StorageClassUniform ||
+		                            type.storage == StorageClassShaderRecordBufferKHR;
+		    bool has_block_flags = ir.meta[type.self].decoration.decoration_flags.get(DecorationBlock) ||
+		                           ir.meta[type.self].decoration.decoration_flags.get(DecorationBufferBlock);
+
+		    if (var.storage != StorageClassFunction && type.pointer && is_block_storage && !is_hidden_variable(var) &&
+		        has_block_flags)
+		    {
+			    emit_buffer_block(var);
+		    }
+	    });
 
 	// Output push constant blocks
-	ir.for_each_typed_id<SPIRVariable>([&](uint32_t, SPIRVariable &var) {
-		auto &type = this->get<SPIRType>(var.basetype);
-		if (var.storage != StorageClassFunction && type.pointer && type.storage == StorageClassPushConstant &&
-		    !is_hidden_variable(var))
-		{
-			emit_push_constant_block(var);
-		}
-	});
+	ir.for_each_typed_id<SPIRVariable>(
+	    [&](uint32_t, SPIRVariable &var)
+	    {
+		    auto &type = this->get<SPIRType>(var.basetype);
+		    if (var.storage != StorageClassFunction && type.pointer && type.storage == StorageClassPushConstant &&
+		        !is_hidden_variable(var))
+		    {
+			    emit_push_constant_block(var);
+		    }
+	    });
 
 	bool skip_separate_image_sampler = !combined_image_samplers.empty() || !options.vulkan_semantics;
 
 	// Output Uniform Constants (values, samplers, images, etc).
-	ir.for_each_typed_id<SPIRVariable>([&](uint32_t, SPIRVariable &var) {
-		auto &type = this->get<SPIRType>(var.basetype);
-
-		// If we're remapping separate samplers and images, only emit the combined samplers.
-		if (skip_separate_image_sampler)
-		{
-			// Sampler buffers are always used without a sampler, and they will also work in regular GL.
-			bool sampler_buffer = type.basetype == SPIRType::Image && type.image.dim == DimBuffer;
-			bool separate_image = type.basetype == SPIRType::Image && type.image.sampled == 1;
-			bool separate_sampler = type.basetype == SPIRType::Sampler;
-			if (!sampler_buffer && (separate_image || separate_sampler))
-				return;
-		}
-
-		if (var.storage != StorageClassFunction && type.pointer &&
-		    (type.storage == StorageClassUniformConstant || type.storage == StorageClassAtomicCounter ||
-		     type.storage == StorageClassRayPayloadKHR || type.storage == StorageClassIncomingRayPayloadKHR ||
-		     type.storage == StorageClassCallableDataKHR || type.storage == StorageClassIncomingCallableDataKHR ||
-		     type.storage == StorageClassHitAttributeKHR) &&
-		    !is_hidden_variable(var))
-		{
-			emit_uniform(var);
-			emitted = true;
-		}
-	});
+	ir.for_each_typed_id<SPIRVariable>(
+	    [&](uint32_t, SPIRVariable &var)
+	    {
+		    auto &type = this->get<SPIRType>(var.basetype);
+
+		    // If we're remapping separate samplers and images, only emit the combined samplers.
+		    if (skip_separate_image_sampler)
+		    {
+			    // Sampler buffers are always used without a sampler, and they will also work in regular GL.
+			    bool sampler_buffer = type.basetype == SPIRType::Image && type.image.dim == DimBuffer;
+			    bool separate_image = type.basetype == SPIRType::Image && type.image.sampled == 1;
+			    bool separate_sampler = type.basetype == SPIRType::Sampler;
+			    if (!sampler_buffer && (separate_image || separate_sampler))
+				    return;
+		    }
+
+		    if (var.storage != StorageClassFunction && type.pointer &&
+		        (type.storage == StorageClassUniformConstant || type.storage == StorageClassAtomicCounter ||
+		         type.storage == StorageClassRayPayloadKHR || type.storage == StorageClassIncomingRayPayloadKHR ||
+		         type.storage == StorageClassCallableDataKHR || type.storage == StorageClassIncomingCallableDataKHR ||
+		         type.storage == StorageClassHitAttributeKHR) &&
+		        !is_hidden_variable(var))
+		    {
+			    emit_uniform(var);
+			    emitted = true;
+		    }
+	    });
 
 	if (emitted)
 		statement("");
@@ -3965,71 +3992,73 @@ void CompilerGLSL::emit_resources()
 	bool emitted_base_instance = false;
 
 	// Output in/out interfaces.
-	ir.for_each_typed_id<SPIRVariable>([&](uint32_t, SPIRVariable &var) {
-		auto &type = this->get<SPIRType>(var.basetype);
-
-		bool is_hidden = is_hidden_variable(var);
-
-		// Unused output I/O variables might still be required to implement framebuffer fetch.
-		if (var.storage == StorageClassOutput && !is_legacy() &&
-		    location_is_framebuffer_fetch(get_decoration(var.self, DecorationLocation)) != 0)
-		{
-			is_hidden = false;
-		}
-
-		if (var.storage != StorageClassFunction && type.pointer &&
-		    (var.storage == StorageClassInput || var.storage == StorageClassOutput) &&
-		    interface_variable_exists_in_entry_point(var.self) && !is_hidden)
-		{
-			if (options.es && get_execution_model() == ExecutionModelVertex && var.storage == StorageClassInput &&
-			    type.array.size() == 1)
-			{
-				SPIRV_CROSS_THROW("OpenGL ES doesn't support array input variables in vertex shader.");
-			}
-			emit_interface_block(var);
-			emitted = true;
-		}
-		else if (is_builtin_variable(var))
-		{
-			auto builtin = BuiltIn(get_decoration(var.self, DecorationBuiltIn));
-			// For gl_InstanceIndex emulation on GLES, the API user needs to
-			// supply this uniform.
-
-			// The draw parameter extension is soft-enabled on GL with some fallbacks.
-			if (!options.vulkan_semantics)
-			{
-				if (!emitted_base_instance &&
-				    ((options.vertex.support_nonzero_base_instance && builtin == BuiltInInstanceIndex) ||
-				     (builtin == BuiltInBaseInstance)))
-				{
-					statement("#ifdef GL_ARB_shader_draw_parameters");
-					statement("#define SPIRV_Cross_BaseInstance gl_BaseInstanceARB");
-					statement("#else");
-					// A crude, but simple workaround which should be good enough for non-indirect draws.
-					statement("uniform int SPIRV_Cross_BaseInstance;");
-					statement("#endif");
-					emitted = true;
-					emitted_base_instance = true;
-				}
-				else if (builtin == BuiltInBaseVertex)
-				{
-					statement("#ifdef GL_ARB_shader_draw_parameters");
-					statement("#define SPIRV_Cross_BaseVertex gl_BaseVertexARB");
-					statement("#else");
-					// A crude, but simple workaround which should be good enough for non-indirect draws.
-					statement("uniform int SPIRV_Cross_BaseVertex;");
-					statement("#endif");
-				}
-				else if (builtin == BuiltInDrawIndex)
-				{
-					statement("#ifndef GL_ARB_shader_draw_parameters");
-					// Cannot really be worked around.
-					statement("#error GL_ARB_shader_draw_parameters is not supported.");
-					statement("#endif");
-				}
-			}
-		}
-	});
+	ir.for_each_typed_id<SPIRVariable>(
+	    [&](uint32_t, SPIRVariable &var)
+	    {
+		    auto &type = this->get<SPIRType>(var.basetype);
+
+		    bool is_hidden = is_hidden_variable(var);
+
+		    // Unused output I/O variables might still be required to implement framebuffer fetch.
+		    if (var.storage == StorageClassOutput && !is_legacy() &&
+		        location_is_framebuffer_fetch(get_decoration(var.self, DecorationLocation)) != 0)
+		    {
+			    is_hidden = false;
+		    }
+
+		    if (var.storage != StorageClassFunction && type.pointer &&
+		        (var.storage == StorageClassInput || var.storage == StorageClassOutput) &&
+		        interface_variable_exists_in_entry_point(var.self) && !is_hidden)
+		    {
+			    if (options.es && get_execution_model() == ExecutionModelVertex && var.storage == StorageClassInput &&
+			        type.array.size() == 1)
+			    {
+				    SPIRV_CROSS_THROW("OpenGL ES doesn't support array input variables in vertex shader.");
+			    }
+			    emit_interface_block(var);
+			    emitted = true;
+		    }
+		    else if (is_builtin_variable(var))
+		    {
+			    auto builtin = BuiltIn(get_decoration(var.self, DecorationBuiltIn));
+			    // For gl_InstanceIndex emulation on GLES, the API user needs to
+			    // supply this uniform.
+
+			    // The draw parameter extension is soft-enabled on GL with some fallbacks.
+			    if (!options.vulkan_semantics)
+			    {
+				    if (!emitted_base_instance &&
+				        ((options.vertex.support_nonzero_base_instance && builtin == BuiltInInstanceIndex) ||
+				         (builtin == BuiltInBaseInstance)))
+				    {
+					    statement("#ifdef GL_ARB_shader_draw_parameters");
+					    statement("#define SPIRV_Cross_BaseInstance gl_BaseInstanceARB");
+					    statement("#else");
+					    // A crude, but simple workaround which should be good enough for non-indirect draws.
+					    statement("uniform int SPIRV_Cross_BaseInstance;");
+					    statement("#endif");
+					    emitted = true;
+					    emitted_base_instance = true;
+				    }
+				    else if (builtin == BuiltInBaseVertex)
+				    {
+					    statement("#ifdef GL_ARB_shader_draw_parameters");
+					    statement("#define SPIRV_Cross_BaseVertex gl_BaseVertexARB");
+					    statement("#else");
+					    // A crude, but simple workaround which should be good enough for non-indirect draws.
+					    statement("uniform int SPIRV_Cross_BaseVertex;");
+					    statement("#endif");
+				    }
+				    else if (builtin == BuiltInDrawIndex)
+				    {
+					    statement("#ifndef GL_ARB_shader_draw_parameters");
+					    // Cannot really be worked around.
+					    statement("#error GL_ARB_shader_draw_parameters is not supported.");
+					    statement("#endif");
+				    }
+			    }
+		    }
+	    });
 
 	// Global variables.
 	for (auto global : global_variables)
@@ -4123,94 +4152,103 @@ void CompilerGLSL::emit_output_variable_initializer(const SPIRVariable &var)
 
 			for (uint32_t j = 0; j < iteration_count; j++)
 			{
-				entry_func.fixup_hooks_in.push_back([=, &var]() {
-					AccessChainMeta meta;
-					auto &c = this->get<SPIRConstant>(var.initializer);
-
-					uint32_t invocation_id = 0;
-					uint32_t member_index_id = 0;
-					if (is_control_point)
-					{
-						uint32_t ids = ir.increase_bound_by(3);
-						auto &uint_type = set<SPIRType>(ids, OpTypeInt);
-						uint_type.basetype = SPIRType::UInt;
-						uint_type.width = 32;
-						set<SPIRExpression>(ids + 1, builtin_to_glsl(BuiltInInvocationId, StorageClassInput), ids, true);
-						set<SPIRConstant>(ids + 2, ids, i, false);
-						invocation_id = ids + 1;
-						member_index_id = ids + 2;
-					}
-
-					if (is_patch)
-					{
-						statement("if (gl_InvocationID == 0)");
-						begin_scope();
-					}
-
-					if (type_is_array && !is_control_point)
-					{
-						uint32_t indices[2] = { j, i };
-						auto chain = access_chain_internal(var.self, indices, 2, ACCESS_CHAIN_INDEX_IS_LITERAL_BIT, &meta);
-						statement(chain, " = ", lut_name, "[", j, "];");
-					}
-					else if (is_control_point)
-					{
-						uint32_t indices[2] = { invocation_id, member_index_id };
-						auto chain = access_chain_internal(var.self, indices, 2, 0, &meta);
-						statement(chain, " = ", lut_name, "[", builtin_to_glsl(BuiltInInvocationId, StorageClassInput), "];");
-					}
-					else
-					{
-						auto chain =
-								access_chain_internal(var.self, &i, 1, ACCESS_CHAIN_INDEX_IS_LITERAL_BIT, &meta);
-						statement(chain, " = ", to_expression(c.subconstants[i]), ";");
-					}
-
-					if (is_patch)
-						end_scope();
-				});
+				entry_func.fixup_hooks_in.push_back(
+				    [=, &var]()
+				    {
+					    AccessChainMeta meta;
+					    auto &c = this->get<SPIRConstant>(var.initializer);
+
+					    uint32_t invocation_id = 0;
+					    uint32_t member_index_id = 0;
+					    if (is_control_point)
+					    {
+						    uint32_t ids = ir.increase_bound_by(3);
+						    auto &uint_type = set<SPIRType>(ids, OpTypeInt);
+						    uint_type.basetype = SPIRType::UInt;
+						    uint_type.width = 32;
+						    set<SPIRExpression>(ids + 1, builtin_to_glsl(BuiltInInvocationId, StorageClassInput), ids,
+						                        true);
+						    set<SPIRConstant>(ids + 2, ids, i, false);
+						    invocation_id = ids + 1;
+						    member_index_id = ids + 2;
+					    }
+
+					    if (is_patch)
+					    {
+						    statement("if (gl_InvocationID == 0)");
+						    begin_scope();
+					    }
+
+					    if (type_is_array && !is_control_point)
+					    {
+						    uint32_t indices[2] = { j, i };
+						    auto chain =
+						        access_chain_internal(var.self, indices, 2, ACCESS_CHAIN_INDEX_IS_LITERAL_BIT, &meta);
+						    statement(chain, " = ", lut_name, "[", j, "];");
+					    }
+					    else if (is_control_point)
+					    {
+						    uint32_t indices[2] = { invocation_id, member_index_id };
+						    auto chain = access_chain_internal(var.self, indices, 2, 0, &meta);
+						    statement(chain, " = ", lut_name, "[",
+						              builtin_to_glsl(BuiltInInvocationId, StorageClassInput), "];");
+					    }
+					    else
+					    {
+						    auto chain =
+						        access_chain_internal(var.self, &i, 1, ACCESS_CHAIN_INDEX_IS_LITERAL_BIT, &meta);
+						    statement(chain, " = ", to_expression(c.subconstants[i]), ";");
+					    }
+
+					    if (is_patch)
+						    end_scope();
+				    });
 			}
 		}
 	}
 	else if (is_control_point)
 	{
 		auto lut_name = join("_", var.self, "_init");
-		statement("const ", type_to_glsl(type), " ", lut_name, type_to_array_glsl(type, 0),
-		          " = ", to_expression(var.initializer), ";");
-		entry_func.fixup_hooks_in.push_back([&, lut_name]() {
-			statement(to_expression(var.self), "[gl_InvocationID] = ", lut_name, "[gl_InvocationID];");
-		});
+		statement("const ", type_to_glsl(type), " ", lut_name, type_to_array_glsl(type, 0), " = ",
+		          to_expression(var.initializer), ";");
+		entry_func.fixup_hooks_in.push_back(
+		    [&, lut_name]()
+		    { statement(to_expression(var.self), "[gl_InvocationID] = ", lut_name, "[gl_InvocationID];"); });
 	}
 	else if (has_decoration(var.self, DecorationBuiltIn) &&
 	         BuiltIn(get_decoration(var.self, DecorationBuiltIn)) == BuiltInSampleMask)
 	{
 		// We cannot copy the array since gl_SampleMask is unsized in GLSL. Unroll time! <_<
-		entry_func.fixup_hooks_in.push_back([&] {
-			auto &c = this->get<SPIRConstant>(var.initializer);
-			uint32_t num_constants = uint32_t(c.subconstants.size());
-			for (uint32_t i = 0; i < num_constants; i++)
-			{
-				// Don't use to_expression on constant since it might be uint, just fish out the raw int.
-				statement(to_expression(var.self), "[", i, "] = ",
-				          convert_to_string(this->get<SPIRConstant>(c.subconstants[i]).scalar_i32()), ";");
-			}
-		});
+		entry_func.fixup_hooks_in.push_back(
+		    [&]
+		    {
+			    auto &c = this->get<SPIRConstant>(var.initializer);
+			    uint32_t num_constants = uint32_t(c.subconstants.size());
+			    for (uint32_t i = 0; i < num_constants; i++)
+			    {
+				    // Don't use to_expression on constant since it might be uint, just fish out the raw int.
+				    statement(to_expression(var.self), "[", i,
+				              "] = ", convert_to_string(this->get<SPIRConstant>(c.subconstants[i]).scalar_i32()), ";");
+			    }
+		    });
 	}
 	else
 	{
 		auto lut_name = join("_", var.self, "_init");
-		statement("const ", type_to_glsl(type), " ", lut_name,
-		          type_to_array_glsl(type, var.self), " = ", to_expression(var.initializer), ";");
-		entry_func.fixup_hooks_in.push_back([&, lut_name, is_patch]() {
-			if (is_patch)
-			{
-				statement("if (gl_InvocationID == 0)");
-				begin_scope();
-			}
-			statement(to_expression(var.self), " = ", lut_name, ";");
-			if (is_patch)
-				end_scope();
-		});
+		statement("const ", type_to_glsl(type), " ", lut_name, type_to_array_glsl(type, var.self), " = ",
+		          to_expression(var.initializer), ";");
+		entry_func.fixup_hooks_in.push_back(
+		    [&, lut_name, is_patch]()
+		    {
+			    if (is_patch)
+			    {
+				    statement("if (gl_InvocationID == 0)");
+				    begin_scope();
+			    }
+			    statement(to_expression(var.self), " = ", lut_name, ";");
+			    if (is_patch)
+				    end_scope();
+		    });
 	}
 }
 
@@ -4781,8 +4819,7 @@ void CompilerGLSL::emit_extension_workarounds(ExecutionModel model)
 			statement("");
 		}
 
-		auto arithmetic_feature_helper =
-		    [&](Supp::Feature feat, std::string func_name, Op op, GroupOperation group_op)
+		auto arithmetic_feature_helper = [&](Supp::Feature feat, std::string func_name, Op op, GroupOperation group_op)
 		{
 			if (shader_subgroup_supporter.is_feature_requested(feat))
 			{
@@ -4843,8 +4880,10 @@ void CompilerGLSL::emit_extension_workarounds(ExecutionModel model)
 			{
 				// Need both variants.
 				// GLSL cannot overload on precision, so need to dispatch appropriately.
-				statement("highp ", type_to_glsl(type), " spvWorkaroundRowMajor(highp ", type_to_glsl(type), " wrap) { return wrap; }");
-				statement("mediump ", type_to_glsl(type), " spvWorkaroundRowMajorMP(mediump ", type_to_glsl(type), " wrap) { return wrap; }");
+				statement("highp ", type_to_glsl(type), " spvWorkaroundRowMajor(highp ", type_to_glsl(type),
+				          " wrap) { return wrap; }");
+				statement("mediump ", type_to_glsl(type), " spvWorkaroundRowMajorMP(mediump ", type_to_glsl(type),
+				          " wrap) { return wrap; }");
 			}
 			else
 			{
@@ -4904,8 +4943,8 @@ void CompilerGLSL::emit_polyfills(uint32_t polyfills, bool relaxed)
 		statement(qual, "float spvDeterminant", suffix, "(", qual, "mat3 m)");
 		begin_scope();
 		statement("return dot(m[0], vec3(m[1][1] * m[2][2] - m[1][2] * m[2][1], "
-		                                "m[1][2] * m[2][0] - m[1][0] * m[2][2], "
-		                                "m[1][0] * m[2][1] - m[1][1] * m[2][0]));");
+		          "m[1][2] * m[2][0] - m[1][0] * m[2][2], "
+		          "m[1][0] * m[2][1] - m[1][1] * m[2][0]));");
 		end_scope();
 		statement("");
 	}
@@ -4915,10 +4954,14 @@ void CompilerGLSL::emit_polyfills(uint32_t polyfills, bool relaxed)
 		statement(qual, "float spvDeterminant", suffix, "(", qual, "mat4 m)");
 		begin_scope();
 		statement("return dot(m[0], vec4("
-		          "m[2][1] * m[3][2] * m[1][3] - m[3][1] * m[2][2] * m[1][3] + m[3][1] * m[1][2] * m[2][3] - m[1][1] * m[3][2] * m[2][3] - m[2][1] * m[1][2] * m[3][3] + m[1][1] * m[2][2] * m[3][3], "
-		          "m[3][0] * m[2][2] * m[1][3] - m[2][0] * m[3][2] * m[1][3] - m[3][0] * m[1][2] * m[2][3] + m[1][0] * m[3][2] * m[2][3] + m[2][0] * m[1][2] * m[3][3] - m[1][0] * m[2][2] * m[3][3], "
-		          "m[2][0] * m[3][1] * m[1][3] - m[3][0] * m[2][1] * m[1][3] + m[3][0] * m[1][1] * m[2][3] - m[1][0] * m[3][1] * m[2][3] - m[2][0] * m[1][1] * m[3][3] + m[1][0] * m[2][1] * m[3][3], "
-		          "m[3][0] * m[2][1] * m[1][2] - m[2][0] * m[3][1] * m[1][2] - m[3][0] * m[1][1] * m[2][2] + m[1][0] * m[3][1] * m[2][2] + m[2][0] * m[1][1] * m[3][2] - m[1][0] * m[2][1] * m[3][2]));");
+		          "m[2][1] * m[3][2] * m[1][3] - m[3][1] * m[2][2] * m[1][3] + m[3][1] * m[1][2] * m[2][3] - m[1][1] * "
+		          "m[3][2] * m[2][3] - m[2][1] * m[1][2] * m[3][3] + m[1][1] * m[2][2] * m[3][3], "
+		          "m[3][0] * m[2][2] * m[1][3] - m[2][0] * m[3][2] * m[1][3] - m[3][0] * m[1][2] * m[2][3] + m[1][0] * "
+		          "m[3][2] * m[2][3] + m[2][0] * m[1][2] * m[3][3] - m[1][0] * m[2][2] * m[3][3], "
+		          "m[2][0] * m[3][1] * m[1][3] - m[3][0] * m[2][1] * m[1][3] + m[3][0] * m[1][1] * m[2][3] - m[1][0] * "
+		          "m[3][1] * m[2][3] - m[2][0] * m[1][1] * m[3][3] + m[1][0] * m[2][1] * m[3][3], "
+		          "m[3][0] * m[2][1] * m[1][2] - m[2][0] * m[3][1] * m[1][2] - m[3][0] * m[1][1] * m[2][2] + m[1][0] * "
+		          "m[3][1] * m[2][2] + m[2][0] * m[1][1] * m[3][2] - m[1][0] * m[2][1] * m[3][2]));");
 		end_scope();
 		statement("");
 	}
@@ -4937,17 +4980,18 @@ void CompilerGLSL::emit_polyfills(uint32_t polyfills, bool relaxed)
 	{
 		statement(qual, "mat3 spvInverse", suffix, "(", qual, "mat3 m)");
 		begin_scope();
-		statement(qual, "vec3 t = vec3(m[1][1] * m[2][2] - m[1][2] * m[2][1], m[1][2] * m[2][0] - m[1][0] * m[2][2], m[1][0] * m[2][1] - m[1][1] * m[2][0]);");
+		statement(qual, "vec3 t = vec3(m[1][1] * m[2][2] - m[1][2] * m[2][1], m[1][2] * m[2][0] - m[1][0] * m[2][2], "
+		                "m[1][0] * m[2][1] - m[1][1] * m[2][0]);");
 		statement("return mat3(t[0], "
-		                      "m[0][2] * m[2][1] - m[0][1] * m[2][2], "
-		                      "m[0][1] * m[1][2] - m[0][2] * m[1][1], "
-		                      "t[1], "
-		                      "m[0][0] * m[2][2] - m[0][2] * m[2][0], "
-		                      "m[0][2] * m[1][0] - m[0][0] * m[1][2], "
-		                      "t[2], "
-		                      "m[0][1] * m[2][0] - m[0][0] * m[2][1], "
-		                      "m[0][0] * m[1][1] - m[0][1] * m[1][0]) "
-		                      "* (1.0 / dot(m[0], t));");
+		          "m[0][2] * m[2][1] - m[0][1] * m[2][2], "
+		          "m[0][1] * m[1][2] - m[0][2] * m[1][1], "
+		          "t[1], "
+		          "m[0][0] * m[2][2] - m[0][2] * m[2][0], "
+		          "m[0][2] * m[1][0] - m[0][0] * m[1][2], "
+		          "t[2], "
+		          "m[0][1] * m[2][0] - m[0][0] * m[2][1], "
+		          "m[0][0] * m[1][1] - m[0][1] * m[1][0]) "
+		          "* (1.0 / dot(m[0], t));");
 		end_scope();
 		statement("");
 	}
@@ -4957,27 +5001,43 @@ void CompilerGLSL::emit_polyfills(uint32_t polyfills, bool relaxed)
 		statement(qual, "mat4 spvInverse", suffix, "(", qual, "mat4 m)");
 		begin_scope();
 		statement(qual, "vec4 t = vec4("
-		          "m[2][1] * m[3][2] * m[1][3] - m[3][1] * m[2][2] * m[1][3] + m[3][1] * m[1][2] * m[2][3] - m[1][1] * m[3][2] * m[2][3] - m[2][1] * m[1][2] * m[3][3] + m[1][1] * m[2][2] * m[3][3], "
-		          "m[3][0] * m[2][2] * m[1][3] - m[2][0] * m[3][2] * m[1][3] - m[3][0] * m[1][2] * m[2][3] + m[1][0] * m[3][2] * m[2][3] + m[2][0] * m[1][2] * m[3][3] - m[1][0] * m[2][2] * m[3][3], "
-		          "m[2][0] * m[3][1] * m[1][3] - m[3][0] * m[2][1] * m[1][3] + m[3][0] * m[1][1] * m[2][3] - m[1][0] * m[3][1] * m[2][3] - m[2][0] * m[1][1] * m[3][3] + m[1][0] * m[2][1] * m[3][3], "
-		          "m[3][0] * m[2][1] * m[1][2] - m[2][0] * m[3][1] * m[1][2] - m[3][0] * m[1][1] * m[2][2] + m[1][0] * m[3][1] * m[2][2] + m[2][0] * m[1][1] * m[3][2] - m[1][0] * m[2][1] * m[3][2]);");
+		                "m[2][1] * m[3][2] * m[1][3] - m[3][1] * m[2][2] * m[1][3] + m[3][1] * m[1][2] * m[2][3] - "
+		                "m[1][1] * m[3][2] * m[2][3] - m[2][1] * m[1][2] * m[3][3] + m[1][1] * m[2][2] * m[3][3], "
+		                "m[3][0] * m[2][2] * m[1][3] - m[2][0] * m[3][2] * m[1][3] - m[3][0] * m[1][2] * m[2][3] + "
+		                "m[1][0] * m[3][2] * m[2][3] + m[2][0] * m[1][2] * m[3][3] - m[1][0] * m[2][2] * m[3][3], "
+		                "m[2][0] * m[3][1] * m[1][3] - m[3][0] * m[2][1] * m[1][3] + m[3][0] * m[1][1] * m[2][3] - "
+		                "m[1][0] * m[3][1] * m[2][3] - m[2][0] * m[1][1] * m[3][3] + m[1][0] * m[2][1] * m[3][3], "
+		                "m[3][0] * m[2][1] * m[1][2] - m[2][0] * m[3][1] * m[1][2] - m[3][0] * m[1][1] * m[2][2] + "
+		                "m[1][0] * m[3][1] * m[2][2] + m[2][0] * m[1][1] * m[3][2] - m[1][0] * m[2][1] * m[3][2]);");
 		statement("return mat4("
 		          "t[0], "
-		          "m[3][1] * m[2][2] * m[0][3] - m[2][1] * m[3][2] * m[0][3] - m[3][1] * m[0][2] * m[2][3] + m[0][1] * m[3][2] * m[2][3] + m[2][1] * m[0][2] * m[3][3] - m[0][1] * m[2][2] * m[3][3], "
-		          "m[1][1] * m[3][2] * m[0][3] - m[3][1] * m[1][2] * m[0][3] + m[3][1] * m[0][2] * m[1][3] - m[0][1] * m[3][2] * m[1][3] - m[1][1] * m[0][2] * m[3][3] + m[0][1] * m[1][2] * m[3][3], "
-		          "m[2][1] * m[1][2] * m[0][3] - m[1][1] * m[2][2] * m[0][3] - m[2][1] * m[0][2] * m[1][3] + m[0][1] * m[2][2] * m[1][3] + m[1][1] * m[0][2] * m[2][3] - m[0][1] * m[1][2] * m[2][3], "
+		          "m[3][1] * m[2][2] * m[0][3] - m[2][1] * m[3][2] * m[0][3] - m[3][1] * m[0][2] * m[2][3] + m[0][1] * "
+		          "m[3][2] * m[2][3] + m[2][1] * m[0][2] * m[3][3] - m[0][1] * m[2][2] * m[3][3], "
+		          "m[1][1] * m[3][2] * m[0][3] - m[3][1] * m[1][2] * m[0][3] + m[3][1] * m[0][2] * m[1][3] - m[0][1] * "
+		          "m[3][2] * m[1][3] - m[1][1] * m[0][2] * m[3][3] + m[0][1] * m[1][2] * m[3][3], "
+		          "m[2][1] * m[1][2] * m[0][3] - m[1][1] * m[2][2] * m[0][3] - m[2][1] * m[0][2] * m[1][3] + m[0][1] * "
+		          "m[2][2] * m[1][3] + m[1][1] * m[0][2] * m[2][3] - m[0][1] * m[1][2] * m[2][3], "
 		          "t[1], "
-		          "m[2][0] * m[3][2] * m[0][3] - m[3][0] * m[2][2] * m[0][3] + m[3][0] * m[0][2] * m[2][3] - m[0][0] * m[3][2] * m[2][3] - m[2][0] * m[0][2] * m[3][3] + m[0][0] * m[2][2] * m[3][3], "
-		          "m[3][0] * m[1][2] * m[0][3] - m[1][0] * m[3][2] * m[0][3] - m[3][0] * m[0][2] * m[1][3] + m[0][0] * m[3][2] * m[1][3] + m[1][0] * m[0][2] * m[3][3] - m[0][0] * m[1][2] * m[3][3], "
-		          "m[1][0] * m[2][2] * m[0][3] - m[2][0] * m[1][2] * m[0][3] + m[2][0] * m[0][2] * m[1][3] - m[0][0] * m[2][2] * m[1][3] - m[1][0] * m[0][2] * m[2][3] + m[0][0] * m[1][2] * m[2][3], "
+		          "m[2][0] * m[3][2] * m[0][3] - m[3][0] * m[2][2] * m[0][3] + m[3][0] * m[0][2] * m[2][3] - m[0][0] * "
+		          "m[3][2] * m[2][3] - m[2][0] * m[0][2] * m[3][3] + m[0][0] * m[2][2] * m[3][3], "
+		          "m[3][0] * m[1][2] * m[0][3] - m[1][0] * m[3][2] * m[0][3] - m[3][0] * m[0][2] * m[1][3] + m[0][0] * "
+		          "m[3][2] * m[1][3] + m[1][0] * m[0][2] * m[3][3] - m[0][0] * m[1][2] * m[3][3], "
+		          "m[1][0] * m[2][2] * m[0][3] - m[2][0] * m[1][2] * m[0][3] + m[2][0] * m[0][2] * m[1][3] - m[0][0] * "
+		          "m[2][2] * m[1][3] - m[1][0] * m[0][2] * m[2][3] + m[0][0] * m[1][2] * m[2][3], "
 		          "t[2], "
-		          "m[3][0] * m[2][1] * m[0][3] - m[2][0] * m[3][1] * m[0][3] - m[3][0] * m[0][1] * m[2][3] + m[0][0] * m[3][1] * m[2][3] + m[2][0] * m[0][1] * m[3][3] - m[0][0] * m[2][1] * m[3][3], "
-		          "m[1][0] * m[3][1] * m[0][3] - m[3][0] * m[1][1] * m[0][3] + m[3][0] * m[0][1] * m[1][3] - m[0][0] * m[3][1] * m[1][3] - m[1][0] * m[0][1] * m[3][3] + m[0][0] * m[1][1] * m[3][3], "
-		          "m[2][0] * m[1][1] * m[0][3] - m[1][0] * m[2][1] * m[0][3] - m[2][0] * m[0][1] * m[1][3] + m[0][0] * m[2][1] * m[1][3] + m[1][0] * m[0][1] * m[2][3] - m[0][0] * m[1][1] * m[2][3], "
+		          "m[3][0] * m[2][1] * m[0][3] - m[2][0] * m[3][1] * m[0][3] - m[3][0] * m[0][1] * m[2][3] + m[0][0] * "
+		          "m[3][1] * m[2][3] + m[2][0] * m[0][1] * m[3][3] - m[0][0] * m[2][1] * m[3][3], "
+		          "m[1][0] * m[3][1] * m[0][3] - m[3][0] * m[1][1] * m[0][3] + m[3][0] * m[0][1] * m[1][3] - m[0][0] * "
+		          "m[3][1] * m[1][3] - m[1][0] * m[0][1] * m[3][3] + m[0][0] * m[1][1] * m[3][3], "
+		          "m[2][0] * m[1][1] * m[0][3] - m[1][0] * m[2][1] * m[0][3] - m[2][0] * m[0][1] * m[1][3] + m[0][0] * "
+		          "m[2][1] * m[1][3] + m[1][0] * m[0][1] * m[2][3] - m[0][0] * m[1][1] * m[2][3], "
 		          "t[3], "
-		          "m[2][0] * m[3][1] * m[0][2] - m[3][0] * m[2][1] * m[0][2] + m[3][0] * m[0][1] * m[2][2] - m[0][0] * m[3][1] * m[2][2] - m[2][0] * m[0][1] * m[3][2] + m[0][0] * m[2][1] * m[3][2], "
-		          "m[3][0] * m[1][1] * m[0][2] - m[1][0] * m[3][1] * m[0][2] - m[3][0] * m[0][1] * m[1][2] + m[0][0] * m[3][1] * m[1][2] + m[1][0] * m[0][1] * m[3][2] - m[0][0] * m[1][1] * m[3][2], "
-		          "m[1][0] * m[2][1] * m[0][2] - m[2][0] * m[1][1] * m[0][2] + m[2][0] * m[0][1] * m[1][2] - m[0][0] * m[2][1] * m[1][2] - m[1][0] * m[0][1] * m[2][2] + m[0][0] * m[1][1] * m[2][2]) "
+		          "m[2][0] * m[3][1] * m[0][2] - m[3][0] * m[2][1] * m[0][2] + m[3][0] * m[0][1] * m[2][2] - m[0][0] * "
+		          "m[3][1] * m[2][2] - m[2][0] * m[0][1] * m[3][2] + m[0][0] * m[2][1] * m[3][2], "
+		          "m[3][0] * m[1][1] * m[0][2] - m[1][0] * m[3][1] * m[0][2] - m[3][0] * m[0][1] * m[1][2] + m[0][0] * "
+		          "m[3][1] * m[1][2] + m[1][0] * m[0][1] * m[3][2] - m[0][0] * m[1][1] * m[3][2], "
+		          "m[1][0] * m[2][1] * m[0][2] - m[2][0] * m[1][1] * m[0][2] + m[2][0] * m[0][1] * m[1][2] - m[0][0] * "
+		          "m[2][1] * m[1][2] - m[1][0] * m[0][1] * m[2][2] + m[0][0] * m[1][1] * m[2][2]) "
 		          "* (1.0 / dot(m[0], t));");
 		end_scope();
 		statement("");
@@ -5004,8 +5064,8 @@ void CompilerGLSL::emit_polyfills(uint32_t polyfills, bool relaxed)
 
 				const char *types[3][4] = {
 					{ "float16_t", "f16vec2", "f16vec3", "f16vec4" },
-					{ "float",     "vec2",    "vec3",    "vec4" },
-					{ "double",    "dvec2",   "dvec3",   "dvec4" },
+					{ "float", "vec2", "vec3", "vec4" },
+					{ "double", "dvec2", "dvec3", "dvec4" },
 				};
 
 				for (uint32_t k = 0; k < 4; k++)
@@ -5014,13 +5074,13 @@ void CompilerGLSL::emit_polyfills(uint32_t polyfills, bool relaxed)
 
 					if (i < 2)
 					{
-						statement("spirv_instruction(set = \"GLSL.std.450\", id = ", glsl_ops[i], ") ",
-						          type, " ", spv_ops[i], "(", type, ", ", type, ");");
+						statement("spirv_instruction(set = \"GLSL.std.450\", id = ", glsl_ops[i], ") ", type, " ",
+						          spv_ops[i], "(", type, ", ", type, ");");
 					}
 					else
 					{
-						statement("spirv_instruction(set = \"GLSL.std.450\", id = ", glsl_ops[i], ") ",
-						          type, " ", spv_ops[i], "(", type, ", ", type, ", ", type, ");");
+						statement("spirv_instruction(set = \"GLSL.std.450\", id = ", glsl_ops[i], ") ", type, " ",
+						          spv_ops[i], "(", type, ", ", type, ", ", type, ");");
 					}
 
 					has_poly = true;
@@ -5053,8 +5113,8 @@ void CompilerGLSL::emit_polyfills(uint32_t polyfills, bool relaxed)
 
 				const char *types[3][4] = {
 					{ "float16_t", "f16vec2", "f16vec3", "f16vec4" },
-					{ "float",     "vec2",    "vec3",    "vec4" },
-					{ "double",    "dvec2",   "dvec3",   "dvec4" },
+					{ "float", "vec2", "vec3", "vec4" },
+					{ "double", "dvec2", "dvec3", "dvec4" },
 				};
 
 				for (uint32_t k = 0; k < 4; k++)
@@ -5063,8 +5123,8 @@ void CompilerGLSL::emit_polyfills(uint32_t polyfills, bool relaxed)
 
 					if (i < 2)
 					{
-						statement("mediump ", type, " ", spv_ops[i], "Relaxed(",
-						          "mediump ", type, " a, mediump ", type, " b)");
+						statement("mediump ", type, " ", spv_ops[i], "Relaxed(", "mediump ", type, " a, mediump ", type,
+						          " b)");
 						begin_scope();
 						statement("mediump ", type, " res = ", spv_ops[i], "(a, b);");
 						statement("return res;");
@@ -5073,8 +5133,8 @@ void CompilerGLSL::emit_polyfills(uint32_t polyfills, bool relaxed)
 					}
 					else
 					{
-						statement("mediump ", type, " ", spv_ops[i], "Relaxed(",
-						          "mediump ", type, " a, mediump ", type, " b, mediump ", type, " c)");
+						statement("mediump ", type, " ", spv_ops[i], "Relaxed(", "mediump ", type, " a, mediump ", type,
+						          " b, mediump ", type, " c)");
 						begin_scope();
 						statement("mediump ", type, " res = ", spv_ops[i], "(a, b, c);");
 						statement("return res;");
@@ -5115,7 +5175,8 @@ void CompilerGLSL::force_temporary_and_recompile(uint32_t id)
 		force_recompile();
 }
 
-uint32_t CompilerGLSL::consume_temporary_in_precision_context(uint32_t type_id, uint32_t id, Options::Precision precision)
+uint32_t CompilerGLSL::consume_temporary_in_precision_context(uint32_t type_id, uint32_t id,
+                                                              Options::Precision precision)
 {
 	// Constants do not have innate precision.
 	auto handle_type = ir.ids[id].get_type();
@@ -5428,14 +5489,13 @@ string CompilerGLSL::to_extract_constant_composite_expression(uint32_t result_ty
 	return constant_expression(tmp);
 }
 
-string CompilerGLSL::to_rerolled_array_expression(const SPIRType &parent_type,
-                                                  const string &base_expr, const SPIRType &type)
+string CompilerGLSL::to_rerolled_array_expression(const SPIRType &parent_type, const string &base_expr,
+                                                  const SPIRType &type)
 {
-	bool remapped_boolean = parent_type.basetype == SPIRType::Struct &&
-	                        type.basetype == SPIRType::Boolean &&
+	bool remapped_boolean = parent_type.basetype == SPIRType::Struct && type.basetype == SPIRType::Boolean &&
 	                        backend.boolean_in_struct_remapped_type != SPIRType::Boolean;
 
-	SPIRType tmp_type { OpNop };
+	SPIRType tmp_type{ OpNop };
 	if (remapped_boolean)
 	{
 		tmp_type = get<SPIRType>(type.parent_type);
@@ -5478,14 +5538,13 @@ string CompilerGLSL::to_composite_constructor_expression(const SPIRType &parent_
 	auto &type = expression_type(id);
 
 	bool reroll_array = false;
-	bool remapped_boolean = parent_type.basetype == SPIRType::Struct &&
-	                        type.basetype == SPIRType::Boolean &&
+	bool remapped_boolean = parent_type.basetype == SPIRType::Struct && type.basetype == SPIRType::Boolean &&
 	                        backend.boolean_in_struct_remapped_type != SPIRType::Boolean;
 
 	if (is_array(type))
 	{
-		reroll_array = !backend.array_is_value_type ||
-		               (block_like_type && !backend.array_is_value_type_in_buffer_blocks);
+		reroll_array =
+		    !backend.array_is_value_type || (block_like_type && !backend.array_is_value_type_in_buffer_blocks);
 
 		if (remapped_boolean)
 		{
@@ -5583,8 +5642,8 @@ string CompilerGLSL::to_expression(uint32_t id, bool register_expression_read)
 			uint32_t physical_type_id = get_extended_decoration(id, SPIRVCrossDecorationPhysicalTypeID);
 			bool is_packed = has_extended_decoration(id, SPIRVCrossDecorationPhysicalTypePacked);
 			bool relaxed = has_decoration(id, DecorationRelaxedPrecision);
-			return convert_row_major_matrix(e.expression, get<SPIRType>(e.expression_type), physical_type_id,
-			                                is_packed, relaxed);
+			return convert_row_major_matrix(e.expression, get<SPIRType>(e.expression_type), physical_type_id, is_packed,
+			                                relaxed);
 		}
 		else if (flattened_structs.count(id))
 		{
@@ -5619,7 +5678,8 @@ string CompilerGLSL::to_expression(uint32_t id, bool register_expression_read)
 				int wg_index = get_constant_mapping_to_workgroup_component(c);
 				if (wg_index >= 0)
 				{
-					auto wg_size = join(builtin_to_glsl(BuiltInWorkgroupSize, StorageClassInput), vector_swizzle(1, wg_index));
+					auto wg_size =
+					    join(builtin_to_glsl(BuiltInWorkgroupSize, StorageClassInput), vector_swizzle(1, wg_index));
 					if (type.basetype != SPIRType::UInt)
 						wg_size = bitcast_expression(type, SPIRType::UInt, wg_size);
 					return wg_size;
@@ -5841,7 +5901,7 @@ string CompilerGLSL::constant_op_expression(const SPIRConstantOp &cop)
 		uint32_t op0 = cop.arguments[0];
 		uint32_t op1 = cop.arguments[1];
 		return join(to_enclosed_expression(op0), " - ", to_enclosed_expression(op1), " * ", "(",
-		                 to_enclosed_expression(op0), " / ", to_enclosed_expression(op1), ")");
+		            to_enclosed_expression(op0), " / ", to_enclosed_expression(op1), ")");
 	}
 
 	case OpSelect:
@@ -5910,8 +5970,7 @@ string CompilerGLSL::constant_op_expression(const SPIRConstantOp &cop)
 
 		string expr;
 		if (c && cop.arguments.size() == 2 && c->is_used_as_array_length &&
-		    !backend.supports_spec_constant_array_size &&
-		    is_vector(get<SPIRType>(c->constant_type)))
+		    !backend.supports_spec_constant_array_size && is_vector(get<SPIRType>(c->constant_type)))
 		{
 			expr = to_expression(c->specialization_constant_id(0, cop.arguments[1]));
 		}
@@ -6052,8 +6111,7 @@ string CompilerGLSL::constant_op_expression(const SPIRConstantOp &cop)
 	}
 }
 
-string CompilerGLSL::constant_expression(const SPIRConstant &c,
-                                         bool inside_block_like_struct_scope,
+string CompilerGLSL::constant_expression(const SPIRConstant &c, bool inside_block_like_struct_scope,
                                          bool inside_struct_scope)
 {
 	auto &type = get<SPIRType>(c.constant_type);
@@ -6101,9 +6159,8 @@ string CompilerGLSL::constant_expression(const SPIRConstant &c,
 		// Should look at ArrayStride here as well, but it's possible to declare a constant struct
 		// with Offset = 0, using no ArrayStride on the enclosed array type.
 		// A particular CTS test hits this scenario.
-		bool array_type_decays = inside_block_like_struct_scope &&
-		                         is_array(type) &&
-		                         !backend.array_is_value_type_in_buffer_blocks;
+		bool array_type_decays =
+		    inside_block_like_struct_scope && is_array(type) && !backend.array_is_value_type_in_buffer_blocks;
 
 		// Allow Metal to use the array<T> template to make arrays a value type
 		bool needs_trailing_tracket = false;
@@ -6116,10 +6173,9 @@ string CompilerGLSL::constant_expression(const SPIRConstant &c,
 		         is_array(type) && !array_type_decays)
 		{
 			const auto *p_type = &type;
-			SPIRType tmp_type { OpNop };
+			SPIRType tmp_type{ OpNop };
 
-			if (inside_struct_scope &&
-			    backend.boolean_in_struct_remapped_type != SPIRType::Boolean &&
+			if (inside_struct_scope && backend.boolean_in_struct_remapped_type != SPIRType::Boolean &&
 			    type.basetype == SPIRType::Boolean)
 			{
 				tmp_type = type;
@@ -6208,8 +6264,7 @@ string CompilerGLSL::constant_expression(const SPIRConstant &c,
 	{
 		auto res = constant_expression_vector(c, 0);
 
-		if (inside_struct_scope &&
-		    backend.boolean_in_struct_remapped_type != SPIRType::Boolean &&
+		if (inside_struct_scope && backend.boolean_in_struct_remapped_type != SPIRType::Boolean &&
 		    type.basetype == SPIRType::Boolean)
 		{
 			SPIRType tmp_type = type;
@@ -6234,8 +6289,7 @@ string CompilerGLSL::constant_expression(const SPIRConstant &c,
 		}
 		res += ")";
 
-		if (inside_struct_scope &&
-		    backend.boolean_in_struct_remapped_type != SPIRType::Boolean &&
+		if (inside_struct_scope && backend.boolean_in_struct_remapped_type != SPIRType::Boolean &&
 		    type.basetype == SPIRType::Boolean)
 		{
 			SPIRType tmp_type = type;
@@ -6262,7 +6316,7 @@ string CompilerGLSL::convert_floate4m3_to_string(const SPIRConstant &c, uint32_t
 	// There is no infinity in e4m3.
 	if (std::isnan(float_value))
 	{
-		SPIRType type { OpTypeFloat };
+		SPIRType type{ OpTypeFloat };
 		type.basetype = SPIRType::Half;
 		type.vecsize = 1;
 		type.columns = 1;
@@ -6270,7 +6324,7 @@ string CompilerGLSL::convert_floate4m3_to_string(const SPIRConstant &c, uint32_t
 	}
 	else
 	{
-		SPIRType type { OpTypeFloat };
+		SPIRType type{ OpTypeFloat };
 		type.basetype = SPIRType::FloatE4M3;
 		type.vecsize = 1;
 		type.columns = 1;
@@ -6290,7 +6344,7 @@ string CompilerGLSL::convert_half_to_string(const SPIRConstant &c, uint32_t col,
 	// of complicated workarounds, just value-cast to the half type always.
 	if (std::isnan(float_value) || std::isinf(float_value))
 	{
-		SPIRType type { OpTypeFloat };
+		SPIRType type{ OpTypeFloat };
 		type.basetype = is_bfloat8 ? SPIRType::FloatE5M2 : SPIRType::Half;
 		type.vecsize = 1;
 		type.columns = 1;
@@ -6306,7 +6360,7 @@ string CompilerGLSL::convert_half_to_string(const SPIRConstant &c, uint32_t col,
 	}
 	else
 	{
-		SPIRType type { OpTypeFloat };
+		SPIRType type{ OpTypeFloat };
 		type.basetype = is_bfloat8 ? SPIRType::FloatE5M2 : SPIRType::Half;
 		type.vecsize = 1;
 		type.columns = 1;
@@ -6328,8 +6382,8 @@ string CompilerGLSL::convert_float_to_string(const SPIRConstant &c, uint32_t col
 		// Use special representation.
 		if (!is_legacy())
 		{
-			SPIRType out_type { OpTypeFloat };
-			SPIRType in_type { OpTypeInt };
+			SPIRType out_type{ OpTypeFloat };
+			SPIRType in_type{ OpTypeInt };
 			out_type.basetype = SPIRType::Float;
 			in_type.basetype = SPIRType::UInt;
 			out_type.vecsize = 1;
@@ -6401,8 +6455,8 @@ std::string CompilerGLSL::convert_double_to_string(const SPIRConstant &c, uint32
 		// Use special representation.
 		if (!is_legacy())
 		{
-			SPIRType out_type { OpTypeFloat };
-			SPIRType in_type { OpTypeInt };
+			SPIRType out_type{ OpTypeFloat };
+			SPIRType in_type{ OpTypeInt };
 			out_type.basetype = SPIRType::Double;
 			in_type.basetype = SPIRType::UInt64;
 			out_type.vecsize = 1;
@@ -6910,9 +6964,8 @@ void CompilerGLSL::emit_uninitialized_temporary(uint32_t result_type, uint32_t r
 	{
 		auto &header = get<SPIRBlock>(current_continue_block->loop_dominator);
 		if (find_if(begin(header.declare_temporary), end(header.declare_temporary),
-		            [result_type, result_id](const pair<uint32_t, uint32_t> &tmp) {
-			            return tmp.first == result_type && tmp.second == result_id;
-		            }) == end(header.declare_temporary))
+		            [result_type, result_id](const pair<uint32_t, uint32_t> &tmp)
+		            { return tmp.first == result_type && tmp.second == result_id; }) == end(header.declare_temporary))
 		{
 			header.declare_temporary.emplace_back(result_type, result_id);
 			hoisted_temporaries.insert(result_id);
@@ -6931,7 +6984,8 @@ void CompilerGLSL::emit_uninitialized_temporary(uint32_t result_type, uint32_t r
 		if (options.force_zero_initialized_variables && type_can_zero_initialize(type))
 			initializer = join(" = ", to_zero_initialized_expression(result_type));
 
-		statement(flags_to_qualifiers_glsl(type, result_id, flags), variable_decl(type, to_name(result_id)), initializer, ";");
+		statement(flags_to_qualifiers_glsl(type, result_id, flags), variable_decl(type, to_name(result_id)),
+		          initializer, ";");
 	}
 }
 
@@ -6955,9 +7009,8 @@ string CompilerGLSL::declare_temporary(uint32_t result_type, uint32_t result_id)
 	{
 		auto &header = get<SPIRBlock>(current_continue_block->loop_dominator);
 		if (find_if(begin(header.declare_temporary), end(header.declare_temporary),
-		            [result_type, result_id](const pair<uint32_t, uint32_t> &tmp) {
-			            return tmp.first == result_type && tmp.second == result_id;
-		            }) == end(header.declare_temporary))
+		            [result_type, result_id](const pair<uint32_t, uint32_t> &tmp)
+		            { return tmp.first == result_type && tmp.second == result_id; }) == end(header.declare_temporary))
 		{
 			header.declare_temporary.emplace_back(result_type, result_id);
 			hoisted_temporaries.insert(result_id);
@@ -7069,23 +7122,21 @@ void CompilerGLSL::emit_unary_op_cast(uint32_t result_type, uint32_t result_id,
 {
 	auto &type = get<SPIRType>(result_type);
 	bool forward = should_forward(op0);
-	emit_op(result_type, result_id, join(type_to_glsl(type), "(", op, to_enclosed_unpacked_expression(op0), ")"), forward);
+	emit_op(result_type, result_id, join(type_to_glsl(type), "(", op, to_enclosed_unpacked_expression(op0), ")"),
+	        forward);
 	inherit_expression_dependencies(result_id, op0);
 }
 
 void CompilerGLSL::emit_mesh_tasks(SPIRBlock &block)
 {
-	statement("EmitMeshTasksEXT(",
-	          to_unpacked_expression(block.mesh.groups[0]), ", ",
-	          to_unpacked_expression(block.mesh.groups[1]), ", ",
-	          to_unpacked_expression(block.mesh.groups[2]), ");");
+	statement("EmitMeshTasksEXT(", to_unpacked_expression(block.mesh.groups[0]), ", ",
+	          to_unpacked_expression(block.mesh.groups[1]), ", ", to_unpacked_expression(block.mesh.groups[2]), ");");
 }
 
 void CompilerGLSL::emit_binary_op(uint32_t result_type, uint32_t result_id, uint32_t op0, uint32_t op1, const char *op)
 {
 	// Various FP arithmetic opcodes such as add, sub, mul will hit this.
-	bool force_temporary_precise = backend.support_precise_qualifier &&
-	                               has_legacy_nocontract(result_type, result_id) &&
+	bool force_temporary_precise = backend.support_precise_qualifier && has_legacy_nocontract(result_type, result_id) &&
 	                               type_is_floating_point(get<SPIRType>(result_type));
 	bool forward = should_forward(op0) && should_forward(op1) && !force_temporary_precise;
 
@@ -7180,7 +7231,7 @@ SPIRType CompilerGLSL::binary_op_bitcast_helper(string &cast_op0, string &cast_o
 
 	// Create a fake type so we can bitcast to it.
 	// We only deal with regular arithmetic types here like int, uints and so on.
-	SPIRType expected_type{type0.op};
+	SPIRType expected_type{ type0.op };
 	expected_type.basetype = input_type;
 	expected_type.vecsize = type0.vecsize;
 	expected_type.columns = type0.columns;
@@ -7224,8 +7275,7 @@ bool CompilerGLSL::emit_complex_bitcast(uint32_t result_type, uint32_t id, uint3
 }
 
 void CompilerGLSL::emit_binary_op_cast(uint32_t result_type, uint32_t result_id, uint32_t op0, uint32_t op1,
-                                       const char *op, SPIRType::BaseType input_type,
-                                       bool skip_cast_if_equal_type,
+                                       const char *op, SPIRType::BaseType input_type, bool skip_cast_if_equal_type,
                                        bool implicit_integer_promotion)
 {
 	string cast_op0, cast_op1;
@@ -7296,19 +7346,18 @@ void CompilerGLSL::emit_atomic_func_op(uint32_t result_type, uint32_t result_id,
 
 	forced_temporaries.insert(result_id);
 	emit_op(result_type, result_id,
-	        join(op, "(", to_atomic_ptr_expression(op0), ", ",
-	             to_unpacked_expression(op1), ")"), false);
+	        join(op, "(", to_atomic_ptr_expression(op0), ", ", to_unpacked_expression(op1), ")"), false);
 	flush_all_atomic_capable_variables();
 }
 
-void CompilerGLSL::emit_atomic_func_op(uint32_t result_type, uint32_t result_id,
-                                       uint32_t op0, uint32_t op1, uint32_t op2,
-                                       const char *op)
+void CompilerGLSL::emit_atomic_func_op(uint32_t result_type, uint32_t result_id, uint32_t op0, uint32_t op1,
+                                       uint32_t op2, const char *op)
 {
 	forced_temporaries.insert(result_id);
 	emit_op(result_type, result_id,
-	        join(op, "(", to_non_uniform_aware_expression(op0), ", ",
-	             to_unpacked_expression(op1), ", ", to_unpacked_expression(op2), ")"), false);
+	        join(op, "(", to_non_uniform_aware_expression(op0), ", ", to_unpacked_expression(op1), ", ",
+	             to_unpacked_expression(op2), ")"),
+	        false);
 	flush_all_atomic_capable_variables();
 }
 
@@ -7538,7 +7587,7 @@ void CompilerGLSL::emit_bitfield_insert_op(uint32_t result_type, uint32_t result
 	auto op3_expr = to_unpacked_expression(op3);
 
 	assert(offset_count_type == SPIRType::UInt || offset_count_type == SPIRType::Int);
-	SPIRType target_type { OpTypeInt };
+	SPIRType target_type{ OpTypeInt };
 	target_type.width = 32;
 	target_type.vecsize = 1;
 	target_type.basetype = offset_count_type;
@@ -7667,7 +7716,9 @@ string CompilerGLSL::legacy_tex_op(const std::string &op, const SPIRType &imgtyp
 		return join(type_prefix, type, "LodOffset");
 	else if (op == "textureProjGrad")
 		return join(type_prefix, type,
-		            is_legacy_es() ? "ProjGradEXT" : is_legacy_desktop() ? "ProjGradARB" : "ProjGrad");
+		            is_legacy_es()      ? "ProjGradEXT" :
+		            is_legacy_desktop() ? "ProjGradARB" :
+		                                  "ProjGrad");
 	else if (op == "textureProjLodOffset")
 		return join(type_prefix, type, "ProjLodOffset");
 	else if (op == "textureSize")
@@ -7869,10 +7920,12 @@ string CompilerGLSL::to_combined_image_sampler(VariableID image_id, VariableID s
 		VariableID sid = global_sampler ? samp_id : VariableID(uint32_t(sampler_itr - begin(args)));
 
 		auto &combined = current_function->combined_parameters;
-		auto itr = find_if(begin(combined), end(combined), [=](const SPIRFunction::CombinedImageSamplerParameter &p) {
-			return p.global_image == global_image && p.global_sampler == global_sampler && p.image_id == iid &&
-			       p.sampler_id == sid;
-		});
+		auto itr = find_if(begin(combined), end(combined),
+		                   [=](const SPIRFunction::CombinedImageSamplerParameter &p)
+		                   {
+			                   return p.global_image == global_image && p.global_sampler == global_sampler &&
+			                          p.image_id == iid && p.sampler_id == sid;
+		                   });
 
 		if (itr != end(combined))
 			return to_expression(itr->id) + array_expr;
@@ -7887,9 +7940,8 @@ string CompilerGLSL::to_combined_image_sampler(VariableID image_id, VariableID s
 	{
 		// For global sampler2D, look directly at the global remapping table.
 		auto &mapping = combined_image_samplers;
-		auto itr = find_if(begin(mapping), end(mapping), [image_id, samp_id](const CombinedImageSampler &combined) {
-			return combined.image_id == image_id && combined.sampler_id == samp_id;
-		});
+		auto itr = find_if(begin(mapping), end(mapping), [image_id, samp_id](const CombinedImageSampler &combined)
+		                   { return combined.image_id == image_id && combined.sampler_id == samp_id; });
 
 		if (itr != end(combined_image_samplers))
 			return to_expression(itr->combined_id) + array_expr;
@@ -8208,7 +8260,8 @@ std::string CompilerGLSL::to_texture_op(const Instruction &i, bool sparse, bool
 		length--;
 	}
 
-	auto test = [&](uint32_t &v, uint32_t flag) {
+	auto test = [&](uint32_t &v, uint32_t flag)
+	{
 		if (length && (flags & flag))
 		{
 			v = *opt++;
@@ -8470,7 +8523,8 @@ string CompilerGLSL::to_function_args(const TextureFunctionArguments &args, bool
 	}
 
 	bool swizz_func = backend.swizzle_is_function;
-	auto swizzle = [swizz_func](uint32_t comps, uint32_t in_comps) -> const char * {
+	auto swizzle = [swizz_func](uint32_t comps, uint32_t in_comps) -> const char *
+	{
 		if (comps == in_comps)
 			return "";
 
@@ -8608,9 +8662,8 @@ string CompilerGLSL::to_function_args(const TextureFunctionArguments &args, bool
 			{
 				if (imgtype.image.arrayed)
 				{
-					coord_expr = join("ivec3(", enclose_expression(coord_expr),
-									  ".x, 0, ",
-									  enclose_expression(coord_expr), ".y)");
+					coord_expr = join("ivec3(", enclose_expression(coord_expr), ".x, 0, ",
+					                  enclose_expression(coord_expr), ".y)");
 				}
 				else
 					coord_expr = join("ivec2(", coord_expr, ", 0)");
@@ -8878,9 +8931,8 @@ void CompilerGLSL::emit_glsl_op(uint32_t result_type, uint32_t id, uint32_t eop,
 			auto &op1_type = expression_type(args[1]);
 			auto via_type = op1_type;
 			via_type.basetype = SPIRType::Int;
-			statement(to_expression(args[1]), " = ",
-			          type_to_glsl(op1_type), "(", type_to_glsl(via_type),
-			          "(", to_expression(args[0]), "));");
+			statement(to_expression(args[1]), " = ", type_to_glsl(op1_type), "(", type_to_glsl(via_type), "(",
+			          to_expression(args[0]), "));");
 			emit_binary_op(result_type, id, args[0], args[1], "-");
 		}
 		break;
@@ -8900,8 +8952,8 @@ void CompilerGLSL::emit_glsl_op(uint32_t result_type, uint32_t id, uint32_t eop,
 			auto &op0_type = expression_type(args[0]);
 			auto via_type = op0_type;
 			via_type.basetype = SPIRType::Int;
-			statement(to_expression(id), ".", to_member_name(type, 1), " = ", type_to_glsl(op0_type),
-			          "(", type_to_glsl(via_type), "(", to_expression(args[0]), "));");
+			statement(to_expression(id), ".", to_member_name(type, 1), " = ", type_to_glsl(op0_type), "(",
+			          type_to_glsl(via_type), "(", to_expression(args[0]), "));");
 			statement(to_expression(id), ".", to_member_name(type, 0), " = ", to_enclosed_expression(args[0]), " - ",
 			          to_expression(id), ".", to_member_name(type, 1), ";");
 		}
@@ -9012,8 +9064,10 @@ void CompilerGLSL::emit_glsl_op(uint32_t result_type, uint32_t id, uint32_t eop,
 			inherit_expression_dependencies(epos_id, args[0]);
 			inherit_expression_dependencies(eneg_id, args[0]);
 
-			auto expr = join("(", to_enclosed_expression(epos_id), " - ", to_enclosed_expression(eneg_id), ") / "
-			                 "(", to_enclosed_expression(epos_id), " + ", to_enclosed_expression(eneg_id), ")");
+			auto expr = join("(", to_enclosed_expression(epos_id), " - ", to_enclosed_expression(eneg_id),
+			                 ") / "
+			                 "(",
+			                 to_enclosed_expression(epos_id), " + ", to_enclosed_expression(eneg_id), ")");
 			emit_op(result_type, id, expr, true);
 			inherit_expression_dependencies(id, epos_id);
 			inherit_expression_dependencies(id, eneg_id);
@@ -9084,8 +9138,7 @@ void CompilerGLSL::emit_glsl_op(uint32_t result_type, uint32_t id, uint32_t eop,
 				SPIRV_CROSS_THROW("Unsupported type for matrix determinant");
 
 			bool relaxed = has_decoration(id, DecorationRelaxedPrecision);
-			require_polyfill(static_cast<Polyfill>(PolyfillDeterminant2x2 << (type.vecsize - 2)),
-			                 relaxed);
+			require_polyfill(static_cast<Polyfill>(PolyfillDeterminant2x2 << (type.vecsize - 2)), relaxed);
 			emit_unary_func_op(result_type, id, args[0],
 			                   (options.es && relaxed) ? "spvDeterminantMP" : "spvDeterminant");
 		}
@@ -9118,8 +9171,7 @@ void CompilerGLSL::emit_glsl_op(uint32_t result_type, uint32_t id, uint32_t eop,
 				SPIRV_CROSS_THROW("Unsupported type for matrix inverse");
 
 			bool relaxed = has_decoration(id, DecorationRelaxedPrecision);
-			require_polyfill(static_cast<Polyfill>(PolyfillMatrixInverse2x2 << (type.vecsize - 2)),
-			                 relaxed);
+			require_polyfill(static_cast<Polyfill>(PolyfillMatrixInverse2x2 << (type.vecsize - 2)), relaxed);
 			func = (options.es && relaxed) ? "spvInverseMP" : "spvInverse";
 		}
 
@@ -9353,7 +9405,8 @@ void CompilerGLSL::emit_glsl_op(uint32_t result_type, uint32_t id, uint32_t eop,
 			if (relaxed)
 				require_polyfill(poly, false);
 
-			emit_trinary_func_op(result_type, id, args[0], args[1], args[2], relaxed ? "spvNClampRelaxed" : "spvNClamp");
+			emit_trinary_func_op(result_type, id, args[0], args[1], args[2],
+			                     relaxed ? "spvNClampRelaxed" : "spvNClamp");
 		}
 		else
 		{
@@ -9430,20 +9483,21 @@ void CompilerGLSL::emit_emulated_ahyper_op(uint32_t result_type, uint32_t id, ui
 	switch (op)
 	{
 	case GLSLstd450Asinh:
-		expr = join("log(", to_enclosed_expression(op0), " + sqrt(",
-		            to_enclosed_expression(op0), " * ", to_enclosed_expression(op0), " + ", one, "))");
+		expr = join("log(", to_enclosed_expression(op0), " + sqrt(", to_enclosed_expression(op0), " * ",
+		            to_enclosed_expression(op0), " + ", one, "))");
 		emit_op(result_type, id, expr, forward);
 		break;
 
 	case GLSLstd450Acosh:
-		expr = join("log(", to_enclosed_expression(op0), " + sqrt(",
-		            to_enclosed_expression(op0), " * ", to_enclosed_expression(op0), " - ", one, "))");
+		expr = join("log(", to_enclosed_expression(op0), " + sqrt(", to_enclosed_expression(op0), " * ",
+		            to_enclosed_expression(op0), " - ", one, "))");
 		break;
 
 	case GLSLstd450Atanh:
-		expr = join("log((", one, " + ", to_enclosed_expression(op0), ") / "
-		            "(", one, " - ", to_enclosed_expression(op0), ")) * 0.5",
-		            backend.float_literal_suffix ? "f" : "");
+		expr = join("log((", one, " + ", to_enclosed_expression(op0),
+		            ") / "
+		            "(",
+		            one, " - ", to_enclosed_expression(op0), ")) * 0.5", backend.float_literal_suffix ? "f" : "");
 		break;
 
 	default:
@@ -9733,7 +9787,7 @@ void CompilerGLSL::emit_subgroup_op(const Instruction &i)
 	GLSL_GROUP_OP(FMul)
 
 #undef GLSL_GROUP_OP
-	// clang-format on
+		// clang-format on
 
 	case OpGroupNonUniformFMin:
 	case OpGroupNonUniformFMax:
@@ -10532,7 +10586,7 @@ const char *CompilerGLSL::index_to_swizzle(uint32_t index)
 	case 3:
 		return "w";
 	default:
-		return "x";		// Don't crash, but engage the "undefined behavior" described for out-of-bounds logical addressing in spec.
+		return "x"; // Don't crash, but engage the "undefined behavior" described for out-of-bounds logical addressing in spec.
 	}
 }
 
@@ -10544,7 +10598,8 @@ void CompilerGLSL::access_chain_internal_append_index(std::string &expr, uint32_
 	bool ptr_chain = (flags & ACCESS_CHAIN_PTR_CHAIN_BIT) != 0;
 	bool register_expression_read = (flags & ACCESS_CHAIN_SKIP_REGISTER_EXPRESSION_READ_BIT) == 0;
 
-	string idx_expr = index_is_literal ? convert_to_string(index) : to_unpacked_expression(index, register_expression_read);
+	string idx_expr =
+	    index_is_literal ? convert_to_string(index) : to_unpacked_expression(index, register_expression_read);
 
 	// For the case where the base of an OpPtrAccessChain already ends in [n],
 	// we need to use the index as an offset to the existing index, otherwise,
@@ -10636,7 +10691,8 @@ string CompilerGLSL::access_chain_internal(uint32_t base, const uint32_t *indice
 	// If we are translating access to a structured buffer, the first subscript '._m0' must be hidden
 	bool hide_first_subscript = count > 1 && is_user_type_structured(base);
 
-	const auto append_index = [&](uint32_t index, bool is_literal, bool is_ptr_chain = false) {
+	const auto append_index = [&](uint32_t index, bool is_literal, bool is_ptr_chain = false)
+	{
 		AccessChainFlags mod_flags = flags;
 		if (!is_literal)
 			mod_flags &= ~ACCESS_CHAIN_INDEX_IS_LITERAL_BIT;
@@ -10731,8 +10787,8 @@ string CompilerGLSL::access_chain_internal(uint32_t base, const uint32_t *indice
 					if (flags & ACCESS_CHAIN_PTR_CHAIN_CAST_TO_SCALAR_BIT)
 					{
 						is_packed = true;
-						expr = join("*reinterpret_cast<device packed_", type_to_glsl(pointee_type),
-						            " *>(", intptr_expr, ")");
+						expr = join("*reinterpret_cast<device packed_", type_to_glsl(pointee_type), " *>(", intptr_expr,
+						            ")");
 					}
 					else
 					{
@@ -10816,7 +10872,8 @@ string CompilerGLSL::access_chain_internal(uint32_t base, const uint32_t *indice
 				case BuiltInCullPrimitiveEXT:
 				case BuiltInPrimitiveShadingRateKHR:
 					if (mesh_shader)
-						expr = join("gl_MeshPrimitivesEXT[", to_expression(index, register_expression_read), "].", expr);
+						expr =
+						    join("gl_MeshPrimitivesEXT[", to_expression(index, register_expression_read), "].", expr);
 					else
 						append_index(index, is_literal);
 					break;
@@ -10826,8 +10883,8 @@ string CompilerGLSL::access_chain_internal(uint32_t base, const uint32_t *indice
 					break;
 				}
 			}
-			else if (backend.force_merged_mesh_block && i == 0 && var &&
-			         !is_builtin_variable(*var) && var->storage == StorageClassOutput)
+			else if (backend.force_merged_mesh_block && i == 0 && var && !is_builtin_variable(*var) &&
+			         var->storage == StorageClassOutput)
 			{
 				if (is_per_primitive_variable(*var))
 					expr = join("gl_MeshPrimitivesEXT[", to_expression(index, register_expression_read), "].", expr);
@@ -10858,7 +10915,8 @@ string CompilerGLSL::access_chain_internal(uint32_t base, const uint32_t *indice
 				if (!pending_array_enclose)
 					expr += "]";
 			}
-			else if (index_is_literal || !builtin_translates_to_nonarray(BuiltIn(get_decoration(base, DecorationBuiltIn))))
+			else if (index_is_literal ||
+			         !builtin_translates_to_nonarray(BuiltIn(get_decoration(base, DecorationBuiltIn))))
 			{
 				// Some builtins are arrays in SPIR-V but not in other languages, e.g. gl_SampleMask[] is an array in SPIR-V but not in Metal.
 				// By throwing away the index, we imply the index was 0, which it must be for gl_SampleMask.
@@ -10977,6 +11035,8 @@ string CompilerGLSL::access_chain_internal(uint32_t base, const uint32_t *indice
 			// is used to store a column. We can resolve it right here and now if we access a scalar directly,
 			// by flipping indexing order of the matrix.
 
+			if (!backend.matrix_column_accessor.empty())
+				expr += "." + backend.matrix_column_accessor;
 			expr += "[";
 			if (is_literal)
 				expr += convert_to_string(index);
@@ -11016,8 +11076,8 @@ string CompilerGLSL::access_chain_internal(uint32_t base, const uint32_t *indice
 						// E.g. [0].data followed by [1] would be shuffled to [1][0].data which is wrong,
 						// and needs to be [1].data[0] instead.
 						end_deferred_index++;
-						deferred_index = deferred_index.substr(end_deferred_index) +
-						                 deferred_index.substr(0, end_deferred_index);
+						deferred_index =
+						    deferred_index.substr(end_deferred_index) + deferred_index.substr(0, end_deferred_index);
 					}
 
 					expr.resize(column_index);
@@ -11742,9 +11802,8 @@ bool CompilerGLSL::should_forward(uint32_t id) const
 	if (expr && expr->expression_dependencies.size() >= max_expression_dependencies)
 		return false;
 
-	if (expr && expr->loaded_from
-		&& has_decoration(expr->loaded_from, DecorationBuiltIn)
-		&& has_decoration(expr->loaded_from, DecorationVolatile))
+	if (expr && expr->loaded_from && has_decoration(expr->loaded_from, DecorationBuiltIn) &&
+	    has_decoration(expr->loaded_from, DecorationVolatile))
 	{
 		// Never forward volatile builtin variables, e.g. SPIR-V 1.6 HelperInvocation.
 		return false;
@@ -11873,7 +11932,8 @@ void CompilerGLSL::emit_variable_temporary_copies(const SPIRVariable &var)
 	{
 		auto &type = get<SPIRType>(var.basetype);
 		auto &flags = get_decoration_bitset(var.self);
-		statement(flags_to_qualifiers_glsl(type, var.self, flags), variable_decl(type, join("_", var.self, "_copy")), ";");
+		statement(flags_to_qualifiers_glsl(type, var.self, flags), variable_decl(type, join("_", var.self, "_copy")),
+		          ";");
 		flushed_phi_variables.insert(var.self);
 	}
 }
@@ -12605,9 +12665,9 @@ void CompilerGLSL::emit_instruction(const Instruction &instruction)
 	uint32_t length = instruction.length;
 
 #define GLSL_BOP(op) emit_binary_op(ops[0], ops[1], ops[2], ops[3], #op)
-#define GLSL_BOP_CAST(op, type) \
-	emit_binary_op_cast(ops[0], ops[1], ops[2], ops[3], #op, type, \
-	                    opcode_is_sign_invariant(opcode), implicit_integer_promotion)
+#define GLSL_BOP_CAST(op, type)                                                                      \
+	emit_binary_op_cast(ops[0], ops[1], ops[2], ops[3], #op, type, opcode_is_sign_invariant(opcode), \
+	                    implicit_integer_promotion)
 #define GLSL_UOP(op) emit_unary_op(ops[0], ops[1], ops[2], #op)
 #define GLSL_UOP_CAST(op) emit_unary_op_cast(ops[0], ops[1], ops[2], #op)
 #define GLSL_QFOP(op) emit_quaternary_func_op(ops[0], ops[1], ops[2], ops[3], ops[4], ops[5], #op)
@@ -12791,9 +12851,8 @@ void CompilerGLSL::emit_instruction(const Instruction &instruction)
 		if (flattened_buffer_blocks.count(ops[2]) && target_type.basetype == SPIRType::Struct)
 			requires_temporary = !backend.can_declare_struct_inline;
 
-		auto &expr = requires_temporary ?
-                         emit_op(ops[0], ops[1], std::move(e), false) :
-                         set<SPIRExpression>(ops[1], std::move(e), ops[0], should_forward(ops[2]));
+		auto &expr = requires_temporary ? emit_op(ops[0], ops[1], std::move(e), false) :
+		                                  set<SPIRExpression>(ops[1], std::move(e), ops[0], should_forward(ops[2]));
 
 		auto *backing_variable = maybe_get_backing_variable(ops[2]);
 		expr.loaded_from = backing_variable ? backing_variable->self : ID(ops[2]);
@@ -13163,7 +13222,8 @@ void CompilerGLSL::emit_instruction(const Instruction &instruction)
 			auto expr = to_extract_constant_composite_expression(result_type, *c, ops + 3, length);
 			e = &emit_op(result_type, id, expr, true, true);
 		}
-		else if (allow_base_expression && should_forward(ops[2]) && type.vecsize == 1 && type.columns == 1 && length == 1)
+		else if (allow_base_expression && should_forward(ops[2]) && type.vecsize == 1 && type.columns == 1 &&
+		         length == 1)
 		{
 			// Only apply this optimization if result is scalar.
 
@@ -13181,7 +13241,8 @@ void CompilerGLSL::emit_instruction(const Instruction &instruction)
 			// from expression causing it to be forced to an actual temporary in GLSL.
 			auto expr = access_chain_internal(ops[2], &ops[3], length,
 			                                  ACCESS_CHAIN_INDEX_IS_LITERAL_BIT | ACCESS_CHAIN_CHAIN_ONLY_BIT |
-			                                  ACCESS_CHAIN_FORCE_COMPOSITE_BIT, &meta);
+			                                      ACCESS_CHAIN_FORCE_COMPOSITE_BIT,
+			                                  &meta);
 			e = &emit_op(result_type, id, expr, true, should_suppress_usage_tracking(ops[2]));
 			inherit_expression_dependencies(id, ops[2]);
 			e->base_expression = ops[2];
@@ -13191,8 +13252,8 @@ void CompilerGLSL::emit_instruction(const Instruction &instruction)
 		}
 		else
 		{
-			auto expr = access_chain_internal(ops[2], &ops[3], length,
-			                                  ACCESS_CHAIN_INDEX_IS_LITERAL_BIT | ACCESS_CHAIN_FORCE_COMPOSITE_BIT, &meta);
+			auto expr = access_chain_internal(
+			    ops[2], &ops[3], length, ACCESS_CHAIN_INDEX_IS_LITERAL_BIT | ACCESS_CHAIN_FORCE_COMPOSITE_BIT, &meta);
 			e = &emit_op(result_type, id, expr, should_forward(ops[2]), should_suppress_usage_tracking(ops[2]));
 			inherit_expression_dependencies(id, ops[2]);
 		}
@@ -13236,8 +13297,7 @@ void CompilerGLSL::emit_instruction(const Instruction &instruction)
 		// that loop variable, since we won't be able to override the expression after the fact.
 		// If the composite is hoisted, we might never be able to properly invalidate any usage
 		// of that composite in a subsequent loop iteration.
-		if (invalid_expressions.count(composite) ||
-		    block_composite_insert_overwrite.count(composite) ||
+		if (invalid_expressions.count(composite) || block_composite_insert_overwrite.count(composite) ||
 		    hoisted_temporaries.count(id) || hoisted_temporaries.count(composite) ||
 		    maybe_get<SPIRExpression>(composite) == nullptr)
 		{
@@ -13245,7 +13305,7 @@ void CompilerGLSL::emit_instruction(const Instruction &instruction)
 		}
 		else if (backend.requires_relaxed_precision_analysis &&
 		         has_decoration(composite, DecorationRelaxedPrecision) !=
-		         has_decoration(id, DecorationRelaxedPrecision) &&
+		             has_decoration(id, DecorationRelaxedPrecision) &&
 		         get<SPIRType>(result_type).basetype != SPIRType::Struct)
 		{
 			// Similarly, if precision does not match for input and output,
@@ -13863,9 +13923,9 @@ void CompilerGLSL::emit_instruction(const Instruction &instruction)
 			auto &op0_type = expression_type(op0);
 			auto via_type = op0_type;
 			via_type.basetype = SPIRType::Int;
-			expr = join(to_enclosed_expression(op0), " - ", to_enclosed_expression(op1), " * ",
-			            type_to_glsl(op0_type), "(", type_to_glsl(via_type),  "(",
-			            to_enclosed_expression(op0), " / ", to_enclosed_expression(op1), "))");
+			expr = join(to_enclosed_expression(op0), " - ", to_enclosed_expression(op1), " * ", type_to_glsl(op0_type),
+			            "(", type_to_glsl(via_type), "(", to_enclosed_expression(op0), " / ",
+			            to_enclosed_expression(op1), "))");
 		}
 
 		emit_op(result_type, result_id, expr, forward);
@@ -13919,7 +13979,7 @@ void CompilerGLSL::emit_instruction(const Instruction &instruction)
 	{
 		auto &type = get<SPIRType>(ops[0]);
 		if (type.vecsize > 1)
-			GLSL_UFOP(not );
+			GLSL_UFOP(not);
 		else
 			GLSL_UOP(!);
 		break;
@@ -14337,9 +14397,7 @@ void CompilerGLSL::emit_instruction(const Instruction &instruction)
 		                     (atomic_image && get<SPIRType>(type.image.type).basetype == SPIRType::UInt);
 		const char *op = atomic_image ? "imageAtomicAdd" : "atomicAdd";
 		const char *increment = unsigned_type ? "0u" : "0";
-		emit_op(ops[0], ops[1],
-		        join(op, "(",
-		             to_atomic_ptr_expression(ops[2]), ", ", increment, ")"), false);
+		emit_op(ops[0], ops[1], join(op, "(", to_atomic_ptr_expression(ops[2]), ", ", increment, ")"), false);
 		flush_all_atomic_capable_variables();
 
 		if (type.basetype == SPIRType::UInt64 || type.basetype == SPIRType::Int64)
@@ -14394,8 +14452,7 @@ void CompilerGLSL::emit_instruction(const Instruction &instruction)
 			else
 				increment = "-1";
 
-			emit_op(ops[0], ops[1],
-			        join(op, "(", to_atomic_ptr_expression(ops[2]), ", ", increment, ")"), false);
+			emit_op(ops[0], ops[1], join(op, "(", to_atomic_ptr_expression(ops[2]), ", ", increment, ")"), false);
 
 			if (type.basetype == SPIRType::UInt64 || type.basetype == SPIRType::Int64)
 				require_extension_internal("GL_EXT_shader_atomic_int64");
@@ -14582,9 +14639,7 @@ void CompilerGLSL::emit_instruction(const Instruction &instruction)
 		}
 
 		bool forward = should_forward(ops[3]);
-		emit_op(ops[0], ops[1],
-		        join(op, "(", sampler_expr, ", ", to_unpacked_expression(ops[3]), ")"),
-		        forward);
+		emit_op(ops[0], ops[1], join(op, "(", sampler_expr, ", ", to_unpacked_expression(ops[3]), ")"), forward);
 		inherit_expression_dependencies(ops[1], ops[2]);
 		inherit_expression_dependencies(ops[1], ops[3]);
 		register_control_dependent_expression(ops[1]);
@@ -14737,7 +14792,8 @@ void CompilerGLSL::emit_instruction(const Instruction &instruction)
 						                  "operand mask was used.");
 
 					uint32_t samples = ops[5];
-					imgexpr = join("subpassLoad(", to_non_uniform_aware_expression(ops[2]), ", ", to_expression(samples), ")");
+					imgexpr = join("subpassLoad(", to_non_uniform_aware_expression(ops[2]), ", ",
+					               to_expression(samples), ")");
 				}
 				else
 					imgexpr = join("subpassLoad(", to_non_uniform_aware_expression(ops[2]), ")");
@@ -14758,7 +14814,8 @@ void CompilerGLSL::emit_instruction(const Instruction &instruction)
 				else
 				{
 					// Implement subpass loads via texture barrier style sampling.
-					imgexpr = join("texelFetch(", to_non_uniform_aware_expression(ops[2]), ", ivec2(gl_FragCoord.xy), 0)");
+					imgexpr =
+					    join("texelFetch(", to_non_uniform_aware_expression(ops[2]), ", ivec2(gl_FragCoord.xy), 0)");
 				}
 			}
 			imgexpr = remap_swizzle(get<SPIRType>(result_type), 4, imgexpr);
@@ -14793,13 +14850,15 @@ void CompilerGLSL::emit_instruction(const Instruction &instruction)
 						                  "operand mask was used.");
 
 					uint32_t samples = ops[5];
-					statement(to_expression(sparse_code_id), " = sparseImageLoadARB(", to_non_uniform_aware_expression(ops[2]), ", ",
-					          coord_expr, ", ", to_expression(samples), ", ", to_expression(sparse_texel_id), ");");
+					statement(to_expression(sparse_code_id), " = sparseImageLoadARB(",
+					          to_non_uniform_aware_expression(ops[2]), ", ", coord_expr, ", ", to_expression(samples),
+					          ", ", to_expression(sparse_texel_id), ");");
 				}
 				else
 				{
-					statement(to_expression(sparse_code_id), " = sparseImageLoadARB(", to_non_uniform_aware_expression(ops[2]), ", ",
-					          coord_expr, ", ", to_expression(sparse_texel_id), ");");
+					statement(to_expression(sparse_code_id), " = sparseImageLoadARB(",
+					          to_non_uniform_aware_expression(ops[2]), ", ", coord_expr, ", ",
+					          to_expression(sparse_texel_id), ");");
 				}
 				imgexpr = join(type_to_glsl(get<SPIRType>(result_type)), "(", to_expression(sparse_code_id), ", ",
 				               to_expression(sparse_texel_id), ")");
@@ -14814,8 +14873,8 @@ void CompilerGLSL::emit_instruction(const Instruction &instruction)
 						                  "operand mask was used.");
 
 					uint32_t samples = ops[5];
-					imgexpr =
-					    join("imageLoad(", to_non_uniform_aware_expression(ops[2]), ", ", coord_expr, ", ", to_expression(samples), ")");
+					imgexpr = join("imageLoad(", to_non_uniform_aware_expression(ops[2]), ", ", coord_expr, ", ",
+					               to_expression(samples), ")");
 				}
 				else
 					imgexpr = join("imageLoad(", to_non_uniform_aware_expression(ops[2]), ", ", coord_expr, ")");
@@ -14904,7 +14963,8 @@ void CompilerGLSL::emit_instruction(const Instruction &instruction)
 			if (operands != ImageOperandsSampleMask || length != 5)
 				SPIRV_CROSS_THROW("Multisampled image used in OpImageWrite, but unexpected operand mask was used.");
 			uint32_t samples = ops[4];
-			statement("imageStore(", to_non_uniform_aware_expression(ops[0]), ", ", coord_expr, ", ", to_expression(samples), ", ",
+			statement("imageStore(", to_non_uniform_aware_expression(ops[0]), ", ", coord_expr, ", ",
+			          to_expression(samples), ", ",
 			          remap_swizzle(store_type, value_type.vecsize, to_expression(ops[2])), ");");
 		}
 		else
@@ -15231,8 +15291,7 @@ void CompilerGLSL::emit_instruction(const Instruction &instruction)
 	{
 		uint32_t extension_set = ops[2];
 		auto ext = get<SPIRExtension>(extension_set).ext;
-		if (ext != SPIRExtension::SPV_debug_info &&
-		    ext != SPIRExtension::NonSemanticShaderDebugInfo &&
+		if (ext != SPIRExtension::SPV_debug_info && ext != SPIRExtension::NonSemanticShaderDebugInfo &&
 		    ext != SPIRExtension::NonSemanticGeneric)
 		{
 			SPIRV_CROSS_THROW("Unexpected use of ExtInstWithForwardRefsKHR.");
@@ -15270,8 +15329,7 @@ void CompilerGLSL::emit_instruction(const Instruction &instruction)
 		{
 			emit_non_semantic_shader_debug_info(ops[0], ops[1], ops[3], &ops[4], length - 4);
 		}
-		else if (ext == SPIRExtension::SPV_debug_info ||
-		         ext == SPIRExtension::NonSemanticGeneric)
+		else if (ext == SPIRExtension::SPV_debug_info || ext == SPIRExtension::NonSemanticGeneric)
 		{
 			break; // Ignore SPIR-V debug information extended instructions.
 		}
@@ -15594,19 +15652,20 @@ void CompilerGLSL::emit_instruction(const Instruction &instruction)
 		flush_control_dependent_expressions(current_emitting_block->self);
 		break;
 	case OpTraceNV:
-		statement("traceNV(", to_non_uniform_aware_expression(ops[0]), ", ", to_expression(ops[1]), ", ", to_expression(ops[2]), ", ",
-		          to_expression(ops[3]), ", ", to_expression(ops[4]), ", ", to_expression(ops[5]), ", ",
-		          to_expression(ops[6]), ", ", to_expression(ops[7]), ", ", to_expression(ops[8]), ", ",
-		          to_expression(ops[9]), ", ", to_expression(ops[10]), ");");
+		statement("traceNV(", to_non_uniform_aware_expression(ops[0]), ", ", to_expression(ops[1]), ", ",
+		          to_expression(ops[2]), ", ", to_expression(ops[3]), ", ", to_expression(ops[4]), ", ",
+		          to_expression(ops[5]), ", ", to_expression(ops[6]), ", ", to_expression(ops[7]), ", ",
+		          to_expression(ops[8]), ", ", to_expression(ops[9]), ", ", to_expression(ops[10]), ");");
 		flush_control_dependent_expressions(current_emitting_block->self);
 		break;
 	case OpTraceRayKHR:
 		if (!has_decoration(ops[10], DecorationLocation))
 			SPIRV_CROSS_THROW("A memory declaration object must be used in TraceRayKHR.");
-		statement("traceRayEXT(", to_non_uniform_aware_expression(ops[0]), ", ", to_expression(ops[1]), ", ", to_expression(ops[2]), ", ",
-		          to_expression(ops[3]), ", ", to_expression(ops[4]), ", ", to_expression(ops[5]), ", ",
-		          to_expression(ops[6]), ", ", to_expression(ops[7]), ", ", to_expression(ops[8]), ", ",
-		          to_expression(ops[9]), ", ", get_decoration(ops[10], DecorationLocation), ");");
+		statement("traceRayEXT(", to_non_uniform_aware_expression(ops[0]), ", ", to_expression(ops[1]), ", ",
+		          to_expression(ops[2]), ", ", to_expression(ops[3]), ", ", to_expression(ops[4]), ", ",
+		          to_expression(ops[5]), ", ", to_expression(ops[6]), ", ", to_expression(ops[7]), ", ",
+		          to_expression(ops[8]), ", ", to_expression(ops[9]), ", ", get_decoration(ops[10], DecorationLocation),
+		          ");");
 		flush_control_dependent_expressions(current_emitting_block->self);
 		break;
 	case OpExecuteCallableNV:
@@ -15623,11 +15682,9 @@ void CompilerGLSL::emit_instruction(const Instruction &instruction)
 		// Don't bother forwarding temporaries. Avoids having to test expression invalidation with ray query objects.
 	case OpRayQueryInitializeKHR:
 		flush_variable_declaration(ops[0]);
-		statement("rayQueryInitializeEXT(",
-		          to_expression(ops[0]), ", ", to_expression(ops[1]), ", ",
-		          to_expression(ops[2]), ", ", to_expression(ops[3]), ", ",
-		          to_expression(ops[4]), ", ", to_expression(ops[5]), ", ",
-		          to_expression(ops[6]), ", ", to_expression(ops[7]), ");");
+		statement("rayQueryInitializeEXT(", to_expression(ops[0]), ", ", to_expression(ops[1]), ", ",
+		          to_expression(ops[2]), ", ", to_expression(ops[3]), ", ", to_expression(ops[4]), ", ",
+		          to_expression(ops[5]), ", ", to_expression(ops[6]), ", ", to_expression(ops[7]), ");");
 		break;
 	case OpRayQueryProceedKHR:
 		flush_variable_declaration(ops[0]);
@@ -15648,41 +15705,47 @@ void CompilerGLSL::emit_instruction(const Instruction &instruction)
 	case OpRayQueryGetIntersectionTriangleVertexPositionsKHR:
 		flush_variable_declaration(ops[1]);
 		emit_uninitialized_temporary_expression(ops[0], ops[1]);
-		statement("rayQueryGetIntersectionTriangleVertexPositionsEXT(", to_expression(ops[2]), ", bool(", to_expression(ops[3]), "), ", to_expression(ops[1]), ");");
+		statement("rayQueryGetIntersectionTriangleVertexPositionsEXT(", to_expression(ops[2]), ", bool(",
+		          to_expression(ops[3]), "), ", to_expression(ops[1]), ");");
 		break;
-#define GLSL_RAY_QUERY_GET_OP(op) \
-	case OpRayQueryGet##op##KHR: \
-		flush_variable_declaration(ops[2]); \
+#define GLSL_RAY_QUERY_GET_OP(op)                                                                   \
+	case OpRayQueryGet##op##KHR:                                                                    \
+		flush_variable_declaration(ops[2]);                                                         \
 		emit_op(ops[0], ops[1], join("rayQueryGet" #op "EXT(", to_expression(ops[2]), ")"), false); \
 		break
-#define GLSL_RAY_QUERY_GET_OP2(op) \
-	case OpRayQueryGet##op##KHR: \
-		flush_variable_declaration(ops[2]); \
-		emit_op(ops[0], ops[1], join("rayQueryGet" #op "EXT(", to_expression(ops[2]), ", ", "bool(", to_expression(ops[3]), "))"), false); \
+#define GLSL_RAY_QUERY_GET_OP2(op)                                                                                 \
+	case OpRayQueryGet##op##KHR:                                                                                   \
+		flush_variable_declaration(ops[2]);                                                                        \
+		emit_op(ops[0], ops[1],                                                                                    \
+		        join("rayQueryGet" #op "EXT(", to_expression(ops[2]), ", ", "bool(", to_expression(ops[3]), "))"), \
+		        false);                                                                                            \
 		break
-	GLSL_RAY_QUERY_GET_OP(RayTMin);
-	GLSL_RAY_QUERY_GET_OP(RayFlags);
-	GLSL_RAY_QUERY_GET_OP(WorldRayOrigin);
-	GLSL_RAY_QUERY_GET_OP(WorldRayDirection);
-	GLSL_RAY_QUERY_GET_OP(IntersectionCandidateAABBOpaque);
-	GLSL_RAY_QUERY_GET_OP2(IntersectionType);
-	GLSL_RAY_QUERY_GET_OP2(IntersectionT);
-	GLSL_RAY_QUERY_GET_OP2(IntersectionInstanceCustomIndex);
-	GLSL_RAY_QUERY_GET_OP2(IntersectionInstanceId);
-	GLSL_RAY_QUERY_GET_OP2(IntersectionInstanceShaderBindingTableRecordOffset);
-	GLSL_RAY_QUERY_GET_OP2(IntersectionGeometryIndex);
-	GLSL_RAY_QUERY_GET_OP2(IntersectionPrimitiveIndex);
-	GLSL_RAY_QUERY_GET_OP2(IntersectionBarycentrics);
-	GLSL_RAY_QUERY_GET_OP2(IntersectionFrontFace);
-	GLSL_RAY_QUERY_GET_OP2(IntersectionObjectRayDirection);
-	GLSL_RAY_QUERY_GET_OP2(IntersectionObjectRayOrigin);
-	GLSL_RAY_QUERY_GET_OP2(IntersectionObjectToWorld);
-	GLSL_RAY_QUERY_GET_OP2(IntersectionWorldToObject);
+		GLSL_RAY_QUERY_GET_OP(RayTMin);
+		GLSL_RAY_QUERY_GET_OP(RayFlags);
+		GLSL_RAY_QUERY_GET_OP(WorldRayOrigin);
+		GLSL_RAY_QUERY_GET_OP(WorldRayDirection);
+		GLSL_RAY_QUERY_GET_OP(IntersectionCandidateAABBOpaque);
+		GLSL_RAY_QUERY_GET_OP2(IntersectionType);
+		GLSL_RAY_QUERY_GET_OP2(IntersectionT);
+		GLSL_RAY_QUERY_GET_OP2(IntersectionInstanceCustomIndex);
+		GLSL_RAY_QUERY_GET_OP2(IntersectionInstanceId);
+		GLSL_RAY_QUERY_GET_OP2(IntersectionInstanceShaderBindingTableRecordOffset);
+		GLSL_RAY_QUERY_GET_OP2(IntersectionGeometryIndex);
+		GLSL_RAY_QUERY_GET_OP2(IntersectionPrimitiveIndex);
+		GLSL_RAY_QUERY_GET_OP2(IntersectionBarycentrics);
+		GLSL_RAY_QUERY_GET_OP2(IntersectionFrontFace);
+		GLSL_RAY_QUERY_GET_OP2(IntersectionObjectRayDirection);
+		GLSL_RAY_QUERY_GET_OP2(IntersectionObjectRayOrigin);
+		GLSL_RAY_QUERY_GET_OP2(IntersectionObjectToWorld);
+		GLSL_RAY_QUERY_GET_OP2(IntersectionWorldToObject);
 #undef GLSL_RAY_QUERY_GET_OP
 #undef GLSL_RAY_QUERY_GET_OP2
 	case OpRayQueryGetClusterIdNV:
 		flush_variable_declaration(ops[2]);
-		emit_op(ops[0], ops[1], join("rayQueryGetIntersectionClusterIdNV(", to_expression(ops[2]), ", ", "bool(", to_expression(ops[3]), "))"), false);
+		emit_op(ops[0], ops[1],
+		        join("rayQueryGetIntersectionClusterIdNV(", to_expression(ops[2]), ", ", "bool(", to_expression(ops[3]),
+		             "))"),
+		        false);
 		break;
 	case OpTensorQuerySizeARM:
 		flush_variable_declaration(ops[1]);
@@ -15694,7 +15757,7 @@ void CompilerGLSL::emit_instruction(const Instruction &instruction)
 		flush_variable_declaration(ops[1]);
 		emit_uninitialized_temporary_expression(ops[0], ops[1]);
 
-		SmallVector<std::string> args {
+		SmallVector<std::string> args{
 			to_expression(ops[2]), // tensor
 			to_expression(ops[3]), // coordinates
 			to_expression(ops[1]), // out value
@@ -15731,7 +15794,7 @@ void CompilerGLSL::emit_instruction(const Instruction &instruction)
 	{
 		flush_variable_declaration(ops[0]);
 
-		SmallVector<std::string> args {
+		SmallVector<std::string> args{
 			to_expression(ops[0]), // tensor
 			to_expression(ops[1]), // coordinates
 			to_expression(ops[2]), // out value
@@ -15759,8 +15822,8 @@ void CompilerGLSL::emit_instruction(const Instruction &instruction)
 	{
 		require_extension_internal("GL_EXT_ray_tracing");
 
-		bool elide_temporary = should_forward(ops[2]) && forced_temporaries.count(ops[1]) == 0 &&
-		                       !hoisted_temporaries.count(ops[1]);
+		bool elide_temporary =
+		    should_forward(ops[2]) && forced_temporaries.count(ops[1]) == 0 && !hoisted_temporaries.count(ops[1]);
 
 		if (elide_temporary)
 		{
@@ -15930,14 +15993,15 @@ void CompilerGLSL::emit_instruction(const Instruction &instruction)
 		auto matrix_layout_id = ops[4];
 		auto matrix_iterpretation_id = ops[5];
 		auto matrix_stride_id = length >= 6 ? ops[6] : 0;
-		statement(join("coopVecOuterProductAccumulateNV(", to_expression(v1), ", ", to_expression(v2), ", ",
-		               to_expression(buf), ", ", to_expression(offset), ", ",
-		               matrix_stride_id ? to_expression(matrix_stride_id) : "0",
-					   ", ", to_pretty_expression_if_int_constant(
-							   matrix_layout_id, std::begin(CoopVecMatrixLayoutNames), std::end(CoopVecMatrixLayoutNames)),
-		               ", ", to_pretty_expression_if_int_constant(
-							   matrix_iterpretation_id, std::begin(CoopVecComponentTypeNames), std::end(CoopVecComponentTypeNames)),
-		               ");"));
+		statement(join(
+		    "coopVecOuterProductAccumulateNV(", to_expression(v1), ", ", to_expression(v2), ", ", to_expression(buf),
+		    ", ", to_expression(offset), ", ", matrix_stride_id ? to_expression(matrix_stride_id) : "0", ", ",
+		    to_pretty_expression_if_int_constant(matrix_layout_id, std::begin(CoopVecMatrixLayoutNames),
+		                                         std::end(CoopVecMatrixLayoutNames)),
+		    ", ",
+		    to_pretty_expression_if_int_constant(matrix_iterpretation_id, std::begin(CoopVecComponentTypeNames),
+		                                         std::end(CoopVecComponentTypeNames)),
+		    ");"));
 		register_write(ops[0]);
 		break;
 	}
@@ -15978,14 +16042,14 @@ void CompilerGLSL::emit_instruction(const Instruction &instruction)
 			// arguments 3, 6 and in case of MulAddNv also 9 use component type int constants
 			if (i == 3 || i == 6 || (i == 9 && opcode == OpCooperativeVectorMatrixMulAddNV))
 			{
-				stmt += to_pretty_expression_if_int_constant(
-						ops[i], std::begin(CoopVecComponentTypeNames), std::end(CoopVecComponentTypeNames));
+				stmt += to_pretty_expression_if_int_constant(ops[i], std::begin(CoopVecComponentTypeNames),
+				                                             std::end(CoopVecComponentTypeNames));
 			}
 			else if ((i == 12 && opcode == OpCooperativeVectorMatrixMulAddNV) ||
 			         (i == 9 && opcode == OpCooperativeVectorMatrixMulNV))
 			{
-				stmt += to_pretty_expression_if_int_constant(
-						ops[i], std::begin(CoopVecMatrixLayoutNames), std::end(CoopVecMatrixLayoutNames));
+				stmt += to_pretty_expression_if_int_constant(ops[i], std::begin(CoopVecMatrixLayoutNames),
+				                                             std::end(CoopVecMatrixLayoutNames));
 			}
 			else
 				stmt += to_expression(ops[i]);
@@ -16004,9 +16068,9 @@ void CompilerGLSL::emit_instruction(const Instruction &instruction)
 		uint32_t result_type = ops[0];
 		uint32_t id = ops[1];
 		set<SPIRExpression>(
-				id, join(type_to_glsl(get<SPIRType>(result_type)),
-				         "(", type_to_glsl(get<SPIRType>(ops[2])), "(0).length())"),
-				result_type, true);
+		    id,
+		    join(type_to_glsl(get<SPIRType>(result_type)), "(", type_to_glsl(get<SPIRType>(ops[2])), "(0).length())"),
+		    result_type, true);
 		break;
 	}
 
@@ -16025,8 +16089,8 @@ void CompilerGLSL::emit_instruction(const Instruction &instruction)
 		if (!is_forcing_recompilation())
 			split_expr = split_coopmat_pointer(expr);
 
-		string layout_expr = to_pretty_expression_if_int_constant(
-				ops[3], std::begin(CoopMatMatrixLayoutNames), std::end(CoopMatMatrixLayoutNames));
+		string layout_expr = to_pretty_expression_if_int_constant(ops[3], std::begin(CoopMatMatrixLayoutNames),
+		                                                          std::end(CoopMatMatrixLayoutNames));
 		statement("coopMatLoad(", to_expression(id), ", ", split_expr.first, ", ", split_expr.second, ", ",
 		          to_expression(ops[4]), ", ", layout_expr, ");");
 
@@ -16048,8 +16112,8 @@ void CompilerGLSL::emit_instruction(const Instruction &instruction)
 		if (!is_forcing_recompilation())
 			split_expr = split_coopmat_pointer(expr);
 
-		string layout_expr = to_pretty_expression_if_int_constant(
-				ops[2], std::begin(CoopMatMatrixLayoutNames), std::end(CoopMatMatrixLayoutNames));
+		string layout_expr = to_pretty_expression_if_int_constant(ops[2], std::begin(CoopMatMatrixLayoutNames),
+		                                                          std::end(CoopMatMatrixLayoutNames));
 
 		statement("coopMatStore(", to_expression(ops[1]), ", ", split_expr.first, ", ", split_expr.second, ", ",
 		          to_expression(ops[3]), ", ", layout_expr, ");");
@@ -16069,12 +16133,8 @@ void CompilerGLSL::emit_instruction(const Instruction &instruction)
 		uint32_t C = ops[4];
 		bool forward = should_forward(A) && should_forward(B) && should_forward(C);
 		emit_op(result_type, id,
-		        join("coopMatMulAdd(",
-		             to_unpacked_expression(A), ", ",
-		             to_unpacked_expression(B), ", ",
-		             to_unpacked_expression(C), ", ",
-		             (length >= 6 ? ops[5] : 0),
-		             ")"),
+		        join("coopMatMulAdd(", to_unpacked_expression(A), ", ", to_unpacked_expression(B), ", ",
+		             to_unpacked_expression(C), ", ", (length >= 6 ? ops[5] : 0), ")"),
 		        forward);
 
 		inherit_expression_dependencies(id, A);
@@ -16279,8 +16339,7 @@ string CompilerGLSL::convert_row_major_matrix(string exp_str, const SPIRType &ex
 			// E.g. [0].data followed by [1] would be shuffled to [1][0].data which is wrong,
 			// and needs to be [1].data[0] instead.
 			end_deferred_index++;
-			column_expr = column_expr.substr(end_deferred_index) +
-			              column_expr.substr(0, end_deferred_index);
+			column_expr = column_expr.substr(end_deferred_index) + column_expr.substr(0, end_deferred_index);
 		}
 
 		auto transposed_expr = type_to_glsl_constructor(exp_type) + "(";
@@ -16345,7 +16404,8 @@ void CompilerGLSL::emit_struct_member(const SPIRType &type, uint32_t member_type
 	if (is_block)
 		qualifiers = to_interpolation_qualifiers(memberflags);
 
-	statement(layout_for_member(type, index), qualifiers, qualifier, flags_to_qualifiers_glsl(membertype, 0, memberflags),
+	statement(layout_for_member(type, index), qualifiers, qualifier,
+	          flags_to_qualifiers_glsl(membertype, 0, memberflags),
 	          variable_decl(membertype, to_member_name(type, index)), ";");
 }
 
@@ -16365,10 +16425,9 @@ string CompilerGLSL::flags_to_qualifiers_glsl(const SPIRType &type, uint32_t id,
 	}
 
 	// Structs do not have precision qualifiers, neither do doubles (desktop only anyways, so no mediump/highp).
-	bool type_supports_precision =
-			type.basetype == SPIRType::Float || type.basetype == SPIRType::Int || type.basetype == SPIRType::UInt ||
-			type.basetype == SPIRType::Image || type.basetype == SPIRType::SampledImage ||
-			type.basetype == SPIRType::Sampler;
+	bool type_supports_precision = type.basetype == SPIRType::Float || type.basetype == SPIRType::Int ||
+	                               type.basetype == SPIRType::UInt || type.basetype == SPIRType::Image ||
+	                               type.basetype == SPIRType::SampledImage || type.basetype == SPIRType::Sampler;
 
 	if (!type_supports_precision)
 		return qual;
@@ -16533,10 +16592,8 @@ string CompilerGLSL::argument_decl(const SPIRFunction::Parameter &arg)
 	auto &type = expression_type(arg.id);
 	const char *direction = "";
 
-	if (is_pointer(type) &&
-	    (type.storage == StorageClassFunction ||
-	     type.storage == StorageClassPrivate ||
-	     type.storage == StorageClassOutput))
+	if (is_pointer(type) && (type.storage == StorageClassFunction || type.storage == StorageClassPrivate ||
+	                         type.storage == StorageClassOutput))
 	{
 		// If we're passing around block types to function, we really mean reference in a pointer sense,
 		// but DXC does not like inout for mesh blocks, so workaround that. out is technically not correct,
@@ -16648,7 +16705,7 @@ string CompilerGLSL::pls_decl(const PlsRemap &var)
 
 	auto op_and_basetype = pls_format_to_basetype(var.format);
 
-	SPIRType type { op_and_basetype.first };
+	SPIRType type{ op_and_basetype.first };
 	type.basetype = op_and_basetype.second;
 	auto vecsize = pls_format_to_components(var.format);
 	if (vecsize > 1)
@@ -16857,8 +16914,7 @@ string CompilerGLSL::image_type_glsl(const SPIRType &type, uint32_t id, bool /*m
 	}
 
 	// "Shadow" state in GLSL only exists for samplers and combined image samplers.
-	if (((type.basetype == SPIRType::SampledImage) || (type.basetype == SPIRType::Sampler)) &&
-	    is_depth_image(type, id))
+	if (((type.basetype == SPIRType::SampledImage) || (type.basetype == SPIRType::Sampler)) && is_depth_image(type, id))
 	{
 		res += "Shadow";
 
@@ -16961,7 +17017,7 @@ string CompilerGLSL::type_to_glsl(const SPIRType &type, uint32_t id)
 		if (type.ext.tensor.shape != 0)
 			SPIRV_CROSS_THROW("GLSL tensors cannot have a Shape.");
 		return join("tensorARM<", type_to_glsl(get<SPIRType>(type.ext.tensor.type)), ", ",
-								to_expression(type.ext.tensor.rank), ">");
+		            to_expression(type.ext.tensor.rank), ">");
 
 	case SPIRType::Void:
 		return "void";
@@ -17050,8 +17106,7 @@ string CompilerGLSL::type_to_glsl(const SPIRType &type, uint32_t id)
 		if (scope_expr.empty())
 			scope_expr = to_expression(coop_type->ext.cooperative.scope_id);
 
-		return join("coopmat<", type_to_glsl(get<SPIRType>(coop_type->parent_type)), ", ",
-		            scope_expr, ", ",
+		return join("coopmat<", type_to_glsl(get<SPIRType>(coop_type->parent_type)), ", ", scope_expr, ", ",
 		            to_expression(coop_type->ext.cooperative.rows_id), ", ",
 		            to_expression(coop_type->ext.cooperative.columns_id), ", ", use, ">");
 	}
@@ -17643,10 +17698,12 @@ void CompilerGLSL::flush_phi(BlockID from, BlockID to)
 				// as part of another Phi node in our target block.
 				// For this case, we will need to copy phi.function_variable to a temporary, and use that for future reads.
 				// This is judged to be extremely rare, so deal with it here using a simple, but suboptimal algorithm.
-				bool need_saved_temporary =
-				    find_if(itr + 1, end(child.phi_variables), [&](const SPIRBlock::Phi &future_phi) -> bool {
-					    return future_phi.local_variable == ID(phi.function_variable) && future_phi.parent == from;
-				    }) != end(child.phi_variables);
+				bool need_saved_temporary = find_if(itr + 1, end(child.phi_variables),
+				                                    [&](const SPIRBlock::Phi &future_phi) -> bool
+				                                    {
+					                                    return future_phi.local_variable == ID(phi.function_variable) &&
+					                                           future_phi.parent == from;
+				                                    }) != end(child.phi_variables);
 
 				if (need_saved_temporary)
 				{
@@ -18053,8 +18110,8 @@ bool CompilerGLSL::attempt_emit_loop_header(SPIRBlock &block, SPIRBlock::Method
 
 		bool condition_is_temporary = forced_temporaries.find(block.condition) == end(forced_temporaries);
 
-		bool flushes_phi = flush_phi_required(block.self, block.true_block) ||
-		                   flush_phi_required(block.self, block.false_block);
+		bool flushes_phi =
+		    flush_phi_required(block.self, block.true_block) || flush_phi_required(block.self, block.false_block);
 
 		// This can work! We only did trivial things which could be forwarded in block body!
 		if (!flushes_phi && current_count == statement_count && condition_is_temporary)
@@ -18136,8 +18193,8 @@ bool CompilerGLSL::attempt_emit_loop_header(SPIRBlock &block, SPIRBlock::Method
 
 		bool condition_is_temporary = forced_temporaries.find(child.condition) == end(forced_temporaries);
 
-		bool flushes_phi = flush_phi_required(child.self, child.true_block) ||
-		                   flush_phi_required(child.self, child.false_block);
+		bool flushes_phi =
+		    flush_phi_required(child.self, child.true_block) || flush_phi_required(child.self, child.false_block);
 
 		if (!flushes_phi && current_count == statement_count && condition_is_temporary)
 		{
@@ -18236,7 +18293,8 @@ void CompilerGLSL::emit_hoisted_temporaries(SmallVector<pair<TypeID, ID>> &tempo
 		if (options.force_zero_initialized_variables && type_can_zero_initialize(type))
 			initializer = join(" = ", to_zero_initialized_expression(tmp.first));
 
-		statement(flags_to_qualifiers_glsl(type, tmp.second, flags), variable_decl(type, to_name(tmp.second)), initializer, ";");
+		statement(flags_to_qualifiers_glsl(type, tmp.second, flags), variable_decl(type, to_name(tmp.second)),
+		          initializer, ";");
 
 		hoisted_temporaries.insert(tmp.second);
 		forced_temporaries.insert(tmp.second);
@@ -18251,8 +18309,7 @@ void CompilerGLSL::emit_hoisted_temporaries(SmallVector<pair<TypeID, ID>> &tempo
 		{
 			uint32_t mirror_id = mirrored_precision_itr->second;
 			auto &mirror_flags = get_decoration_bitset(mirror_id);
-			statement(flags_to_qualifiers_glsl(type, mirror_id, mirror_flags),
-			          variable_decl(type, to_name(mirror_id)),
+			statement(flags_to_qualifiers_glsl(type, mirror_id, mirror_flags), variable_decl(type, to_name(mirror_id)),
 			          initializer, ";");
 			// The temporary might be read from before it's assigned, set up the expression now.
 			set<SPIRExpression>(mirror_id, to_name(mirror_id), tmp.first, true);
@@ -18302,9 +18359,8 @@ BlockID CompilerGLSL::emit_block_chain_inner(SPIRBlock &block)
 			auto mirrored_precision_itr = temporary_to_mirror_precision_alias.find(var_id);
 			if (mirrored_precision_itr != temporary_to_mirror_precision_alias.end() &&
 			    find_if(block.declare_temporary.begin(), block.declare_temporary.end(),
-			            [mirrored_precision_itr](const std::pair<TypeID, VariableID> &p) {
-			              return p.second == mirrored_precision_itr->second;
-			            }) == block.declare_temporary.end())
+			            [mirrored_precision_itr](const std::pair<TypeID, VariableID> &p)
+			            { return p.second == mirrored_precision_itr->second; }) == block.declare_temporary.end())
 			{
 				block.declare_temporary.push_back({ var.basetype, mirrored_precision_itr->second });
 			}
@@ -18612,7 +18668,8 @@ BlockID CompilerGLSL::emit_block_chain_inner(SPIRBlock &block)
 		};
 
 		const auto to_legacy_case_label = [&](uint32_t condition, const SmallVector<uint64_t> &labels,
-		                                      const char *suffix) -> string {
+		                                      const char *suffix) -> string
+		{
 			string ret;
 			size_t count = labels.size();
 			for (size_t i = 0; i < count; i++)
@@ -18889,11 +18946,10 @@ BlockID CompilerGLSL::emit_block_chain_inner(SPIRBlock &block)
 		while (id)
 		{
 			auto &iter_block = get<SPIRBlock>(id);
-			if (iter_block.terminator == SPIRBlock::MultiSelect ||
-			    iter_block.merge == SPIRBlock::MergeLoop)
+			if (iter_block.terminator == SPIRBlock::MultiSelect || iter_block.merge == SPIRBlock::MergeLoop)
 			{
-				ID next_block = iter_block.merge == SPIRBlock::MergeLoop ?
-				                iter_block.merge_block : iter_block.next_block;
+				ID next_block =
+				    iter_block.merge == SPIRBlock::MergeLoop ? iter_block.merge_block : iter_block.next_block;
 				bool outside_construct = next_block && cfg.find_common_dominator(next_block, block.self) == next_block;
 				if (!outside_construct)
 				{
@@ -19157,14 +19213,13 @@ bool CompilerGLSL::unroll_array_to_complex_store(uint32_t target_id, uint32_t so
 	else
 		array_expr = to_expression(type.array.back());
 
-	SPIRType target_type { OpTypeInt };
+	SPIRType target_type{ OpTypeInt };
 	target_type.basetype = SPIRType::Int;
 
 	statement("for (int i = 0; i < int(", array_expr, "); i++)");
 	begin_scope();
-	statement(to_expression(target_id), "[i] = ",
-	          bitcast_expression(target_type, type.basetype, join(to_expression(source_id), "[i]")),
-	          ";");
+	statement(to_expression(target_id),
+	          "[i] = ", bitcast_expression(target_type, type.basetype, join(to_expression(source_id), "[i]")), ";");
 	end_scope();
 
 	return true;
@@ -19189,9 +19244,7 @@ void CompilerGLSL::unroll_array_from_complex_load(uint32_t target_id, uint32_t s
 
 	auto builtin = BuiltIn(get_decoration(var->self, DecorationBuiltIn));
 	bool is_builtin = is_builtin_variable(*var) &&
-	                  (builtin == BuiltInPointSize ||
-	                   builtin == BuiltInPosition ||
-	                   builtin == BuiltInSampleMask);
+	                  (builtin == BuiltInPointSize || builtin == BuiltInPosition || builtin == BuiltInSampleMask);
 	bool is_tess = is_tessellation_shader();
 	bool is_patch = has_decoration(var->self, DecorationPatch);
 	bool is_sample_mask = is_builtin && builtin == BuiltInSampleMask;
@@ -19222,7 +19275,7 @@ void CompilerGLSL::unroll_array_from_complex_load(uint32_t target_id, uint32_t s
 			statement(new_expr, "[i] = gl_in[i].", expr, ";");
 		else if (is_sample_mask)
 		{
-			SPIRType target_type { OpTypeInt };
+			SPIRType target_type{ OpTypeInt };
 			target_type.basetype = SPIRType::Int;
 			statement(new_expr, "[i] = ", bitcast_expression(target_type, type.basetype, join(expr, "[i]")), ";");
 		}
@@ -19350,8 +19403,7 @@ void CompilerGLSL::convert_non_uniform_expression(string &expr, uint32_t ptr_id)
 	if (!var)
 		return;
 
-	if (var->storage != StorageClassUniformConstant &&
-	    var->storage != StorageClassStorageBuffer &&
+	if (var->storage != StorageClassUniformConstant && var->storage != StorageClassStorageBuffer &&
 	    var->storage != StorageClassUniform)
 		return;
 
@@ -19483,45 +19535,50 @@ void CompilerGLSL::fixup_anonymous_struct_names()
 	// Breaks exponential explosion with weird type trees.
 	std::unordered_set<uint32_t> visited;
 
-	ir.for_each_typed_id<SPIRType>([&](uint32_t, SPIRType &type) {
-		if (type.basetype == SPIRType::Struct &&
-		    (has_decoration(type.self, DecorationBlock) ||
-		     has_decoration(type.self, DecorationBufferBlock)))
-		{
-			fixup_anonymous_struct_names(visited, type);
-		}
-	});
+	ir.for_each_typed_id<SPIRType>(
+	    [&](uint32_t, SPIRType &type)
+	    {
+		    if (type.basetype == SPIRType::Struct &&
+		        (has_decoration(type.self, DecorationBlock) || has_decoration(type.self, DecorationBufferBlock)))
+		    {
+			    fixup_anonymous_struct_names(visited, type);
+		    }
+	    });
 }
 
 void CompilerGLSL::fixup_type_alias()
 {
 	// Due to how some backends work, the "master" type of type_alias must be a block-like type if it exists.
-	ir.for_each_typed_id<SPIRType>([&](uint32_t self, SPIRType &type) {
-		if (!type.type_alias)
-			return;
-
-		if (has_decoration(type.self, DecorationBlock) || has_decoration(type.self, DecorationBufferBlock))
-		{
-			// Top-level block types should never alias anything else.
-			type.type_alias = 0;
-		}
-		else if (type_is_block_like(type) && type.self == ID(self))
-		{
-			// A block-like type is any type which contains Offset decoration, but not top-level blocks,
-			// i.e. blocks which are placed inside buffers.
-			// Become the master.
-			ir.for_each_typed_id<SPIRType>([&](uint32_t other_id, SPIRType &other_type) {
-				if (other_id == self)
-					return;
-
-				if (other_type.type_alias == type.type_alias)
-					other_type.type_alias = self;
-			});
-
-			this->get<SPIRType>(type.type_alias).type_alias = self;
-			type.type_alias = 0;
-		}
-	});
+	ir.for_each_typed_id<SPIRType>(
+	    [&](uint32_t self, SPIRType &type)
+	    {
+		    if (!type.type_alias)
+			    return;
+
+		    if (has_decoration(type.self, DecorationBlock) || has_decoration(type.self, DecorationBufferBlock))
+		    {
+			    // Top-level block types should never alias anything else.
+			    type.type_alias = 0;
+		    }
+		    else if (type_is_block_like(type) && type.self == ID(self))
+		    {
+			    // A block-like type is any type which contains Offset decoration, but not top-level blocks,
+			    // i.e. blocks which are placed inside buffers.
+			    // Become the master.
+			    ir.for_each_typed_id<SPIRType>(
+			        [&](uint32_t other_id, SPIRType &other_type)
+			        {
+				        if (other_id == self)
+					        return;
+
+				        if (other_type.type_alias == type.type_alias)
+					        other_type.type_alias = self;
+			        });
+
+			    this->get<SPIRType>(type.type_alias).type_alias = self;
+			    type.type_alias = 0;
+		    }
+	    });
 }
 
 void CompilerGLSL::reorder_type_alias()
@@ -19577,8 +19634,8 @@ void CompilerGLSL::emit_line_directive(uint32_t file_id, uint32_t line_literal)
 	}
 }
 
-void CompilerGLSL::emit_non_semantic_shader_debug_info(uint32_t, uint32_t result_id, uint32_t eop,
-                                                       const uint32_t *args, uint32_t)
+void CompilerGLSL::emit_non_semantic_shader_debug_info(uint32_t, uint32_t result_id, uint32_t eop, const uint32_t *args,
+                                                       uint32_t)
 {
 	if (!options.emit_line_directives)
 		return;
@@ -19699,23 +19756,27 @@ bool CompilerGLSL::subpass_input_is_framebuffer_fetch(uint32_t id) const
 const SPIRVariable *CompilerGLSL::find_subpass_input_by_attachment_index(uint32_t index) const
 {
 	const SPIRVariable *ret = nullptr;
-	ir.for_each_typed_id<SPIRVariable>([&](uint32_t, const SPIRVariable &var) {
-		if (has_decoration(var.self, DecorationInputAttachmentIndex) &&
-		    get_decoration(var.self, DecorationInputAttachmentIndex) == index)
-		{
-			ret = &var;
-		}
-	});
+	ir.for_each_typed_id<SPIRVariable>(
+	    [&](uint32_t, const SPIRVariable &var)
+	    {
+		    if (has_decoration(var.self, DecorationInputAttachmentIndex) &&
+		        get_decoration(var.self, DecorationInputAttachmentIndex) == index)
+		    {
+			    ret = &var;
+		    }
+	    });
 	return ret;
 }
 
 const SPIRVariable *CompilerGLSL::find_color_output_by_location(uint32_t location) const
 {
 	const SPIRVariable *ret = nullptr;
-	ir.for_each_typed_id<SPIRVariable>([&](uint32_t, const SPIRVariable &var) {
-		if (var.storage == StorageClassOutput && get_decoration(var.self, DecorationLocation) == location)
-			ret = &var;
-	});
+	ir.for_each_typed_id<SPIRVariable>(
+	    [&](uint32_t, const SPIRVariable &var)
+	    {
+		    if (var.storage == StorageClassOutput && get_decoration(var.self, DecorationLocation) == location)
+			    ret = &var;
+	    });
 	return ret;
 }
 
@@ -19734,19 +19795,21 @@ void CompilerGLSL::emit_inout_fragment_outputs_copy_to_subpass_inputs()
 			SPIRV_CROSS_THROW("Cannot use GL_EXT_shader_framebuffer_fetch with arrays of color outputs.");
 
 		auto &func = get<SPIRFunction>(get_entry_point().self);
-		func.fixup_hooks_in.push_back([=]() {
-			if (is_legacy())
-			{
-				statement(to_expression(subpass_var->self), " = ", "gl_LastFragData[",
-				          get_decoration(output_var->self, DecorationLocation), "];");
-			}
-			else
-			{
-				uint32_t num_rt_components = this->get<SPIRType>(output_var->basetype).vecsize;
-				statement(to_expression(subpass_var->self), vector_swizzle(num_rt_components, 0), " = ",
-				          to_expression(output_var->self), ";");
-			}
-		});
+		func.fixup_hooks_in.push_back(
+		    [=]()
+		    {
+			    if (is_legacy())
+			    {
+				    statement(to_expression(subpass_var->self), " = ", "gl_LastFragData[",
+				              get_decoration(output_var->self, DecorationLocation), "];");
+			    }
+			    else
+			    {
+				    uint32_t num_rt_components = this->get<SPIRType>(output_var->basetype).vecsize;
+				    statement(to_expression(subpass_var->self), vector_swizzle(num_rt_components, 0), " = ",
+				              to_expression(output_var->self), ";");
+			    }
+		    });
 	}
 }
 
@@ -19842,8 +19905,7 @@ bool CompilerGLSL::ShaderSubgroupSupportHelper::can_feature_be_implemented_witho
 		true, // SubgroupBalloFindLSB_MSB
 		false, false, false, false,
 		true, // SubgroupMemBarrier - replaced with workgroup memory barriers
-		false, false, true, false,
-		false, false, false, false, false, false, // iadd, fadd
+		false, false, true,  false, false, false, false, false, false, false, // iadd, fadd
 		false, false, false, false, false, false, // imul , fmul
 	};
 
@@ -19854,14 +19916,16 @@ CompilerGLSL::ShaderSubgroupSupportHelper::Candidate CompilerGLSL::ShaderSubgrou
     get_KHR_extension_for_feature(Feature feature)
 {
 	static const Candidate extensions[FeatureCount] = {
-		KHR_shader_subgroup_ballot, KHR_shader_subgroup_basic,  KHR_shader_subgroup_basic,  KHR_shader_subgroup_basic,
-		KHR_shader_subgroup_basic,  KHR_shader_subgroup_ballot, KHR_shader_subgroup_ballot, KHR_shader_subgroup_vote,
-		KHR_shader_subgroup_vote,   KHR_shader_subgroup_basic,  KHR_shader_subgroup_basic, KHR_shader_subgroup_basic,
-		KHR_shader_subgroup_ballot, KHR_shader_subgroup_ballot, KHR_shader_subgroup_ballot, KHR_shader_subgroup_ballot,
-		KHR_shader_subgroup_arithmetic, KHR_shader_subgroup_arithmetic, KHR_shader_subgroup_arithmetic,
+		KHR_shader_subgroup_ballot,     KHR_shader_subgroup_basic,      KHR_shader_subgroup_basic,
+		KHR_shader_subgroup_basic,      KHR_shader_subgroup_basic,      KHR_shader_subgroup_ballot,
+		KHR_shader_subgroup_ballot,     KHR_shader_subgroup_vote,       KHR_shader_subgroup_vote,
+		KHR_shader_subgroup_basic,      KHR_shader_subgroup_basic,      KHR_shader_subgroup_basic,
+		KHR_shader_subgroup_ballot,     KHR_shader_subgroup_ballot,     KHR_shader_subgroup_ballot,
+		KHR_shader_subgroup_ballot,     KHR_shader_subgroup_arithmetic, KHR_shader_subgroup_arithmetic,
 		KHR_shader_subgroup_arithmetic, KHR_shader_subgroup_arithmetic, KHR_shader_subgroup_arithmetic,
 		KHR_shader_subgroup_arithmetic, KHR_shader_subgroup_arithmetic, KHR_shader_subgroup_arithmetic,
 		KHR_shader_subgroup_arithmetic, KHR_shader_subgroup_arithmetic, KHR_shader_subgroup_arithmetic,
+		KHR_shader_subgroup_arithmetic,
 	};
 
 	return extensions[feature];
@@ -19911,7 +19975,8 @@ CompilerGLSL::ShaderSubgroupSupportHelper::CandidateVector CompilerGLSL::ShaderS
     get_candidates_for_feature(Feature ft, const Result &r)
 {
 	auto c = get_candidates_for_feature(ft);
-	auto cmp = [&r](Candidate a, Candidate b) {
+	auto cmp = [&r](Candidate a, Candidate b)
+	{
 		if (r.weights[a] == r.weights[b])
 			return a < b; // Prefer candidates with lower enum value
 		return r.weights[a] > r.weights[b];
@@ -20093,9 +20158,8 @@ bool CompilerGLSL::is_stage_output_variable_masked(const SPIRVariable &var) cons
 		if (!has_decoration(var.self, DecorationLocation))
 			return false;
 
-		return is_stage_output_location_masked(
-				get_decoration(var.self, DecorationLocation),
-				get_decoration(var.self, DecorationComponent));
+		return is_stage_output_location_masked(get_decoration(var.self, DecorationLocation),
+		                                       get_decoration(var.self, DecorationComponent));
 	}
 }
 
@@ -20154,7 +20218,8 @@ uint32_t CompilerGLSL::get_declared_member_location(const SPIRVariable &var, uin
 		return get_accumulated_member_location(var, mbr_idx, strip_array);
 }
 
-uint32_t CompilerGLSL::get_accumulated_member_location(const SPIRVariable &var, uint32_t mbr_idx, bool strip_array) const
+uint32_t CompilerGLSL::get_accumulated_member_location(const SPIRVariable &var, uint32_t mbr_idx,
+                                                       bool strip_array) const
 {
 	auto &type = strip_array ? get_variable_element_type(var) : get_variable_data_type(var);
 	uint32_t location = get_decoration(var.self, DecorationLocation);
@@ -20244,17 +20309,17 @@ std::string CompilerGLSL::format_double(double value) const
 	return convert_to_string(value, current_locale_radix_character);
 }
 
-std::string CompilerGLSL::to_pretty_expression_if_int_constant(
-		uint32_t id,
-		const GlslConstantNameMapping *mapping_start, const GlslConstantNameMapping *mapping_end,
-		bool register_expression_read)
+std::string CompilerGLSL::to_pretty_expression_if_int_constant(uint32_t id,
+                                                               const GlslConstantNameMapping *mapping_start,
+                                                               const GlslConstantNameMapping *mapping_end,
+                                                               bool register_expression_read)
 {
 	auto *c = maybe_get<SPIRConstant>(id);
 	if (c && !c->specialization)
 	{
 		auto value = c->scalar();
-		auto pretty_name = std::find_if(mapping_start, mapping_end,
-		                                [value](const GlslConstantNameMapping &mapping) { return mapping.value == value; });
+		auto pretty_name = std::find_if(mapping_start, mapping_end, [value](const GlslConstantNameMapping &mapping)
+		                                { return mapping.value == value; });
 		if (pretty_name != mapping_end)
 			return pretty_name->alias;
 	}
@@ -20306,7 +20371,8 @@ uint32_t CompilerGLSL::get_fp_fast_math_flags_for_op(uint32_t result_type, uint3
 
 	// Legacy NoContraction deals with any kind of transform to the expression.
 	if (id != 0 && has_decoration(id, DecorationNoContraction))
-		fp_flags &= ~(FPFastMathModeAllowContractMask | FPFastMathModeAllowTransformMask | FPFastMathModeAllowReassocMask);
+		fp_flags &=
+		    ~(FPFastMathModeAllowContractMask | FPFastMathModeAllowTransformMask | FPFastMathModeAllowReassocMask);
 
 	// Handle float_controls2 execution modes.
 	bool found_default = false;
@@ -20332,8 +20398,7 @@ uint32_t CompilerGLSL::get_fp_fast_math_flags_for_op(uint32_t result_type, uint3
 
 bool CompilerGLSL::has_legacy_nocontract(uint32_t result_type, uint32_t id) const
 {
-	const auto fp_flags = FPFastMathModeAllowContractMask |
-	                      FPFastMathModeAllowTransformMask |
-	                      FPFastMathModeAllowReassocMask;
+	const auto fp_flags =
+	    FPFastMathModeAllowContractMask | FPFastMathModeAllowTransformMask | FPFastMathModeAllowReassocMask;
 	return (get_fp_fast_math_flags_for_op(result_type, id) & fp_flags) != fp_flags;
 }
diff --git a/spirv_glsl.hpp b/spirv_glsl.hpp
index 24e34d7b0..70c93fcd4 100644
--- a/spirv_glsl.hpp
+++ b/spirv_glsl.hpp
@@ -668,6 +668,9 @@ class CompilerGLSL : public Compiler
 		bool requires_relaxed_precision_analysis = false;
 		bool implicit_c_integer_promotion_rules = false;
 		bool supports_spec_constant_array_size = true;
+		// When non-empty, matrix column access uses this member name instead of raw array indexing.
+		// e.g., "columns" -> m.columns[i] instead of m[i].
+		std::string matrix_column_accessor;
 	} backend;
 
 	virtual void emit_struct(SPIRType &type);
@@ -708,7 +711,7 @@ class CompilerGLSL : public Compiler
 	void flush_undeclared_variables(SPIRBlock &block);
 	void emit_variable_temporary_copies(const SPIRVariable &var);
 
-	bool should_dereference(uint32_t id);
+	virtual bool should_dereference(uint32_t id);
 	bool should_dereference_caller_param(uint32_t id);
 	bool should_forward(uint32_t id) const;
 	bool should_suppress_usage_tracking(uint32_t id) const;
diff --git a/spirv_opencl.cpp b/spirv_opencl.cpp
index 5a72f14e4..3aaaa6150 100644
--- a/spirv_opencl.cpp
+++ b/spirv_opencl.cpp
@@ -112,6 +112,7 @@ string CompilerOpenCL::compile()
 	backend.support_pointer_to_pointer = true;
 	backend.implicit_c_integer_promotion_rules = true;
 	backend.supports_spec_constant_array_size = false;
+	backend.matrix_column_accessor = "columns";
 
 	fixup_anonymous_struct_names();
 	fixup_type_alias();
@@ -124,17 +125,32 @@ string CompilerOpenCL::compile()
 	set_enabled_interface_variables(get_active_interface_variables());
 	reorder_type_alias();
 
+	// Pre-scan: discover all matrix types used in the IR so that typedefs
+	// and helpers can be emitted in the first pass without forcing a recompile.
+	prepass_discover_matrix_types();
+
 	uint32_t pass_count = 0;
 	do
 	{
+		auto prev_matrix_types = used_matrix_types;
+		auto prev_helpers = need_mul_mat_vec.size() + need_mul_vec_mat.size() + need_mul_mat_mat.size() +
+		                    need_mul_mat_scalar.size() + need_transpose.size() + need_outer_product.size();
+
 		reset(pass_count);
 		buffer.reset();
 
 		emit_header();
+		emit_matrix_typedefs();
 		emit_specialization_constants_and_structs();
+		emit_matrix_helpers();
 		emit_resources();
 		emit_function(get<SPIRFunction>(ir.default_entry_point), Bitset());
 
+		auto new_helpers = need_mul_mat_vec.size() + need_mul_vec_mat.size() + need_mul_mat_mat.size() +
+		                   need_mul_mat_scalar.size() + need_transpose.size() + need_outer_product.size();
+		if (used_matrix_types != prev_matrix_types || new_helpers != prev_helpers)
+			force_recompile();
+
 		pass_count++;
 	} while (is_forcing_recompilation());
 
@@ -153,6 +169,8 @@ void CompilerOpenCL::emit_header()
 
 	if (opencl_options.opencl_version >= 200)
 		statement("#pragma OPENCL EXTENSION cl_khr_3d_image_writes : enable");
+	if (opencl_options.enable_fp16)
+		statement("#pragma OPENCL EXTENSION cl_khr_fp16 : enable");
 	if (opencl_options.enable_fp64)
 		statement("#pragma OPENCL EXTENSION cl_khr_fp64 : enable");
 	if (opencl_options.enable_64bit_atomics && opencl_options.opencl_version >= 200)
@@ -490,6 +508,227 @@ void CompilerOpenCL::emit_resources()
 		statement("");
 	}
 
+	// FindLSB polyfill: GLSL findLSB returns the bit position of the lowest set bit, or -1 if 0.
+	// OpenCL 2.0+ has ctz() but OpenCL 1.2 does not. Use (x & -x) to isolate lowest bit,
+	// then 31 - clz() to get its position.
+	if (needs_findlsb_polyfill)
+	{
+		statement("static int spvFindLSB(uint x) {");
+		statement("    if (x == 0u) return -1;");
+		statement("    return 31 - as_int(clz(x & (0u - x)));");
+		statement("}");
+		statement("");
+	}
+
+	// Pack/Unpack Snorm/Unorm polyfills.
+	if (needs_pack_snorm_4x8)
+	{
+		statement("static uint spvPackSnorm4x8(float4 v) {");
+		statement("    char4 packed = convert_char4_sat_rte(v * 127.0f);");
+		statement("    return as_uint(packed);");
+		statement("}");
+		statement("");
+	}
+	if (needs_pack_unorm_4x8)
+	{
+		statement("static uint spvPackUnorm4x8(float4 v) {");
+		statement("    uchar4 packed = convert_uchar4_sat_rte(v * 255.0f);");
+		statement("    return as_uint(packed);");
+		statement("}");
+		statement("");
+	}
+	if (needs_pack_snorm_2x16)
+	{
+		statement("static uint spvPackSnorm2x16(float2 v) {");
+		statement("    short2 packed = convert_short2_sat_rte(v * 32767.0f);");
+		statement("    return as_uint(packed);");
+		statement("}");
+		statement("");
+	}
+	if (needs_pack_unorm_2x16)
+	{
+		statement("static uint spvPackUnorm2x16(float2 v) {");
+		statement("    ushort2 packed = convert_ushort2_sat_rte(v * 65535.0f);");
+		statement("    return as_uint(packed);");
+		statement("}");
+		statement("");
+	}
+	if (needs_unpack_snorm_4x8)
+	{
+		statement("static float4 spvUnpackSnorm4x8(uint v) {");
+		statement("    char4 packed = as_char4(v);");
+		statement("    return max(convert_float4(packed) / 127.0f, (float4)(-1.0f));");
+		statement("}");
+		statement("");
+	}
+	if (needs_unpack_unorm_4x8)
+	{
+		statement("static float4 spvUnpackUnorm4x8(uint v) {");
+		statement("    uchar4 packed = as_uchar4(v);");
+		statement("    return convert_float4(packed) / 255.0f;");
+		statement("}");
+		statement("");
+	}
+	if (needs_unpack_snorm_2x16)
+	{
+		statement("static float2 spvUnpackSnorm2x16(uint v) {");
+		statement("    short2 packed = as_short2(v);");
+		statement("    return max(convert_float2(packed) / 32767.0f, (float2)(-1.0f));");
+		statement("}");
+		statement("");
+	}
+	if (needs_unpack_unorm_2x16)
+	{
+		statement("static float2 spvUnpackUnorm2x16(uint v) {");
+		statement("    ushort2 packed = as_ushort2(v);");
+		statement("    return convert_float2(packed) / 65535.0f;");
+		statement("}");
+		statement("");
+	}
+
+	// Determinant polyfills using struct-wrapped matrix types (unique names per size for C).
+	if (needs_determinant_2)
+	{
+		auto mat = opencl_matrix_type_name(SPIRType::Float, 2, 2);
+		statement("static float spvDeterminant2(", mat, " m) {");
+		statement("    return m.columns[0].x * m.columns[1].y - m.columns[0].y * m.columns[1].x;");
+		statement("}");
+		statement("");
+	}
+	if (needs_determinant_3)
+	{
+		auto mat = opencl_matrix_type_name(SPIRType::Float, 3, 3);
+		statement("static float spvDeterminant3(", mat, " m) {");
+		statement("    return dot(m.columns[0], (float3)("
+		          "m.columns[1].y * m.columns[2].z - m.columns[1].z * m.columns[2].y, "
+		          "m.columns[1].z * m.columns[2].x - m.columns[1].x * m.columns[2].z, "
+		          "m.columns[1].x * m.columns[2].y - m.columns[1].y * m.columns[2].x));");
+		statement("}");
+		statement("");
+	}
+	if (needs_determinant_4)
+	{
+		auto mat = opencl_matrix_type_name(SPIRType::Float, 4, 4);
+		statement("static float spvDeterminant4(", mat, " m) {");
+		statement(
+		    "    return dot(m.columns[0], (float4)("
+		    "m.columns[2].y * m.columns[3].z * m.columns[1].w - m.columns[3].y * m.columns[2].z * m.columns[1].w + "
+		    "m.columns[3].y * m.columns[1].z * m.columns[2].w - m.columns[1].y * m.columns[3].z * m.columns[2].w - "
+		    "m.columns[2].y * m.columns[1].z * m.columns[3].w + m.columns[1].y * m.columns[2].z * m.columns[3].w, "
+		    "m.columns[3].x * m.columns[2].z * m.columns[1].w - m.columns[2].x * m.columns[3].z * m.columns[1].w - "
+		    "m.columns[3].x * m.columns[1].z * m.columns[2].w + m.columns[1].x * m.columns[3].z * m.columns[2].w + "
+		    "m.columns[2].x * m.columns[1].z * m.columns[3].w - m.columns[1].x * m.columns[2].z * m.columns[3].w, "
+		    "m.columns[2].x * m.columns[3].y * m.columns[1].w - m.columns[3].x * m.columns[2].y * m.columns[1].w + "
+		    "m.columns[3].x * m.columns[1].y * m.columns[2].w - m.columns[1].x * m.columns[3].y * m.columns[2].w - "
+		    "m.columns[2].x * m.columns[1].y * m.columns[3].w + m.columns[1].x * m.columns[2].y * m.columns[3].w, "
+		    "m.columns[3].x * m.columns[2].y * m.columns[1].z - m.columns[2].x * m.columns[3].y * m.columns[1].z - "
+		    "m.columns[3].x * m.columns[1].y * m.columns[2].z + m.columns[1].x * m.columns[3].y * m.columns[2].z + "
+		    "m.columns[2].x * m.columns[1].y * m.columns[3].z - m.columns[1].x * m.columns[2].y * m.columns[3].z));");
+		statement("}");
+		statement("");
+	}
+
+	// Matrix inverse polyfills.
+	if (needs_inverse_2)
+	{
+		auto mat = opencl_matrix_type_name(SPIRType::Float, 2, 2);
+		statement("static ", mat, " spvInverse2(", mat, " m) {");
+		statement("    float d = 1.0f / (m.columns[0].x * m.columns[1].y - m.columns[1].x * m.columns[0].y);");
+		statement("    return (", mat,
+		          "){ { (float2)(m.columns[1].y * d, -m.columns[0].y * d), (float2)(-m.columns[1].x * d, "
+		          "m.columns[0].x * d) } };");
+		statement("}");
+		statement("");
+	}
+	if (needs_inverse_3)
+	{
+		auto mat = opencl_matrix_type_name(SPIRType::Float, 3, 3);
+		statement("static ", mat, " spvInverse3(", mat, " m) {");
+		statement("    float3 t = (float3)("
+		          "m.columns[1].y * m.columns[2].z - m.columns[1].z * m.columns[2].y, "
+		          "m.columns[1].z * m.columns[2].x - m.columns[1].x * m.columns[2].z, "
+		          "m.columns[1].x * m.columns[2].y - m.columns[1].y * m.columns[2].x);");
+		statement("    float d = 1.0f / dot(m.columns[0], t);");
+		statement("    return (", mat,
+		          "){ { t * d, "
+		          "(float3)(m.columns[0].z * m.columns[2].y - m.columns[0].y * m.columns[2].z, "
+		          "m.columns[0].x * m.columns[2].z - m.columns[0].z * m.columns[2].x, "
+		          "m.columns[0].y * m.columns[2].x - m.columns[0].x * m.columns[2].y) * d, "
+		          "(float3)(m.columns[0].y * m.columns[1].z - m.columns[0].z * m.columns[1].y, "
+		          "m.columns[0].z * m.columns[1].x - m.columns[0].x * m.columns[1].z, "
+		          "m.columns[0].x * m.columns[1].y - m.columns[0].y * m.columns[1].x) * d } };");
+		statement("}");
+		statement("");
+	}
+	if (needs_inverse_4)
+	{
+		auto mat = opencl_matrix_type_name(SPIRType::Float, 4, 4);
+		statement("static ", mat, " spvInverse4(", mat, " m) {");
+		statement(
+		    "    float4 t = (float4)("
+		    "m.columns[2].y * m.columns[3].z * m.columns[1].w - m.columns[3].y * m.columns[2].z * m.columns[1].w + "
+		    "m.columns[3].y * m.columns[1].z * m.columns[2].w - m.columns[1].y * m.columns[3].z * m.columns[2].w - "
+		    "m.columns[2].y * m.columns[1].z * m.columns[3].w + m.columns[1].y * m.columns[2].z * m.columns[3].w, "
+		    "m.columns[3].x * m.columns[2].z * m.columns[1].w - m.columns[2].x * m.columns[3].z * m.columns[1].w - "
+		    "m.columns[3].x * m.columns[1].z * m.columns[2].w + m.columns[1].x * m.columns[3].z * m.columns[2].w + "
+		    "m.columns[2].x * m.columns[1].z * m.columns[3].w - m.columns[1].x * m.columns[2].z * m.columns[3].w, "
+		    "m.columns[2].x * m.columns[3].y * m.columns[1].w - m.columns[3].x * m.columns[2].y * m.columns[1].w + "
+		    "m.columns[3].x * m.columns[1].y * m.columns[2].w - m.columns[1].x * m.columns[3].y * m.columns[2].w - "
+		    "m.columns[2].x * m.columns[1].y * m.columns[3].w + m.columns[1].x * m.columns[2].y * m.columns[3].w, "
+		    "m.columns[3].x * m.columns[2].y * m.columns[1].z - m.columns[2].x * m.columns[3].y * m.columns[1].z - "
+		    "m.columns[3].x * m.columns[1].y * m.columns[2].z + m.columns[1].x * m.columns[3].y * m.columns[2].z + "
+		    "m.columns[2].x * m.columns[1].y * m.columns[3].z - m.columns[1].x * m.columns[2].y * m.columns[3].z);");
+		statement(
+		    "    ", mat, " r = (", mat,
+		    "){ { "
+		    "(float4)(t.x, "
+		    "m.columns[3].y * m.columns[2].z * m.columns[0].w - m.columns[2].y * m.columns[3].z * m.columns[0].w - "
+		    "m.columns[3].y * m.columns[0].z * m.columns[2].w + m.columns[0].y * m.columns[3].z * m.columns[2].w + "
+		    "m.columns[2].y * m.columns[0].z * m.columns[3].w - m.columns[0].y * m.columns[2].z * m.columns[3].w, "
+		    "m.columns[1].y * m.columns[3].z * m.columns[0].w - m.columns[3].y * m.columns[1].z * m.columns[0].w + "
+		    "m.columns[3].y * m.columns[0].z * m.columns[1].w - m.columns[0].y * m.columns[3].z * m.columns[1].w - "
+		    "m.columns[1].y * m.columns[0].z * m.columns[3].w + m.columns[0].y * m.columns[1].z * m.columns[3].w, "
+		    "m.columns[2].y * m.columns[1].z * m.columns[0].w - m.columns[1].y * m.columns[2].z * m.columns[0].w - "
+		    "m.columns[2].y * m.columns[0].z * m.columns[1].w + m.columns[0].y * m.columns[2].z * m.columns[1].w + "
+		    "m.columns[1].y * m.columns[0].z * m.columns[2].w - m.columns[0].y * m.columns[1].z * m.columns[2].w), "
+		    "(float4)(t.y, "
+		    "m.columns[2].x * m.columns[3].z * m.columns[0].w - m.columns[3].x * m.columns[2].z * m.columns[0].w + "
+		    "m.columns[3].x * m.columns[0].z * m.columns[2].w - m.columns[0].x * m.columns[3].z * m.columns[2].w - "
+		    "m.columns[2].x * m.columns[0].z * m.columns[3].w + m.columns[0].x * m.columns[2].z * m.columns[3].w, "
+		    "m.columns[3].x * m.columns[1].z * m.columns[0].w - m.columns[1].x * m.columns[3].z * m.columns[0].w - "
+		    "m.columns[3].x * m.columns[0].z * m.columns[1].w + m.columns[0].x * m.columns[3].z * m.columns[1].w + "
+		    "m.columns[1].x * m.columns[0].z * m.columns[3].w - m.columns[0].x * m.columns[1].z * m.columns[3].w, "
+		    "m.columns[1].x * m.columns[2].z * m.columns[0].w - m.columns[2].x * m.columns[1].z * m.columns[0].w + "
+		    "m.columns[2].x * m.columns[0].z * m.columns[1].w - m.columns[0].x * m.columns[2].z * m.columns[1].w - "
+		    "m.columns[1].x * m.columns[0].z * m.columns[2].w + m.columns[0].x * m.columns[1].z * m.columns[2].w), "
+		    "(float4)(t.z, "
+		    "m.columns[3].x * m.columns[2].y * m.columns[0].w - m.columns[2].x * m.columns[3].y * m.columns[0].w - "
+		    "m.columns[3].x * m.columns[0].y * m.columns[2].w + m.columns[0].x * m.columns[3].y * m.columns[2].w + "
+		    "m.columns[2].x * m.columns[0].y * m.columns[3].w - m.columns[0].x * m.columns[2].y * m.columns[3].w, "
+		    "m.columns[1].x * m.columns[3].y * m.columns[0].w - m.columns[3].x * m.columns[1].y * m.columns[0].w + "
+		    "m.columns[3].x * m.columns[0].y * m.columns[1].w - m.columns[0].x * m.columns[3].y * m.columns[1].w - "
+		    "m.columns[1].x * m.columns[0].y * m.columns[3].w + m.columns[0].x * m.columns[1].y * m.columns[3].w, "
+		    "m.columns[2].x * m.columns[1].y * m.columns[0].w - m.columns[1].x * m.columns[2].y * m.columns[0].w - "
+		    "m.columns[2].x * m.columns[0].y * m.columns[1].w + m.columns[0].x * m.columns[2].y * m.columns[1].w + "
+		    "m.columns[1].x * m.columns[0].y * m.columns[2].w - m.columns[0].x * m.columns[1].y * m.columns[2].w), "
+		    "(float4)(t.w, "
+		    "m.columns[2].x * m.columns[3].y * m.columns[0].z - m.columns[3].x * m.columns[2].y * m.columns[0].z + "
+		    "m.columns[3].x * m.columns[0].y * m.columns[2].z - m.columns[0].x * m.columns[3].y * m.columns[2].z - "
+		    "m.columns[2].x * m.columns[0].y * m.columns[3].z + m.columns[0].x * m.columns[2].y * m.columns[3].z, "
+		    "m.columns[3].x * m.columns[1].y * m.columns[0].z - m.columns[1].x * m.columns[3].y * m.columns[0].z - "
+		    "m.columns[3].x * m.columns[0].y * m.columns[1].z + m.columns[0].x * m.columns[3].y * m.columns[1].z + "
+		    "m.columns[1].x * m.columns[0].y * m.columns[3].z - m.columns[0].x * m.columns[1].y * m.columns[3].z, "
+		    "m.columns[1].x * m.columns[2].y * m.columns[0].z - m.columns[2].x * m.columns[1].y * m.columns[0].z + "
+		    "m.columns[2].x * m.columns[0].y * m.columns[1].z - m.columns[0].x * m.columns[2].y * m.columns[1].z - "
+		    "m.columns[1].x * m.columns[0].y * m.columns[2].z + m.columns[0].x * m.columns[1].y * m.columns[2].z) } "
+		    "};");
+		statement("    float d = 1.0f / dot(m.columns[0], t);");
+		statement("    r.columns[0] *= d; r.columns[1] *= d; r.columns[2] *= d; r.columns[3] *= d;");
+		statement("    return r;");
+		statement("}");
+		statement("");
+	}
+
 	// Default sampler for combined image+sampler usage (OpenCL C requires file-scope const sampler_t).
 	if (needs_default_sampler)
 	{
@@ -1008,6 +1247,8 @@ string CompilerOpenCL::type_to_glsl(const SPIRType &type, uint32_t id, bool memb
 		type_name = "ulong";
 		break;
 	case SPIRType::Half:
+		if (!opencl_options.enable_fp16)
+			SPIRV_CROSS_THROW("Half requires cl_khr_fp16.");
 		type_name = "half";
 		break;
 	case SPIRType::Float:
@@ -1023,6 +1264,13 @@ string CompilerOpenCL::type_to_glsl(const SPIRType &type, uint32_t id, bool memb
 		return "unknown_type";
 	}
 
+	// Matrix? (columns > 1)
+	if (type.columns > 1)
+	{
+		used_matrix_types.insert(make_matrix_key(type));
+		return opencl_matrix_type_name(type);
+	}
+
 	// Vector?
 	if (type.vecsize > 1)
 		type_name += to_string(type.vecsize);
@@ -1035,6 +1283,366 @@ string CompilerOpenCL::type_to_glsl(const SPIRType &type, uint32_t id)
 	return type_to_glsl(type, id, false);
 }
 
+CompilerOpenCL::MatrixTypeKey CompilerOpenCL::make_matrix_key(const SPIRType &type)
+{
+	return { type.basetype, type.vecsize, type.columns };
+}
+
+string CompilerOpenCL::opencl_column_type_name(SPIRType::BaseType basetype, uint32_t vecsize)
+{
+	string name;
+	switch (basetype)
+	{
+	case SPIRType::Float:
+		name = "float";
+		break;
+	case SPIRType::Double:
+		name = "double";
+		break;
+	case SPIRType::Half:
+		name = "half";
+		break;
+	default:
+		name = "float";
+		break;
+	}
+	if (vecsize > 1)
+		name += to_string(vecsize);
+	return name;
+}
+
+string CompilerOpenCL::opencl_matrix_type_name(SPIRType::BaseType basetype, uint32_t vecsize, uint32_t columns)
+{
+	string prefix = "spv";
+	if (basetype == SPIRType::Double)
+		prefix += "D";
+	else if (basetype == SPIRType::Half)
+		prefix += "H";
+	prefix += "Mat";
+	if (columns == vecsize)
+		return prefix + to_string(columns);
+	return prefix + to_string(columns) + "x" + to_string(vecsize);
+}
+
+string CompilerOpenCL::opencl_matrix_type_name(const SPIRType &type)
+{
+	return opencl_matrix_type_name(type.basetype, type.vecsize, type.columns);
+}
+
+string CompilerOpenCL::opencl_matrix_short_name(SPIRType::BaseType basetype, uint32_t vecsize, uint32_t columns)
+{
+	// Returns e.g. "Mat4", "DMat4", "HMat4", "Mat4x3", "DMat4x3"
+	string prefix;
+	if (basetype == SPIRType::Double)
+		prefix = "D";
+	else if (basetype == SPIRType::Half)
+		prefix = "H";
+	prefix += "Mat";
+	if (columns == vecsize)
+		return prefix + to_string(columns);
+	return prefix + to_string(columns) + "x" + to_string(vecsize);
+}
+
+string CompilerOpenCL::opencl_vector_short_name(SPIRType::BaseType basetype, uint32_t vecsize)
+{
+	// Returns e.g. "Vec4", "DVec4", "HVec4", "Scalar" for vecsize 1
+	if (vecsize == 1)
+	{
+		if (basetype == SPIRType::Double)
+			return "DScalar";
+		if (basetype == SPIRType::Half)
+			return "HScalar";
+		return "Scalar";
+	}
+	string prefix;
+	if (basetype == SPIRType::Double)
+		prefix = "D";
+	else if (basetype == SPIRType::Half)
+		prefix = "H";
+	return prefix + "Vec" + to_string(vecsize);
+}
+
+void CompilerOpenCL::prepass_discover_matrix_types()
+{
+	used_matrix_types.clear();
+	need_mul_mat_vec.clear();
+	need_mul_vec_mat.clear();
+	need_mul_mat_mat.clear();
+	need_mul_mat_scalar.clear();
+	need_transpose.clear();
+	need_outer_product.clear();
+
+	// Scan all types for matrix members.
+	ir.for_each_typed_id<SPIRType>(
+	    [&](uint32_t, SPIRType &type)
+	    {
+		    if (type.columns > 1 && type.basetype != SPIRType::Struct)
+			    used_matrix_types.insert(make_matrix_key(type));
+		    for (auto &member_type_id : type.member_types)
+		    {
+			    auto &member_type = get<SPIRType>(member_type_id);
+			    if (member_type.columns > 1)
+				    used_matrix_types.insert(make_matrix_key(member_type));
+		    }
+	    });
+
+	// Scan all instructions for matrix operations to discover helpers needed.
+	// We can resolve the matrix type from the SPIR-V type of operands at pre-scan time.
+	auto get_id_type = [&](uint32_t id) -> const SPIRType &
+	{
+		// For value IDs, look up the type from variable, constant, or the instruction result.
+		auto *var = maybe_get<SPIRVariable>(id);
+		if (var)
+			return get_variable_data_type(*var);
+		auto *c = maybe_get<SPIRConstant>(id);
+		if (c)
+			return get<SPIRType>(c->constant_type);
+		// For instruction results, the type is stored in the expression or type_id.
+		if (ir.ids[id].get_type() == TypeExpression)
+			return get<SPIRType>(get<SPIRExpression>(id).expression_type);
+		// For types themselves
+		if (ir.ids[id].get_type() == TypeType)
+			return get<SPIRType>(id);
+		// Fallback: check if there's a result type mapping
+		return get<SPIRType>(id);
+	};
+
+	ir.for_each_typed_id<SPIRFunction>(
+	    [&](uint32_t, SPIRFunction &f)
+	    {
+		    for (auto &block_id : f.blocks)
+		    {
+			    auto &block = get<SPIRBlock>(block_id);
+			    for (auto &instruction : block.ops)
+			    {
+				    auto ops = stream(instruction);
+				    auto opcode = static_cast<Op>(instruction.op);
+
+				    // Helper lambda to resolve the type of a SPIR-V value ID from the instruction.
+				    // For OpMatrixTimesVector etc., ops[2] and ops[3] are value IDs whose types
+				    // may not be directly available at pre-scan time. Instead, we check the
+				    // instruction result type to infer what's needed.
+				    switch (opcode)
+				    {
+				    case OpMatrixTimesVector:
+				    {
+					    // ops[0] = result type (vector), ops[2] = matrix, ops[3] = vector
+					    // The matrix type is not directly available from ops[2] here.
+					    // We infer from the result: result is vec(vecsize), matrix has same vecsize.
+					    // But we need the column count too. Let's look up the variable type.
+					    // At pre-scan time, not all IDs have resolved types, so we'll rely on
+					    // the recompile mechanism for helpers that can't be pre-discovered.
+					    break;
+				    }
+				    case OpOuterProduct:
+				    {
+					    auto &res_type = get<SPIRType>(ops[0]);
+					    if (res_type.columns > 1)
+					    {
+						    used_matrix_types.insert(make_matrix_key(res_type));
+						    auto col_short = opencl_vector_short_name(res_type.basetype, res_type.vecsize);
+						    auto row_short = opencl_vector_short_name(res_type.basetype, res_type.columns);
+						    (void)col_short;
+						    (void)row_short;
+						    need_outer_product.insert(make_matrix_key(res_type));
+					    }
+					    break;
+				    }
+				    case OpTranspose:
+				    {
+					    auto &res_type = get<SPIRType>(ops[0]);
+					    if (res_type.columns > 1)
+					    {
+						    used_matrix_types.insert(make_matrix_key(res_type));
+						    // The input type has swapped dimensions.
+						    MatrixTypeKey input_key = { res_type.basetype, res_type.columns, res_type.vecsize };
+						    used_matrix_types.insert(input_key);
+						    need_transpose.insert(input_key);
+					    }
+					    break;
+				    }
+				    case OpMatrixTimesScalar:
+				    case OpMatrixTimesMatrix:
+				    case OpVectorTimesMatrix:
+					    // These will be discovered during emit_instruction and trigger recompile if needed.
+					    break;
+				    default:
+					    break;
+				    }
+			    }
+		    }
+	    });
+}
+
+void CompilerOpenCL::emit_matrix_typedefs()
+{
+	if (used_matrix_types.empty())
+		return;
+
+	for (auto &key : used_matrix_types)
+	{
+		auto col_type = opencl_column_type_name(key.basetype, key.vecsize);
+		auto mat_name = opencl_matrix_type_name(key.basetype, key.vecsize, key.columns);
+		statement("typedef struct { ", col_type, " columns[", key.columns, "]; } ", mat_name, ";");
+	}
+	statement("");
+}
+
+void CompilerOpenCL::emit_matrix_helpers()
+{
+	for (auto &key : need_mul_mat_vec)
+		emit_mul_mat_vec_helper(key.basetype, key.vecsize, key.columns);
+	for (auto &key : need_mul_vec_mat)
+		emit_mul_vec_mat_helper(key.basetype, key.vecsize, key.columns);
+	for (auto &key : need_mul_mat_mat)
+		emit_mul_mat_mat_helper(key.first, key.second);
+	for (auto &key : need_mul_mat_scalar)
+		emit_mul_mat_scalar_helper(key.basetype, key.vecsize, key.columns);
+	for (auto &key : need_transpose)
+		emit_transpose_helper(key.basetype, key.vecsize, key.columns);
+	for (auto &key : need_outer_product)
+		emit_outer_product_helper(key.basetype, key.vecsize, key.columns);
+}
+
+void CompilerOpenCL::emit_mul_mat_vec_helper(SPIRType::BaseType basetype, uint32_t vecsize, uint32_t columns)
+{
+	auto mat_type = opencl_matrix_type_name(basetype, vecsize, columns);
+	auto vec_result = opencl_column_type_name(basetype, vecsize);
+	auto vec_arg = opencl_column_type_name(basetype, columns);
+	auto mat_short = opencl_matrix_short_name(basetype, vecsize, columns);
+	auto vec_short = opencl_vector_short_name(basetype, columns);
+	string func_name = "spvMul" + mat_short + vec_short;
+
+	statement("static ", vec_result, " ", func_name, "(", mat_type, " m, ", vec_arg, " v)");
+	begin_scope();
+	string expr = "return ";
+	const char *swizzles[] = { "x",  "y",  "z",  "w",  "s4", "s5", "s6", "s7",
+		                       "s8", "s9", "sa", "sb", "sc", "sd", "se", "sf" };
+	for (uint32_t i = 0; i < columns; i++)
+	{
+		if (i > 0)
+			expr += " + ";
+		expr += "m.columns[" + to_string(i) + "]";
+		if (columns > 1)
+			expr += string(" * v.") + swizzles[i];
+	}
+	expr += ";";
+	statement(expr);
+	end_scope();
+	statement("");
+}
+
+void CompilerOpenCL::emit_mul_vec_mat_helper(SPIRType::BaseType basetype, uint32_t vecsize, uint32_t columns)
+{
+	auto mat_type = opencl_matrix_type_name(basetype, vecsize, columns);
+	auto in_vec = opencl_column_type_name(basetype, vecsize);
+	auto out_vec = opencl_column_type_name(basetype, columns);
+	auto vec_short = opencl_vector_short_name(basetype, vecsize);
+	auto mat_short = opencl_matrix_short_name(basetype, vecsize, columns);
+	string func_name = "spvMul" + vec_short + mat_short;
+
+	statement("static ", out_vec, " ", func_name, "(", in_vec, " v, ", mat_type, " m)");
+	begin_scope();
+	string expr = "return (" + out_vec + ")(";
+	for (uint32_t i = 0; i < columns; i++)
+	{
+		if (i > 0)
+			expr += ", ";
+		expr += "dot(v, m.columns[" + to_string(i) + "])";
+	}
+	expr += ");";
+	statement(expr);
+	end_scope();
+	statement("");
+}
+
+void CompilerOpenCL::emit_mul_mat_mat_helper(const MatrixTypeKey &a, const MatrixTypeKey &b)
+{
+	auto mat_a_type = opencl_matrix_type_name(a.basetype, a.vecsize, a.columns);
+	auto mat_b_type = opencl_matrix_type_name(b.basetype, b.vecsize, b.columns);
+	auto result_type = opencl_matrix_type_name(a.basetype, a.vecsize, b.columns);
+	auto mat_a_short = opencl_matrix_short_name(a.basetype, a.vecsize, a.columns);
+	auto mat_b_short = opencl_matrix_short_name(b.basetype, b.vecsize, b.columns);
+	string func_name = "spvMul" + mat_a_short + mat_b_short;
+
+	auto mv_vec_short = opencl_vector_short_name(a.basetype, a.columns);
+	string mul_mv_func = "spvMul" + mat_a_short + mv_vec_short;
+
+	statement("static ", result_type, " ", func_name, "(", mat_a_type, " a, ", mat_b_type, " b)");
+	begin_scope();
+	statement(result_type, " r;");
+	for (uint32_t i = 0; i < b.columns; i++)
+		statement("r.columns[", i, "] = ", mul_mv_func, "(a, b.columns[", i, "]);");
+	statement("return r;");
+	end_scope();
+	statement("");
+}
+
+void CompilerOpenCL::emit_mul_mat_scalar_helper(SPIRType::BaseType basetype, uint32_t vecsize, uint32_t columns)
+{
+	auto mat_type = opencl_matrix_type_name(basetype, vecsize, columns);
+	auto scalar_type = opencl_column_type_name(basetype, 1);
+	auto mat_short = opencl_matrix_short_name(basetype, vecsize, columns);
+	string func_name = "spvMul" + mat_short + "Scalar";
+
+	statement("static ", mat_type, " ", func_name, "(", mat_type, " m, ", scalar_type, " s)");
+	begin_scope();
+	statement(mat_type, " r;");
+	for (uint32_t i = 0; i < columns; i++)
+		statement("r.columns[", i, "] = m.columns[", i, "] * s;");
+	statement("return r;");
+	end_scope();
+	statement("");
+}
+
+void CompilerOpenCL::emit_transpose_helper(SPIRType::BaseType basetype, uint32_t vecsize, uint32_t columns)
+{
+	auto in_type = opencl_matrix_type_name(basetype, vecsize, columns);
+	auto out_type = opencl_matrix_type_name(basetype, columns, vecsize);
+	auto in_short = opencl_matrix_short_name(basetype, vecsize, columns);
+	string func_name = "spvTranspose" + in_short;
+	const char *swizzles[] = { "x", "y", "z", "w" };
+
+	statement("static ", out_type, " ", func_name, "(", in_type, " m)");
+	begin_scope();
+	statement(out_type, " r;");
+	for (uint32_t i = 0; i < vecsize; i++)
+	{
+		string expr = "r.columns[" + to_string(i) + "] = (" + opencl_column_type_name(basetype, columns) + ")(";
+		for (uint32_t j = 0; j < columns; j++)
+		{
+			if (j > 0)
+				expr += ", ";
+			expr += "m.columns[" + to_string(j) + "]." + swizzles[i];
+		}
+		expr += ");";
+		statement(expr);
+	}
+	statement("return r;");
+	end_scope();
+	statement("");
+}
+
+void CompilerOpenCL::emit_outer_product_helper(SPIRType::BaseType basetype, uint32_t vecsize, uint32_t columns)
+{
+	auto mat_type = opencl_matrix_type_name(basetype, vecsize, columns);
+	auto col_type = opencl_column_type_name(basetype, vecsize);
+	auto row_type = opencl_column_type_name(basetype, columns);
+	auto col_short = opencl_vector_short_name(basetype, vecsize);
+	auto row_short = opencl_vector_short_name(basetype, columns);
+	string func_name = "spvOuterProduct" + col_short + row_short;
+	const char *swizzles[] = { "x", "y", "z", "w" };
+
+	statement("static ", mat_type, " ", func_name, "(", col_type, " c, ", row_type, " r)");
+	begin_scope();
+	statement(mat_type, " m;");
+	for (uint32_t i = 0; i < columns; i++)
+		statement("m.columns[", i, "] = c * r.", swizzles[i], ";");
+	statement("return m;");
+	end_scope();
+	statement("");
+}
+
 string CompilerOpenCL::image_type_glsl(const SPIRType &type, uint32_t id, bool member)
 {
 	(void)member;
@@ -1122,6 +1730,46 @@ uint32_t CompilerOpenCL::get_physical_type_id_stride(TypeID type_id) const
 	return vecsize * type.columns * (type.width / 8u);
 }
 
+bool CompilerOpenCL::member_is_non_native_row_major_matrix(const SPIRType &type, uint32_t index)
+{
+	// OpenCL backend uses struct-wrapped matrices with transpose helpers,
+	// so we can handle non-square row-major matrices (unlike the base GLSL class).
+	if (!has_member_decoration(type.self, index, DecorationRowMajor))
+		return false;
+
+	const auto mbr_type = get<SPIRType>(type.member_types[index]);
+	if (mbr_type.columns <= 1)
+		return false;
+
+	return true;
+}
+
+string CompilerOpenCL::convert_row_major_matrix(string exp_str, const SPIRType &exp_type, uint32_t physical_type_id,
+                                                bool is_packed, bool relaxed)
+{
+	strip_enclosed_expression(exp_str);
+	if (!is_matrix(exp_type))
+	{
+		// Column access from a row-major matrix — delegate to base class unrolling.
+		return CompilerGLSL::convert_row_major_matrix(std::move(exp_str), exp_type, physical_type_id, is_packed,
+		                                              relaxed);
+	}
+
+	// Full matrix transpose: use our spvTranspose helper.
+	// The expression string is in the physical (transposed) layout.
+	// exp_type is the SPIR-V logical type. The physical type has swapped dimensions.
+	// We transpose FROM physical TO logical: spvTranspose_PhysType_(phys_expr) -> logical_type
+	uint32_t phys_vecsize = exp_type.columns;
+	uint32_t phys_columns = exp_type.vecsize;
+	auto phys_short = opencl_matrix_short_name(exp_type.basetype, phys_vecsize, phys_columns);
+	MatrixTypeKey phys_key = { exp_type.basetype, phys_vecsize, phys_columns };
+	need_transpose.insert(phys_key);
+	used_matrix_types.insert(phys_key);
+	used_matrix_types.insert(make_matrix_key(exp_type));
+
+	return join("spvTranspose", phys_short, "(", exp_str, ")");
+}
+
 std::string CompilerOpenCL::type_to_glsl_constructor(const SPIRType &type)
 {
 	string ret = CompilerGLSL::type_to_glsl_constructor(type);
@@ -1136,20 +1784,40 @@ std::string CompilerOpenCL::constant_expression(const SPIRConstant &c, bool insi
                                                 bool inside_struct_scope)
 {
 	auto &type = get<SPIRType>(c.constant_type);
-	if (c.replicated && type.op != OpTypeArray)
+
+	// Matrix constant: emit as struct compound literal.
+	if (type.columns > 1)
 	{
-		auto sub_expr = to_expression(c.subconstants[0]);
-		if (type.op == OpTypeMatrix)
+		auto mat_name = opencl_matrix_type_name(type);
+		string expr = "(" + mat_name + "){ { ";
+		if (c.replicated)
 		{
-			// OpenCL C has no native matrix type; matrices are represented as their column vector type.
-			// For a replicated matrix constant, just use the column value directly.
-			return sub_expr;
+			auto sub_expr = to_expression(c.subconstants[0]);
+			for (uint32_t i = 0; i < type.columns; i++)
+			{
+				if (i > 0)
+					expr += ", ";
+				expr += sub_expr;
+			}
 		}
 		else
 		{
-			// Vector replicate: (float4)(scalar)
-			return join(type_to_glsl_constructor(type), "(", sub_expr, ")");
+			for (uint32_t i = 0; i < type.columns; i++)
+			{
+				if (i > 0)
+					expr += ", ";
+				expr += constant_expression_vector(c, i);
+			}
 		}
+		expr += " } }";
+		return expr;
+	}
+
+	if (c.replicated && type.op != OpTypeArray)
+	{
+		auto sub_expr = to_expression(c.subconstants[0]);
+		// Vector replicate: (float4)(scalar)
+		return join(type_to_glsl_constructor(type), "(", sub_expr, ")");
 	}
 	return CompilerGLSL::constant_expression(c, inside_block_like_struct_scope, inside_struct_scope);
 }
@@ -1251,105 +1919,531 @@ void CompilerOpenCL::emit_glsl_op(uint32_t result_type, uint32_t result_id, uint
 			expr = join(bitcast_glsl_op(out_type, abs_ret_type), "(", expr, ")");
 		}
 
-		emit_op(result_type, result_id, expr, should_forward(args[0]));
-		inherit_expression_dependencies(result_id, args[0]);
+		emit_op(result_type, result_id, expr, should_forward(args[0]));
+		inherit_expression_dependencies(result_id, args[0]);
+		break;
+	}
+
+	case GLSLstd450SSign:
+	{
+		// OpenCL has no integer sign(). Use clamp(x, -1, 1).
+		auto &expr_type = expression_type(args[0]);
+		auto &out_type = get<SPIRType>(result_type);
+
+		auto expected_basetype = to_signed_basetype(expr_type.width);
+		string input_expr;
+		if (expr_type.basetype != expected_basetype)
+			input_expr = bitcast_expression(expected_basetype, args[0]);
+		else
+			input_expr = to_expression(args[0]);
+
+		string expr = join("clamp(", input_expr, ", -1, 1)");
+
+		// Cast to result type if needed (e.g. result is unsigned).
+		if (out_type.basetype != expected_basetype)
+		{
+			SPIRType signed_type = out_type;
+			signed_type.basetype = expected_basetype;
+			expr = join(bitcast_glsl_op(out_type, signed_type), "(", expr, ")");
+		}
+
+		emit_op(result_type, result_id, expr, should_forward(args[0]));
+		inherit_expression_dependencies(result_id, args[0]);
+		break;
+	}
+
+	case GLSLstd450FindSMsb:
+	{
+		// GLSL findMSB for signed: position of highest bit that differs from sign bit.
+		// OpenCL: (W-1) - clz(x ^ (x >> (W-1)))
+		// x >> (W-1) is arithmetic shift: 0 for positive, -1 for negative.
+		// x ^ -1 = ~x, x ^ 0 = x. So this gives clz(x) for positive, clz(~x) for negative.
+		auto &expr_type = expression_type(args[0]);
+		auto &out_type = get<SPIRType>(result_type);
+		uint32_t width = expr_type.width;
+
+		// Input must be signed for arithmetic right shift.
+		auto signed_basetype = to_signed_basetype(width);
+		SPIRType signed_type = expr_type;
+		signed_type.basetype = signed_basetype;
+
+		string input_expr;
+		if (expr_type.basetype != signed_basetype)
+			input_expr = bitcast_expression(signed_basetype, args[0]);
+		else
+			input_expr = to_enclosed_expression(args[0]);
+
+		string xor_expr = join(input_expr, " ^ (", input_expr, " >> ", width - 1, ")");
+		string expr = join(width - 1, " - clz(", xor_expr, ")");
+
+		// clz on signed type returns signed, so result is signed. Cast if output is unsigned.
+		if (out_type.basetype != signed_basetype)
+			expr = join(bitcast_glsl_op(out_type, signed_type), "(", expr, ")");
+
+		emit_op(result_type, result_id, expr, should_forward(args[0]));
+		inherit_expression_dependencies(result_id, args[0]);
+		break;
+	}
+
+	case GLSLstd450FindUMsb:
+	{
+		// GLSL findMSB for unsigned: position of highest set bit, -1 for 0.
+		// OpenCL: (W-1) - clz(x). clz(0) = W, so result = -1 for 0.
+		auto &expr_type = expression_type(args[0]);
+		auto &out_type = get<SPIRType>(result_type);
+		uint32_t width = expr_type.width;
+
+		auto unsigned_basetype = to_unsigned_basetype(width);
+		string input_expr;
+		if (expr_type.basetype != unsigned_basetype)
+			input_expr = bitcast_expression(unsigned_basetype, args[0]);
+		else
+			input_expr = to_expression(args[0]);
+
+		// Cast to signed for the subtraction so result can be -1.
+		auto signed_basetype = to_signed_basetype(width);
+		SPIRType signed_type = out_type;
+		signed_type.basetype = signed_basetype;
+		string clz_expr = join("as_", type_to_glsl(signed_type), "(clz(", input_expr, "))");
+
+		string expr = join(width - 1, " - ", clz_expr);
+
+		// findMSB returns int (signed). Cast if output type differs.
+		if (out_type.basetype != signed_basetype)
+		{
+			expr = join(bitcast_glsl_op(out_type, signed_type), "(", expr, ")");
+		}
+
+		emit_op(result_type, result_id, expr, should_forward(args[0]));
+		inherit_expression_dependencies(result_id, args[0]);
+		break;
+	}
+
+	case GLSLstd450InverseSqrt:
+		emit_unary_func_op(result_type, result_id, args[0], "rsqrt");
+		break;
+
+	case GLSLstd450RoundEven:
+		emit_unary_func_op(result_type, result_id, args[0], "rint");
+		break;
+
+	case GLSLstd450Fract:
+	{
+		// OpenCL fract() requires a pointer argument. Use (x - floor(x)) inline.
+		auto expr = join("(", to_expression(args[0]), " - floor(", to_expression(args[0]), "))");
+		emit_op(result_type, result_id, expr, should_forward(args[0]));
+		inherit_expression_dependencies(result_id, args[0]);
+		break;
+	}
+
+	case GLSLstd450Atan2:
+		emit_binary_func_op(result_type, result_id, args[0], args[1], "atan2");
+		break;
+
+	case GLSLstd450Radians:
+		emit_unary_func_op(result_type, result_id, args[0], "radians");
+		break;
+
+	case GLSLstd450Degrees:
+		emit_unary_func_op(result_type, result_id, args[0], "degrees");
+		break;
+
+	case GLSLstd450FindILsb:
+	{
+		if (!needs_findlsb_polyfill)
+		{
+			needs_findlsb_polyfill = true;
+			force_recompile();
+		}
+		auto &input_type = expression_type(args[0]);
+		auto &out_type = get<SPIRType>(result_type);
+		// spvFindLSB takes uint. Cast input to uint if signed, and handle vector by component.
+		if (input_type.vecsize > 1)
+		{
+			// Vector: apply per-component using .s0, .s1, etc.
+			string expr = "(" + type_to_glsl(out_type) + ")(";
+			const char *swizzles[] = { "x", "y", "z", "w" };
+			for (uint32_t i = 0; i < input_type.vecsize; i++)
+			{
+				if (i > 0)
+					expr += ", ";
+				if (input_type.basetype == SPIRType::Int)
+					expr += join("spvFindLSB(as_uint(", to_expression(args[0]), ".", swizzles[i], "))");
+				else
+					expr += join("spvFindLSB(", to_expression(args[0]), ".", swizzles[i], ")");
+			}
+			expr += ")";
+			emit_op(result_type, result_id, expr, should_forward(args[0]));
+			inherit_expression_dependencies(result_id, args[0]);
+		}
+		else
+		{
+			string input_expr;
+			if (input_type.basetype == SPIRType::Int)
+				input_expr = join("spvFindLSB(as_uint(", to_expression(args[0]), "))");
+			else
+				input_expr = join("spvFindLSB(", to_expression(args[0]), ")");
+			emit_op(result_type, result_id, input_expr, should_forward(args[0]));
+			inherit_expression_dependencies(result_id, args[0]);
+		}
+		break;
+	}
+
+	case GLSLstd450FaceForward:
+	{
+		// OpenCL C has no faceforward(). Implement inline.
+		// faceforward(N, I, Nref) = dot(Nref, I) < 0 ? N : -N
+		auto &type = get<SPIRType>(result_type);
+		if (type.vecsize == 1)
+		{
+			auto expr = join("(", to_expression(args[2]), " * ", to_expression(args[1]), " < 0.0f ? ",
+			                 to_expression(args[0]), " : -", to_expression(args[0]), ")");
+			emit_op(result_type, result_id, expr,
+			        should_forward(args[0]) && should_forward(args[1]) && should_forward(args[2]));
+		}
+		else
+		{
+			auto expr = join("(dot(", to_expression(args[2]), ", ", to_expression(args[1]), ") < 0.0f ? ",
+			                 to_expression(args[0]), " : -", to_expression(args[0]), ")");
+			emit_op(result_type, result_id, expr,
+			        should_forward(args[0]) && should_forward(args[1]) && should_forward(args[2]));
+		}
+		for (uint32_t i = 0; i < 3; i++)
+			inherit_expression_dependencies(result_id, args[i]);
+		break;
+	}
+
+	case GLSLstd450Reflect:
+	{
+		// OpenCL C has no reflect(). Implement inline.
+		// reflect(I, N) = I - 2 * dot(N, I) * N
+		auto &type = get<SPIRType>(result_type);
+		if (type.vecsize == 1)
+		{
+			auto expr = join(to_enclosed_expression(args[0]), " - 2.0f * ", to_enclosed_expression(args[1]), " * ",
+			                 to_enclosed_expression(args[0]), " * ", to_enclosed_expression(args[1]));
+			emit_op(result_type, result_id, expr, should_forward(args[0]) && should_forward(args[1]));
+		}
+		else
+		{
+			auto expr = join(to_expression(args[0]), " - 2.0f * dot(", to_expression(args[1]), ", ",
+			                 to_expression(args[0]), ") * ", to_expression(args[1]));
+			emit_op(result_type, result_id, expr, should_forward(args[0]) && should_forward(args[1]));
+		}
+		inherit_expression_dependencies(result_id, args[0]);
+		inherit_expression_dependencies(result_id, args[1]);
+		break;
+	}
+
+	case GLSLstd450Refract:
+	{
+		// OpenCL C has no refract(). Implement inline.
+		// refract(I, N, eta): k = 1 - eta^2*(1 - dot(N,I)^2); k < 0 ? 0 : eta*I - (eta*dot(N,I)+sqrt(k))*N
+		auto &type = get<SPIRType>(result_type);
+		forced_temporaries.insert(result_id);
+		auto type_name = type_to_glsl(type);
+		emit_op(result_type, result_id, join("(", type_name, ")(0.0f)"), false);
+		auto I = to_expression(args[0]);
+		auto N = to_expression(args[1]);
+		auto eta = to_expression(args[2]);
+		auto res = to_expression(result_id);
+		statement("{");
+		if (type.vecsize == 1)
+		{
+			statement("    float spv_NdotI = ", N, " * ", I, ";");
+		}
+		else
+		{
+			statement("    float spv_NdotI = dot(", N, ", ", I, ");");
+		}
+		statement("    float spv_k = 1.0f - ", eta, " * ", eta, " * (1.0f - spv_NdotI * spv_NdotI);");
+		statement("    if (spv_k >= 0.0f)");
+		statement("        ", res, " = ", eta, " * ", I, " - (", eta, " * spv_NdotI + sqrt(spv_k)) * ", N, ";");
+		statement("}");
+		break;
+	}
+
+	case GLSLstd450Length:
+	{
+		auto &type = expression_type(args[0]);
+		if (type.vecsize == 1)
+			emit_unary_func_op(result_type, result_id, args[0], "fabs");
+		else
+			emit_unary_func_op(result_type, result_id, args[0], "length");
+		break;
+	}
+
+	case GLSLstd450Distance:
+	{
+		auto &type = expression_type(args[0]);
+		if (type.vecsize == 1)
+		{
+			auto expr = join("fabs(", to_expression(args[0]), " - ", to_expression(args[1]), ")");
+			emit_op(result_type, result_id, expr, should_forward(args[0]) && should_forward(args[1]));
+			inherit_expression_dependencies(result_id, args[0]);
+			inherit_expression_dependencies(result_id, args[1]);
+		}
+		else
+			emit_binary_func_op(result_type, result_id, args[0], args[1], "distance");
+		break;
+	}
+
+	case GLSLstd450Normalize:
+	{
+		auto &type = expression_type(args[0]);
+		if (type.vecsize == 1)
+			emit_unary_func_op(result_type, result_id, args[0], "sign");
+		else
+			emit_unary_func_op(result_type, result_id, args[0], "normalize");
+		break;
+	}
+
+	case GLSLstd450PackSnorm4x8:
+		if (!needs_pack_snorm_4x8)
+		{
+			needs_pack_snorm_4x8 = true;
+			force_recompile();
+		}
+		emit_unary_func_op(result_type, result_id, args[0], "spvPackSnorm4x8");
+		break;
+	case GLSLstd450PackUnorm4x8:
+		if (!needs_pack_unorm_4x8)
+		{
+			needs_pack_unorm_4x8 = true;
+			force_recompile();
+		}
+		emit_unary_func_op(result_type, result_id, args[0], "spvPackUnorm4x8");
+		break;
+	case GLSLstd450PackSnorm2x16:
+		if (!needs_pack_snorm_2x16)
+		{
+			needs_pack_snorm_2x16 = true;
+			force_recompile();
+		}
+		emit_unary_func_op(result_type, result_id, args[0], "spvPackSnorm2x16");
+		break;
+	case GLSLstd450PackUnorm2x16:
+		if (!needs_pack_unorm_2x16)
+		{
+			needs_pack_unorm_2x16 = true;
+			force_recompile();
+		}
+		emit_unary_func_op(result_type, result_id, args[0], "spvPackUnorm2x16");
+		break;
+	case GLSLstd450UnpackSnorm4x8:
+		if (!needs_unpack_snorm_4x8)
+		{
+			needs_unpack_snorm_4x8 = true;
+			force_recompile();
+		}
+		emit_unary_func_op(result_type, result_id, args[0], "spvUnpackSnorm4x8");
+		break;
+	case GLSLstd450UnpackUnorm4x8:
+		if (!needs_unpack_unorm_4x8)
+		{
+			needs_unpack_unorm_4x8 = true;
+			force_recompile();
+		}
+		emit_unary_func_op(result_type, result_id, args[0], "spvUnpackUnorm4x8");
+		break;
+	case GLSLstd450UnpackSnorm2x16:
+		if (!needs_unpack_snorm_2x16)
+		{
+			needs_unpack_snorm_2x16 = true;
+			force_recompile();
+		}
+		emit_unary_func_op(result_type, result_id, args[0], "spvUnpackSnorm2x16");
+		break;
+	case GLSLstd450UnpackUnorm2x16:
+		if (!needs_unpack_unorm_2x16)
+		{
+			needs_unpack_unorm_2x16 = true;
+			force_recompile();
+		}
+		emit_unary_func_op(result_type, result_id, args[0], "spvUnpackUnorm2x16");
+		break;
+
+	case GLSLstd450Determinant:
+	{
+		auto *e = maybe_get<SPIRExpression>(args[0]);
+		bool old_transpose = e && e->need_transpose;
+		if (old_transpose)
+			e->need_transpose = false;
+
+		auto &type = expression_type(args[0]);
+		assert(type.vecsize == type.columns);
+		const char *func = "spvDeterminant2";
+		if (type.vecsize == 2)
+		{
+			if (!needs_determinant_2)
+			{
+				needs_determinant_2 = true;
+				force_recompile();
+			}
+		}
+		else if (type.vecsize == 3)
+		{
+			func = "spvDeterminant3";
+			if (!needs_determinant_3)
+			{
+				needs_determinant_3 = true;
+				force_recompile();
+			}
+		}
+		else if (type.vecsize == 4)
+		{
+			func = "spvDeterminant4";
+			if (!needs_determinant_4)
+			{
+				needs_determinant_4 = true;
+				force_recompile();
+			}
+		}
+
+		emit_unary_func_op(result_type, result_id, args[0], func);
+
+		if (old_transpose)
+			e->need_transpose = true;
 		break;
 	}
 
-	case GLSLstd450SSign:
+	case GLSLstd450MatrixInverse:
 	{
-		// OpenCL has no integer sign(). Use clamp(x, -1, 1).
-		auto &expr_type = expression_type(args[0]);
-		auto &out_type = get<SPIRType>(result_type);
-
-		auto expected_basetype = to_signed_basetype(expr_type.width);
-		string input_expr;
-		if (expr_type.basetype != expected_basetype)
-			input_expr = bitcast_expression(expected_basetype, args[0]);
-		else
-			input_expr = to_expression(args[0]);
-
-		string expr = join("clamp(", input_expr, ", -1, 1)");
+		auto *a = maybe_get<SPIRExpression>(args[0]);
+		bool old_transpose = a && a->need_transpose;
+		if (old_transpose)
+			a->need_transpose = false;
 
-		// Cast to result type if needed (e.g. result is unsigned).
-		if (out_type.basetype != expected_basetype)
+		auto &type = get<SPIRType>(result_type);
+		assert(type.vecsize == type.columns);
+		const char *inv_func = "spvInverse2";
+		if (type.vecsize == 2)
 		{
-			SPIRType signed_type = out_type;
-			signed_type.basetype = expected_basetype;
-			expr = join(bitcast_glsl_op(out_type, signed_type), "(", expr, ")");
+			if (!needs_inverse_2)
+			{
+				needs_inverse_2 = true;
+				force_recompile();
+			}
+		}
+		else if (type.vecsize == 3)
+		{
+			inv_func = "spvInverse3";
+			if (!needs_inverse_3)
+			{
+				needs_inverse_3 = true;
+				force_recompile();
+			}
+		}
+		else if (type.vecsize == 4)
+		{
+			inv_func = "spvInverse4";
+			if (!needs_inverse_4)
+			{
+				needs_inverse_4 = true;
+				force_recompile();
+			}
 		}
 
-		emit_op(result_type, result_id, expr, should_forward(args[0]));
+		bool forward = should_forward(args[0]);
+		auto &expr_out =
+		    emit_op(result_type, result_id, join(inv_func, "(", to_unpacked_expression(args[0]), ")"), forward);
 		inherit_expression_dependencies(result_id, args[0]);
+
+		if (old_transpose)
+		{
+			expr_out.need_transpose = true;
+			a->need_transpose = true;
+		}
 		break;
 	}
 
-	case GLSLstd450FindSMsb:
-	{
-		// GLSL findMSB for signed: position of highest bit that differs from sign bit.
-		// OpenCL: (W-1) - clz(x ^ (x >> (W-1)))
-		// x >> (W-1) is arithmetic shift: 0 for positive, -1 for negative.
-		// x ^ -1 = ~x, x ^ 0 = x. So this gives clz(x) for positive, clz(~x) for negative.
-		auto &expr_type = expression_type(args[0]);
-		auto &out_type = get<SPIRType>(result_type);
-		uint32_t width = expr_type.width;
-
-		// Input must be signed for arithmetic right shift.
-		auto signed_basetype = to_signed_basetype(width);
-		SPIRType signed_type = expr_type;
-		signed_type.basetype = signed_basetype;
-
-		string input_expr;
-		if (expr_type.basetype != signed_basetype)
-			input_expr = bitcast_expression(signed_basetype, args[0]);
-		else
-			input_expr = to_enclosed_expression(args[0]);
-
-		string xor_expr = join(input_expr, " ^ (", input_expr, " >> ", width - 1, ")");
-		string expr = join(width - 1, " - clz(", xor_expr, ")");
-
-		// clz on signed type returns signed, so result is signed. Cast if output is unsigned.
-		if (out_type.basetype != signed_basetype)
-			expr = join(bitcast_glsl_op(out_type, signed_type), "(", expr, ")");
+	// NMin / NMax / NClamp: OpenCL fmin/fmax propagate NaN correctly, use them directly.
+	case GLSLstd450NMin:
+		emit_binary_func_op(result_type, result_id, args[0], args[1], "fmin");
+		break;
+	case GLSLstd450NMax:
+		emit_binary_func_op(result_type, result_id, args[0], args[1], "fmax");
+		break;
+	case GLSLstd450NClamp:
+		emit_trinary_func_op(result_type, result_id, args[0], args[1], args[2], "clamp");
+		break;
 
-		emit_op(result_type, result_id, expr, should_forward(args[0]));
-		inherit_expression_dependencies(result_id, args[0]);
+	case GLSLstd450Frexp:
+	{
+		// OpenCL frexp signature matches GLSL: frexp(x, &exp)
+		register_call_out_argument(args[1]);
+		forced_temporaries.insert(result_id);
+		emit_op(result_type, result_id, join("frexp(", to_expression(args[0]), ", &", to_expression(args[1]), ")"),
+		        false);
 		break;
 	}
 
-	case GLSLstd450FindUMsb:
+	case GLSLstd450FrexpStruct:
 	{
-		// GLSL findMSB for unsigned: position of highest set bit, -1 for 0.
-		// OpenCL: (W-1) - clz(x). clz(0) = W, so result = -1 for 0.
-		auto &expr_type = expression_type(args[0]);
-		auto &out_type = get<SPIRType>(result_type);
-		uint32_t width = expr_type.width;
+		auto &type = get<SPIRType>(result_type);
+		emit_uninitialized_temporary_expression(result_type, result_id);
+		statement(to_expression(result_id), ".", to_member_name(type, 0), " = frexp(", to_expression(args[0]), ", &",
+		          to_expression(result_id), ".", to_member_name(type, 1), ");");
+		break;
+	}
 
-		auto unsigned_basetype = to_unsigned_basetype(width);
-		string input_expr;
-		if (expr_type.basetype != unsigned_basetype)
-			input_expr = bitcast_expression(unsigned_basetype, args[0]);
-		else
-			input_expr = to_expression(args[0]);
+	case GLSLstd450Ldexp:
+		emit_binary_func_op(result_type, result_id, args[0], args[1], "ldexp");
+		break;
 
-		// Cast to signed for the subtraction so result can be -1.
-		auto signed_basetype = to_signed_basetype(width);
-		SPIRType signed_type = out_type;
-		signed_type.basetype = signed_basetype;
-		string clz_expr = join("as_", type_to_glsl(signed_type), "(clz(", input_expr, "))");
+	case GLSLstd450Cross:
+		emit_binary_func_op(result_type, result_id, args[0], args[1], "cross");
+		break;
 
-		string expr = join(width - 1, " - ", clz_expr);
+	case GLSLstd450FSign:
+		emit_unary_func_op(result_type, result_id, args[0], "sign");
+		break;
 
-		// findMSB returns int (signed). Cast if output type differs.
-		if (out_type.basetype != signed_basetype)
-		{
-			expr = join(bitcast_glsl_op(out_type, signed_type), "(", expr, ")");
-		}
+	case GLSLstd450FAbs:
+		emit_unary_func_op(result_type, result_id, args[0], "fabs");
+		break;
 
-		emit_op(result_type, result_id, expr, should_forward(args[0]));
-		inherit_expression_dependencies(result_id, args[0]);
+	case GLSLstd450FMin:
+		emit_binary_func_op(result_type, result_id, args[0], args[1], "fmin");
+		break;
+	case GLSLstd450FMax:
+		emit_binary_func_op(result_type, result_id, args[0], args[1], "fmax");
+		break;
+	case GLSLstd450FClamp:
+		emit_trinary_func_op(result_type, result_id, args[0], args[1], args[2], "clamp");
+		break;
+	case GLSLstd450SMin:
+		emit_binary_func_op(result_type, result_id, args[0], args[1], "min");
+		break;
+	case GLSLstd450SMax:
+		emit_binary_func_op(result_type, result_id, args[0], args[1], "max");
+		break;
+	case GLSLstd450UMin:
+		emit_binary_func_op(result_type, result_id, args[0], args[1], "min");
+		break;
+	case GLSLstd450UMax:
+		emit_binary_func_op(result_type, result_id, args[0], args[1], "max");
+		break;
+	case GLSLstd450SClamp:
+		emit_trinary_func_op(result_type, result_id, args[0], args[1], args[2], "clamp");
+		break;
+	case GLSLstd450UClamp:
+		emit_trinary_func_op(result_type, result_id, args[0], args[1], args[2], "clamp");
+		break;
+
+	case GLSLstd450FMix:
+	case GLSLstd450IMix:
+		emit_trinary_func_op(result_type, result_id, args[0], args[1], args[2], "mix");
+		break;
+	case GLSLstd450Step:
+		emit_binary_func_op(result_type, result_id, args[0], args[1], "step");
+		break;
+	case GLSLstd450SmoothStep:
+		emit_trinary_func_op(result_type, result_id, args[0], args[1], args[2], "smoothstep");
+		break;
+	case GLSLstd450Fma:
+		emit_trinary_func_op(result_type, result_id, args[0], args[1], args[2], "fma");
 		break;
-	}
 
 	default:
 		CompilerGLSL::emit_glsl_op(result_type, result_id, op, args, count);
@@ -1410,6 +2504,21 @@ std::string CompilerOpenCL::to_atomic_ptr_expression(uint32_t id)
 
 // Task #3: In OpenCL C, pointer-to-struct member access uses -> instead of .
 // ptr_chain_is_resolved == false means this is the first member access from the base.
+bool CompilerOpenCL::should_dereference(uint32_t id)
+{
+	// In OpenCL C, function parameters with StorageClassFunction pointer types
+	// are emitted as actual pointers (T*), so they need dereferencing for
+	// member/component access (e.g., (*a).x instead of a.x).
+	const auto &type = expression_type(id);
+	if (is_pointer(type) && type.storage == StorageClassFunction)
+	{
+		auto *var = maybe_get<SPIRVariable>(id);
+		if (var && var->parameter != nullptr)
+			return true;
+	}
+	return CompilerGLSL::should_dereference(id);
+}
+
 std::string CompilerOpenCL::to_member_reference(uint32_t base, const SPIRType &type, uint32_t index,
                                                 bool ptr_chain_is_resolved)
 {
@@ -1439,7 +2548,19 @@ std::string CompilerOpenCL::to_member_reference(uint32_t base, const SPIRType &t
 			{
 				return join("->", to_member_name(type, index));
 			}
-			// StorageClassUniform (UBO): emitted by value in OpenCL — use '.'
+			// StorageClassUniform with BufferBlock decoration is a legacy SSBO (GLSL 430 style),
+			// emitted as __global T* in OpenCL C — use ->.
+			// Plain Uniform with Block decoration is a UBO, emitted by value — use '.'.
+			if (sc == StorageClassUniform)
+			{
+				auto *var = maybe_get_backing_variable(base);
+				if (var)
+				{
+					auto &var_type = get<SPIRType>(var->basetype);
+					if (has_decoration(var_type.self, DecorationBufferBlock))
+						return join("->", to_member_name(type, index));
+				}
+			}
 		}
 	}
 	return join(".", to_member_name(type, index));
@@ -1800,6 +2921,67 @@ void CompilerOpenCL::emit_function(SPIRFunction &func, const Bitset &return_flag
 	}
 }
 
+void CompilerOpenCL::emit_store_statement(uint32_t lhs_expression, uint32_t rhs_expression)
+{
+	auto &type = expression_type(rhs_expression);
+	auto *lhs_e = maybe_get<SPIRExpression>(lhs_expression);
+
+	// In OpenCL C, we cannot assign to a function return value (rvalue).
+	// The base class wraps the LHS in convert_row_major_matrix() which produces
+	// spvTranspose(lhs) = rhs, which is invalid C.
+	// Instead, transpose the RHS and store directly to the LHS.
+	if (is_matrix(type) && lhs_e && lhs_e->need_transpose)
+	{
+		lhs_e->need_transpose = false;
+
+		auto *rhs_e = maybe_get<SPIRExpression>(rhs_expression);
+		if (rhs_e && rhs_e->need_transpose)
+		{
+			// Both sides need transpose — they cancel out.
+			rhs_e->need_transpose = false;
+			statement(to_expression(lhs_expression), " = ", to_unpacked_row_major_matrix_expression(rhs_expression),
+			          ";");
+			rhs_e->need_transpose = true;
+		}
+		else
+		{
+			// Transpose the RHS before storing.
+			auto &rhs_type = expression_type(rhs_expression);
+			auto rhs_short = opencl_matrix_short_name(rhs_type.basetype, rhs_type.vecsize, rhs_type.columns);
+			MatrixTypeKey rhs_key = { rhs_type.basetype, rhs_type.vecsize, rhs_type.columns };
+			need_transpose.insert(rhs_key);
+			used_matrix_types.insert(rhs_key);
+			// The LHS is in physical (transposed) layout, so we transpose the logical RHS to physical.
+			statement(to_expression(lhs_expression), " = spvTranspose", rhs_short, "(",
+			          to_unpacked_expression(rhs_expression), ");");
+		}
+
+		lhs_e->need_transpose = true;
+		register_write(lhs_expression);
+	}
+	else if (lhs_e && lhs_e->need_transpose)
+	{
+		// Storing a column to a row-major matrix. Unroll the write.
+		lhs_e->need_transpose = false;
+		for (uint32_t c = 0; c < type.vecsize; c++)
+		{
+			auto lhs_expr = to_dereferenced_expression(lhs_expression);
+			auto column_index = lhs_expr.find_last_of('[');
+			if (column_index != string::npos)
+			{
+				statement(lhs_expr.insert(column_index, join('[', c, ']')), " = ",
+				          to_extract_component_expression(rhs_expression, c), ";");
+			}
+		}
+		lhs_e->need_transpose = true;
+		register_write(lhs_expression);
+	}
+	else
+	{
+		CompilerGLSL::emit_store_statement(lhs_expression, rhs_expression);
+	}
+}
+
 void CompilerOpenCL::emit_struct_member(const SPIRType &type, uint32_t member_type_id, uint32_t index,
                                         const string &qualifier, uint32_t)
 {
@@ -1811,6 +2993,30 @@ void CompilerOpenCL::emit_struct_member(const SPIRType &type, uint32_t member_ty
 	{
 		statement(qualifier, "ulong ", to_member_name(type, index), ";");
 	}
+	else if (has_member_decoration(type.self, index, DecorationRowMajor))
+	{
+		// Row-major matrix: the physical layout has transposed dimensions.
+		// Emit the member with the physical (transposed) type so struct layout matches buffer.
+		// Walk through array nesting to find the inner matrix type.
+		const auto *inner = &membertype;
+		while (is_array(*inner))
+			inner = &get<SPIRType>(inner->parent_type);
+
+		if (inner->columns > 1)
+		{
+			auto phys_type_name = opencl_matrix_type_name(inner->basetype, inner->columns, inner->vecsize);
+			MatrixTypeKey phys_key = { inner->basetype, inner->columns, inner->vecsize };
+			used_matrix_types.insert(phys_key);
+
+			statement(qualifier, phys_type_name, " ", to_member_name(type, index), type_to_array_glsl(membertype, 0),
+			          ";");
+		}
+		else
+		{
+			// Not actually a matrix member, fall through to default.
+			statement(qualifier, variable_decl(membertype, to_member_name(type, index)), ";");
+		}
+	}
 	else
 	{
 		statement(qualifier, variable_decl(membertype, to_member_name(type, index)), ";");
@@ -2188,24 +3394,240 @@ void CompilerOpenCL::emit_instruction(const Instruction &instruction)
 		break;
 	}
 
-	// OpOuterProduct: no OpenCL builtin and no native matrix type.
-	// The result matrix type is represented as its column vector type in OpenCL C.
-	// Emit only the first column (col_vec * row_vec.x).
+	// OpOuterProduct: use struct-wrapped matrix helper.
 	case OpOuterProduct:
 	{
 		uint32_t result_type = ops[0];
 		uint32_t result_id = ops[1];
 		uint32_t col_vec = ops[2]; // column vector
 		uint32_t row_vec = ops[3]; // row vector
+		auto &res_type = get<SPIRType>(result_type);
+		auto &col_type = expression_type(col_vec);
 		auto &row_type = expression_type(row_vec);
 
-		// First column of the outer product: col_vec * row_vec.x
-		string first_row_elem =
-		    row_type.vecsize > 1 ? join(to_expression(row_vec), ".", index_to_swizzle(0)) : to_expression(row_vec);
-		string expr = join(to_expression(col_vec), " * ", first_row_elem);
-		emit_op(result_type, result_id, expr, should_forward(col_vec) && should_forward(row_vec));
-		inherit_expression_dependencies(result_id, col_vec);
-		inherit_expression_dependencies(result_id, row_vec);
+		need_outer_product.insert(make_matrix_key(res_type));
+		// Ensure the result matrix type is registered.
+		used_matrix_types.insert(make_matrix_key(res_type));
+
+		auto col_short = opencl_vector_short_name(col_type.basetype, col_type.vecsize);
+		auto row_short = opencl_vector_short_name(row_type.basetype, row_type.vecsize);
+		string func_name = "spvOuterProduct" + col_short + row_short;
+
+		emit_binary_func_op(result_type, result_id, col_vec, row_vec, func_name.c_str());
+		break;
+	}
+
+	// Matrix arithmetic operations using struct-wrapped matrix helpers.
+	case OpMatrixTimesVector:
+	{
+		uint32_t result_type = ops[0];
+		uint32_t result_id = ops[1];
+		uint32_t mat_id = ops[2];
+		uint32_t vec_id = ops[3];
+		auto &mat_type = expression_type(mat_id);
+
+		auto *e = maybe_get<SPIRExpression>(mat_id);
+		if (e && e->need_transpose)
+		{
+			// Transposed M * v = v * M_untransposed.
+			// mat_type is the SPIR-V type (e.g., mat2x3 = 2 cols, vecsize=3).
+			// The untransposed (physical) matrix is mat3x2 = 3 cols, vecsize=2.
+			e->need_transpose = false;
+			uint32_t phys_cols = mat_type.vecsize;
+			uint32_t phys_rows = mat_type.columns;
+			MatrixTypeKey phys_key = { mat_type.basetype, phys_rows, phys_cols };
+			need_mul_vec_mat.insert(phys_key);
+			used_matrix_types.insert(phys_key);
+
+			auto vec_short = opencl_vector_short_name(mat_type.basetype, phys_rows);
+			auto mat_short = opencl_matrix_short_name(mat_type.basetype, phys_rows, phys_cols);
+			string func_name = "spvMul" + vec_short + mat_short;
+
+			string expr =
+			    join(func_name, "(", to_expression(vec_id), ", ", to_unpacked_row_major_matrix_expression(mat_id), ")");
+			bool forward = should_forward(mat_id) && should_forward(vec_id);
+			emit_op(result_type, result_id, expr, forward);
+			e->need_transpose = true;
+		}
+		else
+		{
+			auto key = make_matrix_key(mat_type);
+			need_mul_mat_vec.insert(key);
+
+			auto mat_short = opencl_matrix_short_name(mat_type.basetype, mat_type.vecsize, mat_type.columns);
+			auto vec_short = opencl_vector_short_name(mat_type.basetype, mat_type.columns);
+			string func_name = "spvMul" + mat_short + vec_short;
+
+			emit_binary_func_op(result_type, result_id, mat_id, vec_id, func_name.c_str());
+		}
+		inherit_expression_dependencies(result_id, mat_id);
+		inherit_expression_dependencies(result_id, vec_id);
+		break;
+	}
+
+	case OpVectorTimesMatrix:
+	{
+		uint32_t result_type = ops[0];
+		uint32_t result_id = ops[1];
+		uint32_t vec_id = ops[2];
+		uint32_t mat_id = ops[3];
+		auto &mat_type = expression_type(mat_id);
+
+		auto *e = maybe_get<SPIRExpression>(mat_id);
+		if (e && e->need_transpose)
+		{
+			// v * M^T = M_untransposed * v.
+			// mat_type is the SPIR-V type (e.g., mat2x3 = 2 cols, vecsize=3).
+			// The untransposed (physical) matrix is mat3x2 = 3 cols, vecsize=2.
+			e->need_transpose = false;
+			uint32_t phys_cols = mat_type.vecsize;
+			uint32_t phys_rows = mat_type.columns;
+			MatrixTypeKey phys_key = { mat_type.basetype, phys_rows, phys_cols };
+			need_mul_mat_vec.insert(phys_key);
+			used_matrix_types.insert(phys_key);
+
+			auto mat_short = opencl_matrix_short_name(mat_type.basetype, phys_rows, phys_cols);
+			auto vec_short = opencl_vector_short_name(mat_type.basetype, phys_rows);
+			string func_name = "spvMul" + mat_short + vec_short;
+
+			string expr =
+			    join(func_name, "(", to_unpacked_row_major_matrix_expression(mat_id), ", ", to_expression(vec_id), ")");
+			bool forward = should_forward(mat_id) && should_forward(vec_id);
+			emit_op(result_type, result_id, expr, forward);
+			e->need_transpose = true;
+		}
+		else
+		{
+			auto key = make_matrix_key(mat_type);
+			need_mul_vec_mat.insert(key);
+
+			auto vec_short = opencl_vector_short_name(mat_type.basetype, mat_type.vecsize);
+			auto mat_short = opencl_matrix_short_name(mat_type.basetype, mat_type.vecsize, mat_type.columns);
+			string func_name = "spvMul" + vec_short + mat_short;
+
+			emit_binary_func_op(result_type, result_id, vec_id, mat_id, func_name.c_str());
+		}
+		inherit_expression_dependencies(result_id, vec_id);
+		inherit_expression_dependencies(result_id, mat_id);
+		break;
+	}
+
+	case OpMatrixTimesMatrix:
+	{
+		uint32_t result_type = ops[0];
+		uint32_t result_id = ops[1];
+		uint32_t a_id = ops[2];
+		uint32_t b_id = ops[3];
+		auto &a_type = expression_type(a_id);
+		auto &b_type = expression_type(b_id);
+
+		auto *ea = maybe_get<SPIRExpression>(a_id);
+		auto *eb = maybe_get<SPIRExpression>(b_id);
+
+		if (ea && eb && ea->need_transpose && eb->need_transpose)
+		{
+			// (A^T * B^T) = (B * A)^T
+			// Physical (untransposed) matrices have swapped dimensions.
+			ea->need_transpose = false;
+			eb->need_transpose = false;
+
+			MatrixTypeKey phys_b = { b_type.basetype, b_type.columns, b_type.vecsize };
+			MatrixTypeKey phys_a = { a_type.basetype, a_type.columns, a_type.vecsize };
+			need_mul_mat_mat.insert({ phys_b, phys_a });
+			need_mul_mat_vec.insert(phys_b);
+			used_matrix_types.insert(phys_b);
+			used_matrix_types.insert(phys_a);
+
+			auto mat_b_short = opencl_matrix_short_name(phys_b.basetype, phys_b.vecsize, phys_b.columns);
+			auto mat_a_short = opencl_matrix_short_name(phys_a.basetype, phys_a.vecsize, phys_a.columns);
+			string func_name = "spvMul" + mat_b_short + mat_a_short;
+
+			string expr = join(func_name, "(", to_unpacked_row_major_matrix_expression(b_id), ", ",
+			                   to_unpacked_row_major_matrix_expression(a_id), ")");
+			bool forward = should_forward(a_id) && should_forward(b_id);
+			emit_transposed_op(result_type, result_id, expr, forward);
+
+			ea->need_transpose = true;
+			eb->need_transpose = true;
+		}
+		else
+		{
+			auto key_a = make_matrix_key(a_type);
+			auto key_b = make_matrix_key(b_type);
+			need_mul_mat_mat.insert({ key_a, key_b });
+			// Also need the MatVec helper for the inner multiplication.
+			need_mul_mat_vec.insert(key_a);
+
+			auto mat_a_short = opencl_matrix_short_name(a_type.basetype, a_type.vecsize, a_type.columns);
+			auto mat_b_short = opencl_matrix_short_name(b_type.basetype, b_type.vecsize, b_type.columns);
+			string func_name = "spvMul" + mat_a_short + mat_b_short;
+
+			emit_binary_func_op(result_type, result_id, a_id, b_id, func_name.c_str());
+		}
+		inherit_expression_dependencies(result_id, a_id);
+		inherit_expression_dependencies(result_id, b_id);
+		break;
+	}
+
+	case OpMatrixTimesScalar:
+	{
+		uint32_t result_type = ops[0];
+		uint32_t result_id = ops[1];
+		uint32_t mat_id = ops[2];
+		uint32_t scalar_id = ops[3];
+		auto &mat_type = expression_type(mat_id);
+
+		auto *e = maybe_get<SPIRExpression>(mat_id);
+		if (e && e->need_transpose)
+		{
+			// Physical (untransposed) matrix has swapped dimensions.
+			e->need_transpose = false;
+			MatrixTypeKey phys_key = { mat_type.basetype, mat_type.columns, mat_type.vecsize };
+			need_mul_mat_scalar.insert(phys_key);
+			used_matrix_types.insert(phys_key);
+
+			auto mat_short = opencl_matrix_short_name(phys_key.basetype, phys_key.vecsize, phys_key.columns);
+			string func_name = "spvMul" + mat_short + "Scalar";
+
+			string expr = join(func_name, "(", to_unpacked_row_major_matrix_expression(mat_id), ", ",
+			                   to_expression(scalar_id), ")");
+			bool forward = should_forward(mat_id) && should_forward(scalar_id);
+			emit_transposed_op(result_type, result_id, expr, forward);
+			e->need_transpose = true;
+		}
+		else
+		{
+			auto key = make_matrix_key(mat_type);
+			need_mul_mat_scalar.insert(key);
+
+			auto mat_short = opencl_matrix_short_name(mat_type.basetype, mat_type.vecsize, mat_type.columns);
+			string func_name = "spvMul" + mat_short + "Scalar";
+
+			emit_binary_func_op(result_type, result_id, mat_id, scalar_id, func_name.c_str());
+		}
+		inherit_expression_dependencies(result_id, mat_id);
+		inherit_expression_dependencies(result_id, scalar_id);
+		break;
+	}
+
+	case OpTranspose:
+	{
+		uint32_t result_type = ops[0];
+		uint32_t result_id = ops[1];
+		uint32_t input_id = ops[2];
+		auto &in_type = expression_type(input_id);
+		auto &res_type = get<SPIRType>(result_type);
+
+		auto key = make_matrix_key(in_type);
+		need_transpose.insert(key);
+		// Ensure both input and output matrix types are registered.
+		used_matrix_types.insert(key);
+		used_matrix_types.insert(make_matrix_key(res_type));
+
+		auto in_short = opencl_matrix_short_name(in_type.basetype, in_type.vecsize, in_type.columns);
+		string func_name = "spvTranspose" + in_short;
+
+		emit_unary_func_op(result_type, result_id, input_id, func_name.c_str());
 		break;
 	}
 
@@ -2753,6 +4175,15 @@ void CompilerOpenCL::emit_instruction(const Instruction &instruction)
 				e.access_chain = true;
 				if (is_subscript_deref)
 					subscripted_deref_exprs.insert(result_id);
+
+				// Propagate row-major transpose flag for matrix members.
+				if (struct_type && length >= 4)
+				{
+					uint32_t mbr_idx = get<SPIRConstant>(ops[3]).scalar();
+					if (member_is_non_native_row_major_matrix(*struct_type, mbr_idx))
+						e.need_transpose = true;
+				}
+
 				forwarded_temporaries.insert(result_id);
 				suppressed_usage_tracking.insert(result_id);
 				for (uint32_t i = 2; i < length; i++)
@@ -2864,6 +4295,41 @@ void CompilerOpenCL::emit_instruction(const Instruction &instruction)
 		break;
 	}
 
+	case OpCompositeConstruct:
+	{
+		uint32_t result_type = ops[0];
+		uint32_t result_id = ops[1];
+		auto &type = get<SPIRType>(result_type);
+		if (type.columns > 1)
+		{
+			// Matrix composite construct: emit compound literal (spvMat4){ { col0, col1, ... } }
+			const auto *elems = &ops[2];
+			uint32_t length = instruction.length - 2;
+
+			bool forward = true;
+			for (uint32_t i = 0; i < length; i++)
+				forward = forward && should_forward(elems[i]);
+
+			auto mat_name = opencl_matrix_type_name(type);
+			string expr = "(" + mat_name + "){ { ";
+			for (uint32_t i = 0; i < length; i++)
+			{
+				if (i > 0)
+					expr += ", ";
+				expr += to_unpacked_expression(elems[i]);
+			}
+			expr += " } }";
+			emit_op(result_type, result_id, expr, forward);
+			for (uint32_t i = 0; i < length; i++)
+				inherit_expression_dependencies(result_id, elems[i]);
+		}
+		else
+		{
+			CompilerGLSL::emit_instruction(instruction);
+		}
+		break;
+	}
+
 	case OpCompositeConstructReplicateEXT:
 	{
 		// GLSL base uses type(value) for vector splat, but OpenCL C needs (type)(value).
@@ -2872,9 +4338,17 @@ void CompilerOpenCL::emit_instruction(const Instruction &instruction)
 		auto &type = get<SPIRType>(result_type);
 		if (type.op == OpTypeMatrix)
 		{
-			// OpenCL C has no native matrix type; matrices are represented as their column vector type.
-			// Just use the sub-value directly (representing the first/only column).
-			emit_op(result_type, result_id, to_expression(ops[2]), should_forward(ops[2]));
+			// Struct-wrapped matrix: replicate the column value across all columns.
+			auto mat_name = opencl_matrix_type_name(type);
+			string expr = "(" + mat_name + "){ { ";
+			for (uint32_t i = 0; i < type.columns; i++)
+			{
+				if (i > 0)
+					expr += ", ";
+				expr += to_expression(ops[2]);
+			}
+			expr += " } }";
+			emit_op(result_type, result_id, expr, should_forward(ops[2]));
 			inherit_expression_dependencies(result_id, ops[2]);
 		}
 		else if (type.op != OpTypeArray && type.vecsize > 1)
diff --git a/spirv_opencl.hpp b/spirv_opencl.hpp
index a9991fa90..82e551be9 100644
--- a/spirv_opencl.hpp
+++ b/spirv_opencl.hpp
@@ -25,6 +25,7 @@
 #define SPIRV_CROSS_OPENCL_HPP
 
 #include "spirv_glsl.hpp"
+#include <set>
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
@@ -42,14 +43,20 @@ class CompilerOpenCL : public CompilerGLSL
 	{
 		// OpenCL C version: 120 = 1.2, 200 = 2.0
 		uint32_t opencl_version = make_opencl_version(1, 2);
+		// Enable cl_khr_fp16 (half) extension
+		bool enable_fp16 = false;
 		// Enable cl_khr_fp64 (double) extension
 		bool enable_fp64 = false;
 		// Enable cl_khr_int64_extended_atomics extension
 		bool enable_64bit_atomics = false;
 		// Enable cl_khr_subgroups extension
 		bool enable_subgroups = false;
-		// Enable cl_khr_subgroup_shuffle extension
-		bool enable_shuffle = false;
+		// Enable all subgroup extensions
+		bool enable_subgroups_all = false;
+		// Emulate missing subgroup extensions
+		bool emulate_subgroups = false;
+		// Size of subgroup emulation
+		uint32_t fixed_subgroup_size = 0;
 
 		void set_opencl_version(uint32_t major, uint32_t minor = 0, uint32_t patch = 0)
 		{
@@ -104,6 +111,7 @@ class CompilerOpenCL : public CompilerGLSL
 	std::string variable_decl(const SPIRType &type, const std::string &name, uint32_t id = 0) override;
 	void emit_function_prototype(SPIRFunction &func, const Bitset &return_flags) override;
 	void emit_instruction(const Instruction &instruction) override;
+	bool should_dereference(uint32_t id) override;
 	std::string to_member_reference(uint32_t base, const SPIRType &type, uint32_t index,
 	                                bool ptr_chain_is_resolved) override;
 	std::string to_func_call_arg(const SPIRFunction::Parameter &arg, uint32_t id) override;
@@ -124,10 +132,14 @@ class CompilerOpenCL : public CompilerGLSL
 	std::string get_type_address_space(const SPIRType &type, uint32_t id, bool argument = false);
 	const char *to_restrict(uint32_t id, bool space);
 	uint32_t get_physical_type_id_stride(TypeID type_id) const override;
+	bool member_is_non_native_row_major_matrix(const SPIRType &type, uint32_t index) override;
+	std::string convert_row_major_matrix(std::string exp_str, const SPIRType &exp_type, uint32_t physical_type_id,
+	                                     bool is_packed, bool relaxed) override;
 
 	void replace_illegal_names() override;
 	void emit_function(SPIRFunction &func, const Bitset &return_flags) override;
 	void emit_block_hints(const SPIRBlock &block) override;
+	void emit_store_statement(uint32_t lhs_expression, uint32_t rhs_expression) override;
 	void emit_struct_member(const SPIRType &type, uint32_t member_type_id, uint32_t index,
 	                        const std::string &qualifier = "", uint32_t base_offset = 0) override;
 
@@ -151,6 +163,74 @@ class CompilerOpenCL : public CompilerGLSL
 	bool needs_bitreverse_polyfill = false;
 	// Set when a default sampler is needed for combined image+sampler usage.
 	bool needs_default_sampler = false;
+	// Set when findLSB polyfill is needed.
+	bool needs_findlsb_polyfill = false;
+	// Set when pack/unpack Snorm/Unorm polyfills are needed.
+	bool needs_pack_snorm_4x8 = false;
+	bool needs_pack_unorm_4x8 = false;
+	bool needs_pack_snorm_2x16 = false;
+	bool needs_pack_unorm_2x16 = false;
+	bool needs_unpack_snorm_4x8 = false;
+	bool needs_unpack_unorm_4x8 = false;
+	bool needs_unpack_snorm_2x16 = false;
+	bool needs_unpack_unorm_2x16 = false;
+	// Set when determinant/inverse polyfills are needed (per size).
+	bool needs_determinant_2 = false;
+	bool needs_determinant_3 = false;
+	bool needs_determinant_4 = false;
+	bool needs_inverse_2 = false;
+	bool needs_inverse_3 = false;
+	bool needs_inverse_4 = false;
+
+	// Matrix type support: tracks which matrix signatures (basetype, vecsize, columns) are needed.
+	struct MatrixTypeKey
+	{
+		SPIRType::BaseType basetype;
+		uint32_t vecsize;
+		uint32_t columns;
+		bool operator<(const MatrixTypeKey &o) const
+		{
+			if (basetype != o.basetype)
+				return basetype < o.basetype;
+			if (columns != o.columns)
+				return columns < o.columns;
+			return vecsize < o.vecsize;
+		}
+		bool operator==(const MatrixTypeKey &o) const
+		{
+			return basetype == o.basetype && vecsize == o.vecsize && columns == o.columns;
+		}
+		bool operator!=(const MatrixTypeKey &o) const
+		{
+			return !(*this == o);
+		}
+	};
+	std::set<MatrixTypeKey> used_matrix_types;
+
+	// Flags for which matrix helper functions need to be emitted.
+	std::set<MatrixTypeKey> need_mul_mat_vec; // MatrixTimesVector
+	std::set<MatrixTypeKey> need_mul_vec_mat; // VectorTimesMatrix
+	std::set<std::pair<MatrixTypeKey, MatrixTypeKey>> need_mul_mat_mat; // MatrixTimesMatrix
+	std::set<MatrixTypeKey> need_mul_mat_scalar; // MatrixTimesScalar
+	std::set<MatrixTypeKey> need_transpose; // OpTranspose (key is input matrix type)
+	std::set<MatrixTypeKey> need_outer_product; // OpOuterProduct (key is result matrix type)
+
+	std::string opencl_matrix_type_name(const SPIRType &type);
+	std::string opencl_matrix_type_name(SPIRType::BaseType basetype, uint32_t vecsize, uint32_t columns);
+	std::string opencl_column_type_name(SPIRType::BaseType basetype, uint32_t vecsize);
+	// Short names for building helper function names (e.g. "Mat4", "Vec4", "DVec4").
+	std::string opencl_matrix_short_name(SPIRType::BaseType basetype, uint32_t vecsize, uint32_t columns);
+	std::string opencl_vector_short_name(SPIRType::BaseType basetype, uint32_t vecsize);
+	void emit_matrix_typedefs();
+	void emit_matrix_helpers();
+	void emit_mul_mat_vec_helper(SPIRType::BaseType basetype, uint32_t vecsize, uint32_t columns);
+	void emit_mul_vec_mat_helper(SPIRType::BaseType basetype, uint32_t vecsize, uint32_t columns);
+	void emit_mul_mat_mat_helper(const MatrixTypeKey &a, const MatrixTypeKey &b);
+	void emit_mul_mat_scalar_helper(SPIRType::BaseType basetype, uint32_t vecsize, uint32_t columns);
+	void emit_transpose_helper(SPIRType::BaseType basetype, uint32_t vecsize, uint32_t columns);
+	void emit_outer_product_helper(SPIRType::BaseType basetype, uint32_t vecsize, uint32_t columns);
+	MatrixTypeKey make_matrix_key(const SPIRType &type);
+	void prepass_discover_matrix_types();
 
 	// For each non-entry function, the ordered list of flattened buffer var IDs to thread as extra params.
 	std::unordered_map<uint32_t, SmallVector<uint32_t>> func_flattened_args;
diff --git a/test_shaders.py b/test_shaders.py
index dbc38ba5c..9343d9a9d 100755
--- a/test_shaders.py
+++ b/test_shaders.py
@@ -613,12 +613,22 @@ def path_to_opencl_standard_cli(shader):
 def validate_shader_opencl(shader, opt, paths):
     shader = reference_path(shader[0], shader[1], opt)
     extensions = []
-    if '.double.' in shader:
+    if '.fp16.' in shader:
+        extensions.append('cl_khr_fp16')
+    if '.fp64.' in shader:
         extensions.append('cl_khr_fp64')
-    if '.subgroup.' in shader:
+    if '.subgroups-emulate.' in shader:
+        if '.subgroups.' in shader:
+            extensions.append('cl_khr_subgroups')
+    elif '.subgroups.' in shader:
         extensions.append('cl_khr_subgroups')
-    if '.shuffle.' in shader:
+        extensions.append('cl_khr_subgroup_ballot')
+        extensions.append('cl_khr_subgroup_clustered_reduce')
+        extensions.append('cl_khr_subgroup_non_uniform_arithmetic')
+        extensions.append('cl_khr_subgroup_non_uniform_vote')
+        extensions.append('cl_khr_subgroup_rotate')
         extensions.append('cl_khr_subgroup_shuffle')
+        extensions.append('cl_khr_subgroup_shuffle_relative')
 
     global ignore_clang
     try:
@@ -681,12 +691,16 @@ def cross_compile_opencl(shader, spirv, opt, iterations, paths):
     opencl_args = [spirv_cross_path, '--output', opencl_path, spirv_path, '--opencl', '--iterations', str(iterations)]
     opencl_args.append('--opencl-version')
     opencl_args.append(path_to_opencl_standard_cli(shader))
-    if '.double.' in shader:
+    if '.fp16.' in shader:
+        opencl_args.append('--opencl-fp16')
+    if '.fp64.' in shader:
         opencl_args.append('--opencl-fp64')
-    if '.subgroup.' in shader:
-        opencl_args.append('--opencl-subgroups')
-    if '.shuffle.' in shader:
-        opencl_args.append('--opencl-shuffle')
+    if '.subgroups.' in shader:
+        opencl_args.append('--opencl-subgroups-all')
+    if '.subgroups-emulate.' in shader:
+        opencl_args.append('--opencl-emulate-subgroups')
+        opencl_args.append('--opencl-fixed-subgroup-size')
+        opencl_args.append('32')
 
     if shader_is_invalid_spirv(shader):
         subprocess.run(opencl_args)

From cfc761b9a5a4ddcc948dcd8212e0e463d801a72c Mon Sep 17 00:00:00 2001
From: Garrick Meeker <gmeeker@gmail.com>
Date: Sat, 14 Mar 2026 11:47:54 -0700
Subject: [PATCH 07/16] OpenCL: add more tests

---
 ...ased-struct-divergent-member-name.asm.comp |  32 +++
 .../comp/arithmetic-conversion-signs.asm.comp |  43 ++++
 ...-physical-layout-mismatch.invalid.asm.comp |  20 ++
 .../asm/comp/atomic-load-store.asm.comp       |  20 ++
 .../asm/comp/atomic-min-max-sign.asm.comp     |  25 ++
 .../asm/comp/atomic-result-temporary.asm.comp |  21 ++
 .../asm/comp/bda-arguments.asm.comp           |  41 +++
 ...-to-array-in-buffer.invalid.asm.spv16.comp |  24 ++
 .../bitcast-fp16-fp32.fp16.invalid.asm.comp   |  22 ++
 .../comp/bitfield-signed-operations.asm.comp  |  38 +++
 .../asm/comp/bitscan.asm.comp                 |  35 +++
 ...block-like-array-type-construct-2.asm.comp |  41 +++
 ...like-array-type-construct.invalid.asm.comp |  37 +++
 ...buffer-device-address-ptr-casting.asm.comp |  36 +++
 ...e-construct-buffer-struct.asm.invalid.comp |  25 ++
 .../comp/constant-composite-undef.asm.comp    |  18 ++
 .../comp/constant-lut-name-aliasing.asm.comp  |  20 ++
 .../asm/comp/copy-logical-2.spv14.asm.comp    |  83 +++++++
 ...fset-and-array-stride-diffs.spv14.asm.comp |  61 +++++
 .../asm/comp/copy-logical.spv14.asm.comp      |  56 +++++
 ...vice-array-load-temporary.asm.invalid.comp |  29 +++
 ...porary.force-native-array.asm.invalid.comp |  29 +++
 ...constant-array-load-store.asm.invalid.comp |  33 +++
 ...-store.force-native-array.asm.invalid.comp |  33 +++
 ...-in-entry-point.noeliminate.spv14.asm.comp |  23 ++
 .../asm/comp/glsl-signed-operations.asm.comp  |  50 ++++
 .../glsl.std450.frexp-modf-struct.asm.comp    |  40 +++
 ...nner-array-of-struct-copy.invalid.asm.comp |  38 +++
 ...mage-atomic-mismatch-sign.asm.invalid.comp |   0
 .../asm/comp/local-size-id-override.asm.comp  |  34 +++
 .../asm/comp/local-size-id.asm.invalid.comp   |  35 +++
 .../asm/comp/modf-storage-class.asm.comp      |  49 ++++
 .../opptrdiff-basic.spv14.invalid.asm.comp    |  56 +++++
 ...pptraccesschain-elem-offset.spv14.asm.comp |  50 ++++
 .../asm/comp/opptrequal-basic.spv14.asm.comp  |  34 +++
 ...tx-bypass-transpose.spv14.asm.invalid.comp |  52 ++++
 .../comp/opptrnotequal-basic.spv14.asm.comp   |  34 +++
 ...-access-chain-custom-array-stride.asm.comp |  21 ++
 .../comp/spec-constant-name-aliasing.asm.comp |  48 ++++
 .../storage-buffer-basic.invalid.asm.comp     |  27 ++
 .../storage-buffer-pointer-argument.asm.comp  |  28 +++
 ...entals-float-controls-2-fp16.fp16.asm.comp |  35 +++
 ...entals-float-controls-2-fp32.fp16.asm.comp |  35 +++
 .../asm/comp/variable-pointers-2.asm.comp     |  16 ++
 .../comp/variable-pointers-3.invalid.asm.comp |  12 +
 ...ariable-pointers-vector-to-scalar.asm.comp |  12 +
 .../comp/variable-pointers.asm.invalid.comp   |  77 ++++++
 .../variable-ssbo-argument.spv16.asm.comp     |  21 ++
 ...ssbo-array-argument.spv16.invalid.asm.comp |  21 ++
 ...ar-alias-ptr-access-chain.asm.invalid.comp | 145 +++++++++++
 ...tier-1.device-argument-buffer.invalid.comp |   0
 ...array-copy-threadgroup-memory.invalid.comp |  20 ++
 .../atomic-cmpxchg-packed-vector.invalid.comp |  31 +++
 .../comp/basic.invalid.comp                   |   0
 .../comp/bda-atomics.invalid.comp             |  44 ++++
 ...a-load-std140-arrayed-pointer.invalid.comp |  27 ++
 .../bda-nonwritable-glslang-workaround.comp   |  26 ++
 ...bda-restrict-pointer-variable.invalid.comp |  26 ++
 .../comp/bitcast-16bit-1.invalid.comp         |   0
 .../comp/bitcast-16bit-2.invalid.comp         |   0
 .../shaders-opencl-no-opt/comp/bitfield.comp  |  35 +++
 ...ce-address-from-pointer-complex-chain.comp |  33 +++
 ...extract-atomics-from-function.invalid.comp |  90 +++++++
 ...vocation-id-writable-ssbo-in-function.comp |  27 ++
 .../comp/glsl.std450.comp                     | 234 ++++++++++++++++++
 .../comp/illegal-struct-name.asm.comp         |  27 ++
 ...plicit-integer-promotion.fp16.invalid.comp |  89 +++++++
 .../comp/int16min-literal.fp16.invalid.comp   |  27 ++
 .../comp/int64.invalid.comp                   |  75 ++++++
 .../comp/int64min-literal.comp                |  26 ++
 .../comp/integer-dot-product.comp             |  58 +++++
 .../comp/intmin-literal.comp                  |  24 ++
 .../shaders-opencl-no-opt/comp/loop.comp      | 100 ++++++++
 .../read-only-coherent-image.invalid.comp     |  17 ++
 .../shaders-opencl-no-opt/comp/return.comp    |  39 +++
 ...tier-1.device-argument-buffer.invalid.comp |   0
 ...std140-array-load-composite-construct.comp |  18 ++
 ...ct-packing-scalar.nocompat.invalid.vk.comp | 144 +++++++++++
 ....vk.opencl12.emulate-subgroup.invalid.comp |   0
 ...at.vk.subgroup.fixed-subgroup.invalid.comp |   0
 ...ubgroups.nocompat.vk.subgroup.invalid.comp |   0
 ....nocompat.vk.subgroup.swizzle.invalid.comp |   0
 ...ncendental-float-controls-1-fp16.fp16.comp |  35 +++
 ...ncendental-float-controls-1-fp32.fp16.comp |  35 +++
 .../transposed-temporary-expression-2.comp    |  56 +++++
 .../comp/transposed-temporary-expression.comp |  41 +++
 .../comp/trivial-select-cast-vector.comp      |  19 ++
 .../comp/trivial-select-matrix.spv14.comp     |  22 ++
 ...roup-size-spec-constant-array.invalid.comp |  58 +++++
 ...ize-spec-constant-array.spv16.invalid.comp |  70 ++++++
 ...ased-struct-divergent-member-name.asm.comp |  77 ++++++
 .../comp/arithmetic-conversion-signs.asm.comp | 131 ++++++++++
 ...-physical-layout-mismatch.invalid.asm.comp |  47 ++++
 .../asm/comp/atomic-load-store.asm.comp       |  48 ++++
 .../asm/comp/atomic-min-max-sign.asm.comp     |  56 +++++
 .../asm/comp/atomic-result-temporary.asm.comp |  59 +++++
 .../asm/comp/bda-arguments.asm.comp           |  81 ++++++
 ...-to-array-in-buffer.invalid.asm.spv16.comp |  71 ++++++
 .../bitcast-fp16-fp32.fp16.invalid.asm.comp   |  63 +++++
 .../comp/bitfield-signed-operations.asm.comp  |  97 ++++++++
 .../asm/comp/bitscan.asm.comp                 |  72 ++++++
 ...block-like-array-type-construct-2.asm.comp |  85 +++++++
 ...like-array-type-construct.invalid.asm.comp |  80 ++++++
 ...buffer-device-address-ptr-casting.asm.comp | 106 ++++++++
 ...e-construct-buffer-struct.asm.invalid.comp |  54 ++++
 .../comp/constant-composite-undef.asm.comp    |  40 +++
 .../comp/constant-lut-name-aliasing.asm.comp  |  81 ++++++
 .../asm/comp/copy-logical-2.spv14.asm.comp    |  81 ++++++
 ...fset-and-array-stride-diffs.spv14.asm.comp |  60 +++++
 .../asm/comp/copy-logical.spv14.asm.comp      |  69 ++++++
 ...vice-array-load-temporary.asm.invalid.comp |  53 ++++
 ...porary.force-native-array.asm.invalid.comp |  53 ++++
 ...constant-array-load-store.asm.invalid.comp |  81 ++++++
 ...-store.force-native-array.asm.invalid.comp |  81 ++++++
 ...-in-entry-point.noeliminate.spv14.asm.comp |  59 +++++
 .../asm/comp/glsl-signed-operations.asm.comp  | 123 +++++++++
 .../glsl.std450.frexp-modf-struct.asm.comp    |  55 ++++
 ...nner-array-of-struct-copy.invalid.asm.comp | 137 ++++++++++
 ...mage-atomic-mismatch-sign.asm.invalid.comp |  71 ++++++
 .../asm/comp/local-size-id-override.asm.comp  |  60 +++++
 .../asm/comp/local-size-id.asm.invalid.comp   |  76 ++++++
 .../asm/comp/modf-storage-class.asm.comp      | 116 +++++++++
 .../opptrdiff-basic.spv14.invalid.asm.comp    |  98 ++++++++
 ...pptraccesschain-elem-offset.spv14.asm.comp |  79 ++++++
 .../asm/comp/opptrequal-basic.spv14.asm.comp  |  96 +++++++
 ...tx-bypass-transpose.spv14.asm.invalid.comp |  98 ++++++++
 .../comp/opptrnotequal-basic.spv14.asm.comp   |  96 +++++++
 ...-access-chain-custom-array-stride.asm.comp |  98 ++++++++
 .../comp/spec-constant-name-aliasing.asm.comp |  78 ++++++
 .../storage-buffer-basic.invalid.asm.comp     |  58 +++++
 .../storage-buffer-pointer-argument.asm.comp  |  63 +++++
 ...entals-float-controls-2-fp16.fp16.asm.comp | 225 +++++++++++++++++
 ...entals-float-controls-2-fp32.fp16.asm.comp | 224 +++++++++++++++++
 .../asm/comp/variable-pointers-2.asm.comp     |  71 ++++++
 .../comp/variable-pointers-3.invalid.asm.comp |  60 +++++
 ...ariable-pointers-vector-to-scalar.asm.comp |  60 +++++
 .../comp/variable-pointers.asm.invalid.comp   | 152 ++++++++++++
 .../variable-ssbo-argument.spv16.asm.comp     |  44 ++++
 ...ssbo-array-argument.spv16.invalid.asm.comp |  45 ++++
 ...ar-alias-ptr-access-chain.asm.invalid.comp | 214 ++++++++++++++++
 ...tier-1.device-argument-buffer.invalid.comp |   9 +
 ...array-copy-threadgroup-memory.invalid.comp |  18 ++
 .../atomic-cmpxchg-packed-vector.invalid.comp |  17 ++
 shaders-opencl-no-opt/comp/basic.invalid.comp |  27 ++
 .../comp/bda-atomics.invalid.comp             |  34 +++
 ...a-load-std140-arrayed-pointer.invalid.comp |  20 ++
 .../bda-nonwritable-glslang-workaround.comp   |  22 ++
 ...bda-restrict-pointer-variable.invalid.comp |  18 ++
 .../comp/bitcast-16bit-1.invalid.comp         |  23 ++
 .../comp/bitcast-16bit-2.invalid.comp         |  26 ++
 shaders-opencl-no-opt/comp/bitfield.comp      |  23 ++
 ...ce-address-from-pointer-complex-chain.comp |  21 ++
 ...extract-atomics-from-function.invalid.comp |  69 ++++++
 ...vocation-id-writable-ssbo-in-function.comp |  12 +
 shaders-opencl-no-opt/comp/glsl.std450.comp   | 129 ++++++++++
 .../comp/illegal-struct-name.asm.comp         |  62 +++++
 ...plicit-integer-promotion.fp16.invalid.comp |  85 +++++++
 .../comp/int16min-literal.fp16.invalid.comp   |  22 ++
 shaders-opencl-no-opt/comp/int64.invalid.comp |  65 +++++
 .../comp/int64min-literal.comp                |  21 ++
 .../comp/integer-dot-product.comp             | 114 +++++++++
 .../comp/intmin-literal.comp                  |  18 ++
 shaders-opencl-no-opt/comp/loop.comp          |  98 ++++++++
 .../read-only-coherent-image.invalid.comp     |  17 ++
 shaders-opencl-no-opt/comp/return.comp        |  33 +++
 ...tier-1.device-argument-buffer.invalid.comp |  13 +
 ...std140-array-load-composite-construct.comp |  13 +
 ...ct-packing-scalar.nocompat.invalid.vk.comp | 100 ++++++++
 ....vk.opencl12.emulate-subgroup.invalid.comp |  25 ++
 ...at.vk.subgroup.fixed-subgroup.invalid.comp | 211 ++++++++++++++++
 ...ubgroups.nocompat.vk.subgroup.invalid.comp | 211 ++++++++++++++++
 ....nocompat.vk.subgroup.swizzle.invalid.comp | 211 ++++++++++++++++
 ...ncendental-float-controls-1-fp16.fp16.comp |  35 +++
 ...ncendental-float-controls-1-fp32.fp16.comp |  35 +++
 .../transposed-temporary-expression-2.comp    |  24 ++
 .../comp/transposed-temporary-expression.comp |  17 ++
 .../comp/trivial-select-cast-vector.comp      |  14 ++
 .../comp/trivial-select-matrix.spv14.comp     |  16 ++
 ...roup-size-spec-constant-array.invalid.comp |  21 ++
 ...ize-spec-constant-array.spv16.invalid.comp |  21 ++
 spirv_opencl.cpp                              | 111 +++++++--
 spirv_opencl.hpp                              |   5 +
 182 files changed, 9737 insertions(+), 19 deletions(-)
 create mode 100644 reference/shaders-opencl-no-opt/asm/comp/aliased-struct-divergent-member-name.asm.comp
 create mode 100644 reference/shaders-opencl-no-opt/asm/comp/arithmetic-conversion-signs.asm.comp
 create mode 100644 reference/shaders-opencl-no-opt/asm/comp/array-copy-physical-layout-mismatch.invalid.asm.comp
 create mode 100644 reference/shaders-opencl-no-opt/asm/comp/atomic-load-store.asm.comp
 create mode 100644 reference/shaders-opencl-no-opt/asm/comp/atomic-min-max-sign.asm.comp
 create mode 100644 reference/shaders-opencl-no-opt/asm/comp/atomic-result-temporary.asm.comp
 create mode 100644 reference/shaders-opencl-no-opt/asm/comp/bda-arguments.asm.comp
 create mode 100644 reference/shaders-opencl-no-opt/asm/comp/bda-to-array-in-buffer.invalid.asm.spv16.comp
 create mode 100644 reference/shaders-opencl-no-opt/asm/comp/bitcast-fp16-fp32.fp16.invalid.asm.comp
 create mode 100644 reference/shaders-opencl-no-opt/asm/comp/bitfield-signed-operations.asm.comp
 create mode 100644 reference/shaders-opencl-no-opt/asm/comp/bitscan.asm.comp
 create mode 100644 reference/shaders-opencl-no-opt/asm/comp/block-like-array-type-construct-2.asm.comp
 create mode 100644 reference/shaders-opencl-no-opt/asm/comp/block-like-array-type-construct.invalid.asm.comp
 create mode 100644 reference/shaders-opencl-no-opt/asm/comp/buffer-device-address-ptr-casting.asm.comp
 create mode 100644 reference/shaders-opencl-no-opt/asm/comp/composite-construct-buffer-struct.asm.invalid.comp
 create mode 100644 reference/shaders-opencl-no-opt/asm/comp/constant-composite-undef.asm.comp
 create mode 100644 reference/shaders-opencl-no-opt/asm/comp/constant-lut-name-aliasing.asm.comp
 create mode 100644 reference/shaders-opencl-no-opt/asm/comp/copy-logical-2.spv14.asm.comp
 create mode 100644 reference/shaders-opencl-no-opt/asm/comp/copy-logical-offset-and-array-stride-diffs.spv14.asm.comp
 create mode 100644 reference/shaders-opencl-no-opt/asm/comp/copy-logical.spv14.asm.comp
 create mode 100644 reference/shaders-opencl-no-opt/asm/comp/device-array-load-temporary.asm.invalid.comp
 create mode 100644 reference/shaders-opencl-no-opt/asm/comp/device-array-load-temporary.force-native-array.asm.invalid.comp
 create mode 100644 reference/shaders-opencl-no-opt/asm/comp/device-constant-array-load-store.asm.invalid.comp
 create mode 100644 reference/shaders-opencl-no-opt/asm/comp/device-constant-array-load-store.force-native-array.asm.invalid.comp
 create mode 100644 reference/shaders-opencl-no-opt/asm/comp/eliminate-globals-not-in-entry-point.noeliminate.spv14.asm.comp
 create mode 100644 reference/shaders-opencl-no-opt/asm/comp/glsl-signed-operations.asm.comp
 create mode 100644 reference/shaders-opencl-no-opt/asm/comp/glsl.std450.frexp-modf-struct.asm.comp
 create mode 100644 reference/shaders-opencl-no-opt/asm/comp/groupshared-inner-array-of-struct-copy.invalid.asm.comp
 create mode 100644 reference/shaders-opencl-no-opt/asm/comp/image-atomic-mismatch-sign.asm.invalid.comp
 create mode 100644 reference/shaders-opencl-no-opt/asm/comp/local-size-id-override.asm.comp
 create mode 100644 reference/shaders-opencl-no-opt/asm/comp/local-size-id.asm.invalid.comp
 create mode 100644 reference/shaders-opencl-no-opt/asm/comp/modf-storage-class.asm.comp
 create mode 100644 reference/shaders-opencl-no-opt/asm/comp/opptrdiff-basic.spv14.invalid.asm.comp
 create mode 100644 reference/shaders-opencl-no-opt/asm/comp/opptrdiff-opptraccesschain-elem-offset.spv14.asm.comp
 create mode 100644 reference/shaders-opencl-no-opt/asm/comp/opptrequal-basic.spv14.asm.comp
 create mode 100644 reference/shaders-opencl-no-opt/asm/comp/opptrequal-row-maj-mtx-bypass-transpose.spv14.asm.invalid.comp
 create mode 100644 reference/shaders-opencl-no-opt/asm/comp/opptrnotequal-basic.spv14.asm.comp
 create mode 100644 reference/shaders-opencl-no-opt/asm/comp/ptr-access-chain-custom-array-stride.asm.comp
 create mode 100644 reference/shaders-opencl-no-opt/asm/comp/spec-constant-name-aliasing.asm.comp
 create mode 100644 reference/shaders-opencl-no-opt/asm/comp/storage-buffer-basic.invalid.asm.comp
 create mode 100644 reference/shaders-opencl-no-opt/asm/comp/storage-buffer-pointer-argument.asm.comp
 create mode 100644 reference/shaders-opencl-no-opt/asm/comp/trancendentals-float-controls-2-fp16.fp16.asm.comp
 create mode 100644 reference/shaders-opencl-no-opt/asm/comp/trancendentals-float-controls-2-fp32.fp16.asm.comp
 create mode 100644 reference/shaders-opencl-no-opt/asm/comp/variable-pointers-2.asm.comp
 create mode 100644 reference/shaders-opencl-no-opt/asm/comp/variable-pointers-3.invalid.asm.comp
 create mode 100644 reference/shaders-opencl-no-opt/asm/comp/variable-pointers-vector-to-scalar.asm.comp
 create mode 100644 reference/shaders-opencl-no-opt/asm/comp/variable-pointers.asm.invalid.comp
 create mode 100644 reference/shaders-opencl-no-opt/asm/comp/variable-ssbo-argument.spv16.asm.comp
 create mode 100644 reference/shaders-opencl-no-opt/asm/comp/variable-ssbo-array-argument.spv16.invalid.asm.comp
 create mode 100644 reference/shaders-opencl-no-opt/asm/comp/workgroup-uint-to-uchar-alias-ptr-access-chain.asm.invalid.comp
 create mode 100644 reference/shaders-opencl-no-opt/comp/argument-buffer-readonly-writeonly-alias.argument.argument-tier-1.device-argument-buffer.invalid.comp
 create mode 100644 reference/shaders-opencl-no-opt/comp/array-copy-threadgroup-memory.invalid.comp
 create mode 100644 reference/shaders-opencl-no-opt/comp/atomic-cmpxchg-packed-vector.invalid.comp
 create mode 100644 reference/shaders-opencl-no-opt/comp/basic.invalid.comp
 create mode 100644 reference/shaders-opencl-no-opt/comp/bda-atomics.invalid.comp
 create mode 100644 reference/shaders-opencl-no-opt/comp/bda-load-std140-arrayed-pointer.invalid.comp
 create mode 100644 reference/shaders-opencl-no-opt/comp/bda-nonwritable-glslang-workaround.comp
 create mode 100644 reference/shaders-opencl-no-opt/comp/bda-restrict-pointer-variable.invalid.comp
 create mode 100644 reference/shaders-opencl-no-opt/comp/bitcast-16bit-1.invalid.comp
 create mode 100644 reference/shaders-opencl-no-opt/comp/bitcast-16bit-2.invalid.comp
 create mode 100644 reference/shaders-opencl-no-opt/comp/bitfield.comp
 create mode 100644 reference/shaders-opencl-no-opt/comp/buffer-device-address-from-pointer-complex-chain.comp
 create mode 100644 reference/shaders-opencl-no-opt/comp/extract-atomics-from-function.invalid.comp
 create mode 100644 reference/shaders-opencl-no-opt/comp/global-invocation-id-writable-ssbo-in-function.comp
 create mode 100644 reference/shaders-opencl-no-opt/comp/glsl.std450.comp
 create mode 100644 reference/shaders-opencl-no-opt/comp/illegal-struct-name.asm.comp
 create mode 100644 reference/shaders-opencl-no-opt/comp/implicit-integer-promotion.fp16.invalid.comp
 create mode 100644 reference/shaders-opencl-no-opt/comp/int16min-literal.fp16.invalid.comp
 create mode 100644 reference/shaders-opencl-no-opt/comp/int64.invalid.comp
 create mode 100644 reference/shaders-opencl-no-opt/comp/int64min-literal.comp
 create mode 100644 reference/shaders-opencl-no-opt/comp/integer-dot-product.comp
 create mode 100644 reference/shaders-opencl-no-opt/comp/intmin-literal.comp
 create mode 100644 reference/shaders-opencl-no-opt/comp/loop.comp
 create mode 100644 reference/shaders-opencl-no-opt/comp/read-only-coherent-image.invalid.comp
 create mode 100644 reference/shaders-opencl-no-opt/comp/return.comp
 create mode 100644 reference/shaders-opencl-no-opt/comp/simple-bindless-ssbo.argument.argument-tier-1.device-argument-buffer.invalid.comp
 create mode 100644 reference/shaders-opencl-no-opt/comp/std140-array-load-composite-construct.comp
 create mode 100644 reference/shaders-opencl-no-opt/comp/struct-packing-scalar.nocompat.invalid.vk.comp
 create mode 100644 reference/shaders-opencl-no-opt/comp/subgroups.nocompat.vk.opencl12.emulate-subgroup.invalid.comp
 create mode 100644 reference/shaders-opencl-no-opt/comp/subgroups.nocompat.vk.subgroup.fixed-subgroup.invalid.comp
 create mode 100644 reference/shaders-opencl-no-opt/comp/subgroups.nocompat.vk.subgroup.invalid.comp
 create mode 100644 reference/shaders-opencl-no-opt/comp/subgroups.nocompat.vk.subgroup.swizzle.invalid.comp
 create mode 100644 reference/shaders-opencl-no-opt/comp/trancendental-float-controls-1-fp16.fp16.comp
 create mode 100644 reference/shaders-opencl-no-opt/comp/trancendental-float-controls-1-fp32.fp16.comp
 create mode 100644 reference/shaders-opencl-no-opt/comp/transposed-temporary-expression-2.comp
 create mode 100644 reference/shaders-opencl-no-opt/comp/transposed-temporary-expression.comp
 create mode 100644 reference/shaders-opencl-no-opt/comp/trivial-select-cast-vector.comp
 create mode 100644 reference/shaders-opencl-no-opt/comp/trivial-select-matrix.spv14.comp
 create mode 100644 reference/shaders-opencl-no-opt/comp/workgroup-size-spec-constant-array.invalid.comp
 create mode 100644 reference/shaders-opencl-no-opt/comp/workgroup-size-spec-constant-array.spv16.invalid.comp
 create mode 100644 shaders-opencl-no-opt/asm/comp/aliased-struct-divergent-member-name.asm.comp
 create mode 100644 shaders-opencl-no-opt/asm/comp/arithmetic-conversion-signs.asm.comp
 create mode 100644 shaders-opencl-no-opt/asm/comp/array-copy-physical-layout-mismatch.invalid.asm.comp
 create mode 100644 shaders-opencl-no-opt/asm/comp/atomic-load-store.asm.comp
 create mode 100644 shaders-opencl-no-opt/asm/comp/atomic-min-max-sign.asm.comp
 create mode 100644 shaders-opencl-no-opt/asm/comp/atomic-result-temporary.asm.comp
 create mode 100644 shaders-opencl-no-opt/asm/comp/bda-arguments.asm.comp
 create mode 100644 shaders-opencl-no-opt/asm/comp/bda-to-array-in-buffer.invalid.asm.spv16.comp
 create mode 100644 shaders-opencl-no-opt/asm/comp/bitcast-fp16-fp32.fp16.invalid.asm.comp
 create mode 100644 shaders-opencl-no-opt/asm/comp/bitfield-signed-operations.asm.comp
 create mode 100644 shaders-opencl-no-opt/asm/comp/bitscan.asm.comp
 create mode 100644 shaders-opencl-no-opt/asm/comp/block-like-array-type-construct-2.asm.comp
 create mode 100644 shaders-opencl-no-opt/asm/comp/block-like-array-type-construct.invalid.asm.comp
 create mode 100644 shaders-opencl-no-opt/asm/comp/buffer-device-address-ptr-casting.asm.comp
 create mode 100644 shaders-opencl-no-opt/asm/comp/composite-construct-buffer-struct.asm.invalid.comp
 create mode 100644 shaders-opencl-no-opt/asm/comp/constant-composite-undef.asm.comp
 create mode 100644 shaders-opencl-no-opt/asm/comp/constant-lut-name-aliasing.asm.comp
 create mode 100644 shaders-opencl-no-opt/asm/comp/copy-logical-2.spv14.asm.comp
 create mode 100644 shaders-opencl-no-opt/asm/comp/copy-logical-offset-and-array-stride-diffs.spv14.asm.comp
 create mode 100644 shaders-opencl-no-opt/asm/comp/copy-logical.spv14.asm.comp
 create mode 100644 shaders-opencl-no-opt/asm/comp/device-array-load-temporary.asm.invalid.comp
 create mode 100644 shaders-opencl-no-opt/asm/comp/device-array-load-temporary.force-native-array.asm.invalid.comp
 create mode 100644 shaders-opencl-no-opt/asm/comp/device-constant-array-load-store.asm.invalid.comp
 create mode 100644 shaders-opencl-no-opt/asm/comp/device-constant-array-load-store.force-native-array.asm.invalid.comp
 create mode 100644 shaders-opencl-no-opt/asm/comp/eliminate-globals-not-in-entry-point.noeliminate.spv14.asm.comp
 create mode 100644 shaders-opencl-no-opt/asm/comp/glsl-signed-operations.asm.comp
 create mode 100644 shaders-opencl-no-opt/asm/comp/glsl.std450.frexp-modf-struct.asm.comp
 create mode 100644 shaders-opencl-no-opt/asm/comp/groupshared-inner-array-of-struct-copy.invalid.asm.comp
 create mode 100644 shaders-opencl-no-opt/asm/comp/image-atomic-mismatch-sign.asm.invalid.comp
 create mode 100644 shaders-opencl-no-opt/asm/comp/local-size-id-override.asm.comp
 create mode 100644 shaders-opencl-no-opt/asm/comp/local-size-id.asm.invalid.comp
 create mode 100644 shaders-opencl-no-opt/asm/comp/modf-storage-class.asm.comp
 create mode 100644 shaders-opencl-no-opt/asm/comp/opptrdiff-basic.spv14.invalid.asm.comp
 create mode 100644 shaders-opencl-no-opt/asm/comp/opptrdiff-opptraccesschain-elem-offset.spv14.asm.comp
 create mode 100644 shaders-opencl-no-opt/asm/comp/opptrequal-basic.spv14.asm.comp
 create mode 100644 shaders-opencl-no-opt/asm/comp/opptrequal-row-maj-mtx-bypass-transpose.spv14.asm.invalid.comp
 create mode 100644 shaders-opencl-no-opt/asm/comp/opptrnotequal-basic.spv14.asm.comp
 create mode 100644 shaders-opencl-no-opt/asm/comp/ptr-access-chain-custom-array-stride.asm.comp
 create mode 100644 shaders-opencl-no-opt/asm/comp/spec-constant-name-aliasing.asm.comp
 create mode 100644 shaders-opencl-no-opt/asm/comp/storage-buffer-basic.invalid.asm.comp
 create mode 100644 shaders-opencl-no-opt/asm/comp/storage-buffer-pointer-argument.asm.comp
 create mode 100644 shaders-opencl-no-opt/asm/comp/trancendentals-float-controls-2-fp16.fp16.asm.comp
 create mode 100644 shaders-opencl-no-opt/asm/comp/trancendentals-float-controls-2-fp32.fp16.asm.comp
 create mode 100644 shaders-opencl-no-opt/asm/comp/variable-pointers-2.asm.comp
 create mode 100644 shaders-opencl-no-opt/asm/comp/variable-pointers-3.invalid.asm.comp
 create mode 100644 shaders-opencl-no-opt/asm/comp/variable-pointers-vector-to-scalar.asm.comp
 create mode 100644 shaders-opencl-no-opt/asm/comp/variable-pointers.asm.invalid.comp
 create mode 100644 shaders-opencl-no-opt/asm/comp/variable-ssbo-argument.spv16.asm.comp
 create mode 100644 shaders-opencl-no-opt/asm/comp/variable-ssbo-array-argument.spv16.invalid.asm.comp
 create mode 100644 shaders-opencl-no-opt/asm/comp/workgroup-uint-to-uchar-alias-ptr-access-chain.asm.invalid.comp
 create mode 100644 shaders-opencl-no-opt/comp/argument-buffer-readonly-writeonly-alias.argument.argument-tier-1.device-argument-buffer.invalid.comp
 create mode 100644 shaders-opencl-no-opt/comp/array-copy-threadgroup-memory.invalid.comp
 create mode 100644 shaders-opencl-no-opt/comp/atomic-cmpxchg-packed-vector.invalid.comp
 create mode 100644 shaders-opencl-no-opt/comp/basic.invalid.comp
 create mode 100644 shaders-opencl-no-opt/comp/bda-atomics.invalid.comp
 create mode 100644 shaders-opencl-no-opt/comp/bda-load-std140-arrayed-pointer.invalid.comp
 create mode 100644 shaders-opencl-no-opt/comp/bda-nonwritable-glslang-workaround.comp
 create mode 100644 shaders-opencl-no-opt/comp/bda-restrict-pointer-variable.invalid.comp
 create mode 100644 shaders-opencl-no-opt/comp/bitcast-16bit-1.invalid.comp
 create mode 100644 shaders-opencl-no-opt/comp/bitcast-16bit-2.invalid.comp
 create mode 100644 shaders-opencl-no-opt/comp/bitfield.comp
 create mode 100644 shaders-opencl-no-opt/comp/buffer-device-address-from-pointer-complex-chain.comp
 create mode 100644 shaders-opencl-no-opt/comp/extract-atomics-from-function.invalid.comp
 create mode 100644 shaders-opencl-no-opt/comp/global-invocation-id-writable-ssbo-in-function.comp
 create mode 100644 shaders-opencl-no-opt/comp/glsl.std450.comp
 create mode 100644 shaders-opencl-no-opt/comp/illegal-struct-name.asm.comp
 create mode 100644 shaders-opencl-no-opt/comp/implicit-integer-promotion.fp16.invalid.comp
 create mode 100644 shaders-opencl-no-opt/comp/int16min-literal.fp16.invalid.comp
 create mode 100644 shaders-opencl-no-opt/comp/int64.invalid.comp
 create mode 100644 shaders-opencl-no-opt/comp/int64min-literal.comp
 create mode 100644 shaders-opencl-no-opt/comp/integer-dot-product.comp
 create mode 100644 shaders-opencl-no-opt/comp/intmin-literal.comp
 create mode 100644 shaders-opencl-no-opt/comp/loop.comp
 create mode 100644 shaders-opencl-no-opt/comp/read-only-coherent-image.invalid.comp
 create mode 100644 shaders-opencl-no-opt/comp/return.comp
 create mode 100644 shaders-opencl-no-opt/comp/simple-bindless-ssbo.argument.argument-tier-1.device-argument-buffer.invalid.comp
 create mode 100644 shaders-opencl-no-opt/comp/std140-array-load-composite-construct.comp
 create mode 100644 shaders-opencl-no-opt/comp/struct-packing-scalar.nocompat.invalid.vk.comp
 create mode 100644 shaders-opencl-no-opt/comp/subgroups.nocompat.vk.opencl12.emulate-subgroup.invalid.comp
 create mode 100644 shaders-opencl-no-opt/comp/subgroups.nocompat.vk.subgroup.fixed-subgroup.invalid.comp
 create mode 100644 shaders-opencl-no-opt/comp/subgroups.nocompat.vk.subgroup.invalid.comp
 create mode 100644 shaders-opencl-no-opt/comp/subgroups.nocompat.vk.subgroup.swizzle.invalid.comp
 create mode 100644 shaders-opencl-no-opt/comp/trancendental-float-controls-1-fp16.fp16.comp
 create mode 100644 shaders-opencl-no-opt/comp/trancendental-float-controls-1-fp32.fp16.comp
 create mode 100644 shaders-opencl-no-opt/comp/transposed-temporary-expression-2.comp
 create mode 100644 shaders-opencl-no-opt/comp/transposed-temporary-expression.comp
 create mode 100644 shaders-opencl-no-opt/comp/trivial-select-cast-vector.comp
 create mode 100644 shaders-opencl-no-opt/comp/trivial-select-matrix.spv14.comp
 create mode 100644 shaders-opencl-no-opt/comp/workgroup-size-spec-constant-array.invalid.comp
 create mode 100644 shaders-opencl-no-opt/comp/workgroup-size-spec-constant-array.spv16.invalid.comp

diff --git a/reference/shaders-opencl-no-opt/asm/comp/aliased-struct-divergent-member-name.asm.comp b/reference/shaders-opencl-no-opt/asm/comp/aliased-struct-divergent-member-name.asm.comp
new file mode 100644
index 000000000..583813d01
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/asm/comp/aliased-struct-divergent-member-name.asm.comp
@@ -0,0 +1,32 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct T
+{
+    float c;
+};
+
+typedef struct T T;
+
+struct SSBO1
+{
+    T foo[1];
+};
+
+typedef struct SSBO1 SSBO1;
+
+struct SSBO2
+{
+    T bar[1];
+};
+
+typedef struct SSBO2 SSBO2;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global T* _9, __global T* _13)
+{
+    T v = (T){ 40.0f };
+    _9[10].c = v.c;
+    _13[30].c = v.c;
+}
+
diff --git a/reference/shaders-opencl-no-opt/asm/comp/arithmetic-conversion-signs.asm.comp b/reference/shaders-opencl-no-opt/asm/comp/arithmetic-conversion-signs.asm.comp
new file mode 100644
index 000000000..19a82a8fb
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/asm/comp/arithmetic-conversion-signs.asm.comp
@@ -0,0 +1,43 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct SSBO
+{
+    int s32;
+    uint u32;
+    short s16;
+    ushort u16;
+    float f32;
+};
+
+typedef struct SSBO SSBO;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global SSBO* _6)
+{
+    int _29 = _6->s32;
+    uint _30 = _6->u32;
+    short _31 = _6->s16;
+    ushort _32 = _6->u16;
+    float _33 = _6->f32;
+    _6->s32 = convert_int(_31);
+    _6->u32 = convert_uint(_31);
+    _6->s32 = convert_int(_32);
+    _6->u32 = convert_uint(_32);
+    _6->u32 = convert_uint(_31);
+    _6->u32 = convert_uint(_32);
+    _6->s16 = convert_short(_29);
+    _6->u16 = convert_ushort(_29);
+    _6->s16 = convert_short(_30);
+    _6->u16 = convert_ushort(_30);
+    _6->u16 = convert_ushort(_29);
+    _6->u16 = convert_ushort(_30);
+    _6->f32 = convert_float(_31);
+    _6->f32 = convert_float(_32);
+    _6->f32 = convert_float(_31);
+    _6->f32 = convert_float(_32);
+    _6->s16 = convert_short(_33);
+    _6->u16 = convert_ushort(_33);
+    _6->u16 = convert_ushort(_33);
+}
+
diff --git a/reference/shaders-opencl-no-opt/asm/comp/array-copy-physical-layout-mismatch.invalid.asm.comp b/reference/shaders-opencl-no-opt/asm/comp/array-copy-physical-layout-mismatch.invalid.asm.comp
new file mode 100644
index 000000000..45aaa65c5
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/asm/comp/array-copy-physical-layout-mismatch.invalid.asm.comp
@@ -0,0 +1,20 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct SSBO
+{
+    float b[5];
+    float c[5];
+};
+
+typedef struct SSBO SSBO;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global SSBO* _7)
+{
+    float a[5] = _7->b;
+    a = _7->b;
+    _7->b = a;
+    _7->c = a;
+}
+
diff --git a/reference/shaders-opencl-no-opt/asm/comp/atomic-load-store.asm.comp b/reference/shaders-opencl-no-opt/asm/comp/atomic-load-store.asm.comp
new file mode 100644
index 000000000..64dd5c4dc
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/asm/comp/atomic-load-store.asm.comp
@@ -0,0 +1,20 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct SSBO
+{
+    uint a;
+    uint b;
+};
+
+typedef struct SSBO SSBO;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global SSBO* _7)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    uint _16 = atomic_add(&(_7->b), 0u);
+    uint c = _16;
+    atomic_xchg(&(_7->a), c);
+}
+
diff --git a/reference/shaders-opencl-no-opt/asm/comp/atomic-min-max-sign.asm.comp b/reference/shaders-opencl-no-opt/asm/comp/atomic-min-max-sign.asm.comp
new file mode 100644
index 000000000..51a153da0
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/asm/comp/atomic-min-max-sign.asm.comp
@@ -0,0 +1,25 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct SSBO
+{
+    uint a;
+    int b;
+};
+
+typedef struct SSBO SSBO;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global SSBO* _6)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    uint _30 = atomic_max(&(_6->a), 1u);
+    uint _31 = atomic_min(&(_6->a), 1u);
+    uint _32 = atomic_min(&(_6->a), 4294967295u);
+    uint _33 = atomic_max(&(_6->a), 4294967295u);
+    int _34 = atomic_max(&(_6->b), -3);
+    int _35 = atomic_min(&(_6->b), -3);
+    int _36 = atomic_min(&(_6->b), 4);
+    int _37 = atomic_max(&(_6->b), 4);
+}
+
diff --git a/reference/shaders-opencl-no-opt/asm/comp/atomic-result-temporary.asm.comp b/reference/shaders-opencl-no-opt/asm/comp/atomic-result-temporary.asm.comp
new file mode 100644
index 000000000..e68c8925b
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/asm/comp/atomic-result-temporary.asm.comp
@@ -0,0 +1,21 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct SSBO
+{
+    uint count;
+    uint data[1];
+};
+
+typedef struct SSBO SSBO;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global SSBO* _7)
+{
+    uint _19 = atomic_add(&(_7->count), 1u);
+    if (_19 < 1024u)
+    {
+        _7->data[_19] = ((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x;
+    }
+}
+
diff --git a/reference/shaders-opencl-no-opt/asm/comp/bda-arguments.asm.comp b/reference/shaders-opencl-no-opt/asm/comp/bda-arguments.asm.comp
new file mode 100644
index 000000000..e927b1917
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/asm/comp/bda-arguments.asm.comp
@@ -0,0 +1,41 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct _4;
+
+struct _4
+{
+    int _m0;
+    ulong _m1;
+};
+
+typedef struct _4 _4;
+
+struct _16
+{
+    ulong _m0;
+};
+
+typedef struct _16 _16;
+
+void _43(__global _4* __restrict _10, int _44, __global int* __restrict _12, __global int* __restrict __global * __restrict _13, __global int* __restrict _14)
+{
+}
+
+void _40(__global _4* _6, int _41, __global int* _7, __global int* __global * _8, __global int* _9)
+{
+    _43(_6, _41, _7, _8, _9);
+    _6->_m0 = _41;
+    *_7 = _41;
+    *_9 = _41;
+    *_8 = _9;
+}
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(_16 _32)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    __global _4* _28 = ((__global _4*)(_32._m0));
+    _40(_28, 40, &_28->_m0, &_28->_m1, ((__global int*)(_28->_m1)));
+}
+
diff --git a/reference/shaders-opencl-no-opt/asm/comp/bda-to-array-in-buffer.invalid.asm.spv16.comp b/reference/shaders-opencl-no-opt/asm/comp/bda-to-array-in-buffer.invalid.asm.spv16.comp
new file mode 100644
index 000000000..803d9421e
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/asm/comp/bda-to-array-in-buffer.invalid.asm.spv16.comp
@@ -0,0 +1,24 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct _7
+{
+    ulong _m0;
+    ulong _m1;
+};
+
+typedef struct _7 _7;
+
+__global uint* _23(__global _7* _2)
+{
+    __global uint* _29 = ((__global uint*)((ulong)(((__global uchar*)(_2->_m1))) + 16ul));
+    *_29 = 1u;
+    return _29;
+}
+
+__attribute__((reqd_work_group_size(16, 16, 1)))
+__kernel void comp_main(__global _7* _2)
+{
+    __global uint* _31 = _23(_2);
+}
+
diff --git a/reference/shaders-opencl-no-opt/asm/comp/bitcast-fp16-fp32.fp16.invalid.asm.comp b/reference/shaders-opencl-no-opt/asm/comp/bitcast-fp16-fp32.fp16.invalid.asm.comp
new file mode 100644
index 000000000..b02f295d9
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/asm/comp/bitcast-fp16-fp32.fp16.invalid.asm.comp
@@ -0,0 +1,22 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+struct SSBO
+{
+    half2 a;
+    float b;
+    float c;
+    half2 d;
+};
+
+typedef struct SSBO SSBO;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global SSBO* _6)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    _6->b = uintBitsToFloat(packFloat2x16(_6->a));
+    _6->d = unpackFloat2x16(floatBitsToUint(_6->c));
+}
+
diff --git a/reference/shaders-opencl-no-opt/asm/comp/bitfield-signed-operations.asm.comp b/reference/shaders-opencl-no-opt/asm/comp/bitfield-signed-operations.asm.comp
new file mode 100644
index 000000000..3b5bd0001
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/asm/comp/bitfield-signed-operations.asm.comp
@@ -0,0 +1,38 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct SSBO
+{
+    int4 ints;
+    uint4 uints;
+};
+
+typedef struct SSBO SSBO;
+
+uint spvBitReverse(uint v) {
+    v = ((v >> 1u) & 0x55555555u) | ((v & 0x55555555u) << 1u);
+    v = ((v >> 2u) & 0x33333333u) | ((v & 0x33333333u) << 2u);
+    v = ((v >> 4u) & 0x0F0F0F0Fu) | ((v & 0x0F0F0F0Fu) << 4u);
+    v = ((v >> 8u) & 0x00FF00FFu) | ((v & 0x00FF00FFu) << 8u);
+    return (v >> 16u) | (v << 16u);
+}
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global SSBO* _4)
+{
+    int4 _19 = _4->ints;
+    uint4 _20 = _4->uints;
+    _4->ints = popcount(_19);
+    _4->uints = as_uint4(popcount(_19));
+    _4->ints = as_int4(popcount(_20));
+    _4->uints = popcount(_20);
+    _4->ints = as_int4((uint4)(spvBitReverse(as_uint4(_19).s0), spvBitReverse(as_uint4(_19).s1), spvBitReverse(as_uint4(_19).s2), spvBitReverse(as_uint4(_19).s3)));
+    _4->uints = (uint4)(spvBitReverse(_20.s0), spvBitReverse(_20.s1), spvBitReverse(_20.s2), spvBitReverse(_20.s3));
+    _4->ints = (_19 << (32 - 11u - 1)) >> (32 - 11u);
+    _4->uints = as_uint4((as_int4(_20) << (32 - 1 - 11u)) >> (32 - 1));
+    _4->ints = as_int4((as_uint4(_19) >> 1) & ((uint4)(1u << 11u) - (uint4)1u));
+    _4->uints = (_20 >> 11u) & ((uint4)(1u << 1) - (uint4)1u);
+    _4->ints = as_int4((as_uint4(_19) & ~(((uint4)(1u << 11u) - (uint4)1u) << 1)) | ((as_uint4(_19.wzyx) << 1) & (((uint4)(1u << 11u) - (uint4)1u) << 1)));
+    _4->uints = (_20 & ~(((uint4)(1u << 1) - (uint4)1u) << 11u)) | ((_20.wzyx << 11u) & (((uint4)(1u << 1) - (uint4)1u) << 11u));
+}
+
diff --git a/reference/shaders-opencl-no-opt/asm/comp/bitscan.asm.comp b/reference/shaders-opencl-no-opt/asm/comp/bitscan.asm.comp
new file mode 100644
index 000000000..f538fab1a
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/asm/comp/bitscan.asm.comp
@@ -0,0 +1,35 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct SSBO
+{
+    uint4 u;
+    int4 i;
+};
+
+typedef struct SSBO SSBO;
+
+static int spvFindLSB(uint x) {
+    if (x == 0u) return -1;
+    return 31 - as_int(clz(x & (0u - x)));
+}
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global SSBO* _6)
+{
+    uint4 _19 = _6->u;
+    int4 _20 = _6->i;
+    _6->u = (uint4)(spvFindLSB(_19.x), spvFindLSB(_19.y), spvFindLSB(_19.z), spvFindLSB(_19.w));
+    _6->i = (int4)(spvFindLSB(_19.x), spvFindLSB(_19.y), spvFindLSB(_19.z), spvFindLSB(_19.w));
+    _6->u = (uint4)(spvFindLSB(as_uint(_20.x)), spvFindLSB(as_uint(_20.y)), spvFindLSB(as_uint(_20.z)), spvFindLSB(as_uint(_20.w)));
+    _6->i = (int4)(spvFindLSB(as_uint(_20.x)), spvFindLSB(as_uint(_20.y)), spvFindLSB(as_uint(_20.z)), spvFindLSB(as_uint(_20.w)));
+    _6->u = as_uint4(31 - as_int4(clz(_19)));
+    _6->i = 31 - as_int4(clz(_19));
+    _6->u = as_uint4(31 - as_int4(clz(as_uint4(_20))));
+    _6->i = 31 - as_int4(clz(as_uint4(_20)));
+    _6->u = as_uint4(31 - clz(as_int4(_19) ^ (as_int4(_19) >> 31)));
+    _6->i = 31 - clz(as_int4(_19) ^ (as_int4(_19) >> 31));
+    _6->u = as_uint4(31 - clz(_20 ^ (_20 >> 31)));
+    _6->i = 31 - clz(_20 ^ (_20 >> 31));
+}
+
diff --git a/reference/shaders-opencl-no-opt/asm/comp/block-like-array-type-construct-2.asm.comp b/reference/shaders-opencl-no-opt/asm/comp/block-like-array-type-construct-2.asm.comp
new file mode 100644
index 000000000..4ba95f9d4
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/asm/comp/block-like-array-type-construct-2.asm.comp
@@ -0,0 +1,41 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct type_CommonConstants
+{
+    uint g_count;
+    uint3 g_padding4;
+};
+
+typedef struct type_CommonConstants type_CommonConstants;
+
+struct MyStruct
+{
+    float4 m_coefficients[4];
+};
+
+typedef struct MyStruct MyStruct;
+
+struct type_RWStructuredBuffer_MyStruct
+{
+    MyStruct _m0[1];
+};
+
+typedef struct type_RWStructuredBuffer_MyStruct type_RWStructuredBuffer_MyStruct;
+
+constant float4 _27[4] = { (float4)(0.0f), (float4)(0.0f), (float4)(0.0f), (float4)(0.0f) };
+
+__attribute__((reqd_work_group_size(64, 1, 1)))
+__kernel void comp_main(type_CommonConstants CommonConstants, __global MyStruct* g_data)
+{
+    do
+    {
+        if (((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x >= CommonConstants.g_count)
+        {
+            break;
+        }
+        g_data[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x] = (MyStruct){ { (float4)(0.0f), (float4)(0.0f), (float4)(0.0f), (float4)(0.0f) } };
+        break;
+    } while(false);
+}
+
diff --git a/reference/shaders-opencl-no-opt/asm/comp/block-like-array-type-construct.invalid.asm.comp b/reference/shaders-opencl-no-opt/asm/comp/block-like-array-type-construct.invalid.asm.comp
new file mode 100644
index 000000000..421377b4d
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/asm/comp/block-like-array-type-construct.invalid.asm.comp
@@ -0,0 +1,37 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct _12
+{
+    float _m0[4];
+    float _m1[4];
+};
+
+typedef struct _12 _12;
+
+constant float _36[4] = { 1.0f, 2.0f, 3.0f, 4.0f };
+constant _12 _39[2] = { (_12){ { 1.0f, 2.0f, 3.0f, 4.0f }, { 1.0f, 2.0f, 3.0f, 4.0f } }, (_12){ { 1.0f, 2.0f, 3.0f, 4.0f }, { 1.0f, 2.0f, 3.0f, 4.0f } } };
+
+struct SSBO
+{
+    uint a;
+    int b;
+};
+
+typedef struct SSBO SSBO;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global SSBO* _8)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    float foo[4];
+    float foo2[4];
+    foo[0] = 1.0f;
+    foo = { 1.0f, 2.0f, 3.0f, 4.0f };
+    foo[1] = 2.0f;
+    foo[2] = 3.0f;
+    foo[3] = 4.0f;
+    foo2 = foo;
+    _12 _41 = (_12){ { foo[0], foo[1], foo[2], foo[3] }, { foo2[0], foo2[1], foo2[2], foo2[3] } };
+}
+
diff --git a/reference/shaders-opencl-no-opt/asm/comp/buffer-device-address-ptr-casting.asm.comp b/reference/shaders-opencl-no-opt/asm/comp/buffer-device-address-ptr-casting.asm.comp
new file mode 100644
index 000000000..bd7015044
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/asm/comp/buffer-device-address-ptr-casting.asm.comp
@@ -0,0 +1,36 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct SomeBuffer;
+
+struct SomeBuffer
+{
+    float4 v;
+    ulong a;
+    uint2 b;
+};
+
+typedef struct SomeBuffer SomeBuffer;
+
+struct Registers
+{
+    ulong address;
+    uint2 address2;
+};
+
+typedef struct Registers Registers;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(Registers registers)
+{
+    __global SomeBuffer* _44 = ((__global SomeBuffer*)(registers.address));
+    __global SomeBuffer* _45 = ((__global SomeBuffer*)(registers.address));
+    __global SomeBuffer* _46 = ((__global SomeBuffer*)as_ulong(registers.address2));
+    _44->v = (float4)(1.0f, 2.0f, 3.0f, 4.0f);
+    _45->v = (float4)(1.0f, 2.0f, 3.0f, 4.0f);
+    _46->v = (float4)(1.0f, 2.0f, 3.0f, 4.0f);
+    _44->a = (ulong)(_44);
+    _45->a = (ulong)((ulong)(_45));
+    _46->b = as_uint2((ulong)(_46));
+}
+
diff --git a/reference/shaders-opencl-no-opt/asm/comp/composite-construct-buffer-struct.asm.invalid.comp b/reference/shaders-opencl-no-opt/asm/comp/composite-construct-buffer-struct.asm.invalid.comp
new file mode 100644
index 000000000..c9bf180b8
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/asm/comp/composite-construct-buffer-struct.asm.invalid.comp
@@ -0,0 +1,25 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct Block
+{
+    uint2 _m0[2];
+    uint2 _m1[2];
+};
+
+typedef struct Block Block;
+
+struct SSBO
+{
+    Block _m0[3];
+};
+
+typedef struct SSBO SSBO;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global Block* ssbo)
+{
+    __local uint2 _18[2];
+    ssbo[0u] = (Block){ { ssbo[0u]._m1[0], ssbo[0u]._m1[1] }, { ssbo[0u]._m1[0], ssbo[0u]._m1[1] } };
+}
+
diff --git a/reference/shaders-opencl-no-opt/asm/comp/constant-composite-undef.asm.comp b/reference/shaders-opencl-no-opt/asm/comp/constant-composite-undef.asm.comp
new file mode 100644
index 000000000..b59d2814c
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/asm/comp/constant-composite-undef.asm.comp
@@ -0,0 +1,18 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct Block
+{
+    float4 f;
+};
+
+typedef struct Block Block;
+
+constant float _15 = 0;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global float4* block)
+{
+    block[0] = (float4)(0.100000001490116119384765625f, 0.20000000298023223876953125f, 0.300000011920928955078125f, 0.0f);
+}
+
diff --git a/reference/shaders-opencl-no-opt/asm/comp/constant-lut-name-aliasing.asm.comp b/reference/shaders-opencl-no-opt/asm/comp/constant-lut-name-aliasing.asm.comp
new file mode 100644
index 000000000..6e0851769
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/asm/comp/constant-lut-name-aliasing.asm.comp
@@ -0,0 +1,20 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct SSBO
+{
+    int values[1];
+};
+
+typedef struct SSBO SSBO;
+
+constant int indexable[4] = { 0, 1, 2, 3 };
+constant int indexable_1[4] = { 4, 5, 6, 7 };
+
+__attribute__((reqd_work_group_size(4, 4, 1)))
+__kernel void comp_main(__global int* _8)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    _8[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x] = indexable[((uint3)(get_local_id(0), get_local_id(1), get_local_id(2))).x] + indexable_1[((uint3)(get_local_id(0), get_local_id(1), get_local_id(2))).y];
+}
+
diff --git a/reference/shaders-opencl-no-opt/asm/comp/copy-logical-2.spv14.asm.comp b/reference/shaders-opencl-no-opt/asm/comp/copy-logical-2.spv14.asm.comp
new file mode 100644
index 000000000..e237c895e
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/asm/comp/copy-logical-2.spv14.asm.comp
@@ -0,0 +1,83 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+typedef struct { float2 columns[2]; } spvMat2;
+
+struct _13
+{
+    spvMat2 _m0;
+};
+
+typedef struct _13 _13;
+
+struct _14
+{
+    spvMat2 _m0;
+};
+
+typedef struct _14 _14;
+
+struct B2
+{
+    float4 elem2;
+};
+
+typedef struct B2 B2;
+
+struct C
+{
+    float4 c;
+    B2 b2;
+    B2 b2_array[4];
+    _14 _m3;
+};
+
+typedef struct C C;
+
+struct B1
+{
+    float4 elem1;
+};
+
+typedef struct B1 B1;
+
+struct A
+{
+    float4 a;
+    B1 b1;
+    B1 b1_array[4];
+    _13 _m3;
+};
+
+typedef struct A A;
+
+struct _10
+{
+    A a_block;
+    C c_block;
+};
+
+typedef struct _10 _10;
+
+static spvMat2 spvTransposeMat2(spvMat2 m)
+{
+    spvMat2 r;
+    r.columns[0] = (float2)(m.columns[0].x, m.columns[1].x);
+    r.columns[1] = (float2)(m.columns[0].y, m.columns[1].y);
+    return r;
+}
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global _10* _4)
+{
+    A _24;
+    _24.a = _4->c_block.c;
+    _24.b1.elem1 = _4->c_block.b2.elem2;
+    _24.b1_array[0].elem1 = _4->c_block.b2_array[0].elem2;
+    _24.b1_array[1].elem1 = _4->c_block.b2_array[1].elem2;
+    _24.b1_array[2].elem1 = _4->c_block.b2_array[2].elem2;
+    _24.b1_array[3].elem1 = _4->c_block.b2_array[3].elem2;
+    _24._m3._m0 = spvTransposeMat2(_4->c_block._m3._m0);
+    _4->a_block = _24;
+}
+
diff --git a/reference/shaders-opencl-no-opt/asm/comp/copy-logical-offset-and-array-stride-diffs.spv14.asm.comp b/reference/shaders-opencl-no-opt/asm/comp/copy-logical-offset-and-array-stride-diffs.spv14.asm.comp
new file mode 100644
index 000000000..c411b48d0
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/asm/comp/copy-logical-offset-and-array-stride-diffs.spv14.asm.comp
@@ -0,0 +1,61 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+typedef struct { float4 columns[4]; } spvMat4;
+
+struct _9
+{
+    uint _m0;
+};
+
+typedef struct _9 _9;
+
+struct _10
+{
+    uint _m0;
+};
+
+typedef struct _10 _10;
+
+struct _5
+{
+    uint _m0;
+    uint _m1[2];
+    uint _m2;
+    _9 _m3;
+    float4 _m4;
+    float3 _m5;
+    float2 _m6;
+};
+
+typedef struct _5 _5;
+
+struct _6
+{
+    uint _m0;
+    uint _m1[2];
+    uint _m2;
+    _10 _m3;
+    float4 _m4;
+    float3 _m5;
+    float2 _m6;
+};
+
+typedef struct _6 _6;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global _6* _3, __global _5* _4)
+{
+    _6 _22 = (*_3);
+    _5 _23;
+    _23._m0 = _22._m0;
+    _23._m1[0] = _22._m1[0];
+    _23._m1[1] = _22._m1[1];
+    _23._m2 = _22._m2;
+    _23._m3._m0 = _22._m3._m0;
+    _23._m4 = _22._m4;
+    _23._m5 = _22._m5;
+    _23._m6 = _22._m6;
+    *_4 = _23;
+}
+
diff --git a/reference/shaders-opencl-no-opt/asm/comp/copy-logical.spv14.asm.comp b/reference/shaders-opencl-no-opt/asm/comp/copy-logical.spv14.asm.comp
new file mode 100644
index 000000000..069e04c31
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/asm/comp/copy-logical.spv14.asm.comp
@@ -0,0 +1,56 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct B2
+{
+    float4 elem2;
+};
+
+typedef struct B2 B2;
+
+struct C
+{
+    float4 c;
+    B2 b2;
+    B2 b2_array[4];
+};
+
+typedef struct C C;
+
+struct B1
+{
+    float4 elem1;
+};
+
+typedef struct B1 B1;
+
+struct A
+{
+    float4 a;
+    B1 b1;
+    B1 b1_array[4];
+};
+
+typedef struct A A;
+
+struct _10
+{
+    A a_block;
+    C c_block;
+};
+
+typedef struct _10 _10;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global _10* _4)
+{
+    A _24;
+    _24.a = _4->c_block.c;
+    _24.b1.elem1 = _4->c_block.b2.elem2;
+    _24.b1_array[0].elem1 = _4->c_block.b2_array[0].elem2;
+    _24.b1_array[1].elem1 = _4->c_block.b2_array[1].elem2;
+    _24.b1_array[2].elem1 = _4->c_block.b2_array[2].elem2;
+    _24.b1_array[3].elem1 = _4->c_block.b2_array[3].elem2;
+    _4->a_block = _24;
+}
+
diff --git a/reference/shaders-opencl-no-opt/asm/comp/device-array-load-temporary.asm.invalid.comp b/reference/shaders-opencl-no-opt/asm/comp/device-array-load-temporary.asm.invalid.comp
new file mode 100644
index 000000000..e4387c0c9
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/asm/comp/device-array-load-temporary.asm.invalid.comp
@@ -0,0 +1,29 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct Block
+{
+    uint2 _m0[2];
+    uint2 _m1[2];
+};
+
+typedef struct Block Block;
+
+struct SSBO
+{
+    Block _m0[3];
+};
+
+typedef struct SSBO SSBO;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global Block* ssbo)
+{
+    __local uint2 _18[2];
+    uint2 _27[2];
+    _27[0] = ssbo[0u]._m1[0];
+    _27[1] = ssbo[0u]._m1[1];
+    ssbo[0u]._m0 = _27;
+    ssbo[0u]._m0 = _27;
+}
+
diff --git a/reference/shaders-opencl-no-opt/asm/comp/device-array-load-temporary.force-native-array.asm.invalid.comp b/reference/shaders-opencl-no-opt/asm/comp/device-array-load-temporary.force-native-array.asm.invalid.comp
new file mode 100644
index 000000000..e4387c0c9
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/asm/comp/device-array-load-temporary.force-native-array.asm.invalid.comp
@@ -0,0 +1,29 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct Block
+{
+    uint2 _m0[2];
+    uint2 _m1[2];
+};
+
+typedef struct Block Block;
+
+struct SSBO
+{
+    Block _m0[3];
+};
+
+typedef struct SSBO SSBO;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global Block* ssbo)
+{
+    __local uint2 _18[2];
+    uint2 _27[2];
+    _27[0] = ssbo[0u]._m1[0];
+    _27[1] = ssbo[0u]._m1[1];
+    ssbo[0u]._m0 = _27;
+    ssbo[0u]._m0 = _27;
+}
+
diff --git a/reference/shaders-opencl-no-opt/asm/comp/device-constant-array-load-store.asm.invalid.comp b/reference/shaders-opencl-no-opt/asm/comp/device-constant-array-load-store.asm.invalid.comp
new file mode 100644
index 000000000..f8a5f221b
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/asm/comp/device-constant-array-load-store.asm.invalid.comp
@@ -0,0 +1,33 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct Block
+{
+    uint2 _m0[2];
+    uint2 _m1[2];
+};
+
+typedef struct Block Block;
+
+struct SSBO
+{
+    Block _m0[3];
+};
+
+typedef struct SSBO SSBO;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global Block* ssbo, SSBO ubo)
+{
+    __local uint2 _18[2];
+    ssbo[0u]._m0 = ssbo[0u]._m1;
+    ssbo[0u]._m0 = ubo._m0[0u]._m1;
+    uint2 _23[2];
+    ssbo[0u]._m0 = _23;
+    ssbo[0u]._m0 = _18;
+    _18 = ssbo[0u]._m1;
+    _23 = ssbo[0u]._m1;
+    _18 = ubo._m0[0u]._m1;
+    _23 = ubo._m0[0u]._m1;
+}
+
diff --git a/reference/shaders-opencl-no-opt/asm/comp/device-constant-array-load-store.force-native-array.asm.invalid.comp b/reference/shaders-opencl-no-opt/asm/comp/device-constant-array-load-store.force-native-array.asm.invalid.comp
new file mode 100644
index 000000000..f8a5f221b
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/asm/comp/device-constant-array-load-store.force-native-array.asm.invalid.comp
@@ -0,0 +1,33 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct Block
+{
+    uint2 _m0[2];
+    uint2 _m1[2];
+};
+
+typedef struct Block Block;
+
+struct SSBO
+{
+    Block _m0[3];
+};
+
+typedef struct SSBO SSBO;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global Block* ssbo, SSBO ubo)
+{
+    __local uint2 _18[2];
+    ssbo[0u]._m0 = ssbo[0u]._m1;
+    ssbo[0u]._m0 = ubo._m0[0u]._m1;
+    uint2 _23[2];
+    ssbo[0u]._m0 = _23;
+    ssbo[0u]._m0 = _18;
+    _18 = ssbo[0u]._m1;
+    _23 = ssbo[0u]._m1;
+    _18 = ubo._m0[0u]._m1;
+    _23 = ubo._m0[0u]._m1;
+}
+
diff --git a/reference/shaders-opencl-no-opt/asm/comp/eliminate-globals-not-in-entry-point.noeliminate.spv14.asm.comp b/reference/shaders-opencl-no-opt/asm/comp/eliminate-globals-not-in-entry-point.noeliminate.spv14.asm.comp
new file mode 100644
index 000000000..0d6820a4f
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/asm/comp/eliminate-globals-not-in-entry-point.noeliminate.spv14.asm.comp
@@ -0,0 +1,23 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct UBO
+{
+    float v;
+};
+
+typedef struct UBO UBO;
+
+struct SSBO
+{
+    float v;
+};
+
+typedef struct SSBO SSBO;
+
+__attribute__((reqd_work_group_size(64, 1, 1)))
+__kernel void comp_main(__global float* ssbo)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+}
+
diff --git a/reference/shaders-opencl-no-opt/asm/comp/glsl-signed-operations.asm.comp b/reference/shaders-opencl-no-opt/asm/comp/glsl-signed-operations.asm.comp
new file mode 100644
index 000000000..e59e39dae
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/asm/comp/glsl-signed-operations.asm.comp
@@ -0,0 +1,50 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct SSBO
+{
+    int4 ints;
+    uint4 uints;
+};
+
+typedef struct SSBO SSBO;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global SSBO* _6)
+{
+    int4 _19 = _6->ints;
+    uint4 _20 = _6->uints;
+    _6->ints = as_int4(abs(_19));
+    _6->uints = abs(_19);
+    _6->ints = as_int4(abs(as_int4(_20)));
+    _6->uints = abs(as_int4(_20));
+    _6->ints = clamp(_19, -1, 1);
+    _6->uints = as_uint4(clamp(_19, -1, 1));
+    _6->ints = clamp(as_int4(_20), -1, 1);
+    _6->uints = as_uint4(clamp(as_int4(_20), -1, 1));
+    _6->ints = 31 - clz(as_int4(_20) ^ (as_int4(_20) >> 31));
+    _6->uints = as_uint4(31 - clz(as_int4(_20) ^ (as_int4(_20) >> 31)));
+    _6->ints = 31 - as_int4(clz(as_uint4(_19)));
+    _6->uints = as_uint4(31 - as_int4(clz(as_uint4(_19))));
+    _6->ints = min(_19, _19);
+    _6->uints = as_uint4(min(_19, as_int4(_20)));
+    _6->ints = min(as_int4(_20), as_int4(_20));
+    _6->uints = as_uint4(min(as_int4(_20), _19));
+    _6->ints = as_int4(min(as_uint4(_19), _20));
+    _6->uints = min(as_uint4(_19), _20);
+    _6->ints = as_int4(min(_20, as_uint4(_19)));
+    _6->uints = min(_20, as_uint4(_19));
+    _6->ints = max(_19, _19);
+    _6->uints = as_uint4(max(_19, _19));
+    _6->ints = max(as_int4(_20), _19);
+    _6->uints = as_uint4(max(as_int4(_20), _19));
+    _6->ints = as_int4(max(as_uint4(_19), _20));
+    _6->uints = max(as_uint4(_19), as_uint4(_19));
+    _6->ints = as_int4(max(_20, as_uint4(_19)));
+    _6->uints = max(_20, as_uint4(_19));
+    _6->ints = clamp(as_int4(_20), as_int4(_20), as_int4(_20));
+    _6->uints = as_uint4(clamp(as_int4(_20), as_int4(_20), as_int4(_20)));
+    _6->ints = as_int4(clamp(as_uint4(_19), as_uint4(_19), as_uint4(_19)));
+    _6->uints = clamp(as_uint4(_19), as_uint4(_19), as_uint4(_19));
+}
+
diff --git a/reference/shaders-opencl-no-opt/asm/comp/glsl.std450.frexp-modf-struct.asm.comp b/reference/shaders-opencl-no-opt/asm/comp/glsl.std450.frexp-modf-struct.asm.comp
new file mode 100644
index 000000000..f44d85023
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/asm/comp/glsl.std450.frexp-modf-struct.asm.comp
@@ -0,0 +1,40 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct _9
+{
+    float _m0;
+    float _m1;
+};
+
+typedef struct _9 _9;
+
+struct _16
+{
+    float _m0;
+    int _m1;
+};
+
+typedef struct _16 _16;
+
+struct _4
+{
+    float _m0;
+    int _m1;
+};
+
+typedef struct _4 _4;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global _4* _6)
+{
+    _9 _23;
+    _23._m0 = modf(20.0f, &_23._m1);
+    _16 _24;
+    _24._m0 = frexp(40.0f, &_24._m1);
+    _6->_m0 = _23._m0;
+    _6->_m0 = _23._m1;
+    _6->_m0 = _24._m0;
+    _6->_m1 = _24._m1;
+}
+
diff --git a/reference/shaders-opencl-no-opt/asm/comp/groupshared-inner-array-of-struct-copy.invalid.asm.comp b/reference/shaders-opencl-no-opt/asm/comp/groupshared-inner-array-of-struct-copy.invalid.asm.comp
new file mode 100644
index 000000000..cedb4d5d6
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/asm/comp/groupshared-inner-array-of-struct-copy.invalid.asm.comp
@@ -0,0 +1,38 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct Data
+{
+    float3 sourceData[16];
+};
+
+typedef struct Data Data;
+
+__attribute__((reqd_work_group_size(8, 8, 1)))
+__kernel void comp_main(read_only image2d_t g_inputTexture, write_only image2d_t g_output)
+{
+    __local Data g_data[64];
+    uint _49;
+    _49 = 0u;
+    for (; _49 < 4u; _49++)
+    {
+        for (uint _56 = 0u; _56 < 4u; )
+        {
+            int3 _65 = as_int3(((uint3)(get_global_id(0), get_global_id(1), get_global_id(2)))) + (int3)(as_int(_56), as_int(_49), 0);
+            g_data[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x].sourceData[(_49 * 4u) + _56] = texelFetch(g_inputTexture, _65.xy, _65.z).xyz;
+            _56++;
+            continue;
+        }
+    }
+    float3 _45[16] = g_data[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x].sourceData;
+    uint _77;
+    _77 = 0u;
+    for (int _80 = 0; _80 < 16; )
+    {
+        _77 |= convert_uint(clamp(dot(_45[_80], (float3)(-1.0f)), 0.0f, 1.0f));
+        _80++;
+        continue;
+    }
+    write_imageui(g_output, as_int2(((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).xy), (uint4)(_77));
+}
+
diff --git a/reference/shaders-opencl-no-opt/asm/comp/image-atomic-mismatch-sign.asm.invalid.comp b/reference/shaders-opencl-no-opt/asm/comp/image-atomic-mismatch-sign.asm.invalid.comp
new file mode 100644
index 000000000..e69de29bb
diff --git a/reference/shaders-opencl-no-opt/asm/comp/local-size-id-override.asm.comp b/reference/shaders-opencl-no-opt/asm/comp/local-size-id-override.asm.comp
new file mode 100644
index 000000000..22b5b4066
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/asm/comp/local-size-id-override.asm.comp
@@ -0,0 +1,34 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct SSBO
+{
+    float4 values[1];
+};
+
+typedef struct SSBO SSBO;
+
+#ifndef SPIRV_CROSS_CONSTANT_ID_1
+#define SPIRV_CROSS_CONSTANT_ID_1 11u
+#endif
+constant uint _12 = SPIRV_CROSS_CONSTANT_ID_1;
+#ifndef SPIRV_CROSS_CONSTANT_ID_2
+#define SPIRV_CROSS_CONSTANT_ID_2 12u
+#endif
+constant uint _13 = SPIRV_CROSS_CONSTANT_ID_2;
+#ifndef SPIRV_CROSS_CONSTANT_ID_3
+#define SPIRV_CROSS_CONSTANT_ID_3 13u
+#endif
+constant uint _6 = SPIRV_CROSS_CONSTANT_ID_3;
+#ifndef SPIRV_CROSS_CONSTANT_ID_4
+#define SPIRV_CROSS_CONSTANT_ID_4 14u
+#endif
+constant uint _7 = SPIRV_CROSS_CONSTANT_ID_4;
+constant uint3 spvWorkgroupSize = (uint3)(3u, _12, _13);
+
+__attribute__((reqd_work_group_size(3, 11, 12)))
+__kernel void comp_main(__global float4* _10)
+{
+    _10[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x] += (float4)(2.0f);
+}
+
diff --git a/reference/shaders-opencl-no-opt/asm/comp/local-size-id.asm.invalid.comp b/reference/shaders-opencl-no-opt/asm/comp/local-size-id.asm.invalid.comp
new file mode 100644
index 000000000..8e6e38e20
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/asm/comp/local-size-id.asm.invalid.comp
@@ -0,0 +1,35 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct SSBO
+{
+    float4 values[1];
+};
+
+typedef struct SSBO SSBO;
+
+#ifndef SPIRV_CROSS_CONSTANT_ID_1
+#define SPIRV_CROSS_CONSTANT_ID_1 11
+#endif
+constant int _12 = SPIRV_CROSS_CONSTANT_ID_1;
+#ifndef SPIRV_CROSS_CONSTANT_ID_2
+#define SPIRV_CROSS_CONSTANT_ID_2 12
+#endif
+constant int _13 = SPIRV_CROSS_CONSTANT_ID_2;
+#ifndef SPIRV_CROSS_CONSTANT_ID_3
+#define SPIRV_CROSS_CONSTANT_ID_3 13
+#endif
+constant int _6 = SPIRV_CROSS_CONSTANT_ID_3;
+#ifndef SPIRV_CROSS_CONSTANT_ID_4
+#define SPIRV_CROSS_CONSTANT_ID_4 14
+#endif
+constant int _7 = SPIRV_CROSS_CONSTANT_ID_4;
+#define _37 ((as_uint(_6) + 3u))
+constant uint3 _38 = (uint3)(_37, _7, 2u);
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global float4* _10)
+{
+    _10[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x] = ((((_10[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x] + (float4)(2.0f)) + convert_float3(_38).xyzz) * convert_float(_6)) * convert_float(_7)) * convert_float(2u);
+}
+
diff --git a/reference/shaders-opencl-no-opt/asm/comp/modf-storage-class.asm.comp b/reference/shaders-opencl-no-opt/asm/comp/modf-storage-class.asm.comp
new file mode 100644
index 000000000..7522b61b7
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/asm/comp/modf-storage-class.asm.comp
@@ -0,0 +1,49 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct _19
+{
+    float2 _m0;
+    float2 _m1;
+};
+
+typedef struct _19 _19;
+
+struct _6
+{
+    uint2 _m0[324];
+};
+
+typedef struct _6 _6;
+
+struct _9
+{
+    float2 _m0[648];
+};
+
+typedef struct _9 _9;
+
+struct _13
+{
+    float2 _m0[648];
+};
+
+typedef struct _13 _13;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global const uint2* _7, __global float2* _11, __global float2* _14)
+{
+    for (uint _46 = 0u; _46 < 648u; _46 += 2u)
+    {
+        uint2 _47 = _7[_46 / 2u];
+        float2 _48 = as_float2(_47);
+        float2 _69 = modf(_48, &_11[_46]);
+        _11[_46 + 1u] = _69;
+        _19 _74;
+        _74._m0 = modf(_48, &_74._m1);
+        _19 _50 = _74;
+        _14[_46] = _50._m1;
+        _14[_46 + 1u] = _50._m0;
+    }
+}
+
diff --git a/reference/shaders-opencl-no-opt/asm/comp/opptrdiff-basic.spv14.invalid.asm.comp b/reference/shaders-opencl-no-opt/asm/comp/opptrdiff-basic.spv14.invalid.asm.comp
new file mode 100644
index 000000000..32195c0b5
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/asm/comp/opptrdiff-basic.spv14.invalid.asm.comp
@@ -0,0 +1,56 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct _7
+{
+    int _m0[1][4];
+};
+
+typedef struct _7 _7;
+
+struct _9
+{
+    int _m0[1][17];
+};
+
+typedef struct _9 _9;
+
+struct _11
+{
+    int _m0;
+};
+
+typedef struct _11 _11;
+
+__attribute__((reqd_work_group_size(4, 1, 1)))
+__kernel void comp_main(__global int* _2, __global int* _3, _11 _4)
+{
+    if (as_int3(((uint3)(get_group_id(0), get_group_id(1), get_group_id(2)))).x >= _4._m0)
+    {
+        return;
+    }
+    int _49;
+    if (as_int3(((uint3)(get_local_id(0), get_local_id(1), get_local_id(2)))).x == 1)
+    {
+        _3[as_int3(((uint3)(get_group_id(0), get_group_id(1), get_group_id(2)))).x][16] = &_2[as_int3(((uint3)(get_group_id(0), get_group_id(1), get_group_id(2)))).x] - &_2[0];
+        _49 = 0;
+    }
+    else
+    {
+        _49 = 0;
+    }
+    for (;;)
+    {
+        int _50 = _49 + 1;
+        _3[as_int3(((uint3)(get_group_id(0), get_group_id(1), get_group_id(2)))).x][(as_int3(((uint3)(get_local_id(0), get_local_id(1), get_local_id(2)))).x * 4) + _49] = &_2[as_int3(((uint3)(get_group_id(0), get_group_id(1), get_group_id(2)))).x][as_int3(((uint3)(get_local_id(0), get_local_id(1), get_local_id(2)))).x] - &_2[as_int3(((uint3)(get_group_id(0), get_group_id(1), get_group_id(2)))).x][_49];
+        if (_50 == 4)
+        {
+            break;
+        }
+        else
+        {
+            _49 = _50;
+        }
+    }
+}
+
diff --git a/reference/shaders-opencl-no-opt/asm/comp/opptrdiff-opptraccesschain-elem-offset.spv14.asm.comp b/reference/shaders-opencl-no-opt/asm/comp/opptrdiff-opptraccesschain-elem-offset.spv14.asm.comp
new file mode 100644
index 000000000..eb979ab04
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/asm/comp/opptrdiff-opptraccesschain-elem-offset.spv14.asm.comp
@@ -0,0 +1,50 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct _7
+{
+    int _m0;
+    int _m1[1];
+};
+
+typedef struct _7 _7;
+
+struct _9
+{
+    int2 _m0[1];
+};
+
+typedef struct _9 _9;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global _7* _2, __global int2* _3)
+{
+    __global int* _4;
+    __global int* _5;
+    int _28 = _2->_m0;
+    _4 = &_2->_m1[0];
+    _5 = &_2->_m1[0 + _28];
+    int _34;
+    if (!(_28 <= 0))
+    {
+        _34 = 0;
+        for (;;)
+        {
+            __global int* _36 = _4;
+            __global int* _37 = _5;
+            int _35 = _34 + 1;
+            _4 = &_36[1];
+            _5 = &_37[-1];
+            _3[_34] = (int2)(_36 - _37, _37 - _36);
+            if (_34 >= _28)
+            {
+                break;
+            }
+            else
+            {
+                _34 = _35;
+            }
+        }
+    }
+}
+
diff --git a/reference/shaders-opencl-no-opt/asm/comp/opptrequal-basic.spv14.asm.comp b/reference/shaders-opencl-no-opt/asm/comp/opptrequal-basic.spv14.asm.comp
new file mode 100644
index 000000000..3f60f0746
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/asm/comp/opptrequal-basic.spv14.asm.comp
@@ -0,0 +1,34 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct _7
+{
+    uint _m0[1];
+};
+
+typedef struct _7 _7;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global uint* _2, __global uint* _3, __global uint* _4, __global uint* _5)
+{
+    uint _18 = 0u;
+    uint _28 = _18 + 1u;
+    _5[_18] = (uint)(&_2 == &_3);
+    uint _32 = _28 + 1u;
+    _5[_28] = (uint)(&_2[0] == &_3[0]);
+    uint _36 = _32 + 1u;
+    _5[_32] = (uint)(&_2[0u] == &_3[0u]);
+    uint _40 = _36 + 1u;
+    _5[_36] = (uint)(&_2 == &_4);
+    uint _44 = _40 + 1u;
+    _5[_40] = (uint)(&_2[0] == &_4[0]);
+    uint _48 = _44 + 1u;
+    _5[_44] = (uint)(&_2[0u] == &_4[0u]);
+    uint _52 = _48 + 1u;
+    _5[_48] = (uint)(&_3 == &_4);
+    uint _56 = _52 + 1u;
+    _5[_52] = (uint)(&_3[0] == &_4[0]);
+    _5[_56] = (uint)(&_3[0u] == &_4[0u]);
+    _5[_56 + 1u] = (uint)(&_2 == &_2);
+}
+
diff --git a/reference/shaders-opencl-no-opt/asm/comp/opptrequal-row-maj-mtx-bypass-transpose.spv14.asm.invalid.comp b/reference/shaders-opencl-no-opt/asm/comp/opptrequal-row-maj-mtx-bypass-transpose.spv14.asm.invalid.comp
new file mode 100644
index 000000000..0f41e332f
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/asm/comp/opptrequal-row-maj-mtx-bypass-transpose.spv14.asm.invalid.comp
@@ -0,0 +1,52 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+typedef struct { float4 columns[4]; } spvMat4;
+
+struct _6
+{
+    spvMat4 _m0;
+    spvMat4 _m1;
+    float _m2;
+    float _m3;
+};
+
+typedef struct _6 _6;
+
+struct _7
+{
+    uint _m0[1];
+};
+
+typedef struct _7 _7;
+
+static spvMat4 spvTransposeMat4(spvMat4 m)
+{
+    spvMat4 r;
+    r.columns[0] = (float4)(m.columns[0].x, m.columns[1].x, m.columns[2].x, m.columns[3].x);
+    r.columns[1] = (float4)(m.columns[0].y, m.columns[1].y, m.columns[2].y, m.columns[3].y);
+    r.columns[2] = (float4)(m.columns[0].z, m.columns[1].z, m.columns[2].z, m.columns[3].z);
+    r.columns[3] = (float4)(m.columns[0].w, m.columns[1].w, m.columns[2].w, m.columns[3].w);
+    return r;
+}
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global _6* _2, __global _6* _3, __global uint* _4)
+{
+    uint _26 = 0u;
+    uint _39 = _26 + 1u;
+    _4[_26] = (&_2->_m2 == &_2->_m3) ? 0u : 1u;
+    bool _40 = &_2->_m2 == &_3->_m2;
+    uint _43 = _39 + 1u;
+    _4[_39] = _40 ? 0u : 1u;
+    bool _46 = _40 ? &_2->_m2 : &_2->_m3 == _40 ? &_3->_m2 : &_3->_m3;
+    uint _49 = _43 + 1u;
+    _4[_43] = _46 ? 0u : 1u;
+    uint _54 = _49 + 1u;
+    _4[_49] = (_46 ? &_2->_m2 : &_2->_m3 == &_2->_m0.columns[0u].x) ? 0u : 1u;
+    uint _56 = (&_2->_m0 == &spvTransposeMat4(_2->_m1)) ? 0u : 1u;
+    uint _58 = _54 + 1u;
+    _4[_54] = _56;
+    _4[_58] = _56;
+}
+
diff --git a/reference/shaders-opencl-no-opt/asm/comp/opptrnotequal-basic.spv14.asm.comp b/reference/shaders-opencl-no-opt/asm/comp/opptrnotequal-basic.spv14.asm.comp
new file mode 100644
index 000000000..2d46e33bc
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/asm/comp/opptrnotequal-basic.spv14.asm.comp
@@ -0,0 +1,34 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct _7
+{
+    uint _m0[1];
+};
+
+typedef struct _7 _7;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global uint* _2, __global uint* _3, __global uint* _4, __global uint* _5)
+{
+    uint _18 = 0u;
+    uint _28 = _18 + 1u;
+    _5[_18] = (uint)(&_2 != &_3);
+    uint _32 = _28 + 1u;
+    _5[_28] = (uint)(&_2[0] != &_3[0]);
+    uint _36 = _32 + 1u;
+    _5[_32] = (uint)(&_2[0u] != &_3[0u]);
+    uint _40 = _36 + 1u;
+    _5[_36] = (uint)(&_2 != &_4);
+    uint _44 = _40 + 1u;
+    _5[_40] = (uint)(&_2[0] != &_4[0]);
+    uint _48 = _44 + 1u;
+    _5[_44] = (uint)(&_2[0u] != &_4[0u]);
+    uint _52 = _48 + 1u;
+    _5[_48] = (uint)(&_3 != &_4);
+    uint _56 = _52 + 1u;
+    _5[_52] = (uint)(&_3[0] != &_4[0]);
+    _5[_56] = (uint)(&_3[0u] != &_4[0u]);
+    _5[_56 + 1u] = (uint)(&_2 != &_2);
+}
+
diff --git a/reference/shaders-opencl-no-opt/asm/comp/ptr-access-chain-custom-array-stride.asm.comp b/reference/shaders-opencl-no-opt/asm/comp/ptr-access-chain-custom-array-stride.asm.comp
new file mode 100644
index 000000000..9d37c04c1
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/asm/comp/ptr-access-chain-custom-array-stride.asm.comp
@@ -0,0 +1,21 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct Registers
+{
+    ulong a;
+    ulong b;
+    uint2 c;
+    uint2 d;
+};
+
+typedef struct Registers Registers;
+
+__attribute__((reqd_work_group_size(64, 1, 1)))
+__kernel void comp_main(Registers _7)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    *((__global float3*)((ulong)(((__global float3*)(_7.a))) + ((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x * 12)) = (*((__global float3*)((ulong)(((__global float3*)(_7.a))) + ((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x * 12))) + ((__global float3*)(_7.b))[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x];
+    *((__global float3*)((ulong)(((__global float3*)as_ulong(_7.c))) + ((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x * 12)) = (*((__global float3*)((ulong)(((__global float3*)as_ulong(_7.c))) + ((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x * 12))) + ((__global float3*)as_ulong(_7.d))[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x];
+}
+
diff --git a/reference/shaders-opencl-no-opt/asm/comp/spec-constant-name-aliasing.asm.comp b/reference/shaders-opencl-no-opt/asm/comp/spec-constant-name-aliasing.asm.comp
new file mode 100644
index 000000000..d07a53e83
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/asm/comp/spec-constant-name-aliasing.asm.comp
@@ -0,0 +1,48 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct SSBO
+{
+    int values[1];
+};
+
+typedef struct SSBO SSBO;
+
+#ifndef SPIRV_CROSS_CONSTANT_ID_0
+#define SPIRV_CROSS_CONSTANT_ID_0 0
+#endif
+constant int A = SPIRV_CROSS_CONSTANT_ID_0;
+#ifndef SPIRV_CROSS_CONSTANT_ID_1
+#define SPIRV_CROSS_CONSTANT_ID_1 1
+#endif
+constant int A_1 = SPIRV_CROSS_CONSTANT_ID_1;
+#ifndef SPIRV_CROSS_CONSTANT_ID_2
+#define SPIRV_CROSS_CONSTANT_ID_2 2
+#endif
+constant int A_2 = SPIRV_CROSS_CONSTANT_ID_2;
+#ifndef SPIRV_CROSS_CONSTANT_ID_3
+#define SPIRV_CROSS_CONSTANT_ID_3 3
+#endif
+constant int A_3 = SPIRV_CROSS_CONSTANT_ID_3;
+#ifndef SPIRV_CROSS_CONSTANT_ID_4
+#define SPIRV_CROSS_CONSTANT_ID_4 4
+#endif
+constant int A_4 = SPIRV_CROSS_CONSTANT_ID_4;
+#ifndef SPIRV_CROSS_CONSTANT_ID_5
+#define SPIRV_CROSS_CONSTANT_ID_5 5
+#endif
+constant int A_5 = SPIRV_CROSS_CONSTANT_ID_5;
+#define A_6 ((A - A_1))
+#define A_7 ((A_6 - A_2))
+#define A_8 ((A_7 - A_3))
+#define A_9 ((A_8 - A_4))
+#define A_10 ((A_9 - A_5))
+#define A_11 ((A_10 + A_5))
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global int* _7)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    _7[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x] = A_11;
+}
+
diff --git a/reference/shaders-opencl-no-opt/asm/comp/storage-buffer-basic.invalid.asm.comp b/reference/shaders-opencl-no-opt/asm/comp/storage-buffer-basic.invalid.asm.comp
new file mode 100644
index 000000000..952585e08
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/asm/comp/storage-buffer-basic.invalid.asm.comp
@@ -0,0 +1,27 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct _3
+{
+    float _m0[1];
+};
+
+typedef struct _3 _3;
+
+#ifndef SPIRV_CROSS_CONSTANT_ID_0
+#define SPIRV_CROSS_CONSTANT_ID_0 1u
+#endif
+constant uint _15 = SPIRV_CROSS_CONSTANT_ID_0;
+#ifndef SPIRV_CROSS_CONSTANT_ID_2
+#define SPIRV_CROSS_CONSTANT_ID_2 3u
+#endif
+constant uint _17 = SPIRV_CROSS_CONSTANT_ID_2;
+constant uint3 spvWorkgroupSize = (uint3)(_15, 2u, _17);
+
+__attribute__((reqd_work_group_size(1, 2, 3)))
+__kernel void comp_main(__global float* _20, __global float* _21)
+{
+    uint3 _19 = spvWorkgroupSize = spvWorkgroupSize;
+    _20[((uint3)(get_group_id(0), get_group_id(1), get_group_id(2))).x] = _21[((uint3)(get_group_id(0), get_group_id(1), get_group_id(2))).x] + _20[((uint3)(get_group_id(0), get_group_id(1), get_group_id(2))).x];
+}
+
diff --git a/reference/shaders-opencl-no-opt/asm/comp/storage-buffer-pointer-argument.asm.comp b/reference/shaders-opencl-no-opt/asm/comp/storage-buffer-pointer-argument.asm.comp
new file mode 100644
index 000000000..9a56784a5
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/asm/comp/storage-buffer-pointer-argument.asm.comp
@@ -0,0 +1,28 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct SSBO
+{
+    float a;
+};
+
+typedef struct SSBO SSBO;
+
+struct SSBORead
+{
+    float b;
+};
+
+typedef struct SSBORead SSBORead;
+
+void copy_out(__global float* A_1, __global float* B_1)
+{
+    *A_1 = *B_1;
+}
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global float* _10, __global const float* _14)
+{
+    copy_out(&_10[0], &_14[0]);
+}
+
diff --git a/reference/shaders-opencl-no-opt/asm/comp/trancendentals-float-controls-2-fp16.fp16.asm.comp b/reference/shaders-opencl-no-opt/asm/comp/trancendentals-float-controls-2-fp16.fp16.asm.comp
new file mode 100644
index 000000000..230929e90
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/asm/comp/trancendentals-float-controls-2-fp16.fp16.asm.comp
@@ -0,0 +1,35 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+struct SSBO
+{
+    float v[4];
+    half f16[4];
+};
+
+typedef struct SSBO SSBO;
+
+__attribute__((reqd_work_group_size(4, 1, 1)))
+__kernel void comp_main(__global SSBO* _9)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    _9->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))] = cos(_9->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))]);
+    _9->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))] += sin(_9->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))]);
+    _9->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))] += tan(_9->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))]);
+    _9->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))] += acos(_9->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))]);
+    _9->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))] += asin(_9->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))]);
+    _9->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))] += atan(_9->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))]);
+    _9->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))] += exp(_9->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))]);
+    _9->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))] += exp2(_9->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))]);
+    _9->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))] += log(_9->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))]);
+    _9->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))] += log2(_9->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))]);
+    _9->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))] += sqrt(_9->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))]);
+    _9->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))] += rsqrt(_9->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))]);
+    _9->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))] += pow(_9->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))], 4.0f);
+    _9->f16[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))] = cos(_9->f16[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))]);
+    _9->f16[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))] += sin(_9->f16[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))]);
+    _9->f16[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))] += cosh(_9->f16[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))]);
+    _9->f16[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))] += sinh(_9->f16[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))]);
+}
+
diff --git a/reference/shaders-opencl-no-opt/asm/comp/trancendentals-float-controls-2-fp32.fp16.asm.comp b/reference/shaders-opencl-no-opt/asm/comp/trancendentals-float-controls-2-fp32.fp16.asm.comp
new file mode 100644
index 000000000..230929e90
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/asm/comp/trancendentals-float-controls-2-fp32.fp16.asm.comp
@@ -0,0 +1,35 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+struct SSBO
+{
+    float v[4];
+    half f16[4];
+};
+
+typedef struct SSBO SSBO;
+
+__attribute__((reqd_work_group_size(4, 1, 1)))
+__kernel void comp_main(__global SSBO* _9)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    _9->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))] = cos(_9->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))]);
+    _9->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))] += sin(_9->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))]);
+    _9->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))] += tan(_9->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))]);
+    _9->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))] += acos(_9->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))]);
+    _9->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))] += asin(_9->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))]);
+    _9->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))] += atan(_9->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))]);
+    _9->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))] += exp(_9->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))]);
+    _9->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))] += exp2(_9->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))]);
+    _9->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))] += log(_9->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))]);
+    _9->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))] += log2(_9->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))]);
+    _9->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))] += sqrt(_9->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))]);
+    _9->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))] += rsqrt(_9->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))]);
+    _9->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))] += pow(_9->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))], 4.0f);
+    _9->f16[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))] = cos(_9->f16[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))]);
+    _9->f16[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))] += sin(_9->f16[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))]);
+    _9->f16[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))] += cosh(_9->f16[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))]);
+    _9->f16[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))] += sinh(_9->f16[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))]);
+}
+
diff --git a/reference/shaders-opencl-no-opt/asm/comp/variable-pointers-2.asm.comp b/reference/shaders-opencl-no-opt/asm/comp/variable-pointers-2.asm.comp
new file mode 100644
index 000000000..dfbbef692
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/asm/comp/variable-pointers-2.asm.comp
@@ -0,0 +1,16 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+__attribute__((reqd_work_group_size(64, 1, 1)))
+__kernel void comp_main()
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    __local float2 test[64];
+    float _21 = convert_float(((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x);
+    float2 _22 = (float2)(_21);
+    ((&((&test)[0u]))[0u])[1u + 2u] = _22;
+    ((&test)[0u])[1u + 2u] = _22;
+    ((&test)[0u])[3u] = _22;
+    ((&test)[0u])[2u + 1u].x = _21;
+}
+
diff --git a/reference/shaders-opencl-no-opt/asm/comp/variable-pointers-3.invalid.asm.comp b/reference/shaders-opencl-no-opt/asm/comp/variable-pointers-3.invalid.asm.comp
new file mode 100644
index 000000000..99ce6ceff
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/asm/comp/variable-pointers-3.invalid.asm.comp
@@ -0,0 +1,12 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+__attribute__((reqd_work_group_size(64, 1, 1)))
+__kernel void comp_main()
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    __local float2 test[64];
+    float _21 = convert_float(((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x);
+    (true ? &((&test)[0u])[2u].x : &((&test)[0u])[2u].x)[1u] = _21;
+}
+
diff --git a/reference/shaders-opencl-no-opt/asm/comp/variable-pointers-vector-to-scalar.asm.comp b/reference/shaders-opencl-no-opt/asm/comp/variable-pointers-vector-to-scalar.asm.comp
new file mode 100644
index 000000000..5af73eb76
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/asm/comp/variable-pointers-vector-to-scalar.asm.comp
@@ -0,0 +1,12 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+__attribute__((reqd_work_group_size(64, 1, 1)))
+__kernel void comp_main()
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    __local float2 test[64];
+    float _21 = convert_float(((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x);
+    (*(true ? &test[1u] : &test[2u])).y = _21;
+}
+
diff --git a/reference/shaders-opencl-no-opt/asm/comp/variable-pointers.asm.invalid.comp b/reference/shaders-opencl-no-opt/asm/comp/variable-pointers.asm.invalid.comp
new file mode 100644
index 000000000..7b9c6e61d
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/asm/comp/variable-pointers.asm.invalid.comp
@@ -0,0 +1,77 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct foo
+{
+    int a[128];
+    uint b;
+    float2 c;
+};
+
+typedef struct foo foo;
+
+struct bar
+{
+    int d;
+};
+
+typedef struct bar bar;
+
+struct baz
+{
+    int e[128];
+};
+
+typedef struct baz baz;
+
+__global int* select_buffer(__global foo* buf, __global int* buf2, bar cb)
+{
+    return (cb.d != 0) ? &buf->a[0u] : &buf2[0u];
+}
+
+__global int* select_buffer_null(__global foo* buf, bar cb)
+{
+    return (cb.d != 0) ? &buf->a[0u] : NULL;
+}
+
+__local int* select_tgsm(bar cb, __local int* tgsm)
+{
+    return (cb.d != 0) ? &tgsm[0u] : NULL;
+}
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global foo* buf, bar cb, __global int* buf2)
+{
+    __local int tgsm[128];
+    __global int* sbuf_1_1;
+    __global int* sbuf2_1_1;
+    __local int* stgsm_1_1;
+    sbuf_1_1 = select_buffer(buf, buf2, cb);
+    sbuf2_1_1 = select_buffer_null(buf, cb);
+    stgsm_1_1 = select_tgsm(cb, tgsm);
+    __local int* cur_1_1 = stgsm_1_1;
+    __global int* _78;
+    _78 = &buf->a[0u];
+    __local int* _81;
+    int _82;
+    for (;;)
+    {
+        _81 = cur_1_1;
+        _82 = *_78;
+        if (_82 != 0)
+        {
+            int _86 = *_81;
+            int _87 = _82 + _86;
+            *_78 = _87;
+            *_81 = _87;
+            cur_1_1 = &_81[1u];
+            _78 = &_78[1u];
+            continue;
+        }
+        else
+        {
+            break;
+        }
+    }
+}
+
diff --git a/reference/shaders-opencl-no-opt/asm/comp/variable-ssbo-argument.spv16.asm.comp b/reference/shaders-opencl-no-opt/asm/comp/variable-ssbo-argument.spv16.asm.comp
new file mode 100644
index 000000000..86ba9b715
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/asm/comp/variable-ssbo-argument.spv16.asm.comp
@@ -0,0 +1,21 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct _3
+{
+    uchar _m0[1];
+};
+
+typedef struct _3 _3;
+
+void _20(__global uchar* _21)
+{
+    _21[2u] = (uchar)(0);
+}
+
+__attribute__((reqd_work_group_size(16, 1, 1)))
+__kernel void comp_main(__global uchar* _2)
+{
+    _20(&_2[1u]);
+}
+
diff --git a/reference/shaders-opencl-no-opt/asm/comp/variable-ssbo-array-argument.spv16.invalid.asm.comp b/reference/shaders-opencl-no-opt/asm/comp/variable-ssbo-array-argument.spv16.invalid.asm.comp
new file mode 100644
index 000000000..2b20027cd
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/asm/comp/variable-ssbo-array-argument.spv16.invalid.asm.comp
@@ -0,0 +1,21 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct _3
+{
+    uchar _m0[16];
+};
+
+typedef struct _3 _3;
+
+void _20(__global uchar* _21[16])
+{
+    (*_21)[2u] = (uchar)(0);
+}
+
+__attribute__((reqd_work_group_size(16, 1, 1)))
+__kernel void comp_main(__global uchar* _2)
+{
+    _20(&_2[0]);
+}
+
diff --git a/reference/shaders-opencl-no-opt/asm/comp/workgroup-uint-to-uchar-alias-ptr-access-chain.asm.invalid.comp b/reference/shaders-opencl-no-opt/asm/comp/workgroup-uint-to-uchar-alias-ptr-access-chain.asm.invalid.comp
new file mode 100644
index 000000000..c6c583b90
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/asm/comp/workgroup-uint-to-uchar-alias-ptr-access-chain.asm.invalid.comp
@@ -0,0 +1,145 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+#ifndef SPIRV_CROSS_CONSTANT_ID_0
+#define SPIRV_CROSS_CONSTANT_ID_0 1u
+#endif
+constant uint _15 = SPIRV_CROSS_CONSTANT_ID_0;
+#ifndef SPIRV_CROSS_CONSTANT_ID_1
+#define SPIRV_CROSS_CONSTANT_ID_1 1u
+#endif
+constant uint _16 = SPIRV_CROSS_CONSTANT_ID_1;
+#ifndef SPIRV_CROSS_CONSTANT_ID_2
+#define SPIRV_CROSS_CONSTANT_ID_2 1u
+#endif
+constant uint _17 = SPIRV_CROSS_CONSTANT_ID_2;
+constant uint3 spvWorkgroupSize = (uint3)(_15, _16, _17);
+
+struct _6
+{
+    uint4 _m0[1];
+};
+
+typedef struct _6 _6;
+
+struct _7
+{
+    uint _m0;
+};
+
+typedef struct _7 _7;
+
+struct _8
+{
+    _7 _m0;
+};
+
+typedef struct _8 _8;
+
+constant uchar4 _137 = (uchar4)(0);
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global uint4* _25, _8 _29)
+{
+    __local uint _5[256];
+    __local uchar _10[1024];
+    uint3 _20 = spvWorkgroupSize = spvWorkgroupSize;
+    bool _40 = _29._m0._m0 != 0u;
+    if (_40)
+    {
+        uchar _58 = convert_uchar(((((uint3)(get_local_id(0), get_local_id(1), get_local_id(2))).y * ((uint3)(get_local_id(0), get_local_id(1), get_local_id(2))).x) / ((uint3)(get_group_id(0), get_group_id(1), get_group_id(2))).y) % 255u);
+        uint _66;
+        uint _61 = 0u;
+        uint _62;
+        for (;;)
+        {
+            _62 = _61 * _29._m0._m0;
+            _66 = 0u;
+            for (;;)
+            {
+                uint _67 = _66 + _62;
+                uint _68 = _66 * _61;
+                _5[_67] = ((uint3)(get_group_id(0), get_group_id(1), get_group_id(2))).x + _68;
+                uint _74 = _67 << 2u;
+                uint _76 = _74 >> 10u;
+                uint _78 = _74 & 1020u;
+                uchar4 _80 = as_uchar4(((uint3)(get_group_id(0), get_group_id(1), get_group_id(2))).y + _68);
+                ((&_10)[_76])[_78 | 1u] = _80.y;
+                ((&_10)[_76])[_78 | 2u] = _80.z;
+                ((&_10)[_76])[_78 | 3u] = _80.w;
+                ((&_10)[_76])[_78] = _58;
+                uint _93 = _66 + 1u;
+                if (_93 >= _29._m0._m0)
+                {
+                    break;
+                }
+                else
+                {
+                    _66 = _93;
+                }
+            }
+            uint _100 = _61 + 1u;
+            if (_100 >= _29._m0._m0)
+            {
+                break;
+            }
+            else
+            {
+                _61 = _100;
+                continue;
+            }
+        }
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint _112;
+    if (_40)
+    {
+        _112 = 0u;
+        uint _117;
+        uint _113;
+        for (;;)
+        {
+            _113 = _112 * _29._m0._m0;
+            _117 = 0u;
+            for (;;)
+            {
+                uint _118 = _117 + _113;
+                uint _123 = _118 << 2u;
+                uint _124 = _123 >> 10u;
+                uint _125 = _123 & 1020u;
+                uchar4 _138;
+                _138.x = ((&_10)[_124])[_125];
+                _138.y = ((&_10)[_124])[_125 | 1u];
+                _138.z = ((&_10)[_124])[_125 | 2u];
+                _138.w = ((&_10)[_124])[_125 | 3u];
+                uint _143 = _5[_118] + as_uint(_138);
+                uint4 _144 = _25[_118];
+                _144.x = _143;
+                _144.y = _143 >> 2u;
+                _144.w = _143 >> 3u;
+                _25[_118] = _144;
+                uint _150 = _117 + 1u;
+                if (_150 >= _29._m0._m0)
+                {
+                    break;
+                }
+                else
+                {
+                    _117 = _150;
+                }
+            }
+            uint _157 = _112 + 1u;
+            if (_157 >= _29._m0._m0)
+            {
+                break;
+            }
+            else
+            {
+                _112 = _157;
+                continue;
+            }
+        }
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+}
+
diff --git a/reference/shaders-opencl-no-opt/comp/argument-buffer-readonly-writeonly-alias.argument.argument-tier-1.device-argument-buffer.invalid.comp b/reference/shaders-opencl-no-opt/comp/argument-buffer-readonly-writeonly-alias.argument.argument-tier-1.device-argument-buffer.invalid.comp
new file mode 100644
index 000000000..e69de29bb
diff --git a/reference/shaders-opencl-no-opt/comp/array-copy-threadgroup-memory.invalid.comp b/reference/shaders-opencl-no-opt/comp/array-copy-threadgroup-memory.invalid.comp
new file mode 100644
index 000000000..33bdcbea5
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/comp/array-copy-threadgroup-memory.invalid.comp
@@ -0,0 +1,20 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+__attribute__((reqd_work_group_size(8, 1, 1)))
+__kernel void comp_main()
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    __local float shared_group[8][8];
+    __local float shared_group_alt[8][8];
+    float blob[8];
+    for (int i = 0; i < 8; i++)
+    {
+        blob[i] = convert_float(i);
+    }
+    shared_group[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))] = blob;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    float copied_blob[8] = shared_group[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0))) ^ 1u];
+    shared_group_alt[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))] = shared_group[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))];
+}
+
diff --git a/reference/shaders-opencl-no-opt/comp/atomic-cmpxchg-packed-vector.invalid.comp b/reference/shaders-opencl-no-opt/comp/atomic-cmpxchg-packed-vector.invalid.comp
new file mode 100644
index 000000000..e7237a064
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/comp/atomic-cmpxchg-packed-vector.invalid.comp
@@ -0,0 +1,31 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct AttData0
+{
+    uint3 att0[1];
+};
+
+typedef struct AttData0 AttData0;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global uint3* _22)
+{
+    uint newVal_1 = 432u;
+    uint prevVal_1 = 0u;
+    uint curVal_1 = 0u;
+    for (;;)
+    {
+        uint _30 = atomic_cmpxchg(&(_22[0][0u]), prevVal_1, newVal_1);
+        curVal_1 = _30;
+        if (_30 != prevVal_1)
+        {
+            continue;
+        }
+        else
+        {
+            break;
+        }
+    }
+}
+
diff --git a/reference/shaders-opencl-no-opt/comp/basic.invalid.comp b/reference/shaders-opencl-no-opt/comp/basic.invalid.comp
new file mode 100644
index 000000000..e69de29bb
diff --git a/reference/shaders-opencl-no-opt/comp/bda-atomics.invalid.comp b/reference/shaders-opencl-no-opt/comp/bda-atomics.invalid.comp
new file mode 100644
index 000000000..5b9a08f9b
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/comp/bda-atomics.invalid.comp
@@ -0,0 +1,44 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct Ptr;
+
+struct Registers
+{
+    ulong ptr;
+};
+
+typedef struct Registers Registers;
+
+struct Ptr
+{
+    uint i;
+    uint2 i2;
+};
+
+typedef struct Ptr Ptr;
+
+struct UBO
+{
+    ulong ptr_ubo;
+};
+
+typedef struct UBO UBO;
+
+struct SSBO
+{
+    ulong ptr_ssbo;
+};
+
+typedef struct SSBO SSBO;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(Registers _12, UBO _26, __global const __global Ptr** _35)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    uint _23 = atomic_add(&((__global Ptr*)(_12.ptr))->i, 10u);
+    uint _32 = atomic_add(&((__global Ptr*)(_26.ptr_ubo))->i, 11u);
+    uint _41 = atomic_add(&((__global Ptr*)(_35[0]))->i, 12u);
+    uint _51 = atomic_add(&((__global Ptr*)as_ulong(((__global Ptr*)(_12.ptr))->i2))->i, 13u);
+}
+
diff --git a/reference/shaders-opencl-no-opt/comp/bda-load-std140-arrayed-pointer.invalid.comp b/reference/shaders-opencl-no-opt/comp/bda-load-std140-arrayed-pointer.invalid.comp
new file mode 100644
index 000000000..cb3ef0331
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/comp/bda-load-std140-arrayed-pointer.invalid.comp
@@ -0,0 +1,27 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct SSBO;
+
+struct SSBO
+{
+    float data[1];
+};
+
+typedef struct SSBO SSBO;
+
+struct UBO
+{
+    __global SSBO* ptrs[2];
+};
+
+typedef struct UBO UBO;
+
+__attribute__((reqd_work_group_size(256, 1, 1)))
+__kernel void comp_main(UBO _17)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    __global SSBO* s0 = ((__global SSBO*)(_17.ptrs[0]));
+    s0->data[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x] += 1.0f;
+}
+
diff --git a/reference/shaders-opencl-no-opt/comp/bda-nonwritable-glslang-workaround.comp b/reference/shaders-opencl-no-opt/comp/bda-nonwritable-glslang-workaround.comp
new file mode 100644
index 000000000..3decaac79
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/comp/bda-nonwritable-glslang-workaround.comp
@@ -0,0 +1,26 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct SSBO;
+
+struct Registers
+{
+    uint2 bda;
+};
+
+typedef struct Registers Registers;
+
+struct SSBO
+{
+    float data[1];
+};
+
+typedef struct SSBO SSBO;
+
+__attribute__((reqd_work_group_size(256, 1, 1)))
+__kernel void comp_main(Registers _10)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    ((__global SSBO*)as_ulong(_10.bda))->data[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x] = 0.0f;
+}
+
diff --git a/reference/shaders-opencl-no-opt/comp/bda-restrict-pointer-variable.invalid.comp b/reference/shaders-opencl-no-opt/comp/bda-restrict-pointer-variable.invalid.comp
new file mode 100644
index 000000000..5d82fb4d5
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/comp/bda-restrict-pointer-variable.invalid.comp
@@ -0,0 +1,26 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct Ref;
+
+struct Ref
+{
+    float4 v;
+};
+
+typedef struct Ref Ref;
+
+struct Registers
+{
+    ulong foo;
+};
+
+typedef struct Registers Registers;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(Registers _14)
+{
+    restrict __global Ref* __restrict ref = ((__global Ref*)(_14.foo));
+    ref->v = (float4)(1.0f);
+}
+
diff --git a/reference/shaders-opencl-no-opt/comp/bitcast-16bit-1.invalid.comp b/reference/shaders-opencl-no-opt/comp/bitcast-16bit-1.invalid.comp
new file mode 100644
index 000000000..e69de29bb
diff --git a/reference/shaders-opencl-no-opt/comp/bitcast-16bit-2.invalid.comp b/reference/shaders-opencl-no-opt/comp/bitcast-16bit-2.invalid.comp
new file mode 100644
index 000000000..e69de29bb
diff --git a/reference/shaders-opencl-no-opt/comp/bitfield.comp b/reference/shaders-opencl-no-opt/comp/bitfield.comp
new file mode 100644
index 000000000..754ec2495
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/comp/bitfield.comp
@@ -0,0 +1,35 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+uint spvBitReverse(uint v) {
+    v = ((v >> 1u) & 0x55555555u) | ((v & 0x55555555u) << 1u);
+    v = ((v >> 2u) & 0x33333333u) | ((v & 0x33333333u) << 2u);
+    v = ((v >> 4u) & 0x0F0F0F0Fu) | ((v & 0x0F0F0F0Fu) << 4u);
+    v = ((v >> 8u) & 0x00FF00FFu) | ((v & 0x00FF00FFu) << 8u);
+    return (v >> 16u) | (v << 16u);
+}
+
+static int spvFindLSB(uint x) {
+    if (x == 0u) return -1;
+    return 31 - as_int(clz(x & (0u - x)));
+}
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main()
+{
+    int signed_value_1 = 0;
+    uint unsigned_value_1 = 0u;
+    int s_1 = (signed_value_1 << (32 - 20 - 5)) >> (32 - 20);
+    uint u_1 = (unsigned_value_1 >> 6) & ((uint)(1u << 21) - (uint)1u);
+    s_1 = as_int((as_uint(s_1) & ~(((uint)(1u << 4) - (uint)1u) << 5)) | ((as_uint(40) << 5) & (((uint)(1u << 4) - (uint)1u) << 5)));
+    u_1 = (u_1 & ~(((uint)(1u << 4) - (uint)1u) << 5)) | ((60u << 5) & (((uint)(1u << 4) - (uint)1u) << 5));
+    u_1 = spvBitReverse(u_1);
+    s_1 = as_int(spvBitReverse(as_uint(s_1)));
+    int v0_1 = as_int(popcount(u_1));
+    int v1_1 = popcount(s_1);
+    int v2_1 = 31 - as_int(clz(u_1));
+    int v3_1 = 31 - clz(s_1 ^ (s_1 >> 31));
+    int v4_1 = spvFindLSB(u_1);
+    int v5_1 = spvFindLSB(as_uint(s_1));
+}
+
diff --git a/reference/shaders-opencl-no-opt/comp/buffer-device-address-from-pointer-complex-chain.comp b/reference/shaders-opencl-no-opt/comp/buffer-device-address-from-pointer-complex-chain.comp
new file mode 100644
index 000000000..350c009c8
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/comp/buffer-device-address-from-pointer-complex-chain.comp
@@ -0,0 +1,33 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct SSBO;
+
+struct S
+{
+    float3 v;
+};
+
+typedef struct S S;
+
+struct SSBO
+{
+    S s[1];
+};
+
+typedef struct SSBO SSBO;
+
+struct PC
+{
+    uint2 ptr;
+};
+
+typedef struct PC PC;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(PC pc)
+{
+    __global SSBO* ssbo = ((__global SSBO*)as_ulong(pc.ptr));
+    ssbo->s[0].v = (float3)(1.0f);
+}
+
diff --git a/reference/shaders-opencl-no-opt/comp/extract-atomics-from-function.invalid.comp b/reference/shaders-opencl-no-opt/comp/extract-atomics-from-function.invalid.comp
new file mode 100644
index 000000000..016fbcd95
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/comp/extract-atomics-from-function.invalid.comp
@@ -0,0 +1,90 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+#define var (*var_ptr)
+#define var (*var_ptr)
+void testAdd(__local uint* var_ptr)
+{
+    uint _29 = atomic_add(&var, 1u);
+}
+
+#undef var
+
+#define var (*var_ptr)
+void testMin(__local uint* var_ptr)
+{
+    uint _31 = atomic_min(&var, 2u);
+}
+
+#undef var
+
+#define var (*var_ptr)
+void testMax(__local uint* var_ptr)
+{
+    uint _33 = atomic_max(&var, 3u);
+}
+
+#undef var
+
+#define var (*var_ptr)
+void testOr(__local uint* var_ptr)
+{
+    uint _35 = atomic_or(&var, 5u);
+}
+
+#undef var
+
+#define var (*var_ptr)
+void testXor(__local uint* var_ptr)
+{
+    uint _37 = atomic_xor(&var, 6u);
+}
+
+#undef var
+
+#define var (*var_ptr)
+void testExchange(__local uint* var_ptr)
+{
+    uint _39 = atomic_xchg(&var, 7u);
+}
+
+#undef var
+
+#define var (*var_ptr)
+void testCompSwap(__local uint* var_ptr)
+{
+    uint _42 = atomic_cmpxchg(&var, 8u, 9u);
+}
+
+#undef var
+
+#define var (*var_ptr)
+void testStore(__local uint* var_ptr)
+{
+    atomic_xchg(&var, 10u);
+}
+
+#undef var
+
+void foo(__local uint* var_ptr)
+{
+    testAdd(&var);
+    testMin(&var);
+    testMax(&var);
+    testOr(&var);
+    testXor(&var);
+    testExchange(&var);
+    testCompSwap(&var);
+    testStore(&var);
+}
+
+#undef var
+
+__attribute__((reqd_work_group_size(64, 1, 1)))
+__kernel void comp_main()
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    __local uint var;
+    foo(&var);
+}
+
diff --git a/reference/shaders-opencl-no-opt/comp/global-invocation-id-writable-ssbo-in-function.comp b/reference/shaders-opencl-no-opt/comp/global-invocation-id-writable-ssbo-in-function.comp
new file mode 100644
index 000000000..8edae4313
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/comp/global-invocation-id-writable-ssbo-in-function.comp
@@ -0,0 +1,27 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct myBlock
+{
+    int a;
+    float b[1];
+};
+
+typedef struct myBlock myBlock;
+
+#define _RESERVED_IDENTIFIER_FIXUP_gl_GlobalInvocationID (*_RESERVED_IDENTIFIER_FIXUP_gl_GlobalInvocationID_ptr)
+float getB(__global myBlock* myStorage, __private uint3* _RESERVED_IDENTIFIER_FIXUP_gl_GlobalInvocationID_ptr)
+{
+    return myStorage->b[_RESERVED_IDENTIFIER_FIXUP_gl_GlobalInvocationID.x];
+}
+
+#undef _RESERVED_IDENTIFIER_FIXUP_gl_GlobalInvocationID
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global myBlock* myStorage)
+{
+    uint3 _RESERVED_IDENTIFIER_FIXUP_gl_GlobalInvocationID = ((uint3)(get_global_id(0), get_global_id(1), get_global_id(2)));
+    myStorage->a = (myStorage->a + 1) % 256;
+    myStorage->b[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x] = fmod(getB(myStorage, &_RESERVED_IDENTIFIER_FIXUP_gl_GlobalInvocationID) + 0.0199999995529651641845703125f, 1.0f);
+}
+
diff --git a/reference/shaders-opencl-no-opt/comp/glsl.std450.comp b/reference/shaders-opencl-no-opt/comp/glsl.std450.comp
new file mode 100644
index 000000000..9188ba1e6
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/comp/glsl.std450.comp
@@ -0,0 +1,234 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+typedef struct { float2 columns[2]; } spvMat2;
+typedef struct { float3 columns[3]; } spvMat3;
+typedef struct { float4 columns[4]; } spvMat4;
+
+struct SSBO
+{
+    float res;
+    int ires;
+    uint ures;
+    float4 f32;
+    int4 s32;
+    uint4 u32;
+    spvMat2 m2;
+    spvMat3 m3;
+    spvMat4 m4;
+};
+
+typedef struct SSBO SSBO;
+
+struct ResType
+{
+    float _m0;
+    float _m1;
+};
+
+typedef struct ResType ResType;
+
+struct ResType_1
+{
+    float _m0;
+    int _m1;
+};
+
+typedef struct ResType_1 ResType_1;
+
+uint spvPackHalf2x16(float2 v) {
+    uint r;
+    vstore_half(v.x, 0, (__private half *)&r);
+    vstore_half(v.y, 1, (__private half *)&r);
+    return r;
+}
+
+float2 spvUnpackHalf2x16(uint u) {
+    const __private uint *p = &u;
+    return (float2)(vload_half(0, (const __private half *)p),
+                   vload_half(1, (const __private half *)p));
+}
+
+static int spvFindLSB(uint x) {
+    if (x == 0u) return -1;
+    return 31 - as_int(clz(x & (0u - x)));
+}
+
+static uint spvPackSnorm4x8(float4 v) {
+    char4 packed = convert_char4_sat_rte(v * 127.0f);
+    return as_uint(packed);
+}
+
+static uint spvPackUnorm4x8(float4 v) {
+    uchar4 packed = convert_uchar4_sat_rte(v * 255.0f);
+    return as_uint(packed);
+}
+
+static uint spvPackSnorm2x16(float2 v) {
+    short2 packed = convert_short2_sat_rte(v * 32767.0f);
+    return as_uint(packed);
+}
+
+static uint spvPackUnorm2x16(float2 v) {
+    ushort2 packed = convert_ushort2_sat_rte(v * 65535.0f);
+    return as_uint(packed);
+}
+
+static float4 spvUnpackSnorm4x8(uint v) {
+    char4 packed = as_char4(v);
+    return max(convert_float4(packed) / 127.0f, (float4)(-1.0f));
+}
+
+static float4 spvUnpackUnorm4x8(uint v) {
+    uchar4 packed = as_uchar4(v);
+    return convert_float4(packed) / 255.0f;
+}
+
+static float2 spvUnpackSnorm2x16(uint v) {
+    short2 packed = as_short2(v);
+    return max(convert_float2(packed) / 32767.0f, (float2)(-1.0f));
+}
+
+static float2 spvUnpackUnorm2x16(uint v) {
+    ushort2 packed = as_ushort2(v);
+    return convert_float2(packed) / 65535.0f;
+}
+
+static float spvDeterminant2(spvMat2 m) {
+    return m.columns[0].x * m.columns[1].y - m.columns[0].y * m.columns[1].x;
+}
+
+static float spvDeterminant3(spvMat3 m) {
+    return dot(m.columns[0], (float3)(m.columns[1].y * m.columns[2].z - m.columns[1].z * m.columns[2].y, m.columns[1].z * m.columns[2].x - m.columns[1].x * m.columns[2].z, m.columns[1].x * m.columns[2].y - m.columns[1].y * m.columns[2].x));
+}
+
+static float spvDeterminant4(spvMat4 m) {
+    return dot(m.columns[0], (float4)(m.columns[2].y * m.columns[3].z * m.columns[1].w - m.columns[3].y * m.columns[2].z * m.columns[1].w + m.columns[3].y * m.columns[1].z * m.columns[2].w - m.columns[1].y * m.columns[3].z * m.columns[2].w - m.columns[2].y * m.columns[1].z * m.columns[3].w + m.columns[1].y * m.columns[2].z * m.columns[3].w, m.columns[3].x * m.columns[2].z * m.columns[1].w - m.columns[2].x * m.columns[3].z * m.columns[1].w - m.columns[3].x * m.columns[1].z * m.columns[2].w + m.columns[1].x * m.columns[3].z * m.columns[2].w + m.columns[2].x * m.columns[1].z * m.columns[3].w - m.columns[1].x * m.columns[2].z * m.columns[3].w, m.columns[2].x * m.columns[3].y * m.columns[1].w - m.columns[3].x * m.columns[2].y * m.columns[1].w + m.columns[3].x * m.columns[1].y * m.columns[2].w - m.columns[1].x * m.columns[3].y * m.columns[2].w - m.columns[2].x * m.columns[1].y * m.columns[3].w + m.columns[1].x * m.columns[2].y * m.columns[3].w, m.columns[3].x * m.columns[2].y * m.columns[1].z - m.columns[2].x * m.columns[3].y * m.columns[1].z - m.columns[3].x * m.columns[1].y * m.columns[2].z + m.columns[1].x * m.columns[3].y * m.columns[2].z + m.columns[2].x * m.columns[1].y * m.columns[3].z - m.columns[1].x * m.columns[2].y * m.columns[3].z));
+}
+
+static spvMat2 spvInverse2(spvMat2 m) {
+    float d = 1.0f / (m.columns[0].x * m.columns[1].y - m.columns[1].x * m.columns[0].y);
+    return (spvMat2){ { (float2)(m.columns[1].y * d, -m.columns[0].y * d), (float2)(-m.columns[1].x * d, m.columns[0].x * d) } };
+}
+
+static spvMat3 spvInverse3(spvMat3 m) {
+    float3 t = (float3)(m.columns[1].y * m.columns[2].z - m.columns[1].z * m.columns[2].y, m.columns[1].z * m.columns[2].x - m.columns[1].x * m.columns[2].z, m.columns[1].x * m.columns[2].y - m.columns[1].y * m.columns[2].x);
+    float d = 1.0f / dot(m.columns[0], t);
+    return (spvMat3){ { t * d, (float3)(m.columns[0].z * m.columns[2].y - m.columns[0].y * m.columns[2].z, m.columns[0].x * m.columns[2].z - m.columns[0].z * m.columns[2].x, m.columns[0].y * m.columns[2].x - m.columns[0].x * m.columns[2].y) * d, (float3)(m.columns[0].y * m.columns[1].z - m.columns[0].z * m.columns[1].y, m.columns[0].z * m.columns[1].x - m.columns[0].x * m.columns[1].z, m.columns[0].x * m.columns[1].y - m.columns[0].y * m.columns[1].x) * d } };
+}
+
+static spvMat4 spvInverse4(spvMat4 m) {
+    float4 t = (float4)(m.columns[2].y * m.columns[3].z * m.columns[1].w - m.columns[3].y * m.columns[2].z * m.columns[1].w + m.columns[3].y * m.columns[1].z * m.columns[2].w - m.columns[1].y * m.columns[3].z * m.columns[2].w - m.columns[2].y * m.columns[1].z * m.columns[3].w + m.columns[1].y * m.columns[2].z * m.columns[3].w, m.columns[3].x * m.columns[2].z * m.columns[1].w - m.columns[2].x * m.columns[3].z * m.columns[1].w - m.columns[3].x * m.columns[1].z * m.columns[2].w + m.columns[1].x * m.columns[3].z * m.columns[2].w + m.columns[2].x * m.columns[1].z * m.columns[3].w - m.columns[1].x * m.columns[2].z * m.columns[3].w, m.columns[2].x * m.columns[3].y * m.columns[1].w - m.columns[3].x * m.columns[2].y * m.columns[1].w + m.columns[3].x * m.columns[1].y * m.columns[2].w - m.columns[1].x * m.columns[3].y * m.columns[2].w - m.columns[2].x * m.columns[1].y * m.columns[3].w + m.columns[1].x * m.columns[2].y * m.columns[3].w, m.columns[3].x * m.columns[2].y * m.columns[1].z - m.columns[2].x * m.columns[3].y * m.columns[1].z - m.columns[3].x * m.columns[1].y * m.columns[2].z + m.columns[1].x * m.columns[3].y * m.columns[2].z + m.columns[2].x * m.columns[1].y * m.columns[3].z - m.columns[1].x * m.columns[2].y * m.columns[3].z);
+    spvMat4 r = (spvMat4){ { (float4)(t.x, m.columns[3].y * m.columns[2].z * m.columns[0].w - m.columns[2].y * m.columns[3].z * m.columns[0].w - m.columns[3].y * m.columns[0].z * m.columns[2].w + m.columns[0].y * m.columns[3].z * m.columns[2].w + m.columns[2].y * m.columns[0].z * m.columns[3].w - m.columns[0].y * m.columns[2].z * m.columns[3].w, m.columns[1].y * m.columns[3].z * m.columns[0].w - m.columns[3].y * m.columns[1].z * m.columns[0].w + m.columns[3].y * m.columns[0].z * m.columns[1].w - m.columns[0].y * m.columns[3].z * m.columns[1].w - m.columns[1].y * m.columns[0].z * m.columns[3].w + m.columns[0].y * m.columns[1].z * m.columns[3].w, m.columns[2].y * m.columns[1].z * m.columns[0].w - m.columns[1].y * m.columns[2].z * m.columns[0].w - m.columns[2].y * m.columns[0].z * m.columns[1].w + m.columns[0].y * m.columns[2].z * m.columns[1].w + m.columns[1].y * m.columns[0].z * m.columns[2].w - m.columns[0].y * m.columns[1].z * m.columns[2].w), (float4)(t.y, m.columns[2].x * m.columns[3].z * m.columns[0].w - m.columns[3].x * m.columns[2].z * m.columns[0].w + m.columns[3].x * m.columns[0].z * m.columns[2].w - m.columns[0].x * m.columns[3].z * m.columns[2].w - m.columns[2].x * m.columns[0].z * m.columns[3].w + m.columns[0].x * m.columns[2].z * m.columns[3].w, m.columns[3].x * m.columns[1].z * m.columns[0].w - m.columns[1].x * m.columns[3].z * m.columns[0].w - m.columns[3].x * m.columns[0].z * m.columns[1].w + m.columns[0].x * m.columns[3].z * m.columns[1].w + m.columns[1].x * m.columns[0].z * m.columns[3].w - m.columns[0].x * m.columns[1].z * m.columns[3].w, m.columns[1].x * m.columns[2].z * m.columns[0].w - m.columns[2].x * m.columns[1].z * m.columns[0].w + m.columns[2].x * m.columns[0].z * m.columns[1].w - m.columns[0].x * m.columns[2].z * m.columns[1].w - m.columns[1].x * m.columns[0].z * m.columns[2].w + m.columns[0].x * m.columns[1].z * m.columns[2].w), (float4)(t.z, m.columns[3].x * m.columns[2].y * m.columns[0].w - m.columns[2].x * m.columns[3].y * m.columns[0].w - m.columns[3].x * m.columns[0].y * m.columns[2].w + m.columns[0].x * m.columns[3].y * m.columns[2].w + m.columns[2].x * m.columns[0].y * m.columns[3].w - m.columns[0].x * m.columns[2].y * m.columns[3].w, m.columns[1].x * m.columns[3].y * m.columns[0].w - m.columns[3].x * m.columns[1].y * m.columns[0].w + m.columns[3].x * m.columns[0].y * m.columns[1].w - m.columns[0].x * m.columns[3].y * m.columns[1].w - m.columns[1].x * m.columns[0].y * m.columns[3].w + m.columns[0].x * m.columns[1].y * m.columns[3].w, m.columns[2].x * m.columns[1].y * m.columns[0].w - m.columns[1].x * m.columns[2].y * m.columns[0].w - m.columns[2].x * m.columns[0].y * m.columns[1].w + m.columns[0].x * m.columns[2].y * m.columns[1].w + m.columns[1].x * m.columns[0].y * m.columns[2].w - m.columns[0].x * m.columns[1].y * m.columns[2].w), (float4)(t.w, m.columns[2].x * m.columns[3].y * m.columns[0].z - m.columns[3].x * m.columns[2].y * m.columns[0].z + m.columns[3].x * m.columns[0].y * m.columns[2].z - m.columns[0].x * m.columns[3].y * m.columns[2].z - m.columns[2].x * m.columns[0].y * m.columns[3].z + m.columns[0].x * m.columns[2].y * m.columns[3].z, m.columns[3].x * m.columns[1].y * m.columns[0].z - m.columns[1].x * m.columns[3].y * m.columns[0].z - m.columns[3].x * m.columns[0].y * m.columns[1].z + m.columns[0].x * m.columns[3].y * m.columns[1].z + m.columns[1].x * m.columns[0].y * m.columns[3].z - m.columns[0].x * m.columns[1].y * m.columns[3].z, m.columns[1].x * m.columns[2].y * m.columns[0].z - m.columns[2].x * m.columns[1].y * m.columns[0].z + m.columns[2].x * m.columns[0].y * m.columns[1].z - m.columns[0].x * m.columns[2].y * m.columns[1].z - m.columns[1].x * m.columns[0].y * m.columns[2].z + m.columns[0].x * m.columns[1].y * m.columns[2].z) } };
+    float d = 1.0f / dot(m.columns[0], t);
+    r.columns[0] *= d; r.columns[1] *= d; r.columns[2] *= d; r.columns[3] *= d;
+    return r;
+}
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global SSBO* _19)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    _19->res = round(_19->f32[0u]);
+    _19->res = rint(_19->f32[0u]);
+    _19->res = trunc(_19->f32[0u]);
+    _19->res = fabs(_19->f32[0u]);
+    _19->ires = as_int(abs(_19->s32[0u]));
+    _19->res = sign(_19->f32[0u]);
+    _19->ires = clamp(_19->s32[0u], -1, 1);
+    _19->res = floor(_19->f32[0u]);
+    _19->res = ceil(_19->f32[0u]);
+    _19->res = (_19->f32[0u] - floor(_19->f32[0u]));
+    _19->res = radians(_19->f32[0u]);
+    _19->res = degrees(_19->f32[0u]);
+    _19->res = sin(_19->f32[0u]);
+    _19->res = cos(_19->f32[0u]);
+    _19->res = tan(_19->f32[0u]);
+    _19->res = asin(_19->f32[0u]);
+    _19->res = acos(_19->f32[0u]);
+    _19->res = atan(_19->f32[0u]);
+    _19->res = sinh(_19->f32[0u]);
+    _19->res = cosh(_19->f32[0u]);
+    _19->res = tanh(_19->f32[0u]);
+    _19->res = asinh(_19->f32[0u]);
+    _19->res = acosh(_19->f32[0u]);
+    _19->res = atanh(_19->f32[0u]);
+    _19->res = atan2(_19->f32[0u], _19->f32[1u]);
+    _19->res = pow(_19->f32[0u], _19->f32[1u]);
+    _19->res = exp(_19->f32[0u]);
+    _19->res = log(_19->f32[0u]);
+    _19->res = exp2(_19->f32[0u]);
+    _19->res = log2(_19->f32[0u]);
+    _19->res = sqrt(_19->f32[0u]);
+    _19->res = rsqrt(_19->f32[0u]);
+    _19->res = fabs(_19->f32[0u]);
+    _19->res = fabs(_19->f32[0u] - _19->f32[1u]);
+    _19->res = sign(_19->f32[0u]);
+    _19->res = (_19->f32[2u] * _19->f32[1u] < 0.0f ? _19->f32[0u] : -_19->f32[0u]);
+    _19->res = _19->f32[0u] - 2.0f * _19->f32[1u] * _19->f32[0u] * _19->f32[1u];
+    float _195 = (float)(0.0f);
+    {
+        float spv_NdotI = _19->f32[1u] * _19->f32[0u];
+        float spv_k = 1.0f - _19->f32[2u] * _19->f32[2u] * (1.0f - spv_NdotI * spv_NdotI);
+        if (spv_k >= 0.0f)
+            _195 = _19->f32[2u] * _19->f32[0u] - (_19->f32[2u] * spv_NdotI + sqrt(spv_k)) * _19->f32[1u];
+    }
+    _19->res = _195;
+    _19->res = length(_19->f32.xy);
+    _19->res = distance(_19->f32.xy, _19->f32.zw);
+    float2 v2_1 = normalize(_19->f32.xy);
+    v2_1 = (dot(_19->f32.zw, _19->f32.yz) < 0.0f ? _19->f32.xy : -_19->f32.xy);
+    v2_1 = _19->f32.xy - 2.0f * dot(_19->f32.zw, _19->f32.xy) * _19->f32.zw;
+    float2 _243 = (float2)(0.0f);
+    {
+        float spv_NdotI = dot(_19->f32.yz, _19->f32.xy);
+        float spv_k = 1.0f - _19->f32[3u] * _19->f32[3u] * (1.0f - spv_NdotI * spv_NdotI);
+        if (spv_k >= 0.0f)
+            _243 = _19->f32[3u] * _19->f32.xy - (_19->f32[3u] * spv_NdotI + sqrt(spv_k)) * _19->f32.yz;
+    }
+    v2_1 = _243;
+    float3 v3_1 = cross(_19->f32.xyz, _19->f32.yzw);
+    _19->res = spvDeterminant2(_19->m2);
+    _19->res = spvDeterminant3(_19->m3);
+    _19->res = spvDeterminant4(_19->m4);
+    _19->m2 = spvInverse2(_19->m2);
+    _19->m3 = spvInverse3(_19->m3);
+    _19->m4 = spvInverse4(_19->m4);
+    ResType _288;
+    _288._m0 = modf(_19->f32[0u], &_288._m1);
+    float tmp_1 = _288._m1;
+    _19->res = _288._m0;
+    _19->res = fmin(_19->f32[0u], _19->f32[1u]);
+    _19->ures = min(_19->u32[0u], _19->u32[1u]);
+    _19->ires = min(_19->s32[0u], _19->s32[1u]);
+    _19->res = fmax(_19->f32[0u], _19->f32[1u]);
+    _19->ures = max(_19->u32[0u], _19->u32[1u]);
+    _19->ires = max(_19->s32[0u], _19->s32[1u]);
+    _19->res = clamp(_19->f32[0u], _19->f32[1u], _19->f32[2u]);
+    _19->ures = clamp(_19->u32[0u], _19->u32[1u], _19->u32[2u]);
+    _19->ires = clamp(_19->s32[0u], _19->s32[1u], _19->s32[2u]);
+    _19->res = mix(_19->f32[0u], _19->f32[1u], _19->f32[2u]);
+    _19->res = step(_19->f32[0u], _19->f32[1u]);
+    _19->res = smoothstep(_19->f32[0u], _19->f32[1u], _19->f32[2u]);
+    _19->res = fma(_19->f32[0u], _19->f32[1u], _19->f32[2u]);
+    ResType_1 _390;
+    _390._m0 = frexp(_19->f32[0u], &_390._m1);
+    int itmp_1 = _390._m1;
+    _19->res = _390._m0;
+    _19->res = ldexp(_19->f32[0u], itmp_1);
+    _19->ures = spvPackSnorm4x8(_19->f32);
+    _19->ures = spvPackUnorm4x8(_19->f32);
+    _19->ures = spvPackSnorm2x16(_19->f32.xy);
+    _19->ures = spvPackUnorm2x16(_19->f32.xy);
+    _19->ures = spvPackHalf2x16(_19->f32.xy);
+    v2_1 = spvUnpackSnorm2x16(_19->u32[0u]);
+    v2_1 = spvUnpackUnorm2x16(_19->u32[0u]);
+    v2_1 = spvUnpackHalf2x16(_19->u32[0u]);
+    float4 v4_1 = spvUnpackSnorm4x8(_19->u32[0u]);
+    v4_1 = spvUnpackUnorm4x8(_19->u32[0u]);
+    _19->s32 = (int4)(spvFindLSB(as_uint(_19->s32.x)), spvFindLSB(as_uint(_19->s32.y)), spvFindLSB(as_uint(_19->s32.z)), spvFindLSB(as_uint(_19->s32.w)));
+    _19->s32 = (int4)(spvFindLSB(_19->u32.x), spvFindLSB(_19->u32.y), spvFindLSB(_19->u32.z), spvFindLSB(_19->u32.w));
+    _19->s32 = 31 - clz(_19->s32 ^ (_19->s32 >> 31));
+    _19->s32 = 31 - as_int4(clz(_19->u32));
+}
+
diff --git a/reference/shaders-opencl-no-opt/comp/illegal-struct-name.asm.comp b/reference/shaders-opencl-no-opt/comp/illegal-struct-name.asm.comp
new file mode 100644
index 000000000..7dde55fc7
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/comp/illegal-struct-name.asm.comp
@@ -0,0 +1,27 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct Foo
+{
+    float _abs;
+};
+
+typedef struct Foo Foo;
+
+struct SSBO
+{
+    Foo foo;
+    Foo foo2;
+};
+
+typedef struct SSBO SSBO;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global SSBO* _9)
+{
+    Foo f;
+    f._abs = _9->foo._abs;
+    int _abs = 10;
+    _9->foo2._abs = f._abs;
+}
+
diff --git a/reference/shaders-opencl-no-opt/comp/implicit-integer-promotion.fp16.invalid.comp b/reference/shaders-opencl-no-opt/comp/implicit-integer-promotion.fp16.invalid.comp
new file mode 100644
index 000000000..7ee5a5f89
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/comp/implicit-integer-promotion.fp16.invalid.comp
@@ -0,0 +1,89 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+struct BUF0
+{
+    half2 f16s;
+    ushort2 u16;
+    short2 i16;
+    ushort4 u16s;
+    short4 i16s;
+    half f16;
+};
+
+typedef struct BUF0 BUF0;
+
+void test_u16(__global BUF0* _24)
+{
+    _24->f16 += as_half(ushort(_24->u16[0u] + _24->u16[1u]));
+    _24->f16 += as_half(ushort(_24->u16[0u] - _24->u16[1u]));
+    _24->f16 += as_half(ushort(_24->u16[0u] * _24->u16[1u]));
+    _24->f16 += as_half(ushort(_24->u16[0u] / _24->u16[1u]));
+    _24->f16 += as_half(ushort(_24->u16[0u] % _24->u16[1u]));
+    _24->f16 += as_half(ushort(_24->u16[0u] << _24->u16[1u]));
+    _24->f16 += as_half(ushort(_24->u16[0u] >> _24->u16[1u]));
+    _24->f16 += as_half(ushort(~_24->u16[0u]));
+    _24->f16 += as_half(ushort(-_24->u16[0u]));
+    _24->f16 += as_half(ushort(_24->u16[0u] ^ _24->u16[1u]));
+    _24->f16 += as_half(ushort(_24->u16[0u] & _24->u16[1u]));
+    _24->f16 += as_half(ushort(_24->u16[0u] | _24->u16[1u]));
+}
+
+void test_i16(__global BUF0* _24)
+{
+    _24->f16 += as_half(short(_24->i16[0u] + _24->i16[1u]));
+    _24->f16 += as_half(short(_24->i16[0u] - _24->i16[1u]));
+    _24->f16 += as_half(short(_24->i16[0u] * _24->i16[1u]));
+    _24->f16 += as_half(short(_24->i16[0u] / _24->i16[1u]));
+    _24->f16 += as_half(short(_24->i16[0u] % _24->i16[1u]));
+    _24->f16 += as_half(short(_24->i16[0u] << _24->i16[1u]));
+    _24->f16 += as_half(short(_24->i16[0u] >> _24->i16[1u]));
+    _24->f16 += as_half(short(~_24->i16[0u]));
+    _24->f16 += as_half(short(-_24->i16[0u]));
+    _24->f16 += as_half(short(_24->i16[0u] ^ _24->i16[1u]));
+    _24->f16 += as_half(short(_24->i16[0u] & _24->i16[1u]));
+    _24->f16 += as_half(short(_24->i16[0u] | _24->i16[1u]));
+}
+
+void test_u16s(__global BUF0* _24)
+{
+    _24->f16s += as_half2(_24->u16s.xy + _24->u16s.zw);
+    _24->f16s += as_half2(_24->u16s.xy - _24->u16s.zw);
+    _24->f16s += as_half2(_24->u16s.xy * _24->u16s.zw);
+    _24->f16s += as_half2(_24->u16s.xy / _24->u16s.zw);
+    _24->f16s += as_half2(_24->u16s.xy % _24->u16s.zw);
+    _24->f16s += as_half2(_24->u16s.xy << _24->u16s.zw);
+    _24->f16s += as_half2(_24->u16s.xy >> _24->u16s.zw);
+    _24->f16s += as_half2(~_24->u16s.xy);
+    _24->f16s += as_half2(-_24->u16s.xy);
+    _24->f16s += as_half2(_24->u16s.xy ^ _24->u16s.zw);
+    _24->f16s += as_half2(_24->u16s.xy & _24->u16s.zw);
+    _24->f16s += as_half2(_24->u16s.xy | _24->u16s.zw);
+}
+
+void test_i16s(__global BUF0* _24)
+{
+    _24->f16s += as_half2(_24->i16s.xy + _24->i16s.zw);
+    _24->f16s += as_half2(_24->i16s.xy - _24->i16s.zw);
+    _24->f16s += as_half2(_24->i16s.xy * _24->i16s.zw);
+    _24->f16s += as_half2(_24->i16s.xy / _24->i16s.zw);
+    _24->f16s += as_half2(_24->i16s.xy % _24->i16s.zw);
+    _24->f16s += as_half2(_24->i16s.xy << _24->i16s.zw);
+    _24->f16s += as_half2(_24->i16s.xy >> _24->i16s.zw);
+    _24->f16s += as_half2(~_24->i16s.xy);
+    _24->f16s += as_half2(-_24->i16s.xy);
+    _24->f16s += as_half2(_24->i16s.xy ^ _24->i16s.zw);
+    _24->f16s += as_half2(_24->i16s.xy & _24->i16s.zw);
+    _24->f16s += as_half2(_24->i16s.xy | _24->i16s.zw);
+}
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global BUF0* _24)
+{
+    test_u16(_24);
+    test_i16(_24);
+    test_u16s(_24);
+    test_i16s(_24);
+}
+
diff --git a/reference/shaders-opencl-no-opt/comp/int16min-literal.fp16.invalid.comp b/reference/shaders-opencl-no-opt/comp/int16min-literal.fp16.invalid.comp
new file mode 100644
index 000000000..4d3324d9e
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/comp/int16min-literal.fp16.invalid.comp
@@ -0,0 +1,27 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+struct UBO
+{
+    half b;
+};
+
+typedef struct UBO UBO;
+
+struct SSBO
+{
+    half a;
+};
+
+typedef struct SSBO SSBO;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(UBO _12, __global half* _24)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    short v = as_short(_12.b);
+    v = short(v ^ (-32768s));
+    _24[0] = as_half(v);
+}
+
diff --git a/reference/shaders-opencl-no-opt/comp/int64.invalid.comp b/reference/shaders-opencl-no-opt/comp/int64.invalid.comp
new file mode 100644
index 000000000..13fc8bf8b
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/comp/int64.invalid.comp
@@ -0,0 +1,75 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct M0
+{
+    long v;
+    long2 b[2];
+    ulong c;
+    ulong d[5];
+};
+
+typedef struct M0 M0;
+
+struct SSBO0_Type
+{
+    long4 a;
+    M0 m0;
+};
+
+typedef struct SSBO0_Type SSBO0_Type;
+
+struct SSBO1_Type
+{
+    ulong4 b;
+    M0 m0;
+};
+
+typedef struct SSBO1_Type SSBO1_Type;
+
+struct SSBO2_Type
+{
+    long a[4];
+    long2 b[4];
+};
+
+typedef struct SSBO2_Type SSBO2_Type;
+
+struct SSBO3_Type
+{
+    long a[4];
+    long2 b[4];
+};
+
+typedef struct SSBO3_Type SSBO3_Type;
+
+struct SSBO
+{
+    int s32;
+    uint u32;
+};
+
+typedef struct SSBO SSBO;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global SSBO* _96)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    SSBO0_Type ssbo_0;
+    ssbo_0.a += (long4)(10l, 20l, 30l, 40l);
+    SSBO1_Type ssbo_1;
+    ssbo_1.b += (ulong4)(999999999999999999ul, 8888888888888888ul, 77777777777777777ul, 6666666666666666ul);
+    ssbo_0.a += (long4)(20l);
+    ssbo_0.a = as_long4(abs(ssbo_0.a + as_long4(ssbo_1.b)));
+    ssbo_0.a += (long4)(1l);
+    ssbo_1.b += as_ulong4((long4)(1l));
+    ssbo_0.a -= (long4)(1l);
+    ssbo_1.b -= as_ulong4((long4)(1l));
+    SSBO2_Type ssbo_2;
+    ssbo_2.a[0] += 1l;
+    SSBO3_Type ssbo_3;
+    ssbo_3.a[0] += 2l;
+    _96->s32 = as_int(convert_uint(((as_ulong(ssbo_0.a.x) + ssbo_1.b.y) + as_ulong(ssbo_2.a[1])) + as_ulong(ssbo_3.a[2])));
+    _96->u32 = convert_uint(((as_ulong(ssbo_0.a.y) + ssbo_1.b.z) + as_ulong(ssbo_2.a[0])) + as_ulong(ssbo_3.a[1]));
+}
+
diff --git a/reference/shaders-opencl-no-opt/comp/int64min-literal.comp b/reference/shaders-opencl-no-opt/comp/int64min-literal.comp
new file mode 100644
index 000000000..1697efd5b
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/comp/int64min-literal.comp
@@ -0,0 +1,26 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct UBO
+{
+    float b;
+};
+
+typedef struct UBO UBO;
+
+struct SSBO
+{
+    float a;
+};
+
+typedef struct SSBO SSBO;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(UBO _12, __global float* _25)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    long v = convert_long(as_int(_12.b));
+    v ^= (long)(0x8000000000000000ul);
+    _25[0] = as_float(convert_int(v));
+}
+
diff --git a/reference/shaders-opencl-no-opt/comp/integer-dot-product.comp b/reference/shaders-opencl-no-opt/comp/integer-dot-product.comp
new file mode 100644
index 000000000..a76cd28a7
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/comp/integer-dot-product.comp
@@ -0,0 +1,58 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct InOut3
+{
+    ushort4 x;
+    ushort4 y;
+    int acc;
+    int result;
+};
+
+typedef struct InOut3 InOut3;
+
+struct InOut2
+{
+    uint x;
+    uint y;
+    uint result;
+};
+
+typedef struct InOut2 InOut2;
+
+struct InOut
+{
+    uint4 x;
+    uint4 y;
+    int result;
+};
+
+typedef struct InOut InOut;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global InOut3* comp3, __global InOut2* comp2, __global InOut* comp)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    int sdot_int = (int)(as_short4(comp3->x).s0) * (int)(as_short4(comp3->y).s0) + (int)(as_short4(comp3->x).s1) * (int)(as_short4(comp3->y).s1) + (int)(as_short4(comp3->x).s2) * (int)(as_short4(comp3->y).s2) + (int)(as_short4(comp3->x).s3) * (int)(as_short4(comp3->y).s3);
+    uint sdot_uint = (uint)(as_short4(comp3->x).s0) * (uint)(as_short4(comp3->y).s0) + (uint)(as_short4(comp3->x).s1) * (uint)(as_short4(comp3->y).s1) + (uint)(as_short4(comp3->x).s2) * (uint)(as_short4(comp3->y).s2) + (uint)(as_short4(comp3->x).s3) * (uint)(as_short4(comp3->y).s3);
+    uint udot_uint = (uint)(comp3->x.s0) * (uint)(comp3->y.s0) + (uint)(comp3->x.s1) * (uint)(comp3->y.s1) + (uint)(comp3->x.s2) * (uint)(comp3->y.s2) + (uint)(comp3->x.s3) * (uint)(comp3->y.s3);
+    int sudot_int = (int)(as_short4(comp3->x).s0) * (int)(comp3->y.s0) + (int)(as_short4(comp3->x).s1) * (int)(comp3->y.s1) + (int)(as_short4(comp3->x).s2) * (int)(comp3->y.s2) + (int)(as_short4(comp3->x).s3) * (int)(comp3->y.s3);
+    uint sudot_uint = (uint)(as_short4(comp3->x).s0) * (uint)(comp3->y.s0) + (uint)(as_short4(comp3->x).s1) * (uint)(comp3->y.s1) + (uint)(as_short4(comp3->x).s2) * (uint)(comp3->y.s2) + (uint)(as_short4(comp3->x).s3) * (uint)(comp3->y.s3);
+    uchar spdot8 = (uchar)(as_char4(comp2->x).s0) * (uchar)(as_char4(comp2->y).s0) + (uchar)(as_char4(comp2->x).s1) * (uchar)(as_char4(comp2->y).s1) + (uchar)(as_char4(comp2->x).s2) * (uchar)(as_char4(comp2->y).s2) + (uchar)(as_char4(comp2->x).s3) * (uchar)(as_char4(comp2->y).s3);
+    ushort spdot16 = (ushort)(as_char4(comp2->x).s0) * (ushort)(as_char4(comp2->y).s0) + (ushort)(as_char4(comp2->x).s1) * (ushort)(as_char4(comp2->y).s1) + (ushort)(as_char4(comp2->x).s2) * (ushort)(as_char4(comp2->y).s2) + (ushort)(as_char4(comp2->x).s3) * (ushort)(as_char4(comp2->y).s3);
+    uint spdot32 = (uint)(as_char4(comp2->x).s0) * (uint)(as_char4(comp2->y).s0) + (uint)(as_char4(comp2->x).s1) * (uint)(as_char4(comp2->y).s1) + (uint)(as_char4(comp2->x).s2) * (uint)(as_char4(comp2->y).s2) + (uint)(as_char4(comp2->x).s3) * (uint)(as_char4(comp2->y).s3);
+    int spdoti32 = (int)(as_char4(comp2->x).s0) * (int)(as_char4(comp2->y).s0) + (int)(as_char4(comp2->x).s1) * (int)(as_char4(comp2->y).s1) + (int)(as_char4(comp2->x).s2) * (int)(as_char4(comp2->y).s2) + (int)(as_char4(comp2->x).s3) * (int)(as_char4(comp2->y).s3);
+    uchar updot8 = (uchar)(as_uchar4(comp2->x).s0) * (uchar)(as_uchar4(comp2->y).s0) + (uchar)(as_uchar4(comp2->x).s1) * (uchar)(as_uchar4(comp2->y).s1) + (uchar)(as_uchar4(comp2->x).s2) * (uchar)(as_uchar4(comp2->y).s2) + (uchar)(as_uchar4(comp2->x).s3) * (uchar)(as_uchar4(comp2->y).s3);
+    ushort updot16 = (ushort)(as_uchar4(comp2->x).s0) * (ushort)(as_uchar4(comp2->y).s0) + (ushort)(as_uchar4(comp2->x).s1) * (ushort)(as_uchar4(comp2->y).s1) + (ushort)(as_uchar4(comp2->x).s2) * (ushort)(as_uchar4(comp2->y).s2) + (ushort)(as_uchar4(comp2->x).s3) * (ushort)(as_uchar4(comp2->y).s3);
+    uint updot32 = (uint)(as_uchar4(comp2->x).s0) * (uint)(as_uchar4(comp2->y).s0) + (uint)(as_uchar4(comp2->x).s1) * (uint)(as_uchar4(comp2->y).s1) + (uint)(as_uchar4(comp2->x).s2) * (uint)(as_uchar4(comp2->y).s2) + (uint)(as_uchar4(comp2->x).s3) * (uint)(as_uchar4(comp2->y).s3);
+    uchar supdot8 = (uchar)(as_char4(comp2->x).s0) * (uchar)(as_uchar4(comp2->y).s0) + (uchar)(as_char4(comp2->x).s1) * (uchar)(as_uchar4(comp2->y).s1) + (uchar)(as_char4(comp2->x).s2) * (uchar)(as_uchar4(comp2->y).s2) + (uchar)(as_char4(comp2->x).s3) * (uchar)(as_uchar4(comp2->y).s3);
+    ushort supdot16 = (ushort)(as_char4(comp2->x).s0) * (ushort)(as_uchar4(comp2->y).s0) + (ushort)(as_char4(comp2->x).s1) * (ushort)(as_uchar4(comp2->y).s1) + (ushort)(as_char4(comp2->x).s2) * (ushort)(as_uchar4(comp2->y).s2) + (ushort)(as_char4(comp2->x).s3) * (ushort)(as_uchar4(comp2->y).s3);
+    uint supdot32 = (uint)(as_char4(comp2->x).s0) * (uint)(as_uchar4(comp2->y).s0) + (uint)(as_char4(comp2->x).s1) * (uint)(as_uchar4(comp2->y).s1) + (uint)(as_char4(comp2->x).s2) * (uint)(as_uchar4(comp2->y).s2) + (uint)(as_char4(comp2->x).s3) * (uint)(as_uchar4(comp2->y).s3);
+    int supdoti32 = (int)(as_char4(comp2->x).s0) * (int)(as_uchar4(comp2->y).s0) + (int)(as_char4(comp2->x).s1) * (int)(as_uchar4(comp2->y).s1) + (int)(as_char4(comp2->x).s2) * (int)(as_uchar4(comp2->y).s2) + (int)(as_char4(comp2->x).s3) * (int)(as_uchar4(comp2->y).s3);
+    int sdotaddsat_int = (int)add_sat((int)(as_short4(comp3->x).s0) * (int)(as_short4(comp3->y).s0) + (int)(as_short4(comp3->x).s1) * (int)(as_short4(comp3->y).s1) + (int)(as_short4(comp3->x).s2) * (int)(as_short4(comp3->y).s2) + (int)(as_short4(comp3->x).s3) * (int)(as_short4(comp3->y).s3), comp3->acc);
+    uint sdotaddsat_uint = (uint)add_sat((int)(as_short4(comp3->x).s0) * (int)(as_short4(comp3->y).s0) + (int)(as_short4(comp3->x).s1) * (int)(as_short4(comp3->y).s1) + (int)(as_short4(comp3->x).s2) * (int)(as_short4(comp3->y).s2) + (int)(as_short4(comp3->x).s3) * (int)(as_short4(comp3->y).s3), comp3->acc);
+    uint udotaddsat_uint = (uint)add_sat((uint)(comp3->x.s0) * (uint)(comp3->y.s0) + (uint)(comp3->x.s1) * (uint)(comp3->y.s1) + (uint)(comp3->x.s2) * (uint)(comp3->y.s2) + (uint)(comp3->x.s3) * (uint)(comp3->y.s3), as_uint(comp3->acc));
+    int sudotaddsat_int = (int)add_sat((int)(as_short4(comp3->x).s0) * (int)(comp3->y.s0) + (int)(as_short4(comp3->x).s1) * (int)(comp3->y.s1) + (int)(as_short4(comp3->x).s2) * (int)(comp3->y.s2) + (int)(as_short4(comp3->x).s3) * (int)(comp3->y.s3), comp3->acc);
+    uint sudotaddsat_uint = (uint)add_sat((int)(as_short4(comp3->x).s0) * (int)(comp3->y.s0) + (int)(as_short4(comp3->x).s1) * (int)(comp3->y.s1) + (int)(as_short4(comp3->x).s2) * (int)(comp3->y.s2) + (int)(as_short4(comp3->x).s3) * (int)(comp3->y.s3), comp3->acc);
+}
+
diff --git a/reference/shaders-opencl-no-opt/comp/intmin-literal.comp b/reference/shaders-opencl-no-opt/comp/intmin-literal.comp
new file mode 100644
index 000000000..78eca3f61
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/comp/intmin-literal.comp
@@ -0,0 +1,24 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct SSBO
+{
+    float a;
+};
+
+typedef struct SSBO SSBO;
+
+struct UBO
+{
+    float b;
+};
+
+typedef struct UBO UBO;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global float* _9, UBO _14)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    _9[0] = as_float(as_int(_14.b) ^ (int)(0x80000000));
+}
+
diff --git a/reference/shaders-opencl-no-opt/comp/loop.comp b/reference/shaders-opencl-no-opt/comp/loop.comp
new file mode 100644
index 000000000..84472618d
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/comp/loop.comp
@@ -0,0 +1,100 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+typedef struct { float4 columns[4]; } spvMat4;
+
+struct SSBO
+{
+    spvMat4 mvp;
+    float4 in_data[1];
+};
+
+typedef struct SSBO SSBO;
+
+struct SSBO2
+{
+    float4 out_data[1];
+};
+
+typedef struct SSBO2 SSBO2;
+
+static float4 spvMulMat4Vec4(spvMat4 m, float4 v)
+{
+    return m.columns[0] * v.x + m.columns[1] * v.y + m.columns[2] * v.z + m.columns[3] * v.w;
+}
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global const SSBO* _24, __global float4* _177)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    uint ident_1 = ((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x;
+    float4 idat_1 = _24->in_data[ident_1];
+    int k_1 = 0;
+    uint i_2 = 0u;
+    if (idat_1.y == 20.0f)
+    {
+        do
+        {
+            k_1 *= 2;
+            i_2 += as_uint(1);
+        } while (i_2 < ident_1);
+    }
+    switch (k_1)
+    {
+        case 10:
+        {
+            for (;;)
+            {
+                i_2 += as_uint(1);
+                if (i_2 > 10u)
+                {
+                    break;
+                }
+                continue;
+            }
+            break;
+        }
+        default:
+        {
+            for (;;)
+            {
+                i_2 += 2u;
+                if (i_2 > 20u)
+                {
+                    break;
+                }
+                continue;
+            }
+            break;
+        }
+    }
+    while (k_1 < 10)
+    {
+        idat_1 *= 2.0f;
+        k_1++;
+    }
+    for (uint i_1_1 = 0u; i_1_1 < 16u; i_1_1 += as_uint(1), k_1++)
+    {
+        for (uint j_1 = 0u; j_1 < 30u; j_1 += as_uint(1))
+        {
+            idat_1 = spvMulMat4Vec4(_24->mvp, idat_1);
+        }
+    }
+    k_1 = 0;
+    for (;;)
+    {
+        k_1++;
+        if (k_1 > 10)
+        {
+            k_1 += 2;
+        }
+        else
+        {
+            k_1 += 3;
+            continue;
+        }
+        k_1 += 10;
+        continue;
+    }
+}
+
diff --git a/reference/shaders-opencl-no-opt/comp/read-only-coherent-image.invalid.comp b/reference/shaders-opencl-no-opt/comp/read-only-coherent-image.invalid.comp
new file mode 100644
index 000000000..0e8f8174f
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/comp/read-only-coherent-image.invalid.comp
@@ -0,0 +1,17 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct SSBO
+{
+    uint val;
+};
+
+typedef struct SSBO SSBO;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global uint* _9, write_only image2d_t img)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    _9[0] = read_imageui(img, (int2)(10)).x;
+}
+
diff --git a/reference/shaders-opencl-no-opt/comp/return.comp b/reference/shaders-opencl-no-opt/comp/return.comp
new file mode 100644
index 000000000..1f5bb5cb6
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/comp/return.comp
@@ -0,0 +1,39 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct SSBO2
+{
+    float4 out_data[1];
+};
+
+typedef struct SSBO2 SSBO2;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global float4* _27)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    uint ident = ((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x;
+    if (ident == 2u)
+    {
+        _27[ident] = (float4)(20.0f);
+    }
+    else
+    {
+        if (ident == 4u)
+        {
+            _27[ident] = (float4)(10.0f);
+            return;
+        }
+    }
+    int i = 0;
+    while (i < 20)
+    {
+        if (i == 10)
+        {
+            break;
+        }
+        return;
+    }
+    _27[ident] = (float4)(10.0f);
+}
+
diff --git a/reference/shaders-opencl-no-opt/comp/simple-bindless-ssbo.argument.argument-tier-1.device-argument-buffer.invalid.comp b/reference/shaders-opencl-no-opt/comp/simple-bindless-ssbo.argument.argument-tier-1.device-argument-buffer.invalid.comp
new file mode 100644
index 000000000..e69de29bb
diff --git a/reference/shaders-opencl-no-opt/comp/std140-array-load-composite-construct.comp b/reference/shaders-opencl-no-opt/comp/std140-array-load-composite-construct.comp
new file mode 100644
index 000000000..ba07a77f0
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/comp/std140-array-load-composite-construct.comp
@@ -0,0 +1,18 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct SSBO
+{
+    float a[16];
+    float4 b[16];
+};
+
+typedef struct SSBO SSBO;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global SSBO* _14)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    _14->b[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x] = (float4)(_14->a[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x]);
+}
+
diff --git a/reference/shaders-opencl-no-opt/comp/struct-packing-scalar.nocompat.invalid.vk.comp b/reference/shaders-opencl-no-opt/comp/struct-packing-scalar.nocompat.invalid.vk.comp
new file mode 100644
index 000000000..ae9db77c3
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/comp/struct-packing-scalar.nocompat.invalid.vk.comp
@@ -0,0 +1,144 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+typedef struct { float2 columns[2]; } spvMat2;
+typedef struct { float3 columns[2]; } spvMat2x3;
+typedef struct { float2 columns[3]; } spvMat3x2;
+
+struct S0
+{
+    float2 a[1];
+    float b;
+};
+
+typedef struct S0 S0;
+
+struct S1
+{
+    float3 a;
+    float b;
+};
+
+typedef struct S1 S1;
+
+struct S2
+{
+    float3 a[1];
+    float b;
+};
+
+typedef struct S2 S2;
+
+struct S3
+{
+    float2 a;
+    float b;
+};
+
+typedef struct S3 S3;
+
+struct Content
+{
+    S0 m0s[1];
+    S1 m1s[1];
+    S2 m2s[1];
+    S0 m0;
+    S1 m1;
+    S2 m2;
+    S3 m3;
+    float m4;
+};
+
+typedef struct Content Content;
+
+struct SSBO1
+{
+    Content content;
+    Content content1[2];
+    Content content2;
+    spvMat2 m0;
+    spvMat2 m1;
+    spvMat2x3 m2[4];
+    spvMat3x2 m3;
+    spvMat2 m4;
+    spvMat2 m5[9];
+    spvMat3x2 m6[4][2];
+    spvMat2x3 m7;
+    float array[1];
+};
+
+typedef struct SSBO1 SSBO1;
+
+struct SSBO0
+{
+    Content content;
+    Content content1[2];
+    Content content2;
+    spvMat2 m0;
+    spvMat2 m1;
+    spvMat2x3 m2[4];
+    spvMat3x2 m3;
+    spvMat2 m4;
+    spvMat2 m5[9];
+    spvMat3x2 m6[4][2];
+    spvMat2x3 m7;
+    float array[1];
+};
+
+typedef struct SSBO0 SSBO0;
+
+struct SSBO2
+{
+    float m0;
+    spvMat2 m1;
+    spvMat2x3 m2;
+};
+
+typedef struct SSBO2 SSBO2;
+
+static float3 spvMulMat2x3Vec2(spvMat2x3 m, float2 v)
+{
+    return m.columns[0] * v.x + m.columns[1] * v.y;
+}
+
+static spvMat2 spvTransposeMat2(spvMat2 m)
+{
+    spvMat2 r;
+    r.columns[0] = (float2)(m.columns[0].x, m.columns[1].x);
+    r.columns[1] = (float2)(m.columns[0].y, m.columns[1].y);
+    return r;
+}
+
+static spvMat2x3 spvTransposeMat3x2(spvMat3x2 m)
+{
+    spvMat2x3 r;
+    r.columns[0] = (float3)(m.columns[0].x, m.columns[1].x, m.columns[2].x);
+    r.columns[1] = (float3)(m.columns[0].y, m.columns[1].y, m.columns[2].y);
+    return r;
+}
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global SSBO1* ssbo_scalar, __global SSBO0* ssbo_140, __global SSBO2* ssbo_scalar2)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    ssbo_scalar->content.m0s[0].a[0] = ssbo_140->content.m0s[0].a[0];
+    ssbo_scalar->content.m0s[0].b = ssbo_140->content.m0s[0].b;
+    ssbo_scalar->content.m1s[0].a = ssbo_140->content.m1s[0].a;
+    ssbo_scalar->content.m1s[0].b = ssbo_140->content.m1s[0].b;
+    ssbo_scalar->content.m2s[0].a[0] = ssbo_140->content.m2s[0].a[0];
+    ssbo_scalar->content.m2s[0].b = ssbo_140->content.m2s[0].b;
+    ssbo_scalar->content.m0.a[0] = ssbo_140->content.m0.a[0];
+    ssbo_scalar->content.m0.b = ssbo_140->content.m0.b;
+    ssbo_scalar->content.m1.a = ssbo_140->content.m1.a;
+    ssbo_scalar->content.m1.b = ssbo_140->content.m1.b;
+    ssbo_scalar->content.m2.a[0] = ssbo_140->content.m2.a[0];
+    ssbo_scalar->content.m2.b = ssbo_140->content.m2.b;
+    ssbo_scalar->content.m3.a = ssbo_140->content.m3.a;
+    ssbo_scalar->content.m3.b = ssbo_140->content.m3.b;
+    ssbo_scalar->content.m4 = ssbo_140->content.m4;
+    ssbo_scalar->content.m1.a = spvMulMat2x3Vec2(ssbo_scalar->m2[1], ssbo_scalar->content.m0.a[0]);
+    ssbo_scalar->m0 = ssbo_scalar2->m1;
+    ssbo_scalar2->m1 = spvTransposeMat2(ssbo_scalar->m4);
+    ssbo_scalar2->m2 = spvTransposeMat3x2(ssbo_scalar->m3);
+}
+
diff --git a/reference/shaders-opencl-no-opt/comp/subgroups.nocompat.vk.opencl12.emulate-subgroup.invalid.comp b/reference/shaders-opencl-no-opt/comp/subgroups.nocompat.vk.opencl12.emulate-subgroup.invalid.comp
new file mode 100644
index 000000000..e69de29bb
diff --git a/reference/shaders-opencl-no-opt/comp/subgroups.nocompat.vk.subgroup.fixed-subgroup.invalid.comp b/reference/shaders-opencl-no-opt/comp/subgroups.nocompat.vk.subgroup.fixed-subgroup.invalid.comp
new file mode 100644
index 000000000..e69de29bb
diff --git a/reference/shaders-opencl-no-opt/comp/subgroups.nocompat.vk.subgroup.invalid.comp b/reference/shaders-opencl-no-opt/comp/subgroups.nocompat.vk.subgroup.invalid.comp
new file mode 100644
index 000000000..e69de29bb
diff --git a/reference/shaders-opencl-no-opt/comp/subgroups.nocompat.vk.subgroup.swizzle.invalid.comp b/reference/shaders-opencl-no-opt/comp/subgroups.nocompat.vk.subgroup.swizzle.invalid.comp
new file mode 100644
index 000000000..e69de29bb
diff --git a/reference/shaders-opencl-no-opt/comp/trancendental-float-controls-1-fp16.fp16.comp b/reference/shaders-opencl-no-opt/comp/trancendental-float-controls-1-fp16.fp16.comp
new file mode 100644
index 000000000..dcc01cef3
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/comp/trancendental-float-controls-1-fp16.fp16.comp
@@ -0,0 +1,35 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+struct SSBO
+{
+    float v[4];
+    half f16[4];
+};
+
+typedef struct SSBO SSBO;
+
+__attribute__((reqd_work_group_size(4, 1, 1)))
+__kernel void comp_main(__global SSBO* _14)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    _14->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))] = cos(_14->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))]);
+    _14->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))] += sin(_14->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))]);
+    _14->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))] += tan(_14->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))]);
+    _14->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))] += acos(_14->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))]);
+    _14->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))] += asin(_14->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))]);
+    _14->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))] += atan(_14->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))]);
+    _14->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))] += exp(_14->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))]);
+    _14->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))] += exp2(_14->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))]);
+    _14->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))] += log(_14->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))]);
+    _14->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))] += log2(_14->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))]);
+    _14->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))] += sqrt(_14->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))]);
+    _14->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))] += rsqrt(_14->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))]);
+    _14->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))] += pow(_14->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))], 4.0f);
+    _14->f16[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))] = cos(_14->f16[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))]);
+    _14->f16[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))] += sin(_14->f16[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))]);
+    _14->f16[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))] += cosh(_14->f16[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))]);
+    _14->f16[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))] += sinh(_14->f16[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))]);
+}
+
diff --git a/reference/shaders-opencl-no-opt/comp/trancendental-float-controls-1-fp32.fp16.comp b/reference/shaders-opencl-no-opt/comp/trancendental-float-controls-1-fp32.fp16.comp
new file mode 100644
index 000000000..dcc01cef3
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/comp/trancendental-float-controls-1-fp32.fp16.comp
@@ -0,0 +1,35 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+struct SSBO
+{
+    float v[4];
+    half f16[4];
+};
+
+typedef struct SSBO SSBO;
+
+__attribute__((reqd_work_group_size(4, 1, 1)))
+__kernel void comp_main(__global SSBO* _14)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    _14->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))] = cos(_14->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))]);
+    _14->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))] += sin(_14->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))]);
+    _14->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))] += tan(_14->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))]);
+    _14->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))] += acos(_14->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))]);
+    _14->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))] += asin(_14->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))]);
+    _14->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))] += atan(_14->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))]);
+    _14->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))] += exp(_14->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))]);
+    _14->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))] += exp2(_14->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))]);
+    _14->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))] += log(_14->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))]);
+    _14->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))] += log2(_14->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))]);
+    _14->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))] += sqrt(_14->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))]);
+    _14->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))] += rsqrt(_14->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))]);
+    _14->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))] += pow(_14->v[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))], 4.0f);
+    _14->f16[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))] = cos(_14->f16[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))]);
+    _14->f16[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))] += sin(_14->f16[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))]);
+    _14->f16[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))] += cosh(_14->f16[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))]);
+    _14->f16[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))] += sinh(_14->f16[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))]);
+}
+
diff --git a/reference/shaders-opencl-no-opt/comp/transposed-temporary-expression-2.comp b/reference/shaders-opencl-no-opt/comp/transposed-temporary-expression-2.comp
new file mode 100644
index 000000000..be55a5acb
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/comp/transposed-temporary-expression-2.comp
@@ -0,0 +1,56 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+typedef struct { float4 columns[3]; } spvMat3x4;
+typedef struct { float3 columns[4]; } spvMat4x3;
+
+struct SSBO
+{
+    spvMat3x4 A;
+    spvMat3x4 B;
+    spvMat3x4 C;
+    float4 D;
+    float w0;
+    float w1;
+};
+
+typedef struct SSBO SSBO;
+
+static float3 spvMulMat4x3Vec4(spvMat4x3 m, float4 v)
+{
+    return m.columns[0] * v.x + m.columns[1] * v.y + m.columns[2] * v.z + m.columns[3] * v.w;
+}
+
+static spvMat3x4 spvMulMat3x4Scalar(spvMat3x4 m, float s)
+{
+    spvMat3x4 r;
+    r.columns[0] = m.columns[0] * s;
+    r.columns[1] = m.columns[1] * s;
+    r.columns[2] = m.columns[2] * s;
+    return r;
+}
+
+static spvMat4x3 spvTransposeMat3x4(spvMat3x4 m)
+{
+    spvMat4x3 r;
+    r.columns[0] = (float3)(m.columns[0].x, m.columns[1].x, m.columns[2].x);
+    r.columns[1] = (float3)(m.columns[0].y, m.columns[1].y, m.columns[2].y);
+    r.columns[2] = (float3)(m.columns[0].z, m.columns[1].z, m.columns[2].z);
+    r.columns[3] = (float3)(m.columns[0].w, m.columns[1].w, m.columns[2].w);
+    return r;
+}
+
+__attribute__((reqd_work_group_size(64, 1, 1)))
+__kernel void comp_main(__global SSBO* _18)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    spvMat4x3 Anew_1;
+    spvMat4x3 Bnew_1;
+    do
+    {
+        Anew_1 = spvTransposeMat3x4(spvMulMat3x4Scalar(_18->A, _18->w0));
+        Bnew_1 = spvTransposeMat3x4(spvMulMat3x4Scalar(_18->B, _18->w1));
+    } while (false);
+    _18->D = (float4)(spvMulMat4x3Vec4((spvMat4x3){ { Anew_1.columns[0] + Bnew_1.columns[0], Anew_1.columns[1] + Bnew_1.columns[1], Anew_1.columns[2] + Bnew_1.columns[2], Anew_1.columns[3] + Bnew_1.columns[3] } }, _18->D), 1.0f);
+}
+
diff --git a/reference/shaders-opencl-no-opt/comp/transposed-temporary-expression.comp b/reference/shaders-opencl-no-opt/comp/transposed-temporary-expression.comp
new file mode 100644
index 000000000..669835c16
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/comp/transposed-temporary-expression.comp
@@ -0,0 +1,41 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+typedef struct { float4 columns[3]; } spvMat3x4;
+typedef struct { float3 columns[4]; } spvMat4x3;
+
+struct SSBO
+{
+    spvMat3x4 A;
+    spvMat3x4 B;
+    spvMat3x4 C;
+    float4 D;
+    float w0;
+    float w1;
+};
+
+typedef struct SSBO SSBO;
+
+static float3 spvMulMat4x3Vec4(spvMat4x3 m, float4 v)
+{
+    return m.columns[0] * v.x + m.columns[1] * v.y + m.columns[2] * v.z + m.columns[3] * v.w;
+}
+
+static spvMat3x4 spvMulMat3x4Scalar(spvMat3x4 m, float s)
+{
+    spvMat3x4 r;
+    r.columns[0] = m.columns[0] * s;
+    r.columns[1] = m.columns[1] * s;
+    r.columns[2] = m.columns[2] * s;
+    return r;
+}
+
+__attribute__((reqd_work_group_size(64, 1, 1)))
+__kernel void comp_main(__global SSBO* _12)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    spvMat3x4 _23 = spvMulMat3x4Scalar(_12->A, _12->w0);
+    spvMat3x4 _30 = spvMulMat3x4Scalar(_12->B, _12->w1);
+    _12->D = (float4)(spvMulMat4x3Vec4((spvMat4x3){ { (float3)(_23.columns[0][0], _23.columns[1][0], _23.columns[2][0]) + (float3)(_30.columns[0][0], _30.columns[1][0], _30.columns[2][0]), (float3)(_23.columns[0][1], _23.columns[1][1], _23.columns[2][1]) + (float3)(_30.columns[0][1], _30.columns[1][1], _30.columns[2][1]), (float3)(_23.columns[0][2], _23.columns[1][2], _23.columns[2][2]) + (float3)(_30.columns[0][2], _30.columns[1][2], _30.columns[2][2]), (float3)(_23.columns[0][3], _23.columns[1][3], _23.columns[2][3]) + (float3)(_30.columns[0][3], _30.columns[1][3], _30.columns[2][3]) } }, _12->D), 1.0f);
+}
+
diff --git a/reference/shaders-opencl-no-opt/comp/trivial-select-cast-vector.comp b/reference/shaders-opencl-no-opt/comp/trivial-select-cast-vector.comp
new file mode 100644
index 000000000..6e105e28a
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/comp/trivial-select-cast-vector.comp
@@ -0,0 +1,19 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct A
+{
+    float3 a;
+    float3 b;
+};
+
+typedef struct A A;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global A* _14)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    int3 c = _14->b < (float3)(1.0f);
+    _14->a = select((float3)(1.0f, 0.0f, 0.0f), (float3)(0.0f, 0.0f, 1.0f), c);
+}
+
diff --git a/reference/shaders-opencl-no-opt/comp/trivial-select-matrix.spv14.comp b/reference/shaders-opencl-no-opt/comp/trivial-select-matrix.spv14.comp
new file mode 100644
index 000000000..cf0dbbc85
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/comp/trivial-select-matrix.spv14.comp
@@ -0,0 +1,22 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+typedef struct { float3 columns[3]; } spvMat3;
+
+struct A
+{
+    spvMat3 a;
+    float b;
+};
+
+typedef struct A A;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global A* _14)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    bool c = _14->b < 1.0f;
+    _14->a = c ? ((spvMat3){ { (float3)(1.0f), (float3)(1.0f), (float3)(1.0f) } }) : ((spvMat3){ { (float3)(0.0f), (float3)(0.0f), (float3)(0.0f) } });
+    _14->a = c ? ((spvMat3){ { (float3)(1.0f, 0.0f, 0.0f), (float3)(0.0f, 1.0f, 0.0f), (float3)(0.0f, 0.0f, 1.0f) } }) : ((spvMat3){ { (float3)(0.0f), (float3)(0.0f), (float3)(0.0f) } });
+}
+
diff --git a/reference/shaders-opencl-no-opt/comp/workgroup-size-spec-constant-array.invalid.comp b/reference/shaders-opencl-no-opt/comp/workgroup-size-spec-constant-array.invalid.comp
new file mode 100644
index 000000000..2f0fe4a37
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/comp/workgroup-size-spec-constant-array.invalid.comp
@@ -0,0 +1,58 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+#ifndef SPIRV_CROSS_CONSTANT_ID_1
+#define SPIRV_CROSS_CONSTANT_ID_1 2
+#endif
+constant int A = SPIRV_CROSS_CONSTANT_ID_1;
+#define _20 ((as_uint(A) + 0u))
+#ifndef SPIRV_CROSS_CONSTANT_ID_0
+#define SPIRV_CROSS_CONSTANT_ID_0 1u
+#endif
+constant uint _21 = SPIRV_CROSS_CONSTANT_ID_0;
+#ifndef SPIRV_CROSS_CONSTANT_ID_2
+#define SPIRV_CROSS_CONSTANT_ID_2 1u
+#endif
+constant uint _22 = SPIRV_CROSS_CONSTANT_ID_2;
+#ifndef SPIRV_CROSS_CONSTANT_ID_3
+#define SPIRV_CROSS_CONSTANT_ID_3 1u
+#endif
+constant uint _23 = SPIRV_CROSS_CONSTANT_ID_3;
+constant uint3 spvWorkgroupSize = (uint3)(_21, _22, _23);
+#define _26 (_21)
+#define _27 ((_20 * _26))
+#define _31 ((as_uint(A) + 0u))
+#define _32 (_21)
+#define _33 ((_31 * _32))
+#define _59 ((as_uint(A) + 0u))
+#define _61 (_23)
+#define _62 ((_59 * _61))
+
+struct SSBO
+{
+    int I;
+    float V;
+};
+
+typedef struct SSBO SSBO;
+
+#define _88 ((as_uint(A) + 0u))
+#define _89 (_23)
+#define _90 ((_88 * _89))
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global SSBO* _76)
+{
+    float D[_33];
+    float E[_90];
+    for (int i = 0; as_uint(i) < _27; i++)
+    {
+        D[i] = 1.0f + convert_float(as_uint(i) + spvWorkgroupSize.y);
+    }
+    for (int i_1 = 0; as_uint(i_1) < _62; i_1++)
+    {
+        D[i_1] = 1.0f + convert_float(as_uint(i_1) + spvWorkgroupSize.y);
+    }
+    _76->V = (D[_76->I] + D[_76->I ^ 1]) + E[_76->I];
+}
+
diff --git a/reference/shaders-opencl-no-opt/comp/workgroup-size-spec-constant-array.spv16.invalid.comp b/reference/shaders-opencl-no-opt/comp/workgroup-size-spec-constant-array.spv16.invalid.comp
new file mode 100644
index 000000000..b04d8391b
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/comp/workgroup-size-spec-constant-array.spv16.invalid.comp
@@ -0,0 +1,70 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+#ifndef SPIRV_CROSS_CONSTANT_ID_0
+#define SPIRV_CROSS_CONSTANT_ID_0 1u
+#endif
+constant uint _7 = SPIRV_CROSS_CONSTANT_ID_0;
+#ifndef SPIRV_CROSS_CONSTANT_ID_2
+#define SPIRV_CROSS_CONSTANT_ID_2 1u
+#endif
+constant uint _8 = SPIRV_CROSS_CONSTANT_ID_2;
+#ifndef SPIRV_CROSS_CONSTANT_ID_3
+#define SPIRV_CROSS_CONSTANT_ID_3 1u
+#endif
+constant uint _9 = SPIRV_CROSS_CONSTANT_ID_3;
+#ifndef SPIRV_CROSS_CONSTANT_ID_1
+#define SPIRV_CROSS_CONSTANT_ID_1 2
+#endif
+constant int A = SPIRV_CROSS_CONSTANT_ID_1;
+#define _23 ((as_uint(A) + 0u))
+#ifndef SPIRV_CROSS_CONSTANT_ID_0
+#define SPIRV_CROSS_CONSTANT_ID_0 1u
+#endif
+constant uint _24 = SPIRV_CROSS_CONSTANT_ID_0;
+#ifndef SPIRV_CROSS_CONSTANT_ID_2
+#define SPIRV_CROSS_CONSTANT_ID_2 1u
+#endif
+constant uint _25 = SPIRV_CROSS_CONSTANT_ID_2;
+#ifndef SPIRV_CROSS_CONSTANT_ID_3
+#define SPIRV_CROSS_CONSTANT_ID_3 1u
+#endif
+constant uint _26 = SPIRV_CROSS_CONSTANT_ID_3;
+constant uint3 _28 = (uint3)(_24, _25, _26);
+#define _29 (_24)
+#define _30 ((_23 * _29))
+#define _34 ((as_uint(A) + 0u))
+#define _35 (_24)
+#define _36 ((_34 * _35))
+#define _62 ((as_uint(A) + 0u))
+#define _64 (_26)
+#define _65 ((_62 * _64))
+
+struct SSBO
+{
+    int I;
+    float V;
+};
+
+typedef struct SSBO SSBO;
+
+#define _91 ((as_uint(A) + 0u))
+#define _92 (_26)
+#define _93 ((_91 * _92))
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global SSBO* _79)
+{
+    float D[_36];
+    float E[_93];
+    for (int i = 0; as_uint(i) < _30; i++)
+    {
+        D[i] = 1.0f + convert_float(as_uint(i) + _28.y);
+    }
+    for (int i_1 = 0; as_uint(i_1) < _65; i_1++)
+    {
+        D[i_1] = 1.0f + convert_float(as_uint(i_1) + _28.y);
+    }
+    _79->V = (D[_79->I] + D[_79->I ^ 1]) + E[_79->I];
+}
+
diff --git a/shaders-opencl-no-opt/asm/comp/aliased-struct-divergent-member-name.asm.comp b/shaders-opencl-no-opt/asm/comp/aliased-struct-divergent-member-name.asm.comp
new file mode 100644
index 000000000..87aee2db5
--- /dev/null
+++ b/shaders-opencl-no-opt/asm/comp/aliased-struct-divergent-member-name.asm.comp
@@ -0,0 +1,77 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos Glslang Reference Front End; 7
+; Bound: 37
+; Schema: 0
+               OpCapability Shader
+          %1 = OpExtInstImport "GLSL.std.450"
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint GLCompute %main "main"
+               OpExecutionMode %main LocalSize 1 1 1
+               OpSource GLSL 450
+               OpName %main "main"
+               OpName %T "T"
+               OpMemberName %T 0 "a"
+               OpName %v "v"
+               OpName %T_0 "T"
+               OpMemberName %T_0 0 "b"
+               OpName %SSBO1 "SSBO1"
+               OpMemberName %SSBO1 0 "foo"
+               OpName %_ ""
+               OpName %T_1 "T"
+               OpMemberName %T_1 0 "c"
+               OpName %SSBO2 "SSBO2"
+               OpMemberName %SSBO2 0 "bar"
+               OpName %__0 ""
+               OpMemberDecorate %T_0 0 Offset 0
+               OpDecorate %_runtimearr_T_0 ArrayStride 4
+               OpMemberDecorate %SSBO1 0 Offset 0
+               OpDecorate %SSBO1 BufferBlock
+               OpDecorate %_ DescriptorSet 0
+               OpDecorate %_ Binding 0
+               OpMemberDecorate %T_1 0 Offset 0
+               OpDecorate %_runtimearr_T_1 ArrayStride 16
+               OpMemberDecorate %SSBO2 0 Offset 0
+               OpDecorate %SSBO2 BufferBlock
+               OpDecorate %__0 DescriptorSet 0
+               OpDecorate %__0 Binding 1
+       %void = OpTypeVoid
+          %3 = OpTypeFunction %void
+      %float = OpTypeFloat 32
+          %T = OpTypeStruct %float
+%_ptr_Function_T = OpTypePointer Function %T
+   %float_40 = OpConstant %float 40
+         %11 = OpConstantComposite %T %float_40
+        %T_0 = OpTypeStruct %float
+%_runtimearr_T_0 = OpTypeRuntimeArray %T_0
+      %SSBO1 = OpTypeStruct %_runtimearr_T_0
+%_ptr_Uniform_SSBO1 = OpTypePointer Uniform %SSBO1
+          %_ = OpVariable %_ptr_Uniform_SSBO1 Uniform
+        %int = OpTypeInt 32 1
+      %int_0 = OpConstant %int 0
+     %int_10 = OpConstant %int 10
+%_ptr_Uniform_T_0 = OpTypePointer Uniform %T_0
+%_ptr_Uniform_float = OpTypePointer Uniform %float
+        %T_1 = OpTypeStruct %float
+%_runtimearr_T_1 = OpTypeRuntimeArray %T_1
+      %SSBO2 = OpTypeStruct %_runtimearr_T_1
+%_ptr_Uniform_SSBO2 = OpTypePointer Uniform %SSBO2
+        %__0 = OpVariable %_ptr_Uniform_SSBO2 Uniform
+     %int_30 = OpConstant %int 30
+%_ptr_Uniform_T_1 = OpTypePointer Uniform %T_1
+       %main = OpFunction %void None %3
+          %5 = OpLabel
+          %v = OpVariable %_ptr_Function_T Function
+               OpStore %v %11
+         %20 = OpLoad %T %v
+         %22 = OpAccessChain %_ptr_Uniform_T_0 %_ %int_0 %int_10
+         %23 = OpCompositeExtract %float %20 0
+         %25 = OpAccessChain %_ptr_Uniform_float %22 %int_0
+               OpStore %25 %23
+         %32 = OpLoad %T %v
+         %34 = OpAccessChain %_ptr_Uniform_T_1 %__0 %int_0 %int_30
+         %35 = OpCompositeExtract %float %32 0
+         %36 = OpAccessChain %_ptr_Uniform_float %34 %int_0
+               OpStore %36 %35
+               OpReturn
+               OpFunctionEnd
diff --git a/shaders-opencl-no-opt/asm/comp/arithmetic-conversion-signs.asm.comp b/shaders-opencl-no-opt/asm/comp/arithmetic-conversion-signs.asm.comp
new file mode 100644
index 000000000..0e1ce235d
--- /dev/null
+++ b/shaders-opencl-no-opt/asm/comp/arithmetic-conversion-signs.asm.comp
@@ -0,0 +1,131 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos Glslang Reference Front End; 7
+; Bound: 76
+; Schema: 0
+               OpCapability Shader
+               OpCapability Int16
+               OpCapability StorageBuffer16BitAccess
+               OpExtension "SPV_KHR_16bit_storage"
+          %1 = OpExtInstImport "GLSL.std.450"
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint GLCompute %main "main"
+               OpExecutionMode %main LocalSize 1 1 1
+               OpSource GLSL 450
+               OpSourceExtension "GL_EXT_shader_explicit_arithmetic_types_int16"
+               OpName %main "main"
+               OpName %SSBO "SSBO"
+               OpMemberName %SSBO 0 "s32"
+               OpMemberName %SSBO 1 "u32"
+               OpMemberName %SSBO 2 "s16"
+               OpMemberName %SSBO 3 "u16"
+               OpMemberName %SSBO 4 "f32"
+               OpName %_ ""
+               OpMemberDecorate %SSBO 0 Offset 0
+               OpMemberDecorate %SSBO 1 Offset 4
+               OpMemberDecorate %SSBO 2 Offset 8
+               OpMemberDecorate %SSBO 3 Offset 10
+               OpMemberDecorate %SSBO 4 Offset 12
+               OpDecorate %SSBO BufferBlock
+               OpDecorate %_ DescriptorSet 0
+               OpDecorate %_ Binding 0
+       %void = OpTypeVoid
+          %3 = OpTypeFunction %void
+        %int = OpTypeInt 32 1
+       %uint = OpTypeInt 32 0
+      %short = OpTypeInt 16 1
+     %ushort = OpTypeInt 16 0
+      %float = OpTypeFloat 32
+       %SSBO = OpTypeStruct %int %uint %short %ushort %float
+%_ptr_Uniform_SSBO = OpTypePointer Uniform %SSBO
+          %_ = OpVariable %_ptr_Uniform_SSBO Uniform
+      %int_2 = OpConstant %int 2
+      %int_0 = OpConstant %int 0
+%_ptr_Uniform_int = OpTypePointer Uniform %int
+%_ptr_Uniform_short = OpTypePointer Uniform %short
+      %int_1 = OpConstant %int 1
+%_ptr_Uniform_uint = OpTypePointer Uniform %uint
+      %int_3 = OpConstant %int 3
+%_ptr_Uniform_ushort = OpTypePointer Uniform %ushort
+      %int_4 = OpConstant %int 4
+%_ptr_Uniform_float = OpTypePointer Uniform %float
+       %main = OpFunction %void None %3
+          %5 = OpLabel
+         %ptr_s32 = OpAccessChain %_ptr_Uniform_int %_ %int_0
+         %ptr_u32 = OpAccessChain %_ptr_Uniform_uint %_ %int_1
+         %ptr_s16 = OpAccessChain %_ptr_Uniform_short %_ %int_2
+         %ptr_u16 = OpAccessChain %_ptr_Uniform_ushort %_ %int_3
+         %ptr_f32 = OpAccessChain %_ptr_Uniform_float %_ %int_4
+         %s32 = OpLoad %int %ptr_s32
+         %u32 = OpLoad %uint %ptr_u32
+         %s16 = OpLoad %short %ptr_s16
+         %u16 = OpLoad %ushort %ptr_u16
+		 %f32 = OpLoad %float %ptr_f32
+
+		; Sign-extend
+		 %s16_to_s32_signed = OpSConvert %int %s16
+		 OpStore %ptr_s32 %s16_to_s32_signed
+		 %s16_to_u32_signed = OpSConvert %uint %s16
+		 OpStore %ptr_u32 %s16_to_u32_signed
+
+		 %u16_to_s32_signed = OpSConvert %int %u16
+		 OpStore %ptr_s32 %u16_to_s32_signed
+		 %u16_to_u32_signed = OpSConvert %uint %u16
+		 OpStore %ptr_u32 %u16_to_u32_signed
+
+		; Zero-extend
+		; Result must be unsigned for OpUConvert.
+		 ;%s16_to_s32_unsigned = OpUConvert %int %s16
+		 ;OpStore %ptr_s32 %s16_to_s32_unsigned
+		 %s16_to_u32_unsigned = OpUConvert %uint %s16
+		 OpStore %ptr_u32 %s16_to_u32_unsigned
+
+		 ;%u16_to_s32_unsigned = OpUConvert %int %u16
+		 ;OpStore %ptr_s32 %u16_to_s32_unsigned
+		 %u16_to_u32_unsigned = OpUConvert %uint %u16
+		 OpStore %ptr_u32 %u16_to_u32_unsigned
+
+		; Truncate (SConvert == UConvert)
+		 %s32_to_s16_signed = OpSConvert %short %s32
+		 OpStore %ptr_s16 %s32_to_s16_signed
+		 %s32_to_u16_signed = OpSConvert %ushort %s32
+		 OpStore %ptr_u16 %s32_to_u16_signed
+
+		 %u32_to_s16_signed = OpSConvert %short %u32
+		 OpStore %ptr_s16 %u32_to_s16_signed
+		 %u32_to_u16_signed = OpSConvert %ushort %u32
+		 OpStore %ptr_u16 %u32_to_u16_signed
+
+		 ;%s32_to_s16_unsigned = OpUConvert %short %s32
+		 ;OpStore %ptr_s16 %s32_to_s16_unsigned
+		 %s32_to_u16_unsigned = OpUConvert %ushort %s32
+		 OpStore %ptr_u16 %s32_to_u16_unsigned
+
+		 ;%u32_to_s16_unsigned = OpUConvert %short %u32
+		 ;OpStore %ptr_s16 %u32_to_s16_unsigned
+		 %u32_to_u16_unsigned = OpUConvert %ushort %u32
+		 OpStore %ptr_u16 %u32_to_u16_unsigned
+
+		; SToF
+		%s16_to_f32_signed = OpConvertSToF %float %s16
+		OpStore %ptr_f32 %s16_to_f32_signed
+		%u16_to_f32_signed = OpConvertSToF %float %u16
+		OpStore %ptr_f32 %u16_to_f32_signed
+		%s16_to_f32_unsigned = OpConvertUToF %float %s16
+		OpStore %ptr_f32 %s16_to_f32_unsigned
+		%u16_to_f32_unsigned = OpConvertUToF %float %u16
+		OpStore %ptr_f32 %u16_to_f32_unsigned
+
+		; FToS
+		%f32_to_s16_signed = OpConvertFToS %short %f32
+		OpStore %ptr_s16 %f32_to_s16_signed
+		%f32_to_u16_signed = OpConvertFToS %ushort %f32
+		OpStore %ptr_u16 %f32_to_u16_signed
+
+		; FToU
+		%f32_to_u16_unsigned = OpConvertFToU %ushort %f32
+		OpStore %ptr_u16 %f32_to_u16_unsigned
+		; Result must be unsigned for FToU, so don't bother testing that.
+
+               OpReturn
+               OpFunctionEnd
diff --git a/shaders-opencl-no-opt/asm/comp/array-copy-physical-layout-mismatch.invalid.asm.comp b/shaders-opencl-no-opt/asm/comp/array-copy-physical-layout-mismatch.invalid.asm.comp
new file mode 100644
index 000000000..bc465285e
--- /dev/null
+++ b/shaders-opencl-no-opt/asm/comp/array-copy-physical-layout-mismatch.invalid.asm.comp
@@ -0,0 +1,47 @@
+               OpCapability Shader
+          %1 = OpExtInstImport "GLSL.std.450"
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint GLCompute %main "main"
+               OpExecutionMode %main LocalSize 1 1 1
+               OpSource GLSL 450
+               OpName %main "main"
+               OpName %a "a"
+               OpName %SSBO "SSBO"
+               OpMemberName %SSBO 0 "b"
+               OpMemberName %SSBO 1 "c"
+               OpName %_ ""
+               OpDecorate %_arr_float_uint_5 ArrayStride 16
+               OpDecorate %SSBO BufferBlock
+               OpMemberDecorate %SSBO 0 Offset 0
+               OpMemberDecorate %SSBO 1 Offset 80
+               OpDecorate %_ Binding 0
+               OpDecorate %_ DescriptorSet 0
+       %void = OpTypeVoid
+          %3 = OpTypeFunction %void
+      %float = OpTypeFloat 32
+       %uint = OpTypeInt 32 0
+     %uint_5 = OpConstant %uint 5
+%_arr_float_uint_5 = OpTypeArray %float %uint_5
+%_ptr_Function__arr_float_uint_5 = OpTypePointer Function %_arr_float_uint_5
+%_ptr_Uniform__arr_float_uint_5 = OpTypePointer Uniform %_arr_float_uint_5
+%_ptr_Function_float = OpTypePointer Function %float
+%_ptr_Uniform_float = OpTypePointer Uniform %float
+       %SSBO = OpTypeStruct %_arr_float_uint_5 %_arr_float_uint_5
+%_ptr_Uniform_SSBO = OpTypePointer Uniform %SSBO
+          %_ = OpVariable %_ptr_Uniform_SSBO Uniform
+        %int = OpTypeInt 32 1
+      %int_0 = OpConstant %int 0
+      %int_1 = OpConstant %int 1
+       %main = OpFunction %void None %3
+          %5 = OpLabel
+          %a = OpVariable %_ptr_Function__arr_float_uint_5 Function
+         %ptr_b = OpAccessChain %_ptr_Uniform__arr_float_uint_5 %_ %int_0
+         %ptr_c = OpAccessChain %_ptr_Uniform__arr_float_uint_5 %_ %int_1
+		 %loaded_b = OpLoad %_arr_float_uint_5 %ptr_b
+		 OpStore %a %loaded_b
+		 OpCopyMemory %a %ptr_b
+		 %loaded_a = OpLoad %_arr_float_uint_5 %a
+		 OpStore %ptr_b %loaded_a
+		 OpCopyMemory %ptr_c %a
+               OpReturn
+               OpFunctionEnd
diff --git a/shaders-opencl-no-opt/asm/comp/atomic-load-store.asm.comp b/shaders-opencl-no-opt/asm/comp/atomic-load-store.asm.comp
new file mode 100644
index 000000000..3f2d141a1
--- /dev/null
+++ b/shaders-opencl-no-opt/asm/comp/atomic-load-store.asm.comp
@@ -0,0 +1,48 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos Glslang Reference Front End; 8
+; Bound: 23
+; Schema: 0
+               OpCapability Shader
+          %1 = OpExtInstImport "GLSL.std.450"
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint GLCompute %main "main"
+               OpExecutionMode %main LocalSize 1 1 1
+               OpSource GLSL 450
+               OpName %main "main"
+               OpName %c "c"
+               OpName %SSBO "SSBO"
+               OpMemberName %SSBO 0 "a"
+               OpMemberName %SSBO 1 "b"
+               OpName %_ ""
+               OpMemberDecorate %SSBO 0 Offset 0
+               OpMemberDecorate %SSBO 1 Offset 4
+               OpDecorate %SSBO BufferBlock
+               OpDecorate %_ DescriptorSet 0
+               OpDecorate %_ Binding 0
+               OpDecorate %gl_WorkGroupSize BuiltIn WorkgroupSize
+       %void = OpTypeVoid
+          %3 = OpTypeFunction %void
+       %uint = OpTypeInt 32 0
+%_ptr_Function_uint = OpTypePointer Function %uint
+       %SSBO = OpTypeStruct %uint %uint
+%_ptr_Uniform_SSBO = OpTypePointer Uniform %SSBO
+          %_ = OpVariable %_ptr_Uniform_SSBO Uniform
+        %int = OpTypeInt 32 1
+      %int_1 = OpConstant %int 1
+%_ptr_Uniform_uint = OpTypePointer Uniform %uint
+      %int_0 = OpConstant %int 0
+     %v3uint = OpTypeVector %uint 3
+     %uint_1 = OpConstant %uint 1
+%gl_WorkGroupSize = OpConstantComposite %v3uint %uint_1 %uint_1 %uint_1
+       %main = OpFunction %void None %3
+          %5 = OpLabel
+          %c = OpVariable %_ptr_Function_uint Function
+         %15 = OpAccessChain %_ptr_Uniform_uint %_ %int_1
+         %16 = OpAtomicLoad %uint %15 %int_1 %int_0
+               OpStore %c %16
+         %18 = OpLoad %uint %c
+         %19 = OpAccessChain %_ptr_Uniform_uint %_ %int_0
+               OpAtomicStore %19 %int_1 %int_0 %18 
+               OpReturn
+               OpFunctionEnd
diff --git a/shaders-opencl-no-opt/asm/comp/atomic-min-max-sign.asm.comp b/shaders-opencl-no-opt/asm/comp/atomic-min-max-sign.asm.comp
new file mode 100644
index 000000000..832a27354
--- /dev/null
+++ b/shaders-opencl-no-opt/asm/comp/atomic-min-max-sign.asm.comp
@@ -0,0 +1,56 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos Glslang Reference Front End; 10
+; Bound: 30
+; Schema: 0
+               OpCapability Shader
+          %1 = OpExtInstImport "GLSL.std.450"
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint GLCompute %main "main"
+               OpExecutionMode %main LocalSize 1 1 1
+               OpSource GLSL 450
+               OpName %main "main"
+               OpName %SSBO "SSBO"
+               OpMemberName %SSBO 0 "a"
+               OpMemberName %SSBO 1 "b"
+               OpName %_ ""
+               OpMemberDecorate %SSBO 0 Offset 0
+               OpMemberDecorate %SSBO 1 Offset 4
+               OpDecorate %SSBO BufferBlock
+               OpDecorate %_ DescriptorSet 0
+               OpDecorate %_ Binding 0
+               OpDecorate %gl_WorkGroupSize BuiltIn WorkgroupSize
+       %void = OpTypeVoid
+          %3 = OpTypeFunction %void
+       %uint = OpTypeInt 32 0
+        %int = OpTypeInt 32 1
+       %SSBO = OpTypeStruct %uint %int
+%_ptr_Uniform_SSBO = OpTypePointer Uniform %SSBO
+          %_ = OpVariable %_ptr_Uniform_SSBO Uniform
+      %int_0 = OpConstant %int 0
+%_ptr_Uniform_uint = OpTypePointer Uniform %uint
+     %uint_1 = OpConstant %uint 1
+     %uint_0 = OpConstant %uint 0
+%uint_4294967295 = OpConstant %uint 4294967295
+      %int_1 = OpConstant %int 1
+%_ptr_Uniform_int = OpTypePointer Uniform %int
+     %int_n3 = OpConstant %int -3
+      %int_4 = OpConstant %int 4
+     %v3uint = OpTypeVector %uint 3
+%gl_WorkGroupSize = OpConstantComposite %v3uint %uint_1 %uint_1 %uint_1
+       %main = OpFunction %void None %3
+          %5 = OpLabel
+         %13 = OpAccessChain %_ptr_Uniform_uint %_ %int_0
+         %18 = OpAccessChain %_ptr_Uniform_uint %_ %int_0
+         %22 = OpAccessChain %_ptr_Uniform_int %_ %int_1
+         %25 = OpAccessChain %_ptr_Uniform_int %_ %int_1
+         %30 = OpAtomicUMax %uint %13 %uint_1 %uint_0 %uint_1
+         %31 = OpAtomicSMin %uint %13 %uint_1 %uint_0 %uint_1
+         %32 = OpAtomicUMin %uint %18 %uint_1 %uint_0 %uint_4294967295
+         %33 = OpAtomicSMax %uint %18 %uint_1 %uint_0 %uint_4294967295
+         %34 = OpAtomicSMax %int %22 %uint_1 %uint_0 %int_n3
+         %35 = OpAtomicUMin %int %22 %uint_1 %uint_0 %int_n3
+         %36 = OpAtomicSMin %int %25 %uint_1 %uint_0 %int_4
+         %37 = OpAtomicUMax %int %25 %uint_1 %uint_0 %int_4
+               OpReturn
+               OpFunctionEnd
diff --git a/shaders-opencl-no-opt/asm/comp/atomic-result-temporary.asm.comp b/shaders-opencl-no-opt/asm/comp/atomic-result-temporary.asm.comp
new file mode 100644
index 000000000..a32384159
--- /dev/null
+++ b/shaders-opencl-no-opt/asm/comp/atomic-result-temporary.asm.comp
@@ -0,0 +1,59 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos Glslang Reference Front End; 7
+; Bound: 35
+; Schema: 0
+               OpCapability Shader
+          %1 = OpExtInstImport "GLSL.std.450"
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint GLCompute %main "main" %gl_GlobalInvocationID
+               OpExecutionMode %main LocalSize 1 1 1
+               OpSource GLSL 450
+               OpName %main "main"
+               OpName %SSBO "SSBO"
+               OpMemberName %SSBO 0 "count"
+               OpMemberName %SSBO 1 "data"
+               OpName %_ ""
+               OpName %gl_GlobalInvocationID "gl_GlobalInvocationID"
+               OpDecorate %_runtimearr_uint ArrayStride 4
+               OpMemberDecorate %SSBO 0 Offset 0
+               OpMemberDecorate %SSBO 1 Offset 4
+               OpDecorate %SSBO BufferBlock
+               OpDecorate %_ DescriptorSet 0
+               OpDecorate %_ Binding 0
+               OpDecorate %gl_GlobalInvocationID BuiltIn GlobalInvocationId
+       %void = OpTypeVoid
+          %3 = OpTypeFunction %void
+       %uint = OpTypeInt 32 0
+%_runtimearr_uint = OpTypeRuntimeArray %uint
+       %SSBO = OpTypeStruct %uint %_runtimearr_uint
+%_ptr_Uniform_SSBO = OpTypePointer Uniform %SSBO
+          %_ = OpVariable %_ptr_Uniform_SSBO Uniform
+        %int = OpTypeInt 32 1
+      %int_0 = OpConstant %int 0
+%_ptr_Uniform_uint = OpTypePointer Uniform %uint
+     %uint_1 = OpConstant %uint 1
+     %uint_0 = OpConstant %uint 0
+  %uint_1024 = OpConstant %uint 1024
+       %bool = OpTypeBool
+      %int_1 = OpConstant %int 1
+     %v3uint = OpTypeVector %uint 3
+%_ptr_Input_v3uint = OpTypePointer Input %v3uint
+%gl_GlobalInvocationID = OpVariable %_ptr_Input_v3uint Input
+%_ptr_Input_uint = OpTypePointer Input %uint
+       %main = OpFunction %void None %3
+          %5 = OpLabel
+         %16 = OpAccessChain %_ptr_Uniform_uint %_ %int_0
+         %19 = OpAtomicIAdd %uint %16 %uint_1 %uint_0 %uint_1
+         %23 = OpULessThan %bool %19 %uint_1024
+               OpSelectionMerge %25 None
+               OpBranchConditional %23 %24 %25
+         %24 = OpLabel
+         %32 = OpAccessChain %_ptr_Input_uint %gl_GlobalInvocationID %uint_0
+         %33 = OpLoad %uint %32
+         %34 = OpAccessChain %_ptr_Uniform_uint %_ %int_1 %19
+               OpStore %34 %33
+               OpBranch %25
+         %25 = OpLabel
+               OpReturn
+               OpFunctionEnd
diff --git a/shaders-opencl-no-opt/asm/comp/bda-arguments.asm.comp b/shaders-opencl-no-opt/asm/comp/bda-arguments.asm.comp
new file mode 100644
index 000000000..034e5ae0c
--- /dev/null
+++ b/shaders-opencl-no-opt/asm/comp/bda-arguments.asm.comp
@@ -0,0 +1,81 @@
+               OpCapability Shader
+               OpCapability PhysicalStorageBufferAddresses
+               OpExtension "SPV_KHR_physical_storage_buffer"
+          %1 = OpExtInstImport "GLSL.std.450"
+               OpMemoryModel PhysicalStorageBuffer64 GLSL450
+               OpEntryPoint GLCompute %main "main"
+               OpExecutionMode %main LocalSize 1 1 1
+               OpSource GLSL 450
+               OpSourceExtension "GL_EXT_buffer_reference"
+               OpDecorate %Foo Block
+               OpMemberDecorate %Foo 0 Offset 0
+               OpMemberDecorate %Foo 1 Offset 8
+               OpDecorate %foo Aliased
+               OpDecorate %vp Aliased
+               OpDecorate %ppp Aliased
+               OpDecorate %p2 Aliased
+               OpDecorate %dummyarg1 Restrict
+               OpDecorate %dummyarg3 Restrict
+               OpDecorate %dummyarg4 Restrict
+               OpDecorate %dummyarg5 Restrict
+
+               OpDecorate %Registers Block
+               OpMemberDecorate %Registers 0 Offset 0
+               OpDecorate %gl_WorkGroupSize BuiltIn WorkgroupSize
+       %void = OpTypeVoid
+          %3 = OpTypeFunction %void
+               OpTypeForwardPointer %_ptr_PhysicalStorageBuffer_Foo PhysicalStorageBuffer
+        %int = OpTypeInt 32 1
+%_ptr_PhysicalStorageBuffer_int = OpTypePointer PhysicalStorageBuffer %int
+%_ptr_PhysicalStorageBuffer_int_int = OpTypePointer PhysicalStorageBuffer %_ptr_PhysicalStorageBuffer_int
+        %Foo = OpTypeStruct %int %_ptr_PhysicalStorageBuffer_int
+%_ptr_PhysicalStorageBuffer_Foo = OpTypePointer PhysicalStorageBuffer %Foo
+%_ptr_Function__ptr_PhysicalStorageBuffer_Foo = OpTypePointer Function %_ptr_PhysicalStorageBuffer_Foo
+         %11 = OpTypeFunction %void %_ptr_PhysicalStorageBuffer_Foo %int %_ptr_PhysicalStorageBuffer_int %_ptr_PhysicalStorageBuffer_int_int %_ptr_PhysicalStorageBuffer_int
+      %int_0 = OpConstant %int 0
+      %int_1 = OpConstant %int 1
+  %Registers = OpTypeStruct %_ptr_PhysicalStorageBuffer_Foo
+%_ptr_PushConstant_Registers = OpTypePointer PushConstant %Registers
+          %_ = OpVariable %_ptr_PushConstant_Registers PushConstant
+     %int_40 = OpConstant %int 40
+%_ptr_PushConstant__ptr_PhysicalStorageBuffer_Foo = OpTypePointer PushConstant %_ptr_PhysicalStorageBuffer_Foo
+       %uint = OpTypeInt 32 0
+     %v3uint = OpTypeVector %uint 3
+     %uint_1 = OpConstant %uint 1
+%gl_WorkGroupSize = OpConstantComposite %v3uint %uint_1 %uint_1 %uint_1
+       %main = OpFunction %void None %3
+          %5 = OpLabel
+         %27 = OpAccessChain %_ptr_PushConstant__ptr_PhysicalStorageBuffer_Foo %_ %int_0
+         %28 = OpLoad %_ptr_PhysicalStorageBuffer_Foo %27
+		 %29 = OpAccessChain %_ptr_PhysicalStorageBuffer_int %28 %int_0
+		 %pp = OpAccessChain %_ptr_PhysicalStorageBuffer_int_int %28 %int_1
+		 %pp_loaded = OpLoad %_ptr_PhysicalStorageBuffer_int %pp Aligned 8
+         %30 = OpFunctionCall %void %func_1_i1_ %28 %int_40 %29 %pp %pp_loaded
+               OpReturn
+               OpFunctionEnd
+ %func_1_i1_ = OpFunction %void None %11
+        %foo = OpFunctionParameter %_ptr_PhysicalStorageBuffer_Foo
+          %v = OpFunctionParameter %int
+          %vp = OpFunctionParameter %_ptr_PhysicalStorageBuffer_int
+          %ppp = OpFunctionParameter %_ptr_PhysicalStorageBuffer_int_int
+          %p2 = OpFunctionParameter %_ptr_PhysicalStorageBuffer_int
+         %15 = OpLabel
+		 %dummy_call = OpFunctionCall %void %func_dummy %foo %v %vp %ppp %p2
+         %20 = OpAccessChain %_ptr_PhysicalStorageBuffer_int %foo %int_0
+               OpStore %20 %v Aligned 16
+               OpStore %vp %v Aligned 4
+               OpStore %p2 %v Aligned 4
+		OpStore %ppp %p2 Aligned 8	   
+               OpReturn
+               OpFunctionEnd
+
+%func_dummy = OpFunction %void None %11
+	   %dummyarg1 = OpFunctionParameter %_ptr_PhysicalStorageBuffer_Foo
+	   %dummyarg2 = OpFunctionParameter %int
+	   %dummyarg3 = OpFunctionParameter %_ptr_PhysicalStorageBuffer_int
+	   %dummyarg4 = OpFunctionParameter %_ptr_PhysicalStorageBuffer_int_int
+	   %dummyarg5 = OpFunctionParameter %_ptr_PhysicalStorageBuffer_int
+	   %dummylabel = OpLabel
+		   OpReturn
+	   OpFunctionEnd
+
diff --git a/shaders-opencl-no-opt/asm/comp/bda-to-array-in-buffer.invalid.asm.spv16.comp b/shaders-opencl-no-opt/asm/comp/bda-to-array-in-buffer.invalid.asm.spv16.comp
new file mode 100644
index 000000000..0cca78f6a
--- /dev/null
+++ b/shaders-opencl-no-opt/asm/comp/bda-to-array-in-buffer.invalid.asm.spv16.comp
@@ -0,0 +1,71 @@
+; SPIR-V
+; Version: 1.5
+; Generator: Khronos; 35
+; Bound: 5550
+; Schema: 0
+               OpCapability Int8
+               OpCapability Int64
+               OpCapability Int64
+               OpCapability Shader
+               OpCapability PhysicalStorageBufferAddresses
+               OpExtension "SPV_KHR_physical_storage_buffer"
+               OpMemoryModel PhysicalStorageBuffer64 Simple
+               OpEntryPoint GLCompute %main "main" %globals
+               OpExecutionMode %main LocalSize 16 16 1
+
+               OpDecorate %ptr_uchar ArrayStride 8
+               OpDecorate %ptr_uint ArrayStride 8
+               OpDecorate %ptr_array_t ArrayStride 8
+               OpDecorate %array_t ArrayStride 1
+               OpDecorate %struct_t Block
+               OpMemberDecorate %struct_t 0 Offset 0
+               OpMemberDecorate %struct_t 1 Offset 8
+               OpDecorate %ptr_struct ArrayStride 32
+               OpDecorate %globals DescriptorSet 0
+               OpDecorate %globals Binding 0
+
+       %void = OpTypeVoid
+      %uchar = OpTypeInt 8 0
+       %uint = OpTypeInt 32 0
+      %ulong = OpTypeInt 64 0
+       %bool = OpTypeBool
+
+   %ulong_12 = OpConstant %ulong 12
+     %uint_0 = OpConstant %uint 0
+     %uint_1 = OpConstant %uint 1
+     %uint_2 = OpConstant %uint 2
+   %uchar_69 = OpConstant %uchar 69
+   %ulong_16 = OpConstant %ulong 16
+
+   %ptr_uint = OpTypePointer PhysicalStorageBuffer %uint
+  %ptr_uchar = OpTypePointer PhysicalStorageBuffer %uchar
+
+    %array_t = OpTypeArray %uchar %ulong_12
+%ptr_array_t = OpTypePointer PhysicalStorageBuffer %array_t
+
+   %struct_t = OpTypeStruct %ptr_uchar %ptr_array_t
+ %ptr_struct = OpTypePointer StorageBuffer %struct_t
+
+    %void_fn = OpTypeFunction %void
+      %foo_t = OpTypeFunction %ptr_uint
+
+%ptr_uchararr_sb = OpTypePointer StorageBuffer %ptr_array_t
+
+    %globals = OpVariable %ptr_struct StorageBuffer
+
+        %foo = OpFunction %ptr_uint None %foo_t
+  %foo_entry = OpLabel
+       %lea2 = OpAccessChain %ptr_uchararr_sb %globals %uint_1
+    %loaded2 = OpLoad %ptr_array_t %lea2
+       %cast = OpConvertPtrToU %ulong %loaded2
+   %adjusted = OpIAdd %ulong %cast %ulong_16
+      %cast2 = OpConvertUToPtr %ptr_uint %adjusted
+               OpStore %cast2 %uint_1 Aligned 4 ; eliminating this store generates different code and the problem disappears
+               OpReturnValue %cast2
+               OpFunctionEnd
+
+       %main = OpFunction %void None %void_fn
+ %main_entry = OpLabel
+    %nothing = OpFunctionCall %ptr_uint %foo
+               OpReturn
+               OpFunctionEnd
diff --git a/shaders-opencl-no-opt/asm/comp/bitcast-fp16-fp32.fp16.invalid.asm.comp b/shaders-opencl-no-opt/asm/comp/bitcast-fp16-fp32.fp16.invalid.asm.comp
new file mode 100644
index 000000000..3651a4de5
--- /dev/null
+++ b/shaders-opencl-no-opt/asm/comp/bitcast-fp16-fp32.fp16.invalid.asm.comp
@@ -0,0 +1,63 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos Glslang Reference Front End; 8
+; Bound: 33
+; Schema: 0
+               OpCapability Shader
+               OpCapability Float16
+               OpCapability StorageBuffer16BitAccess
+               OpExtension "SPV_KHR_16bit_storage"
+          %1 = OpExtInstImport "GLSL.std.450"
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint GLCompute %main "main"
+               OpExecutionMode %main LocalSize 1 1 1
+               OpSource GLSL 450
+               OpSourceExtension "GL_EXT_shader_explicit_arithmetic_types"
+               OpName %main "main"
+               OpName %SSBO "SSBO"
+               OpMemberName %SSBO 0 "a"
+               OpMemberName %SSBO 1 "b"
+               OpMemberName %SSBO 2 "c"
+               OpMemberName %SSBO 3 "d"
+               OpName %_ ""
+               OpMemberDecorate %SSBO 0 Offset 0
+               OpMemberDecorate %SSBO 1 Offset 4
+               OpMemberDecorate %SSBO 2 Offset 8
+               OpMemberDecorate %SSBO 3 Offset 12
+               OpDecorate %SSBO BufferBlock
+               OpDecorate %_ DescriptorSet 0
+               OpDecorate %_ Binding 0
+               OpDecorate %gl_WorkGroupSize BuiltIn WorkgroupSize
+       %void = OpTypeVoid
+          %3 = OpTypeFunction %void
+       %half = OpTypeFloat 16
+     %v2half = OpTypeVector %half 2
+      %float = OpTypeFloat 32
+       %SSBO = OpTypeStruct %v2half %float %float %v2half
+%_ptr_Uniform_SSBO = OpTypePointer Uniform %SSBO
+          %_ = OpVariable %_ptr_Uniform_SSBO Uniform
+        %int = OpTypeInt 32 1
+      %int_1 = OpConstant %int 1
+      %int_0 = OpConstant %int 0
+%_ptr_Uniform_v2half = OpTypePointer Uniform %v2half
+       %uint = OpTypeInt 32 0
+%_ptr_Uniform_float = OpTypePointer Uniform %float
+      %int_3 = OpConstant %int 3
+      %int_2 = OpConstant %int 2
+     %v3uint = OpTypeVector %uint 3
+     %uint_1 = OpConstant %uint 1
+%gl_WorkGroupSize = OpConstantComposite %v3uint %uint_1 %uint_1 %uint_1
+       %main = OpFunction %void None %3
+          %5 = OpLabel
+         %16 = OpAccessChain %_ptr_Uniform_v2half %_ %int_0
+         %17 = OpLoad %v2half %16
+         %20 = OpBitcast %float %17
+         %22 = OpAccessChain %_ptr_Uniform_float %_ %int_1
+               OpStore %22 %20
+         %25 = OpAccessChain %_ptr_Uniform_float %_ %int_2
+         %26 = OpLoad %float %25
+         %28 = OpBitcast %v2half %26
+         %29 = OpAccessChain %_ptr_Uniform_v2half %_ %int_3
+               OpStore %29 %28
+               OpReturn
+               OpFunctionEnd
diff --git a/shaders-opencl-no-opt/asm/comp/bitfield-signed-operations.asm.comp b/shaders-opencl-no-opt/asm/comp/bitfield-signed-operations.asm.comp
new file mode 100644
index 000000000..435fa3222
--- /dev/null
+++ b/shaders-opencl-no-opt/asm/comp/bitfield-signed-operations.asm.comp
@@ -0,0 +1,97 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos Glslang Reference Front End; 7
+; Bound: 26
+; Schema: 0
+               OpCapability Shader
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint GLCompute %main "main"
+               OpExecutionMode %main LocalSize 1 1 1
+               OpSource GLSL 450
+               OpName %main "main"
+               OpName %SSBO "SSBO"
+               OpMemberName %SSBO 0 "ints"
+               OpMemberName %SSBO 1 "uints"
+               OpName %_ ""
+               OpMemberDecorate %SSBO 0 Offset 0
+               OpMemberDecorate %SSBO 1 Offset 16
+               OpDecorate %SSBO BufferBlock
+               OpDecorate %_ DescriptorSet 0
+               OpDecorate %_ Binding 0
+       %void = OpTypeVoid
+          %3 = OpTypeFunction %void
+        %int = OpTypeInt 32 1
+      %v4int = OpTypeVector %int 4
+       %uint = OpTypeInt 32 0
+     %v4uint = OpTypeVector %uint 4
+
+	 %int_1 = OpConstant %int 1
+	 %uint_11 = OpConstant %uint 11
+
+       %SSBO = OpTypeStruct %v4int %v4uint
+%_ptr_Uniform_SSBO = OpTypePointer Uniform %SSBO
+          %_ = OpVariable %_ptr_Uniform_SSBO Uniform
+      %int_0 = OpConstant %int 0
+%_ptr_Uniform_v4int = OpTypePointer Uniform %v4int
+%_ptr_Uniform_v4uint = OpTypePointer Uniform %v4uint
+       %main = OpFunction %void None %3
+          %5 = OpLabel
+         %ints_ptr = OpAccessChain %_ptr_Uniform_v4int %_ %int_0
+         %uints_ptr = OpAccessChain %_ptr_Uniform_v4uint %_ %int_1
+         %ints = OpLoad %v4int %ints_ptr
+         %uints = OpLoad %v4uint %uints_ptr
+
+		 %ints_alt = OpVectorShuffle %v4int %ints %ints 3 2 1 0
+		 %uints_alt = OpVectorShuffle %v4uint %uints %uints 3 2 1 0
+
+         %int_to_int_popcount = OpBitCount %v4int %ints
+         %int_to_uint_popcount = OpBitCount %v4uint %ints
+         %uint_to_int_popcount = OpBitCount %v4int %uints
+         %uint_to_uint_popcount = OpBitCount %v4uint %uints
+
+		; BitReverse must have matching types w.r.t. sign, yay.
+         %int_to_int_reverse = OpBitReverse %v4int %ints
+         ;%int_to_uint_reverse = OpBitReverse %v4uint %ints
+         ;%uint_to_int_reverse = OpBitReverse %v4int %uints
+         %uint_to_uint_reverse = OpBitReverse %v4uint %uints
+
+		; Base and Result must match.
+         %int_to_int_sbit = OpBitFieldSExtract %v4int %ints %int_1 %uint_11
+         ;%int_to_uint_sbit = OpBitFieldSExtract %v4uint %ints %offset %count
+         ;%uint_to_int_sbit = OpBitFieldSExtract %v4int %uints %offset %count
+         %uint_to_uint_sbit = OpBitFieldSExtract %v4uint %uints %uint_11 %int_1
+
+		; Base and Result must match.
+         %int_to_int_ubit = OpBitFieldUExtract %v4int %ints %int_1 %uint_11
+         ;%int_to_uint_ubit = OpBitFieldUExtract %v4uint %ints %offset %count
+         ;%uint_to_int_ubit = OpBitFieldUExtract %v4int %uints %offset %count
+         %uint_to_uint_ubit = OpBitFieldUExtract %v4uint %uints %uint_11 %int_1
+
+		 %int_to_int_insert = OpBitFieldInsert %v4int %ints %ints_alt %int_1 %uint_11
+		 %uint_to_uint_insert = OpBitFieldInsert %v4uint %uints %uints_alt %uint_11 %int_1
+
+               OpStore %ints_ptr %int_to_int_popcount
+               OpStore %uints_ptr %int_to_uint_popcount
+               OpStore %ints_ptr %uint_to_int_popcount
+               OpStore %uints_ptr %uint_to_uint_popcount
+
+               OpStore %ints_ptr %int_to_int_reverse
+               ;OpStore %uints_ptr %int_to_uint_reverse
+               ;OpStore %ints_ptr %uint_to_int_reverse
+               OpStore %uints_ptr %uint_to_uint_reverse
+
+               OpStore %ints_ptr %int_to_int_sbit
+               ;OpStore %uints_ptr %int_to_uint_sbit
+               ;OpStore %ints_ptr %uint_to_int_sbit
+               OpStore %uints_ptr %uint_to_uint_sbit
+
+               OpStore %ints_ptr %int_to_int_ubit
+               ;OpStore %uints_ptr %int_to_uint_ubit
+               ;OpStore %ints_ptr %uint_to_int_ubit
+               OpStore %uints_ptr %uint_to_uint_ubit
+
+			   OpStore %ints_ptr %int_to_int_insert
+			   OpStore %uints_ptr %uint_to_uint_insert
+
+               OpReturn
+               OpFunctionEnd
diff --git a/shaders-opencl-no-opt/asm/comp/bitscan.asm.comp b/shaders-opencl-no-opt/asm/comp/bitscan.asm.comp
new file mode 100644
index 000000000..e3b785cd5
--- /dev/null
+++ b/shaders-opencl-no-opt/asm/comp/bitscan.asm.comp
@@ -0,0 +1,72 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos Glslang Reference Front End; 7
+; Bound: 35
+; Schema: 0
+               OpCapability Shader
+          %1 = OpExtInstImport "GLSL.std.450"
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint GLCompute %main "main"
+               OpExecutionMode %main LocalSize 1 1 1
+               OpSource GLSL 450
+               OpName %main "main"
+               OpName %SSBO "SSBO"
+               OpMemberName %SSBO 0 "u"
+               OpMemberName %SSBO 1 "i"
+               OpMemberDecorate %SSBO 0 Offset 0
+               OpMemberDecorate %SSBO 1 Offset 16
+               OpDecorate %SSBO BufferBlock
+               OpDecorate %_ DescriptorSet 0
+               OpDecorate %_ Binding 0
+       %void = OpTypeVoid
+          %3 = OpTypeFunction %void
+        %int = OpTypeInt 32 1
+      %ivec4 = OpTypeVector %int 4
+       %uint = OpTypeInt 32 0
+      %uvec4 = OpTypeVector %uint 4
+       %SSBO = OpTypeStruct %uvec4 %ivec4
+%_ptr_Uniform_SSBO = OpTypePointer Uniform %SSBO
+          %_ = OpVariable %_ptr_Uniform_SSBO Uniform
+      %int_0 = OpConstant %int 0
+%_ptr_Uniform_uvec4 = OpTypePointer Uniform %uvec4
+      %int_1 = OpConstant %int 1
+%_ptr_Uniform_ivec4 = OpTypePointer Uniform %ivec4
+       %main = OpFunction %void None %3
+          %5 = OpLabel
+         %uptr = OpAccessChain %_ptr_Uniform_uvec4 %_ %int_0
+         %iptr = OpAccessChain %_ptr_Uniform_ivec4 %_ %int_1
+         %uvalue = OpLoad %uvec4 %uptr
+         %ivalue = OpLoad %ivec4 %iptr
+
+         %lsb_uint_to_uint = OpExtInst %uvec4 %1 FindILsb %uvalue
+         %lsb_uint_to_int = OpExtInst %ivec4 %1 FindILsb %uvalue
+         %lsb_int_to_uint = OpExtInst %uvec4 %1 FindILsb %ivalue
+         %lsb_int_to_int = OpExtInst %ivec4 %1 FindILsb %ivalue
+
+         %umsb_uint_to_uint = OpExtInst %uvec4 %1 FindUMsb %uvalue
+         %umsb_uint_to_int = OpExtInst %ivec4 %1 FindUMsb %uvalue
+         %umsb_int_to_uint = OpExtInst %uvec4 %1 FindUMsb %ivalue
+         %umsb_int_to_int = OpExtInst %ivec4 %1 FindUMsb %ivalue
+
+         %smsb_uint_to_uint = OpExtInst %uvec4 %1 FindSMsb %uvalue
+         %smsb_uint_to_int = OpExtInst %ivec4 %1 FindSMsb %uvalue
+         %smsb_int_to_uint = OpExtInst %uvec4 %1 FindSMsb %ivalue
+         %smsb_int_to_int = OpExtInst %ivec4 %1 FindSMsb %ivalue
+
+	OpStore %uptr %lsb_uint_to_uint
+	OpStore %iptr %lsb_uint_to_int
+	OpStore %uptr %lsb_int_to_uint
+	OpStore %iptr %lsb_int_to_int
+
+	OpStore %uptr %umsb_uint_to_uint
+	OpStore %iptr %umsb_uint_to_int
+	OpStore %uptr %umsb_int_to_uint
+	OpStore %iptr %umsb_int_to_int
+
+	OpStore %uptr %smsb_uint_to_uint
+	OpStore %iptr %smsb_uint_to_int
+	OpStore %uptr %smsb_int_to_uint
+	OpStore %iptr %smsb_int_to_int
+
+               OpReturn
+               OpFunctionEnd
diff --git a/shaders-opencl-no-opt/asm/comp/block-like-array-type-construct-2.asm.comp b/shaders-opencl-no-opt/asm/comp/block-like-array-type-construct-2.asm.comp
new file mode 100644
index 000000000..37ff035fa
--- /dev/null
+++ b/shaders-opencl-no-opt/asm/comp/block-like-array-type-construct-2.asm.comp
@@ -0,0 +1,85 @@
+; SPIR-V
+; Version: 1.3
+; Generator: Google spiregg; 0
+; Bound: 40
+; Schema: 0
+               OpCapability Shader
+               OpExtension "SPV_GOOGLE_hlsl_functionality1"
+               OpExtension "SPV_GOOGLE_user_type"
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint GLCompute %csMainClear "main" %gl_GlobalInvocationID
+               OpExecutionMode %csMainClear LocalSize 64 1 1
+               OpSource HLSL 600
+               OpName %type_CommonConstants "type.CommonConstants"
+               OpMemberName %type_CommonConstants 0 "g_count"
+               OpMemberName %type_CommonConstants 1 "g_padding4"
+               OpName %CommonConstants "CommonConstants"
+               OpName %type_RWStructuredBuffer_MyStruct "type.RWStructuredBuffer.MyStruct"
+               OpName %MyStruct "MyStruct"
+               OpMemberName %MyStruct 0 "m_coefficients"
+               OpName %g_data "g_data"
+               OpName %csMainClear "csMainClear"
+               OpDecorate %gl_GlobalInvocationID BuiltIn GlobalInvocationId
+               OpDecorateString %gl_GlobalInvocationID UserSemantic "SV_DispatchThreadID"
+               OpDecorate %CommonConstants DescriptorSet 0
+               OpDecorate %CommonConstants Binding 0
+               OpDecorate %g_data DescriptorSet 0
+               OpDecorate %g_data Binding 1
+               OpMemberDecorate %type_CommonConstants 0 Offset 0
+               OpMemberDecorate %type_CommonConstants 1 Offset 4
+               OpDecorate %type_CommonConstants Block
+               OpDecorateString %CommonConstants UserTypeGOOGLE "cbuffer"
+               OpDecorate %_arr_v4float_uint_4 ArrayStride 16
+               OpMemberDecorate %MyStruct 0 Offset 0
+               OpDecorate %_runtimearr_MyStruct ArrayStride 64
+               OpMemberDecorate %type_RWStructuredBuffer_MyStruct 0 Offset 0
+               OpDecorate %type_RWStructuredBuffer_MyStruct BufferBlock
+               OpDecorateString %g_data UserTypeGOOGLE "rwstructuredbuffer"
+        %int = OpTypeInt 32 1
+      %int_0 = OpConstant %int 0
+       %uint = OpTypeInt 32 0
+     %uint_4 = OpConstant %uint 4
+      %float = OpTypeFloat 32
+    %v4float = OpTypeVector %float 4
+     %v3uint = OpTypeVector %uint 3
+%type_CommonConstants = OpTypeStruct %uint %v3uint
+%_ptr_Uniform_type_CommonConstants = OpTypePointer Uniform %type_CommonConstants
+%_arr_v4float_uint_4 = OpTypeArray %v4float %uint_4
+   %MyStruct = OpTypeStruct %_arr_v4float_uint_4
+%_runtimearr_MyStruct = OpTypeRuntimeArray %MyStruct
+%type_RWStructuredBuffer_MyStruct = OpTypeStruct %_runtimearr_MyStruct
+%_ptr_Uniform_type_RWStructuredBuffer_MyStruct = OpTypePointer Uniform %type_RWStructuredBuffer_MyStruct
+%_ptr_Input_v3uint = OpTypePointer Input %v3uint
+       %void = OpTypeVoid
+         %21 = OpTypeFunction %void
+%_ptr_Uniform_uint = OpTypePointer Uniform %uint
+       %bool = OpTypeBool
+%_ptr_Uniform_MyStruct = OpTypePointer Uniform %MyStruct
+%CommonConstants = OpVariable %_ptr_Uniform_type_CommonConstants Uniform
+     %g_data = OpVariable %_ptr_Uniform_type_RWStructuredBuffer_MyStruct Uniform
+%gl_GlobalInvocationID = OpVariable %_ptr_Input_v3uint Input
+     %uint_0 = OpConstant %uint 0
+         %26 = OpConstantNull %v4float
+         %27 = OpConstantComposite %_arr_v4float_uint_4 %26 %26 %26 %26
+         %28 = OpConstantComposite %MyStruct %27
+%csMainClear = OpFunction %void None %21
+         %29 = OpLabel
+         %30 = OpLoad %v3uint %gl_GlobalInvocationID
+               OpSelectionMerge %31 None
+               OpSwitch %uint_0 %32
+         %32 = OpLabel
+         %33 = OpCompositeExtract %uint %30 0
+         %34 = OpAccessChain %_ptr_Uniform_uint %CommonConstants %int_0
+         %35 = OpLoad %uint %34
+         %36 = OpUGreaterThanEqual %bool %33 %35
+               OpSelectionMerge %37 DontFlatten
+               OpBranchConditional %36 %38 %37
+         %38 = OpLabel
+               OpBranch %31
+         %37 = OpLabel
+         %39 = OpAccessChain %_ptr_Uniform_MyStruct %g_data %int_0 %33
+               OpStore %39 %28
+               OpBranch %31
+         %31 = OpLabel
+               OpReturn
+               OpFunctionEnd
diff --git a/shaders-opencl-no-opt/asm/comp/block-like-array-type-construct.invalid.asm.comp b/shaders-opencl-no-opt/asm/comp/block-like-array-type-construct.invalid.asm.comp
new file mode 100644
index 000000000..8aaa9500a
--- /dev/null
+++ b/shaders-opencl-no-opt/asm/comp/block-like-array-type-construct.invalid.asm.comp
@@ -0,0 +1,80 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos Glslang Reference Front End; 10
+; Bound: 32
+; Schema: 0
+               OpCapability Shader
+          %1 = OpExtInstImport "GLSL.std.450"
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint GLCompute %main "main"
+               OpExecutionMode %main LocalSize 1 1 1
+               OpSource GLSL 450
+               OpName %main "main"
+               OpName %foo "foo"
+               OpName %foo2 "foo2"
+               OpName %SSBO "SSBO"
+               OpMemberName %SSBO 0 "a"
+               OpMemberName %SSBO 1 "b"
+               OpName %_ ""
+               OpDecorate %gl_WorkGroupSize BuiltIn WorkgroupSize
+               OpMemberDecorate %SSBO 0 Offset 0
+               OpMemberDecorate %SSBO 1 Offset 4
+               OpDecorate %SSBO BufferBlock
+               OpDecorate %_ DescriptorSet 0
+               OpDecorate %_ Binding 0
+			   OpDecorate %_arr_float_uint_4 ArrayStride 4
+			   OpDecorate %struct_arr ArrayStride 32
+			   OpMemberDecorate %struct 0 Offset 0
+			   OpMemberDecorate %struct 1 Offset 16
+       %void = OpTypeVoid
+          %3 = OpTypeFunction %void
+      %float = OpTypeFloat 32
+       %uint = OpTypeInt 32 0
+     %uint_2 = OpConstant %uint 2
+     %uint_4 = OpConstant %uint 4
+%_arr_float_uint_4 = OpTypeArray %float %uint_4
+%_ptr_Private__arr_float_uint_4 = OpTypePointer Private %_arr_float_uint_4
+        %foo = OpVariable %_ptr_Private__arr_float_uint_4 Private
+        %foo2 = OpVariable %_ptr_Private__arr_float_uint_4 Private
+        %int = OpTypeInt 32 1
+      %int_0 = OpConstant %int 0
+    %float_1 = OpConstant %float 1
+	%struct = OpTypeStruct %_arr_float_uint_4 %_arr_float_uint_4
+	%struct_arr = OpTypeArray %struct %uint_2
+	%ptr_struct = OpTypePointer Function %struct
+%_ptr_Private_float = OpTypePointer Private %float
+      %int_1 = OpConstant %int 1
+    %float_2 = OpConstant %float 2
+      %int_2 = OpConstant %int 2
+    %float_3 = OpConstant %float 3
+      %int_3 = OpConstant %int 3
+    %float_4 = OpConstant %float 4
+     %v3uint = OpTypeVector %uint 3
+     %uint_1 = OpConstant %uint 1
+%gl_WorkGroupSize = OpConstantComposite %v3uint %uint_1 %uint_1 %uint_1
+		%carr = OpConstantComposite %_arr_float_uint_4 %float_1 %float_2 %float_3 %float_4
+		%struct_constant_0 = OpConstantComposite %struct %carr %carr
+		%struct_constant_1 = OpConstantComposite %struct %carr %carr
+		%struct_arr_constant = OpConstantComposite %struct_arr %struct_constant_0 %struct_constant_1
+       %SSBO = OpTypeStruct %uint %int
+%_ptr_Uniform_SSBO = OpTypePointer Uniform %SSBO
+          %_ = OpVariable %_ptr_Uniform_SSBO Uniform
+       %main = OpFunction %void None %3
+          %5 = OpLabel
+	   %struct_var = OpVariable %ptr_struct Function
+         %16 = OpAccessChain %_ptr_Private_float %foo %int_0
+               OpStore %16 %float_1
+			   OpStore %foo %carr
+         %19 = OpAccessChain %_ptr_Private_float %foo %int_1
+               OpStore %19 %float_2
+         %22 = OpAccessChain %_ptr_Private_float %foo %int_2
+               OpStore %22 %float_3
+         %25 = OpAccessChain %_ptr_Private_float %foo %int_3
+               OpStore %25 %float_4
+			   OpCopyMemory %foo2 %foo
+			%l0 = OpLoad %_arr_float_uint_4 %foo
+			%l1 = OpLoad %_arr_float_uint_4 %foo2
+			%struct0 = OpCompositeConstruct %struct %l0 %l1
+			OpStore %struct_var %struct0
+               OpReturn
+               OpFunctionEnd
diff --git a/shaders-opencl-no-opt/asm/comp/buffer-device-address-ptr-casting.asm.comp b/shaders-opencl-no-opt/asm/comp/buffer-device-address-ptr-casting.asm.comp
new file mode 100644
index 000000000..ed8d0ba6f
--- /dev/null
+++ b/shaders-opencl-no-opt/asm/comp/buffer-device-address-ptr-casting.asm.comp
@@ -0,0 +1,106 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos Glslang Reference Front End; 10
+; Bound: 62
+; Schema: 0
+               OpCapability Shader
+               OpCapability Int64
+               OpCapability PhysicalStorageBufferAddresses
+               OpExtension "SPV_KHR_physical_storage_buffer"
+          %1 = OpExtInstImport "GLSL.std.450"
+               OpMemoryModel PhysicalStorageBuffer64 GLSL450
+               OpEntryPoint GLCompute %main "main"
+               OpExecutionMode %main LocalSize 1 1 1
+               OpSource GLSL 450
+               OpSourceExtension "GL_ARB_gpu_shader_int64"
+               OpSourceExtension "GL_EXT_buffer_reference"
+               OpSourceExtension "GL_EXT_buffer_reference_uvec2"
+               OpName %main "main"
+               OpName %SomeBuffer "SomeBuffer"
+               OpMemberName %SomeBuffer 0 "v"
+               OpMemberName %SomeBuffer 1 "a"
+               OpMemberName %SomeBuffer 2 "b"
+               OpName %Registers "Registers"
+               OpMemberName %Registers 0 "address"
+               OpMemberName %Registers 1 "address2"
+               OpName %registers "registers"
+               OpName %a "a"
+               OpName %b "b"
+               OpMemberDecorate %SomeBuffer 0 Offset 0
+               OpMemberDecorate %SomeBuffer 1 Offset 16
+               OpMemberDecorate %SomeBuffer 2 Offset 24
+               OpDecorate %SomeBuffer Block
+               OpMemberDecorate %Registers 0 Offset 0
+               OpMemberDecorate %Registers 1 Offset 8
+               OpDecorate %Registers Block
+       %void = OpTypeVoid
+          %3 = OpTypeFunction %void
+               OpTypeForwardPointer %_ptr_PhysicalStorageBuffer_SomeBuffer PhysicalStorageBuffer
+      %float = OpTypeFloat 32
+    %v4float = OpTypeVector %float 4
+      %ulong = OpTypeInt 64 0
+       %uint = OpTypeInt 32 0
+     %v2uint = OpTypeVector %uint 2
+ %SomeBuffer = OpTypeStruct %v4float %ulong %v2uint
+%_ptr_PhysicalStorageBuffer_SomeBuffer = OpTypePointer PhysicalStorageBuffer %SomeBuffer
+%_ptr_Function__ptr_PhysicalStorageBuffer_SomeBuffer = OpTypePointer Function %_ptr_PhysicalStorageBuffer_SomeBuffer
+  %Registers = OpTypeStruct %ulong %v2uint
+%_ptr_PushConstant_Registers = OpTypePointer PushConstant %Registers
+  %registers = OpVariable %_ptr_PushConstant_Registers PushConstant
+        %int = OpTypeInt 32 1
+      %int_0 = OpConstant %int 0
+%_ptr_PushConstant_ulong = OpTypePointer PushConstant %ulong
+      %int_1 = OpConstant %int 1
+%_ptr_PushConstant_v2uint = OpTypePointer PushConstant %v2uint
+    %float_1 = OpConstant %float 1
+    %float_2 = OpConstant %float 2
+    %float_3 = OpConstant %float 3
+    %float_4 = OpConstant %float 4
+         %35 = OpConstantComposite %v4float %float_1 %float_2 %float_3 %float_4
+%_ptr_PhysicalStorageBuffer_v4float = OpTypePointer PhysicalStorageBuffer %v4float
+    %float_5 = OpConstant %float 5
+    %float_6 = OpConstant %float 6
+    %float_7 = OpConstant %float 7
+    %float_8 = OpConstant %float 8
+         %43 = OpConstantComposite %v4float %float_5 %float_6 %float_7 %float_8
+%_ptr_Function_ulong = OpTypePointer Function %ulong
+%_ptr_Function_v2uint = OpTypePointer Function %v2uint
+%_ptr_PhysicalStorageBuffer_ulong = OpTypePointer PhysicalStorageBuffer %ulong
+      %int_2 = OpConstant %int 2
+%_ptr_PhysicalStorageBuffer_v2uint = OpTypePointer PhysicalStorageBuffer %v2uint
+       %main = OpFunction %void None %3
+          %5 = OpLabel
+          %a = OpVariable %_ptr_Function_ulong Function
+          %b = OpVariable %_ptr_Function_v2uint Function
+         %21 = OpAccessChain %_ptr_PushConstant_ulong %registers %int_0
+         %27 = OpAccessChain %_ptr_PushConstant_v2uint %registers %int_1
+         %uint_ptr0 = OpLoad %ulong %21
+         %uint_ptr1 = OpLoad %v2uint %27
+
+		 ; ConvertUToPtr and vice versa do not accept vectors.
+         %ulong_ptr0 = OpConvertUToPtr %_ptr_PhysicalStorageBuffer_SomeBuffer %uint_ptr0
+         %ulong_ptr1 = OpBitcast %_ptr_PhysicalStorageBuffer_SomeBuffer %uint_ptr0
+         %uvec2_ptr0 = OpBitcast %_ptr_PhysicalStorageBuffer_SomeBuffer %uint_ptr1
+
+         %vec4_write0 = OpAccessChain %_ptr_PhysicalStorageBuffer_v4float %ulong_ptr0 %int_0
+         %vec4_write1 = OpAccessChain %_ptr_PhysicalStorageBuffer_v4float %ulong_ptr1 %int_0
+         %vec4_write2 = OpAccessChain %_ptr_PhysicalStorageBuffer_v4float %uvec2_ptr0 %int_0
+
+		   OpStore %vec4_write0 %35 Aligned 16
+		   OpStore %vec4_write1 %35 Aligned 16
+		   OpStore %vec4_write2 %35 Aligned 16
+
+         %ulong_from_ptr0 = OpConvertPtrToU %ulong %ulong_ptr0
+         %ulong_from_ptr1 = OpBitcast %ulong %ulong_ptr1
+         %uvec2_from_ptr0 = OpBitcast %v2uint %uvec2_ptr0
+
+         %ptr0 = OpAccessChain %_ptr_PhysicalStorageBuffer_ulong %ulong_ptr0 %int_1
+         %ptr1 = OpAccessChain %_ptr_PhysicalStorageBuffer_ulong %ulong_ptr1 %int_1
+         %ptr2 = OpAccessChain %_ptr_PhysicalStorageBuffer_v2uint %uvec2_ptr0 %int_2
+
+		   OpStore %ptr0 %ulong_from_ptr0 Aligned 8
+		   OpStore %ptr1 %ulong_from_ptr1 Aligned 8
+		   OpStore %ptr2 %uvec2_from_ptr0 Aligned 8
+
+               OpReturn
+               OpFunctionEnd
diff --git a/shaders-opencl-no-opt/asm/comp/composite-construct-buffer-struct.asm.invalid.comp b/shaders-opencl-no-opt/asm/comp/composite-construct-buffer-struct.asm.invalid.comp
new file mode 100644
index 000000000..c7b76a8c0
--- /dev/null
+++ b/shaders-opencl-no-opt/asm/comp/composite-construct-buffer-struct.asm.invalid.comp
@@ -0,0 +1,54 @@
+               OpCapability Shader
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint GLCompute %main "main"
+               OpExecutionMode %main LocalSize 1 1 1
+			   OpName %Block "Block"
+			   OpName %SSBO "SSBO"
+			   OpName %SSBO_Var "ssbo"
+			   OpName %UBO_Var "ubo"
+			   OpDecorate %SSBO_Var Binding 0
+			   OpDecorate %SSBO_Var DescriptorSet 0
+			   OpDecorate %UBO_Var Binding 1
+			   OpDecorate %UBO_Var DescriptorSet 0
+			   OpMemberDecorate %SSBO 0 Offset 0
+			   OpMemberDecorate %Block 0 Offset 0
+			   OpMemberDecorate %Block 1 Offset 16
+			   OpDecorate %BlockArray ArrayStride 32
+			   OpDecorate %arr_uvec2_2 ArrayStride 8
+			   OpDecorate %SSBO Block
+       %uint = OpTypeInt 32 0
+     %uint_0 = OpConstant %uint 0
+     %uint_1 = OpConstant %uint 1
+     %uint_2 = OpConstant %uint 2
+     %uint_3 = OpConstant %uint 3
+     %uvec2 = OpTypeVector %uint 2
+     %arr_uvec2_2 = OpTypeArray %uvec2 %uint_2
+	 %arr_uvec2_2_ptr = OpTypePointer StorageBuffer %arr_uvec2_2
+	 %arr_uvec2_2_ptr_const = OpTypePointer Uniform %arr_uvec2_2
+	 %arr_uvec2_2_ptr_func = OpTypePointer Function %arr_uvec2_2
+	 %arr_uvec2_2_ptr_workgroup = OpTypePointer Workgroup %arr_uvec2_2
+	 %wg = OpVariable %arr_uvec2_2_ptr_workgroup Workgroup
+   %Block = OpTypeStruct %arr_uvec2_2 %arr_uvec2_2
+   %Block_ptr = OpTypePointer StorageBuffer %Block
+%BlockArray = OpTypeArray %Block %uint_3
+%SSBO = OpTypeStruct %BlockArray
+%SSBO_Ptr = OpTypePointer StorageBuffer %SSBO
+%SSBO_Var = OpVariable %SSBO_Ptr StorageBuffer
+%UBO_Ptr = OpTypePointer Uniform %SSBO
+%UBO_Var = OpVariable %UBO_Ptr Uniform
+%void = OpTypeVoid
+%func_type = OpTypeFunction %void
+
+    %main = OpFunction %void None %func_type
+         %25 = OpLabel
+		 %func = OpVariable %arr_uvec2_2_ptr_func Function
+
+		; Copy device array to temporary.
+		 %ptr = OpAccessChain %Block_ptr %SSBO_Var %uint_0 %uint_0
+		 %ptr_arr_1 = OpAccessChain %arr_uvec2_2_ptr %SSBO_Var %uint_0 %uint_0 %uint_1
+		 %loaded_array = OpLoad %arr_uvec2_2 %ptr_arr_1
+		 %constructed = OpCompositeConstruct %Block %loaded_array %loaded_array
+		 OpStore %ptr %constructed
+
+               OpReturn
+               OpFunctionEnd
diff --git a/shaders-opencl-no-opt/asm/comp/constant-composite-undef.asm.comp b/shaders-opencl-no-opt/asm/comp/constant-composite-undef.asm.comp
new file mode 100644
index 000000000..8997d0aaf
--- /dev/null
+++ b/shaders-opencl-no-opt/asm/comp/constant-composite-undef.asm.comp
@@ -0,0 +1,40 @@
+; SPIR-V
+; Version: 1.3
+; Generator: Khronos SPIR-V Tools Assembler; 0
+; Bound: 20
+; Schema: 0
+               OpCapability Shader
+          %1 = OpExtInstImport "GLSL.std.450"
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint GLCompute %main "main"
+               OpExecutionMode %main LocalSize 1 1 1
+               OpSource GLSL 450
+               OpName %main "main"
+               OpName %Block "Block"
+               OpMemberName %Block 0 "f"
+               OpName %block "block"
+               OpMemberDecorate %Block 0 Offset 0
+               OpDecorate %Block BufferBlock
+               OpDecorate %block DescriptorSet 0
+               OpDecorate %block Binding 0
+       %void = OpTypeVoid
+          %6 = OpTypeFunction %void
+      %float = OpTypeFloat 32
+    %v4float = OpTypeVector %float 4
+      %Block = OpTypeStruct %v4float
+%_ptr_Uniform_Block = OpTypePointer Uniform %Block
+      %block = OpVariable %_ptr_Uniform_Block Uniform
+        %int = OpTypeInt 32 1
+      %int_0 = OpConstant %int 0
+%float_0_100000001 = OpConstant %float 0.100000001
+%float_0_200000003 = OpConstant %float 0.200000003
+%float_0_300000012 = OpConstant %float 0.300000012
+         %15 = OpUndef %float
+         %16 = OpConstantComposite %v4float %float_0_100000001 %float_0_200000003 %float_0_300000012 %15
+%_ptr_Uniform_v4float = OpTypePointer Uniform %v4float
+       %main = OpFunction %void None %6
+         %18 = OpLabel
+         %19 = OpAccessChain %_ptr_Uniform_v4float %block %int_0
+               OpStore %19 %16
+               OpReturn
+               OpFunctionEnd
diff --git a/shaders-opencl-no-opt/asm/comp/constant-lut-name-aliasing.asm.comp b/shaders-opencl-no-opt/asm/comp/constant-lut-name-aliasing.asm.comp
new file mode 100644
index 000000000..e1dcb0ef8
--- /dev/null
+++ b/shaders-opencl-no-opt/asm/comp/constant-lut-name-aliasing.asm.comp
@@ -0,0 +1,81 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos Glslang Reference Front End; 10
+; Bound: 49
+; Schema: 0
+               OpCapability Shader
+          %1 = OpExtInstImport "GLSL.std.450"
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint GLCompute %main "main" %gl_GlobalInvocationID %gl_LocalInvocationID
+               OpExecutionMode %main LocalSize 4 4 1
+               OpSource GLSL 450
+               OpName %main "main"
+               OpName %SSBO "SSBO"
+               OpMemberName %SSBO 0 "values"
+               OpName %_ ""
+               OpName %gl_GlobalInvocationID "gl_GlobalInvocationID"
+               OpName %gl_LocalInvocationID "gl_LocalInvocationID"
+               OpName %indexable "indexable"
+               OpName %indexable_0 "indexable"
+			   OpName %25 "indexable"
+			   OpName %38 "indexable"
+               OpDecorate %_runtimearr_int ArrayStride 4
+               OpMemberDecorate %SSBO 0 Offset 0
+               OpDecorate %SSBO BufferBlock
+               OpDecorate %_ DescriptorSet 0
+               OpDecorate %_ Binding 0
+               OpDecorate %gl_GlobalInvocationID BuiltIn GlobalInvocationId
+               OpDecorate %gl_LocalInvocationID BuiltIn LocalInvocationId
+               OpDecorate %gl_WorkGroupSize BuiltIn WorkgroupSize
+       %void = OpTypeVoid
+          %3 = OpTypeFunction %void
+        %int = OpTypeInt 32 1
+%_runtimearr_int = OpTypeRuntimeArray %int
+       %SSBO = OpTypeStruct %_runtimearr_int
+%_ptr_Uniform_SSBO = OpTypePointer Uniform %SSBO
+          %_ = OpVariable %_ptr_Uniform_SSBO Uniform
+      %int_0 = OpConstant %int 0
+       %uint = OpTypeInt 32 0
+     %v3uint = OpTypeVector %uint 3
+%_ptr_Input_v3uint = OpTypePointer Input %v3uint
+%gl_GlobalInvocationID = OpVariable %_ptr_Input_v3uint Input
+     %uint_0 = OpConstant %uint 0
+%_ptr_Input_uint = OpTypePointer Input %uint
+     %uint_4 = OpConstant %uint 4
+%_arr_int_uint_4 = OpTypeArray %int %uint_4
+      %int_1 = OpConstant %int 1
+      %int_2 = OpConstant %int 2
+      %int_3 = OpConstant %int 3
+         %25 = OpConstantComposite %_arr_int_uint_4 %int_0 %int_1 %int_2 %int_3
+%gl_LocalInvocationID = OpVariable %_ptr_Input_v3uint Input
+%_ptr_Function__arr_int_uint_4 = OpTypePointer Function %_arr_int_uint_4
+%_ptr_Function_int = OpTypePointer Function %int
+      %int_4 = OpConstant %int 4
+      %int_5 = OpConstant %int 5
+      %int_6 = OpConstant %int 6
+      %int_7 = OpConstant %int 7
+         %38 = OpConstantComposite %_arr_int_uint_4 %int_4 %int_5 %int_6 %int_7
+     %uint_1 = OpConstant %uint 1
+%_ptr_Uniform_int = OpTypePointer Uniform %int
+%gl_WorkGroupSize = OpConstantComposite %v3uint %uint_4 %uint_4 %uint_1
+       %main = OpFunction %void None %3
+          %5 = OpLabel
+  %indexable = OpVariable %_ptr_Function__arr_int_uint_4 Function
+%indexable_0 = OpVariable %_ptr_Function__arr_int_uint_4 Function
+         %18 = OpAccessChain %_ptr_Input_uint %gl_GlobalInvocationID %uint_0
+         %19 = OpLoad %uint %18
+         %27 = OpAccessChain %_ptr_Input_uint %gl_LocalInvocationID %uint_0
+         %28 = OpLoad %uint %27
+               OpStore %indexable %25
+         %32 = OpAccessChain %_ptr_Function_int %indexable %28
+         %33 = OpLoad %int %32
+         %40 = OpAccessChain %_ptr_Input_uint %gl_LocalInvocationID %uint_1
+         %41 = OpLoad %uint %40
+               OpStore %indexable_0 %38
+         %43 = OpAccessChain %_ptr_Function_int %indexable_0 %41
+         %44 = OpLoad %int %43
+         %45 = OpIAdd %int %33 %44
+         %47 = OpAccessChain %_ptr_Uniform_int %_ %int_0 %19
+               OpStore %47 %45
+               OpReturn
+               OpFunctionEnd
diff --git a/shaders-opencl-no-opt/asm/comp/copy-logical-2.spv14.asm.comp b/shaders-opencl-no-opt/asm/comp/copy-logical-2.spv14.asm.comp
new file mode 100644
index 000000000..6a7065a6f
--- /dev/null
+++ b/shaders-opencl-no-opt/asm/comp/copy-logical-2.spv14.asm.comp
@@ -0,0 +1,81 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos Glslang Reference Front End; 8
+; Bound: 48
+; Schema: 0
+               OpCapability Shader
+          %1 = OpExtInstImport "GLSL.std.450"
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint GLCompute %main "main" %ssbo
+               OpExecutionMode %main LocalSize 1 1 1
+               OpSource GLSL 450
+		OpName %B1 "B1"
+		OpName %A "A"
+		OpName %C "C"
+		OpName %B2 "B2"
+		OpMemberName %A 0 "a"
+		OpMemberName %A 1 "b1"
+		OpMemberName %A 2 "b1_array"
+		OpMemberName %C 0 "c"
+		OpMemberName %C 1 "b2"
+		OpMemberName %C 2 "b2_array"
+		OpMemberName %B1 0 "elem1"
+		OpMemberName %B2 0 "elem2"
+		OpMemberName %SSBO 0 "a_block"
+		OpMemberName %SSBO 1 "c_block"
+		OpDecorate %B1Array ArrayStride 16
+		OpDecorate %B2Array ArrayStride 16
+               OpMemberDecorate %B1 0 Offset 0
+               OpMemberDecorate %A 0 Offset 0
+               OpMemberDecorate %A 1 Offset 16
+               OpMemberDecorate %A 2 Offset 32
+               OpMemberDecorate %A 3 Offset 96
+               OpMemberDecorate %B2 0 Offset 0
+               OpMemberDecorate %C 0 Offset 0
+               OpMemberDecorate %C 1 Offset 16
+               OpMemberDecorate %C 2 Offset 32
+               OpMemberDecorate %C 3 Offset 96
+               OpMemberDecorate %SSBO 0 Offset 0
+               OpMemberDecorate %SSBO 1 Offset 112
+		OpMemberDecorate %A0 0 Offset 0
+		OpMemberDecorate %C0 0 Offset 0
+		OpMemberDecorate %A0 0 RowMajor
+		OpMemberDecorate %A0 0 MatrixStride 8
+		OpMemberDecorate %C0 0 ColMajor
+		OpMemberDecorate %C0 0 MatrixStride 16
+               OpDecorate %SSBO Block
+               OpDecorate %ssbo DescriptorSet 0
+               OpDecorate %ssbo Binding 0
+       %void = OpTypeVoid
+          %3 = OpTypeFunction %void
+      %float = OpTypeFloat 32
+	%uint = OpTypeInt 32 0
+	%uint_4 = OpConstant %uint 4
+    %v4float = OpTypeVector %float 4
+    %v2float = OpTypeVector %float 2
+    %m2float = OpTypeMatrix %v2float 2
+        %A0 = OpTypeStruct %m2float
+        %C0 = OpTypeStruct %m2float
+         %B2 = OpTypeStruct %v4float
+	%B2Array = OpTypeArray %B2 %uint_4
+          %C = OpTypeStruct %v4float %B2 %B2Array %C0
+         %B1 = OpTypeStruct %v4float
+	%B1Array = OpTypeArray %B1 %uint_4
+          %A = OpTypeStruct %v4float %B1 %B1Array %A0
+       %SSBO = OpTypeStruct %A %C
+%_ptr_Uniform_SSBO = OpTypePointer StorageBuffer %SSBO
+       %ssbo = OpVariable %_ptr_Uniform_SSBO StorageBuffer
+        %int = OpTypeInt 32 1
+      %int_1 = OpConstant %int 1
+%_ptr_Uniform_C = OpTypePointer StorageBuffer %C
+      %int_0 = OpConstant %int 0
+%_ptr_Uniform_A = OpTypePointer StorageBuffer %A
+       %main = OpFunction %void None %3
+          %5 = OpLabel
+         %22 = OpAccessChain %_ptr_Uniform_C %ssbo %int_1
+         %39 = OpAccessChain %_ptr_Uniform_A %ssbo %int_0
+         %23 = OpLoad %C %22
+         %24 = OpCopyLogical %A %23
+               OpStore %39 %24
+               OpReturn
+               OpFunctionEnd
diff --git a/shaders-opencl-no-opt/asm/comp/copy-logical-offset-and-array-stride-diffs.spv14.asm.comp b/shaders-opencl-no-opt/asm/comp/copy-logical-offset-and-array-stride-diffs.spv14.asm.comp
new file mode 100644
index 000000000..026bd1131
--- /dev/null
+++ b/shaders-opencl-no-opt/asm/comp/copy-logical-offset-and-array-stride-diffs.spv14.asm.comp
@@ -0,0 +1,60 @@
+; SPIR-V
+; Version: 1.4
+; Generator: Khronos SPIR-V Tools Assembler; 0
+; Bound: 24
+; Schema: 0
+               OpCapability Shader
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint GLCompute %2 "main" %3 %4
+               OpExecutionMode %2 LocalSize 1 1 1
+               OpDecorate %5 Block
+               OpMemberDecorate %5 0 Offset 0
+               OpMemberDecorate %5 1 Offset 16
+               OpMemberDecorate %5 2 Offset 48
+               OpMemberDecorate %5 3 Offset 64
+               OpMemberDecorate %5 4 Offset 80
+               OpMemberDecorate %5 5 Offset 96
+               OpMemberDecorate %5 6 Offset 112
+               OpDecorate %6 Block
+               OpMemberDecorate %6 0 Offset 0
+               OpMemberDecorate %6 1 Offset 4
+               OpMemberDecorate %6 2 Offset 12
+               OpMemberDecorate %6 3 Offset 16
+               OpMemberDecorate %6 4 Offset 32
+               OpMemberDecorate %6 5 Offset 48
+               OpMemberDecorate %6 6 Offset 64
+               OpDecorate %3 DescriptorSet 0
+               OpDecorate %3 Binding 0
+               OpDecorate %4 DescriptorSet 0
+               OpDecorate %4 Binding 1
+               OpDecorate %7 ArrayStride 4
+               OpDecorate %8 ArrayStride 16
+               OpMemberDecorate %9 0 Offset 4
+               OpMemberDecorate %10 0 Offset 8
+         %11 = OpTypeVoid
+         %12 = OpTypeFloat 32
+         %13 = OpTypeVector %12 2
+         %14 = OpTypeVector %12 3
+         %15 = OpTypeVector %12 4
+         %16 = OpTypeMatrix %15 4
+         %17 = OpTypeInt 32 0
+         %18 = OpConstant %17 2
+          %7 = OpTypeArray %17 %18
+          %8 = OpTypeArray %17 %18
+          %9 = OpTypeStruct %17
+         %10 = OpTypeStruct %17
+          %5 = OpTypeStruct %17 %8 %17 %9 %15 %14 %13
+         %19 = OpTypePointer StorageBuffer %5
+          %6 = OpTypeStruct %17 %7 %17 %10 %15 %14 %13
+         %20 = OpTypePointer StorageBuffer %6
+          %3 = OpVariable %20 StorageBuffer
+          %4 = OpVariable %19 StorageBuffer
+         %21 = OpTypeFunction %11
+          %2 = OpFunction %11 None %21
+          %1 = OpLabel
+         %22 = OpLoad %6 %3
+         %23 = OpCopyLogical %5 %22
+               OpStore %4 %23
+               OpReturn
+               OpFunctionEnd
+
diff --git a/shaders-opencl-no-opt/asm/comp/copy-logical.spv14.asm.comp b/shaders-opencl-no-opt/asm/comp/copy-logical.spv14.asm.comp
new file mode 100644
index 000000000..20fa0b099
--- /dev/null
+++ b/shaders-opencl-no-opt/asm/comp/copy-logical.spv14.asm.comp
@@ -0,0 +1,69 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos Glslang Reference Front End; 8
+; Bound: 48
+; Schema: 0
+               OpCapability Shader
+          %1 = OpExtInstImport "GLSL.std.450"
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint GLCompute %main "main" %ssbo
+               OpExecutionMode %main LocalSize 1 1 1
+               OpSource GLSL 450
+		OpName %B1 "B1"
+		OpName %A "A"
+		OpName %C "C"
+		OpName %B2 "B2"
+		OpMemberName %A 0 "a"
+		OpMemberName %A 1 "b1"
+		OpMemberName %A 2 "b1_array"
+		OpMemberName %C 0 "c"
+		OpMemberName %C 1 "b2"
+		OpMemberName %C 2 "b2_array"
+		OpMemberName %B1 0 "elem1"
+		OpMemberName %B2 0 "elem2"
+		OpMemberName %SSBO 0 "a_block"
+		OpMemberName %SSBO 1 "c_block"
+		OpDecorate %B1Array ArrayStride 16
+		OpDecorate %B2Array ArrayStride 16
+               OpMemberDecorate %B1 0 Offset 0
+               OpMemberDecorate %A 0 Offset 0
+               OpMemberDecorate %A 1 Offset 16
+               OpMemberDecorate %A 2 Offset 32
+               OpMemberDecorate %B2 0 Offset 0
+               OpMemberDecorate %C 0 Offset 0
+               OpMemberDecorate %C 1 Offset 16
+               OpMemberDecorate %C 2 Offset 32
+               OpMemberDecorate %SSBO 0 Offset 0
+               OpMemberDecorate %SSBO 1 Offset 96
+               OpDecorate %SSBO Block
+               OpDecorate %ssbo DescriptorSet 0
+               OpDecorate %ssbo Binding 0
+       %void = OpTypeVoid
+          %3 = OpTypeFunction %void
+      %float = OpTypeFloat 32
+	%uint = OpTypeInt 32 0
+	%uint_4 = OpConstant %uint 4
+    %v4float = OpTypeVector %float 4
+         %B2 = OpTypeStruct %v4float
+	%B2Array = OpTypeArray %B2 %uint_4
+          %C = OpTypeStruct %v4float %B2 %B2Array
+         %B1 = OpTypeStruct %v4float
+	%B1Array = OpTypeArray %B1 %uint_4
+          %A = OpTypeStruct %v4float %B1 %B1Array
+       %SSBO = OpTypeStruct %A %C
+%_ptr_Uniform_SSBO = OpTypePointer StorageBuffer %SSBO
+       %ssbo = OpVariable %_ptr_Uniform_SSBO StorageBuffer
+        %int = OpTypeInt 32 1
+      %int_1 = OpConstant %int 1
+%_ptr_Uniform_C = OpTypePointer StorageBuffer %C
+      %int_0 = OpConstant %int 0
+%_ptr_Uniform_A = OpTypePointer StorageBuffer %A
+       %main = OpFunction %void None %3
+          %5 = OpLabel
+         %22 = OpAccessChain %_ptr_Uniform_C %ssbo %int_1
+         %39 = OpAccessChain %_ptr_Uniform_A %ssbo %int_0
+         %23 = OpLoad %C %22
+         %24 = OpCopyLogical %A %23
+               OpStore %39 %24
+               OpReturn
+               OpFunctionEnd
diff --git a/shaders-opencl-no-opt/asm/comp/device-array-load-temporary.asm.invalid.comp b/shaders-opencl-no-opt/asm/comp/device-array-load-temporary.asm.invalid.comp
new file mode 100644
index 000000000..d59aad3ce
--- /dev/null
+++ b/shaders-opencl-no-opt/asm/comp/device-array-load-temporary.asm.invalid.comp
@@ -0,0 +1,53 @@
+               OpCapability Shader
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint GLCompute %main "main"
+               OpExecutionMode %main LocalSize 1 1 1
+			   OpName %Block "Block"
+			   OpName %SSBO "SSBO"
+			   OpName %SSBO_Var "ssbo"
+			   OpName %UBO_Var "ubo"
+			   OpDecorate %SSBO_Var Binding 0
+			   OpDecorate %SSBO_Var DescriptorSet 0
+			   OpDecorate %UBO_Var Binding 1
+			   OpDecorate %UBO_Var DescriptorSet 0
+			   OpMemberDecorate %SSBO 0 Offset 0
+			   OpMemberDecorate %Block 0 Offset 0
+			   OpMemberDecorate %Block 1 Offset 16
+			   OpDecorate %BlockArray ArrayStride 32
+			   OpDecorate %arr_uvec2_2 ArrayStride 8
+			   OpDecorate %SSBO Block
+       %uint = OpTypeInt 32 0
+     %uint_0 = OpConstant %uint 0
+     %uint_1 = OpConstant %uint 1
+     %uint_2 = OpConstant %uint 2
+     %uint_3 = OpConstant %uint 3
+     %uvec2 = OpTypeVector %uint 2
+     %arr_uvec2_2 = OpTypeArray %uvec2 %uint_2
+	 %arr_uvec2_2_ptr = OpTypePointer StorageBuffer %arr_uvec2_2
+	 %arr_uvec2_2_ptr_const = OpTypePointer Uniform %arr_uvec2_2
+	 %arr_uvec2_2_ptr_func = OpTypePointer Function %arr_uvec2_2
+	 %arr_uvec2_2_ptr_workgroup = OpTypePointer Workgroup %arr_uvec2_2
+	 %wg = OpVariable %arr_uvec2_2_ptr_workgroup Workgroup
+   %Block = OpTypeStruct %arr_uvec2_2 %arr_uvec2_2
+%BlockArray = OpTypeArray %Block %uint_3
+%SSBO = OpTypeStruct %BlockArray
+%SSBO_Ptr = OpTypePointer StorageBuffer %SSBO
+%SSBO_Var = OpVariable %SSBO_Ptr StorageBuffer
+%UBO_Ptr = OpTypePointer Uniform %SSBO
+%UBO_Var = OpVariable %UBO_Ptr Uniform
+%void = OpTypeVoid
+%func_type = OpTypeFunction %void
+
+    %main = OpFunction %void None %func_type
+         %25 = OpLabel
+		 %func = OpVariable %arr_uvec2_2_ptr_func Function
+
+		; Copy device array to temporary.
+		 %ptr_arr_0 = OpAccessChain %arr_uvec2_2_ptr %SSBO_Var %uint_0 %uint_0 %uint_0
+		 %ptr_arr_1 = OpAccessChain %arr_uvec2_2_ptr %SSBO_Var %uint_0 %uint_0 %uint_1
+		 %loaded_array = OpLoad %arr_uvec2_2 %ptr_arr_1
+		 OpStore %ptr_arr_0 %loaded_array
+		 OpStore %ptr_arr_0 %loaded_array
+
+               OpReturn
+               OpFunctionEnd
diff --git a/shaders-opencl-no-opt/asm/comp/device-array-load-temporary.force-native-array.asm.invalid.comp b/shaders-opencl-no-opt/asm/comp/device-array-load-temporary.force-native-array.asm.invalid.comp
new file mode 100644
index 000000000..d59aad3ce
--- /dev/null
+++ b/shaders-opencl-no-opt/asm/comp/device-array-load-temporary.force-native-array.asm.invalid.comp
@@ -0,0 +1,53 @@
+               OpCapability Shader
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint GLCompute %main "main"
+               OpExecutionMode %main LocalSize 1 1 1
+			   OpName %Block "Block"
+			   OpName %SSBO "SSBO"
+			   OpName %SSBO_Var "ssbo"
+			   OpName %UBO_Var "ubo"
+			   OpDecorate %SSBO_Var Binding 0
+			   OpDecorate %SSBO_Var DescriptorSet 0
+			   OpDecorate %UBO_Var Binding 1
+			   OpDecorate %UBO_Var DescriptorSet 0
+			   OpMemberDecorate %SSBO 0 Offset 0
+			   OpMemberDecorate %Block 0 Offset 0
+			   OpMemberDecorate %Block 1 Offset 16
+			   OpDecorate %BlockArray ArrayStride 32
+			   OpDecorate %arr_uvec2_2 ArrayStride 8
+			   OpDecorate %SSBO Block
+       %uint = OpTypeInt 32 0
+     %uint_0 = OpConstant %uint 0
+     %uint_1 = OpConstant %uint 1
+     %uint_2 = OpConstant %uint 2
+     %uint_3 = OpConstant %uint 3
+     %uvec2 = OpTypeVector %uint 2
+     %arr_uvec2_2 = OpTypeArray %uvec2 %uint_2
+	 %arr_uvec2_2_ptr = OpTypePointer StorageBuffer %arr_uvec2_2
+	 %arr_uvec2_2_ptr_const = OpTypePointer Uniform %arr_uvec2_2
+	 %arr_uvec2_2_ptr_func = OpTypePointer Function %arr_uvec2_2
+	 %arr_uvec2_2_ptr_workgroup = OpTypePointer Workgroup %arr_uvec2_2
+	 %wg = OpVariable %arr_uvec2_2_ptr_workgroup Workgroup
+   %Block = OpTypeStruct %arr_uvec2_2 %arr_uvec2_2
+%BlockArray = OpTypeArray %Block %uint_3
+%SSBO = OpTypeStruct %BlockArray
+%SSBO_Ptr = OpTypePointer StorageBuffer %SSBO
+%SSBO_Var = OpVariable %SSBO_Ptr StorageBuffer
+%UBO_Ptr = OpTypePointer Uniform %SSBO
+%UBO_Var = OpVariable %UBO_Ptr Uniform
+%void = OpTypeVoid
+%func_type = OpTypeFunction %void
+
+    %main = OpFunction %void None %func_type
+         %25 = OpLabel
+		 %func = OpVariable %arr_uvec2_2_ptr_func Function
+
+		; Copy device array to temporary.
+		 %ptr_arr_0 = OpAccessChain %arr_uvec2_2_ptr %SSBO_Var %uint_0 %uint_0 %uint_0
+		 %ptr_arr_1 = OpAccessChain %arr_uvec2_2_ptr %SSBO_Var %uint_0 %uint_0 %uint_1
+		 %loaded_array = OpLoad %arr_uvec2_2 %ptr_arr_1
+		 OpStore %ptr_arr_0 %loaded_array
+		 OpStore %ptr_arr_0 %loaded_array
+
+               OpReturn
+               OpFunctionEnd
diff --git a/shaders-opencl-no-opt/asm/comp/device-constant-array-load-store.asm.invalid.comp b/shaders-opencl-no-opt/asm/comp/device-constant-array-load-store.asm.invalid.comp
new file mode 100644
index 000000000..d9d0d51c3
--- /dev/null
+++ b/shaders-opencl-no-opt/asm/comp/device-constant-array-load-store.asm.invalid.comp
@@ -0,0 +1,81 @@
+               OpCapability Shader
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint GLCompute %main "main"
+               OpExecutionMode %main LocalSize 1 1 1
+			   OpName %Block "Block"
+			   OpName %SSBO "SSBO"
+			   OpName %SSBO_Var "ssbo"
+			   OpName %UBO_Var "ubo"
+			   OpDecorate %SSBO_Var Binding 0
+			   OpDecorate %SSBO_Var DescriptorSet 0
+			   OpDecorate %UBO_Var Binding 1
+			   OpDecorate %UBO_Var DescriptorSet 0
+			   OpMemberDecorate %SSBO 0 Offset 0
+			   OpMemberDecorate %Block 0 Offset 0
+			   OpMemberDecorate %Block 1 Offset 16
+			   OpDecorate %BlockArray ArrayStride 32
+			   OpDecorate %arr_uvec2_2 ArrayStride 8
+			   OpDecorate %SSBO Block
+       %uint = OpTypeInt 32 0
+     %uint_0 = OpConstant %uint 0
+     %uint_1 = OpConstant %uint 1
+     %uint_2 = OpConstant %uint 2
+     %uint_3 = OpConstant %uint 3
+     %uvec2 = OpTypeVector %uint 2
+     %arr_uvec2_2 = OpTypeArray %uvec2 %uint_2
+	 %arr_uvec2_2_ptr = OpTypePointer StorageBuffer %arr_uvec2_2
+	 %arr_uvec2_2_ptr_const = OpTypePointer Uniform %arr_uvec2_2
+	 %arr_uvec2_2_ptr_func = OpTypePointer Function %arr_uvec2_2
+	 %arr_uvec2_2_ptr_workgroup = OpTypePointer Workgroup %arr_uvec2_2
+	 %wg = OpVariable %arr_uvec2_2_ptr_workgroup Workgroup
+   %Block = OpTypeStruct %arr_uvec2_2 %arr_uvec2_2
+%BlockArray = OpTypeArray %Block %uint_3
+%SSBO = OpTypeStruct %BlockArray
+%SSBO_Ptr = OpTypePointer StorageBuffer %SSBO
+%SSBO_Var = OpVariable %SSBO_Ptr StorageBuffer
+%UBO_Ptr = OpTypePointer Uniform %SSBO
+%UBO_Var = OpVariable %UBO_Ptr Uniform
+%void = OpTypeVoid
+%func_type = OpTypeFunction %void
+
+    %main = OpFunction %void None %func_type
+         %25 = OpLabel
+		 %func = OpVariable %arr_uvec2_2_ptr_func Function
+
+		 ; DeviceToDevice
+		 %ptr_arr_0 = OpAccessChain %arr_uvec2_2_ptr %SSBO_Var %uint_0 %uint_0 %uint_0
+		 %ptr_arr_1 = OpAccessChain %arr_uvec2_2_ptr %SSBO_Var %uint_0 %uint_0 %uint_1
+		 %loaded_array = OpLoad %arr_uvec2_2 %ptr_arr_1
+		 OpStore %ptr_arr_0 %loaded_array
+
+		 ; ConstantToDevice
+		 %ptr_arr_1_const = OpAccessChain %arr_uvec2_2_ptr_const %UBO_Var %uint_0 %uint_0 %uint_1
+		 %loaded_array_const = OpLoad %arr_uvec2_2 %ptr_arr_1_const
+		 OpStore %ptr_arr_0 %loaded_array_const
+
+		 ; StackToDevice
+		 %loaded_array_func = OpLoad %arr_uvec2_2 %func
+		 OpStore %ptr_arr_0 %loaded_array_func
+
+		 ; ThreadGroupToDevice
+		 %loaded_array_workgroup = OpLoad %arr_uvec2_2 %wg
+		 OpStore %ptr_arr_0 %loaded_array_workgroup
+
+		 ; DeviceToThreadGroup
+		 %loaded_array_2 = OpLoad %arr_uvec2_2 %ptr_arr_1
+		 OpStore %wg %loaded_array_2
+
+		 ; DeviceToStack
+		 %loaded_array_3 = OpLoad %arr_uvec2_2 %ptr_arr_1
+		 OpStore %func %loaded_array_3
+
+		 ; ConstantToThreadGroup
+		 %loaded_array_const_2 = OpLoad %arr_uvec2_2 %ptr_arr_1_const
+		 OpStore %wg %loaded_array_const_2
+
+		 ; ConstantToStack
+		 %loaded_array_const_3 = OpLoad %arr_uvec2_2 %ptr_arr_1_const
+		 OpStore %func %loaded_array_const_3
+
+               OpReturn
+               OpFunctionEnd
diff --git a/shaders-opencl-no-opt/asm/comp/device-constant-array-load-store.force-native-array.asm.invalid.comp b/shaders-opencl-no-opt/asm/comp/device-constant-array-load-store.force-native-array.asm.invalid.comp
new file mode 100644
index 000000000..d9d0d51c3
--- /dev/null
+++ b/shaders-opencl-no-opt/asm/comp/device-constant-array-load-store.force-native-array.asm.invalid.comp
@@ -0,0 +1,81 @@
+               OpCapability Shader
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint GLCompute %main "main"
+               OpExecutionMode %main LocalSize 1 1 1
+			   OpName %Block "Block"
+			   OpName %SSBO "SSBO"
+			   OpName %SSBO_Var "ssbo"
+			   OpName %UBO_Var "ubo"
+			   OpDecorate %SSBO_Var Binding 0
+			   OpDecorate %SSBO_Var DescriptorSet 0
+			   OpDecorate %UBO_Var Binding 1
+			   OpDecorate %UBO_Var DescriptorSet 0
+			   OpMemberDecorate %SSBO 0 Offset 0
+			   OpMemberDecorate %Block 0 Offset 0
+			   OpMemberDecorate %Block 1 Offset 16
+			   OpDecorate %BlockArray ArrayStride 32
+			   OpDecorate %arr_uvec2_2 ArrayStride 8
+			   OpDecorate %SSBO Block
+       %uint = OpTypeInt 32 0
+     %uint_0 = OpConstant %uint 0
+     %uint_1 = OpConstant %uint 1
+     %uint_2 = OpConstant %uint 2
+     %uint_3 = OpConstant %uint 3
+     %uvec2 = OpTypeVector %uint 2
+     %arr_uvec2_2 = OpTypeArray %uvec2 %uint_2
+	 %arr_uvec2_2_ptr = OpTypePointer StorageBuffer %arr_uvec2_2
+	 %arr_uvec2_2_ptr_const = OpTypePointer Uniform %arr_uvec2_2
+	 %arr_uvec2_2_ptr_func = OpTypePointer Function %arr_uvec2_2
+	 %arr_uvec2_2_ptr_workgroup = OpTypePointer Workgroup %arr_uvec2_2
+	 %wg = OpVariable %arr_uvec2_2_ptr_workgroup Workgroup
+   %Block = OpTypeStruct %arr_uvec2_2 %arr_uvec2_2
+%BlockArray = OpTypeArray %Block %uint_3
+%SSBO = OpTypeStruct %BlockArray
+%SSBO_Ptr = OpTypePointer StorageBuffer %SSBO
+%SSBO_Var = OpVariable %SSBO_Ptr StorageBuffer
+%UBO_Ptr = OpTypePointer Uniform %SSBO
+%UBO_Var = OpVariable %UBO_Ptr Uniform
+%void = OpTypeVoid
+%func_type = OpTypeFunction %void
+
+    %main = OpFunction %void None %func_type
+         %25 = OpLabel
+		 %func = OpVariable %arr_uvec2_2_ptr_func Function
+
+		 ; DeviceToDevice
+		 %ptr_arr_0 = OpAccessChain %arr_uvec2_2_ptr %SSBO_Var %uint_0 %uint_0 %uint_0
+		 %ptr_arr_1 = OpAccessChain %arr_uvec2_2_ptr %SSBO_Var %uint_0 %uint_0 %uint_1
+		 %loaded_array = OpLoad %arr_uvec2_2 %ptr_arr_1
+		 OpStore %ptr_arr_0 %loaded_array
+
+		 ; ConstantToDevice
+		 %ptr_arr_1_const = OpAccessChain %arr_uvec2_2_ptr_const %UBO_Var %uint_0 %uint_0 %uint_1
+		 %loaded_array_const = OpLoad %arr_uvec2_2 %ptr_arr_1_const
+		 OpStore %ptr_arr_0 %loaded_array_const
+
+		 ; StackToDevice
+		 %loaded_array_func = OpLoad %arr_uvec2_2 %func
+		 OpStore %ptr_arr_0 %loaded_array_func
+
+		 ; ThreadGroupToDevice
+		 %loaded_array_workgroup = OpLoad %arr_uvec2_2 %wg
+		 OpStore %ptr_arr_0 %loaded_array_workgroup
+
+		 ; DeviceToThreadGroup
+		 %loaded_array_2 = OpLoad %arr_uvec2_2 %ptr_arr_1
+		 OpStore %wg %loaded_array_2
+
+		 ; DeviceToStack
+		 %loaded_array_3 = OpLoad %arr_uvec2_2 %ptr_arr_1
+		 OpStore %func %loaded_array_3
+
+		 ; ConstantToThreadGroup
+		 %loaded_array_const_2 = OpLoad %arr_uvec2_2 %ptr_arr_1_const
+		 OpStore %wg %loaded_array_const_2
+
+		 ; ConstantToStack
+		 %loaded_array_const_3 = OpLoad %arr_uvec2_2 %ptr_arr_1_const
+		 OpStore %func %loaded_array_const_3
+
+               OpReturn
+               OpFunctionEnd
diff --git a/shaders-opencl-no-opt/asm/comp/eliminate-globals-not-in-entry-point.noeliminate.spv14.asm.comp b/shaders-opencl-no-opt/asm/comp/eliminate-globals-not-in-entry-point.noeliminate.spv14.asm.comp
new file mode 100644
index 000000000..73f3ceee1
--- /dev/null
+++ b/shaders-opencl-no-opt/asm/comp/eliminate-globals-not-in-entry-point.noeliminate.spv14.asm.comp
@@ -0,0 +1,59 @@
+; SPIR-V
+; Version: 1.5
+; Generator: Khronos Glslang Reference Front End; 10
+; Bound: 26
+; Schema: 0
+               OpCapability Shader
+          %1 = OpExtInstImport "GLSL.std.450"
+               OpMemoryModel Logical GLSL450
+               ;OpEntryPoint GLCompute %main "main" %Samp %ubo %ssbo %v %w
+               OpEntryPoint GLCompute %main "main"
+               OpExecutionMode %main LocalSize 64 1 1
+               OpSource GLSL 450
+               OpName %main "main"
+               OpName %Samp "Samp"
+               OpName %UBO "UBO"
+               OpMemberName %UBO 0 "v"
+               OpName %ubo "ubo"
+               OpName %SSBO "SSBO"
+               OpMemberName %SSBO 0 "v"
+               OpName %ssbo "ssbo"
+               OpName %v "v"
+               OpName %w "w"
+               OpDecorate %gl_WorkGroupSize BuiltIn WorkgroupSize
+               OpDecorate %Samp DescriptorSet 0
+               OpDecorate %Samp Binding 0
+               OpMemberDecorate %UBO 0 Offset 0
+               OpDecorate %UBO Block
+               OpDecorate %ubo DescriptorSet 0
+               OpDecorate %ubo Binding 1
+               OpMemberDecorate %SSBO 0 Offset 0
+               OpDecorate %SSBO Block
+               OpDecorate %ssbo DescriptorSet 0
+               OpDecorate %ssbo Binding 2
+       %void = OpTypeVoid
+          %3 = OpTypeFunction %void
+       %uint = OpTypeInt 32 0
+     %v3uint = OpTypeVector %uint 3
+    %uint_64 = OpConstant %uint 64
+     %uint_1 = OpConstant %uint 1
+%gl_WorkGroupSize = OpConstantComposite %v3uint %uint_64 %uint_1 %uint_1
+      %float = OpTypeFloat 32
+         %12 = OpTypeImage %float 2D 0 0 0 1 Unknown
+         %13 = OpTypeSampledImage %12
+%_ptr_UniformConstant_13 = OpTypePointer UniformConstant %13
+       %Samp = OpVariable %_ptr_UniformConstant_13 UniformConstant
+        %UBO = OpTypeStruct %float
+%_ptr_Uniform_UBO = OpTypePointer Uniform %UBO
+        %ubo = OpVariable %_ptr_Uniform_UBO Uniform
+       %SSBO = OpTypeStruct %float
+%_ptr_StorageBuffer_SSBO = OpTypePointer StorageBuffer %SSBO
+       %ssbo = OpVariable %_ptr_StorageBuffer_SSBO StorageBuffer
+%_ptr_Private_float = OpTypePointer Private %float
+          %v = OpVariable %_ptr_Private_float Private
+%_ptr_Workgroup_float = OpTypePointer Workgroup %float
+          %w = OpVariable %_ptr_Workgroup_float Workgroup
+       %main = OpFunction %void None %3
+          %5 = OpLabel
+               OpReturn
+               OpFunctionEnd
diff --git a/shaders-opencl-no-opt/asm/comp/glsl-signed-operations.asm.comp b/shaders-opencl-no-opt/asm/comp/glsl-signed-operations.asm.comp
new file mode 100644
index 000000000..7da9f95b9
--- /dev/null
+++ b/shaders-opencl-no-opt/asm/comp/glsl-signed-operations.asm.comp
@@ -0,0 +1,123 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos Glslang Reference Front End; 7
+; Bound: 26
+; Schema: 0
+               OpCapability Shader
+          %1 = OpExtInstImport "GLSL.std.450"
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint GLCompute %main "main"
+               OpExecutionMode %main LocalSize 1 1 1
+               OpSource GLSL 450
+               OpName %main "main"
+               OpName %SSBO "SSBO"
+               OpMemberName %SSBO 0 "ints"
+               OpMemberName %SSBO 1 "uints"
+               OpName %_ ""
+               OpMemberDecorate %SSBO 0 Offset 0
+               OpMemberDecorate %SSBO 1 Offset 16
+               OpDecorate %SSBO BufferBlock
+               OpDecorate %_ DescriptorSet 0
+               OpDecorate %_ Binding 0
+       %void = OpTypeVoid
+          %3 = OpTypeFunction %void
+        %int = OpTypeInt 32 1
+      %v4int = OpTypeVector %int 4
+       %uint = OpTypeInt 32 0
+     %v4uint = OpTypeVector %uint 4
+       %SSBO = OpTypeStruct %v4int %v4uint
+%_ptr_Uniform_SSBO = OpTypePointer Uniform %SSBO
+          %_ = OpVariable %_ptr_Uniform_SSBO Uniform
+      %int_0 = OpConstant %int 0
+%_ptr_Uniform_v4int = OpTypePointer Uniform %v4int
+      %int_1 = OpConstant %int 1
+%_ptr_Uniform_v4uint = OpTypePointer Uniform %v4uint
+       %main = OpFunction %void None %3
+          %5 = OpLabel
+         %ints_ptr = OpAccessChain %_ptr_Uniform_v4int %_ %int_0
+         %uints_ptr = OpAccessChain %_ptr_Uniform_v4uint %_ %int_1
+         %ints = OpLoad %v4int %ints_ptr
+         %uints = OpLoad %v4uint %uints_ptr
+
+         %int_to_int_sabs = OpExtInst %v4int %1 SAbs %ints
+         %int_to_uint_sabs = OpExtInst %v4uint %1 SAbs %ints
+         %uint_to_int_sabs = OpExtInst %v4int %1 SAbs %uints
+         %uint_to_uint_sabs = OpExtInst %v4uint %1 SAbs %uints
+
+         %int_to_int_ssign = OpExtInst %v4int %1 SSign %ints
+         %int_to_uint_ssign = OpExtInst %v4uint %1 SSign %ints
+         %uint_to_int_ssign = OpExtInst %v4int %1 SSign %uints
+         %uint_to_uint_ssign = OpExtInst %v4uint %1 SSign %uints
+
+         %int_to_int_smsb = OpExtInst %v4int %1 FindSMsb %uints
+         %int_to_uint_smsb = OpExtInst %v4uint %1 FindSMsb %uints
+         %uint_to_int_umsb = OpExtInst %v4int %1 FindUMsb %ints
+         %uint_to_uint_umsb = OpExtInst %v4uint %1 FindUMsb %ints
+
+         %int_to_int_smin = OpExtInst %v4int %1 SMin %ints %ints
+         %int_to_uint_smin = OpExtInst %v4uint %1 SMin %ints %uints
+         %uint_to_int_smin = OpExtInst %v4int %1 SMin %uints %uints
+         %uint_to_uint_smin = OpExtInst %v4uint %1 SMin %uints %ints
+
+         %int_to_int_umin = OpExtInst %v4int %1 UMin %ints %uints
+         %int_to_uint_umin = OpExtInst %v4uint %1 UMin %ints %uints
+         %uint_to_int_umin = OpExtInst %v4int %1 UMin %uints %ints
+         %uint_to_uint_umin = OpExtInst %v4uint %1 UMin %uints %ints
+
+         %int_to_int_smax = OpExtInst %v4int %1 SMax %ints %ints
+         %int_to_uint_smax = OpExtInst %v4uint %1 SMax %ints %ints
+         %uint_to_int_smax = OpExtInst %v4int %1 SMax %uints %ints
+         %uint_to_uint_smax = OpExtInst %v4uint %1 SMax %uints %ints
+
+         %int_to_int_umax = OpExtInst %v4int %1 UMax %ints %uints
+         %int_to_uint_umax = OpExtInst %v4uint %1 UMax %ints %ints
+         %uint_to_int_umax = OpExtInst %v4int %1 UMax %uints %ints
+         %uint_to_uint_umax = OpExtInst %v4uint %1 UMax %uints %ints
+
+         %int_to_int_sclamp = OpExtInst %v4int %1 SClamp %uints %uints %uints
+         %int_to_uint_sclamp = OpExtInst %v4uint %1 SClamp %uints %uints %uints
+         %uint_to_int_uclamp = OpExtInst %v4int %1 UClamp %ints %ints %ints
+         %uint_to_uint_uclamp = OpExtInst %v4uint %1 UClamp %ints %ints %ints
+
+               OpStore %ints_ptr %int_to_int_sabs
+               OpStore %uints_ptr %int_to_uint_sabs
+               OpStore %ints_ptr %uint_to_int_sabs
+               OpStore %uints_ptr %uint_to_uint_sabs
+
+               OpStore %ints_ptr %int_to_int_ssign
+               OpStore %uints_ptr %int_to_uint_ssign
+               OpStore %ints_ptr %uint_to_int_ssign
+               OpStore %uints_ptr %uint_to_uint_ssign
+
+               OpStore %ints_ptr %int_to_int_smsb
+               OpStore %uints_ptr %int_to_uint_smsb
+               OpStore %ints_ptr %uint_to_int_umsb
+               OpStore %uints_ptr %uint_to_uint_umsb
+
+               OpStore %ints_ptr %int_to_int_smin
+               OpStore %uints_ptr %int_to_uint_smin
+               OpStore %ints_ptr %uint_to_int_smin
+               OpStore %uints_ptr %uint_to_uint_smin
+
+               OpStore %ints_ptr %int_to_int_umin
+               OpStore %uints_ptr %int_to_uint_umin
+               OpStore %ints_ptr %uint_to_int_umin
+               OpStore %uints_ptr %uint_to_uint_umin
+
+               OpStore %ints_ptr %int_to_int_smax
+               OpStore %uints_ptr %int_to_uint_smax
+               OpStore %ints_ptr %uint_to_int_smax
+               OpStore %uints_ptr %uint_to_uint_smax
+
+               OpStore %ints_ptr %int_to_int_umax
+               OpStore %uints_ptr %int_to_uint_umax
+               OpStore %ints_ptr %uint_to_int_umax
+               OpStore %uints_ptr %uint_to_uint_umax
+
+               OpStore %ints_ptr %int_to_int_sclamp
+               OpStore %uints_ptr %int_to_uint_sclamp
+               OpStore %ints_ptr %uint_to_int_uclamp
+               OpStore %uints_ptr %uint_to_uint_uclamp
+
+               OpReturn
+               OpFunctionEnd
diff --git a/shaders-opencl-no-opt/asm/comp/glsl.std450.frexp-modf-struct.asm.comp b/shaders-opencl-no-opt/asm/comp/glsl.std450.frexp-modf-struct.asm.comp
new file mode 100644
index 000000000..30db11d45
--- /dev/null
+++ b/shaders-opencl-no-opt/asm/comp/glsl.std450.frexp-modf-struct.asm.comp
@@ -0,0 +1,55 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos Glslang Reference Front End; 7
+; Bound: 45
+; Schema: 0
+               OpCapability Shader
+          %1 = OpExtInstImport "GLSL.std.450"
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint GLCompute %main "main"
+               OpExecutionMode %main LocalSize 1 1 1
+               OpSource GLSL 450
+               OpMemberDecorate %SSBO 0 Offset 0
+               OpMemberDecorate %SSBO 1 Offset 4
+               OpDecorate %SSBO BufferBlock
+               OpDecorate %_ DescriptorSet 0
+               OpDecorate %_ Binding 0
+       %void = OpTypeVoid
+          %3 = OpTypeFunction %void
+      %float = OpTypeFloat 32
+ %ResTypeMod = OpTypeStruct %float %float
+%_ptr_Function_ResTypeMod = OpTypePointer Function %ResTypeMod
+        %int = OpTypeInt 32 1
+      %int_0 = OpConstant %int 0
+   %float_20 = OpConstant %float 20
+      %int_1 = OpConstant %int 1
+%_ptr_Function_float = OpTypePointer Function %float
+%ResTypeFrexp = OpTypeStruct %float %int
+%_ptr_Function_ResTypeFrexp = OpTypePointer Function %ResTypeFrexp
+   %float_40 = OpConstant %float 40
+%_ptr_Function_int = OpTypePointer Function %int
+       %SSBO = OpTypeStruct %float %int
+%_ptr_Uniform_SSBO = OpTypePointer Uniform %SSBO
+          %_ = OpVariable %_ptr_Uniform_SSBO Uniform
+%_ptr_Uniform_float = OpTypePointer Uniform %float
+%_ptr_Uniform_int = OpTypePointer Uniform %int
+       %main = OpFunction %void None %3
+          %5 = OpLabel
+         %modres = OpExtInst %ResTypeMod %1 ModfStruct %float_20
+         %frexpres = OpExtInst %ResTypeFrexp %1 FrexpStruct %float_40
+
+		 %modres_f = OpCompositeExtract %float %modres 0
+		 %modres_i = OpCompositeExtract %float %modres 1
+		 %frexpres_f = OpCompositeExtract %float %frexpres 0
+		 %frexpres_i = OpCompositeExtract %int %frexpres 1
+
+         %float_ptr = OpAccessChain %_ptr_Uniform_float %_ %int_0
+         %int_ptr = OpAccessChain %_ptr_Uniform_int %_ %int_1
+
+               OpStore %float_ptr %modres_f
+               OpStore %float_ptr %modres_i
+               OpStore %float_ptr %frexpres_f
+               OpStore %int_ptr %frexpres_i
+
+               OpReturn
+               OpFunctionEnd
diff --git a/shaders-opencl-no-opt/asm/comp/groupshared-inner-array-of-struct-copy.invalid.asm.comp b/shaders-opencl-no-opt/asm/comp/groupshared-inner-array-of-struct-copy.invalid.asm.comp
new file mode 100644
index 000000000..b9876122a
--- /dev/null
+++ b/shaders-opencl-no-opt/asm/comp/groupshared-inner-array-of-struct-copy.invalid.asm.comp
@@ -0,0 +1,137 @@
+; SPIR-V
+; Version: 1.3
+; Generator: Google spiregg; 0
+; Bound: 91
+; Schema: 0
+               OpCapability Shader
+               OpCapability StorageImageReadWithoutFormat
+               OpExtension "SPV_GOOGLE_hlsl_functionality1"
+               OpExtension "SPV_GOOGLE_user_type"
+          %1 = OpExtInstImport "GLSL.std.450"
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint GLCompute %csMain "main" %gl_GlobalInvocationID
+               OpExecutionMode %csMain LocalSize 8 8 1
+               OpSource HLSL 500
+               OpName %Data "Data"
+               OpMemberName %Data 0 "sourceData"
+               OpName %g_data "g_data"
+               OpName %type_2d_image "type.2d.image"
+               OpName %g_inputTexture "g_inputTexture"
+               OpName %type_2d_image_0 "type.2d.image"
+               OpName %g_output "g_output"
+               OpName %csMain "csMain"
+               OpDecorate %gl_GlobalInvocationID BuiltIn GlobalInvocationId
+               OpDecorateString %gl_GlobalInvocationID UserSemantic "SV_DispatchThreadID"
+               OpDecorate %g_inputTexture DescriptorSet 0
+               OpDecorate %g_inputTexture Binding 0
+               OpDecorate %g_output DescriptorSet 0
+               OpDecorate %g_output Binding 0
+               OpDecorateString %g_inputTexture UserTypeGOOGLE "texture2d:<float4>"
+               OpDecorateString %g_output UserTypeGOOGLE "rwtexture2d:<uint>"
+        %int = OpTypeInt 32 1
+      %int_0 = OpConstant %int 0
+       %uint = OpTypeInt 32 0
+     %uint_0 = OpConstant %uint 0
+     %uint_4 = OpConstant %uint 4
+     %uint_1 = OpConstant %uint 1
+     %int_16 = OpConstant %int 16
+      %float = OpTypeFloat 32
+   %float_n1 = OpConstant %float -1
+    %v3float = OpTypeVector %float 3
+         %20 = OpConstantComposite %v3float %float_n1 %float_n1 %float_n1
+    %float_0 = OpConstant %float 0
+    %float_1 = OpConstant %float 1
+      %int_1 = OpConstant %int 1
+    %uint_64 = OpConstant %uint 64
+    %uint_16 = OpConstant %uint 16
+%_arr_v3float_uint_16 = OpTypeArray %v3float %uint_16
+       %Data = OpTypeStruct %_arr_v3float_uint_16
+%_arr_Data_uint_64 = OpTypeArray %Data %uint_64
+%_ptr_Workgroup__arr_Data_uint_64 = OpTypePointer Workgroup %_arr_Data_uint_64
+%type_2d_image = OpTypeImage %float 2D 2 0 0 1 Unknown
+%_ptr_UniformConstant_type_2d_image = OpTypePointer UniformConstant %type_2d_image
+%type_2d_image_0 = OpTypeImage %uint 2D 2 0 0 2 R32ui
+%_ptr_UniformConstant_type_2d_image_0 = OpTypePointer UniformConstant %type_2d_image_0
+     %v3uint = OpTypeVector %uint 3
+%_ptr_Input_v3uint = OpTypePointer Input %v3uint
+       %void = OpTypeVoid
+         %34 = OpTypeFunction %void
+     %v2uint = OpTypeVector %uint 2
+%_ptr_Function__arr_v3float_uint_16 = OpTypePointer Function %_arr_v3float_uint_16
+%_ptr_Workgroup__arr_v3float_uint_16 = OpTypePointer Workgroup %_arr_v3float_uint_16
+%_ptr_Function_v3float = OpTypePointer Function %v3float
+       %bool = OpTypeBool
+      %v3int = OpTypeVector %int 3
+      %v2int = OpTypeVector %int 2
+    %v4float = OpTypeVector %float 4
+%_ptr_Workgroup_v3float = OpTypePointer Workgroup %v3float
+     %g_data = OpVariable %_ptr_Workgroup__arr_Data_uint_64 Workgroup
+%g_inputTexture = OpVariable %_ptr_UniformConstant_type_2d_image UniformConstant
+   %g_output = OpVariable %_ptr_UniformConstant_type_2d_image_0 UniformConstant
+%gl_GlobalInvocationID = OpVariable %_ptr_Input_v3uint Input
+     %csMain = OpFunction %void None %34
+         %44 = OpLabel
+         %45 = OpVariable %_ptr_Function__arr_v3float_uint_16 Function
+         %46 = OpLoad %v3uint %gl_GlobalInvocationID
+         %47 = OpCompositeExtract %uint %46 0
+               OpBranch %48
+         %48 = OpLabel
+         %49 = OpPhi %uint %uint_0 %44 %50 %51
+         %52 = OpULessThan %bool %49 %uint_4
+               OpLoopMerge %53 %51 None
+               OpBranchConditional %52 %54 %53
+         %54 = OpLabel
+               OpBranch %55
+         %55 = OpLabel
+         %56 = OpPhi %uint %uint_0 %54 %57 %58
+         %59 = OpULessThan %bool %56 %uint_4
+               OpLoopMerge %60 %58 None
+               OpBranchConditional %59 %58 %60
+         %58 = OpLabel
+         %61 = OpBitcast %v3int %46
+         %62 = OpBitcast %int %56
+         %63 = OpBitcast %int %49
+         %64 = OpCompositeConstruct %v3int %62 %63 %int_0
+         %65 = OpIAdd %v3int %61 %64
+         %66 = OpVectorShuffle %v2int %65 %65 0 1
+         %67 = OpCompositeExtract %int %65 2
+         %68 = OpLoad %type_2d_image %g_inputTexture
+         %69 = OpImageFetch %v4float %68 %66 Lod %67
+         %70 = OpVectorShuffle %v3float %69 %69 0 1 2
+         %71 = OpIMul %uint %49 %uint_4
+         %72 = OpIAdd %uint %71 %56
+         %73 = OpAccessChain %_ptr_Workgroup_v3float %g_data %47 %int_0 %72
+               OpStore %73 %70
+         %57 = OpIAdd %uint %56 %uint_1
+               OpBranch %55
+         %60 = OpLabel
+               OpBranch %51
+         %51 = OpLabel
+         %50 = OpIAdd %uint %49 %uint_1
+               OpBranch %48
+         %53 = OpLabel
+         %74 = OpAccessChain %_ptr_Workgroup__arr_v3float_uint_16 %g_data %47 %int_0
+         %75 = OpLoad %_arr_v3float_uint_16 %74
+               OpStore %45 %75
+               OpBranch %76
+         %76 = OpLabel
+         %77 = OpPhi %uint %uint_0 %53 %78 %79
+         %80 = OpPhi %int %int_0 %53 %81 %79
+         %82 = OpSLessThan %bool %80 %int_16
+               OpLoopMerge %83 %79 None
+               OpBranchConditional %82 %79 %83
+         %79 = OpLabel
+         %84 = OpAccessChain %_ptr_Function_v3float %45 %80
+         %85 = OpLoad %v3float %84
+         %86 = OpDot %float %85 %20
+         %87 = OpExtInst %float %1 FClamp %86 %float_0 %float_1
+         %88 = OpConvertFToU %uint %87
+         %78 = OpBitwiseOr %uint %77 %88
+         %81 = OpIAdd %int %80 %int_1
+               OpBranch %76
+         %83 = OpLabel
+         %89 = OpVectorShuffle %v2uint %46 %46 0 1
+         %90 = OpLoad %type_2d_image_0 %g_output
+               OpImageWrite %90 %89 %77 None
+               OpReturn
+               OpFunctionEnd
diff --git a/shaders-opencl-no-opt/asm/comp/image-atomic-mismatch-sign.asm.invalid.comp b/shaders-opencl-no-opt/asm/comp/image-atomic-mismatch-sign.asm.invalid.comp
new file mode 100644
index 000000000..3817a6152
--- /dev/null
+++ b/shaders-opencl-no-opt/asm/comp/image-atomic-mismatch-sign.asm.invalid.comp
@@ -0,0 +1,71 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos Glslang Reference Front End; 11
+; Bound: 45
+; Schema: 0
+               OpCapability Shader
+          %1 = OpExtInstImport "GLSL.std.450"
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint GLCompute %main "main"
+               OpExecutionMode %main LocalSize 1 1 1
+               OpSource ESSL 310
+               OpSourceExtension "GL_OES_shader_image_atomic"
+               OpName %main "main"
+               OpName %uImage "uImage"
+               OpName %uImageArray "uImageArray"
+               OpName %iImage "iImage"
+               OpName %iImageArray "iImageArray"
+               OpDecorate %uImage DescriptorSet 0
+               OpDecorate %uImage Binding 0
+               OpDecorate %uImageArray DescriptorSet 0
+               OpDecorate %uImageArray Binding 2
+               OpDecorate %iImage DescriptorSet 0
+               OpDecorate %iImage Binding 1
+               OpDecorate %iImageArray DescriptorSet 0
+               OpDecorate %iImageArray Binding 3
+               OpDecorate %gl_WorkGroupSize BuiltIn WorkgroupSize
+       %void = OpTypeVoid
+          %3 = OpTypeFunction %void
+       %uint = OpTypeInt 32 0
+          %7 = OpTypeImage %uint 2D 0 0 0 2 R32ui
+%_ptr_UniformConstant_7 = OpTypePointer UniformConstant %7
+     %uImage = OpVariable %_ptr_UniformConstant_7 UniformConstant
+        %int = OpTypeInt 32 1
+      %v2int = OpTypeVector %int 2
+      %int_1 = OpConstant %int 1
+      %int_5 = OpConstant %int 5
+         %14 = OpConstantComposite %v2int %int_1 %int_5
+     %uint_1 = OpConstant %uint 1
+     %uint_0 = OpConstant %uint 0
+%_ptr_Image_uint = OpTypePointer Image %uint
+         %20 = OpTypeImage %uint 2D 0 1 0 2 R32ui
+%_ptr_UniformConstant_20 = OpTypePointer UniformConstant %20
+%uImageArray = OpVariable %_ptr_UniformConstant_20 UniformConstant
+      %v3int = OpTypeVector %int 3
+      %int_4 = OpConstant %int 4
+         %25 = OpConstantComposite %v3int %int_1 %int_5 %int_4
+         %28 = OpTypeImage %int 2D 0 0 0 2 R32i
+%_ptr_UniformConstant_28 = OpTypePointer UniformConstant %28
+     %iImage = OpVariable %_ptr_UniformConstant_28 UniformConstant
+      %int_6 = OpConstant %int 6
+         %32 = OpConstantComposite %v2int %int_1 %int_6
+%_ptr_Image_int = OpTypePointer Image %int
+         %36 = OpTypeImage %int 2D 0 1 0 2 R32i
+%_ptr_UniformConstant_36 = OpTypePointer UniformConstant %36
+%iImageArray = OpVariable %_ptr_UniformConstant_36 UniformConstant
+      %int_9 = OpConstant %int 9
+         %40 = OpConstantComposite %v3int %int_1 %int_6 %int_9
+     %v3uint = OpTypeVector %uint 3
+%gl_WorkGroupSize = OpConstantComposite %v3uint %uint_1 %uint_1 %uint_1
+       %main = OpFunction %void None %3
+          %5 = OpLabel
+         %18 = OpImageTexelPointer %_ptr_Image_uint %uImage %14 %uint_0
+         %19 = OpAtomicSMin %uint %18 %uint_1 %uint_0 %uint_1
+         %26 = OpImageTexelPointer %_ptr_Image_uint %uImageArray %25 %uint_0
+         %27 = OpAtomicSMax %uint %26 %uint_1 %uint_0 %uint_1
+         %34 = OpImageTexelPointer %_ptr_Image_int %iImage %32 %uint_0
+         %35 = OpAtomicUMin %int %34 %uint_1 %uint_0 %int_1
+         %41 = OpImageTexelPointer %_ptr_Image_int %iImageArray %40 %uint_0
+         %42 = OpAtomicUMax %int %41 %uint_1 %uint_0 %int_1
+               OpReturn
+               OpFunctionEnd
diff --git a/shaders-opencl-no-opt/asm/comp/local-size-id-override.asm.comp b/shaders-opencl-no-opt/asm/comp/local-size-id-override.asm.comp
new file mode 100644
index 000000000..2eaef4bdb
--- /dev/null
+++ b/shaders-opencl-no-opt/asm/comp/local-size-id-override.asm.comp
@@ -0,0 +1,60 @@
+               OpCapability Shader
+          %1 = OpExtInstImport "GLSL.std.450"
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint GLCompute %main "main" %gl_GlobalInvocationID
+               OpExecutionModeId %main LocalSizeId %spec_3 %spec_4 %uint_2
+               OpSource GLSL 450
+               OpName %main "main"
+               OpName %SSBO "SSBO"
+               OpMemberName %SSBO 0 "values"
+               OpName %_ ""
+               OpName %gl_GlobalInvocationID "gl_GlobalInvocationID"
+               OpDecorate %_runtimearr_v4float ArrayStride 16
+               OpMemberDecorate %SSBO 0 Offset 0
+               OpDecorate %SSBO Block
+               OpDecorate %_ DescriptorSet 0
+               OpDecorate %_ Binding 0
+               OpDecorate %gl_GlobalInvocationID BuiltIn GlobalInvocationId
+               OpDecorate %spec_1 SpecId 1
+               OpDecorate %spec_2 SpecId 2
+               OpDecorate %spec_3 SpecId 3
+               OpDecorate %spec_4 SpecId 4
+               OpDecorate %gl_WorkGroupSize BuiltIn WorkgroupSize
+       %void = OpTypeVoid
+          %3 = OpTypeFunction %void
+      %float = OpTypeFloat 32
+    %v4float = OpTypeVector %float 4
+%_runtimearr_v4float = OpTypeRuntimeArray %v4float
+       %SSBO = OpTypeStruct %_runtimearr_v4float
+%_ptr_Uniform_SSBO = OpTypePointer StorageBuffer %SSBO
+          %_ = OpVariable %_ptr_Uniform_SSBO StorageBuffer
+        %int = OpTypeInt 32 1
+      %int_0 = OpConstant %int 0
+       %uint = OpTypeInt 32 0
+     %v3uint = OpTypeVector %uint 3
+%_ptr_Input_v3uint = OpTypePointer Input %v3uint
+%gl_GlobalInvocationID = OpVariable %_ptr_Input_v3uint Input
+     %uint_0 = OpConstant %uint 0
+%_ptr_Input_uint = OpTypePointer Input %uint
+    %float_2 = OpConstant %float 2
+%_ptr_Uniform_v4float = OpTypePointer StorageBuffer %v4float
+         %spec_1 = OpSpecConstant %uint 11
+         %spec_2 = OpSpecConstant %uint 12
+         %spec_3 = OpSpecConstant %uint 13
+         %spec_4 = OpSpecConstant %uint 14
+     %uint_1 = OpConstant %uint 1
+     %uint_2 = OpConstant %uint 2
+     %uint_3 = OpConstant %uint 3
+%gl_WorkGroupSize = OpSpecConstantComposite %v3uint %uint_3 %spec_1 %spec_2
+       %main = OpFunction %void None %3
+          %5 = OpLabel
+         %20 = OpAccessChain %_ptr_Input_uint %gl_GlobalInvocationID %uint_0
+         %21 = OpLoad %uint %20
+         %24 = OpAccessChain %_ptr_Uniform_v4float %_ %int_0 %21
+         %25 = OpLoad %v4float %24
+         %26 = OpCompositeConstruct %v4float %float_2 %float_2 %float_2 %float_2
+         %27 = OpFAdd %v4float %25 %26
+         %28 = OpAccessChain %_ptr_Uniform_v4float %_ %int_0 %21
+               OpStore %28 %27
+               OpReturn
+               OpFunctionEnd
diff --git a/shaders-opencl-no-opt/asm/comp/local-size-id.asm.invalid.comp b/shaders-opencl-no-opt/asm/comp/local-size-id.asm.invalid.comp
new file mode 100644
index 000000000..3031f4bb8
--- /dev/null
+++ b/shaders-opencl-no-opt/asm/comp/local-size-id.asm.invalid.comp
@@ -0,0 +1,76 @@
+               OpCapability Shader
+          %1 = OpExtInstImport "GLSL.std.450"
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint GLCompute %main "main" %gl_GlobalInvocationID
+               OpExecutionModeId %main LocalSizeId %spec_3 %spec_4 %uint_2
+               OpSource GLSL 450
+               OpName %main "main"
+               OpName %SSBO "SSBO"
+               OpMemberName %SSBO 0 "values"
+               OpName %_ ""
+               OpName %gl_GlobalInvocationID "gl_GlobalInvocationID"
+               OpDecorate %_runtimearr_v4float ArrayStride 16
+               OpMemberDecorate %SSBO 0 Offset 0
+               OpDecorate %SSBO Block
+               OpDecorate %_ DescriptorSet 0
+               OpDecorate %_ Binding 0
+               OpDecorate %gl_GlobalInvocationID BuiltIn GlobalInvocationId
+               OpDecorate %spec_1 SpecId 1
+               OpDecorate %spec_2 SpecId 2
+               OpDecorate %spec_3 SpecId 3
+               OpDecorate %spec_4 SpecId 4
+       %void = OpTypeVoid
+          %3 = OpTypeFunction %void
+      %float = OpTypeFloat 32
+     %v3float = OpTypeVector %float 3
+    %v4float = OpTypeVector %float 4
+%_runtimearr_v4float = OpTypeRuntimeArray %v4float
+       %SSBO = OpTypeStruct %_runtimearr_v4float
+%_ptr_Uniform_SSBO = OpTypePointer StorageBuffer %SSBO
+          %_ = OpVariable %_ptr_Uniform_SSBO StorageBuffer
+        %int = OpTypeInt 32 1
+      %int_0 = OpConstant %int 0
+       %uint = OpTypeInt 32 0
+     %v3uint = OpTypeVector %uint 3
+%_ptr_Input_v3uint = OpTypePointer Input %v3uint
+%gl_GlobalInvocationID = OpVariable %_ptr_Input_v3uint Input
+     %uint_0 = OpConstant %uint 0
+%_ptr_Input_uint = OpTypePointer Input %uint
+    %float_2 = OpConstant %float 2
+%_ptr_Uniform_v4float = OpTypePointer StorageBuffer %v4float
+		; Test that we can declare the spec constant as signed.
+		; Needs implicit bitcast since WorkGroupSize is uint.
+         %spec_1 = OpSpecConstant %int 11
+         %spec_2 = OpSpecConstant %int 12
+         %spec_3 = OpSpecConstant %int 13
+         %spec_4 = OpSpecConstant %int 14
+     %uint_1 = OpConstant %uint 1
+     %uint_2 = OpConstant %uint 2
+     %uint_3 = OpConstant %uint 3
+	 ; Test that we can build spec constant composites out of local size id values.
+	 ; Needs special case handling.
+	 %spec_3_op = OpSpecConstantOp %uint IAdd %spec_3 %uint_3
+%WorkGroupSize = OpSpecConstantComposite %v3uint %spec_3_op %spec_4 %uint_2
+       %main = OpFunction %void None %3
+          %5 = OpLabel
+         %20 = OpAccessChain %_ptr_Input_uint %gl_GlobalInvocationID %uint_0
+         %21 = OpLoad %uint %20
+         %24 = OpAccessChain %_ptr_Uniform_v4float %_ %int_0 %21
+         %25 = OpLoad %v4float %24
+         %26 = OpCompositeConstruct %v4float %float_2 %float_2 %float_2 %float_2
+         %27 = OpFAdd %v4float %25 %26
+		 %wg_f = OpConvertUToF %v3float %WorkGroupSize
+		 %wg_f4 = OpVectorShuffle %v4float %wg_f %wg_f 0 1 2 2
+	 ; Test that we can use the spec constants directly which needs to translate to gl_WorkGroupSize.elem.
+	 ; Needs special case handling.
+		 %res = OpFAdd %v4float %27 %wg_f4
+		 %f0 = OpConvertSToF %float %spec_3
+		 %f1 = OpConvertSToF %float %spec_4
+		 %f2 = OpConvertSToF %float %uint_2
+		 %res1 = OpVectorTimesScalar %v4float %res %f0
+		 %res2 = OpVectorTimesScalar %v4float %res1 %f1
+		 %res3 = OpVectorTimesScalar %v4float %res2 %f2
+         %28 = OpAccessChain %_ptr_Uniform_v4float %_ %int_0 %21
+               OpStore %28 %res3
+               OpReturn
+               OpFunctionEnd
diff --git a/shaders-opencl-no-opt/asm/comp/modf-storage-class.asm.comp b/shaders-opencl-no-opt/asm/comp/modf-storage-class.asm.comp
new file mode 100644
index 000000000..126b01e46
--- /dev/null
+++ b/shaders-opencl-no-opt/asm/comp/modf-storage-class.asm.comp
@@ -0,0 +1,116 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos SPIR-V Tools Assembler; 0
+; Bound: 91
+; Schema: 0
+               OpCapability Shader
+          %1 = OpExtInstImport "GLSL.std.450"
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint GLCompute %2 "main"
+               OpExecutionMode %2 LocalSize 1 1 1
+               OpDecorate %_arr_v2uint_uint_324 ArrayStride 8
+               OpMemberDecorate %_struct_6 0 NonWritable
+               OpMemberDecorate %_struct_6 0 Offset 0
+               OpDecorate %_struct_6 BufferBlock
+               OpDecorate %7 DescriptorSet 0
+               OpDecorate %7 Binding 0
+               OpDecorate %_arr_v2float_uint_648 ArrayStride 8
+               OpMemberDecorate %_struct_9 0 Offset 0
+               OpDecorate %_struct_9 BufferBlock
+               OpDecorate %11 DescriptorSet 0
+               OpDecorate %11 Binding 1
+               OpDecorate %_arr_v2float_uint_648_0 ArrayStride 8
+               OpMemberDecorate %_struct_13 0 Offset 0
+               OpDecorate %_struct_13 BufferBlock
+               OpDecorate %14 DescriptorSet 0
+               OpDecorate %14 Binding 2
+       %void = OpTypeVoid
+          %3 = OpTypeFunction %void
+      %float = OpTypeFloat 32
+    %v2float = OpTypeVector %float 2
+%_ptr_Function_v2float = OpTypePointer Function %v2float
+ %_struct_19 = OpTypeStruct %v2float %v2float
+         %10 = OpTypeFunction %_struct_19 %_ptr_Function_v2float
+%_ptr_Function__struct_19 = OpTypePointer Function %_struct_19
+       %uint = OpTypeInt 32 0
+%_ptr_Function_uint = OpTypePointer Function %uint
+     %uint_0 = OpConstant %uint 0
+   %uint_648 = OpConstant %uint 648
+       %bool = OpTypeBool
+     %v2uint = OpTypeVector %uint 2
+%_ptr_Function_v2uint = OpTypePointer Function %v2uint
+   %uint_324 = OpConstant %uint 324
+%_arr_v2uint_uint_324 = OpTypeArray %v2uint %uint_324
+  %_struct_6 = OpTypeStruct %_arr_v2uint_uint_324
+%_ptr_Uniform__struct_6 = OpTypePointer Uniform %_struct_6
+          %7 = OpVariable %_ptr_Uniform__struct_6 Uniform
+        %int = OpTypeInt 32 1
+      %int_0 = OpConstant %int 0
+     %uint_2 = OpConstant %uint 2
+%_ptr_Uniform_v2uint = OpTypePointer Uniform %v2uint
+%_arr_v2float_uint_648 = OpTypeArray %v2float %uint_648
+  %_struct_9 = OpTypeStruct %_arr_v2float_uint_648
+%_ptr_Uniform__struct_9 = OpTypePointer Uniform %_struct_9
+         %11 = OpVariable %_ptr_Uniform__struct_9 Uniform
+     %uint_1 = OpConstant %uint 1
+%_ptr_Uniform_v2float = OpTypePointer Uniform %v2float
+%_arr_v2float_uint_648_0 = OpTypeArray %v2float %uint_648
+ %_struct_13 = OpTypeStruct %_arr_v2float_uint_648_0
+%_ptr_Uniform__struct_13 = OpTypePointer Uniform %_struct_13
+         %14 = OpVariable %_ptr_Uniform__struct_13 Uniform
+      %int_1 = OpConstant %int 1
+          %2 = OpFunction %void None %3
+          %5 = OpLabel
+         %46 = OpVariable %_ptr_Function_uint Function
+         %47 = OpVariable %_ptr_Function_v2uint Function
+         %48 = OpVariable %_ptr_Function_v2float Function
+         %50 = OpVariable %_ptr_Function__struct_19 Function
+               OpStore %46 %uint_0
+               OpBranch %30
+         %30 = OpLabel
+               OpLoopMerge %32 %33 None
+               OpBranch %34
+         %34 = OpLabel
+         %35 = OpLoad %uint %46
+         %38 = OpULessThan %bool %35 %uint_648
+               OpBranchConditional %38 %31 %32
+         %31 = OpLabel
+         %49 = OpLoad %uint %46
+         %51 = OpUDiv %uint %49 %uint_2
+         %53 = OpAccessChain %_ptr_Uniform_v2uint %7 %int_0 %51
+         %54 = OpLoad %v2uint %53
+               OpStore %47 %54
+         %56 = OpLoad %v2uint %47
+         %57 = OpBitcast %v2float %56
+               OpStore %48 %57
+         %62 = OpLoad %uint %46
+         %64 = OpIAdd %uint %62 %uint_1
+         %65 = OpLoad %v2float %48
+         %66 = OpLoad %uint %46
+         %68 = OpAccessChain %_ptr_Uniform_v2float %11 %int_0 %66
+         %69 = OpExtInst %v2float %1 Modf %65 %68
+         %70 = OpAccessChain %_ptr_Uniform_v2float %11 %int_0 %64
+               OpStore %70 %69
+         %73 = OpLoad %v2float %48
+         %74 = OpExtInst %_struct_19 %1 ModfStruct %73
+               OpStore %50 %74
+         %79 = OpLoad %uint %46
+         %81 = OpAccessChain %_ptr_Function_v2float %50 %int_1
+         %82 = OpLoad %v2float %81
+         %83 = OpAccessChain %_ptr_Uniform_v2float %14 %int_0 %79
+               OpStore %83 %82
+         %84 = OpLoad %uint %46
+         %85 = OpIAdd %uint %84 %uint_1
+         %86 = OpAccessChain %_ptr_Function_v2float %50 %int_0
+         %87 = OpLoad %v2float %86
+         %88 = OpAccessChain %_ptr_Uniform_v2float %14 %int_0 %85
+               OpStore %88 %87
+               OpBranch %33
+         %33 = OpLabel
+         %89 = OpLoad %uint %46
+         %90 = OpIAdd %uint %89 %uint_2
+               OpStore %46 %90
+               OpBranch %30
+         %32 = OpLabel
+               OpReturn
+               OpFunctionEnd
diff --git a/shaders-opencl-no-opt/asm/comp/opptrdiff-basic.spv14.invalid.asm.comp b/shaders-opencl-no-opt/asm/comp/opptrdiff-basic.spv14.invalid.asm.comp
new file mode 100644
index 000000000..8319dfdb6
--- /dev/null
+++ b/shaders-opencl-no-opt/asm/comp/opptrdiff-basic.spv14.invalid.asm.comp
@@ -0,0 +1,98 @@
+; SPIR-V
+; Version: 1.4
+; Generator: Khronos SPIR-V Tools Assembler; 0
+; Bound: 59
+; Schema: 0
+               OpCapability Shader
+               OpCapability VariablePointersStorageBuffer
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint GLCompute %1 "main" %2 %3 %4 %5 %6
+               OpExecutionMode %1 LocalSize 4 1 1
+               OpDecorate %7 Block
+               OpMemberDecorate %7 0 Offset 0
+               OpDecorate %8 ArrayStride 16
+               OpDecorate %9 Block
+               OpMemberDecorate %9 0 Offset 0
+               OpDecorate %10 ArrayStride 68
+               OpDecorate %11 Block
+               OpMemberDecorate %11 0 Offset 0
+               OpDecorate %12 ArrayStride 4
+               OpDecorate %13 ArrayStride 4
+               OpDecorate %2 DescriptorSet 0
+               OpDecorate %2 Binding 0
+               OpDecorate %3 DescriptorSet 0
+               OpDecorate %3 Binding 1
+               OpDecorate %4 DescriptorSet 0
+               OpDecorate %4 Binding 2
+               OpDecorate %5 BuiltIn LocalInvocationId
+               OpDecorate %6 BuiltIn WorkgroupId
+         %14 = OpTypeVoid
+         %15 = OpTypeBool
+         %16 = OpTypeInt 32 1
+         %17 = OpConstant %16 0
+         %18 = OpConstant %16 1
+         %19 = OpConstant %16 4
+         %20 = OpConstant %16 16
+         %21 = OpConstant %16 17
+         %22 = OpTypeVector %16 3
+         %23 = OpTypePointer Input %22
+         %12 = OpTypeArray %16 %19
+          %8 = OpTypeRuntimeArray %12
+          %7 = OpTypeStruct %8
+         %24 = OpTypePointer StorageBuffer %7
+         %25 = OpTypePointer StorageBuffer %12
+         %13 = OpTypeArray %16 %21
+         %10 = OpTypeRuntimeArray %13
+          %9 = OpTypeStruct %10
+         %26 = OpTypePointer StorageBuffer %9
+         %27 = OpTypePointer StorageBuffer %13
+         %28 = OpTypePointer StorageBuffer %16
+         %11 = OpTypeStruct %16
+         %29 = OpTypePointer Uniform %11
+         %30 = OpTypePointer Uniform %16
+          %2 = OpVariable %24 StorageBuffer
+          %3 = OpVariable %26 StorageBuffer
+          %4 = OpVariable %29 Uniform
+          %5 = OpVariable %23 Input
+          %6 = OpVariable %23 Input
+         %31 = OpTypeFunction %14
+          %1 = OpFunction %14 None %31
+         %32 = OpLabel
+         %33 = OpAccessChain %30 %4 %17
+         %34 = OpLoad %16 %33
+         %35 = OpLoad %22 %6
+         %36 = OpCompositeExtract %16 %35 0
+         %37 = OpLoad %22 %5
+         %38 = OpCompositeExtract %16 %37 0
+         %39 = OpAccessChain %25 %2 %17 %17
+         %40 = OpAccessChain %25 %2 %17 %36
+         %41 = OpSGreaterThanEqual %15 %36 %34
+               OpSelectionMerge %42 None
+               OpBranchConditional %41 %43 %42
+         %43 = OpLabel
+               OpReturn
+         %42 = OpLabel
+         %44 = OpIEqual %15 %38 %18
+               OpSelectionMerge %45 None
+               OpBranchConditional %44 %46 %45
+         %46 = OpLabel
+         %47 = OpPtrDiff %16 %40 %39
+         %48 = OpAccessChain %28 %3 %17 %36 %20
+               OpStore %48 %47
+               OpBranch %45
+         %45 = OpLabel
+         %49 = OpPhi %16 %17 %42 %17 %46 %50 %45
+         %50 = OpIAdd %16 %49 %18
+         %51 = OpIEqual %15 %50 %19
+         %52 = OpIMul %16 %38 %19
+         %53 = OpIAdd %16 %52 %49
+         %54 = OpAccessChain %28 %40 %38
+         %55 = OpAccessChain %28 %40 %49
+         %56 = OpPtrDiff %16 %54 %55
+         %57 = OpAccessChain %28 %3 %17 %36 %53
+               OpStore %57 %56
+               OpLoopMerge %58 %45 None
+               OpBranchConditional %51 %58 %45
+         %58 = OpLabel
+               OpReturn
+               OpFunctionEnd
diff --git a/shaders-opencl-no-opt/asm/comp/opptrdiff-opptraccesschain-elem-offset.spv14.asm.comp b/shaders-opencl-no-opt/asm/comp/opptrdiff-opptraccesschain-elem-offset.spv14.asm.comp
new file mode 100644
index 000000000..856649195
--- /dev/null
+++ b/shaders-opencl-no-opt/asm/comp/opptrdiff-opptraccesschain-elem-offset.spv14.asm.comp
@@ -0,0 +1,79 @@
+; SPIR-V
+; Version: 1.4
+; Generator: Khronos SPIR-V Tools Assembler; 0
+; Bound: 46
+; Schema: 0
+               OpCapability Shader
+               OpCapability VariablePointersStorageBuffer
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint GLCompute %1 "main" %2 %3 %4 %5
+               OpExecutionMode %1 LocalSize 1 1 1
+               OpDecorate %6 ArrayStride 4
+               OpDecorate %7 Block
+               OpMemberDecorate %7 0 Offset 0
+               OpMemberDecorate %7 1 Offset 4
+               OpDecorate %2 DescriptorSet 0
+               OpDecorate %2 Binding 0
+               OpDecorate %8 ArrayStride 8
+               OpDecorate %9 Block
+               OpMemberDecorate %9 0 Offset 0
+               OpDecorate %3 DescriptorSet 0
+               OpDecorate %3 Binding 1
+               OpDecorate %10 ArrayStride 4
+         %11 = OpTypeVoid
+         %12 = OpTypeBool
+         %13 = OpTypeInt 32 1
+         %14 = OpConstant %13 -1
+         %15 = OpConstant %13 0
+         %16 = OpConstant %13 1
+         %17 = OpConstant %13 2
+         %18 = OpConstant %13 3
+         %19 = OpTypeVector %13 2
+          %6 = OpTypeRuntimeArray %13
+          %7 = OpTypeStruct %13 %6
+         %20 = OpTypePointer StorageBuffer %7
+          %2 = OpVariable %20 StorageBuffer
+          %8 = OpTypeRuntimeArray %19
+          %9 = OpTypeStruct %8
+         %21 = OpTypePointer StorageBuffer %9
+          %3 = OpVariable %21 StorageBuffer
+         %10 = OpTypePointer StorageBuffer %13
+         %22 = OpTypePointer Private %10
+          %4 = OpVariable %22 Private
+          %5 = OpVariable %22 Private
+         %23 = OpTypePointer StorageBuffer %13
+         %24 = OpTypePointer StorageBuffer %19
+         %25 = OpTypeFunction %11
+          %1 = OpFunction %11 None %25
+         %26 = OpLabel
+         %27 = OpAccessChain %23 %2 %15
+         %28 = OpLoad %13 %27
+         %29 = OpAccessChain %10 %2 %16 %15
+               OpStore %4 %29
+         %30 = OpPtrAccessChain %10 %29 %28
+               OpStore %5 %30
+         %31 = OpSLessThanEqual %12 %28 %15
+               OpSelectionMerge %32 None
+               OpBranchConditional %31 %32 %33
+         %33 = OpLabel
+         %34 = OpPhi %13 %15 %26 %35 %33
+         %36 = OpLoad %10 %4
+         %37 = OpLoad %10 %5
+         %38 = OpPtrAccessChain %10 %36 %16
+         %39 = OpPtrAccessChain %10 %37 %14
+         %35 = OpIAdd %13 %34 %16
+               OpStore %4 %38
+               OpStore %5 %39
+         %40 = OpPtrDiff %13 %36 %37
+         %41 = OpPtrDiff %13 %37 %36
+         %42 = OpCompositeConstruct %19 %40 %41
+         %43 = OpAccessChain %24 %3 %15 %34
+               OpStore %43 %42
+         %44 = OpSGreaterThanEqual %12 %34 %28
+               OpLoopMerge %45 %33 None
+               OpBranchConditional %44 %45 %33
+         %45 = OpLabel
+               OpBranch %32
+         %32 = OpLabel
+               OpReturn
+               OpFunctionEnd
diff --git a/shaders-opencl-no-opt/asm/comp/opptrequal-basic.spv14.asm.comp b/shaders-opencl-no-opt/asm/comp/opptrequal-basic.spv14.asm.comp
new file mode 100644
index 000000000..5a97976ce
--- /dev/null
+++ b/shaders-opencl-no-opt/asm/comp/opptrequal-basic.spv14.asm.comp
@@ -0,0 +1,96 @@
+; SPIR-V
+; Version: 1.4
+; Generator: Khronos SPIR-V Tools Assembler; 0
+; Bound: 64
+; Schema: 0
+               OpCapability Shader
+               OpCapability VariablePointers
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint GLCompute %1 "main" %2 %3 %4 %5
+               OpExecutionMode %1 LocalSize 1 1 1
+               OpDecorate %6 ArrayStride 4
+               OpDecorate %7 Block
+               OpMemberDecorate %7 0 Offset 0
+               OpDecorate %2 DescriptorSet 0
+               OpDecorate %2 Binding 0
+               OpDecorate %3 DescriptorSet 0
+               OpDecorate %3 Binding 1
+               OpDecorate %4 DescriptorSet 0
+               OpDecorate %4 Binding 2
+               OpDecorate %5 DescriptorSet 0
+               OpDecorate %5 Binding 3
+          %8 = OpTypeVoid
+          %9 = OpTypeBool
+         %10 = OpTypeInt 32 0
+         %11 = OpConstant %10 0
+         %12 = OpConstant %10 1
+          %6 = OpTypeRuntimeArray %10
+          %7 = OpTypeStruct %6
+         %13 = OpTypePointer StorageBuffer %7
+         %14 = OpTypePointer StorageBuffer %6
+         %15 = OpTypePointer StorageBuffer %10
+          %2 = OpVariable %13 StorageBuffer
+          %3 = OpVariable %13 StorageBuffer
+          %4 = OpVariable %13 StorageBuffer
+          %5 = OpVariable %13 StorageBuffer
+         %16 = OpTypeFunction %8
+          %1 = OpFunction %8 None %16
+         %17 = OpLabel
+         %18 = OpCopyObject %10 %11
+         %19 = OpAccessChain %14 %2 %11
+         %20 = OpAccessChain %15 %2 %11 %11
+         %21 = OpAccessChain %14 %3 %11
+         %22 = OpAccessChain %15 %3 %11 %11
+         %23 = OpAccessChain %14 %4 %11
+         %24 = OpAccessChain %15 %4 %11 %11
+         %25 = OpPtrEqual %9 %2 %3
+         %26 = OpSelect %10 %25 %12 %11
+         %27 = OpAccessChain %15 %5 %11 %18
+         %28 = OpIAdd %10 %18 %12
+               OpStore %27 %26
+         %29 = OpPtrEqual %9 %19 %21
+         %30 = OpSelect %10 %29 %12 %11
+         %31 = OpAccessChain %15 %5 %11 %28
+         %32 = OpIAdd %10 %28 %12
+               OpStore %31 %30
+         %33 = OpPtrEqual %9 %20 %22
+         %34 = OpSelect %10 %33 %12 %11
+         %35 = OpAccessChain %15 %5 %11 %32
+         %36 = OpIAdd %10 %32 %12
+               OpStore %35 %34
+         %37 = OpPtrEqual %9 %2 %4
+         %38 = OpSelect %10 %37 %12 %11
+         %39 = OpAccessChain %15 %5 %11 %36
+         %40 = OpIAdd %10 %36 %12
+               OpStore %39 %38
+         %41 = OpPtrEqual %9 %19 %23
+         %42 = OpSelect %10 %41 %12 %11
+         %43 = OpAccessChain %15 %5 %11 %40
+         %44 = OpIAdd %10 %40 %12
+               OpStore %43 %42
+         %45 = OpPtrEqual %9 %20 %24
+         %46 = OpSelect %10 %45 %12 %11
+         %47 = OpAccessChain %15 %5 %11 %44
+         %48 = OpIAdd %10 %44 %12
+               OpStore %47 %46
+         %49 = OpPtrEqual %9 %3 %4
+         %50 = OpSelect %10 %49 %12 %11
+         %51 = OpAccessChain %15 %5 %11 %48
+         %52 = OpIAdd %10 %48 %12
+               OpStore %51 %50
+         %53 = OpPtrEqual %9 %21 %23
+         %54 = OpSelect %10 %53 %12 %11
+         %55 = OpAccessChain %15 %5 %11 %52
+         %56 = OpIAdd %10 %52 %12
+               OpStore %55 %54
+         %57 = OpPtrEqual %9 %22 %24
+         %58 = OpSelect %10 %57 %12 %11
+         %59 = OpAccessChain %15 %5 %11 %56
+         %60 = OpIAdd %10 %56 %12
+               OpStore %59 %58
+         %61 = OpPtrEqual %9 %2 %2
+         %62 = OpSelect %10 %61 %12 %11
+         %63 = OpAccessChain %15 %5 %11 %60
+               OpStore %63 %62
+               OpReturn
+               OpFunctionEnd
diff --git a/shaders-opencl-no-opt/asm/comp/opptrequal-row-maj-mtx-bypass-transpose.spv14.asm.invalid.comp b/shaders-opencl-no-opt/asm/comp/opptrequal-row-maj-mtx-bypass-transpose.spv14.asm.invalid.comp
new file mode 100644
index 000000000..89813b226
--- /dev/null
+++ b/shaders-opencl-no-opt/asm/comp/opptrequal-row-maj-mtx-bypass-transpose.spv14.asm.invalid.comp
@@ -0,0 +1,98 @@
+; SPIR-V
+; Version: 1.4
+; Generator: Khronos SPIR-V Tools Assembler; 0
+; Bound: 63
+; Schema: 0
+               OpCapability Shader
+               OpCapability VariablePointersStorageBuffer
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint GLCompute %1 "main" %2 %3 %4
+               OpExecutionMode %1 LocalSize 1 1 1
+               OpDecorate %5 ArrayStride 4
+               OpDecorate %6 Block
+               OpDecorate %7 Block
+               OpMemberDecorate %6 0 ColMajor
+               OpMemberDecorate %6 0 Offset 0
+               OpMemberDecorate %6 0 MatrixStride 16
+               OpMemberDecorate %6 1 RowMajor
+               OpMemberDecorate %6 1 Offset 64
+               OpMemberDecorate %6 1 MatrixStride 16
+               OpMemberDecorate %6 2 Offset 128
+               OpMemberDecorate %6 3 Offset 132
+               OpMemberDecorate %7 0 Offset 0
+               OpDecorate %2 DescriptorSet 0
+               OpDecorate %2 Binding 0
+               OpDecorate %3 DescriptorSet 0
+               OpDecorate %3 Binding 1
+               OpDecorate %4 DescriptorSet 0
+               OpDecorate %4 Binding 2
+          %8 = OpTypeVoid
+          %9 = OpTypeBool
+         %10 = OpTypeInt 32 0
+         %11 = OpConstant %10 0
+         %12 = OpConstant %10 1
+         %13 = OpConstant %10 2
+         %14 = OpConstant %10 3
+         %15 = OpTypeFloat 32
+          %5 = OpTypeRuntimeArray %10
+         %16 = OpTypeVector %15 4
+         %17 = OpTypeMatrix %16 4
+          %6 = OpTypeStruct %17 %17 %15 %15
+          %7 = OpTypeStruct %5
+         %18 = OpTypePointer StorageBuffer %6
+         %19 = OpTypePointer StorageBuffer %7
+         %20 = OpTypePointer StorageBuffer %17
+         %21 = OpTypePointer StorageBuffer %10
+         %22 = OpTypePointer StorageBuffer %15
+         %23 = OpTypePointer StorageBuffer %16
+          %2 = OpVariable %18 StorageBuffer
+          %3 = OpVariable %18 StorageBuffer
+          %4 = OpVariable %19 StorageBuffer
+         %24 = OpTypeFunction %8
+          %1 = OpFunction %8 None %24
+         %25 = OpLabel
+         %26 = OpCopyObject %10 %11
+         %27 = OpAccessChain %22 %2 %13
+         %28 = OpAccessChain %22 %2 %14
+         %29 = OpAccessChain %22 %3 %13
+         %30 = OpAccessChain %22 %3 %14
+         %31 = OpAccessChain %20 %2 %11
+         %32 = OpAccessChain %20 %2 %12
+         %33 = OpAccessChain %23 %2 %11 %11
+         %34 = OpAccessChain %23 %2 %11 %12
+         %35 = OpAccessChain %22 %2 %11 %11 %11
+         %36 = OpPtrEqual %9 %27 %28
+         %37 = OpSelect %10 %36 %11 %12
+         %38 = OpAccessChain %21 %4 %11 %26
+         %39 = OpIAdd %10 %26 %12
+               OpStore %38 %37
+         %40 = OpPtrEqual %9 %27 %29
+         %41 = OpSelect %10 %40 %11 %12
+         %42 = OpAccessChain %21 %4 %11 %39
+         %43 = OpIAdd %10 %39 %12
+               OpStore %42 %41
+         %44 = OpSelect %22 %40 %27 %28
+         %45 = OpSelect %22 %40 %29 %30
+         %46 = OpPtrEqual %9 %44 %45
+         %47 = OpSelect %10 %46 %11 %12
+         %48 = OpAccessChain %21 %4 %11 %43
+         %49 = OpIAdd %10 %43 %12
+               OpStore %48 %47
+         %50 = OpSelect %22 %46 %27 %28
+         %51 = OpPtrEqual %9 %50 %35
+         %52 = OpSelect %10 %51 %11 %12
+         %53 = OpAccessChain %21 %4 %11 %49
+         %54 = OpIAdd %10 %49 %12
+               OpStore %53 %52
+         %55 = OpPtrEqual %9 %31 %32
+         %56 = OpSelect %10 %55 %11 %12
+         %57 = OpAccessChain %21 %4 %11 %54
+         %58 = OpIAdd %10 %54 %12
+               OpStore %57 %56
+         %59 = OpPtrEqual %9 %33 %34
+         %60 = OpSelect %10 %59 %11 %12
+         %61 = OpAccessChain %21 %4 %11 %58
+         %62 = OpIAdd %10 %58 %12
+               OpStore %61 %56
+               OpReturn
+               OpFunctionEnd
diff --git a/shaders-opencl-no-opt/asm/comp/opptrnotequal-basic.spv14.asm.comp b/shaders-opencl-no-opt/asm/comp/opptrnotequal-basic.spv14.asm.comp
new file mode 100644
index 000000000..1cbf8045c
--- /dev/null
+++ b/shaders-opencl-no-opt/asm/comp/opptrnotequal-basic.spv14.asm.comp
@@ -0,0 +1,96 @@
+; SPIR-V
+; Version: 1.4
+; Generator: Khronos SPIR-V Tools Assembler; 0
+; Bound: 64
+; Schema: 0
+               OpCapability Shader
+               OpCapability VariablePointers
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint GLCompute %1 "main" %2 %3 %4 %5
+               OpExecutionMode %1 LocalSize 1 1 1
+               OpDecorate %6 ArrayStride 4
+               OpDecorate %7 Block
+               OpMemberDecorate %7 0 Offset 0
+               OpDecorate %2 DescriptorSet 0
+               OpDecorate %2 Binding 0
+               OpDecorate %3 DescriptorSet 0
+               OpDecorate %3 Binding 1
+               OpDecorate %4 DescriptorSet 0
+               OpDecorate %4 Binding 2
+               OpDecorate %5 DescriptorSet 0
+               OpDecorate %5 Binding 3
+          %8 = OpTypeVoid
+          %9 = OpTypeBool
+         %10 = OpTypeInt 32 0
+         %11 = OpConstant %10 0
+         %12 = OpConstant %10 1
+          %6 = OpTypeRuntimeArray %10
+          %7 = OpTypeStruct %6
+         %13 = OpTypePointer StorageBuffer %7
+         %14 = OpTypePointer StorageBuffer %6
+         %15 = OpTypePointer StorageBuffer %10
+          %2 = OpVariable %13 StorageBuffer
+          %3 = OpVariable %13 StorageBuffer
+          %4 = OpVariable %13 StorageBuffer
+          %5 = OpVariable %13 StorageBuffer
+         %16 = OpTypeFunction %8
+          %1 = OpFunction %8 None %16
+         %17 = OpLabel
+         %18 = OpCopyObject %10 %11
+         %19 = OpAccessChain %14 %2 %11
+         %20 = OpAccessChain %15 %2 %11 %11
+         %21 = OpAccessChain %14 %3 %11
+         %22 = OpAccessChain %15 %3 %11 %11
+         %23 = OpAccessChain %14 %4 %11
+         %24 = OpAccessChain %15 %4 %11 %11
+         %25 = OpPtrNotEqual %9 %2 %3
+         %26 = OpSelect %10 %25 %12 %11
+         %27 = OpAccessChain %15 %5 %11 %18
+         %28 = OpIAdd %10 %18 %12
+               OpStore %27 %26
+         %29 = OpPtrNotEqual %9 %19 %21
+         %30 = OpSelect %10 %29 %12 %11
+         %31 = OpAccessChain %15 %5 %11 %28
+         %32 = OpIAdd %10 %28 %12
+               OpStore %31 %30
+         %33 = OpPtrNotEqual %9 %20 %22
+         %34 = OpSelect %10 %33 %12 %11
+         %35 = OpAccessChain %15 %5 %11 %32
+         %36 = OpIAdd %10 %32 %12
+               OpStore %35 %34
+         %37 = OpPtrNotEqual %9 %2 %4
+         %38 = OpSelect %10 %37 %12 %11
+         %39 = OpAccessChain %15 %5 %11 %36
+         %40 = OpIAdd %10 %36 %12
+               OpStore %39 %38
+         %41 = OpPtrNotEqual %9 %19 %23
+         %42 = OpSelect %10 %41 %12 %11
+         %43 = OpAccessChain %15 %5 %11 %40
+         %44 = OpIAdd %10 %40 %12
+               OpStore %43 %42
+         %45 = OpPtrNotEqual %9 %20 %24
+         %46 = OpSelect %10 %45 %12 %11
+         %47 = OpAccessChain %15 %5 %11 %44
+         %48 = OpIAdd %10 %44 %12
+               OpStore %47 %46
+         %49 = OpPtrNotEqual %9 %3 %4
+         %50 = OpSelect %10 %49 %12 %11
+         %51 = OpAccessChain %15 %5 %11 %48
+         %52 = OpIAdd %10 %48 %12
+               OpStore %51 %50
+         %53 = OpPtrNotEqual %9 %21 %23
+         %54 = OpSelect %10 %53 %12 %11
+         %55 = OpAccessChain %15 %5 %11 %52
+         %56 = OpIAdd %10 %52 %12
+               OpStore %55 %54
+         %57 = OpPtrNotEqual %9 %22 %24
+         %58 = OpSelect %10 %57 %12 %11
+         %59 = OpAccessChain %15 %5 %11 %56
+         %60 = OpIAdd %10 %56 %12
+               OpStore %59 %58
+         %61 = OpPtrNotEqual %9 %2 %2
+         %62 = OpSelect %10 %61 %12 %11
+         %63 = OpAccessChain %15 %5 %11 %60
+               OpStore %63 %62
+               OpReturn
+               OpFunctionEnd
diff --git a/shaders-opencl-no-opt/asm/comp/ptr-access-chain-custom-array-stride.asm.comp b/shaders-opencl-no-opt/asm/comp/ptr-access-chain-custom-array-stride.asm.comp
new file mode 100644
index 000000000..298b4e750
--- /dev/null
+++ b/shaders-opencl-no-opt/asm/comp/ptr-access-chain-custom-array-stride.asm.comp
@@ -0,0 +1,98 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos Glslang Reference Front End; 11
+; Bound: 66
+; Schema: 0
+               OpCapability Shader
+               OpCapability PhysicalStorageBufferAddresses
+               OpExtension "SPV_KHR_physical_storage_buffer"
+          %1 = OpExtInstImport "GLSL.std.450"
+               OpMemoryModel PhysicalStorageBuffer64 GLSL450
+               OpEntryPoint GLCompute %main "main" %gl_GlobalInvocationID
+               OpExecutionMode %main LocalSize 64 1 1
+               OpSource GLSL 450
+               OpSourceExtension "GL_EXT_buffer_reference"
+               OpSourceExtension "GL_EXT_buffer_reference_uvec2"
+               OpSourceExtension "GL_EXT_scalar_block_layout"
+               OpName %main "main"
+               OpName %Registers "Registers"
+               OpMemberName %Registers 0 "a"
+               OpMemberName %Registers 1 "b"
+               OpMemberName %Registers 2 "c"
+               OpMemberName %Registers 3 "d"
+               OpName %_ ""
+               OpName %gl_GlobalInvocationID "gl_GlobalInvocationID"
+               OpMemberDecorate %Registers 0 Offset 0
+               OpMemberDecorate %Registers 1 Offset 8
+               OpMemberDecorate %Registers 2 Offset 16
+               OpMemberDecorate %Registers 3 Offset 24
+               OpDecorate %Registers Block
+               OpDecorate %v3float_stride12_ptr ArrayStride 12
+               OpDecorate %v3float_stride16_ptr ArrayStride 16
+               OpDecorate %gl_GlobalInvocationID BuiltIn GlobalInvocationId
+               OpDecorate %gl_WorkGroupSize BuiltIn WorkgroupSize
+       %void = OpTypeVoid
+          %3 = OpTypeFunction %void
+       %uint = OpTypeInt 32 0
+     %v2uint = OpTypeVector %uint 2
+      %float = OpTypeFloat 32
+    %v3float = OpTypeVector %float 3
+%_ptr_PhysicalStorageBuffer_v3float = OpTypePointer PhysicalStorageBuffer %v3float
+%v3float_stride12_ptr = OpTypePointer PhysicalStorageBuffer %v3float
+%v3float_stride16_ptr = OpTypePointer PhysicalStorageBuffer %v3float
+%v3float_stride12_ptr_push = OpTypePointer PushConstant %v3float_stride12_ptr
+%v3float_stride16_ptr_push = OpTypePointer PushConstant %v3float_stride16_ptr
+%v2uint_ptr = OpTypePointer PushConstant %v2uint
+  %Registers = OpTypeStruct %v3float_stride12_ptr %v3float_stride16_ptr %v2uint %v2uint
+%_ptr_PushConstant_Registers = OpTypePointer PushConstant %Registers
+          %_ = OpVariable %_ptr_PushConstant_Registers PushConstant
+        %int = OpTypeInt 32 1
+      %int_0 = OpConstant %int 0
+     %v3uint = OpTypeVector %uint 3
+%_ptr_Input_v3uint = OpTypePointer Input %v3uint
+%gl_GlobalInvocationID = OpVariable %_ptr_Input_v3uint Input
+     %uint_0 = OpConstant %uint 0
+%_ptr_Input_uint = OpTypePointer Input %uint
+      %int_1 = OpConstant %int 1
+      %int_2 = OpConstant %int 2
+%_ptr_PushConstant_v2uint = OpTypePointer PushConstant %v2uint
+      %int_3 = OpConstant %int 3
+    %uint_64 = OpConstant %uint 64
+     %uint_1 = OpConstant %uint 1
+%gl_WorkGroupSize = OpConstantComposite %v3uint %uint_64 %uint_1 %uint_1
+       %main = OpFunction %void None %3
+          %5 = OpLabel
+         %29 = OpAccessChain %_ptr_Input_uint %gl_GlobalInvocationID %uint_0
+         %index = OpLoad %uint %29
+
+         %ptr_member_0 = OpAccessChain %v3float_stride12_ptr_push %_ %int_0
+         %ptr0 = OpLoad %v3float_stride12_ptr %ptr_member_0
+
+         %ptr_member_1 = OpAccessChain %v3float_stride16_ptr_push %_ %int_1
+         %ptr1 = OpLoad %v3float_stride16_ptr %ptr_member_1
+
+         %ptr_member_2 = OpAccessChain %v2uint_ptr %_ %int_2
+         %ptr2v = OpLoad %v2uint %ptr_member_2
+		 %ptr2 = OpBitcast %v3float_stride12_ptr %ptr2v
+
+         %ptr_member_3 = OpAccessChain %v2uint_ptr %_ %int_3
+         %ptr3v = OpLoad %v2uint %ptr_member_3
+		 %ptr3 = OpBitcast %v3float_stride16_ptr %ptr3v
+
+		%ptr0_chain = OpPtrAccessChain %v3float_stride12_ptr %ptr0 %index
+		%ptr1_chain = OpPtrAccessChain %v3float_stride16_ptr %ptr1 %index
+		%ptr2_chain = OpPtrAccessChain %v3float_stride12_ptr %ptr2 %index
+		%ptr3_chain = OpPtrAccessChain %v3float_stride16_ptr %ptr3 %index
+
+		%loaded0 = OpLoad %v3float %ptr0_chain Aligned 4
+		%loaded1 = OpLoad %v3float %ptr1_chain Aligned 16
+		%loaded2 = OpLoad %v3float %ptr2_chain Aligned 4
+		%loaded3 = OpLoad %v3float %ptr3_chain Aligned 16
+
+		%added0 = OpFAdd %v3float %loaded0 %loaded1
+		%added1 = OpFAdd %v3float %loaded2 %loaded3
+		OpStore %ptr0_chain %added0 Aligned 4
+		OpStore %ptr2_chain %added1 Aligned 4
+
+               OpReturn
+               OpFunctionEnd
diff --git a/shaders-opencl-no-opt/asm/comp/spec-constant-name-aliasing.asm.comp b/shaders-opencl-no-opt/asm/comp/spec-constant-name-aliasing.asm.comp
new file mode 100644
index 000000000..b4e622bac
--- /dev/null
+++ b/shaders-opencl-no-opt/asm/comp/spec-constant-name-aliasing.asm.comp
@@ -0,0 +1,78 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos Glslang Reference Front End; 10
+; Bound: 35
+; Schema: 0
+               OpCapability Shader
+          %1 = OpExtInstImport "GLSL.std.450"
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint GLCompute %main "main" %gl_GlobalInvocationID
+               OpExecutionMode %main LocalSize 1 1 1
+               OpSource GLSL 450
+               OpName %main "main"
+               OpName %SSBO "SSBO"
+               OpMemberName %SSBO 0 "values"
+               OpName %_ ""
+               OpName %gl_GlobalInvocationID "gl_GlobalInvocationID"
+               OpName %A "A"
+               OpName %B "A"
+               OpName %C "A"
+               OpName %D "A"
+               OpName %E "A"
+               OpName %F "A"
+               OpName %G "A"
+               OpName %H "A"
+               OpName %I "A"
+               OpName %J "A"
+               OpName %K "A"
+               OpName %L "A"
+               OpDecorate %_runtimearr_int ArrayStride 4
+               OpMemberDecorate %SSBO 0 Offset 0
+               OpDecorate %SSBO BufferBlock
+               OpDecorate %_ DescriptorSet 0
+               OpDecorate %_ Binding 0
+               OpDecorate %gl_GlobalInvocationID BuiltIn GlobalInvocationId
+               OpDecorate %A SpecId 0
+               OpDecorate %B SpecId 1
+               OpDecorate %C SpecId 2
+               OpDecorate %D SpecId 3
+               OpDecorate %E SpecId 4
+               OpDecorate %F SpecId 5
+               OpDecorate %gl_WorkGroupSize BuiltIn WorkgroupSize
+       %void = OpTypeVoid
+          %3 = OpTypeFunction %void
+        %int = OpTypeInt 32 1
+%_runtimearr_int = OpTypeRuntimeArray %int
+       %SSBO = OpTypeStruct %_runtimearr_int
+%_ptr_Uniform_SSBO = OpTypePointer Uniform %SSBO
+          %_ = OpVariable %_ptr_Uniform_SSBO Uniform
+      %int_0 = OpConstant %int 0
+       %uint = OpTypeInt 32 0
+     %v3uint = OpTypeVector %uint 3
+%_ptr_Input_v3uint = OpTypePointer Input %v3uint
+%gl_GlobalInvocationID = OpVariable %_ptr_Input_v3uint Input
+     %uint_0 = OpConstant %uint 0
+%_ptr_Input_uint = OpTypePointer Input %uint
+          %A = OpSpecConstant %int 0
+          %B = OpSpecConstant %int 1
+          %C = OpSpecConstant %int 2
+          %D = OpSpecConstant %int 3
+          %E = OpSpecConstant %int 4
+          %F = OpSpecConstant %int 5
+          %G = OpSpecConstantOp %int ISub %A %B
+          %H = OpSpecConstantOp %int ISub %G %C
+          %I = OpSpecConstantOp %int ISub %H %D
+          %J = OpSpecConstantOp %int ISub %I %E
+          %K = OpSpecConstantOp %int ISub %J %F
+		  %L = OpSpecConstantOp %int IAdd %K %F
+%_ptr_Uniform_int = OpTypePointer Uniform %int
+     %uint_1 = OpConstant %uint 1
+%gl_WorkGroupSize = OpConstantComposite %v3uint %uint_1 %uint_1 %uint_1
+       %main = OpFunction %void None %3
+          %5 = OpLabel
+         %18 = OpAccessChain %_ptr_Input_uint %gl_GlobalInvocationID %uint_0
+         %19 = OpLoad %uint %18
+         %32 = OpAccessChain %_ptr_Uniform_int %_ %int_0 %19
+               OpStore %32 %L
+               OpReturn
+               OpFunctionEnd
diff --git a/shaders-opencl-no-opt/asm/comp/storage-buffer-basic.invalid.asm.comp b/shaders-opencl-no-opt/asm/comp/storage-buffer-basic.invalid.asm.comp
new file mode 100644
index 000000000..bdf2027a8
--- /dev/null
+++ b/shaders-opencl-no-opt/asm/comp/storage-buffer-basic.invalid.asm.comp
@@ -0,0 +1,58 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Codeplay; 0
+; Bound: 31
+; Schema: 0
+               OpCapability Shader
+               OpCapability VariablePointers
+               OpExtension "SPV_KHR_storage_buffer_storage_class"
+               OpExtension "SPV_KHR_variable_pointers"
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint GLCompute %22 "main" %gl_WorkGroupID
+               OpSource OpenCL_C 120
+               OpDecorate %15 SpecId 0
+               ;OpDecorate %16 SpecId 1
+               OpDecorate %17 SpecId 2
+               OpDecorate %_runtimearr_float ArrayStride 4
+               OpMemberDecorate %_struct_4 0 Offset 0
+               OpDecorate %_struct_4 Block
+               OpDecorate %gl_WorkGroupID BuiltIn WorkgroupId
+               OpDecorate %gl_WorkGroupSize BuiltIn WorkgroupSize
+               OpDecorate %20 DescriptorSet 0
+               OpDecorate %20 Binding 0
+               OpDecorate %21 DescriptorSet 0
+               OpDecorate %21 Binding 1
+      %float = OpTypeFloat 32
+       %uint = OpTypeInt 32 0
+	   %size1 = OpConstant %uint 1
+%_ptr_StorageBuffer_float = OpTypePointer StorageBuffer %float
+%_runtimearr_float = OpTypeArray %float %size1 ; Runtime arrays do not work yet in MSL.
+  %_struct_4 = OpTypeStruct %_runtimearr_float
+%_ptr_StorageBuffer__struct_4 = OpTypePointer StorageBuffer %_struct_4
+       %void = OpTypeVoid
+          %8 = OpTypeFunction %void
+     %v3uint = OpTypeVector %uint 3
+%_ptr_Input_v3uint = OpTypePointer Input %v3uint
+%_ptr_Input_uint = OpTypePointer Input %uint
+%_ptr_Private_v3uint = OpTypePointer Private %v3uint
+     %uint_0 = OpConstant %uint 0
+%gl_WorkGroupID = OpVariable %_ptr_Input_v3uint Input
+         %15 = OpSpecConstant %uint 1
+         %16 = OpConstant %uint 2
+         %17 = OpSpecConstant %uint 3
+%gl_WorkGroupSize = OpSpecConstantComposite %v3uint %15 %16 %17
+         %19 = OpVariable %_ptr_Private_v3uint Private %gl_WorkGroupSize
+         %20 = OpVariable %_ptr_StorageBuffer__struct_4 StorageBuffer
+         %21 = OpVariable %_ptr_StorageBuffer__struct_4 StorageBuffer
+         %22 = OpFunction %void None %8
+         %23 = OpLabel
+         %24 = OpAccessChain %_ptr_Input_uint %gl_WorkGroupID %uint_0
+         %25 = OpLoad %uint %24
+         %26 = OpAccessChain %_ptr_StorageBuffer_float %21 %uint_0 %25
+         %27 = OpLoad %float %26
+         %28 = OpAccessChain %_ptr_StorageBuffer_float %20 %uint_0 %25
+         %29 = OpLoad %float %28
+         %30 = OpFAdd %float %27 %29
+               OpStore %28 %30
+               OpReturn
+               OpFunctionEnd
diff --git a/shaders-opencl-no-opt/asm/comp/storage-buffer-pointer-argument.asm.comp b/shaders-opencl-no-opt/asm/comp/storage-buffer-pointer-argument.asm.comp
new file mode 100644
index 000000000..010d17c20
--- /dev/null
+++ b/shaders-opencl-no-opt/asm/comp/storage-buffer-pointer-argument.asm.comp
@@ -0,0 +1,63 @@
+; SPIR-V
+; Version: 1.3
+; Generator: Khronos Glslang Reference Front End; 7
+; Bound: 30
+; Schema: 0
+               OpCapability Shader
+               OpCapability VariablePointersStorageBuffer
+          %1 = OpExtInstImport "GLSL.std.450"
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint GLCompute %main "main"
+               OpExecutionMode %main LocalSize 1 1 1
+               OpSource GLSL 450
+               OpName %main "main"
+               OpName %copy_out_f1_f1_ "copy_out(f1;f1;"
+               OpName %A "A"
+               OpName %B "B"
+               OpName %SSBO "SSBO"
+               OpMemberName %SSBO 0 "a"
+               OpName %_ ""
+               OpName %SSBORead "SSBORead"
+               OpMemberName %SSBORead 0 "b"
+               OpName %__0 ""
+               OpMemberDecorate %SSBO 0 NonReadable
+               OpMemberDecorate %SSBO 0 Offset 0
+               OpDecorate %SSBO Block
+               OpDecorate %_ DescriptorSet 0
+               OpDecorate %_ Binding 0
+               OpMemberDecorate %SSBORead 0 NonWritable
+               OpMemberDecorate %SSBORead 0 Offset 0
+               OpDecorate %SSBORead Block
+               OpDecorate %__0 DescriptorSet 0
+               OpDecorate %__0 Binding 1
+       %void = OpTypeVoid
+          %3 = OpTypeFunction %void
+      %float = OpTypeFloat 32
+%_ptr_Function_float = OpTypePointer Function %float
+%_ptr_StorageBuffer_float = OpTypePointer StorageBuffer %float
+          %8 = OpTypeFunction %void %_ptr_StorageBuffer_float %_ptr_StorageBuffer_float
+       %SSBO = OpTypeStruct %float
+%_ptr_StorageBuffer_SSBO = OpTypePointer StorageBuffer %SSBO
+          %_ = OpVariable %_ptr_StorageBuffer_SSBO StorageBuffer
+        %int = OpTypeInt 32 1
+      %int_0 = OpConstant %int 0
+   %SSBORead = OpTypeStruct %float
+%_ptr_StorageBuffer_SSBORead = OpTypePointer StorageBuffer %SSBORead
+        %__0 = OpVariable %_ptr_StorageBuffer_SSBORead StorageBuffer
+       %main = OpFunction %void None %3
+          %5 = OpLabel
+      %param = OpVariable %_ptr_Function_float Function
+    %param_0 = OpVariable %_ptr_Function_float Function
+         %25 = OpAccessChain %_ptr_StorageBuffer_float %_ %int_0
+         %26 = OpAccessChain %_ptr_StorageBuffer_float %__0 %int_0
+         %27 = OpFunctionCall %void %copy_out_f1_f1_ %25 %26
+               OpReturn
+               OpFunctionEnd
+%copy_out_f1_f1_ = OpFunction %void None %8
+          %A = OpFunctionParameter %_ptr_StorageBuffer_float
+          %B = OpFunctionParameter %_ptr_StorageBuffer_float
+         %12 = OpLabel
+         %13 = OpLoad %float %B
+               OpStore %A %13
+               OpReturn
+               OpFunctionEnd
diff --git a/shaders-opencl-no-opt/asm/comp/trancendentals-float-controls-2-fp16.fp16.asm.comp b/shaders-opencl-no-opt/asm/comp/trancendentals-float-controls-2-fp16.fp16.asm.comp
new file mode 100644
index 000000000..fca4fff77
--- /dev/null
+++ b/shaders-opencl-no-opt/asm/comp/trancendentals-float-controls-2-fp16.fp16.asm.comp
@@ -0,0 +1,225 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos Glslang Reference Front End; 11
+; Bound: 173
+; Schema: 0
+               OpCapability Shader
+               OpCapability Float16
+               OpCapability StorageBuffer16BitAccess
+               OpCapability FloatControls2
+               OpExtension "SPV_KHR_16bit_storage"
+               OpExtension "SPV_KHR_float_controls2"
+          %1 = OpExtInstImport "GLSL.std.450"
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint GLCompute %main "main" %gl_LocalInvocationIndex
+               OpExecutionMode %main LocalSize 4 1 1
+               OpExecutionModeId %main FPFastMathDefault %half %fp32_modes
+               OpSource GLSL 450
+               OpName %main "main"
+               OpName %SSBO "SSBO"
+               OpMemberName %SSBO 0 "v"
+               OpMemberName %SSBO 1 "f16"
+               OpName %_ ""
+               OpName %gl_LocalInvocationIndex "gl_LocalInvocationIndex"
+               OpDecorate %_arr_float_uint_4 ArrayStride 4
+               OpDecorate %_arr_half_uint_4 ArrayStride 2
+               OpDecorate %SSBO BufferBlock
+               OpMemberDecorate %SSBO 0 Offset 0
+               OpMemberDecorate %SSBO 1 Offset 16
+               OpDecorate %_ Binding 0
+               OpDecorate %_ DescriptorSet 0
+               OpDecorate %gl_LocalInvocationIndex BuiltIn LocalInvocationIndex
+               OpDecorate %gl_WorkGroupSize BuiltIn WorkgroupSize
+			   OpDecorate %24 FPFastMathMode NotNaN|NotInf
+			   OpDecorate %30 FPFastMathMode NotNaN ; This isn't enough to avoid precise::
+       %void = OpTypeVoid
+          %3 = OpTypeFunction %void
+      %float = OpTypeFloat 32
+       %uint = OpTypeInt 32 0
+	   %fp32_modes = OpConstant %uint 0x7000f
+     %uint_4 = OpConstant %uint 4
+%_arr_float_uint_4 = OpTypeArray %float %uint_4
+       %half = OpTypeFloat 16
+%_arr_half_uint_4 = OpTypeArray %half %uint_4
+       %SSBO = OpTypeStruct %_arr_float_uint_4 %_arr_half_uint_4
+%_ptr_Uniform_SSBO = OpTypePointer Uniform %SSBO
+          %_ = OpVariable %_ptr_Uniform_SSBO Uniform
+        %int = OpTypeInt 32 1
+      %int_0 = OpConstant %int 0
+%_ptr_Input_uint = OpTypePointer Input %uint
+%gl_LocalInvocationIndex = OpVariable %_ptr_Input_uint Input
+%_ptr_Uniform_float = OpTypePointer Uniform %float
+    %float_4 = OpConstant %float 4
+      %int_1 = OpConstant %int 1
+%_ptr_Uniform_half = OpTypePointer Uniform %half
+     %v3uint = OpTypeVector %uint 3
+     %uint_1 = OpConstant %uint 1
+%gl_WorkGroupSize = OpConstantComposite %v3uint %uint_4 %uint_1 %uint_1
+       %main = OpFunction %void None %3
+          %5 = OpLabel
+         %19 = OpLoad %uint %gl_LocalInvocationIndex
+         %20 = OpLoad %uint %gl_LocalInvocationIndex
+         %22 = OpAccessChain %_ptr_Uniform_float %_ %int_0 %20
+         %23 = OpLoad %float %22
+         %24 = OpExtInst %float %1 Cos %23
+         %25 = OpAccessChain %_ptr_Uniform_float %_ %int_0 %19
+               OpStore %25 %24
+         %26 = OpLoad %uint %gl_LocalInvocationIndex
+         %27 = OpLoad %uint %gl_LocalInvocationIndex
+         %28 = OpAccessChain %_ptr_Uniform_float %_ %int_0 %27
+         %29 = OpLoad %float %28
+         %30 = OpExtInst %float %1 Sin %29
+         %31 = OpAccessChain %_ptr_Uniform_float %_ %int_0 %26
+         %32 = OpLoad %float %31
+         %33 = OpFAdd %float %32 %30
+         %34 = OpAccessChain %_ptr_Uniform_float %_ %int_0 %26
+               OpStore %34 %33
+         %35 = OpLoad %uint %gl_LocalInvocationIndex
+         %36 = OpLoad %uint %gl_LocalInvocationIndex
+         %37 = OpAccessChain %_ptr_Uniform_float %_ %int_0 %36
+         %38 = OpLoad %float %37
+         %39 = OpExtInst %float %1 Tan %38
+         %40 = OpAccessChain %_ptr_Uniform_float %_ %int_0 %35
+         %41 = OpLoad %float %40
+         %42 = OpFAdd %float %41 %39
+         %43 = OpAccessChain %_ptr_Uniform_float %_ %int_0 %35
+               OpStore %43 %42
+         %44 = OpLoad %uint %gl_LocalInvocationIndex
+         %45 = OpLoad %uint %gl_LocalInvocationIndex
+         %46 = OpAccessChain %_ptr_Uniform_float %_ %int_0 %45
+         %47 = OpLoad %float %46
+         %48 = OpExtInst %float %1 Acos %47
+         %49 = OpAccessChain %_ptr_Uniform_float %_ %int_0 %44
+         %50 = OpLoad %float %49
+         %51 = OpFAdd %float %50 %48
+         %52 = OpAccessChain %_ptr_Uniform_float %_ %int_0 %44
+               OpStore %52 %51
+         %53 = OpLoad %uint %gl_LocalInvocationIndex
+         %54 = OpLoad %uint %gl_LocalInvocationIndex
+         %55 = OpAccessChain %_ptr_Uniform_float %_ %int_0 %54
+         %56 = OpLoad %float %55
+         %57 = OpExtInst %float %1 Asin %56
+         %58 = OpAccessChain %_ptr_Uniform_float %_ %int_0 %53
+         %59 = OpLoad %float %58
+         %60 = OpFAdd %float %59 %57
+         %61 = OpAccessChain %_ptr_Uniform_float %_ %int_0 %53
+               OpStore %61 %60
+         %62 = OpLoad %uint %gl_LocalInvocationIndex
+         %63 = OpLoad %uint %gl_LocalInvocationIndex
+         %64 = OpAccessChain %_ptr_Uniform_float %_ %int_0 %63
+         %65 = OpLoad %float %64
+         %66 = OpExtInst %float %1 Atan %65
+         %67 = OpAccessChain %_ptr_Uniform_float %_ %int_0 %62
+         %68 = OpLoad %float %67
+         %69 = OpFAdd %float %68 %66
+         %70 = OpAccessChain %_ptr_Uniform_float %_ %int_0 %62
+               OpStore %70 %69
+         %71 = OpLoad %uint %gl_LocalInvocationIndex
+         %72 = OpLoad %uint %gl_LocalInvocationIndex
+         %73 = OpAccessChain %_ptr_Uniform_float %_ %int_0 %72
+         %74 = OpLoad %float %73
+         %75 = OpExtInst %float %1 Exp %74
+         %76 = OpAccessChain %_ptr_Uniform_float %_ %int_0 %71
+         %77 = OpLoad %float %76
+         %78 = OpFAdd %float %77 %75
+         %79 = OpAccessChain %_ptr_Uniform_float %_ %int_0 %71
+               OpStore %79 %78
+         %80 = OpLoad %uint %gl_LocalInvocationIndex
+         %81 = OpLoad %uint %gl_LocalInvocationIndex
+         %82 = OpAccessChain %_ptr_Uniform_float %_ %int_0 %81
+         %83 = OpLoad %float %82
+         %84 = OpExtInst %float %1 Exp2 %83
+         %85 = OpAccessChain %_ptr_Uniform_float %_ %int_0 %80
+         %86 = OpLoad %float %85
+         %87 = OpFAdd %float %86 %84
+         %88 = OpAccessChain %_ptr_Uniform_float %_ %int_0 %80
+               OpStore %88 %87
+         %89 = OpLoad %uint %gl_LocalInvocationIndex
+         %90 = OpLoad %uint %gl_LocalInvocationIndex
+         %91 = OpAccessChain %_ptr_Uniform_float %_ %int_0 %90
+         %92 = OpLoad %float %91
+         %93 = OpExtInst %float %1 Log %92
+         %94 = OpAccessChain %_ptr_Uniform_float %_ %int_0 %89
+         %95 = OpLoad %float %94
+         %96 = OpFAdd %float %95 %93
+         %97 = OpAccessChain %_ptr_Uniform_float %_ %int_0 %89
+               OpStore %97 %96
+         %98 = OpLoad %uint %gl_LocalInvocationIndex
+         %99 = OpLoad %uint %gl_LocalInvocationIndex
+        %100 = OpAccessChain %_ptr_Uniform_float %_ %int_0 %99
+        %101 = OpLoad %float %100
+        %102 = OpExtInst %float %1 Log2 %101
+        %103 = OpAccessChain %_ptr_Uniform_float %_ %int_0 %98
+        %104 = OpLoad %float %103
+        %105 = OpFAdd %float %104 %102
+        %106 = OpAccessChain %_ptr_Uniform_float %_ %int_0 %98
+               OpStore %106 %105
+        %107 = OpLoad %uint %gl_LocalInvocationIndex
+        %108 = OpLoad %uint %gl_LocalInvocationIndex
+        %109 = OpAccessChain %_ptr_Uniform_float %_ %int_0 %108
+        %110 = OpLoad %float %109
+        %111 = OpExtInst %float %1 Sqrt %110
+        %112 = OpAccessChain %_ptr_Uniform_float %_ %int_0 %107
+        %113 = OpLoad %float %112
+        %114 = OpFAdd %float %113 %111
+        %115 = OpAccessChain %_ptr_Uniform_float %_ %int_0 %107
+               OpStore %115 %114
+        %116 = OpLoad %uint %gl_LocalInvocationIndex
+        %117 = OpLoad %uint %gl_LocalInvocationIndex
+        %118 = OpAccessChain %_ptr_Uniform_float %_ %int_0 %117
+        %119 = OpLoad %float %118
+        %120 = OpExtInst %float %1 InverseSqrt %119
+        %121 = OpAccessChain %_ptr_Uniform_float %_ %int_0 %116
+        %122 = OpLoad %float %121
+        %123 = OpFAdd %float %122 %120
+        %124 = OpAccessChain %_ptr_Uniform_float %_ %int_0 %116
+               OpStore %124 %123
+        %125 = OpLoad %uint %gl_LocalInvocationIndex
+        %126 = OpLoad %uint %gl_LocalInvocationIndex
+        %127 = OpAccessChain %_ptr_Uniform_float %_ %int_0 %126
+        %128 = OpLoad %float %127
+        %130 = OpExtInst %float %1 Pow %128 %float_4
+        %131 = OpAccessChain %_ptr_Uniform_float %_ %int_0 %125
+        %132 = OpLoad %float %131
+        %133 = OpFAdd %float %132 %130
+        %134 = OpAccessChain %_ptr_Uniform_float %_ %int_0 %125
+               OpStore %134 %133
+        %136 = OpLoad %uint %gl_LocalInvocationIndex
+        %137 = OpLoad %uint %gl_LocalInvocationIndex
+        %139 = OpAccessChain %_ptr_Uniform_half %_ %int_1 %137
+        %140 = OpLoad %half %139
+        %141 = OpExtInst %half %1 Cos %140
+        %142 = OpAccessChain %_ptr_Uniform_half %_ %int_1 %136
+               OpStore %142 %141
+        %143 = OpLoad %uint %gl_LocalInvocationIndex
+        %144 = OpLoad %uint %gl_LocalInvocationIndex
+        %145 = OpAccessChain %_ptr_Uniform_half %_ %int_1 %144
+        %146 = OpLoad %half %145
+        %147 = OpExtInst %half %1 Sin %146
+        %148 = OpAccessChain %_ptr_Uniform_half %_ %int_1 %143
+        %149 = OpLoad %half %148
+        %150 = OpFAdd %half %149 %147
+        %151 = OpAccessChain %_ptr_Uniform_half %_ %int_1 %143
+               OpStore %151 %150
+        %152 = OpLoad %uint %gl_LocalInvocationIndex
+        %153 = OpLoad %uint %gl_LocalInvocationIndex
+        %154 = OpAccessChain %_ptr_Uniform_half %_ %int_1 %153
+        %155 = OpLoad %half %154
+        %156 = OpExtInst %half %1 Cosh %155
+        %157 = OpAccessChain %_ptr_Uniform_half %_ %int_1 %152
+        %158 = OpLoad %half %157
+        %159 = OpFAdd %half %158 %156
+        %160 = OpAccessChain %_ptr_Uniform_half %_ %int_1 %152
+               OpStore %160 %159
+        %161 = OpLoad %uint %gl_LocalInvocationIndex
+        %162 = OpLoad %uint %gl_LocalInvocationIndex
+        %163 = OpAccessChain %_ptr_Uniform_half %_ %int_1 %162
+        %164 = OpLoad %half %163
+        %165 = OpExtInst %half %1 Sinh %164
+        %166 = OpAccessChain %_ptr_Uniform_half %_ %int_1 %161
+        %167 = OpLoad %half %166
+        %168 = OpFAdd %half %167 %165
+        %169 = OpAccessChain %_ptr_Uniform_half %_ %int_1 %161
+               OpStore %169 %168
+               OpReturn
+               OpFunctionEnd
diff --git a/shaders-opencl-no-opt/asm/comp/trancendentals-float-controls-2-fp32.fp16.asm.comp b/shaders-opencl-no-opt/asm/comp/trancendentals-float-controls-2-fp32.fp16.asm.comp
new file mode 100644
index 000000000..c95c72ddb
--- /dev/null
+++ b/shaders-opencl-no-opt/asm/comp/trancendentals-float-controls-2-fp32.fp16.asm.comp
@@ -0,0 +1,224 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos Glslang Reference Front End; 11
+; Bound: 173
+; Schema: 0
+               OpCapability Shader
+               OpCapability Float16
+               OpCapability StorageBuffer16BitAccess
+               OpCapability FloatControls2
+               OpExtension "SPV_KHR_16bit_storage"
+               OpExtension "SPV_KHR_float_controls2"
+          %1 = OpExtInstImport "GLSL.std.450"
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint GLCompute %main "main" %gl_LocalInvocationIndex
+               OpExecutionMode %main LocalSize 4 1 1
+               OpExecutionModeId %main FPFastMathDefault %float %fp32_modes
+               OpSource GLSL 450
+               OpName %main "main"
+               OpName %SSBO "SSBO"
+               OpMemberName %SSBO 0 "v"
+               OpMemberName %SSBO 1 "f16"
+               OpName %_ ""
+               OpName %gl_LocalInvocationIndex "gl_LocalInvocationIndex"
+               OpDecorate %_arr_float_uint_4 ArrayStride 4
+               OpDecorate %_arr_half_uint_4 ArrayStride 2
+               OpDecorate %SSBO BufferBlock
+               OpMemberDecorate %SSBO 0 Offset 0
+               OpMemberDecorate %SSBO 1 Offset 16
+               OpDecorate %_ Binding 0
+               OpDecorate %_ DescriptorSet 0
+               OpDecorate %gl_LocalInvocationIndex BuiltIn LocalInvocationIndex
+               OpDecorate %gl_WorkGroupSize BuiltIn WorkgroupSize
+			   OpDecorate %24 FPFastMathMode None
+       %void = OpTypeVoid
+          %3 = OpTypeFunction %void
+      %float = OpTypeFloat 32
+       %uint = OpTypeInt 32 0
+	   %fp32_modes = OpConstant %uint 0x7000f
+     %uint_4 = OpConstant %uint 4
+%_arr_float_uint_4 = OpTypeArray %float %uint_4
+       %half = OpTypeFloat 16
+%_arr_half_uint_4 = OpTypeArray %half %uint_4
+       %SSBO = OpTypeStruct %_arr_float_uint_4 %_arr_half_uint_4
+%_ptr_Uniform_SSBO = OpTypePointer Uniform %SSBO
+          %_ = OpVariable %_ptr_Uniform_SSBO Uniform
+        %int = OpTypeInt 32 1
+      %int_0 = OpConstant %int 0
+%_ptr_Input_uint = OpTypePointer Input %uint
+%gl_LocalInvocationIndex = OpVariable %_ptr_Input_uint Input
+%_ptr_Uniform_float = OpTypePointer Uniform %float
+    %float_4 = OpConstant %float 4
+      %int_1 = OpConstant %int 1
+%_ptr_Uniform_half = OpTypePointer Uniform %half
+     %v3uint = OpTypeVector %uint 3
+     %uint_1 = OpConstant %uint 1
+%gl_WorkGroupSize = OpConstantComposite %v3uint %uint_4 %uint_1 %uint_1
+       %main = OpFunction %void None %3
+          %5 = OpLabel
+         %19 = OpLoad %uint %gl_LocalInvocationIndex
+         %20 = OpLoad %uint %gl_LocalInvocationIndex
+         %22 = OpAccessChain %_ptr_Uniform_float %_ %int_0 %20
+         %23 = OpLoad %float %22
+         %24 = OpExtInst %float %1 Cos %23
+         %25 = OpAccessChain %_ptr_Uniform_float %_ %int_0 %19
+               OpStore %25 %24
+         %26 = OpLoad %uint %gl_LocalInvocationIndex
+         %27 = OpLoad %uint %gl_LocalInvocationIndex
+         %28 = OpAccessChain %_ptr_Uniform_float %_ %int_0 %27
+         %29 = OpLoad %float %28
+         %30 = OpExtInst %float %1 Sin %29
+         %31 = OpAccessChain %_ptr_Uniform_float %_ %int_0 %26
+         %32 = OpLoad %float %31
+         %33 = OpFAdd %float %32 %30
+         %34 = OpAccessChain %_ptr_Uniform_float %_ %int_0 %26
+               OpStore %34 %33
+         %35 = OpLoad %uint %gl_LocalInvocationIndex
+         %36 = OpLoad %uint %gl_LocalInvocationIndex
+         %37 = OpAccessChain %_ptr_Uniform_float %_ %int_0 %36
+         %38 = OpLoad %float %37
+         %39 = OpExtInst %float %1 Tan %38
+         %40 = OpAccessChain %_ptr_Uniform_float %_ %int_0 %35
+         %41 = OpLoad %float %40
+         %42 = OpFAdd %float %41 %39
+         %43 = OpAccessChain %_ptr_Uniform_float %_ %int_0 %35
+               OpStore %43 %42
+         %44 = OpLoad %uint %gl_LocalInvocationIndex
+         %45 = OpLoad %uint %gl_LocalInvocationIndex
+         %46 = OpAccessChain %_ptr_Uniform_float %_ %int_0 %45
+         %47 = OpLoad %float %46
+         %48 = OpExtInst %float %1 Acos %47
+         %49 = OpAccessChain %_ptr_Uniform_float %_ %int_0 %44
+         %50 = OpLoad %float %49
+         %51 = OpFAdd %float %50 %48
+         %52 = OpAccessChain %_ptr_Uniform_float %_ %int_0 %44
+               OpStore %52 %51
+         %53 = OpLoad %uint %gl_LocalInvocationIndex
+         %54 = OpLoad %uint %gl_LocalInvocationIndex
+         %55 = OpAccessChain %_ptr_Uniform_float %_ %int_0 %54
+         %56 = OpLoad %float %55
+         %57 = OpExtInst %float %1 Asin %56
+         %58 = OpAccessChain %_ptr_Uniform_float %_ %int_0 %53
+         %59 = OpLoad %float %58
+         %60 = OpFAdd %float %59 %57
+         %61 = OpAccessChain %_ptr_Uniform_float %_ %int_0 %53
+               OpStore %61 %60
+         %62 = OpLoad %uint %gl_LocalInvocationIndex
+         %63 = OpLoad %uint %gl_LocalInvocationIndex
+         %64 = OpAccessChain %_ptr_Uniform_float %_ %int_0 %63
+         %65 = OpLoad %float %64
+         %66 = OpExtInst %float %1 Atan %65
+         %67 = OpAccessChain %_ptr_Uniform_float %_ %int_0 %62
+         %68 = OpLoad %float %67
+         %69 = OpFAdd %float %68 %66
+         %70 = OpAccessChain %_ptr_Uniform_float %_ %int_0 %62
+               OpStore %70 %69
+         %71 = OpLoad %uint %gl_LocalInvocationIndex
+         %72 = OpLoad %uint %gl_LocalInvocationIndex
+         %73 = OpAccessChain %_ptr_Uniform_float %_ %int_0 %72
+         %74 = OpLoad %float %73
+         %75 = OpExtInst %float %1 Exp %74
+         %76 = OpAccessChain %_ptr_Uniform_float %_ %int_0 %71
+         %77 = OpLoad %float %76
+         %78 = OpFAdd %float %77 %75
+         %79 = OpAccessChain %_ptr_Uniform_float %_ %int_0 %71
+               OpStore %79 %78
+         %80 = OpLoad %uint %gl_LocalInvocationIndex
+         %81 = OpLoad %uint %gl_LocalInvocationIndex
+         %82 = OpAccessChain %_ptr_Uniform_float %_ %int_0 %81
+         %83 = OpLoad %float %82
+         %84 = OpExtInst %float %1 Exp2 %83
+         %85 = OpAccessChain %_ptr_Uniform_float %_ %int_0 %80
+         %86 = OpLoad %float %85
+         %87 = OpFAdd %float %86 %84
+         %88 = OpAccessChain %_ptr_Uniform_float %_ %int_0 %80
+               OpStore %88 %87
+         %89 = OpLoad %uint %gl_LocalInvocationIndex
+         %90 = OpLoad %uint %gl_LocalInvocationIndex
+         %91 = OpAccessChain %_ptr_Uniform_float %_ %int_0 %90
+         %92 = OpLoad %float %91
+         %93 = OpExtInst %float %1 Log %92
+         %94 = OpAccessChain %_ptr_Uniform_float %_ %int_0 %89
+         %95 = OpLoad %float %94
+         %96 = OpFAdd %float %95 %93
+         %97 = OpAccessChain %_ptr_Uniform_float %_ %int_0 %89
+               OpStore %97 %96
+         %98 = OpLoad %uint %gl_LocalInvocationIndex
+         %99 = OpLoad %uint %gl_LocalInvocationIndex
+        %100 = OpAccessChain %_ptr_Uniform_float %_ %int_0 %99
+        %101 = OpLoad %float %100
+        %102 = OpExtInst %float %1 Log2 %101
+        %103 = OpAccessChain %_ptr_Uniform_float %_ %int_0 %98
+        %104 = OpLoad %float %103
+        %105 = OpFAdd %float %104 %102
+        %106 = OpAccessChain %_ptr_Uniform_float %_ %int_0 %98
+               OpStore %106 %105
+        %107 = OpLoad %uint %gl_LocalInvocationIndex
+        %108 = OpLoad %uint %gl_LocalInvocationIndex
+        %109 = OpAccessChain %_ptr_Uniform_float %_ %int_0 %108
+        %110 = OpLoad %float %109
+        %111 = OpExtInst %float %1 Sqrt %110
+        %112 = OpAccessChain %_ptr_Uniform_float %_ %int_0 %107
+        %113 = OpLoad %float %112
+        %114 = OpFAdd %float %113 %111
+        %115 = OpAccessChain %_ptr_Uniform_float %_ %int_0 %107
+               OpStore %115 %114
+        %116 = OpLoad %uint %gl_LocalInvocationIndex
+        %117 = OpLoad %uint %gl_LocalInvocationIndex
+        %118 = OpAccessChain %_ptr_Uniform_float %_ %int_0 %117
+        %119 = OpLoad %float %118
+        %120 = OpExtInst %float %1 InverseSqrt %119
+        %121 = OpAccessChain %_ptr_Uniform_float %_ %int_0 %116
+        %122 = OpLoad %float %121
+        %123 = OpFAdd %float %122 %120
+        %124 = OpAccessChain %_ptr_Uniform_float %_ %int_0 %116
+               OpStore %124 %123
+        %125 = OpLoad %uint %gl_LocalInvocationIndex
+        %126 = OpLoad %uint %gl_LocalInvocationIndex
+        %127 = OpAccessChain %_ptr_Uniform_float %_ %int_0 %126
+        %128 = OpLoad %float %127
+        %130 = OpExtInst %float %1 Pow %128 %float_4
+        %131 = OpAccessChain %_ptr_Uniform_float %_ %int_0 %125
+        %132 = OpLoad %float %131
+        %133 = OpFAdd %float %132 %130
+        %134 = OpAccessChain %_ptr_Uniform_float %_ %int_0 %125
+               OpStore %134 %133
+        %136 = OpLoad %uint %gl_LocalInvocationIndex
+        %137 = OpLoad %uint %gl_LocalInvocationIndex
+        %139 = OpAccessChain %_ptr_Uniform_half %_ %int_1 %137
+        %140 = OpLoad %half %139
+        %141 = OpExtInst %half %1 Cos %140
+        %142 = OpAccessChain %_ptr_Uniform_half %_ %int_1 %136
+               OpStore %142 %141
+        %143 = OpLoad %uint %gl_LocalInvocationIndex
+        %144 = OpLoad %uint %gl_LocalInvocationIndex
+        %145 = OpAccessChain %_ptr_Uniform_half %_ %int_1 %144
+        %146 = OpLoad %half %145
+        %147 = OpExtInst %half %1 Sin %146
+        %148 = OpAccessChain %_ptr_Uniform_half %_ %int_1 %143
+        %149 = OpLoad %half %148
+        %150 = OpFAdd %half %149 %147
+        %151 = OpAccessChain %_ptr_Uniform_half %_ %int_1 %143
+               OpStore %151 %150
+        %152 = OpLoad %uint %gl_LocalInvocationIndex
+        %153 = OpLoad %uint %gl_LocalInvocationIndex
+        %154 = OpAccessChain %_ptr_Uniform_half %_ %int_1 %153
+        %155 = OpLoad %half %154
+        %156 = OpExtInst %half %1 Cosh %155
+        %157 = OpAccessChain %_ptr_Uniform_half %_ %int_1 %152
+        %158 = OpLoad %half %157
+        %159 = OpFAdd %half %158 %156
+        %160 = OpAccessChain %_ptr_Uniform_half %_ %int_1 %152
+               OpStore %160 %159
+        %161 = OpLoad %uint %gl_LocalInvocationIndex
+        %162 = OpLoad %uint %gl_LocalInvocationIndex
+        %163 = OpAccessChain %_ptr_Uniform_half %_ %int_1 %162
+        %164 = OpLoad %half %163
+        %165 = OpExtInst %half %1 Sinh %164
+        %166 = OpAccessChain %_ptr_Uniform_half %_ %int_1 %161
+        %167 = OpLoad %half %166
+        %168 = OpFAdd %half %167 %165
+        %169 = OpAccessChain %_ptr_Uniform_half %_ %int_1 %161
+               OpStore %169 %168
+               OpReturn
+               OpFunctionEnd
diff --git a/shaders-opencl-no-opt/asm/comp/variable-pointers-2.asm.comp b/shaders-opencl-no-opt/asm/comp/variable-pointers-2.asm.comp
new file mode 100644
index 000000000..9c2afe393
--- /dev/null
+++ b/shaders-opencl-no-opt/asm/comp/variable-pointers-2.asm.comp
@@ -0,0 +1,71 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos Glslang Reference Front End; 11
+; Bound: 26
+; Schema: 0
+               OpCapability Shader
+			   OpCapability VariablePointers
+			   OpExtension "SPV_KHR_variable_pointers"
+          %1 = OpExtInstImport "GLSL.std.450"
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint GLCompute %main "main" %gl_LocalInvocationIndex %gl_GlobalInvocationID
+               OpExecutionMode %main LocalSize 64 1 1
+               OpSource GLSL 450
+               OpName %main "main"
+               OpName %test "test"
+               OpName %gl_LocalInvocationIndex "gl_LocalInvocationIndex"
+               OpName %gl_GlobalInvocationID "gl_GlobalInvocationID"
+               OpDecorate %gl_LocalInvocationIndex BuiltIn LocalInvocationIndex
+               OpDecorate %gl_GlobalInvocationID BuiltIn GlobalInvocationId
+               OpDecorate %gl_WorkGroupSize BuiltIn WorkgroupSize
+       %void = OpTypeVoid
+          %3 = OpTypeFunction %void
+      %float = OpTypeFloat 32
+      %v2float = OpTypeVector %float 2
+       %uint = OpTypeInt 32 0
+    %uint_64 = OpConstant %uint 64
+%_arr_v2float_uint_64 = OpTypeArray %v2float %uint_64
+%_ptr_Workgroup__arr_v2float_uint_64 = OpTypePointer Workgroup %_arr_v2float_uint_64
+       %test = OpVariable %_ptr_Workgroup__arr_v2float_uint_64 Workgroup
+%_ptr_Input_uint = OpTypePointer Input %uint
+%gl_LocalInvocationIndex = OpVariable %_ptr_Input_uint Input
+     %v3uint = OpTypeVector %uint 3
+%_ptr_Input_v3uint = OpTypePointer Input %v3uint
+%gl_GlobalInvocationID = OpVariable %_ptr_Input_v3uint Input
+     %uint_0 = OpConstant %uint 0
+     %uint_1 = OpConstant %uint 1
+     %uint_2 = OpConstant %uint 2
+     %uint_3 = OpConstant %uint 3
+%_ptr_Workgroup_float = OpTypePointer Workgroup %float
+%_ptr_Workgroup_v2float = OpTypePointer Workgroup %v2float
+%gl_WorkGroupSize = OpConstantComposite %v3uint %uint_64 %uint_1 %uint_1
+       %main = OpFunction %void None %3
+          %5 = OpLabel
+         %14 = OpLoad %uint %gl_LocalInvocationIndex
+         %19 = OpAccessChain %_ptr_Input_uint %gl_GlobalInvocationID %uint_0
+         %20 = OpLoad %uint %19
+         %21 = OpConvertUToF %float %20
+		 %22 = OpCompositeConstruct %v2float %21 %21
+
+		   ; Dummy expression. *(&test + 0)
+		   %ptr0 = OpPtrAccessChain %_ptr_Workgroup__arr_v2float_uint_64 %test %uint_0
+		   %ptr1 = OpPtrAccessChain %_ptr_Workgroup_v2float %ptr0 %uint_0 %uint_1
+		   %ptr2 = OpPtrAccessChain %_ptr_Workgroup_v2float %ptr1 %uint_2
+		   OpStore %ptr2 %22
+
+		   ; Chain PtrAccessChain while keeping pointer type.
+		   %ptr3 = OpPtrAccessChain %_ptr_Workgroup_v2float %test %uint_0 %uint_1
+		   %ptr4 = OpPtrAccessChain %_ptr_Workgroup_v2float %ptr3 %uint_2
+		   OpStore %ptr4 %22
+
+			; Same semantics.
+		   %ptr5 = OpPtrAccessChain %_ptr_Workgroup_v2float %test %uint_0 %uint_3
+		   OpStore %ptr5 %22
+
+			; Scalar shenanigans.
+		   %ptr6 = OpPtrAccessChain %_ptr_Workgroup_float %test %uint_0 %uint_2 %uint_0
+		   %ptr7 = OpPtrAccessChain %_ptr_Workgroup_float %ptr6 %uint_1
+		   OpStore %ptr7 %21
+
+               OpReturn
+               OpFunctionEnd
diff --git a/shaders-opencl-no-opt/asm/comp/variable-pointers-3.invalid.asm.comp b/shaders-opencl-no-opt/asm/comp/variable-pointers-3.invalid.asm.comp
new file mode 100644
index 000000000..79ff08edc
--- /dev/null
+++ b/shaders-opencl-no-opt/asm/comp/variable-pointers-3.invalid.asm.comp
@@ -0,0 +1,60 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos Glslang Reference Front End; 11
+; Bound: 26
+; Schema: 0
+               OpCapability Shader
+			   OpCapability VariablePointers
+			   OpExtension "SPV_KHR_variable_pointers"
+          %1 = OpExtInstImport "GLSL.std.450"
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint GLCompute %main "main" %gl_LocalInvocationIndex %gl_GlobalInvocationID
+               OpExecutionMode %main LocalSize 64 1 1
+               OpSource GLSL 450
+               OpName %main "main"
+               OpName %test "test"
+               OpName %gl_LocalInvocationIndex "gl_LocalInvocationIndex"
+               OpName %gl_GlobalInvocationID "gl_GlobalInvocationID"
+               OpDecorate %gl_LocalInvocationIndex BuiltIn LocalInvocationIndex
+               OpDecorate %gl_GlobalInvocationID BuiltIn GlobalInvocationId
+               OpDecorate %gl_WorkGroupSize BuiltIn WorkgroupSize
+       %void = OpTypeVoid
+          %3 = OpTypeFunction %void
+      %float = OpTypeFloat 32
+	  %bool = OpTypeBool
+	  %true = OpConstantTrue %bool
+      %v2float = OpTypeVector %float 2
+       %uint = OpTypeInt 32 0
+    %uint_64 = OpConstant %uint 64
+%_arr_v2float_uint_64 = OpTypeArray %v2float %uint_64
+%_ptr_Workgroup__arr_v2float_uint_64 = OpTypePointer Workgroup %_arr_v2float_uint_64
+       %test = OpVariable %_ptr_Workgroup__arr_v2float_uint_64 Workgroup
+%_ptr_Input_uint = OpTypePointer Input %uint
+%gl_LocalInvocationIndex = OpVariable %_ptr_Input_uint Input
+     %v3uint = OpTypeVector %uint 3
+%_ptr_Input_v3uint = OpTypePointer Input %v3uint
+%gl_GlobalInvocationID = OpVariable %_ptr_Input_v3uint Input
+     %uint_0 = OpConstant %uint 0
+     %uint_1 = OpConstant %uint 1
+     %uint_2 = OpConstant %uint 2
+     %uint_3 = OpConstant %uint 3
+%_ptr_Workgroup_float = OpTypePointer Workgroup %float
+%_ptr_Workgroup_v2float = OpTypePointer Workgroup %v2float
+%gl_WorkGroupSize = OpConstantComposite %v3uint %uint_64 %uint_1 %uint_1
+       %main = OpFunction %void None %3
+          %5 = OpLabel
+         %14 = OpLoad %uint %gl_LocalInvocationIndex
+         %19 = OpAccessChain %_ptr_Input_uint %gl_GlobalInvocationID %uint_0
+         %20 = OpLoad %uint %19
+         %21 = OpConvertUToF %float %20
+		 %22 = OpCompositeConstruct %v2float %21 %21
+
+			; Scalar shenanigans.
+		   %ptr6 = OpPtrAccessChain %_ptr_Workgroup_float %test %uint_0 %uint_2 %uint_0
+		   %ptr6_alt = OpPtrAccessChain %_ptr_Workgroup_float %test %uint_0 %uint_2 %uint_0
+		   %ptr6_sel = OpSelect %_ptr_Workgroup_float %true %ptr6 %ptr6_alt
+		   %ptr7 = OpPtrAccessChain %_ptr_Workgroup_float %ptr6_sel %uint_1
+		   OpStore %ptr7 %21
+
+               OpReturn
+               OpFunctionEnd
diff --git a/shaders-opencl-no-opt/asm/comp/variable-pointers-vector-to-scalar.asm.comp b/shaders-opencl-no-opt/asm/comp/variable-pointers-vector-to-scalar.asm.comp
new file mode 100644
index 000000000..c4512858a
--- /dev/null
+++ b/shaders-opencl-no-opt/asm/comp/variable-pointers-vector-to-scalar.asm.comp
@@ -0,0 +1,60 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos Glslang Reference Front End; 11
+; Bound: 26
+; Schema: 0
+               OpCapability Shader
+			   OpCapability VariablePointers
+			   OpExtension "SPV_KHR_variable_pointers"
+          %1 = OpExtInstImport "GLSL.std.450"
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint GLCompute %main "main" %gl_LocalInvocationIndex %gl_GlobalInvocationID
+               OpExecutionMode %main LocalSize 64 1 1
+               OpSource GLSL 450
+               OpName %main "main"
+               OpName %test "test"
+               OpName %gl_LocalInvocationIndex "gl_LocalInvocationIndex"
+               OpName %gl_GlobalInvocationID "gl_GlobalInvocationID"
+               OpDecorate %gl_LocalInvocationIndex BuiltIn LocalInvocationIndex
+               OpDecorate %gl_GlobalInvocationID BuiltIn GlobalInvocationId
+               OpDecorate %gl_WorkGroupSize BuiltIn WorkgroupSize
+       %void = OpTypeVoid
+          %3 = OpTypeFunction %void
+      %float = OpTypeFloat 32
+	  %bool = OpTypeBool
+	  %true = OpConstantTrue %bool
+      %v2float = OpTypeVector %float 2
+       %uint = OpTypeInt 32 0
+    %uint_64 = OpConstant %uint 64
+%_arr_v2float_uint_64 = OpTypeArray %v2float %uint_64
+%_ptr_Workgroup__arr_v2float_uint_64 = OpTypePointer Workgroup %_arr_v2float_uint_64
+       %test = OpVariable %_ptr_Workgroup__arr_v2float_uint_64 Workgroup
+%_ptr_Input_uint = OpTypePointer Input %uint
+%gl_LocalInvocationIndex = OpVariable %_ptr_Input_uint Input
+     %v3uint = OpTypeVector %uint 3
+%_ptr_Input_v3uint = OpTypePointer Input %v3uint
+%gl_GlobalInvocationID = OpVariable %_ptr_Input_v3uint Input
+     %uint_0 = OpConstant %uint 0
+     %uint_1 = OpConstant %uint 1
+     %uint_2 = OpConstant %uint 2
+     %uint_3 = OpConstant %uint 3
+%_ptr_Workgroup_float = OpTypePointer Workgroup %float
+%_ptr_Workgroup_v2float = OpTypePointer Workgroup %v2float
+%gl_WorkGroupSize = OpConstantComposite %v3uint %uint_64 %uint_1 %uint_1
+       %main = OpFunction %void None %3
+          %5 = OpLabel
+         %14 = OpLoad %uint %gl_LocalInvocationIndex
+         %19 = OpAccessChain %_ptr_Input_uint %gl_GlobalInvocationID %uint_0
+         %20 = OpLoad %uint %19
+         %21 = OpConvertUToF %float %20
+		 %22 = OpCompositeConstruct %v2float %21 %21
+
+		   %a = OpAccessChain %_ptr_Workgroup_v2float %test %uint_1
+		   %b = OpAccessChain %_ptr_Workgroup_v2float %test %uint_2
+		   %c = OpSelect %_ptr_Workgroup_v2float %true %a %b
+
+		   %d = OpAccessChain %_ptr_Workgroup_float %c %uint_1
+		   OpStore %d %21
+
+               OpReturn
+               OpFunctionEnd
diff --git a/shaders-opencl-no-opt/asm/comp/variable-pointers.asm.invalid.comp b/shaders-opencl-no-opt/asm/comp/variable-pointers.asm.invalid.comp
new file mode 100644
index 000000000..ba6267cc0
--- /dev/null
+++ b/shaders-opencl-no-opt/asm/comp/variable-pointers.asm.invalid.comp
@@ -0,0 +1,152 @@
+; SPIR-V
+; Version: 1.3
+; Generator: Khronos SPIR-V Tools Assembler; 0
+; Bound: 89
+; Schema: 0
+               OpCapability Shader
+               OpCapability VariablePointers
+          %1 = OpExtInstImport "GLSL.std.450"
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint GLCompute %main "main" %gl_GlobalInvocationID
+               OpExecutionMode %main LocalSize 1 1 1
+               OpSource GLSL 450
+               OpName %main "main"
+               OpName %foo "foo"
+               OpMemberName %foo 0 "a"
+               OpMemberName %foo 1 "b"
+               OpMemberName %foo 2 "c"
+               OpName %bar "bar"
+               OpMemberName %bar 0 "d"
+               OpName %baz "baz"
+               OpMemberName %baz 0 "e"
+               OpName %buf "buf"
+               OpName %buf2 "buf2"
+               OpName %cb "cb"
+               OpName %tgsm "tgsm"
+               OpName %sbuf "sbuf"
+               OpName %sbuf2 "sbuf2"
+               OpName %stgsm "stgsm"
+               OpName %select_buffer "select_buffer"
+               OpName %select_buffer_null "select_buffer_null"
+               OpName %select_tgsm "select_tgsm"
+               OpName %cur "cur"
+               OpMemberDecorate %foo 0 Offset 0
+               OpMemberDecorate %foo 1 Offset 512
+               OpMemberDecorate %foo 2 Offset 520
+               OpMemberDecorate %bar 0 Offset 0
+               OpMemberDecorate %baz 0 Offset 0
+               OpDecorate %foo Block
+               OpDecorate %bar Block
+               OpDecorate %baz Block
+               OpDecorate %buf DescriptorSet 0
+               OpDecorate %buf Binding 0
+               OpDecorate %cb DescriptorSet 0
+               OpDecorate %cb Binding 3
+               OpDecorate %buf2 DescriptorSet 0
+               OpDecorate %buf2 Binding 4
+               OpDecorate %_ptr_Workgroup_int ArrayStride 4
+               OpDecorate %_ptr_StorageBuffer_int ArrayStride 4
+               OpDecorate %_arr_int_uint_128 ArrayStride 4
+               OpDecorate %gl_GlobalInvocationID BuiltIn GlobalInvocationId
+       %void = OpTypeVoid
+         %22 = OpTypeFunction %void
+        %int = OpTypeInt 32 1
+       %uint = OpTypeInt 32 0
+     %v3uint = OpTypeVector %uint 3
+%_ptr_Input_v3uint = OpTypePointer Input %v3uint
+%gl_GlobalInvocationID = OpVariable %_ptr_Input_v3uint Input
+   %uint_128 = OpConstant %uint 128
+%_arr_int_uint_128 = OpTypeArray %int %uint_128
+      %float = OpTypeFloat 32
+    %v2float = OpTypeVector %float 2
+        %foo = OpTypeStruct %_arr_int_uint_128 %uint %v2float
+%_ptr_StorageBuffer_foo = OpTypePointer StorageBuffer %foo
+        %buf = OpVariable %_ptr_StorageBuffer_foo StorageBuffer
+        %bar = OpTypeStruct %int
+%_ptr_Uniform_bar = OpTypePointer Uniform %bar
+         %cb = OpVariable %_ptr_Uniform_bar Uniform
+        %baz = OpTypeStruct %_arr_int_uint_128
+%_ptr_StorageBuffer_baz = OpTypePointer StorageBuffer %baz
+       %buf2 = OpVariable %_ptr_StorageBuffer_baz StorageBuffer
+%_ptr_Workgroup__arr_int_uint_128 = OpTypePointer Workgroup %_arr_int_uint_128
+       %tgsm = OpVariable %_ptr_Workgroup__arr_int_uint_128 Workgroup
+%_ptr_StorageBuffer_int = OpTypePointer StorageBuffer %int
+%_ptr_Private__ptr_StorageBuffer_int = OpTypePointer Private %_ptr_StorageBuffer_int
+       %sbuf = OpVariable %_ptr_Private__ptr_StorageBuffer_int Private
+      %sbuf2 = OpVariable %_ptr_Private__ptr_StorageBuffer_int Private
+%_ptr_Workgroup_int = OpTypePointer Workgroup %int
+%_ptr_Private__ptr_Workgroup_int = OpTypePointer Private %_ptr_Workgroup_int
+      %stgsm = OpVariable %_ptr_Private__ptr_Workgroup_int Private
+     %uint_0 = OpConstant %uint 0
+       %bool = OpTypeBool
+%_ptr_Uniform_int = OpTypePointer Uniform %int
+         %44 = OpTypeFunction %_ptr_StorageBuffer_int
+      %int_0 = OpConstant %int 0
+     %uint_1 = OpConstant %uint 1
+         %47 = OpConstantNull %_ptr_StorageBuffer_int
+         %48 = OpTypeFunction %_ptr_Workgroup_int
+         %49 = OpConstantNull %_ptr_Workgroup_int
+%_ptr_Function__ptr_Workgroup_int = OpTypePointer Function %_ptr_Workgroup_int
+%select_buffer = OpFunction %_ptr_StorageBuffer_int None %44
+         %51 = OpLabel
+         %52 = OpAccessChain %_ptr_Uniform_int %cb %uint_0
+         %53 = OpLoad %int %52
+         %54 = OpINotEqual %bool %53 %int_0
+         %55 = OpAccessChain %_ptr_StorageBuffer_int %buf %uint_0 %uint_0
+         %56 = OpAccessChain %_ptr_StorageBuffer_int %buf2 %uint_0 %uint_0
+         %57 = OpSelect %_ptr_StorageBuffer_int %54 %55 %56
+               OpReturnValue %57
+               OpFunctionEnd
+%select_buffer_null = OpFunction %_ptr_StorageBuffer_int None %44
+         %58 = OpLabel
+         %59 = OpAccessChain %_ptr_Uniform_int %cb %uint_0
+         %60 = OpLoad %int %59
+         %61 = OpINotEqual %bool %60 %int_0
+         %62 = OpAccessChain %_ptr_StorageBuffer_int %buf %uint_0 %uint_0
+         %63 = OpSelect %_ptr_StorageBuffer_int %61 %62 %47
+               OpReturnValue %63
+               OpFunctionEnd
+%select_tgsm = OpFunction %_ptr_Workgroup_int None %48
+         %64 = OpLabel
+         %65 = OpAccessChain %_ptr_Uniform_int %cb %uint_0
+         %66 = OpLoad %int %65
+         %67 = OpINotEqual %bool %66 %int_0
+         %68 = OpAccessChain %_ptr_Workgroup_int %tgsm %uint_0
+         %69 = OpSelect %_ptr_Workgroup_int %67 %68 %49
+               OpReturnValue %69
+               OpFunctionEnd
+       %main = OpFunction %void None %22
+         %70 = OpLabel
+        %cur = OpVariable %_ptr_Function__ptr_Workgroup_int Function
+         %71 = OpFunctionCall %_ptr_StorageBuffer_int %select_buffer
+               OpStore %sbuf %71
+         %72 = OpFunctionCall %_ptr_StorageBuffer_int %select_buffer_null
+               OpStore %sbuf2 %72
+         %73 = OpFunctionCall %_ptr_Workgroup_int %select_tgsm
+               OpStore %stgsm %73
+         %74 = OpAccessChain %_ptr_StorageBuffer_int %buf %uint_0 %uint_0
+         %75 = OpLoad %_ptr_Workgroup_int %stgsm
+         %76 = OpCopyObject %_ptr_Workgroup_int %75
+               OpStore %cur %76
+               OpBranch %77
+         %77 = OpLabel
+         %78 = OpPhi %_ptr_StorageBuffer_int %74 %70 %79 %80
+         %81 = OpLoad %_ptr_Workgroup_int %cur
+         %82 = OpLoad %int %78
+         %83 = OpINotEqual %bool %82 %int_0
+               OpLoopMerge %85 %80 None
+               OpBranchConditional %83 %84 %85
+         %84 = OpLabel
+         %86 = OpLoad %int %81
+         %87 = OpIAdd %int %82 %86
+               OpStore %78 %87
+               OpStore %81 %87
+               OpBranch %80
+         %80 = OpLabel
+         %79 = OpPtrAccessChain %_ptr_StorageBuffer_int %78 %uint_1
+         %88 = OpPtrAccessChain %_ptr_Workgroup_int %81 %uint_1
+               OpStore %cur %88
+               OpBranch %77
+         %85 = OpLabel
+               OpReturn
+               OpFunctionEnd
diff --git a/shaders-opencl-no-opt/asm/comp/variable-ssbo-argument.spv16.asm.comp b/shaders-opencl-no-opt/asm/comp/variable-ssbo-argument.spv16.asm.comp
new file mode 100644
index 000000000..0cb791703
--- /dev/null
+++ b/shaders-opencl-no-opt/asm/comp/variable-ssbo-argument.spv16.asm.comp
@@ -0,0 +1,44 @@
+; SPIR-V
+; Version: 1.6
+; Generator: Khronos SPIR-V Tools Assembler; 0
+; Bound: 24
+; Schema: 0
+               OpCapability VariablePointersStorageBuffer
+               OpCapability Int8
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint GLCompute %1 "main" %2
+               OpExecutionMode %1 LocalSize 16 1 1
+               OpDecorate %2 DescriptorSet 0
+               OpDecorate %2 Binding 0
+               OpDecorate %_struct_3 Block
+               OpMemberDecorate %_struct_3 0 Offset 0
+               OpDecorate %_runtimearr_uchar ArrayStride 1
+               OpDecorate %_ptr_StorageBuffer_uchar ArrayStride 1
+       %void = OpTypeVoid
+       %uint = OpTypeInt 32 0
+     %uint_0 = OpConstant %uint 0
+     %uint_1 = OpConstant %uint 1
+     %uint_2 = OpConstant %uint 2
+     %uint_3 = OpConstant %uint 3
+      %uchar = OpTypeInt 8 0
+    %uchar_0 = OpConstant %uchar 0
+%_runtimearr_uchar = OpTypeRuntimeArray %uchar
+  %_struct_3 = OpTypeStruct %_runtimearr_uchar
+%_ptr_StorageBuffer_uchar = OpTypePointer StorageBuffer %uchar
+%_ptr_StorageBuffer__struct_3 = OpTypePointer StorageBuffer %_struct_3
+          %2 = OpVariable %_ptr_StorageBuffer__struct_3 StorageBuffer
+         %15 = OpTypeFunction %void %_ptr_StorageBuffer_uchar
+         %16 = OpTypeFunction %void
+          %1 = OpFunction %void None %16
+         %17 = OpLabel
+         %18 = OpAccessChain %_ptr_StorageBuffer_uchar %2 %uint_0 %uint_1
+         %19 = OpFunctionCall %void %20 %18
+               OpReturn
+               OpFunctionEnd
+         %20 = OpFunction %void None %15
+         %21 = OpFunctionParameter %_ptr_StorageBuffer_uchar
+         %22 = OpLabel
+         %23 = OpPtrAccessChain %_ptr_StorageBuffer_uchar %21 %uint_2
+               OpStore %23 %uchar_0
+               OpReturn
+               OpFunctionEnd
diff --git a/shaders-opencl-no-opt/asm/comp/variable-ssbo-array-argument.spv16.invalid.asm.comp b/shaders-opencl-no-opt/asm/comp/variable-ssbo-array-argument.spv16.invalid.asm.comp
new file mode 100644
index 000000000..8dd687ca9
--- /dev/null
+++ b/shaders-opencl-no-opt/asm/comp/variable-ssbo-array-argument.spv16.invalid.asm.comp
@@ -0,0 +1,45 @@
+; SPIR-V
+; Version: 1.6
+; Generator: Khronos SPIR-V Tools Assembler; 0
+; Bound: 24
+; Schema: 0
+               OpCapability VariablePointersStorageBuffer
+               OpCapability Int8
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint GLCompute %1 "main" %2
+               OpExecutionMode %1 LocalSize 16 1 1
+               OpDecorate %2 DescriptorSet 0
+               OpDecorate %2 Binding 0
+               OpDecorate %_struct_3 Block
+               OpMemberDecorate %_struct_3 0 Offset 0
+               OpDecorate %uchar_array ArrayStride 1
+       %void = OpTypeVoid
+       %uint = OpTypeInt 32 0
+     %uint_0 = OpConstant %uint 0
+     %uint_1 = OpConstant %uint 1
+     %uint_2 = OpConstant %uint 2
+     %uint_3 = OpConstant %uint 3
+     %uint_16 = OpConstant %uint 16
+      %uchar = OpTypeInt 8 0
+    %uchar_0 = OpConstant %uchar 0
+%uchar_array = OpTypeArray %uchar %uint_16
+  %_struct_3 = OpTypeStruct %uchar_array
+%_ptr_StorageBuffer_uchar = OpTypePointer StorageBuffer %uchar
+%_ptr_StorageBuffer_uchar_array = OpTypePointer StorageBuffer %uchar_array
+%_ptr_StorageBuffer__struct_3 = OpTypePointer StorageBuffer %_struct_3
+          %2 = OpVariable %_ptr_StorageBuffer__struct_3 StorageBuffer
+         %15 = OpTypeFunction %void %_ptr_StorageBuffer_uchar_array
+         %16 = OpTypeFunction %void
+          %1 = OpFunction %void None %16
+         %17 = OpLabel
+         %18 = OpAccessChain %_ptr_StorageBuffer_uchar_array %2 %uint_0
+         %19 = OpFunctionCall %void %20 %18
+               OpReturn
+               OpFunctionEnd
+         %20 = OpFunction %void None %15
+         %21 = OpFunctionParameter %_ptr_StorageBuffer_uchar_array
+         %22 = OpLabel
+         %23 = OpAccessChain %_ptr_StorageBuffer_uchar %21 %uint_2
+               OpStore %23 %uchar_0
+               OpReturn
+               OpFunctionEnd
diff --git a/shaders-opencl-no-opt/asm/comp/workgroup-uint-to-uchar-alias-ptr-access-chain.asm.invalid.comp b/shaders-opencl-no-opt/asm/comp/workgroup-uint-to-uchar-alias-ptr-access-chain.asm.invalid.comp
new file mode 100644
index 000000000..ed4e10446
--- /dev/null
+++ b/shaders-opencl-no-opt/asm/comp/workgroup-uint-to-uchar-alias-ptr-access-chain.asm.invalid.comp
@@ -0,0 +1,214 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Google Clspv; 0
+; Bound: 175
+; Schema: 0
+               OpCapability Shader
+               OpCapability Int8
+               OpCapability VariablePointers
+               OpExtension "SPV_KHR_storage_buffer_storage_class"
+               OpExtension "SPV_KHR_variable_pointers"
+               OpExtension "SPV_KHR_non_semantic_info"
+        %163 = OpExtInstImport "NonSemantic.ClspvReflection.5"
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint GLCompute %32 "main" %gl_LocalInvocationID %gl_WorkGroupID
+               OpSource OpenCL_C 120
+        %164 = OpString "main"
+        %165 = OpString " __kernel"
+        %167 = OpString "out_data"
+        %170 = OpString "pix_in_block"
+               OpDecorate %gl_LocalInvocationID BuiltIn LocalInvocationId
+               OpDecorate %gl_WorkGroupID BuiltIn WorkgroupId
+               OpDecorate %gl_WorkGroupSize BuiltIn WorkgroupSize
+               OpDecorate %_runtimearr_v4uint ArrayStride 16
+               OpMemberDecorate %_struct_23 0 Offset 0
+               OpDecorate %_struct_23 Block
+               OpMemberDecorate %_struct_26 0 Offset 0
+               OpMemberDecorate %_struct_27 0 Offset 0
+               OpDecorate %_struct_27 Block
+               OpDecorate %25 DescriptorSet 0
+               OpDecorate %25 Binding 0
+               OpDecorate %_arr_uint_uint_256 ArrayStride 4
+               OpDecorate %_arr_uchar_uint_1024 ArrayStride 1
+               OpDecorate %15 SpecId 0
+               OpDecorate %16 SpecId 1
+               OpDecorate %17 SpecId 2
+       %uint = OpTypeInt 32 0
+   %uint_256 = OpConstant %uint 256
+%_arr_uint_uint_256 = OpTypeArray %uint %uint_256
+%_ptr_Workgroup__arr_uint_uint_256 = OpTypePointer Workgroup %_arr_uint_uint_256
+      %uchar = OpTypeInt 8 0
+  %uint_1024 = OpConstant %uint 1024
+%_arr_uchar_uint_1024 = OpTypeArray %uchar %uint_1024
+%_ptr_Workgroup__arr_uchar_uint_1024 = OpTypePointer Workgroup %_arr_uchar_uint_1024
+     %v3uint = OpTypeVector %uint 3
+%_ptr_Input_v3uint = OpTypePointer Input %v3uint
+         %15 = OpSpecConstant %uint 1
+         %16 = OpSpecConstant %uint 1
+         %17 = OpSpecConstant %uint 1
+%gl_WorkGroupSize = OpSpecConstantComposite %v3uint %15 %16 %17
+%_ptr_Private_v3uint = OpTypePointer Private %v3uint
+     %v4uint = OpTypeVector %uint 4
+%_runtimearr_v4uint = OpTypeRuntimeArray %v4uint
+ %_struct_23 = OpTypeStruct %_runtimearr_v4uint
+%_ptr_StorageBuffer__struct_23 = OpTypePointer StorageBuffer %_struct_23
+ %_struct_26 = OpTypeStruct %uint
+ %_struct_27 = OpTypeStruct %_struct_26
+%_ptr_PushConstant__struct_27 = OpTypePointer PushConstant %_struct_27
+       %void = OpTypeVoid
+         %31 = OpTypeFunction %void
+%_ptr_PushConstant__struct_26 = OpTypePointer PushConstant %_struct_26
+     %uint_0 = OpConstant %uint 0
+       %bool = OpTypeBool
+%_ptr_Input_uint = OpTypePointer Input %uint
+     %uint_1 = OpConstant %uint 1
+   %uint_255 = OpConstant %uint 255
+%_ptr_Workgroup_uint = OpTypePointer Workgroup %uint
+     %uint_2 = OpConstant %uint 2
+    %uint_10 = OpConstant %uint 10
+  %uint_1020 = OpConstant %uint 1020
+    %v4uchar = OpTypeVector %uchar 4
+%_ptr_Workgroup_uchar = OpTypePointer Workgroup %uchar
+     %uint_3 = OpConstant %uint 3
+   %uint_264 = OpConstant %uint 264
+%_ptr_StorageBuffer_v4uint = OpTypePointer StorageBuffer %v4uint
+        %137 = OpUndef %v4uchar
+     %uint_4 = OpConstant %uint 4
+          %5 = OpVariable %_ptr_Workgroup__arr_uint_uint_256 Workgroup
+         %10 = OpVariable %_ptr_Workgroup__arr_uchar_uint_1024 Workgroup
+%gl_LocalInvocationID = OpVariable %_ptr_Input_v3uint Input
+%gl_WorkGroupID = OpVariable %_ptr_Input_v3uint Input
+         %20 = OpVariable %_ptr_Private_v3uint Private %gl_WorkGroupSize
+         %25 = OpVariable %_ptr_StorageBuffer__struct_23 StorageBuffer
+         %29 = OpVariable %_ptr_PushConstant__struct_27 PushConstant
+         %32 = OpFunction %void None %31
+         %33 = OpLabel
+         %36 = OpAccessChain %_ptr_PushConstant__struct_26 %29 %uint_0
+         %37 = OpLoad %_struct_26 %36
+         %38 = OpCompositeExtract %uint %37 0
+         %40 = OpINotEqual %bool %38 %uint_0
+               OpSelectionMerge %105 None
+               OpBranchConditional %40 %43 %105
+         %43 = OpLabel
+         %45 = OpAccessChain %_ptr_Input_uint %gl_WorkGroupID %uint_0
+         %46 = OpLoad %uint %45
+         %48 = OpAccessChain %_ptr_Input_uint %gl_WorkGroupID %uint_1
+         %49 = OpLoad %uint %48
+         %50 = OpAccessChain %_ptr_Input_uint %gl_LocalInvocationID %uint_0
+         %51 = OpLoad %uint %50
+         %52 = OpAccessChain %_ptr_Input_uint %gl_LocalInvocationID %uint_1
+         %53 = OpLoad %uint %52
+         %54 = OpIMul %uint %53 %51
+         %55 = OpUDiv %uint %54 %49
+         %57 = OpUMod %uint %55 %uint_255
+         %58 = OpUConvert %uchar %57
+               OpBranch %60
+         %60 = OpLabel
+         %61 = OpPhi %uint %100 %99 %uint_0 %43
+         %62 = OpIMul %uint %61 %38
+               OpLoopMerge %103 %99 None
+               OpBranch %65
+         %65 = OpLabel
+         %66 = OpPhi %uint %93 %65 %uint_0 %60
+         %67 = OpIAdd %uint %66 %62
+         %68 = OpIMul %uint %66 %61
+         %69 = OpIAdd %uint %46 %68
+         %71 = OpAccessChain %_ptr_Workgroup_uint %5 %67
+               OpStore %71 %69
+         %72 = OpIAdd %uint %49 %68
+         %74 = OpShiftLeftLogical %uint %67 %uint_2
+         %76 = OpShiftRightLogical %uint %74 %uint_10
+         %78 = OpBitwiseAnd %uint %74 %uint_1020
+         %80 = OpBitcast %v4uchar %72
+         %81 = OpCompositeExtract %uchar %80 1
+         %82 = OpCompositeExtract %uchar %80 2
+         %83 = OpCompositeExtract %uchar %80 3
+         %85 = OpPtrAccessChain %_ptr_Workgroup_uchar %10 %76 %78
+         %86 = OpBitwiseOr %uint %78 %uint_1
+         %87 = OpPtrAccessChain %_ptr_Workgroup_uchar %10 %76 %86
+               OpStore %87 %81
+         %88 = OpBitwiseOr %uint %78 %uint_2
+         %89 = OpPtrAccessChain %_ptr_Workgroup_uchar %10 %76 %88
+               OpStore %89 %82
+         %91 = OpBitwiseOr %uint %78 %uint_3
+         %92 = OpPtrAccessChain %_ptr_Workgroup_uchar %10 %76 %91
+               OpStore %92 %83
+               OpStore %85 %58
+         %93 = OpIAdd %uint %66 %uint_1
+         %94 = OpUGreaterThanEqual %bool %93 %38
+               OpLoopMerge %97 %65 None
+               OpBranchConditional %94 %97 %65
+         %97 = OpLabel
+               OpBranch %99
+         %99 = OpLabel
+        %100 = OpIAdd %uint %61 %uint_1
+        %101 = OpUGreaterThanEqual %bool %100 %38
+               OpBranchConditional %101 %103 %60
+        %103 = OpLabel
+               OpBranch %105
+        %105 = OpLabel
+               OpBranch %107
+        %107 = OpLabel
+               OpControlBarrier %uint_2 %uint_2 %uint_264
+               OpSelectionMerge %162 None
+               OpBranchConditional %40 %111 %162
+        %111 = OpLabel
+        %112 = OpPhi %uint %157 %156 %uint_0 %107
+        %113 = OpIMul %uint %112 %38
+               OpLoopMerge %160 %156 None
+               OpBranch %116
+        %116 = OpLabel
+        %117 = OpPhi %uint %150 %116 %uint_0 %111
+        %118 = OpIAdd %uint %117 %113
+        %120 = OpAccessChain %_ptr_StorageBuffer_v4uint %25 %uint_0 %118
+        %121 = OpAccessChain %_ptr_Workgroup_uint %5 %118
+        %122 = OpLoad %uint %121
+        %123 = OpShiftLeftLogical %uint %118 %uint_2
+        %124 = OpShiftRightLogical %uint %123 %uint_10
+        %125 = OpBitwiseAnd %uint %123 %uint_1020
+        %126 = OpPtrAccessChain %_ptr_Workgroup_uchar %10 %124 %125
+        %127 = OpLoad %uchar %126
+        %128 = OpBitwiseOr %uint %125 %uint_1
+        %129 = OpPtrAccessChain %_ptr_Workgroup_uchar %10 %124 %128
+        %130 = OpLoad %uchar %129
+        %131 = OpBitwiseOr %uint %125 %uint_2
+        %132 = OpPtrAccessChain %_ptr_Workgroup_uchar %10 %124 %131
+        %133 = OpLoad %uchar %132
+        %134 = OpBitwiseOr %uint %125 %uint_3
+        %135 = OpPtrAccessChain %_ptr_Workgroup_uchar %10 %124 %134
+        %136 = OpLoad %uchar %135
+        %138 = OpCompositeInsert %v4uchar %127 %137 0
+        %139 = OpCompositeInsert %v4uchar %130 %138 1
+        %140 = OpCompositeInsert %v4uchar %133 %139 2
+        %141 = OpCompositeInsert %v4uchar %136 %140 3
+        %142 = OpBitcast %uint %141
+        %143 = OpIAdd %uint %122 %142
+        %144 = OpLoad %v4uint %120
+        %145 = OpCompositeInsert %v4uint %143 %144 0
+        %146 = OpShiftRightLogical %uint %143 %uint_2
+        %147 = OpCompositeInsert %v4uint %146 %145 1
+        %148 = OpShiftRightLogical %uint %143 %uint_3
+        %149 = OpCompositeInsert %v4uint %148 %147 3
+               OpStore %120 %149
+        %150 = OpIAdd %uint %117 %uint_1
+        %151 = OpUGreaterThanEqual %bool %150 %38
+               OpLoopMerge %154 %116 None
+               OpBranchConditional %151 %154 %116
+        %154 = OpLabel
+               OpBranch %156
+        %156 = OpLabel
+        %157 = OpIAdd %uint %112 %uint_1
+        %158 = OpUGreaterThanEqual %bool %157 %38
+               OpBranchConditional %158 %160 %111
+        %160 = OpLabel
+               OpBranch %162
+        %162 = OpLabel
+               OpControlBarrier %uint_2 %uint_2 %uint_264
+               OpReturn
+               OpFunctionEnd
+        %166 = OpExtInst %void %163 Kernel %32 %164 %uint_2 %uint_0 %165
+        %168 = OpExtInst %void %163 ArgumentInfo %167
+        %169 = OpExtInst %void %163 ArgumentStorageBuffer %166 %uint_0 %uint_0 %uint_0 %168
+        %171 = OpExtInst %void %163 ArgumentInfo %170
+        %173 = OpExtInst %void %163 ArgumentPodPushConstant %166 %uint_1 %uint_0 %uint_4 %171
+        %174 = OpExtInst %void %163 SpecConstantWorkgroupSize %uint_0 %uint_1 %uint_2
diff --git a/shaders-opencl-no-opt/comp/argument-buffer-readonly-writeonly-alias.argument.argument-tier-1.device-argument-buffer.invalid.comp b/shaders-opencl-no-opt/comp/argument-buffer-readonly-writeonly-alias.argument.argument-tier-1.device-argument-buffer.invalid.comp
new file mode 100644
index 000000000..f5f05a1ae
--- /dev/null
+++ b/shaders-opencl-no-opt/comp/argument-buffer-readonly-writeonly-alias.argument.argument-tier-1.device-argument-buffer.invalid.comp
@@ -0,0 +1,9 @@
+#version 450
+#extension GL_EXT_nonuniform_qualifier : require
+
+layout (binding = 0) readonly buffer A {float data_a[];} a[];
+layout (binding = 0) writeonly buffer D {float data_d[];} d[];
+
+void main() {
+	d[gl_WorkGroupID.x].data_d[0] = a[gl_WorkGroupID.x].data_a[0];
+}
diff --git a/shaders-opencl-no-opt/comp/array-copy-threadgroup-memory.invalid.comp b/shaders-opencl-no-opt/comp/array-copy-threadgroup-memory.invalid.comp
new file mode 100644
index 000000000..081c39626
--- /dev/null
+++ b/shaders-opencl-no-opt/comp/array-copy-threadgroup-memory.invalid.comp
@@ -0,0 +1,18 @@
+#version 450
+layout(local_size_x = 8) in;
+
+shared float shared_group[8][8];
+shared float shared_group_alt[8][8];
+
+void main()
+{
+	float blob[8];
+	for (int i = 0; i < 8; i++)
+		blob[i] = float(i);
+	shared_group[gl_LocalInvocationIndex] = blob;
+
+	barrier();
+
+	float copied_blob[8] = shared_group[gl_LocalInvocationIndex ^ 1u];
+	shared_group_alt[gl_LocalInvocationIndex] = shared_group[gl_LocalInvocationIndex];
+}
diff --git a/shaders-opencl-no-opt/comp/atomic-cmpxchg-packed-vector.invalid.comp b/shaders-opencl-no-opt/comp/atomic-cmpxchg-packed-vector.invalid.comp
new file mode 100644
index 000000000..f54fc5f52
--- /dev/null
+++ b/shaders-opencl-no-opt/comp/atomic-cmpxchg-packed-vector.invalid.comp
@@ -0,0 +1,17 @@
+#version 460
+#extension GL_EXT_nonuniform_qualifier : enable
+#extension GL_EXT_scalar_block_layout : require
+
+layout(scalar, binding=1) restrict buffer AttData0 {
+    uvec3 att0[];
+};
+
+void main() {
+    uint newVal = 432;
+    uint prevVal = 0;
+    uint curVal = 0;
+    
+    while ( (curVal = atomicCompSwap(att0[0].x, prevVal, newVal)) != prevVal)
+    {
+    }
+};
diff --git a/shaders-opencl-no-opt/comp/basic.invalid.comp b/shaders-opencl-no-opt/comp/basic.invalid.comp
new file mode 100644
index 000000000..abf100f49
--- /dev/null
+++ b/shaders-opencl-no-opt/comp/basic.invalid.comp
@@ -0,0 +1,27 @@
+#version 450
+layout(local_size_x = 3, local_size_y = 3, local_size_z = 2) in;
+
+layout(set = 0, binding = 0) uniform Foo
+{
+	int a;
+	int b;
+};
+
+layout(set = 0, binding = 1) uniform Bar
+{
+	int c;
+	int d;
+};
+
+layout(set = 1, binding = 2) buffer Baz
+{
+	int e;
+	int f;
+} baz[3 * 3 * 2];
+
+void main()
+{
+	uvec3 coords = gl_GlobalInvocationID;
+	baz[coords.x + coords.y + coords.z].e = a + c;
+	baz[coords.x + coords.y + coords.z].f = b * d;
+}
diff --git a/shaders-opencl-no-opt/comp/bda-atomics.invalid.comp b/shaders-opencl-no-opt/comp/bda-atomics.invalid.comp
new file mode 100644
index 000000000..eb7ae42fa
--- /dev/null
+++ b/shaders-opencl-no-opt/comp/bda-atomics.invalid.comp
@@ -0,0 +1,34 @@
+#version 450
+#extension GL_EXT_buffer_reference : require
+#extension GL_EXT_buffer_reference_uvec2 : require
+
+layout(local_size_x = 1) in;
+
+layout(buffer_reference) buffer Ptr
+{
+	uint i;
+	uvec2 i2;
+};
+
+layout(push_constant, std430) uniform Registers
+{
+	Ptr ptr;
+};
+
+layout(set = 0, binding = 0) uniform UBO
+{
+	Ptr ptr_ubo;
+};
+
+layout(set = 0, binding = 1) readonly buffer SSBO
+{
+	Ptr ptr_ssbo;
+};
+
+void main()
+{
+	atomicAdd(ptr.i, 10u);
+	atomicAdd(ptr_ubo.i, 11u);
+	atomicAdd(ptr_ssbo.i, 12u);
+	atomicAdd(Ptr(ptr.i2).i, 13u);
+}
diff --git a/shaders-opencl-no-opt/comp/bda-load-std140-arrayed-pointer.invalid.comp b/shaders-opencl-no-opt/comp/bda-load-std140-arrayed-pointer.invalid.comp
new file mode 100644
index 000000000..82f069249
--- /dev/null
+++ b/shaders-opencl-no-opt/comp/bda-load-std140-arrayed-pointer.invalid.comp
@@ -0,0 +1,20 @@
+#version 460
+#extension GL_EXT_buffer_reference_uvec2 : require
+
+layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
+
+layout(buffer_reference) buffer SSBO
+{
+	float data[];
+};
+
+layout(set = 0, binding = 0) uniform UBO
+{
+	SSBO ptrs[2];
+};
+
+void main()
+{
+	SSBO s0 = ptrs[0];
+	s0.data[gl_GlobalInvocationID.x] += 1.0;
+}
diff --git a/shaders-opencl-no-opt/comp/bda-nonwritable-glslang-workaround.comp b/shaders-opencl-no-opt/comp/bda-nonwritable-glslang-workaround.comp
new file mode 100644
index 000000000..61a6585fb
--- /dev/null
+++ b/shaders-opencl-no-opt/comp/bda-nonwritable-glslang-workaround.comp
@@ -0,0 +1,22 @@
+#version 460
+#extension GL_EXT_buffer_reference_uvec2 : require
+
+layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
+
+
+layout(push_constant) uniform Registers
+{
+	uvec2 bda;
+};
+
+// glslang emits NonWritable on the member, but forgets to actually validate that,
+// meaning we cannot trust NonWritable on BDA.
+layout(buffer_reference) readonly buffer SSBO
+{
+	float data[];
+};
+
+void main()
+{
+	SSBO(bda).data[gl_GlobalInvocationID.x] = 0.0;
+}
diff --git a/shaders-opencl-no-opt/comp/bda-restrict-pointer-variable.invalid.comp b/shaders-opencl-no-opt/comp/bda-restrict-pointer-variable.invalid.comp
new file mode 100644
index 000000000..8f1d97861
--- /dev/null
+++ b/shaders-opencl-no-opt/comp/bda-restrict-pointer-variable.invalid.comp
@@ -0,0 +1,18 @@
+#version 450
+#extension GL_EXT_buffer_reference : require
+
+layout(buffer_reference) buffer Ref
+{
+	vec4 v;
+};
+
+layout(push_constant) uniform Registers
+{
+	Ref foo;
+};
+
+void main()
+{
+	restrict Ref ref = foo;
+	ref.v = vec4(1.0);
+}
diff --git a/shaders-opencl-no-opt/comp/bitcast-16bit-1.invalid.comp b/shaders-opencl-no-opt/comp/bitcast-16bit-1.invalid.comp
new file mode 100644
index 000000000..0c21cda30
--- /dev/null
+++ b/shaders-opencl-no-opt/comp/bitcast-16bit-1.invalid.comp
@@ -0,0 +1,23 @@
+#version 450 core
+#extension GL_AMD_gpu_shader_half_float : require
+#extension GL_AMD_gpu_shader_int16 : require
+layout(local_size_x = 1) in;
+
+layout(binding = 0, std430) buffer SSBO0
+{
+   i16vec4 inputs[];
+};
+
+layout(binding = 1, std430) buffer SSBO1
+{
+   ivec4 outputs[];
+};
+
+void main()
+{
+   uint ident = gl_GlobalInvocationID.x;
+   f16vec2 a = int16BitsToFloat16(inputs[ident].xy);
+   outputs[ident].x = int(packFloat2x16(a + f16vec2(1, 1)));
+   outputs[ident].y = packInt2x16(inputs[ident].zw);
+   outputs[ident].z = int(packUint2x16(u16vec2(inputs[ident].xy)));
+}
diff --git a/shaders-opencl-no-opt/comp/bitcast-16bit-2.invalid.comp b/shaders-opencl-no-opt/comp/bitcast-16bit-2.invalid.comp
new file mode 100644
index 000000000..6bb662412
--- /dev/null
+++ b/shaders-opencl-no-opt/comp/bitcast-16bit-2.invalid.comp
@@ -0,0 +1,26 @@
+#version 450 core
+#extension GL_AMD_gpu_shader_half_float : require
+#extension GL_AMD_gpu_shader_int16 : require
+layout(local_size_x = 1) in;
+
+layout(binding = 0, std430) buffer SSBO0
+{
+   ivec4 inputs[];
+};
+
+layout(binding = 1, std430) buffer SSBO1
+{
+   i16vec4 outputs[];
+};
+
+layout(binding = 2) uniform UBO
+{
+   f16vec4 const0;
+};
+
+void main()
+{
+   uint ident = gl_GlobalInvocationID.x;
+   outputs[ident].xy = unpackInt2x16(inputs[ident].x) + float16BitsToInt16(const0.xy);
+   outputs[ident].zw = i16vec2(unpackUint2x16(uint(inputs[ident].y)) - float16BitsToUint16(const0.zw));
+}
diff --git a/shaders-opencl-no-opt/comp/bitfield.comp b/shaders-opencl-no-opt/comp/bitfield.comp
new file mode 100644
index 000000000..0cac0b257
--- /dev/null
+++ b/shaders-opencl-no-opt/comp/bitfield.comp
@@ -0,0 +1,23 @@
+#version 310 es
+
+void main()
+{
+   int signed_value = 0;
+   uint unsigned_value = 0u;
+
+   int s = bitfieldExtract(signed_value, 5, 20);
+   uint u = bitfieldExtract(unsigned_value, 6, 21);
+   s = bitfieldInsert(s, 40, 5, 4);
+   u = bitfieldInsert(u, 60u, 5, 4);
+
+   u = bitfieldReverse(u);
+   s = bitfieldReverse(s);
+
+   int v0 = bitCount(u);
+   int v1 = bitCount(s);
+
+   int v2 = findMSB(u);
+   int v3 = findMSB(s);
+   int v4 = findLSB(u);
+   int v5 = findLSB(s);
+}
diff --git a/shaders-opencl-no-opt/comp/buffer-device-address-from-pointer-complex-chain.comp b/shaders-opencl-no-opt/comp/buffer-device-address-from-pointer-complex-chain.comp
new file mode 100644
index 000000000..56c11bbb7
--- /dev/null
+++ b/shaders-opencl-no-opt/comp/buffer-device-address-from-pointer-complex-chain.comp
@@ -0,0 +1,21 @@
+#version 460
+
+#extension GL_EXT_buffer_reference: enable
+#extension GL_EXT_buffer_reference_uvec2: enable
+
+struct S {
+    vec3 v;
+};
+
+layout(buffer_reference) buffer SSBO{
+    S s[];
+};
+
+layout(push_constant) uniform PC {
+    uvec2 ptr;
+} pc;
+
+void main(){
+    SSBO ssbo = SSBO(pc.ptr);
+    ssbo.s[0].v = vec3(1.0);
+}
diff --git a/shaders-opencl-no-opt/comp/extract-atomics-from-function.invalid.comp b/shaders-opencl-no-opt/comp/extract-atomics-from-function.invalid.comp
new file mode 100644
index 000000000..ce730ba64
--- /dev/null
+++ b/shaders-opencl-no-opt/comp/extract-atomics-from-function.invalid.comp
@@ -0,0 +1,69 @@
+#version 460
+
+#extension GL_KHR_memory_scope_semantics : enable
+
+layout(local_size_x = 64) in;
+
+shared uint var;
+
+void testAdd()
+{
+  atomicAdd(var, 1);
+}
+
+void testMin()
+{
+  atomicMin(var, 2);
+}
+
+void testMax()
+{
+  atomicMax(var, 3);
+}
+
+void testAnd()
+{
+  atomicAnd(var, 4);
+}
+
+void testOr()
+{
+  atomicOr(var, 5);
+}
+
+void testXor()
+{
+  atomicXor(var, 6);
+}
+
+void testExchange()
+{
+  atomicExchange(var, 7);
+}
+
+void testCompSwap()
+{
+  atomicCompSwap(var, 8, 9);
+}
+
+void testStore()
+{
+  atomicStore(var, 10u, gl_ScopeDevice, 0, gl_SemanticsRelaxed);
+}
+
+void foo()
+{
+  testAdd();
+  testMin();
+  testMax();
+  testOr();
+  testXor();
+  testExchange();
+  testCompSwap();
+  testStore();
+}
+
+void main()
+{
+  foo();
+}
diff --git a/shaders-opencl-no-opt/comp/global-invocation-id-writable-ssbo-in-function.comp b/shaders-opencl-no-opt/comp/global-invocation-id-writable-ssbo-in-function.comp
new file mode 100644
index 000000000..2fe074df7
--- /dev/null
+++ b/shaders-opencl-no-opt/comp/global-invocation-id-writable-ssbo-in-function.comp
@@ -0,0 +1,12 @@
+#version 450
+layout(set = 0, binding = 0) buffer myBlock {
+    int a;
+    float b[1];
+} myStorage;
+float getB() {
+    return myStorage.b[gl_GlobalInvocationID.x];
+}
+void main() {
+    myStorage.a = (myStorage.a + 1) % 256;
+    myStorage.b[gl_GlobalInvocationID.x] = mod((getB() + 0.02), 1.0);
+}
diff --git a/shaders-opencl-no-opt/comp/glsl.std450.comp b/shaders-opencl-no-opt/comp/glsl.std450.comp
new file mode 100644
index 000000000..a17a82b82
--- /dev/null
+++ b/shaders-opencl-no-opt/comp/glsl.std450.comp
@@ -0,0 +1,129 @@
+#version 450
+layout(local_size_x = 1) in;
+
+layout(binding = 0, std430) buffer SSBO
+{
+	float res;
+	int ires;
+	uint ures;
+
+	vec4 f32;
+	ivec4 s32;
+	uvec4 u32;
+
+	mat2 m2;
+	mat3 m3;
+	mat4 m4;
+};
+
+void main()
+{
+	float tmp;
+	vec2 v2;
+	vec3 v3;
+	vec4 v4;
+	int itmp;
+
+	res = round(f32.x);
+	res = roundEven(f32.x);
+	res = trunc(f32.x);
+	res = abs(f32.x);
+	ires = abs(s32.x);
+	res = sign(f32.x);
+	ires = sign(s32.x);
+	res = floor(f32.x);
+	res = ceil(f32.x);
+	res = fract(f32.x);
+	res = radians(f32.x);
+	res = degrees(f32.x);
+	res = sin(f32.x);
+	res = cos(f32.x);
+	res = tan(f32.x);
+	res = asin(f32.x);
+	res = acos(f32.x);
+	res = atan(f32.x);
+	res = sinh(f32.x);
+	res = cosh(f32.x);
+	res = tanh(f32.x);
+	res = asinh(f32.x);
+	res = acosh(f32.x);
+	res = atanh(f32.x);
+	res = atan(f32.x, f32.y);
+	res = pow(f32.x, f32.y);
+	res = exp(f32.x);
+	res = log(f32.x);
+	res = exp2(f32.x);
+	res = log2(f32.x);
+	res = sqrt(f32.x);
+	res = inversesqrt(f32.x);
+
+	res = length(f32.x);
+	res = distance(f32.x, f32.y);
+	res = normalize(f32.x);
+	res = faceforward(f32.x, f32.y, f32.z);
+	res = reflect(f32.x, f32.y);
+	res = refract(f32.x, f32.y, f32.z);
+
+	res = length(f32.xy);
+	res = distance(f32.xy, f32.zw);
+	v2 = normalize(f32.xy);
+	v2 = faceforward(f32.xy, f32.yz, f32.zw);
+	v2 = reflect(f32.xy, f32.zw);
+	v2 = refract(f32.xy, f32.yz, f32.w);
+
+	v3 = cross(f32.xyz, f32.yzw);
+
+	res = determinant(m2);
+	res = determinant(m3);
+	res = determinant(m4);
+	m2 = inverse(m2);
+	m3 = inverse(m3);
+	m4 = inverse(m4);
+
+	res = modf(f32.x, tmp);
+	// ModfStruct
+
+	res = min(f32.x, f32.y);
+	ures = min(u32.x, u32.y);
+	ires = min(s32.x, s32.y);
+	res = max(f32.x, f32.y);
+	ures = max(u32.x, u32.y);
+	ires = max(s32.x, s32.y);
+
+	res = clamp(f32.x, f32.y, f32.z);
+	ures = clamp(u32.x, u32.y, u32.z);
+	ires = clamp(s32.x, s32.y, s32.z);
+
+	res = mix(f32.x, f32.y, f32.z);
+	res = step(f32.x, f32.y);
+	res = smoothstep(f32.x, f32.y, f32.z);
+	res = fma(f32.x, f32.y, f32.z);
+
+	res = frexp(f32.x, itmp);
+	// FrexpStruct
+	res = ldexp(f32.x, itmp);
+
+	ures = packSnorm4x8(f32);
+	ures = packUnorm4x8(f32);
+	ures = packSnorm2x16(f32.xy);
+	ures = packUnorm2x16(f32.xy);
+	ures = packHalf2x16(f32.xy);
+	// packDouble2x32
+
+	v2 = unpackSnorm2x16(u32.x);
+	v2 = unpackUnorm2x16(u32.x);
+	v2 = unpackHalf2x16(u32.x);
+	v4 = unpackSnorm4x8(u32.x);
+	v4 = unpackUnorm4x8(u32.x);
+	// unpackDouble2x32
+
+	s32 = findLSB(s32);
+	s32 = findLSB(u32);
+	s32 = findMSB(s32);
+	s32 = findMSB(u32);
+
+	// interpolateAtSample
+	// interpolateAtOffset
+
+	// NMin, NMax, NClamp
+}
diff --git a/shaders-opencl-no-opt/comp/illegal-struct-name.asm.comp b/shaders-opencl-no-opt/comp/illegal-struct-name.asm.comp
new file mode 100644
index 000000000..f7a8787d3
--- /dev/null
+++ b/shaders-opencl-no-opt/comp/illegal-struct-name.asm.comp
@@ -0,0 +1,62 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos Glslang Reference Front End; 8
+; Bound: 31
+; Schema: 0
+               OpCapability Shader
+          %1 = OpExtInstImport "GLSL.std.450"
+               OpMemoryModel Logical GLSL450
+               OpEntryPoint GLCompute %main "main"
+               OpExecutionMode %main LocalSize 1 1 1
+               OpSource GLSL 450
+               OpName %main "main"
+               OpName %Foo "Foo"
+               OpMemberName %Foo 0 "abs"
+               OpName %f "f"
+               OpName %Foo_0 "Foo"
+               OpMemberName %Foo_0 0 "abs"
+               OpName %SSBO "SSBO"
+               OpMemberName %SSBO 0 "foo"
+               OpMemberName %SSBO 1 "foo2"
+               OpName %_ ""
+               OpName %linear "abs"
+               OpMemberDecorate %Foo_0 0 Offset 0
+               OpMemberDecorate %SSBO 0 Offset 0
+               OpMemberDecorate %SSBO 1 Offset 4
+               OpDecorate %SSBO BufferBlock
+               OpDecorate %_ DescriptorSet 0
+               OpDecorate %_ Binding 0
+       %void = OpTypeVoid
+          %3 = OpTypeFunction %void
+      %float = OpTypeFloat 32
+        %Foo = OpTypeStruct %float
+%_ptr_Function_Foo = OpTypePointer Function %Foo
+      %Foo_0 = OpTypeStruct %float
+       %SSBO = OpTypeStruct %Foo_0 %Foo_0
+%_ptr_Uniform_SSBO = OpTypePointer Uniform %SSBO
+          %_ = OpVariable %_ptr_Uniform_SSBO Uniform
+        %int = OpTypeInt 32 1
+      %int_0 = OpConstant %int 0
+%_ptr_Uniform_Foo_0 = OpTypePointer Uniform %Foo_0
+%_ptr_Function_float = OpTypePointer Function %float
+%_ptr_Function_int = OpTypePointer Function %int
+     %int_10 = OpConstant %int 10
+      %int_1 = OpConstant %int 1
+%_ptr_Uniform_float = OpTypePointer Uniform %float
+       %main = OpFunction %void None %3
+          %5 = OpLabel
+          %f = OpVariable %_ptr_Function_Foo Function
+     %linear = OpVariable %_ptr_Function_int Function
+         %17 = OpAccessChain %_ptr_Uniform_Foo_0 %_ %int_0
+         %18 = OpLoad %Foo_0 %17
+         %19 = OpCompositeExtract %float %18 0
+         %21 = OpAccessChain %_ptr_Function_float %f %int_0
+               OpStore %21 %19
+               OpStore %linear %int_10
+         %26 = OpLoad %Foo %f
+         %27 = OpAccessChain %_ptr_Uniform_Foo_0 %_ %int_1
+         %28 = OpCompositeExtract %float %26 0
+         %30 = OpAccessChain %_ptr_Uniform_float %27 %int_0
+               OpStore %30 %28
+               OpReturn
+               OpFunctionEnd
diff --git a/shaders-opencl-no-opt/comp/implicit-integer-promotion.fp16.invalid.comp b/shaders-opencl-no-opt/comp/implicit-integer-promotion.fp16.invalid.comp
new file mode 100644
index 000000000..a0ee95b3a
--- /dev/null
+++ b/shaders-opencl-no-opt/comp/implicit-integer-promotion.fp16.invalid.comp
@@ -0,0 +1,85 @@
+#version 450
+#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
+
+layout(set = 0, binding = 0) buffer BUF0
+{
+	f16vec2 f16s;
+	u16vec2 u16;
+	i16vec2 i16;
+	u16vec4 u16s;
+	i16vec4 i16s;
+	float16_t f16;
+};
+
+void test_i16()
+{
+	f16 += int16BitsToFloat16(i16.x + i16.y);
+	f16 += int16BitsToFloat16(i16.x - i16.y);
+	f16 += int16BitsToFloat16(i16.x * i16.y);
+	f16 += int16BitsToFloat16(i16.x / i16.y);
+	f16 += int16BitsToFloat16(i16.x % i16.y);
+	f16 += int16BitsToFloat16(i16.x << i16.y);
+	f16 += int16BitsToFloat16(i16.x >> i16.y);
+	f16 += int16BitsToFloat16(~i16.x);
+	f16 += int16BitsToFloat16(-i16.x);
+	f16 += int16BitsToFloat16(i16.x ^ i16.y);
+	f16 += int16BitsToFloat16(i16.x & i16.y);
+	f16 += int16BitsToFloat16(i16.x | i16.y);
+}
+
+void test_u16()
+{
+	f16 += uint16BitsToFloat16(u16.x + u16.y);
+	f16 += uint16BitsToFloat16(u16.x - u16.y);
+	f16 += uint16BitsToFloat16(u16.x * u16.y);
+	f16 += uint16BitsToFloat16(u16.x / u16.y);
+	f16 += uint16BitsToFloat16(u16.x % u16.y);
+	f16 += uint16BitsToFloat16(u16.x << u16.y);
+	f16 += uint16BitsToFloat16(u16.x >> u16.y);
+	f16 += uint16BitsToFloat16(~u16.x);
+	f16 += uint16BitsToFloat16(-u16.x);
+	f16 += uint16BitsToFloat16(u16.x ^ u16.y);
+	f16 += uint16BitsToFloat16(u16.x & u16.y);
+	f16 += uint16BitsToFloat16(u16.x | u16.y);
+}
+
+void test_u16s()
+{
+	f16s += uint16BitsToFloat16(u16s.xy + u16s.zw);
+	f16s += uint16BitsToFloat16(u16s.xy - u16s.zw);
+	f16s += uint16BitsToFloat16(u16s.xy * u16s.zw);
+	f16s += uint16BitsToFloat16(u16s.xy / u16s.zw);
+	f16s += uint16BitsToFloat16(u16s.xy % u16s.zw);
+	f16s += uint16BitsToFloat16(u16s.xy << u16s.zw);
+	f16s += uint16BitsToFloat16(u16s.xy >> u16s.zw);
+	f16s += uint16BitsToFloat16(~u16s.xy);
+	f16s += uint16BitsToFloat16(-u16s.xy);
+	f16s += uint16BitsToFloat16(u16s.xy ^ u16s.zw);
+	f16s += uint16BitsToFloat16(u16s.xy & u16s.zw);
+	f16s += uint16BitsToFloat16(u16s.xy | u16s.zw);
+}
+
+void test_i16s()
+{
+	f16s += int16BitsToFloat16(i16s.xy + i16s.zw);
+	f16s += int16BitsToFloat16(i16s.xy - i16s.zw);
+	f16s += int16BitsToFloat16(i16s.xy * i16s.zw);
+	f16s += int16BitsToFloat16(i16s.xy / i16s.zw);
+	f16s += int16BitsToFloat16(i16s.xy % i16s.zw);
+	f16s += int16BitsToFloat16(i16s.xy << i16s.zw);
+	f16s += int16BitsToFloat16(i16s.xy >> i16s.zw);
+	f16s += int16BitsToFloat16(~i16s.xy);
+	f16s += int16BitsToFloat16(-i16s.xy);
+	f16s += int16BitsToFloat16(i16s.xy ^ i16s.zw);
+	f16s += int16BitsToFloat16(i16s.xy & i16s.zw);
+	f16s += int16BitsToFloat16(i16s.xy | i16s.zw);
+}
+
+void main()
+{
+	test_u16();
+	test_i16();
+	test_u16s();
+	test_i16s();
+}
diff --git a/shaders-opencl-no-opt/comp/int16min-literal.fp16.invalid.comp b/shaders-opencl-no-opt/comp/int16min-literal.fp16.invalid.comp
new file mode 100644
index 000000000..c1b345266
--- /dev/null
+++ b/shaders-opencl-no-opt/comp/int16min-literal.fp16.invalid.comp
@@ -0,0 +1,22 @@
+#version 450
+#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
+
+layout(local_size_x = 1) in;
+
+layout(set = 0, binding = 1) buffer SSBO
+{
+	float16_t a;
+};
+
+layout(set = 0, binding = 0) uniform UBO
+{
+	float16_t b;
+};
+
+void main()
+{
+	int16_t v = float16BitsToInt16(b);
+	v ^= 0x8000s;
+	a = int16BitsToFloat16(v);
+}
diff --git a/shaders-opencl-no-opt/comp/int64.invalid.comp b/shaders-opencl-no-opt/comp/int64.invalid.comp
new file mode 100644
index 000000000..965bed4ae
--- /dev/null
+++ b/shaders-opencl-no-opt/comp/int64.invalid.comp
@@ -0,0 +1,65 @@
+#version 450
+#extension GL_ARB_gpu_shader_int64 : require
+layout(local_size_x = 1) in;
+
+struct M0
+{
+	int64_t v;
+	i64vec2 b[2];
+	uint64_t c;
+	uint64_t d[5];
+};
+
+struct SSBO0_Type
+{
+	i64vec4 a;
+	M0 m0;
+};
+
+struct SSBO1_Type
+{
+	u64vec4 b;
+	M0 m0;
+};
+
+struct SSBO2_Type
+{
+	int64_t a[4];
+	i64vec2 b[4];
+};
+
+struct SSBO3_Type
+{
+	int64_t a[4];
+	i64vec2 b[4];
+};
+
+layout(set = 0, binding = 0, std430) buffer SSBO
+{
+	int s32;
+	uint u32;
+};
+
+void main()
+{
+	SSBO0_Type ssbo_0;
+	SSBO1_Type ssbo_1;
+	SSBO2_Type ssbo_2;
+	SSBO3_Type ssbo_3;
+
+	ssbo_0.a += i64vec4(10, 20, 30, 40);
+	ssbo_1.b += u64vec4(999999999999999999ul, 8888888888888888ul, 77777777777777777ul, 6666666666666666ul);
+	ssbo_0.a += 20;
+	ssbo_0.a = abs(ssbo_0.a + i64vec4(ssbo_1.b));
+
+	ssbo_0.a++;
+	ssbo_1.b++;
+	ssbo_0.a--;
+	ssbo_1.b--;
+
+	ssbo_2.a[0] += 1l;
+	ssbo_3.a[0] += 2l;
+
+	s32 = int(ssbo_0.a.x + ssbo_1.b.y + ssbo_2.a[1] + ssbo_3.a[2]);
+	u32 = uint(ssbo_0.a.y + ssbo_1.b.z + ssbo_2.a[0] + ssbo_3.a[1]);
+}
diff --git a/shaders-opencl-no-opt/comp/int64min-literal.comp b/shaders-opencl-no-opt/comp/int64min-literal.comp
new file mode 100644
index 000000000..792960544
--- /dev/null
+++ b/shaders-opencl-no-opt/comp/int64min-literal.comp
@@ -0,0 +1,21 @@
+#version 450
+#extension GL_ARB_gpu_shader_int64 : require
+
+layout(local_size_x = 1) in;
+
+layout(set = 0, binding = 1) buffer SSBO
+{
+	float a;
+};
+
+layout(set = 0, binding = 0) uniform UBO
+{
+	float b;
+};
+
+void main()
+{
+	int64_t v = int64_t(floatBitsToInt(b));
+	v ^= 0x8000000000000000L;
+	a = intBitsToFloat(int(v));
+}
diff --git a/shaders-opencl-no-opt/comp/integer-dot-product.comp b/shaders-opencl-no-opt/comp/integer-dot-product.comp
new file mode 100644
index 000000000..8b6630922
--- /dev/null
+++ b/shaders-opencl-no-opt/comp/integer-dot-product.comp
@@ -0,0 +1,114 @@
+#version 450
+#extension GL_EXT_shader_8bit_storage : require
+#extension GL_EXT_shader_16bit_storage : require
+#extension GL_EXT_shader_explicit_arithmetic_types : require
+#extension GL_EXT_spirv_intrinsics : require
+
+layout(local_size_x = 1) in;
+
+layout(std430, binding = 0) buffer InOut {
+    uvec4 x;
+    uvec4 y;
+    int result;
+} comp;
+
+layout(std430, binding = 1) buffer InOut2 {
+    uint x;
+    uint y;
+    uint result;
+} comp2;
+
+layout(std430, binding = 1) buffer InOut3 {
+    u16vec4 x;
+    u16vec4 y;
+    int acc;
+    int result;
+} comp3;
+
+// Signed integer dot with unsigned integer
+spirv_instruction (extensions = ["SPV_KHR_integer_dot_product"], capabilities = [6019], id = 4450)
+int sdot_int_result(u16vec4 x, u16vec4 y);
+spirv_instruction (extensions = ["SPV_KHR_integer_dot_product"], capabilities = [6019], id = 4450)
+uint sdot_uint_result(u16vec4 x, u16vec4 y);
+
+// Unsigned integer dot with signed integer. Only unsigned result is allowed in SPIR-V.
+spirv_instruction (extensions = ["SPV_KHR_integer_dot_product"], capabilities = [6019], id = 4451)
+uint udot_uint_result(u16vec4 x, u16vec4 y);
+
+// Mixed integer dot with unsigned integer
+spirv_instruction (extensions = ["SPV_KHR_integer_dot_product"], capabilities = [6019], id = 4452)
+int sudot_int_result(u16vec4 x, u16vec4 y);
+spirv_instruction (extensions = ["SPV_KHR_integer_dot_product"], capabilities = [6019], id = 4452)
+uint sudot_uint_result(u16vec4 x, u16vec4 y);
+
+// Signed packed dot product with different output widths.
+spirv_instruction (extensions = ["SPV_KHR_integer_dot_product"], capabilities = [6019], id = 4450)
+uint8_t spdot_to_8(uint x, uint y, spirv_literal uint packedFormat);
+spirv_instruction (extensions = ["SPV_KHR_integer_dot_product"], capabilities = [6019], id = 4450)
+uint16_t spdot_to_16(uint x, uint y, spirv_literal uint packedFormat);
+spirv_instruction (extensions = ["SPV_KHR_integer_dot_product"], capabilities = [6019], id = 4450)
+uint spdot_to_32(uint x, uint y, spirv_literal uint packedFormat);
+spirv_instruction (extensions = ["SPV_KHR_integer_dot_product"], capabilities = [6019], id = 4450)
+int spdot_to_i32(uint x, uint y, spirv_literal uint packedFormat);
+
+// Unsigned packed dot product with different output widths.
+spirv_instruction (extensions = ["SPV_KHR_integer_dot_product"], capabilities = [6019], id = 4451)
+uint8_t updot_to_8(uint x, uint y, spirv_literal uint packedFormat);
+spirv_instruction (extensions = ["SPV_KHR_integer_dot_product"], capabilities = [6019], id = 4451)
+uint16_t updot_to_16(uint x, uint y, spirv_literal uint packedFormat);
+spirv_instruction (extensions = ["SPV_KHR_integer_dot_product"], capabilities = [6019], id = 4451)
+uint updot_to_32(uint x, uint y, spirv_literal uint packedFormat);
+
+// Mixed packed dot product with different output widths.
+spirv_instruction (extensions = ["SPV_KHR_integer_dot_product"], capabilities = [6019], id = 4452)
+uint8_t supdot_to_8(uint x, uint y, spirv_literal uint packedFormat);
+spirv_instruction (extensions = ["SPV_KHR_integer_dot_product"], capabilities = [6019], id = 4452)
+uint16_t supdot_to_16(uint x, uint y, spirv_literal uint packedFormat);
+spirv_instruction (extensions = ["SPV_KHR_integer_dot_product"], capabilities = [6019], id = 4452)
+uint supdot_to_32(uint x, uint y, spirv_literal uint packedFormat);
+spirv_instruction (extensions = ["SPV_KHR_integer_dot_product"], capabilities = [6019], id = 4452)
+int supdot_to_i32(uint x, uint y, spirv_literal uint packedFormat);
+
+// SDotAccSat with unsigned input and result type
+spirv_instruction (extensions = ["SPV_KHR_integer_dot_product"], capabilities = [6019], id = 4453)
+int sdotaddsat_int_result(u16vec4 x, u16vec4 y, int acc);
+spirv_instruction (extensions = ["SPV_KHR_integer_dot_product"], capabilities = [6019], id = 4453)
+uint sdotaddsat_uint_result(u16vec4 x, u16vec4 y, int acc);
+
+// UDotAccSat. Result type must be unsigned in SPIR-V.
+spirv_instruction (extensions = ["SPV_KHR_integer_dot_product"], capabilities = [6019], id = 4454)
+uint udotaddsat(u16vec4 x, u16vec4 y, int acc);
+
+// SUDotAccSat
+spirv_instruction (extensions = ["SPV_KHR_integer_dot_product"], capabilities = [6019], id = 4455)
+int sudotaddsat_int_result(u16vec4 x, u16vec4 y, int acc);
+spirv_instruction (extensions = ["SPV_KHR_integer_dot_product"], capabilities = [6019], id = 4455)
+uint sudotaddsat_uint_result(u16vec4 x, u16vec4 y, int acc);
+
+void main() {
+    int sdot_int = sdot_int_result(comp3.x, comp3.y);
+    uint sdot_uint = sdot_uint_result(comp3.x, comp3.y);
+    uint udot_uint = udot_uint_result(comp3.x, comp3.y);
+    int sudot_int = sudot_int_result(comp3.x, comp3.y);
+    uint sudot_uint = sudot_uint_result(comp3.x, comp3.y);
+
+    uint8_t spdot8 = spdot_to_8(comp2.x, comp2.y, 0x0); // PackedVectorFormat4x8Bit
+    uint16_t spdot16 = spdot_to_16(comp2.x, comp2.y, 0x0); // PackedVectorFormat4x8Bit
+    uint spdot32 = spdot_to_32(comp2.x, comp2.y, 0x0); // PackedVectorFormat4x8Bit
+    int spdoti32 = spdot_to_i32(comp2.x, comp2.y, 0x0); // PackedVectorFormat4x8Bit
+
+    uint8_t updot8 = updot_to_8(comp2.x, comp2.y, 0x0); // PackedVectorFormat4x8Bit
+    uint16_t updot16 = updot_to_16(comp2.x, comp2.y, 0x0); // PackedVectorFormat4x8Bit
+    uint updot32 = updot_to_32(comp2.x, comp2.y, 0x0); // PackedVectorFormat4x8Bit
+
+    uint8_t supdot8 = supdot_to_8(comp2.x, comp2.y, 0x0); // PackedVectorFormat4x8Bit
+    uint16_t supdot16 = supdot_to_16(comp2.x, comp2.y, 0x0); // PackedVectorFormat4x8Bit
+    uint supdot32 = supdot_to_32(comp2.x, comp2.y, 0x0); // PackedVectorFormat4x8Bit
+    int supdoti32 = supdot_to_i32(comp2.x, comp2.y, 0x0); // PackedVectorFormat4x8Bit
+
+    int sdotaddsat_int = sdotaddsat_int_result(comp3.x, comp3.y, comp3.acc);
+    uint sdotaddsat_uint = sdotaddsat_uint_result(comp3.x, comp3.y, comp3.acc);
+    uint udotaddsat_uint = udotaddsat(comp3.x, comp3.y, comp3.acc);
+    int sudotaddsat_int = sudotaddsat_int_result(comp3.x, comp3.y, comp3.acc);
+    uint sudotaddsat_uint = sudotaddsat_uint_result(comp3.x, comp3.y, comp3.acc);
+}
diff --git a/shaders-opencl-no-opt/comp/intmin-literal.comp b/shaders-opencl-no-opt/comp/intmin-literal.comp
new file mode 100644
index 000000000..ee35cedab
--- /dev/null
+++ b/shaders-opencl-no-opt/comp/intmin-literal.comp
@@ -0,0 +1,18 @@
+#version 450
+
+layout(local_size_x = 1) in;
+
+layout(set = 0, binding = 1) buffer SSBO
+{
+	float a;
+};
+
+layout(set = 0, binding = 0) uniform UBO
+{
+	float b;
+};
+
+void main()
+{
+	a = intBitsToFloat(floatBitsToInt(b) ^ 0x80000000);
+}
diff --git a/shaders-opencl-no-opt/comp/loop.comp b/shaders-opencl-no-opt/comp/loop.comp
new file mode 100644
index 000000000..6d6c32424
--- /dev/null
+++ b/shaders-opencl-no-opt/comp/loop.comp
@@ -0,0 +1,98 @@
+#version 310 es
+layout(local_size_x = 1) in;
+
+layout(std430, binding = 0) readonly buffer SSBO
+{
+    mat4 mvp;
+    vec4 in_data[];
+};
+
+layout(std430, binding = 1) writeonly buffer SSBO2
+{
+    vec4 out_data[];
+};
+
+void main()
+{
+    uint ident = gl_GlobalInvocationID.x;
+    vec4 idat = in_data[ident];
+
+    int k = 0;
+    uint i = 0u;
+
+    if (idat.y == 20.0)
+    {
+        do
+        {
+            k = k * 2;
+            i++;
+        } while (i < ident);
+    }
+
+    switch (k)
+    {
+        case 10:
+            for (;;)
+            {
+                i++;
+                if (i > 10u)
+                    break;
+            }
+            break;
+
+        default:
+            for (;;)
+            {
+               i += 2u;
+               if (i > 20u)
+                  break;
+            }
+            break;
+    }
+
+    while (k < 10)
+    {
+        idat *= 2.0;
+        k++;
+    }
+
+    for (uint i = 0u; i < 16u; i++, k++)
+        for (uint j = 0u; j < 30u; j++)
+            idat = mvp * idat;
+
+    k = 0;
+    for (;;)
+    {
+        k++;
+        if (k > 10)
+        {
+            k += 2;
+        }
+        else
+        {
+            k += 3;
+            continue;
+        }
+
+        k += 10;
+    }
+
+    k = 0;
+    do
+    {
+        k++;
+    } while (k > 10);
+
+    int l = 0;
+    for (;; l++)
+    {
+        if (l == 5)
+        {
+            continue;
+        }
+        
+        idat += 1.0;
+    }
+    out_data[ident] = idat;
+}
+
diff --git a/shaders-opencl-no-opt/comp/read-only-coherent-image.invalid.comp b/shaders-opencl-no-opt/comp/read-only-coherent-image.invalid.comp
new file mode 100644
index 000000000..39d4c38a8
--- /dev/null
+++ b/shaders-opencl-no-opt/comp/read-only-coherent-image.invalid.comp
@@ -0,0 +1,17 @@
+#version 450
+#extension GL_EXT_buffer_reference : require
+layout(local_size_x = 1) in;
+
+layout(set = 0, binding = 1, r32ui) volatile uniform uimage2D img;
+
+layout(set = 0, binding = 0) buffer SSBO
+{
+	uint val;
+};
+
+void main()
+{
+	//imageAtomicAdd(img, ivec2(10), 40);
+	val = imageLoad(img, ivec2(10)).x;
+}
+
diff --git a/shaders-opencl-no-opt/comp/return.comp b/shaders-opencl-no-opt/comp/return.comp
new file mode 100644
index 000000000..617f43718
--- /dev/null
+++ b/shaders-opencl-no-opt/comp/return.comp
@@ -0,0 +1,33 @@
+#version 310 es
+layout(local_size_x = 1) in;
+
+layout(std430, binding = 1) writeonly buffer SSBO2
+{
+    vec4 out_data[];
+};
+
+void main()
+{
+    uint ident = gl_GlobalInvocationID.x;
+
+    if (ident == 2u)
+    {
+        out_data[ident] = vec4(20.0);
+    }
+    else if (ident == 4u)
+    {
+        out_data[ident] = vec4(10.0);
+        return;
+    }
+
+    for (int i = 0; i < 20; i++)
+    {
+        if (i == 10)
+            break;
+
+        return;
+    }
+
+    out_data[ident] = vec4(10.0);
+}
+
diff --git a/shaders-opencl-no-opt/comp/simple-bindless-ssbo.argument.argument-tier-1.device-argument-buffer.invalid.comp b/shaders-opencl-no-opt/comp/simple-bindless-ssbo.argument.argument-tier-1.device-argument-buffer.invalid.comp
new file mode 100644
index 000000000..0db56342c
--- /dev/null
+++ b/shaders-opencl-no-opt/comp/simple-bindless-ssbo.argument.argument-tier-1.device-argument-buffer.invalid.comp
@@ -0,0 +1,13 @@
+#version 450
+#extension GL_EXT_nonuniform_qualifier : require
+layout(local_size_x = 1) in;
+
+layout(set = 0, binding = 0) buffer SSBO
+{
+	vec4 a;
+} ssbos[];
+
+void main()
+{
+	ssbos[gl_WorkGroupID.x].a += 2.0;
+}
diff --git a/shaders-opencl-no-opt/comp/std140-array-load-composite-construct.comp b/shaders-opencl-no-opt/comp/std140-array-load-composite-construct.comp
new file mode 100644
index 000000000..af1c47b32
--- /dev/null
+++ b/shaders-opencl-no-opt/comp/std140-array-load-composite-construct.comp
@@ -0,0 +1,13 @@
+#version 450
+layout(local_size_x = 1) in;
+
+layout(std140, binding = 0) buffer SSBO
+{
+	float a[16];
+	vec4 b[16];
+};
+
+void main()
+{
+	b[gl_GlobalInvocationID.x] = vec4(a[gl_GlobalInvocationID.x]);
+}
diff --git a/shaders-opencl-no-opt/comp/struct-packing-scalar.nocompat.invalid.vk.comp b/shaders-opencl-no-opt/comp/struct-packing-scalar.nocompat.invalid.vk.comp
new file mode 100644
index 000000000..47d88912f
--- /dev/null
+++ b/shaders-opencl-no-opt/comp/struct-packing-scalar.nocompat.invalid.vk.comp
@@ -0,0 +1,100 @@
+#version 310 es
+#extension GL_EXT_scalar_block_layout : require
+
+layout(local_size_x = 1) in;
+
+struct S0
+{
+    vec2 a[1];
+    float b;
+};
+
+struct S1
+{
+    vec3 a;
+    float b;
+};
+
+struct S2
+{
+    vec3 a[1];
+    float b;
+};
+
+struct S3
+{
+    vec2 a;
+    float b;
+};
+
+struct S4
+{
+    vec2 c;
+};
+
+struct Content
+{
+    S0 m0s[1];
+    S1 m1s[1];
+    S2 m2s[1];
+    S0 m0;
+    S1 m1;
+    S2 m2;
+    S3 m3;
+    float m4;
+
+    // glslang seems to miscompile this atm into ArrayStride of 16 even in scalar layout.
+    //S4 m3s[8];
+};
+
+layout(binding = 2, scalar) restrict buffer SSBO2
+{
+    float m0;
+    mat2 m1;
+    layout(row_major) mat3x2 m2;
+} ssbo_scalar2;
+
+layout(binding = 1, scalar) restrict buffer SSBO1
+{
+    Content content;
+    Content content1[2];
+    Content content2;
+
+    layout(column_major) mat2 m0;
+    layout(column_major) mat2 m1;
+    layout(column_major) mat2x3 m2[4];
+    layout(column_major) mat3x2 m3;
+    layout(row_major) mat2 m4;
+    layout(row_major) mat2 m5[9];
+    layout(row_major) mat2x3 m6[4][2];
+    layout(row_major) mat3x2 m7;
+    float array[];
+} ssbo_scalar;
+
+layout(binding = 0, std140) restrict buffer SSBO0
+{
+    Content content;
+    Content content1[2];
+    Content content2;
+
+    layout(column_major) mat2 m0;
+    layout(column_major) mat2 m1;
+    layout(column_major) mat2x3 m2[4];
+    layout(column_major) mat3x2 m3;
+    layout(row_major) mat2 m4;
+    layout(row_major) mat2 m5[9];
+    layout(row_major) mat2x3 m6[4][2];
+    layout(row_major) mat3x2 m7;
+
+    float array[];
+} ssbo_140;
+
+void main()
+{
+    ssbo_scalar.content = ssbo_140.content;
+    ssbo_scalar.content.m1.a = ssbo_scalar.m2[1] * ssbo_scalar.content.m0.a[0];	// test packed matrix access
+    ssbo_scalar.m0 = ssbo_scalar2.m1;
+    ssbo_scalar2.m1 = ssbo_scalar.m4;
+    ssbo_scalar2.m2 = ssbo_scalar.m3;
+}
+
diff --git a/shaders-opencl-no-opt/comp/subgroups.nocompat.vk.opencl12.emulate-subgroup.invalid.comp b/shaders-opencl-no-opt/comp/subgroups.nocompat.vk.opencl12.emulate-subgroup.invalid.comp
new file mode 100644
index 000000000..8a0be2269
--- /dev/null
+++ b/shaders-opencl-no-opt/comp/subgroups.nocompat.vk.opencl12.emulate-subgroup.invalid.comp
@@ -0,0 +1,25 @@
+#version 450
+#extension GL_KHR_shader_subgroup_basic : require
+layout(local_size_x = 1) in;
+
+layout(std430, binding = 0) buffer SSBO
+{
+	float FragColor;
+};
+
+// Reduced test for emulated functionality.
+
+void main()
+{
+	// basic
+	FragColor = float(gl_NumSubgroups);
+	FragColor = float(gl_SubgroupID);
+	FragColor = float(gl_SubgroupSize);
+	FragColor = float(gl_SubgroupInvocationID);
+	subgroupBarrier();
+	subgroupMemoryBarrier();
+	subgroupMemoryBarrierBuffer();
+	subgroupMemoryBarrierShared();
+	subgroupMemoryBarrierImage();
+	bool elected = subgroupElect();
+}
diff --git a/shaders-opencl-no-opt/comp/subgroups.nocompat.vk.subgroup.fixed-subgroup.invalid.comp b/shaders-opencl-no-opt/comp/subgroups.nocompat.vk.subgroup.fixed-subgroup.invalid.comp
new file mode 100644
index 000000000..c8172fd95
--- /dev/null
+++ b/shaders-opencl-no-opt/comp/subgroups.nocompat.vk.subgroup.fixed-subgroup.invalid.comp
@@ -0,0 +1,211 @@
+#version 450
+#extension GL_KHR_shader_subgroup_basic : require
+#extension GL_KHR_shader_subgroup_ballot : require
+#extension GL_KHR_shader_subgroup_vote : require
+#extension GL_KHR_shader_subgroup_shuffle : require
+#extension GL_KHR_shader_subgroup_shuffle_relative : require
+#extension GL_KHR_shader_subgroup_arithmetic : require
+#extension GL_KHR_shader_subgroup_clustered : require
+#extension GL_KHR_shader_subgroup_quad : require
+#extension GL_KHR_shader_subgroup_rotate : require
+layout(local_size_x = 1) in;
+
+layout(std430, binding = 0) buffer SSBO
+{
+	float FragColor;
+};
+
+void doClusteredRotate()
+{
+	uint rotated_clustered = subgroupClusteredRotate(20u, 4u, 8u);
+	bool rotated_clustered_bool = subgroupClusteredRotate(false, 4u, 8u);
+}
+
+void main()
+{
+	// basic
+	FragColor = float(gl_NumSubgroups);
+	FragColor = float(gl_SubgroupID);
+	FragColor = float(gl_SubgroupSize);
+	FragColor = float(gl_SubgroupInvocationID);
+	subgroupBarrier();
+	subgroupMemoryBarrier();
+	subgroupMemoryBarrierBuffer();
+	subgroupMemoryBarrierShared();
+	subgroupMemoryBarrierImage();
+	bool elected = subgroupElect();
+
+	// ballot
+	FragColor = float(gl_SubgroupEqMask);
+	FragColor = float(gl_SubgroupGeMask);
+	FragColor = float(gl_SubgroupGtMask);
+	FragColor = float(gl_SubgroupLeMask);
+	FragColor = float(gl_SubgroupLtMask);
+	vec4 broadcasted = subgroupBroadcast(vec4(10.0), 8u);
+	bvec2 broadcasted_bool = subgroupBroadcast(bvec2(true), 8u);
+	vec3 first = subgroupBroadcastFirst(vec3(20.0));
+	bvec4 first_bool = subgroupBroadcastFirst(bvec4(false));
+	uvec4 ballot_value = subgroupBallot(true);
+	bool inverse_ballot_value = subgroupInverseBallot(ballot_value);
+	bool bit_extracted = subgroupBallotBitExtract(uvec4(10u), 8u);
+	uint bit_count = subgroupBallotBitCount(ballot_value);
+	uint inclusive_bit_count = subgroupBallotInclusiveBitCount(ballot_value);
+	uint exclusive_bit_count = subgroupBallotExclusiveBitCount(ballot_value);
+	uint lsb = subgroupBallotFindLSB(ballot_value);
+	uint msb = subgroupBallotFindMSB(ballot_value);
+
+	// shuffle
+	uint shuffled = subgroupShuffle(10u, 8u);
+	bool shuffled_bool = subgroupShuffle(true, 9u);
+	uint shuffled_xor = subgroupShuffleXor(30u, 8u);
+	bool shuffled_xor_bool = subgroupShuffleXor(false, 9u);
+
+	// shuffle relative 
+	uint shuffled_up = subgroupShuffleUp(20u, 4u);
+	bool shuffled_up_bool = subgroupShuffleUp(true, 4u);
+	uint shuffled_down = subgroupShuffleDown(20u, 4u);
+	bool shuffled_down_bool = subgroupShuffleDown(false, 4u);
+
+	// rotate
+	uint rotated = subgroupRotate(20u, 4u);
+	bool rotated_bool = subgroupRotate(false, 4u);
+	doClusteredRotate();
+
+	// vote
+	bool has_all = subgroupAll(true);
+	bool has_any = subgroupAny(true);
+	bool has_equal = subgroupAllEqual(0);
+	has_equal = subgroupAllEqual(true);
+	has_equal = subgroupAllEqual(vec3(0.0, 1.0, 2.0));
+	has_equal = subgroupAllEqual(bvec4(true, true, false, true));
+
+	// arithmetic
+	vec4 added = subgroupAdd(vec4(20.0));
+	ivec4 iadded = subgroupAdd(ivec4(20));
+	vec4 multiplied = subgroupMul(vec4(20.0));
+	ivec4 imultiplied = subgroupMul(ivec4(20));
+	vec4 lo = subgroupMin(vec4(20.0));
+	vec4 hi = subgroupMax(vec4(20.0));
+	ivec4 slo = subgroupMin(ivec4(20));
+	ivec4 shi = subgroupMax(ivec4(20));
+	uvec4 ulo = subgroupMin(uvec4(20));
+	uvec4 uhi = subgroupMax(uvec4(20));
+	uvec4 anded = subgroupAnd(ballot_value);
+	uvec4 ored = subgroupOr(ballot_value);
+	uvec4 xored = subgroupXor(ballot_value);
+	bvec4 anded_b = subgroupAnd(equal(ballot_value, uvec4(42)));
+	bvec4 ored_b = subgroupOr(equal(ballot_value, uvec4(42)));
+	bvec4 xored_b = subgroupXor(equal(ballot_value, uvec4(42)));
+
+	added = subgroupInclusiveAdd(added);
+	iadded = subgroupInclusiveAdd(iadded);
+	multiplied = subgroupInclusiveMul(multiplied);
+	imultiplied = subgroupInclusiveMul(imultiplied);
+	//lo = subgroupInclusiveMin(lo);  // FIXME: Unsupported by Metal
+	//hi = subgroupInclusiveMax(hi);
+	//slo = subgroupInclusiveMin(slo);
+	//shi = subgroupInclusiveMax(shi);
+	//ulo = subgroupInclusiveMin(ulo);
+	//uhi = subgroupInclusiveMax(uhi);
+	//anded = subgroupInclusiveAnd(anded);
+	//ored = subgroupInclusiveOr(ored);
+	//xored = subgroupInclusiveXor(ored);
+	//added = subgroupExclusiveAdd(lo);
+
+	added = subgroupExclusiveAdd(multiplied);
+	multiplied = subgroupExclusiveMul(multiplied);
+	iadded = subgroupExclusiveAdd(imultiplied);
+	imultiplied = subgroupExclusiveMul(imultiplied);
+	//lo = subgroupExclusiveMin(lo);  // FIXME: Unsupported by Metal
+	//hi = subgroupExclusiveMax(hi);
+	//ulo = subgroupExclusiveMin(ulo);
+	//uhi = subgroupExclusiveMax(uhi);
+	//slo = subgroupExclusiveMin(slo);
+	//shi = subgroupExclusiveMax(shi);
+	//anded = subgroupExclusiveAnd(anded);
+	//ored = subgroupExclusiveOr(ored);
+	//xored = subgroupExclusiveXor(ored);
+
+	// clustered
+	added = subgroupClusteredAdd(added, 1u);
+	multiplied = subgroupClusteredMul(multiplied, 1u);
+	iadded = subgroupClusteredAdd(iadded, 1u);
+	imultiplied = subgroupClusteredMul(imultiplied, 1u);
+	lo = subgroupClusteredMin(lo, 1u);
+	hi = subgroupClusteredMax(hi, 1u);
+	ulo = subgroupClusteredMin(ulo, 1u);
+	uhi = subgroupClusteredMax(uhi, 1u);
+	slo = subgroupClusteredMin(slo, 1u);
+	shi = subgroupClusteredMax(shi, 1u);
+	anded = subgroupClusteredAnd(anded, 1u);
+	ored = subgroupClusteredOr(ored, 1u);
+	xored = subgroupClusteredXor(xored, 1u);
+
+	anded_b = subgroupClusteredAnd(equal(anded, uvec4(2u)), 1u);
+	ored_b = subgroupClusteredOr(equal(ored, uvec4(3u)), 1u);
+	xored_b = subgroupClusteredXor(equal(xored, uvec4(4u)), 1u);
+
+	added = subgroupClusteredAdd(added, 2u);
+	multiplied = subgroupClusteredMul(multiplied, 2u);
+	iadded = subgroupClusteredAdd(iadded, 2u);
+	imultiplied = subgroupClusteredMul(imultiplied, 2u);
+	lo = subgroupClusteredMin(lo, 2u);
+	hi = subgroupClusteredMax(hi, 2u);
+	ulo = subgroupClusteredMin(ulo, 2u);
+	uhi = subgroupClusteredMax(uhi, 2u);
+	slo = subgroupClusteredMin(slo, 2u);
+	shi = subgroupClusteredMax(shi, 2u);
+	anded = subgroupClusteredAnd(anded, 2u);
+	ored = subgroupClusteredOr(ored, 2u);
+	xored = subgroupClusteredXor(xored, 2u);
+
+	anded_b = subgroupClusteredAnd(equal(anded, uvec4(2u)), 2u);
+	ored_b = subgroupClusteredOr(equal(ored, uvec4(3u)), 2u);
+	xored_b = subgroupClusteredXor(equal(xored, uvec4(4u)), 2u);
+
+	added = subgroupClusteredAdd(added, 4u);
+	multiplied = subgroupClusteredMul(multiplied, 4u);
+	iadded = subgroupClusteredAdd(iadded, 4u);
+	imultiplied = subgroupClusteredMul(imultiplied, 4u);
+	lo = subgroupClusteredMin(lo, 4u);
+	hi = subgroupClusteredMax(hi, 4u);
+	ulo = subgroupClusteredMin(ulo, 4u);
+	uhi = subgroupClusteredMax(uhi, 4u);
+	slo = subgroupClusteredMin(slo, 4u);
+	shi = subgroupClusteredMax(shi, 4u);
+	anded = subgroupClusteredAnd(anded, 4u);
+	ored = subgroupClusteredOr(ored, 4u);
+	xored = subgroupClusteredXor(xored, 4u);
+
+	anded_b = subgroupClusteredAnd(equal(anded, uvec4(2u)), 4u);
+	ored_b = subgroupClusteredOr(equal(ored, uvec4(3u)), 4u);
+	xored_b = subgroupClusteredXor(equal(xored, uvec4(4u)), 4u);
+
+	added = subgroupClusteredAdd(added, 16u);
+	multiplied = subgroupClusteredMul(multiplied, 16u);
+	iadded = subgroupClusteredAdd(iadded, 16u);
+	imultiplied = subgroupClusteredMul(imultiplied, 16u);
+	lo = subgroupClusteredMin(lo, 16u);
+	hi = subgroupClusteredMax(hi, 16u);
+	ulo = subgroupClusteredMin(ulo, 16u);
+	uhi = subgroupClusteredMax(uhi, 16u);
+	slo = subgroupClusteredMin(slo, 16u);
+	shi = subgroupClusteredMax(shi, 16u);
+	anded = subgroupClusteredAnd(anded, 16u);
+	ored = subgroupClusteredOr(ored, 16u);
+	xored = subgroupClusteredXor(xored, 16u);
+
+	anded_b = subgroupClusteredAnd(equal(anded, uvec4(2u)), 16u);
+	ored_b = subgroupClusteredOr(equal(ored, uvec4(3u)), 16u);
+	xored_b = subgroupClusteredXor(equal(xored, uvec4(4u)), 16u);
+
+	// quad
+	vec4 swap_horiz = subgroupQuadSwapHorizontal(vec4(20.0));
+	bvec4 swap_horiz_bool = subgroupQuadSwapHorizontal(bvec4(true));
+	vec4 swap_vertical = subgroupQuadSwapVertical(vec4(20.0));
+	bvec4 swap_vertical_bool = subgroupQuadSwapVertical(bvec4(true));
+	vec4 swap_diagonal = subgroupQuadSwapDiagonal(vec4(20.0));
+	bvec4 swap_diagonal_bool = subgroupQuadSwapDiagonal(bvec4(true));
+	vec4 quad_broadcast = subgroupQuadBroadcast(vec4(20.0), 3u);
+	bvec4 quad_broadcast_bool = subgroupQuadBroadcast(bvec4(true), 3u);
+}
diff --git a/shaders-opencl-no-opt/comp/subgroups.nocompat.vk.subgroup.invalid.comp b/shaders-opencl-no-opt/comp/subgroups.nocompat.vk.subgroup.invalid.comp
new file mode 100644
index 000000000..c8172fd95
--- /dev/null
+++ b/shaders-opencl-no-opt/comp/subgroups.nocompat.vk.subgroup.invalid.comp
@@ -0,0 +1,211 @@
+#version 450
+#extension GL_KHR_shader_subgroup_basic : require
+#extension GL_KHR_shader_subgroup_ballot : require
+#extension GL_KHR_shader_subgroup_vote : require
+#extension GL_KHR_shader_subgroup_shuffle : require
+#extension GL_KHR_shader_subgroup_shuffle_relative : require
+#extension GL_KHR_shader_subgroup_arithmetic : require
+#extension GL_KHR_shader_subgroup_clustered : require
+#extension GL_KHR_shader_subgroup_quad : require
+#extension GL_KHR_shader_subgroup_rotate : require
+layout(local_size_x = 1) in;
+
+layout(std430, binding = 0) buffer SSBO
+{
+	float FragColor;
+};
+
+void doClusteredRotate()
+{
+	uint rotated_clustered = subgroupClusteredRotate(20u, 4u, 8u);
+	bool rotated_clustered_bool = subgroupClusteredRotate(false, 4u, 8u);
+}
+
+void main()
+{
+	// basic
+	FragColor = float(gl_NumSubgroups);
+	FragColor = float(gl_SubgroupID);
+	FragColor = float(gl_SubgroupSize);
+	FragColor = float(gl_SubgroupInvocationID);
+	subgroupBarrier();
+	subgroupMemoryBarrier();
+	subgroupMemoryBarrierBuffer();
+	subgroupMemoryBarrierShared();
+	subgroupMemoryBarrierImage();
+	bool elected = subgroupElect();
+
+	// ballot
+	FragColor = float(gl_SubgroupEqMask);
+	FragColor = float(gl_SubgroupGeMask);
+	FragColor = float(gl_SubgroupGtMask);
+	FragColor = float(gl_SubgroupLeMask);
+	FragColor = float(gl_SubgroupLtMask);
+	vec4 broadcasted = subgroupBroadcast(vec4(10.0), 8u);
+	bvec2 broadcasted_bool = subgroupBroadcast(bvec2(true), 8u);
+	vec3 first = subgroupBroadcastFirst(vec3(20.0));
+	bvec4 first_bool = subgroupBroadcastFirst(bvec4(false));
+	uvec4 ballot_value = subgroupBallot(true);
+	bool inverse_ballot_value = subgroupInverseBallot(ballot_value);
+	bool bit_extracted = subgroupBallotBitExtract(uvec4(10u), 8u);
+	uint bit_count = subgroupBallotBitCount(ballot_value);
+	uint inclusive_bit_count = subgroupBallotInclusiveBitCount(ballot_value);
+	uint exclusive_bit_count = subgroupBallotExclusiveBitCount(ballot_value);
+	uint lsb = subgroupBallotFindLSB(ballot_value);
+	uint msb = subgroupBallotFindMSB(ballot_value);
+
+	// shuffle
+	uint shuffled = subgroupShuffle(10u, 8u);
+	bool shuffled_bool = subgroupShuffle(true, 9u);
+	uint shuffled_xor = subgroupShuffleXor(30u, 8u);
+	bool shuffled_xor_bool = subgroupShuffleXor(false, 9u);
+
+	// shuffle relative 
+	uint shuffled_up = subgroupShuffleUp(20u, 4u);
+	bool shuffled_up_bool = subgroupShuffleUp(true, 4u);
+	uint shuffled_down = subgroupShuffleDown(20u, 4u);
+	bool shuffled_down_bool = subgroupShuffleDown(false, 4u);
+
+	// rotate
+	uint rotated = subgroupRotate(20u, 4u);
+	bool rotated_bool = subgroupRotate(false, 4u);
+	doClusteredRotate();
+
+	// vote
+	bool has_all = subgroupAll(true);
+	bool has_any = subgroupAny(true);
+	bool has_equal = subgroupAllEqual(0);
+	has_equal = subgroupAllEqual(true);
+	has_equal = subgroupAllEqual(vec3(0.0, 1.0, 2.0));
+	has_equal = subgroupAllEqual(bvec4(true, true, false, true));
+
+	// arithmetic
+	vec4 added = subgroupAdd(vec4(20.0));
+	ivec4 iadded = subgroupAdd(ivec4(20));
+	vec4 multiplied = subgroupMul(vec4(20.0));
+	ivec4 imultiplied = subgroupMul(ivec4(20));
+	vec4 lo = subgroupMin(vec4(20.0));
+	vec4 hi = subgroupMax(vec4(20.0));
+	ivec4 slo = subgroupMin(ivec4(20));
+	ivec4 shi = subgroupMax(ivec4(20));
+	uvec4 ulo = subgroupMin(uvec4(20));
+	uvec4 uhi = subgroupMax(uvec4(20));
+	uvec4 anded = subgroupAnd(ballot_value);
+	uvec4 ored = subgroupOr(ballot_value);
+	uvec4 xored = subgroupXor(ballot_value);
+	bvec4 anded_b = subgroupAnd(equal(ballot_value, uvec4(42)));
+	bvec4 ored_b = subgroupOr(equal(ballot_value, uvec4(42)));
+	bvec4 xored_b = subgroupXor(equal(ballot_value, uvec4(42)));
+
+	added = subgroupInclusiveAdd(added);
+	iadded = subgroupInclusiveAdd(iadded);
+	multiplied = subgroupInclusiveMul(multiplied);
+	imultiplied = subgroupInclusiveMul(imultiplied);
+	//lo = subgroupInclusiveMin(lo);  // FIXME: Unsupported by Metal
+	//hi = subgroupInclusiveMax(hi);
+	//slo = subgroupInclusiveMin(slo);
+	//shi = subgroupInclusiveMax(shi);
+	//ulo = subgroupInclusiveMin(ulo);
+	//uhi = subgroupInclusiveMax(uhi);
+	//anded = subgroupInclusiveAnd(anded);
+	//ored = subgroupInclusiveOr(ored);
+	//xored = subgroupInclusiveXor(ored);
+	//added = subgroupExclusiveAdd(lo);
+
+	added = subgroupExclusiveAdd(multiplied);
+	multiplied = subgroupExclusiveMul(multiplied);
+	iadded = subgroupExclusiveAdd(imultiplied);
+	imultiplied = subgroupExclusiveMul(imultiplied);
+	//lo = subgroupExclusiveMin(lo);  // FIXME: Unsupported by Metal
+	//hi = subgroupExclusiveMax(hi);
+	//ulo = subgroupExclusiveMin(ulo);
+	//uhi = subgroupExclusiveMax(uhi);
+	//slo = subgroupExclusiveMin(slo);
+	//shi = subgroupExclusiveMax(shi);
+	//anded = subgroupExclusiveAnd(anded);
+	//ored = subgroupExclusiveOr(ored);
+	//xored = subgroupExclusiveXor(ored);
+
+	// clustered
+	added = subgroupClusteredAdd(added, 1u);
+	multiplied = subgroupClusteredMul(multiplied, 1u);
+	iadded = subgroupClusteredAdd(iadded, 1u);
+	imultiplied = subgroupClusteredMul(imultiplied, 1u);
+	lo = subgroupClusteredMin(lo, 1u);
+	hi = subgroupClusteredMax(hi, 1u);
+	ulo = subgroupClusteredMin(ulo, 1u);
+	uhi = subgroupClusteredMax(uhi, 1u);
+	slo = subgroupClusteredMin(slo, 1u);
+	shi = subgroupClusteredMax(shi, 1u);
+	anded = subgroupClusteredAnd(anded, 1u);
+	ored = subgroupClusteredOr(ored, 1u);
+	xored = subgroupClusteredXor(xored, 1u);
+
+	anded_b = subgroupClusteredAnd(equal(anded, uvec4(2u)), 1u);
+	ored_b = subgroupClusteredOr(equal(ored, uvec4(3u)), 1u);
+	xored_b = subgroupClusteredXor(equal(xored, uvec4(4u)), 1u);
+
+	added = subgroupClusteredAdd(added, 2u);
+	multiplied = subgroupClusteredMul(multiplied, 2u);
+	iadded = subgroupClusteredAdd(iadded, 2u);
+	imultiplied = subgroupClusteredMul(imultiplied, 2u);
+	lo = subgroupClusteredMin(lo, 2u);
+	hi = subgroupClusteredMax(hi, 2u);
+	ulo = subgroupClusteredMin(ulo, 2u);
+	uhi = subgroupClusteredMax(uhi, 2u);
+	slo = subgroupClusteredMin(slo, 2u);
+	shi = subgroupClusteredMax(shi, 2u);
+	anded = subgroupClusteredAnd(anded, 2u);
+	ored = subgroupClusteredOr(ored, 2u);
+	xored = subgroupClusteredXor(xored, 2u);
+
+	anded_b = subgroupClusteredAnd(equal(anded, uvec4(2u)), 2u);
+	ored_b = subgroupClusteredOr(equal(ored, uvec4(3u)), 2u);
+	xored_b = subgroupClusteredXor(equal(xored, uvec4(4u)), 2u);
+
+	added = subgroupClusteredAdd(added, 4u);
+	multiplied = subgroupClusteredMul(multiplied, 4u);
+	iadded = subgroupClusteredAdd(iadded, 4u);
+	imultiplied = subgroupClusteredMul(imultiplied, 4u);
+	lo = subgroupClusteredMin(lo, 4u);
+	hi = subgroupClusteredMax(hi, 4u);
+	ulo = subgroupClusteredMin(ulo, 4u);
+	uhi = subgroupClusteredMax(uhi, 4u);
+	slo = subgroupClusteredMin(slo, 4u);
+	shi = subgroupClusteredMax(shi, 4u);
+	anded = subgroupClusteredAnd(anded, 4u);
+	ored = subgroupClusteredOr(ored, 4u);
+	xored = subgroupClusteredXor(xored, 4u);
+
+	anded_b = subgroupClusteredAnd(equal(anded, uvec4(2u)), 4u);
+	ored_b = subgroupClusteredOr(equal(ored, uvec4(3u)), 4u);
+	xored_b = subgroupClusteredXor(equal(xored, uvec4(4u)), 4u);
+
+	added = subgroupClusteredAdd(added, 16u);
+	multiplied = subgroupClusteredMul(multiplied, 16u);
+	iadded = subgroupClusteredAdd(iadded, 16u);
+	imultiplied = subgroupClusteredMul(imultiplied, 16u);
+	lo = subgroupClusteredMin(lo, 16u);
+	hi = subgroupClusteredMax(hi, 16u);
+	ulo = subgroupClusteredMin(ulo, 16u);
+	uhi = subgroupClusteredMax(uhi, 16u);
+	slo = subgroupClusteredMin(slo, 16u);
+	shi = subgroupClusteredMax(shi, 16u);
+	anded = subgroupClusteredAnd(anded, 16u);
+	ored = subgroupClusteredOr(ored, 16u);
+	xored = subgroupClusteredXor(xored, 16u);
+
+	anded_b = subgroupClusteredAnd(equal(anded, uvec4(2u)), 16u);
+	ored_b = subgroupClusteredOr(equal(ored, uvec4(3u)), 16u);
+	xored_b = subgroupClusteredXor(equal(xored, uvec4(4u)), 16u);
+
+	// quad
+	vec4 swap_horiz = subgroupQuadSwapHorizontal(vec4(20.0));
+	bvec4 swap_horiz_bool = subgroupQuadSwapHorizontal(bvec4(true));
+	vec4 swap_vertical = subgroupQuadSwapVertical(vec4(20.0));
+	bvec4 swap_vertical_bool = subgroupQuadSwapVertical(bvec4(true));
+	vec4 swap_diagonal = subgroupQuadSwapDiagonal(vec4(20.0));
+	bvec4 swap_diagonal_bool = subgroupQuadSwapDiagonal(bvec4(true));
+	vec4 quad_broadcast = subgroupQuadBroadcast(vec4(20.0), 3u);
+	bvec4 quad_broadcast_bool = subgroupQuadBroadcast(bvec4(true), 3u);
+}
diff --git a/shaders-opencl-no-opt/comp/subgroups.nocompat.vk.subgroup.swizzle.invalid.comp b/shaders-opencl-no-opt/comp/subgroups.nocompat.vk.subgroup.swizzle.invalid.comp
new file mode 100644
index 000000000..c8172fd95
--- /dev/null
+++ b/shaders-opencl-no-opt/comp/subgroups.nocompat.vk.subgroup.swizzle.invalid.comp
@@ -0,0 +1,211 @@
+#version 450
+#extension GL_KHR_shader_subgroup_basic : require
+#extension GL_KHR_shader_subgroup_ballot : require
+#extension GL_KHR_shader_subgroup_vote : require
+#extension GL_KHR_shader_subgroup_shuffle : require
+#extension GL_KHR_shader_subgroup_shuffle_relative : require
+#extension GL_KHR_shader_subgroup_arithmetic : require
+#extension GL_KHR_shader_subgroup_clustered : require
+#extension GL_KHR_shader_subgroup_quad : require
+#extension GL_KHR_shader_subgroup_rotate : require
+layout(local_size_x = 1) in;
+
+layout(std430, binding = 0) buffer SSBO
+{
+	float FragColor;
+};
+
+void doClusteredRotate()
+{
+	uint rotated_clustered = subgroupClusteredRotate(20u, 4u, 8u);
+	bool rotated_clustered_bool = subgroupClusteredRotate(false, 4u, 8u);
+}
+
+void main()
+{
+	// basic
+	FragColor = float(gl_NumSubgroups);
+	FragColor = float(gl_SubgroupID);
+	FragColor = float(gl_SubgroupSize);
+	FragColor = float(gl_SubgroupInvocationID);
+	subgroupBarrier();
+	subgroupMemoryBarrier();
+	subgroupMemoryBarrierBuffer();
+	subgroupMemoryBarrierShared();
+	subgroupMemoryBarrierImage();
+	bool elected = subgroupElect();
+
+	// ballot
+	FragColor = float(gl_SubgroupEqMask);
+	FragColor = float(gl_SubgroupGeMask);
+	FragColor = float(gl_SubgroupGtMask);
+	FragColor = float(gl_SubgroupLeMask);
+	FragColor = float(gl_SubgroupLtMask);
+	vec4 broadcasted = subgroupBroadcast(vec4(10.0), 8u);
+	bvec2 broadcasted_bool = subgroupBroadcast(bvec2(true), 8u);
+	vec3 first = subgroupBroadcastFirst(vec3(20.0));
+	bvec4 first_bool = subgroupBroadcastFirst(bvec4(false));
+	uvec4 ballot_value = subgroupBallot(true);
+	bool inverse_ballot_value = subgroupInverseBallot(ballot_value);
+	bool bit_extracted = subgroupBallotBitExtract(uvec4(10u), 8u);
+	uint bit_count = subgroupBallotBitCount(ballot_value);
+	uint inclusive_bit_count = subgroupBallotInclusiveBitCount(ballot_value);
+	uint exclusive_bit_count = subgroupBallotExclusiveBitCount(ballot_value);
+	uint lsb = subgroupBallotFindLSB(ballot_value);
+	uint msb = subgroupBallotFindMSB(ballot_value);
+
+	// shuffle
+	uint shuffled = subgroupShuffle(10u, 8u);
+	bool shuffled_bool = subgroupShuffle(true, 9u);
+	uint shuffled_xor = subgroupShuffleXor(30u, 8u);
+	bool shuffled_xor_bool = subgroupShuffleXor(false, 9u);
+
+	// shuffle relative 
+	uint shuffled_up = subgroupShuffleUp(20u, 4u);
+	bool shuffled_up_bool = subgroupShuffleUp(true, 4u);
+	uint shuffled_down = subgroupShuffleDown(20u, 4u);
+	bool shuffled_down_bool = subgroupShuffleDown(false, 4u);
+
+	// rotate
+	uint rotated = subgroupRotate(20u, 4u);
+	bool rotated_bool = subgroupRotate(false, 4u);
+	doClusteredRotate();
+
+	// vote
+	bool has_all = subgroupAll(true);
+	bool has_any = subgroupAny(true);
+	bool has_equal = subgroupAllEqual(0);
+	has_equal = subgroupAllEqual(true);
+	has_equal = subgroupAllEqual(vec3(0.0, 1.0, 2.0));
+	has_equal = subgroupAllEqual(bvec4(true, true, false, true));
+
+	// arithmetic
+	vec4 added = subgroupAdd(vec4(20.0));
+	ivec4 iadded = subgroupAdd(ivec4(20));
+	vec4 multiplied = subgroupMul(vec4(20.0));
+	ivec4 imultiplied = subgroupMul(ivec4(20));
+	vec4 lo = subgroupMin(vec4(20.0));
+	vec4 hi = subgroupMax(vec4(20.0));
+	ivec4 slo = subgroupMin(ivec4(20));
+	ivec4 shi = subgroupMax(ivec4(20));
+	uvec4 ulo = subgroupMin(uvec4(20));
+	uvec4 uhi = subgroupMax(uvec4(20));
+	uvec4 anded = subgroupAnd(ballot_value);
+	uvec4 ored = subgroupOr(ballot_value);
+	uvec4 xored = subgroupXor(ballot_value);
+	bvec4 anded_b = subgroupAnd(equal(ballot_value, uvec4(42)));
+	bvec4 ored_b = subgroupOr(equal(ballot_value, uvec4(42)));
+	bvec4 xored_b = subgroupXor(equal(ballot_value, uvec4(42)));
+
+	added = subgroupInclusiveAdd(added);
+	iadded = subgroupInclusiveAdd(iadded);
+	multiplied = subgroupInclusiveMul(multiplied);
+	imultiplied = subgroupInclusiveMul(imultiplied);
+	//lo = subgroupInclusiveMin(lo);  // FIXME: Unsupported by Metal
+	//hi = subgroupInclusiveMax(hi);
+	//slo = subgroupInclusiveMin(slo);
+	//shi = subgroupInclusiveMax(shi);
+	//ulo = subgroupInclusiveMin(ulo);
+	//uhi = subgroupInclusiveMax(uhi);
+	//anded = subgroupInclusiveAnd(anded);
+	//ored = subgroupInclusiveOr(ored);
+	//xored = subgroupInclusiveXor(ored);
+	//added = subgroupExclusiveAdd(lo);
+
+	added = subgroupExclusiveAdd(multiplied);
+	multiplied = subgroupExclusiveMul(multiplied);
+	iadded = subgroupExclusiveAdd(imultiplied);
+	imultiplied = subgroupExclusiveMul(imultiplied);
+	//lo = subgroupExclusiveMin(lo);  // FIXME: Unsupported by Metal
+	//hi = subgroupExclusiveMax(hi);
+	//ulo = subgroupExclusiveMin(ulo);
+	//uhi = subgroupExclusiveMax(uhi);
+	//slo = subgroupExclusiveMin(slo);
+	//shi = subgroupExclusiveMax(shi);
+	//anded = subgroupExclusiveAnd(anded);
+	//ored = subgroupExclusiveOr(ored);
+	//xored = subgroupExclusiveXor(ored);
+
+	// clustered
+	added = subgroupClusteredAdd(added, 1u);
+	multiplied = subgroupClusteredMul(multiplied, 1u);
+	iadded = subgroupClusteredAdd(iadded, 1u);
+	imultiplied = subgroupClusteredMul(imultiplied, 1u);
+	lo = subgroupClusteredMin(lo, 1u);
+	hi = subgroupClusteredMax(hi, 1u);
+	ulo = subgroupClusteredMin(ulo, 1u);
+	uhi = subgroupClusteredMax(uhi, 1u);
+	slo = subgroupClusteredMin(slo, 1u);
+	shi = subgroupClusteredMax(shi, 1u);
+	anded = subgroupClusteredAnd(anded, 1u);
+	ored = subgroupClusteredOr(ored, 1u);
+	xored = subgroupClusteredXor(xored, 1u);
+
+	anded_b = subgroupClusteredAnd(equal(anded, uvec4(2u)), 1u);
+	ored_b = subgroupClusteredOr(equal(ored, uvec4(3u)), 1u);
+	xored_b = subgroupClusteredXor(equal(xored, uvec4(4u)), 1u);
+
+	added = subgroupClusteredAdd(added, 2u);
+	multiplied = subgroupClusteredMul(multiplied, 2u);
+	iadded = subgroupClusteredAdd(iadded, 2u);
+	imultiplied = subgroupClusteredMul(imultiplied, 2u);
+	lo = subgroupClusteredMin(lo, 2u);
+	hi = subgroupClusteredMax(hi, 2u);
+	ulo = subgroupClusteredMin(ulo, 2u);
+	uhi = subgroupClusteredMax(uhi, 2u);
+	slo = subgroupClusteredMin(slo, 2u);
+	shi = subgroupClusteredMax(shi, 2u);
+	anded = subgroupClusteredAnd(anded, 2u);
+	ored = subgroupClusteredOr(ored, 2u);
+	xored = subgroupClusteredXor(xored, 2u);
+
+	anded_b = subgroupClusteredAnd(equal(anded, uvec4(2u)), 2u);
+	ored_b = subgroupClusteredOr(equal(ored, uvec4(3u)), 2u);
+	xored_b = subgroupClusteredXor(equal(xored, uvec4(4u)), 2u);
+
+	added = subgroupClusteredAdd(added, 4u);
+	multiplied = subgroupClusteredMul(multiplied, 4u);
+	iadded = subgroupClusteredAdd(iadded, 4u);
+	imultiplied = subgroupClusteredMul(imultiplied, 4u);
+	lo = subgroupClusteredMin(lo, 4u);
+	hi = subgroupClusteredMax(hi, 4u);
+	ulo = subgroupClusteredMin(ulo, 4u);
+	uhi = subgroupClusteredMax(uhi, 4u);
+	slo = subgroupClusteredMin(slo, 4u);
+	shi = subgroupClusteredMax(shi, 4u);
+	anded = subgroupClusteredAnd(anded, 4u);
+	ored = subgroupClusteredOr(ored, 4u);
+	xored = subgroupClusteredXor(xored, 4u);
+
+	anded_b = subgroupClusteredAnd(equal(anded, uvec4(2u)), 4u);
+	ored_b = subgroupClusteredOr(equal(ored, uvec4(3u)), 4u);
+	xored_b = subgroupClusteredXor(equal(xored, uvec4(4u)), 4u);
+
+	added = subgroupClusteredAdd(added, 16u);
+	multiplied = subgroupClusteredMul(multiplied, 16u);
+	iadded = subgroupClusteredAdd(iadded, 16u);
+	imultiplied = subgroupClusteredMul(imultiplied, 16u);
+	lo = subgroupClusteredMin(lo, 16u);
+	hi = subgroupClusteredMax(hi, 16u);
+	ulo = subgroupClusteredMin(ulo, 16u);
+	uhi = subgroupClusteredMax(uhi, 16u);
+	slo = subgroupClusteredMin(slo, 16u);
+	shi = subgroupClusteredMax(shi, 16u);
+	anded = subgroupClusteredAnd(anded, 16u);
+	ored = subgroupClusteredOr(ored, 16u);
+	xored = subgroupClusteredXor(xored, 16u);
+
+	anded_b = subgroupClusteredAnd(equal(anded, uvec4(2u)), 16u);
+	ored_b = subgroupClusteredOr(equal(ored, uvec4(3u)), 16u);
+	xored_b = subgroupClusteredXor(equal(xored, uvec4(4u)), 16u);
+
+	// quad
+	vec4 swap_horiz = subgroupQuadSwapHorizontal(vec4(20.0));
+	bvec4 swap_horiz_bool = subgroupQuadSwapHorizontal(bvec4(true));
+	vec4 swap_vertical = subgroupQuadSwapVertical(vec4(20.0));
+	bvec4 swap_vertical_bool = subgroupQuadSwapVertical(bvec4(true));
+	vec4 swap_diagonal = subgroupQuadSwapDiagonal(vec4(20.0));
+	bvec4 swap_diagonal_bool = subgroupQuadSwapDiagonal(bvec4(true));
+	vec4 quad_broadcast = subgroupQuadBroadcast(vec4(20.0), 3u);
+	bvec4 quad_broadcast_bool = subgroupQuadBroadcast(bvec4(true), 3u);
+}
diff --git a/shaders-opencl-no-opt/comp/trancendental-float-controls-1-fp16.fp16.comp b/shaders-opencl-no-opt/comp/trancendental-float-controls-1-fp16.fp16.comp
new file mode 100644
index 000000000..4f9e82f37
--- /dev/null
+++ b/shaders-opencl-no-opt/comp/trancendental-float-controls-1-fp16.fp16.comp
@@ -0,0 +1,35 @@
+#version 450
+#extension GL_EXT_spirv_intrinsics : require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
+layout(local_size_x = 4) in;
+
+layout(set = 0, binding = 0) buffer SSBO
+{
+	float v[4];
+	float16_t f16[4];
+};
+
+// SignedZeroInfNanPreserve 16
+spirv_execution_mode(capabilities = [4466], extensions = ["SPV_KHR_float_controls"], 4461, 16);
+
+void main ()
+{
+	v[gl_LocalInvocationIndex] = cos(v[gl_LocalInvocationIndex]);
+	v[gl_LocalInvocationIndex] += sin(v[gl_LocalInvocationIndex]);
+	v[gl_LocalInvocationIndex] += tan(v[gl_LocalInvocationIndex]);
+	v[gl_LocalInvocationIndex] += acos(v[gl_LocalInvocationIndex]);
+	v[gl_LocalInvocationIndex] += asin(v[gl_LocalInvocationIndex]);
+	v[gl_LocalInvocationIndex] += atan(v[gl_LocalInvocationIndex]);
+	v[gl_LocalInvocationIndex] += exp(v[gl_LocalInvocationIndex]);
+	v[gl_LocalInvocationIndex] += exp2(v[gl_LocalInvocationIndex]);
+	v[gl_LocalInvocationIndex] += log(v[gl_LocalInvocationIndex]);
+	v[gl_LocalInvocationIndex] += log2(v[gl_LocalInvocationIndex]);
+	v[gl_LocalInvocationIndex] += sqrt(v[gl_LocalInvocationIndex]);
+	v[gl_LocalInvocationIndex] += inversesqrt(v[gl_LocalInvocationIndex]);
+	v[gl_LocalInvocationIndex] += pow(v[gl_LocalInvocationIndex], 4.0);
+
+	f16[gl_LocalInvocationIndex] = cos(f16[gl_LocalInvocationIndex]);
+	f16[gl_LocalInvocationIndex] += sin(f16[gl_LocalInvocationIndex]);
+	f16[gl_LocalInvocationIndex] += cosh(f16[gl_LocalInvocationIndex]);
+	f16[gl_LocalInvocationIndex] += sinh(f16[gl_LocalInvocationIndex]);
+}
diff --git a/shaders-opencl-no-opt/comp/trancendental-float-controls-1-fp32.fp16.comp b/shaders-opencl-no-opt/comp/trancendental-float-controls-1-fp32.fp16.comp
new file mode 100644
index 000000000..8995457c9
--- /dev/null
+++ b/shaders-opencl-no-opt/comp/trancendental-float-controls-1-fp32.fp16.comp
@@ -0,0 +1,35 @@
+#version 450
+#extension GL_EXT_spirv_intrinsics : require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
+layout(local_size_x = 4) in;
+
+layout(set = 0, binding = 0) buffer SSBO
+{
+	float v[4];
+	float16_t f16[4];
+};
+
+// SignedZeroInfNanPreserve 32
+spirv_execution_mode(capabilities = [4466], extensions = ["SPV_KHR_float_controls"], 4461, 32);
+
+void main ()
+{
+	v[gl_LocalInvocationIndex] = cos(v[gl_LocalInvocationIndex]);
+	v[gl_LocalInvocationIndex] += sin(v[gl_LocalInvocationIndex]);
+	v[gl_LocalInvocationIndex] += tan(v[gl_LocalInvocationIndex]);
+	v[gl_LocalInvocationIndex] += acos(v[gl_LocalInvocationIndex]);
+	v[gl_LocalInvocationIndex] += asin(v[gl_LocalInvocationIndex]);
+	v[gl_LocalInvocationIndex] += atan(v[gl_LocalInvocationIndex]);
+	v[gl_LocalInvocationIndex] += exp(v[gl_LocalInvocationIndex]);
+	v[gl_LocalInvocationIndex] += exp2(v[gl_LocalInvocationIndex]);
+	v[gl_LocalInvocationIndex] += log(v[gl_LocalInvocationIndex]);
+	v[gl_LocalInvocationIndex] += log2(v[gl_LocalInvocationIndex]);
+	v[gl_LocalInvocationIndex] += sqrt(v[gl_LocalInvocationIndex]);
+	v[gl_LocalInvocationIndex] += inversesqrt(v[gl_LocalInvocationIndex]);
+	v[gl_LocalInvocationIndex] += pow(v[gl_LocalInvocationIndex], 4.0);
+
+	f16[gl_LocalInvocationIndex] = cos(f16[gl_LocalInvocationIndex]);
+	f16[gl_LocalInvocationIndex] += sin(f16[gl_LocalInvocationIndex]);
+	f16[gl_LocalInvocationIndex] += cosh(f16[gl_LocalInvocationIndex]);
+	f16[gl_LocalInvocationIndex] += sinh(f16[gl_LocalInvocationIndex]);
+}
diff --git a/shaders-opencl-no-opt/comp/transposed-temporary-expression-2.comp b/shaders-opencl-no-opt/comp/transposed-temporary-expression-2.comp
new file mode 100644
index 000000000..3f526942c
--- /dev/null
+++ b/shaders-opencl-no-opt/comp/transposed-temporary-expression-2.comp
@@ -0,0 +1,24 @@
+#version 450
+layout(local_size_x = 64) in;
+
+layout(set = 0, binding = 0) buffer SSBO
+{
+	layout(row_major) mat4x3 A;
+	layout(row_major) mat4x3 B;
+	layout(row_major) mat4x3 C;
+	vec4 D;
+	float w0;
+	float w1;
+};
+
+void main()
+{
+	mat4x3 Anew;
+	mat4x3 Bnew;
+	do
+	{
+		Anew = A * w0;
+		Bnew = B * w1;
+	} while(false);
+	D = vec4((Anew + Bnew) * D, 1.0);
+}
diff --git a/shaders-opencl-no-opt/comp/transposed-temporary-expression.comp b/shaders-opencl-no-opt/comp/transposed-temporary-expression.comp
new file mode 100644
index 000000000..ab56bd039
--- /dev/null
+++ b/shaders-opencl-no-opt/comp/transposed-temporary-expression.comp
@@ -0,0 +1,17 @@
+#version 450
+layout(local_size_x = 64) in;
+
+layout(set = 0, binding = 0) buffer SSBO
+{
+	layout(row_major) mat4x3 A;
+	layout(row_major) mat4x3 B;
+	layout(row_major) mat4x3 C;
+	vec4 D;
+	float w0;
+	float w1;
+};
+
+void main()
+{
+	D = vec4((A * w0 + B * w1) * D, 1.0);
+}
diff --git a/shaders-opencl-no-opt/comp/trivial-select-cast-vector.comp b/shaders-opencl-no-opt/comp/trivial-select-cast-vector.comp
new file mode 100644
index 000000000..c3e0922a1
--- /dev/null
+++ b/shaders-opencl-no-opt/comp/trivial-select-cast-vector.comp
@@ -0,0 +1,14 @@
+#version 450
+layout(local_size_x = 1) in;
+
+layout(set = 0, binding = 0) buffer A
+{
+	vec3 a;
+	vec3 b;
+};
+
+void main()
+{
+	bvec3 c = lessThan(b, vec3(1.0));
+	a = mix(vec3(1, 0, 0), vec3(0, 0, 1), c);
+}
diff --git a/shaders-opencl-no-opt/comp/trivial-select-matrix.spv14.comp b/shaders-opencl-no-opt/comp/trivial-select-matrix.spv14.comp
new file mode 100644
index 000000000..5ffcc3f3a
--- /dev/null
+++ b/shaders-opencl-no-opt/comp/trivial-select-matrix.spv14.comp
@@ -0,0 +1,16 @@
+#version 450
+layout(local_size_x = 1) in;
+
+layout(set = 0, binding = 0) buffer A
+{
+	mat3 a;
+	float b;
+};
+
+void main()
+{
+	// Scalar to Matrix
+	bool c = b < 1.0;
+	a = c ? mat3(vec3(1), vec3(1), vec3(1)) : mat3(vec3(0), vec3(0), vec3(0));
+	a = c ? mat3(1) : mat3(0);
+}
diff --git a/shaders-opencl-no-opt/comp/workgroup-size-spec-constant-array.invalid.comp b/shaders-opencl-no-opt/comp/workgroup-size-spec-constant-array.invalid.comp
new file mode 100644
index 000000000..d29e08005
--- /dev/null
+++ b/shaders-opencl-no-opt/comp/workgroup-size-spec-constant-array.invalid.comp
@@ -0,0 +1,21 @@
+#version 450
+layout(local_size_x_id = 0, local_size_y_id = 2, local_size_z_id = 3) in;
+
+layout(constant_id = 1) const int A = 2;
+float D[A * gl_WorkGroupSize.x];
+float E[A * gl_WorkGroupSize.z];
+
+layout(set = 0, binding = 0) buffer SSBO
+{
+	int I;
+	float V;
+};
+
+void main ()
+{
+	for (int i = 0; i < A * gl_WorkGroupSize.x; i++)
+		D[i] = 1.0 + float(i + gl_WorkGroupSize.y);
+	for (int i = 0; i < A * gl_WorkGroupSize.z; i++)
+		D[i] = 1.0 + float(i + gl_WorkGroupSize.y);
+	V = D[I] + D[I ^ 1] + E[I];
+}
diff --git a/shaders-opencl-no-opt/comp/workgroup-size-spec-constant-array.spv16.invalid.comp b/shaders-opencl-no-opt/comp/workgroup-size-spec-constant-array.spv16.invalid.comp
new file mode 100644
index 000000000..d29e08005
--- /dev/null
+++ b/shaders-opencl-no-opt/comp/workgroup-size-spec-constant-array.spv16.invalid.comp
@@ -0,0 +1,21 @@
+#version 450
+layout(local_size_x_id = 0, local_size_y_id = 2, local_size_z_id = 3) in;
+
+layout(constant_id = 1) const int A = 2;
+float D[A * gl_WorkGroupSize.x];
+float E[A * gl_WorkGroupSize.z];
+
+layout(set = 0, binding = 0) buffer SSBO
+{
+	int I;
+	float V;
+};
+
+void main ()
+{
+	for (int i = 0; i < A * gl_WorkGroupSize.x; i++)
+		D[i] = 1.0 + float(i + gl_WorkGroupSize.y);
+	for (int i = 0; i < A * gl_WorkGroupSize.z; i++)
+		D[i] = 1.0 + float(i + gl_WorkGroupSize.y);
+	V = D[I] + D[I ^ 1] + E[I];
+}
diff --git a/spirv_opencl.cpp b/spirv_opencl.cpp
index 3aaaa6150..b366ec9c6 100644
--- a/spirv_opencl.cpp
+++ b/spirv_opencl.cpp
@@ -1137,9 +1137,10 @@ const char *CompilerOpenCL::to_restrict(uint32_t id, bool space)
 	else
 		flags = get_decoration_bitset(id);
 
-	return flags.get(DecorationRestrict) || flags.get(DecorationRestrictPointerEXT) ?
-	           (space ? "__restrict " : "__restrict") :
-	           "";
+	// Only check DecorationRestrict here. DecorationRestrictPointerEXT is handled by
+	// flags_to_qualifiers_glsl in the GLSL base (emits "restrict " prefix), so we
+	// don't duplicate it as "__restrict" after the pointer star.
+	return flags.get(DecorationRestrict) ? (space ? "__restrict " : "__restrict") : "";
 }
 
 string CompilerOpenCL::type_to_glsl(const SPIRType &type, uint32_t id, bool member)
@@ -1822,6 +1823,26 @@ std::string CompilerOpenCL::constant_expression(const SPIRConstant &c, bool insi
 	return CompilerGLSL::constant_expression(c, inside_block_like_struct_scope, inside_struct_scope);
 }
 
+std::string CompilerOpenCL::to_initializer_expression(const SPIRVariable &var)
+{
+	// OpenCL C does not support initializing arrays from non-constant expressions
+	// (e.g., `float a[5] = ssbo->b;` is not valid C).
+	// For array variables with non-constant initializers, emit zero init `{ 0 }` and
+	// schedule element-by-element copy after the declaration.
+	auto &type = get_variable_data_type(var);
+	if (is_array(type) && var.initializer)
+	{
+		// Check if the initializer is a constant — those are fine as-is.
+		if (ir.ids[var.initializer].get_type() != TypeConstant)
+		{
+			// Queue the initializer for post-declaration element-by-element copy.
+			pending_array_copies.push_back({ var.self, var.initializer });
+			return "{ 0 }";
+		}
+	}
+	return CompilerGLSL::to_initializer_expression(var);
+}
+
 // OpenCL C requires cast syntax for vector construction: (float4)(1.0, 2.0, 3.0, 4.0)
 // The GLSL base emits: float4(1.0, 2.0, 3.0, 4.0) which is invalid in OpenCL C.
 std::string CompilerOpenCL::constant_expression_vector(const SPIRConstant &c, uint32_t vector)
@@ -1831,15 +1852,17 @@ std::string CompilerOpenCL::constant_expression_vector(const SPIRConstant &c, ui
 	auto type = get<SPIRType>(c.constant_type);
 	type.columns = 1;
 
-	if (type.vecsize > 1)
+	// The base class emits GLSL constructor-style casts: typename(args).
+	// OpenCL C requires C-style casts: (typename)(args).
+	// This applies to both vector types (e.g. float4(x)) and scalar casts
+	// (e.g. int(0x80000000), long(0x8000000000000000ul), uchar(0)).
+	auto scalar_type = type;
+	scalar_type.vecsize = 1;
+	auto type_name = (type.vecsize > 1) ? type_to_glsl(type) : type_to_glsl(scalar_type);
+	if (!type_name.empty() && res.size() > type_name.size() + 1 && res.substr(0, type_name.size()) == type_name &&
+	    res[type_name.size()] == '(')
 	{
-		// The base class emits: typename(args). OpenCL needs: (typename)(args).
-		auto type_name = type_to_glsl(type);
-		if (res.size() > type_name.size() + 1 && res.substr(0, type_name.size()) == type_name &&
-		    res[type_name.size()] == '(')
-		{
-			res = "(" + type_name + ")(" + res.substr(type_name.size() + 1);
-		}
+		res = "(" + type_name + ")(" + res.substr(type_name.size() + 1);
 	}
 
 	return res;
@@ -2413,23 +2436,41 @@ void CompilerOpenCL::emit_glsl_op(uint32_t result_type, uint32_t result_id, uint
 		emit_trinary_func_op(result_type, result_id, args[0], args[1], args[2], "clamp");
 		break;
 	case GLSLstd450SMin:
-		emit_binary_func_op(result_type, result_id, args[0], args[1], "min");
+	{
+		auto int_type = to_signed_basetype(get_integer_width_for_glsl_instruction(glsl_op, args, count));
+		emit_binary_func_op_cast(result_type, result_id, args[0], args[1], "min", int_type, false);
 		break;
+	}
 	case GLSLstd450SMax:
-		emit_binary_func_op(result_type, result_id, args[0], args[1], "max");
+	{
+		auto int_type = to_signed_basetype(get_integer_width_for_glsl_instruction(glsl_op, args, count));
+		emit_binary_func_op_cast(result_type, result_id, args[0], args[1], "max", int_type, false);
 		break;
+	}
 	case GLSLstd450UMin:
-		emit_binary_func_op(result_type, result_id, args[0], args[1], "min");
+	{
+		auto uint_type = to_unsigned_basetype(get_integer_width_for_glsl_instruction(glsl_op, args, count));
+		emit_binary_func_op_cast(result_type, result_id, args[0], args[1], "min", uint_type, false);
 		break;
+	}
 	case GLSLstd450UMax:
-		emit_binary_func_op(result_type, result_id, args[0], args[1], "max");
+	{
+		auto uint_type = to_unsigned_basetype(get_integer_width_for_glsl_instruction(glsl_op, args, count));
+		emit_binary_func_op_cast(result_type, result_id, args[0], args[1], "max", uint_type, false);
 		break;
+	}
 	case GLSLstd450SClamp:
-		emit_trinary_func_op(result_type, result_id, args[0], args[1], args[2], "clamp");
+	{
+		auto int_type = to_signed_basetype(get_integer_width_for_glsl_instruction(glsl_op, args, count));
+		emit_trinary_func_op_cast(result_type, result_id, args[0], args[1], args[2], "clamp", int_type);
 		break;
+	}
 	case GLSLstd450UClamp:
-		emit_trinary_func_op(result_type, result_id, args[0], args[1], args[2], "clamp");
+	{
+		auto uint_type = to_unsigned_basetype(get_integer_width_for_glsl_instruction(glsl_op, args, count));
+		emit_trinary_func_op_cast(result_type, result_id, args[0], args[1], args[2], "clamp", uint_type);
 		break;
+	}
 
 	case GLSLstd450FMix:
 	case GLSLstd450IMix:
@@ -2544,7 +2585,8 @@ std::string CompilerOpenCL::to_member_reference(uint32_t base, const SPIRType &t
 			// so is_pointer() above is false — we only reach here with actual pointers.
 			// Note: StorageClassWorkgroup is excluded because __local variables are emitted
 			// as value types in OpenCL C, so member access uses '.'.
-			if (sc == StorageClassStorageBuffer || sc == StorageClassCrossWorkgroup)
+			if (sc == StorageClassStorageBuffer || sc == StorageClassCrossWorkgroup ||
+			    sc == StorageClassPhysicalStorageBuffer)
 			{
 				return join("->", to_member_name(type, index));
 			}
@@ -3200,7 +3242,9 @@ bool CompilerOpenCL::emit_array_copy(const char *expr, uint32_t lhs_id, uint32_t
 		lhs = to_expression(lhs_id);
 
 	auto rhs_expr = to_expression(rhs_id);
-	auto &type = expression_type(rhs_id);
+	auto &raw_type = expression_type(rhs_id);
+	// If the RHS is a pointer (e.g., from OpLoad source), use the pointee type.
+	auto &type = is_pointer(raw_type) ? get_pointee_type(raw_type) : raw_type;
 
 	// Get the array size
 	if (!is_array(type) || type.array.empty())
@@ -3975,6 +4019,35 @@ void CompilerOpenCL::emit_instruction(const Instruction &instruction)
 		break;
 	}
 
+	// BDA pointer casts: emit C-style casts instead of GLSL constructor-style.
+	case OpConvertUToPtr:
+	{
+		auto &type = get<SPIRType>(ops[0]);
+		auto &in_type = expression_type(ops[2]);
+		auto ptr_type_str = type_to_glsl(type);
+		string expr;
+		if (in_type.vecsize > 1)
+			expr = join("((", ptr_type_str, ")as_ulong(", to_expression(ops[2]), "))");
+		else
+			expr = join("((", ptr_type_str, ")(", to_expression(ops[2]), "))");
+		emit_op(ops[0], ops[1], std::move(expr), should_forward(ops[2]));
+		inherit_expression_dependencies(ops[1], ops[2]);
+		break;
+	}
+
+	case OpConvertPtrToU:
+	{
+		auto &type = get<SPIRType>(ops[0]);
+		string expr;
+		if (type.vecsize > 1)
+			expr = join("as_", type_to_glsl(type), "((ulong)(", to_expression(ops[2]), "))");
+		else
+			expr = join("(", type_to_glsl(type), ")(", to_expression(ops[2]), ")");
+		emit_op(ops[0], ops[1], std::move(expr), should_forward(ops[2]));
+		inherit_expression_dependencies(ops[1], ops[2]);
+		break;
+	}
+
 	case OpBitcast:
 	{
 		auto &out_type = get<SPIRType>(ops[0]);
diff --git a/spirv_opencl.hpp b/spirv_opencl.hpp
index 82e551be9..0303e55d9 100644
--- a/spirv_opencl.hpp
+++ b/spirv_opencl.hpp
@@ -122,6 +122,7 @@ class CompilerOpenCL : public CompilerGLSL
 	                     StorageClass rhs_storage) override;
 	std::string constant_expression(const SPIRConstant &c, bool inside_block_like_struct_scope = false,
 	                                bool inside_struct_scope = false) override;
+	std::string to_initializer_expression(const SPIRVariable &var) override;
 	std::string constant_expression_vector(const SPIRConstant &c, uint32_t vector) override;
 	std::string bitcast_glsl_op(const SPIRType &result_type, const SPIRType &argument_type) override;
 	std::string to_atomic_ptr_expression(uint32_t id) override;
@@ -156,6 +157,10 @@ class CompilerOpenCL : public CompilerGLSL
 	// These are C values (not pointers), so subsequent member accesses must use '.' not '->'.
 	std::unordered_set<uint32_t> subscripted_deref_exprs;
 
+	// Pending array copies from to_initializer_expression: { var_id, initializer_id }
+	// These are emitted as element-by-element copies after the variable declaration.
+	SmallVector<std::pair<uint32_t, uint32_t>> pending_array_copies;
+
 	// Set when packHalf2x16/unpackHalf2x16 polyfill helpers are needed.
 	bool needs_half_pack_polyfill = false;
 	bool needs_half_unpack_polyfill = false;

From d1371b35571e5545a4c5ee425d830c08b8f1f66e Mon Sep 17 00:00:00 2001
From: Garrick Meeker <gmeeker@gmail.com>
Date: Sat, 14 Mar 2026 14:00:52 -0700
Subject: [PATCH 08/16] OpenCL: Fixes for legal C code

---
 ...e-load-store-short-vector.invalid.asm.comp |  14 +-
 ...ray-copy-physical-layout-mismatch.asm.comp |  37 +++
 ...-physical-layout-mismatch.invalid.asm.comp |  20 --
 ...m.comp => bitcast-fp16-fp32.fp16.asm.comp} |   4 +-
 ... block-like-array-type-construct.asm.comp} |  10 +-
 ...vice-array-load-temporary.asm.invalid.comp |   6 +-
 ...porary.force-native-array.asm.invalid.comp |   6 +-
 ...constant-array-load-store.asm.invalid.comp |  24 +-
 ...-store.force-native-array.asm.invalid.comp |  24 +-
 ...shared-inner-array-of-struct-copy.asm.comp |  53 ++++
 ...nner-array-of-struct-copy.invalid.asm.comp |  38 ---
 ...asm.comp => storage-buffer-basic.asm.comp} |   2 +-
 ...ar-alias-ptr-access-chain.asm.invalid.comp |   2 +-
 ...tier-1.device-argument-buffer.invalid.comp |  23 ++
 .../comp/array-copy-threadgroup-memory.comp   |  42 +++
 ...array-copy-threadgroup-memory.invalid.comp |  20 --
 .../shaders-opencl-no-opt/comp/basic.comp     |  36 +++
 .../comp/basic.invalid.comp                   |   0
 ...-atomics.invalid.comp => bda-atomics.comp} |   2 +-
 ...p => bda-load-std140-arrayed-pointer.comp} |   2 +-
 ...omp => bda-restrict-pointer-variable.comp} |   2 +-
 ...omp => extract-atomics-from-function.comp} |   2 +-
 ...p => implicit-integer-promotion.fp16.comp} |  48 ++--
 ...nvalid.comp => int16min-literal.fp16.comp} |   2 +-
 ...lid.comp => read-only-coherent-image.comp} |   2 +-
 ...tier-1.device-argument-buffer.invalid.comp |   0
 ...e-load-store-short-vector.invalid.asm.comp |  14 +-
 ...ay-copy-physical-layout-mismatch.asm.comp} |   0
 ...m.comp => bitcast-fp16-fp32.fp16.asm.comp} |   0
 ... block-like-array-type-construct.asm.comp} |   0
 ...hared-inner-array-of-struct-copy.asm.comp} |   0
 ...asm.comp => storage-buffer-basic.asm.comp} |   0
 ...omp => array-copy-threadgroup-memory.comp} |   0
 .../comp/{basic.invalid.comp => basic.comp}   |   0
 ...-atomics.invalid.comp => bda-atomics.comp} |   0
 ...p => bda-load-std140-arrayed-pointer.comp} |   0
 ...omp => bda-restrict-pointer-variable.comp} |   0
 ...omp => extract-atomics-from-function.comp} |   0
 ...p => implicit-integer-promotion.fp16.comp} |   0
 ...nvalid.comp => int16min-literal.fp16.comp} |   0
 ...lid.comp => read-only-coherent-image.comp} |   0
 ...tier-1.device-argument-buffer.invalid.comp |  13 -
 spirv_glsl.cpp                                |  18 +-
 spirv_glsl.hpp                                |   3 +
 spirv_opencl.cpp                              | 245 ++++++++++++++----
 spirv_opencl.hpp                              |   5 +-
 46 files changed, 499 insertions(+), 220 deletions(-)
 create mode 100644 reference/shaders-opencl-no-opt/asm/comp/array-copy-physical-layout-mismatch.asm.comp
 delete mode 100644 reference/shaders-opencl-no-opt/asm/comp/array-copy-physical-layout-mismatch.invalid.asm.comp
 rename reference/shaders-opencl-no-opt/asm/comp/{bitcast-fp16-fp32.fp16.invalid.asm.comp => bitcast-fp16-fp32.fp16.asm.comp} (79%)
 rename reference/shaders-opencl-no-opt/asm/comp/{block-like-array-type-construct.invalid.asm.comp => block-like-array-type-construct.asm.comp} (83%)
 create mode 100644 reference/shaders-opencl-no-opt/asm/comp/groupshared-inner-array-of-struct-copy.asm.comp
 delete mode 100644 reference/shaders-opencl-no-opt/asm/comp/groupshared-inner-array-of-struct-copy.invalid.asm.comp
 rename reference/shaders-opencl-no-opt/asm/comp/{storage-buffer-basic.invalid.asm.comp => storage-buffer-basic.asm.comp} (93%)
 create mode 100644 reference/shaders-opencl-no-opt/comp/array-copy-threadgroup-memory.comp
 delete mode 100644 reference/shaders-opencl-no-opt/comp/array-copy-threadgroup-memory.invalid.comp
 create mode 100644 reference/shaders-opencl-no-opt/comp/basic.comp
 delete mode 100644 reference/shaders-opencl-no-opt/comp/basic.invalid.comp
 rename reference/shaders-opencl-no-opt/comp/{bda-atomics.invalid.comp => bda-atomics.comp} (90%)
 rename reference/shaders-opencl-no-opt/comp/{bda-load-std140-arrayed-pointer.invalid.comp => bda-load-std140-arrayed-pointer.comp} (94%)
 rename reference/shaders-opencl-no-opt/comp/{bda-restrict-pointer-variable.invalid.comp => bda-restrict-pointer-variable.comp} (81%)
 rename reference/shaders-opencl-no-opt/comp/{extract-atomics-from-function.invalid.comp => extract-atomics-from-function.comp} (100%)
 rename reference/shaders-opencl-no-opt/comp/{implicit-integer-promotion.fp16.invalid.comp => implicit-integer-promotion.fp16.comp} (55%)
 rename reference/shaders-opencl-no-opt/comp/{int16min-literal.fp16.invalid.comp => int16min-literal.fp16.comp} (92%)
 rename reference/shaders-opencl-no-opt/comp/{read-only-coherent-image.invalid.comp => read-only-coherent-image.comp} (81%)
 delete mode 100644 reference/shaders-opencl-no-opt/comp/simple-bindless-ssbo.argument.argument-tier-1.device-argument-buffer.invalid.comp
 rename shaders-opencl-no-opt/asm/comp/{array-copy-physical-layout-mismatch.invalid.asm.comp => array-copy-physical-layout-mismatch.asm.comp} (100%)
 rename shaders-opencl-no-opt/asm/comp/{bitcast-fp16-fp32.fp16.invalid.asm.comp => bitcast-fp16-fp32.fp16.asm.comp} (100%)
 rename shaders-opencl-no-opt/asm/comp/{block-like-array-type-construct.invalid.asm.comp => block-like-array-type-construct.asm.comp} (100%)
 rename shaders-opencl-no-opt/asm/comp/{groupshared-inner-array-of-struct-copy.invalid.asm.comp => groupshared-inner-array-of-struct-copy.asm.comp} (100%)
 rename shaders-opencl-no-opt/asm/comp/{storage-buffer-basic.invalid.asm.comp => storage-buffer-basic.asm.comp} (100%)
 rename shaders-opencl-no-opt/comp/{array-copy-threadgroup-memory.invalid.comp => array-copy-threadgroup-memory.comp} (100%)
 rename shaders-opencl-no-opt/comp/{basic.invalid.comp => basic.comp} (100%)
 rename shaders-opencl-no-opt/comp/{bda-atomics.invalid.comp => bda-atomics.comp} (100%)
 rename shaders-opencl-no-opt/comp/{bda-load-std140-arrayed-pointer.invalid.comp => bda-load-std140-arrayed-pointer.comp} (100%)
 rename shaders-opencl-no-opt/comp/{bda-restrict-pointer-variable.invalid.comp => bda-restrict-pointer-variable.comp} (100%)
 rename shaders-opencl-no-opt/comp/{extract-atomics-from-function.invalid.comp => extract-atomics-from-function.comp} (100%)
 rename shaders-opencl-no-opt/comp/{implicit-integer-promotion.fp16.invalid.comp => implicit-integer-promotion.fp16.comp} (100%)
 rename shaders-opencl-no-opt/comp/{int16min-literal.fp16.invalid.comp => int16min-literal.fp16.comp} (100%)
 rename shaders-opencl-no-opt/comp/{read-only-coherent-image.invalid.comp => read-only-coherent-image.comp} (100%)
 delete mode 100644 shaders-opencl-no-opt/comp/simple-bindless-ssbo.argument.argument-tier-1.device-argument-buffer.invalid.comp

diff --git a/reference/opt/shaders-opencl/asm/comp/image-load-store-short-vector.invalid.asm.comp b/reference/opt/shaders-opencl/asm/comp/image-load-store-short-vector.invalid.asm.comp
index f7d65805e..4ab76df67 100644
--- a/reference/opt/shaders-opencl/asm/comp/image-load-store-short-vector.invalid.asm.comp
+++ b/reference/opt/shaders-opencl/asm/comp/image-load-store-short-vector.invalid.asm.comp
@@ -1,18 +1,18 @@
 // Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
 
 
-void _main( uint3* id)
+void _main( uint3* id_2)
 {
-    float2 loaded = read_imagef(TargetTexture, as_int2((*id).xy)).xy;
-    float2 storeTemp = loaded + (float2)(1.0f);
-    write_imagef(TargetTexture, as_int2((*id).xy + (uint2)(1u)), (float4)(storeTemp));
+    float2 loaded_1 = read_imagef(TargetTexture, as_int2((*id_2).xy)).xy;
+    float2 storeTemp_1 = loaded_1 + (float2)(1.0f);
+    write_imagef(TargetTexture, as_int2((*id_2).xy + (uint2)(1u)), (float4)(storeTemp_1));
 }
 
 __attribute__((reqd_work_group_size(1, 1, 1)))
 __kernel void comp_main(write_only image2d_t TargetTexture)
 {
-    uint3 id_1 = ((uint3)(get_group_id(0), get_group_id(1), get_group_id(2)));
-    uint3 param = id_1;
-    _main(&param);
+    uint3 id_1_1 = ((uint3)(get_group_id(0), get_group_id(1), get_group_id(2)));
+    uint3 param_1 = id_1_1;
+    _main(&param_1);
 }
 
diff --git a/reference/shaders-opencl-no-opt/asm/comp/array-copy-physical-layout-mismatch.asm.comp b/reference/shaders-opencl-no-opt/asm/comp/array-copy-physical-layout-mismatch.asm.comp
new file mode 100644
index 000000000..2cb996c09
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/asm/comp/array-copy-physical-layout-mismatch.asm.comp
@@ -0,0 +1,37 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct SSBO
+{
+    float b[5];
+    float c[5];
+};
+
+typedef struct SSBO SSBO;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global SSBO* _7)
+{
+    float a[5];
+    a[0] = _7->b[0];
+    a[1] = _7->b[1];
+    a[2] = _7->b[2];
+    a[3] = _7->b[3];
+    a[4] = _7->b[4];
+    a[0] = _7->b[0];
+    a[1] = _7->b[1];
+    a[2] = _7->b[2];
+    a[3] = _7->b[3];
+    a[4] = _7->b[4];
+    _7->b[0] = a[0];
+    _7->b[1] = a[1];
+    _7->b[2] = a[2];
+    _7->b[3] = a[3];
+    _7->b[4] = a[4];
+    _7->c[0] = a[0];
+    _7->c[1] = a[1];
+    _7->c[2] = a[2];
+    _7->c[3] = a[3];
+    _7->c[4] = a[4];
+}
+
diff --git a/reference/shaders-opencl-no-opt/asm/comp/array-copy-physical-layout-mismatch.invalid.asm.comp b/reference/shaders-opencl-no-opt/asm/comp/array-copy-physical-layout-mismatch.invalid.asm.comp
deleted file mode 100644
index 45aaa65c5..000000000
--- a/reference/shaders-opencl-no-opt/asm/comp/array-copy-physical-layout-mismatch.invalid.asm.comp
+++ /dev/null
@@ -1,20 +0,0 @@
-// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
-
-
-struct SSBO
-{
-    float b[5];
-    float c[5];
-};
-
-typedef struct SSBO SSBO;
-
-__attribute__((reqd_work_group_size(1, 1, 1)))
-__kernel void comp_main(__global SSBO* _7)
-{
-    float a[5] = _7->b;
-    a = _7->b;
-    _7->b = a;
-    _7->c = a;
-}
-
diff --git a/reference/shaders-opencl-no-opt/asm/comp/bitcast-fp16-fp32.fp16.invalid.asm.comp b/reference/shaders-opencl-no-opt/asm/comp/bitcast-fp16-fp32.fp16.asm.comp
similarity index 79%
rename from reference/shaders-opencl-no-opt/asm/comp/bitcast-fp16-fp32.fp16.invalid.asm.comp
rename to reference/shaders-opencl-no-opt/asm/comp/bitcast-fp16-fp32.fp16.asm.comp
index b02f295d9..3b38a72fe 100644
--- a/reference/shaders-opencl-no-opt/asm/comp/bitcast-fp16-fp32.fp16.invalid.asm.comp
+++ b/reference/shaders-opencl-no-opt/asm/comp/bitcast-fp16-fp32.fp16.asm.comp
@@ -16,7 +16,7 @@ __attribute__((reqd_work_group_size(1, 1, 1)))
 __kernel void comp_main(__global SSBO* _6)
 {
     uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
-    _6->b = uintBitsToFloat(packFloat2x16(_6->a));
-    _6->d = unpackFloat2x16(floatBitsToUint(_6->c));
+    _6->b = as_float(_6->a);
+    _6->d = as_half2(_6->c);
 }
 
diff --git a/reference/shaders-opencl-no-opt/asm/comp/block-like-array-type-construct.invalid.asm.comp b/reference/shaders-opencl-no-opt/asm/comp/block-like-array-type-construct.asm.comp
similarity index 83%
rename from reference/shaders-opencl-no-opt/asm/comp/block-like-array-type-construct.invalid.asm.comp
rename to reference/shaders-opencl-no-opt/asm/comp/block-like-array-type-construct.asm.comp
index 421377b4d..3744e3e11 100644
--- a/reference/shaders-opencl-no-opt/asm/comp/block-like-array-type-construct.invalid.asm.comp
+++ b/reference/shaders-opencl-no-opt/asm/comp/block-like-array-type-construct.asm.comp
@@ -27,11 +27,17 @@ __kernel void comp_main(__global SSBO* _8)
     float foo[4];
     float foo2[4];
     foo[0] = 1.0f;
-    foo = { 1.0f, 2.0f, 3.0f, 4.0f };
+    foo[0] = 1.0f;
+    foo[1] = 2.0f;
+    foo[2] = 3.0f;
+    foo[3] = 4.0f;
     foo[1] = 2.0f;
     foo[2] = 3.0f;
     foo[3] = 4.0f;
-    foo2 = foo;
+    foo2[0] = foo[0];
+    foo2[1] = foo[1];
+    foo2[2] = foo[2];
+    foo2[3] = foo[3];
     _12 _41 = (_12){ { foo[0], foo[1], foo[2], foo[3] }, { foo2[0], foo2[1], foo2[2], foo2[3] } };
 }
 
diff --git a/reference/shaders-opencl-no-opt/asm/comp/device-array-load-temporary.asm.invalid.comp b/reference/shaders-opencl-no-opt/asm/comp/device-array-load-temporary.asm.invalid.comp
index e4387c0c9..38ec524f8 100644
--- a/reference/shaders-opencl-no-opt/asm/comp/device-array-load-temporary.asm.invalid.comp
+++ b/reference/shaders-opencl-no-opt/asm/comp/device-array-load-temporary.asm.invalid.comp
@@ -23,7 +23,9 @@ __kernel void comp_main(__global Block* ssbo)
     uint2 _27[2];
     _27[0] = ssbo[0u]._m1[0];
     _27[1] = ssbo[0u]._m1[1];
-    ssbo[0u]._m0 = _27;
-    ssbo[0u]._m0 = _27;
+    ssbo[0u]._m0[0] = _27[0];
+    ssbo[0u]._m0[1] = _27[1];
+    ssbo[0u]._m0[0] = _27[0];
+    ssbo[0u]._m0[1] = _27[1];
 }
 
diff --git a/reference/shaders-opencl-no-opt/asm/comp/device-array-load-temporary.force-native-array.asm.invalid.comp b/reference/shaders-opencl-no-opt/asm/comp/device-array-load-temporary.force-native-array.asm.invalid.comp
index e4387c0c9..38ec524f8 100644
--- a/reference/shaders-opencl-no-opt/asm/comp/device-array-load-temporary.force-native-array.asm.invalid.comp
+++ b/reference/shaders-opencl-no-opt/asm/comp/device-array-load-temporary.force-native-array.asm.invalid.comp
@@ -23,7 +23,9 @@ __kernel void comp_main(__global Block* ssbo)
     uint2 _27[2];
     _27[0] = ssbo[0u]._m1[0];
     _27[1] = ssbo[0u]._m1[1];
-    ssbo[0u]._m0 = _27;
-    ssbo[0u]._m0 = _27;
+    ssbo[0u]._m0[0] = _27[0];
+    ssbo[0u]._m0[1] = _27[1];
+    ssbo[0u]._m0[0] = _27[0];
+    ssbo[0u]._m0[1] = _27[1];
 }
 
diff --git a/reference/shaders-opencl-no-opt/asm/comp/device-constant-array-load-store.asm.invalid.comp b/reference/shaders-opencl-no-opt/asm/comp/device-constant-array-load-store.asm.invalid.comp
index f8a5f221b..48f285a9a 100644
--- a/reference/shaders-opencl-no-opt/asm/comp/device-constant-array-load-store.asm.invalid.comp
+++ b/reference/shaders-opencl-no-opt/asm/comp/device-constant-array-load-store.asm.invalid.comp
@@ -20,14 +20,22 @@ __attribute__((reqd_work_group_size(1, 1, 1)))
 __kernel void comp_main(__global Block* ssbo, SSBO ubo)
 {
     __local uint2 _18[2];
-    ssbo[0u]._m0 = ssbo[0u]._m1;
-    ssbo[0u]._m0 = ubo._m0[0u]._m1;
+    ssbo[0u]._m0[0] = ssbo[0u]._m1[0];
+    ssbo[0u]._m0[1] = ssbo[0u]._m1[1];
+    ssbo[0u]._m0[0] = ubo._m0[0u]._m1[0];
+    ssbo[0u]._m0[1] = ubo._m0[0u]._m1[1];
     uint2 _23[2];
-    ssbo[0u]._m0 = _23;
-    ssbo[0u]._m0 = _18;
-    _18 = ssbo[0u]._m1;
-    _23 = ssbo[0u]._m1;
-    _18 = ubo._m0[0u]._m1;
-    _23 = ubo._m0[0u]._m1;
+    ssbo[0u]._m0[0] = _23[0];
+    ssbo[0u]._m0[1] = _23[1];
+    ssbo[0u]._m0[0] = _18[0];
+    ssbo[0u]._m0[1] = _18[1];
+    _18[0] = ssbo[0u]._m1[0];
+    _18[1] = ssbo[0u]._m1[1];
+    _23[0] = ssbo[0u]._m1[0];
+    _23[1] = ssbo[0u]._m1[1];
+    _18[0] = ubo._m0[0u]._m1[0];
+    _18[1] = ubo._m0[0u]._m1[1];
+    _23[0] = ubo._m0[0u]._m1[0];
+    _23[1] = ubo._m0[0u]._m1[1];
 }
 
diff --git a/reference/shaders-opencl-no-opt/asm/comp/device-constant-array-load-store.force-native-array.asm.invalid.comp b/reference/shaders-opencl-no-opt/asm/comp/device-constant-array-load-store.force-native-array.asm.invalid.comp
index f8a5f221b..48f285a9a 100644
--- a/reference/shaders-opencl-no-opt/asm/comp/device-constant-array-load-store.force-native-array.asm.invalid.comp
+++ b/reference/shaders-opencl-no-opt/asm/comp/device-constant-array-load-store.force-native-array.asm.invalid.comp
@@ -20,14 +20,22 @@ __attribute__((reqd_work_group_size(1, 1, 1)))
 __kernel void comp_main(__global Block* ssbo, SSBO ubo)
 {
     __local uint2 _18[2];
-    ssbo[0u]._m0 = ssbo[0u]._m1;
-    ssbo[0u]._m0 = ubo._m0[0u]._m1;
+    ssbo[0u]._m0[0] = ssbo[0u]._m1[0];
+    ssbo[0u]._m0[1] = ssbo[0u]._m1[1];
+    ssbo[0u]._m0[0] = ubo._m0[0u]._m1[0];
+    ssbo[0u]._m0[1] = ubo._m0[0u]._m1[1];
     uint2 _23[2];
-    ssbo[0u]._m0 = _23;
-    ssbo[0u]._m0 = _18;
-    _18 = ssbo[0u]._m1;
-    _23 = ssbo[0u]._m1;
-    _18 = ubo._m0[0u]._m1;
-    _23 = ubo._m0[0u]._m1;
+    ssbo[0u]._m0[0] = _23[0];
+    ssbo[0u]._m0[1] = _23[1];
+    ssbo[0u]._m0[0] = _18[0];
+    ssbo[0u]._m0[1] = _18[1];
+    _18[0] = ssbo[0u]._m1[0];
+    _18[1] = ssbo[0u]._m1[1];
+    _23[0] = ssbo[0u]._m1[0];
+    _23[1] = ssbo[0u]._m1[1];
+    _18[0] = ubo._m0[0u]._m1[0];
+    _18[1] = ubo._m0[0u]._m1[1];
+    _23[0] = ubo._m0[0u]._m1[0];
+    _23[1] = ubo._m0[0u]._m1[1];
 }
 
diff --git a/reference/shaders-opencl-no-opt/asm/comp/groupshared-inner-array-of-struct-copy.asm.comp b/reference/shaders-opencl-no-opt/asm/comp/groupshared-inner-array-of-struct-copy.asm.comp
new file mode 100644
index 000000000..d0c505a04
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/asm/comp/groupshared-inner-array-of-struct-copy.asm.comp
@@ -0,0 +1,53 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct Data
+{
+    float3 sourceData[16];
+};
+
+typedef struct Data Data;
+
+__attribute__((reqd_work_group_size(8, 8, 1)))
+__kernel void comp_main(read_only image2d_t g_inputTexture, write_only image2d_t g_output)
+{
+    __local Data g_data[64];
+    uint _49;
+    _49 = 0u;
+    for (; _49 < 4u; _49++)
+    {
+        for (uint _56 = 0u; _56 < 4u; )
+        {
+            g_data[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x].sourceData[(_49 * 4u) + _56] = read_imagef(g_inputTexture, (as_int3(((uint3)(get_global_id(0), get_global_id(1), get_global_id(2)))) + (int3)(as_int(_56), as_int(_49), 0)).xy).xyz;
+            _56++;
+            continue;
+        }
+    }
+    float3 _45[16];
+    _45[0] = g_data[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x].sourceData[0];
+    _45[1] = g_data[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x].sourceData[1];
+    _45[2] = g_data[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x].sourceData[2];
+    _45[3] = g_data[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x].sourceData[3];
+    _45[4] = g_data[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x].sourceData[4];
+    _45[5] = g_data[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x].sourceData[5];
+    _45[6] = g_data[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x].sourceData[6];
+    _45[7] = g_data[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x].sourceData[7];
+    _45[8] = g_data[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x].sourceData[8];
+    _45[9] = g_data[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x].sourceData[9];
+    _45[10] = g_data[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x].sourceData[10];
+    _45[11] = g_data[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x].sourceData[11];
+    _45[12] = g_data[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x].sourceData[12];
+    _45[13] = g_data[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x].sourceData[13];
+    _45[14] = g_data[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x].sourceData[14];
+    _45[15] = g_data[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x].sourceData[15];
+    uint _77;
+    _77 = 0u;
+    for (int _80 = 0; _80 < 16; )
+    {
+        _77 |= convert_uint(clamp(dot(_45[_80], (float3)(-1.0f)), 0.0f, 1.0f));
+        _80++;
+        continue;
+    }
+    write_imageui(g_output, as_int2(((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).xy), (uint4)(_77));
+}
+
diff --git a/reference/shaders-opencl-no-opt/asm/comp/groupshared-inner-array-of-struct-copy.invalid.asm.comp b/reference/shaders-opencl-no-opt/asm/comp/groupshared-inner-array-of-struct-copy.invalid.asm.comp
deleted file mode 100644
index cedb4d5d6..000000000
--- a/reference/shaders-opencl-no-opt/asm/comp/groupshared-inner-array-of-struct-copy.invalid.asm.comp
+++ /dev/null
@@ -1,38 +0,0 @@
-// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
-
-
-struct Data
-{
-    float3 sourceData[16];
-};
-
-typedef struct Data Data;
-
-__attribute__((reqd_work_group_size(8, 8, 1)))
-__kernel void comp_main(read_only image2d_t g_inputTexture, write_only image2d_t g_output)
-{
-    __local Data g_data[64];
-    uint _49;
-    _49 = 0u;
-    for (; _49 < 4u; _49++)
-    {
-        for (uint _56 = 0u; _56 < 4u; )
-        {
-            int3 _65 = as_int3(((uint3)(get_global_id(0), get_global_id(1), get_global_id(2)))) + (int3)(as_int(_56), as_int(_49), 0);
-            g_data[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x].sourceData[(_49 * 4u) + _56] = texelFetch(g_inputTexture, _65.xy, _65.z).xyz;
-            _56++;
-            continue;
-        }
-    }
-    float3 _45[16] = g_data[((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x].sourceData;
-    uint _77;
-    _77 = 0u;
-    for (int _80 = 0; _80 < 16; )
-    {
-        _77 |= convert_uint(clamp(dot(_45[_80], (float3)(-1.0f)), 0.0f, 1.0f));
-        _80++;
-        continue;
-    }
-    write_imageui(g_output, as_int2(((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).xy), (uint4)(_77));
-}
-
diff --git a/reference/shaders-opencl-no-opt/asm/comp/storage-buffer-basic.invalid.asm.comp b/reference/shaders-opencl-no-opt/asm/comp/storage-buffer-basic.asm.comp
similarity index 93%
rename from reference/shaders-opencl-no-opt/asm/comp/storage-buffer-basic.invalid.asm.comp
rename to reference/shaders-opencl-no-opt/asm/comp/storage-buffer-basic.asm.comp
index 952585e08..4641626ae 100644
--- a/reference/shaders-opencl-no-opt/asm/comp/storage-buffer-basic.invalid.asm.comp
+++ b/reference/shaders-opencl-no-opt/asm/comp/storage-buffer-basic.asm.comp
@@ -21,7 +21,7 @@ constant uint3 spvWorkgroupSize = (uint3)(_15, 2u, _17);
 __attribute__((reqd_work_group_size(1, 2, 3)))
 __kernel void comp_main(__global float* _20, __global float* _21)
 {
-    uint3 _19 = spvWorkgroupSize = spvWorkgroupSize;
+    uint3 _19 = spvWorkgroupSize;
     _20[((uint3)(get_group_id(0), get_group_id(1), get_group_id(2))).x] = _21[((uint3)(get_group_id(0), get_group_id(1), get_group_id(2))).x] + _20[((uint3)(get_group_id(0), get_group_id(1), get_group_id(2))).x];
 }
 
diff --git a/reference/shaders-opencl-no-opt/asm/comp/workgroup-uint-to-uchar-alias-ptr-access-chain.asm.invalid.comp b/reference/shaders-opencl-no-opt/asm/comp/workgroup-uint-to-uchar-alias-ptr-access-chain.asm.invalid.comp
index c6c583b90..2f3e9217c 100644
--- a/reference/shaders-opencl-no-opt/asm/comp/workgroup-uint-to-uchar-alias-ptr-access-chain.asm.invalid.comp
+++ b/reference/shaders-opencl-no-opt/asm/comp/workgroup-uint-to-uchar-alias-ptr-access-chain.asm.invalid.comp
@@ -43,7 +43,7 @@ __kernel void comp_main(__global uint4* _25, _8 _29)
 {
     __local uint _5[256];
     __local uchar _10[1024];
-    uint3 _20 = spvWorkgroupSize = spvWorkgroupSize;
+    uint3 _20 = spvWorkgroupSize;
     bool _40 = _29._m0._m0 != 0u;
     if (_40)
     {
diff --git a/reference/shaders-opencl-no-opt/comp/argument-buffer-readonly-writeonly-alias.argument.argument-tier-1.device-argument-buffer.invalid.comp b/reference/shaders-opencl-no-opt/comp/argument-buffer-readonly-writeonly-alias.argument.argument-tier-1.device-argument-buffer.invalid.comp
index e69de29bb..497606109 100644
--- a/reference/shaders-opencl-no-opt/comp/argument-buffer-readonly-writeonly-alias.argument.argument-tier-1.device-argument-buffer.invalid.comp
+++ b/reference/shaders-opencl-no-opt/comp/argument-buffer-readonly-writeonly-alias.argument.argument-tier-1.device-argument-buffer.invalid.comp
@@ -0,0 +1,23 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct D
+{
+    float data_d[1];
+};
+
+typedef struct D D;
+
+struct A
+{
+    float data_a[1];
+};
+
+typedef struct A A;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global float* d, __global const float* a)
+{
+    d[0][0] = a[0][0];
+}
+
diff --git a/reference/shaders-opencl-no-opt/comp/array-copy-threadgroup-memory.comp b/reference/shaders-opencl-no-opt/comp/array-copy-threadgroup-memory.comp
new file mode 100644
index 000000000..3ceaf1450
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/comp/array-copy-threadgroup-memory.comp
@@ -0,0 +1,42 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+__attribute__((reqd_work_group_size(8, 1, 1)))
+__kernel void comp_main()
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    __local float shared_group[8][8];
+    __local float shared_group_alt[8][8];
+    float blob[8];
+    for (int i = 0; i < 8; i++)
+    {
+        blob[i] = convert_float(i);
+    }
+    shared_group[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))][0] = blob[0];
+    shared_group[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))][1] = blob[1];
+    shared_group[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))][2] = blob[2];
+    shared_group[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))][3] = blob[3];
+    shared_group[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))][4] = blob[4];
+    shared_group[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))][5] = blob[5];
+    shared_group[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))][6] = blob[6];
+    shared_group[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))][7] = blob[7];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    float copied_blob[8];
+    copied_blob[0] = shared_group[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0))) ^ 1u][0];
+    copied_blob[1] = shared_group[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0))) ^ 1u][1];
+    copied_blob[2] = shared_group[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0))) ^ 1u][2];
+    copied_blob[3] = shared_group[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0))) ^ 1u][3];
+    copied_blob[4] = shared_group[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0))) ^ 1u][4];
+    copied_blob[5] = shared_group[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0))) ^ 1u][5];
+    copied_blob[6] = shared_group[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0))) ^ 1u][6];
+    copied_blob[7] = shared_group[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0))) ^ 1u][7];
+    shared_group_alt[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))][0] = shared_group[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))][0];
+    shared_group_alt[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))][1] = shared_group[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))][1];
+    shared_group_alt[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))][2] = shared_group[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))][2];
+    shared_group_alt[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))][3] = shared_group[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))][3];
+    shared_group_alt[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))][4] = shared_group[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))][4];
+    shared_group_alt[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))][5] = shared_group[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))][5];
+    shared_group_alt[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))][6] = shared_group[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))][6];
+    shared_group_alt[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))][7] = shared_group[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))][7];
+}
+
diff --git a/reference/shaders-opencl-no-opt/comp/array-copy-threadgroup-memory.invalid.comp b/reference/shaders-opencl-no-opt/comp/array-copy-threadgroup-memory.invalid.comp
deleted file mode 100644
index 33bdcbea5..000000000
--- a/reference/shaders-opencl-no-opt/comp/array-copy-threadgroup-memory.invalid.comp
+++ /dev/null
@@ -1,20 +0,0 @@
-// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
-
-
-__attribute__((reqd_work_group_size(8, 1, 1)))
-__kernel void comp_main()
-{
-    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
-    __local float shared_group[8][8];
-    __local float shared_group_alt[8][8];
-    float blob[8];
-    for (int i = 0; i < 8; i++)
-    {
-        blob[i] = convert_float(i);
-    }
-    shared_group[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))] = blob;
-    barrier(CLK_LOCAL_MEM_FENCE);
-    float copied_blob[8] = shared_group[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0))) ^ 1u];
-    shared_group_alt[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))] = shared_group[((uint)(get_local_id(2) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0)))];
-}
-
diff --git a/reference/shaders-opencl-no-opt/comp/basic.comp b/reference/shaders-opencl-no-opt/comp/basic.comp
new file mode 100644
index 000000000..35091de9a
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/comp/basic.comp
@@ -0,0 +1,36 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct Baz
+{
+    int e;
+    int f;
+};
+
+typedef struct Baz Baz;
+
+struct Foo
+{
+    int a;
+    int b;
+};
+
+typedef struct Foo Foo;
+
+struct Bar
+{
+    int c;
+    int d;
+};
+
+typedef struct Bar Bar;
+
+__attribute__((reqd_work_group_size(3, 3, 2)))
+__kernel void comp_main(__global Baz* baz, Foo _34, Bar _40)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    uint3 coords = ((uint3)(get_global_id(0), get_global_id(1), get_global_id(2)));
+    baz[(coords.x + coords.y) + coords.z].e = _34.a + _40.c;
+    baz[(coords.x + coords.y) + coords.z].f = _34.b * _40.d;
+}
+
diff --git a/reference/shaders-opencl-no-opt/comp/basic.invalid.comp b/reference/shaders-opencl-no-opt/comp/basic.invalid.comp
deleted file mode 100644
index e69de29bb..000000000
diff --git a/reference/shaders-opencl-no-opt/comp/bda-atomics.invalid.comp b/reference/shaders-opencl-no-opt/comp/bda-atomics.comp
similarity index 90%
rename from reference/shaders-opencl-no-opt/comp/bda-atomics.invalid.comp
rename to reference/shaders-opencl-no-opt/comp/bda-atomics.comp
index 5b9a08f9b..a3250fb5d 100644
--- a/reference/shaders-opencl-no-opt/comp/bda-atomics.invalid.comp
+++ b/reference/shaders-opencl-no-opt/comp/bda-atomics.comp
@@ -33,7 +33,7 @@ struct SSBO
 typedef struct SSBO SSBO;
 
 __attribute__((reqd_work_group_size(1, 1, 1)))
-__kernel void comp_main(Registers _12, UBO _26, __global const __global Ptr** _35)
+__kernel void comp_main(Registers _12, UBO _26, __global const ulong* _35)
 {
     uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
     uint _23 = atomic_add(&((__global Ptr*)(_12.ptr))->i, 10u);
diff --git a/reference/shaders-opencl-no-opt/comp/bda-load-std140-arrayed-pointer.invalid.comp b/reference/shaders-opencl-no-opt/comp/bda-load-std140-arrayed-pointer.comp
similarity index 94%
rename from reference/shaders-opencl-no-opt/comp/bda-load-std140-arrayed-pointer.invalid.comp
rename to reference/shaders-opencl-no-opt/comp/bda-load-std140-arrayed-pointer.comp
index cb3ef0331..381ef3667 100644
--- a/reference/shaders-opencl-no-opt/comp/bda-load-std140-arrayed-pointer.invalid.comp
+++ b/reference/shaders-opencl-no-opt/comp/bda-load-std140-arrayed-pointer.comp
@@ -12,7 +12,7 @@ typedef struct SSBO SSBO;
 
 struct UBO
 {
-    __global SSBO* ptrs[2];
+    ulong ptrs[2];
 };
 
 typedef struct UBO UBO;
diff --git a/reference/shaders-opencl-no-opt/comp/bda-restrict-pointer-variable.invalid.comp b/reference/shaders-opencl-no-opt/comp/bda-restrict-pointer-variable.comp
similarity index 81%
rename from reference/shaders-opencl-no-opt/comp/bda-restrict-pointer-variable.invalid.comp
rename to reference/shaders-opencl-no-opt/comp/bda-restrict-pointer-variable.comp
index 5d82fb4d5..098f8fd5c 100644
--- a/reference/shaders-opencl-no-opt/comp/bda-restrict-pointer-variable.invalid.comp
+++ b/reference/shaders-opencl-no-opt/comp/bda-restrict-pointer-variable.comp
@@ -20,7 +20,7 @@ typedef struct Registers Registers;
 __attribute__((reqd_work_group_size(1, 1, 1)))
 __kernel void comp_main(Registers _14)
 {
-    restrict __global Ref* __restrict ref = ((__global Ref*)(_14.foo));
+    __global Ref* __restrict ref = ((__global Ref*)(_14.foo));
     ref->v = (float4)(1.0f);
 }
 
diff --git a/reference/shaders-opencl-no-opt/comp/extract-atomics-from-function.invalid.comp b/reference/shaders-opencl-no-opt/comp/extract-atomics-from-function.comp
similarity index 100%
rename from reference/shaders-opencl-no-opt/comp/extract-atomics-from-function.invalid.comp
rename to reference/shaders-opencl-no-opt/comp/extract-atomics-from-function.comp
index 016fbcd95..f1da5e503 100644
--- a/reference/shaders-opencl-no-opt/comp/extract-atomics-from-function.invalid.comp
+++ b/reference/shaders-opencl-no-opt/comp/extract-atomics-from-function.comp
@@ -1,7 +1,6 @@
 // Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
 
 
-#define var (*var_ptr)
 #define var (*var_ptr)
 void testAdd(__local uint* var_ptr)
 {
@@ -66,6 +65,7 @@ void testStore(__local uint* var_ptr)
 
 #undef var
 
+#define var (*var_ptr)
 void foo(__local uint* var_ptr)
 {
     testAdd(&var);
diff --git a/reference/shaders-opencl-no-opt/comp/implicit-integer-promotion.fp16.invalid.comp b/reference/shaders-opencl-no-opt/comp/implicit-integer-promotion.fp16.comp
similarity index 55%
rename from reference/shaders-opencl-no-opt/comp/implicit-integer-promotion.fp16.invalid.comp
rename to reference/shaders-opencl-no-opt/comp/implicit-integer-promotion.fp16.comp
index 7ee5a5f89..5191bd38c 100644
--- a/reference/shaders-opencl-no-opt/comp/implicit-integer-promotion.fp16.invalid.comp
+++ b/reference/shaders-opencl-no-opt/comp/implicit-integer-promotion.fp16.comp
@@ -16,34 +16,34 @@ typedef struct BUF0 BUF0;
 
 void test_u16(__global BUF0* _24)
 {
-    _24->f16 += as_half(ushort(_24->u16[0u] + _24->u16[1u]));
-    _24->f16 += as_half(ushort(_24->u16[0u] - _24->u16[1u]));
-    _24->f16 += as_half(ushort(_24->u16[0u] * _24->u16[1u]));
-    _24->f16 += as_half(ushort(_24->u16[0u] / _24->u16[1u]));
-    _24->f16 += as_half(ushort(_24->u16[0u] % _24->u16[1u]));
-    _24->f16 += as_half(ushort(_24->u16[0u] << _24->u16[1u]));
-    _24->f16 += as_half(ushort(_24->u16[0u] >> _24->u16[1u]));
-    _24->f16 += as_half(ushort(~_24->u16[0u]));
-    _24->f16 += as_half(ushort(-_24->u16[0u]));
-    _24->f16 += as_half(ushort(_24->u16[0u] ^ _24->u16[1u]));
-    _24->f16 += as_half(ushort(_24->u16[0u] & _24->u16[1u]));
-    _24->f16 += as_half(ushort(_24->u16[0u] | _24->u16[1u]));
+    _24->f16 += as_half((ushort)(_24->u16[0u] + _24->u16[1u]));
+    _24->f16 += as_half((ushort)(_24->u16[0u] - _24->u16[1u]));
+    _24->f16 += as_half((ushort)(_24->u16[0u] * _24->u16[1u]));
+    _24->f16 += as_half((ushort)(_24->u16[0u] / _24->u16[1u]));
+    _24->f16 += as_half((ushort)(_24->u16[0u] % _24->u16[1u]));
+    _24->f16 += as_half((ushort)(_24->u16[0u] << _24->u16[1u]));
+    _24->f16 += as_half((ushort)(_24->u16[0u] >> _24->u16[1u]));
+    _24->f16 += as_half((ushort)(~_24->u16[0u]));
+    _24->f16 += as_half((ushort)(-_24->u16[0u]));
+    _24->f16 += as_half((ushort)(_24->u16[0u] ^ _24->u16[1u]));
+    _24->f16 += as_half((ushort)(_24->u16[0u] & _24->u16[1u]));
+    _24->f16 += as_half((ushort)(_24->u16[0u] | _24->u16[1u]));
 }
 
 void test_i16(__global BUF0* _24)
 {
-    _24->f16 += as_half(short(_24->i16[0u] + _24->i16[1u]));
-    _24->f16 += as_half(short(_24->i16[0u] - _24->i16[1u]));
-    _24->f16 += as_half(short(_24->i16[0u] * _24->i16[1u]));
-    _24->f16 += as_half(short(_24->i16[0u] / _24->i16[1u]));
-    _24->f16 += as_half(short(_24->i16[0u] % _24->i16[1u]));
-    _24->f16 += as_half(short(_24->i16[0u] << _24->i16[1u]));
-    _24->f16 += as_half(short(_24->i16[0u] >> _24->i16[1u]));
-    _24->f16 += as_half(short(~_24->i16[0u]));
-    _24->f16 += as_half(short(-_24->i16[0u]));
-    _24->f16 += as_half(short(_24->i16[0u] ^ _24->i16[1u]));
-    _24->f16 += as_half(short(_24->i16[0u] & _24->i16[1u]));
-    _24->f16 += as_half(short(_24->i16[0u] | _24->i16[1u]));
+    _24->f16 += as_half((short)(_24->i16[0u] + _24->i16[1u]));
+    _24->f16 += as_half((short)(_24->i16[0u] - _24->i16[1u]));
+    _24->f16 += as_half((short)(_24->i16[0u] * _24->i16[1u]));
+    _24->f16 += as_half((short)(_24->i16[0u] / _24->i16[1u]));
+    _24->f16 += as_half((short)(_24->i16[0u] % _24->i16[1u]));
+    _24->f16 += as_half((short)(_24->i16[0u] << _24->i16[1u]));
+    _24->f16 += as_half((short)(_24->i16[0u] >> _24->i16[1u]));
+    _24->f16 += as_half((short)(~_24->i16[0u]));
+    _24->f16 += as_half((short)(-_24->i16[0u]));
+    _24->f16 += as_half((short)(_24->i16[0u] ^ _24->i16[1u]));
+    _24->f16 += as_half((short)(_24->i16[0u] & _24->i16[1u]));
+    _24->f16 += as_half((short)(_24->i16[0u] | _24->i16[1u]));
 }
 
 void test_u16s(__global BUF0* _24)
diff --git a/reference/shaders-opencl-no-opt/comp/int16min-literal.fp16.invalid.comp b/reference/shaders-opencl-no-opt/comp/int16min-literal.fp16.comp
similarity index 92%
rename from reference/shaders-opencl-no-opt/comp/int16min-literal.fp16.invalid.comp
rename to reference/shaders-opencl-no-opt/comp/int16min-literal.fp16.comp
index 4d3324d9e..ee9e25b09 100644
--- a/reference/shaders-opencl-no-opt/comp/int16min-literal.fp16.invalid.comp
+++ b/reference/shaders-opencl-no-opt/comp/int16min-literal.fp16.comp
@@ -21,7 +21,7 @@ __kernel void comp_main(UBO _12, __global half* _24)
 {
     uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
     short v = as_short(_12.b);
-    v = short(v ^ (-32768s));
+    v = (short)(v ^ (short)(-32768));
     _24[0] = as_half(v);
 }
 
diff --git a/reference/shaders-opencl-no-opt/comp/read-only-coherent-image.invalid.comp b/reference/shaders-opencl-no-opt/comp/read-only-coherent-image.comp
similarity index 81%
rename from reference/shaders-opencl-no-opt/comp/read-only-coherent-image.invalid.comp
rename to reference/shaders-opencl-no-opt/comp/read-only-coherent-image.comp
index 0e8f8174f..4fe8fbb98 100644
--- a/reference/shaders-opencl-no-opt/comp/read-only-coherent-image.invalid.comp
+++ b/reference/shaders-opencl-no-opt/comp/read-only-coherent-image.comp
@@ -9,7 +9,7 @@ struct SSBO
 typedef struct SSBO SSBO;
 
 __attribute__((reqd_work_group_size(1, 1, 1)))
-__kernel void comp_main(__global uint* _9, write_only image2d_t img)
+__kernel void comp_main(__global uint* _9, read_only image2d_t img)
 {
     uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
     _9[0] = read_imageui(img, (int2)(10)).x;
diff --git a/reference/shaders-opencl-no-opt/comp/simple-bindless-ssbo.argument.argument-tier-1.device-argument-buffer.invalid.comp b/reference/shaders-opencl-no-opt/comp/simple-bindless-ssbo.argument.argument-tier-1.device-argument-buffer.invalid.comp
deleted file mode 100644
index e69de29bb..000000000
diff --git a/reference/shaders-opencl/asm/comp/image-load-store-short-vector.invalid.asm.comp b/reference/shaders-opencl/asm/comp/image-load-store-short-vector.invalid.asm.comp
index f7d65805e..4ab76df67 100644
--- a/reference/shaders-opencl/asm/comp/image-load-store-short-vector.invalid.asm.comp
+++ b/reference/shaders-opencl/asm/comp/image-load-store-short-vector.invalid.asm.comp
@@ -1,18 +1,18 @@
 // Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
 
 
-void _main( uint3* id)
+void _main( uint3* id_2)
 {
-    float2 loaded = read_imagef(TargetTexture, as_int2((*id).xy)).xy;
-    float2 storeTemp = loaded + (float2)(1.0f);
-    write_imagef(TargetTexture, as_int2((*id).xy + (uint2)(1u)), (float4)(storeTemp));
+    float2 loaded_1 = read_imagef(TargetTexture, as_int2((*id_2).xy)).xy;
+    float2 storeTemp_1 = loaded_1 + (float2)(1.0f);
+    write_imagef(TargetTexture, as_int2((*id_2).xy + (uint2)(1u)), (float4)(storeTemp_1));
 }
 
 __attribute__((reqd_work_group_size(1, 1, 1)))
 __kernel void comp_main(write_only image2d_t TargetTexture)
 {
-    uint3 id_1 = ((uint3)(get_group_id(0), get_group_id(1), get_group_id(2)));
-    uint3 param = id_1;
-    _main(&param);
+    uint3 id_1_1 = ((uint3)(get_group_id(0), get_group_id(1), get_group_id(2)));
+    uint3 param_1 = id_1_1;
+    _main(&param_1);
 }
 
diff --git a/shaders-opencl-no-opt/asm/comp/array-copy-physical-layout-mismatch.invalid.asm.comp b/shaders-opencl-no-opt/asm/comp/array-copy-physical-layout-mismatch.asm.comp
similarity index 100%
rename from shaders-opencl-no-opt/asm/comp/array-copy-physical-layout-mismatch.invalid.asm.comp
rename to shaders-opencl-no-opt/asm/comp/array-copy-physical-layout-mismatch.asm.comp
diff --git a/shaders-opencl-no-opt/asm/comp/bitcast-fp16-fp32.fp16.invalid.asm.comp b/shaders-opencl-no-opt/asm/comp/bitcast-fp16-fp32.fp16.asm.comp
similarity index 100%
rename from shaders-opencl-no-opt/asm/comp/bitcast-fp16-fp32.fp16.invalid.asm.comp
rename to shaders-opencl-no-opt/asm/comp/bitcast-fp16-fp32.fp16.asm.comp
diff --git a/shaders-opencl-no-opt/asm/comp/block-like-array-type-construct.invalid.asm.comp b/shaders-opencl-no-opt/asm/comp/block-like-array-type-construct.asm.comp
similarity index 100%
rename from shaders-opencl-no-opt/asm/comp/block-like-array-type-construct.invalid.asm.comp
rename to shaders-opencl-no-opt/asm/comp/block-like-array-type-construct.asm.comp
diff --git a/shaders-opencl-no-opt/asm/comp/groupshared-inner-array-of-struct-copy.invalid.asm.comp b/shaders-opencl-no-opt/asm/comp/groupshared-inner-array-of-struct-copy.asm.comp
similarity index 100%
rename from shaders-opencl-no-opt/asm/comp/groupshared-inner-array-of-struct-copy.invalid.asm.comp
rename to shaders-opencl-no-opt/asm/comp/groupshared-inner-array-of-struct-copy.asm.comp
diff --git a/shaders-opencl-no-opt/asm/comp/storage-buffer-basic.invalid.asm.comp b/shaders-opencl-no-opt/asm/comp/storage-buffer-basic.asm.comp
similarity index 100%
rename from shaders-opencl-no-opt/asm/comp/storage-buffer-basic.invalid.asm.comp
rename to shaders-opencl-no-opt/asm/comp/storage-buffer-basic.asm.comp
diff --git a/shaders-opencl-no-opt/comp/array-copy-threadgroup-memory.invalid.comp b/shaders-opencl-no-opt/comp/array-copy-threadgroup-memory.comp
similarity index 100%
rename from shaders-opencl-no-opt/comp/array-copy-threadgroup-memory.invalid.comp
rename to shaders-opencl-no-opt/comp/array-copy-threadgroup-memory.comp
diff --git a/shaders-opencl-no-opt/comp/basic.invalid.comp b/shaders-opencl-no-opt/comp/basic.comp
similarity index 100%
rename from shaders-opencl-no-opt/comp/basic.invalid.comp
rename to shaders-opencl-no-opt/comp/basic.comp
diff --git a/shaders-opencl-no-opt/comp/bda-atomics.invalid.comp b/shaders-opencl-no-opt/comp/bda-atomics.comp
similarity index 100%
rename from shaders-opencl-no-opt/comp/bda-atomics.invalid.comp
rename to shaders-opencl-no-opt/comp/bda-atomics.comp
diff --git a/shaders-opencl-no-opt/comp/bda-load-std140-arrayed-pointer.invalid.comp b/shaders-opencl-no-opt/comp/bda-load-std140-arrayed-pointer.comp
similarity index 100%
rename from shaders-opencl-no-opt/comp/bda-load-std140-arrayed-pointer.invalid.comp
rename to shaders-opencl-no-opt/comp/bda-load-std140-arrayed-pointer.comp
diff --git a/shaders-opencl-no-opt/comp/bda-restrict-pointer-variable.invalid.comp b/shaders-opencl-no-opt/comp/bda-restrict-pointer-variable.comp
similarity index 100%
rename from shaders-opencl-no-opt/comp/bda-restrict-pointer-variable.invalid.comp
rename to shaders-opencl-no-opt/comp/bda-restrict-pointer-variable.comp
diff --git a/shaders-opencl-no-opt/comp/extract-atomics-from-function.invalid.comp b/shaders-opencl-no-opt/comp/extract-atomics-from-function.comp
similarity index 100%
rename from shaders-opencl-no-opt/comp/extract-atomics-from-function.invalid.comp
rename to shaders-opencl-no-opt/comp/extract-atomics-from-function.comp
diff --git a/shaders-opencl-no-opt/comp/implicit-integer-promotion.fp16.invalid.comp b/shaders-opencl-no-opt/comp/implicit-integer-promotion.fp16.comp
similarity index 100%
rename from shaders-opencl-no-opt/comp/implicit-integer-promotion.fp16.invalid.comp
rename to shaders-opencl-no-opt/comp/implicit-integer-promotion.fp16.comp
diff --git a/shaders-opencl-no-opt/comp/int16min-literal.fp16.invalid.comp b/shaders-opencl-no-opt/comp/int16min-literal.fp16.comp
similarity index 100%
rename from shaders-opencl-no-opt/comp/int16min-literal.fp16.invalid.comp
rename to shaders-opencl-no-opt/comp/int16min-literal.fp16.comp
diff --git a/shaders-opencl-no-opt/comp/read-only-coherent-image.invalid.comp b/shaders-opencl-no-opt/comp/read-only-coherent-image.comp
similarity index 100%
rename from shaders-opencl-no-opt/comp/read-only-coherent-image.invalid.comp
rename to shaders-opencl-no-opt/comp/read-only-coherent-image.comp
diff --git a/shaders-opencl-no-opt/comp/simple-bindless-ssbo.argument.argument-tier-1.device-argument-buffer.invalid.comp b/shaders-opencl-no-opt/comp/simple-bindless-ssbo.argument.argument-tier-1.device-argument-buffer.invalid.comp
deleted file mode 100644
index 0db56342c..000000000
--- a/shaders-opencl-no-opt/comp/simple-bindless-ssbo.argument.argument-tier-1.device-argument-buffer.invalid.comp
+++ /dev/null
@@ -1,13 +0,0 @@
-#version 450
-#extension GL_EXT_nonuniform_qualifier : require
-layout(local_size_x = 1) in;
-
-layout(set = 0, binding = 0) buffer SSBO
-{
-	vec4 a;
-} ssbos[];
-
-void main()
-{
-	ssbos[gl_WorkGroupID.x].a += 2.0;
-}
diff --git a/spirv_glsl.cpp b/spirv_glsl.cpp
index 66ab8c560..e782463c7 100644
--- a/spirv_glsl.cpp
+++ b/spirv_glsl.cpp
@@ -7122,8 +7122,12 @@ void CompilerGLSL::emit_unary_op_cast(uint32_t result_type, uint32_t result_id,
 {
 	auto &type = get<SPIRType>(result_type);
 	bool forward = should_forward(op0);
-	emit_op(result_type, result_id, join(type_to_glsl(type), "(", op, to_enclosed_unpacked_expression(op0), ")"),
-	        forward);
+	if (backend.c_style_casts)
+		emit_op(result_type, result_id,
+		        join("(", type_to_glsl(type), ")(", op, to_enclosed_unpacked_expression(op0), ")"), forward);
+	else
+		emit_op(result_type, result_id, join(type_to_glsl(type), "(", op, to_enclosed_unpacked_expression(op0), ")"),
+		        forward);
 	inherit_expression_dependencies(result_id, op0);
 }
 
@@ -7291,7 +7295,10 @@ void CompilerGLSL::emit_binary_op_cast(uint32_t result_type, uint32_t result_id,
 	if (implicit_integer_promotion)
 	{
 		// Simple value cast.
-		expr = join(type_to_glsl(out_type), '(', bitop, ')');
+		if (backend.c_style_casts)
+			expr = join("(", type_to_glsl(out_type), ")(", bitop, ")");
+		else
+			expr = join(type_to_glsl(out_type), '(', bitop, ')');
 	}
 	else if (out_type.basetype != input_type && out_type.basetype != SPIRType::Boolean)
 	{
@@ -13779,7 +13786,10 @@ void CompilerGLSL::emit_instruction(const Instruction &instruction)
 
 		if (implicit_integer_promotion)
 		{
-			expr = join(type_to_glsl(get<SPIRType>(result_type)), '(', expr, ')');
+			if (backend.c_style_casts)
+				expr = join("(", type_to_glsl(get<SPIRType>(result_type)), ")(", expr, ")");
+			else
+				expr = join(type_to_glsl(get<SPIRType>(result_type)), '(', expr, ')');
 		}
 		else if (out_type.basetype != int_type)
 		{
diff --git a/spirv_glsl.hpp b/spirv_glsl.hpp
index 70c93fcd4..a6b564461 100644
--- a/spirv_glsl.hpp
+++ b/spirv_glsl.hpp
@@ -667,6 +667,9 @@ class CompilerGLSL : public Compiler
 		bool workgroup_size_is_hidden = false;
 		bool requires_relaxed_precision_analysis = false;
 		bool implicit_c_integer_promotion_rules = false;
+		// When true, emit C-style casts "(type)(expr)" instead of GLSL constructor-style "type(expr)"
+		// for value casts (e.g., implicit integer promotion).
+		bool c_style_casts = false;
 		bool supports_spec_constant_array_size = true;
 		// When non-empty, matrix column access uses this member name instead of raw array indexing.
 		// e.g., "columns" -> m.columns[i] instead of m[i].
diff --git a/spirv_opencl.cpp b/spirv_opencl.cpp
index b366ec9c6..cb2be10aa 100644
--- a/spirv_opencl.cpp
+++ b/spirv_opencl.cpp
@@ -78,8 +78,8 @@ string CompilerOpenCL::compile()
 	backend.float_literal_suffix = true;
 	backend.double_literal_suffix = true;
 	backend.uint32_t_literal_suffix = true;
-	backend.int16_t_literal_suffix = "s";
-	backend.uint16_t_literal_suffix = "us";
+	backend.int16_t_literal_suffix = "";
+	backend.uint16_t_literal_suffix = "";
 	backend.basic_int_type = "int";
 	backend.basic_uint_type = "uint";
 	backend.basic_int8_type = "char";
@@ -111,12 +111,14 @@ string CompilerOpenCL::compile()
 	backend.array_is_value_type_in_buffer_blocks = false;
 	backend.support_pointer_to_pointer = true;
 	backend.implicit_c_integer_promotion_rules = true;
+	backend.c_style_casts = true;
 	backend.supports_spec_constant_array_size = false;
 	backend.matrix_column_accessor = "columns";
 
 	fixup_anonymous_struct_names();
 	fixup_type_alias();
 	replace_illegal_names();
+	fixup_image_load_store_access();
 	build_function_control_flow_graphs_and_analyze();
 	update_active_builtins();
 	analyze_image_and_sampler_usage();
@@ -216,6 +218,21 @@ const char *CompilerOpenCL::to_storage_qualifiers_glsl(const SPIRVariable &)
 
 void CompilerOpenCL::compute_kernel_resources()
 {
+	// OpenCL C uses __restrict (after *) instead of GLSL's restrict (before type).
+	// Convert DecorationRestrictPointerEXT → DecorationRestrict so the base class
+	// flags_to_qualifiers_glsl does not emit "restrict " prefix, and our to_restrict
+	// emits "__restrict" after the pointer star instead.
+	ir.for_each_typed_id<SPIRVariable>(
+	    [&](uint32_t, SPIRVariable &var)
+	    {
+		    auto &flags = get_decoration_bitset(var.self);
+		    if (flags.get(DecorationRestrictPointerEXT))
+		    {
+			    unset_decoration(var.self, DecorationRestrictPointerEXT);
+			    set_decoration(var.self, DecorationRestrict);
+		    }
+	    });
+
 	// Collect all SSBOs/BufferBlocks that get flattened to __global T* kernel parameters.
 	flattened_buffer_vars.clear();
 	flattened_var_type_decl.clear();
@@ -234,7 +251,11 @@ void CompilerOpenCL::compute_kernel_resources()
 			    if (type.basetype == SPIRType::Struct && type.member_types.size() == 1)
 			    {
 				    const auto &member0_type = get<SPIRType>(type.member_types.front());
-				    subtype = type_to_glsl(member0_type);
+				    // BDA pointer members are stored as ulong in structs.
+				    if (is_pointer(member0_type) && member0_type.storage == StorageClassPhysicalStorageBuffer)
+					    subtype = "ulong";
+				    else
+					    subtype = type_to_glsl(member0_type);
 			    }
 			    else
 			    {
@@ -951,10 +972,9 @@ void CompilerOpenCL::emit_entry_point_declarations()
 		if (var.storage == StorageClassPrivate && !is_hidden_variable(var, true))
 		{
 			add_local_variable_name(var.self);
-			string initializer;
-			if (var.initializer)
-				initializer = join(" = ", to_expression(var.initializer));
-			statement(CompilerGLSL::variable_decl(var), initializer, ";");
+			// CompilerGLSL::variable_decl(var) already includes the initializer
+			// expression (via to_initializer_expression), so no extra initializer needed.
+			statement(CompilerGLSL::variable_decl(var), ";");
 		}
 	}
 
@@ -1137,9 +1157,8 @@ const char *CompilerOpenCL::to_restrict(uint32_t id, bool space)
 	else
 		flags = get_decoration_bitset(id);
 
-	// Only check DecorationRestrict here. DecorationRestrictPointerEXT is handled by
-	// flags_to_qualifiers_glsl in the GLSL base (emits "restrict " prefix), so we
-	// don't duplicate it as "__restrict" after the pointer star.
+	// DecorationRestrictPointerEXT is converted to DecorationRestrict in
+	// compute_kernel_resources(), so only check DecorationRestrict here.
 	return flags.get(DecorationRestrict) ? (space ? "__restrict " : "__restrict") : "";
 }
 
@@ -1829,17 +1848,9 @@ std::string CompilerOpenCL::to_initializer_expression(const SPIRVariable &var)
 	// (e.g., `float a[5] = ssbo->b;` is not valid C).
 	// For array variables with non-constant initializers, emit zero init `{ 0 }` and
 	// schedule element-by-element copy after the declaration.
-	auto &type = get_variable_data_type(var);
-	if (is_array(type) && var.initializer)
-	{
-		// Check if the initializer is a constant — those are fine as-is.
-		if (ir.ids[var.initializer].get_type() != TypeConstant)
-		{
-			// Queue the initializer for post-declaration element-by-element copy.
-			pending_array_copies.push_back({ var.self, var.initializer });
-			return "{ 0 }";
-		}
-	}
+	// SPIR-V spec only allows constant initializers on OpVariable, so array
+	// initializers are always constants and valid as-is in OpenCL C.
+	// Non-constant array copies are handled by emit_store_statement (OpStore).
 	return CompilerGLSL::to_initializer_expression(var);
 }
 
@@ -2513,6 +2524,26 @@ std::string CompilerOpenCL::bitcast_glsl_op(const SPIRType &out_type, const SPIR
 	return "as_" + out_name;
 }
 
+bool CompilerOpenCL::emit_complex_bitcast(uint32_t result_type, uint32_t id, uint32_t op0)
+{
+	auto &output_type = get<SPIRType>(result_type);
+	auto &input_type = expression_type(op0);
+	string expr;
+
+	// float → half2 bitcast: as_half2(expr)
+	if (output_type.basetype == SPIRType::Half && input_type.basetype == SPIRType::Float && input_type.vecsize == 1)
+		expr = join("as_half2(", to_unpacked_expression(op0), ")");
+	// half2 → float bitcast: as_float(expr)
+	else if (output_type.basetype == SPIRType::Float && input_type.basetype == SPIRType::Half &&
+	         input_type.vecsize == 2)
+		expr = join("as_float(", to_unpacked_expression(op0), ")");
+	else
+		return false;
+
+	emit_op(result_type, id, expr, should_forward(op0));
+	return true;
+}
+
 // Task #7: In OpenCL C, atomic functions take a pointer argument.
 // Access chain expressions (access_chain = true) may be C lvalues (e.g. ssbo->u32) → need &.
 // But single-member flattened SSBOs emit the raw pointer itself (e.g. _48 is __global uint*)
@@ -2728,6 +2759,12 @@ std::string CompilerOpenCL::entry_point_args(bool append_comma)
 				    {
 					    // Use type of first struct member as a StructuredBuffer will have only one '._m0' field in SPIR-V
 					    const auto &member0_type = this->get<SPIRType>(parent_type.member_types.front());
+					    // If the sole member is a BDA pointer, type_to_glsl would return
+					    // `__global Ptr*` which, wrapped in `__global const X*`, yields
+					    // double `__global` and pointer-to-pointer. Flatten to `ulong`
+					    // instead, matching how emit_struct_member stores BDA pointers.
+					    if (is_pointer(member0_type) && member0_type.storage == StorageClassPhysicalStorageBuffer)
+						    return std::string("ulong");
 					    return this->type_to_glsl(member0_type);
 				    }
 				    else
@@ -2896,6 +2933,24 @@ void CompilerOpenCL::emit_function_prototype(SPIRFunction &func, const Bitset &r
 	}
 
 	decl += ")";
+
+	// Emit #define macros right before the function prototype for workgroup scalar pointer aliasing.
+	// This must happen here (not in emit_function) because CompilerGLSL::emit_function recursively
+	// emits callee functions before reaching emit_function_prototype, so #define in emit_function
+	// would be undone by callee #undef before this function's body is emitted.
+	auto wg_it = func_workgroup_args.find(func.self);
+	if (wg_it != func_workgroup_args.end())
+	{
+		for (auto var_id : wg_it->second)
+		{
+			if (workgroup_scalar_vars.count(var_id))
+			{
+				auto var_name = to_name(var_id);
+				statement("#define ", var_name, " (*", var_name, "_ptr)");
+			}
+		}
+	}
+
 	statement(decl);
 }
 
@@ -2933,33 +2988,24 @@ void CompilerOpenCL::append_global_func_args(const SPIRFunction &func, uint32_t
 
 void CompilerOpenCL::emit_function(SPIRFunction &func, const Bitset &return_flags)
 {
-	// Emit #define macros before the function for workgroup scalar pointer aliasing.
+	CompilerGLSL::emit_function(func, return_flags);
+
+	// Emit #undef after the function body.
+	// The matching #define is emitted in emit_function_prototype.
 	auto wg_it = func_workgroup_args.find(func.self);
-	bool has_defines = false;
 	if (wg_it != func_workgroup_args.end())
 	{
+		bool has_defines = false;
 		for (auto var_id : wg_it->second)
 		{
 			if (workgroup_scalar_vars.count(var_id))
 			{
-				auto var_name = to_name(var_id);
-				statement("#define ", var_name, " (*", var_name, "_ptr)");
+				statement("#undef ", to_name(var_id));
 				has_defines = true;
 			}
 		}
-	}
-
-	CompilerGLSL::emit_function(func, return_flags);
-
-	// Emit #undef after the function.
-	if (has_defines)
-	{
-		for (auto var_id : wg_it->second)
-		{
-			if (workgroup_scalar_vars.count(var_id))
-				statement("#undef ", to_name(var_id));
-		}
-		statement("");
+		if (has_defines)
+			statement("");
 	}
 }
 
@@ -3020,6 +3066,35 @@ void CompilerOpenCL::emit_store_statement(uint32_t lhs_expression, uint32_t rhs_
 	}
 	else
 	{
+		// Check if storing an array type — C does not allow `array = expr;`.
+		auto &rhs_type_raw = expression_type(rhs_expression);
+		auto &rhs_type = is_pointer(rhs_type_raw) ? get_pointee_type(rhs_type_raw) : rhs_type_raw;
+		if (is_array(rhs_type))
+		{
+			auto *var = maybe_get<SPIRVariable>(lhs_expression);
+			// For deferred declarations where the RHS is a composite construct
+			// (not loaded from memory), C99 allows `T arr[N] = { ... };`.
+			// Let the base class handle that case — it merges decl + init correctly.
+			auto *rhs_expr_node = maybe_get<SPIRExpression>(rhs_expression);
+			bool rhs_from_memory = rhs_expr_node && rhs_expr_node->loaded_from;
+			if (var && var->deferred_declaration && !rhs_from_memory)
+			{
+				// Base class will emit `T arr[N] = { ... };`
+				CompilerGLSL::emit_store_statement(lhs_expression, rhs_expression);
+				return;
+			}
+
+			// Flush deferred declaration so we don't get "float a[5] = rhs".
+			if (var && var->deferred_declaration)
+			{
+				var->deferred_declaration = false;
+				statement(variable_decl_function_local(*var), ";");
+			}
+			auto lhs_expr = to_dereferenced_expression(lhs_expression);
+			emit_array_copy(lhs_expr.c_str(), 0, rhs_expression, StorageClassFunction, StorageClassFunction);
+			register_write(lhs_expression);
+			return;
+		}
 		CompilerGLSL::emit_store_statement(lhs_expression, rhs_expression);
 	}
 }
@@ -3031,19 +3106,20 @@ void CompilerOpenCL::emit_struct_member(const SPIRType &type, uint32_t member_ty
 	// OpenCL C does not use GLSL layout qualifiers or interpolation qualifiers.
 	// PhysicalStorageBuffer pointers in structs must be emitted as ulong since
 	// OpenCL C does not allow pointer types in kernel parameter structs.
-	if (is_pointer(membertype) && membertype.storage == StorageClassPhysicalStorageBuffer)
+	// Walk through array dimensions to find the inner element type, so that
+	// array-of-pointer members (e.g. `Ptr* ptrs[2]`) are also caught.
+	auto *inner = &membertype;
+	while (is_array(*inner))
+		inner = &get<SPIRType>(inner->parent_type);
+	if (is_pointer(*inner) && inner->storage == StorageClassPhysicalStorageBuffer)
 	{
-		statement(qualifier, "ulong ", to_member_name(type, index), ";");
+		statement(qualifier, "ulong ", to_member_name(type, index), type_to_array_glsl(membertype, 0), ";");
 	}
 	else if (has_member_decoration(type.self, index, DecorationRowMajor))
 	{
 		// Row-major matrix: the physical layout has transposed dimensions.
 		// Emit the member with the physical (transposed) type so struct layout matches buffer.
-		// Walk through array nesting to find the inner matrix type.
-		const auto *inner = &membertype;
-		while (is_array(*inner))
-			inner = &get<SPIRType>(inner->parent_type);
-
+		// `inner` already points to the innermost non-array type from the BDA check above.
 		if (inner->columns > 1)
 		{
 			auto phys_type_name = opencl_matrix_type_name(inner->basetype, inner->columns, inner->vecsize);
@@ -3262,9 +3338,19 @@ bool CompilerOpenCL::emit_array_copy(const char *expr, uint32_t lhs_id, uint32_t
 		return true;
 	}
 
-	// Emit element-by-element copy
-	for (uint32_t i = 0; i < array_size; i++)
-		statement(lhs, "[", i, "] = ", rhs_expr, "[", i, "];");
+	// For constant RHS, `to_expression` returns `{ 1.0f, 2.0f, ... }` and
+	// subscripting that (`{ ... }[0]`) is not valid C. Extract sub-constants.
+	auto *constant = maybe_get<SPIRConstant>(rhs_id);
+	if (constant && !constant->subconstants.empty())
+	{
+		for (uint32_t i = 0; i < array_size; i++)
+			statement(lhs, "[", i, "] = ", to_expression(constant->subconstants[i]), ";");
+	}
+	else
+	{
+		for (uint32_t i = 0; i < array_size; i++)
+			statement(lhs, "[", i, "] = ", rhs_expr, "[", i, "];");
+	}
 
 	return true;
 }
@@ -4164,6 +4250,7 @@ void CompilerOpenCL::emit_instruction(const Instruction &instruction)
 
 			string expr;
 			bool handled = false;
+			int32_t row_major_mbr_idx = -1; // member index for row-major check, -1 if N/A
 
 			bool is_subscript_deref = false; // result is a C value (subscripted), not a pointer
 
@@ -4175,6 +4262,7 @@ void CompilerOpenCL::emit_instruction(const Instruction &instruction)
 				// ops[5+] = optional sub-member indices
 				expr = join(to_name(base_id), "[", to_expression(ops[4]), "]");
 				is_subscript_deref = true;
+				row_major_mbr_idx = 0; // single member, always index 0
 				// Walk additional sub-member indices using type info.
 				if (length >= 6 && struct_type)
 				{
@@ -4204,6 +4292,7 @@ void CompilerOpenCL::emit_instruction(const Instruction &instruction)
 				auto mbr_name = to_member_name(*struct_type, mbr_idx);
 				expr = join(to_name(base_id), "[", to_expression(ops[3]), "].", mbr_name);
 				is_subscript_deref = true;
+				row_major_mbr_idx = int32_t(mbr_idx);
 				handled = true;
 			}
 			else if (length == 5 && !is_single_member && struct_type)
@@ -4214,6 +4303,7 @@ void CompilerOpenCL::emit_instruction(const Instruction &instruction)
 				auto mbr_name = to_member_name(*struct_type, mbr_idx);
 				expr = join(to_name(base_id), "->", mbr_name, "[", to_expression(ops[4]), "]");
 				is_subscript_deref = true;
+				row_major_mbr_idx = int32_t(mbr_idx);
 				handled = true;
 			}
 			else if (length == 4 && is_single_member)
@@ -4221,6 +4311,7 @@ void CompilerOpenCL::emit_instruction(const Instruction &instruction)
 				// Single-member SSBO flattened to T*: accessing the one member gives element 0.
 				expr = join(to_name(base_id), "[0]");
 				is_subscript_deref = true;
+				row_major_mbr_idx = 0; // single member, always index 0
 				handled = true;
 			}
 			else if (length == 4 && !is_single_member && struct_type && !struct_type->array.empty())
@@ -4237,6 +4328,7 @@ void CompilerOpenCL::emit_instruction(const Instruction &instruction)
 				auto mbr_name = to_member_name(*struct_type, mbr_idx);
 				expr = join(to_name(base_id), "->", mbr_name);
 				is_subscript_deref = true; // result is a struct value (accessed through ->), use . for children
+				row_major_mbr_idx = int32_t(mbr_idx);
 				handled = true;
 			}
 
@@ -4250,10 +4342,9 @@ void CompilerOpenCL::emit_instruction(const Instruction &instruction)
 					subscripted_deref_exprs.insert(result_id);
 
 				// Propagate row-major transpose flag for matrix members.
-				if (struct_type && length >= 4)
+				if (struct_type && row_major_mbr_idx >= 0)
 				{
-					uint32_t mbr_idx = get<SPIRConstant>(ops[3]).scalar();
-					if (member_is_non_native_row_major_matrix(*struct_type, mbr_idx))
+					if (member_is_non_native_row_major_matrix(*struct_type, uint32_t(row_major_mbr_idx)))
 						e.need_transpose = true;
 				}
 
@@ -4554,6 +4645,46 @@ void CompilerOpenCL::emit_instruction(const Instruction &instruction)
 		break;
 	}
 
+	// OpImageFetch (texelFetch in GLSL) maps to read_image* in OpenCL C,
+	// same as OpImageRead but may carry a Lod operand (which we ignore
+	// since OpenCL images don't support LOD on read).
+	case OpImageFetch:
+	{
+		uint32_t result_type = ops[0];
+		uint32_t result_id = ops[1];
+		uint32_t image_id = ops[2];
+		uint32_t coord_id = ops[3];
+
+		auto &result_spirtype = get<SPIRType>(result_type);
+		const char *read_func;
+		switch (result_spirtype.basetype)
+		{
+		case SPIRType::UInt:
+			read_func = "read_imageui";
+			break;
+		case SPIRType::Int:
+			read_func = "read_imagei";
+			break;
+		default:
+			read_func = "read_imagef";
+			break;
+		}
+
+		// Convert coordinate to int.
+		auto coord_type = expression_type(coord_id);
+		coord_type.basetype = SPIRType::Int;
+		auto coord_expr = bitcast_expression(coord_type, expression_type(coord_id).basetype, to_expression(coord_id));
+
+		auto raw_expr = join(read_func, "(", to_expression(image_id), ", ", coord_expr, ")");
+		auto swizzled = remap_swizzle(result_spirtype, 4, raw_expr);
+
+		bool forward = should_forward(image_id) && should_forward(coord_id);
+		emit_op(result_type, result_id, swizzled, forward);
+		inherit_expression_dependencies(result_id, image_id);
+		inherit_expression_dependencies(result_id, coord_id);
+		break;
+	}
+
 	// Task #10: Map image read/write/query ops to OpenCL C equivalents.
 	case OpImageRead:
 	{
@@ -4562,6 +4693,18 @@ void CompilerOpenCL::emit_instruction(const Instruction &instruction)
 		uint32_t image_id = ops[2];
 		uint32_t coord_id = ops[3];
 
+		// Unset NonReadable so image access qualifier deduction works correctly.
+		auto *image_var = maybe_get_backing_variable(image_id);
+		if (image_var)
+		{
+			auto &flags = get_decoration_bitset(image_var->self);
+			if (flags.get(DecorationNonReadable))
+			{
+				unset_decoration(image_var->self, DecorationNonReadable);
+				force_recompile();
+			}
+		}
+
 		auto &img_type = expression_type(image_id);
 		// SubpassData is not supported; fall through to base class.
 		if (img_type.image.dim == DimSubpassData)
diff --git a/spirv_opencl.hpp b/spirv_opencl.hpp
index 0303e55d9..94a7c5d35 100644
--- a/spirv_opencl.hpp
+++ b/spirv_opencl.hpp
@@ -125,6 +125,7 @@ class CompilerOpenCL : public CompilerGLSL
 	std::string to_initializer_expression(const SPIRVariable &var) override;
 	std::string constant_expression_vector(const SPIRConstant &c, uint32_t vector) override;
 	std::string bitcast_glsl_op(const SPIRType &result_type, const SPIRType &argument_type) override;
+	bool emit_complex_bitcast(uint32_t result_type, uint32_t id, uint32_t op0) override;
 	std::string to_atomic_ptr_expression(uint32_t id) override;
 	void emit_glsl_op(uint32_t result_type, uint32_t result_id, uint32_t op, const uint32_t *args,
 	                  uint32_t count) override;
@@ -157,10 +158,6 @@ class CompilerOpenCL : public CompilerGLSL
 	// These are C values (not pointers), so subsequent member accesses must use '.' not '->'.
 	std::unordered_set<uint32_t> subscripted_deref_exprs;
 
-	// Pending array copies from to_initializer_expression: { var_id, initializer_id }
-	// These are emitted as element-by-element copies after the variable declaration.
-	SmallVector<std::pair<uint32_t, uint32_t>> pending_array_copies;
-
 	// Set when packHalf2x16/unpackHalf2x16 polyfill helpers are needed.
 	bool needs_half_pack_polyfill = false;
 	bool needs_half_unpack_polyfill = false;

From 82f29d2abeeb605b1521f920bad846c8f1749bbd Mon Sep 17 00:00:00 2001
From: Garrick Meeker <gmeeker@gmail.com>
Date: Sat, 14 Mar 2026 14:42:10 -0700
Subject: [PATCH 09/16] OpenCL: Fixing another test

---
 ...w-maj-mtx-bypass-transpose.spv14.asm.comp} | 16 +----
 .../asm/comp/variable-pointers-2.asm.comp     |  2 +-
 .../comp/variable-pointers-3.invalid.asm.comp |  2 +-
 ...ariable-pointers-vector-to-scalar.asm.comp |  2 +-
 ...w-maj-mtx-bypass-transpose.spv14.asm.comp} |  0
 spirv_opencl.cpp                              | 62 +++++++++++++------
 spirv_opencl.hpp                              |  4 ++
 7 files changed, 54 insertions(+), 34 deletions(-)
 rename reference/shaders-opencl-no-opt/asm/comp/{opptrequal-row-maj-mtx-bypass-transpose.spv14.asm.invalid.comp => opptrequal-row-maj-mtx-bypass-transpose.spv14.asm.comp} (51%)
 rename shaders-opencl-no-opt/asm/comp/{opptrequal-row-maj-mtx-bypass-transpose.spv14.asm.invalid.comp => opptrequal-row-maj-mtx-bypass-transpose.spv14.asm.comp} (100%)

diff --git a/reference/shaders-opencl-no-opt/asm/comp/opptrequal-row-maj-mtx-bypass-transpose.spv14.asm.invalid.comp b/reference/shaders-opencl-no-opt/asm/comp/opptrequal-row-maj-mtx-bypass-transpose.spv14.asm.comp
similarity index 51%
rename from reference/shaders-opencl-no-opt/asm/comp/opptrequal-row-maj-mtx-bypass-transpose.spv14.asm.invalid.comp
rename to reference/shaders-opencl-no-opt/asm/comp/opptrequal-row-maj-mtx-bypass-transpose.spv14.asm.comp
index 0f41e332f..0cd1801d4 100644
--- a/reference/shaders-opencl-no-opt/asm/comp/opptrequal-row-maj-mtx-bypass-transpose.spv14.asm.invalid.comp
+++ b/reference/shaders-opencl-no-opt/asm/comp/opptrequal-row-maj-mtx-bypass-transpose.spv14.asm.comp
@@ -20,16 +20,6 @@ struct _7
 
 typedef struct _7 _7;
 
-static spvMat4 spvTransposeMat4(spvMat4 m)
-{
-    spvMat4 r;
-    r.columns[0] = (float4)(m.columns[0].x, m.columns[1].x, m.columns[2].x, m.columns[3].x);
-    r.columns[1] = (float4)(m.columns[0].y, m.columns[1].y, m.columns[2].y, m.columns[3].y);
-    r.columns[2] = (float4)(m.columns[0].z, m.columns[1].z, m.columns[2].z, m.columns[3].z);
-    r.columns[3] = (float4)(m.columns[0].w, m.columns[1].w, m.columns[2].w, m.columns[3].w);
-    return r;
-}
-
 __attribute__((reqd_work_group_size(1, 1, 1)))
 __kernel void comp_main(__global _6* _2, __global _6* _3, __global uint* _4)
 {
@@ -39,12 +29,12 @@ __kernel void comp_main(__global _6* _2, __global _6* _3, __global uint* _4)
     bool _40 = &_2->_m2 == &_3->_m2;
     uint _43 = _39 + 1u;
     _4[_39] = _40 ? 0u : 1u;
-    bool _46 = _40 ? &_2->_m2 : &_2->_m3 == _40 ? &_3->_m2 : &_3->_m3;
+    bool _46 = (_40 ? &_2->_m2 : &_2->_m3) == (_40 ? &_3->_m2 : &_3->_m3);
     uint _49 = _43 + 1u;
     _4[_43] = _46 ? 0u : 1u;
     uint _54 = _49 + 1u;
-    _4[_49] = (_46 ? &_2->_m2 : &_2->_m3 == &_2->_m0.columns[0u].x) ? 0u : 1u;
-    uint _56 = (&_2->_m0 == &spvTransposeMat4(_2->_m1)) ? 0u : 1u;
+    _4[_49] = ((_46 ? &_2->_m2 : &_2->_m3) == &((__global float*)&_2->_m0.columns[0u])[0u]) ? 0u : 1u;
+    uint _56 = (&_2->_m0 == &_2->_m1) ? 0u : 1u;
     uint _58 = _54 + 1u;
     _4[_54] = _56;
     _4[_58] = _56;
diff --git a/reference/shaders-opencl-no-opt/asm/comp/variable-pointers-2.asm.comp b/reference/shaders-opencl-no-opt/asm/comp/variable-pointers-2.asm.comp
index dfbbef692..de0cfa47c 100644
--- a/reference/shaders-opencl-no-opt/asm/comp/variable-pointers-2.asm.comp
+++ b/reference/shaders-opencl-no-opt/asm/comp/variable-pointers-2.asm.comp
@@ -11,6 +11,6 @@ __kernel void comp_main()
     ((&((&test)[0u]))[0u])[1u + 2u] = _22;
     ((&test)[0u])[1u + 2u] = _22;
     ((&test)[0u])[3u] = _22;
-    ((&test)[0u])[2u + 1u].x = _21;
+    ((__local float*)&((&test)[0u])[2u])[0u + 1u] = _21;
 }
 
diff --git a/reference/shaders-opencl-no-opt/asm/comp/variable-pointers-3.invalid.asm.comp b/reference/shaders-opencl-no-opt/asm/comp/variable-pointers-3.invalid.asm.comp
index 99ce6ceff..6c35b4042 100644
--- a/reference/shaders-opencl-no-opt/asm/comp/variable-pointers-3.invalid.asm.comp
+++ b/reference/shaders-opencl-no-opt/asm/comp/variable-pointers-3.invalid.asm.comp
@@ -7,6 +7,6 @@ __kernel void comp_main()
     uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
     __local float2 test[64];
     float _21 = convert_float(((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x);
-    (true ? &((&test)[0u])[2u].x : &((&test)[0u])[2u].x)[1u] = _21;
+    (true ? &((__local float*)&((&test)[0u])[2u])[0u] : &((__local float*)&((&test)[0u])[2u])[0u])[1u] = _21;
 }
 
diff --git a/reference/shaders-opencl-no-opt/asm/comp/variable-pointers-vector-to-scalar.asm.comp b/reference/shaders-opencl-no-opt/asm/comp/variable-pointers-vector-to-scalar.asm.comp
index 5af73eb76..e925200e9 100644
--- a/reference/shaders-opencl-no-opt/asm/comp/variable-pointers-vector-to-scalar.asm.comp
+++ b/reference/shaders-opencl-no-opt/asm/comp/variable-pointers-vector-to-scalar.asm.comp
@@ -7,6 +7,6 @@ __kernel void comp_main()
     uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
     __local float2 test[64];
     float _21 = convert_float(((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x);
-    (*(true ? &test[1u] : &test[2u])).y = _21;
+    ((__local float*)&(*(true ? &test[1u] : &test[2u])))[1u] = _21;
 }
 
diff --git a/shaders-opencl-no-opt/asm/comp/opptrequal-row-maj-mtx-bypass-transpose.spv14.asm.invalid.comp b/shaders-opencl-no-opt/asm/comp/opptrequal-row-maj-mtx-bypass-transpose.spv14.asm.comp
similarity index 100%
rename from shaders-opencl-no-opt/asm/comp/opptrequal-row-maj-mtx-bypass-transpose.spv14.asm.invalid.comp
rename to shaders-opencl-no-opt/asm/comp/opptrequal-row-maj-mtx-bypass-transpose.spv14.asm.comp
diff --git a/spirv_opencl.cpp b/spirv_opencl.cpp
index cb2be10aa..ed7cfe633 100644
--- a/spirv_opencl.cpp
+++ b/spirv_opencl.cpp
@@ -2574,6 +2574,43 @@ std::string CompilerOpenCL::to_atomic_ptr_expression(uint32_t id)
 	return to_expression(id);
 }
 
+bool CompilerOpenCL::prepare_access_chain_for_scalar_access(std::string &expr, const SPIRType &type,
+                                                            StorageClass storage, bool &is_packed)
+{
+	// In OpenCL C, you cannot take the address of a vector component (e.g. &vec.x is invalid).
+	// Cast the vector expression to a scalar pointer so that element access uses array indexing.
+	if (storage == StorageClassStorageBuffer || storage == StorageClassWorkgroup)
+	{
+		const char *addr_space = storage == StorageClassWorkgroup ? "__local" : "__global";
+		expr = join("((", addr_space, " ", type_to_glsl(type), "*)&", enclose_expression(expr), ")");
+		is_packed = true;
+		return true;
+	}
+	else
+		return false;
+}
+
+void CompilerOpenCL::emit_binary_ptr_op(uint32_t result_type, uint32_t result_id, uint32_t op0, uint32_t op1,
+                                        const char *op)
+{
+	bool forward = should_forward(op0) && should_forward(op1);
+	emit_op(result_type, result_id, join(to_ptr_expression(op0), " ", op, " ", to_ptr_expression(op1)), forward);
+	inherit_expression_dependencies(result_id, op0);
+	inherit_expression_dependencies(result_id, op1);
+}
+
+string CompilerOpenCL::to_ptr_expression(uint32_t id, bool register_expression_read)
+{
+	auto *e = maybe_get<SPIRExpression>(id);
+	// If need_transpose is set, bypass the transpose wrapper and use the raw expression,
+	// since we're taking the address and comparing pointers, not values.
+	auto expr =
+	    enclose_expression(e && e->need_transpose ? e->expression : to_expression(id, register_expression_read));
+	if (!should_dereference(id))
+		expr = address_of_expression(expr);
+	return expr;
+}
+
 // Task #3: In OpenCL C, pointer-to-struct member access uses -> instead of .
 // ptr_chain_is_resolved == false means this is the first member access from the base.
 bool CompilerOpenCL::should_dereference(uint32_t id)
@@ -4843,27 +4880,16 @@ void CompilerOpenCL::emit_instruction(const Instruction &instruction)
 	}
 
 	case OpPtrEqual:
+		emit_binary_ptr_op(ops[0], ops[1], ops[2], ops[3], "==");
+		break;
+
 	case OpPtrNotEqual:
+		emit_binary_ptr_op(ops[0], ops[1], ops[2], ops[3], "!=");
+		break;
+
 	case OpPtrDiff:
-	{
-		uint32_t result_type = ops[0];
-		uint32_t result_id = ops[1];
-		uint32_t op0 = ops[2];
-		uint32_t op1 = ops[3];
-		const char *op = "";
-		if (opcode == OpPtrEqual)
-			op = "==";
-		else if (opcode == OpPtrNotEqual)
-			op = "!=";
-		else if (opcode == OpPtrDiff)
-			op = "-";
-		bool forward = should_forward(op0) && should_forward(op1);
-		emit_op(result_type, result_id, join(to_pointer_expression(op0), " ", op, " ", to_pointer_expression(op1)),
-		        forward);
-		inherit_expression_dependencies(result_id, op0);
-		inherit_expression_dependencies(result_id, op1);
+		emit_binary_ptr_op(ops[0], ops[1], ops[2], ops[3], "-");
 		break;
-	}
 
 	case OpSDot:
 	case OpUDot:
diff --git a/spirv_opencl.hpp b/spirv_opencl.hpp
index 94a7c5d35..febc62bc9 100644
--- a/spirv_opencl.hpp
+++ b/spirv_opencl.hpp
@@ -144,6 +144,10 @@ class CompilerOpenCL : public CompilerGLSL
 	void emit_store_statement(uint32_t lhs_expression, uint32_t rhs_expression) override;
 	void emit_struct_member(const SPIRType &type, uint32_t member_type_id, uint32_t index,
 	                        const std::string &qualifier = "", uint32_t base_offset = 0) override;
+	void emit_binary_ptr_op(uint32_t result_type, uint32_t result_id, uint32_t op0, uint32_t op1, const char *op);
+	std::string to_ptr_expression(uint32_t id, bool register_expression_read = true);
+	bool prepare_access_chain_for_scalar_access(std::string &expr, const SPIRType &type, StorageClass storage,
+	                                            bool &is_packed) override;
 
 	Options opencl_options;
 

From 6c9166411e26105fd6bf9b81a79918e59e29f744 Mon Sep 17 00:00:00 2001
From: Garrick Meeker <gmeeker@gmail.com>
Date: Mon, 16 Mar 2026 10:44:11 -0700
Subject: [PATCH 10/16] OpenCL: subgroup implementation

---
 ...oups-arithmetic.nocompat.vk.subgroups.comp |  47 ++
 ...ubgroups-ballot.nocompat.vk.subgroups.comp |  37 ++
 ...subgroups-basic.nocompat.vk.subgroups.comp |  57 +++
 ...roups-clustered.nocompat.vk.subgroups.comp |  99 ++++
 ...ubgroups-rotate.nocompat.vk.subgroups.comp |  24 +
 ...huffle-relative.nocompat.vk.subgroups.comp |  24 +
 ...bgroups-shuffle.nocompat.vk.subgroups.comp |  24 +
 .../subgroups-vote.nocompat.vk.subgroups.comp |  28 ++
 ....vk.opencl12.emulate-subgroup.invalid.comp |   0
 ...at.vk.subgroup.fixed-subgroup.invalid.comp |   0
 ...ubgroups.nocompat.vk.subgroup.invalid.comp |   0
 ....nocompat.vk.subgroup.swizzle.invalid.comp |   0
 ...oups-arithmetic.nocompat.vk.subgroups.comp |  47 ++
 ...ubgroups-ballot.nocompat.vk.subgroups.comp |  36 ++
 ...subgroups-basic.nocompat.vk.subgroups.comp |  72 +++
 ...roups-clustered.nocompat.vk.subgroups.comp |  97 ++++
 ...ubgroups-rotate.nocompat.vk.subgroups.comp |  20 +
 ...huffle-relative.nocompat.vk.subgroups.comp |  20 +
 ...bgroups-shuffle.nocompat.vk.subgroups.comp |  19 +
 .../subgroups-vote.nocompat.vk.subgroups.comp |  24 +
 ....vk.opencl12.emulate-subgroup.invalid.comp |  25 -
 ...at.vk.subgroup.fixed-subgroup.invalid.comp | 211 --------
 ...ubgroups.nocompat.vk.subgroup.invalid.comp | 211 --------
 ....nocompat.vk.subgroup.swizzle.invalid.comp | 211 --------
 spirv_opencl.cpp                              | 470 +++++++++++++++++-
 spirv_opencl.hpp                              |  14 +
 test_shaders.py                               |  42 +-
 27 files changed, 1171 insertions(+), 688 deletions(-)
 create mode 100644 reference/shaders-opencl-no-opt/comp/subgroups-arithmetic.nocompat.vk.subgroups.comp
 create mode 100644 reference/shaders-opencl-no-opt/comp/subgroups-ballot.nocompat.vk.subgroups.comp
 create mode 100644 reference/shaders-opencl-no-opt/comp/subgroups-basic.nocompat.vk.subgroups.comp
 create mode 100644 reference/shaders-opencl-no-opt/comp/subgroups-clustered.nocompat.vk.subgroups.comp
 create mode 100644 reference/shaders-opencl-no-opt/comp/subgroups-rotate.nocompat.vk.subgroups.comp
 create mode 100644 reference/shaders-opencl-no-opt/comp/subgroups-shuffle-relative.nocompat.vk.subgroups.comp
 create mode 100644 reference/shaders-opencl-no-opt/comp/subgroups-shuffle.nocompat.vk.subgroups.comp
 create mode 100644 reference/shaders-opencl-no-opt/comp/subgroups-vote.nocompat.vk.subgroups.comp
 delete mode 100644 reference/shaders-opencl-no-opt/comp/subgroups.nocompat.vk.opencl12.emulate-subgroup.invalid.comp
 delete mode 100644 reference/shaders-opencl-no-opt/comp/subgroups.nocompat.vk.subgroup.fixed-subgroup.invalid.comp
 delete mode 100644 reference/shaders-opencl-no-opt/comp/subgroups.nocompat.vk.subgroup.invalid.comp
 delete mode 100644 reference/shaders-opencl-no-opt/comp/subgroups.nocompat.vk.subgroup.swizzle.invalid.comp
 create mode 100644 shaders-opencl-no-opt/comp/subgroups-arithmetic.nocompat.vk.subgroups.comp
 create mode 100644 shaders-opencl-no-opt/comp/subgroups-ballot.nocompat.vk.subgroups.comp
 create mode 100644 shaders-opencl-no-opt/comp/subgroups-basic.nocompat.vk.subgroups.comp
 create mode 100644 shaders-opencl-no-opt/comp/subgroups-clustered.nocompat.vk.subgroups.comp
 create mode 100644 shaders-opencl-no-opt/comp/subgroups-rotate.nocompat.vk.subgroups.comp
 create mode 100644 shaders-opencl-no-opt/comp/subgroups-shuffle-relative.nocompat.vk.subgroups.comp
 create mode 100644 shaders-opencl-no-opt/comp/subgroups-shuffle.nocompat.vk.subgroups.comp
 create mode 100644 shaders-opencl-no-opt/comp/subgroups-vote.nocompat.vk.subgroups.comp
 delete mode 100644 shaders-opencl-no-opt/comp/subgroups.nocompat.vk.opencl12.emulate-subgroup.invalid.comp
 delete mode 100644 shaders-opencl-no-opt/comp/subgroups.nocompat.vk.subgroup.fixed-subgroup.invalid.comp
 delete mode 100644 shaders-opencl-no-opt/comp/subgroups.nocompat.vk.subgroup.invalid.comp
 delete mode 100644 shaders-opencl-no-opt/comp/subgroups.nocompat.vk.subgroup.swizzle.invalid.comp

diff --git a/reference/shaders-opencl-no-opt/comp/subgroups-arithmetic.nocompat.vk.subgroups.comp b/reference/shaders-opencl-no-opt/comp/subgroups-arithmetic.nocompat.vk.subgroups.comp
new file mode 100644
index 000000000..916168719
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/comp/subgroups-arithmetic.nocompat.vk.subgroups.comp
@@ -0,0 +1,47 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+#pragma OPENCL EXTENSION cl_khr_3d_image_writes : enable
+#pragma OPENCL EXTENSION cl_khr_subgroups : enable
+#pragma OPENCL EXTENSION cl_khr_subgroup_non_uniform_arithmetic : enable
+
+struct SSBO
+{
+    float fdat;
+    int idat;
+    uint udat;
+};
+
+typedef struct SSBO SSBO;
+
+__attribute__((reqd_work_group_size(256, 1, 1)))
+__kernel void comp_main(__global SSBO* _13)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    float fadd_1 = sub_group_reduce_add(_13->fdat);
+    float fmul_1 = sub_group_non_uniform_reduce_mul(_13->fdat);
+    int iadd_1 = sub_group_reduce_add(_13->idat);
+    int imul_1 = sub_group_non_uniform_reduce_mul(_13->idat);
+    float fmin_v_1 = sub_group_reduce_min(_13->fdat);
+    float fmax_v_1 = sub_group_reduce_max(_13->fdat);
+    int smin_v_1 = sub_group_reduce_min(_13->idat);
+    int smax_v_1 = sub_group_reduce_max(_13->idat);
+    uint umin_v_1 = sub_group_reduce_min(_13->udat);
+    uint umax_v_1 = sub_group_reduce_max(_13->udat);
+    uint anded_1 = sub_group_non_uniform_reduce_and(_13->udat);
+    uint ored_1 = sub_group_non_uniform_reduce_or(_13->udat);
+    uint xored_1 = sub_group_non_uniform_reduce_xor(_13->udat);
+    int4 bval_1 = (uint4)(_13->udat) == (uint4)(42u);
+    int4 anded_b_1 = (int4)(sub_group_non_uniform_reduce_logical_and(bval_1.x), sub_group_non_uniform_reduce_logical_and(bval_1.y), sub_group_non_uniform_reduce_logical_and(bval_1.z), sub_group_non_uniform_reduce_logical_and(bval_1.w));
+    int4 ored_b_1 = (int4)(sub_group_non_uniform_reduce_logical_or(bval_1.x), sub_group_non_uniform_reduce_logical_or(bval_1.y), sub_group_non_uniform_reduce_logical_or(bval_1.z), sub_group_non_uniform_reduce_logical_or(bval_1.w));
+    int4 xored_b_1 = (int4)(sub_group_non_uniform_reduce_logical_xor(bval_1.x), sub_group_non_uniform_reduce_logical_xor(bval_1.y), sub_group_non_uniform_reduce_logical_xor(bval_1.z), sub_group_non_uniform_reduce_logical_xor(bval_1.w));
+    float finc_add_1 = sub_group_scan_inclusive_add(_13->fdat);
+    float finc_mul_1 = sub_group_non_uniform_scan_inclusive_mul(_13->fdat);
+    int iinc_add_1 = sub_group_scan_inclusive_add(_13->idat);
+    int iinc_mul_1 = sub_group_non_uniform_scan_inclusive_mul(_13->idat);
+    float fexc_add_1 = sub_group_scan_exclusive_add(_13->fdat);
+    float fexc_mul_1 = sub_group_non_uniform_scan_exclusive_mul(_13->fdat);
+    int iexc_add_1 = sub_group_scan_exclusive_add(_13->idat);
+    int iexc_mul_1 = sub_group_non_uniform_scan_exclusive_mul(_13->idat);
+    _13->fdat = (((((((((((fadd_1 + fmul_1) + fmin_v_1) + fmax_v_1) + finc_add_1) + finc_mul_1) + fexc_add_1) + fexc_mul_1) + convert_float(((((((iadd_1 + imul_1) + smin_v_1) + smax_v_1) + iinc_add_1) + iinc_mul_1) + iexc_add_1) + iexc_mul_1)) + convert_float((((umin_v_1 + umax_v_1) + anded_1) + ored_1) + xored_1)) + (float)(anded_b_1.x)) + (float)(ored_b_1.x)) + (float)(xored_b_1.x);
+}
+
diff --git a/reference/shaders-opencl-no-opt/comp/subgroups-ballot.nocompat.vk.subgroups.comp b/reference/shaders-opencl-no-opt/comp/subgroups-ballot.nocompat.vk.subgroups.comp
new file mode 100644
index 000000000..c7d53554c
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/comp/subgroups-ballot.nocompat.vk.subgroups.comp
@@ -0,0 +1,37 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+#pragma OPENCL EXTENSION cl_khr_3d_image_writes : enable
+#pragma OPENCL EXTENSION cl_khr_subgroups : enable
+#pragma OPENCL EXTENSION cl_khr_subgroup_ballot : enable
+
+struct SSBO
+{
+    float FragColor;
+};
+
+typedef struct SSBO SSBO;
+
+__attribute__((reqd_work_group_size(256, 1, 1)))
+__kernel void comp_main(__global float* _9)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    _9[0] = convert_float4(get_sub_group_eq_mask()).x;
+    _9[0] = convert_float4(get_sub_group_ge_mask()).x;
+    _9[0] = convert_float4(get_sub_group_gt_mask()).x;
+    _9[0] = convert_float4(get_sub_group_le_mask()).x;
+    _9[0] = convert_float4(get_sub_group_lt_mask()).x;
+    float4 broadcasted_1 = (float4)(sub_group_broadcast((float4)(10.0f).x, 8u), sub_group_broadcast((float4)(10.0f).y, 8u), sub_group_broadcast((float4)(10.0f).z, 8u), sub_group_broadcast((float4)(10.0f).w, 8u));
+    int2 broadcasted_bool_1 = (int2)(sub_group_broadcast((int2)(true).x, 8u), sub_group_broadcast((int2)(true).y, 8u));
+    float3 first_1 = (float3)(sub_group_broadcast_first((float3)(20.0f).x), sub_group_broadcast_first((float3)(20.0f).y), sub_group_broadcast_first((float3)(20.0f).z));
+    int4 first_bool_1 = (int4)(sub_group_broadcast_first((int4)(false).x), sub_group_broadcast_first((int4)(false).y), sub_group_broadcast_first((int4)(false).z), sub_group_broadcast_first((int4)(false).w));
+    uint4 ballot_value_1 = sub_group_ballot(true);
+    bool inverse_ballot_value_1 = sub_group_inverse_ballot(ballot_value_1);
+    bool bit_extracted_1 = sub_group_ballot_bit_extract((uint4)(10u), 8u);
+    uint bit_count_1 = sub_group_ballot_bit_count(ballot_value_1);
+    uint inclusive_bit_count_1 = sub_group_ballot_inclusive_scan(ballot_value_1);
+    uint exclusive_bit_count_1 = sub_group_ballot_exclusive_scan(ballot_value_1);
+    uint lsb_1 = sub_group_ballot_find_lsb(ballot_value_1);
+    uint msb_1 = sub_group_ballot_find_msb(ballot_value_1);
+    _9[0] = (((((((((broadcasted_1.x + (float)(broadcasted_bool_1.x)) + first_1.x) + (float)(first_bool_1.x)) + (float)(inverse_ballot_value_1)) + (float)(bit_extracted_1)) + convert_float(bit_count_1)) + convert_float(inclusive_bit_count_1)) + convert_float(exclusive_bit_count_1)) + convert_float(lsb_1)) + convert_float(msb_1);
+}
+
diff --git a/reference/shaders-opencl-no-opt/comp/subgroups-basic.nocompat.vk.subgroups.comp b/reference/shaders-opencl-no-opt/comp/subgroups-basic.nocompat.vk.subgroups.comp
new file mode 100644
index 000000000..e4921be88
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/comp/subgroups-basic.nocompat.vk.subgroups.comp
@@ -0,0 +1,57 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+#pragma OPENCL EXTENSION cl_khr_3d_image_writes : enable
+#pragma OPENCL EXTENSION cl_khr_subgroups : enable
+
+struct SSBO
+{
+    float FragColor;
+    int idat;
+    uint udat;
+};
+
+typedef struct SSBO SSBO;
+
+__attribute__((reqd_work_group_size(256, 1, 1)))
+__kernel void comp_main(__global SSBO* _11)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    _11->FragColor = convert_float(get_num_sub_groups());
+    _11->FragColor = convert_float(get_sub_group_id());
+    _11->FragColor = convert_float(get_sub_group_size());
+    _11->FragColor = convert_float(get_sub_group_local_id());
+    sub_group_barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
+    sub_group_barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
+    sub_group_barrier(CLK_GLOBAL_MEM_FENCE);
+    sub_group_barrier(CLK_LOCAL_MEM_FENCE);
+    sub_group_barrier(CLK_GLOBAL_MEM_FENCE);
+    bool has_all = sub_group_all(true);
+    bool has_any = sub_group_any(true);
+    uint broadcasted = sub_group_broadcast(42u, 0u);
+    float fadd = sub_group_reduce_add(_11->FragColor);
+    int iadd = sub_group_reduce_add(_11->idat);
+    float fmin = sub_group_reduce_min(_11->FragColor);
+    float fmax = sub_group_reduce_max(_11->FragColor);
+    int smin = sub_group_reduce_min(_11->idat);
+    int smax = sub_group_reduce_max(_11->idat);
+    uint umin = sub_group_reduce_min(_11->udat);
+    uint umax = sub_group_reduce_max(_11->udat);
+    float finc_add = sub_group_scan_inclusive_add(_11->FragColor);
+    float finc_min = sub_group_scan_inclusive_min(_11->FragColor);
+    float finc_max = sub_group_scan_inclusive_max(_11->FragColor);
+    int iinc_add = sub_group_scan_inclusive_add(_11->idat);
+    int iinc_min = sub_group_scan_inclusive_min(_11->idat);
+    int iinc_max = sub_group_scan_inclusive_max(_11->idat);
+    uint uinc_min = sub_group_scan_inclusive_min(_11->udat);
+    uint uinc_max = sub_group_scan_inclusive_max(_11->udat);
+    float fexc_add = sub_group_scan_exclusive_add(_11->FragColor);
+    float fexc_min = sub_group_scan_exclusive_min(_11->FragColor);
+    float fexc_max = sub_group_scan_exclusive_max(_11->FragColor);
+    int iexc_add = sub_group_scan_exclusive_add(_11->idat);
+    int iexc_min = sub_group_scan_exclusive_min(_11->idat);
+    int iexc_max = sub_group_scan_exclusive_max(_11->idat);
+    uint uexc_min = sub_group_scan_exclusive_min(_11->udat);
+    uint uexc_max = sub_group_scan_exclusive_max(_11->udat);
+    _11->FragColor = (((((((((((fadd + fmin) + fmax) + finc_add) + finc_min) + finc_max) + fexc_add) + fexc_min) + fexc_max) + convert_float((((((((iadd + smin) + smax) + iinc_add) + iinc_min) + iinc_max) + iexc_add) + iexc_min) + iexc_max)) + convert_float((((((umin + umax) + uinc_min) + uinc_max) + uexc_min) + uexc_max) + broadcasted)) + (float)(has_all)) + (float)(has_any);
+}
+
diff --git a/reference/shaders-opencl-no-opt/comp/subgroups-clustered.nocompat.vk.subgroups.comp b/reference/shaders-opencl-no-opt/comp/subgroups-clustered.nocompat.vk.subgroups.comp
new file mode 100644
index 000000000..10a67ecce
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/comp/subgroups-clustered.nocompat.vk.subgroups.comp
@@ -0,0 +1,99 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+#pragma OPENCL EXTENSION cl_khr_3d_image_writes : enable
+#pragma OPENCL EXTENSION cl_khr_subgroups : enable
+#pragma OPENCL EXTENSION cl_khr_subgroup_clustered_reduce : enable
+
+struct SSBO
+{
+    float fdat;
+    int idat;
+    uint udat;
+};
+
+typedef struct SSBO SSBO;
+
+__attribute__((reqd_work_group_size(256, 1, 1)))
+__kernel void comp_main(__global SSBO* _14)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    float4 fv_1 = (float4)(_14->fdat);
+    int4 iv_1 = (int4)(_14->idat);
+    uint4 uv_1 = (uint4)(_14->udat);
+    float4 added_1 = (float4)(sub_group_clustered_reduce_add(fv_1.x, 1u), sub_group_clustered_reduce_add(fv_1.y, 1u), sub_group_clustered_reduce_add(fv_1.z, 1u), sub_group_clustered_reduce_add(fv_1.w, 1u));
+    float4 multiplied_1 = (float4)(sub_group_clustered_reduce_mul(fv_1.x, 1u), sub_group_clustered_reduce_mul(fv_1.y, 1u), sub_group_clustered_reduce_mul(fv_1.z, 1u), sub_group_clustered_reduce_mul(fv_1.w, 1u));
+    int4 iadded_1 = (int4)(sub_group_clustered_reduce_add(iv_1.x, 1u), sub_group_clustered_reduce_add(iv_1.y, 1u), sub_group_clustered_reduce_add(iv_1.z, 1u), sub_group_clustered_reduce_add(iv_1.w, 1u));
+    int4 imultiplied_1 = (int4)(sub_group_clustered_reduce_mul(iv_1.x, 1u), sub_group_clustered_reduce_mul(iv_1.y, 1u), sub_group_clustered_reduce_mul(iv_1.z, 1u), sub_group_clustered_reduce_mul(iv_1.w, 1u));
+    float4 lo_1 = (float4)(sub_group_clustered_reduce_min(fv_1.x, 1u), sub_group_clustered_reduce_min(fv_1.y, 1u), sub_group_clustered_reduce_min(fv_1.z, 1u), sub_group_clustered_reduce_min(fv_1.w, 1u));
+    float4 hi_1 = (float4)(sub_group_clustered_reduce_max(fv_1.x, 1u), sub_group_clustered_reduce_max(fv_1.y, 1u), sub_group_clustered_reduce_max(fv_1.z, 1u), sub_group_clustered_reduce_max(fv_1.w, 1u));
+    int4 slo_1 = (int4)(sub_group_clustered_reduce_min(iv_1.x, 1u), sub_group_clustered_reduce_min(iv_1.y, 1u), sub_group_clustered_reduce_min(iv_1.z, 1u), sub_group_clustered_reduce_min(iv_1.w, 1u));
+    int4 shi_1 = (int4)(sub_group_clustered_reduce_max(iv_1.x, 1u), sub_group_clustered_reduce_max(iv_1.y, 1u), sub_group_clustered_reduce_max(iv_1.z, 1u), sub_group_clustered_reduce_max(iv_1.w, 1u));
+    uint4 ulo_1 = (uint4)(sub_group_clustered_reduce_min(uv_1.x, 1u), sub_group_clustered_reduce_min(uv_1.y, 1u), sub_group_clustered_reduce_min(uv_1.z, 1u), sub_group_clustered_reduce_min(uv_1.w, 1u));
+    uint4 uhi_1 = (uint4)(sub_group_clustered_reduce_max(uv_1.x, 1u), sub_group_clustered_reduce_max(uv_1.y, 1u), sub_group_clustered_reduce_max(uv_1.z, 1u), sub_group_clustered_reduce_max(uv_1.w, 1u));
+    uint4 anded_1 = (uint4)(sub_group_clustered_reduce_and(uv_1.x, 1u), sub_group_clustered_reduce_and(uv_1.y, 1u), sub_group_clustered_reduce_and(uv_1.z, 1u), sub_group_clustered_reduce_and(uv_1.w, 1u));
+    uint4 ored_1 = (uint4)(sub_group_clustered_reduce_or(uv_1.x, 1u), sub_group_clustered_reduce_or(uv_1.y, 1u), sub_group_clustered_reduce_or(uv_1.z, 1u), sub_group_clustered_reduce_or(uv_1.w, 1u));
+    uint4 xored_1 = (uint4)(sub_group_clustered_reduce_xor(uv_1.x, 1u), sub_group_clustered_reduce_xor(uv_1.y, 1u), sub_group_clustered_reduce_xor(uv_1.z, 1u), sub_group_clustered_reduce_xor(uv_1.w, 1u));
+    int4 bval_1 = uv_1 == (uint4)(42u);
+    int4 anded_b_1 = (int4)(sub_group_clustered_reduce_logical_and(bval_1.x, 1u), sub_group_clustered_reduce_logical_and(bval_1.y, 1u), sub_group_clustered_reduce_logical_and(bval_1.z, 1u), sub_group_clustered_reduce_logical_and(bval_1.w, 1u));
+    int4 ored_b_1 = (int4)(sub_group_clustered_reduce_logical_or(bval_1.x, 1u), sub_group_clustered_reduce_logical_or(bval_1.y, 1u), sub_group_clustered_reduce_logical_or(bval_1.z, 1u), sub_group_clustered_reduce_logical_or(bval_1.w, 1u));
+    int4 xored_b_1 = (int4)(sub_group_clustered_reduce_logical_xor(bval_1.x, 1u), sub_group_clustered_reduce_logical_xor(bval_1.y, 1u), sub_group_clustered_reduce_logical_xor(bval_1.z, 1u), sub_group_clustered_reduce_logical_xor(bval_1.w, 1u));
+    added_1 = (float4)(sub_group_clustered_reduce_add(added_1.x, 2u), sub_group_clustered_reduce_add(added_1.y, 2u), sub_group_clustered_reduce_add(added_1.z, 2u), sub_group_clustered_reduce_add(added_1.w, 2u));
+    multiplied_1 = (float4)(sub_group_clustered_reduce_mul(multiplied_1.x, 2u), sub_group_clustered_reduce_mul(multiplied_1.y, 2u), sub_group_clustered_reduce_mul(multiplied_1.z, 2u), sub_group_clustered_reduce_mul(multiplied_1.w, 2u));
+    iadded_1 = (int4)(sub_group_clustered_reduce_add(iadded_1.x, 2u), sub_group_clustered_reduce_add(iadded_1.y, 2u), sub_group_clustered_reduce_add(iadded_1.z, 2u), sub_group_clustered_reduce_add(iadded_1.w, 2u));
+    imultiplied_1 = (int4)(sub_group_clustered_reduce_mul(imultiplied_1.x, 2u), sub_group_clustered_reduce_mul(imultiplied_1.y, 2u), sub_group_clustered_reduce_mul(imultiplied_1.z, 2u), sub_group_clustered_reduce_mul(imultiplied_1.w, 2u));
+    lo_1 = (float4)(sub_group_clustered_reduce_min(lo_1.x, 2u), sub_group_clustered_reduce_min(lo_1.y, 2u), sub_group_clustered_reduce_min(lo_1.z, 2u), sub_group_clustered_reduce_min(lo_1.w, 2u));
+    hi_1 = (float4)(sub_group_clustered_reduce_max(hi_1.x, 2u), sub_group_clustered_reduce_max(hi_1.y, 2u), sub_group_clustered_reduce_max(hi_1.z, 2u), sub_group_clustered_reduce_max(hi_1.w, 2u));
+    slo_1 = (int4)(sub_group_clustered_reduce_min(slo_1.x, 2u), sub_group_clustered_reduce_min(slo_1.y, 2u), sub_group_clustered_reduce_min(slo_1.z, 2u), sub_group_clustered_reduce_min(slo_1.w, 2u));
+    shi_1 = (int4)(sub_group_clustered_reduce_max(shi_1.x, 2u), sub_group_clustered_reduce_max(shi_1.y, 2u), sub_group_clustered_reduce_max(shi_1.z, 2u), sub_group_clustered_reduce_max(shi_1.w, 2u));
+    ulo_1 = (uint4)(sub_group_clustered_reduce_min(ulo_1.x, 2u), sub_group_clustered_reduce_min(ulo_1.y, 2u), sub_group_clustered_reduce_min(ulo_1.z, 2u), sub_group_clustered_reduce_min(ulo_1.w, 2u));
+    uhi_1 = (uint4)(sub_group_clustered_reduce_max(uhi_1.x, 2u), sub_group_clustered_reduce_max(uhi_1.y, 2u), sub_group_clustered_reduce_max(uhi_1.z, 2u), sub_group_clustered_reduce_max(uhi_1.w, 2u));
+    anded_1 = (uint4)(sub_group_clustered_reduce_and(anded_1.x, 2u), sub_group_clustered_reduce_and(anded_1.y, 2u), sub_group_clustered_reduce_and(anded_1.z, 2u), sub_group_clustered_reduce_and(anded_1.w, 2u));
+    ored_1 = (uint4)(sub_group_clustered_reduce_or(ored_1.x, 2u), sub_group_clustered_reduce_or(ored_1.y, 2u), sub_group_clustered_reduce_or(ored_1.z, 2u), sub_group_clustered_reduce_or(ored_1.w, 2u));
+    xored_1 = (uint4)(sub_group_clustered_reduce_xor(xored_1.x, 2u), sub_group_clustered_reduce_xor(xored_1.y, 2u), sub_group_clustered_reduce_xor(xored_1.z, 2u), sub_group_clustered_reduce_xor(xored_1.w, 2u));
+    int4 _123 = anded_1 == (uint4)(2u);
+    anded_b_1 = (int4)(sub_group_clustered_reduce_logical_and(_123.x, 2u), sub_group_clustered_reduce_logical_and(_123.y, 2u), sub_group_clustered_reduce_logical_and(_123.z, 2u), sub_group_clustered_reduce_logical_and(_123.w, 2u));
+    int4 _127 = ored_1 == (uint4)(3u);
+    ored_b_1 = (int4)(sub_group_clustered_reduce_logical_or(_127.x, 2u), sub_group_clustered_reduce_logical_or(_127.y, 2u), sub_group_clustered_reduce_logical_or(_127.z, 2u), sub_group_clustered_reduce_logical_or(_127.w, 2u));
+    int4 _132 = xored_1 == (uint4)(4u);
+    xored_b_1 = (int4)(sub_group_clustered_reduce_logical_xor(_132.x, 2u), sub_group_clustered_reduce_logical_xor(_132.y, 2u), sub_group_clustered_reduce_logical_xor(_132.z, 2u), sub_group_clustered_reduce_logical_xor(_132.w, 2u));
+    added_1 = (float4)(sub_group_clustered_reduce_add(added_1.x, 4u), sub_group_clustered_reduce_add(added_1.y, 4u), sub_group_clustered_reduce_add(added_1.z, 4u), sub_group_clustered_reduce_add(added_1.w, 4u));
+    multiplied_1 = (float4)(sub_group_clustered_reduce_mul(multiplied_1.x, 4u), sub_group_clustered_reduce_mul(multiplied_1.y, 4u), sub_group_clustered_reduce_mul(multiplied_1.z, 4u), sub_group_clustered_reduce_mul(multiplied_1.w, 4u));
+    iadded_1 = (int4)(sub_group_clustered_reduce_add(iadded_1.x, 4u), sub_group_clustered_reduce_add(iadded_1.y, 4u), sub_group_clustered_reduce_add(iadded_1.z, 4u), sub_group_clustered_reduce_add(iadded_1.w, 4u));
+    imultiplied_1 = (int4)(sub_group_clustered_reduce_mul(imultiplied_1.x, 4u), sub_group_clustered_reduce_mul(imultiplied_1.y, 4u), sub_group_clustered_reduce_mul(imultiplied_1.z, 4u), sub_group_clustered_reduce_mul(imultiplied_1.w, 4u));
+    lo_1 = (float4)(sub_group_clustered_reduce_min(lo_1.x, 4u), sub_group_clustered_reduce_min(lo_1.y, 4u), sub_group_clustered_reduce_min(lo_1.z, 4u), sub_group_clustered_reduce_min(lo_1.w, 4u));
+    hi_1 = (float4)(sub_group_clustered_reduce_max(hi_1.x, 4u), sub_group_clustered_reduce_max(hi_1.y, 4u), sub_group_clustered_reduce_max(hi_1.z, 4u), sub_group_clustered_reduce_max(hi_1.w, 4u));
+    slo_1 = (int4)(sub_group_clustered_reduce_min(slo_1.x, 4u), sub_group_clustered_reduce_min(slo_1.y, 4u), sub_group_clustered_reduce_min(slo_1.z, 4u), sub_group_clustered_reduce_min(slo_1.w, 4u));
+    shi_1 = (int4)(sub_group_clustered_reduce_max(shi_1.x, 4u), sub_group_clustered_reduce_max(shi_1.y, 4u), sub_group_clustered_reduce_max(shi_1.z, 4u), sub_group_clustered_reduce_max(shi_1.w, 4u));
+    ulo_1 = (uint4)(sub_group_clustered_reduce_min(ulo_1.x, 4u), sub_group_clustered_reduce_min(ulo_1.y, 4u), sub_group_clustered_reduce_min(ulo_1.z, 4u), sub_group_clustered_reduce_min(ulo_1.w, 4u));
+    uhi_1 = (uint4)(sub_group_clustered_reduce_max(uhi_1.x, 4u), sub_group_clustered_reduce_max(uhi_1.y, 4u), sub_group_clustered_reduce_max(uhi_1.z, 4u), sub_group_clustered_reduce_max(uhi_1.w, 4u));
+    anded_1 = (uint4)(sub_group_clustered_reduce_and(anded_1.x, 4u), sub_group_clustered_reduce_and(anded_1.y, 4u), sub_group_clustered_reduce_and(anded_1.z, 4u), sub_group_clustered_reduce_and(anded_1.w, 4u));
+    ored_1 = (uint4)(sub_group_clustered_reduce_or(ored_1.x, 4u), sub_group_clustered_reduce_or(ored_1.y, 4u), sub_group_clustered_reduce_or(ored_1.z, 4u), sub_group_clustered_reduce_or(ored_1.w, 4u));
+    xored_1 = (uint4)(sub_group_clustered_reduce_xor(xored_1.x, 4u), sub_group_clustered_reduce_xor(xored_1.y, 4u), sub_group_clustered_reduce_xor(xored_1.z, 4u), sub_group_clustered_reduce_xor(xored_1.w, 4u));
+    int4 _161 = anded_1 == (uint4)(2u);
+    anded_b_1 = (int4)(sub_group_clustered_reduce_logical_and(_161.x, 4u), sub_group_clustered_reduce_logical_and(_161.y, 4u), sub_group_clustered_reduce_logical_and(_161.z, 4u), sub_group_clustered_reduce_logical_and(_161.w, 4u));
+    int4 _164 = ored_1 == (uint4)(3u);
+    ored_b_1 = (int4)(sub_group_clustered_reduce_logical_or(_164.x, 4u), sub_group_clustered_reduce_logical_or(_164.y, 4u), sub_group_clustered_reduce_logical_or(_164.z, 4u), sub_group_clustered_reduce_logical_or(_164.w, 4u));
+    int4 _167 = xored_1 == (uint4)(4u);
+    xored_b_1 = (int4)(sub_group_clustered_reduce_logical_xor(_167.x, 4u), sub_group_clustered_reduce_logical_xor(_167.y, 4u), sub_group_clustered_reduce_logical_xor(_167.z, 4u), sub_group_clustered_reduce_logical_xor(_167.w, 4u));
+    added_1 = (float4)(sub_group_clustered_reduce_add(added_1.x, 16u), sub_group_clustered_reduce_add(added_1.y, 16u), sub_group_clustered_reduce_add(added_1.z, 16u), sub_group_clustered_reduce_add(added_1.w, 16u));
+    multiplied_1 = (float4)(sub_group_clustered_reduce_mul(multiplied_1.x, 16u), sub_group_clustered_reduce_mul(multiplied_1.y, 16u), sub_group_clustered_reduce_mul(multiplied_1.z, 16u), sub_group_clustered_reduce_mul(multiplied_1.w, 16u));
+    iadded_1 = (int4)(sub_group_clustered_reduce_add(iadded_1.x, 16u), sub_group_clustered_reduce_add(iadded_1.y, 16u), sub_group_clustered_reduce_add(iadded_1.z, 16u), sub_group_clustered_reduce_add(iadded_1.w, 16u));
+    imultiplied_1 = (int4)(sub_group_clustered_reduce_mul(imultiplied_1.x, 16u), sub_group_clustered_reduce_mul(imultiplied_1.y, 16u), sub_group_clustered_reduce_mul(imultiplied_1.z, 16u), sub_group_clustered_reduce_mul(imultiplied_1.w, 16u));
+    lo_1 = (float4)(sub_group_clustered_reduce_min(lo_1.x, 16u), sub_group_clustered_reduce_min(lo_1.y, 16u), sub_group_clustered_reduce_min(lo_1.z, 16u), sub_group_clustered_reduce_min(lo_1.w, 16u));
+    hi_1 = (float4)(sub_group_clustered_reduce_max(hi_1.x, 16u), sub_group_clustered_reduce_max(hi_1.y, 16u), sub_group_clustered_reduce_max(hi_1.z, 16u), sub_group_clustered_reduce_max(hi_1.w, 16u));
+    slo_1 = (int4)(sub_group_clustered_reduce_min(slo_1.x, 16u), sub_group_clustered_reduce_min(slo_1.y, 16u), sub_group_clustered_reduce_min(slo_1.z, 16u), sub_group_clustered_reduce_min(slo_1.w, 16u));
+    shi_1 = (int4)(sub_group_clustered_reduce_max(shi_1.x, 16u), sub_group_clustered_reduce_max(shi_1.y, 16u), sub_group_clustered_reduce_max(shi_1.z, 16u), sub_group_clustered_reduce_max(shi_1.w, 16u));
+    ulo_1 = (uint4)(sub_group_clustered_reduce_min(ulo_1.x, 16u), sub_group_clustered_reduce_min(ulo_1.y, 16u), sub_group_clustered_reduce_min(ulo_1.z, 16u), sub_group_clustered_reduce_min(ulo_1.w, 16u));
+    uhi_1 = (uint4)(sub_group_clustered_reduce_max(uhi_1.x, 16u), sub_group_clustered_reduce_max(uhi_1.y, 16u), sub_group_clustered_reduce_max(uhi_1.z, 16u), sub_group_clustered_reduce_max(uhi_1.w, 16u));
+    anded_1 = (uint4)(sub_group_clustered_reduce_and(anded_1.x, 16u), sub_group_clustered_reduce_and(anded_1.y, 16u), sub_group_clustered_reduce_and(anded_1.z, 16u), sub_group_clustered_reduce_and(anded_1.w, 16u));
+    ored_1 = (uint4)(sub_group_clustered_reduce_or(ored_1.x, 16u), sub_group_clustered_reduce_or(ored_1.y, 16u), sub_group_clustered_reduce_or(ored_1.z, 16u), sub_group_clustered_reduce_or(ored_1.w, 16u));
+    xored_1 = (uint4)(sub_group_clustered_reduce_xor(xored_1.x, 16u), sub_group_clustered_reduce_xor(xored_1.y, 16u), sub_group_clustered_reduce_xor(xored_1.z, 16u), sub_group_clustered_reduce_xor(xored_1.w, 16u));
+    int4 _197 = anded_1 == (uint4)(2u);
+    anded_b_1 = (int4)(sub_group_clustered_reduce_logical_and(_197.x, 16u), sub_group_clustered_reduce_logical_and(_197.y, 16u), sub_group_clustered_reduce_logical_and(_197.z, 16u), sub_group_clustered_reduce_logical_and(_197.w, 16u));
+    int4 _200 = ored_1 == (uint4)(3u);
+    ored_b_1 = (int4)(sub_group_clustered_reduce_logical_or(_200.x, 16u), sub_group_clustered_reduce_logical_or(_200.y, 16u), sub_group_clustered_reduce_logical_or(_200.z, 16u), sub_group_clustered_reduce_logical_or(_200.w, 16u));
+    int4 _203 = xored_1 == (uint4)(4u);
+    xored_b_1 = (int4)(sub_group_clustered_reduce_logical_xor(_203.x, 16u), sub_group_clustered_reduce_logical_xor(_203.y, 16u), sub_group_clustered_reduce_logical_xor(_203.z, 16u), sub_group_clustered_reduce_logical_xor(_203.w, 16u));
+    _14->fdat = (((((((added_1.x + multiplied_1.x) + lo_1.x) + hi_1.x) + convert_float(((iadded_1.x + imultiplied_1.x) + slo_1.x) + shi_1.x)) + convert_float((((ulo_1.x + uhi_1.x) + anded_1.x) + ored_1.x) + xored_1.x)) + (float)(anded_b_1.x)) + (float)(ored_b_1.x)) + (float)(xored_b_1.x);
+}
+
diff --git a/reference/shaders-opencl-no-opt/comp/subgroups-rotate.nocompat.vk.subgroups.comp b/reference/shaders-opencl-no-opt/comp/subgroups-rotate.nocompat.vk.subgroups.comp
new file mode 100644
index 000000000..d97431603
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/comp/subgroups-rotate.nocompat.vk.subgroups.comp
@@ -0,0 +1,24 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+#pragma OPENCL EXTENSION cl_khr_3d_image_writes : enable
+#pragma OPENCL EXTENSION cl_khr_subgroups : enable
+#pragma OPENCL EXTENSION cl_khr_subgroup_rotate : enable
+
+struct SSBO
+{
+    float FragColor;
+};
+
+typedef struct SSBO SSBO;
+
+__attribute__((reqd_work_group_size(256, 1, 1)))
+__kernel void comp_main(__global float* _26)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    uint rotated_1 = sub_group_rotate(20u, 4u);
+    bool rotated_bool_1 = sub_group_rotate(false, 4u);
+    uint rotated_clustered_1 = sub_group_clustered_rotate(20u, 4u, 8u);
+    bool rotated_clustered_bool_1 = sub_group_clustered_rotate(false, 4u, 8u);
+    _26[0] = ((convert_float(rotated_1) + (float)(rotated_bool_1)) + convert_float(rotated_clustered_1)) + (float)(rotated_clustered_bool_1);
+}
+
diff --git a/reference/shaders-opencl-no-opt/comp/subgroups-shuffle-relative.nocompat.vk.subgroups.comp b/reference/shaders-opencl-no-opt/comp/subgroups-shuffle-relative.nocompat.vk.subgroups.comp
new file mode 100644
index 000000000..7c076e911
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/comp/subgroups-shuffle-relative.nocompat.vk.subgroups.comp
@@ -0,0 +1,24 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+#pragma OPENCL EXTENSION cl_khr_3d_image_writes : enable
+#pragma OPENCL EXTENSION cl_khr_subgroups : enable
+#pragma OPENCL EXTENSION cl_khr_subgroup_shuffle_relative : enable
+
+struct SSBO
+{
+    float FragColor;
+};
+
+typedef struct SSBO SSBO;
+
+__attribute__((reqd_work_group_size(256, 1, 1)))
+__kernel void comp_main(__global float* _26)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    uint shuffled_up_1 = sub_group_shuffle_up(20u, 4u);
+    bool shuffled_up_bool_1 = sub_group_shuffle_up(true, 4u);
+    uint shuffled_down_1 = sub_group_shuffle_down(20u, 4u);
+    bool shuffled_down_bool_1 = sub_group_shuffle_down(false, 4u);
+    _26[0] = ((convert_float(shuffled_up_1) + (float)(shuffled_up_bool_1)) + convert_float(shuffled_down_1)) + (float)(shuffled_down_bool_1);
+}
+
diff --git a/reference/shaders-opencl-no-opt/comp/subgroups-shuffle.nocompat.vk.subgroups.comp b/reference/shaders-opencl-no-opt/comp/subgroups-shuffle.nocompat.vk.subgroups.comp
new file mode 100644
index 000000000..5c032dda2
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/comp/subgroups-shuffle.nocompat.vk.subgroups.comp
@@ -0,0 +1,24 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+#pragma OPENCL EXTENSION cl_khr_3d_image_writes : enable
+#pragma OPENCL EXTENSION cl_khr_subgroups : enable
+#pragma OPENCL EXTENSION cl_khr_subgroup_shuffle : enable
+
+struct SSBO
+{
+    float FragColor;
+};
+
+typedef struct SSBO SSBO;
+
+__attribute__((reqd_work_group_size(256, 1, 1)))
+__kernel void comp_main(__global float* _28)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    uint shuffled_1 = sub_group_shuffle(10u, 8u);
+    bool shuffled_bool_1 = sub_group_shuffle(true, 9u);
+    uint shuffled_xor_1 = sub_group_shuffle_xor(30u, 8u);
+    bool shuffled_xor_bool_1 = sub_group_shuffle_xor(false, 9u);
+    _28[0] = ((convert_float(shuffled_1) + (float)(shuffled_bool_1)) + convert_float(shuffled_xor_1)) + (float)(shuffled_xor_bool_1);
+}
+
diff --git a/reference/shaders-opencl-no-opt/comp/subgroups-vote.nocompat.vk.subgroups.comp b/reference/shaders-opencl-no-opt/comp/subgroups-vote.nocompat.vk.subgroups.comp
new file mode 100644
index 000000000..63276058c
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/comp/subgroups-vote.nocompat.vk.subgroups.comp
@@ -0,0 +1,28 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+#pragma OPENCL EXTENSION cl_khr_3d_image_writes : enable
+#pragma OPENCL EXTENSION cl_khr_subgroups : enable
+#pragma OPENCL EXTENSION cl_khr_subgroup_non_uniform_vote : enable
+
+struct SSBO
+{
+    float FragColor;
+    int idat;
+};
+
+typedef struct SSBO SSBO;
+
+__attribute__((reqd_work_group_size(256, 1, 1)))
+__kernel void comp_main(__global SSBO* _29)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    bool elected_1 = sub_group_elect();
+    bool has_all_1 = sub_group_all(get_sub_group_local_id() < 10u);
+    bool has_any_1 = sub_group_any(get_sub_group_local_id() == 0u);
+    bool eq_int_1 = sub_group_non_uniform_all_equal(_29->idat);
+    bool eq_bool_1 = sub_group_non_uniform_all_equal(true);
+    bool eq_vec3_1 = sub_group_non_uniform_all_equal((float3)(0.0f, 1.0f, 2.0f).x) && sub_group_non_uniform_all_equal((float3)(0.0f, 1.0f, 2.0f).y) && sub_group_non_uniform_all_equal((float3)(0.0f, 1.0f, 2.0f).z);
+    bool eq_bvec4_1 = sub_group_non_uniform_all_equal((int4)(true, true, false, true).x) && sub_group_non_uniform_all_equal((int4)(true, true, false, true).y) && sub_group_non_uniform_all_equal((int4)(true, true, false, true).z) && sub_group_non_uniform_all_equal((int4)(true, true, false, true).w);
+    _29->FragColor = ((((((float)(elected_1) + (float)(has_all_1)) + (float)(has_any_1)) + (float)(eq_int_1)) + (float)(eq_bool_1)) + (float)(eq_vec3_1)) + (float)(eq_bvec4_1);
+}
+
diff --git a/reference/shaders-opencl-no-opt/comp/subgroups.nocompat.vk.opencl12.emulate-subgroup.invalid.comp b/reference/shaders-opencl-no-opt/comp/subgroups.nocompat.vk.opencl12.emulate-subgroup.invalid.comp
deleted file mode 100644
index e69de29bb..000000000
diff --git a/reference/shaders-opencl-no-opt/comp/subgroups.nocompat.vk.subgroup.fixed-subgroup.invalid.comp b/reference/shaders-opencl-no-opt/comp/subgroups.nocompat.vk.subgroup.fixed-subgroup.invalid.comp
deleted file mode 100644
index e69de29bb..000000000
diff --git a/reference/shaders-opencl-no-opt/comp/subgroups.nocompat.vk.subgroup.invalid.comp b/reference/shaders-opencl-no-opt/comp/subgroups.nocompat.vk.subgroup.invalid.comp
deleted file mode 100644
index e69de29bb..000000000
diff --git a/reference/shaders-opencl-no-opt/comp/subgroups.nocompat.vk.subgroup.swizzle.invalid.comp b/reference/shaders-opencl-no-opt/comp/subgroups.nocompat.vk.subgroup.swizzle.invalid.comp
deleted file mode 100644
index e69de29bb..000000000
diff --git a/shaders-opencl-no-opt/comp/subgroups-arithmetic.nocompat.vk.subgroups.comp b/shaders-opencl-no-opt/comp/subgroups-arithmetic.nocompat.vk.subgroups.comp
new file mode 100644
index 000000000..23bf10ec5
--- /dev/null
+++ b/shaders-opencl-no-opt/comp/subgroups-arithmetic.nocompat.vk.subgroups.comp
@@ -0,0 +1,47 @@
+#version 450
+#extension GL_KHR_shader_subgroup_basic : require
+#extension GL_KHR_shader_subgroup_arithmetic : require
+layout(local_size_x = 256) in;
+
+layout(std430, binding = 0) buffer SSBO
+{
+	float fdat;
+	int idat;
+	uint udat;
+};
+
+void main()
+{
+	float fadd = subgroupAdd(fdat);
+	float fmul = subgroupMul(fdat);
+	int iadd = subgroupAdd(idat);
+	int imul = subgroupMul(idat);
+	float fmin_v = subgroupMin(fdat);
+	float fmax_v = subgroupMax(fdat);
+	int smin_v = subgroupMin(idat);
+	int smax_v = subgroupMax(idat);
+	uint umin_v = subgroupMin(udat);
+	uint umax_v = subgroupMax(udat);
+	uint anded = subgroupAnd(udat);
+	uint ored = subgroupOr(udat);
+	uint xored = subgroupXor(udat);
+	bvec4 bval = equal(uvec4(udat), uvec4(42u));
+	bvec4 anded_b = subgroupAnd(bval);
+	bvec4 ored_b = subgroupOr(bval);
+	bvec4 xored_b = subgroupXor(bval);
+
+	float finc_add = subgroupInclusiveAdd(fdat);
+	float finc_mul = subgroupInclusiveMul(fdat);
+	int iinc_add = subgroupInclusiveAdd(idat);
+	int iinc_mul = subgroupInclusiveMul(idat);
+
+	float fexc_add = subgroupExclusiveAdd(fdat);
+	float fexc_mul = subgroupExclusiveMul(fdat);
+	int iexc_add = subgroupExclusiveAdd(idat);
+	int iexc_mul = subgroupExclusiveMul(idat);
+
+	fdat = fadd + fmul + fmin_v + fmax_v + finc_add + finc_mul + fexc_add + fexc_mul
+		+ float(iadd + imul + smin_v + smax_v + iinc_add + iinc_mul + iexc_add + iexc_mul)
+		+ float(umin_v + umax_v + anded + ored + xored)
+		+ float(anded_b.x) + float(ored_b.x) + float(xored_b.x);
+}
diff --git a/shaders-opencl-no-opt/comp/subgroups-ballot.nocompat.vk.subgroups.comp b/shaders-opencl-no-opt/comp/subgroups-ballot.nocompat.vk.subgroups.comp
new file mode 100644
index 000000000..f65334709
--- /dev/null
+++ b/shaders-opencl-no-opt/comp/subgroups-ballot.nocompat.vk.subgroups.comp
@@ -0,0 +1,36 @@
+#version 450
+#extension GL_KHR_shader_subgroup_basic : require
+#extension GL_KHR_shader_subgroup_ballot : require
+layout(local_size_x = 256) in;
+
+layout(std430, binding = 0) buffer SSBO
+{
+	float FragColor;
+};
+
+void main()
+{
+	FragColor = float(gl_SubgroupEqMask);
+	FragColor = float(gl_SubgroupGeMask);
+	FragColor = float(gl_SubgroupGtMask);
+	FragColor = float(gl_SubgroupLeMask);
+	FragColor = float(gl_SubgroupLtMask);
+
+	vec4 broadcasted = subgroupBroadcast(vec4(10.0), 8u);
+	bvec2 broadcasted_bool = subgroupBroadcast(bvec2(true), 8u);
+	vec3 first = subgroupBroadcastFirst(vec3(20.0));
+	bvec4 first_bool = subgroupBroadcastFirst(bvec4(false));
+	uvec4 ballot_value = subgroupBallot(true);
+	bool inverse_ballot_value = subgroupInverseBallot(ballot_value);
+	bool bit_extracted = subgroupBallotBitExtract(uvec4(10u), 8u);
+	uint bit_count = subgroupBallotBitCount(ballot_value);
+	uint inclusive_bit_count = subgroupBallotInclusiveBitCount(ballot_value);
+	uint exclusive_bit_count = subgroupBallotExclusiveBitCount(ballot_value);
+	uint lsb = subgroupBallotFindLSB(ballot_value);
+	uint msb = subgroupBallotFindMSB(ballot_value);
+
+	FragColor = broadcasted.x + float(broadcasted_bool.x) + first.x + float(first_bool.x)
+		+ float(inverse_ballot_value) + float(bit_extracted)
+		+ float(bit_count) + float(inclusive_bit_count) + float(exclusive_bit_count)
+		+ float(lsb) + float(msb);
+}
diff --git a/shaders-opencl-no-opt/comp/subgroups-basic.nocompat.vk.subgroups.comp b/shaders-opencl-no-opt/comp/subgroups-basic.nocompat.vk.subgroups.comp
new file mode 100644
index 000000000..da981bccf
--- /dev/null
+++ b/shaders-opencl-no-opt/comp/subgroups-basic.nocompat.vk.subgroups.comp
@@ -0,0 +1,72 @@
+#version 450
+#extension GL_KHR_shader_subgroup_basic : require
+#extension GL_KHR_shader_subgroup_vote : require
+#extension GL_KHR_shader_subgroup_ballot : require
+#extension GL_KHR_shader_subgroup_arithmetic : require
+layout(local_size_x = 256) in;
+
+layout(std430, binding = 0) buffer SSBO
+{
+	float FragColor;
+	int idat;
+	uint udat;
+};
+
+void main()
+{
+	// Builtins
+	FragColor = float(gl_NumSubgroups);
+	FragColor = float(gl_SubgroupID);
+	FragColor = float(gl_SubgroupSize);
+	FragColor = float(gl_SubgroupInvocationID);
+
+	// Barriers
+	subgroupBarrier();
+	subgroupMemoryBarrier();
+	subgroupMemoryBarrierBuffer();
+	subgroupMemoryBarrierShared();
+	subgroupMemoryBarrierImage();
+
+	// Vote (uniform) — OpGroupNonUniformAll/Any map to sub_group_all/any in cl_khr_subgroups
+	bool has_all = subgroupAll(true);
+	bool has_any = subgroupAny(true);
+
+	// Broadcast (uniform) — OpGroupNonUniformBroadcast maps to sub_group_broadcast in cl_khr_subgroups
+	uint broadcasted = subgroupBroadcast(42u, 0u);
+
+	// Reduce (uniform) — OpGroupNonUniform{I,F}Add/Min/Max with Reduce
+	float fadd = subgroupAdd(FragColor);
+	int iadd = subgroupAdd(idat);
+	float fmin = subgroupMin(FragColor);
+	float fmax = subgroupMax(FragColor);
+	int smin = subgroupMin(idat);
+	int smax = subgroupMax(idat);
+	uint umin = subgroupMin(udat);
+	uint umax = subgroupMax(udat);
+
+	// Inclusive scan
+	float finc_add = subgroupInclusiveAdd(FragColor);
+	float finc_min = subgroupInclusiveMin(FragColor);
+	float finc_max = subgroupInclusiveMax(FragColor);
+	int iinc_add = subgroupInclusiveAdd(idat);
+	int iinc_min = subgroupInclusiveMin(idat);
+	int iinc_max = subgroupInclusiveMax(idat);
+	uint uinc_min = subgroupInclusiveMin(udat);
+	uint uinc_max = subgroupInclusiveMax(udat);
+
+	// Exclusive scan
+	float fexc_add = subgroupExclusiveAdd(FragColor);
+	float fexc_min = subgroupExclusiveMin(FragColor);
+	float fexc_max = subgroupExclusiveMax(FragColor);
+	int iexc_add = subgroupExclusiveAdd(idat);
+	int iexc_min = subgroupExclusiveMin(idat);
+	int iexc_max = subgroupExclusiveMax(idat);
+	uint uexc_min = subgroupExclusiveMin(udat);
+	uint uexc_max = subgroupExclusiveMax(udat);
+
+	// Write results to prevent dead-code elimination
+	FragColor = fadd + fmin + fmax + finc_add + finc_min + finc_max + fexc_add + fexc_min + fexc_max
+		+ float(iadd + smin + smax + iinc_add + iinc_min + iinc_max + iexc_add + iexc_min + iexc_max)
+		+ float(umin + umax + uinc_min + uinc_max + uexc_min + uexc_max + broadcasted)
+		+ float(has_all) + float(has_any);
+}
diff --git a/shaders-opencl-no-opt/comp/subgroups-clustered.nocompat.vk.subgroups.comp b/shaders-opencl-no-opt/comp/subgroups-clustered.nocompat.vk.subgroups.comp
new file mode 100644
index 000000000..c52b5ab00
--- /dev/null
+++ b/shaders-opencl-no-opt/comp/subgroups-clustered.nocompat.vk.subgroups.comp
@@ -0,0 +1,97 @@
+#version 450
+#extension GL_KHR_shader_subgroup_basic : require
+#extension GL_KHR_shader_subgroup_clustered : require
+layout(local_size_x = 256) in;
+
+layout(std430, binding = 0) buffer SSBO
+{
+	float fdat;
+	int idat;
+	uint udat;
+};
+
+void main()
+{
+	vec4 fv = vec4(fdat);
+	ivec4 iv = ivec4(idat);
+	uvec4 uv = uvec4(udat);
+
+	// Cluster size 1
+	vec4 added = subgroupClusteredAdd(fv, 1u);
+	vec4 multiplied = subgroupClusteredMul(fv, 1u);
+	ivec4 iadded = subgroupClusteredAdd(iv, 1u);
+	ivec4 imultiplied = subgroupClusteredMul(iv, 1u);
+	vec4 lo = subgroupClusteredMin(fv, 1u);
+	vec4 hi = subgroupClusteredMax(fv, 1u);
+	ivec4 slo = subgroupClusteredMin(iv, 1u);
+	ivec4 shi = subgroupClusteredMax(iv, 1u);
+	uvec4 ulo = subgroupClusteredMin(uv, 1u);
+	uvec4 uhi = subgroupClusteredMax(uv, 1u);
+	uvec4 anded = subgroupClusteredAnd(uv, 1u);
+	uvec4 ored = subgroupClusteredOr(uv, 1u);
+	uvec4 xored = subgroupClusteredXor(uv, 1u);
+	bvec4 bval = equal(uv, uvec4(42u));
+	bvec4 anded_b = subgroupClusteredAnd(bval, 1u);
+	bvec4 ored_b = subgroupClusteredOr(bval, 1u);
+	bvec4 xored_b = subgroupClusteredXor(bval, 1u);
+
+	// Cluster size 2
+	added = subgroupClusteredAdd(added, 2u);
+	multiplied = subgroupClusteredMul(multiplied, 2u);
+	iadded = subgroupClusteredAdd(iadded, 2u);
+	imultiplied = subgroupClusteredMul(imultiplied, 2u);
+	lo = subgroupClusteredMin(lo, 2u);
+	hi = subgroupClusteredMax(hi, 2u);
+	slo = subgroupClusteredMin(slo, 2u);
+	shi = subgroupClusteredMax(shi, 2u);
+	ulo = subgroupClusteredMin(ulo, 2u);
+	uhi = subgroupClusteredMax(uhi, 2u);
+	anded = subgroupClusteredAnd(anded, 2u);
+	ored = subgroupClusteredOr(ored, 2u);
+	xored = subgroupClusteredXor(xored, 2u);
+	anded_b = subgroupClusteredAnd(equal(anded, uvec4(2u)), 2u);
+	ored_b = subgroupClusteredOr(equal(ored, uvec4(3u)), 2u);
+	xored_b = subgroupClusteredXor(equal(xored, uvec4(4u)), 2u);
+
+	// Cluster size 4
+	added = subgroupClusteredAdd(added, 4u);
+	multiplied = subgroupClusteredMul(multiplied, 4u);
+	iadded = subgroupClusteredAdd(iadded, 4u);
+	imultiplied = subgroupClusteredMul(imultiplied, 4u);
+	lo = subgroupClusteredMin(lo, 4u);
+	hi = subgroupClusteredMax(hi, 4u);
+	slo = subgroupClusteredMin(slo, 4u);
+	shi = subgroupClusteredMax(shi, 4u);
+	ulo = subgroupClusteredMin(ulo, 4u);
+	uhi = subgroupClusteredMax(uhi, 4u);
+	anded = subgroupClusteredAnd(anded, 4u);
+	ored = subgroupClusteredOr(ored, 4u);
+	xored = subgroupClusteredXor(xored, 4u);
+	anded_b = subgroupClusteredAnd(equal(anded, uvec4(2u)), 4u);
+	ored_b = subgroupClusteredOr(equal(ored, uvec4(3u)), 4u);
+	xored_b = subgroupClusteredXor(equal(xored, uvec4(4u)), 4u);
+
+	// Cluster size 16
+	added = subgroupClusteredAdd(added, 16u);
+	multiplied = subgroupClusteredMul(multiplied, 16u);
+	iadded = subgroupClusteredAdd(iadded, 16u);
+	imultiplied = subgroupClusteredMul(imultiplied, 16u);
+	lo = subgroupClusteredMin(lo, 16u);
+	hi = subgroupClusteredMax(hi, 16u);
+	slo = subgroupClusteredMin(slo, 16u);
+	shi = subgroupClusteredMax(shi, 16u);
+	ulo = subgroupClusteredMin(ulo, 16u);
+	uhi = subgroupClusteredMax(uhi, 16u);
+	anded = subgroupClusteredAnd(anded, 16u);
+	ored = subgroupClusteredOr(ored, 16u);
+	xored = subgroupClusteredXor(xored, 16u);
+	anded_b = subgroupClusteredAnd(equal(anded, uvec4(2u)), 16u);
+	ored_b = subgroupClusteredOr(equal(ored, uvec4(3u)), 16u);
+	xored_b = subgroupClusteredXor(equal(xored, uvec4(4u)), 16u);
+
+	// Write results
+	fdat = added.x + multiplied.x + lo.x + hi.x
+		+ float(iadded.x + imultiplied.x + slo.x + shi.x)
+		+ float(ulo.x + uhi.x + anded.x + ored.x + xored.x)
+		+ float(anded_b.x) + float(ored_b.x) + float(xored_b.x);
+}
diff --git a/shaders-opencl-no-opt/comp/subgroups-rotate.nocompat.vk.subgroups.comp b/shaders-opencl-no-opt/comp/subgroups-rotate.nocompat.vk.subgroups.comp
new file mode 100644
index 000000000..0df5d8330
--- /dev/null
+++ b/shaders-opencl-no-opt/comp/subgroups-rotate.nocompat.vk.subgroups.comp
@@ -0,0 +1,20 @@
+#version 450
+#extension GL_KHR_shader_subgroup_basic : require
+#extension GL_KHR_shader_subgroup_rotate : require
+layout(local_size_x = 256) in;
+
+layout(std430, binding = 0) buffer SSBO
+{
+	float FragColor;
+};
+
+void main()
+{
+	uint rotated = subgroupRotate(20u, 4u);
+	bool rotated_bool = subgroupRotate(false, 4u);
+	uint rotated_clustered = subgroupClusteredRotate(20u, 4u, 8u);
+	bool rotated_clustered_bool = subgroupClusteredRotate(false, 4u, 8u);
+
+	FragColor = float(rotated) + float(rotated_bool)
+		+ float(rotated_clustered) + float(rotated_clustered_bool);
+}
diff --git a/shaders-opencl-no-opt/comp/subgroups-shuffle-relative.nocompat.vk.subgroups.comp b/shaders-opencl-no-opt/comp/subgroups-shuffle-relative.nocompat.vk.subgroups.comp
new file mode 100644
index 000000000..b026695ce
--- /dev/null
+++ b/shaders-opencl-no-opt/comp/subgroups-shuffle-relative.nocompat.vk.subgroups.comp
@@ -0,0 +1,20 @@
+#version 450
+#extension GL_KHR_shader_subgroup_basic : require
+#extension GL_KHR_shader_subgroup_shuffle_relative : require
+layout(local_size_x = 256) in;
+
+layout(std430, binding = 0) buffer SSBO
+{
+	float FragColor;
+};
+
+void main()
+{
+	uint shuffled_up = subgroupShuffleUp(20u, 4u);
+	bool shuffled_up_bool = subgroupShuffleUp(true, 4u);
+	uint shuffled_down = subgroupShuffleDown(20u, 4u);
+	bool shuffled_down_bool = subgroupShuffleDown(false, 4u);
+
+	FragColor = float(shuffled_up) + float(shuffled_up_bool)
+		+ float(shuffled_down) + float(shuffled_down_bool);
+}
diff --git a/shaders-opencl-no-opt/comp/subgroups-shuffle.nocompat.vk.subgroups.comp b/shaders-opencl-no-opt/comp/subgroups-shuffle.nocompat.vk.subgroups.comp
new file mode 100644
index 000000000..8e2a433ac
--- /dev/null
+++ b/shaders-opencl-no-opt/comp/subgroups-shuffle.nocompat.vk.subgroups.comp
@@ -0,0 +1,19 @@
+#version 450
+#extension GL_KHR_shader_subgroup_basic : require
+#extension GL_KHR_shader_subgroup_shuffle : require
+layout(local_size_x = 256) in;
+
+layout(std430, binding = 0) buffer SSBO
+{
+	float FragColor;
+};
+
+void main()
+{
+	uint shuffled = subgroupShuffle(10u, 8u);
+	bool shuffled_bool = subgroupShuffle(true, 9u);
+	uint shuffled_xor = subgroupShuffleXor(30u, 8u);
+	bool shuffled_xor_bool = subgroupShuffleXor(false, 9u);
+
+	FragColor = float(shuffled) + float(shuffled_bool) + float(shuffled_xor) + float(shuffled_xor_bool);
+}
diff --git a/shaders-opencl-no-opt/comp/subgroups-vote.nocompat.vk.subgroups.comp b/shaders-opencl-no-opt/comp/subgroups-vote.nocompat.vk.subgroups.comp
new file mode 100644
index 000000000..b0995a600
--- /dev/null
+++ b/shaders-opencl-no-opt/comp/subgroups-vote.nocompat.vk.subgroups.comp
@@ -0,0 +1,24 @@
+#version 450
+#extension GL_KHR_shader_subgroup_basic : require
+#extension GL_KHR_shader_subgroup_vote : require
+layout(local_size_x = 256) in;
+
+layout(std430, binding = 0) buffer SSBO
+{
+	float FragColor;
+	int idat;
+};
+
+void main()
+{
+	bool elected = subgroupElect();
+	bool has_all = subgroupAll(gl_SubgroupInvocationID < 10u);
+	bool has_any = subgroupAny(gl_SubgroupInvocationID == 0u);
+	bool eq_int = subgroupAllEqual(idat);
+	bool eq_bool = subgroupAllEqual(true);
+	bool eq_vec3 = subgroupAllEqual(vec3(0.0, 1.0, 2.0));
+	bool eq_bvec4 = subgroupAllEqual(bvec4(true, true, false, true));
+
+	FragColor = float(elected) + float(has_all) + float(has_any)
+		+ float(eq_int) + float(eq_bool) + float(eq_vec3) + float(eq_bvec4);
+}
diff --git a/shaders-opencl-no-opt/comp/subgroups.nocompat.vk.opencl12.emulate-subgroup.invalid.comp b/shaders-opencl-no-opt/comp/subgroups.nocompat.vk.opencl12.emulate-subgroup.invalid.comp
deleted file mode 100644
index 8a0be2269..000000000
--- a/shaders-opencl-no-opt/comp/subgroups.nocompat.vk.opencl12.emulate-subgroup.invalid.comp
+++ /dev/null
@@ -1,25 +0,0 @@
-#version 450
-#extension GL_KHR_shader_subgroup_basic : require
-layout(local_size_x = 1) in;
-
-layout(std430, binding = 0) buffer SSBO
-{
-	float FragColor;
-};
-
-// Reduced test for emulated functionality.
-
-void main()
-{
-	// basic
-	FragColor = float(gl_NumSubgroups);
-	FragColor = float(gl_SubgroupID);
-	FragColor = float(gl_SubgroupSize);
-	FragColor = float(gl_SubgroupInvocationID);
-	subgroupBarrier();
-	subgroupMemoryBarrier();
-	subgroupMemoryBarrierBuffer();
-	subgroupMemoryBarrierShared();
-	subgroupMemoryBarrierImage();
-	bool elected = subgroupElect();
-}
diff --git a/shaders-opencl-no-opt/comp/subgroups.nocompat.vk.subgroup.fixed-subgroup.invalid.comp b/shaders-opencl-no-opt/comp/subgroups.nocompat.vk.subgroup.fixed-subgroup.invalid.comp
deleted file mode 100644
index c8172fd95..000000000
--- a/shaders-opencl-no-opt/comp/subgroups.nocompat.vk.subgroup.fixed-subgroup.invalid.comp
+++ /dev/null
@@ -1,211 +0,0 @@
-#version 450
-#extension GL_KHR_shader_subgroup_basic : require
-#extension GL_KHR_shader_subgroup_ballot : require
-#extension GL_KHR_shader_subgroup_vote : require
-#extension GL_KHR_shader_subgroup_shuffle : require
-#extension GL_KHR_shader_subgroup_shuffle_relative : require
-#extension GL_KHR_shader_subgroup_arithmetic : require
-#extension GL_KHR_shader_subgroup_clustered : require
-#extension GL_KHR_shader_subgroup_quad : require
-#extension GL_KHR_shader_subgroup_rotate : require
-layout(local_size_x = 1) in;
-
-layout(std430, binding = 0) buffer SSBO
-{
-	float FragColor;
-};
-
-void doClusteredRotate()
-{
-	uint rotated_clustered = subgroupClusteredRotate(20u, 4u, 8u);
-	bool rotated_clustered_bool = subgroupClusteredRotate(false, 4u, 8u);
-}
-
-void main()
-{
-	// basic
-	FragColor = float(gl_NumSubgroups);
-	FragColor = float(gl_SubgroupID);
-	FragColor = float(gl_SubgroupSize);
-	FragColor = float(gl_SubgroupInvocationID);
-	subgroupBarrier();
-	subgroupMemoryBarrier();
-	subgroupMemoryBarrierBuffer();
-	subgroupMemoryBarrierShared();
-	subgroupMemoryBarrierImage();
-	bool elected = subgroupElect();
-
-	// ballot
-	FragColor = float(gl_SubgroupEqMask);
-	FragColor = float(gl_SubgroupGeMask);
-	FragColor = float(gl_SubgroupGtMask);
-	FragColor = float(gl_SubgroupLeMask);
-	FragColor = float(gl_SubgroupLtMask);
-	vec4 broadcasted = subgroupBroadcast(vec4(10.0), 8u);
-	bvec2 broadcasted_bool = subgroupBroadcast(bvec2(true), 8u);
-	vec3 first = subgroupBroadcastFirst(vec3(20.0));
-	bvec4 first_bool = subgroupBroadcastFirst(bvec4(false));
-	uvec4 ballot_value = subgroupBallot(true);
-	bool inverse_ballot_value = subgroupInverseBallot(ballot_value);
-	bool bit_extracted = subgroupBallotBitExtract(uvec4(10u), 8u);
-	uint bit_count = subgroupBallotBitCount(ballot_value);
-	uint inclusive_bit_count = subgroupBallotInclusiveBitCount(ballot_value);
-	uint exclusive_bit_count = subgroupBallotExclusiveBitCount(ballot_value);
-	uint lsb = subgroupBallotFindLSB(ballot_value);
-	uint msb = subgroupBallotFindMSB(ballot_value);
-
-	// shuffle
-	uint shuffled = subgroupShuffle(10u, 8u);
-	bool shuffled_bool = subgroupShuffle(true, 9u);
-	uint shuffled_xor = subgroupShuffleXor(30u, 8u);
-	bool shuffled_xor_bool = subgroupShuffleXor(false, 9u);
-
-	// shuffle relative 
-	uint shuffled_up = subgroupShuffleUp(20u, 4u);
-	bool shuffled_up_bool = subgroupShuffleUp(true, 4u);
-	uint shuffled_down = subgroupShuffleDown(20u, 4u);
-	bool shuffled_down_bool = subgroupShuffleDown(false, 4u);
-
-	// rotate
-	uint rotated = subgroupRotate(20u, 4u);
-	bool rotated_bool = subgroupRotate(false, 4u);
-	doClusteredRotate();
-
-	// vote
-	bool has_all = subgroupAll(true);
-	bool has_any = subgroupAny(true);
-	bool has_equal = subgroupAllEqual(0);
-	has_equal = subgroupAllEqual(true);
-	has_equal = subgroupAllEqual(vec3(0.0, 1.0, 2.0));
-	has_equal = subgroupAllEqual(bvec4(true, true, false, true));
-
-	// arithmetic
-	vec4 added = subgroupAdd(vec4(20.0));
-	ivec4 iadded = subgroupAdd(ivec4(20));
-	vec4 multiplied = subgroupMul(vec4(20.0));
-	ivec4 imultiplied = subgroupMul(ivec4(20));
-	vec4 lo = subgroupMin(vec4(20.0));
-	vec4 hi = subgroupMax(vec4(20.0));
-	ivec4 slo = subgroupMin(ivec4(20));
-	ivec4 shi = subgroupMax(ivec4(20));
-	uvec4 ulo = subgroupMin(uvec4(20));
-	uvec4 uhi = subgroupMax(uvec4(20));
-	uvec4 anded = subgroupAnd(ballot_value);
-	uvec4 ored = subgroupOr(ballot_value);
-	uvec4 xored = subgroupXor(ballot_value);
-	bvec4 anded_b = subgroupAnd(equal(ballot_value, uvec4(42)));
-	bvec4 ored_b = subgroupOr(equal(ballot_value, uvec4(42)));
-	bvec4 xored_b = subgroupXor(equal(ballot_value, uvec4(42)));
-
-	added = subgroupInclusiveAdd(added);
-	iadded = subgroupInclusiveAdd(iadded);
-	multiplied = subgroupInclusiveMul(multiplied);
-	imultiplied = subgroupInclusiveMul(imultiplied);
-	//lo = subgroupInclusiveMin(lo);  // FIXME: Unsupported by Metal
-	//hi = subgroupInclusiveMax(hi);
-	//slo = subgroupInclusiveMin(slo);
-	//shi = subgroupInclusiveMax(shi);
-	//ulo = subgroupInclusiveMin(ulo);
-	//uhi = subgroupInclusiveMax(uhi);
-	//anded = subgroupInclusiveAnd(anded);
-	//ored = subgroupInclusiveOr(ored);
-	//xored = subgroupInclusiveXor(ored);
-	//added = subgroupExclusiveAdd(lo);
-
-	added = subgroupExclusiveAdd(multiplied);
-	multiplied = subgroupExclusiveMul(multiplied);
-	iadded = subgroupExclusiveAdd(imultiplied);
-	imultiplied = subgroupExclusiveMul(imultiplied);
-	//lo = subgroupExclusiveMin(lo);  // FIXME: Unsupported by Metal
-	//hi = subgroupExclusiveMax(hi);
-	//ulo = subgroupExclusiveMin(ulo);
-	//uhi = subgroupExclusiveMax(uhi);
-	//slo = subgroupExclusiveMin(slo);
-	//shi = subgroupExclusiveMax(shi);
-	//anded = subgroupExclusiveAnd(anded);
-	//ored = subgroupExclusiveOr(ored);
-	//xored = subgroupExclusiveXor(ored);
-
-	// clustered
-	added = subgroupClusteredAdd(added, 1u);
-	multiplied = subgroupClusteredMul(multiplied, 1u);
-	iadded = subgroupClusteredAdd(iadded, 1u);
-	imultiplied = subgroupClusteredMul(imultiplied, 1u);
-	lo = subgroupClusteredMin(lo, 1u);
-	hi = subgroupClusteredMax(hi, 1u);
-	ulo = subgroupClusteredMin(ulo, 1u);
-	uhi = subgroupClusteredMax(uhi, 1u);
-	slo = subgroupClusteredMin(slo, 1u);
-	shi = subgroupClusteredMax(shi, 1u);
-	anded = subgroupClusteredAnd(anded, 1u);
-	ored = subgroupClusteredOr(ored, 1u);
-	xored = subgroupClusteredXor(xored, 1u);
-
-	anded_b = subgroupClusteredAnd(equal(anded, uvec4(2u)), 1u);
-	ored_b = subgroupClusteredOr(equal(ored, uvec4(3u)), 1u);
-	xored_b = subgroupClusteredXor(equal(xored, uvec4(4u)), 1u);
-
-	added = subgroupClusteredAdd(added, 2u);
-	multiplied = subgroupClusteredMul(multiplied, 2u);
-	iadded = subgroupClusteredAdd(iadded, 2u);
-	imultiplied = subgroupClusteredMul(imultiplied, 2u);
-	lo = subgroupClusteredMin(lo, 2u);
-	hi = subgroupClusteredMax(hi, 2u);
-	ulo = subgroupClusteredMin(ulo, 2u);
-	uhi = subgroupClusteredMax(uhi, 2u);
-	slo = subgroupClusteredMin(slo, 2u);
-	shi = subgroupClusteredMax(shi, 2u);
-	anded = subgroupClusteredAnd(anded, 2u);
-	ored = subgroupClusteredOr(ored, 2u);
-	xored = subgroupClusteredXor(xored, 2u);
-
-	anded_b = subgroupClusteredAnd(equal(anded, uvec4(2u)), 2u);
-	ored_b = subgroupClusteredOr(equal(ored, uvec4(3u)), 2u);
-	xored_b = subgroupClusteredXor(equal(xored, uvec4(4u)), 2u);
-
-	added = subgroupClusteredAdd(added, 4u);
-	multiplied = subgroupClusteredMul(multiplied, 4u);
-	iadded = subgroupClusteredAdd(iadded, 4u);
-	imultiplied = subgroupClusteredMul(imultiplied, 4u);
-	lo = subgroupClusteredMin(lo, 4u);
-	hi = subgroupClusteredMax(hi, 4u);
-	ulo = subgroupClusteredMin(ulo, 4u);
-	uhi = subgroupClusteredMax(uhi, 4u);
-	slo = subgroupClusteredMin(slo, 4u);
-	shi = subgroupClusteredMax(shi, 4u);
-	anded = subgroupClusteredAnd(anded, 4u);
-	ored = subgroupClusteredOr(ored, 4u);
-	xored = subgroupClusteredXor(xored, 4u);
-
-	anded_b = subgroupClusteredAnd(equal(anded, uvec4(2u)), 4u);
-	ored_b = subgroupClusteredOr(equal(ored, uvec4(3u)), 4u);
-	xored_b = subgroupClusteredXor(equal(xored, uvec4(4u)), 4u);
-
-	added = subgroupClusteredAdd(added, 16u);
-	multiplied = subgroupClusteredMul(multiplied, 16u);
-	iadded = subgroupClusteredAdd(iadded, 16u);
-	imultiplied = subgroupClusteredMul(imultiplied, 16u);
-	lo = subgroupClusteredMin(lo, 16u);
-	hi = subgroupClusteredMax(hi, 16u);
-	ulo = subgroupClusteredMin(ulo, 16u);
-	uhi = subgroupClusteredMax(uhi, 16u);
-	slo = subgroupClusteredMin(slo, 16u);
-	shi = subgroupClusteredMax(shi, 16u);
-	anded = subgroupClusteredAnd(anded, 16u);
-	ored = subgroupClusteredOr(ored, 16u);
-	xored = subgroupClusteredXor(xored, 16u);
-
-	anded_b = subgroupClusteredAnd(equal(anded, uvec4(2u)), 16u);
-	ored_b = subgroupClusteredOr(equal(ored, uvec4(3u)), 16u);
-	xored_b = subgroupClusteredXor(equal(xored, uvec4(4u)), 16u);
-
-	// quad
-	vec4 swap_horiz = subgroupQuadSwapHorizontal(vec4(20.0));
-	bvec4 swap_horiz_bool = subgroupQuadSwapHorizontal(bvec4(true));
-	vec4 swap_vertical = subgroupQuadSwapVertical(vec4(20.0));
-	bvec4 swap_vertical_bool = subgroupQuadSwapVertical(bvec4(true));
-	vec4 swap_diagonal = subgroupQuadSwapDiagonal(vec4(20.0));
-	bvec4 swap_diagonal_bool = subgroupQuadSwapDiagonal(bvec4(true));
-	vec4 quad_broadcast = subgroupQuadBroadcast(vec4(20.0), 3u);
-	bvec4 quad_broadcast_bool = subgroupQuadBroadcast(bvec4(true), 3u);
-}
diff --git a/shaders-opencl-no-opt/comp/subgroups.nocompat.vk.subgroup.invalid.comp b/shaders-opencl-no-opt/comp/subgroups.nocompat.vk.subgroup.invalid.comp
deleted file mode 100644
index c8172fd95..000000000
--- a/shaders-opencl-no-opt/comp/subgroups.nocompat.vk.subgroup.invalid.comp
+++ /dev/null
@@ -1,211 +0,0 @@
-#version 450
-#extension GL_KHR_shader_subgroup_basic : require
-#extension GL_KHR_shader_subgroup_ballot : require
-#extension GL_KHR_shader_subgroup_vote : require
-#extension GL_KHR_shader_subgroup_shuffle : require
-#extension GL_KHR_shader_subgroup_shuffle_relative : require
-#extension GL_KHR_shader_subgroup_arithmetic : require
-#extension GL_KHR_shader_subgroup_clustered : require
-#extension GL_KHR_shader_subgroup_quad : require
-#extension GL_KHR_shader_subgroup_rotate : require
-layout(local_size_x = 1) in;
-
-layout(std430, binding = 0) buffer SSBO
-{
-	float FragColor;
-};
-
-void doClusteredRotate()
-{
-	uint rotated_clustered = subgroupClusteredRotate(20u, 4u, 8u);
-	bool rotated_clustered_bool = subgroupClusteredRotate(false, 4u, 8u);
-}
-
-void main()
-{
-	// basic
-	FragColor = float(gl_NumSubgroups);
-	FragColor = float(gl_SubgroupID);
-	FragColor = float(gl_SubgroupSize);
-	FragColor = float(gl_SubgroupInvocationID);
-	subgroupBarrier();
-	subgroupMemoryBarrier();
-	subgroupMemoryBarrierBuffer();
-	subgroupMemoryBarrierShared();
-	subgroupMemoryBarrierImage();
-	bool elected = subgroupElect();
-
-	// ballot
-	FragColor = float(gl_SubgroupEqMask);
-	FragColor = float(gl_SubgroupGeMask);
-	FragColor = float(gl_SubgroupGtMask);
-	FragColor = float(gl_SubgroupLeMask);
-	FragColor = float(gl_SubgroupLtMask);
-	vec4 broadcasted = subgroupBroadcast(vec4(10.0), 8u);
-	bvec2 broadcasted_bool = subgroupBroadcast(bvec2(true), 8u);
-	vec3 first = subgroupBroadcastFirst(vec3(20.0));
-	bvec4 first_bool = subgroupBroadcastFirst(bvec4(false));
-	uvec4 ballot_value = subgroupBallot(true);
-	bool inverse_ballot_value = subgroupInverseBallot(ballot_value);
-	bool bit_extracted = subgroupBallotBitExtract(uvec4(10u), 8u);
-	uint bit_count = subgroupBallotBitCount(ballot_value);
-	uint inclusive_bit_count = subgroupBallotInclusiveBitCount(ballot_value);
-	uint exclusive_bit_count = subgroupBallotExclusiveBitCount(ballot_value);
-	uint lsb = subgroupBallotFindLSB(ballot_value);
-	uint msb = subgroupBallotFindMSB(ballot_value);
-
-	// shuffle
-	uint shuffled = subgroupShuffle(10u, 8u);
-	bool shuffled_bool = subgroupShuffle(true, 9u);
-	uint shuffled_xor = subgroupShuffleXor(30u, 8u);
-	bool shuffled_xor_bool = subgroupShuffleXor(false, 9u);
-
-	// shuffle relative 
-	uint shuffled_up = subgroupShuffleUp(20u, 4u);
-	bool shuffled_up_bool = subgroupShuffleUp(true, 4u);
-	uint shuffled_down = subgroupShuffleDown(20u, 4u);
-	bool shuffled_down_bool = subgroupShuffleDown(false, 4u);
-
-	// rotate
-	uint rotated = subgroupRotate(20u, 4u);
-	bool rotated_bool = subgroupRotate(false, 4u);
-	doClusteredRotate();
-
-	// vote
-	bool has_all = subgroupAll(true);
-	bool has_any = subgroupAny(true);
-	bool has_equal = subgroupAllEqual(0);
-	has_equal = subgroupAllEqual(true);
-	has_equal = subgroupAllEqual(vec3(0.0, 1.0, 2.0));
-	has_equal = subgroupAllEqual(bvec4(true, true, false, true));
-
-	// arithmetic
-	vec4 added = subgroupAdd(vec4(20.0));
-	ivec4 iadded = subgroupAdd(ivec4(20));
-	vec4 multiplied = subgroupMul(vec4(20.0));
-	ivec4 imultiplied = subgroupMul(ivec4(20));
-	vec4 lo = subgroupMin(vec4(20.0));
-	vec4 hi = subgroupMax(vec4(20.0));
-	ivec4 slo = subgroupMin(ivec4(20));
-	ivec4 shi = subgroupMax(ivec4(20));
-	uvec4 ulo = subgroupMin(uvec4(20));
-	uvec4 uhi = subgroupMax(uvec4(20));
-	uvec4 anded = subgroupAnd(ballot_value);
-	uvec4 ored = subgroupOr(ballot_value);
-	uvec4 xored = subgroupXor(ballot_value);
-	bvec4 anded_b = subgroupAnd(equal(ballot_value, uvec4(42)));
-	bvec4 ored_b = subgroupOr(equal(ballot_value, uvec4(42)));
-	bvec4 xored_b = subgroupXor(equal(ballot_value, uvec4(42)));
-
-	added = subgroupInclusiveAdd(added);
-	iadded = subgroupInclusiveAdd(iadded);
-	multiplied = subgroupInclusiveMul(multiplied);
-	imultiplied = subgroupInclusiveMul(imultiplied);
-	//lo = subgroupInclusiveMin(lo);  // FIXME: Unsupported by Metal
-	//hi = subgroupInclusiveMax(hi);
-	//slo = subgroupInclusiveMin(slo);
-	//shi = subgroupInclusiveMax(shi);
-	//ulo = subgroupInclusiveMin(ulo);
-	//uhi = subgroupInclusiveMax(uhi);
-	//anded = subgroupInclusiveAnd(anded);
-	//ored = subgroupInclusiveOr(ored);
-	//xored = subgroupInclusiveXor(ored);
-	//added = subgroupExclusiveAdd(lo);
-
-	added = subgroupExclusiveAdd(multiplied);
-	multiplied = subgroupExclusiveMul(multiplied);
-	iadded = subgroupExclusiveAdd(imultiplied);
-	imultiplied = subgroupExclusiveMul(imultiplied);
-	//lo = subgroupExclusiveMin(lo);  // FIXME: Unsupported by Metal
-	//hi = subgroupExclusiveMax(hi);
-	//ulo = subgroupExclusiveMin(ulo);
-	//uhi = subgroupExclusiveMax(uhi);
-	//slo = subgroupExclusiveMin(slo);
-	//shi = subgroupExclusiveMax(shi);
-	//anded = subgroupExclusiveAnd(anded);
-	//ored = subgroupExclusiveOr(ored);
-	//xored = subgroupExclusiveXor(ored);
-
-	// clustered
-	added = subgroupClusteredAdd(added, 1u);
-	multiplied = subgroupClusteredMul(multiplied, 1u);
-	iadded = subgroupClusteredAdd(iadded, 1u);
-	imultiplied = subgroupClusteredMul(imultiplied, 1u);
-	lo = subgroupClusteredMin(lo, 1u);
-	hi = subgroupClusteredMax(hi, 1u);
-	ulo = subgroupClusteredMin(ulo, 1u);
-	uhi = subgroupClusteredMax(uhi, 1u);
-	slo = subgroupClusteredMin(slo, 1u);
-	shi = subgroupClusteredMax(shi, 1u);
-	anded = subgroupClusteredAnd(anded, 1u);
-	ored = subgroupClusteredOr(ored, 1u);
-	xored = subgroupClusteredXor(xored, 1u);
-
-	anded_b = subgroupClusteredAnd(equal(anded, uvec4(2u)), 1u);
-	ored_b = subgroupClusteredOr(equal(ored, uvec4(3u)), 1u);
-	xored_b = subgroupClusteredXor(equal(xored, uvec4(4u)), 1u);
-
-	added = subgroupClusteredAdd(added, 2u);
-	multiplied = subgroupClusteredMul(multiplied, 2u);
-	iadded = subgroupClusteredAdd(iadded, 2u);
-	imultiplied = subgroupClusteredMul(imultiplied, 2u);
-	lo = subgroupClusteredMin(lo, 2u);
-	hi = subgroupClusteredMax(hi, 2u);
-	ulo = subgroupClusteredMin(ulo, 2u);
-	uhi = subgroupClusteredMax(uhi, 2u);
-	slo = subgroupClusteredMin(slo, 2u);
-	shi = subgroupClusteredMax(shi, 2u);
-	anded = subgroupClusteredAnd(anded, 2u);
-	ored = subgroupClusteredOr(ored, 2u);
-	xored = subgroupClusteredXor(xored, 2u);
-
-	anded_b = subgroupClusteredAnd(equal(anded, uvec4(2u)), 2u);
-	ored_b = subgroupClusteredOr(equal(ored, uvec4(3u)), 2u);
-	xored_b = subgroupClusteredXor(equal(xored, uvec4(4u)), 2u);
-
-	added = subgroupClusteredAdd(added, 4u);
-	multiplied = subgroupClusteredMul(multiplied, 4u);
-	iadded = subgroupClusteredAdd(iadded, 4u);
-	imultiplied = subgroupClusteredMul(imultiplied, 4u);
-	lo = subgroupClusteredMin(lo, 4u);
-	hi = subgroupClusteredMax(hi, 4u);
-	ulo = subgroupClusteredMin(ulo, 4u);
-	uhi = subgroupClusteredMax(uhi, 4u);
-	slo = subgroupClusteredMin(slo, 4u);
-	shi = subgroupClusteredMax(shi, 4u);
-	anded = subgroupClusteredAnd(anded, 4u);
-	ored = subgroupClusteredOr(ored, 4u);
-	xored = subgroupClusteredXor(xored, 4u);
-
-	anded_b = subgroupClusteredAnd(equal(anded, uvec4(2u)), 4u);
-	ored_b = subgroupClusteredOr(equal(ored, uvec4(3u)), 4u);
-	xored_b = subgroupClusteredXor(equal(xored, uvec4(4u)), 4u);
-
-	added = subgroupClusteredAdd(added, 16u);
-	multiplied = subgroupClusteredMul(multiplied, 16u);
-	iadded = subgroupClusteredAdd(iadded, 16u);
-	imultiplied = subgroupClusteredMul(imultiplied, 16u);
-	lo = subgroupClusteredMin(lo, 16u);
-	hi = subgroupClusteredMax(hi, 16u);
-	ulo = subgroupClusteredMin(ulo, 16u);
-	uhi = subgroupClusteredMax(uhi, 16u);
-	slo = subgroupClusteredMin(slo, 16u);
-	shi = subgroupClusteredMax(shi, 16u);
-	anded = subgroupClusteredAnd(anded, 16u);
-	ored = subgroupClusteredOr(ored, 16u);
-	xored = subgroupClusteredXor(xored, 16u);
-
-	anded_b = subgroupClusteredAnd(equal(anded, uvec4(2u)), 16u);
-	ored_b = subgroupClusteredOr(equal(ored, uvec4(3u)), 16u);
-	xored_b = subgroupClusteredXor(equal(xored, uvec4(4u)), 16u);
-
-	// quad
-	vec4 swap_horiz = subgroupQuadSwapHorizontal(vec4(20.0));
-	bvec4 swap_horiz_bool = subgroupQuadSwapHorizontal(bvec4(true));
-	vec4 swap_vertical = subgroupQuadSwapVertical(vec4(20.0));
-	bvec4 swap_vertical_bool = subgroupQuadSwapVertical(bvec4(true));
-	vec4 swap_diagonal = subgroupQuadSwapDiagonal(vec4(20.0));
-	bvec4 swap_diagonal_bool = subgroupQuadSwapDiagonal(bvec4(true));
-	vec4 quad_broadcast = subgroupQuadBroadcast(vec4(20.0), 3u);
-	bvec4 quad_broadcast_bool = subgroupQuadBroadcast(bvec4(true), 3u);
-}
diff --git a/shaders-opencl-no-opt/comp/subgroups.nocompat.vk.subgroup.swizzle.invalid.comp b/shaders-opencl-no-opt/comp/subgroups.nocompat.vk.subgroup.swizzle.invalid.comp
deleted file mode 100644
index c8172fd95..000000000
--- a/shaders-opencl-no-opt/comp/subgroups.nocompat.vk.subgroup.swizzle.invalid.comp
+++ /dev/null
@@ -1,211 +0,0 @@
-#version 450
-#extension GL_KHR_shader_subgroup_basic : require
-#extension GL_KHR_shader_subgroup_ballot : require
-#extension GL_KHR_shader_subgroup_vote : require
-#extension GL_KHR_shader_subgroup_shuffle : require
-#extension GL_KHR_shader_subgroup_shuffle_relative : require
-#extension GL_KHR_shader_subgroup_arithmetic : require
-#extension GL_KHR_shader_subgroup_clustered : require
-#extension GL_KHR_shader_subgroup_quad : require
-#extension GL_KHR_shader_subgroup_rotate : require
-layout(local_size_x = 1) in;
-
-layout(std430, binding = 0) buffer SSBO
-{
-	float FragColor;
-};
-
-void doClusteredRotate()
-{
-	uint rotated_clustered = subgroupClusteredRotate(20u, 4u, 8u);
-	bool rotated_clustered_bool = subgroupClusteredRotate(false, 4u, 8u);
-}
-
-void main()
-{
-	// basic
-	FragColor = float(gl_NumSubgroups);
-	FragColor = float(gl_SubgroupID);
-	FragColor = float(gl_SubgroupSize);
-	FragColor = float(gl_SubgroupInvocationID);
-	subgroupBarrier();
-	subgroupMemoryBarrier();
-	subgroupMemoryBarrierBuffer();
-	subgroupMemoryBarrierShared();
-	subgroupMemoryBarrierImage();
-	bool elected = subgroupElect();
-
-	// ballot
-	FragColor = float(gl_SubgroupEqMask);
-	FragColor = float(gl_SubgroupGeMask);
-	FragColor = float(gl_SubgroupGtMask);
-	FragColor = float(gl_SubgroupLeMask);
-	FragColor = float(gl_SubgroupLtMask);
-	vec4 broadcasted = subgroupBroadcast(vec4(10.0), 8u);
-	bvec2 broadcasted_bool = subgroupBroadcast(bvec2(true), 8u);
-	vec3 first = subgroupBroadcastFirst(vec3(20.0));
-	bvec4 first_bool = subgroupBroadcastFirst(bvec4(false));
-	uvec4 ballot_value = subgroupBallot(true);
-	bool inverse_ballot_value = subgroupInverseBallot(ballot_value);
-	bool bit_extracted = subgroupBallotBitExtract(uvec4(10u), 8u);
-	uint bit_count = subgroupBallotBitCount(ballot_value);
-	uint inclusive_bit_count = subgroupBallotInclusiveBitCount(ballot_value);
-	uint exclusive_bit_count = subgroupBallotExclusiveBitCount(ballot_value);
-	uint lsb = subgroupBallotFindLSB(ballot_value);
-	uint msb = subgroupBallotFindMSB(ballot_value);
-
-	// shuffle
-	uint shuffled = subgroupShuffle(10u, 8u);
-	bool shuffled_bool = subgroupShuffle(true, 9u);
-	uint shuffled_xor = subgroupShuffleXor(30u, 8u);
-	bool shuffled_xor_bool = subgroupShuffleXor(false, 9u);
-
-	// shuffle relative 
-	uint shuffled_up = subgroupShuffleUp(20u, 4u);
-	bool shuffled_up_bool = subgroupShuffleUp(true, 4u);
-	uint shuffled_down = subgroupShuffleDown(20u, 4u);
-	bool shuffled_down_bool = subgroupShuffleDown(false, 4u);
-
-	// rotate
-	uint rotated = subgroupRotate(20u, 4u);
-	bool rotated_bool = subgroupRotate(false, 4u);
-	doClusteredRotate();
-
-	// vote
-	bool has_all = subgroupAll(true);
-	bool has_any = subgroupAny(true);
-	bool has_equal = subgroupAllEqual(0);
-	has_equal = subgroupAllEqual(true);
-	has_equal = subgroupAllEqual(vec3(0.0, 1.0, 2.0));
-	has_equal = subgroupAllEqual(bvec4(true, true, false, true));
-
-	// arithmetic
-	vec4 added = subgroupAdd(vec4(20.0));
-	ivec4 iadded = subgroupAdd(ivec4(20));
-	vec4 multiplied = subgroupMul(vec4(20.0));
-	ivec4 imultiplied = subgroupMul(ivec4(20));
-	vec4 lo = subgroupMin(vec4(20.0));
-	vec4 hi = subgroupMax(vec4(20.0));
-	ivec4 slo = subgroupMin(ivec4(20));
-	ivec4 shi = subgroupMax(ivec4(20));
-	uvec4 ulo = subgroupMin(uvec4(20));
-	uvec4 uhi = subgroupMax(uvec4(20));
-	uvec4 anded = subgroupAnd(ballot_value);
-	uvec4 ored = subgroupOr(ballot_value);
-	uvec4 xored = subgroupXor(ballot_value);
-	bvec4 anded_b = subgroupAnd(equal(ballot_value, uvec4(42)));
-	bvec4 ored_b = subgroupOr(equal(ballot_value, uvec4(42)));
-	bvec4 xored_b = subgroupXor(equal(ballot_value, uvec4(42)));
-
-	added = subgroupInclusiveAdd(added);
-	iadded = subgroupInclusiveAdd(iadded);
-	multiplied = subgroupInclusiveMul(multiplied);
-	imultiplied = subgroupInclusiveMul(imultiplied);
-	//lo = subgroupInclusiveMin(lo);  // FIXME: Unsupported by Metal
-	//hi = subgroupInclusiveMax(hi);
-	//slo = subgroupInclusiveMin(slo);
-	//shi = subgroupInclusiveMax(shi);
-	//ulo = subgroupInclusiveMin(ulo);
-	//uhi = subgroupInclusiveMax(uhi);
-	//anded = subgroupInclusiveAnd(anded);
-	//ored = subgroupInclusiveOr(ored);
-	//xored = subgroupInclusiveXor(ored);
-	//added = subgroupExclusiveAdd(lo);
-
-	added = subgroupExclusiveAdd(multiplied);
-	multiplied = subgroupExclusiveMul(multiplied);
-	iadded = subgroupExclusiveAdd(imultiplied);
-	imultiplied = subgroupExclusiveMul(imultiplied);
-	//lo = subgroupExclusiveMin(lo);  // FIXME: Unsupported by Metal
-	//hi = subgroupExclusiveMax(hi);
-	//ulo = subgroupExclusiveMin(ulo);
-	//uhi = subgroupExclusiveMax(uhi);
-	//slo = subgroupExclusiveMin(slo);
-	//shi = subgroupExclusiveMax(shi);
-	//anded = subgroupExclusiveAnd(anded);
-	//ored = subgroupExclusiveOr(ored);
-	//xored = subgroupExclusiveXor(ored);
-
-	// clustered
-	added = subgroupClusteredAdd(added, 1u);
-	multiplied = subgroupClusteredMul(multiplied, 1u);
-	iadded = subgroupClusteredAdd(iadded, 1u);
-	imultiplied = subgroupClusteredMul(imultiplied, 1u);
-	lo = subgroupClusteredMin(lo, 1u);
-	hi = subgroupClusteredMax(hi, 1u);
-	ulo = subgroupClusteredMin(ulo, 1u);
-	uhi = subgroupClusteredMax(uhi, 1u);
-	slo = subgroupClusteredMin(slo, 1u);
-	shi = subgroupClusteredMax(shi, 1u);
-	anded = subgroupClusteredAnd(anded, 1u);
-	ored = subgroupClusteredOr(ored, 1u);
-	xored = subgroupClusteredXor(xored, 1u);
-
-	anded_b = subgroupClusteredAnd(equal(anded, uvec4(2u)), 1u);
-	ored_b = subgroupClusteredOr(equal(ored, uvec4(3u)), 1u);
-	xored_b = subgroupClusteredXor(equal(xored, uvec4(4u)), 1u);
-
-	added = subgroupClusteredAdd(added, 2u);
-	multiplied = subgroupClusteredMul(multiplied, 2u);
-	iadded = subgroupClusteredAdd(iadded, 2u);
-	imultiplied = subgroupClusteredMul(imultiplied, 2u);
-	lo = subgroupClusteredMin(lo, 2u);
-	hi = subgroupClusteredMax(hi, 2u);
-	ulo = subgroupClusteredMin(ulo, 2u);
-	uhi = subgroupClusteredMax(uhi, 2u);
-	slo = subgroupClusteredMin(slo, 2u);
-	shi = subgroupClusteredMax(shi, 2u);
-	anded = subgroupClusteredAnd(anded, 2u);
-	ored = subgroupClusteredOr(ored, 2u);
-	xored = subgroupClusteredXor(xored, 2u);
-
-	anded_b = subgroupClusteredAnd(equal(anded, uvec4(2u)), 2u);
-	ored_b = subgroupClusteredOr(equal(ored, uvec4(3u)), 2u);
-	xored_b = subgroupClusteredXor(equal(xored, uvec4(4u)), 2u);
-
-	added = subgroupClusteredAdd(added, 4u);
-	multiplied = subgroupClusteredMul(multiplied, 4u);
-	iadded = subgroupClusteredAdd(iadded, 4u);
-	imultiplied = subgroupClusteredMul(imultiplied, 4u);
-	lo = subgroupClusteredMin(lo, 4u);
-	hi = subgroupClusteredMax(hi, 4u);
-	ulo = subgroupClusteredMin(ulo, 4u);
-	uhi = subgroupClusteredMax(uhi, 4u);
-	slo = subgroupClusteredMin(slo, 4u);
-	shi = subgroupClusteredMax(shi, 4u);
-	anded = subgroupClusteredAnd(anded, 4u);
-	ored = subgroupClusteredOr(ored, 4u);
-	xored = subgroupClusteredXor(xored, 4u);
-
-	anded_b = subgroupClusteredAnd(equal(anded, uvec4(2u)), 4u);
-	ored_b = subgroupClusteredOr(equal(ored, uvec4(3u)), 4u);
-	xored_b = subgroupClusteredXor(equal(xored, uvec4(4u)), 4u);
-
-	added = subgroupClusteredAdd(added, 16u);
-	multiplied = subgroupClusteredMul(multiplied, 16u);
-	iadded = subgroupClusteredAdd(iadded, 16u);
-	imultiplied = subgroupClusteredMul(imultiplied, 16u);
-	lo = subgroupClusteredMin(lo, 16u);
-	hi = subgroupClusteredMax(hi, 16u);
-	ulo = subgroupClusteredMin(ulo, 16u);
-	uhi = subgroupClusteredMax(uhi, 16u);
-	slo = subgroupClusteredMin(slo, 16u);
-	shi = subgroupClusteredMax(shi, 16u);
-	anded = subgroupClusteredAnd(anded, 16u);
-	ored = subgroupClusteredOr(ored, 16u);
-	xored = subgroupClusteredXor(xored, 16u);
-
-	anded_b = subgroupClusteredAnd(equal(anded, uvec4(2u)), 16u);
-	ored_b = subgroupClusteredOr(equal(ored, uvec4(3u)), 16u);
-	xored_b = subgroupClusteredXor(equal(xored, uvec4(4u)), 16u);
-
-	// quad
-	vec4 swap_horiz = subgroupQuadSwapHorizontal(vec4(20.0));
-	bvec4 swap_horiz_bool = subgroupQuadSwapHorizontal(bvec4(true));
-	vec4 swap_vertical = subgroupQuadSwapVertical(vec4(20.0));
-	bvec4 swap_vertical_bool = subgroupQuadSwapVertical(bvec4(true));
-	vec4 swap_diagonal = subgroupQuadSwapDiagonal(vec4(20.0));
-	bvec4 swap_diagonal_bool = subgroupQuadSwapDiagonal(bvec4(true));
-	vec4 quad_broadcast = subgroupQuadBroadcast(vec4(20.0), 3u);
-	bvec4 quad_broadcast_bool = subgroupQuadBroadcast(bvec4(true), 3u);
-}
diff --git a/spirv_opencl.cpp b/spirv_opencl.cpp
index ed7cfe633..806bf7497 100644
--- a/spirv_opencl.cpp
+++ b/spirv_opencl.cpp
@@ -177,6 +177,22 @@ void CompilerOpenCL::emit_header()
 		statement("#pragma OPENCL EXTENSION cl_khr_fp64 : enable");
 	if (opencl_options.enable_64bit_atomics && opencl_options.opencl_version >= 200)
 		statement("#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable");
+	if (opencl_options.enable_subgroups)
+		statement("#pragma OPENCL EXTENSION cl_khr_subgroups : enable");
+	if (needs_subgroup_vote)
+		statement("#pragma OPENCL EXTENSION cl_khr_subgroup_non_uniform_vote : enable");
+	if (needs_subgroup_ballot)
+		statement("#pragma OPENCL EXTENSION cl_khr_subgroup_ballot : enable");
+	if (needs_subgroup_arithmetic)
+		statement("#pragma OPENCL EXTENSION cl_khr_subgroup_non_uniform_arithmetic : enable");
+	if (needs_subgroup_shuffle)
+		statement("#pragma OPENCL EXTENSION cl_khr_subgroup_shuffle : enable");
+	if (needs_subgroup_shuffle_relative)
+		statement("#pragma OPENCL EXTENSION cl_khr_subgroup_shuffle_relative : enable");
+	if (needs_subgroup_clustered)
+		statement("#pragma OPENCL EXTENSION cl_khr_subgroup_clustered_reduce : enable");
+	if (needs_subgroup_rotate)
+		statement("#pragma OPENCL EXTENSION cl_khr_subgroup_rotate : enable");
 	statement("");
 
 	// Emit FP_CONTRACT pragma based on ContractionOff execution mode and FPFastMathDefault.
@@ -1071,10 +1087,66 @@ string CompilerOpenCL::builtin_to_glsl(BuiltIn builtin, StorageClass storage)
 	case BuiltInGlobalSize:
 		return "((uint3)(get_global_size(0), get_global_size(1), get_global_size(2)))";
 	case BuiltInNumSubgroups:
+		if (!opencl_options.enable_subgroups)
+			SPIRV_CROSS_THROW("Subgroup builtins require enable_subgroups option.");
+		return "get_num_sub_groups()";
 	case BuiltInSubgroupId:
+		if (!opencl_options.enable_subgroups)
+			SPIRV_CROSS_THROW("Subgroup builtins require enable_subgroups option.");
+		return "get_sub_group_id()";
 	case BuiltInSubgroupSize:
+		if (!opencl_options.enable_subgroups)
+			SPIRV_CROSS_THROW("Subgroup builtins require enable_subgroups option.");
+		return "get_sub_group_size()";
 	case BuiltInSubgroupLocalInvocationId:
-		SPIRV_CROSS_THROW("OpenCL subgroup builtins not yet implemented.");
+		if (!opencl_options.enable_subgroups)
+			SPIRV_CROSS_THROW("Subgroup builtins require enable_subgroups option.");
+		return "get_sub_group_local_id()";
+	case BuiltInSubgroupEqMask:
+		if (!opencl_options.enable_subgroups)
+			SPIRV_CROSS_THROW("Subgroup builtins require enable_subgroups option.");
+		if (!needs_subgroup_ballot)
+		{
+			needs_subgroup_ballot = true;
+			force_recompile();
+		}
+		return "get_sub_group_eq_mask()";
+	case BuiltInSubgroupGeMask:
+		if (!opencl_options.enable_subgroups)
+			SPIRV_CROSS_THROW("Subgroup builtins require enable_subgroups option.");
+		if (!needs_subgroup_ballot)
+		{
+			needs_subgroup_ballot = true;
+			force_recompile();
+		}
+		return "get_sub_group_ge_mask()";
+	case BuiltInSubgroupGtMask:
+		if (!opencl_options.enable_subgroups)
+			SPIRV_CROSS_THROW("Subgroup builtins require enable_subgroups option.");
+		if (!needs_subgroup_ballot)
+		{
+			needs_subgroup_ballot = true;
+			force_recompile();
+		}
+		return "get_sub_group_gt_mask()";
+	case BuiltInSubgroupLeMask:
+		if (!opencl_options.enable_subgroups)
+			SPIRV_CROSS_THROW("Subgroup builtins require enable_subgroups option.");
+		if (!needs_subgroup_ballot)
+		{
+			needs_subgroup_ballot = true;
+			force_recompile();
+		}
+		return "get_sub_group_le_mask()";
+	case BuiltInSubgroupLtMask:
+		if (!opencl_options.enable_subgroups)
+			SPIRV_CROSS_THROW("Subgroup builtins require enable_subgroups option.");
+		if (!needs_subgroup_ballot)
+		{
+			needs_subgroup_ballot = true;
+			force_recompile();
+		}
+		return "get_sub_group_lt_mask()";
 	default:
 		SPIRV_CROSS_THROW("Unsupported builtin for OpenCL compute shader.");
 	}
@@ -3183,6 +3255,333 @@ void CompilerOpenCL::emit_block_hints(const SPIRBlock &)
 	// OpenCL C has no control-flow hint attributes; suppress SPIRV_CROSS_BRANCH/FLATTEN etc.
 }
 
+// Emit a unary subgroup op, decomposing vectors into per-component calls.
+// For scalars, emits: func(val)
+// For vectors, emits: (vectype)(func(val.x), func(val.y), ...)
+void CompilerOpenCL::emit_subgroup_op_vec(uint32_t result_type, uint32_t id, uint32_t value_id, const char *func_name)
+{
+	auto &type = expression_type(value_id);
+	if (type.vecsize > 1)
+	{
+		auto &out_type = get<SPIRType>(result_type);
+		string expr = "(" + type_to_glsl(out_type) + ")(";
+		for (uint32_t c = 0; c < type.vecsize; c++)
+		{
+			if (c > 0)
+				expr += ", ";
+			expr += join(func_name, "(", to_enclosed_expression(value_id), ".", "xyzw"[c], ")");
+		}
+		expr += ")";
+		emit_op(result_type, id, expr, should_forward(value_id));
+		inherit_expression_dependencies(id, value_id);
+	}
+	else
+	{
+		emit_unary_func_op(result_type, id, value_id, func_name);
+	}
+}
+
+// Emit a binary subgroup op (value + extra arg like cluster size), decomposing vectors.
+// For scalars, emits: func(val, extra)
+// For vectors, emits: (vectype)(func(val.x, extra), func(val.y, extra), ...)
+void CompilerOpenCL::emit_subgroup_op_vec_binary(uint32_t result_type, uint32_t id, uint32_t value_id,
+                                                 uint32_t extra_id, const char *func_name)
+{
+	auto &type = expression_type(value_id);
+	if (type.vecsize > 1)
+	{
+		auto &out_type = get<SPIRType>(result_type);
+		string extra_expr = to_expression(extra_id);
+		string expr = "(" + type_to_glsl(out_type) + ")(";
+		for (uint32_t c = 0; c < type.vecsize; c++)
+		{
+			if (c > 0)
+				expr += ", ";
+			expr += join(func_name, "(", to_enclosed_expression(value_id), ".", "xyzw"[c], ", ", extra_expr, ")");
+		}
+		expr += ")";
+		emit_op(result_type, id, expr, should_forward(value_id));
+		inherit_expression_dependencies(id, value_id);
+	}
+	else
+	{
+		emit_binary_func_op(result_type, id, value_id, extra_id, func_name);
+	}
+}
+
+void CompilerOpenCL::emit_subgroup_op(const Instruction &i)
+{
+	const uint32_t *ops = stream(i);
+	auto op = static_cast<Op>(i.op);
+
+	if (!opencl_options.enable_subgroups)
+		SPIRV_CROSS_THROW("Subgroup operations require enable_subgroups option.");
+
+	// Validate scope is Subgroup
+	if (op != OpGroupNonUniformQuadAllKHR && op != OpGroupNonUniformQuadAnyKHR)
+	{
+		auto scope = static_cast<Scope>(evaluate_constant_u32(ops[2]));
+		if (scope != ScopeSubgroup)
+			SPIRV_CROSS_THROW("Only subgroup scope is supported.");
+	}
+
+	uint32_t result_type = ops[0];
+	uint32_t id = ops[1];
+
+	// If we need to do implicit bitcasts, make sure we do it with the correct type.
+	uint32_t integer_width = get_integer_width_for_instruction(i);
+	auto int_type = to_signed_basetype(integer_width);
+	auto uint_type = to_unsigned_basetype(integer_width);
+
+	// Helper to set an extension flag and trigger recompile if newly needed.
+	auto require_extension = [this](bool &flag)
+	{
+		if (!flag)
+		{
+			flag = true;
+			force_recompile();
+		}
+	};
+
+	switch (op)
+	{
+		// === Task 5: cl_khr_subgroup_non_uniform_vote ===
+
+	case OpGroupNonUniformElect:
+		require_extension(needs_subgroup_vote);
+		emit_op(result_type, id, "sub_group_elect()", true);
+		break;
+
+	case OpGroupNonUniformAllEqual:
+	{
+		require_extension(needs_subgroup_vote);
+		auto &type = expression_type(ops[3]);
+		if (type.vecsize > 1)
+		{
+			// OpenCL sub_group_non_uniform_all_equal only accepts scalars.
+			// For vectors, decompose into per-component calls combined with &&.
+			string expr;
+			for (uint32_t c = 0; c < type.vecsize; c++)
+			{
+				if (c > 0)
+					expr += " && ";
+				string component = join(to_enclosed_expression(ops[3]), ".", "xyzw"[c]);
+				expr += join("sub_group_non_uniform_all_equal(", component, ")");
+			}
+			emit_op(result_type, id, expr, should_forward(ops[3]));
+			inherit_expression_dependencies(id, ops[3]);
+		}
+		else
+		{
+			emit_unary_func_op(result_type, id, ops[3], "sub_group_non_uniform_all_equal");
+		}
+		break;
+	}
+
+		// === Task 4: cl_khr_subgroups (base) — vote/broadcast ===
+
+	case OpGroupNonUniformAll:
+		emit_unary_func_op(result_type, id, ops[3], "sub_group_all");
+		break;
+
+	case OpGroupNonUniformAny:
+		emit_unary_func_op(result_type, id, ops[3], "sub_group_any");
+		break;
+
+	case OpGroupNonUniformBroadcast:
+		emit_subgroup_op_vec_binary(result_type, id, ops[3], ops[4], "sub_group_broadcast");
+		break;
+
+		// === Task 6: cl_khr_subgroup_ballot ===
+
+	case OpGroupNonUniformBroadcastFirst:
+		require_extension(needs_subgroup_ballot);
+		emit_subgroup_op_vec(result_type, id, ops[3], "sub_group_broadcast_first");
+		break;
+
+	case OpGroupNonUniformBallot:
+		require_extension(needs_subgroup_ballot);
+		emit_unary_func_op(result_type, id, ops[3], "sub_group_ballot");
+		break;
+
+	case OpGroupNonUniformInverseBallot:
+		require_extension(needs_subgroup_ballot);
+		emit_unary_func_op(result_type, id, ops[3], "sub_group_inverse_ballot");
+		break;
+
+	case OpGroupNonUniformBallotBitExtract:
+		require_extension(needs_subgroup_ballot);
+		emit_binary_func_op(result_type, id, ops[3], ops[4], "sub_group_ballot_bit_extract");
+		break;
+
+	case OpGroupNonUniformBallotFindLSB:
+		require_extension(needs_subgroup_ballot);
+		emit_unary_func_op(result_type, id, ops[3], "sub_group_ballot_find_lsb");
+		break;
+
+	case OpGroupNonUniformBallotFindMSB:
+		require_extension(needs_subgroup_ballot);
+		emit_unary_func_op(result_type, id, ops[3], "sub_group_ballot_find_msb");
+		break;
+
+	case OpGroupNonUniformBallotBitCount:
+	{
+		require_extension(needs_subgroup_ballot);
+		auto operation = static_cast<GroupOperation>(ops[3]);
+		if (operation == GroupOperationReduce)
+			emit_unary_func_op(result_type, id, ops[4], "sub_group_ballot_bit_count");
+		else if (operation == GroupOperationInclusiveScan)
+			emit_unary_func_op(result_type, id, ops[4], "sub_group_ballot_inclusive_scan");
+		else if (operation == GroupOperationExclusiveScan)
+			emit_unary_func_op(result_type, id, ops[4], "sub_group_ballot_exclusive_scan");
+		else
+			SPIRV_CROSS_THROW("Invalid group operation for BallotBitCount.");
+		break;
+	}
+
+	// === Tasks 4/7/10: Arithmetic ops (Reduce/Scan/Clustered) ===
+	// The same SPIR-V opcodes are used for base cl_khr_subgroups (Reduce/InclusiveScan/ExclusiveScan
+	// with add/min/max), cl_khr_subgroup_non_uniform_arithmetic (all ops with Reduce/Scan),
+	// and cl_khr_subgroup_clustered_reduce (ClusteredReduce).
+
+	// clang-format off
+	// OpenCL subgroup functions are scalar-only; vectors are decomposed per-component
+	// via emit_subgroup_op_vec / emit_subgroup_op_vec_binary.
+
+#define OPENCL_SUBGROUP_ARITH(spirv_op, base_name, nu_name) \
+	case OpGroupNonUniform##spirv_op: \
+	{ \
+		auto operation = static_cast<GroupOperation>(ops[3]); \
+		if (operation == GroupOperationReduce) \
+			emit_subgroup_op_vec(result_type, id, ops[4], "sub_group_reduce_" base_name); \
+		else if (operation == GroupOperationInclusiveScan) \
+			emit_subgroup_op_vec(result_type, id, ops[4], "sub_group_scan_inclusive_" base_name); \
+		else if (operation == GroupOperationExclusiveScan) \
+			emit_subgroup_op_vec(result_type, id, ops[4], "sub_group_scan_exclusive_" base_name); \
+		else if (operation == GroupOperationClusteredReduce) \
+		{ \
+			require_extension(needs_subgroup_clustered); \
+			emit_subgroup_op_vec_binary(result_type, id, ops[4], ops[5], "sub_group_clustered_reduce_" base_name); \
+		} \
+		else \
+			SPIRV_CROSS_THROW("Unsupported group operation."); \
+		break; \
+	}
+
+#define OPENCL_SUBGROUP_ARITH_CAST(spirv_op, base_name, nu_name, cast_type) \
+	case OpGroupNonUniform##spirv_op: \
+	{ \
+		auto operation = static_cast<GroupOperation>(ops[3]); \
+		if (operation == GroupOperationReduce) \
+			emit_unary_func_op_cast(result_type, id, ops[4], "sub_group_reduce_" base_name, cast_type, cast_type); \
+		else if (operation == GroupOperationInclusiveScan) \
+			emit_unary_func_op_cast(result_type, id, ops[4], "sub_group_scan_inclusive_" base_name, cast_type, cast_type); \
+		else if (operation == GroupOperationExclusiveScan) \
+			emit_unary_func_op_cast(result_type, id, ops[4], "sub_group_scan_exclusive_" base_name, cast_type, cast_type); \
+		else if (operation == GroupOperationClusteredReduce) \
+		{ \
+			require_extension(needs_subgroup_clustered); \
+			emit_subgroup_op_vec_binary(result_type, id, ops[4], ops[5], "sub_group_clustered_reduce_" base_name); \
+		} \
+		else \
+			SPIRV_CROSS_THROW("Unsupported group operation."); \
+		break; \
+	}
+
+	// Non-uniform arithmetic extension ops (mul, bitwise, logical) — always require the extension
+#define OPENCL_SUBGROUP_ARITH_NU(spirv_op, nu_name) \
+	case OpGroupNonUniform##spirv_op: \
+	{ \
+		auto operation = static_cast<GroupOperation>(ops[3]); \
+		if (operation == GroupOperationReduce) \
+		{ \
+			require_extension(needs_subgroup_arithmetic); \
+			emit_subgroup_op_vec(result_type, id, ops[4], "sub_group_non_uniform_reduce_" nu_name); \
+		} \
+		else if (operation == GroupOperationInclusiveScan) \
+		{ \
+			require_extension(needs_subgroup_arithmetic); \
+			emit_subgroup_op_vec(result_type, id, ops[4], "sub_group_non_uniform_scan_inclusive_" nu_name); \
+		} \
+		else if (operation == GroupOperationExclusiveScan) \
+		{ \
+			require_extension(needs_subgroup_arithmetic); \
+			emit_subgroup_op_vec(result_type, id, ops[4], "sub_group_non_uniform_scan_exclusive_" nu_name); \
+		} \
+		else if (operation == GroupOperationClusteredReduce) \
+		{ \
+			require_extension(needs_subgroup_clustered); \
+			emit_subgroup_op_vec_binary(result_type, id, ops[4], ops[5], "sub_group_clustered_reduce_" nu_name); \
+		} \
+		else \
+			SPIRV_CROSS_THROW("Unsupported group operation."); \
+		break; \
+	}
+
+	// add/min/max: base cl_khr_subgroups for Reduce/Scan, clustered for ClusteredReduce
+	OPENCL_SUBGROUP_ARITH(FAdd, "add", "add")
+	OPENCL_SUBGROUP_ARITH(IAdd, "add", "add")
+	OPENCL_SUBGROUP_ARITH(FMin, "min", "min")
+	OPENCL_SUBGROUP_ARITH(FMax, "max", "max")
+	OPENCL_SUBGROUP_ARITH_CAST(SMin, "min", "min", int_type)
+	OPENCL_SUBGROUP_ARITH_CAST(SMax, "max", "max", int_type)
+	OPENCL_SUBGROUP_ARITH_CAST(UMin, "min", "min", uint_type)
+	OPENCL_SUBGROUP_ARITH_CAST(UMax, "max", "max", uint_type)
+
+	// mul/bitwise/logical: always require cl_khr_subgroup_non_uniform_arithmetic (or clustered)
+	OPENCL_SUBGROUP_ARITH_NU(FMul, "mul")
+	OPENCL_SUBGROUP_ARITH_NU(IMul, "mul")
+	OPENCL_SUBGROUP_ARITH_NU(BitwiseAnd, "and")
+	OPENCL_SUBGROUP_ARITH_NU(BitwiseOr, "or")
+	OPENCL_SUBGROUP_ARITH_NU(BitwiseXor, "xor")
+	OPENCL_SUBGROUP_ARITH_NU(LogicalAnd, "logical_and")
+	OPENCL_SUBGROUP_ARITH_NU(LogicalOr, "logical_or")
+	OPENCL_SUBGROUP_ARITH_NU(LogicalXor, "logical_xor")
+
+#undef OPENCL_SUBGROUP_ARITH
+#undef OPENCL_SUBGROUP_ARITH_CAST
+#undef OPENCL_SUBGROUP_ARITH_NU
+		// clang-format on
+
+		// === Task 8: cl_khr_subgroup_shuffle ===
+
+	case OpGroupNonUniformShuffle:
+		require_extension(needs_subgroup_shuffle);
+		emit_binary_func_op(result_type, id, ops[3], ops[4], "sub_group_shuffle");
+		break;
+
+	case OpGroupNonUniformShuffleXor:
+		require_extension(needs_subgroup_shuffle);
+		emit_binary_func_op(result_type, id, ops[3], ops[4], "sub_group_shuffle_xor");
+		break;
+
+		// === Task 9: cl_khr_subgroup_shuffle_relative ===
+
+	case OpGroupNonUniformShuffleUp:
+		require_extension(needs_subgroup_shuffle_relative);
+		emit_binary_func_op(result_type, id, ops[3], ops[4], "sub_group_shuffle_up");
+		break;
+
+	case OpGroupNonUniformShuffleDown:
+		require_extension(needs_subgroup_shuffle_relative);
+		emit_binary_func_op(result_type, id, ops[3], ops[4], "sub_group_shuffle_down");
+		break;
+
+		// === Task 11: cl_khr_subgroup_rotate ===
+
+	case OpGroupNonUniformRotateKHR:
+		require_extension(needs_subgroup_rotate);
+		if (i.length > 5)
+			emit_trinary_func_op(result_type, id, ops[3], ops[4], ops[5], "sub_group_clustered_rotate");
+		else
+			emit_binary_func_op(result_type, id, ops[3], ops[4], "sub_group_rotate");
+		break;
+
+	default:
+		SPIRV_CROSS_THROW("Unsupported subgroup op for OpenCL.");
+	}
+}
+
 void CompilerOpenCL::emit_specialization_constants_and_structs()
 {
 	bool emitted = false;
@@ -3851,28 +4250,50 @@ void CompilerOpenCL::emit_instruction(const Instruction &instruction)
 	case OpControlBarrier:
 	{
 		// ops[0]=execution_scope, ops[1]=memory_scope, ops[2]=semantics
+		uint32_t execution_scope = evaluate_constant_u32(ops[0]);
 		uint32_t semantics = evaluate_constant_u32(ops[2]);
 		semantics = mask_relevant_memory_semantics(semantics);
 
 		flush_control_dependent_expressions(current_emitting_block->self);
 		flush_all_active_variables();
 
-		// Emit memory fence before the execution barrier if needed
-		string fence_flags = opencl_mem_fence_flags(semantics);
-		if (semantics != 0)
+		if (execution_scope == ScopeSubgroup)
 		{
-			if (opencl_options.supports_opencl_version(2, 0))
-				statement("work_group_barrier(", fence_flags, ");");
+			if (!opencl_options.enable_subgroups)
+				SPIRV_CROSS_THROW("Subgroup barriers require enable_subgroups option.");
+
+			// Subgroup barrier with memory fence flags
+			const uint32_t all_barriers =
+			    MemorySemanticsWorkgroupMemoryMask | MemorySemanticsUniformMemoryMask | MemorySemanticsImageMemoryMask;
+
+			if (semantics == 0 || (semantics & all_barriers) == all_barriers)
+			{
+				statement("sub_group_barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);");
+			}
 			else
-				statement("barrier(", fence_flags, ");");
+			{
+				string fence_flags = opencl_mem_fence_flags(semantics);
+				statement("sub_group_barrier(", fence_flags, ");");
+			}
 		}
 		else
 		{
-			// Execution barrier with default local fence
-			if (opencl_options.supports_opencl_version(2, 0))
-				statement("work_group_barrier(CLK_LOCAL_MEM_FENCE);");
+			// Workgroup barrier
+			string fence_flags = opencl_mem_fence_flags(semantics);
+			if (semantics != 0)
+			{
+				if (opencl_options.supports_opencl_version(2, 0))
+					statement("work_group_barrier(", fence_flags, ");");
+				else
+					statement("barrier(", fence_flags, ");");
+			}
 			else
-				statement("barrier(CLK_LOCAL_MEM_FENCE);");
+			{
+				if (opencl_options.supports_opencl_version(2, 0))
+					statement("work_group_barrier(CLK_LOCAL_MEM_FENCE);");
+				else
+					statement("barrier(CLK_LOCAL_MEM_FENCE);");
+			}
 		}
 		break;
 	}
@@ -3880,6 +4301,7 @@ void CompilerOpenCL::emit_instruction(const Instruction &instruction)
 	case OpMemoryBarrier:
 	{
 		// ops[0]=memory_scope, ops[1]=semantics
+		uint32_t memory_scope = evaluate_constant_u32(ops[0]);
 		uint32_t semantics = evaluate_constant_u32(ops[1]);
 		semantics = mask_relevant_memory_semantics(semantics);
 
@@ -3888,8 +4310,30 @@ void CompilerOpenCL::emit_instruction(const Instruction &instruction)
 
 		if (semantics != 0)
 		{
-			string fence_flags = opencl_mem_fence_flags(semantics);
-			statement("mem_fence(", fence_flags, ");");
+			if (memory_scope == ScopeSubgroup)
+			{
+				if (!opencl_options.enable_subgroups)
+					SPIRV_CROSS_THROW("Subgroup memory barriers require enable_subgroups option.");
+
+				const uint32_t all_barriers = MemorySemanticsWorkgroupMemoryMask | MemorySemanticsUniformMemoryMask |
+				                              MemorySemanticsImageMemoryMask;
+
+				if ((semantics & all_barriers) == all_barriers ||
+				    (semantics & (MemorySemanticsCrossWorkgroupMemoryMask | MemorySemanticsSubgroupMemoryMask)))
+				{
+					statement("sub_group_barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);");
+				}
+				else
+				{
+					string fence_flags = opencl_mem_fence_flags(semantics);
+					statement("sub_group_barrier(", fence_flags, ");");
+				}
+			}
+			else
+			{
+				string fence_flags = opencl_mem_fence_flags(semantics);
+				statement("mem_fence(", fence_flags, ");");
+			}
 		}
 		break;
 	}
diff --git a/spirv_opencl.hpp b/spirv_opencl.hpp
index febc62bc9..4010997fe 100644
--- a/spirv_opencl.hpp
+++ b/spirv_opencl.hpp
@@ -141,6 +141,10 @@ class CompilerOpenCL : public CompilerGLSL
 	void replace_illegal_names() override;
 	void emit_function(SPIRFunction &func, const Bitset &return_flags) override;
 	void emit_block_hints(const SPIRBlock &block) override;
+	void emit_subgroup_op(const Instruction &i) override;
+	void emit_subgroup_op_vec(uint32_t result_type, uint32_t id, uint32_t value_id, const char *func_name);
+	void emit_subgroup_op_vec_binary(uint32_t result_type, uint32_t id, uint32_t value_id, uint32_t extra_id,
+	                                 const char *func_name);
 	void emit_store_statement(uint32_t lhs_expression, uint32_t rhs_expression) override;
 	void emit_struct_member(const SPIRType &type, uint32_t member_type_id, uint32_t index,
 	                        const std::string &qualifier = "", uint32_t base_offset = 0) override;
@@ -188,6 +192,16 @@ class CompilerOpenCL : public CompilerGLSL
 	bool needs_inverse_3 = false;
 	bool needs_inverse_4 = false;
 
+	// Subgroup extension requirements discovered during emit_subgroup_op / builtin_to_glsl.
+	// These trigger force_recompile() so emit_header() can emit the correct pragmas.
+	bool needs_subgroup_vote = false;
+	bool needs_subgroup_ballot = false;
+	bool needs_subgroup_arithmetic = false;
+	bool needs_subgroup_shuffle = false;
+	bool needs_subgroup_shuffle_relative = false;
+	bool needs_subgroup_clustered = false;
+	bool needs_subgroup_rotate = false;
+
 	// Matrix type support: tracks which matrix signatures (basetype, vecsize, columns) are needed.
 	struct MatrixTypeKey
 	{
diff --git a/test_shaders.py b/test_shaders.py
index 9343d9a9d..5278a1b4a 100755
--- a/test_shaders.py
+++ b/test_shaders.py
@@ -585,30 +585,30 @@ def cross_compile_hlsl(shader, spirv, opt, force_no_external_validation, iterati
 
     return (spirv_path, hlsl_path)
 
-def path_to_opencl_standard(shader):
-    if '.cl30.' in shader:
-        return '-cl-std=CL3.0'
-    elif '.cl22.' in shader:
-        return '-cl-std=CL2.2'
-    elif '.cl21.' in shader:
-        return '-cl-std=CL2.1'
-    elif '.cl20.' in shader:
-        return '-cl-std=CL2.0'
-    else:
-        return '-cl-std=CL1.2'
-
 def path_to_opencl_standard_cli(shader):
+    # clang seems warn about cl_khr_subgroups unless is specified.
+    # Revisit when OpenCL 3.0 support is no longer experimental.
+    if '.subgroups.' in shader:
+        return '200'
+    # OpenCL 3.0 support in clang is experimental and 2.1 and 2.2 seem unsupported.
     if '.cl30.' in shader:
-        return '300'
+        # return '300'
+        return '120'
     elif '.cl22.' in shader:
-        return '220'
+        # return '220'
+        return '200'
     elif '.cl21.' in shader:
-        return '210'
+        # return '210'
+        return '200'
     elif '.cl20.' in shader:
         return '200'
     else:
         return '120'
 
+def path_to_opencl_standard(shader):
+    version = path_to_opencl_standard_cli(shader)
+    return f'-cl-std=CL{version[0]}.{version[1]}'
+
 ignore_clang = False
 def validate_shader_opencl(shader, opt, paths):
     shader = reference_path(shader[0], shader[1], opt)
@@ -633,14 +633,22 @@ def validate_shader_opencl(shader, opt, paths):
     global ignore_clang
     try:
         defines = ['-D' + ext for ext in extensions]
+        if extensions:
+            exts = ['-cl-ext=' + ','.join(['+' + ext for ext in extensions])]
+        else:
+            exts = []
+        defines = ['-D' + ext for ext in extensions]
         version = path_to_opencl_standard_cli(shader)
         subprocess.check_call([paths.clang, '-Xclang',
                                path_to_opencl_standard(shader),
                                '-D__OPENCL_C_VERSION__=' + version,
-                               '-D__OPENCL_VERSION__=' + version] + defines +
+                               '-D__OPENCL_VERSION__=' + version] + defines + exts +
                               [
                                '-emit-llvm', '-target', 'spir64-unknown-unknown',
-                               '-Xclang', '-finclude-default-header', '-x', 'cl', '-c', shader])
+                               # clang may incorrectly claim that some extension pragmas are unnecessary
+                               '-Wignored-pragmas',
+                               '-Xclang', '-finclude-default-header', '-x', 'cl', '-c', shader,
+                               '-o', os.devnull])
 
     except OSError as oe:
         if (oe.errno != errno.ENOENT):   # Ignore clang not found error

From f19809203d5f3254805cdd60f4d2d5ec832dcdeb Mon Sep 17 00:00:00 2001
From: Garrick Meeker <gmeeker@gmail.com>
Date: Mon, 16 Mar 2026 15:48:18 -0700
Subject: [PATCH 11/16] OpenCL: more test fixes

---
 .../asm/comp/bda-arguments.asm.comp           |  2 +-
 .../storage-buffer-pointer-argument.asm.comp  |  2 +-
 ...riable-ssbo-array-argument.spv16.asm.comp} |  4 +-
 ...tier-1.device-argument-buffer.invalid.comp | 23 ------
 .../comp/bitcast-16bit-1.fp16.invalid.comp    | 29 ++++++++
 .../comp/bitcast-16bit-1.invalid.comp         |  0
 .../comp/bitcast-16bit-2.fp16.invalid.comp    | 40 +++++++++++
 .../comp/bitcast-16bit-2.invalid.comp         |  0
 ...riable-ssbo-array-argument.spv16.asm.comp} |  0
 ...tier-1.device-argument-buffer.invalid.comp |  9 ---
 ...comp => bitcast-16bit-1.fp16.invalid.comp} |  0
 ...comp => bitcast-16bit-2.fp16.invalid.comp} |  0
 spirv_opencl.cpp                              | 70 ++++++++++++++++++-
 test_shaders.py                               |  2 +-
 14 files changed, 141 insertions(+), 40 deletions(-)
 rename reference/shaders-opencl-no-opt/asm/comp/{variable-ssbo-array-argument.spv16.invalid.asm.comp => variable-ssbo-array-argument.spv16.asm.comp} (76%)
 delete mode 100644 reference/shaders-opencl-no-opt/comp/argument-buffer-readonly-writeonly-alias.argument.argument-tier-1.device-argument-buffer.invalid.comp
 create mode 100644 reference/shaders-opencl-no-opt/comp/bitcast-16bit-1.fp16.invalid.comp
 delete mode 100644 reference/shaders-opencl-no-opt/comp/bitcast-16bit-1.invalid.comp
 create mode 100644 reference/shaders-opencl-no-opt/comp/bitcast-16bit-2.fp16.invalid.comp
 delete mode 100644 reference/shaders-opencl-no-opt/comp/bitcast-16bit-2.invalid.comp
 rename shaders-opencl-no-opt/asm/comp/{variable-ssbo-array-argument.spv16.invalid.asm.comp => variable-ssbo-array-argument.spv16.asm.comp} (100%)
 delete mode 100644 shaders-opencl-no-opt/comp/argument-buffer-readonly-writeonly-alias.argument.argument-tier-1.device-argument-buffer.invalid.comp
 rename shaders-opencl-no-opt/comp/{bitcast-16bit-1.invalid.comp => bitcast-16bit-1.fp16.invalid.comp} (100%)
 rename shaders-opencl-no-opt/comp/{bitcast-16bit-2.invalid.comp => bitcast-16bit-2.fp16.invalid.comp} (100%)

diff --git a/reference/shaders-opencl-no-opt/asm/comp/bda-arguments.asm.comp b/reference/shaders-opencl-no-opt/asm/comp/bda-arguments.asm.comp
index e927b1917..ee615ca3d 100644
--- a/reference/shaders-opencl-no-opt/asm/comp/bda-arguments.asm.comp
+++ b/reference/shaders-opencl-no-opt/asm/comp/bda-arguments.asm.comp
@@ -36,6 +36,6 @@ __kernel void comp_main(_16 _32)
 {
     uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
     __global _4* _28 = ((__global _4*)(_32._m0));
-    _40(_28, 40, &_28->_m0, &_28->_m1, ((__global int*)(_28->_m1)));
+    _40(_28, 40, &_28->_m0, (__global int* __global *)&_28->_m1, ((__global int*)(_28->_m1)));
 }
 
diff --git a/reference/shaders-opencl-no-opt/asm/comp/storage-buffer-pointer-argument.asm.comp b/reference/shaders-opencl-no-opt/asm/comp/storage-buffer-pointer-argument.asm.comp
index 9a56784a5..1ea3ba362 100644
--- a/reference/shaders-opencl-no-opt/asm/comp/storage-buffer-pointer-argument.asm.comp
+++ b/reference/shaders-opencl-no-opt/asm/comp/storage-buffer-pointer-argument.asm.comp
@@ -15,7 +15,7 @@ struct SSBORead
 
 typedef struct SSBORead SSBORead;
 
-void copy_out(__global float* A_1, __global float* B_1)
+void copy_out(__global float* A_1, const __global float* B_1)
 {
     *A_1 = *B_1;
 }
diff --git a/reference/shaders-opencl-no-opt/asm/comp/variable-ssbo-array-argument.spv16.invalid.asm.comp b/reference/shaders-opencl-no-opt/asm/comp/variable-ssbo-array-argument.spv16.asm.comp
similarity index 76%
rename from reference/shaders-opencl-no-opt/asm/comp/variable-ssbo-array-argument.spv16.invalid.asm.comp
rename to reference/shaders-opencl-no-opt/asm/comp/variable-ssbo-array-argument.spv16.asm.comp
index 2b20027cd..e4d6b5107 100644
--- a/reference/shaders-opencl-no-opt/asm/comp/variable-ssbo-array-argument.spv16.invalid.asm.comp
+++ b/reference/shaders-opencl-no-opt/asm/comp/variable-ssbo-array-argument.spv16.asm.comp
@@ -8,7 +8,7 @@ struct _3
 
 typedef struct _3 _3;
 
-void _20(__global uchar* _21[16])
+void _20(__global uchar (*_21)[16])
 {
     (*_21)[2u] = (uchar)(0);
 }
@@ -16,6 +16,6 @@ void _20(__global uchar* _21[16])
 __attribute__((reqd_work_group_size(16, 1, 1)))
 __kernel void comp_main(__global uchar* _2)
 {
-    _20(&_2[0]);
+    _20((__global uchar (*)[16])&_2[0]);
 }
 
diff --git a/reference/shaders-opencl-no-opt/comp/argument-buffer-readonly-writeonly-alias.argument.argument-tier-1.device-argument-buffer.invalid.comp b/reference/shaders-opencl-no-opt/comp/argument-buffer-readonly-writeonly-alias.argument.argument-tier-1.device-argument-buffer.invalid.comp
deleted file mode 100644
index 497606109..000000000
--- a/reference/shaders-opencl-no-opt/comp/argument-buffer-readonly-writeonly-alias.argument.argument-tier-1.device-argument-buffer.invalid.comp
+++ /dev/null
@@ -1,23 +0,0 @@
-// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
-
-
-struct D
-{
-    float data_d[1];
-};
-
-typedef struct D D;
-
-struct A
-{
-    float data_a[1];
-};
-
-typedef struct A A;
-
-__attribute__((reqd_work_group_size(1, 1, 1)))
-__kernel void comp_main(__global float* d, __global const float* a)
-{
-    d[0][0] = a[0][0];
-}
-
diff --git a/reference/shaders-opencl-no-opt/comp/bitcast-16bit-1.fp16.invalid.comp b/reference/shaders-opencl-no-opt/comp/bitcast-16bit-1.fp16.invalid.comp
new file mode 100644
index 000000000..c01ac818c
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/comp/bitcast-16bit-1.fp16.invalid.comp
@@ -0,0 +1,29 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+struct SSBO0
+{
+    short4 inputs[1];
+};
+
+typedef struct SSBO0 SSBO0;
+
+struct SSBO1
+{
+    int4 outputs[1];
+};
+
+typedef struct SSBO1 SSBO1;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global short4* _25, __global int4* _39)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    uint ident = ((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x;
+    half2 a = as_half2(_25[ident].xy);
+    _39[ident][0u] = as_int(as_uint(a + (half2)(half(1.0))));
+    _39[ident][1u] = as_int(_25[ident].zw);
+    _39[ident][2u] = as_int(as_uint(as_ushort2(_25[ident].xy)));
+}
+
diff --git a/reference/shaders-opencl-no-opt/comp/bitcast-16bit-1.invalid.comp b/reference/shaders-opencl-no-opt/comp/bitcast-16bit-1.invalid.comp
deleted file mode 100644
index e69de29bb..000000000
diff --git a/reference/shaders-opencl-no-opt/comp/bitcast-16bit-2.fp16.invalid.comp b/reference/shaders-opencl-no-opt/comp/bitcast-16bit-2.fp16.invalid.comp
new file mode 100644
index 000000000..bca3c2996
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/comp/bitcast-16bit-2.fp16.invalid.comp
@@ -0,0 +1,40 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+struct SSBO1
+{
+    short4 outputs[1];
+};
+
+typedef struct SSBO1 SSBO1;
+
+struct SSBO0
+{
+    int4 inputs[1];
+};
+
+typedef struct SSBO0 SSBO0;
+
+struct UBO
+{
+    half4 const0;
+};
+
+typedef struct UBO UBO;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void comp_main(__global short4* _21, __global int4* _29, UBO _40)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    uint ident_1 = ((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x;
+    int _33 = _29[ident_1][0u];
+    short2 _47 = as_short2(_33) + as_short2(_40.const0.xy);
+    _21[ident_1][0u] = _47.x;
+    _21[ident_1][1u] = _47.y;
+    int _57 = _29[ident_1][1u];
+    short2 _67 = as_short2(as_ushort2(as_uint(_57)) - as_ushort2(_40.const0.zw));
+    _21[ident_1][2u] = _67.x;
+    _21[ident_1][3u] = _67.y;
+}
+
diff --git a/reference/shaders-opencl-no-opt/comp/bitcast-16bit-2.invalid.comp b/reference/shaders-opencl-no-opt/comp/bitcast-16bit-2.invalid.comp
deleted file mode 100644
index e69de29bb..000000000
diff --git a/shaders-opencl-no-opt/asm/comp/variable-ssbo-array-argument.spv16.invalid.asm.comp b/shaders-opencl-no-opt/asm/comp/variable-ssbo-array-argument.spv16.asm.comp
similarity index 100%
rename from shaders-opencl-no-opt/asm/comp/variable-ssbo-array-argument.spv16.invalid.asm.comp
rename to shaders-opencl-no-opt/asm/comp/variable-ssbo-array-argument.spv16.asm.comp
diff --git a/shaders-opencl-no-opt/comp/argument-buffer-readonly-writeonly-alias.argument.argument-tier-1.device-argument-buffer.invalid.comp b/shaders-opencl-no-opt/comp/argument-buffer-readonly-writeonly-alias.argument.argument-tier-1.device-argument-buffer.invalid.comp
deleted file mode 100644
index f5f05a1ae..000000000
--- a/shaders-opencl-no-opt/comp/argument-buffer-readonly-writeonly-alias.argument.argument-tier-1.device-argument-buffer.invalid.comp
+++ /dev/null
@@ -1,9 +0,0 @@
-#version 450
-#extension GL_EXT_nonuniform_qualifier : require
-
-layout (binding = 0) readonly buffer A {float data_a[];} a[];
-layout (binding = 0) writeonly buffer D {float data_d[];} d[];
-
-void main() {
-	d[gl_WorkGroupID.x].data_d[0] = a[gl_WorkGroupID.x].data_a[0];
-}
diff --git a/shaders-opencl-no-opt/comp/bitcast-16bit-1.invalid.comp b/shaders-opencl-no-opt/comp/bitcast-16bit-1.fp16.invalid.comp
similarity index 100%
rename from shaders-opencl-no-opt/comp/bitcast-16bit-1.invalid.comp
rename to shaders-opencl-no-opt/comp/bitcast-16bit-1.fp16.invalid.comp
diff --git a/shaders-opencl-no-opt/comp/bitcast-16bit-2.invalid.comp b/shaders-opencl-no-opt/comp/bitcast-16bit-2.fp16.invalid.comp
similarity index 100%
rename from shaders-opencl-no-opt/comp/bitcast-16bit-2.invalid.comp
rename to shaders-opencl-no-opt/comp/bitcast-16bit-2.fp16.invalid.comp
diff --git a/spirv_opencl.cpp b/spirv_opencl.cpp
index 806bf7497..6b902517d 100644
--- a/spirv_opencl.cpp
+++ b/spirv_opencl.cpp
@@ -2822,7 +2822,43 @@ std::string CompilerOpenCL::to_func_call_arg(const SPIRFunction::Parameter &call
 		}
 	}
 
-	return CompilerGLSL::to_func_call_arg(callee_param, id);
+	// If callee expects a pointer-to-array (e.g., __global uchar (*)[16]) but we have a flat
+	// pointer (e.g., from a flattened SSBO), cast the argument to the expected type.
+	auto &callee_type = expression_type(callee_param.id);
+	if (is_pointer(callee_type) && !callee_type.array.empty())
+	{
+		auto addr_space = get_type_address_space(callee_type, callee_param.id);
+		const auto *pointee = &get<SPIRType>(callee_type.parent_type);
+		while (is_pointer(*pointee))
+			pointee = &get<SPIRType>(pointee->parent_type);
+		string base = type_to_glsl(*pointee, callee_param.id);
+		string array_dims = type_to_array_glsl(callee_type, callee_param.id);
+		string cast_type = (!addr_space.empty() ? addr_space + " " : "") + base + " (*)" + array_dims;
+		return join("(", cast_type, ")", to_pointer_expression(id));
+	}
+
+	// Get the base class result (handles to_pointer_expression for buffer/physical pointers).
+	auto result = CompilerGLSL::to_func_call_arg(callee_param, id);
+
+	// BDA pointer-to-pointer mismatch: struct members store BDA pointers as ulong
+	// (emit_struct_member), so taking &member gives ulong* in C, not the expected
+	// pointer-to-pointer type. Add a cast to the callee's parameter type.
+	// Skip function parameters — they already have the correct pointer type.
+	if (is_pointer(callee_type) && callee_type.storage == StorageClassPhysicalStorageBuffer)
+	{
+		auto &pointee = get<SPIRType>(callee_type.parent_type);
+		if (is_pointer(pointee) && pointee.storage == StorageClassPhysicalStorageBuffer)
+		{
+			auto *var = maybe_get<SPIRVariable>(id);
+			if (!var || !var->parameter)
+			{
+				auto cast_type = type_to_glsl(callee_type, callee_param.id);
+				return join("(", cast_type, ")", result);
+			}
+		}
+	}
+
+	return result;
 }
 
 std::string CompilerOpenCL::entry_point_args(bool append_comma)
@@ -2986,8 +3022,36 @@ void CompilerOpenCL::emit_function_prototype(SPIRFunction &func, const Bitset &r
 
 		// OpenCL C has no in/out/inout qualifiers — skip direction prefix from argument_decl.
 		auto &arg_type = expression_type(arg.id);
-		decl += to_qualifiers_glsl(arg.id);
-		decl += variable_decl(arg_type, to_name(arg.id), arg.id);
+
+		// For StorageBuffer/Uniform pointer params that are never written, add const
+		// to match the constness of NonWritable kernel parameters at call sites.
+		bool is_readonly_ptr =
+		    is_pointer(arg_type) && arg.write_count == 0 &&
+		    (arg_type.storage == StorageClassStorageBuffer || arg_type.storage == StorageClassUniform);
+
+		// Pointer-to-array parameters need special C syntax: "T (*name)[N]" not "T* name[N]".
+		// "T* name[N]" in C means "array of N pointers to T", which is wrong.
+		if (is_pointer(arg_type) && !arg_type.array.empty())
+		{
+			auto addr_space = get_type_address_space(arg_type, arg.id);
+			const auto *pointee = &get<SPIRType>(arg_type.parent_type);
+			while (is_pointer(*pointee))
+				pointee = &get<SPIRType>(pointee->parent_type);
+			string base = type_to_glsl(*pointee, arg.id);
+			string restrict_kw = to_restrict(arg.id, true);
+			if (!addr_space.empty())
+				decl += addr_space + " ";
+			if (is_readonly_ptr)
+				decl += "const ";
+			decl += base + " (*" + restrict_kw + to_name(arg.id) + ")" + type_to_array_glsl(arg_type, arg.id);
+		}
+		else
+		{
+			if (is_readonly_ptr)
+				decl += "const ";
+			decl += to_qualifiers_glsl(arg.id);
+			decl += variable_decl(arg_type, to_name(arg.id), arg.id);
+		}
 
 		if (&arg != &func.arguments.back())
 			decl += ", ";
diff --git a/test_shaders.py b/test_shaders.py
index 5278a1b4a..309fefd10 100755
--- a/test_shaders.py
+++ b/test_shaders.py
@@ -646,7 +646,7 @@ def validate_shader_opencl(shader, opt, paths):
                               [
                                '-emit-llvm', '-target', 'spir64-unknown-unknown',
                                # clang may incorrectly claim that some extension pragmas are unnecessary
-                               '-Wignored-pragmas',
+                               '-Wno-ignored-pragmas',
                                '-Xclang', '-finclude-default-header', '-x', 'cl', '-c', shader,
                                '-o', os.devnull])
 

From 0c6ec1ae102266db1cfe27ad757adba5e495352c Mon Sep 17 00:00:00 2001
From: Garrick Meeker <gmeeker@gmail.com>
Date: Mon, 16 Mar 2026 17:19:33 -0700
Subject: [PATCH 12/16] OpenCL: fixing 16-bit float tests; suppress
 cl_khr_3d_image_writes pragma

---
 ...fp16.invalid.comp => bitcast-16bit-1.fp16.comp} |  2 +-
 ...fp16.invalid.comp => bitcast-16bit-2.fp16.comp} |  0
 .../comp/{int64.invalid.comp => int64.comp}        |  0
 ...comp => struct-packing-scalar.nocompat.vk.comp} |  0
 ...subgroups-arithmetic.nocompat.vk.subgroups.comp |  1 -
 .../subgroups-ballot.nocompat.vk.subgroups.comp    |  1 -
 .../subgroups-basic.nocompat.vk.subgroups.comp     |  1 -
 .../subgroups-clustered.nocompat.vk.subgroups.comp |  1 -
 .../subgroups-rotate.nocompat.vk.subgroups.comp    |  1 -
 ...ups-shuffle-relative.nocompat.vk.subgroups.comp |  1 -
 .../subgroups-shuffle.nocompat.vk.subgroups.comp   |  1 -
 .../comp/subgroups-vote.nocompat.vk.subgroups.comp |  1 -
 ...fp16.invalid.comp => bitcast-16bit-1.fp16.comp} |  0
 ...fp16.invalid.comp => bitcast-16bit-2.fp16.comp} |  0
 .../comp/{int64.invalid.comp => int64.comp}        |  0
 ...comp => struct-packing-scalar.nocompat.vk.comp} |  0
 spirv_glsl.cpp                                     | 14 ++++++++++----
 spirv_opencl.cpp                                   | 12 +++++++++++-
 spirv_opencl.hpp                                   |  5 ++++-
 19 files changed, 26 insertions(+), 15 deletions(-)
 rename reference/shaders-opencl-no-opt/comp/{bitcast-16bit-1.fp16.invalid.comp => bitcast-16bit-1.fp16.comp} (91%)
 rename reference/shaders-opencl-no-opt/comp/{bitcast-16bit-2.fp16.invalid.comp => bitcast-16bit-2.fp16.comp} (100%)
 rename reference/shaders-opencl-no-opt/comp/{int64.invalid.comp => int64.comp} (100%)
 rename reference/shaders-opencl-no-opt/comp/{struct-packing-scalar.nocompat.invalid.vk.comp => struct-packing-scalar.nocompat.vk.comp} (100%)
 rename shaders-opencl-no-opt/comp/{bitcast-16bit-1.fp16.invalid.comp => bitcast-16bit-1.fp16.comp} (100%)
 rename shaders-opencl-no-opt/comp/{bitcast-16bit-2.fp16.invalid.comp => bitcast-16bit-2.fp16.comp} (100%)
 rename shaders-opencl-no-opt/comp/{int64.invalid.comp => int64.comp} (100%)
 rename shaders-opencl-no-opt/comp/{struct-packing-scalar.nocompat.invalid.vk.comp => struct-packing-scalar.nocompat.vk.comp} (100%)

diff --git a/reference/shaders-opencl-no-opt/comp/bitcast-16bit-1.fp16.invalid.comp b/reference/shaders-opencl-no-opt/comp/bitcast-16bit-1.fp16.comp
similarity index 91%
rename from reference/shaders-opencl-no-opt/comp/bitcast-16bit-1.fp16.invalid.comp
rename to reference/shaders-opencl-no-opt/comp/bitcast-16bit-1.fp16.comp
index c01ac818c..0c6516642 100644
--- a/reference/shaders-opencl-no-opt/comp/bitcast-16bit-1.fp16.invalid.comp
+++ b/reference/shaders-opencl-no-opt/comp/bitcast-16bit-1.fp16.comp
@@ -22,7 +22,7 @@ __kernel void comp_main(__global short4* _25, __global int4* _39)
     uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
     uint ident = ((uint3)(get_global_id(0), get_global_id(1), get_global_id(2))).x;
     half2 a = as_half2(_25[ident].xy);
-    _39[ident][0u] = as_int(as_uint(a + (half2)(half(1.0))));
+    _39[ident][0u] = as_int(as_uint(a + (half2)((half)(1.0))));
     _39[ident][1u] = as_int(_25[ident].zw);
     _39[ident][2u] = as_int(as_uint(as_ushort2(_25[ident].xy)));
 }
diff --git a/reference/shaders-opencl-no-opt/comp/bitcast-16bit-2.fp16.invalid.comp b/reference/shaders-opencl-no-opt/comp/bitcast-16bit-2.fp16.comp
similarity index 100%
rename from reference/shaders-opencl-no-opt/comp/bitcast-16bit-2.fp16.invalid.comp
rename to reference/shaders-opencl-no-opt/comp/bitcast-16bit-2.fp16.comp
diff --git a/reference/shaders-opencl-no-opt/comp/int64.invalid.comp b/reference/shaders-opencl-no-opt/comp/int64.comp
similarity index 100%
rename from reference/shaders-opencl-no-opt/comp/int64.invalid.comp
rename to reference/shaders-opencl-no-opt/comp/int64.comp
diff --git a/reference/shaders-opencl-no-opt/comp/struct-packing-scalar.nocompat.invalid.vk.comp b/reference/shaders-opencl-no-opt/comp/struct-packing-scalar.nocompat.vk.comp
similarity index 100%
rename from reference/shaders-opencl-no-opt/comp/struct-packing-scalar.nocompat.invalid.vk.comp
rename to reference/shaders-opencl-no-opt/comp/struct-packing-scalar.nocompat.vk.comp
diff --git a/reference/shaders-opencl-no-opt/comp/subgroups-arithmetic.nocompat.vk.subgroups.comp b/reference/shaders-opencl-no-opt/comp/subgroups-arithmetic.nocompat.vk.subgroups.comp
index 916168719..be962897e 100644
--- a/reference/shaders-opencl-no-opt/comp/subgroups-arithmetic.nocompat.vk.subgroups.comp
+++ b/reference/shaders-opencl-no-opt/comp/subgroups-arithmetic.nocompat.vk.subgroups.comp
@@ -1,6 +1,5 @@
 // Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
 
-#pragma OPENCL EXTENSION cl_khr_3d_image_writes : enable
 #pragma OPENCL EXTENSION cl_khr_subgroups : enable
 #pragma OPENCL EXTENSION cl_khr_subgroup_non_uniform_arithmetic : enable
 
diff --git a/reference/shaders-opencl-no-opt/comp/subgroups-ballot.nocompat.vk.subgroups.comp b/reference/shaders-opencl-no-opt/comp/subgroups-ballot.nocompat.vk.subgroups.comp
index c7d53554c..742c27dc9 100644
--- a/reference/shaders-opencl-no-opt/comp/subgroups-ballot.nocompat.vk.subgroups.comp
+++ b/reference/shaders-opencl-no-opt/comp/subgroups-ballot.nocompat.vk.subgroups.comp
@@ -1,6 +1,5 @@
 // Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
 
-#pragma OPENCL EXTENSION cl_khr_3d_image_writes : enable
 #pragma OPENCL EXTENSION cl_khr_subgroups : enable
 #pragma OPENCL EXTENSION cl_khr_subgroup_ballot : enable
 
diff --git a/reference/shaders-opencl-no-opt/comp/subgroups-basic.nocompat.vk.subgroups.comp b/reference/shaders-opencl-no-opt/comp/subgroups-basic.nocompat.vk.subgroups.comp
index e4921be88..ab81e408a 100644
--- a/reference/shaders-opencl-no-opt/comp/subgroups-basic.nocompat.vk.subgroups.comp
+++ b/reference/shaders-opencl-no-opt/comp/subgroups-basic.nocompat.vk.subgroups.comp
@@ -1,6 +1,5 @@
 // Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
 
-#pragma OPENCL EXTENSION cl_khr_3d_image_writes : enable
 #pragma OPENCL EXTENSION cl_khr_subgroups : enable
 
 struct SSBO
diff --git a/reference/shaders-opencl-no-opt/comp/subgroups-clustered.nocompat.vk.subgroups.comp b/reference/shaders-opencl-no-opt/comp/subgroups-clustered.nocompat.vk.subgroups.comp
index 10a67ecce..9f44352d9 100644
--- a/reference/shaders-opencl-no-opt/comp/subgroups-clustered.nocompat.vk.subgroups.comp
+++ b/reference/shaders-opencl-no-opt/comp/subgroups-clustered.nocompat.vk.subgroups.comp
@@ -1,6 +1,5 @@
 // Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
 
-#pragma OPENCL EXTENSION cl_khr_3d_image_writes : enable
 #pragma OPENCL EXTENSION cl_khr_subgroups : enable
 #pragma OPENCL EXTENSION cl_khr_subgroup_clustered_reduce : enable
 
diff --git a/reference/shaders-opencl-no-opt/comp/subgroups-rotate.nocompat.vk.subgroups.comp b/reference/shaders-opencl-no-opt/comp/subgroups-rotate.nocompat.vk.subgroups.comp
index d97431603..c9462bad4 100644
--- a/reference/shaders-opencl-no-opt/comp/subgroups-rotate.nocompat.vk.subgroups.comp
+++ b/reference/shaders-opencl-no-opt/comp/subgroups-rotate.nocompat.vk.subgroups.comp
@@ -1,6 +1,5 @@
 // Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
 
-#pragma OPENCL EXTENSION cl_khr_3d_image_writes : enable
 #pragma OPENCL EXTENSION cl_khr_subgroups : enable
 #pragma OPENCL EXTENSION cl_khr_subgroup_rotate : enable
 
diff --git a/reference/shaders-opencl-no-opt/comp/subgroups-shuffle-relative.nocompat.vk.subgroups.comp b/reference/shaders-opencl-no-opt/comp/subgroups-shuffle-relative.nocompat.vk.subgroups.comp
index 7c076e911..ddfb30d8c 100644
--- a/reference/shaders-opencl-no-opt/comp/subgroups-shuffle-relative.nocompat.vk.subgroups.comp
+++ b/reference/shaders-opencl-no-opt/comp/subgroups-shuffle-relative.nocompat.vk.subgroups.comp
@@ -1,6 +1,5 @@
 // Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
 
-#pragma OPENCL EXTENSION cl_khr_3d_image_writes : enable
 #pragma OPENCL EXTENSION cl_khr_subgroups : enable
 #pragma OPENCL EXTENSION cl_khr_subgroup_shuffle_relative : enable
 
diff --git a/reference/shaders-opencl-no-opt/comp/subgroups-shuffle.nocompat.vk.subgroups.comp b/reference/shaders-opencl-no-opt/comp/subgroups-shuffle.nocompat.vk.subgroups.comp
index 5c032dda2..0910d63ef 100644
--- a/reference/shaders-opencl-no-opt/comp/subgroups-shuffle.nocompat.vk.subgroups.comp
+++ b/reference/shaders-opencl-no-opt/comp/subgroups-shuffle.nocompat.vk.subgroups.comp
@@ -1,6 +1,5 @@
 // Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
 
-#pragma OPENCL EXTENSION cl_khr_3d_image_writes : enable
 #pragma OPENCL EXTENSION cl_khr_subgroups : enable
 #pragma OPENCL EXTENSION cl_khr_subgroup_shuffle : enable
 
diff --git a/reference/shaders-opencl-no-opt/comp/subgroups-vote.nocompat.vk.subgroups.comp b/reference/shaders-opencl-no-opt/comp/subgroups-vote.nocompat.vk.subgroups.comp
index 63276058c..7c872b2e7 100644
--- a/reference/shaders-opencl-no-opt/comp/subgroups-vote.nocompat.vk.subgroups.comp
+++ b/reference/shaders-opencl-no-opt/comp/subgroups-vote.nocompat.vk.subgroups.comp
@@ -1,6 +1,5 @@
 // Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
 
-#pragma OPENCL EXTENSION cl_khr_3d_image_writes : enable
 #pragma OPENCL EXTENSION cl_khr_subgroups : enable
 #pragma OPENCL EXTENSION cl_khr_subgroup_non_uniform_vote : enable
 
diff --git a/shaders-opencl-no-opt/comp/bitcast-16bit-1.fp16.invalid.comp b/shaders-opencl-no-opt/comp/bitcast-16bit-1.fp16.comp
similarity index 100%
rename from shaders-opencl-no-opt/comp/bitcast-16bit-1.fp16.invalid.comp
rename to shaders-opencl-no-opt/comp/bitcast-16bit-1.fp16.comp
diff --git a/shaders-opencl-no-opt/comp/bitcast-16bit-2.fp16.invalid.comp b/shaders-opencl-no-opt/comp/bitcast-16bit-2.fp16.comp
similarity index 100%
rename from shaders-opencl-no-opt/comp/bitcast-16bit-2.fp16.invalid.comp
rename to shaders-opencl-no-opt/comp/bitcast-16bit-2.fp16.comp
diff --git a/shaders-opencl-no-opt/comp/int64.invalid.comp b/shaders-opencl-no-opt/comp/int64.comp
similarity index 100%
rename from shaders-opencl-no-opt/comp/int64.invalid.comp
rename to shaders-opencl-no-opt/comp/int64.comp
diff --git a/shaders-opencl-no-opt/comp/struct-packing-scalar.nocompat.invalid.vk.comp b/shaders-opencl-no-opt/comp/struct-packing-scalar.nocompat.vk.comp
similarity index 100%
rename from shaders-opencl-no-opt/comp/struct-packing-scalar.nocompat.invalid.vk.comp
rename to shaders-opencl-no-opt/comp/struct-packing-scalar.nocompat.vk.comp
diff --git a/spirv_glsl.cpp b/spirv_glsl.cpp
index e782463c7..bd2bd67a1 100644
--- a/spirv_glsl.cpp
+++ b/spirv_glsl.cpp
@@ -6350,11 +6350,14 @@ string CompilerGLSL::convert_half_to_string(const SPIRConstant &c, uint32_t col,
 		type.columns = 1;
 
 		if (float_value == numeric_limits<float>::infinity())
-			res = join(type_to_glsl(type), "(1.0 / 0.0)");
+			res = backend.c_style_casts ? join("(", type_to_glsl(type), ")(1.0 / 0.0)") :
+			                              join(type_to_glsl(type), "(1.0 / 0.0)");
 		else if (float_value == -numeric_limits<float>::infinity())
-			res = join(type_to_glsl(type), "(-1.0 / 0.0)");
+			res = backend.c_style_casts ? join("(", type_to_glsl(type), ")(-1.0 / 0.0)") :
+			                              join(type_to_glsl(type), "(-1.0 / 0.0)");
 		else if (std::isnan(float_value))
-			res = join(type_to_glsl(type), "(0.0 / 0.0)");
+			res = backend.c_style_casts ? join("(", type_to_glsl(type), ")(0.0 / 0.0)") :
+			                              join(type_to_glsl(type), "(0.0 / 0.0)");
 		else
 			SPIRV_CROSS_THROW("Cannot represent non-finite floating point constant.");
 	}
@@ -6364,7 +6367,10 @@ string CompilerGLSL::convert_half_to_string(const SPIRConstant &c, uint32_t col,
 		type.basetype = is_bfloat8 ? SPIRType::FloatE5M2 : SPIRType::Half;
 		type.vecsize = 1;
 		type.columns = 1;
-		res = join(type_to_glsl(type), "(", format_float(float_value), ")");
+		if (backend.c_style_casts)
+			res = join("(", type_to_glsl(type), ")(", format_float(float_value), ")");
+		else
+			res = join(type_to_glsl(type), "(", format_float(float_value), ")");
 	}
 
 	return res;
diff --git a/spirv_opencl.cpp b/spirv_opencl.cpp
index 6b902517d..0bab1dcec 100644
--- a/spirv_opencl.cpp
+++ b/spirv_opencl.cpp
@@ -169,7 +169,9 @@ void CompilerOpenCL::emit_header()
 	statement("// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)");
 	statement("");
 
-	if (opencl_options.opencl_version >= 200)
+	// cl_khr_3d_image_writes is a core feature in OpenCL 2.x (no pragma needed).
+	// For OpenCL < 2.0 or >= 3.0, emit the pragma only when the shader writes to a 3D image.
+	if (needs_3d_image_writes && (opencl_options.opencl_version < 200 || opencl_options.opencl_version >= 300))
 		statement("#pragma OPENCL EXTENSION cl_khr_3d_image_writes : enable");
 	if (opencl_options.enable_fp16)
 		statement("#pragma OPENCL EXTENSION cl_khr_fp16 : enable");
@@ -5298,6 +5300,14 @@ void CompilerOpenCL::emit_instruction(const Instruction &instruction)
 		uint32_t coord_id = ops[1];
 		uint32_t texel_id = ops[2];
 
+		// Track if we write to a 3D image (needs cl_khr_3d_image_writes pragma).
+		auto &img_type = expression_type(image_id);
+		if (img_type.image.dim == Dim3D && !needs_3d_image_writes)
+		{
+			needs_3d_image_writes = true;
+			force_recompile();
+		}
+
 		// Unset NonWritable so the variable can be written (mirroring GLSL backend).
 		auto *image_var = maybe_get_backing_variable(image_id);
 		if (image_var)
diff --git a/spirv_opencl.hpp b/spirv_opencl.hpp
index 4010997fe..fc3962ddf 100644
--- a/spirv_opencl.hpp
+++ b/spirv_opencl.hpp
@@ -192,8 +192,11 @@ class CompilerOpenCL : public CompilerGLSL
 	bool needs_inverse_3 = false;
 	bool needs_inverse_4 = false;
 
-	// Subgroup extension requirements discovered during emit_subgroup_op / builtin_to_glsl.
+	// Extension requirements discovered during instruction emission.
 	// These trigger force_recompile() so emit_header() can emit the correct pragmas.
+	bool needs_3d_image_writes = false;
+
+	// Subgroup extension requirements discovered during emit_subgroup_op / builtin_to_glsl.
 	bool needs_subgroup_vote = false;
 	bool needs_subgroup_ballot = false;
 	bool needs_subgroup_arithmetic = false;

From 7969b8b90cfba3a07fb26a665d28542e2764dad7 Mon Sep 17 00:00:00 2001
From: Garrick Meeker <gmeeker@gmail.com>
Date: Tue, 17 Mar 2026 08:15:58 -0700
Subject: [PATCH 13/16] OpenCL: Support for subgroups emulation.

---
 main.cpp          |    4 +
 spirv_cross_c.cpp |    3 +
 spirv_cross_c.h   |    1 +
 spirv_opencl.cpp  | 1797 +++++++++++++++++++++++++++++++++++++++++----
 spirv_opencl.hpp  |   16 +
 test_shaders.py   |   16 +-
 6 files changed, 1669 insertions(+), 168 deletions(-)

diff --git a/main.cpp b/main.cpp
index a53f5e758..bb8163b70 100644
--- a/main.cpp
+++ b/main.cpp
@@ -785,6 +785,7 @@ struct CLIArguments
 	bool opencl_enable_subgroups_all = false;
 	bool opencl_emulate_subgroups = false;
 	uint32_t opencl_fixed_subgroup_size = 0;
+	uint32_t opencl_max_workgroup_size = 256;
 };
 
 static void print_version()
@@ -1371,6 +1372,7 @@ static string compile_iteration(const CLIArguments &args, std::vector<uint32_t>
 		ocl_opts.enable_subgroups_all = args.opencl_enable_subgroups_all;
 		ocl_opts.emulate_subgroups = args.opencl_emulate_subgroups;
 		ocl_opts.fixed_subgroup_size = args.opencl_fixed_subgroup_size;
+		ocl_opts.max_workgroup_size = args.opencl_max_workgroup_size;
 		ocl_comp->set_opencl_options(ocl_opts);
 	}
 	else if (args.hlsl)
@@ -2009,6 +2011,8 @@ static int main_inner(int argc, char *argv[])
 	cbs.add("--opencl-emulate-subgroups", [&args](CLIParser &) { args.opencl_emulate_subgroups = true; });
 	cbs.add("--opencl-fixed-subgroup-size",
 	        [&args](CLIParser &parser) { args.opencl_fixed_subgroup_size = parser.next_uint(); });
+	cbs.add("--opencl-max-workgroup-size",
+	        [&args](CLIParser &parser) { args.opencl_max_workgroup_size = parser.next_uint(); });
 	cbs.add("--extension", [&args](CLIParser &parser) { args.extensions.push_back(parser.next_string()); });
 	cbs.add("--rename-entry-point",
 	        [&args](CLIParser &parser)
diff --git a/spirv_cross_c.cpp b/spirv_cross_c.cpp
index 1146f92d0..ede1f3f9a 100644
--- a/spirv_cross_c.cpp
+++ b/spirv_cross_c.cpp
@@ -833,6 +833,9 @@ spvc_result spvc_compiler_options_set_uint(spvc_compiler_options options, spvc_c
 	case SPVC_COMPILER_OPTION_OPENCL_FIXED_SUBGROUP_SIZE:
 		options->opencl.fixed_subgroup_size = value;
 		break;
+	case SPVC_COMPILER_OPTION_OPENCL_MAX_WORKGROUP_SIZE:
+		options->opencl.max_workgroup_size = value;
+		break;
 #endif
 
 	default:
diff --git a/spirv_cross_c.h b/spirv_cross_c.h
index e4d37ce46..b56a5635b 100644
--- a/spirv_cross_c.h
+++ b/spirv_cross_c.h
@@ -766,6 +766,7 @@ extern "C"
 		SPVC_COMPILER_OPTION_OPENCL_ENABLE_SUBGROUPS_ALL = 100 | SPVC_COMPILER_OPTION_OPENCL_BIT,
 		SPVC_COMPILER_OPTION_OPENCL_EMULATE_SUBGROUPS = 101 | SPVC_COMPILER_OPTION_OPENCL_BIT,
 		SPVC_COMPILER_OPTION_OPENCL_FIXED_SUBGROUP_SIZE = 102 | SPVC_COMPILER_OPTION_OPENCL_BIT,
+		SPVC_COMPILER_OPTION_OPENCL_MAX_WORKGROUP_SIZE = 103 | SPVC_COMPILER_OPTION_OPENCL_BIT,
 
 		SPVC_COMPILER_OPTION_INT_MAX = 0x7fffffff
 	} spvc_compiler_option;
diff --git a/spirv_opencl.cpp b/spirv_opencl.cpp
index 0bab1dcec..6bddfa00f 100644
--- a/spirv_opencl.cpp
+++ b/spirv_opencl.cpp
@@ -504,6 +504,9 @@ void CompilerOpenCL::compute_kernel_resources()
 			func_workgroup_args[kv.first] = sorted;
 		}
 	}
+
+	// Scan for subgroup emulation usage (which functions need scratch params threaded).
+	scan_subgroup_emulation_usage();
 }
 
 void CompilerOpenCL::emit_resources()
@@ -768,6 +771,9 @@ void CompilerOpenCL::emit_resources()
 		statement("");
 	}
 
+	// Subgroup emulation helper functions.
+	emit_subgroup_emulation_helpers();
+
 	// Default sampler for combined image+sampler usage (OpenCL C requires file-scope const sampler_t).
 	if (needs_default_sampler)
 	{
@@ -996,6 +1002,10 @@ void CompilerOpenCL::emit_entry_point_declarations()
 		}
 	}
 
+	// Emit subgroup emulation local variables and scratch buffers.
+	if (opencl_options.emulate_subgroups && !opencl_options.enable_subgroups)
+		emit_subgroup_emulation_entry_point_vars();
+
 	// Materialize Input builtin variables as local variables.
 	// In OpenCL C, builtins like get_global_id() are function calls, not variables.
 	// When code needs variable pointers to these builtins (either threaded to non-entry
@@ -1089,22 +1099,39 @@ string CompilerOpenCL::builtin_to_glsl(BuiltIn builtin, StorageClass storage)
 	case BuiltInGlobalSize:
 		return "((uint3)(get_global_size(0), get_global_size(1), get_global_size(2)))";
 	case BuiltInNumSubgroups:
+		if (opencl_options.emulate_subgroups && !opencl_options.enable_subgroups)
+			return "_spv_num_subgroups";
 		if (!opencl_options.enable_subgroups)
 			SPIRV_CROSS_THROW("Subgroup builtins require enable_subgroups option.");
 		return "get_num_sub_groups()";
 	case BuiltInSubgroupId:
+		if (opencl_options.emulate_subgroups && !opencl_options.enable_subgroups)
+			return "_spv_subgroup_id";
 		if (!opencl_options.enable_subgroups)
 			SPIRV_CROSS_THROW("Subgroup builtins require enable_subgroups option.");
 		return "get_sub_group_id()";
 	case BuiltInSubgroupSize:
+		if (opencl_options.emulate_subgroups && !opencl_options.enable_subgroups)
+			return "_spv_subgroup_size";
 		if (!opencl_options.enable_subgroups)
 			SPIRV_CROSS_THROW("Subgroup builtins require enable_subgroups option.");
 		return "get_sub_group_size()";
 	case BuiltInSubgroupLocalInvocationId:
+		if (opencl_options.emulate_subgroups && !opencl_options.enable_subgroups)
+			return "_spv_lane_id";
 		if (!opencl_options.enable_subgroups)
 			SPIRV_CROSS_THROW("Subgroup builtins require enable_subgroups option.");
 		return "get_sub_group_local_id()";
 	case BuiltInSubgroupEqMask:
+		if (opencl_options.emulate_subgroups && !opencl_options.enable_subgroups)
+		{
+			if (!needs_subgroup_emulation_scratch)
+			{
+				needs_subgroup_emulation_scratch = true;
+				force_recompile();
+			}
+			return "spv_subgroup_eq_mask(_spv_lane_id)";
+		}
 		if (!opencl_options.enable_subgroups)
 			SPIRV_CROSS_THROW("Subgroup builtins require enable_subgroups option.");
 		if (!needs_subgroup_ballot)
@@ -1114,6 +1141,15 @@ string CompilerOpenCL::builtin_to_glsl(BuiltIn builtin, StorageClass storage)
 		}
 		return "get_sub_group_eq_mask()";
 	case BuiltInSubgroupGeMask:
+		if (opencl_options.emulate_subgroups && !opencl_options.enable_subgroups)
+		{
+			if (!needs_subgroup_emulation_scratch)
+			{
+				needs_subgroup_emulation_scratch = true;
+				force_recompile();
+			}
+			return "spv_subgroup_ge_mask(_spv_lane_id, _spv_subgroup_size)";
+		}
 		if (!opencl_options.enable_subgroups)
 			SPIRV_CROSS_THROW("Subgroup builtins require enable_subgroups option.");
 		if (!needs_subgroup_ballot)
@@ -1123,6 +1159,15 @@ string CompilerOpenCL::builtin_to_glsl(BuiltIn builtin, StorageClass storage)
 		}
 		return "get_sub_group_ge_mask()";
 	case BuiltInSubgroupGtMask:
+		if (opencl_options.emulate_subgroups && !opencl_options.enable_subgroups)
+		{
+			if (!needs_subgroup_emulation_scratch)
+			{
+				needs_subgroup_emulation_scratch = true;
+				force_recompile();
+			}
+			return "spv_subgroup_gt_mask(_spv_lane_id, _spv_subgroup_size)";
+		}
 		if (!opencl_options.enable_subgroups)
 			SPIRV_CROSS_THROW("Subgroup builtins require enable_subgroups option.");
 		if (!needs_subgroup_ballot)
@@ -1132,6 +1177,15 @@ string CompilerOpenCL::builtin_to_glsl(BuiltIn builtin, StorageClass storage)
 		}
 		return "get_sub_group_gt_mask()";
 	case BuiltInSubgroupLeMask:
+		if (opencl_options.emulate_subgroups && !opencl_options.enable_subgroups)
+		{
+			if (!needs_subgroup_emulation_scratch)
+			{
+				needs_subgroup_emulation_scratch = true;
+				force_recompile();
+			}
+			return "spv_subgroup_le_mask(_spv_lane_id, _spv_subgroup_size)";
+		}
 		if (!opencl_options.enable_subgroups)
 			SPIRV_CROSS_THROW("Subgroup builtins require enable_subgroups option.");
 		if (!needs_subgroup_ballot)
@@ -1141,6 +1195,15 @@ string CompilerOpenCL::builtin_to_glsl(BuiltIn builtin, StorageClass storage)
 		}
 		return "get_sub_group_le_mask()";
 	case BuiltInSubgroupLtMask:
+		if (opencl_options.emulate_subgroups && !opencl_options.enable_subgroups)
+		{
+			if (!needs_subgroup_emulation_scratch)
+			{
+				needs_subgroup_emulation_scratch = true;
+				force_recompile();
+			}
+			return "spv_subgroup_lt_mask(_spv_lane_id)";
+		}
 		if (!opencl_options.enable_subgroups)
 			SPIRV_CROSS_THROW("Subgroup builtins require enable_subgroups option.");
 		if (!needs_subgroup_ballot)
@@ -3105,6 +3168,16 @@ void CompilerOpenCL::emit_function_prototype(SPIRFunction &func, const Bitset &r
 				}
 			}
 		}
+
+		// Thread subgroup emulation scratch buffers and emulation state.
+		if (needs_subgroup_emulation_scratch && funcs_using_subgroup_emulation.count(func.self))
+		{
+			if (!first_resource)
+				decl += ", ";
+			first_resource = false;
+			decl += "__local uint* _spv_subgroup_scratch, uint _spv_linear_id, "
+			        "uint _spv_subgroup_base, uint _spv_subgroup_size, uint _spv_lane_id";
+		}
 	}
 
 	decl += ")";
@@ -3159,6 +3232,16 @@ void CompilerOpenCL::append_global_func_args(const SPIRFunction &func, uint32_t
 			}
 		}
 	}
+
+	// Thread subgroup emulation scratch buffers.
+	if (needs_subgroup_emulation_scratch && funcs_using_subgroup_emulation.count(func.self))
+	{
+		arglist.push_back("_spv_subgroup_scratch");
+		arglist.push_back("_spv_linear_id");
+		arglist.push_back("_spv_subgroup_base");
+		arglist.push_back("_spv_subgroup_size");
+		arglist.push_back("_spv_lane_id");
+	}
 }
 
 void CompilerOpenCL::emit_function(SPIRFunction &func, const Bitset &return_flags)
@@ -3375,175 +3458,1536 @@ void CompilerOpenCL::emit_subgroup_op_vec_binary(uint32_t result_type, uint32_t
 	}
 }
 
-void CompilerOpenCL::emit_subgroup_op(const Instruction &i)
+uint32_t CompilerOpenCL::get_emulation_max_workgroup_size() const
 {
-	const uint32_t *ops = stream(i);
-	auto op = static_cast<Op>(i.op);
+	auto &ep = get_entry_point();
+	uint32_t x = ep.workgroup_size.x;
+	uint32_t y = ep.workgroup_size.y;
+	uint32_t z = ep.workgroup_size.z;
+	if (x != 0 && y != 0 && z != 0)
+		return x * y * z;
+	return opencl_options.max_workgroup_size;
+}
 
-	if (!opencl_options.enable_subgroups)
-		SPIRV_CROSS_THROW("Subgroup operations require enable_subgroups option.");
+string CompilerOpenCL::get_emulation_subgroup_size_expr() const
+{
+	if (opencl_options.fixed_subgroup_size != 0)
+		return to_string(opencl_options.fixed_subgroup_size) + "u";
+	else
+		return "_spv_linear_workgroup_size";
+}
 
-	// Validate scope is Subgroup
-	if (op != OpGroupNonUniformQuadAllKHR && op != OpGroupNonUniformQuadAnyKHR)
-	{
-		auto scope = static_cast<Scope>(evaluate_constant_u32(ops[2]));
-		if (scope != ScopeSubgroup)
-			SPIRV_CROSS_THROW("Only subgroup scope is supported.");
-	}
+string CompilerOpenCL::subgroup_emulation_scratch_type(bool is_64bit) const
+{
+	return is_64bit ? "ulong" : "uint";
+}
 
-	uint32_t result_type = ops[0];
-	uint32_t id = ops[1];
+void CompilerOpenCL::emit_subgroup_emulation_entry_point_vars()
+{
+	uint32_t fixed = opencl_options.fixed_subgroup_size;
 
-	// If we need to do implicit bitcasts, make sure we do it with the correct type.
-	uint32_t integer_width = get_integer_width_for_instruction(i);
-	auto int_type = to_signed_basetype(integer_width);
-	auto uint_type = to_unsigned_basetype(integer_width);
+	// Linear ID computation
+	statement("uint _spv_linear_workgroup_size = get_local_size(0) * get_local_size(1) * get_local_size(2);");
+	statement("uint _spv_linear_id = (get_local_id(2) * get_local_size(1) * get_local_size(0)) + "
+	          "(get_local_id(1) * get_local_size(0)) + get_local_id(0);");
 
-	// Helper to set an extension flag and trigger recompile if newly needed.
-	auto require_extension = [this](bool &flag)
+	if (fixed == 0)
 	{
-		if (!flag)
-		{
-			flag = true;
-			force_recompile();
-		}
-	};
+		// subgroup_size == workgroup size (one big subgroup)
+		statement("uint _spv_subgroup_size = _spv_linear_workgroup_size;");
+		statement("uint _spv_lane_id = _spv_linear_id;");
+		statement("uint _spv_subgroup_id = 0u;");
+		statement("uint _spv_num_subgroups = 1u;");
+		statement("uint _spv_subgroup_base = 0u;");
+	}
+	else if (fixed == 1)
+	{
+		// Degenerate: each invocation is its own subgroup
+		statement("uint _spv_subgroup_size = 1u;");
+		statement("uint _spv_lane_id = 0u;");
+		statement("uint _spv_subgroup_id = _spv_linear_id;");
+		statement("uint _spv_num_subgroups = _spv_linear_workgroup_size;");
+		statement("uint _spv_subgroup_base = _spv_linear_id;");
+	}
+	else
+	{
+		statement("uint _spv_subgroup_size = ", fixed, "u;");
+		statement("uint _spv_lane_id = _spv_linear_id % ", fixed, "u;");
+		statement("uint _spv_subgroup_id = _spv_linear_id / ", fixed, "u;");
+		statement("uint _spv_num_subgroups = _spv_linear_workgroup_size / ", fixed, "u;");
+		statement("uint _spv_subgroup_base = _spv_subgroup_id * ", fixed, "u;");
+	}
 
-	switch (op)
+	// Scratch buffers (only when needed)
+	if (needs_subgroup_emulation_scratch)
 	{
-		// === Task 5: cl_khr_subgroup_non_uniform_vote ===
+		uint32_t max_wg = get_emulation_max_workgroup_size();
+		statement("__local uint _spv_subgroup_scratch[", max_wg, "];");
+	}
+	if (needs_subgroup_emulation_scratch64)
+	{
+		uint32_t max_wg = get_emulation_max_workgroup_size();
+		statement("__local ulong _spv_subgroup_scratch64[", max_wg, "];");
+	}
+}
 
-	case OpGroupNonUniformElect:
-		require_extension(needs_subgroup_vote);
-		emit_op(result_type, id, "sub_group_elect()", true);
-		break;
+void CompilerOpenCL::scan_subgroup_emulation_usage()
+{
+	if (!opencl_options.emulate_subgroups || opencl_options.enable_subgroups)
+		return;
 
-	case OpGroupNonUniformAllEqual:
+	funcs_using_subgroup_emulation.clear();
+
+	// First pass: find functions that directly use subgroup ops.
+	ir.for_each_typed_id<SPIRFunction>(
+	    [&](uint32_t func_id, SPIRFunction &func)
+	    {
+		    if (func_id == ir.default_entry_point)
+			    return;
+		    for (auto block_id : func.blocks)
+		    {
+			    auto &block = get<SPIRBlock>(block_id);
+			    for (auto &insn : block.ops)
+			    {
+				    auto insn_op = static_cast<Op>(insn.op);
+				    if (insn_op >= OpGroupNonUniformElect && insn_op <= OpGroupNonUniformQuadSwap)
+				    {
+					    funcs_using_subgroup_emulation.insert(func_id);
+					    return;
+				    }
+				    if (insn_op == OpGroupNonUniformRotateKHR || insn_op == OpGroupNonUniformQuadAllKHR ||
+				        insn_op == OpGroupNonUniformQuadAnyKHR)
+				    {
+					    funcs_using_subgroup_emulation.insert(func_id);
+					    return;
+				    }
+			    }
+		    }
+	    });
+
+	// Propagate transitively through call graph.
+	bool changed = true;
+	while (changed)
 	{
-		require_extension(needs_subgroup_vote);
-		auto &type = expression_type(ops[3]);
-		if (type.vecsize > 1)
-		{
-			// OpenCL sub_group_non_uniform_all_equal only accepts scalars.
-			// For vectors, decompose into per-component calls combined with &&.
-			string expr;
-			for (uint32_t c = 0; c < type.vecsize; c++)
-			{
-				if (c > 0)
-					expr += " && ";
-				string component = join(to_enclosed_expression(ops[3]), ".", "xyzw"[c]);
-				expr += join("sub_group_non_uniform_all_equal(", component, ")");
-			}
-			emit_op(result_type, id, expr, should_forward(ops[3]));
-			inherit_expression_dependencies(id, ops[3]);
-		}
-		else
-		{
-			emit_unary_func_op(result_type, id, ops[3], "sub_group_non_uniform_all_equal");
-		}
-		break;
+		changed = false;
+		ir.for_each_typed_id<SPIRFunction>(
+		    [&](uint32_t func_id, SPIRFunction &func)
+		    {
+			    if (func_id == ir.default_entry_point)
+				    return;
+			    if (funcs_using_subgroup_emulation.count(func_id))
+				    return;
+			    for (auto block_id : func.blocks)
+			    {
+				    auto &block = get<SPIRBlock>(block_id);
+				    for (auto &insn : block.ops)
+				    {
+					    if (static_cast<Op>(insn.op) == OpFunctionCall)
+					    {
+						    const uint32_t *insn_ops = stream(insn);
+						    uint32_t callee_id = insn_ops[2];
+						    if (funcs_using_subgroup_emulation.count(callee_id))
+						    {
+							    funcs_using_subgroup_emulation.insert(func_id);
+							    changed = true;
+							    return;
+						    }
+					    }
+				    }
+			    }
+		    });
 	}
+}
 
-		// === Task 4: cl_khr_subgroups (base) — vote/broadcast ===
+void CompilerOpenCL::emit_subgroup_emulation_helpers()
+{
+	if (!opencl_options.emulate_subgroups || opencl_options.enable_subgroups)
+		return;
+	if (!needs_subgroup_emulation_scratch)
+		return;
 
-	case OpGroupNonUniformAll:
-		emit_unary_func_op(result_type, id, ops[3], "sub_group_all");
-		break;
+	// Barrier helper name (OpenCL 1.2 vs 2.0)
+	const char *barrier_call = opencl_options.supports_opencl_version(2, 0) ?
+	                               "work_group_barrier(CLK_LOCAL_MEM_FENCE)" :
+	                               "barrier(CLK_LOCAL_MEM_FENCE)";
+
+	// --- Broadcast ---
+	statement("static uint spv_emulate_broadcast_uint(__local uint* scratch, uint val, "
+	          "uint src_lane, uint linear_id, uint subgroup_base) {");
+	statement("    scratch[linear_id] = val;");
+	statement("    ", barrier_call, ";");
+	statement("    uint r = scratch[subgroup_base + src_lane];");
+	statement("    ", barrier_call, ";");
+	statement("    return r;");
+	statement("}");
+	statement("");
 
-	case OpGroupNonUniformAny:
-		emit_unary_func_op(result_type, id, ops[3], "sub_group_any");
-		break;
+	// --- BroadcastFirst (lane 0) ---
+	statement("static uint spv_emulate_broadcast_first_uint(__local uint* scratch, uint val, "
+	          "uint linear_id, uint subgroup_base) {");
+	statement("    scratch[linear_id] = val;");
+	statement("    ", barrier_call, ";");
+	statement("    uint r = scratch[subgroup_base];");
+	statement("    ", barrier_call, ";");
+	statement("    return r;");
+	statement("}");
+	statement("");
 
-	case OpGroupNonUniformBroadcast:
-		emit_subgroup_op_vec_binary(result_type, id, ops[3], ops[4], "sub_group_broadcast");
-		break;
+	// --- Shuffle ---
+	statement("static uint spv_emulate_shuffle_uint(__local uint* scratch, uint val, "
+	          "uint index, uint linear_id, uint subgroup_base) {");
+	statement("    scratch[linear_id] = val;");
+	statement("    ", barrier_call, ";");
+	statement("    uint r = scratch[subgroup_base + index];");
+	statement("    ", barrier_call, ";");
+	statement("    return r;");
+	statement("}");
+	statement("");
 
-		// === Task 6: cl_khr_subgroup_ballot ===
+	// --- ShuffleXor ---
+	statement("static uint spv_emulate_shuffle_xor_uint(__local uint* scratch, uint val, "
+	          "uint mask, uint lane_id, uint linear_id, uint subgroup_base) {");
+	statement("    scratch[linear_id] = val;");
+	statement("    ", barrier_call, ";");
+	statement("    uint r = scratch[subgroup_base + (lane_id ^ mask)];");
+	statement("    ", barrier_call, ";");
+	statement("    return r;");
+	statement("}");
+	statement("");
 
-	case OpGroupNonUniformBroadcastFirst:
-		require_extension(needs_subgroup_ballot);
-		emit_subgroup_op_vec(result_type, id, ops[3], "sub_group_broadcast_first");
-		break;
+	// --- ShuffleUp ---
+	statement("static uint spv_emulate_shuffle_up_uint(__local uint* scratch, uint val, "
+	          "uint delta, uint lane_id, uint linear_id, uint subgroup_base) {");
+	statement("    scratch[linear_id] = val;");
+	statement("    ", barrier_call, ";");
+	statement("    uint r = (lane_id >= delta) ? scratch[linear_id - delta] : val;");
+	statement("    ", barrier_call, ";");
+	statement("    return r;");
+	statement("}");
+	statement("");
 
-	case OpGroupNonUniformBallot:
-		require_extension(needs_subgroup_ballot);
-		emit_unary_func_op(result_type, id, ops[3], "sub_group_ballot");
-		break;
+	// --- ShuffleDown ---
+	statement("static uint spv_emulate_shuffle_down_uint(__local uint* scratch, uint val, "
+	          "uint delta, uint lane_id, uint linear_id, uint subgroup_size) {");
+	statement("    scratch[linear_id] = val;");
+	statement("    ", barrier_call, ";");
+	statement("    uint r = (lane_id + delta < subgroup_size) ? scratch[linear_id + delta] : val;");
+	statement("    ", barrier_call, ";");
+	statement("    return r;");
+	statement("}");
+	statement("");
 
-	case OpGroupNonUniformInverseBallot:
-		require_extension(needs_subgroup_ballot);
-		emit_unary_func_op(result_type, id, ops[3], "sub_group_inverse_ballot");
-		break;
+	// --- Rotate ---
+	statement("static uint spv_emulate_rotate_uint(__local uint* scratch, uint val, "
+	          "uint delta, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {");
+	statement("    scratch[linear_id] = val;");
+	statement("    ", barrier_call, ";");
+	statement("    uint r = scratch[subgroup_base + ((lane_id + delta) % subgroup_size)];");
+	statement("    ", barrier_call, ";");
+	statement("    return r;");
+	statement("}");
+	statement("");
 
-	case OpGroupNonUniformBallotBitExtract:
-		require_extension(needs_subgroup_ballot);
-		emit_binary_func_op(result_type, id, ops[3], ops[4], "sub_group_ballot_bit_extract");
-		break;
+	// --- Clustered rotate ---
+	statement("static uint spv_emulate_clustered_rotate_uint(__local uint* scratch, uint val, "
+	          "uint delta, uint lane_id, uint linear_id, uint subgroup_base, uint cluster_size) {");
+	statement("    scratch[linear_id] = val;");
+	statement("    ", barrier_call, ";");
+	statement("    uint cluster_base = (lane_id / cluster_size) * cluster_size;");
+	statement(
+	    "    uint r = scratch[subgroup_base + cluster_base + ((lane_id - cluster_base + delta) % cluster_size)];");
+	statement("    ", barrier_call, ";");
+	statement("    return r;");
+	statement("}");
+	statement("");
 
-	case OpGroupNonUniformBallotFindLSB:
-		require_extension(needs_subgroup_ballot);
-		emit_unary_func_op(result_type, id, ops[3], "sub_group_ballot_find_lsb");
-		break;
+	// --- Vote All ---
+	statement("static bool spv_emulate_all(__local uint* scratch, bool predicate, "
+	          "uint linear_id, uint subgroup_base, uint subgroup_size) {");
+	statement("    scratch[linear_id] = predicate ? 1u : 0u;");
+	statement("    ", barrier_call, ";");
+	statement("    bool r = true;");
+	statement("    for (uint i = 0u; i < subgroup_size; i++)");
+	statement("        r = r && (scratch[subgroup_base + i] != 0u);");
+	statement("    ", barrier_call, ";");
+	statement("    return r;");
+	statement("}");
+	statement("");
 
-	case OpGroupNonUniformBallotFindMSB:
-		require_extension(needs_subgroup_ballot);
-		emit_unary_func_op(result_type, id, ops[3], "sub_group_ballot_find_msb");
-		break;
+	// --- Vote Any ---
+	statement("static bool spv_emulate_any(__local uint* scratch, bool predicate, "
+	          "uint linear_id, uint subgroup_base, uint subgroup_size) {");
+	statement("    scratch[linear_id] = predicate ? 1u : 0u;");
+	statement("    ", barrier_call, ";");
+	statement("    bool r = false;");
+	statement("    for (uint i = 0u; i < subgroup_size; i++)");
+	statement("        r = r || (scratch[subgroup_base + i] != 0u);");
+	statement("    ", barrier_call, ";");
+	statement("    return r;");
+	statement("}");
+	statement("");
 
-	case OpGroupNonUniformBallotBitCount:
-	{
-		require_extension(needs_subgroup_ballot);
-		auto operation = static_cast<GroupOperation>(ops[3]);
-		if (operation == GroupOperationReduce)
-			emit_unary_func_op(result_type, id, ops[4], "sub_group_ballot_bit_count");
-		else if (operation == GroupOperationInclusiveScan)
-			emit_unary_func_op(result_type, id, ops[4], "sub_group_ballot_inclusive_scan");
-		else if (operation == GroupOperationExclusiveScan)
-			emit_unary_func_op(result_type, id, ops[4], "sub_group_ballot_exclusive_scan");
-		else
-			SPIRV_CROSS_THROW("Invalid group operation for BallotBitCount.");
-		break;
-	}
+	// --- AllEqual ---
+	statement("static bool spv_emulate_all_equal_uint(__local uint* scratch, uint val, "
+	          "uint linear_id, uint subgroup_base, uint subgroup_size) {");
+	statement("    scratch[linear_id] = val;");
+	statement("    ", barrier_call, ";");
+	statement("    uint first = scratch[subgroup_base];");
+	statement("    bool r = true;");
+	statement("    for (uint i = 1u; i < subgroup_size; i++)");
+	statement("        r = r && (scratch[subgroup_base + i] == first);");
+	statement("    ", barrier_call, ";");
+	statement("    return r;");
+	statement("}");
+	statement("");
 
-	// === Tasks 4/7/10: Arithmetic ops (Reduce/Scan/Clustered) ===
-	// The same SPIR-V opcodes are used for base cl_khr_subgroups (Reduce/InclusiveScan/ExclusiveScan
-	// with add/min/max), cl_khr_subgroup_non_uniform_arithmetic (all ops with Reduce/Scan),
-	// and cl_khr_subgroup_clustered_reduce (ClusteredReduce).
+	// --- Ballot ---
+	statement("static uint4 spv_emulate_ballot(__local uint* scratch, bool predicate, "
+	          "uint linear_id, uint subgroup_base, uint subgroup_size) {");
+	statement("    scratch[linear_id] = predicate ? 1u : 0u;");
+	statement("    ", barrier_call, ";");
+	statement("    uint4 r = (uint4)(0u);");
+	statement("    for (uint i = 0u; i < subgroup_size; i++) {");
+	statement("        if (scratch[subgroup_base + i] != 0u) {");
+	statement("            uint word = i / 32u;");
+	statement("            uint bit = i % 32u;");
+	statement("            if (word == 0u) r.x |= (1u << bit);");
+	statement("            else if (word == 1u) r.y |= (1u << bit);");
+	statement("            else if (word == 2u) r.z |= (1u << bit);");
+	statement("            else r.w |= (1u << bit);");
+	statement("        }");
+	statement("    }");
+	statement("    ", barrier_call, ";");
+	statement("    return r;");
+	statement("}");
+	statement("");
 
-	// clang-format off
-	// OpenCL subgroup functions are scalar-only; vectors are decomposed per-component
-	// via emit_subgroup_op_vec / emit_subgroup_op_vec_binary.
+	// --- Mask builtins (pure arithmetic, no scratch) ---
+	statement("static uint4 spv_subgroup_eq_mask(uint lane_id) {");
+	statement("    uint4 r = (uint4)(0u);");
+	statement("    uint word = lane_id / 32u;");
+	statement("    uint bit = lane_id % 32u;");
+	statement("    if (word == 0u) r.x = (1u << bit);");
+	statement("    else if (word == 1u) r.y = (1u << bit);");
+	statement("    else if (word == 2u) r.z = (1u << bit);");
+	statement("    else r.w = (1u << bit);");
+	statement("    return r;");
+	statement("}");
+	statement("");
 
-#define OPENCL_SUBGROUP_ARITH(spirv_op, base_name, nu_name) \
-	case OpGroupNonUniform##spirv_op: \
-	{ \
-		auto operation = static_cast<GroupOperation>(ops[3]); \
-		if (operation == GroupOperationReduce) \
-			emit_subgroup_op_vec(result_type, id, ops[4], "sub_group_reduce_" base_name); \
-		else if (operation == GroupOperationInclusiveScan) \
-			emit_subgroup_op_vec(result_type, id, ops[4], "sub_group_scan_inclusive_" base_name); \
-		else if (operation == GroupOperationExclusiveScan) \
-			emit_subgroup_op_vec(result_type, id, ops[4], "sub_group_scan_exclusive_" base_name); \
-		else if (operation == GroupOperationClusteredReduce) \
-		{ \
-			require_extension(needs_subgroup_clustered); \
-			emit_subgroup_op_vec_binary(result_type, id, ops[4], ops[5], "sub_group_clustered_reduce_" base_name); \
-		} \
-		else \
-			SPIRV_CROSS_THROW("Unsupported group operation."); \
-		break; \
-	}
+	statement("static uint4 spv_subgroup_ge_mask(uint lane_id, uint subgroup_size) {");
+	statement("    uint4 r = (uint4)(0u);");
+	statement("    for (uint i = lane_id; i < subgroup_size; i++) {");
+	statement("        uint word = i / 32u;");
+	statement("        uint bit = i % 32u;");
+	statement("        if (word == 0u) r.x |= (1u << bit);");
+	statement("        else if (word == 1u) r.y |= (1u << bit);");
+	statement("        else if (word == 2u) r.z |= (1u << bit);");
+	statement("        else r.w |= (1u << bit);");
+	statement("    }");
+	statement("    return r;");
+	statement("}");
+	statement("");
 
-#define OPENCL_SUBGROUP_ARITH_CAST(spirv_op, base_name, nu_name, cast_type) \
-	case OpGroupNonUniform##spirv_op: \
-	{ \
-		auto operation = static_cast<GroupOperation>(ops[3]); \
-		if (operation == GroupOperationReduce) \
-			emit_unary_func_op_cast(result_type, id, ops[4], "sub_group_reduce_" base_name, cast_type, cast_type); \
-		else if (operation == GroupOperationInclusiveScan) \
-			emit_unary_func_op_cast(result_type, id, ops[4], "sub_group_scan_inclusive_" base_name, cast_type, cast_type); \
-		else if (operation == GroupOperationExclusiveScan) \
-			emit_unary_func_op_cast(result_type, id, ops[4], "sub_group_scan_exclusive_" base_name, cast_type, cast_type); \
+	statement("static uint4 spv_subgroup_gt_mask(uint lane_id, uint subgroup_size) {");
+	statement("    return spv_subgroup_ge_mask(lane_id + 1u, subgroup_size);");
+	statement("}");
+	statement("");
+
+	statement("static uint4 spv_subgroup_le_mask(uint lane_id, uint subgroup_size) {");
+	statement("    return spv_subgroup_ge_mask(0u, lane_id + 1u);");
+	statement("}");
+	statement("");
+
+	statement("static uint4 spv_subgroup_lt_mask(uint lane_id) {");
+	statement("    if (lane_id == 0u) return (uint4)(0u);");
+	statement("    return spv_subgroup_ge_mask(0u, lane_id);");
+	statement("}");
+	statement("");
+
+	// Arithmetic reduce/scan helpers: one set per type+operation.
+	// Uint operations
+	auto emit_arith_set = [&](const char *type_name, const char *as_cast, const char *suffix, const char *op,
+	                          const char *identity, bool use_cast)
+	{
+		string cast_read =
+		    use_cast ? join("as_", as_cast, "(scratch[subgroup_base + i])") : "scratch[subgroup_base + i]";
+		string cast_first = use_cast ? join("as_", as_cast, "(scratch[subgroup_base])") : "scratch[subgroup_base]";
+		string cast_write = use_cast ? join("as_uint(val)") : "val";
+		string cast_clust = use_cast ? join("as_", as_cast, "(scratch[subgroup_base + cluster_base_in_sg + i])") :
+		                               "scratch[subgroup_base + cluster_base_in_sg + i]";
+		string cast_clust_first = use_cast ? join("as_", as_cast, "(scratch[subgroup_base + cluster_base_in_sg])") :
+		                                     "scratch[subgroup_base + cluster_base_in_sg]";
+
+		// Reduce
+		statement("static ", type_name, " spv_emulate_reduce_", suffix, "(__local uint* scratch, ", type_name,
+		          " val, uint linear_id, uint subgroup_base, uint subgroup_size) {");
+		statement("    scratch[linear_id] = ", cast_write, ";");
+		statement("    ", barrier_call, ";");
+		statement("    ", type_name, " r = ", cast_first, ";");
+		statement("    for (uint i = 1u; i < subgroup_size; i++)");
+		statement("        r = r ", op, " ", cast_read, ";");
+		statement("    ", barrier_call, ";");
+		statement("    return r;");
+		statement("}");
+		statement("");
+
+		// Inclusive scan
+		statement("static ", type_name, " spv_emulate_inclusive_scan_", suffix, "(__local uint* scratch, ", type_name,
+		          " val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {");
+		statement("    scratch[linear_id] = ", cast_write, ";");
+		statement("    ", barrier_call, ";");
+		statement("    ", type_name, " r = ", cast_first, ";");
+		statement("    for (uint i = 1u; i <= lane_id; i++)");
+		statement("        r = r ", op, " ", cast_read, ";");
+		statement("    ", barrier_call, ";");
+		statement("    return r;");
+		statement("}");
+		statement("");
+
+		// Exclusive scan
+		statement("static ", type_name, " spv_emulate_exclusive_scan_", suffix, "(__local uint* scratch, ", type_name,
+		          " val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {");
+		statement("    scratch[linear_id] = ", cast_write, ";");
+		statement("    ", barrier_call, ";");
+		statement("    ", type_name, " r = ", identity, ";");
+		statement("    for (uint i = 0u; i < lane_id; i++)");
+		statement("        r = r ", op, " ", cast_read, ";");
+		statement("    ", barrier_call, ";");
+		statement("    return r;");
+		statement("}");
+		statement("");
+
+		// Clustered reduce
+		statement("static ", type_name, " spv_emulate_clustered_reduce_", suffix, "(__local uint* scratch, ", type_name,
+		          " val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {");
+		statement("    scratch[linear_id] = ", cast_write, ";");
+		statement("    ", barrier_call, ";");
+		statement("    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;");
+		statement("    ", type_name, " r = ", cast_clust_first, ";");
+		statement("    for (uint i = 1u; i < cluster_size; i++)");
+		statement("        r = r ", op, " ", cast_clust, ";");
+		statement("    ", barrier_call, ";");
+		statement("    return r;");
+		statement("}");
+		statement("");
+	};
+
+	// For min/max we need function-call style instead of operator
+	auto emit_arith_func_set = [&](const char *type_name, const char *as_cast, const char *suffix,
+	                               const char *func_name, const char *identity, bool use_cast)
+	{
+		string cast_read =
+		    use_cast ? join("as_", as_cast, "(scratch[subgroup_base + i])") : "scratch[subgroup_base + i]";
+		string cast_first = use_cast ? join("as_", as_cast, "(scratch[subgroup_base])") : "scratch[subgroup_base]";
+		string cast_write = use_cast ? join("as_uint(val)") : "val";
+		string cast_clust = use_cast ? join("as_", as_cast, "(scratch[subgroup_base + cluster_base_in_sg + i])") :
+		                               "scratch[subgroup_base + cluster_base_in_sg + i]";
+		string cast_clust_first = use_cast ? join("as_", as_cast, "(scratch[subgroup_base + cluster_base_in_sg])") :
+		                                     "scratch[subgroup_base + cluster_base_in_sg]";
+
+		// Reduce
+		statement("static ", type_name, " spv_emulate_reduce_", suffix, "(__local uint* scratch, ", type_name,
+		          " val, uint linear_id, uint subgroup_base, uint subgroup_size) {");
+		statement("    scratch[linear_id] = ", cast_write, ";");
+		statement("    ", barrier_call, ";");
+		statement("    ", type_name, " r = ", cast_first, ";");
+		statement("    for (uint i = 1u; i < subgroup_size; i++)");
+		statement("        r = ", func_name, "(r, ", cast_read, ");");
+		statement("    ", barrier_call, ";");
+		statement("    return r;");
+		statement("}");
+		statement("");
+
+		// Inclusive scan
+		statement("static ", type_name, " spv_emulate_inclusive_scan_", suffix, "(__local uint* scratch, ", type_name,
+		          " val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {");
+		statement("    scratch[linear_id] = ", cast_write, ";");
+		statement("    ", barrier_call, ";");
+		statement("    ", type_name, " r = ", cast_first, ";");
+		statement("    for (uint i = 1u; i <= lane_id; i++)");
+		statement("        r = ", func_name, "(r, ", cast_read, ");");
+		statement("    ", barrier_call, ";");
+		statement("    return r;");
+		statement("}");
+		statement("");
+
+		// Exclusive scan
+		statement("static ", type_name, " spv_emulate_exclusive_scan_", suffix, "(__local uint* scratch, ", type_name,
+		          " val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {");
+		statement("    scratch[linear_id] = ", cast_write, ";");
+		statement("    ", barrier_call, ";");
+		statement("    ", type_name, " r = ", identity, ";");
+		statement("    for (uint i = 0u; i < lane_id; i++)");
+		statement("        r = ", func_name, "(r, ", cast_read, ");");
+		statement("    ", barrier_call, ";");
+		statement("    return r;");
+		statement("}");
+		statement("");
+
+		// Clustered reduce
+		statement("static ", type_name, " spv_emulate_clustered_reduce_", suffix, "(__local uint* scratch, ", type_name,
+		          " val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {");
+		statement("    scratch[linear_id] = ", cast_write, ";");
+		statement("    ", barrier_call, ";");
+		statement("    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;");
+		statement("    ", type_name, " r = ", cast_clust_first, ";");
+		statement("    for (uint i = 1u; i < cluster_size; i++)");
+		statement("        r = ", func_name, "(r, ", cast_clust, ");");
+		statement("    ", barrier_call, ";");
+		statement("    return r;");
+		statement("}");
+		statement("");
+	};
+
+	// Integer arithmetic (uint)
+	emit_arith_set("uint", "uint", "add_uint", "+", "0u", false);
+	emit_arith_set("uint", "uint", "mul_uint", "*", "1u", false);
+	emit_arith_func_set("uint", "uint", "min_uint", "min", "UINT_MAX", false);
+	emit_arith_func_set("uint", "uint", "max_uint", "max", "0u", false);
+	emit_arith_set("uint", "uint", "and_uint", "&", "0xFFFFFFFFu", false);
+	emit_arith_set("uint", "uint", "or_uint", "|", "0u", false);
+	emit_arith_set("uint", "uint", "xor_uint", "^", "0u", false);
+
+	// Integer arithmetic (int) — uses as_int/as_uint bitcasts
+	emit_arith_set("int", "int", "add_int", "+", "0", true);
+	emit_arith_set("int", "int", "mul_int", "*", "1", true);
+	emit_arith_func_set("int", "int", "min_int", "min", "INT_MAX", true);
+	emit_arith_func_set("int", "int", "max_int", "max", "INT_MIN", true);
+	emit_arith_set("int", "int", "and_int", "&", "as_int(0xFFFFFFFFu)", true);
+	emit_arith_set("int", "int", "or_int", "|", "0", true);
+	emit_arith_set("int", "int", "xor_int", "^", "0", true);
+
+	// Float arithmetic — uses as_float/as_uint bitcasts
+	emit_arith_set("float", "float", "add_float", "+", "0.0f", true);
+	emit_arith_set("float", "float", "mul_float", "*", "1.0f", true);
+	emit_arith_func_set("float", "float", "min_float", "fmin", "INFINITY", true);
+	emit_arith_func_set("float", "float", "max_float", "fmax", "-INFINITY", true);
+
+	// Logical operations (bool → uint mapping)
+	statement("static bool spv_emulate_reduce_logical_and(__local uint* scratch, bool val, "
+	          "uint linear_id, uint subgroup_base, uint subgroup_size) {");
+	statement("    scratch[linear_id] = val ? 1u : 0u;");
+	statement("    ", barrier_call, ";");
+	statement("    bool r = true;");
+	statement("    for (uint i = 0u; i < subgroup_size; i++)");
+	statement("        r = r && (scratch[subgroup_base + i] != 0u);");
+	statement("    ", barrier_call, ";");
+	statement("    return r;");
+	statement("}");
+	statement("");
+
+	statement("static bool spv_emulate_inclusive_scan_logical_and(__local uint* scratch, bool val, "
+	          "uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {");
+	statement("    scratch[linear_id] = val ? 1u : 0u;");
+	statement("    ", barrier_call, ";");
+	statement("    bool r = true;");
+	statement("    for (uint i = 0u; i <= lane_id; i++)");
+	statement("        r = r && (scratch[subgroup_base + i] != 0u);");
+	statement("    ", barrier_call, ";");
+	statement("    return r;");
+	statement("}");
+	statement("");
+
+	statement("static bool spv_emulate_exclusive_scan_logical_and(__local uint* scratch, bool val, "
+	          "uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {");
+	statement("    scratch[linear_id] = val ? 1u : 0u;");
+	statement("    ", barrier_call, ";");
+	statement("    bool r = true;");
+	statement("    for (uint i = 0u; i < lane_id; i++)");
+	statement("        r = r && (scratch[subgroup_base + i] != 0u);");
+	statement("    ", barrier_call, ";");
+	statement("    return r;");
+	statement("}");
+	statement("");
+
+	statement("static bool spv_emulate_clustered_reduce_logical_and(__local uint* scratch, bool val, "
+	          "uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {");
+	statement("    scratch[linear_id] = val ? 1u : 0u;");
+	statement("    ", barrier_call, ";");
+	statement("    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;");
+	statement("    bool r = true;");
+	statement("    for (uint i = 0u; i < cluster_size; i++)");
+	statement("        r = r && (scratch[subgroup_base + cluster_base_in_sg + i] != 0u);");
+	statement("    ", barrier_call, ";");
+	statement("    return r;");
+	statement("}");
+	statement("");
+
+	// logical_or
+	statement("static bool spv_emulate_reduce_logical_or(__local uint* scratch, bool val, "
+	          "uint linear_id, uint subgroup_base, uint subgroup_size) {");
+	statement("    scratch[linear_id] = val ? 1u : 0u;");
+	statement("    ", barrier_call, ";");
+	statement("    bool r = false;");
+	statement("    for (uint i = 0u; i < subgroup_size; i++)");
+	statement("        r = r || (scratch[subgroup_base + i] != 0u);");
+	statement("    ", barrier_call, ";");
+	statement("    return r;");
+	statement("}");
+	statement("");
+
+	statement("static bool spv_emulate_inclusive_scan_logical_or(__local uint* scratch, bool val, "
+	          "uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {");
+	statement("    scratch[linear_id] = val ? 1u : 0u;");
+	statement("    ", barrier_call, ";");
+	statement("    bool r = false;");
+	statement("    for (uint i = 0u; i <= lane_id; i++)");
+	statement("        r = r || (scratch[subgroup_base + i] != 0u);");
+	statement("    ", barrier_call, ";");
+	statement("    return r;");
+	statement("}");
+	statement("");
+
+	statement("static bool spv_emulate_exclusive_scan_logical_or(__local uint* scratch, bool val, "
+	          "uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {");
+	statement("    scratch[linear_id] = val ? 1u : 0u;");
+	statement("    ", barrier_call, ";");
+	statement("    bool r = false;");
+	statement("    for (uint i = 0u; i < lane_id; i++)");
+	statement("        r = r || (scratch[subgroup_base + i] != 0u);");
+	statement("    ", barrier_call, ";");
+	statement("    return r;");
+	statement("}");
+	statement("");
+
+	statement("static bool spv_emulate_clustered_reduce_logical_or(__local uint* scratch, bool val, "
+	          "uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {");
+	statement("    scratch[linear_id] = val ? 1u : 0u;");
+	statement("    ", barrier_call, ";");
+	statement("    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;");
+	statement("    bool r = false;");
+	statement("    for (uint i = 0u; i < cluster_size; i++)");
+	statement("        r = r || (scratch[subgroup_base + cluster_base_in_sg + i] != 0u);");
+	statement("    ", barrier_call, ";");
+	statement("    return r;");
+	statement("}");
+	statement("");
+
+	// logical_xor
+	statement("static bool spv_emulate_reduce_logical_xor(__local uint* scratch, bool val, "
+	          "uint linear_id, uint subgroup_base, uint subgroup_size) {");
+	statement("    scratch[linear_id] = val ? 1u : 0u;");
+	statement("    ", barrier_call, ";");
+	statement("    bool r = false;");
+	statement("    for (uint i = 0u; i < subgroup_size; i++)");
+	statement("        r = r != (scratch[subgroup_base + i] != 0u);");
+	statement("    ", barrier_call, ";");
+	statement("    return r;");
+	statement("}");
+	statement("");
+
+	statement("static bool spv_emulate_inclusive_scan_logical_xor(__local uint* scratch, bool val, "
+	          "uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {");
+	statement("    scratch[linear_id] = val ? 1u : 0u;");
+	statement("    ", barrier_call, ";");
+	statement("    bool r = false;");
+	statement("    for (uint i = 0u; i <= lane_id; i++)");
+	statement("        r = r != (scratch[subgroup_base + i] != 0u);");
+	statement("    ", barrier_call, ";");
+	statement("    return r;");
+	statement("}");
+	statement("");
+
+	statement("static bool spv_emulate_exclusive_scan_logical_xor(__local uint* scratch, bool val, "
+	          "uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {");
+	statement("    scratch[linear_id] = val ? 1u : 0u;");
+	statement("    ", barrier_call, ";");
+	statement("    bool r = false;");
+	statement("    for (uint i = 0u; i < lane_id; i++)");
+	statement("        r = r != (scratch[subgroup_base + i] != 0u);");
+	statement("    ", barrier_call, ";");
+	statement("    return r;");
+	statement("}");
+	statement("");
+
+	statement("static bool spv_emulate_clustered_reduce_logical_xor(__local uint* scratch, bool val, "
+	          "uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {");
+	statement("    scratch[linear_id] = val ? 1u : 0u;");
+	statement("    ", barrier_call, ";");
+	statement("    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;");
+	statement("    bool r = false;");
+	statement("    for (uint i = 0u; i < cluster_size; i++)");
+	statement("        r = r != (scratch[subgroup_base + cluster_base_in_sg + i] != 0u);");
+	statement("    ", barrier_call, ";");
+	statement("    return r;");
+	statement("}");
+	statement("");
+
+	// Ballot derived operations (pure arithmetic on uint4, no scratch needed)
+	statement("static bool spv_emulate_inverse_ballot(uint4 ballot, uint lane_id) {");
+	statement("    uint word = lane_id / 32u;");
+	statement("    uint bit = lane_id % 32u;");
+	statement("    uint v = (word == 0u) ? ballot.x : (word == 1u) ? ballot.y : (word == 2u) ? ballot.z : ballot.w;");
+	statement("    return (v & (1u << bit)) != 0u;");
+	statement("}");
+	statement("");
+
+	statement("static bool spv_emulate_ballot_bit_extract(uint4 ballot, uint index) {");
+	statement("    uint word = index / 32u;");
+	statement("    uint bit = index % 32u;");
+	statement("    uint v = (word == 0u) ? ballot.x : (word == 1u) ? ballot.y : (word == 2u) ? ballot.z : ballot.w;");
+	statement("    return (v & (1u << bit)) != 0u;");
+	statement("}");
+	statement("");
+
+	statement("static uint spv_popcount4(uint4 v) {");
+	statement("    return popcount(v.x) + popcount(v.y) + popcount(v.z) + popcount(v.w);");
+	statement("}");
+	statement("");
+
+	statement("static uint spv_emulate_ballot_bit_count(uint4 ballot) {");
+	statement("    return spv_popcount4(ballot);");
+	statement("}");
+	statement("");
+
+	statement("static uint spv_emulate_ballot_inclusive_bit_count(uint4 ballot, uint lane_id) {");
+	statement("    uint4 masked = ballot;");
+	statement("    uint word = lane_id / 32u;");
+	statement("    uint bit = lane_id % 32u;");
+	statement("    uint mask = (bit == 31u) ? 0xFFFFFFFFu : ((1u << (bit + 1u)) - 1u);");
+	statement("    if (word == 0u) { masked.x &= mask; masked.y = 0u; masked.z = 0u; masked.w = 0u; }");
+	statement("    else if (word == 1u) { masked.y &= mask; masked.z = 0u; masked.w = 0u; }");
+	statement("    else if (word == 2u) { masked.z &= mask; masked.w = 0u; }");
+	statement("    else { masked.w &= mask; }");
+	statement("    return spv_popcount4(masked);");
+	statement("}");
+	statement("");
+
+	statement("static uint spv_emulate_ballot_exclusive_bit_count(uint4 ballot, uint lane_id) {");
+	statement("    if (lane_id == 0u) return 0u;");
+	statement("    return spv_emulate_ballot_inclusive_bit_count(ballot, lane_id - 1u);");
+	statement("}");
+	statement("");
+
+	statement("static uint spv_emulate_ballot_find_lsb(uint4 ballot) {");
+	statement("    if (ballot.x != 0u) return (uint)clz(ballot.x & (0u - ballot.x));");
+	statement("    if (ballot.y != 0u) return 32u + (uint)clz(ballot.y & (0u - ballot.y));");
+	statement("    if (ballot.z != 0u) return 64u + (uint)clz(ballot.z & (0u - ballot.z));");
+	statement("    if (ballot.w != 0u) return 96u + (uint)clz(ballot.w & (0u - ballot.w));");
+	statement("    return ~0u;");
+	statement("}");
+	statement("");
+
+	statement("static uint spv_emulate_ballot_find_msb(uint4 ballot) {");
+	statement("    if (ballot.w != 0u) return 127u - (uint)clz(ballot.w);");
+	statement("    if (ballot.z != 0u) return 95u - (uint)clz(ballot.z);");
+	statement("    if (ballot.y != 0u) return 63u - (uint)clz(ballot.y);");
+	statement("    if (ballot.x != 0u) return 31u - (uint)clz(ballot.x);");
+	statement("    return ~0u;");
+	statement("}");
+	statement("");
+}
+
+void CompilerOpenCL::emit_subgroup_op_emulated(const Instruction &i)
+{
+	const uint32_t *ops = stream(i);
+	auto op = static_cast<Op>(i.op);
+
+	// Validate scope is Subgroup
+	if (op != OpGroupNonUniformQuadAllKHR && op != OpGroupNonUniformQuadAnyKHR)
+	{
+		auto scope = static_cast<Scope>(evaluate_constant_u32(ops[2]));
+		if (scope != ScopeSubgroup)
+			SPIRV_CROSS_THROW("Only subgroup scope is supported.");
+	}
+
+	uint32_t result_type = ops[0];
+	uint32_t id = ops[1];
+	uint32_t fixed = opencl_options.fixed_subgroup_size;
+
+	// Request scratch buffer (triggers recompile if first time).
+	auto require_scratch = [this]()
+	{
+		if (!needs_subgroup_emulation_scratch)
+		{
+			needs_subgroup_emulation_scratch = true;
+			force_recompile();
+		}
+	};
+
+	// Helper to get the as_uint cast for a value
+	auto to_uint_cast = [&](uint32_t value_id) -> string
+	{
+		auto &type = expression_type(value_id);
+		if (type.basetype == SPIRType::UInt)
+			return to_expression(value_id);
+		else if (type.basetype == SPIRType::Int)
+			return join("as_uint(", to_expression(value_id), ")");
+		else if (type.basetype == SPIRType::Float)
+			return join("as_uint(", to_expression(value_id), ")");
+		else if (type.basetype == SPIRType::Boolean)
+			return join("(", to_expression(value_id), " ? 1u : 0u)");
+		return to_expression(value_id);
+	};
+
+	// Helper to wrap result with type cast from uint
+	auto from_uint_cast = [&](const string &expr, uint32_t value_id) -> string
+	{
+		auto &type = expression_type(value_id);
+		if (type.basetype == SPIRType::UInt)
+			return expr;
+		else if (type.basetype == SPIRType::Int)
+			return join("as_int(", expr, ")");
+		else if (type.basetype == SPIRType::Float)
+			return join("as_float(", expr, ")");
+		return expr;
+	};
+
+	// For emulated vector ops, decompose per-component calling the scalar helper.
+	auto emit_emulated_vec = [&](uint32_t value_id, const string &scalar_call_prefix, const string &scalar_call_suffix)
+	{
+		auto &type = expression_type(value_id);
+		if (type.vecsize > 1)
+		{
+			auto &out_type = get<SPIRType>(result_type);
+			string expr = "(" + type_to_glsl(out_type) + ")(";
+			for (uint32_t c = 0; c < type.vecsize; c++)
+			{
+				if (c > 0)
+					expr += ", ";
+				string component = join(to_enclosed_expression(value_id), ".", "xyzw"[c]);
+				// Cast component to uint for the helper
+				string as_uint_comp;
+				if (type.basetype == SPIRType::UInt)
+					as_uint_comp = component;
+				else if (type.basetype == SPIRType::Int)
+					as_uint_comp = join("as_uint(", component, ")");
+				else if (type.basetype == SPIRType::Float)
+					as_uint_comp = join("as_uint(", component, ")");
+				else
+					as_uint_comp = component;
+
+				string result_comp = scalar_call_prefix + as_uint_comp + scalar_call_suffix;
+				// Cast back from uint
+				if (type.basetype == SPIRType::Int)
+					result_comp = join("as_int(", result_comp, ")");
+				else if (type.basetype == SPIRType::Float)
+					result_comp = join("as_float(", result_comp, ")");
+				expr += result_comp;
+			}
+			expr += ")";
+			emit_op(result_type, id, expr, should_forward(value_id));
+			inherit_expression_dependencies(id, value_id);
+		}
+		else
+		{
+			string result_expr = scalar_call_prefix + to_uint_cast(value_id) + scalar_call_suffix;
+			result_expr = from_uint_cast(result_expr, value_id);
+			emit_op(result_type, id, result_expr, should_forward(value_id));
+			inherit_expression_dependencies(id, value_id);
+		}
+	};
+
+	switch (op)
+	{
+	case OpGroupNonUniformElect:
+		if (fixed == 1)
+			emit_op(result_type, id, "true", true);
+		else
+			emit_op(result_type, id, "(_spv_lane_id == 0u)", true);
+		break;
+
+	case OpGroupNonUniformAll:
+		require_scratch();
+		if (fixed == 1)
+		{
+			emit_op(result_type, id, to_enclosed_expression(ops[3]), should_forward(ops[3]));
+			inherit_expression_dependencies(id, ops[3]);
+		}
+		else
+		{
+			emit_op(result_type, id,
+			        join("spv_emulate_all(_spv_subgroup_scratch, ", to_expression(ops[3]),
+			             ", _spv_linear_id, _spv_subgroup_base, _spv_subgroup_size)"),
+			        should_forward(ops[3]));
+			inherit_expression_dependencies(id, ops[3]);
+		}
+		break;
+
+	case OpGroupNonUniformAny:
+		require_scratch();
+		if (fixed == 1)
+		{
+			emit_op(result_type, id, to_enclosed_expression(ops[3]), should_forward(ops[3]));
+			inherit_expression_dependencies(id, ops[3]);
+		}
+		else
+		{
+			emit_op(result_type, id,
+			        join("spv_emulate_any(_spv_subgroup_scratch, ", to_expression(ops[3]),
+			             ", _spv_linear_id, _spv_subgroup_base, _spv_subgroup_size)"),
+			        should_forward(ops[3]));
+			inherit_expression_dependencies(id, ops[3]);
+		}
+		break;
+
+	case OpGroupNonUniformAllEqual:
+	{
+		require_scratch();
+		if (fixed == 1)
+		{
+			emit_op(result_type, id, "true", true);
+		}
+		else
+		{
+			emit_op(result_type, id,
+			        join("spv_emulate_all_equal_uint(_spv_subgroup_scratch, ", to_uint_cast(ops[3]),
+			             ", _spv_linear_id, _spv_subgroup_base, _spv_subgroup_size)"),
+			        should_forward(ops[3]));
+			inherit_expression_dependencies(id, ops[3]);
+		}
+		break;
+	}
+
+	case OpGroupNonUniformBroadcast:
+		require_scratch();
+		if (fixed == 1)
+		{
+			emit_op(result_type, id, to_enclosed_expression(ops[3]), should_forward(ops[3]));
+			inherit_expression_dependencies(id, ops[3]);
+		}
+		else
+		{
+			string src_lane = to_expression(ops[4]);
+			emit_emulated_vec(ops[3], "spv_emulate_broadcast_uint(_spv_subgroup_scratch, ",
+			                  join(", ", src_lane, ", _spv_linear_id, _spv_subgroup_base)"));
+		}
+		break;
+
+	case OpGroupNonUniformBroadcastFirst:
+		require_scratch();
+		if (fixed == 1)
+		{
+			emit_op(result_type, id, to_enclosed_expression(ops[3]), should_forward(ops[3]));
+			inherit_expression_dependencies(id, ops[3]);
+		}
+		else
+		{
+			emit_emulated_vec(ops[3], "spv_emulate_broadcast_first_uint(_spv_subgroup_scratch, ",
+			                  ", _spv_linear_id, _spv_subgroup_base)");
+		}
+		break;
+
+	case OpGroupNonUniformBallot:
+		require_scratch();
+		if (fixed == 1)
+		{
+			emit_op(result_type, id, join("(", to_expression(ops[3]), " ? (uint4)(1u, 0u, 0u, 0u) : (uint4)(0u))"),
+			        should_forward(ops[3]));
+			inherit_expression_dependencies(id, ops[3]);
+		}
+		else
+		{
+			emit_op(result_type, id,
+			        join("spv_emulate_ballot(_spv_subgroup_scratch, ", to_expression(ops[3]),
+			             ", _spv_linear_id, _spv_subgroup_base, _spv_subgroup_size)"),
+			        should_forward(ops[3]));
+			inherit_expression_dependencies(id, ops[3]);
+		}
+		break;
+
+	case OpGroupNonUniformInverseBallot:
+		require_scratch();
+		emit_op(result_type, id, join("spv_emulate_inverse_ballot(", to_expression(ops[3]), ", _spv_lane_id)"),
+		        should_forward(ops[3]));
+		inherit_expression_dependencies(id, ops[3]);
+		break;
+
+	case OpGroupNonUniformBallotBitExtract:
+		require_scratch();
+		emit_op(result_type, id,
+		        join("spv_emulate_ballot_bit_extract(", to_expression(ops[3]), ", ", to_expression(ops[4]), ")"),
+		        should_forward(ops[3]));
+		inherit_expression_dependencies(id, ops[3]);
+		break;
+
+	case OpGroupNonUniformBallotFindLSB:
+		require_scratch();
+		emit_op(result_type, id, join("spv_emulate_ballot_find_lsb(", to_expression(ops[3]), ")"),
+		        should_forward(ops[3]));
+		inherit_expression_dependencies(id, ops[3]);
+		break;
+
+	case OpGroupNonUniformBallotFindMSB:
+		require_scratch();
+		emit_op(result_type, id, join("spv_emulate_ballot_find_msb(", to_expression(ops[3]), ")"),
+		        should_forward(ops[3]));
+		inherit_expression_dependencies(id, ops[3]);
+		break;
+
+	case OpGroupNonUniformBallotBitCount:
+	{
+		require_scratch();
+		auto operation = static_cast<GroupOperation>(ops[3]);
+		if (operation == GroupOperationReduce)
+		{
+			emit_op(result_type, id, join("spv_emulate_ballot_bit_count(", to_expression(ops[4]), ")"),
+			        should_forward(ops[4]));
+			inherit_expression_dependencies(id, ops[4]);
+		}
+		else if (operation == GroupOperationInclusiveScan)
+		{
+			emit_op(result_type, id,
+			        join("spv_emulate_ballot_inclusive_bit_count(", to_expression(ops[4]), ", _spv_lane_id)"),
+			        should_forward(ops[4]));
+			inherit_expression_dependencies(id, ops[4]);
+		}
+		else if (operation == GroupOperationExclusiveScan)
+		{
+			emit_op(result_type, id,
+			        join("spv_emulate_ballot_exclusive_bit_count(", to_expression(ops[4]), ", _spv_lane_id)"),
+			        should_forward(ops[4]));
+			inherit_expression_dependencies(id, ops[4]);
+		}
+		else
+			SPIRV_CROSS_THROW("Invalid group operation for BallotBitCount.");
+		break;
+	}
+
+	case OpGroupNonUniformShuffle:
+		require_scratch();
+		if (fixed == 1)
+		{
+			emit_op(result_type, id, to_enclosed_expression(ops[3]), should_forward(ops[3]));
+			inherit_expression_dependencies(id, ops[3]);
+		}
+		else
+		{
+			string idx = to_expression(ops[4]);
+			emit_emulated_vec(ops[3], "spv_emulate_shuffle_uint(_spv_subgroup_scratch, ",
+			                  join(", ", idx, ", _spv_linear_id, _spv_subgroup_base)"));
+		}
+		break;
+
+	case OpGroupNonUniformShuffleXor:
+		require_scratch();
+		if (fixed == 1)
+		{
+			emit_op(result_type, id, to_enclosed_expression(ops[3]), should_forward(ops[3]));
+			inherit_expression_dependencies(id, ops[3]);
+		}
+		else
+		{
+			string mask = to_expression(ops[4]);
+			emit_emulated_vec(ops[3], "spv_emulate_shuffle_xor_uint(_spv_subgroup_scratch, ",
+			                  join(", ", mask, ", _spv_lane_id, _spv_linear_id, _spv_subgroup_base)"));
+		}
+		break;
+
+	case OpGroupNonUniformShuffleUp:
+		require_scratch();
+		if (fixed == 1)
+		{
+			emit_op(result_type, id, to_enclosed_expression(ops[3]), should_forward(ops[3]));
+			inherit_expression_dependencies(id, ops[3]);
+		}
+		else
+		{
+			string delta = to_expression(ops[4]);
+			emit_emulated_vec(ops[3], "spv_emulate_shuffle_up_uint(_spv_subgroup_scratch, ",
+			                  join(", ", delta, ", _spv_lane_id, _spv_linear_id, _spv_subgroup_base)"));
+		}
+		break;
+
+	case OpGroupNonUniformShuffleDown:
+		require_scratch();
+		if (fixed == 1)
+		{
+			emit_op(result_type, id, to_enclosed_expression(ops[3]), should_forward(ops[3]));
+			inherit_expression_dependencies(id, ops[3]);
+		}
+		else
+		{
+			string delta = to_expression(ops[4]);
+			emit_emulated_vec(ops[3], "spv_emulate_shuffle_down_uint(_spv_subgroup_scratch, ",
+			                  join(", ", delta, ", _spv_lane_id, _spv_linear_id, _spv_subgroup_size)"));
+		}
+		break;
+
+	case OpGroupNonUniformRotateKHR:
+		require_scratch();
+		if (fixed == 1)
+		{
+			emit_op(result_type, id, to_enclosed_expression(ops[3]), should_forward(ops[3]));
+			inherit_expression_dependencies(id, ops[3]);
+		}
+		else if (i.length > 5)
+		{
+			// Clustered rotate
+			string delta = to_expression(ops[4]);
+			string cluster_size = to_expression(ops[5]);
+			emit_emulated_vec(
+			    ops[3], "spv_emulate_clustered_rotate_uint(_spv_subgroup_scratch, ",
+			    join(", ", delta, ", _spv_lane_id, _spv_linear_id, _spv_subgroup_base, ", cluster_size, ")"));
+		}
+		else
+		{
+			string delta = to_expression(ops[4]);
+			emit_emulated_vec(
+			    ops[3], "spv_emulate_rotate_uint(_spv_subgroup_scratch, ",
+			    join(", ", delta, ", _spv_lane_id, _spv_linear_id, _spv_subgroup_base, _spv_subgroup_size)"));
+		}
+		break;
+
+	// === Arithmetic ops (Reduce / InclusiveScan / ExclusiveScan / ClusteredReduce) ===
+	case OpGroupNonUniformFAdd:
+	case OpGroupNonUniformIAdd:
+	case OpGroupNonUniformFMul:
+	case OpGroupNonUniformIMul:
+	case OpGroupNonUniformFMin:
+	case OpGroupNonUniformFMax:
+	case OpGroupNonUniformSMin:
+	case OpGroupNonUniformSMax:
+	case OpGroupNonUniformUMin:
+	case OpGroupNonUniformUMax:
+	case OpGroupNonUniformBitwiseAnd:
+	case OpGroupNonUniformBitwiseOr:
+	case OpGroupNonUniformBitwiseXor:
+	case OpGroupNonUniformLogicalAnd:
+	case OpGroupNonUniformLogicalOr:
+	case OpGroupNonUniformLogicalXor:
+	{
+		require_scratch();
+		auto operation = static_cast<GroupOperation>(ops[3]);
+		uint32_t value_id = ops[4];
+
+		if (fixed == 1)
+		{
+			// For subgroup_size==1: reduce/inclusive return val; exclusive returns identity.
+			if (operation == GroupOperationExclusiveScan)
+			{
+				// Return the identity value for the operation
+				auto &type = get<SPIRType>(result_type);
+				string identity;
+				switch (op)
+				{
+				case OpGroupNonUniformFAdd:
+				case OpGroupNonUniformIAdd:
+					identity = (type.basetype == SPIRType::Float) ? "0.0f" : "0";
+					break;
+				case OpGroupNonUniformFMul:
+				case OpGroupNonUniformIMul:
+					identity = (type.basetype == SPIRType::Float) ? "1.0f" : "1";
+					break;
+				case OpGroupNonUniformFMin:
+					identity = "INFINITY";
+					break;
+				case OpGroupNonUniformFMax:
+					identity = "-INFINITY";
+					break;
+				case OpGroupNonUniformSMin:
+					identity = "INT_MAX";
+					break;
+				case OpGroupNonUniformSMax:
+					identity = "INT_MIN";
+					break;
+				case OpGroupNonUniformUMin:
+					identity = "UINT_MAX";
+					break;
+				case OpGroupNonUniformUMax:
+					identity = "0u";
+					break;
+				case OpGroupNonUniformBitwiseAnd:
+					identity = (type.basetype == SPIRType::Int) ? "as_int(0xFFFFFFFFu)" : "0xFFFFFFFFu";
+					break;
+				case OpGroupNonUniformBitwiseOr:
+				case OpGroupNonUniformBitwiseXor:
+					identity = (type.basetype == SPIRType::Int) ? "0" : "0u";
+					break;
+				case OpGroupNonUniformLogicalAnd:
+					identity = "true";
+					break;
+				case OpGroupNonUniformLogicalOr:
+				case OpGroupNonUniformLogicalXor:
+					identity = "false";
+					break;
+				default:
+					identity = "0";
+					break;
+				}
+
+				if (type.vecsize > 1)
+					emit_op(result_type, id, join("(", type_to_glsl(type), ")(", identity, ")"), true);
+				else
+					emit_op(result_type, id, identity, true);
+			}
+			else
+			{
+				// Reduce, InclusiveScan, ClusteredReduce all return the value itself
+				emit_op(result_type, id, to_enclosed_expression(value_id), should_forward(value_id));
+				inherit_expression_dependencies(id, value_id);
+			}
+			break;
+		}
+
+		// Determine the suffix for the helper function
+		const char *op_suffix = nullptr;
+		bool is_logical = false;
+		switch (op)
+		{
+		case OpGroupNonUniformFAdd:
+			op_suffix = "add_float";
+			break;
+		case OpGroupNonUniformIAdd:
+		{
+			auto &type = expression_type(value_id);
+			op_suffix = (type.basetype == SPIRType::Int) ? "add_int" : "add_uint";
+			break;
+		}
+		case OpGroupNonUniformFMul:
+			op_suffix = "mul_float";
+			break;
+		case OpGroupNonUniformIMul:
+		{
+			auto &type = expression_type(value_id);
+			op_suffix = (type.basetype == SPIRType::Int) ? "mul_int" : "mul_uint";
+			break;
+		}
+		case OpGroupNonUniformFMin:
+			op_suffix = "min_float";
+			break;
+		case OpGroupNonUniformFMax:
+			op_suffix = "max_float";
+			break;
+		case OpGroupNonUniformSMin:
+			op_suffix = "min_int";
+			break;
+		case OpGroupNonUniformSMax:
+			op_suffix = "max_int";
+			break;
+		case OpGroupNonUniformUMin:
+			op_suffix = "min_uint";
+			break;
+		case OpGroupNonUniformUMax:
+			op_suffix = "max_uint";
+			break;
+		case OpGroupNonUniformBitwiseAnd:
+		{
+			auto &type = expression_type(value_id);
+			op_suffix = (type.basetype == SPIRType::Int) ? "and_int" : "and_uint";
+			break;
+		}
+		case OpGroupNonUniformBitwiseOr:
+		{
+			auto &type = expression_type(value_id);
+			op_suffix = (type.basetype == SPIRType::Int) ? "or_int" : "or_uint";
+			break;
+		}
+		case OpGroupNonUniformBitwiseXor:
+		{
+			auto &type = expression_type(value_id);
+			op_suffix = (type.basetype == SPIRType::Int) ? "xor_int" : "xor_uint";
+			break;
+		}
+		case OpGroupNonUniformLogicalAnd:
+			op_suffix = "logical_and";
+			is_logical = true;
+			break;
+		case OpGroupNonUniformLogicalOr:
+			op_suffix = "logical_or";
+			is_logical = true;
+			break;
+		case OpGroupNonUniformLogicalXor:
+			op_suffix = "logical_xor";
+			is_logical = true;
+			break;
+		default:
+			SPIRV_CROSS_THROW("Unsupported arithmetic op for emulation.");
+			break;
+		}
+
+		// Determine the group operation prefix
+		const char *group_prefix = nullptr;
+		switch (operation)
+		{
+		case GroupOperationReduce:
+			group_prefix = "reduce";
+			break;
+		case GroupOperationInclusiveScan:
+			group_prefix = "inclusive_scan";
+			break;
+		case GroupOperationExclusiveScan:
+			group_prefix = "exclusive_scan";
+			break;
+		case GroupOperationClusteredReduce:
+			group_prefix = "clustered_reduce";
+			break;
+		default:
+			SPIRV_CROSS_THROW("Unsupported group operation for emulation.");
+			break;
+		}
+
+		string helper_name = join("spv_emulate_", group_prefix, "_", op_suffix);
+
+		if (is_logical)
+		{
+			// Logical ops work on bool directly (scalar only)
+			string val_expr = to_expression(value_id);
+			string expr;
+			if (operation == GroupOperationClusteredReduce)
+			{
+				string cluster_size = to_expression(ops[5]);
+				expr = join(helper_name, "(_spv_subgroup_scratch, ", val_expr, ", ", cluster_size,
+				            ", _spv_lane_id, _spv_linear_id, _spv_subgroup_base, _spv_subgroup_size)");
+			}
+			else if (operation == GroupOperationReduce)
+			{
+				expr = join(helper_name, "(_spv_subgroup_scratch, ", val_expr,
+				            ", _spv_linear_id, _spv_subgroup_base, _spv_subgroup_size)");
+			}
+			else
+			{
+				// inclusive/exclusive scan
+				expr = join(helper_name, "(_spv_subgroup_scratch, ", val_expr,
+				            ", _spv_lane_id, _spv_linear_id, _spv_subgroup_base, _spv_subgroup_size)");
+			}
+			emit_op(result_type, id, expr, should_forward(value_id));
+			inherit_expression_dependencies(id, value_id);
+		}
+		else
+		{
+			// Arithmetic ops: use vector decomposition like native subgroup ops
+			auto &type = expression_type(value_id);
+			if (type.vecsize > 1)
+			{
+				auto &out_type = get<SPIRType>(result_type);
+				string expr = "(" + type_to_glsl(out_type) + ")(";
+				for (uint32_t c = 0; c < type.vecsize; c++)
+				{
+					if (c > 0)
+						expr += ", ";
+					string component = join(to_enclosed_expression(value_id), ".", "xyzw"[c]);
+					string call;
+					if (operation == GroupOperationClusteredReduce)
+					{
+						string cluster_size = to_expression(ops[5]);
+						call = join(helper_name, "(_spv_subgroup_scratch, ", component, ", ", cluster_size,
+						            ", _spv_lane_id, _spv_linear_id, _spv_subgroup_base, _spv_subgroup_size)");
+					}
+					else if (operation == GroupOperationReduce)
+					{
+						call = join(helper_name, "(_spv_subgroup_scratch, ", component,
+						            ", _spv_linear_id, _spv_subgroup_base, _spv_subgroup_size)");
+					}
+					else
+					{
+						call = join(helper_name, "(_spv_subgroup_scratch, ", component,
+						            ", _spv_lane_id, _spv_linear_id, _spv_subgroup_base, _spv_subgroup_size)");
+					}
+					expr += call;
+				}
+				expr += ")";
+				emit_op(result_type, id, expr, should_forward(value_id));
+				inherit_expression_dependencies(id, value_id);
+			}
+			else
+			{
+				string val_expr = to_expression(value_id);
+				string expr;
+				if (operation == GroupOperationClusteredReduce)
+				{
+					string cluster_size = to_expression(ops[5]);
+					expr = join(helper_name, "(_spv_subgroup_scratch, ", val_expr, ", ", cluster_size,
+					            ", _spv_lane_id, _spv_linear_id, _spv_subgroup_base, _spv_subgroup_size)");
+				}
+				else if (operation == GroupOperationReduce)
+				{
+					expr = join(helper_name, "(_spv_subgroup_scratch, ", val_expr,
+					            ", _spv_linear_id, _spv_subgroup_base, _spv_subgroup_size)");
+				}
+				else
+				{
+					expr = join(helper_name, "(_spv_subgroup_scratch, ", val_expr,
+					            ", _spv_lane_id, _spv_linear_id, _spv_subgroup_base, _spv_subgroup_size)");
+				}
+				emit_op(result_type, id, expr, should_forward(value_id));
+				inherit_expression_dependencies(id, value_id);
+			}
+		}
+		break;
+	}
+
+	default:
+		SPIRV_CROSS_THROW("Unsupported subgroup op for OpenCL emulation.");
+	}
+}
+
+void CompilerOpenCL::emit_subgroup_op(const Instruction &i)
+{
+	const uint32_t *ops = stream(i);
+	auto op = static_cast<Op>(i.op);
+
+	if (opencl_options.emulate_subgroups && !opencl_options.enable_subgroups)
+	{
+		emit_subgroup_op_emulated(i);
+		return;
+	}
+
+	if (!opencl_options.enable_subgroups)
+		SPIRV_CROSS_THROW("Subgroup operations require enable_subgroups option.");
+
+	// Validate scope is Subgroup
+	if (op != OpGroupNonUniformQuadAllKHR && op != OpGroupNonUniformQuadAnyKHR)
+	{
+		auto scope = static_cast<Scope>(evaluate_constant_u32(ops[2]));
+		if (scope != ScopeSubgroup)
+			SPIRV_CROSS_THROW("Only subgroup scope is supported.");
+	}
+
+	uint32_t result_type = ops[0];
+	uint32_t id = ops[1];
+
+	// If we need to do implicit bitcasts, make sure we do it with the correct type.
+	uint32_t integer_width = get_integer_width_for_instruction(i);
+	auto int_type = to_signed_basetype(integer_width);
+	auto uint_type = to_unsigned_basetype(integer_width);
+
+	// Helper to set an extension flag and trigger recompile if newly needed.
+	auto require_extension = [this](bool &flag)
+	{
+		if (!flag)
+		{
+			flag = true;
+			force_recompile();
+		}
+	};
+
+	switch (op)
+	{
+		// === Task 5: cl_khr_subgroup_non_uniform_vote ===
+
+	case OpGroupNonUniformElect:
+		require_extension(needs_subgroup_vote);
+		emit_op(result_type, id, "sub_group_elect()", true);
+		break;
+
+	case OpGroupNonUniformAllEqual:
+	{
+		require_extension(needs_subgroup_vote);
+		auto &type = expression_type(ops[3]);
+		if (type.vecsize > 1)
+		{
+			// OpenCL sub_group_non_uniform_all_equal only accepts scalars.
+			// For vectors, decompose into per-component calls combined with &&.
+			string expr;
+			for (uint32_t c = 0; c < type.vecsize; c++)
+			{
+				if (c > 0)
+					expr += " && ";
+				string component = join(to_enclosed_expression(ops[3]), ".", "xyzw"[c]);
+				expr += join("sub_group_non_uniform_all_equal(", component, ")");
+			}
+			emit_op(result_type, id, expr, should_forward(ops[3]));
+			inherit_expression_dependencies(id, ops[3]);
+		}
+		else
+		{
+			emit_unary_func_op(result_type, id, ops[3], "sub_group_non_uniform_all_equal");
+		}
+		break;
+	}
+
+		// === Task 4: cl_khr_subgroups (base) — vote/broadcast ===
+
+	case OpGroupNonUniformAll:
+		emit_unary_func_op(result_type, id, ops[3], "sub_group_all");
+		break;
+
+	case OpGroupNonUniformAny:
+		emit_unary_func_op(result_type, id, ops[3], "sub_group_any");
+		break;
+
+	case OpGroupNonUniformBroadcast:
+		emit_subgroup_op_vec_binary(result_type, id, ops[3], ops[4], "sub_group_broadcast");
+		break;
+
+		// === Task 6: cl_khr_subgroup_ballot ===
+
+	case OpGroupNonUniformBroadcastFirst:
+		require_extension(needs_subgroup_ballot);
+		emit_subgroup_op_vec(result_type, id, ops[3], "sub_group_broadcast_first");
+		break;
+
+	case OpGroupNonUniformBallot:
+		require_extension(needs_subgroup_ballot);
+		emit_unary_func_op(result_type, id, ops[3], "sub_group_ballot");
+		break;
+
+	case OpGroupNonUniformInverseBallot:
+		require_extension(needs_subgroup_ballot);
+		emit_unary_func_op(result_type, id, ops[3], "sub_group_inverse_ballot");
+		break;
+
+	case OpGroupNonUniformBallotBitExtract:
+		require_extension(needs_subgroup_ballot);
+		emit_binary_func_op(result_type, id, ops[3], ops[4], "sub_group_ballot_bit_extract");
+		break;
+
+	case OpGroupNonUniformBallotFindLSB:
+		require_extension(needs_subgroup_ballot);
+		emit_unary_func_op(result_type, id, ops[3], "sub_group_ballot_find_lsb");
+		break;
+
+	case OpGroupNonUniformBallotFindMSB:
+		require_extension(needs_subgroup_ballot);
+		emit_unary_func_op(result_type, id, ops[3], "sub_group_ballot_find_msb");
+		break;
+
+	case OpGroupNonUniformBallotBitCount:
+	{
+		require_extension(needs_subgroup_ballot);
+		auto operation = static_cast<GroupOperation>(ops[3]);
+		if (operation == GroupOperationReduce)
+			emit_unary_func_op(result_type, id, ops[4], "sub_group_ballot_bit_count");
+		else if (operation == GroupOperationInclusiveScan)
+			emit_unary_func_op(result_type, id, ops[4], "sub_group_ballot_inclusive_scan");
+		else if (operation == GroupOperationExclusiveScan)
+			emit_unary_func_op(result_type, id, ops[4], "sub_group_ballot_exclusive_scan");
+		else
+			SPIRV_CROSS_THROW("Invalid group operation for BallotBitCount.");
+		break;
+	}
+
+	// === Tasks 4/7/10: Arithmetic ops (Reduce/Scan/Clustered) ===
+	// The same SPIR-V opcodes are used for base cl_khr_subgroups (Reduce/InclusiveScan/ExclusiveScan
+	// with add/min/max), cl_khr_subgroup_non_uniform_arithmetic (all ops with Reduce/Scan),
+	// and cl_khr_subgroup_clustered_reduce (ClusteredReduce).
+
+	// clang-format off
+	// OpenCL subgroup functions are scalar-only; vectors are decomposed per-component
+	// via emit_subgroup_op_vec / emit_subgroup_op_vec_binary.
+
+#define OPENCL_SUBGROUP_ARITH(spirv_op, base_name, nu_name) \
+	case OpGroupNonUniform##spirv_op: \
+	{ \
+		auto operation = static_cast<GroupOperation>(ops[3]); \
+		if (operation == GroupOperationReduce) \
+			emit_subgroup_op_vec(result_type, id, ops[4], "sub_group_reduce_" base_name); \
+		else if (operation == GroupOperationInclusiveScan) \
+			emit_subgroup_op_vec(result_type, id, ops[4], "sub_group_scan_inclusive_" base_name); \
+		else if (operation == GroupOperationExclusiveScan) \
+			emit_subgroup_op_vec(result_type, id, ops[4], "sub_group_scan_exclusive_" base_name); \
+		else if (operation == GroupOperationClusteredReduce) \
+		{ \
+			require_extension(needs_subgroup_clustered); \
+			emit_subgroup_op_vec_binary(result_type, id, ops[4], ops[5], "sub_group_clustered_reduce_" base_name); \
+		} \
+		else \
+			SPIRV_CROSS_THROW("Unsupported group operation."); \
+		break; \
+	}
+
+#define OPENCL_SUBGROUP_ARITH_CAST(spirv_op, base_name, nu_name, cast_type) \
+	case OpGroupNonUniform##spirv_op: \
+	{ \
+		auto operation = static_cast<GroupOperation>(ops[3]); \
+		if (operation == GroupOperationReduce) \
+			emit_unary_func_op_cast(result_type, id, ops[4], "sub_group_reduce_" base_name, cast_type, cast_type); \
+		else if (operation == GroupOperationInclusiveScan) \
+			emit_unary_func_op_cast(result_type, id, ops[4], "sub_group_scan_inclusive_" base_name, cast_type, cast_type); \
+		else if (operation == GroupOperationExclusiveScan) \
+			emit_unary_func_op_cast(result_type, id, ops[4], "sub_group_scan_exclusive_" base_name, cast_type, cast_type); \
 		else if (operation == GroupOperationClusteredReduce) \
 		{ \
 			require_extension(needs_subgroup_clustered); \
@@ -4325,21 +5769,36 @@ void CompilerOpenCL::emit_instruction(const Instruction &instruction)
 
 		if (execution_scope == ScopeSubgroup)
 		{
-			if (!opencl_options.enable_subgroups)
-				SPIRV_CROSS_THROW("Subgroup barriers require enable_subgroups option.");
-
-			// Subgroup barrier with memory fence flags
-			const uint32_t all_barriers =
-			    MemorySemanticsWorkgroupMemoryMask | MemorySemanticsUniformMemoryMask | MemorySemanticsImageMemoryMask;
-
-			if (semantics == 0 || (semantics & all_barriers) == all_barriers)
+			if (opencl_options.emulate_subgroups && !opencl_options.enable_subgroups)
 			{
-				statement("sub_group_barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);");
+				// Emulated subgroup barrier: no-op for subgroup_size==1,
+				// otherwise use work_group_barrier (over-syncs but correct).
+				if (opencl_options.fixed_subgroup_size != 1)
+				{
+					if (opencl_options.supports_opencl_version(2, 0))
+						statement("work_group_barrier(CLK_LOCAL_MEM_FENCE);");
+					else
+						statement("barrier(CLK_LOCAL_MEM_FENCE);");
+				}
 			}
 			else
 			{
-				string fence_flags = opencl_mem_fence_flags(semantics);
-				statement("sub_group_barrier(", fence_flags, ");");
+				if (!opencl_options.enable_subgroups)
+					SPIRV_CROSS_THROW("Subgroup barriers require enable_subgroups option.");
+
+				// Subgroup barrier with memory fence flags
+				const uint32_t all_barriers = MemorySemanticsWorkgroupMemoryMask | MemorySemanticsUniformMemoryMask |
+				                              MemorySemanticsImageMemoryMask;
+
+				if (semantics == 0 || (semantics & all_barriers) == all_barriers)
+				{
+					statement("sub_group_barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);");
+				}
+				else
+				{
+					string fence_flags = opencl_mem_fence_flags(semantics);
+					statement("sub_group_barrier(", fence_flags, ");");
+				}
 			}
 		}
 		else
@@ -4378,21 +5837,35 @@ void CompilerOpenCL::emit_instruction(const Instruction &instruction)
 		{
 			if (memory_scope == ScopeSubgroup)
 			{
-				if (!opencl_options.enable_subgroups)
-					SPIRV_CROSS_THROW("Subgroup memory barriers require enable_subgroups option.");
-
-				const uint32_t all_barriers = MemorySemanticsWorkgroupMemoryMask | MemorySemanticsUniformMemoryMask |
-				                              MemorySemanticsImageMemoryMask;
-
-				if ((semantics & all_barriers) == all_barriers ||
-				    (semantics & (MemorySemanticsCrossWorkgroupMemoryMask | MemorySemanticsSubgroupMemoryMask)))
+				if (opencl_options.emulate_subgroups && !opencl_options.enable_subgroups)
 				{
-					statement("sub_group_barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);");
+					// Emulated: no-op for size==1, otherwise work_group_barrier
+					if (opencl_options.fixed_subgroup_size != 1)
+					{
+						if (opencl_options.supports_opencl_version(2, 0))
+							statement("work_group_barrier(CLK_LOCAL_MEM_FENCE);");
+						else
+							statement("barrier(CLK_LOCAL_MEM_FENCE);");
+					}
 				}
 				else
 				{
-					string fence_flags = opencl_mem_fence_flags(semantics);
-					statement("sub_group_barrier(", fence_flags, ");");
+					if (!opencl_options.enable_subgroups)
+						SPIRV_CROSS_THROW("Subgroup memory barriers require enable_subgroups option.");
+
+					const uint32_t all_barriers = MemorySemanticsWorkgroupMemoryMask |
+					                              MemorySemanticsUniformMemoryMask | MemorySemanticsImageMemoryMask;
+
+					if ((semantics & all_barriers) == all_barriers ||
+					    (semantics & (MemorySemanticsCrossWorkgroupMemoryMask | MemorySemanticsSubgroupMemoryMask)))
+					{
+						statement("sub_group_barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);");
+					}
+					else
+					{
+						string fence_flags = opencl_mem_fence_flags(semantics);
+						statement("sub_group_barrier(", fence_flags, ");");
+					}
 				}
 			}
 			else
diff --git a/spirv_opencl.hpp b/spirv_opencl.hpp
index fc3962ddf..a5e6c4bee 100644
--- a/spirv_opencl.hpp
+++ b/spirv_opencl.hpp
@@ -57,6 +57,8 @@ class CompilerOpenCL : public CompilerGLSL
 		bool emulate_subgroups = false;
 		// Size of subgroup emulation
 		uint32_t fixed_subgroup_size = 0;
+		// Maximum workgroup size (used for scratch buffer sizing when reqd_work_group_size is absent)
+		uint32_t max_workgroup_size = 256;
 
 		void set_opencl_version(uint32_t major, uint32_t minor = 0, uint32_t patch = 0)
 		{
@@ -205,6 +207,20 @@ class CompilerOpenCL : public CompilerGLSL
 	bool needs_subgroup_clustered = false;
 	bool needs_subgroup_rotate = false;
 
+	// Subgroup emulation scratch buffer flags (set during emit, trigger force_recompile).
+	bool needs_subgroup_emulation_scratch = false;
+	bool needs_subgroup_emulation_scratch64 = false;
+	// Set of function IDs that need subgroup emulation scratch parameters threaded.
+	std::unordered_set<uint32_t> funcs_using_subgroup_emulation;
+	// Helpers to emit subgroup emulation polyfills and scratch infrastructure.
+	void emit_subgroup_emulation_helpers();
+	void emit_subgroup_emulation_entry_point_vars();
+	uint32_t get_emulation_max_workgroup_size() const;
+	std::string get_emulation_subgroup_size_expr() const;
+	void emit_subgroup_op_emulated(const Instruction &i);
+	std::string subgroup_emulation_scratch_type(bool is_64bit) const;
+	void scan_subgroup_emulation_usage();
+
 	// Matrix type support: tracks which matrix signatures (basetype, vecsize, columns) are needed.
 	struct MatrixTypeKey
 	{
diff --git a/test_shaders.py b/test_shaders.py
index 309fefd10..fb9ec52c5 100755
--- a/test_shaders.py
+++ b/test_shaders.py
@@ -586,9 +586,9 @@ def cross_compile_hlsl(shader, spirv, opt, force_no_external_validation, iterati
     return (spirv_path, hlsl_path)
 
 def path_to_opencl_standard_cli(shader):
-    # clang seems warn about cl_khr_subgroups unless is specified.
+    # clang seems warn about cl_khr_subgroups unless 2.0 is specified.
     # Revisit when OpenCL 3.0 support is no longer experimental.
-    if '.subgroups.' in shader:
+    if '.subgroups.' in shader or '.subgroups-core.' in shader:
         return '200'
     # OpenCL 3.0 support in clang is experimental and 2.1 and 2.2 seem unsupported.
     if '.cl30.' in shader:
@@ -618,8 +618,10 @@ def validate_shader_opencl(shader, opt, paths):
     if '.fp64.' in shader:
         extensions.append('cl_khr_fp64')
     if '.subgroups-emulate.' in shader:
-        if '.subgroups.' in shader:
-            extensions.append('cl_khr_subgroups')
+        # Make sure no extensions are included
+        pass
+    elif '.subgroups-core.' in shader:
+        extensions.append('cl_khr_subgroups')
     elif '.subgroups.' in shader:
         extensions.append('cl_khr_subgroups')
         extensions.append('cl_khr_subgroup_ballot')
@@ -703,12 +705,14 @@ def cross_compile_opencl(shader, spirv, opt, iterations, paths):
         opencl_args.append('--opencl-fp16')
     if '.fp64.' in shader:
         opencl_args.append('--opencl-fp64')
-    if '.subgroups.' in shader:
-        opencl_args.append('--opencl-subgroups-all')
     if '.subgroups-emulate.' in shader:
+        if '.subgroups.' in shader:
+            opencl_args.append('--opencl-subgroups-all')
         opencl_args.append('--opencl-emulate-subgroups')
         opencl_args.append('--opencl-fixed-subgroup-size')
         opencl_args.append('32')
+    elif '.subgroups.' in shader:
+        opencl_args.append('--opencl-subgroups-all')
 
     if shader_is_invalid_spirv(shader):
         subprocess.run(opencl_args)

From b9f85fb90f935271d7b3c7eccbf961517d8c0960 Mon Sep 17 00:00:00 2001
From: Garrick Meeker <gmeeker@gmail.com>
Date: Tue, 17 Mar 2026 10:17:15 -0700
Subject: [PATCH 14/16] OpenCL: Support for partial subgroups emulation.

---
 ...subgroups-basic.nocompat.vk.subgroups.comp |  76 +-
 ...subgroups-basic.nocompat.vk.subgroups.comp |  12 +-
 spirv_opencl.cpp                              | 889 +++++++++++++++++-
 spirv_opencl.hpp                              |   3 +
 test_shaders.py                               |   8 +-
 5 files changed, 931 insertions(+), 57 deletions(-)

diff --git a/reference/shaders-opencl-no-opt/comp/subgroups-basic.nocompat.vk.subgroups.comp b/reference/shaders-opencl-no-opt/comp/subgroups-basic.nocompat.vk.subgroups.comp
index ab81e408a..32cba0e4e 100644
--- a/reference/shaders-opencl-no-opt/comp/subgroups-basic.nocompat.vk.subgroups.comp
+++ b/reference/shaders-opencl-no-opt/comp/subgroups-basic.nocompat.vk.subgroups.comp
@@ -1,6 +1,7 @@
 // Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
 
 #pragma OPENCL EXTENSION cl_khr_subgroups : enable
+#pragma OPENCL EXTENSION cl_khr_subgroup_non_uniform_vote : enable
 
 struct SSBO
 {
@@ -11,46 +12,55 @@ struct SSBO
 
 typedef struct SSBO SSBO;
 
+float helper( float* val_1)
+{
+    float reduced_1 = sub_group_reduce_add((*val_1));
+    bool elected_1 = sub_group_elect();
+    return elected_1 ? reduced_1 : 0.0f;
+}
+
 __attribute__((reqd_work_group_size(256, 1, 1)))
-__kernel void comp_main(__global SSBO* _11)
+__kernel void comp_main(__global SSBO* _30)
 {
     uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
-    _11->FragColor = convert_float(get_num_sub_groups());
-    _11->FragColor = convert_float(get_sub_group_id());
-    _11->FragColor = convert_float(get_sub_group_size());
-    _11->FragColor = convert_float(get_sub_group_local_id());
+    _30->FragColor = convert_float(get_num_sub_groups());
+    _30->FragColor = convert_float(get_sub_group_id());
+    _30->FragColor = convert_float(get_sub_group_size());
+    _30->FragColor = convert_float(get_sub_group_local_id());
     sub_group_barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
     sub_group_barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
     sub_group_barrier(CLK_GLOBAL_MEM_FENCE);
     sub_group_barrier(CLK_LOCAL_MEM_FENCE);
     sub_group_barrier(CLK_GLOBAL_MEM_FENCE);
-    bool has_all = sub_group_all(true);
-    bool has_any = sub_group_any(true);
-    uint broadcasted = sub_group_broadcast(42u, 0u);
-    float fadd = sub_group_reduce_add(_11->FragColor);
-    int iadd = sub_group_reduce_add(_11->idat);
-    float fmin = sub_group_reduce_min(_11->FragColor);
-    float fmax = sub_group_reduce_max(_11->FragColor);
-    int smin = sub_group_reduce_min(_11->idat);
-    int smax = sub_group_reduce_max(_11->idat);
-    uint umin = sub_group_reduce_min(_11->udat);
-    uint umax = sub_group_reduce_max(_11->udat);
-    float finc_add = sub_group_scan_inclusive_add(_11->FragColor);
-    float finc_min = sub_group_scan_inclusive_min(_11->FragColor);
-    float finc_max = sub_group_scan_inclusive_max(_11->FragColor);
-    int iinc_add = sub_group_scan_inclusive_add(_11->idat);
-    int iinc_min = sub_group_scan_inclusive_min(_11->idat);
-    int iinc_max = sub_group_scan_inclusive_max(_11->idat);
-    uint uinc_min = sub_group_scan_inclusive_min(_11->udat);
-    uint uinc_max = sub_group_scan_inclusive_max(_11->udat);
-    float fexc_add = sub_group_scan_exclusive_add(_11->FragColor);
-    float fexc_min = sub_group_scan_exclusive_min(_11->FragColor);
-    float fexc_max = sub_group_scan_exclusive_max(_11->FragColor);
-    int iexc_add = sub_group_scan_exclusive_add(_11->idat);
-    int iexc_min = sub_group_scan_exclusive_min(_11->idat);
-    int iexc_max = sub_group_scan_exclusive_max(_11->idat);
-    uint uexc_min = sub_group_scan_exclusive_min(_11->udat);
-    uint uexc_max = sub_group_scan_exclusive_max(_11->udat);
-    _11->FragColor = (((((((((((fadd + fmin) + fmax) + finc_add) + finc_min) + finc_max) + fexc_add) + fexc_min) + fexc_max) + convert_float((((((((iadd + smin) + smax) + iinc_add) + iinc_min) + iinc_max) + iexc_add) + iexc_min) + iexc_max)) + convert_float((((((umin + umax) + uinc_min) + uinc_max) + uexc_min) + uexc_max) + broadcasted)) + (float)(has_all)) + (float)(has_any);
+    bool has_all_1 = sub_group_all(true);
+    bool has_any_1 = sub_group_any(true);
+    uint broadcasted_1 = sub_group_broadcast(42u, 0u);
+    float fadd_1 = sub_group_reduce_add(_30->FragColor);
+    int iadd_1 = sub_group_reduce_add(_30->idat);
+    float fmin_1 = sub_group_reduce_min(_30->FragColor);
+    float fmax_1 = sub_group_reduce_max(_30->FragColor);
+    int smin_1 = sub_group_reduce_min(_30->idat);
+    int smax_1 = sub_group_reduce_max(_30->idat);
+    uint umin_1 = sub_group_reduce_min(_30->udat);
+    uint umax_1 = sub_group_reduce_max(_30->udat);
+    float finc_add_1 = sub_group_scan_inclusive_add(_30->FragColor);
+    float finc_min_1 = sub_group_scan_inclusive_min(_30->FragColor);
+    float finc_max_1 = sub_group_scan_inclusive_max(_30->FragColor);
+    int iinc_add_1 = sub_group_scan_inclusive_add(_30->idat);
+    int iinc_min_1 = sub_group_scan_inclusive_min(_30->idat);
+    int iinc_max_1 = sub_group_scan_inclusive_max(_30->idat);
+    uint uinc_min_1 = sub_group_scan_inclusive_min(_30->udat);
+    uint uinc_max_1 = sub_group_scan_inclusive_max(_30->udat);
+    float fexc_add_1 = sub_group_scan_exclusive_add(_30->FragColor);
+    float fexc_min_1 = sub_group_scan_exclusive_min(_30->FragColor);
+    float fexc_max_1 = sub_group_scan_exclusive_max(_30->FragColor);
+    int iexc_add_1 = sub_group_scan_exclusive_add(_30->idat);
+    int iexc_min_1 = sub_group_scan_exclusive_min(_30->idat);
+    int iexc_max_1 = sub_group_scan_exclusive_max(_30->idat);
+    uint uexc_min_1 = sub_group_scan_exclusive_min(_30->udat);
+    uint uexc_max_1 = sub_group_scan_exclusive_max(_30->udat);
+    float param_1 = _30->FragColor;
+    float from_helper_1 = helper(&param_1);
+    _30->FragColor = ((((((((((((fadd_1 + fmin_1) + fmax_1) + finc_add_1) + finc_min_1) + finc_max_1) + fexc_add_1) + fexc_min_1) + fexc_max_1) + convert_float((((((((iadd_1 + smin_1) + smax_1) + iinc_add_1) + iinc_min_1) + iinc_max_1) + iexc_add_1) + iexc_min_1) + iexc_max_1)) + convert_float((((((umin_1 + umax_1) + uinc_min_1) + uinc_max_1) + uexc_min_1) + uexc_max_1) + broadcasted_1)) + (float)(has_all_1)) + (float)(has_any_1)) + from_helper_1;
 }
 
diff --git a/shaders-opencl-no-opt/comp/subgroups-basic.nocompat.vk.subgroups.comp b/shaders-opencl-no-opt/comp/subgroups-basic.nocompat.vk.subgroups.comp
index da981bccf..927d5e9f4 100644
--- a/shaders-opencl-no-opt/comp/subgroups-basic.nocompat.vk.subgroups.comp
+++ b/shaders-opencl-no-opt/comp/subgroups-basic.nocompat.vk.subgroups.comp
@@ -12,6 +12,13 @@ layout(std430, binding = 0) buffer SSBO
 	uint udat;
 };
 
+float helper(float val)
+{
+	float reduced = subgroupAdd(val);
+	bool elected = subgroupElect();
+	return elected ? reduced : 0.0;
+}
+
 void main()
 {
 	// Builtins
@@ -64,9 +71,12 @@ void main()
 	uint uexc_min = subgroupExclusiveMin(udat);
 	uint uexc_max = subgroupExclusiveMax(udat);
 
+	// Call helper function that uses subgroup ops
+	float from_helper = helper(FragColor);
+
 	// Write results to prevent dead-code elimination
 	FragColor = fadd + fmin + fmax + finc_add + finc_min + finc_max + fexc_add + fexc_min + fexc_max
 		+ float(iadd + smin + smax + iinc_add + iinc_min + iinc_max + iexc_add + iexc_min + iexc_max)
 		+ float(umin + umax + uinc_min + uinc_max + uexc_min + uexc_max + broadcasted)
-		+ float(has_all) + float(has_any);
+		+ float(has_all) + float(has_any) + from_helper;
 }
diff --git a/spirv_opencl.cpp b/spirv_opencl.cpp
index 6bddfa00f..b820f447c 100644
--- a/spirv_opencl.cpp
+++ b/spirv_opencl.cpp
@@ -181,20 +181,26 @@ void CompilerOpenCL::emit_header()
 		statement("#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable");
 	if (opencl_options.enable_subgroups)
 		statement("#pragma OPENCL EXTENSION cl_khr_subgroups : enable");
-	if (needs_subgroup_vote)
-		statement("#pragma OPENCL EXTENSION cl_khr_subgroup_non_uniform_vote : enable");
-	if (needs_subgroup_ballot)
-		statement("#pragma OPENCL EXTENSION cl_khr_subgroup_ballot : enable");
-	if (needs_subgroup_arithmetic)
-		statement("#pragma OPENCL EXTENSION cl_khr_subgroup_non_uniform_arithmetic : enable");
-	if (needs_subgroup_shuffle)
-		statement("#pragma OPENCL EXTENSION cl_khr_subgroup_shuffle : enable");
-	if (needs_subgroup_shuffle_relative)
-		statement("#pragma OPENCL EXTENSION cl_khr_subgroup_shuffle_relative : enable");
-	if (needs_subgroup_clustered)
-		statement("#pragma OPENCL EXTENSION cl_khr_subgroup_clustered_reduce : enable");
-	if (needs_subgroup_rotate)
-		statement("#pragma OPENCL EXTENSION cl_khr_subgroup_rotate : enable");
+	// In combined mode, extension-specific pragmas are emitted inside #ifdef blocks
+	// in the wrapper section, not here.
+	bool combined_subgroup_mode = opencl_options.emulate_subgroups && opencl_options.enable_subgroups;
+	if (!combined_subgroup_mode)
+	{
+		if (needs_subgroup_vote)
+			statement("#pragma OPENCL EXTENSION cl_khr_subgroup_non_uniform_vote : enable");
+		if (needs_subgroup_ballot)
+			statement("#pragma OPENCL EXTENSION cl_khr_subgroup_ballot : enable");
+		if (needs_subgroup_arithmetic)
+			statement("#pragma OPENCL EXTENSION cl_khr_subgroup_non_uniform_arithmetic : enable");
+		if (needs_subgroup_shuffle)
+			statement("#pragma OPENCL EXTENSION cl_khr_subgroup_shuffle : enable");
+		if (needs_subgroup_shuffle_relative)
+			statement("#pragma OPENCL EXTENSION cl_khr_subgroup_shuffle_relative : enable");
+		if (needs_subgroup_clustered)
+			statement("#pragma OPENCL EXTENSION cl_khr_subgroup_clustered_reduce : enable");
+		if (needs_subgroup_rotate)
+			statement("#pragma OPENCL EXTENSION cl_khr_subgroup_rotate : enable");
+	}
 	statement("");
 
 	// Emit FP_CONTRACT pragma based on ContractionOff execution mode and FPFastMathDefault.
@@ -771,8 +777,11 @@ void CompilerOpenCL::emit_resources()
 		statement("");
 	}
 
-	// Subgroup emulation helper functions.
-	emit_subgroup_emulation_helpers();
+	// Subgroup emulation helper functions and combined-mode wrappers.
+	if (opencl_options.emulate_subgroups && opencl_options.enable_subgroups)
+		emit_subgroup_combined_wrappers();
+	else
+		emit_subgroup_emulation_helpers();
 
 	// Default sampler for combined image+sampler usage (OpenCL C requires file-scope const sampler_t).
 	if (needs_default_sampler)
@@ -1003,7 +1012,9 @@ void CompilerOpenCL::emit_entry_point_declarations()
 	}
 
 	// Emit subgroup emulation local variables and scratch buffers.
-	if (opencl_options.emulate_subgroups && !opencl_options.enable_subgroups)
+	if (opencl_options.emulate_subgroups && opencl_options.enable_subgroups)
+		emit_subgroup_combined_entry_point_vars();
+	else if (opencl_options.emulate_subgroups && !opencl_options.enable_subgroups)
 		emit_subgroup_emulation_entry_point_vars();
 
 	// Materialize Input builtin variables as local variables.
@@ -1132,6 +1143,20 @@ string CompilerOpenCL::builtin_to_glsl(BuiltIn builtin, StorageClass storage)
 			}
 			return "spv_subgroup_eq_mask(_spv_lane_id)";
 		}
+		if (opencl_options.emulate_subgroups && opencl_options.enable_subgroups)
+		{
+			if (!needs_subgroup_ballot)
+			{
+				needs_subgroup_ballot = true;
+				force_recompile();
+			}
+			if (!needs_subgroup_emulation_scratch)
+			{
+				needs_subgroup_emulation_scratch = true;
+				force_recompile();
+			}
+			return "spv_get_sub_group_eq_mask(_spv_lane_id)";
+		}
 		if (!opencl_options.enable_subgroups)
 			SPIRV_CROSS_THROW("Subgroup builtins require enable_subgroups option.");
 		if (!needs_subgroup_ballot)
@@ -1150,6 +1175,20 @@ string CompilerOpenCL::builtin_to_glsl(BuiltIn builtin, StorageClass storage)
 			}
 			return "spv_subgroup_ge_mask(_spv_lane_id, _spv_subgroup_size)";
 		}
+		if (opencl_options.emulate_subgroups && opencl_options.enable_subgroups)
+		{
+			if (!needs_subgroup_ballot)
+			{
+				needs_subgroup_ballot = true;
+				force_recompile();
+			}
+			if (!needs_subgroup_emulation_scratch)
+			{
+				needs_subgroup_emulation_scratch = true;
+				force_recompile();
+			}
+			return "spv_get_sub_group_ge_mask(_spv_lane_id, _spv_subgroup_size)";
+		}
 		if (!opencl_options.enable_subgroups)
 			SPIRV_CROSS_THROW("Subgroup builtins require enable_subgroups option.");
 		if (!needs_subgroup_ballot)
@@ -1168,6 +1207,20 @@ string CompilerOpenCL::builtin_to_glsl(BuiltIn builtin, StorageClass storage)
 			}
 			return "spv_subgroup_gt_mask(_spv_lane_id, _spv_subgroup_size)";
 		}
+		if (opencl_options.emulate_subgroups && opencl_options.enable_subgroups)
+		{
+			if (!needs_subgroup_ballot)
+			{
+				needs_subgroup_ballot = true;
+				force_recompile();
+			}
+			if (!needs_subgroup_emulation_scratch)
+			{
+				needs_subgroup_emulation_scratch = true;
+				force_recompile();
+			}
+			return "spv_get_sub_group_gt_mask(_spv_lane_id, _spv_subgroup_size)";
+		}
 		if (!opencl_options.enable_subgroups)
 			SPIRV_CROSS_THROW("Subgroup builtins require enable_subgroups option.");
 		if (!needs_subgroup_ballot)
@@ -1186,6 +1239,20 @@ string CompilerOpenCL::builtin_to_glsl(BuiltIn builtin, StorageClass storage)
 			}
 			return "spv_subgroup_le_mask(_spv_lane_id, _spv_subgroup_size)";
 		}
+		if (opencl_options.emulate_subgroups && opencl_options.enable_subgroups)
+		{
+			if (!needs_subgroup_ballot)
+			{
+				needs_subgroup_ballot = true;
+				force_recompile();
+			}
+			if (!needs_subgroup_emulation_scratch)
+			{
+				needs_subgroup_emulation_scratch = true;
+				force_recompile();
+			}
+			return "spv_get_sub_group_le_mask(_spv_lane_id, _spv_subgroup_size)";
+		}
 		if (!opencl_options.enable_subgroups)
 			SPIRV_CROSS_THROW("Subgroup builtins require enable_subgroups option.");
 		if (!needs_subgroup_ballot)
@@ -1204,6 +1271,20 @@ string CompilerOpenCL::builtin_to_glsl(BuiltIn builtin, StorageClass storage)
 			}
 			return "spv_subgroup_lt_mask(_spv_lane_id)";
 		}
+		if (opencl_options.emulate_subgroups && opencl_options.enable_subgroups)
+		{
+			if (!needs_subgroup_ballot)
+			{
+				needs_subgroup_ballot = true;
+				force_recompile();
+			}
+			if (!needs_subgroup_emulation_scratch)
+			{
+				needs_subgroup_emulation_scratch = true;
+				force_recompile();
+			}
+			return "spv_get_sub_group_lt_mask(_spv_lane_id)";
+		}
 		if (!opencl_options.enable_subgroups)
 			SPIRV_CROSS_THROW("Subgroup builtins require enable_subgroups option.");
 		if (!needs_subgroup_ballot)
@@ -3533,7 +3614,7 @@ void CompilerOpenCL::emit_subgroup_emulation_entry_point_vars()
 
 void CompilerOpenCL::scan_subgroup_emulation_usage()
 {
-	if (!opencl_options.emulate_subgroups || opencl_options.enable_subgroups)
+	if (!opencl_options.emulate_subgroups)
 		return;
 
 	funcs_using_subgroup_emulation.clear();
@@ -4179,6 +4260,739 @@ void CompilerOpenCL::emit_subgroup_emulation_helpers()
 	statement("");
 }
 
+void CompilerOpenCL::emit_subgroup_combined_entry_point_vars()
+{
+	// In combined mode, derive subgroup geometry from native cl_khr_subgroups builtins.
+	statement("uint _spv_subgroup_size = get_sub_group_size();");
+	statement("uint _spv_lane_id = get_sub_group_local_id();");
+	statement("uint _spv_subgroup_id = get_sub_group_id();");
+	statement("uint _spv_linear_id = _spv_subgroup_id * _spv_subgroup_size + _spv_lane_id;");
+	statement("uint _spv_subgroup_base = _spv_subgroup_id * _spv_subgroup_size;");
+
+	if (needs_subgroup_emulation_scratch)
+	{
+		uint32_t max_wg = get_emulation_max_workgroup_size();
+		statement("__local uint _spv_subgroup_scratch[", max_wg, "];");
+	}
+	if (needs_subgroup_emulation_scratch64)
+	{
+		uint32_t max_wg = get_emulation_max_workgroup_size();
+		statement("__local ulong _spv_subgroup_scratch64[", max_wg, "];");
+	}
+}
+
+void CompilerOpenCL::emit_subgroup_combined_wrappers()
+{
+	if (!opencl_options.emulate_subgroups || !opencl_options.enable_subgroups)
+		return;
+	if (!needs_subgroup_emulation_scratch)
+		return;
+
+	// Emit all emulation helpers unconditionally (unused static functions are DCE'd by compiler).
+	// These are the same helpers as pure emulation mode.
+	emit_subgroup_emulation_helpers();
+
+	// Now emit per-extension #ifdef/#else macro blocks.
+	// In the #ifdef path: macros map spv_* to native calls.
+	// In the #else path: macros map spv_* to the emulation helpers emitted above.
+
+	// === cl_khr_subgroup_non_uniform_vote ===
+	if (needs_subgroup_vote)
+	{
+		statement("#ifdef cl_khr_subgroup_non_uniform_vote");
+		statement("#pragma OPENCL EXTENSION cl_khr_subgroup_non_uniform_vote : enable");
+		statement("#define spv_sub_group_elect(lane_id) sub_group_elect()");
+		statement("#define spv_all_equal_uint(scratch, val, linear_id, subgroup_base, subgroup_size) "
+		          "sub_group_non_uniform_all_equal((val))");
+		statement("#else");
+		statement("#define spv_sub_group_elect(lane_id) ((lane_id) == 0u)");
+		statement("#define spv_all_equal_uint(scratch, val, linear_id, subgroup_base, subgroup_size) "
+		          "spv_emulate_all_equal_uint((scratch), (val), (linear_id), (subgroup_base), (subgroup_size))");
+		statement("#endif");
+		statement("");
+	}
+
+	// === cl_khr_subgroup_ballot ===
+	if (needs_subgroup_ballot)
+	{
+		statement("#ifdef cl_khr_subgroup_ballot");
+		statement("#pragma OPENCL EXTENSION cl_khr_subgroup_ballot : enable");
+		// Native macros
+		statement("#define spv_broadcast_first_uint(scratch, val, linear_id, subgroup_base) "
+		          "sub_group_broadcast_first((val))");
+		statement("#define spv_ballot(scratch, pred, linear_id, subgroup_base, subgroup_size) "
+		          "sub_group_ballot((pred))");
+		statement("#define spv_inverse_ballot(ballot, lane_id) "
+		          "sub_group_inverse_ballot((ballot))");
+		statement("#define spv_ballot_bit_extract(ballot, idx) "
+		          "sub_group_ballot_bit_extract((ballot), (idx))");
+		statement("#define spv_ballot_bit_count(ballot) "
+		          "sub_group_ballot_bit_count((ballot))");
+		statement("#define spv_ballot_inclusive_bit_count(ballot, lane_id) "
+		          "sub_group_ballot_inclusive_scan((ballot))");
+		statement("#define spv_ballot_exclusive_bit_count(ballot, lane_id) "
+		          "sub_group_ballot_exclusive_scan((ballot))");
+		statement("#define spv_ballot_find_lsb(ballot) sub_group_ballot_find_lsb((ballot))");
+		statement("#define spv_ballot_find_msb(ballot) sub_group_ballot_find_msb((ballot))");
+		// Mask builtins
+		statement("#define spv_get_sub_group_eq_mask(lane_id) get_sub_group_eq_mask()");
+		statement("#define spv_get_sub_group_ge_mask(lane_id, sg_size) get_sub_group_ge_mask()");
+		statement("#define spv_get_sub_group_gt_mask(lane_id, sg_size) get_sub_group_gt_mask()");
+		statement("#define spv_get_sub_group_le_mask(lane_id, sg_size) get_sub_group_le_mask()");
+		statement("#define spv_get_sub_group_lt_mask(lane_id) get_sub_group_lt_mask()");
+		statement("#else");
+		// Emulation macros
+		statement("#define spv_broadcast_first_uint(scratch, val, linear_id, subgroup_base) "
+		          "spv_emulate_broadcast_first_uint((scratch), (val), (linear_id), (subgroup_base))");
+		statement("#define spv_ballot(scratch, pred, linear_id, subgroup_base, subgroup_size) "
+		          "spv_emulate_ballot((scratch), (pred), (linear_id), (subgroup_base), (subgroup_size))");
+		statement("#define spv_inverse_ballot(ballot, lane_id) "
+		          "spv_emulate_inverse_ballot((ballot), (lane_id))");
+		statement("#define spv_ballot_bit_extract(ballot, idx) "
+		          "spv_emulate_ballot_bit_extract((ballot), (idx))");
+		statement("#define spv_ballot_bit_count(ballot) "
+		          "spv_emulate_ballot_bit_count((ballot))");
+		statement("#define spv_ballot_inclusive_bit_count(ballot, lane_id) "
+		          "spv_emulate_ballot_inclusive_bit_count((ballot), (lane_id))");
+		statement("#define spv_ballot_exclusive_bit_count(ballot, lane_id) "
+		          "spv_emulate_ballot_exclusive_bit_count((ballot), (lane_id))");
+		statement("#define spv_ballot_find_lsb(ballot) spv_emulate_ballot_find_lsb((ballot))");
+		statement("#define spv_ballot_find_msb(ballot) spv_emulate_ballot_find_msb((ballot))");
+		statement("#define spv_get_sub_group_eq_mask(lane_id) spv_subgroup_eq_mask((lane_id))");
+		statement("#define spv_get_sub_group_ge_mask(lane_id, sg_size) spv_subgroup_ge_mask((lane_id), (sg_size))");
+		statement("#define spv_get_sub_group_gt_mask(lane_id, sg_size) spv_subgroup_gt_mask((lane_id), (sg_size))");
+		statement("#define spv_get_sub_group_le_mask(lane_id, sg_size) spv_subgroup_le_mask((lane_id), (sg_size))");
+		statement("#define spv_get_sub_group_lt_mask(lane_id) spv_subgroup_lt_mask((lane_id))");
+		statement("#endif");
+		statement("");
+	}
+
+	// === cl_khr_subgroup_shuffle ===
+	if (needs_subgroup_shuffle)
+	{
+		statement("#ifdef cl_khr_subgroup_shuffle");
+		statement("#pragma OPENCL EXTENSION cl_khr_subgroup_shuffle : enable");
+		statement("#define spv_shuffle_uint(scratch, val, idx, linear_id, subgroup_base) "
+		          "sub_group_shuffle((val), (idx))");
+		statement("#define spv_shuffle_xor_uint(scratch, val, mask, lane_id, linear_id, subgroup_base) "
+		          "sub_group_shuffle_xor((val), (mask))");
+		statement("#else");
+		statement("#define spv_shuffle_uint(scratch, val, idx, linear_id, subgroup_base) "
+		          "spv_emulate_shuffle_uint((scratch), (val), (idx), (linear_id), (subgroup_base))");
+		statement("#define spv_shuffle_xor_uint(scratch, val, mask, lane_id, linear_id, subgroup_base) "
+		          "spv_emulate_shuffle_xor_uint((scratch), (val), (mask), (lane_id), (linear_id), (subgroup_base))");
+		statement("#endif");
+		statement("");
+	}
+
+	// === cl_khr_subgroup_shuffle_relative ===
+	if (needs_subgroup_shuffle_relative)
+	{
+		statement("#ifdef cl_khr_subgroup_shuffle_relative");
+		statement("#pragma OPENCL EXTENSION cl_khr_subgroup_shuffle_relative : enable");
+		statement("#define spv_shuffle_up_uint(scratch, val, delta, lane_id, linear_id, subgroup_base) "
+		          "sub_group_shuffle_up((val), (delta))");
+		statement("#define spv_shuffle_down_uint(scratch, val, delta, lane_id, linear_id, subgroup_size) "
+		          "sub_group_shuffle_down((val), (delta))");
+		statement("#else");
+		statement("#define spv_shuffle_up_uint(scratch, val, delta, lane_id, linear_id, subgroup_base) "
+		          "spv_emulate_shuffle_up_uint((scratch), (val), (delta), (lane_id), (linear_id), (subgroup_base))");
+		statement("#define spv_shuffle_down_uint(scratch, val, delta, lane_id, linear_id, subgroup_size) "
+		          "spv_emulate_shuffle_down_uint((scratch), (val), (delta), (lane_id), (linear_id), (subgroup_size))");
+		statement("#endif");
+		statement("");
+	}
+
+	// === cl_khr_subgroup_rotate ===
+	if (needs_subgroup_rotate)
+	{
+		statement("#ifdef cl_khr_subgroup_rotate");
+		statement("#pragma OPENCL EXTENSION cl_khr_subgroup_rotate : enable");
+		statement("#define spv_rotate_uint(scratch, val, delta, lane_id, linear_id, subgroup_base, subgroup_size) "
+		          "sub_group_rotate((val), (delta))");
+		statement("#define spv_clustered_rotate_uint(scratch, val, delta, lane_id, linear_id, subgroup_base, "
+		          "cluster_size) sub_group_clustered_rotate((val), (delta), (cluster_size))");
+		statement("#else");
+		statement("#define spv_rotate_uint(scratch, val, delta, lane_id, linear_id, subgroup_base, subgroup_size) "
+		          "spv_emulate_rotate_uint((scratch), (val), (delta), (lane_id), (linear_id), (subgroup_base), "
+		          "(subgroup_size))");
+		statement("#define spv_clustered_rotate_uint(scratch, val, delta, lane_id, linear_id, subgroup_base, "
+		          "cluster_size) spv_emulate_clustered_rotate_uint((scratch), (val), (delta), (lane_id), "
+		          "(linear_id), (subgroup_base), (cluster_size))");
+		statement("#endif");
+		statement("");
+	}
+
+	// === cl_khr_subgroup_non_uniform_arithmetic ===
+	// Covers: mul, and, or, xor, logical_and/or/xor for Reduce/InclusiveScan/ExclusiveScan
+	if (needs_subgroup_arithmetic)
+	{
+		statement("#ifdef cl_khr_subgroup_non_uniform_arithmetic");
+		statement("#pragma OPENCL EXTENSION cl_khr_subgroup_non_uniform_arithmetic : enable");
+
+		// For each non-base arithmetic op, emit native macro.
+		// Native functions are generic (overloaded), so one macro per type-suffix works.
+		auto emit_arith_macros_native = [&](const char *op_suffix, const char *native_reduce,
+		                                    const char *native_inclusive, const char *native_exclusive)
+		{
+			statement("#define spv_reduce_", op_suffix, "(scratch, val, linear_id, subgroup_base, subgroup_size) ",
+			          native_reduce, "((val))");
+			statement("#define spv_inclusive_scan_", op_suffix,
+			          "(scratch, val, lane_id, linear_id, subgroup_base, subgroup_size) ", native_inclusive, "((val))");
+			statement("#define spv_exclusive_scan_", op_suffix,
+			          "(scratch, val, lane_id, linear_id, subgroup_base, subgroup_size) ", native_exclusive, "((val))");
+		};
+
+		emit_arith_macros_native("mul_uint", "sub_group_non_uniform_reduce_mul",
+		                         "sub_group_non_uniform_scan_inclusive_mul",
+		                         "sub_group_non_uniform_scan_exclusive_mul");
+		emit_arith_macros_native("mul_int", "sub_group_non_uniform_reduce_mul",
+		                         "sub_group_non_uniform_scan_inclusive_mul",
+		                         "sub_group_non_uniform_scan_exclusive_mul");
+		emit_arith_macros_native("mul_float", "sub_group_non_uniform_reduce_mul",
+		                         "sub_group_non_uniform_scan_inclusive_mul",
+		                         "sub_group_non_uniform_scan_exclusive_mul");
+		emit_arith_macros_native("and_uint", "sub_group_non_uniform_reduce_and",
+		                         "sub_group_non_uniform_scan_inclusive_and",
+		                         "sub_group_non_uniform_scan_exclusive_and");
+		emit_arith_macros_native("and_int", "sub_group_non_uniform_reduce_and",
+		                         "sub_group_non_uniform_scan_inclusive_and",
+		                         "sub_group_non_uniform_scan_exclusive_and");
+		emit_arith_macros_native("or_uint", "sub_group_non_uniform_reduce_or",
+		                         "sub_group_non_uniform_scan_inclusive_or", "sub_group_non_uniform_scan_exclusive_or");
+		emit_arith_macros_native("or_int", "sub_group_non_uniform_reduce_or", "sub_group_non_uniform_scan_inclusive_or",
+		                         "sub_group_non_uniform_scan_exclusive_or");
+		emit_arith_macros_native("xor_uint", "sub_group_non_uniform_reduce_xor",
+		                         "sub_group_non_uniform_scan_inclusive_xor",
+		                         "sub_group_non_uniform_scan_exclusive_xor");
+		emit_arith_macros_native("xor_int", "sub_group_non_uniform_reduce_xor",
+		                         "sub_group_non_uniform_scan_inclusive_xor",
+		                         "sub_group_non_uniform_scan_exclusive_xor");
+		emit_arith_macros_native("logical_and", "sub_group_non_uniform_reduce_logical_and",
+		                         "sub_group_non_uniform_scan_inclusive_logical_and",
+		                         "sub_group_non_uniform_scan_exclusive_logical_and");
+		emit_arith_macros_native("logical_or", "sub_group_non_uniform_reduce_logical_or",
+		                         "sub_group_non_uniform_scan_inclusive_logical_or",
+		                         "sub_group_non_uniform_scan_exclusive_logical_or");
+		emit_arith_macros_native("logical_xor", "sub_group_non_uniform_reduce_logical_xor",
+		                         "sub_group_non_uniform_scan_inclusive_logical_xor",
+		                         "sub_group_non_uniform_scan_exclusive_logical_xor");
+
+		statement("#else");
+
+		auto emit_arith_macros_emulated = [&](const char *op_suffix)
+		{
+			statement("#define spv_reduce_", op_suffix,
+			          "(scratch, val, linear_id, subgroup_base, subgroup_size) "
+			          "spv_emulate_reduce_",
+			          op_suffix, "((scratch), (val), (linear_id), (subgroup_base), (subgroup_size))");
+			statement("#define spv_inclusive_scan_", op_suffix,
+			          "(scratch, val, lane_id, linear_id, subgroup_base, subgroup_size) "
+			          "spv_emulate_inclusive_scan_",
+			          op_suffix, "((scratch), (val), (lane_id), (linear_id), (subgroup_base), (subgroup_size))");
+			statement("#define spv_exclusive_scan_", op_suffix,
+			          "(scratch, val, lane_id, linear_id, subgroup_base, subgroup_size) "
+			          "spv_emulate_exclusive_scan_",
+			          op_suffix, "((scratch), (val), (lane_id), (linear_id), (subgroup_base), (subgroup_size))");
+		};
+
+		for (const char *suffix : { "mul_uint", "mul_int", "mul_float", "and_uint", "and_int", "or_uint", "or_int",
+		                            "xor_uint", "xor_int", "logical_and", "logical_or", "logical_xor" })
+			emit_arith_macros_emulated(suffix);
+
+		statement("#endif");
+		statement("");
+	}
+
+	// === cl_khr_subgroup_clustered_reduce ===
+	// Covers ALL ops (including add/min/max) with ClusteredReduce
+	if (needs_subgroup_clustered)
+	{
+		statement("#ifdef cl_khr_subgroup_clustered_reduce");
+		statement("#pragma OPENCL EXTENSION cl_khr_subgroup_clustered_reduce : enable");
+
+		auto emit_clustered_native = [&](const char *op_suffix, const char *native_func)
+		{
+			statement("#define spv_clustered_reduce_", op_suffix,
+			          "(scratch, val, cluster, lane_id, linear_id, subgroup_base, subgroup_size) ", native_func,
+			          "((val), (cluster))");
+		};
+
+		emit_clustered_native("add_uint", "sub_group_clustered_reduce_add");
+		emit_clustered_native("add_int", "sub_group_clustered_reduce_add");
+		emit_clustered_native("add_float", "sub_group_clustered_reduce_add");
+		emit_clustered_native("mul_uint", "sub_group_clustered_reduce_mul");
+		emit_clustered_native("mul_int", "sub_group_clustered_reduce_mul");
+		emit_clustered_native("mul_float", "sub_group_clustered_reduce_mul");
+		emit_clustered_native("min_uint", "sub_group_clustered_reduce_min");
+		emit_clustered_native("min_int", "sub_group_clustered_reduce_min");
+		emit_clustered_native("min_float", "sub_group_clustered_reduce_min");
+		emit_clustered_native("max_uint", "sub_group_clustered_reduce_max");
+		emit_clustered_native("max_int", "sub_group_clustered_reduce_max");
+		emit_clustered_native("max_float", "sub_group_clustered_reduce_max");
+		emit_clustered_native("and_uint", "sub_group_clustered_reduce_and");
+		emit_clustered_native("and_int", "sub_group_clustered_reduce_and");
+		emit_clustered_native("or_uint", "sub_group_clustered_reduce_or");
+		emit_clustered_native("or_int", "sub_group_clustered_reduce_or");
+		emit_clustered_native("xor_uint", "sub_group_clustered_reduce_xor");
+		emit_clustered_native("xor_int", "sub_group_clustered_reduce_xor");
+		emit_clustered_native("logical_and", "sub_group_clustered_reduce_logical_and");
+		emit_clustered_native("logical_or", "sub_group_clustered_reduce_logical_or");
+		emit_clustered_native("logical_xor", "sub_group_clustered_reduce_logical_xor");
+
+		statement("#else");
+
+		auto emit_clustered_emulated = [&](const char *op_suffix)
+		{
+			statement("#define spv_clustered_reduce_", op_suffix,
+			          "(scratch, val, cluster, lane_id, linear_id, subgroup_base, subgroup_size) "
+			          "spv_emulate_clustered_reduce_",
+			          op_suffix,
+			          "((scratch), (val), (cluster), (lane_id), (linear_id), (subgroup_base), (subgroup_size))");
+		};
+
+		for (const char *suffix :
+		     { "add_uint", "add_int",   "add_float", "mul_uint", "mul_int",     "mul_float",  "min_uint",
+		       "min_int",  "min_float", "max_uint",  "max_int",  "max_float",   "and_uint",   "and_int",
+		       "or_uint",  "or_int",    "xor_uint",  "xor_int",  "logical_and", "logical_or", "logical_xor" })
+			emit_clustered_emulated(suffix);
+
+		statement("#endif");
+		statement("");
+	}
+}
+
+void CompilerOpenCL::emit_subgroup_op_combined(const Instruction &i)
+{
+	// Combined mode: emit spv_* wrapper macro calls for non-base ops.
+	// Base ops (all, any, broadcast, add/min/max reduce/scan) are handled by the native path.
+	const uint32_t *ops = stream(i);
+	auto op = static_cast<Op>(i.op);
+
+	auto scope = static_cast<Scope>(evaluate_constant_u32(ops[2]));
+	if (scope != ScopeSubgroup)
+		SPIRV_CROSS_THROW("Only subgroup scope is supported.");
+
+	uint32_t result_type = ops[0];
+	uint32_t id = ops[1];
+
+	auto require_extension = [this](bool &flag)
+	{
+		if (!flag)
+		{
+			flag = true;
+			force_recompile();
+		}
+	};
+
+	auto require_scratch = [this]()
+	{
+		if (!needs_subgroup_emulation_scratch)
+		{
+			needs_subgroup_emulation_scratch = true;
+			force_recompile();
+		}
+	};
+
+	// Helper for vector decomposition with spv_* macros (uint-based, like emulated path)
+	auto to_uint_cast = [&](uint32_t value_id) -> string
+	{
+		auto &type = expression_type(value_id);
+		if (type.basetype == SPIRType::UInt)
+			return to_expression(value_id);
+		else if (type.basetype == SPIRType::Int)
+			return join("as_uint(", to_expression(value_id), ")");
+		else if (type.basetype == SPIRType::Float)
+			return join("as_uint(", to_expression(value_id), ")");
+		else if (type.basetype == SPIRType::Boolean)
+			return join("(", to_expression(value_id), " ? 1u : 0u)");
+		return to_expression(value_id);
+	};
+
+	auto from_uint_cast = [&](const string &expr, uint32_t value_id) -> string
+	{
+		auto &type = expression_type(value_id);
+		if (type.basetype == SPIRType::Int)
+			return join("as_int(", expr, ")");
+		else if (type.basetype == SPIRType::Float)
+			return join("as_float(", expr, ")");
+		return expr;
+	};
+
+	// Vector decomposition calling a spv_* macro per component
+	auto emit_combined_vec = [&](uint32_t value_id, const string &prefix, const string &suffix)
+	{
+		auto &type = expression_type(value_id);
+		if (type.vecsize > 1)
+		{
+			auto &out_type = get<SPIRType>(result_type);
+			string expr = "(" + type_to_glsl(out_type) + ")(";
+			for (uint32_t c = 0; c < type.vecsize; c++)
+			{
+				if (c > 0)
+					expr += ", ";
+				string comp = join(to_enclosed_expression(value_id), ".", "xyzw"[c]);
+				string as_uint_comp;
+				if (type.basetype == SPIRType::UInt)
+					as_uint_comp = comp;
+				else if (type.basetype == SPIRType::Int)
+					as_uint_comp = join("as_uint(", comp, ")");
+				else if (type.basetype == SPIRType::Float)
+					as_uint_comp = join("as_uint(", comp, ")");
+				else
+					as_uint_comp = comp;
+				string result_comp = prefix + as_uint_comp + suffix;
+				if (type.basetype == SPIRType::Int)
+					result_comp = join("as_int(", result_comp, ")");
+				else if (type.basetype == SPIRType::Float)
+					result_comp = join("as_float(", result_comp, ")");
+				expr += result_comp;
+			}
+			expr += ")";
+			emit_op(result_type, id, expr, should_forward(value_id));
+			inherit_expression_dependencies(id, value_id);
+		}
+		else
+		{
+			string result_expr = prefix + to_uint_cast(value_id) + suffix;
+			result_expr = from_uint_cast(result_expr, value_id);
+			emit_op(result_type, id, result_expr, should_forward(value_id));
+			inherit_expression_dependencies(id, value_id);
+		}
+	};
+
+	switch (op)
+	{
+	// === Vote ===
+	case OpGroupNonUniformElect:
+		require_extension(needs_subgroup_vote);
+		require_scratch();
+		emit_op(result_type, id, "spv_sub_group_elect(_spv_lane_id)", true);
+		break;
+
+	case OpGroupNonUniformAllEqual:
+	{
+		require_extension(needs_subgroup_vote);
+		require_scratch();
+		auto &type = expression_type(ops[3]);
+		if (type.vecsize > 1)
+		{
+			string expr;
+			for (uint32_t c = 0; c < type.vecsize; c++)
+			{
+				if (c > 0)
+					expr += " && ";
+				string comp = join(to_enclosed_expression(ops[3]), ".", "xyzw"[c]);
+				string as_uint_comp = (type.basetype == SPIRType::UInt) ? comp : join("as_uint(", comp, ")");
+				expr += join("spv_all_equal_uint(_spv_subgroup_scratch, ", as_uint_comp,
+				             ", _spv_linear_id, _spv_subgroup_base, _spv_subgroup_size)");
+			}
+			emit_op(result_type, id, expr, should_forward(ops[3]));
+			inherit_expression_dependencies(id, ops[3]);
+		}
+		else
+		{
+			emit_op(result_type, id,
+			        join("spv_all_equal_uint(_spv_subgroup_scratch, ", to_uint_cast(ops[3]),
+			             ", _spv_linear_id, _spv_subgroup_base, _spv_subgroup_size)"),
+			        should_forward(ops[3]));
+			inherit_expression_dependencies(id, ops[3]);
+		}
+		break;
+	}
+
+	// === Ballot ===
+	case OpGroupNonUniformBroadcastFirst:
+		require_extension(needs_subgroup_ballot);
+		require_scratch();
+		emit_combined_vec(ops[3], "spv_broadcast_first_uint(_spv_subgroup_scratch, ",
+		                  ", _spv_linear_id, _spv_subgroup_base)");
+		break;
+
+	case OpGroupNonUniformBallot:
+		require_extension(needs_subgroup_ballot);
+		require_scratch();
+		emit_op(result_type, id,
+		        join("spv_ballot(_spv_subgroup_scratch, ", to_expression(ops[3]),
+		             ", _spv_linear_id, _spv_subgroup_base, _spv_subgroup_size)"),
+		        should_forward(ops[3]));
+		inherit_expression_dependencies(id, ops[3]);
+		break;
+
+	case OpGroupNonUniformInverseBallot:
+		require_extension(needs_subgroup_ballot);
+		require_scratch();
+		emit_op(result_type, id, join("spv_inverse_ballot(", to_expression(ops[3]), ", _spv_lane_id)"),
+		        should_forward(ops[3]));
+		inherit_expression_dependencies(id, ops[3]);
+		break;
+
+	case OpGroupNonUniformBallotBitExtract:
+		require_extension(needs_subgroup_ballot);
+		require_scratch();
+		emit_op(result_type, id,
+		        join("spv_ballot_bit_extract(", to_expression(ops[3]), ", ", to_expression(ops[4]), ")"),
+		        should_forward(ops[3]));
+		inherit_expression_dependencies(id, ops[3]);
+		break;
+
+	case OpGroupNonUniformBallotFindLSB:
+		require_extension(needs_subgroup_ballot);
+		require_scratch();
+		emit_op(result_type, id, join("spv_ballot_find_lsb(", to_expression(ops[3]), ")"), should_forward(ops[3]));
+		inherit_expression_dependencies(id, ops[3]);
+		break;
+
+	case OpGroupNonUniformBallotFindMSB:
+		require_extension(needs_subgroup_ballot);
+		require_scratch();
+		emit_op(result_type, id, join("spv_ballot_find_msb(", to_expression(ops[3]), ")"), should_forward(ops[3]));
+		inherit_expression_dependencies(id, ops[3]);
+		break;
+
+	case OpGroupNonUniformBallotBitCount:
+	{
+		require_extension(needs_subgroup_ballot);
+		require_scratch();
+		auto operation = static_cast<GroupOperation>(ops[3]);
+		if (operation == GroupOperationReduce)
+			emit_op(result_type, id, join("spv_ballot_bit_count(", to_expression(ops[4]), ")"), should_forward(ops[4]));
+		else if (operation == GroupOperationInclusiveScan)
+			emit_op(result_type, id, join("spv_ballot_inclusive_bit_count(", to_expression(ops[4]), ", _spv_lane_id)"),
+			        should_forward(ops[4]));
+		else if (operation == GroupOperationExclusiveScan)
+			emit_op(result_type, id, join("spv_ballot_exclusive_bit_count(", to_expression(ops[4]), ", _spv_lane_id)"),
+			        should_forward(ops[4]));
+		else
+			SPIRV_CROSS_THROW("Invalid group operation for BallotBitCount.");
+		inherit_expression_dependencies(id, ops[4]);
+		break;
+	}
+
+	// === Shuffle ===
+	case OpGroupNonUniformShuffle:
+		require_extension(needs_subgroup_shuffle);
+		require_scratch();
+		emit_combined_vec(ops[3], "spv_shuffle_uint(_spv_subgroup_scratch, ",
+		                  join(", ", to_expression(ops[4]), ", _spv_linear_id, _spv_subgroup_base)"));
+		break;
+
+	case OpGroupNonUniformShuffleXor:
+		require_extension(needs_subgroup_shuffle);
+		require_scratch();
+		emit_combined_vec(ops[3], "spv_shuffle_xor_uint(_spv_subgroup_scratch, ",
+		                  join(", ", to_expression(ops[4]), ", _spv_lane_id, _spv_linear_id, _spv_subgroup_base)"));
+		break;
+
+	case OpGroupNonUniformShuffleUp:
+		require_extension(needs_subgroup_shuffle_relative);
+		require_scratch();
+		emit_combined_vec(ops[3], "spv_shuffle_up_uint(_spv_subgroup_scratch, ",
+		                  join(", ", to_expression(ops[4]), ", _spv_lane_id, _spv_linear_id, _spv_subgroup_base)"));
+		break;
+
+	case OpGroupNonUniformShuffleDown:
+		require_extension(needs_subgroup_shuffle_relative);
+		require_scratch();
+		emit_combined_vec(ops[3], "spv_shuffle_down_uint(_spv_subgroup_scratch, ",
+		                  join(", ", to_expression(ops[4]), ", _spv_lane_id, _spv_linear_id, _spv_subgroup_size)"));
+		break;
+
+	// === Rotate ===
+	case OpGroupNonUniformRotateKHR:
+		require_extension(needs_subgroup_rotate);
+		require_scratch();
+		if (i.length > 5)
+		{
+			emit_combined_vec(ops[3], "spv_clustered_rotate_uint(_spv_subgroup_scratch, ",
+			                  join(", ", to_expression(ops[4]), ", _spv_lane_id, _spv_linear_id, _spv_subgroup_base, ",
+			                       to_expression(ops[5]), ")"));
+		}
+		else
+		{
+			emit_combined_vec(ops[3], "spv_rotate_uint(_spv_subgroup_scratch, ",
+			                  join(", ", to_expression(ops[4]),
+			                       ", _spv_lane_id, _spv_linear_id, _spv_subgroup_base, _spv_subgroup_size)"));
+		}
+		break;
+
+	// === Arithmetic (non-base: mul, bitwise, logical + clustered reduce for all) ===
+	case OpGroupNonUniformFAdd:
+	case OpGroupNonUniformIAdd:
+	case OpGroupNonUniformFMul:
+	case OpGroupNonUniformIMul:
+	case OpGroupNonUniformFMin:
+	case OpGroupNonUniformFMax:
+	case OpGroupNonUniformSMin:
+	case OpGroupNonUniformSMax:
+	case OpGroupNonUniformUMin:
+	case OpGroupNonUniformUMax:
+	case OpGroupNonUniformBitwiseAnd:
+	case OpGroupNonUniformBitwiseOr:
+	case OpGroupNonUniformBitwiseXor:
+	case OpGroupNonUniformLogicalAnd:
+	case OpGroupNonUniformLogicalOr:
+	case OpGroupNonUniformLogicalXor:
+	{
+		require_scratch();
+		auto operation = static_cast<GroupOperation>(ops[3]);
+		uint32_t value_id = ops[4];
+
+		// Determine the op suffix (matching emulation helper names)
+		const char *op_suffix = nullptr;
+		bool is_logical = false;
+		switch (op)
+		{
+		case OpGroupNonUniformFAdd:
+			op_suffix = "add_float";
+			break;
+		case OpGroupNonUniformIAdd:
+			op_suffix = (expression_type(value_id).basetype == SPIRType::Int) ? "add_int" : "add_uint";
+			break;
+		case OpGroupNonUniformFMul:
+			op_suffix = "mul_float";
+			break;
+		case OpGroupNonUniformIMul:
+			op_suffix = (expression_type(value_id).basetype == SPIRType::Int) ? "mul_int" : "mul_uint";
+			break;
+		case OpGroupNonUniformFMin:
+			op_suffix = "min_float";
+			break;
+		case OpGroupNonUniformFMax:
+			op_suffix = "max_float";
+			break;
+		case OpGroupNonUniformSMin:
+			op_suffix = "min_int";
+			break;
+		case OpGroupNonUniformSMax:
+			op_suffix = "max_int";
+			break;
+		case OpGroupNonUniformUMin:
+			op_suffix = "min_uint";
+			break;
+		case OpGroupNonUniformUMax:
+			op_suffix = "max_uint";
+			break;
+		case OpGroupNonUniformBitwiseAnd:
+			op_suffix = (expression_type(value_id).basetype == SPIRType::Int) ? "and_int" : "and_uint";
+			break;
+		case OpGroupNonUniformBitwiseOr:
+			op_suffix = (expression_type(value_id).basetype == SPIRType::Int) ? "or_int" : "or_uint";
+			break;
+		case OpGroupNonUniformBitwiseXor:
+			op_suffix = (expression_type(value_id).basetype == SPIRType::Int) ? "xor_int" : "xor_uint";
+			break;
+		case OpGroupNonUniformLogicalAnd:
+			op_suffix = "logical_and";
+			is_logical = true;
+			break;
+		case OpGroupNonUniformLogicalOr:
+			op_suffix = "logical_or";
+			is_logical = true;
+			break;
+		case OpGroupNonUniformLogicalXor:
+			op_suffix = "logical_xor";
+			is_logical = true;
+			break;
+		default:
+			break;
+		}
+
+		// Determine group prefix and required extension
+		const char *group_prefix = nullptr;
+		switch (operation)
+		{
+		case GroupOperationReduce:
+			group_prefix = "spv_reduce_";
+			require_extension(needs_subgroup_arithmetic);
+			break;
+		case GroupOperationInclusiveScan:
+			group_prefix = "spv_inclusive_scan_";
+			require_extension(needs_subgroup_arithmetic);
+			break;
+		case GroupOperationExclusiveScan:
+			group_prefix = "spv_exclusive_scan_";
+			require_extension(needs_subgroup_arithmetic);
+			break;
+		case GroupOperationClusteredReduce:
+			group_prefix = "spv_clustered_reduce_";
+			require_extension(needs_subgroup_clustered);
+			break;
+		default:
+			SPIRV_CROSS_THROW("Unsupported group operation.");
+		}
+
+		string macro_name = join(group_prefix, op_suffix);
+
+		if (is_logical)
+		{
+			string val_expr = to_expression(value_id);
+			string expr;
+			if (operation == GroupOperationClusteredReduce)
+				expr = join(macro_name, "(_spv_subgroup_scratch, ", val_expr, ", ", to_expression(ops[5]),
+				            ", _spv_lane_id, _spv_linear_id, _spv_subgroup_base, _spv_subgroup_size)");
+			else if (operation == GroupOperationReduce)
+				expr = join(macro_name, "(_spv_subgroup_scratch, ", val_expr,
+				            ", _spv_linear_id, _spv_subgroup_base, _spv_subgroup_size)");
+			else
+				expr = join(macro_name, "(_spv_subgroup_scratch, ", val_expr,
+				            ", _spv_lane_id, _spv_linear_id, _spv_subgroup_base, _spv_subgroup_size)");
+			emit_op(result_type, id, expr, should_forward(value_id));
+			inherit_expression_dependencies(id, value_id);
+		}
+		else
+		{
+			auto &type = expression_type(value_id);
+			if (type.vecsize > 1)
+			{
+				auto &out_type = get<SPIRType>(result_type);
+				string expr = "(" + type_to_glsl(out_type) + ")(";
+				for (uint32_t c = 0; c < type.vecsize; c++)
+				{
+					if (c > 0)
+						expr += ", ";
+					string comp = join(to_enclosed_expression(value_id), ".", "xyzw"[c]);
+					string call;
+					if (operation == GroupOperationClusteredReduce)
+						call = join(macro_name, "(_spv_subgroup_scratch, ", comp, ", ", to_expression(ops[5]),
+						            ", _spv_lane_id, _spv_linear_id, _spv_subgroup_base, _spv_subgroup_size)");
+					else if (operation == GroupOperationReduce)
+						call = join(macro_name, "(_spv_subgroup_scratch, ", comp,
+						            ", _spv_linear_id, _spv_subgroup_base, _spv_subgroup_size)");
+					else
+						call = join(macro_name, "(_spv_subgroup_scratch, ", comp,
+						            ", _spv_lane_id, _spv_linear_id, _spv_subgroup_base, _spv_subgroup_size)");
+					expr += call;
+				}
+				expr += ")";
+				emit_op(result_type, id, expr, should_forward(value_id));
+				inherit_expression_dependencies(id, value_id);
+			}
+			else
+			{
+				string val_expr = to_expression(value_id);
+				string expr;
+				if (operation == GroupOperationClusteredReduce)
+					expr = join(macro_name, "(_spv_subgroup_scratch, ", val_expr, ", ", to_expression(ops[5]),
+					            ", _spv_lane_id, _spv_linear_id, _spv_subgroup_base, _spv_subgroup_size)");
+				else if (operation == GroupOperationReduce)
+					expr = join(macro_name, "(_spv_subgroup_scratch, ", val_expr,
+					            ", _spv_linear_id, _spv_subgroup_base, _spv_subgroup_size)");
+				else
+					expr = join(macro_name, "(_spv_subgroup_scratch, ", val_expr,
+					            ", _spv_lane_id, _spv_linear_id, _spv_subgroup_base, _spv_subgroup_size)");
+				emit_op(result_type, id, expr, should_forward(value_id));
+				inherit_expression_dependencies(id, value_id);
+			}
+		}
+		break;
+	}
+
+	default:
+		SPIRV_CROSS_THROW("Unsupported subgroup op for OpenCL combined mode.");
+	}
+}
+
 void CompilerOpenCL::emit_subgroup_op_emulated(const Instruction &i)
 {
 	const uint32_t *ops = stream(i);
@@ -4824,6 +5638,45 @@ void CompilerOpenCL::emit_subgroup_op(const Instruction &i)
 		return;
 	}
 
+	// Combined mode: non-base ops go through wrapper macros, base ops fall through to native.
+	if (opencl_options.emulate_subgroups && opencl_options.enable_subgroups)
+	{
+		// Base cl_khr_subgroups ops: All, Any, Broadcast, add/min/max Reduce/Scan
+		bool is_base = false;
+		switch (op)
+		{
+		case OpGroupNonUniformAll:
+		case OpGroupNonUniformAny:
+		case OpGroupNonUniformBroadcast:
+			is_base = true;
+			break;
+		case OpGroupNonUniformFAdd:
+		case OpGroupNonUniformIAdd:
+		case OpGroupNonUniformFMin:
+		case OpGroupNonUniformFMax:
+		case OpGroupNonUniformSMin:
+		case OpGroupNonUniformSMax:
+		case OpGroupNonUniformUMin:
+		case OpGroupNonUniformUMax:
+		{
+			// Base only for Reduce/InclusiveScan/ExclusiveScan, not ClusteredReduce
+			auto operation = static_cast<GroupOperation>(ops[3]);
+			if (operation != GroupOperationClusteredReduce)
+				is_base = true;
+			break;
+		}
+		default:
+			break;
+		}
+
+		if (!is_base)
+		{
+			emit_subgroup_op_combined(i);
+			return;
+		}
+		// Base ops fall through to native path below.
+	}
+
 	if (!opencl_options.enable_subgroups)
 		SPIRV_CROSS_THROW("Subgroup operations require enable_subgroups option.");
 
diff --git a/spirv_opencl.hpp b/spirv_opencl.hpp
index a5e6c4bee..3dea7039a 100644
--- a/spirv_opencl.hpp
+++ b/spirv_opencl.hpp
@@ -215,6 +215,9 @@ class CompilerOpenCL : public CompilerGLSL
 	// Helpers to emit subgroup emulation polyfills and scratch infrastructure.
 	void emit_subgroup_emulation_helpers();
 	void emit_subgroup_emulation_entry_point_vars();
+	void emit_subgroup_combined_entry_point_vars();
+	void emit_subgroup_combined_wrappers();
+	void emit_subgroup_op_combined(const Instruction &i);
 	uint32_t get_emulation_max_workgroup_size() const;
 	std::string get_emulation_subgroup_size_expr() const;
 	void emit_subgroup_op_emulated(const Instruction &i);
diff --git a/test_shaders.py b/test_shaders.py
index fb9ec52c5..f7818b0a4 100755
--- a/test_shaders.py
+++ b/test_shaders.py
@@ -588,7 +588,7 @@ def cross_compile_hlsl(shader, spirv, opt, force_no_external_validation, iterati
 def path_to_opencl_standard_cli(shader):
     # clang seems warn about cl_khr_subgroups unless 2.0 is specified.
     # Revisit when OpenCL 3.0 support is no longer experimental.
-    if '.subgroups.' in shader or '.subgroups-core.' in shader:
+    if '.subgroups.' in shader:
         return '200'
     # OpenCL 3.0 support in clang is experimental and 2.1 and 2.2 seem unsupported.
     if '.cl30.' in shader:
@@ -618,10 +618,8 @@ def validate_shader_opencl(shader, opt, paths):
     if '.fp64.' in shader:
         extensions.append('cl_khr_fp64')
     if '.subgroups-emulate.' in shader:
-        # Make sure no extensions are included
-        pass
-    elif '.subgroups-core.' in shader:
-        extensions.append('cl_khr_subgroups')
+        if '.subgroups.' in shader:
+            extensions.append('cl_khr_subgroups')
     elif '.subgroups.' in shader:
         extensions.append('cl_khr_subgroups')
         extensions.append('cl_khr_subgroup_ballot')

From 218c1183623e985977d60b24713eca6a753930e0 Mon Sep 17 00:00:00 2001
From: Garrick Meeker <gmeeker@gmail.com>
Date: Wed, 18 Mar 2026 19:53:15 -0700
Subject: [PATCH 15/16] OpenCL: Subgroup emulation tests

---
 ...thmetic.nocompat.vk.subgroups-emulate.comp | 1105 ++++++++++++++++
 ...-ballot.nocompat.vk.subgroups-emulate.comp | 1107 ++++++++++++++++
 ...s-basic.nocompat.vk.subgroups-emulate.comp | 1132 +++++++++++++++++
 ...compat.vk.subgroups.subgroups-emulate.comp |   73 ++
 ...ustered.nocompat.vk.subgroups-emulate.comp | 1105 ++++++++++++++++
 ...-rotate.nocompat.vk.subgroups-emulate.comp | 1094 ++++++++++++++++
 ...shuffle.nocompat.vk.subgroups-emulate.comp | 1097 ++++++++++++++++
 ...ps-vote.nocompat.vk.subgroups-emulate.comp | 1098 ++++++++++++++++
 ...thmetic.nocompat.vk.subgroups-emulate.comp |   39 +
 ...-ballot.nocompat.vk.subgroups-emulate.comp |   43 +
 ...s-basic.nocompat.vk.subgroups-emulate.comp |   77 ++
 ...compat.vk.subgroups.subgroups-emulate.comp |   77 ++
 ...ustered.nocompat.vk.subgroups-emulate.comp |   34 +
 ...-rotate.nocompat.vk.subgroups-emulate.comp |   17 +
 ...shuffle.nocompat.vk.subgroups-emulate.comp |   22 +
 ...ps-vote.nocompat.vk.subgroups-emulate.comp |   27 +
 16 files changed, 8147 insertions(+)
 create mode 100644 reference/shaders-opencl-no-opt/comp/subgroups-arithmetic.nocompat.vk.subgroups-emulate.comp
 create mode 100644 reference/shaders-opencl-no-opt/comp/subgroups-ballot.nocompat.vk.subgroups-emulate.comp
 create mode 100644 reference/shaders-opencl-no-opt/comp/subgroups-basic.nocompat.vk.subgroups-emulate.comp
 create mode 100644 reference/shaders-opencl-no-opt/comp/subgroups-basic.nocompat.vk.subgroups.subgroups-emulate.comp
 create mode 100644 reference/shaders-opencl-no-opt/comp/subgroups-clustered.nocompat.vk.subgroups-emulate.comp
 create mode 100644 reference/shaders-opencl-no-opt/comp/subgroups-rotate.nocompat.vk.subgroups-emulate.comp
 create mode 100644 reference/shaders-opencl-no-opt/comp/subgroups-shuffle.nocompat.vk.subgroups-emulate.comp
 create mode 100644 reference/shaders-opencl-no-opt/comp/subgroups-vote.nocompat.vk.subgroups-emulate.comp
 create mode 100644 shaders-opencl-no-opt/comp/subgroups-arithmetic.nocompat.vk.subgroups-emulate.comp
 create mode 100644 shaders-opencl-no-opt/comp/subgroups-ballot.nocompat.vk.subgroups-emulate.comp
 create mode 100644 shaders-opencl-no-opt/comp/subgroups-basic.nocompat.vk.subgroups-emulate.comp
 create mode 100644 shaders-opencl-no-opt/comp/subgroups-basic.nocompat.vk.subgroups.subgroups-emulate.comp
 create mode 100644 shaders-opencl-no-opt/comp/subgroups-clustered.nocompat.vk.subgroups-emulate.comp
 create mode 100644 shaders-opencl-no-opt/comp/subgroups-rotate.nocompat.vk.subgroups-emulate.comp
 create mode 100644 shaders-opencl-no-opt/comp/subgroups-shuffle.nocompat.vk.subgroups-emulate.comp
 create mode 100644 shaders-opencl-no-opt/comp/subgroups-vote.nocompat.vk.subgroups-emulate.comp

diff --git a/reference/shaders-opencl-no-opt/comp/subgroups-arithmetic.nocompat.vk.subgroups-emulate.comp b/reference/shaders-opencl-no-opt/comp/subgroups-arithmetic.nocompat.vk.subgroups-emulate.comp
new file mode 100644
index 000000000..7cdaa8e90
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/comp/subgroups-arithmetic.nocompat.vk.subgroups-emulate.comp
@@ -0,0 +1,1105 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct SSBO
+{
+    float FragColor;
+    int idat;
+    uint udat;
+};
+
+typedef struct SSBO SSBO;
+
+static uint spv_emulate_broadcast_uint(__local uint* scratch, uint val, uint src_lane, uint linear_id, uint subgroup_base) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base + src_lane];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_broadcast_first_uint(__local uint* scratch, uint val, uint linear_id, uint subgroup_base) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_shuffle_uint(__local uint* scratch, uint val, uint index, uint linear_id, uint subgroup_base) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base + index];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_shuffle_xor_uint(__local uint* scratch, uint val, uint mask, uint lane_id, uint linear_id, uint subgroup_base) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base + (lane_id ^ mask)];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_shuffle_up_uint(__local uint* scratch, uint val, uint delta, uint lane_id, uint linear_id, uint subgroup_base) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = (lane_id >= delta) ? scratch[linear_id - delta] : val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_shuffle_down_uint(__local uint* scratch, uint val, uint delta, uint lane_id, uint linear_id, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = (lane_id + delta < subgroup_size) ? scratch[linear_id + delta] : val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_rotate_uint(__local uint* scratch, uint val, uint delta, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base + ((lane_id + delta) % subgroup_size)];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_clustered_rotate_uint(__local uint* scratch, uint val, uint delta, uint lane_id, uint linear_id, uint subgroup_base, uint cluster_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base = (lane_id / cluster_size) * cluster_size;
+    uint r = scratch[subgroup_base + cluster_base + ((lane_id - cluster_base + delta) % cluster_size)];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_all(__local uint* scratch, bool predicate, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = predicate ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    bool r = true;
+    for (uint i = 0u; i < subgroup_size; i++)
+        r = r && (scratch[subgroup_base + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_any(__local uint* scratch, bool predicate, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = predicate ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    bool r = false;
+    for (uint i = 0u; i < subgroup_size; i++)
+        r = r || (scratch[subgroup_base + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_all_equal_uint(__local uint* scratch, uint val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint first = scratch[subgroup_base];
+    bool r = true;
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = r && (scratch[subgroup_base + i] == first);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint4 spv_emulate_ballot(__local uint* scratch, bool predicate, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = predicate ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint4 r = (uint4)(0u);
+    for (uint i = 0u; i < subgroup_size; i++) {
+        if (scratch[subgroup_base + i] != 0u) {
+            uint word = i / 32u;
+            uint bit = i % 32u;
+            if (word == 0u) r.x |= (1u << bit);
+            else if (word == 1u) r.y |= (1u << bit);
+            else if (word == 2u) r.z |= (1u << bit);
+            else r.w |= (1u << bit);
+        }
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint4 spv_subgroup_eq_mask(uint lane_id) {
+    uint4 r = (uint4)(0u);
+    uint word = lane_id / 32u;
+    uint bit = lane_id % 32u;
+    if (word == 0u) r.x = (1u << bit);
+    else if (word == 1u) r.y = (1u << bit);
+    else if (word == 2u) r.z = (1u << bit);
+    else r.w = (1u << bit);
+    return r;
+}
+
+static uint4 spv_subgroup_ge_mask(uint lane_id, uint subgroup_size) {
+    uint4 r = (uint4)(0u);
+    for (uint i = lane_id; i < subgroup_size; i++) {
+        uint word = i / 32u;
+        uint bit = i % 32u;
+        if (word == 0u) r.x |= (1u << bit);
+        else if (word == 1u) r.y |= (1u << bit);
+        else if (word == 2u) r.z |= (1u << bit);
+        else r.w |= (1u << bit);
+    }
+    return r;
+}
+
+static uint4 spv_subgroup_gt_mask(uint lane_id, uint subgroup_size) {
+    return spv_subgroup_ge_mask(lane_id + 1u, subgroup_size);
+}
+
+static uint4 spv_subgroup_le_mask(uint lane_id, uint subgroup_size) {
+    return spv_subgroup_ge_mask(0u, lane_id + 1u);
+}
+
+static uint4 spv_subgroup_lt_mask(uint lane_id) {
+    if (lane_id == 0u) return (uint4)(0u);
+    return spv_subgroup_ge_mask(0u, lane_id);
+}
+
+static uint spv_emulate_reduce_add_uint(__local uint* scratch, uint val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = r + scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_inclusive_scan_add_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i <= lane_id; i++)
+        r = r + scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_exclusive_scan_add_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = 0u;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r + scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_clustered_reduce_add_uint(__local uint* scratch, uint val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    uint r = scratch[subgroup_base + cluster_base_in_sg];
+    for (uint i = 1u; i < cluster_size; i++)
+        r = r + scratch[subgroup_base + cluster_base_in_sg + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_reduce_mul_uint(__local uint* scratch, uint val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = r * scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_inclusive_scan_mul_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i <= lane_id; i++)
+        r = r * scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_exclusive_scan_mul_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = 1u;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r * scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_clustered_reduce_mul_uint(__local uint* scratch, uint val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    uint r = scratch[subgroup_base + cluster_base_in_sg];
+    for (uint i = 1u; i < cluster_size; i++)
+        r = r * scratch[subgroup_base + cluster_base_in_sg + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_reduce_min_uint(__local uint* scratch, uint val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = min(r, scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_inclusive_scan_min_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i <= lane_id; i++)
+        r = min(r, scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_exclusive_scan_min_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = UINT_MAX;
+    for (uint i = 0u; i < lane_id; i++)
+        r = min(r, scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_clustered_reduce_min_uint(__local uint* scratch, uint val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    uint r = scratch[subgroup_base + cluster_base_in_sg];
+    for (uint i = 1u; i < cluster_size; i++)
+        r = min(r, scratch[subgroup_base + cluster_base_in_sg + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_reduce_max_uint(__local uint* scratch, uint val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = max(r, scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_inclusive_scan_max_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i <= lane_id; i++)
+        r = max(r, scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_exclusive_scan_max_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = 0u;
+    for (uint i = 0u; i < lane_id; i++)
+        r = max(r, scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_clustered_reduce_max_uint(__local uint* scratch, uint val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    uint r = scratch[subgroup_base + cluster_base_in_sg];
+    for (uint i = 1u; i < cluster_size; i++)
+        r = max(r, scratch[subgroup_base + cluster_base_in_sg + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_reduce_and_uint(__local uint* scratch, uint val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = r & scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_inclusive_scan_and_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i <= lane_id; i++)
+        r = r & scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_exclusive_scan_and_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = 0xFFFFFFFFu;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r & scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_clustered_reduce_and_uint(__local uint* scratch, uint val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    uint r = scratch[subgroup_base + cluster_base_in_sg];
+    for (uint i = 1u; i < cluster_size; i++)
+        r = r & scratch[subgroup_base + cluster_base_in_sg + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_reduce_or_uint(__local uint* scratch, uint val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = r | scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_inclusive_scan_or_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i <= lane_id; i++)
+        r = r | scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_exclusive_scan_or_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = 0u;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r | scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_clustered_reduce_or_uint(__local uint* scratch, uint val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    uint r = scratch[subgroup_base + cluster_base_in_sg];
+    for (uint i = 1u; i < cluster_size; i++)
+        r = r | scratch[subgroup_base + cluster_base_in_sg + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_reduce_xor_uint(__local uint* scratch, uint val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = r ^ scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_inclusive_scan_xor_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i <= lane_id; i++)
+        r = r ^ scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_exclusive_scan_xor_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = 0u;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r ^ scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_clustered_reduce_xor_uint(__local uint* scratch, uint val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    uint r = scratch[subgroup_base + cluster_base_in_sg];
+    for (uint i = 1u; i < cluster_size; i++)
+        r = r ^ scratch[subgroup_base + cluster_base_in_sg + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_reduce_add_int(__local uint* scratch, int val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = r + as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_inclusive_scan_add_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i <= lane_id; i++)
+        r = r + as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_exclusive_scan_add_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = 0;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r + as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_clustered_reduce_add_int(__local uint* scratch, int val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    int r = as_int(scratch[subgroup_base + cluster_base_in_sg]);
+    for (uint i = 1u; i < cluster_size; i++)
+        r = r + as_int(scratch[subgroup_base + cluster_base_in_sg + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_reduce_mul_int(__local uint* scratch, int val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = r * as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_inclusive_scan_mul_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i <= lane_id; i++)
+        r = r * as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_exclusive_scan_mul_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = 1;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r * as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_clustered_reduce_mul_int(__local uint* scratch, int val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    int r = as_int(scratch[subgroup_base + cluster_base_in_sg]);
+    for (uint i = 1u; i < cluster_size; i++)
+        r = r * as_int(scratch[subgroup_base + cluster_base_in_sg + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_reduce_min_int(__local uint* scratch, int val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = min(r, as_int(scratch[subgroup_base + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_inclusive_scan_min_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i <= lane_id; i++)
+        r = min(r, as_int(scratch[subgroup_base + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_exclusive_scan_min_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = INT_MAX;
+    for (uint i = 0u; i < lane_id; i++)
+        r = min(r, as_int(scratch[subgroup_base + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_clustered_reduce_min_int(__local uint* scratch, int val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    int r = as_int(scratch[subgroup_base + cluster_base_in_sg]);
+    for (uint i = 1u; i < cluster_size; i++)
+        r = min(r, as_int(scratch[subgroup_base + cluster_base_in_sg + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_reduce_max_int(__local uint* scratch, int val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = max(r, as_int(scratch[subgroup_base + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_inclusive_scan_max_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i <= lane_id; i++)
+        r = max(r, as_int(scratch[subgroup_base + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_exclusive_scan_max_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = INT_MIN;
+    for (uint i = 0u; i < lane_id; i++)
+        r = max(r, as_int(scratch[subgroup_base + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_clustered_reduce_max_int(__local uint* scratch, int val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    int r = as_int(scratch[subgroup_base + cluster_base_in_sg]);
+    for (uint i = 1u; i < cluster_size; i++)
+        r = max(r, as_int(scratch[subgroup_base + cluster_base_in_sg + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_reduce_and_int(__local uint* scratch, int val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = r & as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_inclusive_scan_and_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i <= lane_id; i++)
+        r = r & as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_exclusive_scan_and_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(0xFFFFFFFFu);
+    for (uint i = 0u; i < lane_id; i++)
+        r = r & as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_clustered_reduce_and_int(__local uint* scratch, int val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    int r = as_int(scratch[subgroup_base + cluster_base_in_sg]);
+    for (uint i = 1u; i < cluster_size; i++)
+        r = r & as_int(scratch[subgroup_base + cluster_base_in_sg + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_reduce_or_int(__local uint* scratch, int val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = r | as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_inclusive_scan_or_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i <= lane_id; i++)
+        r = r | as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_exclusive_scan_or_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = 0;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r | as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_clustered_reduce_or_int(__local uint* scratch, int val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    int r = as_int(scratch[subgroup_base + cluster_base_in_sg]);
+    for (uint i = 1u; i < cluster_size; i++)
+        r = r | as_int(scratch[subgroup_base + cluster_base_in_sg + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_reduce_xor_int(__local uint* scratch, int val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = r ^ as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_inclusive_scan_xor_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i <= lane_id; i++)
+        r = r ^ as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_exclusive_scan_xor_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = 0;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r ^ as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_clustered_reduce_xor_int(__local uint* scratch, int val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    int r = as_int(scratch[subgroup_base + cluster_base_in_sg]);
+    for (uint i = 1u; i < cluster_size; i++)
+        r = r ^ as_int(scratch[subgroup_base + cluster_base_in_sg + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_reduce_add_float(__local uint* scratch, float val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    float r = as_float(scratch[subgroup_base]);
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = r + as_float(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_inclusive_scan_add_float(__local uint* scratch, float val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    float r = as_float(scratch[subgroup_base]);
+    for (uint i = 1u; i <= lane_id; i++)
+        r = r + as_float(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_exclusive_scan_add_float(__local uint* scratch, float val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    float r = 0.0f;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r + as_float(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_clustered_reduce_add_float(__local uint* scratch, float val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    float r = as_float(scratch[subgroup_base + cluster_base_in_sg]);
+    for (uint i = 1u; i < cluster_size; i++)
+        r = r + as_float(scratch[subgroup_base + cluster_base_in_sg + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_reduce_mul_float(__local uint* scratch, float val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    float r = as_float(scratch[subgroup_base]);
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = r * as_float(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_inclusive_scan_mul_float(__local uint* scratch, float val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    float r = as_float(scratch[subgroup_base]);
+    for (uint i = 1u; i <= lane_id; i++)
+        r = r * as_float(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_exclusive_scan_mul_float(__local uint* scratch, float val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    float r = 1.0f;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r * as_float(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_clustered_reduce_mul_float(__local uint* scratch, float val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    float r = as_float(scratch[subgroup_base + cluster_base_in_sg]);
+    for (uint i = 1u; i < cluster_size; i++)
+        r = r * as_float(scratch[subgroup_base + cluster_base_in_sg + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_reduce_min_float(__local uint* scratch, float val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    float r = as_float(scratch[subgroup_base]);
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = fmin(r, as_float(scratch[subgroup_base + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_inclusive_scan_min_float(__local uint* scratch, float val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    float r = as_float(scratch[subgroup_base]);
+    for (uint i = 1u; i <= lane_id; i++)
+        r = fmin(r, as_float(scratch[subgroup_base + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_exclusive_scan_min_float(__local uint* scratch, float val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    float r = INFINITY;
+    for (uint i = 0u; i < lane_id; i++)
+        r = fmin(r, as_float(scratch[subgroup_base + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_clustered_reduce_min_float(__local uint* scratch, float val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    float r = as_float(scratch[subgroup_base + cluster_base_in_sg]);
+    for (uint i = 1u; i < cluster_size; i++)
+        r = fmin(r, as_float(scratch[subgroup_base + cluster_base_in_sg + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_reduce_max_float(__local uint* scratch, float val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    float r = as_float(scratch[subgroup_base]);
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = fmax(r, as_float(scratch[subgroup_base + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_inclusive_scan_max_float(__local uint* scratch, float val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    float r = as_float(scratch[subgroup_base]);
+    for (uint i = 1u; i <= lane_id; i++)
+        r = fmax(r, as_float(scratch[subgroup_base + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_exclusive_scan_max_float(__local uint* scratch, float val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    float r = -INFINITY;
+    for (uint i = 0u; i < lane_id; i++)
+        r = fmax(r, as_float(scratch[subgroup_base + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_clustered_reduce_max_float(__local uint* scratch, float val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    float r = as_float(scratch[subgroup_base + cluster_base_in_sg]);
+    for (uint i = 1u; i < cluster_size; i++)
+        r = fmax(r, as_float(scratch[subgroup_base + cluster_base_in_sg + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_reduce_logical_and(__local uint* scratch, bool val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    bool r = true;
+    for (uint i = 0u; i < subgroup_size; i++)
+        r = r && (scratch[subgroup_base + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_inclusive_scan_logical_and(__local uint* scratch, bool val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    bool r = true;
+    for (uint i = 0u; i <= lane_id; i++)
+        r = r && (scratch[subgroup_base + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_exclusive_scan_logical_and(__local uint* scratch, bool val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    bool r = true;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r && (scratch[subgroup_base + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_clustered_reduce_logical_and(__local uint* scratch, bool val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    bool r = true;
+    for (uint i = 0u; i < cluster_size; i++)
+        r = r && (scratch[subgroup_base + cluster_base_in_sg + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_reduce_logical_or(__local uint* scratch, bool val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    bool r = false;
+    for (uint i = 0u; i < subgroup_size; i++)
+        r = r || (scratch[subgroup_base + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_inclusive_scan_logical_or(__local uint* scratch, bool val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    bool r = false;
+    for (uint i = 0u; i <= lane_id; i++)
+        r = r || (scratch[subgroup_base + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_exclusive_scan_logical_or(__local uint* scratch, bool val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    bool r = false;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r || (scratch[subgroup_base + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_clustered_reduce_logical_or(__local uint* scratch, bool val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    bool r = false;
+    for (uint i = 0u; i < cluster_size; i++)
+        r = r || (scratch[subgroup_base + cluster_base_in_sg + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_reduce_logical_xor(__local uint* scratch, bool val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    bool r = false;
+    for (uint i = 0u; i < subgroup_size; i++)
+        r = r != (scratch[subgroup_base + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_inclusive_scan_logical_xor(__local uint* scratch, bool val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    bool r = false;
+    for (uint i = 0u; i <= lane_id; i++)
+        r = r != (scratch[subgroup_base + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_exclusive_scan_logical_xor(__local uint* scratch, bool val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    bool r = false;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r != (scratch[subgroup_base + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_clustered_reduce_logical_xor(__local uint* scratch, bool val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    bool r = false;
+    for (uint i = 0u; i < cluster_size; i++)
+        r = r != (scratch[subgroup_base + cluster_base_in_sg + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_inverse_ballot(uint4 ballot, uint lane_id) {
+    uint word = lane_id / 32u;
+    uint bit = lane_id % 32u;
+    uint v = (word == 0u) ? ballot.x : (word == 1u) ? ballot.y : (word == 2u) ? ballot.z : ballot.w;
+    return (v & (1u << bit)) != 0u;
+}
+
+static bool spv_emulate_ballot_bit_extract(uint4 ballot, uint index) {
+    uint word = index / 32u;
+    uint bit = index % 32u;
+    uint v = (word == 0u) ? ballot.x : (word == 1u) ? ballot.y : (word == 2u) ? ballot.z : ballot.w;
+    return (v & (1u << bit)) != 0u;
+}
+
+static uint spv_popcount4(uint4 v) {
+    return popcount(v.x) + popcount(v.y) + popcount(v.z) + popcount(v.w);
+}
+
+static uint spv_emulate_ballot_bit_count(uint4 ballot) {
+    return spv_popcount4(ballot);
+}
+
+static uint spv_emulate_ballot_inclusive_bit_count(uint4 ballot, uint lane_id) {
+    uint4 masked = ballot;
+    uint word = lane_id / 32u;
+    uint bit = lane_id % 32u;
+    uint mask = (bit == 31u) ? 0xFFFFFFFFu : ((1u << (bit + 1u)) - 1u);
+    if (word == 0u) { masked.x &= mask; masked.y = 0u; masked.z = 0u; masked.w = 0u; }
+    else if (word == 1u) { masked.y &= mask; masked.z = 0u; masked.w = 0u; }
+    else if (word == 2u) { masked.z &= mask; masked.w = 0u; }
+    else { masked.w &= mask; }
+    return spv_popcount4(masked);
+}
+
+static uint spv_emulate_ballot_exclusive_bit_count(uint4 ballot, uint lane_id) {
+    if (lane_id == 0u) return 0u;
+    return spv_emulate_ballot_inclusive_bit_count(ballot, lane_id - 1u);
+}
+
+static uint spv_emulate_ballot_find_lsb(uint4 ballot) {
+    if (ballot.x != 0u) return (uint)clz(ballot.x & (0u - ballot.x));
+    if (ballot.y != 0u) return 32u + (uint)clz(ballot.y & (0u - ballot.y));
+    if (ballot.z != 0u) return 64u + (uint)clz(ballot.z & (0u - ballot.z));
+    if (ballot.w != 0u) return 96u + (uint)clz(ballot.w & (0u - ballot.w));
+    return ~0u;
+}
+
+static uint spv_emulate_ballot_find_msb(uint4 ballot) {
+    if (ballot.w != 0u) return 127u - (uint)clz(ballot.w);
+    if (ballot.z != 0u) return 95u - (uint)clz(ballot.z);
+    if (ballot.y != 0u) return 63u - (uint)clz(ballot.y);
+    if (ballot.x != 0u) return 31u - (uint)clz(ballot.x);
+    return ~0u;
+}
+
+__attribute__((reqd_work_group_size(256, 1, 1)))
+__kernel void comp_main(__global SSBO* _13)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    uint _spv_linear_workgroup_size = get_local_size(0) * get_local_size(1) * get_local_size(2);
+    uint _spv_linear_id = (get_local_id(2) * get_local_size(1) * get_local_size(0)) + (get_local_id(1) * get_local_size(0)) + get_local_id(0);
+    uint _spv_subgroup_size = 32u;
+    uint _spv_lane_id = _spv_linear_id % 32u;
+    uint _spv_subgroup_id = _spv_linear_id / 32u;
+    uint _spv_num_subgroups = _spv_linear_workgroup_size / 32u;
+    uint _spv_subgroup_base = _spv_subgroup_id * 32u;
+    __local uint _spv_subgroup_scratch[256];
+    float fmul_1 = spv_emulate_reduce_mul_float(_spv_subgroup_scratch, _13->FragColor, _spv_linear_id, _spv_subgroup_base, _spv_subgroup_size);
+    int imul_1 = spv_emulate_reduce_mul_int(_spv_subgroup_scratch, _13->idat, _spv_linear_id, _spv_subgroup_base, _spv_subgroup_size);
+    uint umul_1 = spv_emulate_reduce_mul_uint(_spv_subgroup_scratch, _13->udat, _spv_linear_id, _spv_subgroup_base, _spv_subgroup_size);
+    uint band_1 = spv_emulate_reduce_and_uint(_spv_subgroup_scratch, _13->udat, _spv_linear_id, _spv_subgroup_base, _spv_subgroup_size);
+    uint bor_1 = spv_emulate_reduce_or_uint(_spv_subgroup_scratch, _13->udat, _spv_linear_id, _spv_subgroup_base, _spv_subgroup_size);
+    uint bxor_1 = spv_emulate_reduce_xor_uint(_spv_subgroup_scratch, _13->udat, _spv_linear_id, _spv_subgroup_base, _spv_subgroup_size);
+    bool land_1 = spv_emulate_reduce_logical_and(_spv_subgroup_scratch, _13->udat > 0u, _spv_linear_id, _spv_subgroup_base, _spv_subgroup_size);
+    bool lor_1 = spv_emulate_reduce_logical_or(_spv_subgroup_scratch, _13->udat > 0u, _spv_linear_id, _spv_subgroup_base, _spv_subgroup_size);
+    bool lxor_1 = spv_emulate_reduce_logical_xor(_spv_subgroup_scratch, _13->udat > 0u, _spv_linear_id, _spv_subgroup_base, _spv_subgroup_size);
+    float fmul_inc_1 = spv_emulate_inclusive_scan_mul_float(_spv_subgroup_scratch, _13->FragColor, _spv_lane_id, _spv_linear_id, _spv_subgroup_base, _spv_subgroup_size);
+    float fmul_exc_1 = spv_emulate_exclusive_scan_mul_float(_spv_subgroup_scratch, _13->FragColor, _spv_lane_id, _spv_linear_id, _spv_subgroup_base, _spv_subgroup_size);
+    _13->FragColor = ((((((fmul_1 + fmul_inc_1) + fmul_exc_1) + convert_float(imul_1 + as_int(umul_1))) + convert_float((band_1 + bor_1) + bxor_1)) + (float)(land_1)) + (float)(lor_1)) + (float)(lxor_1);
+}
+
diff --git a/reference/shaders-opencl-no-opt/comp/subgroups-ballot.nocompat.vk.subgroups-emulate.comp b/reference/shaders-opencl-no-opt/comp/subgroups-ballot.nocompat.vk.subgroups-emulate.comp
new file mode 100644
index 000000000..46c143c9a
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/comp/subgroups-ballot.nocompat.vk.subgroups-emulate.comp
@@ -0,0 +1,1107 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct SSBO
+{
+    float FragColor;
+    uint udat;
+};
+
+typedef struct SSBO SSBO;
+
+static uint spv_emulate_broadcast_uint(__local uint* scratch, uint val, uint src_lane, uint linear_id, uint subgroup_base) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base + src_lane];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_broadcast_first_uint(__local uint* scratch, uint val, uint linear_id, uint subgroup_base) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_shuffle_uint(__local uint* scratch, uint val, uint index, uint linear_id, uint subgroup_base) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base + index];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_shuffle_xor_uint(__local uint* scratch, uint val, uint mask, uint lane_id, uint linear_id, uint subgroup_base) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base + (lane_id ^ mask)];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_shuffle_up_uint(__local uint* scratch, uint val, uint delta, uint lane_id, uint linear_id, uint subgroup_base) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = (lane_id >= delta) ? scratch[linear_id - delta] : val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_shuffle_down_uint(__local uint* scratch, uint val, uint delta, uint lane_id, uint linear_id, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = (lane_id + delta < subgroup_size) ? scratch[linear_id + delta] : val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_rotate_uint(__local uint* scratch, uint val, uint delta, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base + ((lane_id + delta) % subgroup_size)];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_clustered_rotate_uint(__local uint* scratch, uint val, uint delta, uint lane_id, uint linear_id, uint subgroup_base, uint cluster_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base = (lane_id / cluster_size) * cluster_size;
+    uint r = scratch[subgroup_base + cluster_base + ((lane_id - cluster_base + delta) % cluster_size)];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_all(__local uint* scratch, bool predicate, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = predicate ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    bool r = true;
+    for (uint i = 0u; i < subgroup_size; i++)
+        r = r && (scratch[subgroup_base + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_any(__local uint* scratch, bool predicate, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = predicate ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    bool r = false;
+    for (uint i = 0u; i < subgroup_size; i++)
+        r = r || (scratch[subgroup_base + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_all_equal_uint(__local uint* scratch, uint val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint first = scratch[subgroup_base];
+    bool r = true;
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = r && (scratch[subgroup_base + i] == first);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint4 spv_emulate_ballot(__local uint* scratch, bool predicate, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = predicate ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint4 r = (uint4)(0u);
+    for (uint i = 0u; i < subgroup_size; i++) {
+        if (scratch[subgroup_base + i] != 0u) {
+            uint word = i / 32u;
+            uint bit = i % 32u;
+            if (word == 0u) r.x |= (1u << bit);
+            else if (word == 1u) r.y |= (1u << bit);
+            else if (word == 2u) r.z |= (1u << bit);
+            else r.w |= (1u << bit);
+        }
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint4 spv_subgroup_eq_mask(uint lane_id) {
+    uint4 r = (uint4)(0u);
+    uint word = lane_id / 32u;
+    uint bit = lane_id % 32u;
+    if (word == 0u) r.x = (1u << bit);
+    else if (word == 1u) r.y = (1u << bit);
+    else if (word == 2u) r.z = (1u << bit);
+    else r.w = (1u << bit);
+    return r;
+}
+
+static uint4 spv_subgroup_ge_mask(uint lane_id, uint subgroup_size) {
+    uint4 r = (uint4)(0u);
+    for (uint i = lane_id; i < subgroup_size; i++) {
+        uint word = i / 32u;
+        uint bit = i % 32u;
+        if (word == 0u) r.x |= (1u << bit);
+        else if (word == 1u) r.y |= (1u << bit);
+        else if (word == 2u) r.z |= (1u << bit);
+        else r.w |= (1u << bit);
+    }
+    return r;
+}
+
+static uint4 spv_subgroup_gt_mask(uint lane_id, uint subgroup_size) {
+    return spv_subgroup_ge_mask(lane_id + 1u, subgroup_size);
+}
+
+static uint4 spv_subgroup_le_mask(uint lane_id, uint subgroup_size) {
+    return spv_subgroup_ge_mask(0u, lane_id + 1u);
+}
+
+static uint4 spv_subgroup_lt_mask(uint lane_id) {
+    if (lane_id == 0u) return (uint4)(0u);
+    return spv_subgroup_ge_mask(0u, lane_id);
+}
+
+static uint spv_emulate_reduce_add_uint(__local uint* scratch, uint val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = r + scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_inclusive_scan_add_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i <= lane_id; i++)
+        r = r + scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_exclusive_scan_add_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = 0u;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r + scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_clustered_reduce_add_uint(__local uint* scratch, uint val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    uint r = scratch[subgroup_base + cluster_base_in_sg];
+    for (uint i = 1u; i < cluster_size; i++)
+        r = r + scratch[subgroup_base + cluster_base_in_sg + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_reduce_mul_uint(__local uint* scratch, uint val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = r * scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_inclusive_scan_mul_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i <= lane_id; i++)
+        r = r * scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_exclusive_scan_mul_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = 1u;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r * scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_clustered_reduce_mul_uint(__local uint* scratch, uint val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    uint r = scratch[subgroup_base + cluster_base_in_sg];
+    for (uint i = 1u; i < cluster_size; i++)
+        r = r * scratch[subgroup_base + cluster_base_in_sg + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_reduce_min_uint(__local uint* scratch, uint val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = min(r, scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_inclusive_scan_min_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i <= lane_id; i++)
+        r = min(r, scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_exclusive_scan_min_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = UINT_MAX;
+    for (uint i = 0u; i < lane_id; i++)
+        r = min(r, scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_clustered_reduce_min_uint(__local uint* scratch, uint val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    uint r = scratch[subgroup_base + cluster_base_in_sg];
+    for (uint i = 1u; i < cluster_size; i++)
+        r = min(r, scratch[subgroup_base + cluster_base_in_sg + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_reduce_max_uint(__local uint* scratch, uint val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = max(r, scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_inclusive_scan_max_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i <= lane_id; i++)
+        r = max(r, scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_exclusive_scan_max_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = 0u;
+    for (uint i = 0u; i < lane_id; i++)
+        r = max(r, scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_clustered_reduce_max_uint(__local uint* scratch, uint val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    uint r = scratch[subgroup_base + cluster_base_in_sg];
+    for (uint i = 1u; i < cluster_size; i++)
+        r = max(r, scratch[subgroup_base + cluster_base_in_sg + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_reduce_and_uint(__local uint* scratch, uint val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = r & scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_inclusive_scan_and_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i <= lane_id; i++)
+        r = r & scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_exclusive_scan_and_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = 0xFFFFFFFFu;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r & scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_clustered_reduce_and_uint(__local uint* scratch, uint val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    uint r = scratch[subgroup_base + cluster_base_in_sg];
+    for (uint i = 1u; i < cluster_size; i++)
+        r = r & scratch[subgroup_base + cluster_base_in_sg + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_reduce_or_uint(__local uint* scratch, uint val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = r | scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_inclusive_scan_or_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i <= lane_id; i++)
+        r = r | scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_exclusive_scan_or_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = 0u;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r | scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_clustered_reduce_or_uint(__local uint* scratch, uint val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    uint r = scratch[subgroup_base + cluster_base_in_sg];
+    for (uint i = 1u; i < cluster_size; i++)
+        r = r | scratch[subgroup_base + cluster_base_in_sg + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_reduce_xor_uint(__local uint* scratch, uint val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = r ^ scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_inclusive_scan_xor_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i <= lane_id; i++)
+        r = r ^ scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_exclusive_scan_xor_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = 0u;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r ^ scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_clustered_reduce_xor_uint(__local uint* scratch, uint val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    uint r = scratch[subgroup_base + cluster_base_in_sg];
+    for (uint i = 1u; i < cluster_size; i++)
+        r = r ^ scratch[subgroup_base + cluster_base_in_sg + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_reduce_add_int(__local uint* scratch, int val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = r + as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_inclusive_scan_add_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i <= lane_id; i++)
+        r = r + as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_exclusive_scan_add_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = 0;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r + as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_clustered_reduce_add_int(__local uint* scratch, int val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    int r = as_int(scratch[subgroup_base + cluster_base_in_sg]);
+    for (uint i = 1u; i < cluster_size; i++)
+        r = r + as_int(scratch[subgroup_base + cluster_base_in_sg + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_reduce_mul_int(__local uint* scratch, int val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = r * as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_inclusive_scan_mul_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i <= lane_id; i++)
+        r = r * as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_exclusive_scan_mul_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = 1;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r * as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_clustered_reduce_mul_int(__local uint* scratch, int val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    int r = as_int(scratch[subgroup_base + cluster_base_in_sg]);
+    for (uint i = 1u; i < cluster_size; i++)
+        r = r * as_int(scratch[subgroup_base + cluster_base_in_sg + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_reduce_min_int(__local uint* scratch, int val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = min(r, as_int(scratch[subgroup_base + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_inclusive_scan_min_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i <= lane_id; i++)
+        r = min(r, as_int(scratch[subgroup_base + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_exclusive_scan_min_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = INT_MAX;
+    for (uint i = 0u; i < lane_id; i++)
+        r = min(r, as_int(scratch[subgroup_base + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_clustered_reduce_min_int(__local uint* scratch, int val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    int r = as_int(scratch[subgroup_base + cluster_base_in_sg]);
+    for (uint i = 1u; i < cluster_size; i++)
+        r = min(r, as_int(scratch[subgroup_base + cluster_base_in_sg + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_reduce_max_int(__local uint* scratch, int val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = max(r, as_int(scratch[subgroup_base + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_inclusive_scan_max_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i <= lane_id; i++)
+        r = max(r, as_int(scratch[subgroup_base + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_exclusive_scan_max_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = INT_MIN;
+    for (uint i = 0u; i < lane_id; i++)
+        r = max(r, as_int(scratch[subgroup_base + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_clustered_reduce_max_int(__local uint* scratch, int val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    int r = as_int(scratch[subgroup_base + cluster_base_in_sg]);
+    for (uint i = 1u; i < cluster_size; i++)
+        r = max(r, as_int(scratch[subgroup_base + cluster_base_in_sg + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_reduce_and_int(__local uint* scratch, int val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = r & as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_inclusive_scan_and_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i <= lane_id; i++)
+        r = r & as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_exclusive_scan_and_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(0xFFFFFFFFu);
+    for (uint i = 0u; i < lane_id; i++)
+        r = r & as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_clustered_reduce_and_int(__local uint* scratch, int val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    int r = as_int(scratch[subgroup_base + cluster_base_in_sg]);
+    for (uint i = 1u; i < cluster_size; i++)
+        r = r & as_int(scratch[subgroup_base + cluster_base_in_sg + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_reduce_or_int(__local uint* scratch, int val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = r | as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_inclusive_scan_or_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i <= lane_id; i++)
+        r = r | as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_exclusive_scan_or_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = 0;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r | as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_clustered_reduce_or_int(__local uint* scratch, int val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    int r = as_int(scratch[subgroup_base + cluster_base_in_sg]);
+    for (uint i = 1u; i < cluster_size; i++)
+        r = r | as_int(scratch[subgroup_base + cluster_base_in_sg + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_reduce_xor_int(__local uint* scratch, int val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = r ^ as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_inclusive_scan_xor_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i <= lane_id; i++)
+        r = r ^ as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_exclusive_scan_xor_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = 0;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r ^ as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_clustered_reduce_xor_int(__local uint* scratch, int val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    int r = as_int(scratch[subgroup_base + cluster_base_in_sg]);
+    for (uint i = 1u; i < cluster_size; i++)
+        r = r ^ as_int(scratch[subgroup_base + cluster_base_in_sg + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_reduce_add_float(__local uint* scratch, float val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    float r = as_float(scratch[subgroup_base]);
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = r + as_float(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_inclusive_scan_add_float(__local uint* scratch, float val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    float r = as_float(scratch[subgroup_base]);
+    for (uint i = 1u; i <= lane_id; i++)
+        r = r + as_float(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_exclusive_scan_add_float(__local uint* scratch, float val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    float r = 0.0f;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r + as_float(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_clustered_reduce_add_float(__local uint* scratch, float val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    float r = as_float(scratch[subgroup_base + cluster_base_in_sg]);
+    for (uint i = 1u; i < cluster_size; i++)
+        r = r + as_float(scratch[subgroup_base + cluster_base_in_sg + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_reduce_mul_float(__local uint* scratch, float val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    float r = as_float(scratch[subgroup_base]);
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = r * as_float(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_inclusive_scan_mul_float(__local uint* scratch, float val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    float r = as_float(scratch[subgroup_base]);
+    for (uint i = 1u; i <= lane_id; i++)
+        r = r * as_float(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_exclusive_scan_mul_float(__local uint* scratch, float val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    float r = 1.0f;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r * as_float(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_clustered_reduce_mul_float(__local uint* scratch, float val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    float r = as_float(scratch[subgroup_base + cluster_base_in_sg]);
+    for (uint i = 1u; i < cluster_size; i++)
+        r = r * as_float(scratch[subgroup_base + cluster_base_in_sg + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_reduce_min_float(__local uint* scratch, float val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    float r = as_float(scratch[subgroup_base]);
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = fmin(r, as_float(scratch[subgroup_base + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_inclusive_scan_min_float(__local uint* scratch, float val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    float r = as_float(scratch[subgroup_base]);
+    for (uint i = 1u; i <= lane_id; i++)
+        r = fmin(r, as_float(scratch[subgroup_base + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_exclusive_scan_min_float(__local uint* scratch, float val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    float r = INFINITY;
+    for (uint i = 0u; i < lane_id; i++)
+        r = fmin(r, as_float(scratch[subgroup_base + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_clustered_reduce_min_float(__local uint* scratch, float val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    float r = as_float(scratch[subgroup_base + cluster_base_in_sg]);
+    for (uint i = 1u; i < cluster_size; i++)
+        r = fmin(r, as_float(scratch[subgroup_base + cluster_base_in_sg + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_reduce_max_float(__local uint* scratch, float val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    float r = as_float(scratch[subgroup_base]);
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = fmax(r, as_float(scratch[subgroup_base + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_inclusive_scan_max_float(__local uint* scratch, float val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    float r = as_float(scratch[subgroup_base]);
+    for (uint i = 1u; i <= lane_id; i++)
+        r = fmax(r, as_float(scratch[subgroup_base + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_exclusive_scan_max_float(__local uint* scratch, float val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    float r = -INFINITY;
+    for (uint i = 0u; i < lane_id; i++)
+        r = fmax(r, as_float(scratch[subgroup_base + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_clustered_reduce_max_float(__local uint* scratch, float val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    float r = as_float(scratch[subgroup_base + cluster_base_in_sg]);
+    for (uint i = 1u; i < cluster_size; i++)
+        r = fmax(r, as_float(scratch[subgroup_base + cluster_base_in_sg + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_reduce_logical_and(__local uint* scratch, bool val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    bool r = true;
+    for (uint i = 0u; i < subgroup_size; i++)
+        r = r && (scratch[subgroup_base + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_inclusive_scan_logical_and(__local uint* scratch, bool val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    bool r = true;
+    for (uint i = 0u; i <= lane_id; i++)
+        r = r && (scratch[subgroup_base + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_exclusive_scan_logical_and(__local uint* scratch, bool val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    bool r = true;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r && (scratch[subgroup_base + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_clustered_reduce_logical_and(__local uint* scratch, bool val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    bool r = true;
+    for (uint i = 0u; i < cluster_size; i++)
+        r = r && (scratch[subgroup_base + cluster_base_in_sg + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_reduce_logical_or(__local uint* scratch, bool val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    bool r = false;
+    for (uint i = 0u; i < subgroup_size; i++)
+        r = r || (scratch[subgroup_base + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_inclusive_scan_logical_or(__local uint* scratch, bool val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    bool r = false;
+    for (uint i = 0u; i <= lane_id; i++)
+        r = r || (scratch[subgroup_base + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_exclusive_scan_logical_or(__local uint* scratch, bool val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    bool r = false;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r || (scratch[subgroup_base + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_clustered_reduce_logical_or(__local uint* scratch, bool val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    bool r = false;
+    for (uint i = 0u; i < cluster_size; i++)
+        r = r || (scratch[subgroup_base + cluster_base_in_sg + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_reduce_logical_xor(__local uint* scratch, bool val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    bool r = false;
+    for (uint i = 0u; i < subgroup_size; i++)
+        r = r != (scratch[subgroup_base + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_inclusive_scan_logical_xor(__local uint* scratch, bool val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    bool r = false;
+    for (uint i = 0u; i <= lane_id; i++)
+        r = r != (scratch[subgroup_base + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_exclusive_scan_logical_xor(__local uint* scratch, bool val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    bool r = false;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r != (scratch[subgroup_base + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_clustered_reduce_logical_xor(__local uint* scratch, bool val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    bool r = false;
+    for (uint i = 0u; i < cluster_size; i++)
+        r = r != (scratch[subgroup_base + cluster_base_in_sg + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_inverse_ballot(uint4 ballot, uint lane_id) {
+    uint word = lane_id / 32u;
+    uint bit = lane_id % 32u;
+    uint v = (word == 0u) ? ballot.x : (word == 1u) ? ballot.y : (word == 2u) ? ballot.z : ballot.w;
+    return (v & (1u << bit)) != 0u;
+}
+
+static bool spv_emulate_ballot_bit_extract(uint4 ballot, uint index) {
+    uint word = index / 32u;
+    uint bit = index % 32u;
+    uint v = (word == 0u) ? ballot.x : (word == 1u) ? ballot.y : (word == 2u) ? ballot.z : ballot.w;
+    return (v & (1u << bit)) != 0u;
+}
+
+static uint spv_popcount4(uint4 v) {
+    return popcount(v.x) + popcount(v.y) + popcount(v.z) + popcount(v.w);
+}
+
+static uint spv_emulate_ballot_bit_count(uint4 ballot) {
+    return spv_popcount4(ballot);
+}
+
+static uint spv_emulate_ballot_inclusive_bit_count(uint4 ballot, uint lane_id) {
+    uint4 masked = ballot;
+    uint word = lane_id / 32u;
+    uint bit = lane_id % 32u;
+    uint mask = (bit == 31u) ? 0xFFFFFFFFu : ((1u << (bit + 1u)) - 1u);
+    if (word == 0u) { masked.x &= mask; masked.y = 0u; masked.z = 0u; masked.w = 0u; }
+    else if (word == 1u) { masked.y &= mask; masked.z = 0u; masked.w = 0u; }
+    else if (word == 2u) { masked.z &= mask; masked.w = 0u; }
+    else { masked.w &= mask; }
+    return spv_popcount4(masked);
+}
+
+static uint spv_emulate_ballot_exclusive_bit_count(uint4 ballot, uint lane_id) {
+    if (lane_id == 0u) return 0u;
+    return spv_emulate_ballot_inclusive_bit_count(ballot, lane_id - 1u);
+}
+
+static uint spv_emulate_ballot_find_lsb(uint4 ballot) {
+    if (ballot.x != 0u) return (uint)clz(ballot.x & (0u - ballot.x));
+    if (ballot.y != 0u) return 32u + (uint)clz(ballot.y & (0u - ballot.y));
+    if (ballot.z != 0u) return 64u + (uint)clz(ballot.z & (0u - ballot.z));
+    if (ballot.w != 0u) return 96u + (uint)clz(ballot.w & (0u - ballot.w));
+    return ~0u;
+}
+
+static uint spv_emulate_ballot_find_msb(uint4 ballot) {
+    if (ballot.w != 0u) return 127u - (uint)clz(ballot.w);
+    if (ballot.z != 0u) return 95u - (uint)clz(ballot.z);
+    if (ballot.y != 0u) return 63u - (uint)clz(ballot.y);
+    if (ballot.x != 0u) return 31u - (uint)clz(ballot.x);
+    return ~0u;
+}
+
+__attribute__((reqd_work_group_size(256, 1, 1)))
+__kernel void comp_main(__global SSBO* _23)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    uint _spv_linear_workgroup_size = get_local_size(0) * get_local_size(1) * get_local_size(2);
+    uint _spv_linear_id = (get_local_id(2) * get_local_size(1) * get_local_size(0)) + (get_local_id(1) * get_local_size(0)) + get_local_id(0);
+    uint _spv_subgroup_size = 32u;
+    uint _spv_lane_id = _spv_linear_id % 32u;
+    uint _spv_subgroup_id = _spv_linear_id / 32u;
+    uint _spv_num_subgroups = _spv_linear_workgroup_size / 32u;
+    uint _spv_subgroup_base = _spv_subgroup_id * 32u;
+    __local uint _spv_subgroup_scratch[256];
+    uint4 ballot_1 = spv_emulate_ballot(_spv_subgroup_scratch, _spv_lane_id < 16u, _spv_linear_id, _spv_subgroup_base, _spv_subgroup_size);
+    float first_1 = as_float(spv_emulate_broadcast_first_uint(_spv_subgroup_scratch, as_uint(_23->FragColor), _spv_linear_id, _spv_subgroup_base));
+    bool extracted_1 = spv_emulate_ballot_bit_extract(ballot_1, 5u);
+    uint bit_count_1 = spv_emulate_ballot_bit_count(ballot_1);
+    uint inclusive_count_1 = spv_emulate_ballot_inclusive_bit_count(ballot_1, _spv_lane_id);
+    uint exclusive_count_1 = spv_emulate_ballot_exclusive_bit_count(ballot_1, _spv_lane_id);
+    uint find_lsb_1 = spv_emulate_ballot_find_lsb(ballot_1);
+    uint find_msb_1 = spv_emulate_ballot_find_msb(ballot_1);
+    bool inv_ballot_1 = spv_emulate_inverse_ballot(ballot_1, _spv_lane_id);
+    uint4 eq_mask_1 = spv_subgroup_eq_mask(_spv_lane_id);
+    uint4 ge_mask_1 = spv_subgroup_ge_mask(_spv_lane_id, _spv_subgroup_size);
+    uint4 gt_mask_1 = spv_subgroup_gt_mask(_spv_lane_id, _spv_subgroup_size);
+    uint4 le_mask_1 = spv_subgroup_le_mask(_spv_lane_id, _spv_subgroup_size);
+    uint4 lt_mask_1 = spv_subgroup_lt_mask(_spv_lane_id);
+    _23->FragColor = ((((((((first_1 + convert_float(((ballot_1.x + ballot_1.y) + ballot_1.z) + ballot_1.w)) + convert_float((((bit_count_1 + inclusive_count_1) + exclusive_count_1) + find_lsb_1) + find_msb_1)) + (float)(extracted_1)) + (float)(inv_ballot_1)) + convert_float(eq_mask_1.x)) + convert_float(ge_mask_1.x)) + convert_float(gt_mask_1.x)) + convert_float(le_mask_1.x)) + convert_float(lt_mask_1.x);
+}
+
diff --git a/reference/shaders-opencl-no-opt/comp/subgroups-basic.nocompat.vk.subgroups-emulate.comp b/reference/shaders-opencl-no-opt/comp/subgroups-basic.nocompat.vk.subgroups-emulate.comp
new file mode 100644
index 000000000..48c743edf
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/comp/subgroups-basic.nocompat.vk.subgroups-emulate.comp
@@ -0,0 +1,1132 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct SSBO
+{
+    float FragColor;
+    int idat;
+    uint udat;
+};
+
+typedef struct SSBO SSBO;
+
+static uint spv_emulate_broadcast_uint(__local uint* scratch, uint val, uint src_lane, uint linear_id, uint subgroup_base) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base + src_lane];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_broadcast_first_uint(__local uint* scratch, uint val, uint linear_id, uint subgroup_base) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_shuffle_uint(__local uint* scratch, uint val, uint index, uint linear_id, uint subgroup_base) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base + index];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_shuffle_xor_uint(__local uint* scratch, uint val, uint mask, uint lane_id, uint linear_id, uint subgroup_base) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base + (lane_id ^ mask)];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_shuffle_up_uint(__local uint* scratch, uint val, uint delta, uint lane_id, uint linear_id, uint subgroup_base) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = (lane_id >= delta) ? scratch[linear_id - delta] : val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_shuffle_down_uint(__local uint* scratch, uint val, uint delta, uint lane_id, uint linear_id, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = (lane_id + delta < subgroup_size) ? scratch[linear_id + delta] : val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_rotate_uint(__local uint* scratch, uint val, uint delta, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base + ((lane_id + delta) % subgroup_size)];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_clustered_rotate_uint(__local uint* scratch, uint val, uint delta, uint lane_id, uint linear_id, uint subgroup_base, uint cluster_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base = (lane_id / cluster_size) * cluster_size;
+    uint r = scratch[subgroup_base + cluster_base + ((lane_id - cluster_base + delta) % cluster_size)];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_all(__local uint* scratch, bool predicate, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = predicate ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    bool r = true;
+    for (uint i = 0u; i < subgroup_size; i++)
+        r = r && (scratch[subgroup_base + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_any(__local uint* scratch, bool predicate, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = predicate ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    bool r = false;
+    for (uint i = 0u; i < subgroup_size; i++)
+        r = r || (scratch[subgroup_base + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_all_equal_uint(__local uint* scratch, uint val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint first = scratch[subgroup_base];
+    bool r = true;
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = r && (scratch[subgroup_base + i] == first);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint4 spv_emulate_ballot(__local uint* scratch, bool predicate, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = predicate ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint4 r = (uint4)(0u);
+    for (uint i = 0u; i < subgroup_size; i++) {
+        if (scratch[subgroup_base + i] != 0u) {
+            uint word = i / 32u;
+            uint bit = i % 32u;
+            if (word == 0u) r.x |= (1u << bit);
+            else if (word == 1u) r.y |= (1u << bit);
+            else if (word == 2u) r.z |= (1u << bit);
+            else r.w |= (1u << bit);
+        }
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint4 spv_subgroup_eq_mask(uint lane_id) {
+    uint4 r = (uint4)(0u);
+    uint word = lane_id / 32u;
+    uint bit = lane_id % 32u;
+    if (word == 0u) r.x = (1u << bit);
+    else if (word == 1u) r.y = (1u << bit);
+    else if (word == 2u) r.z = (1u << bit);
+    else r.w = (1u << bit);
+    return r;
+}
+
+static uint4 spv_subgroup_ge_mask(uint lane_id, uint subgroup_size) {
+    uint4 r = (uint4)(0u);
+    for (uint i = lane_id; i < subgroup_size; i++) {
+        uint word = i / 32u;
+        uint bit = i % 32u;
+        if (word == 0u) r.x |= (1u << bit);
+        else if (word == 1u) r.y |= (1u << bit);
+        else if (word == 2u) r.z |= (1u << bit);
+        else r.w |= (1u << bit);
+    }
+    return r;
+}
+
+static uint4 spv_subgroup_gt_mask(uint lane_id, uint subgroup_size) {
+    return spv_subgroup_ge_mask(lane_id + 1u, subgroup_size);
+}
+
+static uint4 spv_subgroup_le_mask(uint lane_id, uint subgroup_size) {
+    return spv_subgroup_ge_mask(0u, lane_id + 1u);
+}
+
+static uint4 spv_subgroup_lt_mask(uint lane_id) {
+    if (lane_id == 0u) return (uint4)(0u);
+    return spv_subgroup_ge_mask(0u, lane_id);
+}
+
+static uint spv_emulate_reduce_add_uint(__local uint* scratch, uint val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = r + scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_inclusive_scan_add_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i <= lane_id; i++)
+        r = r + scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_exclusive_scan_add_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = 0u;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r + scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_clustered_reduce_add_uint(__local uint* scratch, uint val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    uint r = scratch[subgroup_base + cluster_base_in_sg];
+    for (uint i = 1u; i < cluster_size; i++)
+        r = r + scratch[subgroup_base + cluster_base_in_sg + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_reduce_mul_uint(__local uint* scratch, uint val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = r * scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_inclusive_scan_mul_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i <= lane_id; i++)
+        r = r * scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_exclusive_scan_mul_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = 1u;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r * scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_clustered_reduce_mul_uint(__local uint* scratch, uint val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    uint r = scratch[subgroup_base + cluster_base_in_sg];
+    for (uint i = 1u; i < cluster_size; i++)
+        r = r * scratch[subgroup_base + cluster_base_in_sg + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_reduce_min_uint(__local uint* scratch, uint val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = min(r, scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_inclusive_scan_min_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i <= lane_id; i++)
+        r = min(r, scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_exclusive_scan_min_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = UINT_MAX;
+    for (uint i = 0u; i < lane_id; i++)
+        r = min(r, scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_clustered_reduce_min_uint(__local uint* scratch, uint val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    uint r = scratch[subgroup_base + cluster_base_in_sg];
+    for (uint i = 1u; i < cluster_size; i++)
+        r = min(r, scratch[subgroup_base + cluster_base_in_sg + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_reduce_max_uint(__local uint* scratch, uint val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = max(r, scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_inclusive_scan_max_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i <= lane_id; i++)
+        r = max(r, scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_exclusive_scan_max_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = 0u;
+    for (uint i = 0u; i < lane_id; i++)
+        r = max(r, scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_clustered_reduce_max_uint(__local uint* scratch, uint val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    uint r = scratch[subgroup_base + cluster_base_in_sg];
+    for (uint i = 1u; i < cluster_size; i++)
+        r = max(r, scratch[subgroup_base + cluster_base_in_sg + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_reduce_and_uint(__local uint* scratch, uint val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = r & scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_inclusive_scan_and_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i <= lane_id; i++)
+        r = r & scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_exclusive_scan_and_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = 0xFFFFFFFFu;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r & scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_clustered_reduce_and_uint(__local uint* scratch, uint val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    uint r = scratch[subgroup_base + cluster_base_in_sg];
+    for (uint i = 1u; i < cluster_size; i++)
+        r = r & scratch[subgroup_base + cluster_base_in_sg + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_reduce_or_uint(__local uint* scratch, uint val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = r | scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_inclusive_scan_or_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i <= lane_id; i++)
+        r = r | scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_exclusive_scan_or_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = 0u;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r | scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_clustered_reduce_or_uint(__local uint* scratch, uint val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    uint r = scratch[subgroup_base + cluster_base_in_sg];
+    for (uint i = 1u; i < cluster_size; i++)
+        r = r | scratch[subgroup_base + cluster_base_in_sg + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_reduce_xor_uint(__local uint* scratch, uint val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = r ^ scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_inclusive_scan_xor_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i <= lane_id; i++)
+        r = r ^ scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_exclusive_scan_xor_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = 0u;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r ^ scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_clustered_reduce_xor_uint(__local uint* scratch, uint val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    uint r = scratch[subgroup_base + cluster_base_in_sg];
+    for (uint i = 1u; i < cluster_size; i++)
+        r = r ^ scratch[subgroup_base + cluster_base_in_sg + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_reduce_add_int(__local uint* scratch, int val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = r + as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_inclusive_scan_add_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i <= lane_id; i++)
+        r = r + as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_exclusive_scan_add_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = 0;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r + as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_clustered_reduce_add_int(__local uint* scratch, int val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    int r = as_int(scratch[subgroup_base + cluster_base_in_sg]);
+    for (uint i = 1u; i < cluster_size; i++)
+        r = r + as_int(scratch[subgroup_base + cluster_base_in_sg + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_reduce_mul_int(__local uint* scratch, int val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = r * as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_inclusive_scan_mul_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i <= lane_id; i++)
+        r = r * as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_exclusive_scan_mul_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = 1;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r * as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_clustered_reduce_mul_int(__local uint* scratch, int val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    int r = as_int(scratch[subgroup_base + cluster_base_in_sg]);
+    for (uint i = 1u; i < cluster_size; i++)
+        r = r * as_int(scratch[subgroup_base + cluster_base_in_sg + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_reduce_min_int(__local uint* scratch, int val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = min(r, as_int(scratch[subgroup_base + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_inclusive_scan_min_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i <= lane_id; i++)
+        r = min(r, as_int(scratch[subgroup_base + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_exclusive_scan_min_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = INT_MAX;
+    for (uint i = 0u; i < lane_id; i++)
+        r = min(r, as_int(scratch[subgroup_base + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_clustered_reduce_min_int(__local uint* scratch, int val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    int r = as_int(scratch[subgroup_base + cluster_base_in_sg]);
+    for (uint i = 1u; i < cluster_size; i++)
+        r = min(r, as_int(scratch[subgroup_base + cluster_base_in_sg + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_reduce_max_int(__local uint* scratch, int val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = max(r, as_int(scratch[subgroup_base + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_inclusive_scan_max_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i <= lane_id; i++)
+        r = max(r, as_int(scratch[subgroup_base + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_exclusive_scan_max_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = INT_MIN;
+    for (uint i = 0u; i < lane_id; i++)
+        r = max(r, as_int(scratch[subgroup_base + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_clustered_reduce_max_int(__local uint* scratch, int val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    int r = as_int(scratch[subgroup_base + cluster_base_in_sg]);
+    for (uint i = 1u; i < cluster_size; i++)
+        r = max(r, as_int(scratch[subgroup_base + cluster_base_in_sg + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_reduce_and_int(__local uint* scratch, int val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = r & as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_inclusive_scan_and_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i <= lane_id; i++)
+        r = r & as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_exclusive_scan_and_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(0xFFFFFFFFu);
+    for (uint i = 0u; i < lane_id; i++)
+        r = r & as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_clustered_reduce_and_int(__local uint* scratch, int val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    int r = as_int(scratch[subgroup_base + cluster_base_in_sg]);
+    for (uint i = 1u; i < cluster_size; i++)
+        r = r & as_int(scratch[subgroup_base + cluster_base_in_sg + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_reduce_or_int(__local uint* scratch, int val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = r | as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_inclusive_scan_or_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i <= lane_id; i++)
+        r = r | as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_exclusive_scan_or_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = 0;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r | as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_clustered_reduce_or_int(__local uint* scratch, int val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    int r = as_int(scratch[subgroup_base + cluster_base_in_sg]);
+    for (uint i = 1u; i < cluster_size; i++)
+        r = r | as_int(scratch[subgroup_base + cluster_base_in_sg + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_reduce_xor_int(__local uint* scratch, int val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = r ^ as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_inclusive_scan_xor_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i <= lane_id; i++)
+        r = r ^ as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_exclusive_scan_xor_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = 0;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r ^ as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_clustered_reduce_xor_int(__local uint* scratch, int val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    int r = as_int(scratch[subgroup_base + cluster_base_in_sg]);
+    for (uint i = 1u; i < cluster_size; i++)
+        r = r ^ as_int(scratch[subgroup_base + cluster_base_in_sg + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_reduce_add_float(__local uint* scratch, float val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    float r = as_float(scratch[subgroup_base]);
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = r + as_float(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_inclusive_scan_add_float(__local uint* scratch, float val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    float r = as_float(scratch[subgroup_base]);
+    for (uint i = 1u; i <= lane_id; i++)
+        r = r + as_float(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_exclusive_scan_add_float(__local uint* scratch, float val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    float r = 0.0f;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r + as_float(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_clustered_reduce_add_float(__local uint* scratch, float val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    float r = as_float(scratch[subgroup_base + cluster_base_in_sg]);
+    for (uint i = 1u; i < cluster_size; i++)
+        r = r + as_float(scratch[subgroup_base + cluster_base_in_sg + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_reduce_mul_float(__local uint* scratch, float val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    float r = as_float(scratch[subgroup_base]);
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = r * as_float(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_inclusive_scan_mul_float(__local uint* scratch, float val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    float r = as_float(scratch[subgroup_base]);
+    for (uint i = 1u; i <= lane_id; i++)
+        r = r * as_float(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_exclusive_scan_mul_float(__local uint* scratch, float val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    float r = 1.0f;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r * as_float(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_clustered_reduce_mul_float(__local uint* scratch, float val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    float r = as_float(scratch[subgroup_base + cluster_base_in_sg]);
+    for (uint i = 1u; i < cluster_size; i++)
+        r = r * as_float(scratch[subgroup_base + cluster_base_in_sg + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_reduce_min_float(__local uint* scratch, float val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    float r = as_float(scratch[subgroup_base]);
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = fmin(r, as_float(scratch[subgroup_base + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_inclusive_scan_min_float(__local uint* scratch, float val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    float r = as_float(scratch[subgroup_base]);
+    for (uint i = 1u; i <= lane_id; i++)
+        r = fmin(r, as_float(scratch[subgroup_base + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_exclusive_scan_min_float(__local uint* scratch, float val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    float r = INFINITY;
+    for (uint i = 0u; i < lane_id; i++)
+        r = fmin(r, as_float(scratch[subgroup_base + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_clustered_reduce_min_float(__local uint* scratch, float val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    float r = as_float(scratch[subgroup_base + cluster_base_in_sg]);
+    for (uint i = 1u; i < cluster_size; i++)
+        r = fmin(r, as_float(scratch[subgroup_base + cluster_base_in_sg + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_reduce_max_float(__local uint* scratch, float val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    float r = as_float(scratch[subgroup_base]);
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = fmax(r, as_float(scratch[subgroup_base + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_inclusive_scan_max_float(__local uint* scratch, float val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    float r = as_float(scratch[subgroup_base]);
+    for (uint i = 1u; i <= lane_id; i++)
+        r = fmax(r, as_float(scratch[subgroup_base + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_exclusive_scan_max_float(__local uint* scratch, float val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    float r = -INFINITY;
+    for (uint i = 0u; i < lane_id; i++)
+        r = fmax(r, as_float(scratch[subgroup_base + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_clustered_reduce_max_float(__local uint* scratch, float val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    float r = as_float(scratch[subgroup_base + cluster_base_in_sg]);
+    for (uint i = 1u; i < cluster_size; i++)
+        r = fmax(r, as_float(scratch[subgroup_base + cluster_base_in_sg + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_reduce_logical_and(__local uint* scratch, bool val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    bool r = true;
+    for (uint i = 0u; i < subgroup_size; i++)
+        r = r && (scratch[subgroup_base + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_inclusive_scan_logical_and(__local uint* scratch, bool val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    bool r = true;
+    for (uint i = 0u; i <= lane_id; i++)
+        r = r && (scratch[subgroup_base + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_exclusive_scan_logical_and(__local uint* scratch, bool val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    bool r = true;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r && (scratch[subgroup_base + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_clustered_reduce_logical_and(__local uint* scratch, bool val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    bool r = true;
+    for (uint i = 0u; i < cluster_size; i++)
+        r = r && (scratch[subgroup_base + cluster_base_in_sg + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_reduce_logical_or(__local uint* scratch, bool val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    bool r = false;
+    for (uint i = 0u; i < subgroup_size; i++)
+        r = r || (scratch[subgroup_base + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_inclusive_scan_logical_or(__local uint* scratch, bool val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    bool r = false;
+    for (uint i = 0u; i <= lane_id; i++)
+        r = r || (scratch[subgroup_base + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_exclusive_scan_logical_or(__local uint* scratch, bool val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    bool r = false;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r || (scratch[subgroup_base + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_clustered_reduce_logical_or(__local uint* scratch, bool val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    bool r = false;
+    for (uint i = 0u; i < cluster_size; i++)
+        r = r || (scratch[subgroup_base + cluster_base_in_sg + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_reduce_logical_xor(__local uint* scratch, bool val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    bool r = false;
+    for (uint i = 0u; i < subgroup_size; i++)
+        r = r != (scratch[subgroup_base + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_inclusive_scan_logical_xor(__local uint* scratch, bool val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    bool r = false;
+    for (uint i = 0u; i <= lane_id; i++)
+        r = r != (scratch[subgroup_base + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_exclusive_scan_logical_xor(__local uint* scratch, bool val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    bool r = false;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r != (scratch[subgroup_base + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_clustered_reduce_logical_xor(__local uint* scratch, bool val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    bool r = false;
+    for (uint i = 0u; i < cluster_size; i++)
+        r = r != (scratch[subgroup_base + cluster_base_in_sg + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_inverse_ballot(uint4 ballot, uint lane_id) {
+    uint word = lane_id / 32u;
+    uint bit = lane_id % 32u;
+    uint v = (word == 0u) ? ballot.x : (word == 1u) ? ballot.y : (word == 2u) ? ballot.z : ballot.w;
+    return (v & (1u << bit)) != 0u;
+}
+
+static bool spv_emulate_ballot_bit_extract(uint4 ballot, uint index) {
+    uint word = index / 32u;
+    uint bit = index % 32u;
+    uint v = (word == 0u) ? ballot.x : (word == 1u) ? ballot.y : (word == 2u) ? ballot.z : ballot.w;
+    return (v & (1u << bit)) != 0u;
+}
+
+static uint spv_popcount4(uint4 v) {
+    return popcount(v.x) + popcount(v.y) + popcount(v.z) + popcount(v.w);
+}
+
+static uint spv_emulate_ballot_bit_count(uint4 ballot) {
+    return spv_popcount4(ballot);
+}
+
+static uint spv_emulate_ballot_inclusive_bit_count(uint4 ballot, uint lane_id) {
+    uint4 masked = ballot;
+    uint word = lane_id / 32u;
+    uint bit = lane_id % 32u;
+    uint mask = (bit == 31u) ? 0xFFFFFFFFu : ((1u << (bit + 1u)) - 1u);
+    if (word == 0u) { masked.x &= mask; masked.y = 0u; masked.z = 0u; masked.w = 0u; }
+    else if (word == 1u) { masked.y &= mask; masked.z = 0u; masked.w = 0u; }
+    else if (word == 2u) { masked.z &= mask; masked.w = 0u; }
+    else { masked.w &= mask; }
+    return spv_popcount4(masked);
+}
+
+static uint spv_emulate_ballot_exclusive_bit_count(uint4 ballot, uint lane_id) {
+    if (lane_id == 0u) return 0u;
+    return spv_emulate_ballot_inclusive_bit_count(ballot, lane_id - 1u);
+}
+
+static uint spv_emulate_ballot_find_lsb(uint4 ballot) {
+    if (ballot.x != 0u) return (uint)clz(ballot.x & (0u - ballot.x));
+    if (ballot.y != 0u) return 32u + (uint)clz(ballot.y & (0u - ballot.y));
+    if (ballot.z != 0u) return 64u + (uint)clz(ballot.z & (0u - ballot.z));
+    if (ballot.w != 0u) return 96u + (uint)clz(ballot.w & (0u - ballot.w));
+    return ~0u;
+}
+
+static uint spv_emulate_ballot_find_msb(uint4 ballot) {
+    if (ballot.w != 0u) return 127u - (uint)clz(ballot.w);
+    if (ballot.z != 0u) return 95u - (uint)clz(ballot.z);
+    if (ballot.y != 0u) return 63u - (uint)clz(ballot.y);
+    if (ballot.x != 0u) return 31u - (uint)clz(ballot.x);
+    return ~0u;
+}
+
+float helper( float* val_1, __local uint* _spv_subgroup_scratch, uint _spv_linear_id, uint _spv_subgroup_base, uint _spv_subgroup_size, uint _spv_lane_id)
+{
+    float reduced_1 = spv_emulate_reduce_add_float(_spv_subgroup_scratch, (*val_1), _spv_linear_id, _spv_subgroup_base, _spv_subgroup_size);
+    bool elected_2 = (_spv_lane_id == 0u);
+    return elected_2 ? reduced_1 : 0.0f;
+}
+
+__attribute__((reqd_work_group_size(256, 1, 1)))
+__kernel void comp_main(__global SSBO* _30)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    uint _spv_linear_workgroup_size = get_local_size(0) * get_local_size(1) * get_local_size(2);
+    uint _spv_linear_id = (get_local_id(2) * get_local_size(1) * get_local_size(0)) + (get_local_id(1) * get_local_size(0)) + get_local_id(0);
+    uint _spv_subgroup_size = 32u;
+    uint _spv_lane_id = _spv_linear_id % 32u;
+    uint _spv_subgroup_id = _spv_linear_id / 32u;
+    uint _spv_num_subgroups = _spv_linear_workgroup_size / 32u;
+    uint _spv_subgroup_base = _spv_subgroup_id * 32u;
+    __local uint _spv_subgroup_scratch[256];
+    _30->FragColor = convert_float(_spv_num_subgroups);
+    _30->FragColor = convert_float(_spv_subgroup_id);
+    _30->FragColor = convert_float(_spv_subgroup_size);
+    _30->FragColor = convert_float(_spv_lane_id);
+    bool elected_1_1 = (_spv_lane_id == 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    bool has_all_1 = spv_emulate_all(_spv_subgroup_scratch, true, _spv_linear_id, _spv_subgroup_base, _spv_subgroup_size);
+    bool has_any_1 = spv_emulate_any(_spv_subgroup_scratch, true, _spv_linear_id, _spv_subgroup_base, _spv_subgroup_size);
+    uint broadcasted_1 = spv_emulate_broadcast_uint(_spv_subgroup_scratch, 42u, 0u, _spv_linear_id, _spv_subgroup_base);
+    float fadd_1 = spv_emulate_reduce_add_float(_spv_subgroup_scratch, _30->FragColor, _spv_linear_id, _spv_subgroup_base, _spv_subgroup_size);
+    int iadd_1 = spv_emulate_reduce_add_int(_spv_subgroup_scratch, _30->idat, _spv_linear_id, _spv_subgroup_base, _spv_subgroup_size);
+    float fmin_1 = spv_emulate_reduce_min_float(_spv_subgroup_scratch, _30->FragColor, _spv_linear_id, _spv_subgroup_base, _spv_subgroup_size);
+    float fmax_1 = spv_emulate_reduce_max_float(_spv_subgroup_scratch, _30->FragColor, _spv_linear_id, _spv_subgroup_base, _spv_subgroup_size);
+    int smin_1 = spv_emulate_reduce_min_int(_spv_subgroup_scratch, _30->idat, _spv_linear_id, _spv_subgroup_base, _spv_subgroup_size);
+    int smax_1 = spv_emulate_reduce_max_int(_spv_subgroup_scratch, _30->idat, _spv_linear_id, _spv_subgroup_base, _spv_subgroup_size);
+    uint umin_1 = spv_emulate_reduce_min_uint(_spv_subgroup_scratch, _30->udat, _spv_linear_id, _spv_subgroup_base, _spv_subgroup_size);
+    uint umax_1 = spv_emulate_reduce_max_uint(_spv_subgroup_scratch, _30->udat, _spv_linear_id, _spv_subgroup_base, _spv_subgroup_size);
+    float finc_add_1 = spv_emulate_inclusive_scan_add_float(_spv_subgroup_scratch, _30->FragColor, _spv_lane_id, _spv_linear_id, _spv_subgroup_base, _spv_subgroup_size);
+    int iinc_add_1 = spv_emulate_inclusive_scan_add_int(_spv_subgroup_scratch, _30->idat, _spv_lane_id, _spv_linear_id, _spv_subgroup_base, _spv_subgroup_size);
+    uint uinc_min_1 = spv_emulate_inclusive_scan_min_uint(_spv_subgroup_scratch, _30->udat, _spv_lane_id, _spv_linear_id, _spv_subgroup_base, _spv_subgroup_size);
+    uint uinc_max_1 = spv_emulate_inclusive_scan_max_uint(_spv_subgroup_scratch, _30->udat, _spv_lane_id, _spv_linear_id, _spv_subgroup_base, _spv_subgroup_size);
+    float fexc_add_1 = spv_emulate_exclusive_scan_add_float(_spv_subgroup_scratch, _30->FragColor, _spv_lane_id, _spv_linear_id, _spv_subgroup_base, _spv_subgroup_size);
+    int iexc_add_1 = spv_emulate_exclusive_scan_add_int(_spv_subgroup_scratch, _30->idat, _spv_lane_id, _spv_linear_id, _spv_subgroup_base, _spv_subgroup_size);
+    uint uexc_min_1 = spv_emulate_exclusive_scan_min_uint(_spv_subgroup_scratch, _30->udat, _spv_lane_id, _spv_linear_id, _spv_subgroup_base, _spv_subgroup_size);
+    uint uexc_max_1 = spv_emulate_exclusive_scan_max_uint(_spv_subgroup_scratch, _30->udat, _spv_lane_id, _spv_linear_id, _spv_subgroup_base, _spv_subgroup_size);
+    float param_1 = _30->FragColor;
+    float from_helper_1 = helper(&param_1, _spv_subgroup_scratch, _spv_linear_id, _spv_subgroup_base, _spv_subgroup_size, _spv_lane_id);
+    _30->FragColor = (((((((((fadd_1 + fmin_1) + fmax_1) + finc_add_1) + fexc_add_1) + convert_float((((iadd_1 + smin_1) + smax_1) + iinc_add_1) + iexc_add_1)) + convert_float((((((umin_1 + umax_1) + uinc_min_1) + uinc_max_1) + uexc_min_1) + uexc_max_1) + broadcasted_1)) + (float)(has_all_1)) + (float)(has_any_1)) + (float)(elected_1_1)) + from_helper_1;
+}
+
diff --git a/reference/shaders-opencl-no-opt/comp/subgroups-basic.nocompat.vk.subgroups.subgroups-emulate.comp b/reference/shaders-opencl-no-opt/comp/subgroups-basic.nocompat.vk.subgroups.subgroups-emulate.comp
new file mode 100644
index 000000000..27a775a32
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/comp/subgroups-basic.nocompat.vk.subgroups.subgroups-emulate.comp
@@ -0,0 +1,73 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+#pragma OPENCL EXTENSION cl_khr_subgroups : enable
+
+struct SSBO
+{
+    float FragColor;
+    int idat;
+    uint udat;
+};
+
+typedef struct SSBO SSBO;
+
+#ifdef cl_khr_subgroup_non_uniform_vote
+#pragma OPENCL EXTENSION cl_khr_subgroup_non_uniform_vote : enable
+#define spv_sub_group_elect(lane_id) sub_group_elect()
+#define spv_all_equal_uint(scratch, val, linear_id, subgroup_base, subgroup_size) sub_group_non_uniform_all_equal((val))
+#else
+#define spv_sub_group_elect(lane_id) ((lane_id) == 0u)
+#define spv_all_equal_uint(scratch, val, linear_id, subgroup_base, subgroup_size) spv_emulate_all_equal_uint((scratch), (val), (linear_id), (subgroup_base), (subgroup_size))
+#endif
+
+float helper( float* val_1, __local uint* _spv_subgroup_scratch, uint _spv_linear_id, uint _spv_subgroup_base, uint _spv_subgroup_size, uint _spv_lane_id)
+{
+    float reduced_1 = sub_group_reduce_add((*val_1));
+    bool elected_2 = spv_sub_group_elect(_spv_lane_id);
+    return elected_2 ? reduced_1 : 0.0f;
+}
+
+__attribute__((reqd_work_group_size(256, 1, 1)))
+__kernel void comp_main(__global SSBO* _30)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    uint _spv_subgroup_size = get_sub_group_size();
+    uint _spv_lane_id = get_sub_group_local_id();
+    uint _spv_subgroup_id = get_sub_group_id();
+    uint _spv_linear_id = _spv_subgroup_id * _spv_subgroup_size + _spv_lane_id;
+    uint _spv_subgroup_base = _spv_subgroup_id * _spv_subgroup_size;
+    __local uint _spv_subgroup_scratch[256];
+    _30->FragColor = convert_float(get_num_sub_groups());
+    _30->FragColor = convert_float(get_sub_group_id());
+    _30->FragColor = convert_float(get_sub_group_size());
+    _30->FragColor = convert_float(get_sub_group_local_id());
+    bool elected_1_1 = spv_sub_group_elect(_spv_lane_id);
+    sub_group_barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
+    sub_group_barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
+    sub_group_barrier(CLK_GLOBAL_MEM_FENCE);
+    sub_group_barrier(CLK_LOCAL_MEM_FENCE);
+    sub_group_barrier(CLK_GLOBAL_MEM_FENCE);
+    bool has_all_1 = sub_group_all(true);
+    bool has_any_1 = sub_group_any(true);
+    uint broadcasted_1 = sub_group_broadcast(42u, 0u);
+    float fadd_1 = sub_group_reduce_add(_30->FragColor);
+    int iadd_1 = sub_group_reduce_add(_30->idat);
+    float fmin_1 = sub_group_reduce_min(_30->FragColor);
+    float fmax_1 = sub_group_reduce_max(_30->FragColor);
+    int smin_1 = sub_group_reduce_min(_30->idat);
+    int smax_1 = sub_group_reduce_max(_30->idat);
+    uint umin_1 = sub_group_reduce_min(_30->udat);
+    uint umax_1 = sub_group_reduce_max(_30->udat);
+    float finc_add_1 = sub_group_scan_inclusive_add(_30->FragColor);
+    int iinc_add_1 = sub_group_scan_inclusive_add(_30->idat);
+    uint uinc_min_1 = sub_group_scan_inclusive_min(_30->udat);
+    uint uinc_max_1 = sub_group_scan_inclusive_max(_30->udat);
+    float fexc_add_1 = sub_group_scan_exclusive_add(_30->FragColor);
+    int iexc_add_1 = sub_group_scan_exclusive_add(_30->idat);
+    uint uexc_min_1 = sub_group_scan_exclusive_min(_30->udat);
+    uint uexc_max_1 = sub_group_scan_exclusive_max(_30->udat);
+    float param_1 = _30->FragColor;
+    float from_helper_1 = helper(&param_1, _spv_subgroup_scratch, _spv_linear_id, _spv_subgroup_base, _spv_subgroup_size, _spv_lane_id);
+    _30->FragColor = (((((((((fadd_1 + fmin_1) + fmax_1) + finc_add_1) + fexc_add_1) + convert_float((((iadd_1 + smin_1) + smax_1) + iinc_add_1) + iexc_add_1)) + convert_float((((((umin_1 + umax_1) + uinc_min_1) + uinc_max_1) + uexc_min_1) + uexc_max_1) + broadcasted_1)) + (float)(has_all_1)) + (float)(has_any_1)) + (float)(elected_1_1)) + from_helper_1;
+}
+
diff --git a/reference/shaders-opencl-no-opt/comp/subgroups-clustered.nocompat.vk.subgroups-emulate.comp b/reference/shaders-opencl-no-opt/comp/subgroups-clustered.nocompat.vk.subgroups-emulate.comp
new file mode 100644
index 000000000..4675efa97
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/comp/subgroups-clustered.nocompat.vk.subgroups-emulate.comp
@@ -0,0 +1,1105 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct SSBO
+{
+    float FragColor;
+    int idat;
+    uint udat;
+};
+
+typedef struct SSBO SSBO;
+
+static uint spv_emulate_broadcast_uint(__local uint* scratch, uint val, uint src_lane, uint linear_id, uint subgroup_base) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base + src_lane];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_broadcast_first_uint(__local uint* scratch, uint val, uint linear_id, uint subgroup_base) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_shuffle_uint(__local uint* scratch, uint val, uint index, uint linear_id, uint subgroup_base) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base + index];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_shuffle_xor_uint(__local uint* scratch, uint val, uint mask, uint lane_id, uint linear_id, uint subgroup_base) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base + (lane_id ^ mask)];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_shuffle_up_uint(__local uint* scratch, uint val, uint delta, uint lane_id, uint linear_id, uint subgroup_base) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = (lane_id >= delta) ? scratch[linear_id - delta] : val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_shuffle_down_uint(__local uint* scratch, uint val, uint delta, uint lane_id, uint linear_id, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = (lane_id + delta < subgroup_size) ? scratch[linear_id + delta] : val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_rotate_uint(__local uint* scratch, uint val, uint delta, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base + ((lane_id + delta) % subgroup_size)];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_clustered_rotate_uint(__local uint* scratch, uint val, uint delta, uint lane_id, uint linear_id, uint subgroup_base, uint cluster_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base = (lane_id / cluster_size) * cluster_size;
+    uint r = scratch[subgroup_base + cluster_base + ((lane_id - cluster_base + delta) % cluster_size)];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_all(__local uint* scratch, bool predicate, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = predicate ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    bool r = true;
+    for (uint i = 0u; i < subgroup_size; i++)
+        r = r && (scratch[subgroup_base + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_any(__local uint* scratch, bool predicate, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = predicate ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    bool r = false;
+    for (uint i = 0u; i < subgroup_size; i++)
+        r = r || (scratch[subgroup_base + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_all_equal_uint(__local uint* scratch, uint val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint first = scratch[subgroup_base];
+    bool r = true;
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = r && (scratch[subgroup_base + i] == first);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint4 spv_emulate_ballot(__local uint* scratch, bool predicate, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = predicate ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint4 r = (uint4)(0u);
+    for (uint i = 0u; i < subgroup_size; i++) {
+        if (scratch[subgroup_base + i] != 0u) {
+            uint word = i / 32u;
+            uint bit = i % 32u;
+            if (word == 0u) r.x |= (1u << bit);
+            else if (word == 1u) r.y |= (1u << bit);
+            else if (word == 2u) r.z |= (1u << bit);
+            else r.w |= (1u << bit);
+        }
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint4 spv_subgroup_eq_mask(uint lane_id) {
+    uint4 r = (uint4)(0u);
+    uint word = lane_id / 32u;
+    uint bit = lane_id % 32u;
+    if (word == 0u) r.x = (1u << bit);
+    else if (word == 1u) r.y = (1u << bit);
+    else if (word == 2u) r.z = (1u << bit);
+    else r.w = (1u << bit);
+    return r;
+}
+
+static uint4 spv_subgroup_ge_mask(uint lane_id, uint subgroup_size) {
+    uint4 r = (uint4)(0u);
+    for (uint i = lane_id; i < subgroup_size; i++) {
+        uint word = i / 32u;
+        uint bit = i % 32u;
+        if (word == 0u) r.x |= (1u << bit);
+        else if (word == 1u) r.y |= (1u << bit);
+        else if (word == 2u) r.z |= (1u << bit);
+        else r.w |= (1u << bit);
+    }
+    return r;
+}
+
+static uint4 spv_subgroup_gt_mask(uint lane_id, uint subgroup_size) {
+    return spv_subgroup_ge_mask(lane_id + 1u, subgroup_size);
+}
+
+static uint4 spv_subgroup_le_mask(uint lane_id, uint subgroup_size) {
+    return spv_subgroup_ge_mask(0u, lane_id + 1u);
+}
+
+static uint4 spv_subgroup_lt_mask(uint lane_id) {
+    if (lane_id == 0u) return (uint4)(0u);
+    return spv_subgroup_ge_mask(0u, lane_id);
+}
+
+static uint spv_emulate_reduce_add_uint(__local uint* scratch, uint val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = r + scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_inclusive_scan_add_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i <= lane_id; i++)
+        r = r + scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_exclusive_scan_add_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = 0u;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r + scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_clustered_reduce_add_uint(__local uint* scratch, uint val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    uint r = scratch[subgroup_base + cluster_base_in_sg];
+    for (uint i = 1u; i < cluster_size; i++)
+        r = r + scratch[subgroup_base + cluster_base_in_sg + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_reduce_mul_uint(__local uint* scratch, uint val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = r * scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_inclusive_scan_mul_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i <= lane_id; i++)
+        r = r * scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_exclusive_scan_mul_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = 1u;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r * scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_clustered_reduce_mul_uint(__local uint* scratch, uint val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    uint r = scratch[subgroup_base + cluster_base_in_sg];
+    for (uint i = 1u; i < cluster_size; i++)
+        r = r * scratch[subgroup_base + cluster_base_in_sg + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_reduce_min_uint(__local uint* scratch, uint val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = min(r, scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_inclusive_scan_min_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i <= lane_id; i++)
+        r = min(r, scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_exclusive_scan_min_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = UINT_MAX;
+    for (uint i = 0u; i < lane_id; i++)
+        r = min(r, scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_clustered_reduce_min_uint(__local uint* scratch, uint val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    uint r = scratch[subgroup_base + cluster_base_in_sg];
+    for (uint i = 1u; i < cluster_size; i++)
+        r = min(r, scratch[subgroup_base + cluster_base_in_sg + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_reduce_max_uint(__local uint* scratch, uint val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = max(r, scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_inclusive_scan_max_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i <= lane_id; i++)
+        r = max(r, scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_exclusive_scan_max_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = 0u;
+    for (uint i = 0u; i < lane_id; i++)
+        r = max(r, scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_clustered_reduce_max_uint(__local uint* scratch, uint val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    uint r = scratch[subgroup_base + cluster_base_in_sg];
+    for (uint i = 1u; i < cluster_size; i++)
+        r = max(r, scratch[subgroup_base + cluster_base_in_sg + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_reduce_and_uint(__local uint* scratch, uint val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = r & scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_inclusive_scan_and_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i <= lane_id; i++)
+        r = r & scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_exclusive_scan_and_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = 0xFFFFFFFFu;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r & scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_clustered_reduce_and_uint(__local uint* scratch, uint val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    uint r = scratch[subgroup_base + cluster_base_in_sg];
+    for (uint i = 1u; i < cluster_size; i++)
+        r = r & scratch[subgroup_base + cluster_base_in_sg + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_reduce_or_uint(__local uint* scratch, uint val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = r | scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_inclusive_scan_or_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i <= lane_id; i++)
+        r = r | scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_exclusive_scan_or_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = 0u;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r | scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_clustered_reduce_or_uint(__local uint* scratch, uint val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    uint r = scratch[subgroup_base + cluster_base_in_sg];
+    for (uint i = 1u; i < cluster_size; i++)
+        r = r | scratch[subgroup_base + cluster_base_in_sg + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_reduce_xor_uint(__local uint* scratch, uint val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = r ^ scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_inclusive_scan_xor_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i <= lane_id; i++)
+        r = r ^ scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_exclusive_scan_xor_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = 0u;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r ^ scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_clustered_reduce_xor_uint(__local uint* scratch, uint val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    uint r = scratch[subgroup_base + cluster_base_in_sg];
+    for (uint i = 1u; i < cluster_size; i++)
+        r = r ^ scratch[subgroup_base + cluster_base_in_sg + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_reduce_add_int(__local uint* scratch, int val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = r + as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_inclusive_scan_add_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i <= lane_id; i++)
+        r = r + as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_exclusive_scan_add_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = 0;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r + as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_clustered_reduce_add_int(__local uint* scratch, int val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    int r = as_int(scratch[subgroup_base + cluster_base_in_sg]);
+    for (uint i = 1u; i < cluster_size; i++)
+        r = r + as_int(scratch[subgroup_base + cluster_base_in_sg + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_reduce_mul_int(__local uint* scratch, int val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = r * as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_inclusive_scan_mul_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i <= lane_id; i++)
+        r = r * as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_exclusive_scan_mul_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = 1;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r * as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_clustered_reduce_mul_int(__local uint* scratch, int val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    int r = as_int(scratch[subgroup_base + cluster_base_in_sg]);
+    for (uint i = 1u; i < cluster_size; i++)
+        r = r * as_int(scratch[subgroup_base + cluster_base_in_sg + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_reduce_min_int(__local uint* scratch, int val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = min(r, as_int(scratch[subgroup_base + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_inclusive_scan_min_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i <= lane_id; i++)
+        r = min(r, as_int(scratch[subgroup_base + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_exclusive_scan_min_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = INT_MAX;
+    for (uint i = 0u; i < lane_id; i++)
+        r = min(r, as_int(scratch[subgroup_base + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_clustered_reduce_min_int(__local uint* scratch, int val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    int r = as_int(scratch[subgroup_base + cluster_base_in_sg]);
+    for (uint i = 1u; i < cluster_size; i++)
+        r = min(r, as_int(scratch[subgroup_base + cluster_base_in_sg + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_reduce_max_int(__local uint* scratch, int val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = max(r, as_int(scratch[subgroup_base + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_inclusive_scan_max_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i <= lane_id; i++)
+        r = max(r, as_int(scratch[subgroup_base + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_exclusive_scan_max_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = INT_MIN;
+    for (uint i = 0u; i < lane_id; i++)
+        r = max(r, as_int(scratch[subgroup_base + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_clustered_reduce_max_int(__local uint* scratch, int val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    int r = as_int(scratch[subgroup_base + cluster_base_in_sg]);
+    for (uint i = 1u; i < cluster_size; i++)
+        r = max(r, as_int(scratch[subgroup_base + cluster_base_in_sg + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_reduce_and_int(__local uint* scratch, int val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = r & as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_inclusive_scan_and_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i <= lane_id; i++)
+        r = r & as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_exclusive_scan_and_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(0xFFFFFFFFu);
+    for (uint i = 0u; i < lane_id; i++)
+        r = r & as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_clustered_reduce_and_int(__local uint* scratch, int val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    int r = as_int(scratch[subgroup_base + cluster_base_in_sg]);
+    for (uint i = 1u; i < cluster_size; i++)
+        r = r & as_int(scratch[subgroup_base + cluster_base_in_sg + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_reduce_or_int(__local uint* scratch, int val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = r | as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_inclusive_scan_or_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i <= lane_id; i++)
+        r = r | as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_exclusive_scan_or_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = 0;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r | as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_clustered_reduce_or_int(__local uint* scratch, int val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    int r = as_int(scratch[subgroup_base + cluster_base_in_sg]);
+    for (uint i = 1u; i < cluster_size; i++)
+        r = r | as_int(scratch[subgroup_base + cluster_base_in_sg + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_reduce_xor_int(__local uint* scratch, int val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = r ^ as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_inclusive_scan_xor_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i <= lane_id; i++)
+        r = r ^ as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_exclusive_scan_xor_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = 0;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r ^ as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_clustered_reduce_xor_int(__local uint* scratch, int val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    int r = as_int(scratch[subgroup_base + cluster_base_in_sg]);
+    for (uint i = 1u; i < cluster_size; i++)
+        r = r ^ as_int(scratch[subgroup_base + cluster_base_in_sg + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_reduce_add_float(__local uint* scratch, float val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    float r = as_float(scratch[subgroup_base]);
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = r + as_float(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_inclusive_scan_add_float(__local uint* scratch, float val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    float r = as_float(scratch[subgroup_base]);
+    for (uint i = 1u; i <= lane_id; i++)
+        r = r + as_float(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_exclusive_scan_add_float(__local uint* scratch, float val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    float r = 0.0f;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r + as_float(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_clustered_reduce_add_float(__local uint* scratch, float val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    float r = as_float(scratch[subgroup_base + cluster_base_in_sg]);
+    for (uint i = 1u; i < cluster_size; i++)
+        r = r + as_float(scratch[subgroup_base + cluster_base_in_sg + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_reduce_mul_float(__local uint* scratch, float val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    float r = as_float(scratch[subgroup_base]);
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = r * as_float(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_inclusive_scan_mul_float(__local uint* scratch, float val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    float r = as_float(scratch[subgroup_base]);
+    for (uint i = 1u; i <= lane_id; i++)
+        r = r * as_float(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_exclusive_scan_mul_float(__local uint* scratch, float val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    float r = 1.0f;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r * as_float(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_clustered_reduce_mul_float(__local uint* scratch, float val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    float r = as_float(scratch[subgroup_base + cluster_base_in_sg]);
+    for (uint i = 1u; i < cluster_size; i++)
+        r = r * as_float(scratch[subgroup_base + cluster_base_in_sg + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_reduce_min_float(__local uint* scratch, float val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    float r = as_float(scratch[subgroup_base]);
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = fmin(r, as_float(scratch[subgroup_base + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_inclusive_scan_min_float(__local uint* scratch, float val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    float r = as_float(scratch[subgroup_base]);
+    for (uint i = 1u; i <= lane_id; i++)
+        r = fmin(r, as_float(scratch[subgroup_base + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_exclusive_scan_min_float(__local uint* scratch, float val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    float r = INFINITY;
+    for (uint i = 0u; i < lane_id; i++)
+        r = fmin(r, as_float(scratch[subgroup_base + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_clustered_reduce_min_float(__local uint* scratch, float val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    float r = as_float(scratch[subgroup_base + cluster_base_in_sg]);
+    for (uint i = 1u; i < cluster_size; i++)
+        r = fmin(r, as_float(scratch[subgroup_base + cluster_base_in_sg + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_reduce_max_float(__local uint* scratch, float val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    float r = as_float(scratch[subgroup_base]);
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = fmax(r, as_float(scratch[subgroup_base + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_inclusive_scan_max_float(__local uint* scratch, float val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    float r = as_float(scratch[subgroup_base]);
+    for (uint i = 1u; i <= lane_id; i++)
+        r = fmax(r, as_float(scratch[subgroup_base + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_exclusive_scan_max_float(__local uint* scratch, float val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    float r = -INFINITY;
+    for (uint i = 0u; i < lane_id; i++)
+        r = fmax(r, as_float(scratch[subgroup_base + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_clustered_reduce_max_float(__local uint* scratch, float val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    float r = as_float(scratch[subgroup_base + cluster_base_in_sg]);
+    for (uint i = 1u; i < cluster_size; i++)
+        r = fmax(r, as_float(scratch[subgroup_base + cluster_base_in_sg + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_reduce_logical_and(__local uint* scratch, bool val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    bool r = true;
+    for (uint i = 0u; i < subgroup_size; i++)
+        r = r && (scratch[subgroup_base + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_inclusive_scan_logical_and(__local uint* scratch, bool val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    bool r = true;
+    for (uint i = 0u; i <= lane_id; i++)
+        r = r && (scratch[subgroup_base + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_exclusive_scan_logical_and(__local uint* scratch, bool val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    bool r = true;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r && (scratch[subgroup_base + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_clustered_reduce_logical_and(__local uint* scratch, bool val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    bool r = true;
+    for (uint i = 0u; i < cluster_size; i++)
+        r = r && (scratch[subgroup_base + cluster_base_in_sg + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_reduce_logical_or(__local uint* scratch, bool val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    bool r = false;
+    for (uint i = 0u; i < subgroup_size; i++)
+        r = r || (scratch[subgroup_base + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_inclusive_scan_logical_or(__local uint* scratch, bool val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    bool r = false;
+    for (uint i = 0u; i <= lane_id; i++)
+        r = r || (scratch[subgroup_base + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_exclusive_scan_logical_or(__local uint* scratch, bool val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    bool r = false;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r || (scratch[subgroup_base + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_clustered_reduce_logical_or(__local uint* scratch, bool val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    bool r = false;
+    for (uint i = 0u; i < cluster_size; i++)
+        r = r || (scratch[subgroup_base + cluster_base_in_sg + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_reduce_logical_xor(__local uint* scratch, bool val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    bool r = false;
+    for (uint i = 0u; i < subgroup_size; i++)
+        r = r != (scratch[subgroup_base + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_inclusive_scan_logical_xor(__local uint* scratch, bool val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    bool r = false;
+    for (uint i = 0u; i <= lane_id; i++)
+        r = r != (scratch[subgroup_base + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_exclusive_scan_logical_xor(__local uint* scratch, bool val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    bool r = false;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r != (scratch[subgroup_base + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_clustered_reduce_logical_xor(__local uint* scratch, bool val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    bool r = false;
+    for (uint i = 0u; i < cluster_size; i++)
+        r = r != (scratch[subgroup_base + cluster_base_in_sg + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_inverse_ballot(uint4 ballot, uint lane_id) {
+    uint word = lane_id / 32u;
+    uint bit = lane_id % 32u;
+    uint v = (word == 0u) ? ballot.x : (word == 1u) ? ballot.y : (word == 2u) ? ballot.z : ballot.w;
+    return (v & (1u << bit)) != 0u;
+}
+
+static bool spv_emulate_ballot_bit_extract(uint4 ballot, uint index) {
+    uint word = index / 32u;
+    uint bit = index % 32u;
+    uint v = (word == 0u) ? ballot.x : (word == 1u) ? ballot.y : (word == 2u) ? ballot.z : ballot.w;
+    return (v & (1u << bit)) != 0u;
+}
+
+static uint spv_popcount4(uint4 v) {
+    return popcount(v.x) + popcount(v.y) + popcount(v.z) + popcount(v.w);
+}
+
+static uint spv_emulate_ballot_bit_count(uint4 ballot) {
+    return spv_popcount4(ballot);
+}
+
+static uint spv_emulate_ballot_inclusive_bit_count(uint4 ballot, uint lane_id) {
+    uint4 masked = ballot;
+    uint word = lane_id / 32u;
+    uint bit = lane_id % 32u;
+    uint mask = (bit == 31u) ? 0xFFFFFFFFu : ((1u << (bit + 1u)) - 1u);
+    if (word == 0u) { masked.x &= mask; masked.y = 0u; masked.z = 0u; masked.w = 0u; }
+    else if (word == 1u) { masked.y &= mask; masked.z = 0u; masked.w = 0u; }
+    else if (word == 2u) { masked.z &= mask; masked.w = 0u; }
+    else { masked.w &= mask; }
+    return spv_popcount4(masked);
+}
+
+static uint spv_emulate_ballot_exclusive_bit_count(uint4 ballot, uint lane_id) {
+    if (lane_id == 0u) return 0u;
+    return spv_emulate_ballot_inclusive_bit_count(ballot, lane_id - 1u);
+}
+
+static uint spv_emulate_ballot_find_lsb(uint4 ballot) {
+    if (ballot.x != 0u) return (uint)clz(ballot.x & (0u - ballot.x));
+    if (ballot.y != 0u) return 32u + (uint)clz(ballot.y & (0u - ballot.y));
+    if (ballot.z != 0u) return 64u + (uint)clz(ballot.z & (0u - ballot.z));
+    if (ballot.w != 0u) return 96u + (uint)clz(ballot.w & (0u - ballot.w));
+    return ~0u;
+}
+
+static uint spv_emulate_ballot_find_msb(uint4 ballot) {
+    if (ballot.w != 0u) return 127u - (uint)clz(ballot.w);
+    if (ballot.z != 0u) return 95u - (uint)clz(ballot.z);
+    if (ballot.y != 0u) return 63u - (uint)clz(ballot.y);
+    if (ballot.x != 0u) return 31u - (uint)clz(ballot.x);
+    return ~0u;
+}
+
+__attribute__((reqd_work_group_size(256, 1, 1)))
+__kernel void comp_main(__global SSBO* _13)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    uint _spv_linear_workgroup_size = get_local_size(0) * get_local_size(1) * get_local_size(2);
+    uint _spv_linear_id = (get_local_id(2) * get_local_size(1) * get_local_size(0)) + (get_local_id(1) * get_local_size(0)) + get_local_id(0);
+    uint _spv_subgroup_size = 32u;
+    uint _spv_lane_id = _spv_linear_id % 32u;
+    uint _spv_subgroup_id = _spv_linear_id / 32u;
+    uint _spv_num_subgroups = _spv_linear_workgroup_size / 32u;
+    uint _spv_subgroup_base = _spv_subgroup_id * 32u;
+    __local uint _spv_subgroup_scratch[256];
+    float cred_add_1 = spv_emulate_clustered_reduce_add_float(_spv_subgroup_scratch, _13->FragColor, 4u, _spv_lane_id, _spv_linear_id, _spv_subgroup_base, _spv_subgroup_size);
+    float cred_mul_1 = spv_emulate_clustered_reduce_mul_float(_spv_subgroup_scratch, _13->FragColor, 4u, _spv_lane_id, _spv_linear_id, _spv_subgroup_base, _spv_subgroup_size);
+    float cred_min_1 = spv_emulate_clustered_reduce_min_float(_spv_subgroup_scratch, _13->FragColor, 4u, _spv_lane_id, _spv_linear_id, _spv_subgroup_base, _spv_subgroup_size);
+    float cred_max_1 = spv_emulate_clustered_reduce_max_float(_spv_subgroup_scratch, _13->FragColor, 4u, _spv_lane_id, _spv_linear_id, _spv_subgroup_base, _spv_subgroup_size);
+    int cred_iadd_1 = spv_emulate_clustered_reduce_add_int(_spv_subgroup_scratch, _13->idat, 4u, _spv_lane_id, _spv_linear_id, _spv_subgroup_base, _spv_subgroup_size);
+    uint cred_umin_1 = spv_emulate_clustered_reduce_min_uint(_spv_subgroup_scratch, _13->udat, 4u, _spv_lane_id, _spv_linear_id, _spv_subgroup_base, _spv_subgroup_size);
+    uint cred_and_1 = spv_emulate_clustered_reduce_and_uint(_spv_subgroup_scratch, _13->udat, 4u, _spv_lane_id, _spv_linear_id, _spv_subgroup_base, _spv_subgroup_size);
+    uint cred_or_1 = spv_emulate_clustered_reduce_or_uint(_spv_subgroup_scratch, _13->udat, 4u, _spv_lane_id, _spv_linear_id, _spv_subgroup_base, _spv_subgroup_size);
+    uint cred_xor_1 = spv_emulate_clustered_reduce_xor_uint(_spv_subgroup_scratch, _13->udat, 4u, _spv_lane_id, _spv_linear_id, _spv_subgroup_base, _spv_subgroup_size);
+    bool cred_land_1 = spv_emulate_clustered_reduce_logical_and(_spv_subgroup_scratch, _13->udat > 0u, 4u, _spv_lane_id, _spv_linear_id, _spv_subgroup_base, _spv_subgroup_size);
+    bool cred_lor_1 = spv_emulate_clustered_reduce_logical_or(_spv_subgroup_scratch, _13->udat > 0u, 4u, _spv_lane_id, _spv_linear_id, _spv_subgroup_base, _spv_subgroup_size);
+    _13->FragColor = (((((((cred_add_1 + cred_mul_1) + cred_min_1) + cred_max_1) + convert_float(cred_iadd_1)) + convert_float(cred_umin_1)) + convert_float((cred_and_1 + cred_or_1) + cred_xor_1)) + (float)(cred_land_1)) + (float)(cred_lor_1);
+}
+
diff --git a/reference/shaders-opencl-no-opt/comp/subgroups-rotate.nocompat.vk.subgroups-emulate.comp b/reference/shaders-opencl-no-opt/comp/subgroups-rotate.nocompat.vk.subgroups-emulate.comp
new file mode 100644
index 000000000..784532d51
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/comp/subgroups-rotate.nocompat.vk.subgroups-emulate.comp
@@ -0,0 +1,1094 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct SSBO
+{
+    float FragColor;
+};
+
+typedef struct SSBO SSBO;
+
+static uint spv_emulate_broadcast_uint(__local uint* scratch, uint val, uint src_lane, uint linear_id, uint subgroup_base) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base + src_lane];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_broadcast_first_uint(__local uint* scratch, uint val, uint linear_id, uint subgroup_base) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_shuffle_uint(__local uint* scratch, uint val, uint index, uint linear_id, uint subgroup_base) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base + index];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_shuffle_xor_uint(__local uint* scratch, uint val, uint mask, uint lane_id, uint linear_id, uint subgroup_base) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base + (lane_id ^ mask)];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_shuffle_up_uint(__local uint* scratch, uint val, uint delta, uint lane_id, uint linear_id, uint subgroup_base) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = (lane_id >= delta) ? scratch[linear_id - delta] : val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_shuffle_down_uint(__local uint* scratch, uint val, uint delta, uint lane_id, uint linear_id, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = (lane_id + delta < subgroup_size) ? scratch[linear_id + delta] : val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_rotate_uint(__local uint* scratch, uint val, uint delta, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base + ((lane_id + delta) % subgroup_size)];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_clustered_rotate_uint(__local uint* scratch, uint val, uint delta, uint lane_id, uint linear_id, uint subgroup_base, uint cluster_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base = (lane_id / cluster_size) * cluster_size;
+    uint r = scratch[subgroup_base + cluster_base + ((lane_id - cluster_base + delta) % cluster_size)];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_all(__local uint* scratch, bool predicate, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = predicate ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    bool r = true;
+    for (uint i = 0u; i < subgroup_size; i++)
+        r = r && (scratch[subgroup_base + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_any(__local uint* scratch, bool predicate, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = predicate ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    bool r = false;
+    for (uint i = 0u; i < subgroup_size; i++)
+        r = r || (scratch[subgroup_base + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_all_equal_uint(__local uint* scratch, uint val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint first = scratch[subgroup_base];
+    bool r = true;
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = r && (scratch[subgroup_base + i] == first);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint4 spv_emulate_ballot(__local uint* scratch, bool predicate, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = predicate ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint4 r = (uint4)(0u);
+    for (uint i = 0u; i < subgroup_size; i++) {
+        if (scratch[subgroup_base + i] != 0u) {
+            uint word = i / 32u;
+            uint bit = i % 32u;
+            if (word == 0u) r.x |= (1u << bit);
+            else if (word == 1u) r.y |= (1u << bit);
+            else if (word == 2u) r.z |= (1u << bit);
+            else r.w |= (1u << bit);
+        }
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint4 spv_subgroup_eq_mask(uint lane_id) {
+    uint4 r = (uint4)(0u);
+    uint word = lane_id / 32u;
+    uint bit = lane_id % 32u;
+    if (word == 0u) r.x = (1u << bit);
+    else if (word == 1u) r.y = (1u << bit);
+    else if (word == 2u) r.z = (1u << bit);
+    else r.w = (1u << bit);
+    return r;
+}
+
+static uint4 spv_subgroup_ge_mask(uint lane_id, uint subgroup_size) {
+    uint4 r = (uint4)(0u);
+    for (uint i = lane_id; i < subgroup_size; i++) {
+        uint word = i / 32u;
+        uint bit = i % 32u;
+        if (word == 0u) r.x |= (1u << bit);
+        else if (word == 1u) r.y |= (1u << bit);
+        else if (word == 2u) r.z |= (1u << bit);
+        else r.w |= (1u << bit);
+    }
+    return r;
+}
+
+static uint4 spv_subgroup_gt_mask(uint lane_id, uint subgroup_size) {
+    return spv_subgroup_ge_mask(lane_id + 1u, subgroup_size);
+}
+
+static uint4 spv_subgroup_le_mask(uint lane_id, uint subgroup_size) {
+    return spv_subgroup_ge_mask(0u, lane_id + 1u);
+}
+
+static uint4 spv_subgroup_lt_mask(uint lane_id) {
+    if (lane_id == 0u) return (uint4)(0u);
+    return spv_subgroup_ge_mask(0u, lane_id);
+}
+
+static uint spv_emulate_reduce_add_uint(__local uint* scratch, uint val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = r + scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_inclusive_scan_add_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i <= lane_id; i++)
+        r = r + scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_exclusive_scan_add_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = 0u;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r + scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_clustered_reduce_add_uint(__local uint* scratch, uint val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    uint r = scratch[subgroup_base + cluster_base_in_sg];
+    for (uint i = 1u; i < cluster_size; i++)
+        r = r + scratch[subgroup_base + cluster_base_in_sg + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_reduce_mul_uint(__local uint* scratch, uint val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = r * scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_inclusive_scan_mul_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i <= lane_id; i++)
+        r = r * scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_exclusive_scan_mul_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = 1u;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r * scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_clustered_reduce_mul_uint(__local uint* scratch, uint val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    uint r = scratch[subgroup_base + cluster_base_in_sg];
+    for (uint i = 1u; i < cluster_size; i++)
+        r = r * scratch[subgroup_base + cluster_base_in_sg + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_reduce_min_uint(__local uint* scratch, uint val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = min(r, scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_inclusive_scan_min_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i <= lane_id; i++)
+        r = min(r, scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_exclusive_scan_min_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = UINT_MAX;
+    for (uint i = 0u; i < lane_id; i++)
+        r = min(r, scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_clustered_reduce_min_uint(__local uint* scratch, uint val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    uint r = scratch[subgroup_base + cluster_base_in_sg];
+    for (uint i = 1u; i < cluster_size; i++)
+        r = min(r, scratch[subgroup_base + cluster_base_in_sg + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_reduce_max_uint(__local uint* scratch, uint val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = max(r, scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_inclusive_scan_max_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i <= lane_id; i++)
+        r = max(r, scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_exclusive_scan_max_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = 0u;
+    for (uint i = 0u; i < lane_id; i++)
+        r = max(r, scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_clustered_reduce_max_uint(__local uint* scratch, uint val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    uint r = scratch[subgroup_base + cluster_base_in_sg];
+    for (uint i = 1u; i < cluster_size; i++)
+        r = max(r, scratch[subgroup_base + cluster_base_in_sg + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_reduce_and_uint(__local uint* scratch, uint val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = r & scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_inclusive_scan_and_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i <= lane_id; i++)
+        r = r & scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_exclusive_scan_and_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = 0xFFFFFFFFu;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r & scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_clustered_reduce_and_uint(__local uint* scratch, uint val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    uint r = scratch[subgroup_base + cluster_base_in_sg];
+    for (uint i = 1u; i < cluster_size; i++)
+        r = r & scratch[subgroup_base + cluster_base_in_sg + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_reduce_or_uint(__local uint* scratch, uint val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = r | scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_inclusive_scan_or_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i <= lane_id; i++)
+        r = r | scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_exclusive_scan_or_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = 0u;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r | scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_clustered_reduce_or_uint(__local uint* scratch, uint val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    uint r = scratch[subgroup_base + cluster_base_in_sg];
+    for (uint i = 1u; i < cluster_size; i++)
+        r = r | scratch[subgroup_base + cluster_base_in_sg + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_reduce_xor_uint(__local uint* scratch, uint val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = r ^ scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_inclusive_scan_xor_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i <= lane_id; i++)
+        r = r ^ scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_exclusive_scan_xor_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = 0u;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r ^ scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_clustered_reduce_xor_uint(__local uint* scratch, uint val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    uint r = scratch[subgroup_base + cluster_base_in_sg];
+    for (uint i = 1u; i < cluster_size; i++)
+        r = r ^ scratch[subgroup_base + cluster_base_in_sg + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_reduce_add_int(__local uint* scratch, int val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = r + as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_inclusive_scan_add_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i <= lane_id; i++)
+        r = r + as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_exclusive_scan_add_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = 0;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r + as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_clustered_reduce_add_int(__local uint* scratch, int val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    int r = as_int(scratch[subgroup_base + cluster_base_in_sg]);
+    for (uint i = 1u; i < cluster_size; i++)
+        r = r + as_int(scratch[subgroup_base + cluster_base_in_sg + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_reduce_mul_int(__local uint* scratch, int val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = r * as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_inclusive_scan_mul_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i <= lane_id; i++)
+        r = r * as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_exclusive_scan_mul_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = 1;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r * as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_clustered_reduce_mul_int(__local uint* scratch, int val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    int r = as_int(scratch[subgroup_base + cluster_base_in_sg]);
+    for (uint i = 1u; i < cluster_size; i++)
+        r = r * as_int(scratch[subgroup_base + cluster_base_in_sg + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_reduce_min_int(__local uint* scratch, int val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = min(r, as_int(scratch[subgroup_base + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_inclusive_scan_min_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i <= lane_id; i++)
+        r = min(r, as_int(scratch[subgroup_base + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_exclusive_scan_min_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = INT_MAX;
+    for (uint i = 0u; i < lane_id; i++)
+        r = min(r, as_int(scratch[subgroup_base + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_clustered_reduce_min_int(__local uint* scratch, int val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    int r = as_int(scratch[subgroup_base + cluster_base_in_sg]);
+    for (uint i = 1u; i < cluster_size; i++)
+        r = min(r, as_int(scratch[subgroup_base + cluster_base_in_sg + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_reduce_max_int(__local uint* scratch, int val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = max(r, as_int(scratch[subgroup_base + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_inclusive_scan_max_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i <= lane_id; i++)
+        r = max(r, as_int(scratch[subgroup_base + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_exclusive_scan_max_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = INT_MIN;
+    for (uint i = 0u; i < lane_id; i++)
+        r = max(r, as_int(scratch[subgroup_base + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_clustered_reduce_max_int(__local uint* scratch, int val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    int r = as_int(scratch[subgroup_base + cluster_base_in_sg]);
+    for (uint i = 1u; i < cluster_size; i++)
+        r = max(r, as_int(scratch[subgroup_base + cluster_base_in_sg + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_reduce_and_int(__local uint* scratch, int val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = r & as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_inclusive_scan_and_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i <= lane_id; i++)
+        r = r & as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_exclusive_scan_and_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(0xFFFFFFFFu);
+    for (uint i = 0u; i < lane_id; i++)
+        r = r & as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_clustered_reduce_and_int(__local uint* scratch, int val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    int r = as_int(scratch[subgroup_base + cluster_base_in_sg]);
+    for (uint i = 1u; i < cluster_size; i++)
+        r = r & as_int(scratch[subgroup_base + cluster_base_in_sg + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_reduce_or_int(__local uint* scratch, int val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = r | as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_inclusive_scan_or_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i <= lane_id; i++)
+        r = r | as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_exclusive_scan_or_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = 0;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r | as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_clustered_reduce_or_int(__local uint* scratch, int val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    int r = as_int(scratch[subgroup_base + cluster_base_in_sg]);
+    for (uint i = 1u; i < cluster_size; i++)
+        r = r | as_int(scratch[subgroup_base + cluster_base_in_sg + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_reduce_xor_int(__local uint* scratch, int val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = r ^ as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_inclusive_scan_xor_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i <= lane_id; i++)
+        r = r ^ as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_exclusive_scan_xor_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = 0;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r ^ as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_clustered_reduce_xor_int(__local uint* scratch, int val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    int r = as_int(scratch[subgroup_base + cluster_base_in_sg]);
+    for (uint i = 1u; i < cluster_size; i++)
+        r = r ^ as_int(scratch[subgroup_base + cluster_base_in_sg + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_reduce_add_float(__local uint* scratch, float val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    float r = as_float(scratch[subgroup_base]);
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = r + as_float(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_inclusive_scan_add_float(__local uint* scratch, float val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    float r = as_float(scratch[subgroup_base]);
+    for (uint i = 1u; i <= lane_id; i++)
+        r = r + as_float(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_exclusive_scan_add_float(__local uint* scratch, float val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    float r = 0.0f;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r + as_float(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_clustered_reduce_add_float(__local uint* scratch, float val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    float r = as_float(scratch[subgroup_base + cluster_base_in_sg]);
+    for (uint i = 1u; i < cluster_size; i++)
+        r = r + as_float(scratch[subgroup_base + cluster_base_in_sg + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_reduce_mul_float(__local uint* scratch, float val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    float r = as_float(scratch[subgroup_base]);
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = r * as_float(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_inclusive_scan_mul_float(__local uint* scratch, float val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    float r = as_float(scratch[subgroup_base]);
+    for (uint i = 1u; i <= lane_id; i++)
+        r = r * as_float(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_exclusive_scan_mul_float(__local uint* scratch, float val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    float r = 1.0f;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r * as_float(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_clustered_reduce_mul_float(__local uint* scratch, float val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    float r = as_float(scratch[subgroup_base + cluster_base_in_sg]);
+    for (uint i = 1u; i < cluster_size; i++)
+        r = r * as_float(scratch[subgroup_base + cluster_base_in_sg + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_reduce_min_float(__local uint* scratch, float val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    float r = as_float(scratch[subgroup_base]);
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = fmin(r, as_float(scratch[subgroup_base + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_inclusive_scan_min_float(__local uint* scratch, float val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    float r = as_float(scratch[subgroup_base]);
+    for (uint i = 1u; i <= lane_id; i++)
+        r = fmin(r, as_float(scratch[subgroup_base + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_exclusive_scan_min_float(__local uint* scratch, float val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    float r = INFINITY;
+    for (uint i = 0u; i < lane_id; i++)
+        r = fmin(r, as_float(scratch[subgroup_base + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_clustered_reduce_min_float(__local uint* scratch, float val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    float r = as_float(scratch[subgroup_base + cluster_base_in_sg]);
+    for (uint i = 1u; i < cluster_size; i++)
+        r = fmin(r, as_float(scratch[subgroup_base + cluster_base_in_sg + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_reduce_max_float(__local uint* scratch, float val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    float r = as_float(scratch[subgroup_base]);
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = fmax(r, as_float(scratch[subgroup_base + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_inclusive_scan_max_float(__local uint* scratch, float val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    float r = as_float(scratch[subgroup_base]);
+    for (uint i = 1u; i <= lane_id; i++)
+        r = fmax(r, as_float(scratch[subgroup_base + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_exclusive_scan_max_float(__local uint* scratch, float val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    float r = -INFINITY;
+    for (uint i = 0u; i < lane_id; i++)
+        r = fmax(r, as_float(scratch[subgroup_base + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_clustered_reduce_max_float(__local uint* scratch, float val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    float r = as_float(scratch[subgroup_base + cluster_base_in_sg]);
+    for (uint i = 1u; i < cluster_size; i++)
+        r = fmax(r, as_float(scratch[subgroup_base + cluster_base_in_sg + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_reduce_logical_and(__local uint* scratch, bool val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    bool r = true;
+    for (uint i = 0u; i < subgroup_size; i++)
+        r = r && (scratch[subgroup_base + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_inclusive_scan_logical_and(__local uint* scratch, bool val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    bool r = true;
+    for (uint i = 0u; i <= lane_id; i++)
+        r = r && (scratch[subgroup_base + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_exclusive_scan_logical_and(__local uint* scratch, bool val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    bool r = true;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r && (scratch[subgroup_base + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_clustered_reduce_logical_and(__local uint* scratch, bool val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    bool r = true;
+    for (uint i = 0u; i < cluster_size; i++)
+        r = r && (scratch[subgroup_base + cluster_base_in_sg + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_reduce_logical_or(__local uint* scratch, bool val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    bool r = false;
+    for (uint i = 0u; i < subgroup_size; i++)
+        r = r || (scratch[subgroup_base + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_inclusive_scan_logical_or(__local uint* scratch, bool val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    bool r = false;
+    for (uint i = 0u; i <= lane_id; i++)
+        r = r || (scratch[subgroup_base + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_exclusive_scan_logical_or(__local uint* scratch, bool val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    bool r = false;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r || (scratch[subgroup_base + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_clustered_reduce_logical_or(__local uint* scratch, bool val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    bool r = false;
+    for (uint i = 0u; i < cluster_size; i++)
+        r = r || (scratch[subgroup_base + cluster_base_in_sg + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_reduce_logical_xor(__local uint* scratch, bool val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    bool r = false;
+    for (uint i = 0u; i < subgroup_size; i++)
+        r = r != (scratch[subgroup_base + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_inclusive_scan_logical_xor(__local uint* scratch, bool val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    bool r = false;
+    for (uint i = 0u; i <= lane_id; i++)
+        r = r != (scratch[subgroup_base + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_exclusive_scan_logical_xor(__local uint* scratch, bool val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    bool r = false;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r != (scratch[subgroup_base + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_clustered_reduce_logical_xor(__local uint* scratch, bool val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    bool r = false;
+    for (uint i = 0u; i < cluster_size; i++)
+        r = r != (scratch[subgroup_base + cluster_base_in_sg + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_inverse_ballot(uint4 ballot, uint lane_id) {
+    uint word = lane_id / 32u;
+    uint bit = lane_id % 32u;
+    uint v = (word == 0u) ? ballot.x : (word == 1u) ? ballot.y : (word == 2u) ? ballot.z : ballot.w;
+    return (v & (1u << bit)) != 0u;
+}
+
+static bool spv_emulate_ballot_bit_extract(uint4 ballot, uint index) {
+    uint word = index / 32u;
+    uint bit = index % 32u;
+    uint v = (word == 0u) ? ballot.x : (word == 1u) ? ballot.y : (word == 2u) ? ballot.z : ballot.w;
+    return (v & (1u << bit)) != 0u;
+}
+
+static uint spv_popcount4(uint4 v) {
+    return popcount(v.x) + popcount(v.y) + popcount(v.z) + popcount(v.w);
+}
+
+static uint spv_emulate_ballot_bit_count(uint4 ballot) {
+    return spv_popcount4(ballot);
+}
+
+static uint spv_emulate_ballot_inclusive_bit_count(uint4 ballot, uint lane_id) {
+    uint4 masked = ballot;
+    uint word = lane_id / 32u;
+    uint bit = lane_id % 32u;
+    uint mask = (bit == 31u) ? 0xFFFFFFFFu : ((1u << (bit + 1u)) - 1u);
+    if (word == 0u) { masked.x &= mask; masked.y = 0u; masked.z = 0u; masked.w = 0u; }
+    else if (word == 1u) { masked.y &= mask; masked.z = 0u; masked.w = 0u; }
+    else if (word == 2u) { masked.z &= mask; masked.w = 0u; }
+    else { masked.w &= mask; }
+    return spv_popcount4(masked);
+}
+
+static uint spv_emulate_ballot_exclusive_bit_count(uint4 ballot, uint lane_id) {
+    if (lane_id == 0u) return 0u;
+    return spv_emulate_ballot_inclusive_bit_count(ballot, lane_id - 1u);
+}
+
+static uint spv_emulate_ballot_find_lsb(uint4 ballot) {
+    if (ballot.x != 0u) return (uint)clz(ballot.x & (0u - ballot.x));
+    if (ballot.y != 0u) return 32u + (uint)clz(ballot.y & (0u - ballot.y));
+    if (ballot.z != 0u) return 64u + (uint)clz(ballot.z & (0u - ballot.z));
+    if (ballot.w != 0u) return 96u + (uint)clz(ballot.w & (0u - ballot.w));
+    return ~0u;
+}
+
+static uint spv_emulate_ballot_find_msb(uint4 ballot) {
+    if (ballot.w != 0u) return 127u - (uint)clz(ballot.w);
+    if (ballot.z != 0u) return 95u - (uint)clz(ballot.z);
+    if (ballot.y != 0u) return 63u - (uint)clz(ballot.y);
+    if (ballot.x != 0u) return 31u - (uint)clz(ballot.x);
+    return ~0u;
+}
+
+__attribute__((reqd_work_group_size(256, 1, 1)))
+__kernel void comp_main(__global float* _19)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    uint _spv_linear_workgroup_size = get_local_size(0) * get_local_size(1) * get_local_size(2);
+    uint _spv_linear_id = (get_local_id(2) * get_local_size(1) * get_local_size(0)) + (get_local_id(1) * get_local_size(0)) + get_local_id(0);
+    uint _spv_subgroup_size = 32u;
+    uint _spv_lane_id = _spv_linear_id % 32u;
+    uint _spv_subgroup_id = _spv_linear_id / 32u;
+    uint _spv_num_subgroups = _spv_linear_workgroup_size / 32u;
+    uint _spv_subgroup_base = _spv_subgroup_id * 32u;
+    __local uint _spv_subgroup_scratch[256];
+    uint rotated_1 = spv_emulate_rotate_uint(_spv_subgroup_scratch, 20u, 4u, _spv_lane_id, _spv_linear_id, _spv_subgroup_base, _spv_subgroup_size);
+    uint rotated_clustered_1 = spv_emulate_clustered_rotate_uint(_spv_subgroup_scratch, 20u, 4u, _spv_lane_id, _spv_linear_id, _spv_subgroup_base, 8u);
+    _19[0] = convert_float(rotated_1) + convert_float(rotated_clustered_1);
+}
+
diff --git a/reference/shaders-opencl-no-opt/comp/subgroups-shuffle.nocompat.vk.subgroups-emulate.comp b/reference/shaders-opencl-no-opt/comp/subgroups-shuffle.nocompat.vk.subgroups-emulate.comp
new file mode 100644
index 000000000..a9a397230
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/comp/subgroups-shuffle.nocompat.vk.subgroups-emulate.comp
@@ -0,0 +1,1097 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct SSBO
+{
+    float FragColor;
+    uint udat;
+};
+
+typedef struct SSBO SSBO;
+
+static uint spv_emulate_broadcast_uint(__local uint* scratch, uint val, uint src_lane, uint linear_id, uint subgroup_base) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base + src_lane];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_broadcast_first_uint(__local uint* scratch, uint val, uint linear_id, uint subgroup_base) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_shuffle_uint(__local uint* scratch, uint val, uint index, uint linear_id, uint subgroup_base) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base + index];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_shuffle_xor_uint(__local uint* scratch, uint val, uint mask, uint lane_id, uint linear_id, uint subgroup_base) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base + (lane_id ^ mask)];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_shuffle_up_uint(__local uint* scratch, uint val, uint delta, uint lane_id, uint linear_id, uint subgroup_base) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = (lane_id >= delta) ? scratch[linear_id - delta] : val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_shuffle_down_uint(__local uint* scratch, uint val, uint delta, uint lane_id, uint linear_id, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = (lane_id + delta < subgroup_size) ? scratch[linear_id + delta] : val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_rotate_uint(__local uint* scratch, uint val, uint delta, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base + ((lane_id + delta) % subgroup_size)];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_clustered_rotate_uint(__local uint* scratch, uint val, uint delta, uint lane_id, uint linear_id, uint subgroup_base, uint cluster_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base = (lane_id / cluster_size) * cluster_size;
+    uint r = scratch[subgroup_base + cluster_base + ((lane_id - cluster_base + delta) % cluster_size)];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_all(__local uint* scratch, bool predicate, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = predicate ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    bool r = true;
+    for (uint i = 0u; i < subgroup_size; i++)
+        r = r && (scratch[subgroup_base + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_any(__local uint* scratch, bool predicate, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = predicate ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    bool r = false;
+    for (uint i = 0u; i < subgroup_size; i++)
+        r = r || (scratch[subgroup_base + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_all_equal_uint(__local uint* scratch, uint val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint first = scratch[subgroup_base];
+    bool r = true;
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = r && (scratch[subgroup_base + i] == first);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint4 spv_emulate_ballot(__local uint* scratch, bool predicate, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = predicate ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint4 r = (uint4)(0u);
+    for (uint i = 0u; i < subgroup_size; i++) {
+        if (scratch[subgroup_base + i] != 0u) {
+            uint word = i / 32u;
+            uint bit = i % 32u;
+            if (word == 0u) r.x |= (1u << bit);
+            else if (word == 1u) r.y |= (1u << bit);
+            else if (word == 2u) r.z |= (1u << bit);
+            else r.w |= (1u << bit);
+        }
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint4 spv_subgroup_eq_mask(uint lane_id) {
+    uint4 r = (uint4)(0u);
+    uint word = lane_id / 32u;
+    uint bit = lane_id % 32u;
+    if (word == 0u) r.x = (1u << bit);
+    else if (word == 1u) r.y = (1u << bit);
+    else if (word == 2u) r.z = (1u << bit);
+    else r.w = (1u << bit);
+    return r;
+}
+
+static uint4 spv_subgroup_ge_mask(uint lane_id, uint subgroup_size) {
+    uint4 r = (uint4)(0u);
+    for (uint i = lane_id; i < subgroup_size; i++) {
+        uint word = i / 32u;
+        uint bit = i % 32u;
+        if (word == 0u) r.x |= (1u << bit);
+        else if (word == 1u) r.y |= (1u << bit);
+        else if (word == 2u) r.z |= (1u << bit);
+        else r.w |= (1u << bit);
+    }
+    return r;
+}
+
+static uint4 spv_subgroup_gt_mask(uint lane_id, uint subgroup_size) {
+    return spv_subgroup_ge_mask(lane_id + 1u, subgroup_size);
+}
+
+static uint4 spv_subgroup_le_mask(uint lane_id, uint subgroup_size) {
+    return spv_subgroup_ge_mask(0u, lane_id + 1u);
+}
+
+static uint4 spv_subgroup_lt_mask(uint lane_id) {
+    if (lane_id == 0u) return (uint4)(0u);
+    return spv_subgroup_ge_mask(0u, lane_id);
+}
+
+static uint spv_emulate_reduce_add_uint(__local uint* scratch, uint val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = r + scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_inclusive_scan_add_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i <= lane_id; i++)
+        r = r + scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_exclusive_scan_add_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = 0u;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r + scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_clustered_reduce_add_uint(__local uint* scratch, uint val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    uint r = scratch[subgroup_base + cluster_base_in_sg];
+    for (uint i = 1u; i < cluster_size; i++)
+        r = r + scratch[subgroup_base + cluster_base_in_sg + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_reduce_mul_uint(__local uint* scratch, uint val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = r * scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_inclusive_scan_mul_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i <= lane_id; i++)
+        r = r * scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_exclusive_scan_mul_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = 1u;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r * scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_clustered_reduce_mul_uint(__local uint* scratch, uint val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    uint r = scratch[subgroup_base + cluster_base_in_sg];
+    for (uint i = 1u; i < cluster_size; i++)
+        r = r * scratch[subgroup_base + cluster_base_in_sg + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_reduce_min_uint(__local uint* scratch, uint val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = min(r, scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_inclusive_scan_min_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i <= lane_id; i++)
+        r = min(r, scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_exclusive_scan_min_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = UINT_MAX;
+    for (uint i = 0u; i < lane_id; i++)
+        r = min(r, scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_clustered_reduce_min_uint(__local uint* scratch, uint val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    uint r = scratch[subgroup_base + cluster_base_in_sg];
+    for (uint i = 1u; i < cluster_size; i++)
+        r = min(r, scratch[subgroup_base + cluster_base_in_sg + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_reduce_max_uint(__local uint* scratch, uint val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = max(r, scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_inclusive_scan_max_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i <= lane_id; i++)
+        r = max(r, scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_exclusive_scan_max_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = 0u;
+    for (uint i = 0u; i < lane_id; i++)
+        r = max(r, scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_clustered_reduce_max_uint(__local uint* scratch, uint val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    uint r = scratch[subgroup_base + cluster_base_in_sg];
+    for (uint i = 1u; i < cluster_size; i++)
+        r = max(r, scratch[subgroup_base + cluster_base_in_sg + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_reduce_and_uint(__local uint* scratch, uint val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = r & scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_inclusive_scan_and_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i <= lane_id; i++)
+        r = r & scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_exclusive_scan_and_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = 0xFFFFFFFFu;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r & scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_clustered_reduce_and_uint(__local uint* scratch, uint val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    uint r = scratch[subgroup_base + cluster_base_in_sg];
+    for (uint i = 1u; i < cluster_size; i++)
+        r = r & scratch[subgroup_base + cluster_base_in_sg + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_reduce_or_uint(__local uint* scratch, uint val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = r | scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_inclusive_scan_or_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i <= lane_id; i++)
+        r = r | scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_exclusive_scan_or_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = 0u;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r | scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_clustered_reduce_or_uint(__local uint* scratch, uint val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    uint r = scratch[subgroup_base + cluster_base_in_sg];
+    for (uint i = 1u; i < cluster_size; i++)
+        r = r | scratch[subgroup_base + cluster_base_in_sg + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_reduce_xor_uint(__local uint* scratch, uint val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = r ^ scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_inclusive_scan_xor_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i <= lane_id; i++)
+        r = r ^ scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_exclusive_scan_xor_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = 0u;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r ^ scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_clustered_reduce_xor_uint(__local uint* scratch, uint val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    uint r = scratch[subgroup_base + cluster_base_in_sg];
+    for (uint i = 1u; i < cluster_size; i++)
+        r = r ^ scratch[subgroup_base + cluster_base_in_sg + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_reduce_add_int(__local uint* scratch, int val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = r + as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_inclusive_scan_add_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i <= lane_id; i++)
+        r = r + as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_exclusive_scan_add_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = 0;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r + as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_clustered_reduce_add_int(__local uint* scratch, int val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    int r = as_int(scratch[subgroup_base + cluster_base_in_sg]);
+    for (uint i = 1u; i < cluster_size; i++)
+        r = r + as_int(scratch[subgroup_base + cluster_base_in_sg + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_reduce_mul_int(__local uint* scratch, int val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = r * as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_inclusive_scan_mul_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i <= lane_id; i++)
+        r = r * as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_exclusive_scan_mul_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = 1;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r * as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_clustered_reduce_mul_int(__local uint* scratch, int val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    int r = as_int(scratch[subgroup_base + cluster_base_in_sg]);
+    for (uint i = 1u; i < cluster_size; i++)
+        r = r * as_int(scratch[subgroup_base + cluster_base_in_sg + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_reduce_min_int(__local uint* scratch, int val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = min(r, as_int(scratch[subgroup_base + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_inclusive_scan_min_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i <= lane_id; i++)
+        r = min(r, as_int(scratch[subgroup_base + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_exclusive_scan_min_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = INT_MAX;
+    for (uint i = 0u; i < lane_id; i++)
+        r = min(r, as_int(scratch[subgroup_base + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_clustered_reduce_min_int(__local uint* scratch, int val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    int r = as_int(scratch[subgroup_base + cluster_base_in_sg]);
+    for (uint i = 1u; i < cluster_size; i++)
+        r = min(r, as_int(scratch[subgroup_base + cluster_base_in_sg + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_reduce_max_int(__local uint* scratch, int val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = max(r, as_int(scratch[subgroup_base + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_inclusive_scan_max_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i <= lane_id; i++)
+        r = max(r, as_int(scratch[subgroup_base + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_exclusive_scan_max_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = INT_MIN;
+    for (uint i = 0u; i < lane_id; i++)
+        r = max(r, as_int(scratch[subgroup_base + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_clustered_reduce_max_int(__local uint* scratch, int val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    int r = as_int(scratch[subgroup_base + cluster_base_in_sg]);
+    for (uint i = 1u; i < cluster_size; i++)
+        r = max(r, as_int(scratch[subgroup_base + cluster_base_in_sg + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_reduce_and_int(__local uint* scratch, int val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = r & as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_inclusive_scan_and_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i <= lane_id; i++)
+        r = r & as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_exclusive_scan_and_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(0xFFFFFFFFu);
+    for (uint i = 0u; i < lane_id; i++)
+        r = r & as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_clustered_reduce_and_int(__local uint* scratch, int val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    int r = as_int(scratch[subgroup_base + cluster_base_in_sg]);
+    for (uint i = 1u; i < cluster_size; i++)
+        r = r & as_int(scratch[subgroup_base + cluster_base_in_sg + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_reduce_or_int(__local uint* scratch, int val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = r | as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_inclusive_scan_or_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i <= lane_id; i++)
+        r = r | as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_exclusive_scan_or_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = 0;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r | as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_clustered_reduce_or_int(__local uint* scratch, int val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    int r = as_int(scratch[subgroup_base + cluster_base_in_sg]);
+    for (uint i = 1u; i < cluster_size; i++)
+        r = r | as_int(scratch[subgroup_base + cluster_base_in_sg + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_reduce_xor_int(__local uint* scratch, int val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = r ^ as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_inclusive_scan_xor_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i <= lane_id; i++)
+        r = r ^ as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_exclusive_scan_xor_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = 0;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r ^ as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_clustered_reduce_xor_int(__local uint* scratch, int val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    int r = as_int(scratch[subgroup_base + cluster_base_in_sg]);
+    for (uint i = 1u; i < cluster_size; i++)
+        r = r ^ as_int(scratch[subgroup_base + cluster_base_in_sg + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_reduce_add_float(__local uint* scratch, float val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    float r = as_float(scratch[subgroup_base]);
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = r + as_float(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_inclusive_scan_add_float(__local uint* scratch, float val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    float r = as_float(scratch[subgroup_base]);
+    for (uint i = 1u; i <= lane_id; i++)
+        r = r + as_float(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_exclusive_scan_add_float(__local uint* scratch, float val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    float r = 0.0f;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r + as_float(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_clustered_reduce_add_float(__local uint* scratch, float val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    float r = as_float(scratch[subgroup_base + cluster_base_in_sg]);
+    for (uint i = 1u; i < cluster_size; i++)
+        r = r + as_float(scratch[subgroup_base + cluster_base_in_sg + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_reduce_mul_float(__local uint* scratch, float val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    float r = as_float(scratch[subgroup_base]);
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = r * as_float(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_inclusive_scan_mul_float(__local uint* scratch, float val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    float r = as_float(scratch[subgroup_base]);
+    for (uint i = 1u; i <= lane_id; i++)
+        r = r * as_float(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_exclusive_scan_mul_float(__local uint* scratch, float val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    float r = 1.0f;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r * as_float(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_clustered_reduce_mul_float(__local uint* scratch, float val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    float r = as_float(scratch[subgroup_base + cluster_base_in_sg]);
+    for (uint i = 1u; i < cluster_size; i++)
+        r = r * as_float(scratch[subgroup_base + cluster_base_in_sg + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_reduce_min_float(__local uint* scratch, float val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    float r = as_float(scratch[subgroup_base]);
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = fmin(r, as_float(scratch[subgroup_base + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_inclusive_scan_min_float(__local uint* scratch, float val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    float r = as_float(scratch[subgroup_base]);
+    for (uint i = 1u; i <= lane_id; i++)
+        r = fmin(r, as_float(scratch[subgroup_base + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_exclusive_scan_min_float(__local uint* scratch, float val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    float r = INFINITY;
+    for (uint i = 0u; i < lane_id; i++)
+        r = fmin(r, as_float(scratch[subgroup_base + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_clustered_reduce_min_float(__local uint* scratch, float val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    float r = as_float(scratch[subgroup_base + cluster_base_in_sg]);
+    for (uint i = 1u; i < cluster_size; i++)
+        r = fmin(r, as_float(scratch[subgroup_base + cluster_base_in_sg + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_reduce_max_float(__local uint* scratch, float val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    float r = as_float(scratch[subgroup_base]);
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = fmax(r, as_float(scratch[subgroup_base + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_inclusive_scan_max_float(__local uint* scratch, float val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    float r = as_float(scratch[subgroup_base]);
+    for (uint i = 1u; i <= lane_id; i++)
+        r = fmax(r, as_float(scratch[subgroup_base + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_exclusive_scan_max_float(__local uint* scratch, float val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    float r = -INFINITY;
+    for (uint i = 0u; i < lane_id; i++)
+        r = fmax(r, as_float(scratch[subgroup_base + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_clustered_reduce_max_float(__local uint* scratch, float val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    float r = as_float(scratch[subgroup_base + cluster_base_in_sg]);
+    for (uint i = 1u; i < cluster_size; i++)
+        r = fmax(r, as_float(scratch[subgroup_base + cluster_base_in_sg + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_reduce_logical_and(__local uint* scratch, bool val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    bool r = true;
+    for (uint i = 0u; i < subgroup_size; i++)
+        r = r && (scratch[subgroup_base + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_inclusive_scan_logical_and(__local uint* scratch, bool val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    bool r = true;
+    for (uint i = 0u; i <= lane_id; i++)
+        r = r && (scratch[subgroup_base + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_exclusive_scan_logical_and(__local uint* scratch, bool val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    bool r = true;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r && (scratch[subgroup_base + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_clustered_reduce_logical_and(__local uint* scratch, bool val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    bool r = true;
+    for (uint i = 0u; i < cluster_size; i++)
+        r = r && (scratch[subgroup_base + cluster_base_in_sg + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_reduce_logical_or(__local uint* scratch, bool val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    bool r = false;
+    for (uint i = 0u; i < subgroup_size; i++)
+        r = r || (scratch[subgroup_base + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_inclusive_scan_logical_or(__local uint* scratch, bool val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    bool r = false;
+    for (uint i = 0u; i <= lane_id; i++)
+        r = r || (scratch[subgroup_base + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_exclusive_scan_logical_or(__local uint* scratch, bool val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    bool r = false;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r || (scratch[subgroup_base + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_clustered_reduce_logical_or(__local uint* scratch, bool val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    bool r = false;
+    for (uint i = 0u; i < cluster_size; i++)
+        r = r || (scratch[subgroup_base + cluster_base_in_sg + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_reduce_logical_xor(__local uint* scratch, bool val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    bool r = false;
+    for (uint i = 0u; i < subgroup_size; i++)
+        r = r != (scratch[subgroup_base + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_inclusive_scan_logical_xor(__local uint* scratch, bool val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    bool r = false;
+    for (uint i = 0u; i <= lane_id; i++)
+        r = r != (scratch[subgroup_base + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_exclusive_scan_logical_xor(__local uint* scratch, bool val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    bool r = false;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r != (scratch[subgroup_base + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_clustered_reduce_logical_xor(__local uint* scratch, bool val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    bool r = false;
+    for (uint i = 0u; i < cluster_size; i++)
+        r = r != (scratch[subgroup_base + cluster_base_in_sg + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_inverse_ballot(uint4 ballot, uint lane_id) {
+    uint word = lane_id / 32u;
+    uint bit = lane_id % 32u;
+    uint v = (word == 0u) ? ballot.x : (word == 1u) ? ballot.y : (word == 2u) ? ballot.z : ballot.w;
+    return (v & (1u << bit)) != 0u;
+}
+
+static bool spv_emulate_ballot_bit_extract(uint4 ballot, uint index) {
+    uint word = index / 32u;
+    uint bit = index % 32u;
+    uint v = (word == 0u) ? ballot.x : (word == 1u) ? ballot.y : (word == 2u) ? ballot.z : ballot.w;
+    return (v & (1u << bit)) != 0u;
+}
+
+static uint spv_popcount4(uint4 v) {
+    return popcount(v.x) + popcount(v.y) + popcount(v.z) + popcount(v.w);
+}
+
+static uint spv_emulate_ballot_bit_count(uint4 ballot) {
+    return spv_popcount4(ballot);
+}
+
+static uint spv_emulate_ballot_inclusive_bit_count(uint4 ballot, uint lane_id) {
+    uint4 masked = ballot;
+    uint word = lane_id / 32u;
+    uint bit = lane_id % 32u;
+    uint mask = (bit == 31u) ? 0xFFFFFFFFu : ((1u << (bit + 1u)) - 1u);
+    if (word == 0u) { masked.x &= mask; masked.y = 0u; masked.z = 0u; masked.w = 0u; }
+    else if (word == 1u) { masked.y &= mask; masked.z = 0u; masked.w = 0u; }
+    else if (word == 2u) { masked.z &= mask; masked.w = 0u; }
+    else { masked.w &= mask; }
+    return spv_popcount4(masked);
+}
+
+static uint spv_emulate_ballot_exclusive_bit_count(uint4 ballot, uint lane_id) {
+    if (lane_id == 0u) return 0u;
+    return spv_emulate_ballot_inclusive_bit_count(ballot, lane_id - 1u);
+}
+
+static uint spv_emulate_ballot_find_lsb(uint4 ballot) {
+    if (ballot.x != 0u) return (uint)clz(ballot.x & (0u - ballot.x));
+    if (ballot.y != 0u) return 32u + (uint)clz(ballot.y & (0u - ballot.y));
+    if (ballot.z != 0u) return 64u + (uint)clz(ballot.z & (0u - ballot.z));
+    if (ballot.w != 0u) return 96u + (uint)clz(ballot.w & (0u - ballot.w));
+    return ~0u;
+}
+
+static uint spv_emulate_ballot_find_msb(uint4 ballot) {
+    if (ballot.w != 0u) return 127u - (uint)clz(ballot.w);
+    if (ballot.z != 0u) return 95u - (uint)clz(ballot.z);
+    if (ballot.y != 0u) return 63u - (uint)clz(ballot.y);
+    if (ballot.x != 0u) return 31u - (uint)clz(ballot.x);
+    return ~0u;
+}
+
+__attribute__((reqd_work_group_size(256, 1, 1)))
+__kernel void comp_main(__global SSBO* _12)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    uint _spv_linear_workgroup_size = get_local_size(0) * get_local_size(1) * get_local_size(2);
+    uint _spv_linear_id = (get_local_id(2) * get_local_size(1) * get_local_size(0)) + (get_local_id(1) * get_local_size(0)) + get_local_id(0);
+    uint _spv_subgroup_size = 32u;
+    uint _spv_lane_id = _spv_linear_id % 32u;
+    uint _spv_subgroup_id = _spv_linear_id / 32u;
+    uint _spv_num_subgroups = _spv_linear_workgroup_size / 32u;
+    uint _spv_subgroup_base = _spv_subgroup_id * 32u;
+    __local uint _spv_subgroup_scratch[256];
+    float shuffled_1 = as_float(spv_emulate_shuffle_uint(_spv_subgroup_scratch, as_uint(_12->FragColor), 3u, _spv_linear_id, _spv_subgroup_base));
+    float xored_1 = as_float(spv_emulate_shuffle_xor_uint(_spv_subgroup_scratch, as_uint(_12->FragColor), 1u, _spv_lane_id, _spv_linear_id, _spv_subgroup_base));
+    float up_1 = as_float(spv_emulate_shuffle_up_uint(_spv_subgroup_scratch, as_uint(_12->FragColor), 1u, _spv_lane_id, _spv_linear_id, _spv_subgroup_base));
+    float down_1 = as_float(spv_emulate_shuffle_down_uint(_spv_subgroup_scratch, as_uint(_12->FragColor), 1u, _spv_lane_id, _spv_linear_id, _spv_subgroup_size));
+    _12->FragColor = ((shuffled_1 + xored_1) + up_1) + down_1;
+}
+
diff --git a/reference/shaders-opencl-no-opt/comp/subgroups-vote.nocompat.vk.subgroups-emulate.comp b/reference/shaders-opencl-no-opt/comp/subgroups-vote.nocompat.vk.subgroups-emulate.comp
new file mode 100644
index 000000000..2f3475ed6
--- /dev/null
+++ b/reference/shaders-opencl-no-opt/comp/subgroups-vote.nocompat.vk.subgroups-emulate.comp
@@ -0,0 +1,1098 @@
+// Generated from SPIR-V by SPIRV-Cross (OpenCL backend)
+
+
+struct SSBO
+{
+    float FragColor;
+    uint udat;
+};
+
+typedef struct SSBO SSBO;
+
+static uint spv_emulate_broadcast_uint(__local uint* scratch, uint val, uint src_lane, uint linear_id, uint subgroup_base) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base + src_lane];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_broadcast_first_uint(__local uint* scratch, uint val, uint linear_id, uint subgroup_base) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_shuffle_uint(__local uint* scratch, uint val, uint index, uint linear_id, uint subgroup_base) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base + index];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_shuffle_xor_uint(__local uint* scratch, uint val, uint mask, uint lane_id, uint linear_id, uint subgroup_base) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base + (lane_id ^ mask)];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_shuffle_up_uint(__local uint* scratch, uint val, uint delta, uint lane_id, uint linear_id, uint subgroup_base) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = (lane_id >= delta) ? scratch[linear_id - delta] : val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_shuffle_down_uint(__local uint* scratch, uint val, uint delta, uint lane_id, uint linear_id, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = (lane_id + delta < subgroup_size) ? scratch[linear_id + delta] : val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_rotate_uint(__local uint* scratch, uint val, uint delta, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base + ((lane_id + delta) % subgroup_size)];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_clustered_rotate_uint(__local uint* scratch, uint val, uint delta, uint lane_id, uint linear_id, uint subgroup_base, uint cluster_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base = (lane_id / cluster_size) * cluster_size;
+    uint r = scratch[subgroup_base + cluster_base + ((lane_id - cluster_base + delta) % cluster_size)];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_all(__local uint* scratch, bool predicate, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = predicate ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    bool r = true;
+    for (uint i = 0u; i < subgroup_size; i++)
+        r = r && (scratch[subgroup_base + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_any(__local uint* scratch, bool predicate, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = predicate ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    bool r = false;
+    for (uint i = 0u; i < subgroup_size; i++)
+        r = r || (scratch[subgroup_base + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_all_equal_uint(__local uint* scratch, uint val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint first = scratch[subgroup_base];
+    bool r = true;
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = r && (scratch[subgroup_base + i] == first);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint4 spv_emulate_ballot(__local uint* scratch, bool predicate, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = predicate ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint4 r = (uint4)(0u);
+    for (uint i = 0u; i < subgroup_size; i++) {
+        if (scratch[subgroup_base + i] != 0u) {
+            uint word = i / 32u;
+            uint bit = i % 32u;
+            if (word == 0u) r.x |= (1u << bit);
+            else if (word == 1u) r.y |= (1u << bit);
+            else if (word == 2u) r.z |= (1u << bit);
+            else r.w |= (1u << bit);
+        }
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint4 spv_subgroup_eq_mask(uint lane_id) {
+    uint4 r = (uint4)(0u);
+    uint word = lane_id / 32u;
+    uint bit = lane_id % 32u;
+    if (word == 0u) r.x = (1u << bit);
+    else if (word == 1u) r.y = (1u << bit);
+    else if (word == 2u) r.z = (1u << bit);
+    else r.w = (1u << bit);
+    return r;
+}
+
+static uint4 spv_subgroup_ge_mask(uint lane_id, uint subgroup_size) {
+    uint4 r = (uint4)(0u);
+    for (uint i = lane_id; i < subgroup_size; i++) {
+        uint word = i / 32u;
+        uint bit = i % 32u;
+        if (word == 0u) r.x |= (1u << bit);
+        else if (word == 1u) r.y |= (1u << bit);
+        else if (word == 2u) r.z |= (1u << bit);
+        else r.w |= (1u << bit);
+    }
+    return r;
+}
+
+static uint4 spv_subgroup_gt_mask(uint lane_id, uint subgroup_size) {
+    return spv_subgroup_ge_mask(lane_id + 1u, subgroup_size);
+}
+
+static uint4 spv_subgroup_le_mask(uint lane_id, uint subgroup_size) {
+    return spv_subgroup_ge_mask(0u, lane_id + 1u);
+}
+
+static uint4 spv_subgroup_lt_mask(uint lane_id) {
+    if (lane_id == 0u) return (uint4)(0u);
+    return spv_subgroup_ge_mask(0u, lane_id);
+}
+
+static uint spv_emulate_reduce_add_uint(__local uint* scratch, uint val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = r + scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_inclusive_scan_add_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i <= lane_id; i++)
+        r = r + scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_exclusive_scan_add_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = 0u;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r + scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_clustered_reduce_add_uint(__local uint* scratch, uint val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    uint r = scratch[subgroup_base + cluster_base_in_sg];
+    for (uint i = 1u; i < cluster_size; i++)
+        r = r + scratch[subgroup_base + cluster_base_in_sg + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_reduce_mul_uint(__local uint* scratch, uint val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = r * scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_inclusive_scan_mul_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i <= lane_id; i++)
+        r = r * scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_exclusive_scan_mul_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = 1u;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r * scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_clustered_reduce_mul_uint(__local uint* scratch, uint val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    uint r = scratch[subgroup_base + cluster_base_in_sg];
+    for (uint i = 1u; i < cluster_size; i++)
+        r = r * scratch[subgroup_base + cluster_base_in_sg + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_reduce_min_uint(__local uint* scratch, uint val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = min(r, scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_inclusive_scan_min_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i <= lane_id; i++)
+        r = min(r, scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_exclusive_scan_min_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = UINT_MAX;
+    for (uint i = 0u; i < lane_id; i++)
+        r = min(r, scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_clustered_reduce_min_uint(__local uint* scratch, uint val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    uint r = scratch[subgroup_base + cluster_base_in_sg];
+    for (uint i = 1u; i < cluster_size; i++)
+        r = min(r, scratch[subgroup_base + cluster_base_in_sg + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_reduce_max_uint(__local uint* scratch, uint val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = max(r, scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_inclusive_scan_max_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i <= lane_id; i++)
+        r = max(r, scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_exclusive_scan_max_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = 0u;
+    for (uint i = 0u; i < lane_id; i++)
+        r = max(r, scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_clustered_reduce_max_uint(__local uint* scratch, uint val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    uint r = scratch[subgroup_base + cluster_base_in_sg];
+    for (uint i = 1u; i < cluster_size; i++)
+        r = max(r, scratch[subgroup_base + cluster_base_in_sg + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_reduce_and_uint(__local uint* scratch, uint val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = r & scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_inclusive_scan_and_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i <= lane_id; i++)
+        r = r & scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_exclusive_scan_and_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = 0xFFFFFFFFu;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r & scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_clustered_reduce_and_uint(__local uint* scratch, uint val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    uint r = scratch[subgroup_base + cluster_base_in_sg];
+    for (uint i = 1u; i < cluster_size; i++)
+        r = r & scratch[subgroup_base + cluster_base_in_sg + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_reduce_or_uint(__local uint* scratch, uint val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = r | scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_inclusive_scan_or_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i <= lane_id; i++)
+        r = r | scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_exclusive_scan_or_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = 0u;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r | scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_clustered_reduce_or_uint(__local uint* scratch, uint val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    uint r = scratch[subgroup_base + cluster_base_in_sg];
+    for (uint i = 1u; i < cluster_size; i++)
+        r = r | scratch[subgroup_base + cluster_base_in_sg + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_reduce_xor_uint(__local uint* scratch, uint val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = r ^ scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_inclusive_scan_xor_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = scratch[subgroup_base];
+    for (uint i = 1u; i <= lane_id; i++)
+        r = r ^ scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_exclusive_scan_xor_uint(__local uint* scratch, uint val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint r = 0u;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r ^ scratch[subgroup_base + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static uint spv_emulate_clustered_reduce_xor_uint(__local uint* scratch, uint val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    uint r = scratch[subgroup_base + cluster_base_in_sg];
+    for (uint i = 1u; i < cluster_size; i++)
+        r = r ^ scratch[subgroup_base + cluster_base_in_sg + i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_reduce_add_int(__local uint* scratch, int val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = r + as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_inclusive_scan_add_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i <= lane_id; i++)
+        r = r + as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_exclusive_scan_add_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = 0;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r + as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_clustered_reduce_add_int(__local uint* scratch, int val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    int r = as_int(scratch[subgroup_base + cluster_base_in_sg]);
+    for (uint i = 1u; i < cluster_size; i++)
+        r = r + as_int(scratch[subgroup_base + cluster_base_in_sg + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_reduce_mul_int(__local uint* scratch, int val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = r * as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_inclusive_scan_mul_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i <= lane_id; i++)
+        r = r * as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_exclusive_scan_mul_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = 1;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r * as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_clustered_reduce_mul_int(__local uint* scratch, int val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    int r = as_int(scratch[subgroup_base + cluster_base_in_sg]);
+    for (uint i = 1u; i < cluster_size; i++)
+        r = r * as_int(scratch[subgroup_base + cluster_base_in_sg + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_reduce_min_int(__local uint* scratch, int val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = min(r, as_int(scratch[subgroup_base + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_inclusive_scan_min_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i <= lane_id; i++)
+        r = min(r, as_int(scratch[subgroup_base + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_exclusive_scan_min_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = INT_MAX;
+    for (uint i = 0u; i < lane_id; i++)
+        r = min(r, as_int(scratch[subgroup_base + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_clustered_reduce_min_int(__local uint* scratch, int val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    int r = as_int(scratch[subgroup_base + cluster_base_in_sg]);
+    for (uint i = 1u; i < cluster_size; i++)
+        r = min(r, as_int(scratch[subgroup_base + cluster_base_in_sg + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_reduce_max_int(__local uint* scratch, int val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = max(r, as_int(scratch[subgroup_base + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_inclusive_scan_max_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i <= lane_id; i++)
+        r = max(r, as_int(scratch[subgroup_base + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_exclusive_scan_max_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = INT_MIN;
+    for (uint i = 0u; i < lane_id; i++)
+        r = max(r, as_int(scratch[subgroup_base + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_clustered_reduce_max_int(__local uint* scratch, int val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    int r = as_int(scratch[subgroup_base + cluster_base_in_sg]);
+    for (uint i = 1u; i < cluster_size; i++)
+        r = max(r, as_int(scratch[subgroup_base + cluster_base_in_sg + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_reduce_and_int(__local uint* scratch, int val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = r & as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_inclusive_scan_and_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i <= lane_id; i++)
+        r = r & as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_exclusive_scan_and_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(0xFFFFFFFFu);
+    for (uint i = 0u; i < lane_id; i++)
+        r = r & as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_clustered_reduce_and_int(__local uint* scratch, int val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    int r = as_int(scratch[subgroup_base + cluster_base_in_sg]);
+    for (uint i = 1u; i < cluster_size; i++)
+        r = r & as_int(scratch[subgroup_base + cluster_base_in_sg + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_reduce_or_int(__local uint* scratch, int val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = r | as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_inclusive_scan_or_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i <= lane_id; i++)
+        r = r | as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_exclusive_scan_or_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = 0;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r | as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_clustered_reduce_or_int(__local uint* scratch, int val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    int r = as_int(scratch[subgroup_base + cluster_base_in_sg]);
+    for (uint i = 1u; i < cluster_size; i++)
+        r = r | as_int(scratch[subgroup_base + cluster_base_in_sg + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_reduce_xor_int(__local uint* scratch, int val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = r ^ as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_inclusive_scan_xor_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = as_int(scratch[subgroup_base]);
+    for (uint i = 1u; i <= lane_id; i++)
+        r = r ^ as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_exclusive_scan_xor_int(__local uint* scratch, int val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int r = 0;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r ^ as_int(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static int spv_emulate_clustered_reduce_xor_int(__local uint* scratch, int val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    int r = as_int(scratch[subgroup_base + cluster_base_in_sg]);
+    for (uint i = 1u; i < cluster_size; i++)
+        r = r ^ as_int(scratch[subgroup_base + cluster_base_in_sg + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_reduce_add_float(__local uint* scratch, float val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    float r = as_float(scratch[subgroup_base]);
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = r + as_float(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_inclusive_scan_add_float(__local uint* scratch, float val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    float r = as_float(scratch[subgroup_base]);
+    for (uint i = 1u; i <= lane_id; i++)
+        r = r + as_float(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_exclusive_scan_add_float(__local uint* scratch, float val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    float r = 0.0f;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r + as_float(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_clustered_reduce_add_float(__local uint* scratch, float val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    float r = as_float(scratch[subgroup_base + cluster_base_in_sg]);
+    for (uint i = 1u; i < cluster_size; i++)
+        r = r + as_float(scratch[subgroup_base + cluster_base_in_sg + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_reduce_mul_float(__local uint* scratch, float val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    float r = as_float(scratch[subgroup_base]);
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = r * as_float(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_inclusive_scan_mul_float(__local uint* scratch, float val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    float r = as_float(scratch[subgroup_base]);
+    for (uint i = 1u; i <= lane_id; i++)
+        r = r * as_float(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_exclusive_scan_mul_float(__local uint* scratch, float val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    float r = 1.0f;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r * as_float(scratch[subgroup_base + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_clustered_reduce_mul_float(__local uint* scratch, float val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    float r = as_float(scratch[subgroup_base + cluster_base_in_sg]);
+    for (uint i = 1u; i < cluster_size; i++)
+        r = r * as_float(scratch[subgroup_base + cluster_base_in_sg + i]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_reduce_min_float(__local uint* scratch, float val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    float r = as_float(scratch[subgroup_base]);
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = fmin(r, as_float(scratch[subgroup_base + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_inclusive_scan_min_float(__local uint* scratch, float val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    float r = as_float(scratch[subgroup_base]);
+    for (uint i = 1u; i <= lane_id; i++)
+        r = fmin(r, as_float(scratch[subgroup_base + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_exclusive_scan_min_float(__local uint* scratch, float val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    float r = INFINITY;
+    for (uint i = 0u; i < lane_id; i++)
+        r = fmin(r, as_float(scratch[subgroup_base + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_clustered_reduce_min_float(__local uint* scratch, float val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    float r = as_float(scratch[subgroup_base + cluster_base_in_sg]);
+    for (uint i = 1u; i < cluster_size; i++)
+        r = fmin(r, as_float(scratch[subgroup_base + cluster_base_in_sg + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_reduce_max_float(__local uint* scratch, float val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    float r = as_float(scratch[subgroup_base]);
+    for (uint i = 1u; i < subgroup_size; i++)
+        r = fmax(r, as_float(scratch[subgroup_base + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_inclusive_scan_max_float(__local uint* scratch, float val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    float r = as_float(scratch[subgroup_base]);
+    for (uint i = 1u; i <= lane_id; i++)
+        r = fmax(r, as_float(scratch[subgroup_base + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_exclusive_scan_max_float(__local uint* scratch, float val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    float r = -INFINITY;
+    for (uint i = 0u; i < lane_id; i++)
+        r = fmax(r, as_float(scratch[subgroup_base + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static float spv_emulate_clustered_reduce_max_float(__local uint* scratch, float val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = as_uint(val);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    float r = as_float(scratch[subgroup_base + cluster_base_in_sg]);
+    for (uint i = 1u; i < cluster_size; i++)
+        r = fmax(r, as_float(scratch[subgroup_base + cluster_base_in_sg + i]));
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_reduce_logical_and(__local uint* scratch, bool val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    bool r = true;
+    for (uint i = 0u; i < subgroup_size; i++)
+        r = r && (scratch[subgroup_base + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_inclusive_scan_logical_and(__local uint* scratch, bool val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    bool r = true;
+    for (uint i = 0u; i <= lane_id; i++)
+        r = r && (scratch[subgroup_base + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_exclusive_scan_logical_and(__local uint* scratch, bool val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    bool r = true;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r && (scratch[subgroup_base + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_clustered_reduce_logical_and(__local uint* scratch, bool val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    bool r = true;
+    for (uint i = 0u; i < cluster_size; i++)
+        r = r && (scratch[subgroup_base + cluster_base_in_sg + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_reduce_logical_or(__local uint* scratch, bool val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    bool r = false;
+    for (uint i = 0u; i < subgroup_size; i++)
+        r = r || (scratch[subgroup_base + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_inclusive_scan_logical_or(__local uint* scratch, bool val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    bool r = false;
+    for (uint i = 0u; i <= lane_id; i++)
+        r = r || (scratch[subgroup_base + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_exclusive_scan_logical_or(__local uint* scratch, bool val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    bool r = false;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r || (scratch[subgroup_base + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_clustered_reduce_logical_or(__local uint* scratch, bool val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    bool r = false;
+    for (uint i = 0u; i < cluster_size; i++)
+        r = r || (scratch[subgroup_base + cluster_base_in_sg + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_reduce_logical_xor(__local uint* scratch, bool val, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    bool r = false;
+    for (uint i = 0u; i < subgroup_size; i++)
+        r = r != (scratch[subgroup_base + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_inclusive_scan_logical_xor(__local uint* scratch, bool val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    bool r = false;
+    for (uint i = 0u; i <= lane_id; i++)
+        r = r != (scratch[subgroup_base + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_exclusive_scan_logical_xor(__local uint* scratch, bool val, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    bool r = false;
+    for (uint i = 0u; i < lane_id; i++)
+        r = r != (scratch[subgroup_base + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_clustered_reduce_logical_xor(__local uint* scratch, bool val, uint cluster_size, uint lane_id, uint linear_id, uint subgroup_base, uint subgroup_size) {
+    scratch[linear_id] = val ? 1u : 0u;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint cluster_base_in_sg = (lane_id / cluster_size) * cluster_size;
+    bool r = false;
+    for (uint i = 0u; i < cluster_size; i++)
+        r = r != (scratch[subgroup_base + cluster_base_in_sg + i] != 0u);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return r;
+}
+
+static bool spv_emulate_inverse_ballot(uint4 ballot, uint lane_id) {
+    uint word = lane_id / 32u;
+    uint bit = lane_id % 32u;
+    uint v = (word == 0u) ? ballot.x : (word == 1u) ? ballot.y : (word == 2u) ? ballot.z : ballot.w;
+    return (v & (1u << bit)) != 0u;
+}
+
+static bool spv_emulate_ballot_bit_extract(uint4 ballot, uint index) {
+    uint word = index / 32u;
+    uint bit = index % 32u;
+    uint v = (word == 0u) ? ballot.x : (word == 1u) ? ballot.y : (word == 2u) ? ballot.z : ballot.w;
+    return (v & (1u << bit)) != 0u;
+}
+
+static uint spv_popcount4(uint4 v) {
+    return popcount(v.x) + popcount(v.y) + popcount(v.z) + popcount(v.w);
+}
+
+static uint spv_emulate_ballot_bit_count(uint4 ballot) {
+    return spv_popcount4(ballot);
+}
+
+static uint spv_emulate_ballot_inclusive_bit_count(uint4 ballot, uint lane_id) {
+    uint4 masked = ballot;
+    uint word = lane_id / 32u;
+    uint bit = lane_id % 32u;
+    uint mask = (bit == 31u) ? 0xFFFFFFFFu : ((1u << (bit + 1u)) - 1u);
+    if (word == 0u) { masked.x &= mask; masked.y = 0u; masked.z = 0u; masked.w = 0u; }
+    else if (word == 1u) { masked.y &= mask; masked.z = 0u; masked.w = 0u; }
+    else if (word == 2u) { masked.z &= mask; masked.w = 0u; }
+    else { masked.w &= mask; }
+    return spv_popcount4(masked);
+}
+
+static uint spv_emulate_ballot_exclusive_bit_count(uint4 ballot, uint lane_id) {
+    if (lane_id == 0u) return 0u;
+    return spv_emulate_ballot_inclusive_bit_count(ballot, lane_id - 1u);
+}
+
+static uint spv_emulate_ballot_find_lsb(uint4 ballot) {
+    if (ballot.x != 0u) return (uint)clz(ballot.x & (0u - ballot.x));
+    if (ballot.y != 0u) return 32u + (uint)clz(ballot.y & (0u - ballot.y));
+    if (ballot.z != 0u) return 64u + (uint)clz(ballot.z & (0u - ballot.z));
+    if (ballot.w != 0u) return 96u + (uint)clz(ballot.w & (0u - ballot.w));
+    return ~0u;
+}
+
+static uint spv_emulate_ballot_find_msb(uint4 ballot) {
+    if (ballot.w != 0u) return 127u - (uint)clz(ballot.w);
+    if (ballot.z != 0u) return 95u - (uint)clz(ballot.z);
+    if (ballot.y != 0u) return 63u - (uint)clz(ballot.y);
+    if (ballot.x != 0u) return 31u - (uint)clz(ballot.x);
+    return ~0u;
+}
+
+__attribute__((reqd_work_group_size(256, 1, 1)))
+__kernel void comp_main(__global SSBO* _16)
+{
+    uint3 spvWorkgroupSize = (uint3)(get_local_size(0), get_local_size(1), get_local_size(2));
+    uint _spv_linear_workgroup_size = get_local_size(0) * get_local_size(1) * get_local_size(2);
+    uint _spv_linear_id = (get_local_id(2) * get_local_size(1) * get_local_size(0)) + (get_local_id(1) * get_local_size(0)) + get_local_id(0);
+    uint _spv_subgroup_size = 32u;
+    uint _spv_lane_id = _spv_linear_id % 32u;
+    uint _spv_subgroup_id = _spv_linear_id / 32u;
+    uint _spv_num_subgroups = _spv_linear_workgroup_size / 32u;
+    uint _spv_subgroup_base = _spv_subgroup_id * 32u;
+    __local uint _spv_subgroup_scratch[256];
+    bool elected_1 = (_spv_lane_id == 0u);
+    bool all_eq_float_1 = spv_emulate_all_equal_uint(_spv_subgroup_scratch, as_uint(_16->FragColor), _spv_linear_id, _spv_subgroup_base, _spv_subgroup_size);
+    bool all_eq_uint_1 = spv_emulate_all_equal_uint(_spv_subgroup_scratch, _16->udat, _spv_linear_id, _spv_subgroup_base, _spv_subgroup_size);
+    bool all_result_1 = spv_emulate_all(_spv_subgroup_scratch, _16->udat > 0u, _spv_linear_id, _spv_subgroup_base, _spv_subgroup_size);
+    bool any_result_1 = spv_emulate_any(_spv_subgroup_scratch, _16->udat > 0u, _spv_linear_id, _spv_subgroup_base, _spv_subgroup_size);
+    _16->FragColor = ((((float)(elected_1) + (float)(all_eq_float_1)) + (float)(all_eq_uint_1)) + (float)(all_result_1)) + (float)(any_result_1);
+}
+
diff --git a/shaders-opencl-no-opt/comp/subgroups-arithmetic.nocompat.vk.subgroups-emulate.comp b/shaders-opencl-no-opt/comp/subgroups-arithmetic.nocompat.vk.subgroups-emulate.comp
new file mode 100644
index 000000000..31e7e9c89
--- /dev/null
+++ b/shaders-opencl-no-opt/comp/subgroups-arithmetic.nocompat.vk.subgroups-emulate.comp
@@ -0,0 +1,39 @@
+#version 450
+#extension GL_KHR_shader_subgroup_basic : require
+#extension GL_KHR_shader_subgroup_arithmetic : require
+layout(local_size_x = 256) in;
+
+layout(std430, binding = 0) buffer SSBO
+{
+	float FragColor;
+	int idat;
+	uint udat;
+};
+
+void main()
+{
+	// Mul (non-uniform arithmetic extension)
+	float fmul = subgroupMul(FragColor);
+	int imul = subgroupMul(idat);
+	uint umul = subgroupMul(udat);
+
+	// Bitwise
+	uint band = subgroupAnd(udat);
+	uint bor = subgroupOr(udat);
+	uint bxor = subgroupXor(udat);
+
+	// Logical
+	bool land = subgroupAnd(udat > 0u);
+	bool lor = subgroupOr(udat > 0u);
+	bool lxor = subgroupXor(udat > 0u);
+
+	// Inclusive mul
+	float fmul_inc = subgroupInclusiveMul(FragColor);
+	// Exclusive mul
+	float fmul_exc = subgroupExclusiveMul(FragColor);
+
+	FragColor = fmul + fmul_inc + fmul_exc
+		+ float(imul + int(umul))
+		+ float(band + bor + bxor)
+		+ float(land) + float(lor) + float(lxor);
+}
diff --git a/shaders-opencl-no-opt/comp/subgroups-ballot.nocompat.vk.subgroups-emulate.comp b/shaders-opencl-no-opt/comp/subgroups-ballot.nocompat.vk.subgroups-emulate.comp
new file mode 100644
index 000000000..a2ad427a7
--- /dev/null
+++ b/shaders-opencl-no-opt/comp/subgroups-ballot.nocompat.vk.subgroups-emulate.comp
@@ -0,0 +1,43 @@
+#version 450
+#extension GL_KHR_shader_subgroup_basic : require
+#extension GL_KHR_shader_subgroup_ballot : require
+layout(local_size_x = 256) in;
+
+layout(std430, binding = 0) buffer SSBO
+{
+	float FragColor;
+	uint udat;
+};
+
+void main()
+{
+	// Ballot
+	uvec4 ballot = subgroupBallot(gl_SubgroupInvocationID < 16u);
+
+	// BroadcastFirst
+	float first = subgroupBroadcastFirst(FragColor);
+
+	// Ballot bit ops
+	bool extracted = subgroupBallotBitExtract(ballot, 5u);
+	uint bit_count = subgroupBallotBitCount(ballot);
+	uint inclusive_count = subgroupBallotInclusiveBitCount(ballot);
+	uint exclusive_count = subgroupBallotExclusiveBitCount(ballot);
+	uint find_lsb = subgroupBallotFindLSB(ballot);
+	uint find_msb = subgroupBallotFindMSB(ballot);
+
+	// InverseBallot
+	bool inv_ballot = subgroupInverseBallot(ballot);
+
+	// Mask builtins
+	uvec4 eq_mask = gl_SubgroupEqMask;
+	uvec4 ge_mask = gl_SubgroupGeMask;
+	uvec4 gt_mask = gl_SubgroupGtMask;
+	uvec4 le_mask = gl_SubgroupLeMask;
+	uvec4 lt_mask = gl_SubgroupLtMask;
+
+	FragColor = first
+		+ float(ballot.x + ballot.y + ballot.z + ballot.w)
+		+ float(bit_count + inclusive_count + exclusive_count + find_lsb + find_msb)
+		+ float(extracted) + float(inv_ballot)
+		+ float(eq_mask.x) + float(ge_mask.x) + float(gt_mask.x) + float(le_mask.x) + float(lt_mask.x);
+}
diff --git a/shaders-opencl-no-opt/comp/subgroups-basic.nocompat.vk.subgroups-emulate.comp b/shaders-opencl-no-opt/comp/subgroups-basic.nocompat.vk.subgroups-emulate.comp
new file mode 100644
index 000000000..213043519
--- /dev/null
+++ b/shaders-opencl-no-opt/comp/subgroups-basic.nocompat.vk.subgroups-emulate.comp
@@ -0,0 +1,77 @@
+#version 450
+#extension GL_KHR_shader_subgroup_basic : require
+#extension GL_KHR_shader_subgroup_vote : require
+#extension GL_KHR_shader_subgroup_ballot : require
+#extension GL_KHR_shader_subgroup_arithmetic : require
+layout(local_size_x = 256) in;
+
+layout(std430, binding = 0) buffer SSBO
+{
+	float FragColor;
+	int idat;
+	uint udat;
+};
+
+float helper(float val)
+{
+	float reduced = subgroupAdd(val);
+	bool elected = subgroupElect();
+	return elected ? reduced : 0.0;
+}
+
+void main()
+{
+	// Builtins
+	FragColor = float(gl_NumSubgroups);
+	FragColor = float(gl_SubgroupID);
+	FragColor = float(gl_SubgroupSize);
+	FragColor = float(gl_SubgroupInvocationID);
+
+	// Elect
+	bool elected = subgroupElect();
+
+	// Barriers
+	subgroupBarrier();
+	subgroupMemoryBarrier();
+	subgroupMemoryBarrierBuffer();
+	subgroupMemoryBarrierShared();
+	subgroupMemoryBarrierImage();
+
+	// Vote (uniform)
+	bool has_all = subgroupAll(true);
+	bool has_any = subgroupAny(true);
+
+	// Broadcast
+	uint broadcasted = subgroupBroadcast(42u, 0u);
+
+	// Reduce
+	float fadd = subgroupAdd(FragColor);
+	int iadd = subgroupAdd(idat);
+	float fmin = subgroupMin(FragColor);
+	float fmax = subgroupMax(FragColor);
+	int smin = subgroupMin(idat);
+	int smax = subgroupMax(idat);
+	uint umin = subgroupMin(udat);
+	uint umax = subgroupMax(udat);
+
+	// Inclusive scan
+	float finc_add = subgroupInclusiveAdd(FragColor);
+	int iinc_add = subgroupInclusiveAdd(idat);
+	uint uinc_min = subgroupInclusiveMin(udat);
+	uint uinc_max = subgroupInclusiveMax(udat);
+
+	// Exclusive scan
+	float fexc_add = subgroupExclusiveAdd(FragColor);
+	int iexc_add = subgroupExclusiveAdd(idat);
+	uint uexc_min = subgroupExclusiveMin(udat);
+	uint uexc_max = subgroupExclusiveMax(udat);
+
+	// Call helper function that uses subgroup ops
+	float from_helper = helper(FragColor);
+
+	// Write results to prevent dead-code elimination
+	FragColor = fadd + fmin + fmax + finc_add + fexc_add
+		+ float(iadd + smin + smax + iinc_add + iexc_add)
+		+ float(umin + umax + uinc_min + uinc_max + uexc_min + uexc_max + broadcasted)
+		+ float(has_all) + float(has_any) + float(elected) + from_helper;
+}
diff --git a/shaders-opencl-no-opt/comp/subgroups-basic.nocompat.vk.subgroups.subgroups-emulate.comp b/shaders-opencl-no-opt/comp/subgroups-basic.nocompat.vk.subgroups.subgroups-emulate.comp
new file mode 100644
index 000000000..213043519
--- /dev/null
+++ b/shaders-opencl-no-opt/comp/subgroups-basic.nocompat.vk.subgroups.subgroups-emulate.comp
@@ -0,0 +1,77 @@
+#version 450
+#extension GL_KHR_shader_subgroup_basic : require
+#extension GL_KHR_shader_subgroup_vote : require
+#extension GL_KHR_shader_subgroup_ballot : require
+#extension GL_KHR_shader_subgroup_arithmetic : require
+layout(local_size_x = 256) in;
+
+layout(std430, binding = 0) buffer SSBO
+{
+	float FragColor;
+	int idat;
+	uint udat;
+};
+
+float helper(float val)
+{
+	float reduced = subgroupAdd(val);
+	bool elected = subgroupElect();
+	return elected ? reduced : 0.0;
+}
+
+void main()
+{
+	// Builtins
+	FragColor = float(gl_NumSubgroups);
+	FragColor = float(gl_SubgroupID);
+	FragColor = float(gl_SubgroupSize);
+	FragColor = float(gl_SubgroupInvocationID);
+
+	// Elect
+	bool elected = subgroupElect();
+
+	// Barriers
+	subgroupBarrier();
+	subgroupMemoryBarrier();
+	subgroupMemoryBarrierBuffer();
+	subgroupMemoryBarrierShared();
+	subgroupMemoryBarrierImage();
+
+	// Vote (uniform)
+	bool has_all = subgroupAll(true);
+	bool has_any = subgroupAny(true);
+
+	// Broadcast
+	uint broadcasted = subgroupBroadcast(42u, 0u);
+
+	// Reduce
+	float fadd = subgroupAdd(FragColor);
+	int iadd = subgroupAdd(idat);
+	float fmin = subgroupMin(FragColor);
+	float fmax = subgroupMax(FragColor);
+	int smin = subgroupMin(idat);
+	int smax = subgroupMax(idat);
+	uint umin = subgroupMin(udat);
+	uint umax = subgroupMax(udat);
+
+	// Inclusive scan
+	float finc_add = subgroupInclusiveAdd(FragColor);
+	int iinc_add = subgroupInclusiveAdd(idat);
+	uint uinc_min = subgroupInclusiveMin(udat);
+	uint uinc_max = subgroupInclusiveMax(udat);
+
+	// Exclusive scan
+	float fexc_add = subgroupExclusiveAdd(FragColor);
+	int iexc_add = subgroupExclusiveAdd(idat);
+	uint uexc_min = subgroupExclusiveMin(udat);
+	uint uexc_max = subgroupExclusiveMax(udat);
+
+	// Call helper function that uses subgroup ops
+	float from_helper = helper(FragColor);
+
+	// Write results to prevent dead-code elimination
+	FragColor = fadd + fmin + fmax + finc_add + fexc_add
+		+ float(iadd + smin + smax + iinc_add + iexc_add)
+		+ float(umin + umax + uinc_min + uinc_max + uexc_min + uexc_max + broadcasted)
+		+ float(has_all) + float(has_any) + float(elected) + from_helper;
+}
diff --git a/shaders-opencl-no-opt/comp/subgroups-clustered.nocompat.vk.subgroups-emulate.comp b/shaders-opencl-no-opt/comp/subgroups-clustered.nocompat.vk.subgroups-emulate.comp
new file mode 100644
index 000000000..7161ed2d4
--- /dev/null
+++ b/shaders-opencl-no-opt/comp/subgroups-clustered.nocompat.vk.subgroups-emulate.comp
@@ -0,0 +1,34 @@
+#version 450
+#extension GL_KHR_shader_subgroup_basic : require
+#extension GL_KHR_shader_subgroup_clustered : require
+layout(local_size_x = 256) in;
+
+layout(std430, binding = 0) buffer SSBO
+{
+	float FragColor;
+	int idat;
+	uint udat;
+};
+
+void main()
+{
+	float cred_add = subgroupClusteredAdd(FragColor, 4u);
+	float cred_mul = subgroupClusteredMul(FragColor, 4u);
+	float cred_min = subgroupClusteredMin(FragColor, 4u);
+	float cred_max = subgroupClusteredMax(FragColor, 4u);
+
+	int cred_iadd = subgroupClusteredAdd(idat, 4u);
+	uint cred_umin = subgroupClusteredMin(udat, 4u);
+
+	uint cred_and = subgroupClusteredAnd(udat, 4u);
+	uint cred_or = subgroupClusteredOr(udat, 4u);
+	uint cred_xor = subgroupClusteredXor(udat, 4u);
+
+	bool cred_land = subgroupClusteredAnd(udat > 0u, 4u);
+	bool cred_lor = subgroupClusteredOr(udat > 0u, 4u);
+
+	FragColor = cred_add + cred_mul + cred_min + cred_max
+		+ float(cred_iadd) + float(cred_umin)
+		+ float(cred_and + cred_or + cred_xor)
+		+ float(cred_land) + float(cred_lor);
+}
diff --git a/shaders-opencl-no-opt/comp/subgroups-rotate.nocompat.vk.subgroups-emulate.comp b/shaders-opencl-no-opt/comp/subgroups-rotate.nocompat.vk.subgroups-emulate.comp
new file mode 100644
index 000000000..bd225cf23
--- /dev/null
+++ b/shaders-opencl-no-opt/comp/subgroups-rotate.nocompat.vk.subgroups-emulate.comp
@@ -0,0 +1,17 @@
+#version 450
+#extension GL_KHR_shader_subgroup_basic : require
+#extension GL_KHR_shader_subgroup_rotate : require
+layout(local_size_x = 256) in;
+
+layout(std430, binding = 0) buffer SSBO
+{
+	float FragColor;
+};
+
+void main()
+{
+	uint rotated = subgroupRotate(20u, 4u);
+	uint rotated_clustered = subgroupClusteredRotate(20u, 4u, 8u);
+
+	FragColor = float(rotated) + float(rotated_clustered);
+}
diff --git a/shaders-opencl-no-opt/comp/subgroups-shuffle.nocompat.vk.subgroups-emulate.comp b/shaders-opencl-no-opt/comp/subgroups-shuffle.nocompat.vk.subgroups-emulate.comp
new file mode 100644
index 000000000..5afa50288
--- /dev/null
+++ b/shaders-opencl-no-opt/comp/subgroups-shuffle.nocompat.vk.subgroups-emulate.comp
@@ -0,0 +1,22 @@
+#version 450
+#extension GL_KHR_shader_subgroup_basic : require
+#extension GL_KHR_shader_subgroup_shuffle : require
+#extension GL_KHR_shader_subgroup_shuffle_relative : require
+layout(local_size_x = 256) in;
+
+layout(std430, binding = 0) buffer SSBO
+{
+	float FragColor;
+	uint udat;
+};
+
+void main()
+{
+	// Shuffle
+	float shuffled = subgroupShuffle(FragColor, 3u);
+	float xored = subgroupShuffleXor(FragColor, 1u);
+	float up = subgroupShuffleUp(FragColor, 1u);
+	float down = subgroupShuffleDown(FragColor, 1u);
+
+	FragColor = shuffled + xored + up + down;
+}
diff --git a/shaders-opencl-no-opt/comp/subgroups-vote.nocompat.vk.subgroups-emulate.comp b/shaders-opencl-no-opt/comp/subgroups-vote.nocompat.vk.subgroups-emulate.comp
new file mode 100644
index 000000000..18664539b
--- /dev/null
+++ b/shaders-opencl-no-opt/comp/subgroups-vote.nocompat.vk.subgroups-emulate.comp
@@ -0,0 +1,27 @@
+#version 450
+#extension GL_KHR_shader_subgroup_basic : require
+#extension GL_KHR_shader_subgroup_vote : require
+layout(local_size_x = 256) in;
+
+layout(std430, binding = 0) buffer SSBO
+{
+	float FragColor;
+	uint udat;
+};
+
+void main()
+{
+	// Elect
+	bool elected = subgroupElect();
+
+	// AllEqual
+	bool all_eq_float = subgroupAllEqual(FragColor);
+	bool all_eq_uint = subgroupAllEqual(udat);
+
+	// All / Any
+	bool all_result = subgroupAll(udat > 0u);
+	bool any_result = subgroupAny(udat > 0u);
+
+	FragColor = float(elected) + float(all_eq_float) + float(all_eq_uint)
+		+ float(all_result) + float(any_result);
+}

From ee8cceb21a9cea424f9af39f18cee4bdde1c1e82 Mon Sep 17 00:00:00 2001
From: Garrick Meeker <gmeeker@gmail.com>
Date: Thu, 19 Mar 2026 07:33:14 -0700
Subject: [PATCH 16/16] OpenCL: Fix --opencl-subgroups-all

---
 main.cpp         |  2 +-
 spirv_opencl.cpp | 20 --------------------
 spirv_opencl.hpp |  3 +++
 3 files changed, 4 insertions(+), 21 deletions(-)

diff --git a/main.cpp b/main.cpp
index bb8163b70..7e7b308ec 100644
--- a/main.cpp
+++ b/main.cpp
@@ -2007,7 +2007,7 @@ static int main_inner(int argc, char *argv[])
 	cbs.add("--opencl-fp64", [&args](CLIParser &) { args.opencl_enable_fp64 = true; });
 	cbs.add("--opencl-64bit-atomics", [&args](CLIParser &) { args.opencl_enable_64bit_atomics = true; });
 	cbs.add("--opencl-subgroups", [&args](CLIParser &) { args.opencl_enable_subgroups = true; });
-	cbs.add("--opencl-subgroups-all", [&args](CLIParser &) { args.opencl_enable_subgroups = true; });
+	cbs.add("--opencl-subgroups-all", [&args](CLIParser &) { args.opencl_enable_subgroups_all = true; });
 	cbs.add("--opencl-emulate-subgroups", [&args](CLIParser &) { args.opencl_emulate_subgroups = true; });
 	cbs.add("--opencl-fixed-subgroup-size",
 	        [&args](CLIParser &parser) { args.opencl_fixed_subgroup_size = parser.next_uint(); });
diff --git a/spirv_opencl.cpp b/spirv_opencl.cpp
index b820f447c..38ff7f075 100644
--- a/spirv_opencl.cpp
+++ b/spirv_opencl.cpp
@@ -1626,25 +1626,6 @@ void CompilerOpenCL::prepass_discover_matrix_types()
 
 	// Scan all instructions for matrix operations to discover helpers needed.
 	// We can resolve the matrix type from the SPIR-V type of operands at pre-scan time.
-	auto get_id_type = [&](uint32_t id) -> const SPIRType &
-	{
-		// For value IDs, look up the type from variable, constant, or the instruction result.
-		auto *var = maybe_get<SPIRVariable>(id);
-		if (var)
-			return get_variable_data_type(*var);
-		auto *c = maybe_get<SPIRConstant>(id);
-		if (c)
-			return get<SPIRType>(c->constant_type);
-		// For instruction results, the type is stored in the expression or type_id.
-		if (ir.ids[id].get_type() == TypeExpression)
-			return get<SPIRType>(get<SPIRExpression>(id).expression_type);
-		// For types themselves
-		if (ir.ids[id].get_type() == TypeType)
-			return get<SPIRType>(id);
-		// Fallback: check if there's a result type mapping
-		return get<SPIRType>(id);
-	};
-
 	ir.for_each_typed_id<SPIRFunction>(
 	    [&](uint32_t, SPIRFunction &f)
 	    {
@@ -7058,7 +7039,6 @@ void CompilerOpenCL::emit_instruction(const Instruction &instruction)
 		uint32_t result_id = ops[1];
 		uint32_t base_id = ops[2];
 
-		auto &base_type = expression_type(base_id);
 		TypeID base_type_id = expression_type_id(base_id);
 
 		// Check if custom stride pointer arithmetic is needed.
diff --git a/spirv_opencl.hpp b/spirv_opencl.hpp
index 3dea7039a..ed6c4ff9c 100644
--- a/spirv_opencl.hpp
+++ b/spirv_opencl.hpp
@@ -90,6 +90,9 @@ class CompilerOpenCL : public CompilerGLSL
 	void set_opencl_options(const Options &opts)
 	{
 		opencl_options = opts;
+		// subgroups_all implies subgroups is on.
+		if (opencl_options.enable_subgroups_all)
+			opencl_options.enable_subgroups = true;
 	}
 
 	std::string compile() override;