diff --git a/test/WaveOps/QuadReadAcrossDiagonal.32.test b/test/WaveOps/QuadReadAcrossDiagonal.32.test
new file mode 100644
index 000000000..3261d4b64
--- /dev/null
+++ b/test/WaveOps/QuadReadAcrossDiagonal.32.test
@@ -0,0 +1,352 @@
+#--- source.hlsl
+// ints
+StructuredBuffer<int4> In : register(t0);
+RWStructuredBuffer<int4> Out1 : register(u1);
+RWStructuredBuffer<int4> Out2 : register(u2);
+RWStructuredBuffer<int4> Out3 : register(u3);
+RWStructuredBuffer<int4> Out4 : register(u4);
+
+// uints
+StructuredBuffer<uint4> UIn : register(t5);
+RWStructuredBuffer<uint4> UOut1 : register(u6);
+RWStructuredBuffer<uint4> UOut2 : register(u7);
+RWStructuredBuffer<uint4> UOut3 : register(u8);
+RWStructuredBuffer<uint4> UOut4 : register(u9);
+
+// floats
+StructuredBuffer<float4> FIn : register(t10);
+RWStructuredBuffer<float4> FOut1 : register(u11);
+RWStructuredBuffer<float4> FOut2 : register(u12);
+RWStructuredBuffer<float4> FOut3 : register(u13);
+RWStructuredBuffer<float4> FOut4 : register(u14);
+
+[numthreads(2,2,1)]
+void main(uint3 dtid : SV_DispatchThreadID) {
+  uint index = dtid.y * 2 + dtid.x;
+
+  // int case
+  int4 v = In[index];
+  int scalar = QuadReadAcrossDiagonal(v.x);
+  int2 vec2 = QuadReadAcrossDiagonal(v.xy);
+  int3 vec3 = QuadReadAcrossDiagonal(v.xyz);
+  int4 vec4 = QuadReadAcrossDiagonal(v);
+
+  Out1[index].x = scalar;
+  Out2[index].xy = vec2;
+  Out3[index].xyz = vec3;
+  Out4[index] = vec4;
+
+  // uint case
+  uint4 uv = UIn[index];
+  uint uscalar = QuadReadAcrossDiagonal(uv.x);
+  uint2 uvec2 = QuadReadAcrossDiagonal(uv.xy);
+  uint3 uvec3 = QuadReadAcrossDiagonal(uv.xyz);
+  uint4 uvec4 = QuadReadAcrossDiagonal(uv);
+
+  UOut1[index].x = uscalar;
+  UOut2[index].xy = uvec2;
+  UOut3[index].xyz = uvec3;
+  UOut4[index] = uvec4;
+
+  // float case
+  float4 fv = FIn[index];
+  float fscalar = QuadReadAcrossDiagonal(fv.x);
+  float2 fvec2 = QuadReadAcrossDiagonal(fv.xy);
+  float3 fvec3 = QuadReadAcrossDiagonal(fv.xyz);
+  float4 fvec4 = QuadReadAcrossDiagonal(fv);
+
+  FOut1[index].x = fscalar;
+  FOut2[index].xy = fvec2;
+  FOut3[index].xyz = fvec3;
+  FOut4[index] = fvec4;
+}
+
+//--- pipeline.yaml
+
+---
+Shaders:
+  - Stage: Compute
+    Entry: main
+    DispatchSize: [ 1, 1, 1 ]
+Buffers:
+  - Name: In
+    Format: Int32
+    Stride: 16
+    Data: [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 ]
+  - Name: Out1
+    Format: Int32
+    Stride: 16
+    FillSize: 64  
+  - Name: Out2
+    Format: Int32
+    Stride: 16
+    FillSize: 64
+  - Name: Out3
+    Format: Int32
+    Stride: 16
+    FillSize: 64
+  - Name: Out4
+    Format: Int32
+    Stride: 16
+    FillSize: 64
+  - Name: ExpectedOut1
+    Format: Int32
+    Stride: 16
+    Data: [ 13, 0, 0, 0, 9, 0, 0, 0, 5, 0, 0, 0, 1, 0, 0, 0 ]
+  - Name: ExpectedOut2
+    Format: Int32
+    Stride: 16
+    Data: [ 13, 14, 0, 0, 9, 10, 0, 0, 5, 6, 0, 0, 1, 2, 0, 0 ]
+  - Name: ExpectedOut3
+    Format: Int32
+    Stride: 16
+    Data: [ 13, 14, 15, 0, 9, 10, 11, 0, 5, 6, 7, 0, 1, 2, 3, 0 ]
+  - Name: ExpectedOut4
+    Format: Int32
+    Stride: 16
+    Data: [ 13, 14, 15, 16, 9, 10, 11, 12, 5, 6, 7, 8, 1, 2, 3, 4 ]
+  - Name: UIn
+    Format: UInt32
+    Stride: 16
+    Data: [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 ]
+  - Name: UOut1
+    Format: UInt32
+    Stride: 16
+    FillSize: 64 
+  - Name: UOut2
+    Format: UInt32
+    Stride: 16
+    FillSize: 64
+  - Name: UOut3
+    Format: UInt32
+    Stride: 16
+    FillSize: 64
+  - Name: UOut4
+    Format: UInt32
+    Stride: 16
+    FillSize: 64
+  - Name: UExpectedOut1
+    Format: UInt32
+    Stride: 16
+    Data: [ 13, 0, 0, 0, 9, 0, 0, 0, 5, 0, 0, 0, 1, 0, 0, 0 ]
+  - Name: UExpectedOut2
+    Format: UInt32
+    Stride: 16
+    Data: [ 13, 14, 0, 0, 9, 10, 0, 0, 5, 6, 0, 0, 1, 2, 0, 0 ]
+  - Name: UExpectedOut3
+    Format: UInt32
+    Stride: 16
+    Data: [ 13, 14, 15, 0, 9, 10, 11, 0, 5, 6, 7, 0, 1, 2, 3, 0 ]
+  - Name: UExpectedOut4
+    Format: UInt32
+    Stride: 16
+    Data: [ 13, 14, 15, 16, 9, 10, 11, 12, 5, 6, 7, 8, 1, 2, 3, 4 ]
+  - Name: FIn
+    Format: Float32
+    Stride: 16
+    Data: [ 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0 ]
+  - Name: FOut1
+    Format: Float32
+    Stride: 16
+    FillSize: 64
+  - Name: FOut2
+    Format: Float32
+    Stride: 16
+    FillSize: 64
+  - Name: FOut3
+    Format: Float32
+    Stride: 16
+    FillSize: 64
+  - Name: FOut4
+    Format: Float32
+    Stride: 16
+    FillSize: 64
+  - Name: FExpectedOut1
+    Format: Float32
+    Stride: 16
+    Data: [ 13.0, 0.0, 0.0, 0.0, 9.0, 0.0, 0.0, 0.0, 5.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0 ]
+  - Name: FExpectedOut2
+    Format: Float32
+    Stride: 16
+    Data: [ 13.0, 14.0, 0.0, 0.0, 9.0, 10.0, 0.0, 0.0, 5.0, 6.0, 0.0, 0.0, 1.0, 2.0, 0.0, 0.0 ]
+  - Name: FExpectedOut3
+    Format: Float32
+    Stride: 16
+    Data: [ 13.0, 14.0, 15.0, 0.0, 9.0, 10.0, 11.0, 0.0, 5.0, 6.0, 7.0, 0.0, 1.0, 2.0, 3.0, 0.0 ]
+  - Name: FExpectedOut4
+    Format: Float32
+    Stride: 16
+    Data: [ 13.0, 14.0, 15.0, 16.0, 9.0, 10.0, 11.0, 12.0, 5.0, 6.0, 7.0, 8.0, 1.0, 2.0, 3.0, 4.0 ]
+Results:
+  - Result: ExpectedOut1
+    Rule: BufferExact
+    Actual: Out1
+    Expected: ExpectedOut1
+  - Result: ExpectedOut2
+    Rule: BufferExact
+    Actual: Out2
+    Expected: ExpectedOut2
+  - Result: ExpectedOut3
+    Rule: BufferExact
+    Actual: Out3
+    Expected: ExpectedOut3
+  - Result: ExpectedOut4
+    Rule: BufferExact
+    Actual: Out4
+    Expected: ExpectedOut4
+  - Result: UExpectedOut1
+    Rule: BufferExact
+    Actual: UOut1
+    Expected: UExpectedOut1
+  - Result: UExpectedOut2
+    Rule: BufferExact
+    Actual: UOut2
+    Expected: UExpectedOut2
+  - Result: UExpectedOut3
+    Rule: BufferExact
+    Actual: UOut3
+    Expected: UExpectedOut3
+  - Result: UExpectedOut4
+    Rule: BufferExact
+    Actual: UOut4
+    Expected: UExpectedOut4
+  - Result: FExpectedOut1
+    Rule: BufferExact
+    Actual: FOut1
+    Expected: FExpectedOut1
+  - Result: FExpectedOut2
+    Rule: BufferExact
+    Actual: FOut2
+    Expected: FExpectedOut2
+  - Result: FExpectedOut3
+    Rule: BufferExact
+    Actual: FOut3
+    Expected: FExpectedOut3
+  - Result: FExpectedOut4
+    Rule: BufferExact
+    Actual: FOut4
+    Expected: FExpectedOut4
+DescriptorSets:
+  - Resources:
+    - Name: In
+      Kind: StructuredBuffer
+      DirectXBinding:
+        Register: 0
+        Space: 0
+      VulkanBinding:
+        Binding: 0
+    - Name: Out1
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 1
+        Space: 0
+      VulkanBinding:
+        Binding: 1
+    - Name: Out2
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 2
+        Space: 0
+      VulkanBinding:
+        Binding: 2
+    - Name: Out3
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 3
+        Space: 0
+      VulkanBinding:
+        Binding: 3
+    - Name: Out4
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 4
+        Space: 0
+      VulkanBinding:
+        Binding: 4
+    - Name: UIn
+      Kind: StructuredBuffer
+      DirectXBinding:
+        Register: 5
+        Space: 0
+      VulkanBinding:
+        Binding: 5
+    - Name: UOut1
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 6
+        Space: 0
+      VulkanBinding:
+        Binding: 6
+    - Name: UOut2
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 7
+        Space: 0
+      VulkanBinding:
+        Binding: 7
+    - Name: UOut3
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 8
+        Space: 0
+      VulkanBinding:
+        Binding: 8
+    - Name: UOut4
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 9
+        Space: 0
+      VulkanBinding:
+        Binding: 9
+    - Name: FIn
+      Kind: StructuredBuffer
+      DirectXBinding:
+        Register: 10
+        Space: 0
+      VulkanBinding:
+        Binding: 10
+    - Name: FOut1
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 11
+        Space: 0
+      VulkanBinding:
+        Binding: 11
+    - Name: FOut2
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 12
+        Space: 0
+      VulkanBinding:
+        Binding: 12
+    - Name: FOut3
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 13
+        Space: 0
+      VulkanBinding:
+        Binding: 13
+    - Name: FOut4
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 14
+        Space: 0
+      VulkanBinding:
+        Binding: 14
+
+...
+#--- end
+
+# Unsupported in Clang, I have a working branch for QuadReadAcrossDiagonal intrinsic support
+# waiting on https://github.com/llvm/llvm-project/pull/187440 to be merged, so I can open a PR for it
+# XFAIL: Clang
+
+# Bug: https://github.com/llvm/offload-test-suite/issues/986
+# XFAIL: Intel && Vulkan && DXC
+
+# Bug: https://github.com/llvm/offload-test-suite/issues/989
+# XFAIL: Metal
+
+# RUN: split-file %s %t
+# RUN: %dxc_target -T cs_6_5 -Fo %t.o %t/source.hlsl
+# RUN: %offloader %t/pipeline.yaml %t.o 
diff --git a/test/WaveOps/QuadReadAcrossDiagonal.convergence.test b/test/WaveOps/QuadReadAcrossDiagonal.convergence.test
new file mode 100644
index 000000000..ed651122b
--- /dev/null
+++ b/test/WaveOps/QuadReadAcrossDiagonal.convergence.test
@@ -0,0 +1,75 @@
+#--- source.hlsl
+StructuredBuffer<float> In : register(t0);
+RWStructuredBuffer<float> Out : register(u1);
+
+[numthreads(2, 2, 1)]
+void main(uint3 dtid : SV_DispatchThreadID) {
+  uint index = dtid.y * 2 + dtid.x;
+  float value = In[index];
+  // Tests control flow across the quad, but making sure neighboring lanes are active to avoid UB.
+
+  if(index == 0 || index == 3) {
+    // This reads (0, 0) and (1, 1)
+    float value_quad_d = QuadReadAcrossDiagonal(value);
+    Out[index] = value - value_quad_d;
+  } else {
+    // This reads (1, 0) and (0, 1)
+    float value_quad_d = QuadReadAcrossDiagonal(value);
+    Out[index] = value + value_quad_d;
+  }
+}
+
+//--- pipeline.yaml
+
+---
+Shaders:
+  - Stage: Compute
+    Entry: main
+    DispatchSize: [ 1, 1, 1 ]
+Buffers:
+  - Name: In
+    Format: Float32
+    Stride: 4
+    Data: [ 1.0, 10.0, 2.0, 20.0 ]
+  - Name: Out
+    Format: Float32
+    Stride: 4
+    FillSize: 16
+  - Name: ExpectedOut
+    Format: Float32
+    Stride: 4
+    Data: [ -19.0, 12.0, 12.0, 19.0 ]
+Results:
+  - Result: ExpectedOut
+    Rule: BufferExact
+    Actual: Out
+    Expected: ExpectedOut
+DescriptorSets:
+  - Resources:
+    - Name: In
+      Kind: StructuredBuffer
+      DirectXBinding:
+        Register: 0
+        Space: 0
+      VulkanBinding:
+        Binding: 0
+    - Name: Out
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 1
+        Space: 0
+      VulkanBinding:
+        Binding: 1
+...
+#--- end
+
+# Unsupported in Clang, I have a working branch for QuadReadAcrossDiagonal intrinsic support
+# waiting on https://github.com/llvm/llvm-project/pull/187440 to be merged, so I can open a PR for it
+# XFAIL: Clang
+
+# Bug: https://github.com/llvm/offload-test-suite/issues/986
+# XFAIL: Intel && Vulkan && DXC
+
+# RUN: split-file %s %t
+# RUN: %dxc_target -T cs_6_0 -Fo %t.o %t/source.hlsl
+# RUN: %offloader %t/pipeline.yaml %t.o
diff --git a/test/WaveOps/QuadReadAcrossDiagonal.fp16.test b/test/WaveOps/QuadReadAcrossDiagonal.fp16.test
new file mode 100644
index 000000000..70958b40b
--- /dev/null
+++ b/test/WaveOps/QuadReadAcrossDiagonal.fp16.test
@@ -0,0 +1,137 @@
+#--- source.hlsl
+StructuredBuffer<half4> In: register(t0);
+RWStructuredBuffer<half4> Out1 : register(u1); // test scalar
+RWStructuredBuffer<half4> Out2 : register(u2); // test half2
+RWStructuredBuffer<half4> Out3 : register(u3); // test half3
+RWStructuredBuffer<half4> Out4 : register(u4); // test half4
+
+[numthreads(2,2,1)]
+void main(uint3 dtid : SV_DispatchThreadID) {
+  uint index = dtid.y * 2 + dtid.x;
+  half4 v = In[index];
+
+  half scalar = QuadReadAcrossDiagonal(v.x);
+  half2 vec2 = QuadReadAcrossDiagonal(v.xy);
+  half3 vec3 = QuadReadAcrossDiagonal(v.xyz);
+  half4 vec4 = QuadReadAcrossDiagonal(v);
+
+  Out1[index].x = scalar;
+  Out2[index].xy = vec2;
+  Out3[index].xyz = vec3;
+  Out4[index] = vec4;
+}
+
+//--- pipeline.yaml
+
+---
+Shaders:
+  - Stage: Compute
+    Entry: main
+    DispatchSize: [ 1, 1, 1 ]
+Buffers:
+  - Name: In
+    Format: Float16
+    Stride: 8
+    Data: [ 0x3c00, 0x4000, 0x4200, 0x4400, 0x4500, 0x4600, 0x4700, 0x4800, 0x4880, 0x4900, 0x4980, 0x4a00, 0x4a80, 0x4b00, 0x4b80, 0x4c00 ]
+  - Name: Out1
+    Format: Float16
+    Stride: 8
+    FillSize: 32
+  - Name: Out2
+    Format: Float16
+    Stride: 8
+    FillSize: 32
+  - Name: Out3
+    Format: Float16
+    Stride: 8
+    FillSize: 32
+  - Name: Out4
+    Format: Float16
+    Stride: 8
+    FillSize: 32
+  - Name: ExpectedOut1
+    Format: Float16
+    Stride: 8
+    Data: [ 0x4a80, 0x0, 0x0, 0x0, 0x4880, 0x0, 0x0, 0x0, 0x4500, 0x0, 0x0, 0x0, 0x3c00, 0x0, 0x0, 0x0 ]
+  - Name: ExpectedOut2
+    Format: Float16
+    Stride: 8
+    Data: [ 0x4a80, 0x4b00, 0x0, 0x0, 0x4880, 0x4900, 0x0, 0x0, 0x4500, 0x4600, 0x0, 0x0, 0x3c00, 0x4000, 0x0, 0x0 ]
+  - Name: ExpectedOut3
+    Format: Float16
+    Stride: 8
+    Data: [ 0x4a80, 0x4b00, 0x4b80, 0x0, 0x4880, 0x4900, 0x4980, 0x0, 0x4500, 0x4600, 0x4700, 0x0, 0x3c00, 0x4000, 0x4200, 0x0 ]
+  - Name: ExpectedOut4
+    Format: Float16
+    Stride: 8
+    Data: [ 0x4a80, 0x4b00, 0x4b80, 0x4c00, 0x4880, 0x4900, 0x4980, 0x4a00, 0x4500, 0x4600, 0x4700, 0x4800, 0x3c00, 0x4000, 0x4200, 0x4400 ]
+Results:
+  - Result: ExpectedOut1
+    Rule: BufferExact
+    Actual: Out1
+    Expected: ExpectedOut1
+  - Result: ExpectedOut2
+    Rule: BufferExact
+    Actual: Out2
+    Expected: ExpectedOut2
+  - Result: ExpectedOut3
+    Rule: BufferExact
+    Actual: Out3
+    Expected: ExpectedOut3
+  - Result: ExpectedOut4
+    Rule: BufferExact
+    Actual: Out4
+    Expected: ExpectedOut4
+DescriptorSets:
+  - Resources:
+    - Name: In
+      Kind: StructuredBuffer
+      DirectXBinding:
+        Register: 0
+        Space: 0
+      VulkanBinding:
+        Binding: 0
+    - Name: Out1
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 1
+        Space: 0
+      VulkanBinding:
+        Binding: 1
+    - Name: Out2
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 2
+        Space: 0
+      VulkanBinding:
+        Binding: 2
+    - Name: Out3
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 3
+        Space: 0
+      VulkanBinding:
+        Binding: 3
+    - Name: Out4
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 4
+        Space: 0
+      VulkanBinding:
+        Binding: 4
+
+...
+#--- end
+
+# REQUIRES: Half
+
+# Unsupported in Clang, I have a working branch for QuadReadAcrossDiagonal intrinsic support
+# waiting on https://github.com/llvm/llvm-project/pull/187440 to be merged, so I can open a PR for it
+# XFAIL: Clang
+
+# Bug: https://github.com/llvm/offload-test-suite/issues/986
+# XFAIL: Intel && Vulkan && DXC
+
+# RUN: split-file %s %t
+# RUN: %dxc_target -enable-16bit-types -T cs_6_5 -Fo %t.o %t/source.hlsl
+# RUN: %offloader %t/pipeline.yaml %t.o
diff --git a/test/WaveOps/QuadReadAcrossDiagonal.fp64.test b/test/WaveOps/QuadReadAcrossDiagonal.fp64.test
new file mode 100644
index 000000000..b3203481e
--- /dev/null
+++ b/test/WaveOps/QuadReadAcrossDiagonal.fp64.test
@@ -0,0 +1,137 @@
+#--- source.hlsl
+StructuredBuffer<double4> In: register(t0);
+RWStructuredBuffer<double4> Out1 : register(u1); // test scalar
+RWStructuredBuffer<double4> Out2 : register(u2); // test double2
+RWStructuredBuffer<double4> Out3 : register(u3); // test double3
+RWStructuredBuffer<double4> Out4 : register(u4); // test double4
+
+[numthreads(2,2,1)]
+void main(uint3 dtid : SV_DispatchThreadID) {
+  uint index = dtid.y * 2 + dtid.x;
+  double4 v = In[index];
+
+  double scalar = QuadReadAcrossDiagonal(v.x);
+  double2 vec2 = QuadReadAcrossDiagonal(v.xy);
+  double3 vec3 = QuadReadAcrossDiagonal(v.xyz);
+  double4 vec4 = QuadReadAcrossDiagonal(v);
+
+  Out1[index].x = scalar;
+  Out2[index].xy = vec2;
+  Out3[index].xyz = vec3;
+  Out4[index] = vec4;
+}
+
+//--- pipeline.yaml
+
+---
+Shaders:
+  - Stage: Compute
+    Entry: main
+    DispatchSize: [ 1, 1, 1 ]
+Buffers:
+  - Name: In
+    Format: Float64
+    Stride: 32
+    Data: [ 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0 ]
+  - Name: Out1
+    Format: Float64
+    Stride: 32
+    FillSize: 128
+  - Name: Out2
+    Format: Float64
+    Stride: 32
+    FillSize: 128
+  - Name: Out3
+    Format: Float64
+    Stride: 32
+    FillSize: 128
+  - Name: Out4
+    Format: Float64
+    Stride: 32
+    FillSize: 128
+  - Name: ExpectedOut1
+    Format: Float64
+    Stride: 32
+    Data: [ 13.0, 0.0, 0.0, 0.0, 9.0, 0.0, 0.0, 0.0, 5.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0 ]
+  - Name: ExpectedOut2
+    Format: Float64
+    Stride: 32
+    Data: [ 13.0, 14.0, 0.0, 0.0, 9.0, 10.0, 0.0, 0.0, 5.0, 6.0, 0.0, 0.0, 1.0, 2.0, 0.0, 0.0 ]
+  - Name: ExpectedOut3
+    Format: Float64
+    Stride: 32
+    Data: [ 13.0, 14.0, 15.0, 0.0, 9.0, 10.0, 11.0, 0.0, 5.0, 6.0, 7.0, 0.0, 1.0, 2.0, 3.0, 0.0 ]
+  - Name: ExpectedOut4
+    Format: Float64
+    Stride: 32
+    Data: [ 13.0, 14.0, 15.0, 16.0, 9.0, 10.0, 11.0, 12.0, 5.0, 6.0, 7.0, 8.0, 1.0, 2.0, 3.0, 4.0 ]
+Results:
+  - Result: ExpectedOut1
+    Rule: BufferExact
+    Actual: Out1
+    Expected: ExpectedOut1
+  - Result: ExpectedOut2
+    Rule: BufferExact
+    Actual: Out2
+    Expected: ExpectedOut2
+  - Result: ExpectedOut3
+    Rule: BufferExact
+    Actual: Out3
+    Expected: ExpectedOut3
+  - Result: ExpectedOut4
+    Rule: BufferExact
+    Actual: Out4
+    Expected: ExpectedOut4
+DescriptorSets:
+  - Resources:
+    - Name: In
+      Kind: StructuredBuffer
+      DirectXBinding:
+        Register: 0
+        Space: 0
+      VulkanBinding:
+        Binding: 0
+    - Name: Out1
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 1
+        Space: 0
+      VulkanBinding:
+        Binding: 1
+    - Name: Out2
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 2
+        Space: 0
+      VulkanBinding:
+        Binding: 2
+    - Name: Out3
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 3
+        Space: 0
+      VulkanBinding:
+        Binding: 3
+    - Name: Out4
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 4
+        Space: 0
+      VulkanBinding:
+        Binding: 4
+
+...
+#--- end
+
+# REQUIRES: Double
+
+# Unsupported in Clang, I have a working branch for QuadReadAcrossDiagonal intrinsic support
+# waiting on https://github.com/llvm/llvm-project/pull/187440 to be merged, so I can open a PR for it
+# XFAIL: Clang
+
+# Bug: https://github.com/llvm/offload-test-suite/issues/986
+# XFAIL: Intel && Vulkan && DXC
+
+# RUN: split-file %s %t
+# RUN: %dxc_target -T cs_6_5 -Fo %t.o %t/source.hlsl
+# RUN: %offloader %t/pipeline.yaml %t.o
diff --git a/test/WaveOps/QuadReadAcrossDiagonal.int16.test b/test/WaveOps/QuadReadAcrossDiagonal.int16.test
new file mode 100644
index 000000000..18b34e08e
--- /dev/null
+++ b/test/WaveOps/QuadReadAcrossDiagonal.int16.test
@@ -0,0 +1,248 @@
+#--- source.hlsl
+// ints
+StructuredBuffer<int16_t4> In: register(t0);
+RWStructuredBuffer<int16_t4> Out1 : register(u1); // test scalar
+RWStructuredBuffer<int16_t4> Out2 : register(u2); // test int16_t2
+RWStructuredBuffer<int16_t4> Out3 : register(u3); // test int16_t3
+RWStructuredBuffer<int16_t4> Out4 : register(u4); // test int16_t4
+
+// uints
+StructuredBuffer<uint16_t4> UIn: register(t5);
+RWStructuredBuffer<uint16_t4> UOut1 : register(u6); // test scalar
+RWStructuredBuffer<uint16_t4> UOut2 : register(u7); // test uint16_t2
+RWStructuredBuffer<uint16_t4> UOut3 : register(u8); // test uint16_t3
+RWStructuredBuffer<uint16_t4> UOut4 : register(u9); // test uint16_t4
+
+[numthreads(2,2,1)]
+void main(uint3 dtid : SV_DispatchThreadID) {
+  uint index = dtid.y * 2 + dtid.x;
+
+  // int case
+  int16_t4 v = In[index];
+  int16_t scalar = QuadReadAcrossDiagonal(v.x);
+  int16_t2 vec2 = QuadReadAcrossDiagonal(v.xy);
+  int16_t3 vec3 = QuadReadAcrossDiagonal(v.xyz);
+  int16_t4 vec4 = QuadReadAcrossDiagonal(v);
+
+  Out1[index].x = scalar;
+  Out2[index].xy = vec2;
+  Out3[index].xyz = vec3;
+  Out4[index] = vec4;
+
+  // uint case
+  uint16_t4 uv = UIn[index];
+  uint16_t uscalar = QuadReadAcrossDiagonal(uv.x);
+  uint16_t2 uvec2 = QuadReadAcrossDiagonal(uv.xy);
+  uint16_t3 uvec3 = QuadReadAcrossDiagonal(uv.xyz);
+  uint16_t4 uvec4 = QuadReadAcrossDiagonal(uv);
+
+  UOut1[index].x = uscalar;
+  UOut2[index].xy = uvec2;
+  UOut3[index].xyz = uvec3;
+  UOut4[index] = uvec4;
+}
+
+//--- pipeline.yaml
+
+---
+Shaders:
+  - Stage: Compute
+    Entry: main
+    DispatchSize: [ 1, 1, 1 ]
+Buffers:
+  - Name: In
+    Format: Int16
+    Stride: 8
+    Data: [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 ]
+  - Name: Out1
+    Format: Int16
+    Stride: 8
+    FillSize: 32
+  - Name: Out2
+    Format: Int16
+    Stride: 8
+    FillSize: 32
+  - Name: Out3
+    Format: Int16
+    Stride: 8
+    FillSize: 32
+  - Name: Out4
+    Format: Int16
+    Stride: 8
+    FillSize: 32
+  - Name: ExpectedOut1
+    Format: Int16
+    Stride: 8
+    Data: [ 13, 0, 0, 0, 9, 0, 0, 0, 5, 0, 0, 0, 1, 0, 0, 0 ]
+  - Name: ExpectedOut2
+    Format: Int16
+    Stride: 8
+    Data: [ 13, 14, 0, 0, 9, 10, 0, 0, 5, 6, 0, 0, 1, 2, 0, 0 ]
+  - Name: ExpectedOut3
+    Format: Int16
+    Stride: 8
+    Data: [ 13, 14, 15, 0, 9, 10, 11, 0, 5, 6, 7, 0, 1, 2, 3, 0 ]
+  - Name: ExpectedOut4
+    Format: Int16
+    Stride: 8
+    Data: [ 13, 14, 15, 16, 9, 10, 11, 12, 5, 6, 7, 8, 1, 2, 3, 4 ]
+  - Name: UIn
+    Format: UInt16
+    Stride: 8
+    Data: [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 ]
+  - Name: UOut1
+    Format: UInt16
+    Stride: 8
+    FillSize: 32
+  - Name: UOut2
+    Format: UInt16
+    Stride: 8
+    FillSize: 32
+  - Name: UOut3
+    Format: UInt16
+    Stride: 8
+    FillSize: 32
+  - Name: UOut4
+    Format: UInt16
+    Stride: 8
+    FillSize: 32
+  - Name: UExpectedOut1
+    Format: UInt16
+    Stride: 8
+    Data: [ 13, 0, 0, 0, 9, 0, 0, 0, 5, 0, 0, 0, 1, 0, 0, 0 ]
+  - Name: UExpectedOut2
+    Format: UInt16
+    Stride: 8
+    Data: [ 13, 14, 0, 0, 9, 10, 0, 0, 5, 6, 0, 0, 1, 2, 0, 0 ]
+  - Name: UExpectedOut3
+    Format: UInt16
+    Stride: 8
+    Data: [ 13, 14, 15, 0, 9, 10, 11, 0, 5, 6, 7, 0, 1, 2, 3, 0 ]
+  - Name: UExpectedOut4
+    Format: UInt16
+    Stride: 8
+    Data: [ 13, 14, 15, 16, 9, 10, 11, 12, 5, 6, 7, 8, 1, 2, 3, 4 ]
+Results:
+  - Result: ExpectedOut1
+    Rule: BufferExact
+    Actual: Out1
+    Expected: ExpectedOut1
+  - Result: ExpectedOut2
+    Rule: BufferExact
+    Actual: Out2
+    Expected: ExpectedOut2
+  - Result: ExpectedOut3
+    Rule: BufferExact
+    Actual: Out3
+    Expected: ExpectedOut3
+  - Result: ExpectedOut4
+    Rule: BufferExact
+    Actual: Out4
+    Expected: ExpectedOut4
+  - Result: UExpectedOut1
+    Rule: BufferExact
+    Actual: UOut1
+    Expected: UExpectedOut1
+  - Result: UExpectedOut2
+    Rule: BufferExact
+    Actual: UOut2
+    Expected: UExpectedOut2
+  - Result: UExpectedOut3
+    Rule: BufferExact
+    Actual: UOut3
+    Expected: UExpectedOut3
+  - Result: UExpectedOut4
+    Rule: BufferExact
+    Actual: UOut4
+    Expected: UExpectedOut4
+DescriptorSets:
+  - Resources:
+    - Name: In
+      Kind: StructuredBuffer
+      DirectXBinding:
+        Register: 0
+        Space: 0
+      VulkanBinding:
+        Binding: 0
+    - Name: Out1
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 1
+        Space: 0
+      VulkanBinding:
+        Binding: 1
+    - Name: Out2
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 2
+        Space: 0
+      VulkanBinding:
+        Binding: 2
+    - Name: Out3
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 3
+        Space: 0
+      VulkanBinding:
+        Binding: 3
+    - Name: Out4
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 4
+        Space: 0
+      VulkanBinding:
+        Binding: 4
+    - Name: UIn
+      Kind: StructuredBuffer
+      DirectXBinding:
+        Register: 5
+        Space: 0
+      VulkanBinding:
+        Binding: 5
+    - Name: UOut1
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 6
+        Space: 0
+      VulkanBinding:
+        Binding: 6
+    - Name: UOut2
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 7
+        Space: 0
+      VulkanBinding:
+        Binding: 7
+    - Name: UOut3
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 8
+        Space: 0
+      VulkanBinding:
+        Binding: 8
+    - Name: UOut4
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 9
+        Space: 0
+      VulkanBinding:
+        Binding: 9
+
+...
+#--- end
+
+# REQUIRES: Int16
+
+# Unsupported in Clang, I have a working branch for QuadReadAcrossDiagonal intrinsic support
+# waiting on https://github.com/llvm/llvm-project/pull/187440 to be merged, so I can open a PR for it
+# XFAIL: Clang
+
+# Bug: https://github.com/llvm/offload-test-suite/issues/986
+# XFAIL: Intel && Vulkan && DXC
+
+# Bug: https://github.com/llvm/offload-test-suite/issues/989
+# XFAIL: Metal
+
+# RUN: split-file %s %t
+# RUN: %dxc_target -enable-16bit-types -T cs_6_5 -Fo %t.o %t/source.hlsl
+# RUN: %offloader %t/pipeline.yaml %t.o
diff --git a/test/WaveOps/QuadReadAcrossDiagonal.int64.test b/test/WaveOps/QuadReadAcrossDiagonal.int64.test
new file mode 100644
index 000000000..f6819ea34
--- /dev/null
+++ b/test/WaveOps/QuadReadAcrossDiagonal.int64.test
@@ -0,0 +1,248 @@
+#--- source.hlsl
+// ints
+StructuredBuffer<int64_t4> In: register(t0);
+RWStructuredBuffer<int64_t4> Out1 : register(u1); // test scalar
+RWStructuredBuffer<int64_t4> Out2 : register(u2); // test int64_t2
+RWStructuredBuffer<int64_t4> Out3 : register(u3); // test int64_t3
+RWStructuredBuffer<int64_t4> Out4 : register(u4); // test int64_t4
+
+// uints
+StructuredBuffer<uint64_t4> UIn: register(t5);
+RWStructuredBuffer<uint64_t4> UOut1 : register(u6); // test scalar
+RWStructuredBuffer<uint64_t4> UOut2 : register(u7); // test uint64_t2
+RWStructuredBuffer<uint64_t4> UOut3 : register(u8); // test uint64_t3
+RWStructuredBuffer<uint64_t4> UOut4 : register(u9); // test uint64_t4
+
+[numthreads(2,2,1)]
+void main(uint3 dtid : SV_DispatchThreadID) {
+  uint index = dtid.y * 2 + dtid.x;
+
+  // int case
+  int64_t4 v = In[index];
+  int64_t scalar = QuadReadAcrossDiagonal(v.x);
+  int64_t2 vec2 = QuadReadAcrossDiagonal(v.xy);
+  int64_t3 vec3 = QuadReadAcrossDiagonal(v.xyz);
+  int64_t4 vec4 = QuadReadAcrossDiagonal(v);
+
+  Out1[index].x = scalar;
+  Out2[index].xy = vec2;
+  Out3[index].xyz = vec3;
+  Out4[index] = vec4;
+
+  // uint case
+  uint64_t4 uv = UIn[index];
+  uint64_t uscalar = QuadReadAcrossDiagonal(uv.x);
+  uint64_t2 uvec2 = QuadReadAcrossDiagonal(uv.xy);
+  uint64_t3 uvec3 = QuadReadAcrossDiagonal(uv.xyz);
+  uint64_t4 uvec4 = QuadReadAcrossDiagonal(uv);
+
+  UOut1[index].x = uscalar;
+  UOut2[index].xy = uvec2;
+  UOut3[index].xyz = uvec3;
+  UOut4[index] = uvec4;
+}
+
+//--- pipeline.yaml
+
+---
+Shaders:
+  - Stage: Compute
+    Entry: main
+    DispatchSize: [ 1, 1, 1 ]
+Buffers:
+  - Name: In
+    Format: Int64
+    Stride: 32
+    Data: [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 ]
+  - Name: Out1
+    Format: Int64
+    Stride: 32
+    FillSize: 128
+  - Name: Out2
+    Format: Int64
+    Stride: 32
+    FillSize: 128
+  - Name: Out3
+    Format: Int64
+    Stride: 32
+    FillSize: 128
+  - Name: Out4
+    Format: Int64
+    Stride: 32
+    FillSize: 128
+  - Name: ExpectedOut1
+    Format: Int64
+    Stride: 32
+    Data: [ 13, 0, 0, 0, 9, 0, 0, 0, 5, 0, 0, 0, 1, 0, 0, 0 ]
+  - Name: ExpectedOut2
+    Format: Int64
+    Stride: 32
+    Data: [ 13, 14, 0, 0, 9, 10, 0, 0, 5, 6, 0, 0, 1, 2, 0, 0 ]
+  - Name: ExpectedOut3
+    Format: Int64
+    Stride: 32
+    Data: [ 13, 14, 15, 0, 9, 10, 11, 0, 5, 6, 7, 0, 1, 2, 3, 0 ]
+  - Name: ExpectedOut4
+    Format: Int64
+    Stride: 32
+    Data: [ 13, 14, 15, 16, 9, 10, 11, 12, 5, 6, 7, 8, 1, 2, 3, 4 ]
+  - Name: UIn
+    Format: UInt64
+    Stride: 32
+    Data: [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 ]
+  - Name: UOut1
+    Format: UInt64
+    Stride: 32
+    FillSize: 128
+  - Name: UOut2
+    Format: UInt64
+    Stride: 32
+    FillSize: 128
+  - Name: UOut3
+    Format: UInt64
+    Stride: 32
+    FillSize: 128
+  - Name: UOut4
+    Format: UInt64
+    Stride: 32
+    FillSize: 128
+  - Name: UExpectedOut1
+    Format: UInt64
+    Stride: 32
+    Data: [ 13, 0, 0, 0, 9, 0, 0, 0, 5, 0, 0, 0, 1, 0, 0, 0 ]
+  - Name: UExpectedOut2
+    Format: UInt64
+    Stride: 32
+    Data: [ 13, 14, 0, 0, 9, 10, 0, 0, 5, 6, 0, 0, 1, 2, 0, 0 ]
+  - Name: UExpectedOut3
+    Format: UInt64
+    Stride: 32
+    Data: [ 13, 14, 15, 0, 9, 10, 11, 0, 5, 6, 7, 0, 1, 2, 3, 0 ]
+  - Name: UExpectedOut4
+    Format: UInt64
+    Stride: 32
+    Data: [ 13, 14, 15, 16, 9, 10, 11, 12, 5, 6, 7, 8, 1, 2, 3, 4 ]
+Results:
+  - Result: ExpectedOut1
+    Rule: BufferExact
+    Actual: Out1
+    Expected: ExpectedOut1
+  - Result: ExpectedOut2
+    Rule: BufferExact
+    Actual: Out2
+    Expected: ExpectedOut2
+  - Result: ExpectedOut3
+    Rule: BufferExact
+    Actual: Out3
+    Expected: ExpectedOut3
+  - Result: ExpectedOut4
+    Rule: BufferExact
+    Actual: Out4
+    Expected: ExpectedOut4
+  - Result: UExpectedOut1
+    Rule: BufferExact
+    Actual: UOut1
+    Expected: UExpectedOut1
+  - Result: UExpectedOut2
+    Rule: BufferExact
+    Actual: UOut2
+    Expected: UExpectedOut2
+  - Result: UExpectedOut3
+    Rule: BufferExact
+    Actual: UOut3
+    Expected: UExpectedOut3
+  - Result: UExpectedOut4
+    Rule: BufferExact
+    Actual: UOut4
+    Expected: UExpectedOut4
+DescriptorSets:
+  - Resources:
+    - Name: In
+      Kind: StructuredBuffer
+      DirectXBinding:
+        Register: 0
+        Space: 0
+      VulkanBinding:
+        Binding: 0
+    - Name: Out1
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 1
+        Space: 0
+      VulkanBinding:
+        Binding: 1
+    - Name: Out2
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 2
+        Space: 0
+      VulkanBinding:
+        Binding: 2
+    - Name: Out3
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 3
+        Space: 0
+      VulkanBinding:
+        Binding: 3
+    - Name: Out4
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 4
+        Space: 0
+      VulkanBinding:
+        Binding: 4
+    - Name: UIn
+      Kind: StructuredBuffer
+      DirectXBinding:
+        Register: 5
+        Space: 0
+      VulkanBinding:
+        Binding: 5
+    - Name: UOut1
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 6
+        Space: 0
+      VulkanBinding:
+        Binding: 6
+    - Name: UOut2
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 7
+        Space: 0
+      VulkanBinding:
+        Binding: 7
+    - Name: UOut3
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 8
+        Space: 0
+      VulkanBinding:
+        Binding: 8
+    - Name: UOut4
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 9
+        Space: 0
+      VulkanBinding:
+        Binding: 9
+
+...
+#--- end
+
+# REQUIRES: Int64
+
+# Unsupported in Clang, I have a working branch for QuadReadAcrossDiagonal intrinsic support
+# waiting on https://github.com/llvm/llvm-project/pull/187440 to be merged, so I can open a PR for it
+# XFAIL: Clang
+
+# Bug: https://github.com/llvm/offload-test-suite/issues/986
+# XFAIL: Intel && Vulkan && DXC
+
+# Bug: https://github.com/llvm/offload-test-suite/issues/988
+# XFAIL: Metal
+
+# RUN: split-file %s %t
+# RUN: %dxc_target -T cs_6_0 -Fo %t.o %t/source.hlsl
+# RUN: %offloader %t/pipeline.yaml %t.o