diff --git a/test/WaveOps/WavePrefixProduct.32.test b/test/WaveOps/WavePrefixProduct.32.test
new file mode 100644
index 000000000..2494a1ec3
--- /dev/null
+++ b/test/WaveOps/WavePrefixProduct.32.test
@@ -0,0 +1,471 @@
+#--- source.hlsl
+// ints
+StructuredBuffer<int4> In  : register(t0);
+RWStructuredBuffer<int4> Out1 : register(u1);
+RWStructuredBuffer<int4> Out2 : register(u2);
+RWStructuredBuffer<int4> Out3 : register(u3);
+RWStructuredBuffer<int4> Out4 : register(u4);
+RWStructuredBuffer<int4> Out5 : register(u5);
+
+// uints
+StructuredBuffer<uint4> UIn  : register(t6);
+RWStructuredBuffer<uint4> UOut1 : register(u7);
+RWStructuredBuffer<uint4> UOut2 : register(u8);
+RWStructuredBuffer<uint4> UOut3 : register(u9);
+RWStructuredBuffer<uint4> UOut4 : register(u10);
+RWStructuredBuffer<uint4> UOut5 : register(u11);
+
+// floats
+StructuredBuffer<float4> FIn : register(t12);
+RWStructuredBuffer<float4> FOut1 : register(u13);
+RWStructuredBuffer<float4> FOut2 : register(u14);
+RWStructuredBuffer<float4> FOut3 : register(u15);
+RWStructuredBuffer<float4> FOut4 : register(u16);
+RWStructuredBuffer<float4> FOut5 : register(u17);
+
+[numthreads(4,1,1)]
+void main(uint3 tid : SV_GroupThreadID)
+{
+    int4 v = In[0];
+
+    // Mask per "active lane set": only <=N lanes contribute
+    int s1 = tid.x <= 0 ? WavePrefixProduct( v.x ) : 0;
+    int s2 = tid.x <= 1 ? WavePrefixProduct( v.x ) : 0;
+    int s3 = tid.x <= 2 ? WavePrefixProduct( v.x ) : 0;
+    int s4 = tid.x <= 3 ? WavePrefixProduct( v.x ) : 0;
+
+    int2 v2_1 = tid.x <= 0 ? WavePrefixProduct( v.xy ) : int2(0,0);
+    int2 v2_2 = tid.x <= 1 ? WavePrefixProduct( v.xy ) : int2(0,0);
+    int2 v2_3 = tid.x <= 2 ? WavePrefixProduct( v.xy ) : int2(0,0);
+    int2 v2_4 = tid.x <= 3 ? WavePrefixProduct( v.xy ) : int2(0,0);
+
+    int3 v3_1 = tid.x <= 0 ? WavePrefixProduct( v.xyz ) : int3(0,0,0);
+    int3 v3_2 = tid.x <= 1 ? WavePrefixProduct( v.xyz ) : int3(0,0,0);
+    int3 v3_3 = tid.x <= 2 ? WavePrefixProduct( v.xyz ) : int3(0,0,0);
+    int3 v3_4 = tid.x <= 3 ? WavePrefixProduct( v.xyz ) : int3(0,0,0);
+
+    int4 v4_1 = tid.x <= 0 ? WavePrefixProduct( v ) : int4(0,0,0,0);
+    int4 v4_2 = tid.x <= 1 ? WavePrefixProduct( v ) : int4(0,0,0,0);
+    int4 v4_3 = tid.x <= 2 ? WavePrefixProduct( v ) : int4(0,0,0,0);
+    int4 v4_4 = tid.x <= 3 ? WavePrefixProduct( v ) : int4(0,0,0,0);
+
+    int scalars[4] = { s1, s2, s3, s4 };
+    int2 vec2s [4] = { v2_1, v2_2, v2_3, v2_4 };
+    int3 vec3s [4] = { v3_1, v3_2, v3_3, v3_4 };
+    int4 vec4s [4] = { v4_1, v4_2, v4_3, v4_4 };
+
+    Out1[tid.x].x   = scalars[tid.x];
+    Out2[tid.x].xy  = vec2s[tid.x];
+    Out3[tid.x].xyz = vec3s[tid.x];
+    Out4[tid.x]     = vec4s[tid.x];
+    Out5[tid.x]     = WavePrefixProduct(int4(1,2,3,4));    
+
+    // UINT case
+
+    uint4 uv = UIn[0];
+
+    // Mask per "active lane set": only <=N lanes contribute
+    uint us1 = tid.x <= 0 ? WavePrefixProduct( uv.x ) : 0;
+    uint us2 = tid.x <= 1 ? WavePrefixProduct( uv.x ) : 0;
+    uint us3 = tid.x <= 2 ? WavePrefixProduct( uv.x ) : 0;
+    uint us4 = tid.x <= 3 ? WavePrefixProduct( uv.x ) : 0;
+
+    uint2 uv2_1 = tid.x <= 0 ? WavePrefixProduct( uv.xy ) : uint2(0,0);
+    uint2 uv2_2 = tid.x <= 1 ? WavePrefixProduct( uv.xy ) : uint2(0,0);
+    uint2 uv2_3 = tid.x <= 2 ? WavePrefixProduct( uv.xy ) : uint2(0,0);
+    uint2 uv2_4 = tid.x <= 3 ? WavePrefixProduct( uv.xy ) : uint2(0,0);
+
+    uint3 uv3_1 = tid.x <= 0 ? WavePrefixProduct( uv.xyz ) : uint3(0,0,0);
+    uint3 uv3_2 = tid.x <= 1 ? WavePrefixProduct( uv.xyz ) : uint3(0,0,0);
+    uint3 uv3_3 = tid.x <= 2 ? WavePrefixProduct( uv.xyz ) : uint3(0,0,0);
+    uint3 uv3_4 = tid.x <= 3 ? WavePrefixProduct( uv.xyz ) : uint3(0,0,0);
+
+    uint4 uv4_1 = tid.x <= 0 ? WavePrefixProduct( uv ) : uint4(0,0,0,0);
+    uint4 uv4_2 = tid.x <= 1 ? WavePrefixProduct( uv ) : uint4(0,0,0,0);
+    uint4 uv4_3 = tid.x <= 2 ? WavePrefixProduct( uv ) : uint4(0,0,0,0);
+    uint4 uv4_4 = tid.x <= 3 ? WavePrefixProduct( uv ) : uint4(0,0,0,0);
+
+    uint uscalars[4] = { us1, us2, us3, us4 };
+    uint2 uvec2s [4] = { uv2_1, uv2_2, uv2_3, uv2_4 };
+    uint3 uvec3s [4] = { uv3_1, uv3_2, uv3_3, uv3_4 };
+    uint4 uvec4s [4] = { uv4_1, uv4_2, uv4_3, uv4_4 };
+
+    UOut1[tid.x].x   = uscalars[tid.x];
+    UOut2[tid.x].xy  = uvec2s[tid.x];
+    UOut3[tid.x].xyz = uvec3s[tid.x];
+    UOut4[tid.x]     = uvec4s[tid.x];
+    UOut5[tid.x]     = WavePrefixProduct(uint4(1,2,3,4));
+    
+    // Float case
+
+    float4 fv = FIn[0];
+
+    // Mask per "active lane set": only <=N lanes contribute
+    float fs1 = tid.x <= 0 ? WavePrefixProduct( fv.x ) : 0;
+    float fs2 = tid.x <= 1 ? WavePrefixProduct( fv.x ) : 0;
+    float fs3 = tid.x <= 2 ? WavePrefixProduct( fv.x ) : 0;
+    float fs4 = tid.x <= 3 ? WavePrefixProduct( fv.x ) : 0;
+    
+    float2 fv2_1 = tid.x <= 0 ? WavePrefixProduct( fv.xy ) : float2(0,0);
+    float2 fv2_2 = tid.x <= 1 ? WavePrefixProduct( fv.xy ) : float2(0,0);
+    float2 fv2_3 = tid.x <= 2 ? WavePrefixProduct( fv.xy ) : float2(0,0);
+    float2 fv2_4 = tid.x <= 3 ? WavePrefixProduct( fv.xy ) : float2(0,0);
+
+    float3 fv3_1 = tid.x <= 0 ? WavePrefixProduct( fv.xyz ) : float3(0,0,0);
+    float3 fv3_2 = tid.x <= 1 ? WavePrefixProduct( fv.xyz ) : float3(0,0,0);
+    float3 fv3_3 = tid.x <= 2 ? WavePrefixProduct( fv.xyz ) : float3(0,0,0);
+    float3 fv3_4 = tid.x <= 3 ? WavePrefixProduct( fv.xyz ) : float3(0,0,0);
+
+    float4 fv4_1 = tid.x <= 0 ? WavePrefixProduct( fv ) : float4(0,0,0,0);
+    float4 fv4_2 = tid.x <= 1 ? WavePrefixProduct( fv ) : float4(0,0,0,0);
+    float4 fv4_3 = tid.x <= 2 ? WavePrefixProduct( fv ) : float4(0,0,0,0);
+    float4 fv4_4 = tid.x <= 3 ? WavePrefixProduct( fv ) : float4(0,0,0,0);
+
+    float fscalars[4] = { fs1, fs2, fs3, fs4 };
+    float2 fvec2s [4] = { fv2_1, fv2_2, fv2_3, fv2_4 };
+    float3 fvec3s [4] = { fv3_1, fv3_2, fv3_3, fv3_4 };
+    float4 fvec4s [4] = { fv4_1, fv4_2, fv4_3, fv4_4 };
+
+    FOut1[tid.x].x   = fscalars[tid.x];
+    FOut2[tid.x].xy  = fvec2s[tid.x];
+    FOut3[tid.x].xyz = fvec3s[tid.x];
+    FOut4[tid.x]     = fvec4s[tid.x];
+    FOut5[tid.x]     = WavePrefixProduct(float4(1,2,3,4));
+}
+
+//--- pipeline.yaml
+
+---
+Shaders:
+  - Stage: Compute
+    Entry: main
+    DispatchSize: [ 1, 1, 1 ]
+Buffers:
+  - Name: In
+    Format: Int32
+    Stride: 16
+    Data: [ 2, 4, 8, 16 ]
+  - Name: Out1
+    Format: Int32
+    Stride: 16
+    FillSize: 64  
+  - Name: Out2
+    Format: Int32
+    Stride: 16
+    FillSize: 64
+  - Name: Out3
+    Format: Int32
+    Stride: 16
+    FillSize: 64
+  - Name: Out4
+    Format: Int32
+    Stride: 16
+    FillSize: 64
+  - Name: Out5
+    Format: Int32
+    Stride: 16
+    FillSize: 64
+  - Name: ExpectedOut1
+    Format: Int32
+    Stride: 16
+    Data: [ 1, 0, 0, 0, 2, 0, 0, 0, 4, 0, 0, 0, 8, 0, 0, 0 ]
+  - Name: ExpectedOut2
+    Format: Int32
+    Stride: 16
+    Data: [ 1, 1, 0, 0, 2, 4, 0, 0, 4, 16, 0, 0, 8, 64, 0, 0 ]
+  - Name: ExpectedOut3
+    Format: Int32
+    Stride: 16
+    Data: [ 1, 1, 1, 0, 2, 4, 8, 0, 4, 16, 64, 0, 8, 64, 512, 0 ]
+  - Name: ExpectedOut4
+    Format: Int32
+    Stride: 16
+    Data: [ 1, 1, 1, 1, 2, 4, 8, 16, 4, 16, 64, 256, 8, 64, 512, 4096 ]
+  - Name: ExpectedOut5
+    Format: Int32
+    Stride: 16
+    Data: [ 1, 1, 1, 1, 1, 2, 3, 4, 1, 4, 9, 16, 1, 8, 27, 64 ]
+  - Name: UIn
+    Format: UInt32
+    Stride: 16
+    Data: [ 2, 4, 8, 16 ]
+  - Name: UOut1
+    Format: UInt32
+    Stride: 16
+    FillSize: 64 
+  - Name: UOut2
+    Format: UInt32
+    Stride: 16
+    FillSize: 64
+  - Name: UOut3
+    Format: UInt32
+    Stride: 16
+    FillSize: 64
+  - Name: UOut4
+    Format: UInt32
+    Stride: 16
+    FillSize: 64
+  - Name: UOut5
+    Format: UInt32
+    Stride: 16
+    FillSize: 64
+  - Name: UExpectedOut1
+    Format: UInt32
+    Stride: 16
+    Data: [ 1, 0, 0, 0, 2, 0, 0, 0, 4, 0, 0, 0, 8, 0, 0, 0 ]
+  - Name: UExpectedOut2
+    Format: UInt32
+    Stride: 16
+    Data: [ 1, 1, 0, 0, 2, 4, 0, 0, 4, 16, 0, 0, 8, 64, 0, 0 ]
+  - Name: UExpectedOut3
+    Format: UInt32
+    Stride: 16
+    Data: [ 1, 1, 1, 0, 2, 4, 8, 0, 4, 16, 64, 0, 8, 64, 512, 0 ]
+  - Name: UExpectedOut4
+    Format: UInt32
+    Stride: 16
+    Data: [ 1, 1, 1, 1, 2, 4, 8, 16, 4, 16, 64, 256, 8, 64, 512, 4096 ]
+  - Name: UExpectedOut5
+    Format: UInt32
+    Stride: 16
+    Data: [ 1, 1, 1, 1, 1, 2, 3, 4, 1, 4, 9, 16, 1, 8, 27, 64 ]
+  - Name: FIn
+    Format: Float32
+    Stride: 16
+    Data: [ 2.0, 4.0, 8.0, 16.0 ]
+  - Name: FOut1
+    Format: Float32
+    Stride: 16
+    FillSize: 64
+  - Name: FOut2
+    Format: Float32
+    Stride: 16
+    FillSize: 64
+  - Name: FOut3
+    Format: Float32
+    Stride: 16
+    FillSize: 64
+  - Name: FOut4
+    Format: Float32
+    Stride: 16
+    FillSize: 64
+  - Name: FOut5
+    Format: Float32
+    Stride: 16
+    FillSize: 64
+  - Name: FExpectedOut1
+    Format: Float32
+    Stride: 16
+    Data: [ 1.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 4.0, 0.0, 0.0, 0.0, 8.0, 0.0, 0.0, 0.0 ]
+  - Name: FExpectedOut2
+    Format: Float32
+    Stride: 16
+    Data: [ 1.0, 1.0, 0.0, 0.0, 2.0, 4.0, 0.0, 0.0, 4.0, 16.0, 0.0, 0.0, 8.0, 64.0, 0.0, 0.0 ]
+  - Name: FExpectedOut3
+    Format: Float32
+    Stride: 16
+    Data: [ 1.0, 1.0, 1.0, 0.0, 2.0, 4.0, 8.0, 0.0, 4.0, 16.0, 64.0, 0.0, 8.0, 64.0, 512.0, 0.0 ]
+  - Name: FExpectedOut4
+    Format: Float32
+    Stride: 16
+    Data: [ 1.0, 1.0, 1.0, 1.0, 2.0, 4.0, 8.0, 16.0, 4.0, 16.0, 64.0, 256.0, 8.0, 64.0, 512.0, 4096.0 ]
+  - Name: FExpectedOut5
+    Format: Float32
+    Stride: 16
+    Data: [ 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 3.0, 4.0, 1.0, 4.0, 9.0, 16.0, 1.0, 8.0, 27.0, 64.0 ]
+Results:
+  - Result: ExpectedOut1
+    Rule: BufferExact
+    Actual: Out1
+    Expected: ExpectedOut1
+  - Result: ExpectedOut2
+    Rule: BufferExact
+    Actual: Out2
+    Expected: ExpectedOut2
+  - Result: ExpectedOut3
+    Rule: BufferExact
+    Actual: Out3
+    Expected: ExpectedOut3
+  - Result: ExpectedOut4
+    Rule: BufferExact
+    Actual: Out4
+    Expected: ExpectedOut4
+  - Result: ExpectedOut5
+    Rule: BufferExact
+    Actual: Out5
+    Expected: ExpectedOut5
+  - Result: UExpectedOut1
+    Rule: BufferExact
+    Actual: UOut1
+    Expected: UExpectedOut1
+  - Result: UExpectedOut2
+    Rule: BufferExact
+    Actual: UOut2
+    Expected: UExpectedOut2
+  - Result: UExpectedOut3
+    Rule: BufferExact
+    Actual: UOut3
+    Expected: UExpectedOut3
+  - Result: UExpectedOut4
+    Rule: BufferExact
+    Actual: UOut4
+    Expected: UExpectedOut4
+  - Result: UExpectedOut5
+    Rule: BufferExact
+    Actual: UOut5
+    Expected: UExpectedOut5
+  - Result: FExpectedOut1
+    Rule: BufferExact
+    Actual: FOut1
+    Expected: FExpectedOut1
+  - Result: FExpectedOut2
+    Rule: BufferExact
+    Actual: FOut2
+    Expected: FExpectedOut2
+  - Result: FExpectedOut3
+    Rule: BufferExact
+    Actual: FOut3
+    Expected: FExpectedOut3
+  - Result: FExpectedOut4
+    Rule: BufferExact
+    Actual: FOut4
+    Expected: FExpectedOut4
+  - Result: FExpectedOut5
+    Rule: BufferExact
+    Actual: FOut5
+    Expected: FExpectedOut5
+DescriptorSets:
+  - Resources:
+    - Name: In
+      Kind: StructuredBuffer
+      DirectXBinding:
+        Register: 0
+        Space: 0
+      VulkanBinding:
+        Binding: 0
+    - Name: Out1
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 1
+        Space: 0
+      VulkanBinding:
+        Binding: 1
+    - Name: Out2
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 2
+        Space: 0
+      VulkanBinding:
+        Binding: 2
+    - Name: Out3
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 3
+        Space: 0
+      VulkanBinding:
+        Binding: 3
+    - Name: Out4
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 4
+        Space: 0
+      VulkanBinding:
+        Binding: 4
+    - Name: Out5
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 5
+        Space: 0
+      VulkanBinding:
+        Binding: 5
+    - Name: UIn
+      Kind: StructuredBuffer
+      DirectXBinding:
+        Register: 6
+        Space: 0
+      VulkanBinding:
+        Binding: 6
+    - Name: UOut1
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 7
+        Space: 0
+      VulkanBinding:
+        Binding: 7
+    - Name: UOut2
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 8
+        Space: 0
+      VulkanBinding:
+        Binding: 8
+    - Name: UOut3
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 9
+        Space: 0
+      VulkanBinding:
+        Binding: 9
+    - Name: UOut4
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 10
+        Space: 0
+      VulkanBinding:
+        Binding: 10
+    - Name: UOut5
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 11
+        Space: 0
+      VulkanBinding:
+        Binding: 11
+    - Name: FIn
+      Kind: StructuredBuffer
+      DirectXBinding:
+        Register: 12
+        Space: 0
+      VulkanBinding:
+        Binding: 12
+    - Name: FOut1
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 13
+        Space: 0
+      VulkanBinding:
+        Binding: 13
+    - Name: FOut2
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 14
+        Space: 0
+      VulkanBinding:
+        Binding: 14
+    - Name: FOut3
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 15
+        Space: 0
+      VulkanBinding:
+        Binding: 15
+    - Name: FOut4
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 16
+        Space: 0
+      VulkanBinding:
+        Binding: 16
+    - Name: FOut5
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 17
+        Space: 0
+      VulkanBinding:
+        Binding: 17
+
+...
+#--- end
+
+# RUN: split-file %s %t
+# RUN: %dxc_target -T cs_6_5 -Fo %t.o %t/source.hlsl
+# RUN: %offloader %t/pipeline.yaml %t.o 
diff --git a/test/WaveOps/WavePrefixProduct.fp16.test b/test/WaveOps/WavePrefixProduct.fp16.test
new file mode 100644
index 000000000..d8cd5110b
--- /dev/null
+++ b/test/WaveOps/WavePrefixProduct.fp16.test
@@ -0,0 +1,173 @@
+#--- source.hlsl
+StructuredBuffer<half4> In  : register(t0);
+RWStructuredBuffer<half4> Out1 : register(u1); // test scalar
+RWStructuredBuffer<half4> Out2 : register(u2); // test half2
+RWStructuredBuffer<half4> Out3 : register(u3); // test half3
+RWStructuredBuffer<half4> Out4 : register(u4); // test half4
+RWStructuredBuffer<half4> Out5 : register(u5); // constant folding
+
+[numthreads(4,1,1)]
+void main(uint3 tid : SV_GroupThreadID)
+{
+    half4 v = In[0];
+
+    // Mask per "active lane set": only <=N lanes contribute
+    half s1 = tid.x <= 0 ? WavePrefixProduct( v.x ) : 0;
+    half s2 = tid.x <= 1 ? WavePrefixProduct( v.x ) : 0;
+    half s3 = tid.x <= 2 ? WavePrefixProduct( v.x ) : 0;
+    half s4 = tid.x <= 3 ? WavePrefixProduct( v.x ) : 0;
+
+    half2 v2_1 = tid.x <= 0 ? WavePrefixProduct( v.xy ) : half2(0,0);
+    half2 v2_2 = tid.x <= 1 ? WavePrefixProduct( v.xy ) : half2(0,0);
+    half2 v2_3 = tid.x <= 2 ? WavePrefixProduct( v.xy ) : half2(0,0);
+    half2 v2_4 = tid.x <= 3 ? WavePrefixProduct( v.xy ) : half2(0,0);
+
+    half3 v3_1 = tid.x <= 0 ? WavePrefixProduct( v.xyz ) : half3(0,0,0);
+    half3 v3_2 = tid.x <= 1 ? WavePrefixProduct( v.xyz ) : half3(0,0,0);
+    half3 v3_3 = tid.x <= 2 ? WavePrefixProduct( v.xyz ) : half3(0,0,0);
+    half3 v3_4 = tid.x <= 3 ? WavePrefixProduct( v.xyz ) : half3(0,0,0);
+
+    half4 v4_1 = tid.x <= 0 ? WavePrefixProduct( v ) : half4(0,0,0,0);
+    half4 v4_2 = tid.x <= 1 ? WavePrefixProduct( v ) : half4(0,0,0,0);
+    half4 v4_3 = tid.x <= 2 ? WavePrefixProduct( v ) : half4(0,0,0,0);
+    half4 v4_4 = tid.x <= 3 ? WavePrefixProduct( v ) : half4(0,0,0,0);
+
+    half scalars[4] = { s1, s2, s3, s4 };
+    half2 vec2s [4] = { v2_1, v2_2, v2_3, v2_4 };
+    half3 vec3s [4] = { v3_1, v3_2, v3_3, v3_4 };
+    half4 vec4s [4] = { v4_1, v4_2, v4_3, v4_4 };
+
+    Out1[tid.x].x   = scalars[tid.x];
+    Out2[tid.x].xy  = vec2s[tid.x];
+    Out3[tid.x].xyz = vec3s[tid.x];
+    Out4[tid.x]     = vec4s[tid.x];
+    Out5[tid.x]     = WavePrefixProduct(half4(1,2,3,4));
+}
+
+//--- pipeline.yaml
+
+---
+Shaders:
+  - Stage: Compute
+    Entry: main
+    DispatchSize: [ 1, 1, 1 ]
+Buffers:
+  - Name: In
+    Format: Float16
+    Stride: 8
+    # Data: [ 0x4000, 0x4400, 0x4800, 0x4c00 ]
+    Data: [ 0x4000, 0x4400, 0x4800, 0x4c00 ]
+  - Name: Out1
+    Format: Float16
+    Stride: 8
+    FillSize: 32
+  - Name: Out2
+    Format: Float16
+    Stride: 8
+    FillSize: 32
+  - Name: Out3
+    Format: Float16
+    Stride: 8
+    FillSize: 32
+  - Name: Out4
+    Format: Float16
+    Stride: 8
+    FillSize: 32
+  - Name: Out5
+    Format: Float16
+    Stride: 8
+    FillSize: 32
+  - Name: ExpectedOut1
+    Format: Float16
+    Stride: 8
+    Data: [ 0x3c00, 0x0, 0x0, 0x0, 0x4000, 0x0, 0x0, 0x0, 0x4400, 0x0, 0x0, 0x0, 0x4800, 0x0, 0x0, 0x0 ]
+  - Name: ExpectedOut2
+    Format: Float16
+    Stride: 8
+    Data: [ 0x3c00, 0x3c00, 0x0, 0x0, 0x4000, 0x4400, 0x0, 0x0, 0x4400, 0x4c00, 0x0, 0x0, 0x4800, 0x5400, 0x0, 0x0 ]
+  - Name: ExpectedOut3
+    Format: Float16
+    Stride: 8
+    Data: [ 0x3c00, 0x3c00, 0x3c00, 0x0, 0x4000, 0x4400, 0x4800, 0x0, 0x4400, 0x4c00, 0x5400, 0x0, 0x4800, 0x5400, 0x6000, 0x0 ]
+  - Name: ExpectedOut4
+    Format: Float16
+    Stride: 8
+    Data: [ 0x3c00, 0x3c00, 0x3c00, 0x3c00, 0x4000, 0x4400, 0x4800, 0x4c00, 0x4400, 0x4c00, 0x5400, 0x5c00, 0x4800, 0x5400, 0x6000, 0x6c00 ]
+  - Name: ExpectedOut5
+    Format: Float16
+    Stride: 8
+    Data: [ 0x3c00, 0x3c00, 0x3c00, 0x3c00, 0x3c00, 0x4000, 0x4200, 0x4400, 0x3c00, 0x4400, 0x4880, 0x4c00, 0x3c00, 0x4800, 0x4ec0, 0x5400 ]
+Results:
+  - Result: ExpectedOut1
+    Rule: BufferExact
+    Actual: Out1
+    Expected: ExpectedOut1
+  - Result: ExpectedOut2
+    Rule: BufferExact
+    Actual: Out2
+    Expected: ExpectedOut2
+  - Result: ExpectedOut3
+    Rule: BufferExact
+    Actual: Out3
+    Expected: ExpectedOut3
+  - Result: ExpectedOut4
+    Rule: BufferExact
+    Actual: Out4
+    Expected: ExpectedOut4
+  - Result: ExpectedOut5
+    Rule: BufferExact
+    Actual: Out5
+    Expected: ExpectedOut5
+DescriptorSets:
+  - Resources:
+    - Name: In
+      Kind: StructuredBuffer
+      DirectXBinding:
+        Register: 0
+        Space: 0
+      VulkanBinding:
+        Binding: 0
+    - Name: Out1
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 1
+        Space: 0
+      VulkanBinding:
+        Binding: 1
+    - Name: Out2
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 2
+        Space: 0
+      VulkanBinding:
+        Binding: 2
+    - Name: Out3
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 3
+        Space: 0
+      VulkanBinding:
+        Binding: 3
+    - Name: Out4
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 4
+        Space: 0
+      VulkanBinding:
+        Binding: 4
+    - Name: Out5
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 5
+        Space: 0
+      VulkanBinding:
+        Binding: 5
+
+...
+#--- end
+
+# REQUIRES: Half
+
+# RUN: split-file %s %t
+# RUN: %dxc_target -enable-16bit-types -T cs_6_5 -Fo %t.o %t/source.hlsl
+# RUN: %offloader %t/pipeline.yaml %t.o
diff --git a/test/WaveOps/WavePrefixProduct.fp64.test b/test/WaveOps/WavePrefixProduct.fp64.test
new file mode 100644
index 000000000..03c0dd890
--- /dev/null
+++ b/test/WaveOps/WavePrefixProduct.fp64.test
@@ -0,0 +1,172 @@
+#--- source.hlsl
+StructuredBuffer<double4> In  : register(t0);
+RWStructuredBuffer<double4> Out1 : register(u1); // test scalar
+RWStructuredBuffer<double4> Out2 : register(u2); // test double2
+RWStructuredBuffer<double4> Out3 : register(u3); // test double3
+RWStructuredBuffer<double4> Out4 : register(u4); // test double4
+RWStructuredBuffer<double4> Out5 : register(u5); // constant folding
+
+[numthreads(4,1,1)]
+void main(uint3 tid : SV_GroupThreadID)
+{
+    double4 v = In[0];
+
+    // Mask per "active lane set": only <=N lanes contribute
+    double s1 = tid.x <= 0 ? WavePrefixProduct( v.x ) : 0;
+    double s2 = tid.x <= 1 ? WavePrefixProduct( v.x ) : 0;
+    double s3 = tid.x <= 2 ? WavePrefixProduct( v.x ) : 0;
+    double s4 = tid.x <= 3 ? WavePrefixProduct( v.x ) : 0;
+
+    double2 v2_1 = tid.x <= 0 ? WavePrefixProduct( v.xy ) : double2(0,0);
+    double2 v2_2 = tid.x <= 1 ? WavePrefixProduct( v.xy ) : double2(0,0);
+    double2 v2_3 = tid.x <= 2 ? WavePrefixProduct( v.xy ) : double2(0,0);
+    double2 v2_4 = tid.x <= 3 ? WavePrefixProduct( v.xy ) : double2(0,0);
+
+    double3 v3_1 = tid.x <= 0 ? WavePrefixProduct( v.xyz ) : double3(0,0,0);
+    double3 v3_2 = tid.x <= 1 ? WavePrefixProduct( v.xyz ) : double3(0,0,0);
+    double3 v3_3 = tid.x <= 2 ? WavePrefixProduct( v.xyz ) : double3(0,0,0);
+    double3 v3_4 = tid.x <= 3 ? WavePrefixProduct( v.xyz ) : double3(0,0,0);
+
+    double4 v4_1 = tid.x <= 0 ? WavePrefixProduct( v ) : double4(0,0,0,0);
+    double4 v4_2 = tid.x <= 1 ? WavePrefixProduct( v ) : double4(0,0,0,0);
+    double4 v4_3 = tid.x <= 2 ? WavePrefixProduct( v ) : double4(0,0,0,0);
+    double4 v4_4 = tid.x <= 3 ? WavePrefixProduct( v ) : double4(0,0,0,0);
+
+    double scalars[4] = { s1, s2, s3, s4 };
+    double2 vec2s [4] = { v2_1, v2_2, v2_3, v2_4 };
+    double3 vec3s [4] = { v3_1, v3_2, v3_3, v3_4 };
+    double4 vec4s [4] = { v4_1, v4_2, v4_3, v4_4 };
+
+    Out1[tid.x].x   = scalars[tid.x];
+    Out2[tid.x].xy  = vec2s[tid.x];
+    Out3[tid.x].xyz = vec3s[tid.x];
+    Out4[tid.x]     = vec4s[tid.x];
+    Out5[tid.x]     = WavePrefixProduct(double4(1,2,3,4));
+}
+
+//--- pipeline.yaml
+
+---
+Shaders:
+  - Stage: Compute
+    Entry: main
+    DispatchSize: [ 1, 1, 1 ]
+Buffers:
+  - Name: In
+    Format: Float64
+    Stride: 32
+    Data: [ 2.0, 4.0, 8.0, 16.0 ]
+  - Name: Out1
+    Format: Float64
+    Stride: 32
+    FillSize: 128
+  - Name: Out2
+    Format: Float64
+    Stride: 32
+    FillSize: 128
+  - Name: Out3
+    Format: Float64
+    Stride: 32
+    FillSize: 128
+  - Name: Out4
+    Format: Float64
+    Stride: 32
+    FillSize: 128
+  - Name: Out5
+    Format: Float64
+    Stride: 32
+    FillSize: 128
+  - Name: ExpectedOut1
+    Format: Float64
+    Stride: 32
+    Data: [ 1.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 4.0, 0.0, 0.0, 0.0, 8.0, 0.0, 0.0, 0.0 ]
+  - Name: ExpectedOut2
+    Format: Float64
+    Stride: 32
+    Data: [ 1.0, 1.0, 0.0, 0.0, 2.0, 4.0, 0.0, 0.0, 4.0, 16.0, 0.0, 0.0, 8.0, 64.0, 0.0, 0.0 ]
+  - Name: ExpectedOut3
+    Format: Float64
+    Stride: 32
+    Data: [ 1.0, 1.0, 1.0, 0.0, 2.0, 4.0, 8.0, 0.0, 4.0, 16.0, 64.0, 0.0, 8.0, 64.0, 512.0, 0.0 ]
+  - Name: ExpectedOut4
+    Format: Float64
+    Stride: 32
+    Data: [ 1.0, 1.0, 1.0, 1.0, 2.0, 4.0, 8.0, 16.0, 4.0, 16.0, 64.0, 256.0, 8.0, 64.0, 512.0, 4096.0 ]
+  - Name: ExpectedOut5
+    Format: Float64
+    Stride: 32
+    Data: [ 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 3.0, 4.0, 1.0, 4.0, 9.0, 16.0, 1.0, 8.0, 27.0, 64.0 ]
+Results:
+  - Result: ExpectedOut1
+    Rule: BufferExact
+    Actual: Out1
+    Expected: ExpectedOut1
+  - Result: ExpectedOut2
+    Rule: BufferExact
+    Actual: Out2
+    Expected: ExpectedOut2
+  - Result: ExpectedOut3
+    Rule: BufferExact
+    Actual: Out3
+    Expected: ExpectedOut3
+  - Result: ExpectedOut4
+    Rule: BufferExact
+    Actual: Out4
+    Expected: ExpectedOut4
+  - Result: ExpectedOut5
+    Rule: BufferExact
+    Actual: Out5
+    Expected: ExpectedOut5
+DescriptorSets:
+  - Resources:
+    - Name: In
+      Kind: StructuredBuffer
+      DirectXBinding:
+        Register: 0
+        Space: 0
+      VulkanBinding:
+        Binding: 0
+    - Name: Out1
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 1
+        Space: 0
+      VulkanBinding:
+        Binding: 1
+    - Name: Out2
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 2
+        Space: 0
+      VulkanBinding:
+        Binding: 2
+    - Name: Out3
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 3
+        Space: 0
+      VulkanBinding:
+        Binding: 3
+    - Name: Out4
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 4
+        Space: 0
+      VulkanBinding:
+        Binding: 4
+    - Name: Out5
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 5
+        Space: 0
+      VulkanBinding:
+        Binding: 5
+
+...
+#--- end
+
+# REQUIRES: Double
+
+# RUN: split-file %s %t
+# RUN: %dxc_target -T cs_6_5 -Fo %t.o %t/source.hlsl
+# RUN: %offloader %t/pipeline.yaml %t.o
diff --git a/test/WaveOps/WavePrefixProduct.int16.test b/test/WaveOps/WavePrefixProduct.int16.test
new file mode 100644
index 000000000..c6a592d5d
--- /dev/null
+++ b/test/WaveOps/WavePrefixProduct.int16.test
@@ -0,0 +1,323 @@
+#--- source.hlsl
+// ints
+StructuredBuffer<int16_t4> In  : register(t0);
+RWStructuredBuffer<int16_t4> Out1 : register(u1);
+RWStructuredBuffer<int16_t4> Out2 : register(u2);
+RWStructuredBuffer<int16_t4> Out3 : register(u3);
+RWStructuredBuffer<int16_t4> Out4 : register(u4);
+RWStructuredBuffer<int16_t4> Out5 : register(u5);
+
+// uints
+StructuredBuffer<uint16_t4> UIn  : register(t6);
+RWStructuredBuffer<uint16_t4> UOut1 : register(u7);
+RWStructuredBuffer<uint16_t4> UOut2 : register(u8);
+RWStructuredBuffer<uint16_t4> UOut3 : register(u9);
+RWStructuredBuffer<uint16_t4> UOut4 : register(u10);
+RWStructuredBuffer<uint16_t4> UOut5 : register(u11);
+
+[numthreads(4,1,1)]
+void main(uint3 tid : SV_GroupThreadID)
+{
+    int16_t4 v = In[0];
+
+    // Mask per "active lane set": only <=N lanes contribute
+    int16_t s1 = tid.x <= 0 ? WavePrefixProduct( v.x ) : 0;
+    int16_t s2 = tid.x <= 1 ? WavePrefixProduct( v.x ) : 0;
+    int16_t s3 = tid.x <= 2 ? WavePrefixProduct( v.x ) : 0;
+    int16_t s4 = tid.x <= 3 ? WavePrefixProduct( v.x ) : 0;
+
+    int16_t2 v2_1 = tid.x <= 0 ? WavePrefixProduct( v.xy ) : int16_t2(0,0);
+    int16_t2 v2_2 = tid.x <= 1 ? WavePrefixProduct( v.xy ) : int16_t2(0,0);
+    int16_t2 v2_3 = tid.x <= 2 ? WavePrefixProduct( v.xy ) : int16_t2(0,0);
+    int16_t2 v2_4 = tid.x <= 3 ? WavePrefixProduct( v.xy ) : int16_t2(0,0);
+
+    int16_t3 v3_1 = tid.x <= 0 ? WavePrefixProduct( v.xyz ) : int16_t3(0,0,0);
+    int16_t3 v3_2 = tid.x <= 1 ? WavePrefixProduct( v.xyz ) : int16_t3(0,0,0);
+    int16_t3 v3_3 = tid.x <= 2 ? WavePrefixProduct( v.xyz ) : int16_t3(0,0,0);
+    int16_t3 v3_4 = tid.x <= 3 ? WavePrefixProduct( v.xyz ) : int16_t3(0,0,0);
+
+    int16_t4 v4_1 = tid.x <= 0 ? WavePrefixProduct( v ) : int16_t4(0,0,0,0);
+    int16_t4 v4_2 = tid.x <= 1 ? WavePrefixProduct( v ) : int16_t4(0,0,0,0);
+    int16_t4 v4_3 = tid.x <= 2 ? WavePrefixProduct( v ) : int16_t4(0,0,0,0);
+    int16_t4 v4_4 = tid.x <= 3 ? WavePrefixProduct( v ) : int16_t4(0,0,0,0);
+
+    int16_t scalars[4] = { s1, s2, s3, s4 };
+    int16_t2 vec2s [4] = { v2_1, v2_2, v2_3, v2_4 };
+    int16_t3 vec3s [4] = { v3_1, v3_2, v3_3, v3_4 };
+    int16_t4 vec4s [4] = { v4_1, v4_2, v4_3, v4_4 };
+
+    Out1[tid.x].x   = scalars[tid.x];
+    Out2[tid.x].xy  = vec2s[tid.x];
+    Out3[tid.x].xyz = vec3s[tid.x];
+    Out4[tid.x]     = vec4s[tid.x];
+    Out5[tid.x]     = WavePrefixProduct(int16_t4(1,2,3,4));
+
+    // UINT case
+
+    uint16_t4 uv = UIn[0];
+
+    // Mask per "active lane set": only <=N lanes contribute
+    uint16_t us1 = tid.x <= 0 ? WavePrefixProduct( uv.x ) : 0;
+    uint16_t us2 = tid.x <= 1 ? WavePrefixProduct( uv.x ) : 0;
+    uint16_t us3 = tid.x <= 2 ? WavePrefixProduct( uv.x ) : 0;
+    uint16_t us4 = tid.x <= 3 ? WavePrefixProduct( uv.x ) : 0;
+
+    uint16_t2 uv2_1 = tid.x <= 0 ? WavePrefixProduct( uv.xy ) : uint16_t2(0,0);
+    uint16_t2 uv2_2 = tid.x <= 1 ? WavePrefixProduct( uv.xy ) : uint16_t2(0,0);
+    uint16_t2 uv2_3 = tid.x <= 2 ? WavePrefixProduct( uv.xy ) : uint16_t2(0,0);
+    uint16_t2 uv2_4 = tid.x <= 3 ? WavePrefixProduct( uv.xy ) : uint16_t2(0,0);
+
+    uint16_t3 uv3_1 = tid.x <= 0 ? WavePrefixProduct( uv.xyz ) : uint16_t3(0,0,0);
+    uint16_t3 uv3_2 = tid.x <= 1 ? WavePrefixProduct( uv.xyz ) : uint16_t3(0,0,0);
+    uint16_t3 uv3_3 = tid.x <= 2 ? WavePrefixProduct( uv.xyz ) : uint16_t3(0,0,0);
+    uint16_t3 uv3_4 = tid.x <= 3 ? WavePrefixProduct( uv.xyz ) : uint16_t3(0,0,0);
+
+    uint16_t4 uv4_1 = tid.x <= 0 ? WavePrefixProduct( uv ) : uint16_t4(0,0,0,0);
+    uint16_t4 uv4_2 = tid.x <= 1 ? WavePrefixProduct( uv ) : uint16_t4(0,0,0,0);
+    uint16_t4 uv4_3 = tid.x <= 2 ? WavePrefixProduct( uv ) : uint16_t4(0,0,0,0);
+    uint16_t4 uv4_4 = tid.x <= 3 ? WavePrefixProduct( uv ) : uint16_t4(0,0,0,0);
+
+    uint16_t uscalars[4] = { us1, us2, us3, us4 };
+    uint16_t2 uvec2s [4] = { uv2_1, uv2_2, uv2_3, uv2_4 };
+    uint16_t3 uvec3s [4] = { uv3_1, uv3_2, uv3_3, uv3_4 };
+    uint16_t4 uvec4s [4] = { uv4_1, uv4_2, uv4_3, uv4_4 };
+
+    UOut1[tid.x].x   = uscalars[tid.x];
+    UOut2[tid.x].xy  = uvec2s[tid.x];
+    UOut3[tid.x].xyz = uvec3s[tid.x];
+    UOut4[tid.x]     = uvec4s[tid.x];
+    UOut5[tid.x]     = WavePrefixProduct(uint16_t4(1,2,3,4));
+}
+
+//--- pipeline.yaml
+
+---
+Shaders:
+  - Stage: Compute
+    Entry: main
+    DispatchSize: [ 1, 1, 1 ]
+Buffers:
+  - Name: In
+    Format: Int16
+    Stride: 8
+    Data: [ 2, 4, 8, 16 ]
+  - Name: Out1
+    Format: Int16
+    Stride: 8
+    FillSize: 32  
+  - Name: Out2
+    Format: Int16
+    Stride: 8
+    FillSize: 32
+  - Name: Out3
+    Format: Int16
+    Stride: 8
+    FillSize: 32
+  - Name: Out4
+    Format: Int16
+    Stride: 8
+    FillSize: 32
+  - Name: Out5
+    Format: Int16
+    Stride: 8
+    FillSize: 32
+  - Name: ExpectedOut1
+    Format: Int16
+    Stride: 8
+    Data: [ 1, 0, 0, 0, 2, 0, 0, 0, 4, 0, 0, 0, 8, 0, 0, 0 ]
+  - Name: ExpectedOut2
+    Format: Int16
+    Stride: 8
+    Data: [ 1, 1, 0, 0, 2, 4, 0, 0, 4, 16, 0, 0, 8, 64, 0, 0 ]
+  - Name: ExpectedOut3
+    Format: Int16
+    Stride: 8
+    Data: [ 1, 1, 1, 0, 2, 4, 8, 0, 4, 16, 64, 0, 8, 64, 512, 0 ]
+  - Name: ExpectedOut4
+    Format: Int16
+    Stride: 8
+    Data: [ 1, 1, 1, 1, 2, 4, 8, 16, 4, 16, 64, 256, 8, 64, 512, 4096 ]
+  - Name: ExpectedOut5
+    Format: Int16
+    Stride: 8
+    Data: [ 1, 1, 1, 1, 1, 2, 3, 4, 1, 4, 9, 16, 1, 8, 27, 64 ]
+  - Name: UIn
+    Format: UInt16
+    Stride: 8
+    Data: [ 2, 4, 8, 16 ]
+  - Name: UOut1
+    Format: UInt16
+    Stride: 8
+    FillSize: 32  
+  - Name: UOut2
+    Format: UInt16
+    Stride: 8
+    FillSize: 32
+  - Name: UOut3
+    Format: UInt16
+    Stride: 8
+    FillSize: 32
+  - Name: UOut4
+    Format: UInt16
+    Stride: 8
+    FillSize: 32
+  - Name: UOut5
+    Format: UInt16
+    Stride: 8
+    FillSize: 32
+  - Name: UExpectedOut1
+    Format: UInt16
+    Stride: 8
+    Data: [ 1, 0, 0, 0, 2, 0, 0, 0, 4, 0, 0, 0, 8, 0, 0, 0 ]
+  - Name: UExpectedOut2
+    Format: UInt16
+    Stride: 8
+    Data: [ 1, 1, 0, 0, 2, 4, 0, 0, 4, 16, 0, 0, 8, 64, 0, 0 ]
+  - Name: UExpectedOut3
+    Format: UInt16
+    Stride: 8
+    Data: [ 1, 1, 1, 0, 2, 4, 8, 0, 4, 16, 64, 0, 8, 64, 512, 0 ]
+  - Name: UExpectedOut4
+    Format: UInt16
+    Stride: 8
+    Data: [ 1, 1, 1, 1, 2, 4, 8, 16, 4, 16, 64, 256, 8, 64, 512, 4096 ]
+  - Name: UExpectedOut5
+    Format: UInt16
+    Stride: 8
+    Data: [ 1, 1, 1, 1, 1, 2, 3, 4, 1, 4, 9, 16, 1, 8, 27, 64 ]
+Results:
+  - Result: ExpectedOut1
+    Rule: BufferExact
+    Actual: Out1
+    Expected: ExpectedOut1
+  - Result: ExpectedOut2
+    Rule: BufferExact
+    Actual: Out2
+    Expected: ExpectedOut2
+  - Result: ExpectedOut3
+    Rule: BufferExact
+    Actual: Out3
+    Expected: ExpectedOut3
+  - Result: ExpectedOut4
+    Rule: BufferExact
+    Actual: Out4
+    Expected: ExpectedOut4
+  - Result: ExpectedOut5
+    Rule: BufferExact
+    Actual: Out5
+    Expected: ExpectedOut5
+  - Result: UExpectedOut1
+    Rule: BufferExact
+    Actual: UOut1
+    Expected: UExpectedOut1
+  - Result: UExpectedOut2
+    Rule: BufferExact
+    Actual: UOut2
+    Expected: UExpectedOut2
+  - Result: UExpectedOut3
+    Rule: BufferExact
+    Actual: UOut3
+    Expected: UExpectedOut3
+  - Result: UExpectedOut4
+    Rule: BufferExact
+    Actual: UOut4
+    Expected: UExpectedOut4
+  - Result: UExpectedOut5
+    Rule: BufferExact
+    Actual: UOut5
+    Expected: UExpectedOut5
+DescriptorSets:
+  - Resources:
+    - Name: In
+      Kind: StructuredBuffer
+      DirectXBinding:
+        Register: 0
+        Space: 0
+      VulkanBinding:
+        Binding: 0
+    - Name: Out1
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 1
+        Space: 0
+      VulkanBinding:
+        Binding: 1
+    - Name: Out2
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 2
+        Space: 0
+      VulkanBinding:
+        Binding: 2
+    - Name: Out3
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 3
+        Space: 0
+      VulkanBinding:
+        Binding: 3
+    - Name: Out4
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 4
+        Space: 0
+      VulkanBinding:
+        Binding: 4
+    - Name: Out5
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 5
+        Space: 0
+      VulkanBinding:
+        Binding: 5
+    - Name: UIn
+      Kind: StructuredBuffer
+      DirectXBinding:
+        Register: 6
+        Space: 0
+      VulkanBinding:
+        Binding: 6
+    - Name: UOut1
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 7
+        Space: 0
+      VulkanBinding:
+        Binding: 7
+    - Name: UOut2
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 8
+        Space: 0
+      VulkanBinding:
+        Binding: 8
+    - Name: UOut3
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 9
+        Space: 0
+      VulkanBinding:
+        Binding: 9
+    - Name: UOut4
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 10
+        Space: 0
+      VulkanBinding:
+        Binding: 10
+    - Name: UOut5
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 11
+        Space: 0
+      VulkanBinding:
+        Binding: 11
+
+...
+#--- end
+
+# REQUIRES: Int16
+
+# RUN: split-file %s %t
+# RUN: %dxc_target -enable-16bit-types -T cs_6_5 -Fo %t.o %t/source.hlsl
+# RUN: %offloader %t/pipeline.yaml %t.o 
diff --git a/test/WaveOps/WavePrefixProduct.int64.test b/test/WaveOps/WavePrefixProduct.int64.test
new file mode 100644
index 000000000..319c186b6
--- /dev/null
+++ b/test/WaveOps/WavePrefixProduct.int64.test
@@ -0,0 +1,322 @@
+#--- source.hlsl
+StructuredBuffer<int64_t4> In  : register(t0);
+RWStructuredBuffer<int64_t4> Out1 : register(u1); // test scalar
+RWStructuredBuffer<int64_t4> Out2 : register(u2); // test int64_t2
+RWStructuredBuffer<int64_t4> Out3 : register(u3); // test int64_t3
+RWStructuredBuffer<int64_t4> Out4 : register(u4); // test int64_t4
+RWStructuredBuffer<int64_t4> Out5 : register(u5); // constant folding
+
+// uints
+StructuredBuffer<uint64_t4> UIn  : register(t6);
+RWStructuredBuffer<uint64_t4> UOut1 : register(u7);
+RWStructuredBuffer<uint64_t4> UOut2 : register(u8);
+RWStructuredBuffer<uint64_t4> UOut3 : register(u9);
+RWStructuredBuffer<uint64_t4> UOut4 : register(u10);
+RWStructuredBuffer<uint64_t4> UOut5 : register(u11);
+
+[numthreads(4,1,1)]
+void main(uint3 tid : SV_GroupThreadID)
+{
+    int64_t4 v = In[0];
+
+    // Mask per "active lane set": only <=N lanes contribute
+    int64_t s1 = tid.x <= 0 ? WavePrefixProduct( v.x ) : 0;
+    int64_t s2 = tid.x <= 1 ? WavePrefixProduct( v.x ) : 0;
+    int64_t s3 = tid.x <= 2 ? WavePrefixProduct( v.x ) : 0;
+    int64_t s4 = tid.x <= 3 ? WavePrefixProduct( v.x ) : 0;
+
+    int64_t2 v2_1 = tid.x <= 0 ? WavePrefixProduct( v.xy ) : int64_t2(0,0);
+    int64_t2 v2_2 = tid.x <= 1 ? WavePrefixProduct( v.xy ) : int64_t2(0,0);
+    int64_t2 v2_3 = tid.x <= 2 ? WavePrefixProduct( v.xy ) : int64_t2(0,0);
+    int64_t2 v2_4 = tid.x <= 3 ? WavePrefixProduct( v.xy ) : int64_t2(0,0);
+
+    int64_t3 v3_1 = tid.x <= 0 ? WavePrefixProduct( v.xyz ) : int64_t3(0,0,0);
+    int64_t3 v3_2 = tid.x <= 1 ? WavePrefixProduct( v.xyz ) : int64_t3(0,0,0);
+    int64_t3 v3_3 = tid.x <= 2 ? WavePrefixProduct( v.xyz ) : int64_t3(0,0,0);
+    int64_t3 v3_4 = tid.x <= 3 ? WavePrefixProduct( v.xyz ) : int64_t3(0,0,0);
+
+    int64_t4 v4_1 = tid.x <= 0 ? WavePrefixProduct( v ) : int64_t4(0,0,0,0);
+    int64_t4 v4_2 = tid.x <= 1 ? WavePrefixProduct( v ) : int64_t4(0,0,0,0);
+    int64_t4 v4_3 = tid.x <= 2 ? WavePrefixProduct( v ) : int64_t4(0,0,0,0);
+    int64_t4 v4_4 = tid.x <= 3 ? WavePrefixProduct( v ) : int64_t4(0,0,0,0);
+
+    int64_t scalars[4] = { s1, s2, s3, s4 };
+    int64_t2 vec2s [4] = { v2_1, v2_2, v2_3, v2_4 };
+    int64_t3 vec3s [4] = { v3_1, v3_2, v3_3, v3_4 };
+    int64_t4 vec4s [4] = { v4_1, v4_2, v4_3, v4_4 };
+
+    Out1[tid.x].x   = scalars[tid.x];
+    Out2[tid.x].xy  = vec2s[tid.x];
+    Out3[tid.x].xyz = vec3s[tid.x];
+    Out4[tid.x]     = vec4s[tid.x];
+    Out5[tid.x]     = WavePrefixProduct(int64_t4(1,2,3,4));
+
+    // UINT case
+
+    uint64_t4 uv = UIn[0];
+
+    // Mask per "active lane set": only <=N lanes contribute
+    uint64_t us1 = tid.x <= 0 ? WavePrefixProduct( uv.x ) : 0;
+    uint64_t us2 = tid.x <= 1 ? WavePrefixProduct( uv.x ) : 0;
+    uint64_t us3 = tid.x <= 2 ? WavePrefixProduct( uv.x ) : 0;
+    uint64_t us4 = tid.x <= 3 ? WavePrefixProduct( uv.x ) : 0;
+
+    uint64_t2 uv2_1 = tid.x <= 0 ? WavePrefixProduct( uv.xy ) : uint64_t2(0,0);
+    uint64_t2 uv2_2 = tid.x <= 1 ? WavePrefixProduct( uv.xy ) : uint64_t2(0,0);
+    uint64_t2 uv2_3 = tid.x <= 2 ? WavePrefixProduct( uv.xy ) : uint64_t2(0,0);
+    uint64_t2 uv2_4 = tid.x <= 3 ? WavePrefixProduct( uv.xy ) : uint64_t2(0,0);
+
+    uint64_t3 uv3_1 = tid.x <= 0 ? WavePrefixProduct( uv.xyz ) : uint64_t3(0,0,0);
+    uint64_t3 uv3_2 = tid.x <= 1 ? WavePrefixProduct( uv.xyz ) : uint64_t3(0,0,0);
+    uint64_t3 uv3_3 = tid.x <= 2 ? WavePrefixProduct( uv.xyz ) : uint64_t3(0,0,0);
+    uint64_t3 uv3_4 = tid.x <= 3 ? WavePrefixProduct( uv.xyz ) : uint64_t3(0,0,0);
+
+    uint64_t4 uv4_1 = tid.x <= 0 ? WavePrefixProduct( uv ) : uint64_t4(0,0,0,0);
+    uint64_t4 uv4_2 = tid.x <= 1 ? WavePrefixProduct( uv ) : uint64_t4(0,0,0,0);
+    uint64_t4 uv4_3 = tid.x <= 2 ? WavePrefixProduct( uv ) : uint64_t4(0,0,0,0);
+    uint64_t4 uv4_4 = tid.x <= 3 ? WavePrefixProduct( uv ) : uint64_t4(0,0,0,0);
+
+    uint64_t uscalars[4] = { us1, us2, us3, us4 };
+    uint64_t2 uvec2s [4] = { uv2_1, uv2_2, uv2_3, uv2_4 };
+    uint64_t3 uvec3s [4] = { uv3_1, uv3_2, uv3_3, uv3_4 };
+    uint64_t4 uvec4s [4] = { uv4_1, uv4_2, uv4_3, uv4_4 };
+
+    UOut1[tid.x].x   = uscalars[tid.x];
+    UOut2[tid.x].xy  = uvec2s[tid.x];
+    UOut3[tid.x].xyz = uvec3s[tid.x];
+    UOut4[tid.x]     = uvec4s[tid.x];
+    UOut5[tid.x]     = WavePrefixProduct(uint64_t4(1,2,3,4));
+}
+
+//--- pipeline.yaml
+
+---
+Shaders:
+  - Stage: Compute
+    Entry: main
+    DispatchSize: [ 1, 1, 1 ]
+Buffers:
+  - Name: In
+    Format: Int64
+    Stride: 32
+    Data: [ 2, 4, 8, 16 ]
+  - Name: Out1
+    Format: Int64
+    Stride: 32
+    FillSize: 128  
+  - Name: Out2
+    Format: Int64
+    Stride: 32
+    FillSize: 128
+  - Name: Out3
+    Format: Int64
+    Stride: 32
+    FillSize: 128
+  - Name: Out4
+    Format: Int64
+    Stride: 32
+    FillSize: 128
+  - Name: Out5
+    Format: Int64
+    Stride: 32
+    FillSize: 128
+  - Name: ExpectedOut1
+    Format: Int64
+    Stride: 32
+    Data: [ 1, 0, 0, 0, 2, 0, 0, 0, 4, 0, 0, 0, 8, 0, 0, 0 ]
+  - Name: ExpectedOut2
+    Format: Int64
+    Stride: 32
+    Data: [ 1, 1, 0, 0, 2, 4, 0, 0, 4, 16, 0, 0, 8, 64, 0, 0 ]
+  - Name: ExpectedOut3
+    Format: Int64
+    Stride: 32
+    Data: [ 1, 1, 1, 0, 2, 4, 8, 0, 4, 16, 64, 0, 8, 64, 512, 0 ]
+  - Name: ExpectedOut4
+    Format: Int64
+    Stride: 32
+    Data: [ 1, 1, 1, 1, 2, 4, 8, 16, 4, 16, 64, 256, 8, 64, 512, 4096 ]
+  - Name: ExpectedOut5
+    Format: Int64
+    Stride: 32
+    Data: [ 1, 1, 1, 1, 1, 2, 3, 4, 1, 4, 9, 16, 1, 8, 27, 64 ]
+  - Name: UIn
+    Format: UInt64
+    Stride: 32
+    Data: [ 2, 4, 8, 16 ]
+  - Name: UOut1
+    Format: UInt64
+    Stride: 32
+    FillSize: 128  
+  - Name: UOut2
+    Format: UInt64
+    Stride: 32
+    FillSize: 128
+  - Name: UOut3
+    Format: UInt64
+    Stride: 32
+    FillSize: 128
+  - Name: UOut4
+    Format: UInt64
+    Stride: 32
+    FillSize: 128
+  - Name: UOut5
+    Format: UInt64
+    Stride: 32
+    FillSize: 128
+  - Name: UExpectedOut1
+    Format: UInt64
+    Stride: 32
+    Data: [ 1, 0, 0, 0, 2, 0, 0, 0, 4, 0, 0, 0, 8, 0, 0, 0 ]
+  - Name: UExpectedOut2
+    Format: UInt64
+    Stride: 32
+    Data: [ 1, 1, 0, 0, 2, 4, 0, 0, 4, 16, 0, 0, 8, 64, 0, 0 ]
+  - Name: UExpectedOut3
+    Format: UInt64
+    Stride: 32
+    Data: [ 1, 1, 1, 0, 2, 4, 8, 0, 4, 16, 64, 0, 8, 64, 512, 0 ]
+  - Name: UExpectedOut4
+    Format: UInt64
+    Stride: 32
+    Data: [ 1, 1, 1, 1, 2, 4, 8, 16, 4, 16, 64, 256, 8, 64, 512, 4096 ]
+  - Name: UExpectedOut5
+    Format: UInt64
+    Stride: 32
+    Data: [ 1, 1, 1, 1, 1, 2, 3, 4, 1, 4, 9, 16, 1, 8, 27, 64 ]
+Results:
+  - Result: ExpectedOut1
+    Rule: BufferExact
+    Actual: Out1
+    Expected: ExpectedOut1
+  - Result: ExpectedOut2
+    Rule: BufferExact
+    Actual: Out2
+    Expected: ExpectedOut2
+  - Result: ExpectedOut3
+    Rule: BufferExact
+    Actual: Out3
+    Expected: ExpectedOut3
+  - Result: ExpectedOut4
+    Rule: BufferExact
+    Actual: Out4
+    Expected: ExpectedOut4
+  - Result: ExpectedOut5
+    Rule: BufferExact
+    Actual: Out5
+    Expected: ExpectedOut5
+  - Result: UExpectedOut1
+    Rule: BufferExact
+    Actual: UOut1
+    Expected: UExpectedOut1
+  - Result: UExpectedOut2
+    Rule: BufferExact
+    Actual: UOut2
+    Expected: UExpectedOut2
+  - Result: UExpectedOut3
+    Rule: BufferExact
+    Actual: UOut3
+    Expected: UExpectedOut3
+  - Result: UExpectedOut4
+    Rule: BufferExact
+    Actual: UOut4
+    Expected: UExpectedOut4
+  - Result: UExpectedOut5
+    Rule: BufferExact
+    Actual: UOut5
+    Expected: UExpectedOut5
+DescriptorSets:
+  - Resources:
+    - Name: In
+      Kind: StructuredBuffer
+      DirectXBinding:
+        Register: 0
+        Space: 0
+      VulkanBinding:
+        Binding: 0
+    - Name: Out1
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 1
+        Space: 0
+      VulkanBinding:
+        Binding: 1
+    - Name: Out2
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 2
+        Space: 0
+      VulkanBinding:
+        Binding: 2
+    - Name: Out3
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 3
+        Space: 0
+      VulkanBinding:
+        Binding: 3
+    - Name: Out4
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 4
+        Space: 0
+      VulkanBinding:
+        Binding: 4
+    - Name: Out5
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 5
+        Space: 0
+      VulkanBinding:
+        Binding: 5
+    - Name: UIn
+      Kind: StructuredBuffer
+      DirectXBinding:
+        Register: 6
+        Space: 0
+      VulkanBinding:
+        Binding: 6
+    - Name: UOut1
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 7
+        Space: 0
+      VulkanBinding:
+        Binding: 7
+    - Name: UOut2
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 8
+        Space: 0
+      VulkanBinding:
+        Binding: 8
+    - Name: UOut3
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 9
+        Space: 0
+      VulkanBinding:
+        Binding: 9
+    - Name: UOut4
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 10
+        Space: 0
+      VulkanBinding:
+        Binding: 10
+    - Name: UOut5
+      Kind: RWStructuredBuffer
+      DirectXBinding:
+        Register: 11
+        Space: 0
+      VulkanBinding:
+        Binding: 11
+
+...
+#--- end
+
+# REQUIRES: Int64
+
+# RUN: split-file %s %t
+# RUN: %dxc_target -T cs_6_5 -Fo %t.o %t/source.hlsl
+# RUN: %offloader %t/pipeline.yaml %t.o 
diff --git a/test/WaveOps/WavePrefixProduct.test b/test/WaveOps/WavePrefixProduct.test
new file mode 100644
index 000000000..15a7e5533
--- /dev/null
+++ b/test/WaveOps/WavePrefixProduct.test
@@ -0,0 +1,57 @@
+#--- source.hlsl
+RWBuffer<int> value;
+
+[numthreads(4, 1, 1)]
+void main(uint3 threadID : SV_DispatchThreadID) {
+  // Lane | switch status | WavePrefixProduct(2) | WavePrefixProduct(10) | Final
+  // -----|---------------|----------------------|-----------------------|----------------
+  // 0    | active        | 1                    | 1                     | 1 * 1 * 1 = 1
+  // 1    | active        | 1 * 2                | 1 * 10                | 1 * 2 * 10 = 20
+  // 2    | inactive      | n/a                  | 1 * 10 * 10           | 1 * 100 = 100
+  // 3    | inactive      | n/a                  | 1 * 10 * 10 * 10      | 1 * 1000 = 1000
+  uint product = 1;
+  switch (value[threadID.x]) {
+    case 0:
+      product *= WavePrefixProduct(2);
+      break;
+    default:
+      break;
+  }
+  product *= WavePrefixProduct(10);
+  value[threadID.x] = product;
+}
+
+//--- pipeline.yaml
+
+---
+Shaders:
+  - Stage: Compute
+    Entry: main
+    DispatchSize: [ 1, 1, 1 ]
+Buffers:
+  - Name: value
+    Format: Int32
+    Data: [ 0, 0, 1, 2 ]
+  - Name: Expected
+    Format: Int32
+    Data: [ 1, 20, 100, 1000 ]
+Results:
+  - Result: Expected
+    Rule: BufferExact
+    Actual: value
+    Expected: Expected
+DescriptorSets:
+  - Resources:
+    - Name: value
+      Kind: RWBuffer
+      DirectXBinding:
+        Register: 0
+        Space: 0
+      VulkanBinding:
+        Binding: 0
+...
+#--- end
+
+# RUN: split-file %s %t
+# RUN: %dxc_target -T cs_6_0 -Fo %t.o %t/source.hlsl
+# RUN: %offloader %t/pipeline.yaml %t.o