diff --git a/test/WaveOps/WavePrefixProduct.32.test b/test/WaveOps/WavePrefixProduct.32.test new file mode 100644 index 000000000..2494a1ec3 --- /dev/null +++ b/test/WaveOps/WavePrefixProduct.32.test @@ -0,0 +1,471 @@ +#--- source.hlsl +// ints +StructuredBuffer In : register(t0); +RWStructuredBuffer Out1 : register(u1); +RWStructuredBuffer Out2 : register(u2); +RWStructuredBuffer Out3 : register(u3); +RWStructuredBuffer Out4 : register(u4); +RWStructuredBuffer Out5 : register(u5); + +// uints +StructuredBuffer UIn : register(t6); +RWStructuredBuffer UOut1 : register(u7); +RWStructuredBuffer UOut2 : register(u8); +RWStructuredBuffer UOut3 : register(u9); +RWStructuredBuffer UOut4 : register(u10); +RWStructuredBuffer UOut5 : register(u11); + +// floats +StructuredBuffer FIn : register(t12); +RWStructuredBuffer FOut1 : register(u13); +RWStructuredBuffer FOut2 : register(u14); +RWStructuredBuffer FOut3 : register(u15); +RWStructuredBuffer FOut4 : register(u16); +RWStructuredBuffer FOut5 : register(u17); + +[numthreads(4,1,1)] +void main(uint3 tid : SV_GroupThreadID) +{ + int4 v = In[0]; + + // Mask per "active lane set": only <=N lanes contribute + int s1 = tid.x <= 0 ? WavePrefixProduct( v.x ) : 0; + int s2 = tid.x <= 1 ? WavePrefixProduct( v.x ) : 0; + int s3 = tid.x <= 2 ? WavePrefixProduct( v.x ) : 0; + int s4 = tid.x <= 3 ? WavePrefixProduct( v.x ) : 0; + + int2 v2_1 = tid.x <= 0 ? WavePrefixProduct( v.xy ) : int2(0,0); + int2 v2_2 = tid.x <= 1 ? WavePrefixProduct( v.xy ) : int2(0,0); + int2 v2_3 = tid.x <= 2 ? WavePrefixProduct( v.xy ) : int2(0,0); + int2 v2_4 = tid.x <= 3 ? WavePrefixProduct( v.xy ) : int2(0,0); + + int3 v3_1 = tid.x <= 0 ? WavePrefixProduct( v.xyz ) : int3(0,0,0); + int3 v3_2 = tid.x <= 1 ? WavePrefixProduct( v.xyz ) : int3(0,0,0); + int3 v3_3 = tid.x <= 2 ? WavePrefixProduct( v.xyz ) : int3(0,0,0); + int3 v3_4 = tid.x <= 3 ? WavePrefixProduct( v.xyz ) : int3(0,0,0); + + int4 v4_1 = tid.x <= 0 ? WavePrefixProduct( v ) : int4(0,0,0,0); + int4 v4_2 = tid.x <= 1 ? WavePrefixProduct( v ) : int4(0,0,0,0); + int4 v4_3 = tid.x <= 2 ? WavePrefixProduct( v ) : int4(0,0,0,0); + int4 v4_4 = tid.x <= 3 ? WavePrefixProduct( v ) : int4(0,0,0,0); + + int scalars[4] = { s1, s2, s3, s4 }; + int2 vec2s [4] = { v2_1, v2_2, v2_3, v2_4 }; + int3 vec3s [4] = { v3_1, v3_2, v3_3, v3_4 }; + int4 vec4s [4] = { v4_1, v4_2, v4_3, v4_4 }; + + Out1[tid.x].x = scalars[tid.x]; + Out2[tid.x].xy = vec2s[tid.x]; + Out3[tid.x].xyz = vec3s[tid.x]; + Out4[tid.x] = vec4s[tid.x]; + Out5[tid.x] = WavePrefixProduct(int4(1,2,3,4)); + + // UINT case + + uint4 uv = UIn[0]; + + // Mask per "active lane set": only <=N lanes contribute + uint us1 = tid.x <= 0 ? WavePrefixProduct( uv.x ) : 0; + uint us2 = tid.x <= 1 ? WavePrefixProduct( uv.x ) : 0; + uint us3 = tid.x <= 2 ? WavePrefixProduct( uv.x ) : 0; + uint us4 = tid.x <= 3 ? WavePrefixProduct( uv.x ) : 0; + + uint2 uv2_1 = tid.x <= 0 ? WavePrefixProduct( uv.xy ) : uint2(0,0); + uint2 uv2_2 = tid.x <= 1 ? WavePrefixProduct( uv.xy ) : uint2(0,0); + uint2 uv2_3 = tid.x <= 2 ? WavePrefixProduct( uv.xy ) : uint2(0,0); + uint2 uv2_4 = tid.x <= 3 ? WavePrefixProduct( uv.xy ) : uint2(0,0); + + uint3 uv3_1 = tid.x <= 0 ? WavePrefixProduct( uv.xyz ) : uint3(0,0,0); + uint3 uv3_2 = tid.x <= 1 ? WavePrefixProduct( uv.xyz ) : uint3(0,0,0); + uint3 uv3_3 = tid.x <= 2 ? WavePrefixProduct( uv.xyz ) : uint3(0,0,0); + uint3 uv3_4 = tid.x <= 3 ? WavePrefixProduct( uv.xyz ) : uint3(0,0,0); + + uint4 uv4_1 = tid.x <= 0 ? WavePrefixProduct( uv ) : uint4(0,0,0,0); + uint4 uv4_2 = tid.x <= 1 ? WavePrefixProduct( uv ) : uint4(0,0,0,0); + uint4 uv4_3 = tid.x <= 2 ? WavePrefixProduct( uv ) : uint4(0,0,0,0); + uint4 uv4_4 = tid.x <= 3 ? WavePrefixProduct( uv ) : uint4(0,0,0,0); + + uint uscalars[4] = { us1, us2, us3, us4 }; + uint2 uvec2s [4] = { uv2_1, uv2_2, uv2_3, uv2_4 }; + uint3 uvec3s [4] = { uv3_1, uv3_2, uv3_3, uv3_4 }; + uint4 uvec4s [4] = { uv4_1, uv4_2, uv4_3, uv4_4 }; + + UOut1[tid.x].x = uscalars[tid.x]; + UOut2[tid.x].xy = uvec2s[tid.x]; + UOut3[tid.x].xyz = uvec3s[tid.x]; + UOut4[tid.x] = uvec4s[tid.x]; + UOut5[tid.x] = WavePrefixProduct(uint4(1,2,3,4)); + + // Float case + + float4 fv = FIn[0]; + + // Mask per "active lane set": only <=N lanes contribute + float fs1 = tid.x <= 0 ? WavePrefixProduct( fv.x ) : 0; + float fs2 = tid.x <= 1 ? WavePrefixProduct( fv.x ) : 0; + float fs3 = tid.x <= 2 ? WavePrefixProduct( fv.x ) : 0; + float fs4 = tid.x <= 3 ? WavePrefixProduct( fv.x ) : 0; + + float2 fv2_1 = tid.x <= 0 ? WavePrefixProduct( fv.xy ) : float2(0,0); + float2 fv2_2 = tid.x <= 1 ? WavePrefixProduct( fv.xy ) : float2(0,0); + float2 fv2_3 = tid.x <= 2 ? WavePrefixProduct( fv.xy ) : float2(0,0); + float2 fv2_4 = tid.x <= 3 ? WavePrefixProduct( fv.xy ) : float2(0,0); + + float3 fv3_1 = tid.x <= 0 ? WavePrefixProduct( fv.xyz ) : float3(0,0,0); + float3 fv3_2 = tid.x <= 1 ? WavePrefixProduct( fv.xyz ) : float3(0,0,0); + float3 fv3_3 = tid.x <= 2 ? WavePrefixProduct( fv.xyz ) : float3(0,0,0); + float3 fv3_4 = tid.x <= 3 ? WavePrefixProduct( fv.xyz ) : float3(0,0,0); + + float4 fv4_1 = tid.x <= 0 ? WavePrefixProduct( fv ) : float4(0,0,0,0); + float4 fv4_2 = tid.x <= 1 ? WavePrefixProduct( fv ) : float4(0,0,0,0); + float4 fv4_3 = tid.x <= 2 ? WavePrefixProduct( fv ) : float4(0,0,0,0); + float4 fv4_4 = tid.x <= 3 ? WavePrefixProduct( fv ) : float4(0,0,0,0); + + float fscalars[4] = { fs1, fs2, fs3, fs4 }; + float2 fvec2s [4] = { fv2_1, fv2_2, fv2_3, fv2_4 }; + float3 fvec3s [4] = { fv3_1, fv3_2, fv3_3, fv3_4 }; + float4 fvec4s [4] = { fv4_1, fv4_2, fv4_3, fv4_4 }; + + FOut1[tid.x].x = fscalars[tid.x]; + FOut2[tid.x].xy = fvec2s[tid.x]; + FOut3[tid.x].xyz = fvec3s[tid.x]; + FOut4[tid.x] = fvec4s[tid.x]; + FOut5[tid.x] = WavePrefixProduct(float4(1,2,3,4)); +} + +//--- pipeline.yaml + +--- +Shaders: + - Stage: Compute + Entry: main + DispatchSize: [ 1, 1, 1 ] +Buffers: + - Name: In + Format: Int32 + Stride: 16 + Data: [ 2, 4, 8, 16 ] + - Name: Out1 + Format: Int32 + Stride: 16 + FillSize: 64 + - Name: Out2 + Format: Int32 + Stride: 16 + FillSize: 64 + - Name: Out3 + Format: Int32 + Stride: 16 + FillSize: 64 + - Name: Out4 + Format: Int32 + Stride: 16 + FillSize: 64 + - Name: Out5 + Format: Int32 + Stride: 16 + FillSize: 64 + - Name: ExpectedOut1 + Format: Int32 + Stride: 16 + Data: [ 1, 0, 0, 0, 2, 0, 0, 0, 4, 0, 0, 0, 8, 0, 0, 0 ] + - Name: ExpectedOut2 + Format: Int32 + Stride: 16 + Data: [ 1, 1, 0, 0, 2, 4, 0, 0, 4, 16, 0, 0, 8, 64, 0, 0 ] + - Name: ExpectedOut3 + Format: Int32 + Stride: 16 + Data: [ 1, 1, 1, 0, 2, 4, 8, 0, 4, 16, 64, 0, 8, 64, 512, 0 ] + - Name: ExpectedOut4 + Format: Int32 + Stride: 16 + Data: [ 1, 1, 1, 1, 2, 4, 8, 16, 4, 16, 64, 256, 8, 64, 512, 4096 ] + - Name: ExpectedOut5 + Format: Int32 + Stride: 16 + Data: [ 1, 1, 1, 1, 1, 2, 3, 4, 1, 4, 9, 16, 1, 8, 27, 64 ] + - Name: UIn + Format: UInt32 + Stride: 16 + Data: [ 2, 4, 8, 16 ] + - Name: UOut1 + Format: UInt32 + Stride: 16 + FillSize: 64 + - Name: UOut2 + Format: UInt32 + Stride: 16 + FillSize: 64 + - Name: UOut3 + Format: UInt32 + Stride: 16 + FillSize: 64 + - Name: UOut4 + Format: UInt32 + Stride: 16 + FillSize: 64 + - Name: UOut5 + Format: UInt32 + Stride: 16 + FillSize: 64 + - Name: UExpectedOut1 + Format: UInt32 + Stride: 16 + Data: [ 1, 0, 0, 0, 2, 0, 0, 0, 4, 0, 0, 0, 8, 0, 0, 0 ] + - Name: UExpectedOut2 + Format: UInt32 + Stride: 16 + Data: [ 1, 1, 0, 0, 2, 4, 0, 0, 4, 16, 0, 0, 8, 64, 0, 0 ] + - Name: UExpectedOut3 + Format: UInt32 + Stride: 16 + Data: [ 1, 1, 1, 0, 2, 4, 8, 0, 4, 16, 64, 0, 8, 64, 512, 0 ] + - Name: UExpectedOut4 + Format: UInt32 + Stride: 16 + Data: [ 1, 1, 1, 1, 2, 4, 8, 16, 4, 16, 64, 256, 8, 64, 512, 4096 ] + - Name: UExpectedOut5 + Format: UInt32 + Stride: 16 + Data: [ 1, 1, 1, 1, 1, 2, 3, 4, 1, 4, 9, 16, 1, 8, 27, 64 ] + - Name: FIn + Format: Float32 + Stride: 16 + Data: [ 2.0, 4.0, 8.0, 16.0 ] + - Name: FOut1 + Format: Float32 + Stride: 16 + FillSize: 64 + - Name: FOut2 + Format: Float32 + Stride: 16 + FillSize: 64 + - Name: FOut3 + Format: Float32 + Stride: 16 + FillSize: 64 + - Name: FOut4 + Format: Float32 + Stride: 16 + FillSize: 64 + - Name: FOut5 + Format: Float32 + Stride: 16 + FillSize: 64 + - Name: FExpectedOut1 + Format: Float32 + Stride: 16 + Data: [ 1.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 4.0, 0.0, 0.0, 0.0, 8.0, 0.0, 0.0, 0.0 ] + - Name: FExpectedOut2 + Format: Float32 + Stride: 16 + Data: [ 1.0, 1.0, 0.0, 0.0, 2.0, 4.0, 0.0, 0.0, 4.0, 16.0, 0.0, 0.0, 8.0, 64.0, 0.0, 0.0 ] + - Name: FExpectedOut3 + Format: Float32 + Stride: 16 + Data: [ 1.0, 1.0, 1.0, 0.0, 2.0, 4.0, 8.0, 0.0, 4.0, 16.0, 64.0, 0.0, 8.0, 64.0, 512.0, 0.0 ] + - Name: FExpectedOut4 + Format: Float32 + Stride: 16 + Data: [ 1.0, 1.0, 1.0, 1.0, 2.0, 4.0, 8.0, 16.0, 4.0, 16.0, 64.0, 256.0, 8.0, 64.0, 512.0, 4096.0 ] + - Name: FExpectedOut5 + Format: Float32 + Stride: 16 + Data: [ 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 3.0, 4.0, 1.0, 4.0, 9.0, 16.0, 1.0, 8.0, 27.0, 64.0 ] +Results: + - Result: ExpectedOut1 + Rule: BufferExact + Actual: Out1 + Expected: ExpectedOut1 + - Result: ExpectedOut2 + Rule: BufferExact + Actual: Out2 + Expected: ExpectedOut2 + - Result: ExpectedOut3 + Rule: BufferExact + Actual: Out3 + Expected: ExpectedOut3 + - Result: ExpectedOut4 + Rule: BufferExact + Actual: Out4 + Expected: ExpectedOut4 + - Result: ExpectedOut5 + Rule: BufferExact + Actual: Out5 + Expected: ExpectedOut5 + - Result: UExpectedOut1 + Rule: BufferExact + Actual: UOut1 + Expected: UExpectedOut1 + - Result: UExpectedOut2 + Rule: BufferExact + Actual: UOut2 + Expected: UExpectedOut2 + - Result: UExpectedOut3 + Rule: BufferExact + Actual: UOut3 + Expected: UExpectedOut3 + - Result: UExpectedOut4 + Rule: BufferExact + Actual: UOut4 + Expected: UExpectedOut4 + - Result: UExpectedOut5 + Rule: BufferExact + Actual: UOut5 + Expected: UExpectedOut5 + - Result: FExpectedOut1 + Rule: BufferExact + Actual: FOut1 + Expected: FExpectedOut1 + - Result: FExpectedOut2 + Rule: BufferExact + Actual: FOut2 + Expected: FExpectedOut2 + - Result: FExpectedOut3 + Rule: BufferExact + Actual: FOut3 + Expected: FExpectedOut3 + - Result: FExpectedOut4 + Rule: BufferExact + Actual: FOut4 + Expected: FExpectedOut4 + - Result: FExpectedOut5 + Rule: BufferExact + Actual: FOut5 + Expected: FExpectedOut5 +DescriptorSets: + - Resources: + - Name: In + Kind: StructuredBuffer + DirectXBinding: + Register: 0 + Space: 0 + VulkanBinding: + Binding: 0 + - Name: Out1 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 1 + Space: 0 + VulkanBinding: + Binding: 1 + - Name: Out2 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 2 + Space: 0 + VulkanBinding: + Binding: 2 + - Name: Out3 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 3 + Space: 0 + VulkanBinding: + Binding: 3 + - Name: Out4 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 4 + Space: 0 + VulkanBinding: + Binding: 4 + - Name: Out5 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 5 + Space: 0 + VulkanBinding: + Binding: 5 + - Name: UIn + Kind: StructuredBuffer + DirectXBinding: + Register: 6 + Space: 0 + VulkanBinding: + Binding: 6 + - Name: UOut1 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 7 + Space: 0 + VulkanBinding: + Binding: 7 + - Name: UOut2 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 8 + Space: 0 + VulkanBinding: + Binding: 8 + - Name: UOut3 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 9 + Space: 0 + VulkanBinding: + Binding: 9 + - Name: UOut4 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 10 + Space: 0 + VulkanBinding: + Binding: 10 + - Name: UOut5 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 11 + Space: 0 + VulkanBinding: + Binding: 11 + - Name: FIn + Kind: StructuredBuffer + DirectXBinding: + Register: 12 + Space: 0 + VulkanBinding: + Binding: 12 + - Name: FOut1 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 13 + Space: 0 + VulkanBinding: + Binding: 13 + - Name: FOut2 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 14 + Space: 0 + VulkanBinding: + Binding: 14 + - Name: FOut3 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 15 + Space: 0 + VulkanBinding: + Binding: 15 + - Name: FOut4 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 16 + Space: 0 + VulkanBinding: + Binding: 16 + - Name: FOut5 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 17 + Space: 0 + VulkanBinding: + Binding: 17 + +... +#--- end + +# RUN: split-file %s %t +# RUN: %dxc_target -T cs_6_5 -Fo %t.o %t/source.hlsl +# RUN: %offloader %t/pipeline.yaml %t.o diff --git a/test/WaveOps/WavePrefixProduct.fp16.test b/test/WaveOps/WavePrefixProduct.fp16.test new file mode 100644 index 000000000..d8cd5110b --- /dev/null +++ b/test/WaveOps/WavePrefixProduct.fp16.test @@ -0,0 +1,173 @@ +#--- source.hlsl +StructuredBuffer In : register(t0); +RWStructuredBuffer Out1 : register(u1); // test scalar +RWStructuredBuffer Out2 : register(u2); // test half2 +RWStructuredBuffer Out3 : register(u3); // test half3 +RWStructuredBuffer Out4 : register(u4); // test half4 +RWStructuredBuffer Out5 : register(u5); // constant folding + +[numthreads(4,1,1)] +void main(uint3 tid : SV_GroupThreadID) +{ + half4 v = In[0]; + + // Mask per "active lane set": only <=N lanes contribute + half s1 = tid.x <= 0 ? WavePrefixProduct( v.x ) : 0; + half s2 = tid.x <= 1 ? WavePrefixProduct( v.x ) : 0; + half s3 = tid.x <= 2 ? WavePrefixProduct( v.x ) : 0; + half s4 = tid.x <= 3 ? WavePrefixProduct( v.x ) : 0; + + half2 v2_1 = tid.x <= 0 ? WavePrefixProduct( v.xy ) : half2(0,0); + half2 v2_2 = tid.x <= 1 ? WavePrefixProduct( v.xy ) : half2(0,0); + half2 v2_3 = tid.x <= 2 ? WavePrefixProduct( v.xy ) : half2(0,0); + half2 v2_4 = tid.x <= 3 ? WavePrefixProduct( v.xy ) : half2(0,0); + + half3 v3_1 = tid.x <= 0 ? WavePrefixProduct( v.xyz ) : half3(0,0,0); + half3 v3_2 = tid.x <= 1 ? WavePrefixProduct( v.xyz ) : half3(0,0,0); + half3 v3_3 = tid.x <= 2 ? WavePrefixProduct( v.xyz ) : half3(0,0,0); + half3 v3_4 = tid.x <= 3 ? WavePrefixProduct( v.xyz ) : half3(0,0,0); + + half4 v4_1 = tid.x <= 0 ? WavePrefixProduct( v ) : half4(0,0,0,0); + half4 v4_2 = tid.x <= 1 ? WavePrefixProduct( v ) : half4(0,0,0,0); + half4 v4_3 = tid.x <= 2 ? WavePrefixProduct( v ) : half4(0,0,0,0); + half4 v4_4 = tid.x <= 3 ? WavePrefixProduct( v ) : half4(0,0,0,0); + + half scalars[4] = { s1, s2, s3, s4 }; + half2 vec2s [4] = { v2_1, v2_2, v2_3, v2_4 }; + half3 vec3s [4] = { v3_1, v3_2, v3_3, v3_4 }; + half4 vec4s [4] = { v4_1, v4_2, v4_3, v4_4 }; + + Out1[tid.x].x = scalars[tid.x]; + Out2[tid.x].xy = vec2s[tid.x]; + Out3[tid.x].xyz = vec3s[tid.x]; + Out4[tid.x] = vec4s[tid.x]; + Out5[tid.x] = WavePrefixProduct(half4(1,2,3,4)); +} + +//--- pipeline.yaml + +--- +Shaders: + - Stage: Compute + Entry: main + DispatchSize: [ 1, 1, 1 ] +Buffers: + - Name: In + Format: Float16 + Stride: 8 + # Data: [ 0x4000, 0x4400, 0x4800, 0x4c00 ] + Data: [ 0x4000, 0x4400, 0x4800, 0x4c00 ] + - Name: Out1 + Format: Float16 + Stride: 8 + FillSize: 32 + - Name: Out2 + Format: Float16 + Stride: 8 + FillSize: 32 + - Name: Out3 + Format: Float16 + Stride: 8 + FillSize: 32 + - Name: Out4 + Format: Float16 + Stride: 8 + FillSize: 32 + - Name: Out5 + Format: Float16 + Stride: 8 + FillSize: 32 + - Name: ExpectedOut1 + Format: Float16 + Stride: 8 + Data: [ 0x3c00, 0x0, 0x0, 0x0, 0x4000, 0x0, 0x0, 0x0, 0x4400, 0x0, 0x0, 0x0, 0x4800, 0x0, 0x0, 0x0 ] + - Name: ExpectedOut2 + Format: Float16 + Stride: 8 + Data: [ 0x3c00, 0x3c00, 0x0, 0x0, 0x4000, 0x4400, 0x0, 0x0, 0x4400, 0x4c00, 0x0, 0x0, 0x4800, 0x5400, 0x0, 0x0 ] + - Name: ExpectedOut3 + Format: Float16 + Stride: 8 + Data: [ 0x3c00, 0x3c00, 0x3c00, 0x0, 0x4000, 0x4400, 0x4800, 0x0, 0x4400, 0x4c00, 0x5400, 0x0, 0x4800, 0x5400, 0x6000, 0x0 ] + - Name: ExpectedOut4 + Format: Float16 + Stride: 8 + Data: [ 0x3c00, 0x3c00, 0x3c00, 0x3c00, 0x4000, 0x4400, 0x4800, 0x4c00, 0x4400, 0x4c00, 0x5400, 0x5c00, 0x4800, 0x5400, 0x6000, 0x6c00 ] + - Name: ExpectedOut5 + Format: Float16 + Stride: 8 + Data: [ 0x3c00, 0x3c00, 0x3c00, 0x3c00, 0x3c00, 0x4000, 0x4200, 0x4400, 0x3c00, 0x4400, 0x4880, 0x4c00, 0x3c00, 0x4800, 0x4ec0, 0x5400 ] +Results: + - Result: ExpectedOut1 + Rule: BufferExact + Actual: Out1 + Expected: ExpectedOut1 + - Result: ExpectedOut2 + Rule: BufferExact + Actual: Out2 + Expected: ExpectedOut2 + - Result: ExpectedOut3 + Rule: BufferExact + Actual: Out3 + Expected: ExpectedOut3 + - Result: ExpectedOut4 + Rule: BufferExact + Actual: Out4 + Expected: ExpectedOut4 + - Result: ExpectedOut5 + Rule: BufferExact + Actual: Out5 + Expected: ExpectedOut5 +DescriptorSets: + - Resources: + - Name: In + Kind: StructuredBuffer + DirectXBinding: + Register: 0 + Space: 0 + VulkanBinding: + Binding: 0 + - Name: Out1 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 1 + Space: 0 + VulkanBinding: + Binding: 1 + - Name: Out2 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 2 + Space: 0 + VulkanBinding: + Binding: 2 + - Name: Out3 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 3 + Space: 0 + VulkanBinding: + Binding: 3 + - Name: Out4 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 4 + Space: 0 + VulkanBinding: + Binding: 4 + - Name: Out5 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 5 + Space: 0 + VulkanBinding: + Binding: 5 + +... +#--- end + +# REQUIRES: Half + +# RUN: split-file %s %t +# RUN: %dxc_target -enable-16bit-types -T cs_6_5 -Fo %t.o %t/source.hlsl +# RUN: %offloader %t/pipeline.yaml %t.o diff --git a/test/WaveOps/WavePrefixProduct.fp64.test b/test/WaveOps/WavePrefixProduct.fp64.test new file mode 100644 index 000000000..03c0dd890 --- /dev/null +++ b/test/WaveOps/WavePrefixProduct.fp64.test @@ -0,0 +1,172 @@ +#--- source.hlsl +StructuredBuffer In : register(t0); +RWStructuredBuffer Out1 : register(u1); // test scalar +RWStructuredBuffer Out2 : register(u2); // test double2 +RWStructuredBuffer Out3 : register(u3); // test double3 +RWStructuredBuffer Out4 : register(u4); // test double4 +RWStructuredBuffer Out5 : register(u5); // constant folding + +[numthreads(4,1,1)] +void main(uint3 tid : SV_GroupThreadID) +{ + double4 v = In[0]; + + // Mask per "active lane set": only <=N lanes contribute + double s1 = tid.x <= 0 ? WavePrefixProduct( v.x ) : 0; + double s2 = tid.x <= 1 ? WavePrefixProduct( v.x ) : 0; + double s3 = tid.x <= 2 ? WavePrefixProduct( v.x ) : 0; + double s4 = tid.x <= 3 ? WavePrefixProduct( v.x ) : 0; + + double2 v2_1 = tid.x <= 0 ? WavePrefixProduct( v.xy ) : double2(0,0); + double2 v2_2 = tid.x <= 1 ? WavePrefixProduct( v.xy ) : double2(0,0); + double2 v2_3 = tid.x <= 2 ? WavePrefixProduct( v.xy ) : double2(0,0); + double2 v2_4 = tid.x <= 3 ? WavePrefixProduct( v.xy ) : double2(0,0); + + double3 v3_1 = tid.x <= 0 ? WavePrefixProduct( v.xyz ) : double3(0,0,0); + double3 v3_2 = tid.x <= 1 ? WavePrefixProduct( v.xyz ) : double3(0,0,0); + double3 v3_3 = tid.x <= 2 ? WavePrefixProduct( v.xyz ) : double3(0,0,0); + double3 v3_4 = tid.x <= 3 ? WavePrefixProduct( v.xyz ) : double3(0,0,0); + + double4 v4_1 = tid.x <= 0 ? WavePrefixProduct( v ) : double4(0,0,0,0); + double4 v4_2 = tid.x <= 1 ? WavePrefixProduct( v ) : double4(0,0,0,0); + double4 v4_3 = tid.x <= 2 ? WavePrefixProduct( v ) : double4(0,0,0,0); + double4 v4_4 = tid.x <= 3 ? WavePrefixProduct( v ) : double4(0,0,0,0); + + double scalars[4] = { s1, s2, s3, s4 }; + double2 vec2s [4] = { v2_1, v2_2, v2_3, v2_4 }; + double3 vec3s [4] = { v3_1, v3_2, v3_3, v3_4 }; + double4 vec4s [4] = { v4_1, v4_2, v4_3, v4_4 }; + + Out1[tid.x].x = scalars[tid.x]; + Out2[tid.x].xy = vec2s[tid.x]; + Out3[tid.x].xyz = vec3s[tid.x]; + Out4[tid.x] = vec4s[tid.x]; + Out5[tid.x] = WavePrefixProduct(double4(1,2,3,4)); +} + +//--- pipeline.yaml + +--- +Shaders: + - Stage: Compute + Entry: main + DispatchSize: [ 1, 1, 1 ] +Buffers: + - Name: In + Format: Float64 + Stride: 32 + Data: [ 2.0, 4.0, 8.0, 16.0 ] + - Name: Out1 + Format: Float64 + Stride: 32 + FillSize: 128 + - Name: Out2 + Format: Float64 + Stride: 32 + FillSize: 128 + - Name: Out3 + Format: Float64 + Stride: 32 + FillSize: 128 + - Name: Out4 + Format: Float64 + Stride: 32 + FillSize: 128 + - Name: Out5 + Format: Float64 + Stride: 32 + FillSize: 128 + - Name: ExpectedOut1 + Format: Float64 + Stride: 32 + Data: [ 1.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 4.0, 0.0, 0.0, 0.0, 8.0, 0.0, 0.0, 0.0 ] + - Name: ExpectedOut2 + Format: Float64 + Stride: 32 + Data: [ 1.0, 1.0, 0.0, 0.0, 2.0, 4.0, 0.0, 0.0, 4.0, 16.0, 0.0, 0.0, 8.0, 64.0, 0.0, 0.0 ] + - Name: ExpectedOut3 + Format: Float64 + Stride: 32 + Data: [ 1.0, 1.0, 1.0, 0.0, 2.0, 4.0, 8.0, 0.0, 4.0, 16.0, 64.0, 0.0, 8.0, 64.0, 512.0, 0.0 ] + - Name: ExpectedOut4 + Format: Float64 + Stride: 32 + Data: [ 1.0, 1.0, 1.0, 1.0, 2.0, 4.0, 8.0, 16.0, 4.0, 16.0, 64.0, 256.0, 8.0, 64.0, 512.0, 4096.0 ] + - Name: ExpectedOut5 + Format: Float64 + Stride: 32 + Data: [ 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 3.0, 4.0, 1.0, 4.0, 9.0, 16.0, 1.0, 8.0, 27.0, 64.0 ] +Results: + - Result: ExpectedOut1 + Rule: BufferExact + Actual: Out1 + Expected: ExpectedOut1 + - Result: ExpectedOut2 + Rule: BufferExact + Actual: Out2 + Expected: ExpectedOut2 + - Result: ExpectedOut3 + Rule: BufferExact + Actual: Out3 + Expected: ExpectedOut3 + - Result: ExpectedOut4 + Rule: BufferExact + Actual: Out4 + Expected: ExpectedOut4 + - Result: ExpectedOut5 + Rule: BufferExact + Actual: Out5 + Expected: ExpectedOut5 +DescriptorSets: + - Resources: + - Name: In + Kind: StructuredBuffer + DirectXBinding: + Register: 0 + Space: 0 + VulkanBinding: + Binding: 0 + - Name: Out1 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 1 + Space: 0 + VulkanBinding: + Binding: 1 + - Name: Out2 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 2 + Space: 0 + VulkanBinding: + Binding: 2 + - Name: Out3 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 3 + Space: 0 + VulkanBinding: + Binding: 3 + - Name: Out4 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 4 + Space: 0 + VulkanBinding: + Binding: 4 + - Name: Out5 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 5 + Space: 0 + VulkanBinding: + Binding: 5 + +... +#--- end + +# REQUIRES: Double + +# RUN: split-file %s %t +# RUN: %dxc_target -T cs_6_5 -Fo %t.o %t/source.hlsl +# RUN: %offloader %t/pipeline.yaml %t.o diff --git a/test/WaveOps/WavePrefixProduct.int16.test b/test/WaveOps/WavePrefixProduct.int16.test new file mode 100644 index 000000000..c6a592d5d --- /dev/null +++ b/test/WaveOps/WavePrefixProduct.int16.test @@ -0,0 +1,323 @@ +#--- source.hlsl +// ints +StructuredBuffer In : register(t0); +RWStructuredBuffer Out1 : register(u1); +RWStructuredBuffer Out2 : register(u2); +RWStructuredBuffer Out3 : register(u3); +RWStructuredBuffer Out4 : register(u4); +RWStructuredBuffer Out5 : register(u5); + +// uints +StructuredBuffer UIn : register(t6); +RWStructuredBuffer UOut1 : register(u7); +RWStructuredBuffer UOut2 : register(u8); +RWStructuredBuffer UOut3 : register(u9); +RWStructuredBuffer UOut4 : register(u10); +RWStructuredBuffer UOut5 : register(u11); + +[numthreads(4,1,1)] +void main(uint3 tid : SV_GroupThreadID) +{ + int16_t4 v = In[0]; + + // Mask per "active lane set": only <=N lanes contribute + int16_t s1 = tid.x <= 0 ? WavePrefixProduct( v.x ) : 0; + int16_t s2 = tid.x <= 1 ? WavePrefixProduct( v.x ) : 0; + int16_t s3 = tid.x <= 2 ? WavePrefixProduct( v.x ) : 0; + int16_t s4 = tid.x <= 3 ? WavePrefixProduct( v.x ) : 0; + + int16_t2 v2_1 = tid.x <= 0 ? WavePrefixProduct( v.xy ) : int16_t2(0,0); + int16_t2 v2_2 = tid.x <= 1 ? WavePrefixProduct( v.xy ) : int16_t2(0,0); + int16_t2 v2_3 = tid.x <= 2 ? WavePrefixProduct( v.xy ) : int16_t2(0,0); + int16_t2 v2_4 = tid.x <= 3 ? WavePrefixProduct( v.xy ) : int16_t2(0,0); + + int16_t3 v3_1 = tid.x <= 0 ? WavePrefixProduct( v.xyz ) : int16_t3(0,0,0); + int16_t3 v3_2 = tid.x <= 1 ? WavePrefixProduct( v.xyz ) : int16_t3(0,0,0); + int16_t3 v3_3 = tid.x <= 2 ? WavePrefixProduct( v.xyz ) : int16_t3(0,0,0); + int16_t3 v3_4 = tid.x <= 3 ? WavePrefixProduct( v.xyz ) : int16_t3(0,0,0); + + int16_t4 v4_1 = tid.x <= 0 ? WavePrefixProduct( v ) : int16_t4(0,0,0,0); + int16_t4 v4_2 = tid.x <= 1 ? WavePrefixProduct( v ) : int16_t4(0,0,0,0); + int16_t4 v4_3 = tid.x <= 2 ? WavePrefixProduct( v ) : int16_t4(0,0,0,0); + int16_t4 v4_4 = tid.x <= 3 ? WavePrefixProduct( v ) : int16_t4(0,0,0,0); + + int16_t scalars[4] = { s1, s2, s3, s4 }; + int16_t2 vec2s [4] = { v2_1, v2_2, v2_3, v2_4 }; + int16_t3 vec3s [4] = { v3_1, v3_2, v3_3, v3_4 }; + int16_t4 vec4s [4] = { v4_1, v4_2, v4_3, v4_4 }; + + Out1[tid.x].x = scalars[tid.x]; + Out2[tid.x].xy = vec2s[tid.x]; + Out3[tid.x].xyz = vec3s[tid.x]; + Out4[tid.x] = vec4s[tid.x]; + Out5[tid.x] = WavePrefixProduct(int16_t4(1,2,3,4)); + + // UINT case + + uint16_t4 uv = UIn[0]; + + // Mask per "active lane set": only <=N lanes contribute + uint16_t us1 = tid.x <= 0 ? WavePrefixProduct( uv.x ) : 0; + uint16_t us2 = tid.x <= 1 ? WavePrefixProduct( uv.x ) : 0; + uint16_t us3 = tid.x <= 2 ? WavePrefixProduct( uv.x ) : 0; + uint16_t us4 = tid.x <= 3 ? WavePrefixProduct( uv.x ) : 0; + + uint16_t2 uv2_1 = tid.x <= 0 ? WavePrefixProduct( uv.xy ) : uint16_t2(0,0); + uint16_t2 uv2_2 = tid.x <= 1 ? WavePrefixProduct( uv.xy ) : uint16_t2(0,0); + uint16_t2 uv2_3 = tid.x <= 2 ? WavePrefixProduct( uv.xy ) : uint16_t2(0,0); + uint16_t2 uv2_4 = tid.x <= 3 ? WavePrefixProduct( uv.xy ) : uint16_t2(0,0); + + uint16_t3 uv3_1 = tid.x <= 0 ? WavePrefixProduct( uv.xyz ) : uint16_t3(0,0,0); + uint16_t3 uv3_2 = tid.x <= 1 ? WavePrefixProduct( uv.xyz ) : uint16_t3(0,0,0); + uint16_t3 uv3_3 = tid.x <= 2 ? WavePrefixProduct( uv.xyz ) : uint16_t3(0,0,0); + uint16_t3 uv3_4 = tid.x <= 3 ? WavePrefixProduct( uv.xyz ) : uint16_t3(0,0,0); + + uint16_t4 uv4_1 = tid.x <= 0 ? WavePrefixProduct( uv ) : uint16_t4(0,0,0,0); + uint16_t4 uv4_2 = tid.x <= 1 ? WavePrefixProduct( uv ) : uint16_t4(0,0,0,0); + uint16_t4 uv4_3 = tid.x <= 2 ? WavePrefixProduct( uv ) : uint16_t4(0,0,0,0); + uint16_t4 uv4_4 = tid.x <= 3 ? WavePrefixProduct( uv ) : uint16_t4(0,0,0,0); + + uint16_t uscalars[4] = { us1, us2, us3, us4 }; + uint16_t2 uvec2s [4] = { uv2_1, uv2_2, uv2_3, uv2_4 }; + uint16_t3 uvec3s [4] = { uv3_1, uv3_2, uv3_3, uv3_4 }; + uint16_t4 uvec4s [4] = { uv4_1, uv4_2, uv4_3, uv4_4 }; + + UOut1[tid.x].x = uscalars[tid.x]; + UOut2[tid.x].xy = uvec2s[tid.x]; + UOut3[tid.x].xyz = uvec3s[tid.x]; + UOut4[tid.x] = uvec4s[tid.x]; + UOut5[tid.x] = WavePrefixProduct(uint16_t4(1,2,3,4)); +} + +//--- pipeline.yaml + +--- +Shaders: + - Stage: Compute + Entry: main + DispatchSize: [ 1, 1, 1 ] +Buffers: + - Name: In + Format: Int16 + Stride: 8 + Data: [ 2, 4, 8, 16 ] + - Name: Out1 + Format: Int16 + Stride: 8 + FillSize: 32 + - Name: Out2 + Format: Int16 + Stride: 8 + FillSize: 32 + - Name: Out3 + Format: Int16 + Stride: 8 + FillSize: 32 + - Name: Out4 + Format: Int16 + Stride: 8 + FillSize: 32 + - Name: Out5 + Format: Int16 + Stride: 8 + FillSize: 32 + - Name: ExpectedOut1 + Format: Int16 + Stride: 8 + Data: [ 1, 0, 0, 0, 2, 0, 0, 0, 4, 0, 0, 0, 8, 0, 0, 0 ] + - Name: ExpectedOut2 + Format: Int16 + Stride: 8 + Data: [ 1, 1, 0, 0, 2, 4, 0, 0, 4, 16, 0, 0, 8, 64, 0, 0 ] + - Name: ExpectedOut3 + Format: Int16 + Stride: 8 + Data: [ 1, 1, 1, 0, 2, 4, 8, 0, 4, 16, 64, 0, 8, 64, 512, 0 ] + - Name: ExpectedOut4 + Format: Int16 + Stride: 8 + Data: [ 1, 1, 1, 1, 2, 4, 8, 16, 4, 16, 64, 256, 8, 64, 512, 4096 ] + - Name: ExpectedOut5 + Format: Int16 + Stride: 8 + Data: [ 1, 1, 1, 1, 1, 2, 3, 4, 1, 4, 9, 16, 1, 8, 27, 64 ] + - Name: UIn + Format: UInt16 + Stride: 8 + Data: [ 2, 4, 8, 16 ] + - Name: UOut1 + Format: UInt16 + Stride: 8 + FillSize: 32 + - Name: UOut2 + Format: UInt16 + Stride: 8 + FillSize: 32 + - Name: UOut3 + Format: UInt16 + Stride: 8 + FillSize: 32 + - Name: UOut4 + Format: UInt16 + Stride: 8 + FillSize: 32 + - Name: UOut5 + Format: UInt16 + Stride: 8 + FillSize: 32 + - Name: UExpectedOut1 + Format: UInt16 + Stride: 8 + Data: [ 1, 0, 0, 0, 2, 0, 0, 0, 4, 0, 0, 0, 8, 0, 0, 0 ] + - Name: UExpectedOut2 + Format: UInt16 + Stride: 8 + Data: [ 1, 1, 0, 0, 2, 4, 0, 0, 4, 16, 0, 0, 8, 64, 0, 0 ] + - Name: UExpectedOut3 + Format: UInt16 + Stride: 8 + Data: [ 1, 1, 1, 0, 2, 4, 8, 0, 4, 16, 64, 0, 8, 64, 512, 0 ] + - Name: UExpectedOut4 + Format: UInt16 + Stride: 8 + Data: [ 1, 1, 1, 1, 2, 4, 8, 16, 4, 16, 64, 256, 8, 64, 512, 4096 ] + - Name: UExpectedOut5 + Format: UInt16 + Stride: 8 + Data: [ 1, 1, 1, 1, 1, 2, 3, 4, 1, 4, 9, 16, 1, 8, 27, 64 ] +Results: + - Result: ExpectedOut1 + Rule: BufferExact + Actual: Out1 + Expected: ExpectedOut1 + - Result: ExpectedOut2 + Rule: BufferExact + Actual: Out2 + Expected: ExpectedOut2 + - Result: ExpectedOut3 + Rule: BufferExact + Actual: Out3 + Expected: ExpectedOut3 + - Result: ExpectedOut4 + Rule: BufferExact + Actual: Out4 + Expected: ExpectedOut4 + - Result: ExpectedOut5 + Rule: BufferExact + Actual: Out5 + Expected: ExpectedOut5 + - Result: UExpectedOut1 + Rule: BufferExact + Actual: UOut1 + Expected: UExpectedOut1 + - Result: UExpectedOut2 + Rule: BufferExact + Actual: UOut2 + Expected: UExpectedOut2 + - Result: UExpectedOut3 + Rule: BufferExact + Actual: UOut3 + Expected: UExpectedOut3 + - Result: UExpectedOut4 + Rule: BufferExact + Actual: UOut4 + Expected: UExpectedOut4 + - Result: UExpectedOut5 + Rule: BufferExact + Actual: UOut5 + Expected: UExpectedOut5 +DescriptorSets: + - Resources: + - Name: In + Kind: StructuredBuffer + DirectXBinding: + Register: 0 + Space: 0 + VulkanBinding: + Binding: 0 + - Name: Out1 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 1 + Space: 0 + VulkanBinding: + Binding: 1 + - Name: Out2 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 2 + Space: 0 + VulkanBinding: + Binding: 2 + - Name: Out3 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 3 + Space: 0 + VulkanBinding: + Binding: 3 + - Name: Out4 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 4 + Space: 0 + VulkanBinding: + Binding: 4 + - Name: Out5 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 5 + Space: 0 + VulkanBinding: + Binding: 5 + - Name: UIn + Kind: StructuredBuffer + DirectXBinding: + Register: 6 + Space: 0 + VulkanBinding: + Binding: 6 + - Name: UOut1 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 7 + Space: 0 + VulkanBinding: + Binding: 7 + - Name: UOut2 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 8 + Space: 0 + VulkanBinding: + Binding: 8 + - Name: UOut3 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 9 + Space: 0 + VulkanBinding: + Binding: 9 + - Name: UOut4 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 10 + Space: 0 + VulkanBinding: + Binding: 10 + - Name: UOut5 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 11 + Space: 0 + VulkanBinding: + Binding: 11 + +... +#--- end + +# REQUIRES: Int16 + +# RUN: split-file %s %t +# RUN: %dxc_target -enable-16bit-types -T cs_6_5 -Fo %t.o %t/source.hlsl +# RUN: %offloader %t/pipeline.yaml %t.o diff --git a/test/WaveOps/WavePrefixProduct.int64.test b/test/WaveOps/WavePrefixProduct.int64.test new file mode 100644 index 000000000..319c186b6 --- /dev/null +++ b/test/WaveOps/WavePrefixProduct.int64.test @@ -0,0 +1,322 @@ +#--- source.hlsl +StructuredBuffer In : register(t0); +RWStructuredBuffer Out1 : register(u1); // test scalar +RWStructuredBuffer Out2 : register(u2); // test int64_t2 +RWStructuredBuffer Out3 : register(u3); // test int64_t3 +RWStructuredBuffer Out4 : register(u4); // test int64_t4 +RWStructuredBuffer Out5 : register(u5); // constant folding + +// uints +StructuredBuffer UIn : register(t6); +RWStructuredBuffer UOut1 : register(u7); +RWStructuredBuffer UOut2 : register(u8); +RWStructuredBuffer UOut3 : register(u9); +RWStructuredBuffer UOut4 : register(u10); +RWStructuredBuffer UOut5 : register(u11); + +[numthreads(4,1,1)] +void main(uint3 tid : SV_GroupThreadID) +{ + int64_t4 v = In[0]; + + // Mask per "active lane set": only <=N lanes contribute + int64_t s1 = tid.x <= 0 ? WavePrefixProduct( v.x ) : 0; + int64_t s2 = tid.x <= 1 ? WavePrefixProduct( v.x ) : 0; + int64_t s3 = tid.x <= 2 ? WavePrefixProduct( v.x ) : 0; + int64_t s4 = tid.x <= 3 ? WavePrefixProduct( v.x ) : 0; + + int64_t2 v2_1 = tid.x <= 0 ? WavePrefixProduct( v.xy ) : int64_t2(0,0); + int64_t2 v2_2 = tid.x <= 1 ? WavePrefixProduct( v.xy ) : int64_t2(0,0); + int64_t2 v2_3 = tid.x <= 2 ? WavePrefixProduct( v.xy ) : int64_t2(0,0); + int64_t2 v2_4 = tid.x <= 3 ? WavePrefixProduct( v.xy ) : int64_t2(0,0); + + int64_t3 v3_1 = tid.x <= 0 ? WavePrefixProduct( v.xyz ) : int64_t3(0,0,0); + int64_t3 v3_2 = tid.x <= 1 ? WavePrefixProduct( v.xyz ) : int64_t3(0,0,0); + int64_t3 v3_3 = tid.x <= 2 ? WavePrefixProduct( v.xyz ) : int64_t3(0,0,0); + int64_t3 v3_4 = tid.x <= 3 ? WavePrefixProduct( v.xyz ) : int64_t3(0,0,0); + + int64_t4 v4_1 = tid.x <= 0 ? WavePrefixProduct( v ) : int64_t4(0,0,0,0); + int64_t4 v4_2 = tid.x <= 1 ? WavePrefixProduct( v ) : int64_t4(0,0,0,0); + int64_t4 v4_3 = tid.x <= 2 ? WavePrefixProduct( v ) : int64_t4(0,0,0,0); + int64_t4 v4_4 = tid.x <= 3 ? WavePrefixProduct( v ) : int64_t4(0,0,0,0); + + int64_t scalars[4] = { s1, s2, s3, s4 }; + int64_t2 vec2s [4] = { v2_1, v2_2, v2_3, v2_4 }; + int64_t3 vec3s [4] = { v3_1, v3_2, v3_3, v3_4 }; + int64_t4 vec4s [4] = { v4_1, v4_2, v4_3, v4_4 }; + + Out1[tid.x].x = scalars[tid.x]; + Out2[tid.x].xy = vec2s[tid.x]; + Out3[tid.x].xyz = vec3s[tid.x]; + Out4[tid.x] = vec4s[tid.x]; + Out5[tid.x] = WavePrefixProduct(int64_t4(1,2,3,4)); + + // UINT case + + uint64_t4 uv = UIn[0]; + + // Mask per "active lane set": only <=N lanes contribute + uint64_t us1 = tid.x <= 0 ? WavePrefixProduct( uv.x ) : 0; + uint64_t us2 = tid.x <= 1 ? WavePrefixProduct( uv.x ) : 0; + uint64_t us3 = tid.x <= 2 ? WavePrefixProduct( uv.x ) : 0; + uint64_t us4 = tid.x <= 3 ? WavePrefixProduct( uv.x ) : 0; + + uint64_t2 uv2_1 = tid.x <= 0 ? WavePrefixProduct( uv.xy ) : uint64_t2(0,0); + uint64_t2 uv2_2 = tid.x <= 1 ? WavePrefixProduct( uv.xy ) : uint64_t2(0,0); + uint64_t2 uv2_3 = tid.x <= 2 ? WavePrefixProduct( uv.xy ) : uint64_t2(0,0); + uint64_t2 uv2_4 = tid.x <= 3 ? WavePrefixProduct( uv.xy ) : uint64_t2(0,0); + + uint64_t3 uv3_1 = tid.x <= 0 ? WavePrefixProduct( uv.xyz ) : uint64_t3(0,0,0); + uint64_t3 uv3_2 = tid.x <= 1 ? WavePrefixProduct( uv.xyz ) : uint64_t3(0,0,0); + uint64_t3 uv3_3 = tid.x <= 2 ? WavePrefixProduct( uv.xyz ) : uint64_t3(0,0,0); + uint64_t3 uv3_4 = tid.x <= 3 ? WavePrefixProduct( uv.xyz ) : uint64_t3(0,0,0); + + uint64_t4 uv4_1 = tid.x <= 0 ? WavePrefixProduct( uv ) : uint64_t4(0,0,0,0); + uint64_t4 uv4_2 = tid.x <= 1 ? WavePrefixProduct( uv ) : uint64_t4(0,0,0,0); + uint64_t4 uv4_3 = tid.x <= 2 ? WavePrefixProduct( uv ) : uint64_t4(0,0,0,0); + uint64_t4 uv4_4 = tid.x <= 3 ? WavePrefixProduct( uv ) : uint64_t4(0,0,0,0); + + uint64_t uscalars[4] = { us1, us2, us3, us4 }; + uint64_t2 uvec2s [4] = { uv2_1, uv2_2, uv2_3, uv2_4 }; + uint64_t3 uvec3s [4] = { uv3_1, uv3_2, uv3_3, uv3_4 }; + uint64_t4 uvec4s [4] = { uv4_1, uv4_2, uv4_3, uv4_4 }; + + UOut1[tid.x].x = uscalars[tid.x]; + UOut2[tid.x].xy = uvec2s[tid.x]; + UOut3[tid.x].xyz = uvec3s[tid.x]; + UOut4[tid.x] = uvec4s[tid.x]; + UOut5[tid.x] = WavePrefixProduct(uint64_t4(1,2,3,4)); +} + +//--- pipeline.yaml + +--- +Shaders: + - Stage: Compute + Entry: main + DispatchSize: [ 1, 1, 1 ] +Buffers: + - Name: In + Format: Int64 + Stride: 32 + Data: [ 2, 4, 8, 16 ] + - Name: Out1 + Format: Int64 + Stride: 32 + FillSize: 128 + - Name: Out2 + Format: Int64 + Stride: 32 + FillSize: 128 + - Name: Out3 + Format: Int64 + Stride: 32 + FillSize: 128 + - Name: Out4 + Format: Int64 + Stride: 32 + FillSize: 128 + - Name: Out5 + Format: Int64 + Stride: 32 + FillSize: 128 + - Name: ExpectedOut1 + Format: Int64 + Stride: 32 + Data: [ 1, 0, 0, 0, 2, 0, 0, 0, 4, 0, 0, 0, 8, 0, 0, 0 ] + - Name: ExpectedOut2 + Format: Int64 + Stride: 32 + Data: [ 1, 1, 0, 0, 2, 4, 0, 0, 4, 16, 0, 0, 8, 64, 0, 0 ] + - Name: ExpectedOut3 + Format: Int64 + Stride: 32 + Data: [ 1, 1, 1, 0, 2, 4, 8, 0, 4, 16, 64, 0, 8, 64, 512, 0 ] + - Name: ExpectedOut4 + Format: Int64 + Stride: 32 + Data: [ 1, 1, 1, 1, 2, 4, 8, 16, 4, 16, 64, 256, 8, 64, 512, 4096 ] + - Name: ExpectedOut5 + Format: Int64 + Stride: 32 + Data: [ 1, 1, 1, 1, 1, 2, 3, 4, 1, 4, 9, 16, 1, 8, 27, 64 ] + - Name: UIn + Format: UInt64 + Stride: 32 + Data: [ 2, 4, 8, 16 ] + - Name: UOut1 + Format: UInt64 + Stride: 32 + FillSize: 128 + - Name: UOut2 + Format: UInt64 + Stride: 32 + FillSize: 128 + - Name: UOut3 + Format: UInt64 + Stride: 32 + FillSize: 128 + - Name: UOut4 + Format: UInt64 + Stride: 32 + FillSize: 128 + - Name: UOut5 + Format: UInt64 + Stride: 32 + FillSize: 128 + - Name: UExpectedOut1 + Format: UInt64 + Stride: 32 + Data: [ 1, 0, 0, 0, 2, 0, 0, 0, 4, 0, 0, 0, 8, 0, 0, 0 ] + - Name: UExpectedOut2 + Format: UInt64 + Stride: 32 + Data: [ 1, 1, 0, 0, 2, 4, 0, 0, 4, 16, 0, 0, 8, 64, 0, 0 ] + - Name: UExpectedOut3 + Format: UInt64 + Stride: 32 + Data: [ 1, 1, 1, 0, 2, 4, 8, 0, 4, 16, 64, 0, 8, 64, 512, 0 ] + - Name: UExpectedOut4 + Format: UInt64 + Stride: 32 + Data: [ 1, 1, 1, 1, 2, 4, 8, 16, 4, 16, 64, 256, 8, 64, 512, 4096 ] + - Name: UExpectedOut5 + Format: UInt64 + Stride: 32 + Data: [ 1, 1, 1, 1, 1, 2, 3, 4, 1, 4, 9, 16, 1, 8, 27, 64 ] +Results: + - Result: ExpectedOut1 + Rule: BufferExact + Actual: Out1 + Expected: ExpectedOut1 + - Result: ExpectedOut2 + Rule: BufferExact + Actual: Out2 + Expected: ExpectedOut2 + - Result: ExpectedOut3 + Rule: BufferExact + Actual: Out3 + Expected: ExpectedOut3 + - Result: ExpectedOut4 + Rule: BufferExact + Actual: Out4 + Expected: ExpectedOut4 + - Result: ExpectedOut5 + Rule: BufferExact + Actual: Out5 + Expected: ExpectedOut5 + - Result: UExpectedOut1 + Rule: BufferExact + Actual: UOut1 + Expected: UExpectedOut1 + - Result: UExpectedOut2 + Rule: BufferExact + Actual: UOut2 + Expected: UExpectedOut2 + - Result: UExpectedOut3 + Rule: BufferExact + Actual: UOut3 + Expected: UExpectedOut3 + - Result: UExpectedOut4 + Rule: BufferExact + Actual: UOut4 + Expected: UExpectedOut4 + - Result: UExpectedOut5 + Rule: BufferExact + Actual: UOut5 + Expected: UExpectedOut5 +DescriptorSets: + - Resources: + - Name: In + Kind: StructuredBuffer + DirectXBinding: + Register: 0 + Space: 0 + VulkanBinding: + Binding: 0 + - Name: Out1 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 1 + Space: 0 + VulkanBinding: + Binding: 1 + - Name: Out2 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 2 + Space: 0 + VulkanBinding: + Binding: 2 + - Name: Out3 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 3 + Space: 0 + VulkanBinding: + Binding: 3 + - Name: Out4 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 4 + Space: 0 + VulkanBinding: + Binding: 4 + - Name: Out5 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 5 + Space: 0 + VulkanBinding: + Binding: 5 + - Name: UIn + Kind: StructuredBuffer + DirectXBinding: + Register: 6 + Space: 0 + VulkanBinding: + Binding: 6 + - Name: UOut1 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 7 + Space: 0 + VulkanBinding: + Binding: 7 + - Name: UOut2 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 8 + Space: 0 + VulkanBinding: + Binding: 8 + - Name: UOut3 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 9 + Space: 0 + VulkanBinding: + Binding: 9 + - Name: UOut4 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 10 + Space: 0 + VulkanBinding: + Binding: 10 + - Name: UOut5 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 11 + Space: 0 + VulkanBinding: + Binding: 11 + +... +#--- end + +# REQUIRES: Int64 + +# RUN: split-file %s %t +# RUN: %dxc_target -T cs_6_5 -Fo %t.o %t/source.hlsl +# RUN: %offloader %t/pipeline.yaml %t.o diff --git a/test/WaveOps/WavePrefixProduct.test b/test/WaveOps/WavePrefixProduct.test new file mode 100644 index 000000000..15a7e5533 --- /dev/null +++ b/test/WaveOps/WavePrefixProduct.test @@ -0,0 +1,57 @@ +#--- source.hlsl +RWBuffer value; + +[numthreads(4, 1, 1)] +void main(uint3 threadID : SV_DispatchThreadID) { + // Lane | switch status | WavePrefixProduct(2) | WavePrefixProduct(10) | Final + // -----|---------------|----------------------|-----------------------|---------------- + // 0 | active | 1 | 1 | 1 * 1 * 1 = 1 + // 1 | active | 1 * 2 | 1 * 10 | 1 * 2 * 10 = 20 + // 2 | inactive | n/a | 1 * 10 * 10 | 1 * 100 = 100 + // 3 | inactive | n/a | 1 * 10 * 10 * 10 | 1 * 1000 = 1000 + uint product = 1; + switch (value[threadID.x]) { + case 0: + product *= WavePrefixProduct(2); + break; + default: + break; + } + product *= WavePrefixProduct(10); + value[threadID.x] = product; +} + +//--- pipeline.yaml + +--- +Shaders: + - Stage: Compute + Entry: main + DispatchSize: [ 1, 1, 1 ] +Buffers: + - Name: value + Format: Int32 + Data: [ 0, 0, 1, 2 ] + - Name: Expected + Format: Int32 + Data: [ 1, 20, 100, 1000 ] +Results: + - Result: Expected + Rule: BufferExact + Actual: value + Expected: Expected +DescriptorSets: + - Resources: + - Name: value + Kind: RWBuffer + DirectXBinding: + Register: 0 + Space: 0 + VulkanBinding: + Binding: 0 +... +#--- end + +# RUN: split-file %s %t +# RUN: %dxc_target -T cs_6_0 -Fo %t.o %t/source.hlsl +# RUN: %offloader %t/pipeline.yaml %t.o