diff --git a/test/WaveOps/QuadReadAcrossDiagonal.32.test b/test/WaveOps/QuadReadAcrossDiagonal.32.test new file mode 100644 index 000000000..3261d4b64 --- /dev/null +++ b/test/WaveOps/QuadReadAcrossDiagonal.32.test @@ -0,0 +1,352 @@ +#--- source.hlsl +// ints +StructuredBuffer In : register(t0); +RWStructuredBuffer Out1 : register(u1); +RWStructuredBuffer Out2 : register(u2); +RWStructuredBuffer Out3 : register(u3); +RWStructuredBuffer Out4 : register(u4); + +// uints +StructuredBuffer UIn : register(t5); +RWStructuredBuffer UOut1 : register(u6); +RWStructuredBuffer UOut2 : register(u7); +RWStructuredBuffer UOut3 : register(u8); +RWStructuredBuffer UOut4 : register(u9); + +// floats +StructuredBuffer FIn : register(t10); +RWStructuredBuffer FOut1 : register(u11); +RWStructuredBuffer FOut2 : register(u12); +RWStructuredBuffer FOut3 : register(u13); +RWStructuredBuffer FOut4 : register(u14); + +[numthreads(2,2,1)] +void main(uint3 dtid : SV_DispatchThreadID) { + uint index = dtid.y * 2 + dtid.x; + + // int case + int4 v = In[index]; + int scalar = QuadReadAcrossDiagonal(v.x); + int2 vec2 = QuadReadAcrossDiagonal(v.xy); + int3 vec3 = QuadReadAcrossDiagonal(v.xyz); + int4 vec4 = QuadReadAcrossDiagonal(v); + + Out1[index].x = scalar; + Out2[index].xy = vec2; + Out3[index].xyz = vec3; + Out4[index] = vec4; + + // uint case + uint4 uv = UIn[index]; + uint uscalar = QuadReadAcrossDiagonal(uv.x); + uint2 uvec2 = QuadReadAcrossDiagonal(uv.xy); + uint3 uvec3 = QuadReadAcrossDiagonal(uv.xyz); + uint4 uvec4 = QuadReadAcrossDiagonal(uv); + + UOut1[index].x = uscalar; + UOut2[index].xy = uvec2; + UOut3[index].xyz = uvec3; + UOut4[index] = uvec4; + + // float case + float4 fv = FIn[index]; + float fscalar = QuadReadAcrossDiagonal(fv.x); + float2 fvec2 = QuadReadAcrossDiagonal(fv.xy); + float3 fvec3 = QuadReadAcrossDiagonal(fv.xyz); + float4 fvec4 = QuadReadAcrossDiagonal(fv); + + FOut1[index].x = fscalar; + FOut2[index].xy = fvec2; + FOut3[index].xyz = fvec3; + FOut4[index] = fvec4; +} + +//--- pipeline.yaml + +--- +Shaders: + - Stage: Compute + Entry: main + DispatchSize: [ 1, 1, 1 ] +Buffers: + - Name: In + Format: Int32 + Stride: 16 + Data: [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 ] + - Name: Out1 + Format: Int32 + Stride: 16 + FillSize: 64 + - Name: Out2 + Format: Int32 + Stride: 16 + FillSize: 64 + - Name: Out3 + Format: Int32 + Stride: 16 + FillSize: 64 + - Name: Out4 + Format: Int32 + Stride: 16 + FillSize: 64 + - Name: ExpectedOut1 + Format: Int32 + Stride: 16 + Data: [ 13, 0, 0, 0, 9, 0, 0, 0, 5, 0, 0, 0, 1, 0, 0, 0 ] + - Name: ExpectedOut2 + Format: Int32 + Stride: 16 + Data: [ 13, 14, 0, 0, 9, 10, 0, 0, 5, 6, 0, 0, 1, 2, 0, 0 ] + - Name: ExpectedOut3 + Format: Int32 + Stride: 16 + Data: [ 13, 14, 15, 0, 9, 10, 11, 0, 5, 6, 7, 0, 1, 2, 3, 0 ] + - Name: ExpectedOut4 + Format: Int32 + Stride: 16 + Data: [ 13, 14, 15, 16, 9, 10, 11, 12, 5, 6, 7, 8, 1, 2, 3, 4 ] + - Name: UIn + Format: UInt32 + Stride: 16 + Data: [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 ] + - Name: UOut1 + Format: UInt32 + Stride: 16 + FillSize: 64 + - Name: UOut2 + Format: UInt32 + Stride: 16 + FillSize: 64 + - Name: UOut3 + Format: UInt32 + Stride: 16 + FillSize: 64 + - Name: UOut4 + Format: UInt32 + Stride: 16 + FillSize: 64 + - Name: UExpectedOut1 + Format: UInt32 + Stride: 16 + Data: [ 13, 0, 0, 0, 9, 0, 0, 0, 5, 0, 0, 0, 1, 0, 0, 0 ] + - Name: UExpectedOut2 + Format: UInt32 + Stride: 16 + Data: [ 13, 14, 0, 0, 9, 10, 0, 0, 5, 6, 0, 0, 1, 2, 0, 0 ] + - Name: UExpectedOut3 + Format: UInt32 + Stride: 16 + Data: [ 13, 14, 15, 0, 9, 10, 11, 0, 5, 6, 7, 0, 1, 2, 3, 0 ] + - Name: UExpectedOut4 + Format: UInt32 + Stride: 16 + Data: [ 13, 14, 15, 16, 9, 10, 11, 12, 5, 6, 7, 8, 1, 2, 3, 4 ] + - Name: FIn + Format: Float32 + Stride: 16 + Data: [ 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0 ] + - Name: FOut1 + Format: Float32 + Stride: 16 + FillSize: 64 + - Name: FOut2 + Format: Float32 + Stride: 16 + FillSize: 64 + - Name: FOut3 + Format: Float32 + Stride: 16 + FillSize: 64 + - Name: FOut4 + Format: Float32 + Stride: 16 + FillSize: 64 + - Name: FExpectedOut1 + Format: Float32 + Stride: 16 + Data: [ 13.0, 0.0, 0.0, 0.0, 9.0, 0.0, 0.0, 0.0, 5.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0 ] + - Name: FExpectedOut2 + Format: Float32 + Stride: 16 + Data: [ 13.0, 14.0, 0.0, 0.0, 9.0, 10.0, 0.0, 0.0, 5.0, 6.0, 0.0, 0.0, 1.0, 2.0, 0.0, 0.0 ] + - Name: FExpectedOut3 + Format: Float32 + Stride: 16 + Data: [ 13.0, 14.0, 15.0, 0.0, 9.0, 10.0, 11.0, 0.0, 5.0, 6.0, 7.0, 0.0, 1.0, 2.0, 3.0, 0.0 ] + - Name: FExpectedOut4 + Format: Float32 + Stride: 16 + Data: [ 13.0, 14.0, 15.0, 16.0, 9.0, 10.0, 11.0, 12.0, 5.0, 6.0, 7.0, 8.0, 1.0, 2.0, 3.0, 4.0 ] +Results: + - Result: ExpectedOut1 + Rule: BufferExact + Actual: Out1 + Expected: ExpectedOut1 + - Result: ExpectedOut2 + Rule: BufferExact + Actual: Out2 + Expected: ExpectedOut2 + - Result: ExpectedOut3 + Rule: BufferExact + Actual: Out3 + Expected: ExpectedOut3 + - Result: ExpectedOut4 + Rule: BufferExact + Actual: Out4 + Expected: ExpectedOut4 + - Result: UExpectedOut1 + Rule: BufferExact + Actual: UOut1 + Expected: UExpectedOut1 + - Result: UExpectedOut2 + Rule: BufferExact + Actual: UOut2 + Expected: UExpectedOut2 + - Result: UExpectedOut3 + Rule: BufferExact + Actual: UOut3 + Expected: UExpectedOut3 + - Result: UExpectedOut4 + Rule: BufferExact + Actual: UOut4 + Expected: UExpectedOut4 + - Result: FExpectedOut1 + Rule: BufferExact + Actual: FOut1 + Expected: FExpectedOut1 + - Result: FExpectedOut2 + Rule: BufferExact + Actual: FOut2 + Expected: FExpectedOut2 + - Result: FExpectedOut3 + Rule: BufferExact + Actual: FOut3 + Expected: FExpectedOut3 + - Result: FExpectedOut4 + Rule: BufferExact + Actual: FOut4 + Expected: FExpectedOut4 +DescriptorSets: + - Resources: + - Name: In + Kind: StructuredBuffer + DirectXBinding: + Register: 0 + Space: 0 + VulkanBinding: + Binding: 0 + - Name: Out1 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 1 + Space: 0 + VulkanBinding: + Binding: 1 + - Name: Out2 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 2 + Space: 0 + VulkanBinding: + Binding: 2 + - Name: Out3 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 3 + Space: 0 + VulkanBinding: + Binding: 3 + - Name: Out4 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 4 + Space: 0 + VulkanBinding: + Binding: 4 + - Name: UIn + Kind: StructuredBuffer + DirectXBinding: + Register: 5 + Space: 0 + VulkanBinding: + Binding: 5 + - Name: UOut1 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 6 + Space: 0 + VulkanBinding: + Binding: 6 + - Name: UOut2 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 7 + Space: 0 + VulkanBinding: + Binding: 7 + - Name: UOut3 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 8 + Space: 0 + VulkanBinding: + Binding: 8 + - Name: UOut4 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 9 + Space: 0 + VulkanBinding: + Binding: 9 + - Name: FIn + Kind: StructuredBuffer + DirectXBinding: + Register: 10 + Space: 0 + VulkanBinding: + Binding: 10 + - Name: FOut1 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 11 + Space: 0 + VulkanBinding: + Binding: 11 + - Name: FOut2 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 12 + Space: 0 + VulkanBinding: + Binding: 12 + - Name: FOut3 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 13 + Space: 0 + VulkanBinding: + Binding: 13 + - Name: FOut4 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 14 + Space: 0 + VulkanBinding: + Binding: 14 + +... +#--- end + +# Unsupported in Clang, I have a working branch for QuadReadAcrossDiagonal intrinsic support +# waiting on https://github.com/llvm/llvm-project/pull/187440 to be merged, so I can open a PR for it +# XFAIL: Clang + +# Bug: https://github.com/llvm/offload-test-suite/issues/986 +# XFAIL: Intel && Vulkan && DXC + +# Bug: https://github.com/llvm/offload-test-suite/issues/989 +# XFAIL: Metal + +# RUN: split-file %s %t +# RUN: %dxc_target -T cs_6_5 -Fo %t.o %t/source.hlsl +# RUN: %offloader %t/pipeline.yaml %t.o diff --git a/test/WaveOps/QuadReadAcrossDiagonal.convergence.test b/test/WaveOps/QuadReadAcrossDiagonal.convergence.test new file mode 100644 index 000000000..ed651122b --- /dev/null +++ b/test/WaveOps/QuadReadAcrossDiagonal.convergence.test @@ -0,0 +1,75 @@ +#--- source.hlsl +StructuredBuffer In : register(t0); +RWStructuredBuffer Out : register(u1); + +[numthreads(2, 2, 1)] +void main(uint3 dtid : SV_DispatchThreadID) { + uint index = dtid.y * 2 + dtid.x; + float value = In[index]; + // Tests control flow across the quad, but making sure neighboring lanes are active to avoid UB. + + if(index == 0 || index == 3) { + // This reads (0, 0) and (1, 1) + float value_quad_d = QuadReadAcrossDiagonal(value); + Out[index] = value - value_quad_d; + } else { + // This reads (1, 0) and (0, 1) + float value_quad_d = QuadReadAcrossDiagonal(value); + Out[index] = value + value_quad_d; + } +} + +//--- pipeline.yaml + +--- +Shaders: + - Stage: Compute + Entry: main + DispatchSize: [ 1, 1, 1 ] +Buffers: + - Name: In + Format: Float32 + Stride: 4 + Data: [ 1.0, 10.0, 2.0, 20.0 ] + - Name: Out + Format: Float32 + Stride: 4 + FillSize: 16 + - Name: ExpectedOut + Format: Float32 + Stride: 4 + Data: [ -19.0, 12.0, 12.0, 19.0 ] +Results: + - Result: ExpectedOut + Rule: BufferExact + Actual: Out + Expected: ExpectedOut +DescriptorSets: + - Resources: + - Name: In + Kind: StructuredBuffer + DirectXBinding: + Register: 0 + Space: 0 + VulkanBinding: + Binding: 0 + - Name: Out + Kind: RWStructuredBuffer + DirectXBinding: + Register: 1 + Space: 0 + VulkanBinding: + Binding: 1 +... +#--- end + +# Unsupported in Clang, I have a working branch for QuadReadAcrossDiagonal intrinsic support +# waiting on https://github.com/llvm/llvm-project/pull/187440 to be merged, so I can open a PR for it +# XFAIL: Clang + +# Bug: https://github.com/llvm/offload-test-suite/issues/986 +# XFAIL: Intel && Vulkan && DXC + +# RUN: split-file %s %t +# RUN: %dxc_target -T cs_6_0 -Fo %t.o %t/source.hlsl +# RUN: %offloader %t/pipeline.yaml %t.o diff --git a/test/WaveOps/QuadReadAcrossDiagonal.fp16.test b/test/WaveOps/QuadReadAcrossDiagonal.fp16.test new file mode 100644 index 000000000..70958b40b --- /dev/null +++ b/test/WaveOps/QuadReadAcrossDiagonal.fp16.test @@ -0,0 +1,137 @@ +#--- source.hlsl +StructuredBuffer In: register(t0); +RWStructuredBuffer Out1 : register(u1); // test scalar +RWStructuredBuffer Out2 : register(u2); // test half2 +RWStructuredBuffer Out3 : register(u3); // test half3 +RWStructuredBuffer Out4 : register(u4); // test half4 + +[numthreads(2,2,1)] +void main(uint3 dtid : SV_DispatchThreadID) { + uint index = dtid.y * 2 + dtid.x; + half4 v = In[index]; + + half scalar = QuadReadAcrossDiagonal(v.x); + half2 vec2 = QuadReadAcrossDiagonal(v.xy); + half3 vec3 = QuadReadAcrossDiagonal(v.xyz); + half4 vec4 = QuadReadAcrossDiagonal(v); + + Out1[index].x = scalar; + Out2[index].xy = vec2; + Out3[index].xyz = vec3; + Out4[index] = vec4; +} + +//--- pipeline.yaml + +--- +Shaders: + - Stage: Compute + Entry: main + DispatchSize: [ 1, 1, 1 ] +Buffers: + - Name: In + Format: Float16 + Stride: 8 + Data: [ 0x3c00, 0x4000, 0x4200, 0x4400, 0x4500, 0x4600, 0x4700, 0x4800, 0x4880, 0x4900, 0x4980, 0x4a00, 0x4a80, 0x4b00, 0x4b80, 0x4c00 ] + - Name: Out1 + Format: Float16 + Stride: 8 + FillSize: 32 + - Name: Out2 + Format: Float16 + Stride: 8 + FillSize: 32 + - Name: Out3 + Format: Float16 + Stride: 8 + FillSize: 32 + - Name: Out4 + Format: Float16 + Stride: 8 + FillSize: 32 + - Name: ExpectedOut1 + Format: Float16 + Stride: 8 + Data: [ 0x4a80, 0x0, 0x0, 0x0, 0x4880, 0x0, 0x0, 0x0, 0x4500, 0x0, 0x0, 0x0, 0x3c00, 0x0, 0x0, 0x0 ] + - Name: ExpectedOut2 + Format: Float16 + Stride: 8 + Data: [ 0x4a80, 0x4b00, 0x0, 0x0, 0x4880, 0x4900, 0x0, 0x0, 0x4500, 0x4600, 0x0, 0x0, 0x3c00, 0x4000, 0x0, 0x0 ] + - Name: ExpectedOut3 + Format: Float16 + Stride: 8 + Data: [ 0x4a80, 0x4b00, 0x4b80, 0x0, 0x4880, 0x4900, 0x4980, 0x0, 0x4500, 0x4600, 0x4700, 0x0, 0x3c00, 0x4000, 0x4200, 0x0 ] + - Name: ExpectedOut4 + Format: Float16 + Stride: 8 + Data: [ 0x4a80, 0x4b00, 0x4b80, 0x4c00, 0x4880, 0x4900, 0x4980, 0x4a00, 0x4500, 0x4600, 0x4700, 0x4800, 0x3c00, 0x4000, 0x4200, 0x4400 ] +Results: + - Result: ExpectedOut1 + Rule: BufferExact + Actual: Out1 + Expected: ExpectedOut1 + - Result: ExpectedOut2 + Rule: BufferExact + Actual: Out2 + Expected: ExpectedOut2 + - Result: ExpectedOut3 + Rule: BufferExact + Actual: Out3 + Expected: ExpectedOut3 + - Result: ExpectedOut4 + Rule: BufferExact + Actual: Out4 + Expected: ExpectedOut4 +DescriptorSets: + - Resources: + - Name: In + Kind: StructuredBuffer + DirectXBinding: + Register: 0 + Space: 0 + VulkanBinding: + Binding: 0 + - Name: Out1 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 1 + Space: 0 + VulkanBinding: + Binding: 1 + - Name: Out2 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 2 + Space: 0 + VulkanBinding: + Binding: 2 + - Name: Out3 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 3 + Space: 0 + VulkanBinding: + Binding: 3 + - Name: Out4 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 4 + Space: 0 + VulkanBinding: + Binding: 4 + +... +#--- end + +# REQUIRES: Half + +# Unsupported in Clang, I have a working branch for QuadReadAcrossDiagonal intrinsic support +# waiting on https://github.com/llvm/llvm-project/pull/187440 to be merged, so I can open a PR for it +# XFAIL: Clang + +# Bug: https://github.com/llvm/offload-test-suite/issues/986 +# XFAIL: Intel && Vulkan && DXC + +# RUN: split-file %s %t +# RUN: %dxc_target -enable-16bit-types -T cs_6_5 -Fo %t.o %t/source.hlsl +# RUN: %offloader %t/pipeline.yaml %t.o diff --git a/test/WaveOps/QuadReadAcrossDiagonal.fp64.test b/test/WaveOps/QuadReadAcrossDiagonal.fp64.test new file mode 100644 index 000000000..b3203481e --- /dev/null +++ b/test/WaveOps/QuadReadAcrossDiagonal.fp64.test @@ -0,0 +1,137 @@ +#--- source.hlsl +StructuredBuffer In: register(t0); +RWStructuredBuffer Out1 : register(u1); // test scalar +RWStructuredBuffer Out2 : register(u2); // test double2 +RWStructuredBuffer Out3 : register(u3); // test double3 +RWStructuredBuffer Out4 : register(u4); // test double4 + +[numthreads(2,2,1)] +void main(uint3 dtid : SV_DispatchThreadID) { + uint index = dtid.y * 2 + dtid.x; + double4 v = In[index]; + + double scalar = QuadReadAcrossDiagonal(v.x); + double2 vec2 = QuadReadAcrossDiagonal(v.xy); + double3 vec3 = QuadReadAcrossDiagonal(v.xyz); + double4 vec4 = QuadReadAcrossDiagonal(v); + + Out1[index].x = scalar; + Out2[index].xy = vec2; + Out3[index].xyz = vec3; + Out4[index] = vec4; +} + +//--- pipeline.yaml + +--- +Shaders: + - Stage: Compute + Entry: main + DispatchSize: [ 1, 1, 1 ] +Buffers: + - Name: In + Format: Float64 + Stride: 32 + Data: [ 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0 ] + - Name: Out1 + Format: Float64 + Stride: 32 + FillSize: 128 + - Name: Out2 + Format: Float64 + Stride: 32 + FillSize: 128 + - Name: Out3 + Format: Float64 + Stride: 32 + FillSize: 128 + - Name: Out4 + Format: Float64 + Stride: 32 + FillSize: 128 + - Name: ExpectedOut1 + Format: Float64 + Stride: 32 + Data: [ 13.0, 0.0, 0.0, 0.0, 9.0, 0.0, 0.0, 0.0, 5.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0 ] + - Name: ExpectedOut2 + Format: Float64 + Stride: 32 + Data: [ 13.0, 14.0, 0.0, 0.0, 9.0, 10.0, 0.0, 0.0, 5.0, 6.0, 0.0, 0.0, 1.0, 2.0, 0.0, 0.0 ] + - Name: ExpectedOut3 + Format: Float64 + Stride: 32 + Data: [ 13.0, 14.0, 15.0, 0.0, 9.0, 10.0, 11.0, 0.0, 5.0, 6.0, 7.0, 0.0, 1.0, 2.0, 3.0, 0.0 ] + - Name: ExpectedOut4 + Format: Float64 + Stride: 32 + Data: [ 13.0, 14.0, 15.0, 16.0, 9.0, 10.0, 11.0, 12.0, 5.0, 6.0, 7.0, 8.0, 1.0, 2.0, 3.0, 4.0 ] +Results: + - Result: ExpectedOut1 + Rule: BufferExact + Actual: Out1 + Expected: ExpectedOut1 + - Result: ExpectedOut2 + Rule: BufferExact + Actual: Out2 + Expected: ExpectedOut2 + - Result: ExpectedOut3 + Rule: BufferExact + Actual: Out3 + Expected: ExpectedOut3 + - Result: ExpectedOut4 + Rule: BufferExact + Actual: Out4 + Expected: ExpectedOut4 +DescriptorSets: + - Resources: + - Name: In + Kind: StructuredBuffer + DirectXBinding: + Register: 0 + Space: 0 + VulkanBinding: + Binding: 0 + - Name: Out1 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 1 + Space: 0 + VulkanBinding: + Binding: 1 + - Name: Out2 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 2 + Space: 0 + VulkanBinding: + Binding: 2 + - Name: Out3 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 3 + Space: 0 + VulkanBinding: + Binding: 3 + - Name: Out4 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 4 + Space: 0 + VulkanBinding: + Binding: 4 + +... +#--- end + +# REQUIRES: Double + +# Unsupported in Clang, I have a working branch for QuadReadAcrossDiagonal intrinsic support +# waiting on https://github.com/llvm/llvm-project/pull/187440 to be merged, so I can open a PR for it +# XFAIL: Clang + +# Bug: https://github.com/llvm/offload-test-suite/issues/986 +# XFAIL: Intel && Vulkan && DXC + +# RUN: split-file %s %t +# RUN: %dxc_target -T cs_6_5 -Fo %t.o %t/source.hlsl +# RUN: %offloader %t/pipeline.yaml %t.o diff --git a/test/WaveOps/QuadReadAcrossDiagonal.int16.test b/test/WaveOps/QuadReadAcrossDiagonal.int16.test new file mode 100644 index 000000000..18b34e08e --- /dev/null +++ b/test/WaveOps/QuadReadAcrossDiagonal.int16.test @@ -0,0 +1,248 @@ +#--- source.hlsl +// ints +StructuredBuffer In: register(t0); +RWStructuredBuffer Out1 : register(u1); // test scalar +RWStructuredBuffer Out2 : register(u2); // test int16_t2 +RWStructuredBuffer Out3 : register(u3); // test int16_t3 +RWStructuredBuffer Out4 : register(u4); // test int16_t4 + +// uints +StructuredBuffer UIn: register(t5); +RWStructuredBuffer UOut1 : register(u6); // test scalar +RWStructuredBuffer UOut2 : register(u7); // test uint16_t2 +RWStructuredBuffer UOut3 : register(u8); // test uint16_t3 +RWStructuredBuffer UOut4 : register(u9); // test uint16_t4 + +[numthreads(2,2,1)] +void main(uint3 dtid : SV_DispatchThreadID) { + uint index = dtid.y * 2 + dtid.x; + + // int case + int16_t4 v = In[index]; + int16_t scalar = QuadReadAcrossDiagonal(v.x); + int16_t2 vec2 = QuadReadAcrossDiagonal(v.xy); + int16_t3 vec3 = QuadReadAcrossDiagonal(v.xyz); + int16_t4 vec4 = QuadReadAcrossDiagonal(v); + + Out1[index].x = scalar; + Out2[index].xy = vec2; + Out3[index].xyz = vec3; + Out4[index] = vec4; + + // uint case + uint16_t4 uv = UIn[index]; + uint16_t uscalar = QuadReadAcrossDiagonal(uv.x); + uint16_t2 uvec2 = QuadReadAcrossDiagonal(uv.xy); + uint16_t3 uvec3 = QuadReadAcrossDiagonal(uv.xyz); + uint16_t4 uvec4 = QuadReadAcrossDiagonal(uv); + + UOut1[index].x = uscalar; + UOut2[index].xy = uvec2; + UOut3[index].xyz = uvec3; + UOut4[index] = uvec4; +} + +//--- pipeline.yaml + +--- +Shaders: + - Stage: Compute + Entry: main + DispatchSize: [ 1, 1, 1 ] +Buffers: + - Name: In + Format: Int16 + Stride: 8 + Data: [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 ] + - Name: Out1 + Format: Int16 + Stride: 8 + FillSize: 32 + - Name: Out2 + Format: Int16 + Stride: 8 + FillSize: 32 + - Name: Out3 + Format: Int16 + Stride: 8 + FillSize: 32 + - Name: Out4 + Format: Int16 + Stride: 8 + FillSize: 32 + - Name: ExpectedOut1 + Format: Int16 + Stride: 8 + Data: [ 13, 0, 0, 0, 9, 0, 0, 0, 5, 0, 0, 0, 1, 0, 0, 0 ] + - Name: ExpectedOut2 + Format: Int16 + Stride: 8 + Data: [ 13, 14, 0, 0, 9, 10, 0, 0, 5, 6, 0, 0, 1, 2, 0, 0 ] + - Name: ExpectedOut3 + Format: Int16 + Stride: 8 + Data: [ 13, 14, 15, 0, 9, 10, 11, 0, 5, 6, 7, 0, 1, 2, 3, 0 ] + - Name: ExpectedOut4 + Format: Int16 + Stride: 8 + Data: [ 13, 14, 15, 16, 9, 10, 11, 12, 5, 6, 7, 8, 1, 2, 3, 4 ] + - Name: UIn + Format: UInt16 + Stride: 8 + Data: [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 ] + - Name: UOut1 + Format: UInt16 + Stride: 8 + FillSize: 32 + - Name: UOut2 + Format: UInt16 + Stride: 8 + FillSize: 32 + - Name: UOut3 + Format: UInt16 + Stride: 8 + FillSize: 32 + - Name: UOut4 + Format: UInt16 + Stride: 8 + FillSize: 32 + - Name: UExpectedOut1 + Format: UInt16 + Stride: 8 + Data: [ 13, 0, 0, 0, 9, 0, 0, 0, 5, 0, 0, 0, 1, 0, 0, 0 ] + - Name: UExpectedOut2 + Format: UInt16 + Stride: 8 + Data: [ 13, 14, 0, 0, 9, 10, 0, 0, 5, 6, 0, 0, 1, 2, 0, 0 ] + - Name: UExpectedOut3 + Format: UInt16 + Stride: 8 + Data: [ 13, 14, 15, 0, 9, 10, 11, 0, 5, 6, 7, 0, 1, 2, 3, 0 ] + - Name: UExpectedOut4 + Format: UInt16 + Stride: 8 + Data: [ 13, 14, 15, 16, 9, 10, 11, 12, 5, 6, 7, 8, 1, 2, 3, 4 ] +Results: + - Result: ExpectedOut1 + Rule: BufferExact + Actual: Out1 + Expected: ExpectedOut1 + - Result: ExpectedOut2 + Rule: BufferExact + Actual: Out2 + Expected: ExpectedOut2 + - Result: ExpectedOut3 + Rule: BufferExact + Actual: Out3 + Expected: ExpectedOut3 + - Result: ExpectedOut4 + Rule: BufferExact + Actual: Out4 + Expected: ExpectedOut4 + - Result: UExpectedOut1 + Rule: BufferExact + Actual: UOut1 + Expected: UExpectedOut1 + - Result: UExpectedOut2 + Rule: BufferExact + Actual: UOut2 + Expected: UExpectedOut2 + - Result: UExpectedOut3 + Rule: BufferExact + Actual: UOut3 + Expected: UExpectedOut3 + - Result: UExpectedOut4 + Rule: BufferExact + Actual: UOut4 + Expected: UExpectedOut4 +DescriptorSets: + - Resources: + - Name: In + Kind: StructuredBuffer + DirectXBinding: + Register: 0 + Space: 0 + VulkanBinding: + Binding: 0 + - Name: Out1 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 1 + Space: 0 + VulkanBinding: + Binding: 1 + - Name: Out2 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 2 + Space: 0 + VulkanBinding: + Binding: 2 + - Name: Out3 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 3 + Space: 0 + VulkanBinding: + Binding: 3 + - Name: Out4 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 4 + Space: 0 + VulkanBinding: + Binding: 4 + - Name: UIn + Kind: StructuredBuffer + DirectXBinding: + Register: 5 + Space: 0 + VulkanBinding: + Binding: 5 + - Name: UOut1 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 6 + Space: 0 + VulkanBinding: + Binding: 6 + - Name: UOut2 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 7 + Space: 0 + VulkanBinding: + Binding: 7 + - Name: UOut3 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 8 + Space: 0 + VulkanBinding: + Binding: 8 + - Name: UOut4 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 9 + Space: 0 + VulkanBinding: + Binding: 9 + +... +#--- end + +# REQUIRES: Int16 + +# Unsupported in Clang, I have a working branch for QuadReadAcrossDiagonal intrinsic support +# waiting on https://github.com/llvm/llvm-project/pull/187440 to be merged, so I can open a PR for it +# XFAIL: Clang + +# Bug: https://github.com/llvm/offload-test-suite/issues/986 +# XFAIL: Intel && Vulkan && DXC + +# Bug: https://github.com/llvm/offload-test-suite/issues/989 +# XFAIL: Metal + +# RUN: split-file %s %t +# RUN: %dxc_target -enable-16bit-types -T cs_6_5 -Fo %t.o %t/source.hlsl +# RUN: %offloader %t/pipeline.yaml %t.o diff --git a/test/WaveOps/QuadReadAcrossDiagonal.int64.test b/test/WaveOps/QuadReadAcrossDiagonal.int64.test new file mode 100644 index 000000000..f6819ea34 --- /dev/null +++ b/test/WaveOps/QuadReadAcrossDiagonal.int64.test @@ -0,0 +1,248 @@ +#--- source.hlsl +// ints +StructuredBuffer In: register(t0); +RWStructuredBuffer Out1 : register(u1); // test scalar +RWStructuredBuffer Out2 : register(u2); // test int64_t2 +RWStructuredBuffer Out3 : register(u3); // test int64_t3 +RWStructuredBuffer Out4 : register(u4); // test int64_t4 + +// uints +StructuredBuffer UIn: register(t5); +RWStructuredBuffer UOut1 : register(u6); // test scalar +RWStructuredBuffer UOut2 : register(u7); // test uint64_t2 +RWStructuredBuffer UOut3 : register(u8); // test uint64_t3 +RWStructuredBuffer UOut4 : register(u9); // test uint64_t4 + +[numthreads(2,2,1)] +void main(uint3 dtid : SV_DispatchThreadID) { + uint index = dtid.y * 2 + dtid.x; + + // int case + int64_t4 v = In[index]; + int64_t scalar = QuadReadAcrossDiagonal(v.x); + int64_t2 vec2 = QuadReadAcrossDiagonal(v.xy); + int64_t3 vec3 = QuadReadAcrossDiagonal(v.xyz); + int64_t4 vec4 = QuadReadAcrossDiagonal(v); + + Out1[index].x = scalar; + Out2[index].xy = vec2; + Out3[index].xyz = vec3; + Out4[index] = vec4; + + // uint case + uint64_t4 uv = UIn[index]; + uint64_t uscalar = QuadReadAcrossDiagonal(uv.x); + uint64_t2 uvec2 = QuadReadAcrossDiagonal(uv.xy); + uint64_t3 uvec3 = QuadReadAcrossDiagonal(uv.xyz); + uint64_t4 uvec4 = QuadReadAcrossDiagonal(uv); + + UOut1[index].x = uscalar; + UOut2[index].xy = uvec2; + UOut3[index].xyz = uvec3; + UOut4[index] = uvec4; +} + +//--- pipeline.yaml + +--- +Shaders: + - Stage: Compute + Entry: main + DispatchSize: [ 1, 1, 1 ] +Buffers: + - Name: In + Format: Int64 + Stride: 32 + Data: [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 ] + - Name: Out1 + Format: Int64 + Stride: 32 + FillSize: 128 + - Name: Out2 + Format: Int64 + Stride: 32 + FillSize: 128 + - Name: Out3 + Format: Int64 + Stride: 32 + FillSize: 128 + - Name: Out4 + Format: Int64 + Stride: 32 + FillSize: 128 + - Name: ExpectedOut1 + Format: Int64 + Stride: 32 + Data: [ 13, 0, 0, 0, 9, 0, 0, 0, 5, 0, 0, 0, 1, 0, 0, 0 ] + - Name: ExpectedOut2 + Format: Int64 + Stride: 32 + Data: [ 13, 14, 0, 0, 9, 10, 0, 0, 5, 6, 0, 0, 1, 2, 0, 0 ] + - Name: ExpectedOut3 + Format: Int64 + Stride: 32 + Data: [ 13, 14, 15, 0, 9, 10, 11, 0, 5, 6, 7, 0, 1, 2, 3, 0 ] + - Name: ExpectedOut4 + Format: Int64 + Stride: 32 + Data: [ 13, 14, 15, 16, 9, 10, 11, 12, 5, 6, 7, 8, 1, 2, 3, 4 ] + - Name: UIn + Format: UInt64 + Stride: 32 + Data: [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 ] + - Name: UOut1 + Format: UInt64 + Stride: 32 + FillSize: 128 + - Name: UOut2 + Format: UInt64 + Stride: 32 + FillSize: 128 + - Name: UOut3 + Format: UInt64 + Stride: 32 + FillSize: 128 + - Name: UOut4 + Format: UInt64 + Stride: 32 + FillSize: 128 + - Name: UExpectedOut1 + Format: UInt64 + Stride: 32 + Data: [ 13, 0, 0, 0, 9, 0, 0, 0, 5, 0, 0, 0, 1, 0, 0, 0 ] + - Name: UExpectedOut2 + Format: UInt64 + Stride: 32 + Data: [ 13, 14, 0, 0, 9, 10, 0, 0, 5, 6, 0, 0, 1, 2, 0, 0 ] + - Name: UExpectedOut3 + Format: UInt64 + Stride: 32 + Data: [ 13, 14, 15, 0, 9, 10, 11, 0, 5, 6, 7, 0, 1, 2, 3, 0 ] + - Name: UExpectedOut4 + Format: UInt64 + Stride: 32 + Data: [ 13, 14, 15, 16, 9, 10, 11, 12, 5, 6, 7, 8, 1, 2, 3, 4 ] +Results: + - Result: ExpectedOut1 + Rule: BufferExact + Actual: Out1 + Expected: ExpectedOut1 + - Result: ExpectedOut2 + Rule: BufferExact + Actual: Out2 + Expected: ExpectedOut2 + - Result: ExpectedOut3 + Rule: BufferExact + Actual: Out3 + Expected: ExpectedOut3 + - Result: ExpectedOut4 + Rule: BufferExact + Actual: Out4 + Expected: ExpectedOut4 + - Result: UExpectedOut1 + Rule: BufferExact + Actual: UOut1 + Expected: UExpectedOut1 + - Result: UExpectedOut2 + Rule: BufferExact + Actual: UOut2 + Expected: UExpectedOut2 + - Result: UExpectedOut3 + Rule: BufferExact + Actual: UOut3 + Expected: UExpectedOut3 + - Result: UExpectedOut4 + Rule: BufferExact + Actual: UOut4 + Expected: UExpectedOut4 +DescriptorSets: + - Resources: + - Name: In + Kind: StructuredBuffer + DirectXBinding: + Register: 0 + Space: 0 + VulkanBinding: + Binding: 0 + - Name: Out1 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 1 + Space: 0 + VulkanBinding: + Binding: 1 + - Name: Out2 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 2 + Space: 0 + VulkanBinding: + Binding: 2 + - Name: Out3 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 3 + Space: 0 + VulkanBinding: + Binding: 3 + - Name: Out4 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 4 + Space: 0 + VulkanBinding: + Binding: 4 + - Name: UIn + Kind: StructuredBuffer + DirectXBinding: + Register: 5 + Space: 0 + VulkanBinding: + Binding: 5 + - Name: UOut1 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 6 + Space: 0 + VulkanBinding: + Binding: 6 + - Name: UOut2 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 7 + Space: 0 + VulkanBinding: + Binding: 7 + - Name: UOut3 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 8 + Space: 0 + VulkanBinding: + Binding: 8 + - Name: UOut4 + Kind: RWStructuredBuffer + DirectXBinding: + Register: 9 + Space: 0 + VulkanBinding: + Binding: 9 + +... +#--- end + +# REQUIRES: Int64 + +# Unsupported in Clang, I have a working branch for QuadReadAcrossDiagonal intrinsic support +# waiting on https://github.com/llvm/llvm-project/pull/187440 to be merged, so I can open a PR for it +# XFAIL: Clang + +# Bug: https://github.com/llvm/offload-test-suite/issues/986 +# XFAIL: Intel && Vulkan && DXC + +# Bug: https://github.com/llvm/offload-test-suite/issues/988 +# XFAIL: Metal + +# RUN: split-file %s %t +# RUN: %dxc_target -T cs_6_0 -Fo %t.o %t/source.hlsl +# RUN: %offloader %t/pipeline.yaml %t.o