-
Notifications
You must be signed in to change notification settings - Fork 96
Expand file tree
/
Copy pathsync_api_noneusm.cu
More file actions
121 lines (107 loc) · 5.95 KB
/
sync_api_noneusm.cu
File metadata and controls
121 lines (107 loc) · 5.95 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
// UNSUPPORTED: cuda-8.0
// UNSUPPORTED: v8.0
// RUN: dpct --format-range=none -out-root %T/sync_api_noneusm %s --cuda-include-path="%cuda-path/include" --usm-level=none --use-experimental-features=root-group -- -x cuda --cuda-host-only
// RUN: FileCheck %s --match-full-lines --input-file %T/sync_api_noneusm/sync_api_noneusm.dp.cpp
// RUN: %if build_lit %{icpx -c -fsycl %T/sync_api_noneusm/sync_api_noneusm.dp.cpp -o %T/sync_api_noneusm/sync_api_noneusm.dp.o %}
// CHECK: #include <sycl/sycl.hpp>
// CHECK-NEXT: #include <dpct/dpct.hpp>
#include "cooperative_groups.h"
namespace cg = cooperative_groups;
using namespace cooperative_groups;
// CHECK: #define TB(b) auto b = sycl::ext::oneapi::this_work_item::get_work_group<3>();
#define TB(b) cg::thread_block b = cg::this_thread_block();
__device__ void foo(int i) {}
#define FOO(x) foo(x)
// CHECK: void k() {
__global__ void k() {
// CHECK: sycl::group<3> cta = sycl::ext::oneapi::this_work_item::get_work_group<3>();
cg::thread_block cta = cg::this_thread_block();
// CHECK: item_ct1.barrier();
cg::sync(cta);
// CHECK: sycl::group<3> block = sycl::ext::oneapi::this_work_item::get_work_group<3>();
cg::thread_block block = cg::this_thread_block();
// CHECK: item_ct1.barrier(sycl::access::fence_space::local_space);
__syncthreads();
// CHECK: item_ct1.barrier();
block.sync();
// CHECK: item_ct1.barrier();
cg::sync(block);
// CHECK: item_ct1.barrier();
cg::this_thread_block().sync();
// CHECK: item_ct1.barrier();
cg::sync(cg::this_thread_block());
// CHECK: sycl::group<3> b0 = sycl::ext::oneapi::this_work_item::get_work_group<3>(), b1 = sycl::ext::oneapi::this_work_item::get_work_group<3>();
cg::thread_block b0 = cg::this_thread_block(), b1 = cg::this_thread_block();
TB(blk);
int p;
// CHECK: /*
// CHECK-NEXT: DPCT1078:{{[0-9]+}}: Consider replacing memory_order::acq_rel with memory_order::seq_cst for correctness if strong memory order restrictions are needed.
// CHECK-NEXT: */
// CHECK-NEXT: sycl::atomic_fence(sycl::memory_order::acq_rel, sycl::memory_scope::work_group);
__threadfence_block();
// CHECK: /*
// CHECK-NEXT: DPCT1078:{{[0-9]+}}: Consider replacing memory_order::acq_rel with memory_order::seq_cst for correctness if strong memory order restrictions are needed.
// CHECK-NEXT: */
// CHECK-NEXT: sycl::atomic_fence(sycl::memory_order::acq_rel, sycl::memory_scope::device);
__threadfence();
// CHECK: /*
// CHECK-NEXT: DPCT1078:{{[0-9]+}}: Consider replacing memory_order::acq_rel with memory_order::seq_cst for correctness if strong memory order restrictions are needed.
// CHECK-NEXT: */
// CHECK-NEXT: sycl::atomic_fence(sycl::memory_order::acq_rel, sycl::memory_scope::system);
__threadfence_system();
// CHECK: item_ct1.barrier();
// CHECK-NEXT: sycl::all_of_group(sycl::ext::oneapi::this_work_item::get_work_group<3>(), p);
__syncthreads_and(p);
// CHECK: item_ct1.barrier();
// CHECK-NEXT: sycl::any_of_group(sycl::ext::oneapi::this_work_item::get_work_group<3>(), p);
__syncthreads_or(p);
// CHECK: item_ct1.barrier();
// CHECK-NEXT: sycl::reduce_over_group(sycl::ext::oneapi::this_work_item::get_work_group<3>(), p == 0 ? 0 : 1, sycl::ext::oneapi::plus<>());
__syncthreads_count(p);
// CHECK: sycl::group_barrier(sycl::ext::oneapi::this_work_item::get_sub_group());
__syncwarp(0xffffffff);
// CHECK: int a = (item_ct1.barrier(), sycl::all_of_group(sycl::ext::oneapi::this_work_item::get_work_group<3>(), p));
int a = __syncthreads_and(p);
// CHECK: int b = (item_ct1.barrier(), sycl::any_of_group(sycl::ext::oneapi::this_work_item::get_work_group<3>(), p));
int b = __syncthreads_or(p);
// CHECK: int c = (item_ct1.barrier(), sycl::reduce_over_group(sycl::ext::oneapi::this_work_item::get_work_group<3>(), p == 0 ? 0 : 1, sycl::ext::oneapi::plus<>()));
int c = __syncthreads_count(p);
// CHECK: foo((item_ct1.barrier(), sycl::all_of_group(sycl::ext::oneapi::this_work_item::get_work_group<3>(), p)));
foo(__syncthreads_and(p));
// CHECK: foo((item_ct1.barrier(), sycl::any_of_group(sycl::ext::oneapi::this_work_item::get_work_group<3>(), p)));
foo(__syncthreads_or(p));
// CHECK: foo((item_ct1.barrier(), sycl::reduce_over_group(sycl::ext::oneapi::this_work_item::get_work_group<3>(), p == 0 ? 0 : 1, sycl::ext::oneapi::plus<>())));
foo(__syncthreads_count(p));
// CHECK: FOO((sycl::ext::oneapi::this_work_item::get_nd_item<3>().barrier(), sycl::all_of_group(sycl::ext::oneapi::this_work_item::get_work_group<3>(), p)));
FOO(__syncthreads_and(p));
// CHECK: FOO((sycl::ext::oneapi::this_work_item::get_nd_item<3>().barrier(), sycl::any_of_group(sycl::ext::oneapi::this_work_item::get_work_group<3>(), p)));
FOO(__syncthreads_or(p));
// CHECK: FOO((sycl::ext::oneapi::this_work_item::get_nd_item<3>().barrier(), sycl::reduce_over_group(sycl::ext::oneapi::this_work_item::get_work_group<3>(), p == 0 ? 0 : 1, sycl::ext::oneapi::plus<>())));
FOO(__syncthreads_count(p));
}
// CHECK: void kernel() {
// CHECK-NEXT: sycl::ext::oneapi::experimental::root_group grid = sycl::ext::oneapi::this_work_item::get_nd_item<3>().ext_oneapi_get_root_group();
// CHECK-NEXT: sycl::group_barrier(grid);
// CHECK-NEXT: }
__global__ void kernel() {
cg::grid_group grid = cg::this_grid();
grid.sync();
}
int main() {
// CHECK: {
// CHECK-NEXT: auto exp_props = sycl::ext::oneapi::experimental::properties{sycl::ext::oneapi::experimental::use_root_sync};
// CHECK-EMPTY:
// CHECK-NEXT: sycl::ext::oneapi::experimental::nd_launch(
// CHECK-NEXT: dpct::get_out_of_order_queue(),
// CHECK-NEXT: sycl::ext::oneapi::experimental::launch_config(
// CHECK-NEXT: sycl::nd_range<3>(sycl::range<3>(1, 1, 2) * sycl::range<3>(1, 1, 2), sycl::range<3>(1, 1, 2)),
// CHECK-NEXT: exp_props),
// CHECK-NEXT: [=](sycl::nd_item<3> item_ct1) {
// CHECK-NEXT: kernel();
// CHECK-NEXT: });
// CHECK-NEXT: }
// CHECK-NEXT: dpct::get_current_device().queues_wait_and_throw();
kernel<<<2, 2>>>();
cudaDeviceSynchronize();
return 0;
}