SYCLomatic/clang/test/dpct/sync_api_noneusm.cu at 6ab73700b7d3fc86ceff90f232df9f19ccaa742d · oneapi-src/SYCLomatic · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
// UNSUPPORTED: cuda-8.0
// UNSUPPORTED: v8.0
// RUN: dpct --format-range=none -out-root %T/sync_api_noneusm %s --cuda-include-path="%cuda-path/include" --usm-level=none --use-experimental-features=root-group -- -x cuda --cuda-host-only
// RUN: FileCheck %s --match-full-lines --input-file %T/sync_api_noneusm/sync_api_noneusm.dp.cpp
// RUN: %if build_lit %{icpx -c -fsycl %T/sync_api_noneusm/sync_api_noneusm.dp.cpp -o %T/sync_api_noneusm/sync_api_noneusm.dp.o %}

// CHECK: #include <sycl/sycl.hpp>
// CHECK-NEXT: #include <dpct/dpct.hpp>
#include "cooperative_groups.h"
namespace cg = cooperative_groups;
using namespace cooperative_groups;

// CHECK: #define TB(b) auto b = sycl::ext::oneapi::this_work_item::get_work_group<3>();
#define TB(b) cg::thread_block b = cg::this_thread_block();

__device__ void foo(int i) {}

#define FOO(x) foo(x)

// CHECK: void k() {
__global__ void k() {
  // CHECK: sycl::group<3> cta = sycl::ext::oneapi::this_work_item::get_work_group<3>();
  cg::thread_block cta = cg::this_thread_block();
  // CHECK: item_ct1.barrier();
  cg::sync(cta);

  // CHECK: sycl::group<3> block = sycl::ext::oneapi::this_work_item::get_work_group<3>();
  cg::thread_block block = cg::this_thread_block();
  // CHECK: item_ct1.barrier(sycl::access::fence_space::local_space);
  __syncthreads();
  // CHECK: item_ct1.barrier();
  block.sync();
  // CHECK: item_ct1.barrier();
  cg::sync(block);
  // CHECK: item_ct1.barrier();
  cg::this_thread_block().sync();
  // CHECK: item_ct1.barrier();
  cg::sync(cg::this_thread_block());

  // CHECK: sycl::group<3> b0 = sycl::ext::oneapi::this_work_item::get_work_group<3>(), b1 = sycl::ext::oneapi::this_work_item::get_work_group<3>();
  cg::thread_block b0 = cg::this_thread_block(), b1 = cg::this_thread_block();

  TB(blk);

  int p;
  // CHECK: /*
  // CHECK-NEXT: DPCT1078:{{[0-9]+}}: Consider replacing memory_order::acq_rel with memory_order::seq_cst for correctness if strong memory order restrictions are needed.
  // CHECK-NEXT: */
  // CHECK-NEXT: sycl::atomic_fence(sycl::memory_order::acq_rel, sycl::memory_scope::work_group);
  __threadfence_block();
  // CHECK: /*
  // CHECK-NEXT: DPCT1078:{{[0-9]+}}: Consider replacing memory_order::acq_rel with memory_order::seq_cst for correctness if strong memory order restrictions are needed.
  // CHECK-NEXT: */
  // CHECK-NEXT: sycl::atomic_fence(sycl::memory_order::acq_rel, sycl::memory_scope::device);
  __threadfence();
  // CHECK: /*
  // CHECK-NEXT: DPCT1078:{{[0-9]+}}: Consider replacing memory_order::acq_rel with memory_order::seq_cst for correctness if strong memory order restrictions are needed.
  // CHECK-NEXT: */
  // CHECK-NEXT: sycl::atomic_fence(sycl::memory_order::acq_rel, sycl::memory_scope::system);
  __threadfence_system();
  // CHECK: item_ct1.barrier();
  // CHECK-NEXT: sycl::all_of_group(sycl::ext::oneapi::this_work_item::get_work_group<3>(), p);
  __syncthreads_and(p);
  // CHECK: item_ct1.barrier();
  // CHECK-NEXT: sycl::any_of_group(sycl::ext::oneapi::this_work_item::get_work_group<3>(), p);
  __syncthreads_or(p);
  // CHECK: item_ct1.barrier();
  // CHECK-NEXT: sycl::reduce_over_group(sycl::ext::oneapi::this_work_item::get_work_group<3>(), p == 0 ? 0 : 1, sycl::ext::oneapi::plus<>());
  __syncthreads_count(p);
  // CHECK: sycl::group_barrier(sycl::ext::oneapi::this_work_item::get_sub_group());
  __syncwarp(0xffffffff);

  // CHECK: int a = (item_ct1.barrier(), sycl::all_of_group(sycl::ext::oneapi::this_work_item::get_work_group<3>(), p));
  int a = __syncthreads_and(p);
  // CHECK: int b = (item_ct1.barrier(), sycl::any_of_group(sycl::ext::oneapi::this_work_item::get_work_group<3>(), p));
  int b = __syncthreads_or(p);
  // CHECK: int c = (item_ct1.barrier(), sycl::reduce_over_group(sycl::ext::oneapi::this_work_item::get_work_group<3>(), p == 0 ? 0 : 1, sycl::ext::oneapi::plus<>()));
  int c = __syncthreads_count(p);

  // CHECK: foo((item_ct1.barrier(), sycl::all_of_group(sycl::ext::oneapi::this_work_item::get_work_group<3>(), p)));
  foo(__syncthreads_and(p));
  // CHECK: foo((item_ct1.barrier(), sycl::any_of_group(sycl::ext::oneapi::this_work_item::get_work_group<3>(), p)));
  foo(__syncthreads_or(p));
  // CHECK: foo((item_ct1.barrier(), sycl::reduce_over_group(sycl::ext::oneapi::this_work_item::get_work_group<3>(), p == 0 ? 0 : 1, sycl::ext::oneapi::plus<>())));
  foo(__syncthreads_count(p));

  // CHECK: FOO((sycl::ext::oneapi::this_work_item::get_nd_item<3>().barrier(), sycl::all_of_group(sycl::ext::oneapi::this_work_item::get_work_group<3>(), p)));
  FOO(__syncthreads_and(p));
  // CHECK: FOO((sycl::ext::oneapi::this_work_item::get_nd_item<3>().barrier(), sycl::any_of_group(sycl::ext::oneapi::this_work_item::get_work_group<3>(), p)));
  FOO(__syncthreads_or(p));
  // CHECK: FOO((sycl::ext::oneapi::this_work_item::get_nd_item<3>().barrier(), sycl::reduce_over_group(sycl::ext::oneapi::this_work_item::get_work_group<3>(), p == 0 ? 0 : 1, sycl::ext::oneapi::plus<>())));
  FOO(__syncthreads_count(p));
}

// CHECK: void kernel() {
// CHECK-NEXT:   sycl::ext::oneapi::experimental::root_group grid = sycl::ext::oneapi::this_work_item::get_nd_item<3>().ext_oneapi_get_root_group();
// CHECK-NEXT:   sycl::group_barrier(grid);
// CHECK-NEXT: }
__global__ void kernel() {
  cg::grid_group grid = cg::this_grid();
  grid.sync();
}

int main() {
  // CHECK: {
  // CHECK-NEXT: auto exp_props = sycl::ext::oneapi::experimental::properties{sycl::ext::oneapi::experimental::use_root_sync};
  // CHECK-EMPTY:
  // CHECK-NEXT:  sycl::ext::oneapi::experimental::nd_launch(
  // CHECK-NEXT:    dpct::get_out_of_order_queue(),
  // CHECK-NEXT:    sycl::ext::oneapi::experimental::launch_config(
  // CHECK-NEXT:      sycl::nd_range<3>(sycl::range<3>(1, 1, 2) * sycl::range<3>(1, 1, 2), sycl::range<3>(1, 1, 2)),
  // CHECK-NEXT:      exp_props),
  // CHECK-NEXT:    [=](sycl::nd_item<3> item_ct1) {
  // CHECK-NEXT:      kernel();
  // CHECK-NEXT:    });
  // CHECK-NEXT: }
  // CHECK-NEXT:  dpct::get_current_device().queues_wait_and_throw();
  kernel<<<2, 2>>>();
  cudaDeviceSynchronize();
  return 0;
}