diff --git a/.github/workflows/examples_other.yml b/.github/workflows/examples_other.yml new file mode 100644 index 00000000..a2cc3a76 --- /dev/null +++ b/.github/workflows/examples_other.yml @@ -0,0 +1,49 @@ +name: examples/other + +on: + push: + branches: + - main + pull_request: + branches: + - main + +jobs: + test_pykokkos: + strategy: + matrix: + platform: [ubuntu-latest] + python-version: ["3.13"] + runs-on: ${{ matrix.platform }} + steps: + - uses: actions/checkout@v3 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install --upgrade numpy mypy==1.0.1 cmake pytest pybind11 scikit-build patchelf + - name: Install pykokkos-base + run: | + python install_base.py install -- -DENABLE_LAYOUTS=ON -DENABLE_MEMORY_TRAITS=OFF -DENABLE_VIEW_RANKS=3 + - name: Install pykokkos + run: | + python -m pip install . + - name: mypy check + run: | + mypy pykokkos + - name: Run all other examples + run: | + for f in examples/graphs/*.py examples/pykokkos/*.py; do + echo "Running $f..." + output=$(python "$f" 2>&1) || { + if ! echo "$output" | grep -q "ModuleNotFoundError"; then + echo "$output" + exit 1 + fi + echo "Ignoring ModuleNotFoundError in $f" + } + echo "$output" + done diff --git a/examples/graphs/bfs_bottomup.py b/examples/graphs/bfs_bottomup.py index 46f1c27c..9ca29f5a 100644 --- a/examples/graphs/bfs_bottomup.py +++ b/examples/graphs/bfs_bottomup.py @@ -1,122 +1,174 @@ -from typing import Tuple - -import pykokkos as pk - import argparse +from collections import deque +import numpy as np_cpu -@pk.workload -class Workload: - def __init__(self, N: int, M: int): - self.N: int = N - self.M: int = M - - self.val: pk.View1D[pk.double] = pk.View([N * M], pk.double) - self.visited: pk.View1D[int] = pk.View([N * M], int) - self.mat: pk.View2D[pk.double] = pk.View([N, M], pk.double) - self.max_arr: pk.View1D[pk.double] = pk.View([N], pk.double) - self.max_arr2D: pk.View2D[pk.double] = pk.View([N, N], pk.double) - - self.element: float = N * M - - self.val.fill(N + M) - - self.visited.fill(0) - - # Initialize the input matrix, can be design to be any binary matrix - # In this example, mat[0][1] & mat[0][3] will be 0, others will be 1 - self.mat.fill(1) - self.mat[0][1] = 0 - self.mat[0][3] = 0 - - @pk.main - def run(self): - # do the bfs - for i in range(self.N + self.M): - pk.parallel_for("bfs_bottomup", self.element, self.check_vis) - - # after bfs, find maximum value in each row - pk.parallel_for("bfs_bottomup", self.N, self.findmax) - - # find the maximum value of all cell - pk.parallel_for("bfs_bottomup", self.N, self.extend2D) - pk.parallel_for("bfs_bottomup", self.N, self.reduce1D) +import pykokkos as pk - @pk.callback - def results(self): - print(f"\ndistance of every cell:\n") - for i in range(self.element): - print(f"val ({self.val[i]}) ", end="") - if (i + 1) % self.M == 0: - print(f"\n") - print(f"The farthest distance is {self.max_arr[0]}") +if pk.get_default_space() in pk.DeviceExecutionSpace: + import cupy as np +else: + import numpy as np + + +def _view_to_numpy_host(x): + """Host NumPy array for comparisons (handles Cupy-backed views).""" + if hasattr(x, "get"): + return np_cpu.asarray(x.get()) + return np_cpu.asarray(x) + + +def reference_grid_bfs_distances(N: int, M: int, mat) -> np_cpu.ndarray: + """ + Shortest hop count (4-neighbor grid) from every cell to any cell with mat==0. + Same graph as the PyKokkos workunits: vertices are grid cells, edges to N/E/S/W + neighbors. Multi-source BFS using a queue, following the standard pattern in + https://www.geeksforgeeks.org/python/python-program-for-breadth-first-search-or-bfs-for-a-graph/ + """ + mat_h = _view_to_numpy_host(mat) + dist = np_cpu.full(N * M, -1, dtype=np_cpu.int32) + q = deque() + for r in range(N): + for c in range(M): + if mat_h[r, c] == 0: + idx = r * M + c + dist[idx] = 0 + q.append((r, c)) + while q: + r, c = q.popleft() + d = int(dist[r * M + c]) + for dr, dc in ((-1, 0), (1, 0), (0, -1), (0, 1)): + nr, nc = r + dr, c + dc + if 0 <= nr < N and 0 <= nc < M: + ni = nr * M + nc + if dist[ni] == -1: + dist[ni] = d + 1 + q.append((nr, nc)) + return dist.astype(np_cpu.float64) + + +def assert_bfs_matches_pykokkos(N: int, M: int, mat, val, max_arr) -> None: + ref = reference_grid_bfs_distances(N, M, mat) + val_h = _view_to_numpy_host(val) + max_h = float(_view_to_numpy_host(max_arr)[0]) + if not np_cpu.allclose(val_h, ref, rtol=0.0, atol=1e-9): + bad = np_cpu.where(np_cpu.abs(val_h - ref) > 1e-9)[0][:16] + raise AssertionError( + f"distance mismatch at linear indices (first few): {bad.tolist()}" + ) + ref_max = float(np_cpu.max(ref)) + if not np_cpu.isclose(max_h, ref_max, rtol=0.0, atol=1e-9): + raise AssertionError(f"max distance mismatch: got {max_h}, expected {ref_max}") + print("BFS correctness check: OK (matches queue-based NumPy reference)") + + +def main(N: int, M: int): + element: int = N * M + + val = np.full(N * M, float(N + M), dtype=np.double) + visited = np.zeros(N * M, dtype=np.int32) + + mat = np.ones((N, M), dtype=np.double) + mat[0][1] = 0 + mat[0][3] = 0 + + max_arr = np.zeros(N, dtype=np.double) + max_arr2D = np.zeros((N, N), dtype=np.double) + + for i in range(N + M): + pk.parallel_for(element, check_vis, N=N, M=M, mat=mat, val=val, visited=visited) + + pk.parallel_for(N, findmax, M=M, val=val, max_arr=max_arr) + + pk.parallel_for(N, extend2D, N=N, max_arr=max_arr, max_arr2D=max_arr2D) + pk.parallel_for(N, reduce1D, N=N, max_arr=max_arr, max_arr2D=max_arr2D) + + assert_bfs_matches_pykokkos(N, M, mat, val, max_arr) + + print(f"\ndistance of every cell:\n") + for i in range(element): + print(f"val ({val[i]}) ", end="") + if (i + 1) % M == 0: + print(f"\n") + print(f"The farthest distance is {max_arr[0]}") ################################ # check_vis will operate breadth-first search # self.visited[i] will be 1 if self.val[i] = 0 or if self.visited[j] = 1 # where j is one of the neighbor of i ################################ - @pk.workunit - def check_vis(self, i: int): - var_row: int = i // self.M - var_col: int = i % self.M - min_val: float = self.val[i] - - flag: int = 0 - - # if the value of the current index is 0, then the distance is 0, - # and the node is marked as visited - # otherwise, check whether the neighbors were visited, - # if visited, the value of the current index can be decided - if self.mat[var_row][var_col] == 0 and self.visited[i] == 0: - self.visited[i] = 1 - self.val[i] = 0 - else: - # check the neighbor on the previous row - if i >= self.M: - if self.visited[i - self.M] == 1: - flag = 1 - if min_val > self.val[i - self.M]: - min_val = self.val[i - self.M] - - # check the neighbor on the next row - if i // self.M < (self.N - 1): - if self.visited[i + self.M] == 1: - flag = 1 - if min_val > self.val[i + self.M]: - min_val = self.val[i + self.M] - - # check the neighbor on the left - if i % self.M > 0: - if self.visited[i - 1] == 1: - flag = 1 - if min_val > self.val[i - 1]: - min_val = self.val[i - 1] - - # check the neighbor on the right - if i % self.M < (self.M - 1): - if self.visited[i + 1] == 1: - flag = 1 - if min_val > self.val[i + 1]: - min_val = self.val[i + 1] + + +@pk.workunit +def check_vis( + i: int, + N: int, + M: int, + mat: pk.View2D[pk.double], + val: pk.View1D[pk.double], + visited: pk.View1D[int], +): + var_row: int = i // M + var_col: int = i % M + min_val: float = val[i] + + flag: int = 0 + + # if the value of the current index is 0, then the distance is 0, + # and the node is marked as visited + # otherwise, check whether the neighbors were visited, + # if visited, the value of the current index can be decided + if mat[var_row][var_col] == 0 and visited[i] == 0: + visited[i] = 1 + val[i] = 0 + else: + # check the neighbor on the previous row + if i >= M: + if visited[i - M] == 1: + flag = 1 + if min_val > val[i - M]: + min_val = val[i - M] + + # check the neighbor on the next row + if i // M < (N - 1): + if visited[i + M] == 1: + flag = 1 + if min_val > val[i + M]: + min_val = val[i + M] + + # check the neighbor on the left + if i % M > 0: + if visited[i - 1] == 1: + flag = 1 + if min_val > val[i - 1]: + min_val = val[i - 1] + + # check the neighbor on the right + if i % M < (M - 1): + if visited[i + 1] == 1: + flag = 1 + if min_val > val[i + 1]: + min_val = val[i + 1] # if there is at least one neighbor visited, the value of # the current index can be updated and should be marked as visited if flag == 1: - if self.val[i] > min_val: - self.val[i] = min_val + 1 - self.visited[i] = 1 + if val[i] > min_val: + val[i] = min_val + 1 + visited[i] = 1 ################################ # findmax will find the maximum value of cell in each row ################################ - @pk.workunit - def findmax(self, j: int): - tmp_max: float = 0 - for i in range(self.M): - if tmp_max < self.val[j * self.M + i]: - tmp_max = self.val[j * self.M + i] - self.max_arr[j] = tmp_max + + +@pk.workunit +def findmax(j: int, M: int, val: pk.View1D[pk.double], max_arr: pk.View1D[pk.double]): + tmp_max: float = 0 + for i in range(M): + if tmp_max < val[j * M + i]: + tmp_max = val[j * M + i] + max_arr[j] = tmp_max ################################ # extend2D and reduce1D are for finding the maximum value of all cell @@ -134,51 +186,40 @@ def findmax(self, j: int): # [5, 5, 5] # then self.max_arr[0] will be the maximum distance ################################ - @pk.workunit - def extend2D(self, j: int): - for i in range(self.N): - self.max_arr2D[i][j] = self.max_arr[j] - @pk.workunit - def reduce1D(self, j: int): - tmp_max: float = 0 - for i in range(self.N): - if tmp_max < self.max_arr2D[j][i]: - tmp_max = self.max_arr2D[j][i] - self.max_arr[j] = tmp_max + +@pk.workunit +def extend2D( + j: int, N: int, max_arr: pk.View1D[pk.double], max_arr2D: pk.View2D[pk.double] +): + for i in range(N): + max_arr2D[i][j] = max_arr[j] + + +@pk.workunit +def reduce1D( + j: int, N: int, max_arr: pk.View1D[pk.double], max_arr2D: pk.View2D[pk.double] +): + tmp_max: float = 0 + for i in range(N): + if tmp_max < max_arr2D[j][i]: + tmp_max = max_arr2D[j][i] + max_arr[j] = tmp_max if __name__ == "__main__": N: int = -1 M: int = -1 - space: str = "" parser = argparse.ArgumentParser() parser.add_argument("-N", "--rows", type=int) parser.add_argument("-M", "--columns", type=int) - parser.add_argument("-space", "--execution_space", type=str) args = parser.parse_args() - if args.rows: - N = 2**args.rows - else: - N = 2**3 - - if args.columns: - M = 2**args.columns - else: - M = 2**3 - - if args.execution_space: - space = args.execution_space - - if space == "": - space = pk.ExecutionSpace.OpenMP - else: - space = pk.ExecutionSpace(space) + N = 2**args.rows if args.rows else 2**3 + M = 2**args.columns if args.columns else 2**3 print(f"Total size: {N*M}, N={N}, M={M}") - pk.set_default_space(space) - pk.execute(pk.get_default_space(), Workload(N, M)) + main(N, M) diff --git a/examples/pykokkos/binsort.py b/examples/pykokkos/binsort.py index d6d0f810..0876f41b 100644 --- a/examples/pykokkos/binsort.py +++ b/examples/pykokkos/binsort.py @@ -1,60 +1,64 @@ -from typing import List - import pykokkos as pk +if pk.get_default_space() in pk.DeviceExecutionSpace: + import cupy as np +else: + import numpy as np -class MyView(pk.View): - def __init__(self, x: int, data_type: pk.DataTypeClass = pk.int32): - super().__init__([x], data_type) - - -@pk.workload -class Workload: - def __init__(self, total_threads: int): - self.total_threads: int = total_threads - self.view: pk.View1D[pk.int32] = MyView(total_threads, data_type=pk.int32) - - self.x_0: int = 4 - self.permute_vector: pk.View1D[pk.int32] = pk.View([total_threads], pk.int32) - self.bin_offsets: pk.View1D[pk.int32] = pk.View([6], pk.int32) - self.bin_count: pk.View1D[pk.int32] = pk.View([6], pk.int32) - @pk.main - def run(self) -> None: - x: List[int] = [self.x_0, 2, 3] - pk.parallel_for(self.total_threads, self.work) - bin_op = pk.BinOp1D( - self.view, - (self.total_threads // 2), - self.total_threads, - self.total_threads * 2 - 1, - ) - bin_sort = pk.BinSort(self.view, bin_op) - bin_sort.create_permute_vector() - self.permute_vector = bin_sort.get_permute_vector() - self.bin_offsets = bin_sort.get_bin_offsets() - self.bin_count = bin_sort.get_bin_count() +def main(): + total_threads: int = 10 - bin_sort.sort(self.view) + view = np.zeros(total_threads, dtype=np.int32) - @pk.workunit - def work(self, i: int) -> None: - self.view[i] = 2 * i + self.total_threads - i + pk.parallel_for(total_threads, work, total_threads=total_threads, view=view) + max_bins = total_threads // 2 + min_key = total_threads + max_key = total_threads * 2 - 1 + bin_op = pk.BinOp1D( + view, + max_bins, + min_key, + max_key, + ) + bin_sort = pk.BinSort(view, bin_op) + bin_sort.create_permute_vector() + permute_vector = bin_sort.get_permute_vector() + bin_offsets = bin_sort.get_bin_offsets() + bin_count = bin_sort.get_bin_count() + bin_sort.sort(view) - @pk.callback - def results(self) -> None: - for i in range(self.total_threads): - print(f"{self.view[i]} ") + print( + "PyKokkos BinSort demo: fill a 1D key view on the device, bucket keys into " + f"{max_bins} bins over [{min_key}, {max_key}], then sort.\n" + f" Initial keys: view[i] = i + {total_threads} (see work unit).\n" + ) + print( + "Sorted keys (same 1D view, after bin_sort.sort(view) — in-place reorder):\n", + view, + "\n", + ) + print( + "Permute vector from create_permute_vector / get_permute_vector — " + "indices describing how elements were reordered:\n", + permute_vector, + "\n", + ) + print( + "Bin offsets (get_bin_offsets) — start index of each bin in the sorted layout:\n", + bin_offsets, + "\n", + ) + print( + "Bin counts (get_bin_count) — number of keys in each bin:\n", + bin_count, + ) -def run() -> None: - workload = Workload(10) - pk.execute(pk.ExecutionSpace.Default, workload) - print(workload.view) - print(workload.permute_vector) - print(workload.bin_offsets) - print(workload.bin_count) +@pk.workunit +def work(i: int, total_threads: int, view: pk.View1D[pk.int32]): + view[i] = 2 * i + total_threads - i if __name__ == "__main__": - run() + main() diff --git a/examples/pykokkos/classtypes.py b/examples/pykokkos/classtypes.py index aa2a56e2..4b4a8978 100644 --- a/examples/pykokkos/classtypes.py +++ b/examples/pykokkos/classtypes.py @@ -10,28 +10,18 @@ def test(self) -> float: return self.x * 2 -@pk.workload -class Workload: - def __init__(self, total_threads: int): - self.total_threads: int = total_threads +@pk.workunit +def work(tid: int, acc: pk.Acc[pk.double]) -> None: + tc: TestClass = TestClass(float(tid)) + acc += tc.test() - @pk.main - def run(self) -> None: - pk.parallel_for(self.total_threads, self.work) - @pk.workunit - def work(self, tid: int) -> None: - pk.printf("%d\n", tid) - - @pk.function - def fun(self, f: TestClass) -> None: - f.x = 3 - x: float = f.x + 5 - - @pk.function - def test(self) -> TestClass: - return TestClass(3.5) +def main(): + total_threads: int = 10 + result: float = pk.parallel_reduce(total_threads, work) + expected: float = sum(2.0 * float(i) for i in range(total_threads)) + print(f"parallel_reduce: {result} (expected {expected})") if __name__ == "__main__": - pk.execute(pk.ExecutionSpace.Default, Workload(10)) + main() diff --git a/examples/pykokkos/subviews.py b/examples/pykokkos/subviews.py index d08120bf..ea5499e9 100644 --- a/examples/pykokkos/subviews.py +++ b/examples/pykokkos/subviews.py @@ -1,28 +1,31 @@ import pykokkos as pk +if pk.get_default_space() in pk.DeviceExecutionSpace: + import cupy as np +else: + import numpy as np -@pk.workload(subview=pk.ViewTypeInfo(trait=pk.Unmanaged)) -class Workload: - def __init__(self): - self.view: pk.View2D[pk.int32] = pk.View([10, 10], pk.int32) - self.subview: pk.View1D[pk.int32] = self.view[3, 2:5] - @pk.main - def run(self): - pk.parallel_for(10, self.work) +def main(): + n: int = 10 + # 2D view; each thread takes a row subview and a column-range subview inside the workunit. + view = np.zeros((n, n), dtype=np.int32) + pk.parallel_for(n, work, view=view) + print( + "PyKokkos subviews: each iteration builds Kokkos::subview row = view[i, :] and " + "band = view[i, 2:5], then writes through those 1D subviews (not view[i][j] alone).\n" + ) + print(view) - @pk.workunit - def work(self, i: int): - self.view[i][i] = 1 - @pk.callback - def callback(self) -> None: - print(self.view) - - -def run() -> None: - pk.execute(pk.ExecutionSpace.Default, Workload()) +@pk.workunit +def work(i: int, view: pk.View2D[pk.int32]): + # Plain `name = view[...]` becomes Kokkos::subview (annotated assign does not). + row = view[i, :] + band = view[i, 2:5] + row[i] = 1 + band[i % 3] = i + 1 if __name__ == "__main__": - run() + main() diff --git a/examples/pykokkos/team_thread_mdrange.py b/examples/pykokkos/team_thread_mdrange.py index a0b9ee92..804f436a 100644 --- a/examples/pykokkos/team_thread_mdrange.py +++ b/examples/pykokkos/team_thread_mdrange.py @@ -28,7 +28,7 @@ def run(): print(N0) print(N1 * N2) - policy = pk.TeamPolicy(N0, N1 * N2) + policy = pk.TeamPolicy(N0, pk.AUTO) pk.parallel_for(policy, kernel, A=A, B=B, C=C, N1=N1, N2=N2) print(A) diff --git a/pykokkos/core/translators/members.py b/pykokkos/core/translators/members.py index 3d8e8e64..960c8558 100644 --- a/pykokkos/core/translators/members.py +++ b/pykokkos/core/translators/members.py @@ -82,7 +82,7 @@ def extract(self, entity: PyKokkosEntity, classtypes: List[PyKokkosEntity]) -> N param_begin = i + 1 # handle last_pass param for parallel_scan if ( - i + 1 <= len(args) + i + 1 < len(args) and isinstance(args[i + 1].annotation, ast.Name) and args[i + 1].annotation.id == "bool" ): diff --git a/pykokkos/interface/__init__.py b/pykokkos/interface/__init__.py index a14c2ffa..970d7e41 100644 --- a/pykokkos/interface/__init__.py +++ b/pykokkos/interface/__init__.py @@ -59,6 +59,9 @@ ) from .execution_space import ( ExecutionSpace, + DeviceExecutionSpace, + HostParallelExecutionSpace, + HostSerialExecutionSpace, ExecutionSpaceInstance, is_host_execution_space, )