-
Notifications
You must be signed in to change notification settings - Fork 568
Coyote accelerator backend #1347
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 9 commits
72aa3ea
896665b
4be45d1
ecde593
be28016
2c600a6
3055a0b
9554271
e0714b7
d58bf73
789e093
9a6b0fe
d35820f
d4a6a2f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,150 @@ | ||
| import os | ||
| import subprocess | ||
| from hls4ml.model.flow import get_flow, register_flow | ||
| from hls4ml.backends import VitisBackend, VivadoBackend | ||
|
|
||
| class CoyoteAcceleratorBackend(VitisBackend): | ||
| """ | ||
| The CoyoteAccelerator backend, which deploys hls4ml models on a PCIe-attached Alveo FPGA | ||
| Underneath it uses the Coyote shell: https://github.com/fpgasystems/Coyote, | ||
| which offers high-performance data movement, networking capabilities, multi-tenancy, | ||
| partial reconfiguration etc. This backend has some similarities with the VitisAccelerator | ||
| backend, but the underlying platforms are different. The implementation of this backend | ||
| remains mostly simple, inheriting most of the functionality from the Vitis backend and | ||
| providing the necessary infrastructure to run model inference on Alveo boards. | ||
|
|
||
| Currently, this backend supports batched inference of a single model on hardware. | ||
| In the future, it can easily be extended with the following capabilities, leveraging | ||
| Coyote's features: | ||
| - Distributed inference | ||
| - Multiple parallel instances of hls4ml models (same or distinct models) | ||
| - Dynamic, run-time reconfiguration of models | ||
|
|
||
| Generic examples of Coyote can be found at the above-mentioned repository, under examples/ | ||
| """ | ||
|
|
||
| def __init__(self): | ||
| super(VivadoBackend, self).__init__(name='CoyoteAccelerator') | ||
| self._register_layer_attributes() | ||
| self._register_flows() | ||
|
|
||
| def _register_flows(self): | ||
| writer_passes = ['make_stamp', 'coyoteaccelerator:write_hls'] | ||
| self._writer_flow = register_flow('write', writer_passes, requires=['vitis:ip'], backend=self.name) | ||
|
|
||
| ip_flow_requirements = get_flow('vitis:ip').requires.copy() | ||
| self._default_flow = register_flow('ip', None, requires=ip_flow_requirements, backend=self.name) | ||
|
|
||
| def compile(self, model): | ||
| """ | ||
| Compiles the hls4ml model for software emulation | ||
|
|
||
| Args: | ||
| model (ModelGraph): hls4ml model to synthesize | ||
|
|
||
| Return: | ||
| lib_name (str): The name of the compiled library | ||
| """ | ||
| lib_name = None | ||
| ret_val = subprocess.run( | ||
| ['./build_lib.sh'], | ||
| shell=True, | ||
| text=True, | ||
| stdout=subprocess.PIPE, | ||
| stderr=subprocess.STDOUT, | ||
| cwd=model.config.get_output_dir(), | ||
| ) | ||
| if ret_val.returncode != 0: | ||
| print(ret_val.stdout) | ||
| raise Exception(f'Failed to compile project "{model.config.get_project_name()}"') | ||
| lib_name = '{}/build/{}-{}.so'.format( | ||
| model.config.get_output_dir(), model.config.get_project_name(), model.config.get_config_value('Stamp') | ||
| ) | ||
|
|
||
| return lib_name | ||
|
|
||
| def build( | ||
| self, | ||
| model, | ||
| device: str = 'u55c', | ||
| reset: bool = False, | ||
| csim: bool = True, | ||
| synth: bool = True, | ||
| cosim: bool = False, | ||
| validation: bool = False, | ||
| csynth: bool = False, | ||
| bitfile: bool = False, | ||
| timing_opt: bool = False, | ||
| hls_clock_period: float = 4, | ||
| hls_clock_uncertainty: float = 27 | ||
| ): | ||
| """ | ||
| Synthesizes the hls4ml model bitstream as part of the Coyote shell | ||
| and compiles the host-side software to control the FPGA and run model inference | ||
|
|
||
| Args: | ||
| model (ModelGraph): hls4ml model to synthesize | ||
| device (str, optional): Target Alveo FPGA card; currently supported u55c, u280 and u250 | ||
| reset (bool, optional): Reset HLS project, if a previous one is found | ||
| csim (bool, optional): Run C-Simulation of the HLS project | ||
| synth (bool, optional): Run HLS synthesis | ||
| cosim (bool, optional): Run HLS co-simulation | ||
| validation (bool, optional): Validate results between C-Sim and Co-Sim | ||
| csynth (bool, optional): Run Coyote synthesis using Vivado, which will synthesize the model in a vFPGA | ||
| bitfile (bool, optional): Generate Coyote bitstream | ||
| timing_opt (bool, optional): Run additional optimizations when running PnR during bitstream generation | ||
| hls_clock_period (float, optional): Clock period to be used for HLS synthesis | ||
| hls_clock_uncertainty (float, optional): Clock uncertainty to be used for HLS synthesis | ||
|
|
||
| NOTE: Currently, the hardware will synthesize with a default clock period of 4ns / 250 MHz frequency, | ||
| since this is the default frequency of Coyote (since the XDMA core defaults to 250 MHz). Coyote allows | ||
| one to specify a different clock period for the model and use a clock-domain crossing (CDC) between the | ||
| XDMA region and the model. This option is currently not exposed as part of the hls4ml backend, but advanced | ||
| users can easily set in the the CMake configuration of Coyote. | ||
|
|
||
| NOTE: While the hardware will synthesize at 250 MHz, users can optionally pass a different HLS clock period | ||
| This is primarily a work-around when HLS synthesize a kernel that doesn't meet timing during PnR. | ||
| The "trick" is to run HLS synthesis at a higher clock frequency then (or provide higher uncertainty) | ||
|
|
||
| TODO: Add functionality to parse synthesis reports | ||
| """ | ||
| curr_dir = os.getcwd() | ||
|
|
||
| # Synthesize hardware | ||
| cmake_cmd = ( | ||
| f'cmake ../../ ' | ||
| f'-DFLOW=hw ' | ||
| f'-DFDEV_NAME={device} ' | ||
| f'-DBUILD_OPT={int(timing_opt)} ' | ||
| f'-DEN_HLS_RESET={int(reset)} ' | ||
| f'-DEN_HLS_CSIM={int(csim)} ' | ||
| f'-DEN_HLS_SYNTH={int(synth)} ' | ||
| f'-DEN_HLS_COSIM={int(cosim)} ' | ||
| f'-DEN_HLS_VALIDATION={int(validation)} ' | ||
| f'-DHLS_CLOCK_PERIOD={hls_clock_period} ' | ||
| f'-DHLS_CLOCK_UNCERTAINTY="{str(hls_clock_uncertainty)}%"' | ||
| ) | ||
|
|
||
| if not os.path.exists(f'{model.config.get_output_dir()}/build/{model.config.get_project_name()}_cyt_hw'): | ||
| os.mkdir(f'{model.config.get_output_dir()}/build/{model.config.get_project_name()}_cyt_hw') | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this needs to use
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Will change. |
||
| os.chdir(f'{model.config.get_output_dir()}/build/{model.config.get_project_name()}_cyt_hw') | ||
| os.system(cmake_cmd) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would argue that new code (despite being inspired by existing code) should use
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Will change. |
||
|
|
||
| if bitfile: | ||
| os.system('make project && make bitgen') | ||
| elif csynth: | ||
| os.system('make project && make synth') | ||
| else: | ||
| os.system('make project') | ||
|
|
||
| os.chdir(curr_dir) | ||
|
|
||
| # Compile host software | ||
| cmake_cmd = 'cmake ../../ -DFLOW=sw' | ||
| if not os.path.exists(f'{model.config.get_output_dir()}/build/{model.config.get_project_name()}_cyt_sw'): | ||
| os.mkdir(f'{model.config.get_output_dir()}/build/{model.config.get_project_name()}_cyt_sw') | ||
| os.chdir(f'{model.config.get_output_dir()}/build/{model.config.get_project_name()}_cyt_sw') | ||
| os.system(cmake_cmd) | ||
| os.system('make') | ||
| os.chdir(curr_dir) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,104 @@ | ||
| import os | ||
| import time | ||
| import ctypes | ||
| import logging | ||
| import numpy as np | ||
|
|
||
| class CoyoteOverlay: | ||
| """ | ||
| CoyoteOverlay class, similar to NeuralNetworkOverlay for the VivadoAccelerator backend | ||
| This class can be used to run model inference on the FPGA with the CoyoteAccelerator backend | ||
| """ | ||
| def __init__(self, path: str, project_name: str = 'myproject'): | ||
| """ | ||
| Default constructor | ||
|
|
||
| Args: | ||
| path (str): Path to the hls4ml folder, as specified in convert_model(...) | ||
| project_name (str, optional): hls4ml model name, if different than myproject | ||
| """ | ||
|
|
||
| self.path = path | ||
| self.project_name = project_name | ||
|
|
||
| # Set up dynamic C library | ||
| self.coyote_lib = ctypes.cdll.LoadLibrary( | ||
| f'{self.path}/build/{self.project_name}_cyt_sw/lib/libCoyoteInference.so' | ||
| ) | ||
|
|
||
| self.coyote_lib.init_model_inference.argtypes = [ctypes.c_uint, ctypes.c_uint, ctypes.c_uint] | ||
| self.coyote_lib.init_model_inference.restype = ctypes.POINTER(ctypes.c_void_p) | ||
|
|
||
| self.coyote_lib.flush.argtypes = [ctypes.POINTER(ctypes.c_void_p)] | ||
| self.coyote_lib.predict.argtypes = [ctypes.POINTER(ctypes.c_void_p)] | ||
|
|
||
| self.coyote_lib.get_inference_predictions.argtypes = [ctypes.POINTER(ctypes.c_void_p), ctypes.c_uint] | ||
| self.coyote_lib.get_inference_predictions.restype = ctypes.POINTER(ctypes.c_float) | ||
|
|
||
| self.coyote_lib.free_model_inference.argtypes = [ctypes.POINTER(ctypes.c_void_p)] | ||
|
|
||
| def program_hacc_fpga(self): | ||
|
bo3z marked this conversation as resolved.
|
||
| """ | ||
| Utility function for loading the Coyote-hls4ml bitstream and driver | ||
| on the ETH Zurich Heteregenous Accelerate Compute Cluster (HACC) | ||
| On other clusters, users would need to manually load the bitstream and driver | ||
| Gudance on this is specified in Coyote docs. | ||
| """ | ||
| os.system( | ||
| f'cd {self.path}/Coyote/driver && ' | ||
| f'make && ' | ||
| f'cd ../util && ' | ||
| f'bash program_hacc_local.sh ../../build/{self.project_name}_cyt_hw/bitstreams/cyt_top.bit ../driver/build/coyote_driver.ko' | ||
| ) | ||
|
|
||
| def predict(self, X: np.array, y_shape: tuple, batch_size: int = 1): | ||
| """ | ||
| Run model inference | ||
|
|
||
| Args: | ||
| X (np.array): Input data | ||
| y_shape (tuple): Shape of the output; used for allocating sufficient memory for the output | ||
| batch_size (int, optional): Inference batch size | ||
| """ | ||
| if len(X.shape) == 1: | ||
| X = np.array([X]) | ||
| if not (isinstance(X.dtype, float) or isinstance(X.dtype, np.float32)): | ||
| logging.warning('CoyoteOverlay only supports (for now) floating-point inputs; casting input data to float') | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Have we completely switched to
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am not sure. I see three instances in the code:
I've never had issues with logging.warning but happy to change as needed. |
||
| X = X.astype(np.float32) | ||
| y = np.empty((len(X), *y_shape)) | ||
| np_pointer_nd = np.ctypeslib.ndpointer(dtype=np.float32, ndim=len(X[0].shape), flags='C') | ||
| self.coyote_lib.set_inference_data.argtypes = [ctypes.POINTER(ctypes.c_void_p), np_pointer_nd, ctypes.c_uint] | ||
|
|
||
| model = self.coyote_lib.init_model_inference(batch_size, int(np.prod(X[0].shape)), int(np.prod(y_shape))) | ||
|
|
||
| cnt = 0 | ||
| avg_latency = 0 | ||
| avg_throughput = 0 | ||
| total_batches = 0 | ||
| for x in X: | ||
| self.coyote_lib.set_inference_data(model, x, cnt) | ||
| cnt += 1 | ||
| if cnt == batch_size: | ||
| self.coyote_lib.flush(model) | ||
|
|
||
| ts = time.time_ns() | ||
| self.coyote_lib.predict(model) | ||
| te = time.time_ns() | ||
|
|
||
| time_taken = te - ts | ||
| avg_latency += (time_taken / 1e3) | ||
| avg_throughput += (batch_size / (time_taken * 1e-9)) | ||
|
|
||
| for j in range(batch_size): | ||
| tmp = self.coyote_lib.get_inference_predictions(model, j) | ||
| y[total_batches * batch_size + j] = np.ctypeslib.as_array(tmp, shape=y_shape) | ||
|
|
||
| cnt = 0 | ||
| total_batches += 1 | ||
|
|
||
| self.coyote_lib.free_model_inference(model) | ||
| print(f'Batch size: {batch_size}; batches processed: {total_batches}') | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would put these as conditional, either as a function argument or a logging option
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'll add a function argument, verbose. |
||
| print(f'Mean latency: {round(avg_latency / total_batches, 3)}us (inference only)') | ||
| print(f'Mean throughput: {round(avg_throughput / total_batches, 1)} samples/s (inference only)') | ||
|
|
||
| return y | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is several months old now, do you want to update it?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, once I open-source and merge the V80 code with upstream (hopefully next week), I can update the submodule. |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,44 @@ | ||
| cmake_minimum_required(VERSION 3.5) | ||
| set(CYT_DIR ${CMAKE_SOURCE_DIR}/Coyote/) | ||
| set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${CYT_DIR}/cmake) | ||
| find_package(CoyoteHW REQUIRED) | ||
| find_package(CoyoteSW REQUIRED) | ||
|
|
||
| set(FLOW "hw" CACHE STRING "Synthesize hardware (hw) or host software (sw)") | ||
|
|
||
| if(FLOW STREQUAL "hw") | ||
| project(myproject) | ||
| set(EN_STRM 1) | ||
| set(N_STRM_AXI 1) | ||
| set(N_REGIONS 1) | ||
|
|
||
| validation_checks_hw() | ||
| load_apps ( | ||
| VFPGA_C0_0 "src" | ||
| ) | ||
| create_hw() | ||
| endif() | ||
|
|
||
| if(FLOW STREQUAL "sw") | ||
| project( | ||
| CoyoteInference | ||
| VERSION 1.0.0 | ||
| DESCRIPTION "CoyoteInference library" | ||
| ) | ||
| set(CYT_INCLUDE_PATH ${CYT_DIR}/sw/include) | ||
| add_library(CoyoteInference SHARED "${CMAKE_SOURCE_DIR}/src/host_libs.cpp" "${CMAKE_SOURCE_DIR}/src/host_libs.hpp") | ||
| target_include_directories(CoyoteInference PUBLIC ${CYT_INCLUDE_PATH}) | ||
| target_link_libraries(CoyoteInference PUBLIC Coyote) | ||
| target_link_directories(CoyoteInference PUBLIC /usr/local/lib) | ||
|
|
||
| project(myproject) | ||
| set(EXEC test) | ||
| set(TARGET_DIR "${CMAKE_SOURCE_DIR}/src/") | ||
| add_executable(${EXEC} ${TARGET_DIR}/myproject_host.cpp) | ||
| target_link_libraries(${EXEC} PUBLIC Coyote) | ||
| target_link_libraries(${EXEC} PUBLIC CoyoteInference) | ||
| target_link_directories(${EXEC} PUBLIC /usr/local/lib) | ||
| target_include_directories(${EXEC} PUBLIC src/hls/model_wrapper/firmware/) | ||
| target_include_directories(${EXEC} PUBLIC src/hls/model_wrapper/firmware/ap_types) | ||
|
|
||
| endif() |
Uh oh!
There was an error while loading. Please reload this page.