diff --git a/tools/onnx-subgraph/3rd_files_download.sh b/tools/onnx-subgraph/3rd_files_download.sh new file mode 100644 index 00000000000..3adf4cbda4e --- /dev/null +++ b/tools/onnx-subgraph/3rd_files_download.sh @@ -0,0 +1,8 @@ +mkdir 3rd +cd 3rd +git clone https://github.com/ekg/glia.git +cp -r glia/json ../include +cp glia/json-forwards.h ../include +cp glia/jsoncpp.cpp ../src/lib +cd .. +rm -rf 3rd diff --git a/tools/onnx-subgraph/CMakeLists.txt b/tools/onnx-subgraph/CMakeLists.txt new file mode 100644 index 00000000000..28bd9cbc058 --- /dev/null +++ b/tools/onnx-subgraph/CMakeLists.txt @@ -0,0 +1,64 @@ +# cmake version dependency +cmake_minimum_required(VERSION 3.10) + +SET(CMAKE_BUILD_TYPE "Debug") +SET(CMAKE_CXX_FLAGS_DEBUG "$ENV{CXXFLAGS} -O0 -Wall -g2 -ggdb") +SET(CMAKE_CXX_FLAGS_RELEASE "$ENV{CXXFLAGS} -O3 -Wall") +SET(CMAKE_CXX_STANDARD 17) + +project(onnx-subgraph-parser) + +find_package(Protobuf REQUIRED) +find_package(jsoncpp REQUIRED) +find_package(Python3 COMPONENTS Interpreter Development REQUIRED) + +set(PROTO_FILES onnx.proto) +protobuf_generate_cpp(PROTO_SRCS PROTO_HDRS ${PROTO_FILES}) + +include_directories(${CMAKE_CURRENT_BINARY_DIR}) +include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include) +include_directories(${Python3_INCLUDE_DIRS}) + +file(GLOB SOURCES "src/lib/*.cpp" "src/lib/*.cpp" ) + +add_library(onnx-subgraph-parser STATIC ${SOURCES} ${PROTO_SRCS} ${PROTO_FILES}) +target_link_libraries(onnx-subgraph-parser protobuf jsoncpp) + +add_executable(onnx-subgraph src/main.cpp) +target_link_libraries(onnx-subgraph onnx-subgraph-parser ${Python3_LIBRARIES}) + + set(ONNX_SUGRAPH_FILES + extract_onnx_lib.py + extract_onnx.py + single_vs_multiple_onnx.py + quant.py + model_inference.py + model_inference_multiple_output.py + onnx_subgraph_ut.py + test_model_download.sh + config.json + config-sample-1.json + config-sample-2.json + ) + + foreach(ONNX_SUGRAPH IN ITEMS ${ONNX_SUGRAPH_FILES}) + set(ONNX_SUGRAPH_FILE ${ONNX_SUGRAPH}) + set(ONNX_SUGRAPH_SRC "${CMAKE_CURRENT_SOURCE_DIR}/${ONNX_SUGRAPH_FILE}") + set(ONNX_SUGRAPH_BIN "${CMAKE_CURRENT_BINARY_DIR}/scripts/${ONNX_SUGRAPH_FILE}") + set(ONNX_SUGRAPH_TARGET "${ONNX_SUGRAPH}_target") + + add_custom_command(OUTPUT ${ONNX_SUGRAPH_BIN} + COMMAND ${CMAKE_COMMAND} -E copy "${ONNX_SUGRAPH_SRC}" "${ONNX_SUGRAPH_BIN}" + DEPENDS ${ONNX_SUGRAPH_SRC} + COMMENT "Generate ${ONNX_SUGRAPH_BIN}" + ) + + add_custom_target(${ONNX_SUGRAPH_TARGET} ALL DEPENDS ${ONNX_SUGRAPH_BIN}) + + install(FILES ${ONNX_SUGRAPH_BIN} + PERMISSIONS OWNER_WRITE OWNER_READ OWNER_EXECUTE + GROUP_READ GROUP_EXECUTE + WORLD_READ WORLD_EXECUTE + DESTINATION bin) + + endforeach(ONNX_SUGRAPH) diff --git a/tools/onnx-subgraph/Readme.md b/tools/onnx-subgraph/Readme.md new file mode 100644 index 00000000000..acb9236c8e2 --- /dev/null +++ b/tools/onnx-subgraph/Readme.md @@ -0,0 +1,89 @@ +# onnx_autosubgraph +onnx-subgraph tool provides model auto partitionioning of onnx model to several sub models by +operator, performance and model size limitations,with the order and input / output names of +sub models + +# How to build the onnx-subgraph +## OS environment dependence + 1. ubuntu >=20.04 + 2. GCC >= 9.4.0 + 3. cmake >= 3.10 + 4. python >= 3.8 + 5. apt-get install libprotobuf-dev protobuf-compiler libjsoncpp-dev + +## Python packages dependence + onnx 1.16.0 + onnxruntime 1.18.1 + onnxsim 0.4.36 + torch 2.3.1 + scikit-image + scikit-learn + pandas + tqdm + +## building the onnx-subgraph + 1. cd onnx-subgraph + 2. bash 3rd_files_download.sh + 3. mkdir build & cd build + 4. cmake .. & make + 5. we can get following output at ./build + ├── onnx-subgraph + └── scripts + ├── config.json + ├── config-sample-1.json + ├── config-sample-2.json + ├── extract_onnx_lib.py + ├── extract_onnx.py + ├── model_inference_multiple_output.py + ├── model_inference.py + ├── onnx_subgraph_ut.py + ├── quant.py + ├── single_vs_multiple_onnx.py + └── test_model_download.sh +# How to use the onnx-subgraph +## Pre-steps +### Download the test AI models + 1. bash scripts/test_model_download.sh, then "resnet-test.onnx" will be got in ./build + 2. you can change to any other onnx files as your needs, or edit the download link in + "scripts/test_model_download.sh" +### Prepare the config.json + 1. edit the config.json + . you can edit operators in "NPU_supported_ops" and "CPU_supported_ops"; + . you can edit performance data in "performance_data" as the real HW status, + . you can edit "max_subgraph_size" in case of "NPU_supported_ops" is [] + 2. you can also check more examples in "config-sample-1.json" and "config-sample-2.json" + + +## Parse the onnx model + ./onnx-subgraph --onnx=resnet-test.onnx + after parsing done, subgraphs_ios.txt will be generated at current path + +## Split the onnx model to subgraphs + 1. edit the config path and model file path at ./scripts/extract_onnx.py + e.g.: extract_onnx_lib.split_onnx_ios('./subgraphs_ios.txt','./resnet-test.onnx') + 2. python scripts/extract_onnx.py, after extraction done, the subgraphs will be saved + at './subgraphs' + subgraphs + ├── CPU + │   ├── CPUsubgraph0.onnx + │   └── CPUsubgraph1.onnx + └── NPU + ├── NPUsubgraph0.onnx + └── NPUsubgraph1.onnx + +## Verify the subgraphs inference with original model file + 1. edit the model path, subgraph path and config path in ./scripts/single_vs_multiple_onnx.py + single_onnx_model_path = './resnet-test.onnx' + model_path = './subgraphs/' + subgraphsiostxt_path = './subgraphs_ios.txt' + 2. edit the input shape and name of onnx model in ./scripts/single_vs_multiple_onnx.py + default_input_data = { + "x": np.random.rand(1, 3, 256, 256).astype(np.float32), + } + 3. compare the MSE of original inference result and subgraphs inference result + python ./scripts/single_vs_multiple_onnx.py + output: + Single model inference completed! + Multiple subgraph inference completed! + Comparing inference results between single ONNX model and multiple subgraphs... + Output '316' MSE: 5.125894080395578e-14 diff --git a/tools/onnx-subgraph/config-sample-1.json b/tools/onnx-subgraph/config-sample-1.json new file mode 100644 index 00000000000..3e083ca5b64 --- /dev/null +++ b/tools/onnx-subgraph/config-sample-1.json @@ -0,0 +1,10 @@ +{ + "NPU_supported_ops": [], + "CPU_supported_ops": ["Sub", "Pow", "ReduceMean", "Add", "Sqrt", "Div","Transpose", "Gather", "MatMul", "Mul", "Softmax", "Erf", "Gemm", "Conv", "Reshape", + "Sin", "Where", "ConstantOfShape", "Cast", "Sigmoid", "Cos", "Expand", "Slice", "Unsqueeze"], + "performance_data": [], + "hardware_limits": { + "max_subgraph_size": 10240.0, + "max_subgraphs": 5 + } +} diff --git a/tools/onnx-subgraph/config-sample-2.json b/tools/onnx-subgraph/config-sample-2.json new file mode 100644 index 00000000000..02e840a723b --- /dev/null +++ b/tools/onnx-subgraph/config-sample-2.json @@ -0,0 +1,15 @@ +{ + "NPU_supported_ops": ["Conv", "Reshape", "Transpose", "Add", "ReduceMean", "Sub", "Div", "Mul", "Sigmoid","MatMul"], + "CPU_supported_ops": ["Sub", "Pow", "ReduceMean", "Add", "Sqrt", "Div","Transpose", "Gather", "MatMul", "Mul", "Softmax", "Erf", "Gemm", "Conv", "Reshape", + "Sin", "Where", "ConstantOfShape", "Cast", "Sigmoid", "Cos", "Expand", "Slice", "Unsqueeze"], + "performance_data": [ + {"name":"Conv","CPU_time": 0.1, "NPU_time": 0.05}, + {"name":"Mul", "CPU_time": 0.15, "NPU_time": 0.07} + {"name":"Add", "CPU_time": 0.15, "NPU_time": 0.07} + {"name":"Sub", "CPU_time": 0.15, "NPU_time": 0.07} + ], + "hardware_limits": { + "max_subgraph_size": 60024.0, + "max_subgraphs": 5 + } +} diff --git a/tools/onnx-subgraph/config.json b/tools/onnx-subgraph/config.json new file mode 100644 index 00000000000..6d0b7ce5ace --- /dev/null +++ b/tools/onnx-subgraph/config.json @@ -0,0 +1,13 @@ +{ + "NPU_supported_ops": ["Conv", "Reshape", "Transpose", "Add", "ReduceMean", "Sub", "Div", "Mul", "Sigmoid","MatMul"], + "CPU_supported_ops": ["Sub", "Pow", "ReduceMean", "Add", "Sqrt", "Div","Transpose", "Gather", "MatMul", "Mul", "Softmax", "Erf", "Gemm", "Conv", "Reshape", + "Sin", "Where", "ConstantOfShape", "Cast", "Sigmoid", "Cos", "Expand", "Slice", "Unsqueeze"], + "performance_data": [ + {"name":"Conv","CPU_time": 0.1, "NPU_time": 0.05}, + {"name":"Mul", "CPU_time": 0.15, "NPU_time": 0.07} + ], + "hardware_limits": { + "max_subgraph_size": 60024.0, + "max_subgraphs": 5 + } +} diff --git a/tools/onnx-subgraph/extract_onnx.py b/tools/onnx-subgraph/extract_onnx.py new file mode 100644 index 00000000000..fed080c78d1 --- /dev/null +++ b/tools/onnx-subgraph/extract_onnx.py @@ -0,0 +1,21 @@ +# Copyright (c) 2025 Samsung Electronics Co., Ltd. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import extract_onnx_lib +import torch +import onnx +import re + +print("python executed") +extract_onnx_lib.split_onnx_ios('./subgraphs_ios.txt', './resnet-test.onnx') diff --git a/tools/onnx-subgraph/extract_onnx_lib.py b/tools/onnx-subgraph/extract_onnx_lib.py new file mode 100644 index 00000000000..17df7ecada1 --- /dev/null +++ b/tools/onnx-subgraph/extract_onnx_lib.py @@ -0,0 +1,218 @@ +# Copyright (c) 2025 Samsung Electronics Co., Ltd. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch +import onnx +import re +import os + + +def splitinstruction(instr): + iolist = re.split('--input-name \"|\" --output-name \"|\" --input-shape \"', instr) + del iolist[0] + del iolist[-1] + in_ = iolist[0].split(';') + out_ = iolist[1].split(';') + return in_, out_ + + +def splitsubgraph_ios(iofile): + iolist = re.split('--input-name |;--output-name ', iofile) + in_ = iolist[1].split(';') + out_ = iolist[2].split(';') + del out_[-1] + type = iolist[0].split('subgraph')[0] + return in_, out_, type + + +def split_onnx(instrfile, type): + print("module found") + f1 = open(instrfile, "r") + lines = f1.readlines() + count = 0 + for line in lines: + input_names, output_names = splitinstruction(line) + input_path = 'net/diffusion_model_fp32_with_shape.onnx' + output_path = 'diffusion_model_fp32_subgraphs_' + type + '/' + type + 'subgraph' + str( + count) + '.onnx' + count = count + 1 + if ((input_names != ['']) and (output_names != [''])): + onnx.utils.extract_model(input_path, output_path, input_names, output_names) + f1.close() + + +def split_onnx_ios(instrfile, + input_path='net/generation_model_simplify.onnx', + out_folder='subgraphs/'): + if not os.path.exists(input_path): + print(input_path + " not exist") + return + + model = onnx.load(input_path) + onnx.checker.check_model(input_path) + for output in model.graph.output: + model.graph.value_info.append(output) + onnx.save(model, input_path) + f1 = open(instrfile, "r") + lines = f1.readlines() + cpu_count = 0 + npu_count = 0 + count = 0 + if not os.path.exists(out_folder): + os.makedirs(out_folder) + for line in lines: + input_names, output_names, type = splitsubgraph_ios(line) + if (type == 'CPU'): + count = cpu_count + cpu_count = cpu_count + 1 + else: + count = npu_count + npu_count = npu_count + 1 + output_path_folder = out_folder + if not os.path.exists(output_path_folder): + os.makedirs(output_path_folder) + output_path = output_path_folder + type + 'subgraph' + str(count) + '.onnx' + if ((input_names != ['']) and (output_names != [''])): + onnx.utils.extract_model(input_path, output_path, input_names, output_names) + print("succeed", count) + count = count + 1 + f1.close() + + +def rename_node_io(file_path): + model = onnx.load(file_path) + graph = model.graph + for inputs in graph.input: + inputs.name = re.sub(r'[/.]', '', inputs.name) + for outputs in graph.output: + outputs.name = re.sub(r'[/.]', '', outputs.name) + for value_infos in graph.value_info: + value_infos.name = re.sub(r'[/.]', '', value_infos.name) + for initializers in graph.initializer: + initializers.name = re.sub(r'[/.]', '', initializers.name) + for node in graph.node: + node.name = re.sub(r'[/.]', '', node.name) + for i in range(len(node.input)): + node.input[i] = re.sub(r'[/.]', '', node.input[i]) + for i in range(len(node.output)): + node.output[i] = re.sub(r'[/.]', '', node.output[i]) + return model + + +def rename_subgraph_node_ios(in_file_path, out_file_path): + file_names = os.listdir(in_file_path) + for filename in file_names: + filename_ = in_file_path + '/' + filename + model = rename_node_io(filename_) + output_file_path = out_file_path + '/' + filename + onnx.save(model, output_file_path) + print(f'Modified model saved to {output_file_path}') + + +def print_model(file_path): + model = onnx.load(file_path) + graph = model.graph + size = 0 + for node in graph.node: + size = size + 1 + print(size) + + +def sort(ifile_path, ofile_path): + finished_flag = 0 + sort_count = 0 + f1 = open(ifile_path, "r") + lines = f1.readlines() + graphs_inputs = {} + graphs_outputs = {} + order_Subgraphs = {} + issort_Subgraphs = {} + TYPE = {} + index = 0 + for line in lines: + input_names, output_names, type = splitsubgraph_ios(line) + graphs_inputs[index] = input_names + graphs_outputs[index] = output_names + TYPE[index] = type + index = index + 1 + graph_num = index + f1.close() + while finished_flag == 0: + finished_flag = 1 + if (sort_count) == 0: + for i in range(graph_num): + find_flag = 0 + for g_input in graphs_inputs[i]: + for j in range(graph_num): + if g_input in graphs_outputs[j]: + find_flag = 1 + break + if find_flag == 1: + break + if find_flag == 0: + order_Subgraphs[i] = 0 + issort_Subgraphs[i] = 1 + else: + order_Subgraphs[i] = 1 + issort_Subgraphs[i] = 0 + finished_flag = 0 + else: + for i in range(graph_num): + find_flag = 0 + if issort_Subgraphs[i] == 1: + continue + for g_input in graphs_inputs[i]: + for j in range(graph_num): + if g_input in graphs_outputs[j]: + if issort_Subgraphs[j] == 0: + find_flag = 1 + break + if find_flag == 1: + break + if find_flag == 0: + order_Subgraphs[i] = sort_count + issort_Subgraphs[i] = 1 + else: + order_Subgraphs[i] = sort_count + 1 + issort_Subgraphs[i] = 0 + finished_flag = 0 + if i == graph_num - 1: + for j in range(graph_num): + if order_Subgraphs[j] == sort_count: + issort_Subgraphs[j] = 1 + print(order_Subgraphs) + print(issort_Subgraphs) + sort_count = sort_count + 1 + f2 = open(ofile_path, "w") + count_cpu = 0 + count_npu = 0 + for i in range(graph_num): + content = "" + if TYPE[i] == 'CPU': + content = "CPUsubgraph" + str(count_cpu) + ": order" + str( + order_Subgraphs[i]) + "--input-name " + count_cpu = count_cpu + 1 + if TYPE[i] == 'NPU': + content = "NPUsubgraph" + str(count_npu) + ": order" + str( + order_Subgraphs[i]) + "--input-name " + count_npu = count_npu + 1 + for graph_input in graphs_inputs[i]: + content = content + graph_input + ";" + content = content + "--output-name " + for graph_output in graphs_outputs[i]: + content = content + graph_output + ";" + content = content + "\n" + print(content) + f2.write(content) + f2.close() diff --git a/tools/onnx-subgraph/include/device.h b/tools/onnx-subgraph/include/device.h new file mode 100644 index 00000000000..72c73a07059 --- /dev/null +++ b/tools/onnx-subgraph/include/device.h @@ -0,0 +1,170 @@ +/* + * Copyright (c) 2025 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef DEVICE_H +#define DEVICE_H + +#include +#include +#include +#include +#include "onnx.pb.h" +#include "graph.h" +#include + +enum class DeviceType +{ + Target_NPU +}; + +class Device +{ +private: + std::string onnxFile; + +public: + Device(/* args */) + { + NPUPreferOp = {}; + CPUSupportOp = {}; + NPUSupportOp = {}; + max_subgraph_size = 0; + } + + ~Device() {} + + std::vector NPUPreferOp; + std::vector CPUSupportOp; + std::vector NPUSupportOp; + + float max_subgraph_size; + + DeviceType getType() { return DeviceType::Target_NPU; } + + std::vector> getCPUStructure() + { + return {{"Concat"}, + {"Sub", "Pow", "ReduceMean", "Add", "Sqrt", "Div"}, + {"Transpose", "Gather", "Gather", "Gather", "Transpose", "MatMul", "Mul", "Softmax", + "MatMul"}}; + } + + std::vector> getNPUStructure() + { + return {{"Reshape", "Transpose", "Reshape"}, + {"Reshape", "Sigmoid", "Mul", "Transpose", "Conv", "Add", "Transpose"}, + {"Reshape", "Transpose", "Conv", "Transpose", "Reshape"}, + {"Reshape", "Conv", "Transpose"}, + {"Reshape", "Add", "Add", "Reshape", "Transpose", "Conv", "Add"}, + {"Conv"}}; + } + + std::vector getNPUSupportOp() { return NPUSupportOp; } + std::vector getCPUSupportOp() { return CPUSupportOp; } + std::vector getNPUPreferOp() { return NPUPreferOp; } + + /** + * @brief Generate cut instructions for subgraphs based on the given device type. + * + * @param [in] Subgraphs A reference to a vector of ONNX GraphProto objects representing + * subgraphs. + * @param [in] device A string indicating the device type (e.g., "npu" or "c920"). + * @param [in] subgraphs_inputs A reference to a vector of unordered sets containing input + * information for subgraphs. + * @param [in] subgraphs_outputs A reference to a vector of unordered sets containing output + * information for subgraphs. + * + * @pre The function assumes that the `Subgraphs`, `subgraphs_inputs`, and + * `subgraphs_outputs` vectors are properly initialized and have the same size. + * @post A file named ` CutInstruction.txt` is created or overwritten with the + * generated cut instructions. + * @exception If the output file cannot be opened, an error message is printed, and the program + * exits. + * + * @return None + */ + void GenerateCutInstruction(std::vector &Subgraphs, std::string device, + std::vector> &subgraphs_inputs, + std::vector> &subgraphs_outputs); + + /** + * @brief Reads and parses a JSON file containing device information. + * + * This function reads a JSON file from the specified path, parses it, and extracts relevant + * device information. It updates global variables with hardware limits, preferred NPU operations, + * and supported operations for both NPU and CPU. + * + * @param json_path The file path to the JSON file containing device information. + */ + void GetDeviceJson(std::string json_path) + { + Json::Reader reader; + Json::Value root; + + // Open the JSON file in binary mode + std::ifstream in(json_path, std::ios::binary); + if (!in.is_open()) + { + std::cout << "Error opening file\n"; + return; + } + + if (reader.parse(in, root)) + { + // Extract and set the maximum subgraph size from hardware limits + float max_subgraph_size_json = root["hardware_limits"]["max_subgraph_size"].asFloat(); + max_subgraph_size = max_subgraph_size_json; + // Iterate through performance data to identify operations where NPU outperforms CPU + + for (unsigned int i = 0; i < root["performance_data"].size(); i++) + { + if (root["performance_data"][i]["CPU_time"].asFloat() > + root["performance_data"][i]["NPU_time"].asFloat()) + { + NPUPreferOp.push_back(root["performance_data"][i]["name"].asString()); + } + } + + // Iterate through and store supported NPU operations + for (int i = 0; i < int(root["NPU_supported_ops"].size()); i++) + { + if (std::find(NPUSupportOp.begin(), NPUSupportOp.end(), + root["NPU_supported_ops"][i].asString()) == NPUSupportOp.end()) + { + NPUSupportOp.push_back(root["NPU_supported_ops"][i].asString()); + } + } + + // Iterate through and store supported CPU operations + for (int i = 0; i < int(root["CPU_supported_ops"].size()); i++) + { + if (std::find(CPUSupportOp.begin(), CPUSupportOp.end(), + root["CPU_supported_ops"][i].asString()) == CPUSupportOp.end()) + { + CPUSupportOp.push_back(root["CPU_supported_ops"][i].asString()); + } + } + } + + in.close(); + } + + void updateOnnxFile(std::string &path) { onnxFile = path; } + + std::string getOnnxFile() { return onnxFile; } +}; + +#endif diff --git a/tools/onnx-subgraph/include/graph.h b/tools/onnx-subgraph/include/graph.h new file mode 100644 index 00000000000..33bd6e02038 --- /dev/null +++ b/tools/onnx-subgraph/include/graph.h @@ -0,0 +1,193 @@ +/* + * Copyright (c) 2025 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef GRAPH_H +#define GRAPH_H + +#include "onnx.pb.h" +#include +#include +#include +#include +// save the size of each node's inputs and outputs +struct NodeIOSize +{ + std::vector> inputSizes; + std::vector> outputSizes; +}; + +struct NodeTensor +{ + std::string name; + std::vector shape; + + // Default constructor + NodeTensor() = default; + + // Constructor with parameters + NodeTensor(const std::string &n, const std::vector &s) : name(n), shape(s) {} + + // Equality comparison operator + bool operator==(const NodeTensor &other) const + { + return name == other.name && shape == other.shape; + } +}; + +namespace std +{ +template <> struct hash +{ + size_t operator()(const NodeTensor &tensor) const + { + size_t hashValue = hash()(tensor.name); + for (auto &val : tensor.shape) + { + hashValue ^= hash()(val) + 0x9e3779b9 + (hashValue << 6) + (hashValue >> 2); + } + return hashValue; + } +}; +} // namespace std +/** + * @brief Extracts the names and shapes of initializers from the ONNX graph. + * + * @param [in] graph The ONNX graph from which to extract initializers. + * @pre The ONNX graph should be valid and contain initializers. + * @post The names and shapes of the initializers are stored in an unordered set of NodeTensor + * objects. + * @exception None + * @return An unordered set of NodeTensor objects containing the names and shapes of the + * initializers. + */ +std::unordered_set getInitializer(const onnx::GraphProto &graph); +/** + * @brief Extracts the names and shapes of inputs, outputs, and value_info from the ONNX graph. + * + * @param [in] graph The ONNX graph from which to extract inputs, outputs, and value_info. + * @pre The ONNX graph should be valid and contain inputs, outputs, and value_info. + * @post The names and shapes of the inputs, outputs, and value_info are stored in an unordered + * set of NodeTensor objects. + * @exception None + * @return An unordered set of NodeTensor objects containing the names and shapes of the inputs, + * outputs, and value_info. + */ +std::unordered_set getIOvalue(const onnx::GraphProto &graph); +/** + * @brief Determines the input tensors of the graph that are not produced by any node in the + * graph. + * + * @param [in] g The ONNX GraphProto object representing the graph. + * @param [in] initializerNames A set of NodeTensor objects representing the initializers in the + * graph. + * @param [out] graphInputs A set of NodeTensor objects representing the input tensors of the + * graph. + * @pre The GraphProto object g should be valid and contain nodes with proper input and output + * lists. + * @post The graphInputs set will be populated with NodeTensor objects that are inputs to the + * graph. + * @exception None + * @return None + */ +void determineGraphInput(const onnx::GraphProto &g, + const std::unordered_set &initializerNames, + std::unordered_set &graphInputs); +/** + * @brief Determines the output tensors of the graph that are either outputs of the original + * graph or are used as inputs in other parts of the graph. + * + * @param [in] originalGraph The original ONNX GraphProto object representing the graph. + * @param [in] g The ONNX GraphProto object representing the graph to analyze. + * @param [in] allgraphInputs_1 A vector of sets of NodeTensor objects representing the first + * set of inputs to the graph. + * @param [in] allgraphInputs_2 A vector of sets of NodeTensor objects representing the second + * set of inputs to the graph. + * @param [out] graphOutputs A set of NodeTensor objects representing the output tensors of the + * graph. + * @pre The GraphProto objects originalGraph and g should be valid and contain nodes with + * proper input and output lists. + * @post The graphOutputs set will be populated with NodeTensor objects that are outputs of the + * graph. + * @exception None + * @return None + */ +void determineGraphOutput(const onnx::GraphProto &originalGraph, const onnx::GraphProto &g, + std::vector> &allgraphInputs_1, + std::vector> &allgraphInputs_2, + std::unordered_set &graphOutputs); +/** + * @brief Finds the name of the node that produces a specified output tensor in the given ONNX + * graph. + * + * @param [in] g The ONNX GraphProto object representing the graph. + * @param [in] outputTensorName The name of the output tensor to find the producing node for. + * @pre The GraphProto object g should be valid and contain nodes with proper input and output + * lists. + * @post None + * @exception None + * @return The name of the node that produces the specified output tensor, or an empty string if + * no such node is found. + */ +std::string findInputNode(const onnx::GraphProto &g, const std::string &outputTensorName); +/** + * @brief Collects the names of all nodes in the given ONNX graph. + * + * @param [in] graph The ONNX GraphProto object representing the graph. + * @pre The GraphProto object graph should be valid and contain nodes with proper names. + * @post None + * @exception None + * @return An unordered set containing the names of all nodes in the graph. + */ +std::unordered_set collectNodeNames(const onnx::GraphProto &graph); +/** + * @brief Merges nodes from the source graph into the target graph. + * + * @param [in,out] targetGraph The ONNX GraphProto object to which nodes will be added. + * @param [in] sourceGraph The ONNX GraphProto object from which nodes will be copied. + * @pre Both GraphProto objects should be valid. + * @post Nodes from sourceGraph are added to targetGraph. + * @exception Exits the program with an error message if the number of nodes in targetGraph does not + * match the expected size after merging. + * @return None + */ +void mergeGraphs(onnx::GraphProto &targetGraph, onnx::GraphProto &sourceGraph); + +class Graph +{ +private: + /* data */ +public: + Graph() {} + ~Graph() {} + /** + * @brief Loads an ONNX model from a file and returns the graph contained within. + * + * @param [in] path The file path to the ONNX model. + * @pre The file specified by path should exist and be a valid ONNX model. + * @post The ONNX model is parsed and its graph is returned. + * @exception Exits the program with an error message if the file cannot be opened. + * @return The ONNX GraphProto object representing the graph from the model. + */ + onnx::GraphProto GetGraphFromOnnx(std::string &path); +}; +struct graph_adjacency_node +{ + std::vector output_node_index; + int rank; + std::string name; + int index; +}; +#endif diff --git a/tools/onnx-subgraph/include/partition.h b/tools/onnx-subgraph/include/partition.h new file mode 100644 index 00000000000..48ac51e9328 --- /dev/null +++ b/tools/onnx-subgraph/include/partition.h @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2025 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef PARTITION_H +#define PARTITION_H + +#include "onnx.pb.h" +#include +#include +#include +#include +#include +#include "device.h" +#include "graph.h" + +// deprecated +enum PartitionStrategy +{ + SPILTE_CPU_STRUCTURE_FIRST, + SPILTE_NPU_STRUCTURE_FIRST, + AUTOMATIC_SEARCH +}; + +class Partition +{ +private: + /* data */ +public: + Partition() {} + ~Partition() {} + /** + * @brief Partition the ONNX graph into subgraphs and produce cutting instructions. + * + * @param [in] g The ONNX graph to be partitioned. + * @param [in] d The device information for partitioning. + * @param [in] strategy The partition strategy to be used (deprecated). + * @param [in] node_io_size The input/output size information for each node. + * @pre The ONNX graph should be valid and the device information should be properly set. + * @post The graph is partitioned into subgraphs, and the results are stored in Subgraphs and + * otherSubgraphs. + * @exception None + * @return None + */ + void PartitionGraph(const onnx::GraphProto &g, Device &d, PartitionStrategy strategy, + const std::unordered_map &node_io_size); +}; +#endif diff --git a/tools/onnx-subgraph/model_inference.py b/tools/onnx-subgraph/model_inference.py new file mode 100644 index 00000000000..7e41f114ef4 --- /dev/null +++ b/tools/onnx-subgraph/model_inference.py @@ -0,0 +1,352 @@ +# Copyright (c) 2025 Samsung Electronics Co., Ltd. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from skimage.metrics import mean_squared_error, peak_signal_noise_ratio, structural_similarity +from skimage.io import imread +import onnxruntime as ort +import numpy as np +import pandas as pd +import torch +import onnx +import pdb +import re +import os + +from quant import quant_conv_forward_save_output + + +class ModelInference: + """ + This class is used to infer multiple onnx models. + Parameters: + model_path: Path to the model files. + subgraphsiostxt_path: Path to the txt file that describes the structure of the model graph. + Output: + outputs[0]: Inference result from the model. + Description: + Here, subgraphsiostxt_path is a txt file that describes the structure of the model graph and is used to get input/output node names. + The model_path contains paths to multiple onnx files. The load_sessions function will sort the onnx models in the model_path according to the order specified in subgraphsiostxt_path. + It then infers the sorted onnx models, returns the sessions data to self.sessions, and returns the sorted sequence to self.sorted_file_paths. + Finally, it infers the sessions based on the initial data provided by initial_input_data and returns the inference results. + """ + def __init__(self, model_path, subgraphsiostxt_path): + + self.model_path = model_path + self.subgraphsiostxt_path = subgraphsiostxt_path + self.sessions, self.sorted_file_paths = self.load_sessions() + + def load_sessions(self): + with open(self.subgraphsiostxt_path, 'r') as file: + content = file.read() + subgraph_order_map = {} + matches = re.findall(r'(\w+)subgraph(\d+): order(\d+)', content) + + for match in matches: + subgraph_type, subgraph_number, order = match + file_path = os.path.join(self.model_path, + f"{subgraph_type}subgraph{subgraph_number}.onnx") + if int(order) in subgraph_order_map: + subgraph_order_map[int(order)].append(file_path) + else: + subgraph_order_map[int(order)] = [file_path] + + sorted_file_paths = [] + for order in sorted(subgraph_order_map.keys()): + sorted_file_paths.extend(subgraph_order_map[order]) + + sessions = [ort.InferenceSession(model) for model in sorted_file_paths] + return sessions, sorted_file_paths + + def inference(self, initial_input_data): + input_data = initial_input_data + for i, (session, + model_file) in enumerate(zip(self.sessions, self.sorted_file_paths)): + + input_names = [inp.name for inp in session.get_inputs()] + model_input_data = {name: input_data[name] for name in input_names} + outputs = session.run(None, model_input_data) + output_names = [out.name for out in session.get_outputs()] + + if i < len(self.sessions) - 1: + for output, output_name in zip(outputs, output_names): + input_data[output_name] = output + return outputs[0] + + def infer_single_onnx_model(model_file, input_data): + session = ort.InferenceSession(model_file) + outputs = session.run(None, input_data) + return outputs[0] + + +class PcaInference: + """ + This class uses PCA for compression and inferring multiple ONNX models. + Parameters: + model_path: Path to the onnx model files. + subgraphsiostxt_path: Path to the txt file that describes the structure of the model graph. + endwithconv_path: Path to a txt file recording the onnx ending with convolution. + initial_input_data: Initial input data. + num: Inference times, providing the model name based on the number of times. + output_dir: Root directory for saving inference results. + Output: + outputs: Inference results. + Description: + A result_pt directory is generated in between to save intermediate results; however, not generating this directory does not affect experimental results. + The result folder saves the output of the convolution layer to calculate the compression rate. All results are saved in the output_dir folder. + """ + def __init__(self, model_path, subgraphsiostxt_path, endwithconv_path, output_dir): + self.model_path = model_path + self.subgraphsiostxt_path = subgraphsiostxt_path + self.endwithconv_path = endwithconv_path + self.output_dir = output_dir + ( + self.sessions, + self.conv_output_layer_map, + self.sorted_file_paths, + ) = self.load_sessions() + + def load_sessions(self): + with open(self.subgraphsiostxt_path, 'r') as file: + content = file.read() + subgraph_order_map = {} + matches = re.findall(r'(\w+)subgraph(\d+): order(\d+)', content) + + for match in matches: + subgraph_type, subgraph_number, order = match + file_path = os.path.join(self.model_path, + f"{subgraph_type}subgraph{subgraph_number}.onnx") + if int(order) in subgraph_order_map: + subgraph_order_map[int(order)].append(file_path) + else: + subgraph_order_map[int(order)] = [file_path] + + sorted_file_paths = [] + for order in sorted(subgraph_order_map.keys()): + sorted_file_paths.extend(subgraph_order_map[order]) + + sessions = [] + conv_output_layer_map = {} + for model_file in sorted_file_paths: + session = ort.InferenceSession(model_file) + sessions.append(session) + + conv_outputs = {} + if self.onnx_end_conv(model_file): + model = onnx.load(model_file) + for idx, node in enumerate(model.graph.node): + if node.op_type == 'Conv': + for output_name in node.output: + if output_name not in conv_outputs: + conv_outputs[output_name] = idx + 1 + conv_output_layer_map[model_file] = conv_outputs + + return sessions, conv_output_layer_map, sorted_file_paths + + def load_onnx_dict(self): + onnx_dict = [] + with open(self.endwithconv_path, 'r') as file: + content = file.read() + numbers = re.findall(r'\b\d+\b', content) + for number in numbers: + onnx_path = os.path.join(self.model_path, f"NPUsubgraph{number}.onnx") + onnx_dict.append(onnx_path) + return onnx_dict + + def onnx_end_conv(self, model_file): + for onnx in self.load_onnx_dict(): + if onnx == model_file: + return True + return False + + def check_and_convert_inputs(self, model_input_data): + for key, value in model_input_data.items(): + if isinstance(value, torch.Tensor): + model_input_data[key] = value.numpy() + elif not isinstance(value, np.ndarray): + raise TypeError( + f"Input data for '{key}' is not a NumPy array. Got type: {type(value)}" + ) + return model_input_data + + def decomp(self, compressed_tensor, ru, rbits, num_bits=8): + decompressed_tensor = torch.dequantize(compressed_tensor) + decompressed_tensor = decompressed_tensor.numpy() + if not isinstance(decompressed_tensor, np.ndarray): + raise TypeError("The decompressed tensor is not a NumPy array.") + return decompressed_tensor + + def inference(self, initial_input_data, num): + input_data = initial_input_data + aux_data = {} + record_model_name = None + + for i, (session, + model_file) in enumerate(zip(self.sessions, self.sorted_file_paths)): + input_names = [inp.name for inp in session.get_inputs()] + + if self.onnx_end_conv(record_model_name): + for name in input_names: + if name in input_data and name in aux_data: + compressed_tensor = input_data[name] + ru, rbits = aux_data[name] + decompressed_tensor = self.decomp(compressed_tensor, ru, rbits) + input_data[name] = decompressed_tensor + + model_input_data = {name: input_data[name] for name in input_names} + self.check_and_convert_inputs(model_input_data) + outputs = session.run(None, model_input_data) + output_names = [out.name for out in session.get_outputs()] + conv_outputs = self.conv_output_layer_map.get(model_file, {}) + + for output_name, output in zip(output_names, outputs): + if output_name in conv_outputs: + output_tensor = torch.tensor(output) + layer = conv_outputs[output_name] + output_tensor = quant_conv_forward_save_output( + output_tensor, + layer, + count=1, + bit=8, + i=num, + output_dir=self.output_dir) + input_data[output_name] = output_tensor + else: + input_data[output_name] = output + record_model_name = model_file + + return outputs[0] + + +class ImageMetricsEvaluator: + """ + Used to evaluate image quality, including MSE, PSNR, and SSIM. + + Parameters: + original_dir (str): Directory containing the original images. + generated_dir (str): Directory containing the generated images. + compression_dir (str): Directory containing the compression information text files. + Output: + output_file (str): Path to the output file (Excel). + """ + def __init__(self, original_dir, generated_dir, compression_dir, output_file): + + self.original_dir = original_dir + self.generated_dir = generated_dir + self.compression_dir = compression_dir + self.output_file = output_file + + def calculate_image_metrics(self, original_image_path, generated_image_path): + """Calculate MSE, PSNR, and SSIM between the given original and generated images.""" + original_image = imread(original_image_path) + generated_image = imread(generated_image_path) + + if original_image.shape != generated_image.shape: + raise ValueError('两个图像的尺寸必须相同') + + mse = mean_squared_error(original_image, generated_image) + psnr = peak_signal_noise_ratio(original_image, generated_image) + + min_dim = min(original_image.shape[:2]) + win_size = min(7, min_dim) + if win_size % 2 == 0: + win_size -= 1 + if win_size < 3: + win_size = 3 + + ssim = structural_similarity(original_image, + generated_image, + multichannel=True, + win_size=win_size, + channel_axis=-1) + + return mse, psnr, ssim + + def calculate_compression_rate(self, file_path): + """Read from a specified text file and calculate the average compression rate.""" + with open(file_path) as f: + lines = f.readlines() + rate_all = sum( + float(line.split(',')[0]) * float(line.split(',')[1]) for line in lines) + all_ = sum(float(line.split(',')[1]) for line in lines) + return rate_all / all_ if all_ != 0 else None + + def find_matching_compression_file(self, image_name): + """Find the corresponding compression info file based on the image filename.""" + base_name, _ = os.path.splitext(image_name) + number = re.search(r'_(\d+)', base_name) + if number: + number = number.group(1) + compression_files = [ + f for f in os.listdir(self.compression_dir) + if f.startswith(f'result_{number}') and f.endswith('.txt') + ] + if compression_files: + return os.path.join(self.compression_dir, compression_files[0]) + return None + + def compare_images_in_directories(self): + """Compare all images in two directories and save the results to an Excel file.""" + def sort_key(filename): + parts = filename.split('_') + try: + return int(parts[1].split('.')[0]) if len(parts) > 1 else 0 + except (ValueError, IndexError): + print(f"Warning: Could not parse number from filename {filename}") + return 0 + + original_images = sorted( + [f for f in os.listdir(self.original_dir) if f.endswith('.png')], + key=sort_key) + generated_images = sorted( + [f for f in os.listdir(self.generated_dir) if f.endswith('.png')], + key=sort_key) + + results = [] + + for orig_img_name, gen_img_name in zip(original_images, generated_images): + orig_img_path = os.path.join(self.original_dir, orig_img_name) + gen_img_path = os.path.join(self.generated_dir, gen_img_name) + + try: + mse, psnr, ssim = self.calculate_image_metrics(orig_img_path, + gen_img_path) + compression_file_path = self.find_matching_compression_file(orig_img_name) + compression_rate = self.calculate_compression_rate( + compression_file_path) if compression_file_path else None + results.append({ + 'Original Image': orig_img_name, + 'Generated Image': gen_img_name, + 'MSE': mse, + 'PSNR': psnr, + 'SSIM': ssim, + 'Compression Rate': compression_rate + }) + except Exception as e: + print(f"Error processing images {orig_img_name} and {gen_img_name}: {e}") + + df = pd.DataFrame(results) + + output_dir = os.path.dirname(self.output_file) + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + try: + df.to_excel(self.output_file, index=False) + print(f'Results have been saved to {self.output_file}') + except PermissionError: + print( + f"Permission denied: Unable to write to {self.output_file}. Please check file permissions or close the file if it is open in another program." + ) + except Exception as e: + print(f"An error occurred while saving the results: {e}") diff --git a/tools/onnx-subgraph/model_inference_multiple_output.py b/tools/onnx-subgraph/model_inference_multiple_output.py new file mode 100644 index 00000000000..6b6d96aeacf --- /dev/null +++ b/tools/onnx-subgraph/model_inference_multiple_output.py @@ -0,0 +1,357 @@ +# Copyright (c) 2025 Samsung Electronics Co., Ltd. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from skimage.metrics import mean_squared_error, peak_signal_noise_ratio, structural_similarity +from skimage.io import imread +import onnxruntime as ort +import numpy as np +import pandas as pd +import torch +import onnx +import pdb +import re +import os + +from quant import quant_conv_forward_save_output + + +class ModelInference: + """ + This class is used to infer multiple onnx models. + Parameters: + model_path: Path to the model files. + subgraphsiostxt_path: Path to the txt file that describes the structure of the model graph. + Output: + outputs[0]: Inference result from the model. + Description: + Here, subgraphsiostxt_path is a txt file that describes the structure of the model graph and is used to get input/output node names. + The model_path contains paths to multiple onnx files. The load_sessions function will sort the onnx models in the model_path according to the order specified in subgraphsiostxt_path. + It then infers the sorted onnx models, returns the sessions data to self.sessions, and returns the sorted sequence to self.sorted_file_paths. + Finally, it infers the sessions based on the initial data provided by initial_input_data and returns the inference results. + """ + def __init__(self, model_path, subgraphsiostxt_path): + + self.model_path = model_path + self.subgraphsiostxt_path = subgraphsiostxt_path + self.sessions, self.sorted_file_paths = self.load_sessions() + + def load_sessions(self): + with open(self.subgraphsiostxt_path, 'r') as file: + content = file.read() + subgraph_order_map = {} + matches = re.findall(r'(\w+)subgraph(\d+): order(\d+)', content) + + for match in matches: + subgraph_type, subgraph_number, order = match + # lower_subgraph_type = subgraph_type.lower() + file_path = os.path.join(self.model_path, + f"{subgraph_type}subgraph{subgraph_number}.onnx") + if int(order) in subgraph_order_map: + subgraph_order_map[int(order)].append(file_path) + else: + subgraph_order_map[int(order)] = [file_path] + + sorted_file_paths = [] + for order in sorted(subgraph_order_map.keys()): + sorted_file_paths.extend(subgraph_order_map[order]) + + sessions = [ort.InferenceSession(model) for model in sorted_file_paths] + return sessions, sorted_file_paths + + def inference(self, initial_input_data, output_names_to_collect=None): + input_data = initial_input_data + collected_outputs = {} + + for i, (session, + model_file) in enumerate(zip(self.sessions, self.sorted_file_paths)): + input_names = [inp.name for inp in session.get_inputs()] + output_names = [out.name for out in session.get_outputs()] + model_input_data = {name: input_data[name] for name in input_names} + outputs = session.run(None, model_input_data) + current_model_outputs = dict(zip(output_names, outputs)) + if output_names_to_collect is not None: + for output_name in output_names_to_collect: + if output_name in current_model_outputs: + collected_outputs[output_name] = current_model_outputs[ + output_name] + + if i < len(self.sessions) - 1: + input_data.update(current_model_outputs) + return collected_outputs + + def infer_single_onnx_model(model_file, input_data): + session = ort.InferenceSession(model_file) + outputs = session.run(None, input_data) + output_names = [output.name for output in session.get_outputs()] + output_dict = {name: output for name, output in zip(output_names, outputs)} + return output_dict + + +class PcaInference: + """ + This class uses PCA for compression and inferring multiple ONNX models. + Parameters: + model_path: Path to the onnx model files. + subgraphsiostxt_path: Path to the txt file that describes the structure of the model graph. + endwithconv_path: Path to a txt file recording the onnx ending with convolution. + initial_input_data: Initial input data. + num: Inference times, providing the model name based on the number of times. + output_dir: Root directory for saving inference results. + Output: + outputs: Inference results. + Description: + A result_pt directory is generated in between to save intermediate results; however, not generating this directory does not affect experimental results. + The result folder saves the output of the convolution layer to calculate the compression rate. All results are saved in the output_dir folder. + """ + def __init__(self, model_path, subgraphsiostxt_path, endwithconv_path, output_dir): + self.model_path = model_path + self.subgraphsiostxt_path = subgraphsiostxt_path + self.endwithconv_path = endwithconv_path + self.output_dir = output_dir + ( + self.sessions, + self.conv_output_layer_map, + self.sorted_file_paths, + ) = self.load_sessions() + + def load_sessions(self): + with open(self.subgraphsiostxt_path, 'r') as file: + content = file.read() + subgraph_order_map = {} + matches = re.findall(r'(\w+)subgraph(\d+): order(\d+)', content) + + for match in matches: + subgraph_type, subgraph_number, order = match + file_path = os.path.join(self.model_path, + f"{subgraph_type}subgraph{subgraph_number}.onnx") + if int(order) in subgraph_order_map: + subgraph_order_map[int(order)].append(file_path) + else: + subgraph_order_map[int(order)] = [file_path] + + sorted_file_paths = [] + for order in sorted(subgraph_order_map.keys()): + sorted_file_paths.extend(subgraph_order_map[order]) + + sessions = [] + conv_output_layer_map = {} + for model_file in sorted_file_paths: + session = ort.InferenceSession(model_file) + sessions.append(session) + + conv_outputs = {} + if self.onnx_end_conv(model_file): + model = onnx.load(model_file) + for idx, node in enumerate(model.graph.node): + if node.op_type == 'Conv': + for output_name in node.output: + if output_name not in conv_outputs: + conv_outputs[output_name] = idx + 1 + conv_output_layer_map[model_file] = conv_outputs + + return sessions, conv_output_layer_map, sorted_file_paths + + def load_onnx_dict(self): + onnx_dict = [] + with open(self.endwithconv_path, 'r') as file: + content = file.read() + numbers = re.findall(r'\b\d+\b', content) + for number in numbers: + onnx_path = os.path.join(self.model_path, f"NPUsubgraph{number}.onnx") + onnx_dict.append(onnx_path) + return onnx_dict + + def onnx_end_conv(self, model_file): + for onnx in self.load_onnx_dict(): + if onnx == model_file: + return True + return False + + def check_and_convert_inputs(self, model_input_data): + for key, value in model_input_data.items(): + if isinstance(value, torch.Tensor): + model_input_data[key] = value.numpy() + elif not isinstance(value, np.ndarray): + raise TypeError( + f"Input data for '{key}' is not a NumPy array. Got type: {type(value)}" + ) + return model_input_data + + def decomp(self, compressed_tensor, ru, rbits, num_bits=8): + decompressed_tensor = torch.dequantize(compressed_tensor) + decompressed_tensor = decompressed_tensor.numpy() + if not isinstance(decompressed_tensor, np.ndarray): + raise TypeError("The decompressed tensor is not a NumPy array.") + return decompressed_tensor + + def inference(self, initial_input_data, num): + input_data = initial_input_data + aux_data = {} + record_model_name = None + + for i, (session, + model_file) in enumerate(zip(self.sessions, self.sorted_file_paths)): + input_names = [inp.name for inp in session.get_inputs()] + + if self.onnx_end_conv(record_model_name): + for name in input_names: + if name in input_data and name in aux_data: + compressed_tensor = input_data[name] + ru, rbits = aux_data[name] + decompressed_tensor = self.decomp(compressed_tensor, ru, rbits) + input_data[name] = decompressed_tensor + + model_input_data = {name: input_data[name] for name in input_names} + self.check_and_convert_inputs(model_input_data) + outputs = session.run(None, model_input_data) + output_names = [out.name for out in session.get_outputs()] + conv_outputs = self.conv_output_layer_map.get(model_file, {}) + + for output_name, output in zip(output_names, outputs): + if output_name in conv_outputs: + output_tensor = torch.tensor(output) + layer = conv_outputs[output_name] + output_tensor = quant_conv_forward_save_output( + output_tensor, + layer, + count=1, + bit=8, + i=num, + output_dir=self.output_dir) + input_data[output_name] = output_tensor + else: + input_data[output_name] = output + record_model_name = model_file + + return outputs[0] + + +class ImageMetricsEvaluator: + """ + Used to evaluate image quality, including MSE, PSNR, and SSIM. + + Parameters: + original_dir (str): Directory containing the original images. + generated_dir (str): Directory containing the generated images. + compression_dir (str): Directory containing the compression information text files. + Output: + output_file (str): Path to the output file (Excel). + """ + def __init__(self, original_dir, generated_dir, compression_dir, output_file): + + self.original_dir = original_dir + self.generated_dir = generated_dir + self.compression_dir = compression_dir + self.output_file = output_file + + def calculate_image_metrics(self, original_image_path, generated_image_path): + original_image = imread(original_image_path) + generated_image = imread(generated_image_path) + + if original_image.shape != generated_image.shape: + raise ValueError('两个图像的尺寸必须相同') + + mse = mean_squared_error(original_image, generated_image) + psnr = peak_signal_noise_ratio(original_image, generated_image) + + min_dim = min(original_image.shape[:2]) + win_size = min(7, min_dim) + if win_size % 2 == 0: + win_size -= 1 + if win_size < 3: + win_size = 3 + + ssim = structural_similarity(original_image, + generated_image, + multichannel=True, + win_size=win_size, + channel_axis=-1) + + return mse, psnr, ssim + + def calculate_compression_rate(self, file_path): + with open(file_path) as f: + lines = f.readlines() + rate_all = sum( + float(line.split(',')[0]) * float(line.split(',')[1]) for line in lines) + all_ = sum(float(line.split(',')[1]) for line in lines) + return rate_all / all_ if all_ != 0 else None + + def find_matching_compression_file(self, image_name): + base_name, _ = os.path.splitext(image_name) + number = re.search(r'_(\d+)', base_name) + if number: + number = number.group(1) + compression_files = [ + f for f in os.listdir(self.compression_dir) + if f.startswith(f'result_{number}') and f.endswith('.txt') + ] + if compression_files: + return os.path.join(self.compression_dir, compression_files[0]) + return None + + def compare_images_in_directories(self): + def sort_key(filename): + parts = filename.split('_') + try: + return int(parts[1].split('.')[0]) if len(parts) > 1 else 0 + except (ValueError, IndexError): + print(f"Warning: Could not parse number from filename {filename}") + return 0 + + original_images = sorted( + [f for f in os.listdir(self.original_dir) if f.endswith('.png')], + key=sort_key) + generated_images = sorted( + [f for f in os.listdir(self.generated_dir) if f.endswith('.png')], + key=sort_key) + + results = [] + + for orig_img_name, gen_img_name in zip(original_images, generated_images): + orig_img_path = os.path.join(self.original_dir, orig_img_name) + gen_img_path = os.path.join(self.generated_dir, gen_img_name) + + try: + mse, psnr, ssim = self.calculate_image_metrics(orig_img_path, + gen_img_path) + compression_file_path = self.find_matching_compression_file(orig_img_name) + compression_rate = self.calculate_compression_rate( + compression_file_path) if compression_file_path else None + results.append({ + 'Original Image': orig_img_name, + 'Generated Image': gen_img_name, + 'MSE': mse, + 'PSNR': psnr, + 'SSIM': ssim, + 'Compression Rate': compression_rate + }) + except Exception as e: + print(f"Error processing images {orig_img_name} and {gen_img_name}: {e}") + + df = pd.DataFrame(results) + + output_dir = os.path.dirname(self.output_file) + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + try: + df.to_excel(self.output_file, index=False) + print(f'Results have been saved to {self.output_file}') + except PermissionError: + print( + f"Permission denied: Unable to write to {self.output_file}. Please check file permissions or close the file if it is open in another program." + ) + except Exception as e: + print(f"An error occurred while saving the results: {e}") diff --git a/tools/onnx-subgraph/onnx.proto b/tools/onnx-subgraph/onnx.proto new file mode 100644 index 00000000000..6a3abfdd109 --- /dev/null +++ b/tools/onnx-subgraph/onnx.proto @@ -0,0 +1,871 @@ +// +// WARNING: This file is automatically generated! Please edit onnx.in.proto. +// + + +// SPDX-License-Identifier: Apache-2.0 + + +syntax = "proto2"; + +package onnx; + +// Overview +// +// ONNX is an open specification that is comprised of the following components: +// +// 1) A definition of an extensible computation graph model. +// 2) Definitions of standard data types. +// 3) Definitions of built-in operators. +// +// This document describes the syntax of models and their computation graphs, +// as well as the standard data types. Together, they are referred to as the ONNX +// Intermediate Representation, or 'IR' for short. +// +// The normative semantic specification of the ONNX IR is found in docs/IR.md. +// Definitions of the built-in neural network operators may be found in docs/Operators.md. + +// Notes +// +// Protobuf compatibility +// +// To simplify framework compatibility, ONNX is defined using the subset of protobuf +// that is compatible with both protobuf v2 and v3. This means that we do not use any +// protobuf features that are only available in one of the two versions. +// +// Here are the most notable contortions we have to carry out to work around +// these limitations: +// +// - No 'map' (added protobuf 3.0). We instead represent mappings as lists +// of key-value pairs, where order does not matter and duplicates +// are not allowed. + + +// Versioning +// +// ONNX versioning is specified in docs/IR.md and elaborated on in docs/Versioning.md +// +// To be compatible with both proto2 and proto3, we will use a version number +// that is not defined by the default value but an explicit enum number. +enum Version { + // proto3 requires the first enum value to be zero. + // We add this just to appease the compiler. + _START_VERSION = 0; + // The version field is always serialized and we will use it to store the + // version that the graph is generated from. This helps us set up version + // control. + // For the IR, we are using simple numbers starting with 0x00000001, + // which was the version we published on Oct 10, 2017. + IR_VERSION_2017_10_10 = 0x0000000000000001; + + // IR_VERSION 2 published on Oct 30, 2017 + // - Added type discriminator to AttributeProto to support proto3 users + IR_VERSION_2017_10_30 = 0x0000000000000002; + + // IR VERSION 3 published on Nov 3, 2017 + // - For operator versioning: + // - Added new message OperatorSetIdProto + // - Added opset_import in ModelProto + // - For vendor extensions, added domain in NodeProto + IR_VERSION_2017_11_3 = 0x0000000000000003; + + // IR VERSION 4 published on Jan 22, 2019 + // - Relax constraint that initializers should be a subset of graph inputs + // - Add type BFLOAT16 + IR_VERSION_2019_1_22 = 0x0000000000000004; + + // IR VERSION 5 published on March 18, 2019 + // - Add message TensorAnnotation. + // - Add quantization annotation in GraphProto to map tensor with its scale and zero point quantization parameters. + IR_VERSION_2019_3_18 = 0x0000000000000005; + + // IR VERSION 6 published on Sep 19, 2019 + // - Add support for sparse tensor constants stored in model. + // - Add message SparseTensorProto + // - Add sparse initializers + IR_VERSION_2019_9_19 = 0x0000000000000006; + + // IR VERSION 7 published on May 8, 2020 + // - Add support to allow function body graph to rely on multiple external opreator sets. + // - Add a list to promote inference graph's initializers to global and + // mutable variables. Global variables are visible in all graphs of the + // stored models. + // - Add message TrainingInfoProto to store initialization + // method and training algorithm. The execution of TrainingInfoProto + // can modify the values of mutable variables. + // - Implicitly add inference graph into each TrainingInfoProto's algorithm. + IR_VERSION_2020_5_8 = 0x0000000000000007; + + // IR VERSION 8 published on July 30, 2021 + // Introduce TypeProto.SparseTensor + // Introduce TypeProto.Optional + // Added a list of FunctionProtos local to the model + // Deprecated since_version and operator status from FunctionProto + IR_VERSION_2021_7_30 = 0x0000000000000008; + + // IR VERSION 9 published on May 5, 2023 + // Added AttributeProto to FunctionProto so that default attribute values can be set. + // Added FLOAT8E4M3FN, FLOAT8E4M3FNUZ, FLOAT8E5M2, FLOAT8E5M2FNUZ. + IR_VERSION_2023_5_5 = 0x0000000000000009; + + // IR VERSION 10 published on TBD + // Added UINT4, INT4. + IR_VERSION = 0x000000000000000A; +} + +// Attributes +// +// A named attribute containing either singular float, integer, string, graph, +// and tensor values, or repeated float, integer, string, graph, and tensor values. +// An AttributeProto MUST contain the name field, and *only one* of the +// following content fields, effectively enforcing a C/C++ union equivalent. +message AttributeProto { + reserved 12, 16 to 19; + reserved "v"; + + // Note: this enum is structurally identical to the OpSchema::AttrType + // enum defined in schema.h. If you rev one, you likely need to rev the other. + enum AttributeType { + UNDEFINED = 0; + FLOAT = 1; + INT = 2; + STRING = 3; + TENSOR = 4; + GRAPH = 5; + SPARSE_TENSOR = 11; + TYPE_PROTO = 13; + + FLOATS = 6; + INTS = 7; + STRINGS = 8; + TENSORS = 9; + GRAPHS = 10; + SPARSE_TENSORS = 12; + TYPE_PROTOS = 14; + } + + // The name field MUST be present for this version of the IR. + optional string name = 1; // namespace Attribute + + // if ref_attr_name is not empty, ref_attr_name is the attribute name in parent function. + // In this case, this AttributeProto does not contain data, and it's a reference of attribute + // in parent scope. + // NOTE: This should ONLY be used in function (sub-graph). It's invalid to be used in main graph. + optional string ref_attr_name = 21; + + // A human-readable documentation for this attribute. Markdown is allowed. + optional string doc_string = 13; + + // The type field MUST be present for this version of the IR. + // For 0.0.1 versions of the IR, this field was not defined, and + // implementations needed to use has_field heuristics to determine + // which value field was in use. For IR_VERSION 0.0.2 or later, this + // field MUST be set and match the f|i|s|t|... field in use. This + // change was made to accommodate proto3 implementations. + optional AttributeType type = 20; // discriminator that indicates which field below is in use + + // Exactly ONE of the following fields must be present for this version of the IR + optional float f = 2; // float + optional int64 i = 3; // int + optional bytes s = 4; // UTF-8 string + optional TensorProto t = 5; // tensor value + optional GraphProto g = 6; // graph + optional SparseTensorProto sparse_tensor = 22; // sparse tensor value + // Do not use field below, it's deprecated. + // optional ValueProto v = 12; // value - subsumes everything but graph + optional TypeProto tp = 14; // type proto + + repeated float floats = 7; // list of floats + repeated int64 ints = 8; // list of ints + repeated bytes strings = 9; // list of UTF-8 strings + repeated TensorProto tensors = 10; // list of tensors + repeated GraphProto graphs = 11; // list of graph + repeated SparseTensorProto sparse_tensors = 23; // list of sparse tensors + repeated TypeProto type_protos = 15;// list of type protos +} + +// Defines information on value, including the name, the type, and +// the shape of the value. +message ValueInfoProto { + // This field MUST be present in this version of the IR. + optional string name = 1; // namespace Value + // This field MUST be present in this version of the IR for + // inputs and outputs of the top-level graph. + optional TypeProto type = 2; + // A human-readable documentation for this value. Markdown is allowed. + optional string doc_string = 3; + // Named metadata values; keys should be distinct. + repeated StringStringEntryProto metadata_props = 4; +} + +// Nodes +// +// Computation graphs are made up of a DAG of nodes, which represent what is +// commonly called a "layer" or "pipeline stage" in machine learning frameworks. +// +// For example, it can be a node of type "Conv" that takes in an image, a filter +// tensor and a bias tensor, and produces the convolved output. +message NodeProto { + repeated string input = 1; // namespace Value + repeated string output = 2; // namespace Value + + // An optional identifier for this node in a graph. + // This field MAY be absent in this version of the IR. + optional string name = 3; // namespace Node + + // The symbolic identifier of the Operator to execute. + optional string op_type = 4; // namespace Operator + // The domain of the OperatorSet that specifies the operator named by op_type. + optional string domain = 7; // namespace Domain + // Overload identifier, used only to map this to a model-local function. + optional string overload = 8; + + // Additional named attributes. + repeated AttributeProto attribute = 5; + + // A human-readable documentation for this node. Markdown is allowed. + optional string doc_string = 6; + + // Named metadata values; keys should be distinct. + repeated StringStringEntryProto metadata_props = 9; +} + +// Training information +// TrainingInfoProto stores information for training a model. +// In particular, this defines two functionalities: an initialization-step +// and a training-algorithm-step. Initialization resets the model +// back to its original state as if no training has been performed. +// Training algorithm improves the model based on input data. +// +// The semantics of the initialization-step is that the initializers +// in ModelProto.graph and in TrainingInfoProto.algorithm are first +// initialized as specified by the initializers in the graph, and then +// updated by the "initialization_binding" in every instance in +// ModelProto.training_info. +// +// The field "algorithm" defines a computation graph which represents a +// training algorithm's step. After the execution of a +// TrainingInfoProto.algorithm, the initializers specified by "update_binding" +// may be immediately updated. If the targeted training algorithm contains +// consecutive update steps (such as block coordinate descent methods), +// the user needs to create a TrainingInfoProto for each step. +message TrainingInfoProto { + // This field describes a graph to compute the initial tensors + // upon starting the training process. Initialization graph has no input + // and can have multiple outputs. Usually, trainable tensors in neural + // networks are randomly initialized. To achieve that, for each tensor, + // the user can put a random number operator such as RandomNormal or + // RandomUniform in TrainingInfoProto.initialization.node and assign its + // random output to the specific tensor using "initialization_binding". + // This graph can also set the initializers in "algorithm" in the same + // TrainingInfoProto; a use case is resetting the number of training + // iteration to zero. + // + // By default, this field is an empty graph and its evaluation does not + // produce any output. Thus, no initializer would be changed by default. + optional GraphProto initialization = 1; + + // This field represents a training algorithm step. Given required inputs, + // it computes outputs to update initializers in its own or inference graph's + // initializer lists. In general, this field contains loss node, gradient node, + // optimizer node, increment of iteration count. + // + // An execution of the training algorithm step is performed by executing the + // graph obtained by combining the inference graph (namely "ModelProto.graph") + // and the "algorithm" graph. That is, the actual + // input/initializer/output/node/value_info/sparse_initializer list of + // the training graph is the concatenation of + // "ModelProto.graph.input/initializer/output/node/value_info/sparse_initializer" + // and "algorithm.input/initializer/output/node/value_info/sparse_initializer" + // in that order. This combined graph must satisfy the normal ONNX conditions. + // Now, let's provide a visualization of graph combination for clarity. + // Let the inference graph (i.e., "ModelProto.graph") be + // tensor_a, tensor_b -> MatMul -> tensor_c -> Sigmoid -> tensor_d + // and the "algorithm" graph be + // tensor_d -> Add -> tensor_e + // The combination process results + // tensor_a, tensor_b -> MatMul -> tensor_c -> Sigmoid -> tensor_d -> Add -> tensor_e + // + // Notice that an input of a node in the "algorithm" graph may reference the + // output of a node in the inference graph (but not the other way round). Also, inference + // node cannot reference inputs of "algorithm". With these restrictions, inference graph + // can always be run independently without training information. + // + // By default, this field is an empty graph and its evaluation does not + // produce any output. Evaluating the default training step never + // update any initializers. + optional GraphProto algorithm = 2; + + // This field specifies the bindings from the outputs of "initialization" to + // some initializers in "ModelProto.graph.initializer" and + // the "algorithm.initializer" in the same TrainingInfoProto. + // See "update_binding" below for details. + // + // By default, this field is empty and no initializer would be changed + // by the execution of "initialization". + repeated StringStringEntryProto initialization_binding = 3; + + // Gradient-based training is usually an iterative procedure. In one gradient + // descent iteration, we apply + // + // x = x - r * g + // + // where "x" is the optimized tensor, "r" stands for learning rate, and "g" is + // gradient of "x" with respect to a chosen loss. To avoid adding assignments + // into the training graph, we split the update equation into + // + // y = x - r * g + // x = y + // + // The user needs to save "y = x - r * g" into TrainingInfoProto.algorithm. To + // tell that "y" should be assigned to "x", the field "update_binding" may + // contain a key-value pair of strings, "x" (key of StringStringEntryProto) + // and "y" (value of StringStringEntryProto). + // For a neural network with multiple trainable (mutable) tensors, there can + // be multiple key-value pairs in "update_binding". + // + // The initializers appears as keys in "update_binding" are considered + // mutable variables. This implies some behaviors + // as described below. + // + // 1. We have only unique keys in all "update_binding"s so that two + // variables may not have the same name. This ensures that one + // variable is assigned up to once. + // 2. The keys must appear in names of "ModelProto.graph.initializer" or + // "TrainingInfoProto.algorithm.initializer". + // 3. The values must be output names of "algorithm" or "ModelProto.graph.output". + // 4. Mutable variables are initialized to the value specified by the + // corresponding initializer, and then potentially updated by + // "initializer_binding"s and "update_binding"s in "TrainingInfoProto"s. + // + // This field usually contains names of trainable tensors + // (in ModelProto.graph), optimizer states such as momentums in advanced + // stochastic gradient methods (in TrainingInfoProto.graph), + // and number of training iterations (in TrainingInfoProto.graph). + // + // By default, this field is empty and no initializer would be changed + // by the execution of "algorithm". + repeated StringStringEntryProto update_binding = 4; +} + +// Models +// +// ModelProto is a top-level file/container format for bundling a ML model and +// associating its computation graph with metadata. +// +// The semantics of the model are described by the associated GraphProto's. +message ModelProto { + // The version of the IR this model targets. See Version enum above. + // This field MUST be present. + optional int64 ir_version = 1; + + // The OperatorSets this model relies on. + // All ModelProtos MUST have at least one entry that + // specifies which version of the ONNX OperatorSet is + // being imported. + // + // All nodes in the ModelProto's graph will bind against the operator + // with the same-domain/same-op_type operator with the HIGHEST version + // in the referenced operator sets. + repeated OperatorSetIdProto opset_import = 8; + + // The name of the framework or tool used to generate this model. + // This field SHOULD be present to indicate which implementation/tool/framework + // emitted the model. + optional string producer_name = 2; + + // The version of the framework or tool used to generate this model. + // This field SHOULD be present to indicate which implementation/tool/framework + // emitted the model. + optional string producer_version = 3; + + // Domain name of the model. + // We use reverse domain names as name space indicators. For example: + // `com.facebook.fair` or `com.microsoft.cognitiveservices` + // + // Together with `model_version` and GraphProto.name, this forms the unique identity of + // the graph. + optional string domain = 4; + + // The version of the graph encoded. See Version enum below. + optional int64 model_version = 5; + + // A human-readable documentation for this model. Markdown is allowed. + optional string doc_string = 6; + + // The parameterized graph that is evaluated to execute the model. + optional GraphProto graph = 7; + + // Named metadata values; keys should be distinct. + repeated StringStringEntryProto metadata_props = 14; + + // Training-specific information. Sequentially executing all stored + // `TrainingInfoProto.algorithm`s and assigning their outputs following + // the corresponding `TrainingInfoProto.update_binding`s is one training + // iteration. Similarly, to initialize the model + // (as if training hasn't happened), the user should sequentially execute + // all stored `TrainingInfoProto.initialization`s and assigns their outputs + // using `TrainingInfoProto.initialization_binding`s. + // + // If this field is empty, the training behavior of the model is undefined. + repeated TrainingInfoProto training_info = 20; + + // A list of function protos local to the model. + // + // The (domain, name, overload) tuple must be unique across the function protos in this list. + // In case of any conflicts the behavior (whether the model local functions are given higher priority, + // or standard operator sets are given higher priotity or this is treated as error) is defined by + // the runtimes. + // + // The operator sets imported by FunctionProto should be compatible with the ones + // imported by ModelProto and other model local FunctionProtos. + // Example, if same operator set say 'A' is imported by a FunctionProto and ModelProto + // or by 2 FunctionProtos then versions for the operator set may be different but, + // the operator schema returned for op_type, domain, version combination + // for both the versions should be same for every node in the function body. + // + // One FunctionProto can reference other FunctionProto in the model, however, recursive reference + // is not allowed. + repeated FunctionProto functions = 25; +}; + +// StringStringEntryProto follows the pattern for cross-proto-version maps. +// See https://developers.google.com/protocol-buffers/docs/proto3#maps +message StringStringEntryProto { + optional string key = 1; + optional string value = 2; +}; + +message TensorAnnotation { + optional string tensor_name = 1; + // pairs to annotate tensor specified by above. + // The keys used in the mapping below must be pre-defined in ONNX spec. + // For example, for 8-bit linear quantization case, 'SCALE_TENSOR', 'ZERO_POINT_TENSOR' will be pre-defined as + // quantization parameter keys. + repeated StringStringEntryProto quant_parameter_tensor_names = 2; +} + + + +// Graphs +// +// A graph defines the computational logic of a model and is comprised of a parameterized +// list of nodes that form a directed acyclic graph based on their inputs and outputs. +// This is the equivalent of the "network" or "graph" in many deep learning +// frameworks. +message GraphProto { + // The nodes in the graph, sorted topologically. + repeated NodeProto node = 1; + + // The name of the graph. + optional string name = 2; // namespace Graph + + // A list of named tensor values, used to specify constant inputs of the graph. + // Each initializer (both TensorProto as well SparseTensorProto) MUST have a name. + // The name MUST be unique across both initializer and sparse_initializer, + // but the name MAY also appear in the input list. + repeated TensorProto initializer = 5; + + // Initializers (see above) stored in sparse format. + repeated SparseTensorProto sparse_initializer = 15; + + // A human-readable documentation for this graph. Markdown is allowed. + optional string doc_string = 10; + + // The inputs and outputs of the graph. + repeated ValueInfoProto input = 11; + repeated ValueInfoProto output = 12; + + // Information for the values in the graph. The ValueInfoProto.name's + // must be distinct. It is optional for a value to appear in value_info list. + repeated ValueInfoProto value_info = 13; + + // This field carries information to indicate the mapping among a tensor and its + // quantization parameter tensors. For example: + // For tensor 'a', it may have {'SCALE_TENSOR', 'a_scale'} and {'ZERO_POINT_TENSOR', 'a_zero_point'} annotated, + // which means, tensor 'a_scale' and tensor 'a_zero_point' are scale and zero point of tensor 'a' in the model. + repeated TensorAnnotation quantization_annotation = 14; + + // Named metadata values; keys should be distinct. + repeated StringStringEntryProto metadata_props = 16; + + reserved 3, 4, 6 to 9; + reserved "ir_version", "producer_version", "producer_tag", "domain"; +} + +// Tensors +// +// A serialized tensor value. +message TensorProto { + enum DataType { + UNDEFINED = 0; + // Basic types. + FLOAT = 1; // float + UINT8 = 2; // uint8_t + INT8 = 3; // int8_t + UINT16 = 4; // uint16_t + INT16 = 5; // int16_t + INT32 = 6; // int32_t + INT64 = 7; // int64_t + STRING = 8; // string + BOOL = 9; // bool + + // IEEE754 half-precision floating-point format (16 bits wide). + // This format has 1 sign bit, 5 exponent bits, and 10 mantissa bits. + FLOAT16 = 10; + + DOUBLE = 11; + UINT32 = 12; + UINT64 = 13; + COMPLEX64 = 14; // complex with float32 real and imaginary components + COMPLEX128 = 15; // complex with float64 real and imaginary components + + // Non-IEEE floating-point format based on IEEE754 single-precision + // floating-point number truncated to 16 bits. + // This format has 1 sign bit, 8 exponent bits, and 7 mantissa bits. + BFLOAT16 = 16; + + // Non-IEEE floating-point format based on papers + // FP8 Formats for Deep Learning, https://arxiv.org/abs/2209.05433, + // 8-bit Numerical Formats For Deep Neural Networks, https://arxiv.org/pdf/2206.02915.pdf. + // Operators supported FP8 are Cast, CastLike, QuantizeLinear, DequantizeLinear. + // The computation usually happens inside a block quantize / dequantize + // fused by the runtime. + FLOAT8E4M3FN = 17; // float 8, mostly used for coefficients, supports nan, not inf + FLOAT8E4M3FNUZ = 18; // float 8, mostly used for coefficients, supports nan, not inf, no negative zero + FLOAT8E5M2 = 19; // follows IEEE 754, supports nan, inf, mostly used for gradients + FLOAT8E5M2FNUZ = 20; // follows IEEE 754, supports nan, not inf, mostly used for gradients, no negative zero + + // 4-bit data-types + UINT4 = 21; // Unsigned integer in range [0, 15] + INT4 = 22; // Signed integer in range [-8, 7], using two's-complement representation + + // Future extensions go here. + } + + // The shape of the tensor. + repeated int64 dims = 1; + + // The data type of the tensor. + // This field MUST have a valid TensorProto.DataType value + optional int32 data_type = 2; + + // For very large tensors, we may want to store them in chunks, in which + // case the following fields will specify the segment that is stored in + // the current TensorProto. + message Segment { + optional int64 begin = 1; + optional int64 end = 2; + } + optional Segment segment = 3; + + // Tensor content must be organized in row-major order. + // + // Depending on the data_type field, exactly one of the fields below with + // name ending in _data is used to store the elements of the tensor. + + // For float and complex64 values + // Complex64 tensors are encoded as a single array of floats, + // with the real components appearing in odd numbered positions, + // and the corresponding imaginary component appearing in the + // subsequent even numbered position. (e.g., [1.0 + 2.0i, 3.0 + 4.0i] + // is encoded as [1.0, 2.0 ,3.0 ,4.0] + // When this field is present, the data_type field MUST be FLOAT or COMPLEX64. + repeated float float_data = 4 [packed = true]; + + // For int32, uint8, int8, uint16, int16, uint4, int4, bool, float8 and float16 values + // float16 and float8 values must be bit-wise converted to an uint16_t prior + // to writing to the buffer. + // uint4 and int4 values must be packed to 4bitx2 prior to writing to the buffer, the first element is stored in + // the 4 LSB and the second element is stored in the 4 MSB. + // When this field is present, the data_type field MUST be + // INT32, INT16, INT8, INT4, UINT16, UINT8, UINT4, BOOL, FLOAT16, BFLOAT16, FLOAT8E4M3FN, FLOAT8E4M3FNUZ, FLOAT8E5M2, FLOAT8E5M2FNUZ + repeated int32 int32_data = 5 [packed = true]; + + // For strings. + // Each element of string_data is a UTF-8 encoded Unicode + // string. No trailing null, no leading BOM. The protobuf "string" + // scalar type is not used to match ML community conventions. + // When this field is present, the data_type field MUST be STRING + repeated bytes string_data = 6; + + // For int64. + // When this field is present, the data_type field MUST be INT64 + repeated int64 int64_data = 7 [packed = true]; + + // Optionally, a name for the tensor. + optional string name = 8; // namespace Value + + // A human-readable documentation for this tensor. Markdown is allowed. + optional string doc_string = 12; + + // Serializations can either use one of the fields above, or use this + // raw bytes field. The only exception is the string case, where one is + // required to store the content in the repeated bytes string_data field. + // + // When this raw_data field is used to store tensor value, elements MUST + // be stored in as fixed-width, little-endian order. + // Floating-point data types MUST be stored in IEEE 754 format. + // Complex64 elements must be written as two consecutive FLOAT values, real component first. + // Complex128 elements must be written as two consecutive DOUBLE values, real component first. + // Boolean type MUST be written one byte per tensor element (00000001 for true, 00000000 for false). + // uint4 and int4 values must be packed to 4bitx2, the first element is stored in the 4 LSB and the second element is stored in the 4 MSB. + // + // Note: the advantage of specific field rather than the raw_data field is + // that in some cases (e.g. int data), protobuf does a better packing via + // variable length storage, and may lead to smaller binary footprint. + // When this field is present, the data_type field MUST NOT be STRING or UNDEFINED + optional bytes raw_data = 9; + + // Data can be stored inside the protobuf file using type-specific fields or raw_data. + // Alternatively, raw bytes data can be stored in an external file, using the external_data field. + // external_data stores key-value pairs describing data location. Recognized keys are: + // - "location" (required) - POSIX filesystem path relative to the directory where the ONNX + // protobuf model was stored + // - "offset" (optional) - position of byte at which stored data begins. Integer stored as string. + // Offset values SHOULD be multiples 4096 (page size) to enable mmap support. + // - "length" (optional) - number of bytes containing data. Integer stored as string. + // - "checksum" (optional) - SHA1 digest of file specified in under 'location' key. + repeated StringStringEntryProto external_data = 13; + + // Location of the data for this tensor. MUST be one of: + // - DEFAULT - data stored inside the protobuf message. Data is stored in raw_data (if set) otherwise in type-specified field. + // - EXTERNAL - data stored in an external location as described by external_data field. + enum DataLocation { + DEFAULT = 0; + EXTERNAL = 1; + } + + // If value not set, data is stored in raw_data (if set) otherwise in type-specified field. + optional DataLocation data_location = 14; + + // For double + // Complex128 tensors are encoded as a single array of doubles, + // with the real components appearing in odd numbered positions, + // and the corresponding imaginary component appearing in the + // subsequent even numbered position. (e.g., [1.0 + 2.0i, 3.0 + 4.0i] + // is encoded as [1.0, 2.0 ,3.0 ,4.0] + // When this field is present, the data_type field MUST be DOUBLE or COMPLEX128 + repeated double double_data = 10 [packed = true]; + + // For uint64 and uint32 values + // When this field is present, the data_type field MUST be + // UINT32 or UINT64 + repeated uint64 uint64_data = 11 [packed = true]; + + // Named metadata values; keys should be distinct. + repeated StringStringEntryProto metadata_props = 16; +} + +// A serialized sparse-tensor value +message SparseTensorProto { + // The sequence of non-default values are encoded as a tensor of shape [NNZ]. + // The default-value is zero for numeric tensors, and empty-string for string tensors. + // values must have a non-empty name present which serves as a name for SparseTensorProto + // when used in sparse_initializer list. + optional TensorProto values = 1; + + // The indices of the non-default values, which may be stored in one of two formats. + // (a) Indices can be a tensor of shape [NNZ, rank] with the [i,j]-th value + // corresponding to the j-th index of the i-th value (in the values tensor). + // (b) Indices can be a tensor of shape [NNZ], in which case the i-th value + // must be the linearized-index of the i-th value (in the values tensor). + // The linearized-index can be converted into an index tuple (k_1,...,k_rank) + // using the shape provided below. + // The indices must appear in ascending order without duplication. + // In the first format, the ordering is lexicographic-ordering: + // e.g., index-value [1,4] must appear before [2,1] + optional TensorProto indices = 2; + + // The shape of the underlying dense-tensor: [dim_1, dim_2, ... dim_rank] + repeated int64 dims = 3; +} + +// Defines a tensor shape. A dimension can be either an integer value +// or a symbolic variable. A symbolic variable represents an unknown +// dimension. +message TensorShapeProto { + message Dimension { + oneof value { + int64 dim_value = 1; + string dim_param = 2; // namespace Shape + }; + // Standard denotation can optionally be used to denote tensor + // dimensions with standard semantic descriptions to ensure + // that operations are applied to the correct axis of a tensor. + // Refer to https://github.com/onnx/onnx/blob/main/docs/DimensionDenotation.md#denotation-definition + // for pre-defined dimension denotations. + optional string denotation = 3; + }; + repeated Dimension dim = 1; +} + +// Types +// +// The standard ONNX data types. +message TypeProto { + + message Tensor { + // This field MUST NOT have the value of UNDEFINED + // This field MUST have a valid TensorProto.DataType value + // This field MUST be present for this version of the IR. + optional int32 elem_type = 1; + optional TensorShapeProto shape = 2; + } + + // repeated T + message Sequence { + // The type and optional shape of each element of the sequence. + // This field MUST be present for this version of the IR. + optional TypeProto elem_type = 1; + }; + + // map + message Map { + // This field MUST have a valid TensorProto.DataType value + // This field MUST be present for this version of the IR. + // This field MUST refer to an integral type ([U]INT{8|16|32|64}) or STRING + optional int32 key_type = 1; + // This field MUST be present for this version of the IR. + optional TypeProto value_type = 2; + }; + + // wrapper for Tensor, Sequence, or Map + message Optional { + // The type and optional shape of the element wrapped. + // This field MUST be present for this version of the IR. + // Possible values correspond to OptionalProto.DataType enum + optional TypeProto elem_type = 1; + }; + + + message SparseTensor { + // This field MUST NOT have the value of UNDEFINED + // This field MUST have a valid TensorProto.DataType value + // This field MUST be present for this version of the IR. + optional int32 elem_type = 1; + optional TensorShapeProto shape = 2; + } + + + oneof value { + // The type of a tensor. + Tensor tensor_type = 1; + + // NOTE: DNN-only implementations of ONNX MAY elect to not support non-tensor values + // as input and output to graphs and nodes. These types are needed to naturally + // support classical ML operators. DNN operators SHOULD restrict their input + // and output types to tensors. + + // The type of a sequence. + Sequence sequence_type = 4; + + // The type of a map. + Map map_type = 5; + + // The type of an optional. + Optional optional_type = 9; + + + // Type of the sparse tensor + SparseTensor sparse_tensor_type = 8; + + } + + // An optional denotation can be used to denote the whole + // type with a standard semantic description as to what is + // stored inside. Refer to https://github.com/onnx/onnx/blob/main/docs/TypeDenotation.md#type-denotation-definition + // for pre-defined type denotations. + optional string denotation = 6; +} + +// Operator Sets +// +// OperatorSets are uniquely identified by a (domain, opset_version) pair. +message OperatorSetIdProto { + // The domain of the operator set being identified. + // The empty string ("") or absence of this field implies the operator + // set that is defined as part of the ONNX specification. + // This field MUST be present in this version of the IR when referring to any other operator set. + optional string domain = 1; + + // The version of the operator set being identified. + // This field MUST be present in this version of the IR. + optional int64 version = 2; +} + +// Operator/function status. +enum OperatorStatus { + EXPERIMENTAL = 0; + STABLE = 1; +} + +message FunctionProto { + // The name of the function, similar to op_type in NodeProto. + // This is part of the unique-id (domain, name, overload) of FunctionProtos in a model. + optional string name = 1; + + // Deprecated since IR Version 8 + // optional int64 since_version = 2; + reserved 2; + reserved "since_version"; + + // Deprecated since IR Version 8 + // optional OperatorStatus status = 3; + reserved 3; + reserved "status"; + + // The inputs and outputs of the function. + repeated string input = 4; + repeated string output = 5; + + // The attribute parameters of the function. + // It is for function parameters without default values. + repeated string attribute = 6; + + // The attribute protos of the function. + // It is for function attributes with default values. + // A function attribute shall be represented either as + // a string attribute or an AttributeProto, not both. + repeated AttributeProto attribute_proto = 11; + + // The nodes in the function. + repeated NodeProto node = 7; + // A human-readable documentation for this function. Markdown is allowed. + optional string doc_string = 8; + + // The OperatorSets this function body (graph) relies on. + // + // All nodes in the function body (graph) will bind against the operator + // with the same-domain/same-op_type operator with the HIGHEST version + // in the referenced operator sets. This means at most one version can be relied + // for one domain. + // + // The operator sets imported by FunctionProto should be compatible with the ones + // imported by ModelProto. Example, if same operator set say 'A' is imported by FunctionProto + // and ModelProto then versions for the operator set may be different but, + // the operator schema returned for op_type, domain, version combination + // for both the versions should be same. + + repeated OperatorSetIdProto opset_import = 9; + + // The domain which this function belongs to. + // This is part of the unique-id (domain, name, overload) of FunctionProtos in a model. + optional string domain = 10; + + // The overload identifier of the function. + // This is part of the unique-id (domain, name, overload) of FunctionProtos in a model. + optional string overload = 13; + + // Information for the values in the function. The ValueInfoProto.name's + // must be distinct and refer to names in the function (including inputs, + // outputs, and intermediate values). It is optional for a value to appear + // in value_info list. + repeated ValueInfoProto value_info = 12; + + // Named metadata values; keys should be distinct. + repeated StringStringEntryProto metadata_props = 14; +} + +// For using protobuf-lite +option optimize_for = LITE_RUNTIME; diff --git a/tools/onnx-subgraph/onnx_subgraph_ut.py b/tools/onnx-subgraph/onnx_subgraph_ut.py new file mode 100644 index 00000000000..26daf8dd245 --- /dev/null +++ b/tools/onnx-subgraph/onnx_subgraph_ut.py @@ -0,0 +1,66 @@ +# Copyright (c) 2025 Samsung Electronics Co., Ltd. All Rights Reserved + +import unittest +import os +import sys +import extract_onnx_lib +import shutil + + +def onnx_parser_test(args): + #exe = './onnx-subgraph ' + '--onnx=test.onnx' + exe = './onnx-subgraph ' + args + rec = os.system(exe) + + +class ONNX_Parser_Test(unittest.TestCase): + def test_parse_result_exception(self): + ret = os.path.exists('./subgraphs_ios.txt') + if ret: + os.remove('./subgraphs_ios.txt') + onnx_parser_test('--onnx=no_file.onnx') + ret = os.path.exists('./subgraphs_ios.txt') + self.assertEqual(ret, False) + + def test_parse_result_normal(self): + ret = os.path.exists('./subgraphs_ios.txt') + if ret: + os.remove('./subgraphs_ios.txt') + + onnx_parser_test('--onnx=test.onnx') + ret = os.path.exists('./subgraphs_ios.txt') + self.assertEqual(ret, True) + + def test_subgraph_normal(self): + ret = os.path.exists('./subgraphs') + if ret: + shutil.rmtree(path='./subgraphs') + + extract_onnx_lib.split_onnx_ios('./subgraphs_ios.txt', './test.onnx') + ret = os.path.exists('./subgraphs') + self.assertEqual(ret, True) + + ret = os.path.exists('./subgraphs/CPU') + self.assertEqual(ret, True) + + ret = os.path.exists('./subgraphs/NPU') + self.assertEqual(ret, True) + + ret = os.path.exists('./subgraphs/CPU/CPUsubgraph15.onnx') + self.assertEqual(ret, True) + + ret = os.path.exists('./subgraphs/NPU/NPUsubgraph15.onnx') + self.assertEqual(ret, True) + + def test_subgraph_exception(self): + ret = os.path.exists('./subgraphs') + if ret: + shutil.rmtree(path='./subgraphs') + + extract_onnx_lib.split_onnx_ios('./subgraphs_ios.txt', './fake.onnx') + ret = os.path.exists('./subgraphs') + self.assertEqual(ret, False) + + +if __name__ == '__main__': + unittest.main() diff --git a/tools/onnx-subgraph/quant.py b/tools/onnx-subgraph/quant.py new file mode 100644 index 00000000000..5ca672709d4 --- /dev/null +++ b/tools/onnx-subgraph/quant.py @@ -0,0 +1,427 @@ +# Copyright (c) 2025 Samsung Electronics Co., Ltd. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import gc +import os +import time +from types import MethodType + +import torch +import torch.nn as nn +from torch import Tensor +from torch.nn import functional as F +from tqdm import tqdm +from sklearn.cluster import KMeans +from torch.nn import functional as F +import numpy as np +import pdb + + +def quant_transmartix(x, bits=8): + # Quantizes the input tensor and returns the quantized tensor and its integer representation. + if (x.max() == x.min()): + return x, 0 + n = 2**(bits - 1) - 1 + act_scale = (x.max() - x.min()) / 2 / n + zero_point = (x.min() + x.max()) / 2 + aint = ((x - zero_point) / act_scale).round().clamp(-n - 1, n) + xq = aint * act_scale + zero_point + return xq, aint + + +def quant_transmartix1(x, bits=8): + # Computes the projection matrix using singular value decomposition and quantizes it. + cov = torch.matmul(im, im.t()) / im.shape[1] + if (x.max() == x.min()): + return x, 0 + n = 2**(bits - 1) - 1 + act_scale = (x.max() - x.min()) / 2 / n + zero_point = (x.min() + x.max()) / 2 + aint = ((x - zero_point) / act_scale).round().clamp(-n - 1, n) + return aint, act_scale, zero_point + + +def get_projection_matrix(im, eigenVar, num_bits=8): + # covariance matrix + cov = torch.matmul(im, im.t()) / im.shape[1] + # svd + u, s, _ = torch.svd(cov) + u, _ = quant_transmartix(u, 16) + return u, s + + +def comp(x, rate, output_dir, count, transu, inb, num_bits, layer): + # Compresses the input tensor using a transformation matrix and quantizes the result. + if (len(x.shape) == 2): + B, C = x.shape + x_reshape = x + elif (len(x.shape) == 3): + B, C, H = x.shape + x_reshape = x.permute(1, 0, 2).reshape(C, -1) + elif (len(x.shape) == 4): + B, C, H, W = x.shape + x_reshape = x.permute(1, 0, 2, 3).reshape(C, -1) + else: + raise NotImplementedError + if (count == 1): + u, s = get_projection_matrix(x_reshape, rate, num_bits) + x_trans = torch.matmul(u.t(), x_reshape) + x_trans, x_trans_int = quant_transmartix(x_trans, num_bits) + channel_max = x_trans_int.max(-1)[0].reshape(1, -1) + channel_min = x_trans_int.min(-1)[0].reshape(1, -1) + channel_dif = channel_max - channel_min + channel_dif[torch.where(channel_dif == 0)] = 1 + bits = torch.ceil(torch.log2(channel_dif)) + max_min = torch.cat([channel_max, channel_min], dim=0) + x_return = torch.matmul(u, x_trans) + x_return, x_return_int = quant_transmartix(x_return, num_bits) + ru = u + rbits = max_min + elif (count <= 100): + x_trans = torch.matmul(transu.t(), x_reshape) + x_trans, x_trans_int = quant_transmartix(x_trans, num_bits) + channel_max = x_trans_int.max(-1)[0].reshape(1, -1) + channel_min = x_trans_int.min(-1)[0].reshape(1, -1) + max_min = torch.cat([channel_max, channel_min], dim=0) + x_return = torch.matmul(transu, x_trans) + x_return, x_return_int = quant_transmartix(x_return, num_bits) + ru = None + rbits = max_min + else: + x_trans = torch.matmul(transu.t(), x_reshape) + x_trans_int, act_scale, zero_point = quant_transmartix1(x_trans, num_bits) + inb_expend = inb[:, :, None].repeat(1, 1, H * W) + mask_clip_max = torch.where(x_trans_int > inb_expend[0]) + mask_clip_min = torch.where(x_trans_int < inb_expend[1]) + x_trans_int[mask_clip_max] = inb_expend[0][mask_clip_max] + x_trans_int[mask_clip_min] = inb_expend[1][mask_clip_min] + x_trans = x_trans_int * act_scale + zero_point + channel_max = x_trans_int.max(-1)[0].reshape(1, -1) + channel_min = x_trans_int.min(-1)[0].reshape(1, -1) + max_min = torch.cat([channel_max, channel_min], dim=0) + x_return = torch.matmul(transu, x_trans) + print(x_return.size()) + x_return, x_return_int = quant_transmartix(x_return, num_bits) + ru = None + rbits = max_min + + result_pt = os.path.join(output_dir, "result_pt") + os.makedirs(result_pt, exist_ok=True) + torch.save(ru, os.path.join(result_pt, f"{layer}.pt")) + if len(x.shape) == 2: + x_return = x_return + elif len(x.shape) == 3: + x_return = x_return.reshape(C, B, H).permute(1, 0, 2) + elif len(x.shape) == 4: + x_return = x_return.reshape(C, B, H, W).permute(1, 0, 2, 3) + return x_return, ru, rbits + + +def quant_activation(x, bit, act_scale, zero_point=0): + # Applies quantization on activation tensors with given scale and zero point. + n = 2**(bit - 1) - 1 + aint = ((x - zero_point) / act_scale).round().clamp(-n - 1, n) + xq = aint * act_scale + zero_point + return xq + + +def quant_linear_weight(w, bit, mode="channel_wise", symmetric=True): + # Quantizes linear layer weights either channel-wise or tensor-wise. + if mode == "channel_wise" and symmetric: + n = 2**(bit - 1) - 1 + scale_channel_wise = w.abs().max(dim=1, keepdim=True)[0] / n + wint = (w / scale_channel_wise).round().clamp(-n - 1, n) + wq = wint * scale_channel_wise + else: + n = 2**(bit - 1) - 1 + scale_tensor_wise = w.abs().max() / n + wint = (w / scale_tensor_wise).round().clamp(-n - 1, n) + wq = wint * scale_tensor_wise + return wq + + +def quant_conv_weight(w, bit, mode="channel_wise", symmetric=True): + # Quantizes convolutional layer weights channel-wise. + if mode == "channel_wise" and symmetric: + n = 2**(bit - 1) - 1 + scale_channel_wise = (w.view(w.shape[0], -1).abs().max(dim=-1, keepdim=True)[0] / + n) + scale_channel_wise = scale_channel_wise.view(w.shape[0], 1, 1, 1) + wint = (w / scale_channel_wise).round().clamp(-n - 1, n) + wq = wint * scale_channel_wise + else: + raise NotImplementedError + return wq + + +def quant_conv_forward_save_output(x, layer, count, bit, i, output_dir): + # Performs forward pass of convolutional layer while saving intermediate quantization outputs. + x, xq_int = quant_transmartix(x, bit) + result_path = output_dir + "/result" + os.makedirs(result_path, exist_ok=True) + output_tensor, ru, rb = comp(x=x, + rate=0.999999, + output_dir=output_dir, + count=1, + transu=None, + inb=None, + num_bits=8, + layer=layer) + if (count == 1): + u = ru + rb = rb + + B, C, H, W = x.shape + Max = rb[0:200:2] + Min = rb[1:200:2] + channel_max = Max.max(0)[0].reshape(1, -1) + channel_min = Min.min(0)[0].reshape(1, -1) + #0.285 + mask_neg_max = torch.where(channel_max < 0) + channel_max[mask_neg_max] = -1 * channel_max[mask_neg_max] + mask_zero_max = torch.where(channel_max == 0) + channel_max[mask_zero_max] = 1 + channel_max_log = torch.log2(channel_max) + + condition = channel_max_log - torch.floor(channel_max_log) <= 0.55 + channel_max_return = torch.where(condition, 2**torch.floor(channel_max_log), + 2**torch.ceil(channel_max_log)) + + mask_neg_min = torch.where(channel_min < 0) + channel_min[mask_neg_min] = -1 * channel_min[mask_neg_min] + mask_zero_min = torch.where(channel_min == 0) + channel_min[mask_zero_min] = 1 + channel_min_log = torch.log2(channel_min) + + condition = channel_min_log - torch.floor(channel_min_log) <= 0.6 + channel_min_return = torch.where(condition, 2**torch.floor(channel_min_log), + 2**torch.ceil(channel_min_log)) + + channel_min_return[mask_neg_min] = -1 * channel_min_return[mask_neg_min] + rb = torch.cat([channel_max_return, channel_min_return], dim=0) + filename = result_path + f'/result_{i}.txt' + with open(filename, 'a') as f: + f.write(f'{(channel_max_return-channel_min_return).mean()/2**bit},{x.numel()}\n') + return output_tensor + + +def quant_conv_forward_optimization(self, x): + # Optimizes the quantization parameters for a convolutional layer's activations during forward pass. + if self.enable_calib_act_min_max: + z_target = self._conv_forward(x, self.weight, None) + xmax = x.abs().max() + best_scale = None + best_mse = 1e10 + range_num = 200 + pbar = tqdm(range(range_num), desc=self.own_name) + for ii in pbar: + xq = x.clip( + -xmax * (1 / range_num * (range_num - ii)), + xmax * (1 / range_num * (range_num - ii)), + ) + zero_point = (xq.max() + xq.min()) / 2 + act_scale = (xq.max() - xq.min()) / 2 / (2**(self.bit - 1) - 1) + xq = quant_activation(xq, + bit=self.bit, + act_scale=act_scale, + zero_point=zero_point) + zq = self._conv_forward(xq, self.weight, None) + mse = ((z_target - zq)**2).mean().item() + if mse < best_mse: + best_mse = mse + best_scale = act_scale + best_zero_point = zero_point + best_clip_value = xmax * (1 / range_num * (range_num - ii)) + pbar.set_postfix( + dict( + best_mse=f"{best_mse:.1e}", + best_scale=best_scale.data.item(), + xmax=xmax.data.item(), + best_clip_value=best_clip_value.data.item(), + )) + assert best_scale is not None + del z_target + del xq + del zq + del mse + gc.collect() + torch.cuda.empty_cache() + self.act_scale = best_scale + self.zero_point = best_zero_point + self.clip_value = best_clip_value + self.enable_calib_act_min_max = False + if self.act_scale is None: + self.zero_point = (x.max() + x.min()) / 2 + self.act_scale = (x.max() - x.min()) / 2 / (2**(self.bit - 1) - 1) + x = quant_activation(x, + bit=self.bit, + act_scale=self.act_scale, + zero_point=self.zero_point) + return self._conv_forward(x, self.weight, self.bias) + + +def quant_linear_forward_save_output(self, x): + # Performs forward pass of linear layer while saving intermediate quantization outputs. + xq = x.clip(-self.clip_value, self.clip_value) + # xq=x + act_scale = (xq.max() - xq.min()) / 2 / (2**(self.bit - 1) - 1) + zero_point = (xq.min() + xq.max()) / 2 + xq = quant_activation(xq, bit=self.bit, act_scale=act_scale, zero_point=zero_point) + zq = F.linear(xq, self.weight, self.bias) + return zq + + +def quant_linear_forward_optimization(self, x): + # Optimizes the quantization parameters for a linear layer's activations and weights during forward pass. + if self.enable_calib_act_min_max: + z_target = F.linear(x, self.weight) + xmax = x.abs().max() + best_scale = None + best_mse = 1e5 + range_num = 200 + pbar = tqdm(range(range_num), desc=self.own_name) + for ii in pbar: + xq = x.clip( + -xmax * (1 / range_num * (range_num - ii)), + xmax * (1 / range_num * (range_num - ii)), + ) + act_scale = (xq.max() - xq.min()) / 2 / (2**(self.bit - 1) - 1) + zero_point = (xq.min() + xq.max()) / 2 + xq = quant_activation(xq, + bit=self.bit, + act_scale=act_scale, + zero_point=zero_point) + zq = F.linear(xq, self.weight) + mse = ((z_target - zq)**2).mean() + if mse < best_mse: + best_mse = mse + best_scale = act_scale + best_zero = zero_point + best_clip_value = xmax * (1 / range_num * (range_num - ii)) + pbar.set_postfix( + dict( + best_mse=best_mse.data.item(), + best_scale=best_scale.data.item(), + xmax=xmax.data.item(), + best_clip_value=best_clip_value.data.item(), + )) + assert best_scale is not None + del z_target + del xq + del zq + del mse + gc.collect() + torch.cuda.empty_cache() + self.act_scale = best_scale + self.zero_point = best_zero + self.clip_value = best_clip_value + self.enable_calib_act_min_max = False + + if self.act_scale is None: + self.zero_point = (x.max() + x.min()) / 2 + self.act_scale = (x.max() - x.min()) / 2 / (2**(self.bit - 1) - 1) + x = quant_activation(x, + bit=self.bit, + act_scale=self.act_scale, + zero_point=self.zero_point) + + if self.enable_calib_weight_min_max: + z_target = F.linear(x, self.fp_weight) + best_mse = 1e5 + range_num = 200 + wmax = self.fp_weight.abs().max() + pbar = tqdm(range(range_num), desc=self.own_name) + for ii in pbar: + w_clip = wmax * (1 / range_num * (range_num - ii)) + wq = quant_linear_weight( + self.fp_weight.clip(-w_clip, w_clip), + self.bit, + mode="tensor_wise", + symmetric=True, + ) + zq = F.linear(x, wq) + mse = ((z_target - zq)**2).mean() + if mse < best_mse: + best_mse = mse + best_w_clip = w_clip + pbar.set_postfix( + dict( + best_mse=best_mse.data.item(), + best_w_clip=best_w_clip.data.item(), + )) + self.weight.data = quant_linear_weight( + self.fp_weight.clip(-best_w_clip, best_w_clip), + self.bit, + mode="tensor_wise", + symmetric=True, + ) + self.enable_calib_weight_min_max = False + + return F.linear(x, self.weight, self.bias) + + +def fast_quant( + model, + comp, + bit=8, + fp=False, + enable_calib_act_min_max=False, + enable_calib_weight_min_max=False, + optimization=False, + load_min_max_from_json=False, + min_max_dict=None, +): + if fp: + return model + layer = 0 + convlayer = 0 + for name, module in tqdm(model.named_modules(), desc="Quantize weights"): + module.own_name = name + if isinstance(module, nn.Linear): + module.bit = bit + w = module.weight.data.clone() + wq = quant_linear_weight(w, bit, mode="tensor_wise", symmetric=True) + module.weight.data = wq.data + module.act_scale = None + if optimization: + module.forward = MethodType(quant_linear_forward_optimization, module) + elif load_min_max_from_json: + module.clip_value = torch.tensor(min_max_dict[name], device="cuda") + module.forward = MethodType(quant_linear_forward_save_output, module) + module.enable_calib_act_min_max = enable_calib_act_min_max + module.enable_calib_weight_min_max = enable_calib_weight_min_max + module.layer = layer + layer += 1 + if isinstance(module, nn.Conv2d): + module.layer = layer + module.convlayer = convlayer + convlayer += 1 + layer += 1 + module.count = 0 + module.u = 0 + module.rb = 0 + module.comp = comp + + module.bit = bit + w = module.weight.data.clone() + wq = quant_conv_weight(w, bit, mode="channel_wise", symmetric=True) + module.weight.data = wq.data + module.act_scale = None + if optimization: + module.forward = MethodType(quant_conv_forward_optimization, module) + elif load_min_max_from_json: + module.clip_value = torch.tensor(min_max_dict[name], device="cuda") + module.forward = MethodType(quant_conv_forward_save_output, module) + module.enable_calib_act_min_max = enable_calib_act_min_max + return model diff --git a/tools/onnx-subgraph/single_vs_multiple_onnx.py b/tools/onnx-subgraph/single_vs_multiple_onnx.py new file mode 100644 index 00000000000..d83026c4357 --- /dev/null +++ b/tools/onnx-subgraph/single_vs_multiple_onnx.py @@ -0,0 +1,114 @@ +# Copyright (c) 2025 Samsung Electronics Co., Ltd. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import onnxruntime as ort +import numpy as np +from model_inference_multiple_output import * +import os + + +def compare_results(output_single, output_multiple): + """ + Compares the Mean Squared Error (MSE) between identically named outputs from two inference result dictionaries. + Ensures each output name is processed only once. + """ + all_keys = set(output_single.keys()).union(set(output_multiple.keys())) + for key in sorted(all_keys): + if key in output_single and key in output_multiple: + single_output = np.array(output_single[key]) + multiple_output = np.array(output_multiple[key]) + mse = np.mean((single_output - multiple_output)**2) + print(f"Output '{key}' MSE: {mse}") + else: + print(f"Output '{key}' is missing in one of the result sets.") + + +def prepare_initial_input_data(onnx_model_path, default_input_data): + """ + Prepares initial input data for inference. + + Args: + onnx_model_path (str): Path to the ONNX model file. + default_input_data (dict): Dictionary containing default input data. + + Returns: + dict: Dictionary with user-specified or default shaped and typed input data. + """ + session = ort.InferenceSession(onnx_model_path) + input_info = {input.name: input.shape for input in session.get_inputs()} + + initial_input_data = {} + dtype_map = {'f': np.float32, 'i': np.int64} + + for input_name, shape in input_info.items(): + custom_shape_str = input( + f"Enter new shape for input '{input_name}' (comma-separated integers), or press Enter to use default: " + ) + custom_dtype_str = input( + f"Enter data type for input '{input_name}' ('f' for float32, 'i' for int64), or press Enter to use default: " + ) + + if not custom_shape_str: + new_shape = default_input_data[input_name].shape + else: + try: + new_shape = [int(dim) for dim in custom_shape_str.split(',')] + except ValueError: + print("Invalid input, please ensure you enter comma-separated integers.") + continue + + if not custom_dtype_str: + dtype = default_input_data[input_name].dtype + else: + dtype = dtype_map.get(custom_dtype_str.strip(), None) + if dtype is None: + print("Invalid data type, please enter 'f' or 'i'.") + continue + + input_data = np.random.rand(*new_shape).astype(dtype) + initial_input_data[input_name] = input_data + + return initial_input_data + + +# Define paths for single ONNX model and split subgraph models +single_onnx_model_path = './resnet-test.onnx' +model_path = './subgraphs/' +subgraphsiostxt_path = './subgraphs_ios.txt' + +# Initialize ModelInference instance for inference +model_inference = ModelInference(model_path, subgraphsiostxt_path) + +# Default input data dictionary +default_input_data = { + "x": np.random.rand(1, 3, 256, 256).astype(np.float32), +} + +#initial_input_data = prepare_initial_input_data(single_onnx_model_path, default_input_data) +initial_input_data = default_input_data + +# Perform inference using a single ONNX model +output_single = ModelInference.infer_single_onnx_model(single_onnx_model_path, + initial_input_data) +print("Single model inference completed!") + +# Retrieve all output names from the single model +output_names_list = list(output_single.keys()) + +# Perform inference using multiple split subgraph models +output_multiple = model_inference.inference(initial_input_data, output_names_list) +print("Multiple subgraph inference completed!") + +print("Comparing inference results between single ONNX model and multiple subgraphs...") +compare_results(output_single, output_multiple) diff --git a/tools/onnx-subgraph/src/lib/device.cpp b/tools/onnx-subgraph/src/lib/device.cpp new file mode 100644 index 00000000000..9dfc13843d8 --- /dev/null +++ b/tools/onnx-subgraph/src/lib/device.cpp @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2025 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "device.h" + +void Device::GenerateCutInstruction(std::vector &Subgraphs, std::string device, + std::vector> &subgraphs_inputs, + std::vector> &subgraphs_outputs) +{ + std::cout << "Generate Cut Instruction for Target_NPU" << std::endl; + // open file + std::string file_name = device + "CutInstruction.txt"; + std::ofstream outFile(file_name); + if (!outFile.is_open()) + { + std::cerr << "Error opening file." << std::endl; + exit(0); + } + for (size_t i = 0; i < Subgraphs.size(); i++) + { + // default parameters + std::string modelFile = onnxFile; + std::string dataScaleDiv = "255"; + std::string postprocess = "save_and_top5"; + + std::unordered_set graphInputs = subgraphs_inputs[i]; + std::unordered_set graphOutputs = subgraphs_outputs[i]; + + std::string inputName = "\""; + for (const auto &input : graphInputs) + { + inputName = inputName + input.name + ";"; + } + // delete last semicolon + if (!inputName.empty() && inputName.back() == ';') + { + inputName.pop_back(); + } + inputName = inputName + "\""; + std::string outputName = "\""; + for (const auto &output : graphOutputs) + { + outputName = outputName + output.name + ";"; + } + // delete last semicolon + if (!outputName.empty() && outputName.back() == ';') + { + outputName.pop_back(); + } + outputName = outputName + "\""; + + std::string inputShape = "\""; + for (const auto &input : graphInputs) + { + for (const auto &dim : input.shape) + { + inputShape = inputShape + std::to_string(dim) + " "; + } + // delete last space + if (!inputShape.empty() && inputShape.back() == ' ') + { + inputShape.pop_back(); + } + inputShape = inputShape + ";"; + } + // delete last semicolon + if (!inputShape.empty() && inputShape.back() == ';') + { + inputShape.pop_back(); + } + inputShape = inputShape + "\""; + + std::string calibrateDataset = device + "_Subgraphs_" + std::to_string(i) + ".npz"; + std::string quantizationScheme = "int8_asym"; + } + + outFile.close(); +} diff --git a/tools/onnx-subgraph/src/lib/graph.cpp b/tools/onnx-subgraph/src/lib/graph.cpp new file mode 100644 index 00000000000..610c0d71346 --- /dev/null +++ b/tools/onnx-subgraph/src/lib/graph.cpp @@ -0,0 +1,258 @@ +/* + * Copyright (c) 2025 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "graph.h" +#include "partition.h" + +std::unordered_set getInitializer(const onnx::GraphProto &graph) +{ + std::unordered_set initializerNames; + for (const auto &initializer : graph.initializer()) + { + NodeTensor nt; + nt.name = initializer.name(); + std::vector shape; + for (const auto &dim : initializer.dims()) + { + shape.push_back(dim); + } + nt.shape = shape; + initializerNames.insert(nt); + } + return initializerNames; +} + +std::unordered_set getIOvalue(const onnx::GraphProto &graph) +{ + std::unordered_set IOvalue; + for (const auto &value_info : graph.value_info()) + { + NodeTensor nt; + nt.name = value_info.name(); + + std::vector shape; + for (const auto &dim : value_info.type().tensor_type().shape().dim()) + { + shape.push_back(dim.dim_value()); + } + nt.shape = shape; + IOvalue.insert(nt); + } + for (auto value_info : graph.input()) + { + NodeTensor nt; + nt.name = value_info.name(); + + std::vector shape; + for (const auto &dim : value_info.type().tensor_type().shape().dim()) + { + shape.push_back(dim.dim_value()); + } + nt.shape = shape; + IOvalue.insert(nt); + } + for (auto value_info : graph.output()) + { + NodeTensor nt; + nt.name = value_info.name(); + + std::vector shape; + for (const auto &dim : value_info.type().tensor_type().shape().dim()) + { + shape.push_back(dim.dim_value()); + } + nt.shape = shape; + IOvalue.insert(nt); + } + return IOvalue; +} +/** + * @brief Finds a NodeTensor with the specified name in the given set of NodeTensors. + * + * @param [in] name The name of the NodeTensor to find. + * @param [in] tensors The set of NodeTensors to search within. + * @pre The tensors set should be valid and contain NodeTensor objects. + * @post None + * @exception None + * @return An iterator to the found NodeTensor if it exists, otherwise an iterator to the end of + * the set. + */ +std::unordered_set::const_iterator +isInputFromInitializer(const std::string &name, const std::unordered_set &tensors) +{ + return std::find_if(tensors.begin(), tensors.end(), + [&](const NodeTensor &tensor) { return tensor.name == name; }); +} + +void determineGraphInput(const onnx::GraphProto &g, + const std::unordered_set &initializerNames, + std::unordered_set &graphInputs) +{ + std::unordered_set allnodeOutputs; + + // Iterate over each node in the graph to collect all outputs + for (const auto &node : g.node()) + { + // Get the output list of the current node + const auto &outputs = node.output(); + + // Insert each output into the set of all node outputs + for (const auto &output : outputs) + { + allnodeOutputs.insert(output); + } + } + + // Iterate over each node in the graph to identify inputs not produced by any node + for (const auto &node : g.node()) + { + // Get the input list of the current node + const auto &inputs = node.input(); + + // Check each input to determine if it is an external input to the graph + for (const auto &input : inputs) + { + // If the input is not found in the set of all node outputs, it is a graph input + if (std::find(allnodeOutputs.begin(), allnodeOutputs.end(), input) == allnodeOutputs.end()) + { + auto iter = isInputFromInitializer(input, initializerNames); + NodeTensor nt; + nt.name = input; + if (iter != initializerNames.end()) + { + graphInputs.insert(*iter); + } + } + } + } +} + +void determineGraphOutput(const onnx::GraphProto &originalGraph, const onnx::GraphProto &g, + std::vector> &allgraphInputs_1, + std::vector> &allgraphInputs_2, + std::unordered_set &graphOutputs) +{ + auto allgraphInputs = allgraphInputs_1; + allgraphInputs.insert(allgraphInputs.end(), allgraphInputs_2.begin(), allgraphInputs_2.end()); + for (const auto &node : g.node()) + { + const auto &outputs = node.output(); + for (const auto &output : outputs) + { + int flag = 0; + for (auto value_info : originalGraph.output()) + { + if (value_info.name() == output) + { + NodeTensor nt; + nt.name = value_info.name(); + std::cout << nt.name << std::endl; + std::vector shape; + for (const auto &dim : value_info.type().tensor_type().shape().dim()) + { + shape.push_back(dim.dim_value()); + } + nt.shape = shape; + graphOutputs.insert(nt); + flag = 1; + break; + } + } + if (flag) + { + continue; + } + for (size_t i = 0; i < allgraphInputs.size(); i++) + { + for (auto &input : allgraphInputs[i]) + { + if (input.name == output) + { + graphOutputs.insert(input); + flag = 1; + break; + } + } + if (flag) + { + break; + } + } + } + } +} +std::string findInputNode(const onnx::GraphProto &g, const std::string &outputTensorName) +{ + std::string node_name = ""; + for (const auto &node : g.node()) + { + for (const auto &output : node.output()) + { + if (output == outputTensorName) + { + node_name = node.name(); + } + } + } + return node_name; +} + +std::unordered_set collectNodeNames(const onnx::GraphProto &graph) +{ + std::unordered_set nodeNames; + for (const auto &node : graph.node()) + { + nodeNames.insert(node.name()); + } + return nodeNames; +} + +void mergeGraphs(onnx::GraphProto &targetGraph, onnx::GraphProto &sourceGraph) +{ + std::cout << "size before merged: " << targetGraph.node_size() << "+" << sourceGraph.node_size() + << std::endl; + int size_before = targetGraph.node_size() + sourceGraph.node_size(); + for (const auto &node : sourceGraph.node()) + { + *targetGraph.add_node() = node; + } + std::cout << "size after merged: " << targetGraph.node_size() << std::endl; + if (size_before != targetGraph.node_size()) + { + std::cout << "error in mergeGraphs" << std::endl; + std::exit(0); + } +} + +onnx::GraphProto Graph::GetGraphFromOnnx(std::string &path) +{ + std::ifstream input(path, std::ios::ate | std::ios::binary); + if (!input.is_open()) + { + std::cout << "Error: Failed to open file: " << path << std::endl; + exit(0); + } + onnx::ModelProto model; + // get current position in file + std::streamsize size = input.tellg(); + // move to start of file + input.seekg(0, std::ios::beg); + // read raw data + std::vector buffer(size); + input.read(buffer.data(), size); + model.ParseFromArray(buffer.data(), size); // parse protobuf + return model.graph(); +} diff --git a/tools/onnx-subgraph/src/lib/partition.cpp b/tools/onnx-subgraph/src/lib/partition.cpp new file mode 100644 index 00000000000..e91c86401c5 --- /dev/null +++ b/tools/onnx-subgraph/src/lib/partition.cpp @@ -0,0 +1,2977 @@ +/* + * Copyright (c) 2025 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "partition.h" +#include +#include +#include +#define MAX_DEPTH 1000 +std::vector Subgraphs; +/** + * Prints the subgraph information of an ONNX model to specified files. + * + * @param Subgraphs A vector containing subgraph information. + * @param subgraph_file_name The filename for the output of subgraph information. + * @param otherSubgraphs A vector containing other subgraph information. + * @param other_subgraph_file_name The filename for the output of other subgraph information. + */ +void print_subgraphs(std::vector Subgraphs, char *subgraph_file_name, + std::vector otherSubgraphs, char *other_subgraph_file_name) +{ + int node_sum = 0; + std::ofstream outFile(subgraph_file_name); + if (!outFile.is_open()) + { + std::cerr << "Error opening file." << std::endl; + exit(0); + } + int id = 0; + for (const auto &vec : Subgraphs) + { + outFile << " subgraph" << id << ":"; + for (const auto &node : vec.node()) + { + outFile << node.name() << " "; + } + id++; + outFile << std::endl; + node_sum += vec.node_size(); + } + std::ofstream outFile_2(other_subgraph_file_name); + if (!outFile_2.is_open()) + { + std::cerr << "Error opening file." << std::endl; + exit(0); + } + std::cout << "before:" << std::endl; + for (const auto &vec : otherSubgraphs) + { + outFile_2 << " subgraph" << id << ":"; + for (const auto &node : vec.node()) + { + outFile_2 << node.name() << " "; + } + id++; + outFile_2 << std::endl; + node_sum += vec.node_size(); + } +} +/////// +/** + * @brief Constructs an adjacency list representation of the ONNX graph. + * + * @param [in] g A const reference to an ONNX GraphProto object that contains the graph + * structure. + * @param [in,out] visited A pointer to an integer array used to mark whether nodes have been + * visited. + * @pre The 'visited' array should be pre-allocated with a size at least equal to the number + * of nodes in the graph. + * @post The 'visited' array will be initialized to 0 for all nodes. + * @exception None + * @return A vector of graph_adjacency_node objects representing the adjacency list of the graph. + */ +std::vector get_adjancency_list(const onnx::GraphProto &g, int *visited) +{ + std::vector adjacency_list; + int node_index = 0; + for (const auto &node : g.node()) + { + visited[node_index] = 0; + graph_adjacency_node ad_node; + ad_node.index = node_index; + ad_node.name = node.name(); + const auto &outputs = node.output(); + for (const auto &output : outputs) + { + int output_node_index = 0; + for (const auto &output_node : g.node()) + { + int find_flag = 0; + const auto &inputs = output_node.input(); + for (const auto &input : inputs) + { + if (output == input) + { + find_flag = 1; + break; + } + } + if (find_flag == 1) + { + if (std::find(ad_node.output_node_index.begin(), ad_node.output_node_index.end(), + output_node_index) == ad_node.output_node_index.end()) + { + ad_node.output_node_index.push_back(output_node_index); + } + } + output_node_index++; + } + } + node_index++; + adjacency_list.push_back(ad_node); + } + return adjacency_list; +} +/** + * @brief Calculates the size of a specific node in the ONNX graph in kilobytes (KB). + * + * @param [in] g A const reference to an ONNX GraphProto object that contains the graph + * structure. + * @param [in] node_index The index of the node for which the size is to be calculated. + * @pre The node_index should be a valid index within the range of nodes in the graph. + * @post None + * @exception None + * @return The size of the node in kilobytes (KB). + */ +float calculate_node_size(const onnx::GraphProto &g, int node_index) // unit : KB +{ + int64_t node_size = 0; + for (int i = 0; i < g.node(node_index).input_size(); i++) + { + std::string input_name = g.node(node_index).input(i); + for (int j = 0; j < g.initializer_size(); j++) + { + if (g.initializer(j).name() == input_name) + { + int64_t node_init_size = 4; + for (int k = 0; k < g.initializer(j).dims().size(); k++) + { + node_init_size = g.initializer(j).dims(k) * node_init_size; + } + node_size += node_init_size; + break; + } + } + } + return float(node_size * 1.0 / 1024.0); +} +/** + * @brief Depth-First Search (DFS) to build a NPU subgraph. + * + * @param [in] onnx_graph Input ONNX graph structure. + * @param [out] onnx_subgraph Output subgraph. + * @param [in,out] subgraph_node_indices Vector storing indices of nodes in the subgraph. + * @param [in,out] visited Array recording whether nodes have been visited. + * @param [in] start_node Current starting node for the search. + * @param [in] current_node_index Index of the current node. + * @param [in] adjacency_list Adjacency list representing connections between nodes in the + * graph. + * @param [in] supported_op_types List of supported operation types. + * @param [in] preferred_op_types List of preferred operation types (not used in the code). + * @param [in] current_depth Current depth of the search. + * @param [in,out] current_graph_size Current size of the subgraph. + * @param [in] max_graph_size Maximum allowed size of the subgraph. + * @pre `current_node_index` should be a valid node index. + * @post If the subgraph size exceeds `max_graph_size`, a warning message is printed. + * @exception None + */ +void DFS(const onnx::GraphProto &g, onnx::GraphProto &subgraph, + std::vector &sugraph_node_index, int *visited, const onnx::NodeProto &start_node, + int node_index, std::vector &adjacency_list, + const std::vector &support_op, const std::vector &prefer_op, + int depth_in, float &graph_size, float max_graph_size) +{ + int depth_out = depth_in + 1; + *subgraph.add_node() = start_node; + visited[node_index] = 1; + sugraph_node_index.push_back(node_index); + float node_size = calculate_node_size(g, node_index); + graph_size += node_size; + if (graph_size > max_graph_size) + { + std::cout << "graph size exceed max size!" << graph_size << " " << max_graph_size << std::endl; + } + for (int i = 0; i < int(adjacency_list[node_index].output_node_index.size()); i++) + { + if (i > 1) + { + std::cout << adjacency_list[node_index].output_node_index[i] << "->"; + } + // + int next_node_index = adjacency_list[node_index].output_node_index[i]; + const auto &next_node = g.node(next_node_index); + if (!visited[next_node_index] && + (std::find(support_op.begin(), support_op.end(), next_node.op_type()) != + support_op.end()) && + (depth_out < MAX_DEPTH) && (graph_size < max_graph_size)) // 尚未访问且op_type符合的邻接顶点 + DFS(g, subgraph, sugraph_node_index, visited, next_node, next_node_index, adjacency_list, + support_op, prefer_op, depth_out, graph_size, max_graph_size); + } +} +/** + * @brief Perform a depth-first search (DFS) to build a CPU subgraph from a given starting node. + * + * @param [in] g The original ONNX graph from which the subgraph will be extracted. + * @param [out] subgraph The subgraph being constructed. + * @param [out] subgraph_node_indices A vector to store indices of nodes included in the + * subgraph. + * @param [in,out] visited An array to keep track of visited nodes. + * @param [in] start_node The starting node for the DFS. + * @param [in] node_index The index of the starting node in the original graph. + * @param [in] adjacency_list The adjacency list representing the graph's structure. + * @param [in] depth_in The current depth of the DFS. + * @param [in,out] graph_size The cumulative size of the nodes in the subgraph. + * @param [in] max_graph_size The maximum allowed size for the subgraph. + * + * @pre The graph `g` and `adjacency_list` should be properly initialized. + * @pre The `visited` array should be initialized to zero. + * @pre `graph_size` should be initialized to zero before the first call to this function. + * + * @post The `subgraph` will contain the nodes visited during the DFS. + * @post The `subgraph_node_indices` will contain the indices of the nodes in the subgraph. + * @post The `visited` array will reflect the nodes that have been visited. + * @post The `graph_size` will reflect the cumulative size of the nodes in the subgraph. + * + * @exception None + * + * @return None + */ +void DFS_other(const onnx::GraphProto &g, onnx::GraphProto &subgraph, + std::vector &sugraph_node_index, int *visited, + const onnx::NodeProto &start_node, int node_index, + std::vector &adjacency_list, int depth_in, float &graph_size, + float max_graph_size) +{ + int depth_out = depth_in + 1; + *subgraph.add_node() = start_node; + visited[node_index] = 1; + sugraph_node_index.push_back(node_index); + float node_size = calculate_node_size(g, node_index); + graph_size += node_size; + if (graph_size > max_graph_size) + { + std::cout << "graph size exceed max size!" << graph_size << " " << max_graph_size << std::endl; + } + for (int i = 0; i < int(adjacency_list[node_index].output_node_index.size()); i++) + { + int next_node_index = adjacency_list[node_index].output_node_index[i]; + const auto &next_node = g.node(next_node_index); + if (!visited[next_node_index] && (depth_out < MAX_DEPTH) && + (graph_size < max_graph_size)) // do deep first search for each successor node + DFS_other(g, subgraph, sugraph_node_index, visited, next_node, next_node_index, + adjacency_list, depth_out, graph_size, max_graph_size); + } +} + +/** + * @brief Determine and partition subgraphs from the given ONNX graph based on DFS strategy. + * Compared with determine_subgraphs_v2, this function is more stable but may produce more subgraphs + * + * @param [in] g The original ONNX graph to be partitioned. + * @param [out] otherSubgraphs A vector to store the subgraphs that do not meet the preferred + * operation criteria. + * @param [in] d The device object containing information about supported and preferred + * operations. + * @param [in,out] visited An array to keep track of visited nodes. + * @param [in] adjacency_list The adjacency list representing the graph's structure. + * @param [in] strategy The partitioning strategy to be used (e.g., SPILTE_CPU_STRUCTURE_FIRST, + * SPILTE_NPU_STRUCTURE_FIRST). + * + * @pre The graph `g` and `adjacency_list` should be properly initialized. + * @pre The `visited` array should be initialized to zero. + * @pre The device object `d` should be properly initialized with support and preferred + * operations. + * + * @post The `otherSubgraphs` vector will contain subgraphs that do not meet the preferred + * operation criteria. + * @post The `visited` array will reflect the nodes that have been visited. + * + * @exception None + * + * @return None + */ +void determine_subgraphs(const onnx::GraphProto &g, std::vector &otherSubgraphs, + Device &d, int *visited, std::vector &adjacency_list, + PartitionStrategy strategy) +{ + int max_subgraph_size = d.max_subgraph_size; + std::vector support_op; + std::vector prefer_op; + switch (strategy) + { + case SPILTE_CPU_STRUCTURE_FIRST: + { + support_op = d.getCPUSupportOp(); + break; + } + case SPILTE_NPU_STRUCTURE_FIRST: + { + support_op = d.getNPUSupportOp(); + prefer_op = d.getNPUPreferOp(); + break; + } + default: + break; + } + for (int i = 0; i < g.node_size(); i++) + { + if (!visited[i] && + (std::find(support_op.begin(), support_op.end(), g.node(i).op_type()) != support_op.end())) + { + onnx::GraphProto subgraph; + std::vector sugraph_node_index; + const auto &node = g.node(i); + int depth = 0; + float graph_size = 0; + DFS(g, subgraph, sugraph_node_index, visited, node, i, adjacency_list, support_op, prefer_op, + depth, graph_size, max_subgraph_size); + std::cout << "graph_size: " << graph_size << std::endl; + int find_prefer_op = 0; + for (const auto &node : subgraph.node()) + { + if (std::find(prefer_op.begin(), prefer_op.end(), node.op_type()) != prefer_op.end()) + { + find_prefer_op = 1; + } + } + if (find_prefer_op) + { + Subgraphs.push_back(subgraph); + } + else + { + for (const auto &index : sugraph_node_index) + { + visited[index] = 0; + } + } + } + } + for (int i = 0; i < g.node_size(); i++) + { + if (!visited[i]) + { + int depth = 0; + float graph_size = 0; + onnx::GraphProto subgraph; + std::vector sugraph_node_index; + const auto &node = g.node(i); + DFS_other(g, subgraph, sugraph_node_index, visited, node, i, adjacency_list, depth, + graph_size, max_subgraph_size); + std::cout << "graph_size:" << graph_size << std::endl; + otherSubgraphs.push_back(subgraph); + } + } +} + +/** + * @brief Determine and partition subgraphs from the given ONNX graph using the index of nodes, + * compared with determine_subgraphs, this function may produce less subgraphs but some of them may + * be not fully connected(connectivity of subgrpahs will not affect the inference procedure of + * subgraphs) This function specifically handles the partitioning logic for NPU support and + * preferred operations. + * + * @param [in] g The original ONNX graph to be partitioned. + * @param [out] otherSubgraphs A vector to store the subgraphs that do not meet the preferred + * operation criteria. + * @param [in] d The device object containing information about supported and preferred + * operations. + * @param [in,out] visited An array to keep track of visited nodes. + * @param [in] adjacency_list The adjacency list representing the graph's structure. + * @param [in] strategy The partitioning strategy to be used (e.g., SPILTE_CPU_STRUCTURE_FIRST, + * SPILTE_NPU_STRUCTURE_FIRST). + * + * @pre The graph `g` and `adjacency_list` should be properly initialized. + * @pre The `visited` array should be initialized to zero. + * @pre The device object `d` should be properly initialized with support and preferred + * operations. + * + * @post The `otherSubgraphs` vector will contain subgraphs that do not meet the preferred + * operation criteria. + * @post The `visited` array will reflect the nodes that have been visited. + * + * @exception None + * + * @return None + */ +void determine_subgraphs_v2(const onnx::GraphProto &g, + std::vector &otherSubgraphs, Device &d, int *visited, + std::vector &adjacency_list, + PartitionStrategy strategy) +{ + float max_subgraph_size = d.max_subgraph_size; + std::vector support_op; + std::vector prefer_op; + support_op = d.getNPUSupportOp(); + prefer_op = d.getNPUPreferOp(); + onnx::GraphProto temp_graph; + int end_flag = 0; + int node_count = 0; + float temp_graph_size = 0; + while (!end_flag) + { + float node_size = calculate_node_size(g, node_count); + if (temp_graph.node_size() != 0) + { + if ((std::find(support_op.begin(), support_op.end(), g.node(node_count).op_type()) != + support_op.end()) && + temp_graph.node_size() <= max_subgraph_size) + { + *temp_graph.add_node() = g.node(node_count); + temp_graph_size += node_size; + if (temp_graph_size > max_subgraph_size) + { + std::cout << "graph size exceed max size!" << temp_graph_size << " " << max_subgraph_size + << std::endl; + } + visited[node_count] = 1; + } + else + { + int find_preferop = 0; + for (int j = 0; j < temp_graph.node_size(); j++) + { + if (std::find(prefer_op.begin(), prefer_op.end(), temp_graph.node(j).op_type()) != + prefer_op.end()) + { + find_preferop = 1; + break; + } + } + if (find_preferop == 1) + { + Subgraphs.push_back(temp_graph); + } + else + { + for (int k = 1; k <= temp_graph.node_size(); k++) + { + visited[node_count - k] = 0; + } + } + temp_graph.Clear(); + temp_graph_size = 0; + if (std::find(support_op.begin(), support_op.end(), g.node(node_count).op_type()) != + support_op.end()) + { + *temp_graph.add_node() = g.node(node_count); + temp_graph_size += node_size; + visited[node_count] = 1; + continue; + } + } + } + else + { + if (std::find(support_op.begin(), support_op.end(), g.node(node_count).op_type()) != + support_op.end()) + { + *temp_graph.add_node() = g.node(node_count); + temp_graph_size += node_size; + if (temp_graph_size > max_subgraph_size) + { + std::cout << "graph size exceed max size!" << temp_graph_size << " " << max_subgraph_size + << std::endl; + } + visited[node_count] = 1; + } + } + node_count++; + if (node_count == g.node_size()) + { + end_flag = 1; + if (temp_graph.node_size() != 0) + { + Subgraphs.push_back(temp_graph); + } + } + } + onnx::GraphProto temp_graph2; + float temp_graph_size2 = 0; + for (int i = 0; i < g.node_size(); i++) + { + float node_size = calculate_node_size(g, i); + if (visited[i] == 0 && temp_graph_size2 < max_subgraph_size) + { + *temp_graph2.add_node() = g.node(i); + temp_graph_size2 += node_size; + } + else + { + std::cout << "i = " << i << " temp_graph_size2: " << temp_graph_size2 << std::endl; + if (temp_graph2.node_size() != 0) + { + otherSubgraphs.push_back(temp_graph2); + temp_graph_size2 = 0; + temp_graph2.Clear(); + } + if (visited[i] == 0) + { + *temp_graph2.add_node() = g.node(i); + temp_graph_size2 += node_size; + continue; + } + } + if (i == g.node_size() - 1) + { + if (temp_graph2.node_size() != 0) + { + otherSubgraphs.push_back(temp_graph2); + temp_graph2.Clear(); + } + } + } +} +/** + * @brief Perform Tarjan's algorithm to find all strongly connected components in a directed + * graph. This function uses depth-first search (DFS) to identify and group nodes into strongly + * connected components. + * + * @param [in] index The current node index being visited. + * @param [in] depth The current depth in the DFS traversal. + * @param [out] strongly_connected_subgraphs A vector to store the identified strongly connected + * components. + * @param [in,out] DFN An array to store the discovery time of each node. + * @param [in,out] LOW An array to store the lowest discovery time reachable from each node. + * @param [in,out] stack_subgraphs A stack to keep track of nodes in the current DFS path. + * @param [in] successors_Subgraphs A vector of vectors representing the adjacency list of the + * graph. + * + * @pre The `DFN` and `LOW` arrays should be initialized to zero. + * @pre The `stack_subgraphs` should be empty before the first call to this function. + * @pre The `successors_Subgraphs` should be properly initialized with the graph's adjacency + * list. + * + * @post The `strongly_connected_subgraphs` vector will contain all the strongly connected + * components found in the graph. + * @post The `DFN` and `LOW` arrays will reflect the discovery times and lowest reachable + * discovery times for each node. + * @post The `stack_subgraphs` will be empty after the function completes. + * + * @exception None + * + * @return None + */ +void Tarjan(int index, int depth, std::vector> &strongly_connected_subgraphs, + int *DFN, int *LOW, std::vector &stack_subgraphs, + std::vector> &successors_Subgraphs) +{ + int rank = depth + 1; + DFN[index] = LOW[index] = rank; // initialize DFN and LOW to 0 + stack_subgraphs.push_back(index); + for (const auto &successor : successors_Subgraphs[index]) + { + if (DFN[successor] == 0) // the successor is not visited + { + Tarjan(successor, rank, strongly_connected_subgraphs, DFN, LOW, stack_subgraphs, + successors_Subgraphs); // visit successor + LOW[index] = std::min(LOW[index], LOW[successor]); + } + else if (std::find(stack_subgraphs.begin(), stack_subgraphs.end(), successor) != + stack_subgraphs.end()) + { + LOW[index] = std::min(LOW[index], DFN[successor]); + } + } + if (LOW[index] == DFN[index]) // if this node is the smallest root of the strongly connected + // component subtree, then subsequent nodes are popped out of the + // stack and the obtained strongly connected components are saved. + { + auto it = stack_subgraphs.end() - 1; + std::vector strongly_connected; + while (*it != index) + { + strongly_connected.insert(strongly_connected.begin(), *it); + stack_subgraphs.pop_back(); + it = stack_subgraphs.end() - 1; + } + strongly_connected.insert(strongly_connected.begin(), *it); + + if (strongly_connected.size() > 1) + { + strongly_connected_subgraphs.push_back(strongly_connected); + } + stack_subgraphs.pop_back(); // pop + } +} +/** + * @brief Calculate the rank of each node in the merged graph formed by the given strongly + * connected components. The rank is determined based on the topological order of the nodes. + * + * @param [in] strongly_connected A vector containing indices of strongly connected components. + * @param [in] Subgraphs A vector of ONNX GraphProtos representing the main subgraphs. + * @param [in] otherSubgraphs A vector of ONNX GraphProtos representing additional subgraphs. + * + * @pre The `strongly_connected` vector should contain valid indices for `Subgraphs` and + * `otherSubgraphs`. + * @pre The `Subgraphs` and `otherSubgraphs` vectors should be properly initialized with ONNX + * GraphProtos. + * + * @post The `node_rank_list` vector will contain the nodes of the merged graph with their + * respective ranks. + * + * @exception None + * + * @return A vector of `graph_adjacency_node` structures containing the nodes and their ranks. + */ +std::vector calculate_node_rank(std::vector &strongly_connected, + std::vector &Subgraphs, + std::vector &otherSubgraphs) +{ + onnx::GraphProto merged_graph; + std::vector node_rank_list; + for (const auto &index : strongly_connected) + { + if (index < int(Subgraphs.size())) + { + mergeGraphs(merged_graph, Subgraphs[index]); + } + else + { + mergeGraphs(merged_graph, otherSubgraphs[index - Subgraphs.size()]); + } + } + int index = 0; + for (const auto &node : merged_graph.node()) + { + graph_adjacency_node node_rank; + node_rank.name = node.name(); + node_rank.index = index; + node_rank.rank = -1; + node_rank_list.push_back(node_rank); + index++; + } + int sort_count = 0; + int finished_flag = 0; + while (!finished_flag) + { + finished_flag = 1; + if (sort_count == 0) + { + for (int i = 0; i < merged_graph.node_size(); i++) // Traverse all nodes + { + int find_flag = 0; + for (const auto &input : merged_graph.node(i).input()) + { + for (int j = 0; j < merged_graph.node_size(); j++) + { + for (const auto &output : merged_graph.node(j).output()) + { + if (input == output) + { + find_flag = 1; + break; + } + } + if (find_flag) + { + break; + } + } + if (find_flag) + { + break; + } + } + if (!find_flag) + { + node_rank_list[i].rank = sort_count; + } + } + finished_flag = 0; + } + else + { + for (int i = 0; i < merged_graph.node_size(); i++) + { + int find_flag = 0; + if (node_rank_list[i].rank >= 0 && node_rank_list[i].rank < sort_count) + { + continue; + } ////If it has already been sorted, skip this subgraph + for (const auto &input : + merged_graph.node(i).input()) ////traveres all inputs of this subgraph + { + for (int j = 0; j < merged_graph.node_size(); + j++) ////examint if the input is the output of j th subgraph + { + for (const auto &output : merged_graph.node(j).output()) + { + if (output == input) + { + if ((node_rank_list[j].rank < 0 || + node_rank_list[j].rank >= sort_count)) // the j th subgraph has not been sorted + { + find_flag = 1; + break; + } + } + } + if (find_flag) + { + break; + } + } + if (find_flag) + { + break; + } + } + if (!find_flag) + { + node_rank_list[i].rank = sort_count; + } + else + { + node_rank_list[i].rank = sort_count + 1; + finished_flag = 0; + } + } + } + sort_count++; + } + return node_rank_list; +} +/** + * @brief Calculate the rank of each node in the merged graph formed by the given strongly + * connected components. The rank is determined based on the topological order of the nodes. + * Compared with calculate_node_rank, this function has different input parameters. + * + * @param [in] strongly_connected A vector containing indices of strongly connected components. + * @param [in] Subgraphs A vector of ONNX GraphProtos representing the main subgraphs. + * @param [in] otherSubgraphs A vector of ONNX GraphProtos representing additional subgraphs. + * @param [in] subgraph_size The size of the Subgraphs vector. + * @param [in] other_subgraph_size The size of the otherSubgraphs vector. + * + * @pre The `strongly_connected` vector should contain valid indices for `Subgraphs` and + * `otherSubgraphs`. + * @pre The `Subgraphs` and `otherSubgraphs` vectors should be properly initialized with ONNX + * GraphProtos. + * @pre `subgraph_size` should be equal to the size of the `Subgraphs` vector. + * @pre `other_subgraph_size` should be equal to the size of the `otherSubgraphs` vector. + * + * @post The `node_rank_list` vector will contain the nodes of the merged graph with their + * respective ranks. + * + * @exception None + * + * @return A vector of `graph_adjacency_node` structures containing the nodes and their ranks. + */ +std::vector calculate_node_rank_v2( + std::vector &strongly_connected, std::vector &Subgraphs, + std::vector &otherSubgraphs, int subgraph_size, int other_subgraph_size) +{ + onnx::GraphProto merged_graph; + std::vector node_rank_list; + for (const auto &index : strongly_connected) + { + if (index < subgraph_size) + { + mergeGraphs(merged_graph, Subgraphs[index]); + } + else + { + mergeGraphs(merged_graph, otherSubgraphs[index - subgraph_size]); + } + } + int index = 0; + for (const auto &node : merged_graph.node()) + { + graph_adjacency_node node_rank; + node_rank.name = node.name(); + node_rank.index = index; + node_rank.rank = -1; + node_rank_list.push_back(node_rank); + index++; + } + int sort_count = 0; + int finished_flag = 0; + while (!finished_flag) + { + finished_flag = 1; + if (sort_count == 0) + { + for (int i = 0; i < merged_graph.node_size(); i++) // traverse all nodes + { + int find_flag = 0; + for (const auto &input : merged_graph.node(i).input()) + { + for (int j = 0; j < merged_graph.node_size(); j++) + { + for (const auto &output : merged_graph.node(j).output()) + { + if (input == output) + { + find_flag = 1; + break; + } + } + if (find_flag) + { + break; + } + } + if (find_flag) + { + break; + } + } + if (!find_flag) + { + node_rank_list[i].rank = sort_count; + } + } + finished_flag = 0; + } + else + { + for (int i = 0; i < merged_graph.node_size(); i++) + { + int find_flag = 0; + if (node_rank_list[i].rank >= 0 && node_rank_list[i].rank < sort_count) + { + continue; + } + for (const auto &input : + merged_graph.node(i).input()) ////traverses all inputs of this subgraph + { + for (int j = 0; j < merged_graph.node_size(); + j++) /// examint if the input is the output of j th subgraph + { + for (const auto &output : merged_graph.node(j).output()) + { + if (output == input) + { + if ((node_rank_list[j].rank < 0 || + node_rank_list[j].rank >= sort_count)) // the j th subgraph has not been sorted + { + find_flag = 1; + break; + } + } + } + if (find_flag) + { + break; + } + } + if (find_flag) + { + break; + } + } + if (!find_flag) + { + node_rank_list[i].rank = sort_count; + } + else + { + node_rank_list[i].rank = sort_count + 1; + finished_flag = 0; + } + } + } + sort_count++; + } + return node_rank_list; +} +/** + * @brief Calculate the rank of each node in the given merged ONNX graph. + * The rank is determined based on the topological order of the nodes. + * This function is only used to calculate the rank of the nodes in a single graph, + * especially the original graph + * + * @param [in] merged_graph The ONNX GraphProto representing the merged graph. + * @param [out] node_rank_list A vector of `graph_adjacency_node` structures to store the nodes + * and their ranks. + * + * @pre The `merged_graph` should be a valid ONNX GraphProto. + * @pre The `node_rank_list` should be an empty vector or properly initialized. + * + * @post The `node_rank_list` vector will contain the nodes of the merged graph with their + * respective ranks. + * + * @exception None + * + * @return None + */ +void calculate_node_rank_v3(const onnx::GraphProto &merged_graph, + std::vector &node_rank_list) +{ + int index = 0; + for (const auto &node : merged_graph.node()) + { + graph_adjacency_node node_rank; + node_rank.name = node.name(); + node_rank.index = index; + node_rank.rank = -1; + node_rank_list.push_back(node_rank); + index++; + } + int sort_count = 0; + int finished_flag = 0; + while (!finished_flag) + { + finished_flag = 1; + if (sort_count == 0) + { + for (int i = 0; i < merged_graph.node_size(); i++) // traverse all nodes + { + int find_flag = 0; + for (const auto &input : merged_graph.node(i).input()) + { + for (int j = 0; j < merged_graph.node_size(); j++) + { + for (const auto &output : merged_graph.node(j).output()) + { + if (input == output) + { + find_flag = 1; + break; + } + } + if (find_flag) + { + break; + } + } + if (find_flag) + { + break; + } + } + if (!find_flag) + { + node_rank_list[i].rank = sort_count; + } + } + finished_flag = 0; + } + else + { + for (int i = 0; i < merged_graph.node_size(); i++) + { + int find_flag = 0; + if (node_rank_list[i].rank >= 0 && node_rank_list[i].rank < sort_count) + { + continue; + } + for (const auto &input : + merged_graph.node(i).input()) ////traverses all inputs of this subgraph + { + for (int j = 0; j < merged_graph.node_size(); + j++) /// examint if the input is the output of j th subgraph + { + for (const auto &output : merged_graph.node(j).output()) + { + if (output == input) + { + if ((node_rank_list[j].rank < 0 || + node_rank_list[j].rank >= sort_count)) // the j th subgraph has not been sorted + { + find_flag = 1; + break; + } + } + } + if (find_flag) + { + break; + } + } + if (find_flag) + { + break; + } + } + if (!find_flag) + { + node_rank_list[i].rank = sort_count; + } + else + { + node_rank_list[i].rank = sort_count + 1; + finished_flag = 0; + } + } + } + sort_count++; + } +} +/** + * @brief Determine the cut ranks in the given list of SCC (Strongly Connected Component) node + * ranks. A cut rank is defined as a rank where no node exists, but there is at least one node at + * the next rank. + * + * @param [in] scc_node_rank A vector of `graph_adjacency_node` structures representing the + * nodes and their ranks. + * + * @pre The `scc_node_rank` vector should be properly initialized and contain valid node + * ranks. + * + * @post The function does not modify the `scc_node_rank` vector. + * + * @exception None + * + * @return A vector of integers representing the cut ranks. + */ +std::vector get_cut_rank_v2(std::vector &scc_node_rank) +{ + std::vector cut_rank_list; + int min_cut_rank = -1; + int max_rank = 0; + // get min + for (int i = 0; i < int(scc_node_rank.size()); i++) + { + if (scc_node_rank[i].rank < min_cut_rank || min_cut_rank < 0) + { + min_cut_rank = scc_node_rank[i].rank; + } + if (scc_node_rank[i].rank > max_rank) + { + max_rank = scc_node_rank[i].rank; + } + } + int find_flag = 1; + while (find_flag) + { + min_cut_rank++; + int temp_find_flag = 0; + for (int i = 0; i < int(scc_node_rank.size()); i++) + { + if (scc_node_rank[i].rank == min_cut_rank) + { + temp_find_flag = 1; + break; + } + } + find_flag = temp_find_flag; + } + cut_rank_list.push_back(min_cut_rank); + int cut_rank = min_cut_rank; + while (cut_rank < max_rank) + { + cut_rank = cut_rank + 1; + int rank_flag = 0; + int rank_plus_flag = 0; + for (int i = 0; i < int(scc_node_rank.size()); i++) + { + if (scc_node_rank[i].rank == cut_rank) + { + rank_flag = 1; + } + else if (scc_node_rank[i].rank == cut_rank + 1) + { + rank_plus_flag = 1; + } + } + if (rank_flag == 0 && rank_plus_flag == 1) + { + cut_rank_list.push_back(cut_rank + 1); + } + } + + return cut_rank_list; +} +/** + * @brief Eliminate strongly connected components in the graph and partition them into subgraphs + * based on node ranks. + * + * @param [in] strongly_connected_subgraphs List of indices representing strongly connected + * components. + * @param [in,out] Subgraphs List of subgraphs that will be updated. + * @param [in,out] otherSubgraphs List of other subgraphs that will be updated. + * @param [in] g The original graph from which strongly connected components are derived. + * @pre The input graph `g` should be properly initialized and contain nodes. + * @post The `Subgraphs` and `otherSubgraphs` lists may be modified with new partitions based + * on node ranks. + * @exception None + * @return None + */ +void eliminate_scc_v2(std::vector> &strongly_connected_subgraphs, + std::vector &Subgraphs, + std::vector &otherSubgraphs, const onnx::GraphProto &g) +{ + int subgraph_size = Subgraphs.size(); + std::vector node_rank_list; + calculate_node_rank_v3(g, node_rank_list); + for (auto &strongly_connected : strongly_connected_subgraphs) + for (const auto scc_index : strongly_connected) + { + onnx::GraphProto scc_graph; + if (scc_index < subgraph_size) + { + scc_graph = Subgraphs[scc_index]; + } + else + { + scc_graph = otherSubgraphs[scc_index - subgraph_size]; + } + std::vector scc_node_rank; + for (int i = 0; i < scc_graph.node_size(); i++) + { + for (int j = 0; j < int(node_rank_list.size()); j++) + { + if (scc_graph.node(i).name() == node_rank_list[j].name) + { + scc_node_rank.push_back(node_rank_list[j]); + break; + } + } + } + std::vector cut_rank = get_cut_rank_v2(scc_node_rank); + onnx::GraphProto temp_graph_upper; + int node_in_upper = 0; + for (int i = 0; i < scc_graph.node_size(); i++) + { + if (scc_node_rank[i].rank < cut_rank[0]) + { + node_in_upper++; + } + } + int node_in_upper_added = 0; + std::vector temp_graph_upper_adder_list; + int record_i = 0; + std::cout << "node size: " << scc_graph.node_size() << std::endl; + std::cout << "node in upper: " << node_in_upper << std::endl; + while (node_in_upper_added < node_in_upper) + { + onnx::GraphProto temp_graph_upper_adder; + for (int i = record_i; i < scc_graph.node_size(); i++) + { + int i_minus_1 = 0; + if (i == 0) + { + i_minus_1 = 0; + } + else + { + i_minus_1 = i - 1; + } + if (scc_node_rank[i].rank < cut_rank[0] && + (i == record_i || (scc_node_rank[i].rank == scc_node_rank[i_minus_1].rank + 1))) + { + *temp_graph_upper_adder.add_node() = scc_graph.node(i); + node_in_upper_added++; + } + else + { + if (scc_node_rank[i].rank >= cut_rank[0]) + { + record_i = i + 1; + } + else + { + record_i = i; + } + if (temp_graph_upper_adder.node_size() > 0) + { + temp_graph_upper_adder_list.push_back(temp_graph_upper_adder); + temp_graph_upper_adder.clear_node(); + } + break; + } + if (i == scc_graph.node_size() - 1 && temp_graph_upper_adder.node_size() > 0) + { + temp_graph_upper_adder_list.push_back(temp_graph_upper_adder); + temp_graph_upper_adder.clear_node(); + } + } + std::cout << "loop ended:temp graph upper adder size: " + << temp_graph_upper_adder.node_size() << " " << record_i << "/" + << scc_graph.node_size() << " node_in_upper_added:" << node_in_upper_added + << std::endl; + } + if (scc_index < subgraph_size) + { + Subgraphs[scc_index] = temp_graph_upper_adder_list[0]; + } + else + { + otherSubgraphs[scc_index - subgraph_size] = temp_graph_upper_adder_list[0]; + } + + if (temp_graph_upper_adder_list.size() > 1) + { + for (int i = 1; i < int(temp_graph_upper_adder_list.size()); i++) + { + if (scc_index < subgraph_size) + { + Subgraphs.push_back(temp_graph_upper_adder_list[i]); + } + else + { + otherSubgraphs.push_back(temp_graph_upper_adder_list[i]); + } + } + } + std::cout << "scc index" << scc_index << " scc size: " << scc_graph.node_size() << std::endl; + std::cout << "scc node rank: "; + for (int i = 0; i < scc_graph.node_size(); i++) + { + std::cout << scc_node_rank[i].name << " " << scc_node_rank[i].rank << " "; + } + std::cout << std::endl; + for (int i = 0; i < int(cut_rank.size()) - 1; i++) + { + onnx::GraphProto temp_graph_lower; + for (int j = 0; j < scc_graph.node_size(); j++) + { + if (scc_node_rank[j].rank >= cut_rank[i] && scc_node_rank[j].rank < cut_rank[i + 1]) + { + *temp_graph_lower.add_node() = scc_graph.node(j); + } + } + if (scc_index < subgraph_size) + { + if (temp_graph_lower.node_size() > 0) + { + Subgraphs.push_back(temp_graph_lower); + } + } + else + { + if (temp_graph_lower.node_size() > 0) + { + otherSubgraphs.push_back(temp_graph_lower); + } + } + } + onnx::GraphProto temp_graph_lower; + for (int j = 0; j < scc_graph.node_size(); j++) + { + if (scc_node_rank[j].rank >= cut_rank[cut_rank.size() - 1]) + { + *temp_graph_lower.add_node() = scc_graph.node(j); + } + } + if (scc_index < subgraph_size) + { + if (temp_graph_lower.node_size() > 0) + { + Subgraphs.push_back(temp_graph_lower); + } + } + else + { + if (temp_graph_lower.node_size() > 0) + { + otherSubgraphs.push_back(temp_graph_lower); + } + } + } + for (int i = Subgraphs.size() - 1; i >= 0; i--) + { + if (Subgraphs[i].node_size() == 0) + { + Subgraphs.erase(Subgraphs.begin() + i); + } + } + for (int i = otherSubgraphs.size() - 1; i >= 0; i--) + { + if (otherSubgraphs[i].node_size() == 0) + { + otherSubgraphs.erase(otherSubgraphs.begin() + i); + } + } +} +/** + * @brief Eliminate strongly connected components in the graph and partition them into + * individual subgraphs. + * + * @param [in] strongly_connected_subgraphs List of indices representing strongly connected + * components. + * @param [in,out] Subgraphs List of subgraphs that will be updated. + * @param [in,out] otherSubgraphs List of other subgraphs that will be updated. + * @param [in] g The original graph from which strongly connected components are derived. + * @pre The input graph `g` should be properly initialized and contain nodes. + * @post The `Subgraphs` and `otherSubgraphs` lists will be updated with individual nodes from + * each strongly connected component. + * @exception None + * @return None + */ +void eliminate_scc_v3(std::vector> &strongly_connected_subgraphs, + std::vector &Subgraphs, + std::vector &otherSubgraphs, const onnx::GraphProto &g) +{ + int subgraph_size = Subgraphs.size(); + for (int i = 0; i < int(strongly_connected_subgraphs.size()); i++) + { + for (const auto scc_index : strongly_connected_subgraphs[i]) + { + std::cout << "scc index: " << scc_index << std::endl; + onnx::GraphProto scc_graph; + if (scc_index < subgraph_size) + { + scc_graph = Subgraphs[scc_index]; + } + else + { + scc_graph = otherSubgraphs[scc_index - subgraph_size]; + } + for (int j = 0; j < scc_graph.node_size(); j++) + { + onnx::GraphProto graph_temp; + *graph_temp.add_node() = scc_graph.node(j); + if (j == 0) + { + if (scc_index < subgraph_size) + { + Subgraphs[scc_index] = graph_temp; + } + else + { + otherSubgraphs[scc_index - subgraph_size] = graph_temp; + } + } + else + { + if (scc_index < subgraph_size) + { + Subgraphs.push_back(graph_temp); + } + else + { + otherSubgraphs.push_back(graph_temp); + } + } + } + } + } + for (int i = Subgraphs.size() - 1; i >= 0; i--) + { + if (Subgraphs[i].node_size() == 0) + { + Subgraphs.erase(Subgraphs.begin() + i); + } + } + for (int i = otherSubgraphs.size() - 1; i >= 0; i--) + { + if (otherSubgraphs[i].node_size() == 0) + { + otherSubgraphs.erase(otherSubgraphs.begin() + i); + } + } +} +/** + * @brief Determine the graph type based on the given index and return the corresponding graph. + * + * @param [in] index The index of the graph to determine. + * @param [in] Subgraphs List of subgraphs. + * @param [in] otherSubgraphs List of other subgraphs. + * @param [in] subgraph_size The size of the Subgraphs list. + * @pre The `index` should be a valid index within the combined range of `Subgraphs` and + * `otherSubgraphs`. + * @post None + * @exception None + * @return The graph corresponding to the given index. + */ +onnx::GraphProto determinegraphtype_v2(int index, std::vector &Subgraphs, + std::vector &otherSubgraphs, + int subgraph_size) +{ + if (index < subgraph_size) + { + return Subgraphs[index]; + } + else + { + return otherSubgraphs[index - subgraph_size]; + } +} +/** + * @brief Find pairs of strongly connected subgraphs based on input and output tensors. + * + * @param [in] strongly_connected_subgraphs List of strongly connected subgraphs. + * @param [in] Subgraphs List of subgraphs. + * @param [in] otherSubgraphs List of other subgraphs. + * @param [in] graphs_inputs List of input tensors for each graph. + * @param [in] graphs_outputs List of output tensors for each graph. + * @param [out] sccs_pairs List of pairs of strongly connected subgraphs. + * @pre The input lists should be properly initialized and contain valid data. + * @post The `sccs_pairs` list will contain pairs of indices representing connected subgraphs. + * @exception None + * @return None + */ +void find_subgraph_pair_v2(std::vector> &strongly_connected_subgraphs, + std::vector &Subgraphs, + std::vector &otherSubgraphs, + std::vector> &graphs_inputs, + std::vector> &graphs_outputs, + std::vector>> &sccs_pairs) +{ + int count = 0; + for (const auto &strongly_connected : strongly_connected_subgraphs) + { + std::vector scc_graphs; + std::vector> scc_graphs_inputs; + std::vector> scc_graphs_outputs; + for (const auto &index : strongly_connected) + { + std::unordered_set graph_inputs = graphs_inputs[index]; + std::unordered_set graph_outputs = graphs_outputs[index]; + scc_graphs_inputs.push_back(graph_inputs); + scc_graphs_outputs.push_back(graph_outputs); + } + std::vector> scc_pairs; + std::vector is_pushed; + for (int j = 0; j < int(strongly_connected.size()); j++) + { + is_pushed.push_back(0); + } + for (int i = 0; i < int(strongly_connected.size()); i++) + { + for (const auto &graph_input : scc_graphs_inputs[i]) + { + for (int j = i + 1; j < int(strongly_connected.size()); j++) + { + std::vector scc_pair; + if (scc_graphs_outputs[j].find(graph_input) != scc_graphs_outputs[j].end() && + is_pushed[j] == 0) + { + for (const auto &graph_output : scc_graphs_outputs[i]) + { + if (scc_graphs_inputs[j].find(graph_output) != scc_graphs_inputs[j].end()) + { + scc_pair.push_back(strongly_connected[i]); + scc_pair.push_back(strongly_connected[j]); + scc_pairs.push_back(scc_pair); + is_pushed[j] = 1; + is_pushed[i] = 1; + break; + } + } + } + if (is_pushed[i] == 1) + { + break; + } + } + if (is_pushed[i] == 1) + { + break; + } + } + } + if (scc_pairs.size() != 0) + { + sccs_pairs.push_back(scc_pairs); + } + count++; + } + for (const auto &scc_pairs : sccs_pairs) + { + std::cout << "scc pair:"; + for (const auto &scc_pair : scc_pairs) + { + + for (const auto &scc_id : scc_pair) + { + std::cout << scc_id << " "; + } + std::cout << ";"; + } + std::cout << std::endl; + } +} +/** + * @brief Cut a pair of subgraphs into upper and lower parts based on node rank. + * + * @param [in] Subgraphs List of subgraphs. + * @param [in] otherSubgraphs List of other subgraphs. + * @param [in] graphs_inputs List of input tensors for each graph. + * @param [in] graphs_outputs List of output tensors for each graph. + * @param [in] scc_pair Pair of subgraph indices to be cut. + * @param [out] scc_pair_cut List of cut subgraphs (upper and lower parts of master graph and + * slave graph). + * @param [in] subgraph_size Size of subgraph. + * @pre The input lists should be properly initialized and contain valid data. + * @post The `scc_pair_cut` list will contain the cut subgraphs. + * @exception None + * @return A vector containing the index of the master graph and the cut rank. + */ +std::vector cut_pair(std::vector &Subgraphs, + std::vector &otherSubgraphs, + std::vector> &graphs_inputs, + std::vector> &graphs_outputs, + std::vector &scc_pair, std::vector &scc_pair_cut, + int subgraph_size) +{ + std::vector pair_node_list = + calculate_node_rank(scc_pair, Subgraphs, otherSubgraphs); + int master_graph = 0; + for (const auto &node : pair_node_list) + { + if (node.rank == 0) + { + int find_flag = -1; + onnx::GraphProto graph_temp = + determinegraphtype_v2(scc_pair[0], Subgraphs, otherSubgraphs, subgraph_size); + for (const auto &graph_node : graph_temp.node()) + { + if (graph_node.name() == node.name) + { + find_flag = 1; + break; + } + } + if (find_flag == 1) + { + master_graph = 0; + break; + } + else + { + master_graph = 1; + break; + } + } + } + int slave_graph = 1 - master_graph; + // find the position where master and slave graph connect + int cut_rank = -1; + for (const auto &output : graphs_outputs[scc_pair[slave_graph]]) + { + for (const auto &input : graphs_inputs[scc_pair[master_graph]]) + { + + if (input.name == output.name) + { + int node_index = 0; + onnx::GraphProto graph_temp = + determinegraphtype_v2(scc_pair[slave_graph], Subgraphs, otherSubgraphs, subgraph_size); + for (const auto &graph_node : graph_temp.node()) + { + int update_node_rank = 0; + for (const auto &output_node : graph_node.output()) + { + if (output_node == output.name) + { + if (slave_graph == 0) + { + if (cut_rank == -1 || cut_rank > pair_node_list[node_index].rank) + { + cut_rank = pair_node_list[node_index].rank; + } + } + else + { + onnx::GraphProto graph_temp_1 = determinegraphtype_v2( + scc_pair[master_graph], Subgraphs, otherSubgraphs, subgraph_size); + if (cut_rank == -1 || + cut_rank > pair_node_list[node_index + graph_temp_1.node_size()].rank) + { + cut_rank = pair_node_list[node_index + graph_temp_1.node_size()].rank; + } + } + update_node_rank = 1; + break; + } + } + if (update_node_rank == 1) + { + break; + } + node_index++; + } + break; + } + } + } + // cut master graph according to the rank + onnx::GraphProto master_upper; + onnx::GraphProto master_lower; + int node_index = 0; + onnx::GraphProto graph_temp = + determinegraphtype_v2(scc_pair[master_graph], Subgraphs, otherSubgraphs, subgraph_size); + for (const auto &node : graph_temp.node()) + { + int node_rank; + if (master_graph == 0) + { + node_rank = pair_node_list[node_index].rank; + } + else + { + onnx::GraphProto graph_temp_2 = + determinegraphtype_v2(scc_pair[slave_graph], Subgraphs, otherSubgraphs, subgraph_size); + node_rank = pair_node_list[node_index + graph_temp_2.node_size()].rank; + } + if (node_rank < cut_rank) + { + *master_upper.add_node() = node; + } + else + { + *master_lower.add_node() = node; + } + node_index++; + } + scc_pair_cut.push_back(master_upper); + scc_pair_cut.push_back(master_lower); + scc_pair_cut.push_back( + determinegraphtype_v2(scc_pair[slave_graph], Subgraphs, otherSubgraphs, subgraph_size)); + if (master_graph == 1) + { + int temp = scc_pair[0]; + scc_pair[0] = scc_pair[1]; + scc_pair[1] = temp; + master_graph = 0; + } // assure the first graph is master + std::vector return_value; + return_value.push_back(master_graph); + return_value.push_back(cut_rank); + return return_value; +} +/** + * @brief Eliminate pairs of subgraphs by cutting them and updating the subgraph lists. + * + * @param [in,out] Subgraphs List of subgraphs to be processed and updated. + * @param [in,out] otherSubgraphs List of other subgraphs to be processed and updated. + * @param [in] graphs_inputs List of input tensors for each graph. + * @param [in] graphs_outputs List of output tensors for each graph. + * @param [in] strongly_connected_subgraphs List of strongly connected subgraphs. + * @param [in] subgraph_size Size of subgraph. + * @pre The input lists should be properly initialized and contain valid data. + * @post The `Subgraphs` and `otherSubgraphs` lists will be updated with cut subgraphs. + * @exception None + * @return None + */ +void eliminate_pair_v2(std::vector &Subgraphs, + std::vector &otherSubgraphs, + std::vector> &graphs_inputs, + std::vector> &graphs_outputs, + std::vector> &strongly_connected_subgraphs, + int subgraph_size) +{ + int original_node_size = 0; + for (auto &subgraph : Subgraphs) + { + original_node_size += subgraph.node_size(); + } + for (auto &subgraph : otherSubgraphs) + { + original_node_size += subgraph.node_size(); + } + std::vector>> sccs_pairs; + find_subgraph_pair_v2(strongly_connected_subgraphs, Subgraphs, otherSubgraphs, graphs_inputs, + graphs_outputs, sccs_pairs); + for (auto &scc_pairs : sccs_pairs) + { + for (auto &scc_pair : scc_pairs) + { + std::vector scc_pair_cut; + cut_pair(Subgraphs, otherSubgraphs, graphs_inputs, graphs_outputs, scc_pair, scc_pair_cut, + subgraph_size); + if (scc_pair[0] < subgraph_size) + { + Subgraphs[scc_pair[0]] = scc_pair_cut[0]; + Subgraphs.push_back(scc_pair_cut[1]); + } + else + { + otherSubgraphs[scc_pair[0] - subgraph_size] = scc_pair_cut[0]; + otherSubgraphs.push_back(scc_pair_cut[1]); + } + + if (scc_pair[1] < subgraph_size) + { + Subgraphs[scc_pair[1]] = scc_pair_cut[2]; + } + else + { + otherSubgraphs[scc_pair[1] - subgraph_size] = scc_pair_cut[2]; + } + } + } + for (int i = Subgraphs.size() - 1; i >= 0; i--) + { + if (Subgraphs[i].node_size() == 0) + { + Subgraphs.erase(Subgraphs.begin() + i); + } + } + for (int i = otherSubgraphs.size() - 1; i >= 0; i--) + { + if (otherSubgraphs[i].node_size() == 0) + { + otherSubgraphs.erase(otherSubgraphs.begin() + i); + } + } +} +/** + * @brief Find the successor or predecessor subgraph with the least number of nodes. + * + * @param [in] index Index of the current subgraph. + * @param [in] successor List of successor indices. + * @param [in] predecessor List of predecessor indices. + * @param [in] Subgraphs List of subgraphs. + * @param [in] otherSubgraphs List of other subgraphs. + * @pre The input lists should be properly initialized and contain valid data. + * @post None + * @exception None + * @return Index of the successor or predecessor subgraph with the least number of nodes, or -1 + * if no such subgraph exists. + */ +int find_min_size(int index, std::vector &successor, std::vector &predecessor, + std::vector &Subgraphs, + std::vector + &otherSubgraphs) // find the successor or predecessor with the least nodes +{ + std::vector size_list; + int min_index = -1; + int min_size = 10000; + for (int i = 0; i < int(successor.size()); i++) + { + std::cout << "successor: " << successor[i]; + onnx::GraphProto tempgraph; + if ((successor[i] < int(Subgraphs.size()) && index < int(Subgraphs.size())) || + (successor[i] >= int(Subgraphs.size()) && index >= int(Subgraphs.size()))) + { + if (successor[i] < int(Subgraphs.size())) + { + tempgraph = Subgraphs[successor[i]]; + } + else + { + tempgraph = otherSubgraphs[successor[i] - int(Subgraphs.size())]; + } + } + else + { + continue; + } + int size = int(tempgraph.node_size()); + std::cout << " size:" << size << " min:" << min_size; + if (size < min_size && size != 1) + { + min_size = size; + min_index = successor[i]; + std::cout << " update min index:" << min_index; + } + std::cout << std::endl; + } + for (int i = 0; i < int(predecessor.size()); i++) + { + std::cout << "predecessor: " << predecessor[i]; + onnx::GraphProto tempgraph; + if ((predecessor[i] < int(Subgraphs.size()) && index < int(Subgraphs.size())) || + (predecessor[i] >= int(Subgraphs.size()) && index >= int(Subgraphs.size()))) + { + if (predecessor[i] < int(Subgraphs.size())) + { + tempgraph = Subgraphs[predecessor[i]]; + } + else + { + tempgraph = otherSubgraphs[predecessor[i] - int(Subgraphs.size())]; + } + } + else + { + continue; + } + int size = int(tempgraph.node_size()); + std::cout << " size:" << size << " min:" << min_size; + if (size < min_size && size != 1) + { + min_size = size; + min_index = predecessor[i]; + std::cout << " update min index:" << min_index; + } + std::cout << std::endl; + } + return min_index; +} +void Partition::PartitionGraph(const onnx::GraphProto &g, Device &d, PartitionStrategy strategy, + const std::unordered_map &node_io_size) +{ + std::unordered_set IOvalueNames = getIOvalue(g); + int *visited = (int *)malloc(g.node_size() * sizeof(int)); + std::vector adjacency_list = get_adjancency_list(g, visited); + std::vector otherSubgraphs; + determine_subgraphs_v2(g, otherSubgraphs, d, visited, adjacency_list, strategy); + std::cout << "Partition Done" << std::endl; + free(visited); + std::vector().swap(adjacency_list); + int node_sum = 0; + // traverse the structures and print each element + std::ofstream outFile("./subgraphs_1.txt"); + if (!outFile.is_open()) + { + std::cerr << "Error opening file." << std::endl; + exit(0); + } + int id = 0; + for (const auto &vec : Subgraphs) + { + outFile << " subgraph" << id << ":"; + for (const auto &node : vec.node()) + { + outFile << node.name() << " "; + } + id++; + outFile << std::endl; + node_sum += vec.node_size(); + } + int id_record = id; + std::ofstream outFile_2("./subgraphs_2.txt"); + if (!outFile_2.is_open()) + { + std::cerr << "Error opening file." << std::endl; + exit(0); + } + std::cout << "before:" << std::endl; + for (const auto &vec : otherSubgraphs) + { + outFile_2 << " subgraph" << id << ":"; + for (const auto &node : vec.node()) + { + outFile_2 << node.name() << " "; + } + id++; + outFile_2 << std::endl; + node_sum += vec.node_size(); + } + std::vector> subgraphs_2_input_nodes_; + std::vector> subgraphs_2_nodes_; + for (const auto &sg : otherSubgraphs) + { + std::unordered_set graphInputs; + determineGraphInput(sg, IOvalueNames, graphInputs); + std::unordered_set graphInputsNodes; + for (const auto &input : graphInputs) + { + auto nodename = findInputNode(g, input.name); + if (nodename != "") + { + graphInputsNodes.insert(nodename); + } + } + subgraphs_2_input_nodes_.push_back(graphInputsNodes); + subgraphs_2_nodes_.push_back(collectNodeNames(sg)); + } + int *is_merged = (int *)malloc(otherSubgraphs.size() * sizeof(int)); + for (int i = 0; i < int(otherSubgraphs.size()); i++) + { + is_merged[i] = 0; + } + std::cout << "graph size after merging:" << otherSubgraphs.size() << std::endl; + free(is_merged); + std::ofstream outFile_3("./subgraphs_3.txt"); + if (!outFile_3.is_open()) + { + std::cerr << "Error opening file." << std::endl; + exit(0); + } + ////othersubgraphs after merged + for (const auto &vec : otherSubgraphs) + { + outFile_3 << " subgraph" << id_record << ":"; + for (const auto &node : vec.node()) + { + outFile_3 << node.name() << " "; + } + id_record++; + outFile_3 << std::endl; + } + std::cout << "sub node size:" << node_sum << std::endl; + + std::vector> subgraphs_1_inputs; + std::vector> subgraphs_1_input_nodes; + std::vector> subgraphs_1_nodes; + for (const auto &sg : Subgraphs) + { + std::unordered_set graphInputs; + determineGraphInput(sg, IOvalueNames, graphInputs); + subgraphs_1_inputs.push_back(graphInputs); + std::unordered_set graphInputsNodes; + for (const auto &input : graphInputs) + { + auto nodename = findInputNode(g, input.name); + if (nodename != "") + { + graphInputsNodes.insert(nodename); + } + } + subgraphs_1_input_nodes.push_back(graphInputsNodes); + subgraphs_1_nodes.push_back(collectNodeNames(sg)); + } + + std::vector> subgraphs_2_inputs; + std::vector> subgraphs_2_input_nodes; + std::vector> subgraphs_2_nodes; + for (const auto &sg : otherSubgraphs) + { + std::unordered_set graphInputs; + determineGraphInput(sg, IOvalueNames, graphInputs); + subgraphs_2_inputs.push_back(graphInputs); + std::unordered_set graphInputsNodes; + for (const auto &input : graphInputs) + { + auto nodename = findInputNode(g, input.name); + if (nodename != "") + { + graphInputsNodes.insert(nodename); + } + } + subgraphs_2_input_nodes.push_back(graphInputsNodes); + subgraphs_2_nodes.push_back(collectNodeNames(sg)); + } + std::vector> subgraphs_1_outputs; + + int node_number = 0; + + for (const auto &sg : Subgraphs) + { + std::unordered_set graphOutputs; + node_number += sg.node_size(); + determineGraphOutput(g, sg, subgraphs_1_inputs, subgraphs_2_inputs, graphOutputs); + subgraphs_1_outputs.push_back(graphOutputs); + } + std::vector> subgraphs_2_outputs; + for (const auto &sg : otherSubgraphs) + { + std::unordered_set graphOutputs; + node_number += sg.node_size(); + determineGraphOutput(g, sg, subgraphs_1_inputs, subgraphs_2_inputs, graphOutputs); + subgraphs_2_outputs.push_back(graphOutputs); + } + int graph_node_size_minus_constant = g.node_size(); + for (const auto &node : g.node()) + { + if (node.op_type() == "Constant") + { + graph_node_size_minus_constant--; + } + } + std::cout << "total number of nodes in subgraphs:" << node_number << std::endl; + std::cout << "total number of nodes in origional graph:" << graph_node_size_minus_constant + << std::endl; + std::vector> graphs_inputs; + graphs_inputs.insert(graphs_inputs.end(), subgraphs_1_inputs.begin(), subgraphs_1_inputs.end()); + graphs_inputs.insert(graphs_inputs.end(), subgraphs_2_inputs.begin(), subgraphs_2_inputs.end()); + std::vector> graphs_outputs; + graphs_outputs.insert(graphs_outputs.end(), subgraphs_1_outputs.begin(), + subgraphs_1_outputs.end()); + graphs_outputs.insert(graphs_outputs.end(), subgraphs_2_outputs.begin(), + subgraphs_2_outputs.end()); + + std::vector> predecessors_Subgraphs(graphs_inputs.size()); + std::vector> successors_Subgraphs(graphs_inputs.size()); + for (int i = 0; i < int(graphs_inputs.size()); i++) // traversal all subgraphs + { + std::vector predecessors; + for (const auto &g_input : graphs_inputs[i]) + { + for (int j = 0; j < int(graphs_outputs.size()); j++) + { + if ((graphs_outputs[j].find(g_input) != graphs_outputs[j].end())) + { + predecessors.push_back(j); + } + } + } + if (predecessors.size() == 0) + { + std::cout << "subgraph " << i << " has no predecessors" << std::endl; + } + predecessors_Subgraphs[i].insert(predecessors_Subgraphs[i].end(), predecessors.begin(), + predecessors.end()); + } + for (int i = 0; i < int(graphs_inputs.size()); i++) + { + for (int j = 0; j < int(graphs_inputs.size()); j++) + { + if (find(predecessors_Subgraphs[j].begin(), predecessors_Subgraphs[j].end(), i) != + predecessors_Subgraphs[j].end()) + { + successors_Subgraphs[i].push_back(j); + } + } + } + std::vector> strongly_connected_subgraphs; + int *DFN = (int *)malloc(graphs_inputs.size() * sizeof(int)); + int *LOW = (int *)malloc(graphs_inputs.size() * sizeof(int)); + for (int i = 0; i < int(graphs_inputs.size()); i++) + { + DFN[i] = 0; + LOW[i] = 0; + } + for (int temp_count = 0; temp_count < int(predecessors_Subgraphs.size()); temp_count++) + { + if (DFN[temp_count] == 0) + { + std::vector stack_subgraphs; + int depth = 0; + Tarjan(temp_count, depth, strongly_connected_subgraphs, DFN, LOW, stack_subgraphs, + successors_Subgraphs); + } + } + + std::string file_name_scc = "scc.txt"; + std::ofstream outfile_scc(file_name_scc); + outfile_scc << strongly_connected_subgraphs.size() << std::endl; + for (const auto &scc : strongly_connected_subgraphs) + { + std::cout << "scc:"; + outfile_scc << "scc: "; + for (const auto &scc_id : scc) + { + outfile_scc << scc_id << " "; + } + outfile_scc << std::endl; + for (const auto &scc_id : scc) + { + std::cout << scc_id << " "; + outfile_scc << "subgraph" << scc_id << " input:"; + for (const auto &scc_input : graphs_inputs[scc_id]) + { + outfile_scc << scc_input.name << ";"; + } + outfile_scc << " output:"; + for (const auto &scc_output : graphs_outputs[scc_id]) + { + outfile_scc << scc_output.name << ";"; + } + outfile_scc << std::endl; + } + + std::cout << std::endl; + } + outfile_scc.close(); + free(DFN); + free(LOW); + int node_num_all = 0; + for (const auto &sg : Subgraphs) + { + node_num_all += sg.node_size(); + } + for (const auto &sg : otherSubgraphs) + { + node_num_all += sg.node_size(); + } + std::cout << "node num in original graph: " << g.node_size() << std::endl; + std::cout << "node_num after cut " << node_num_all << std::endl; + ///////////////////////+++ + int *DFN_ = (int *)malloc(graphs_inputs.size() * sizeof(int)); + int *LOW_ = (int *)malloc(graphs_inputs.size() * sizeof(int)); + for (int i = 0; i < int(graphs_inputs.size()); i++) + { + DFN_[i] = 0; + LOW_[i] = 0; + } + for (int temp_count = 0; temp_count < int(predecessors_Subgraphs.size()); temp_count++) + { + if (DFN_[temp_count] == 0) + { + std::vector stack_subgraphs; + int depth = 0; + Tarjan(temp_count, depth, strongly_connected_subgraphs, DFN_, LOW_, stack_subgraphs, + successors_Subgraphs); + } + } + free(DFN_); + free(LOW_); + eliminate_scc_v2(strongly_connected_subgraphs, Subgraphs, otherSubgraphs, g); + ///////////////////// + strongly_connected_subgraphs.clear(); + predecessors_Subgraphs.clear(); + successors_Subgraphs.clear(); + std::vector>().swap(subgraphs_2_inputs); + std::vector>().swap(subgraphs_1_inputs); + std::vector>().swap(subgraphs_2_outputs); + std::vector>().swap(subgraphs_1_outputs); + std::vector>().swap(graphs_inputs); + std::vector>().swap(graphs_outputs); + for (const auto &sg : Subgraphs) + { + std::unordered_set graphInputs; + determineGraphInput(sg, IOvalueNames, graphInputs); + subgraphs_1_inputs.push_back(graphInputs); + } + for (const auto &sg : otherSubgraphs) + { + std::unordered_set graphInputs; + determineGraphInput(sg, IOvalueNames, graphInputs); + subgraphs_2_inputs.push_back(graphInputs); + } + node_number = 0; + for (const auto &sg : Subgraphs) + { + std::unordered_set graphOutputs; + node_number += sg.node_size(); + determineGraphOutput(g, sg, subgraphs_1_inputs, subgraphs_2_inputs, graphOutputs); + subgraphs_1_outputs.push_back(graphOutputs); + } + for (const auto &sg : otherSubgraphs) + { + std::unordered_set graphOutputs; + node_number += sg.node_size(); + determineGraphOutput(g, sg, subgraphs_1_inputs, subgraphs_2_inputs, graphOutputs); + subgraphs_2_outputs.push_back(graphOutputs); + } + graphs_inputs.insert(graphs_inputs.end(), subgraphs_1_inputs.begin(), subgraphs_1_inputs.end()); + graphs_inputs.insert(graphs_inputs.end(), subgraphs_2_inputs.begin(), subgraphs_2_inputs.end()); + graphs_outputs.insert(graphs_outputs.end(), subgraphs_1_outputs.begin(), + subgraphs_1_outputs.end()); + graphs_outputs.insert(graphs_outputs.end(), subgraphs_2_outputs.begin(), + subgraphs_2_outputs.end()); + for (int i = 0; i < int(graphs_inputs.size()); i++) + { + std::vector predecessors; + for (const auto &g_input : graphs_inputs[i]) + { + for (int j = 0; j < int(graphs_outputs.size()); j++) + { + if ((graphs_outputs[j].find(g_input) != graphs_outputs[j].end())) + { + predecessors.push_back(j); + } + } + } + predecessors_Subgraphs.push_back(predecessors); + } + for (int i = 0; i < int(graphs_inputs.size()); i++) + { + std::vector temp; + for (int j = 0; j < int(graphs_inputs.size()); j++) + { + if (find(predecessors_Subgraphs[j].begin(), predecessors_Subgraphs[j].end(), i) != + predecessors_Subgraphs[j].end()) + { + temp.push_back(j); + } + } + successors_Subgraphs.push_back(temp); + } + std::string file_name_predecessor_2 = "predecessor_final_2.txt"; + std::string file_name_successor_2 = "successor_final_2.txt"; + std::ofstream outfile_predecessor_2(file_name_predecessor_2); + std::ofstream outfile_successor_2(file_name_successor_2); + if (!(outfile_predecessor_2.is_open() && outfile_successor_2.is_open())) + { + std::cerr << "Error opening file." << std::endl; + exit(0); + } + for (int i = 0; i < int(graphs_inputs.size()); i++) + { + outfile_predecessor_2 << "predecessor of subgraph " << i << ":"; + for (const auto &predecessor : predecessors_Subgraphs[i]) + { + outfile_predecessor_2 << predecessor << ";"; + } + outfile_predecessor_2 << std::endl; + outfile_successor_2 << "successor of subgraph " << i << ":"; + for (const auto &successor : successors_Subgraphs[i]) + { + outfile_successor_2 << successor << ";"; + } + outfile_successor_2 << std::endl; + } + outfile_predecessor_2.close(); + outfile_successor_2.close(); + print_subgraphs(Subgraphs, (char *)"./subgraphs_final_2.txt", otherSubgraphs, + (char *)"./other_subgraphs_final_2.txt"); + int *DFN_2 = (int *)malloc(graphs_inputs.size() * sizeof(int)); + int *LOW_2 = (int *)malloc(graphs_inputs.size() * sizeof(int)); + for (int i = 0; i < int(graphs_inputs.size()); i++) + { + DFN_2[i] = 0; + LOW_2[i] = 0; + } + for (int temp_count = 0; temp_count < int(predecessors_Subgraphs.size()); temp_count++) + { + if (DFN_[temp_count] == 0) + { + std::vector stack_subgraphs; + int depth = 0; + Tarjan(temp_count, depth, strongly_connected_subgraphs, DFN_2, LOW_2, stack_subgraphs, + successors_Subgraphs); + } + } + std::string file_name_scc2 = "scc2.txt"; + std::ofstream outfile_scc2(file_name_scc2); + for (const auto &scc : strongly_connected_subgraphs) + { + std::cout << "scc:"; + outfile_scc2 << "scc: "; + for (const auto &scc_id : scc) + { + outfile_scc2 << scc_id << " "; + } + outfile_scc2 << std::endl; + for (const auto &scc_id : scc) + { + std::cout << scc_id << " "; + outfile_scc2 << "subgraph" << scc_id << " input:"; + for (const auto &scc_input : graphs_inputs[scc_id]) + { + outfile_scc2 << scc_input.name << ";"; + } + outfile_scc2 << " output:"; + for (const auto &scc_output : graphs_outputs[scc_id]) + { + outfile_scc2 << scc_output.name << ";"; + } + outfile_scc2 << std::endl; + } + + std::cout << std::endl; + } + outfile_scc.close(); + free(DFN_2); + free(LOW_2); + // eliminate_scc_v2(strongly_connected_subgraphs, Subgraphs, otherSubgraphs, g); + int subgraph_size_2 = Subgraphs.size(); + int other_subgraph_size_2 = otherSubgraphs.size(); + std::vector eliminated_small_graph_id; + std::vector eliminated_small_graph_size; + std::vector eliminated_small_graph_size_2; + std::vector unmerged_graph_id; + for (int i = 0; i < subgraph_size_2 + other_subgraph_size_2; i++) + { + std::cout << "i:" << i << std::endl; + if (i < subgraph_size_2) + { + if (Subgraphs[i].node_size() < 2) + { + int merge_id = find_min_size(i, successors_Subgraphs[i], predecessors_Subgraphs[i], + Subgraphs, otherSubgraphs); + if (merge_id < subgraph_size_2 && merge_id >= 0) + { + mergeGraphs(Subgraphs[merge_id], Subgraphs[i]); + eliminated_small_graph_id.push_back(i); + eliminated_small_graph_size.push_back(Subgraphs[i].node_size()); + std::cout << "eliminating small graph " << i << "and merged to " << merge_id << std::endl; + } + else if (merge_id >= 0) + { + mergeGraphs(otherSubgraphs[merge_id - subgraph_size_2], Subgraphs[i]); + eliminated_small_graph_id.push_back(i); + eliminated_small_graph_size.push_back(Subgraphs[i].node_size()); + std::cout << "eliminating small graph " << i << "and merged to " << merge_id << std::endl; + } + else + { + unmerged_graph_id.push_back(i); + } + } + } + else + { + if (otherSubgraphs[i - subgraph_size_2].node_size() < 2) + { + int merge_id = find_min_size(i, successors_Subgraphs[i], predecessors_Subgraphs[i], + Subgraphs, otherSubgraphs); + if (merge_id < subgraph_size_2 && merge_id >= 0) + { + mergeGraphs(Subgraphs[merge_id], otherSubgraphs[i - subgraph_size_2]); + eliminated_small_graph_id.push_back(i); + eliminated_small_graph_size.push_back(otherSubgraphs[i - subgraph_size_2].node_size()); + std::cout << "eliminating small graph " << i << "and merged to " << merge_id << std::endl; + } + else if (merge_id >= 0) + { + mergeGraphs(otherSubgraphs[merge_id - subgraph_size_2], + otherSubgraphs[i - subgraph_size_2]); + eliminated_small_graph_id.push_back(i); + eliminated_small_graph_size.push_back(otherSubgraphs[i - subgraph_size_2].node_size()); + std::cout << "eliminating small graph " << i << "and merged to " << merge_id << std::endl; + } + else + { + unmerged_graph_id.push_back(i); + } + } + } + } + std::cout << "succeed in reaching here" << std::endl; + for (int i = eliminated_small_graph_id.size() - 1; i >= 0; i--) + { + if (std::find(unmerged_graph_id.begin(), unmerged_graph_id.end(), + eliminated_small_graph_id[i]) != unmerged_graph_id.end()) + { + continue; + } + std::cout << eliminated_small_graph_id[i] << " "; + int index = eliminated_small_graph_id[i]; + if (index < subgraph_size_2) + { + if (Subgraphs[index].node_size() > 1) + { + std::cout << "eliminate Subgraphs" << index << " "; + for (auto node : Subgraphs[index].node()) + { + std::cout << node.name() << " "; + } + } + eliminated_small_graph_size_2.push_back(Subgraphs[index].node_size()); + Subgraphs.erase(Subgraphs.begin() + index); + } + else + { + if (otherSubgraphs[index - subgraph_size_2].node_size() > 1) + { + std::cout << "eliminate otherSubgraphs" << index - subgraph_size_2 << " "; + for (auto node : otherSubgraphs[index - subgraph_size_2].node()) + { + std::cout << node.name() << " "; + } + } + eliminated_small_graph_size_2.push_back(otherSubgraphs[index - subgraph_size_2].node_size()); + otherSubgraphs.erase(otherSubgraphs.begin() + index - subgraph_size_2); + } + } + std::cout << std::endl; + std::cout << "eliminated_small_graph_size_1: "; + for (const auto &size : eliminated_small_graph_size) + { + std::cout << size << " "; + } + std::cout << std::endl; + std::cout << "eliminated_small_graph_size_2: "; + for (const auto &size : eliminated_small_graph_size_2) + { + std::cout << size << " "; + } + std::cout << std::endl; + /////////clear + strongly_connected_subgraphs.clear(); + predecessors_Subgraphs.clear(); + successors_Subgraphs.clear(); + std::vector>().swap(subgraphs_2_inputs); + std::vector>().swap(subgraphs_1_inputs); + std::vector>().swap(subgraphs_2_outputs); + std::vector>().swap(subgraphs_1_outputs); + std::vector>().swap(graphs_inputs); + std::vector>().swap(graphs_outputs); + for (const auto &sg : Subgraphs) + { + std::unordered_set graphInputs; + determineGraphInput(sg, IOvalueNames, graphInputs); + subgraphs_1_inputs.push_back(graphInputs); + } + for (const auto &sg : otherSubgraphs) + { + std::unordered_set graphInputs; + determineGraphInput(sg, IOvalueNames, graphInputs); + subgraphs_2_inputs.push_back(graphInputs); + } + node_number = 0; + for (const auto &sg : Subgraphs) + { + std::unordered_set graphOutputs; + node_number += sg.node_size(); + determineGraphOutput(g, sg, subgraphs_1_inputs, subgraphs_2_inputs, graphOutputs); + subgraphs_1_outputs.push_back(graphOutputs); + } + for (const auto &sg : otherSubgraphs) + { + std::unordered_set graphOutputs; + node_number += sg.node_size(); + determineGraphOutput(g, sg, subgraphs_1_inputs, subgraphs_2_inputs, graphOutputs); + subgraphs_2_outputs.push_back(graphOutputs); + } + graphs_inputs.insert(graphs_inputs.end(), subgraphs_1_inputs.begin(), subgraphs_1_inputs.end()); + graphs_inputs.insert(graphs_inputs.end(), subgraphs_2_inputs.begin(), subgraphs_2_inputs.end()); + graphs_outputs.insert(graphs_outputs.end(), subgraphs_1_outputs.begin(), + subgraphs_1_outputs.end()); + graphs_outputs.insert(graphs_outputs.end(), subgraphs_2_outputs.begin(), + subgraphs_2_outputs.end()); + for (int i = 0; i < int(graphs_inputs.size()); i++) + { + std::vector predecessors; + for (const auto &g_input : graphs_inputs[i]) + { + for (int j = 0; j < int(graphs_outputs.size()); j++) + { + if ((graphs_outputs[j].find(g_input) != graphs_outputs[j].end())) + { + predecessors.push_back(j); + } + } + } + predecessors_Subgraphs.push_back(predecessors); + } + for (int i = 0; i < int(graphs_inputs.size()); i++) + { + std::vector temp; + for (int j = 0; j < int(graphs_inputs.size()); j++) + { + if (find(predecessors_Subgraphs[j].begin(), predecessors_Subgraphs[j].end(), i) != + predecessors_Subgraphs[j].end()) + { + temp.push_back(j); + } + } + successors_Subgraphs.push_back(temp); + } + std::string file_name_predecessor_3 = "predecessor_final_3.txt"; + std::string file_name_successor_3 = "successor_final_3.txt"; + std::ofstream outfile_predecessor_3(file_name_predecessor_3); + std::ofstream outfile_successor_3(file_name_successor_3); + if (!(outfile_predecessor_3.is_open() && outfile_successor_3.is_open())) + { + std::cerr << "Error opening file." << std::endl; + exit(0); + } + for (int i = 0; i < int(graphs_inputs.size()); i++) + { + outfile_predecessor_3 << "predecessor of subgraph " << i << ":"; + for (const auto &predecessor : predecessors_Subgraphs[i]) + { + outfile_predecessor_3 << predecessor << ";"; + } + outfile_predecessor_3 << std::endl; + outfile_successor_3 << "successor of subgraph " << i << ":"; + for (const auto &successor : successors_Subgraphs[i]) + { + outfile_successor_3 << successor << ";"; + } + outfile_successor_3 << std::endl; + } + outfile_predecessor_3.close(); + outfile_successor_3.close(); + print_subgraphs(Subgraphs, (char *)"./subgraphs_final_3.txt", otherSubgraphs, + (char *)"./other_subgraphs_final_3.txt"); + node_num_all = 0; + for (const auto &sg : Subgraphs) + { + node_num_all += sg.node_size(); + } + for (const auto &sg : otherSubgraphs) + { + node_num_all += sg.node_size(); + } + int *DFN_3 = (int *)malloc(graphs_inputs.size() * sizeof(int)); + int *LOW_3 = (int *)malloc(graphs_inputs.size() * sizeof(int)); + for (int i = 0; i < int(graphs_inputs.size()); i++) + { + DFN_3[i] = 0; + LOW_3[i] = 0; + } + for (int temp_count = 0; temp_count < int(predecessors_Subgraphs.size()); temp_count++) + { + if (DFN_[temp_count] == 0) + { + std::vector stack_subgraphs; + int depth = 0; + Tarjan(temp_count, depth, strongly_connected_subgraphs, DFN_3, LOW_3, stack_subgraphs, + successors_Subgraphs); + } + } + std::string file_name_scc3 = "scc3.txt"; + std::ofstream outfile_scc3(file_name_scc3); + for (const auto &scc : strongly_connected_subgraphs) + { + std::cout << "scc:"; + outfile_scc3 << "scc: "; + for (const auto &scc_id : scc) + { + outfile_scc3 << scc_id << " "; + } + outfile_scc3 << std::endl; + for (const auto &scc_id : scc) + { + std::cout << scc_id << " "; + outfile_scc3 << "subgraph" << scc_id << " input:"; + for (const auto &scc_input : graphs_inputs[scc_id]) + { + outfile_scc3 << scc_input.name << ";"; + } + outfile_scc3 << " output:"; + for (const auto &scc_output : graphs_outputs[scc_id]) + { + outfile_scc3 << scc_output.name << ";"; + } + outfile_scc3 << std::endl; + } + + std::cout << std::endl; + } + outfile_scc.close(); + free(DFN_3); + free(LOW_3); + std::cout << "node_num after cut " << node_num_all << std::endl; + if (node_num_all != g.node_size()) + { + std::cout << "num error!" << std::endl; + exit(0); + } + int count_cut_pair = 0; + while (1) + { + count_cut_pair++; + if (count_cut_pair > 15) + { + std::cout << "cut pair error! So many times!" << std::endl; + exit(0); + break; + } + int subgraph_size = Subgraphs.size(); + std::vector> strongly_connected_subgraphs_all; + std::vector scc_all; + for (int i = 0; i < int(Subgraphs.size()) + int(otherSubgraphs.size()); i++) + { + scc_all.push_back(i); + } + strongly_connected_subgraphs_all.push_back(scc_all); + if (((count_cut_pair > 1 && count_cut_pair < 5) || + (count_cut_pair > 10 && count_cut_pair < 13)) && + strongly_connected_subgraphs.size() != 0) + { + std::cout << count_cut_pair << " eliminate scc v2 executed" << std::endl; + eliminate_scc_v2(strongly_connected_subgraphs, Subgraphs, otherSubgraphs, g); + // eliminate_pair_v2(Subgraphs, otherSubgraphs, graphs_inputs, graphs_outputs, + // strongly_connected_subgraphs_all, subgraph_size); + } + else if (((count_cut_pair == 15)) && strongly_connected_subgraphs.size() != 0) + { + std::cout << count_cut_pair << " eliminate scc v3 executed" << std::endl; + eliminate_scc_v3(strongly_connected_subgraphs, Subgraphs, otherSubgraphs, g); + } + else + { + std::cout << count_cut_pair << " eliminate pair v2 executed" << std::endl; + eliminate_pair_v2(Subgraphs, otherSubgraphs, graphs_inputs, graphs_outputs, + strongly_connected_subgraphs_all, subgraph_size); + } + strongly_connected_subgraphs.clear(); + predecessors_Subgraphs.clear(); + successors_Subgraphs.clear(); + std::vector>().swap(subgraphs_2_inputs); + std::vector>().swap(subgraphs_1_inputs); + std::vector>().swap(subgraphs_2_outputs); + std::vector>().swap(subgraphs_1_outputs); + std::vector>().swap(graphs_inputs); + std::vector>().swap(graphs_outputs); + for (const auto &sg : Subgraphs) + { + std::unordered_set graphInputs; + determineGraphInput(sg, IOvalueNames, graphInputs); + subgraphs_1_inputs.push_back(graphInputs); + } + for (const auto &sg : otherSubgraphs) + { + std::unordered_set graphInputs; + determineGraphInput(sg, IOvalueNames, graphInputs); + subgraphs_2_inputs.push_back(graphInputs); + } + node_number = 0; + for (const auto &sg : Subgraphs) + { + std::unordered_set graphOutputs; + node_number += sg.node_size(); + determineGraphOutput(g, sg, subgraphs_1_inputs, subgraphs_2_inputs, graphOutputs); + subgraphs_1_outputs.push_back(graphOutputs); + } + for (const auto &sg : otherSubgraphs) + { + std::unordered_set graphOutputs; + node_number += sg.node_size(); + determineGraphOutput(g, sg, subgraphs_1_inputs, subgraphs_2_inputs, graphOutputs); + subgraphs_2_outputs.push_back(graphOutputs); + } + graphs_inputs.insert(graphs_inputs.end(), subgraphs_1_inputs.begin(), subgraphs_1_inputs.end()); + graphs_inputs.insert(graphs_inputs.end(), subgraphs_2_inputs.begin(), subgraphs_2_inputs.end()); + graphs_outputs.insert(graphs_outputs.end(), subgraphs_1_outputs.begin(), + subgraphs_1_outputs.end()); + graphs_outputs.insert(graphs_outputs.end(), subgraphs_2_outputs.begin(), + subgraphs_2_outputs.end()); + for (int i = 0; i < int(graphs_inputs.size()); i++) + { + std::vector predecessors; + for (const auto &g_input : graphs_inputs[i]) + { + for (int j = 0; j < int(graphs_outputs.size()); j++) + { + if ((graphs_outputs[j].find(g_input) != graphs_outputs[j].end())) + { + predecessors.push_back(j); + } + } + } + predecessors_Subgraphs.push_back(predecessors); + } + for (int i = 0; i < int(graphs_inputs.size()); i++) + { + std::vector temp; + for (int j = 0; j < int(graphs_inputs.size()); j++) + { + if (find(predecessors_Subgraphs[j].begin(), predecessors_Subgraphs[j].end(), i) != + predecessors_Subgraphs[j].end()) + { + temp.push_back(j); + } + } + successors_Subgraphs.push_back(temp); + } + node_num_all = 0; + for (const auto &sg : Subgraphs) + { + node_num_all += sg.node_size(); + } + for (const auto &sg : otherSubgraphs) + { + node_num_all += sg.node_size(); + } + int *DFN_4 = (int *)malloc(graphs_inputs.size() * sizeof(int)); + int *LOW_4 = (int *)malloc(graphs_inputs.size() * sizeof(int)); + for (int i = 0; i < int(graphs_inputs.size()); i++) + { + DFN_4[i] = 0; + LOW_4[i] = 0; + } + for (int temp_count = 0; temp_count < int(predecessors_Subgraphs.size()); temp_count++) + { + if (DFN_[temp_count] == 0) + { + std::vector stack_subgraphs; + int depth = 0; + Tarjan(temp_count, depth, strongly_connected_subgraphs, DFN_4, LOW_4, stack_subgraphs, + successors_Subgraphs); + } + } + std::string file_name_scc4 = "scc4.txt"; + std::ofstream outfile_scc4(file_name_scc4); + for (const auto &scc : strongly_connected_subgraphs) + { + std::cout << "scc4:"; + for (const auto &scc_id : scc) + { + std::cout << scc_id << " "; + outfile_scc4 << "subgraph" << scc_id << " input:"; + for (const auto &scc_input : graphs_inputs[scc_id]) + { + outfile_scc4 << scc_input.name << ";"; + } + outfile_scc4 << " output:"; + for (const auto &scc_output : graphs_outputs[scc_id]) + { + outfile_scc4 << scc_output.name << ";"; + } + outfile_scc4 << std::endl; + } + + std::cout << std::endl; + } + outfile_scc.close(); + free(DFN_4); + free(LOW_4); + std::cout << "node num in original graph: " << g.node_size() << std::endl; + std::cout << "node_num after cut " << node_num_all << std::endl; + if (node_num_all != g.node_size()) + { + std::cout << "num error!, time" << count_cut_pair << std::endl; + exit(0); + } + if (count_cut_pair == 15) + { + if (strongly_connected_subgraphs.size() == 0) + { + break; + } + else + { + std::cout << "error!" << std::endl; + exit(0); + } + } + std::cout << "graph number after " << count_cut_pair + << "loops: " << Subgraphs.size() + otherSubgraphs.size() << std::endl; + } // end of while + std::string file_name_predecessor_4 = "predecessor_final_4.txt"; + std::string file_name_successor_4 = "successor_final_4.txt"; + std::ofstream outfile_predecessor_4(file_name_predecessor_4); + std::ofstream outfile_successor_4(file_name_successor_4); + if (!(outfile_predecessor_4.is_open() && outfile_successor_4.is_open())) + { + std::cerr << "Error opening file." << std::endl; + exit(0); + } + for (int i = 0; i < int(graphs_inputs.size()); i++) + { + outfile_predecessor_4 << "predecessor of subgraph " << i << ":"; + for (const auto &predecessor : predecessors_Subgraphs[i]) + { + outfile_predecessor_4 << predecessor << ";"; + } + outfile_predecessor_4 << std::endl; + outfile_successor_4 << "successor of subgraph " << i << ":"; + for (const auto &successor : successors_Subgraphs[i]) + { + outfile_successor_4 << successor << ";"; + } + outfile_successor_4 << std::endl; + } + outfile_predecessor_4.close(); + outfile_successor_4.close(); + print_subgraphs(Subgraphs, (char *)"./subgraphs_final_4.txt", otherSubgraphs, + (char *)"./other_subgraphs_final_4.txt"); + ////* + int temp_count_subgraph = 0; + + std::ofstream outfile_conv_flag("end_with_conv.txt"); + for (const auto &graph_outputs : subgraphs_1_outputs) + { + int find_flag = 0; + for (const auto &graph_output : graph_outputs) + { + for (const auto &node : Subgraphs[temp_count_subgraph].node()) + { + for (const auto &output : node.output()) + { + if (graph_output.name == output && node.op_type() == "Conv") + { + outfile_conv_flag << temp_count_subgraph << " "; + find_flag = 1; + break; + } + } + if (find_flag) + { + break; + } + } + if (find_flag) + { + break; + } + } + temp_count_subgraph++; + } + outfile_conv_flag.close(); + std::cout << "succeeded in reaching sorting" << std::endl; + int finished_flag = 0; + int sort_count = 0; + std::vector order_Subgraphs(graphs_inputs.size()); + std::vector issort_Subgraphs(graphs_inputs.size()); + while (!finished_flag) + { + finished_flag = 1; + int changed_sort_flag = 0; + if (sort_count == 0) + { + changed_sort_flag = 1; + for (int i = 0; i < int(graphs_inputs.size()); i++) + { + int find_flag = 0; + for (const auto &g_input : graphs_inputs[i]) + { + for (int j = 0; j < int(graphs_outputs.size()); j++) + { + if (graphs_outputs[j].find(g_input) != graphs_outputs[j].end()) + { + find_flag = 1; + break; + } + } + if (find_flag) + { + break; + } + } + if (!find_flag) + { + order_Subgraphs[i] = 0; + issort_Subgraphs[i] = 1; + } + else + { + order_Subgraphs[i] = 1; + issort_Subgraphs[i] = 0; + finished_flag = 0; + } + } + } + else + { + std::cout << "sort count:" << sort_count << std::endl; + for (int i = 0; i < int(graphs_inputs.size()); i++) + { + int find_flag = 0; + if (issort_Subgraphs[i] == 1 && i != int(graphs_inputs.size()) - 1) + { + continue; + } + for (const auto &g_input : graphs_inputs[i]) + { + for (int j = 0; j < int(graphs_outputs.size()); j++) + { + if ((graphs_outputs[j].find(g_input) != graphs_outputs[j].end())) + { + if ((issort_Subgraphs[j] == 0)) + { + std::cout << "graph " << i << "is after graph " << j << std::endl; + find_flag = 1; + break; + } + } + } + if (find_flag) + { + break; + } + } + if (!find_flag) + { + if (!(issort_Subgraphs[i] == 1)) + { + order_Subgraphs[i] = sort_count; + } + } + else + { + order_Subgraphs[i] = sort_count + 1; + issort_Subgraphs[i] = 0; + finished_flag = 0; + } + if (i == int(graphs_inputs.size()) - + 1) // add the subgraph to the queue only when cycle is completed to prevent the + // newly added subgraph in this cycle from being the predecessor of the + // subsequent sub-graph. + { + for (int j = 0; j < int(graphs_inputs.size()); j++) + { + if (order_Subgraphs[j] == sort_count) + { + issort_Subgraphs[j] = 1; + changed_sort_flag = 1; + std::cout << "graph " << j << " is in the " << sort_count << "th sort" << std::endl; + } + } + } + } + if (changed_sort_flag == 0) + { + std::cout << "error: endless loop!" << std::endl; + std::cout << "sort count:" << sort_count << std::endl; + std::cout << "count_cut_pair: " << count_cut_pair << std::endl; + for (int i = 0; i < int(graphs_inputs.size()); i++) + { + std::cout << "order_Subgraphs[" << i << "]:" << order_Subgraphs[i] << " "; + } + std::cout << std::endl; + std::exit(1); + break; + } + } + sort_count++; + } + char *sub1_type, *sub2_type; + if (strategy == SPILTE_CPU_STRUCTURE_FIRST) + { + sub1_type = (char *)"CPU"; + sub2_type = (char *)"NPU"; + } + else + { + sub1_type = (char *)"NPU"; + sub2_type = (char *)"CPU"; + } + std::cout << " order" << std::endl; + for (auto element : order_Subgraphs) + { + std::cout << element << " "; + } + std::cout << std::endl; + + std::string file_name = "subgraphs_ios.txt"; + std::ofstream outfile1(file_name); + if (!outfile1.is_open()) + { + std::cerr << "Error opening file." << std::endl; + exit(0); + } + int sub1_size = subgraphs_1_inputs.size(); + for (int i = 0; i < int(graphs_inputs.size()); i++) + { + outfile1 << (i >= sub1_size ? sub2_type : sub1_type) << "subgraph" + << (i >= sub1_size ? (i - sub1_size) : i) << ": order" << order_Subgraphs[i]; + outfile1 << "--input-name "; + std::cout << (i >= sub1_size ? sub2_type : sub1_type) << "subgraph" + << (i >= sub1_size ? (i - sub1_size) : i) << ": order" << order_Subgraphs[i] + << std::endl; + std::cout << "Inputs:"; + for (auto element : graphs_inputs[i]) + { + std::cout << element.name << "; size:"; + for (auto Size : element.shape) + { + std::cout << Size << " "; + } + outfile1 << element.name << ";"; + } + std::cout << std::endl; + std::cout << "Outputs:"; + outfile1 << "--output-name "; + for (auto element : graphs_outputs[i]) + { + std::cout << element.name << "; size:"; + for (auto Size : element.shape) + { + std::cout << Size << " "; + } + outfile1 << element.name << ";"; + } + outfile1 << std::endl; + std::cout << std::endl; + std::cout << " The predecessors of " << (i >= sub1_size ? sub2_type : sub1_type) << "subgraph" + << (i >= sub1_size ? (i - sub1_size) : i) << ": "; + for (auto element : predecessors_Subgraphs[i]) + { + std::cout << (element >= sub1_size ? sub2_type : sub1_type) << "subgraph" + << (element >= sub1_size ? (element - sub1_size) : element) << "; "; + } + std::cout << std::endl; + std::cout << " The successors of " << (i >= sub1_size ? sub2_type : sub1_type) << "subgraph" + << (i >= sub1_size ? (i - sub1_size) : i) << ": "; + for (auto element : successors_Subgraphs[i]) + { + std::cout << (element >= sub1_size ? sub2_type : sub1_type) << "subgraph" + << (element >= sub1_size ? (element - sub1_size) : element) << "; "; + } + std::cout << std::endl; + } + outfile1.close(); + for (const auto &tensor : IOvalueNames) + { + std::cout << "Name: " << tensor.name << ", Shape: ["; + for (size_t i = 0; i < tensor.shape.size(); ++i) + { + std::cout << tensor.shape[i]; + if (i < tensor.shape.size() - 1) + { + std::cout << ", "; + } + } + std::cout << "]" << std::endl; + } + + switch (d.getType()) + { + case DeviceType::Target_NPU: + { + if (strategy == SPILTE_CPU_STRUCTURE_FIRST) + { + d.GenerateCutInstruction(Subgraphs, "cpu", subgraphs_1_inputs, subgraphs_1_outputs); + d.GenerateCutInstruction(otherSubgraphs, "npu", subgraphs_2_inputs, subgraphs_2_outputs); + } + else if (strategy == SPILTE_NPU_STRUCTURE_FIRST) + { + d.GenerateCutInstruction(Subgraphs, "npu", subgraphs_1_inputs, subgraphs_1_outputs); + d.GenerateCutInstruction(otherSubgraphs, "cpu", subgraphs_2_inputs, subgraphs_2_outputs); + } + break; + } + default: + std::cout << "Unknown device type" << std::endl; + exit(0); + } + std::cout << "node num in original graph: " << g.node_size() << std::endl; + std::cout << "node_num after cut " << node_num_all << std::endl; +} diff --git a/tools/onnx-subgraph/src/lib/structures.cpp b/tools/onnx-subgraph/src/lib/structures.cpp new file mode 100644 index 00000000000..5ddcf81dc7e --- /dev/null +++ b/tools/onnx-subgraph/src/lib/structures.cpp @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2025 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "partition.h" +#include +int DetermineStructure(const onnx::GraphProto &graph, Device &d, PartitionStrategy strategy) +{ + int node_index = 0; + std::vector> enabled_structure; + std::vector structure_temp; + while (node_index < graph.node_size()) + { + std::vector support_op; + const auto &node = graph.node(node_index); + switch (strategy) + { + case SPILTE_CPU_STRUCTURE_FIRST: + { + support_op = d.getCPUSupportOp(); + break; + } + case SPILTE_NPU_STRUCTURE_FIRST: + { + support_op = d.getNPUSupportOp(); + break; + } + default: + { + break; + } + } + if (std::find(support_op.begin(), support_op.end(), node.op_type()) != support_op.end()) + { + auto op_index = std::find(support_op.begin(), support_op.end(), node.op_type()); + structure_temp.push_back(*op_index); + } + else + { + if (structure_temp.size() >= 3) + { + bool isequal = 0; + for (const auto &structure : enabled_structure) + + { + if (std::equal(structure.begin(), structure.end(), structure_temp.begin(), + structure_temp.end())) + { + isequal = 1; + break; + } + } + if (isequal == 0) + { + enabled_structure.push_back(structure_temp); + } + } + if (structure_temp.size() != 0) + { + structure_temp.clear(); + } + } + node_index++; + } + + for (const auto &structure : enabled_structure) + { + std::cout << "{"; + for (const auto &op : structure) + { + std::cout << "\"" << op << "\","; + } + std::cout << "}," << std::endl; + } + return 0; +} diff --git a/tools/onnx-subgraph/src/main.cpp b/tools/onnx-subgraph/src/main.cpp new file mode 100644 index 00000000000..9e78641e3db --- /dev/null +++ b/tools/onnx-subgraph/src/main.cpp @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2025 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include "graph.h" +#include "partition.h" +#include "Python.h" + +int main(int argc, char *argv[]) +{ + std::string onnxFile; + if (argc > 1) + { + for (int i = 1; i < argc; ++i) + { + std::string arg = argv[i]; + if (arg.substr(0, 7) == "--onnx=") + { + onnxFile = arg.substr(7); + std::cout << "ONNX file: " << onnxFile << std::endl; + } + } + if (onnxFile.empty()) + { + std::cout << "No ONNX file provided." << std::endl; + return -1; + } + } + else + { + printf("Please set valide args: ./onnx-subgraph --onnx=xxx.onnx\n"); + return -1; + } + + Graph graph; + auto g = graph.GetGraphFromOnnx(onnxFile); + std::unordered_map node_io_size; + Partition p; + Device target; + target.updateOnnxFile(onnxFile); + target.GetDeviceJson("./scripts/config.json"); + p.PartitionGraph(g, target, PartitionStrategy::SPILTE_NPU_STRUCTURE_FIRST, node_io_size); + + Py_Initialize(); + if (!Py_IsInitialized()) + { + std::cout << "python init fail" << std::endl; + return 0; + } + PyRun_SimpleString("import sys"); + PyRun_SimpleString("sys.path.append('.')"); + Py_Finalize(); + + return 0; +} diff --git a/tools/onnx-subgraph/subgraphs_ios.txt b/tools/onnx-subgraph/subgraphs_ios.txt new file mode 100644 index 00000000000..9dfdc95e32e --- /dev/null +++ b/tools/onnx-subgraph/subgraphs_ios.txt @@ -0,0 +1,4 @@ +NPUsubgraph0: order0--input-name x;--output-name /stem/conv3/bn/act/Mul_output_0; +NPUsubgraph1: order2--input-name /stem/pool/MaxPool_output_0;--output-name /stages/stages.3/stages.3.1/act/Mul_output_0; +CPUsubgraph0: order1--input-name /stem/conv3/bn/act/Mul_output_0;--output-name /stem/pool/MaxPool_output_0; +CPUsubgraph1: order3--input-name /stages/stages.3/stages.3.1/act/Mul_output_0;--output-name 316; diff --git a/tools/onnx-subgraph/test_model_download.sh b/tools/onnx-subgraph/test_model_download.sh new file mode 100644 index 00000000000..d6597d2dd79 --- /dev/null +++ b/tools/onnx-subgraph/test_model_download.sh @@ -0,0 +1,16 @@ +pip install onnx onnxsim + +if [ ! -d "./models/" ];then + mkdir ./models/ + else + echo "./models path existing" +fi + +cd ./models +wget https://media.githubusercontent.com/media/onnx/models/refs/heads/main/Computer_Vision/resnext26ts_Opset16_timm/resnext26ts_Opset16.onnx --no-check-certificate +#wget https://media.githubusercontent.com/media/onnx/models/refs/heads/main/Natural_Language_Processing/xmod_Opset16_transformers/xmod_Opset16.onnx --no-check-certificate + +onnxsim resnext26ts_Opset16.onnx ../resnet-test.onnx +#onnxsim xmod_Opset16.onnx ../xmod-transformer-test.onnx + +cd ..