diff --git a/tools/onnx-subgraph/3rd_files_download.sh b/tools/onnx-subgraph/3rd_files_download.sh
new file mode 100644
index 00000000000..3adf4cbda4e
--- /dev/null
+++ b/tools/onnx-subgraph/3rd_files_download.sh
@@ -0,0 +1,8 @@
+mkdir 3rd
+cd 3rd
+git clone https://github.com/ekg/glia.git
+cp -r glia/json ../include
+cp glia/json-forwards.h ../include
+cp glia/jsoncpp.cpp ../src/lib
+cd ..
+rm -rf 3rd
diff --git a/tools/onnx-subgraph/CMakeLists.txt b/tools/onnx-subgraph/CMakeLists.txt
new file mode 100644
index 00000000000..28bd9cbc058
--- /dev/null
+++ b/tools/onnx-subgraph/CMakeLists.txt
@@ -0,0 +1,64 @@
+# cmake version dependency
+cmake_minimum_required(VERSION 3.10)
+
+SET(CMAKE_BUILD_TYPE "Debug")
+SET(CMAKE_CXX_FLAGS_DEBUG "$ENV{CXXFLAGS} -O0 -Wall -g2 -ggdb")
+SET(CMAKE_CXX_FLAGS_RELEASE "$ENV{CXXFLAGS} -O3 -Wall")
+SET(CMAKE_CXX_STANDARD 17)
+
+project(onnx-subgraph-parser)
+
+find_package(Protobuf REQUIRED)
+find_package(jsoncpp REQUIRED)
+find_package(Python3 COMPONENTS Interpreter Development REQUIRED)
+
+set(PROTO_FILES onnx.proto)
+protobuf_generate_cpp(PROTO_SRCS PROTO_HDRS ${PROTO_FILES})
+
+include_directories(${CMAKE_CURRENT_BINARY_DIR})
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include)
+include_directories(${Python3_INCLUDE_DIRS})
+
+file(GLOB SOURCES "src/lib/*.cpp" "src/lib/*.cpp" )
+
+add_library(onnx-subgraph-parser STATIC ${SOURCES} ${PROTO_SRCS} ${PROTO_FILES})
+target_link_libraries(onnx-subgraph-parser protobuf jsoncpp)
+
+add_executable(onnx-subgraph src/main.cpp)
+target_link_libraries(onnx-subgraph onnx-subgraph-parser ${Python3_LIBRARIES})
+
+ set(ONNX_SUGRAPH_FILES                                                          
+    extract_onnx_lib.py
+    extract_onnx.py
+    single_vs_multiple_onnx.py
+    quant.py
+    model_inference.py
+    model_inference_multiple_output.py
+    onnx_subgraph_ut.py
+    test_model_download.sh
+    config.json
+    config-sample-1.json
+    config-sample-2.json
+ )                                                                            
+                                                                              
+ foreach(ONNX_SUGRAPH IN ITEMS ${ONNX_SUGRAPH_FILES})                                                                                                        
+   set(ONNX_SUGRAPH_FILE ${ONNX_SUGRAPH})                                           
+   set(ONNX_SUGRAPH_SRC "${CMAKE_CURRENT_SOURCE_DIR}/${ONNX_SUGRAPH_FILE}")         
+   set(ONNX_SUGRAPH_BIN "${CMAKE_CURRENT_BINARY_DIR}/scripts/${ONNX_SUGRAPH_FILE}")         
+   set(ONNX_SUGRAPH_TARGET "${ONNX_SUGRAPH}_target")                                
+                                                                              
+   add_custom_command(OUTPUT ${ONNX_SUGRAPH_BIN}                                 
+     COMMAND ${CMAKE_COMMAND} -E copy "${ONNX_SUGRAPH_SRC}" "${ONNX_SUGRAPH_BIN}"   
+     DEPENDS ${ONNX_SUGRAPH_SRC}                                                 
+     COMMENT "Generate ${ONNX_SUGRAPH_BIN}"                                      
+   )                                                                          
+                                                                              
+   add_custom_target(${ONNX_SUGRAPH_TARGET} ALL DEPENDS ${ONNX_SUGRAPH_BIN})        
+                                                                              
+   install(FILES ${ONNX_SUGRAPH_BIN}                                             
+           PERMISSIONS OWNER_WRITE OWNER_READ OWNER_EXECUTE                   
+                       GROUP_READ GROUP_EXECUTE                               
+                       WORLD_READ WORLD_EXECUTE                               
+           DESTINATION bin)                                                   
+                                                                              
+ endforeach(ONNX_SUGRAPH)     
diff --git a/tools/onnx-subgraph/Readme.md b/tools/onnx-subgraph/Readme.md
new file mode 100644
index 00000000000..acb9236c8e2
--- /dev/null
+++ b/tools/onnx-subgraph/Readme.md
@@ -0,0 +1,89 @@
+# onnx_autosubgraph
+onnx-subgraph tool provides  model auto partitionioning of onnx model to several sub models by 
+operator, performance and model size limitations,with the order and input / output names of 
+sub models
+
+# How to build the onnx-subgraph
+## OS environment dependence
+     1. ubuntu >=20.04
+     2. GCC >= 9.4.0
+     3. cmake >= 3.10
+     4. python >= 3.8
+     5. apt-get install libprotobuf-dev protobuf-compiler libjsoncpp-dev
+
+## Python packages dependence
+    onnx                         1.16.0
+    onnxruntime                  1.18.1
+    onnxsim                      0.4.36
+    torch                        2.3.1
+    scikit-image
+    scikit-learn
+    pandas
+    tqdm
+    
+## building the onnx-subgraph
+    1. cd onnx-subgraph
+    2. bash 3rd_files_download.sh
+    3. mkdir build & cd build
+    4. cmake .. & make
+    5. we can get following output at ./build
+          ├── onnx-subgraph
+          └── scripts
+              ├── config.json
+              ├── config-sample-1.json
+              ├── config-sample-2.json
+              ├── extract_onnx_lib.py
+              ├── extract_onnx.py
+              ├── model_inference_multiple_output.py
+              ├── model_inference.py
+              ├── onnx_subgraph_ut.py
+              ├── quant.py
+              ├── single_vs_multiple_onnx.py
+              └── test_model_download.sh
+# How to use the onnx-subgraph
+## Pre-steps
+### Download the test AI models
+    1. bash scripts/test_model_download.sh, then "resnet-test.onnx" will be got in ./build
+    2. you can change to any other onnx files as your needs, or edit the download link in 
+	   "scripts/test_model_download.sh"
+### Prepare the config.json
+    1. edit the config.json
+       . you can edit operators in "NPU_supported_ops" and "CPU_supported_ops";
+       . you can edit performance data in "performance_data" as the real HW status, 
+       . you can edit "max_subgraph_size" in case of "NPU_supported_ops" is []
+    2. you can also check more examples in "config-sample-1.json" and "config-sample-2.json"
+
+  
+## Parse the onnx model
+     ./onnx-subgraph --onnx=resnet-test.onnx
+       after parsing done, subgraphs_ios.txt will be generated at current path
+       
+## Split the onnx model to subgraphs
+    1. edit the config path and model file path at ./scripts/extract_onnx.py 
+       e.g.: extract_onnx_lib.split_onnx_ios('./subgraphs_ios.txt','./resnet-test.onnx') 
+    2. python scripts/extract_onnx.py, after extraction done, the subgraphs will be saved 
+	   at './subgraphs'
+       subgraphs
+       ├── CPU
+       │   ├── CPUsubgraph0.onnx
+       │   └── CPUsubgraph1.onnx
+       └── NPU
+           ├── NPUsubgraph0.onnx
+           └── NPUsubgraph1.onnx
+    
+## Verify the subgraphs inference with original model file
+    1. edit the model path, subgraph path and config path in ./scripts/single_vs_multiple_onnx.py
+             single_onnx_model_path = './resnet-test.onnx'
+             model_path = './subgraphs/'
+             subgraphsiostxt_path = './subgraphs_ios.txt'
+    2. edit the input shape and name of onnx model in ./scripts/single_vs_multiple_onnx.py
+             default_input_data = {
+                 "x": np.random.rand(1, 3, 256, 256).astype(np.float32),
+             }
+    3. compare the MSE of original inference result and subgraphs inference result
+       python ./scripts/single_vs_multiple_onnx.py
+       output:
+            Single model inference completed!
+            Multiple subgraph inference completed!
+            Comparing inference results between single ONNX model and multiple subgraphs...
+            Output '316' MSE: 5.125894080395578e-14
diff --git a/tools/onnx-subgraph/config-sample-1.json b/tools/onnx-subgraph/config-sample-1.json
new file mode 100644
index 00000000000..3e083ca5b64
--- /dev/null
+++ b/tools/onnx-subgraph/config-sample-1.json
@@ -0,0 +1,10 @@
+{
+    "NPU_supported_ops": [],
+    "CPU_supported_ops": ["Sub", "Pow", "ReduceMean", "Add", "Sqrt", "Div","Transpose", "Gather", "MatMul", "Mul", "Softmax", "Erf", "Gemm", "Conv", "Reshape",
+    "Sin", "Where", "ConstantOfShape", "Cast", "Sigmoid", "Cos", "Expand", "Slice", "Unsqueeze"],
+    "performance_data": [],
+    "hardware_limits": {
+        "max_subgraph_size": 10240.0,
+        "max_subgraphs": 5
+    }
+}
diff --git a/tools/onnx-subgraph/config-sample-2.json b/tools/onnx-subgraph/config-sample-2.json
new file mode 100644
index 00000000000..02e840a723b
--- /dev/null
+++ b/tools/onnx-subgraph/config-sample-2.json
@@ -0,0 +1,15 @@
+{
+    "NPU_supported_ops": ["Conv", "Reshape", "Transpose", "Add", "ReduceMean", "Sub", "Div", "Mul", "Sigmoid","MatMul"],
+    "CPU_supported_ops": ["Sub", "Pow", "ReduceMean", "Add", "Sqrt", "Div","Transpose", "Gather", "MatMul", "Mul", "Softmax", "Erf", "Gemm", "Conv", "Reshape",
+    "Sin", "Where", "ConstantOfShape", "Cast", "Sigmoid", "Cos", "Expand", "Slice", "Unsqueeze"],
+    "performance_data": [
+        {"name":"Conv","CPU_time": 0.1, "NPU_time": 0.05},
+        {"name":"Mul", "CPU_time": 0.15, "NPU_time": 0.07}
+        {"name":"Add", "CPU_time": 0.15, "NPU_time": 0.07}
+        {"name":"Sub", "CPU_time": 0.15, "NPU_time": 0.07}
+    ],
+    "hardware_limits": {
+        "max_subgraph_size": 60024.0,
+        "max_subgraphs": 5
+    }
+}
diff --git a/tools/onnx-subgraph/config.json b/tools/onnx-subgraph/config.json
new file mode 100644
index 00000000000..6d0b7ce5ace
--- /dev/null
+++ b/tools/onnx-subgraph/config.json
@@ -0,0 +1,13 @@
+{
+    "NPU_supported_ops": ["Conv", "Reshape", "Transpose", "Add", "ReduceMean", "Sub", "Div", "Mul", "Sigmoid","MatMul"],
+    "CPU_supported_ops": ["Sub", "Pow", "ReduceMean", "Add", "Sqrt", "Div","Transpose", "Gather", "MatMul", "Mul", "Softmax", "Erf", "Gemm", "Conv", "Reshape",
+    "Sin", "Where", "ConstantOfShape", "Cast", "Sigmoid", "Cos", "Expand", "Slice", "Unsqueeze"],
+    "performance_data": [
+        {"name":"Conv","CPU_time": 0.1, "NPU_time": 0.05},
+		{"name":"Mul", "CPU_time": 0.15, "NPU_time": 0.07}
+    ],
+    "hardware_limits": {
+        "max_subgraph_size": 60024.0,
+        "max_subgraphs": 5
+    }
+}
diff --git a/tools/onnx-subgraph/extract_onnx.py b/tools/onnx-subgraph/extract_onnx.py
new file mode 100644
index 00000000000..fed080c78d1
--- /dev/null
+++ b/tools/onnx-subgraph/extract_onnx.py
@@ -0,0 +1,21 @@
+# Copyright (c) 2025 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import extract_onnx_lib
+import torch
+import onnx
+import re
+
+print("python executed")
+extract_onnx_lib.split_onnx_ios('./subgraphs_ios.txt', './resnet-test.onnx')
diff --git a/tools/onnx-subgraph/extract_onnx_lib.py b/tools/onnx-subgraph/extract_onnx_lib.py
new file mode 100644
index 00000000000..17df7ecada1
--- /dev/null
+++ b/tools/onnx-subgraph/extract_onnx_lib.py
@@ -0,0 +1,218 @@
+# Copyright (c) 2025 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import onnx
+import re
+import os
+
+
+def splitinstruction(instr):
+    iolist = re.split('--input-name \"|\" --output-name \"|\" --input-shape \"', instr)
+    del iolist[0]
+    del iolist[-1]
+    in_ = iolist[0].split(';')
+    out_ = iolist[1].split(';')
+    return in_, out_
+
+
+def splitsubgraph_ios(iofile):
+    iolist = re.split('--input-name |;--output-name ', iofile)
+    in_ = iolist[1].split(';')
+    out_ = iolist[2].split(';')
+    del out_[-1]
+    type = iolist[0].split('subgraph')[0]
+    return in_, out_, type
+
+
+def split_onnx(instrfile, type):
+    print("module found")
+    f1 = open(instrfile, "r")
+    lines = f1.readlines()
+    count = 0
+    for line in lines:
+        input_names, output_names = splitinstruction(line)
+        input_path = 'net/diffusion_model_fp32_with_shape.onnx'
+        output_path = 'diffusion_model_fp32_subgraphs_' + type + '/' + type + 'subgraph' + str(
+            count) + '.onnx'
+        count = count + 1
+        if ((input_names != ['']) and (output_names != [''])):
+            onnx.utils.extract_model(input_path, output_path, input_names, output_names)
+    f1.close()
+
+
+def split_onnx_ios(instrfile,
+                   input_path='net/generation_model_simplify.onnx',
+                   out_folder='subgraphs/'):
+    if not os.path.exists(input_path):
+        print(input_path + " not exist")
+        return
+
+    model = onnx.load(input_path)
+    onnx.checker.check_model(input_path)
+    for output in model.graph.output:
+        model.graph.value_info.append(output)
+    onnx.save(model, input_path)
+    f1 = open(instrfile, "r")
+    lines = f1.readlines()
+    cpu_count = 0
+    npu_count = 0
+    count = 0
+    if not os.path.exists(out_folder):
+        os.makedirs(out_folder)
+    for line in lines:
+        input_names, output_names, type = splitsubgraph_ios(line)
+        if (type == 'CPU'):
+            count = cpu_count
+            cpu_count = cpu_count + 1
+        else:
+            count = npu_count
+            npu_count = npu_count + 1
+        output_path_folder = out_folder
+        if not os.path.exists(output_path_folder):
+            os.makedirs(output_path_folder)
+        output_path = output_path_folder + type + 'subgraph' + str(count) + '.onnx'
+        if ((input_names != ['']) and (output_names != [''])):
+            onnx.utils.extract_model(input_path, output_path, input_names, output_names)
+            print("succeed", count)
+            count = count + 1
+    f1.close()
+
+
+def rename_node_io(file_path):
+    model = onnx.load(file_path)
+    graph = model.graph
+    for inputs in graph.input:
+        inputs.name = re.sub(r'[/.]', '', inputs.name)
+    for outputs in graph.output:
+        outputs.name = re.sub(r'[/.]', '', outputs.name)
+    for value_infos in graph.value_info:
+        value_infos.name = re.sub(r'[/.]', '', value_infos.name)
+    for initializers in graph.initializer:
+        initializers.name = re.sub(r'[/.]', '', initializers.name)
+    for node in graph.node:
+        node.name = re.sub(r'[/.]', '', node.name)
+        for i in range(len(node.input)):
+            node.input[i] = re.sub(r'[/.]', '', node.input[i])
+        for i in range(len(node.output)):
+            node.output[i] = re.sub(r'[/.]', '', node.output[i])
+    return model
+
+
+def rename_subgraph_node_ios(in_file_path, out_file_path):
+    file_names = os.listdir(in_file_path)
+    for filename in file_names:
+        filename_ = in_file_path + '/' + filename
+        model = rename_node_io(filename_)
+        output_file_path = out_file_path + '/' + filename
+        onnx.save(model, output_file_path)
+        print(f'Modified model saved to {output_file_path}')
+
+
+def print_model(file_path):
+    model = onnx.load(file_path)
+    graph = model.graph
+    size = 0
+    for node in graph.node:
+        size = size + 1
+    print(size)
+
+
+def sort(ifile_path, ofile_path):
+    finished_flag = 0
+    sort_count = 0
+    f1 = open(ifile_path, "r")
+    lines = f1.readlines()
+    graphs_inputs = {}
+    graphs_outputs = {}
+    order_Subgraphs = {}
+    issort_Subgraphs = {}
+    TYPE = {}
+    index = 0
+    for line in lines:
+        input_names, output_names, type = splitsubgraph_ios(line)
+        graphs_inputs[index] = input_names
+        graphs_outputs[index] = output_names
+        TYPE[index] = type
+        index = index + 1
+    graph_num = index
+    f1.close()
+    while finished_flag == 0:
+        finished_flag = 1
+        if (sort_count) == 0:
+            for i in range(graph_num):
+                find_flag = 0
+                for g_input in graphs_inputs[i]:
+                    for j in range(graph_num):
+                        if g_input in graphs_outputs[j]:
+                            find_flag = 1
+                            break
+                    if find_flag == 1:
+                        break
+                if find_flag == 0:
+                    order_Subgraphs[i] = 0
+                    issort_Subgraphs[i] = 1
+                else:
+                    order_Subgraphs[i] = 1
+                    issort_Subgraphs[i] = 0
+                    finished_flag = 0
+        else:
+            for i in range(graph_num):
+                find_flag = 0
+                if issort_Subgraphs[i] == 1:
+                    continue
+                for g_input in graphs_inputs[i]:
+                    for j in range(graph_num):
+                        if g_input in graphs_outputs[j]:
+                            if issort_Subgraphs[j] == 0:
+                                find_flag = 1
+                            break
+                    if find_flag == 1:
+                        break
+                if find_flag == 0:
+                    order_Subgraphs[i] = sort_count
+                    issort_Subgraphs[i] = 1
+                else:
+                    order_Subgraphs[i] = sort_count + 1
+                    issort_Subgraphs[i] = 0
+                    finished_flag = 0
+                if i == graph_num - 1:
+                    for j in range(graph_num):
+                        if order_Subgraphs[j] == sort_count:
+                            issort_Subgraphs[j] = 1
+        print(order_Subgraphs)
+        print(issort_Subgraphs)
+        sort_count = sort_count + 1
+        f2 = open(ofile_path, "w")
+        count_cpu = 0
+        count_npu = 0
+        for i in range(graph_num):
+            content = ""
+            if TYPE[i] == 'CPU':
+                content = "CPUsubgraph" + str(count_cpu) + ": order" + str(
+                    order_Subgraphs[i]) + "--input-name "
+                count_cpu = count_cpu + 1
+            if TYPE[i] == 'NPU':
+                content = "NPUsubgraph" + str(count_npu) + ": order" + str(
+                    order_Subgraphs[i]) + "--input-name "
+                count_npu = count_npu + 1
+            for graph_input in graphs_inputs[i]:
+                content = content + graph_input + ";"
+            content = content + "--output-name "
+            for graph_output in graphs_outputs[i]:
+                content = content + graph_output + ";"
+            content = content + "\n"
+            print(content)
+            f2.write(content)
+        f2.close()
diff --git a/tools/onnx-subgraph/include/device.h b/tools/onnx-subgraph/include/device.h
new file mode 100644
index 00000000000..72c73a07059
--- /dev/null
+++ b/tools/onnx-subgraph/include/device.h
@@ -0,0 +1,170 @@
+/*
+ * Copyright (c) 2025 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef DEVICE_H
+#define DEVICE_H
+
+#include <vector>
+#include <string>
+#include <iostream>
+#include <fstream>
+#include "onnx.pb.h"
+#include "graph.h"
+#include <jsoncpp/json/json.h>
+
+enum class DeviceType
+{
+  Target_NPU
+};
+
+class Device
+{
+private:
+  std::string onnxFile;
+
+public:
+  Device(/* args */)
+  {
+    NPUPreferOp = {};
+    CPUSupportOp = {};
+    NPUSupportOp = {};
+    max_subgraph_size = 0;
+  }
+
+  ~Device() {}
+
+  std::vector<std::string> NPUPreferOp;
+  std::vector<std::string> CPUSupportOp;
+  std::vector<std::string> NPUSupportOp;
+
+  float max_subgraph_size;
+
+  DeviceType getType() { return DeviceType::Target_NPU; }
+
+  std::vector<std::vector<std::string>> getCPUStructure()
+  {
+    return {{"Concat"},
+            {"Sub", "Pow", "ReduceMean", "Add", "Sqrt", "Div"},
+            {"Transpose", "Gather", "Gather", "Gather", "Transpose", "MatMul", "Mul", "Softmax",
+             "MatMul"}};
+  }
+
+  std::vector<std::vector<std::string>> getNPUStructure()
+  {
+    return {{"Reshape", "Transpose", "Reshape"},
+            {"Reshape", "Sigmoid", "Mul", "Transpose", "Conv", "Add", "Transpose"},
+            {"Reshape", "Transpose", "Conv", "Transpose", "Reshape"},
+            {"Reshape", "Conv", "Transpose"},
+            {"Reshape", "Add", "Add", "Reshape", "Transpose", "Conv", "Add"},
+            {"Conv"}};
+  }
+
+  std::vector<std::string> getNPUSupportOp() { return NPUSupportOp; }
+  std::vector<std::string> getCPUSupportOp() { return CPUSupportOp; }
+  std::vector<std::string> getNPUPreferOp() { return NPUPreferOp; }
+
+  /**
+   * @brief     Generate cut instructions for subgraphs based on the given device type.
+   *
+   * @param     [in] Subgraphs A reference to a vector of ONNX GraphProto objects representing
+   * subgraphs.
+   * @param     [in] device A string indicating the device type (e.g., "npu" or "c920").
+   * @param     [in] subgraphs_inputs A reference to a vector of unordered sets containing input
+   * information for subgraphs.
+   * @param     [in] subgraphs_outputs A reference to a vector of unordered sets containing output
+   * information for subgraphs.
+   *
+   * @pre       The function assumes that the `Subgraphs`, `subgraphs_inputs`, and
+   * `subgraphs_outputs` vectors are properly initialized and have the same size.
+   * @post      A file named `<device> CutInstruction.txt` is created or overwritten with the
+   * generated cut instructions.
+   * @exception If the output file cannot be opened, an error message is printed, and the program
+   * exits.
+   *
+   * @return    None
+   */
+  void GenerateCutInstruction(std::vector<onnx::GraphProto> &Subgraphs, std::string device,
+                              std::vector<std::unordered_set<NodeTensor>> &subgraphs_inputs,
+                              std::vector<std::unordered_set<NodeTensor>> &subgraphs_outputs);
+
+  /**
+   * @brief Reads and parses a JSON file containing device information.
+   *
+   * This function reads a JSON file from the specified path, parses it, and extracts relevant
+   * device information. It updates global variables with hardware limits, preferred NPU operations,
+   * and supported operations for both NPU and CPU.
+   *
+   * @param json_path The file path to the JSON file containing device information.
+   */
+  void GetDeviceJson(std::string json_path)
+  {
+    Json::Reader reader;
+    Json::Value root;
+
+    // Open the JSON file in binary mode
+    std::ifstream in(json_path, std::ios::binary);
+    if (!in.is_open())
+    {
+      std::cout << "Error opening file\n";
+      return;
+    }
+
+    if (reader.parse(in, root))
+    {
+      // Extract and set the maximum subgraph size from hardware limits
+      float max_subgraph_size_json = root["hardware_limits"]["max_subgraph_size"].asFloat();
+      max_subgraph_size = max_subgraph_size_json;
+      // Iterate through performance data to identify operations where NPU outperforms CPU
+
+      for (unsigned int i = 0; i < root["performance_data"].size(); i++)
+      {
+        if (root["performance_data"][i]["CPU_time"].asFloat() >
+            root["performance_data"][i]["NPU_time"].asFloat())
+        {
+          NPUPreferOp.push_back(root["performance_data"][i]["name"].asString());
+        }
+      }
+
+      // Iterate through and store supported NPU operations
+      for (int i = 0; i < int(root["NPU_supported_ops"].size()); i++)
+      {
+        if (std::find(NPUSupportOp.begin(), NPUSupportOp.end(),
+                      root["NPU_supported_ops"][i].asString()) == NPUSupportOp.end())
+        {
+          NPUSupportOp.push_back(root["NPU_supported_ops"][i].asString());
+        }
+      }
+
+      // Iterate through and store supported CPU operations
+      for (int i = 0; i < int(root["CPU_supported_ops"].size()); i++)
+      {
+        if (std::find(CPUSupportOp.begin(), CPUSupportOp.end(),
+                      root["CPU_supported_ops"][i].asString()) == CPUSupportOp.end())
+        {
+          CPUSupportOp.push_back(root["CPU_supported_ops"][i].asString());
+        }
+      }
+    }
+
+    in.close();
+  }
+
+  void updateOnnxFile(std::string &path) { onnxFile = path; }
+
+  std::string getOnnxFile() { return onnxFile; }
+};
+
+#endif
diff --git a/tools/onnx-subgraph/include/graph.h b/tools/onnx-subgraph/include/graph.h
new file mode 100644
index 00000000000..33bd6e02038
--- /dev/null
+++ b/tools/onnx-subgraph/include/graph.h
@@ -0,0 +1,193 @@
+/*
+ * Copyright (c) 2025 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef GRAPH_H
+#define GRAPH_H
+
+#include "onnx.pb.h"
+#include <iostream>
+#include <fstream>
+#include <unordered_map>
+#include <functional>
+// save the size of each node's inputs and outputs
+struct NodeIOSize
+{
+  std::vector<std::vector<int64_t>> inputSizes;
+  std::vector<std::vector<int64_t>> outputSizes;
+};
+
+struct NodeTensor
+{
+  std::string name;
+  std::vector<int64_t> shape;
+
+  // Default constructor
+  NodeTensor() = default;
+
+  // Constructor with parameters
+  NodeTensor(const std::string &n, const std::vector<int64_t> &s) : name(n), shape(s) {}
+
+  // Equality comparison operator
+  bool operator==(const NodeTensor &other) const
+  {
+    return name == other.name && shape == other.shape;
+  }
+};
+
+namespace std
+{
+template <> struct hash<NodeTensor>
+{
+  size_t operator()(const NodeTensor &tensor) const
+  {
+    size_t hashValue = hash<string>()(tensor.name);
+    for (auto &val : tensor.shape)
+    {
+      hashValue ^= hash<int64_t>()(val) + 0x9e3779b9 + (hashValue << 6) + (hashValue >> 2);
+    }
+    return hashValue;
+  }
+};
+} // namespace std
+/**
+ * @brief     Extracts the names and shapes of initializers from the ONNX graph.
+ *
+ * @param     [in] graph The ONNX graph from which to extract initializers.
+ * @pre       The ONNX graph should be valid and contain initializers.
+ * @post      The names and shapes of the initializers are stored in an unordered set of NodeTensor
+ * objects.
+ * @exception None
+ * @return    An unordered set of NodeTensor objects containing the names and shapes of the
+ * initializers.
+ */
+std::unordered_set<NodeTensor> getInitializer(const onnx::GraphProto &graph);
+/**
+ * @brief     Extracts the names and shapes of inputs, outputs, and value_info from the ONNX graph.
+ *
+ * @param     [in] graph The ONNX graph from which to extract inputs, outputs, and value_info.
+ * @pre       The ONNX graph should be valid and contain inputs, outputs, and value_info.
+ * @post      The names and shapes of the inputs, outputs, and value_info are stored in an unordered
+ * set of NodeTensor objects.
+ * @exception None
+ * @return    An unordered set of NodeTensor objects containing the names and shapes of the inputs,
+ * outputs, and value_info.
+ */
+std::unordered_set<NodeTensor> getIOvalue(const onnx::GraphProto &graph);
+/**
+ * @brief     Determines the input tensors of the graph that are not produced by any node in the
+ * graph.
+ *
+ * @param     [in] g The ONNX GraphProto object representing the graph.
+ * @param     [in] initializerNames A set of NodeTensor objects representing the initializers in the
+ * graph.
+ * @param     [out] graphInputs A set of NodeTensor objects representing the input tensors of the
+ * graph.
+ * @pre       The GraphProto object g should be valid and contain nodes with proper input and output
+ * lists.
+ * @post      The graphInputs set will be populated with NodeTensor objects that are inputs to the
+ * graph.
+ * @exception None
+ * @return    None
+ */
+void determineGraphInput(const onnx::GraphProto &g,
+                         const std::unordered_set<NodeTensor> &initializerNames,
+                         std::unordered_set<NodeTensor> &graphInputs);
+/**
+ * @brief     Determines the output tensors of the graph that are either outputs of the original
+ * graph or are used as inputs in other parts of the graph.
+ *
+ * @param     [in] originalGraph The original ONNX GraphProto object representing the graph.
+ * @param     [in] g The ONNX GraphProto object representing the graph to analyze.
+ * @param     [in] allgraphInputs_1 A vector of sets of NodeTensor objects representing the first
+ * set of inputs to the graph.
+ * @param     [in] allgraphInputs_2 A vector of sets of NodeTensor objects representing the second
+ * set of inputs to the graph.
+ * @param     [out] graphOutputs A set of NodeTensor objects representing the output tensors of the
+ * graph.
+ * @pre       The GraphProto objects originalGraph and g should be valid and contain nodes with
+ * proper input and output lists.
+ * @post      The graphOutputs set will be populated with NodeTensor objects that are outputs of the
+ * graph.
+ * @exception None
+ * @return    None
+ */
+void determineGraphOutput(const onnx::GraphProto &originalGraph, const onnx::GraphProto &g,
+                          std::vector<std::unordered_set<NodeTensor>> &allgraphInputs_1,
+                          std::vector<std::unordered_set<NodeTensor>> &allgraphInputs_2,
+                          std::unordered_set<NodeTensor> &graphOutputs);
+/**
+ * @brief     Finds the name of the node that produces a specified output tensor in the given ONNX
+ * graph.
+ *
+ * @param     [in] g The ONNX GraphProto object representing the graph.
+ * @param     [in] outputTensorName The name of the output tensor to find the producing node for.
+ * @pre       The GraphProto object g should be valid and contain nodes with proper input and output
+ * lists.
+ * @post      None
+ * @exception None
+ * @return    The name of the node that produces the specified output tensor, or an empty string if
+ * no such node is found.
+ */
+std::string findInputNode(const onnx::GraphProto &g, const std::string &outputTensorName);
+/**
+ * @brief     Collects the names of all nodes in the given ONNX graph.
+ *
+ * @param     [in] graph The ONNX GraphProto object representing the graph.
+ * @pre       The GraphProto object graph should be valid and contain nodes with proper names.
+ * @post      None
+ * @exception None
+ * @return    An unordered set containing the names of all nodes in the graph.
+ */
+std::unordered_set<std::string> collectNodeNames(const onnx::GraphProto &graph);
+/**
+ * @brief     Merges nodes from the source graph into the target graph.
+ *
+ * @param     [in,out] targetGraph The ONNX GraphProto object to which nodes will be added.
+ * @param     [in] sourceGraph The ONNX GraphProto object from which nodes will be copied.
+ * @pre       Both GraphProto objects should be valid.
+ * @post      Nodes from sourceGraph are added to targetGraph.
+ * @exception Exits the program with an error message if the number of nodes in targetGraph does not
+ * match the expected size after merging.
+ * @return    None
+ */
+void mergeGraphs(onnx::GraphProto &targetGraph, onnx::GraphProto &sourceGraph);
+
+class Graph
+{
+private:
+  /* data */
+public:
+  Graph() {}
+  ~Graph() {}
+  /**
+   * @brief     Loads an ONNX model from a file and returns the graph contained within.
+   *
+   * @param     [in] path The file path to the ONNX model.
+   * @pre       The file specified by path should exist and be a valid ONNX model.
+   * @post      The ONNX model is parsed and its graph is returned.
+   * @exception Exits the program with an error message if the file cannot be opened.
+   * @return    The ONNX GraphProto object representing the graph from the model.
+   */
+  onnx::GraphProto GetGraphFromOnnx(std::string &path);
+};
+struct graph_adjacency_node
+{
+  std::vector<int> output_node_index;
+  int rank;
+  std::string name;
+  int index;
+};
+#endif
diff --git a/tools/onnx-subgraph/include/partition.h b/tools/onnx-subgraph/include/partition.h
new file mode 100644
index 00000000000..48ac51e9328
--- /dev/null
+++ b/tools/onnx-subgraph/include/partition.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2025 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef PARTITION_H
+#define PARTITION_H
+
+#include "onnx.pb.h"
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <string>
+#include <algorithm>
+#include "device.h"
+#include "graph.h"
+
+// deprecated
+enum PartitionStrategy
+{
+  SPILTE_CPU_STRUCTURE_FIRST,
+  SPILTE_NPU_STRUCTURE_FIRST,
+  AUTOMATIC_SEARCH
+};
+
+class Partition
+{
+private:
+  /* data */
+public:
+  Partition() {}
+  ~Partition() {}
+  /**
+   * @brief     Partition the ONNX graph into subgraphs and produce cutting instructions.
+   *
+   * @param     [in] g The ONNX graph to be partitioned.
+   * @param     [in] d The device information for partitioning.
+   * @param     [in] strategy The partition strategy to be used (deprecated).
+   * @param     [in] node_io_size The input/output size information for each node.
+   * @pre       The ONNX graph should be valid and the device information should be properly set.
+   * @post      The graph is partitioned into subgraphs, and the results are stored in Subgraphs and
+   * otherSubgraphs.
+   * @exception None
+   * @return    None
+   */
+  void PartitionGraph(const onnx::GraphProto &g, Device &d, PartitionStrategy strategy,
+                      const std::unordered_map<std::string, NodeIOSize> &node_io_size);
+};
+#endif
diff --git a/tools/onnx-subgraph/model_inference.py b/tools/onnx-subgraph/model_inference.py
new file mode 100644
index 00000000000..7e41f114ef4
--- /dev/null
+++ b/tools/onnx-subgraph/model_inference.py
@@ -0,0 +1,352 @@
+# Copyright (c) 2025 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from skimage.metrics import mean_squared_error, peak_signal_noise_ratio, structural_similarity
+from skimage.io import imread
+import onnxruntime as ort
+import numpy as np
+import pandas as pd
+import torch
+import onnx
+import pdb
+import re
+import os
+
+from quant import quant_conv_forward_save_output
+
+
+class ModelInference:
+    """
+    This class is used to infer multiple onnx models.
+    Parameters:
+        model_path: Path to the model files.
+        subgraphsiostxt_path: Path to the txt file that describes the structure of the model graph.
+    Output:
+        outputs[0]: Inference result from the model.
+    Description:
+        Here, subgraphsiostxt_path is a txt file that describes the structure of the model graph and is used to get input/output node names. 
+        The model_path contains paths to multiple onnx files. The load_sessions function will sort the onnx models in the model_path according to the order specified in subgraphsiostxt_path. 
+        It then infers the sorted onnx models, returns the sessions data to self.sessions, and returns the sorted sequence to self.sorted_file_paths. 
+        Finally, it infers the sessions based on the initial data provided by initial_input_data and returns the inference results.
+    """
+    def __init__(self, model_path, subgraphsiostxt_path):
+
+        self.model_path = model_path
+        self.subgraphsiostxt_path = subgraphsiostxt_path
+        self.sessions, self.sorted_file_paths = self.load_sessions()
+
+    def load_sessions(self):
+        with open(self.subgraphsiostxt_path, 'r') as file:
+            content = file.read()
+        subgraph_order_map = {}
+        matches = re.findall(r'(\w+)subgraph(\d+): order(\d+)', content)
+
+        for match in matches:
+            subgraph_type, subgraph_number, order = match
+            file_path = os.path.join(self.model_path,
+                                     f"{subgraph_type}subgraph{subgraph_number}.onnx")
+            if int(order) in subgraph_order_map:
+                subgraph_order_map[int(order)].append(file_path)
+            else:
+                subgraph_order_map[int(order)] = [file_path]
+
+        sorted_file_paths = []
+        for order in sorted(subgraph_order_map.keys()):
+            sorted_file_paths.extend(subgraph_order_map[order])
+
+        sessions = [ort.InferenceSession(model) for model in sorted_file_paths]
+        return sessions, sorted_file_paths
+
+    def inference(self, initial_input_data):
+        input_data = initial_input_data
+        for i, (session,
+                model_file) in enumerate(zip(self.sessions, self.sorted_file_paths)):
+
+            input_names = [inp.name for inp in session.get_inputs()]
+            model_input_data = {name: input_data[name] for name in input_names}
+            outputs = session.run(None, model_input_data)
+            output_names = [out.name for out in session.get_outputs()]
+
+            if i < len(self.sessions) - 1:
+                for output, output_name in zip(outputs, output_names):
+                    input_data[output_name] = output
+        return outputs[0]
+
+    def infer_single_onnx_model(model_file, input_data):
+        session = ort.InferenceSession(model_file)
+        outputs = session.run(None, input_data)
+        return outputs[0]
+
+
+class PcaInference:
+    """
+    This class uses PCA for compression and inferring multiple ONNX models.
+    Parameters:
+        model_path: Path to the onnx model files.
+        subgraphsiostxt_path: Path to the txt file that describes the structure of the model graph.
+        endwithconv_path: Path to a txt file recording the onnx ending with convolution.
+        initial_input_data: Initial input data.
+        num: Inference times, providing the model name based on the number of times.
+        output_dir: Root directory for saving inference results.
+    Output:
+        outputs: Inference results.
+    Description:
+        A result_pt directory is generated in between to save intermediate results; however, not generating this directory does not affect experimental results.
+        The result folder saves the output of the convolution layer to calculate the compression rate. All results are saved in the output_dir folder.
+    """
+    def __init__(self, model_path, subgraphsiostxt_path, endwithconv_path, output_dir):
+        self.model_path = model_path
+        self.subgraphsiostxt_path = subgraphsiostxt_path
+        self.endwithconv_path = endwithconv_path
+        self.output_dir = output_dir
+        (
+            self.sessions,
+            self.conv_output_layer_map,
+            self.sorted_file_paths,
+        ) = self.load_sessions()
+
+    def load_sessions(self):
+        with open(self.subgraphsiostxt_path, 'r') as file:
+            content = file.read()
+        subgraph_order_map = {}
+        matches = re.findall(r'(\w+)subgraph(\d+): order(\d+)', content)
+
+        for match in matches:
+            subgraph_type, subgraph_number, order = match
+            file_path = os.path.join(self.model_path,
+                                     f"{subgraph_type}subgraph{subgraph_number}.onnx")
+            if int(order) in subgraph_order_map:
+                subgraph_order_map[int(order)].append(file_path)
+            else:
+                subgraph_order_map[int(order)] = [file_path]
+
+        sorted_file_paths = []
+        for order in sorted(subgraph_order_map.keys()):
+            sorted_file_paths.extend(subgraph_order_map[order])
+
+        sessions = []
+        conv_output_layer_map = {}
+        for model_file in sorted_file_paths:
+            session = ort.InferenceSession(model_file)
+            sessions.append(session)
+
+            conv_outputs = {}
+            if self.onnx_end_conv(model_file):
+                model = onnx.load(model_file)
+                for idx, node in enumerate(model.graph.node):
+                    if node.op_type == 'Conv':
+                        for output_name in node.output:
+                            if output_name not in conv_outputs:
+                                conv_outputs[output_name] = idx + 1
+                conv_output_layer_map[model_file] = conv_outputs
+
+        return sessions, conv_output_layer_map, sorted_file_paths
+
+    def load_onnx_dict(self):
+        onnx_dict = []
+        with open(self.endwithconv_path, 'r') as file:
+            content = file.read()
+            numbers = re.findall(r'\b\d+\b', content)
+            for number in numbers:
+                onnx_path = os.path.join(self.model_path, f"NPUsubgraph{number}.onnx")
+                onnx_dict.append(onnx_path)
+        return onnx_dict
+
+    def onnx_end_conv(self, model_file):
+        for onnx in self.load_onnx_dict():
+            if onnx == model_file:
+                return True
+        return False
+
+    def check_and_convert_inputs(self, model_input_data):
+        for key, value in model_input_data.items():
+            if isinstance(value, torch.Tensor):
+                model_input_data[key] = value.numpy()
+            elif not isinstance(value, np.ndarray):
+                raise TypeError(
+                    f"Input data for '{key}' is not a NumPy array. Got type: {type(value)}"
+                )
+        return model_input_data
+
+    def decomp(self, compressed_tensor, ru, rbits, num_bits=8):
+        decompressed_tensor = torch.dequantize(compressed_tensor)
+        decompressed_tensor = decompressed_tensor.numpy()
+        if not isinstance(decompressed_tensor, np.ndarray):
+            raise TypeError("The decompressed tensor is not a NumPy array.")
+        return decompressed_tensor
+
+    def inference(self, initial_input_data, num):
+        input_data = initial_input_data
+        aux_data = {}
+        record_model_name = None
+
+        for i, (session,
+                model_file) in enumerate(zip(self.sessions, self.sorted_file_paths)):
+            input_names = [inp.name for inp in session.get_inputs()]
+
+            if self.onnx_end_conv(record_model_name):
+                for name in input_names:
+                    if name in input_data and name in aux_data:
+                        compressed_tensor = input_data[name]
+                        ru, rbits = aux_data[name]
+                        decompressed_tensor = self.decomp(compressed_tensor, ru, rbits)
+                        input_data[name] = decompressed_tensor
+
+            model_input_data = {name: input_data[name] for name in input_names}
+            self.check_and_convert_inputs(model_input_data)
+            outputs = session.run(None, model_input_data)
+            output_names = [out.name for out in session.get_outputs()]
+            conv_outputs = self.conv_output_layer_map.get(model_file, {})
+
+            for output_name, output in zip(output_names, outputs):
+                if output_name in conv_outputs:
+                    output_tensor = torch.tensor(output)
+                    layer = conv_outputs[output_name]
+                    output_tensor = quant_conv_forward_save_output(
+                        output_tensor,
+                        layer,
+                        count=1,
+                        bit=8,
+                        i=num,
+                        output_dir=self.output_dir)
+                    input_data[output_name] = output_tensor
+                else:
+                    input_data[output_name] = output
+            record_model_name = model_file
+
+        return outputs[0]
+
+
+class ImageMetricsEvaluator:
+    """
+    Used to evaluate image quality, including MSE, PSNR, and SSIM.
+
+    Parameters:
+        original_dir (str): Directory containing the original images.
+        generated_dir (str): Directory containing the generated images.
+        compression_dir (str): Directory containing the compression information text files.
+    Output:
+        output_file (str): Path to the output file (Excel).
+    """
+    def __init__(self, original_dir, generated_dir, compression_dir, output_file):
+
+        self.original_dir = original_dir
+        self.generated_dir = generated_dir
+        self.compression_dir = compression_dir
+        self.output_file = output_file
+
+    def calculate_image_metrics(self, original_image_path, generated_image_path):
+        """Calculate MSE, PSNR, and SSIM between the given original and generated images."""
+        original_image = imread(original_image_path)
+        generated_image = imread(generated_image_path)
+
+        if original_image.shape != generated_image.shape:
+            raise ValueError('两个图像的尺寸必须相同')
+
+        mse = mean_squared_error(original_image, generated_image)
+        psnr = peak_signal_noise_ratio(original_image, generated_image)
+
+        min_dim = min(original_image.shape[:2])
+        win_size = min(7, min_dim)
+        if win_size % 2 == 0:
+            win_size -= 1
+        if win_size < 3:
+            win_size = 3
+
+        ssim = structural_similarity(original_image,
+                                     generated_image,
+                                     multichannel=True,
+                                     win_size=win_size,
+                                     channel_axis=-1)
+
+        return mse, psnr, ssim
+
+    def calculate_compression_rate(self, file_path):
+        """Read from a specified text file and calculate the average compression rate."""
+        with open(file_path) as f:
+            lines = f.readlines()
+            rate_all = sum(
+                float(line.split(',')[0]) * float(line.split(',')[1]) for line in lines)
+            all_ = sum(float(line.split(',')[1]) for line in lines)
+            return rate_all / all_ if all_ != 0 else None
+
+    def find_matching_compression_file(self, image_name):
+        """Find the corresponding compression info file based on the image filename."""
+        base_name, _ = os.path.splitext(image_name)
+        number = re.search(r'_(\d+)', base_name)
+        if number:
+            number = number.group(1)
+            compression_files = [
+                f for f in os.listdir(self.compression_dir)
+                if f.startswith(f'result_{number}') and f.endswith('.txt')
+            ]
+            if compression_files:
+                return os.path.join(self.compression_dir, compression_files[0])
+        return None
+
+    def compare_images_in_directories(self):
+        """Compare all images in two directories and save the results to an Excel file."""
+        def sort_key(filename):
+            parts = filename.split('_')
+            try:
+                return int(parts[1].split('.')[0]) if len(parts) > 1 else 0
+            except (ValueError, IndexError):
+                print(f"Warning: Could not parse number from filename {filename}")
+                return 0
+
+        original_images = sorted(
+            [f for f in os.listdir(self.original_dir) if f.endswith('.png')],
+            key=sort_key)
+        generated_images = sorted(
+            [f for f in os.listdir(self.generated_dir) if f.endswith('.png')],
+            key=sort_key)
+
+        results = []
+
+        for orig_img_name, gen_img_name in zip(original_images, generated_images):
+            orig_img_path = os.path.join(self.original_dir, orig_img_name)
+            gen_img_path = os.path.join(self.generated_dir, gen_img_name)
+
+            try:
+                mse, psnr, ssim = self.calculate_image_metrics(orig_img_path,
+                                                               gen_img_path)
+                compression_file_path = self.find_matching_compression_file(orig_img_name)
+                compression_rate = self.calculate_compression_rate(
+                    compression_file_path) if compression_file_path else None
+                results.append({
+                    'Original Image': orig_img_name,
+                    'Generated Image': gen_img_name,
+                    'MSE': mse,
+                    'PSNR': psnr,
+                    'SSIM': ssim,
+                    'Compression Rate': compression_rate
+                })
+            except Exception as e:
+                print(f"Error processing images {orig_img_name} and {gen_img_name}: {e}")
+
+        df = pd.DataFrame(results)
+
+        output_dir = os.path.dirname(self.output_file)
+        if not os.path.exists(output_dir):
+            os.makedirs(output_dir)
+
+        try:
+            df.to_excel(self.output_file, index=False)
+            print(f'Results have been saved to {self.output_file}')
+        except PermissionError:
+            print(
+                f"Permission denied: Unable to write to {self.output_file}. Please check file permissions or close the file if it is open in another program."
+            )
+        except Exception as e:
+            print(f"An error occurred while saving the results: {e}")
diff --git a/tools/onnx-subgraph/model_inference_multiple_output.py b/tools/onnx-subgraph/model_inference_multiple_output.py
new file mode 100644
index 00000000000..6b6d96aeacf
--- /dev/null
+++ b/tools/onnx-subgraph/model_inference_multiple_output.py
@@ -0,0 +1,357 @@
+# Copyright (c) 2025 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from skimage.metrics import mean_squared_error, peak_signal_noise_ratio, structural_similarity
+from skimage.io import imread
+import onnxruntime as ort
+import numpy as np
+import pandas as pd
+import torch
+import onnx
+import pdb
+import re
+import os
+
+from quant import quant_conv_forward_save_output
+
+
+class ModelInference:
+    """
+    This class is used to infer multiple onnx models.
+    Parameters:
+        model_path: Path to the model files.
+        subgraphsiostxt_path: Path to the txt file that describes the structure of the model graph.
+    Output:
+        outputs[0]: Inference result from the model.
+    Description:
+        Here, subgraphsiostxt_path is a txt file that describes the structure of the model graph and is used to get input/output node names. 
+        The model_path contains paths to multiple onnx files. The load_sessions function will sort the onnx models in the model_path according to the order specified in subgraphsiostxt_path. 
+        It then infers the sorted onnx models, returns the sessions data to self.sessions, and returns the sorted sequence to self.sorted_file_paths. 
+        Finally, it infers the sessions based on the initial data provided by initial_input_data and returns the inference results.
+    """
+    def __init__(self, model_path, subgraphsiostxt_path):
+
+        self.model_path = model_path
+        self.subgraphsiostxt_path = subgraphsiostxt_path
+        self.sessions, self.sorted_file_paths = self.load_sessions()
+
+    def load_sessions(self):
+        with open(self.subgraphsiostxt_path, 'r') as file:
+            content = file.read()
+        subgraph_order_map = {}
+        matches = re.findall(r'(\w+)subgraph(\d+): order(\d+)', content)
+
+        for match in matches:
+            subgraph_type, subgraph_number, order = match
+            # lower_subgraph_type = subgraph_type.lower()
+            file_path = os.path.join(self.model_path,
+                                     f"{subgraph_type}subgraph{subgraph_number}.onnx")
+            if int(order) in subgraph_order_map:
+                subgraph_order_map[int(order)].append(file_path)
+            else:
+                subgraph_order_map[int(order)] = [file_path]
+
+        sorted_file_paths = []
+        for order in sorted(subgraph_order_map.keys()):
+            sorted_file_paths.extend(subgraph_order_map[order])
+
+        sessions = [ort.InferenceSession(model) for model in sorted_file_paths]
+        return sessions, sorted_file_paths
+
+    def inference(self, initial_input_data, output_names_to_collect=None):
+        input_data = initial_input_data
+        collected_outputs = {}
+
+        for i, (session,
+                model_file) in enumerate(zip(self.sessions, self.sorted_file_paths)):
+            input_names = [inp.name for inp in session.get_inputs()]
+            output_names = [out.name for out in session.get_outputs()]
+            model_input_data = {name: input_data[name] for name in input_names}
+            outputs = session.run(None, model_input_data)
+            current_model_outputs = dict(zip(output_names, outputs))
+            if output_names_to_collect is not None:
+                for output_name in output_names_to_collect:
+                    if output_name in current_model_outputs:
+                        collected_outputs[output_name] = current_model_outputs[
+                            output_name]
+
+            if i < len(self.sessions) - 1:
+                input_data.update(current_model_outputs)
+        return collected_outputs
+
+    def infer_single_onnx_model(model_file, input_data):
+        session = ort.InferenceSession(model_file)
+        outputs = session.run(None, input_data)
+        output_names = [output.name for output in session.get_outputs()]
+        output_dict = {name: output for name, output in zip(output_names, outputs)}
+        return output_dict
+
+
+class PcaInference:
+    """
+    This class uses PCA for compression and inferring multiple ONNX models.
+    Parameters:
+        model_path: Path to the onnx model files.
+        subgraphsiostxt_path: Path to the txt file that describes the structure of the model graph.
+        endwithconv_path: Path to a txt file recording the onnx ending with convolution.
+        initial_input_data: Initial input data.
+        num: Inference times, providing the model name based on the number of times.
+        output_dir: Root directory for saving inference results.
+    Output:
+        outputs: Inference results.
+    Description:
+        A result_pt directory is generated in between to save intermediate results; however, not generating this directory does not affect experimental results.
+        The result folder saves the output of the convolution layer to calculate the compression rate. All results are saved in the output_dir folder.
+    """
+    def __init__(self, model_path, subgraphsiostxt_path, endwithconv_path, output_dir):
+        self.model_path = model_path
+        self.subgraphsiostxt_path = subgraphsiostxt_path
+        self.endwithconv_path = endwithconv_path
+        self.output_dir = output_dir
+        (
+            self.sessions,
+            self.conv_output_layer_map,
+            self.sorted_file_paths,
+        ) = self.load_sessions()
+
+    def load_sessions(self):
+        with open(self.subgraphsiostxt_path, 'r') as file:
+            content = file.read()
+        subgraph_order_map = {}
+        matches = re.findall(r'(\w+)subgraph(\d+): order(\d+)', content)
+
+        for match in matches:
+            subgraph_type, subgraph_number, order = match
+            file_path = os.path.join(self.model_path,
+                                     f"{subgraph_type}subgraph{subgraph_number}.onnx")
+            if int(order) in subgraph_order_map:
+                subgraph_order_map[int(order)].append(file_path)
+            else:
+                subgraph_order_map[int(order)] = [file_path]
+
+        sorted_file_paths = []
+        for order in sorted(subgraph_order_map.keys()):
+            sorted_file_paths.extend(subgraph_order_map[order])
+
+        sessions = []
+        conv_output_layer_map = {}
+        for model_file in sorted_file_paths:
+            session = ort.InferenceSession(model_file)
+            sessions.append(session)
+
+            conv_outputs = {}
+            if self.onnx_end_conv(model_file):
+                model = onnx.load(model_file)
+                for idx, node in enumerate(model.graph.node):
+                    if node.op_type == 'Conv':
+                        for output_name in node.output:
+                            if output_name not in conv_outputs:
+                                conv_outputs[output_name] = idx + 1
+                conv_output_layer_map[model_file] = conv_outputs
+
+        return sessions, conv_output_layer_map, sorted_file_paths
+
+    def load_onnx_dict(self):
+        onnx_dict = []
+        with open(self.endwithconv_path, 'r') as file:
+            content = file.read()
+            numbers = re.findall(r'\b\d+\b', content)
+            for number in numbers:
+                onnx_path = os.path.join(self.model_path, f"NPUsubgraph{number}.onnx")
+                onnx_dict.append(onnx_path)
+        return onnx_dict
+
+    def onnx_end_conv(self, model_file):
+        for onnx in self.load_onnx_dict():
+            if onnx == model_file:
+                return True
+        return False
+
+    def check_and_convert_inputs(self, model_input_data):
+        for key, value in model_input_data.items():
+            if isinstance(value, torch.Tensor):
+                model_input_data[key] = value.numpy()
+            elif not isinstance(value, np.ndarray):
+                raise TypeError(
+                    f"Input data for '{key}' is not a NumPy array. Got type: {type(value)}"
+                )
+        return model_input_data
+
+    def decomp(self, compressed_tensor, ru, rbits, num_bits=8):
+        decompressed_tensor = torch.dequantize(compressed_tensor)
+        decompressed_tensor = decompressed_tensor.numpy()
+        if not isinstance(decompressed_tensor, np.ndarray):
+            raise TypeError("The decompressed tensor is not a NumPy array.")
+        return decompressed_tensor
+
+    def inference(self, initial_input_data, num):
+        input_data = initial_input_data
+        aux_data = {}
+        record_model_name = None
+
+        for i, (session,
+                model_file) in enumerate(zip(self.sessions, self.sorted_file_paths)):
+            input_names = [inp.name for inp in session.get_inputs()]
+
+            if self.onnx_end_conv(record_model_name):
+                for name in input_names:
+                    if name in input_data and name in aux_data:
+                        compressed_tensor = input_data[name]
+                        ru, rbits = aux_data[name]
+                        decompressed_tensor = self.decomp(compressed_tensor, ru, rbits)
+                        input_data[name] = decompressed_tensor
+
+            model_input_data = {name: input_data[name] for name in input_names}
+            self.check_and_convert_inputs(model_input_data)
+            outputs = session.run(None, model_input_data)
+            output_names = [out.name for out in session.get_outputs()]
+            conv_outputs = self.conv_output_layer_map.get(model_file, {})
+
+            for output_name, output in zip(output_names, outputs):
+                if output_name in conv_outputs:
+                    output_tensor = torch.tensor(output)
+                    layer = conv_outputs[output_name]
+                    output_tensor = quant_conv_forward_save_output(
+                        output_tensor,
+                        layer,
+                        count=1,
+                        bit=8,
+                        i=num,
+                        output_dir=self.output_dir)
+                    input_data[output_name] = output_tensor
+                else:
+                    input_data[output_name] = output
+            record_model_name = model_file
+
+        return outputs[0]
+
+
+class ImageMetricsEvaluator:
+    """
+    Used to evaluate image quality, including MSE, PSNR, and SSIM.
+
+    Parameters:
+        original_dir (str): Directory containing the original images.
+        generated_dir (str): Directory containing the generated images.
+        compression_dir (str): Directory containing the compression information text files.
+    Output:
+        output_file (str): Path to the output file (Excel).
+    """
+    def __init__(self, original_dir, generated_dir, compression_dir, output_file):
+
+        self.original_dir = original_dir
+        self.generated_dir = generated_dir
+        self.compression_dir = compression_dir
+        self.output_file = output_file
+
+    def calculate_image_metrics(self, original_image_path, generated_image_path):
+        original_image = imread(original_image_path)
+        generated_image = imread(generated_image_path)
+
+        if original_image.shape != generated_image.shape:
+            raise ValueError('两个图像的尺寸必须相同')
+
+        mse = mean_squared_error(original_image, generated_image)
+        psnr = peak_signal_noise_ratio(original_image, generated_image)
+
+        min_dim = min(original_image.shape[:2])
+        win_size = min(7, min_dim)
+        if win_size % 2 == 0:
+            win_size -= 1
+        if win_size < 3:
+            win_size = 3
+
+        ssim = structural_similarity(original_image,
+                                     generated_image,
+                                     multichannel=True,
+                                     win_size=win_size,
+                                     channel_axis=-1)
+
+        return mse, psnr, ssim
+
+    def calculate_compression_rate(self, file_path):
+        with open(file_path) as f:
+            lines = f.readlines()
+            rate_all = sum(
+                float(line.split(',')[0]) * float(line.split(',')[1]) for line in lines)
+            all_ = sum(float(line.split(',')[1]) for line in lines)
+            return rate_all / all_ if all_ != 0 else None
+
+    def find_matching_compression_file(self, image_name):
+        base_name, _ = os.path.splitext(image_name)
+        number = re.search(r'_(\d+)', base_name)
+        if number:
+            number = number.group(1)
+            compression_files = [
+                f for f in os.listdir(self.compression_dir)
+                if f.startswith(f'result_{number}') and f.endswith('.txt')
+            ]
+            if compression_files:
+                return os.path.join(self.compression_dir, compression_files[0])
+        return None
+
+    def compare_images_in_directories(self):
+        def sort_key(filename):
+            parts = filename.split('_')
+            try:
+                return int(parts[1].split('.')[0]) if len(parts) > 1 else 0
+            except (ValueError, IndexError):
+                print(f"Warning: Could not parse number from filename {filename}")
+                return 0
+
+        original_images = sorted(
+            [f for f in os.listdir(self.original_dir) if f.endswith('.png')],
+            key=sort_key)
+        generated_images = sorted(
+            [f for f in os.listdir(self.generated_dir) if f.endswith('.png')],
+            key=sort_key)
+
+        results = []
+
+        for orig_img_name, gen_img_name in zip(original_images, generated_images):
+            orig_img_path = os.path.join(self.original_dir, orig_img_name)
+            gen_img_path = os.path.join(self.generated_dir, gen_img_name)
+
+            try:
+                mse, psnr, ssim = self.calculate_image_metrics(orig_img_path,
+                                                               gen_img_path)
+                compression_file_path = self.find_matching_compression_file(orig_img_name)
+                compression_rate = self.calculate_compression_rate(
+                    compression_file_path) if compression_file_path else None
+                results.append({
+                    'Original Image': orig_img_name,
+                    'Generated Image': gen_img_name,
+                    'MSE': mse,
+                    'PSNR': psnr,
+                    'SSIM': ssim,
+                    'Compression Rate': compression_rate
+                })
+            except Exception as e:
+                print(f"Error processing images {orig_img_name} and {gen_img_name}: {e}")
+
+        df = pd.DataFrame(results)
+
+        output_dir = os.path.dirname(self.output_file)
+        if not os.path.exists(output_dir):
+            os.makedirs(output_dir)
+
+        try:
+            df.to_excel(self.output_file, index=False)
+            print(f'Results have been saved to {self.output_file}')
+        except PermissionError:
+            print(
+                f"Permission denied: Unable to write to {self.output_file}. Please check file permissions or close the file if it is open in another program."
+            )
+        except Exception as e:
+            print(f"An error occurred while saving the results: {e}")
diff --git a/tools/onnx-subgraph/onnx.proto b/tools/onnx-subgraph/onnx.proto
new file mode 100644
index 00000000000..6a3abfdd109
--- /dev/null
+++ b/tools/onnx-subgraph/onnx.proto
@@ -0,0 +1,871 @@
+//
+// WARNING: This file is automatically generated!  Please edit onnx.in.proto.
+//
+
+
+// SPDX-License-Identifier: Apache-2.0
+
+
+syntax = "proto2";
+
+package onnx;
+
+// Overview
+//
+// ONNX is an open specification that is comprised of the following components:
+//
+// 1)  A definition of an extensible computation graph model.
+// 2)  Definitions of standard data types.
+// 3)  Definitions of built-in operators.
+//
+// This document describes the syntax of models and their computation graphs,
+// as well as the standard data types. Together, they are referred to as the ONNX
+// Intermediate Representation, or 'IR' for short.
+//
+// The normative semantic specification of the ONNX IR is found in docs/IR.md.
+// Definitions of the built-in neural network operators may be found in docs/Operators.md.
+
+// Notes
+//
+// Protobuf compatibility
+//
+// To simplify framework compatibility, ONNX is defined using the subset of protobuf
+// that is compatible with both protobuf v2 and v3. This means that we do not use any
+// protobuf features that are only available in one of the two versions.
+//
+// Here are the most notable contortions we have to carry out to work around
+// these limitations:
+//
+//   - No 'map' (added protobuf 3.0). We instead represent mappings as lists
+//     of key-value pairs, where order does not matter and duplicates
+//     are not allowed.
+
+
+// Versioning
+//
+// ONNX versioning is specified in docs/IR.md and elaborated on in docs/Versioning.md
+//
+// To be compatible with both proto2 and proto3, we will use a version number
+// that is not defined by the default value but an explicit enum number.
+enum Version {
+  // proto3 requires the first enum value to be zero.
+  // We add this just to appease the compiler.
+  _START_VERSION = 0;
+  // The version field is always serialized and we will use it to store the
+  // version that the  graph is generated from. This helps us set up version
+  // control.
+  // For the IR, we are using simple numbers starting with 0x00000001,
+  // which was the version we published on Oct 10, 2017.
+  IR_VERSION_2017_10_10 = 0x0000000000000001;
+
+  // IR_VERSION 2 published on Oct 30, 2017
+  // - Added type discriminator to AttributeProto to support proto3 users
+  IR_VERSION_2017_10_30 = 0x0000000000000002;
+
+  // IR VERSION 3 published on Nov 3, 2017
+  // - For operator versioning:
+  //    - Added new message OperatorSetIdProto
+  //    - Added opset_import in ModelProto
+  // - For vendor extensions, added domain in NodeProto
+  IR_VERSION_2017_11_3 = 0x0000000000000003;
+
+  // IR VERSION 4 published on Jan 22, 2019
+  // - Relax constraint that initializers should be a subset of graph inputs
+  // - Add type BFLOAT16
+  IR_VERSION_2019_1_22 = 0x0000000000000004;
+
+  // IR VERSION 5 published on March 18, 2019
+  // - Add message TensorAnnotation.
+  // - Add quantization annotation in GraphProto to map tensor with its scale and zero point quantization parameters.
+  IR_VERSION_2019_3_18 = 0x0000000000000005;
+
+  // IR VERSION 6 published on Sep 19, 2019
+  // - Add support for sparse tensor constants stored in model.
+  //   - Add message SparseTensorProto
+  //   - Add sparse initializers
+  IR_VERSION_2019_9_19 = 0x0000000000000006;
+
+  // IR VERSION 7 published on May 8, 2020
+  // - Add support to allow function body graph to rely on multiple external opreator sets.
+  // - Add a list to promote inference graph's initializers to global and
+  //   mutable variables. Global variables are visible in all graphs of the
+  //   stored models.
+  // - Add message TrainingInfoProto to store initialization
+  //   method and training algorithm. The execution of TrainingInfoProto
+  //   can modify the values of mutable variables.
+  // - Implicitly add inference graph into each TrainingInfoProto's algorithm.
+  IR_VERSION_2020_5_8 = 0x0000000000000007;
+
+  // IR VERSION 8 published on July 30, 2021
+  // Introduce TypeProto.SparseTensor
+  // Introduce TypeProto.Optional
+  // Added a list of FunctionProtos local to the model
+  // Deprecated since_version and operator status from FunctionProto
+  IR_VERSION_2021_7_30 = 0x0000000000000008;
+
+  // IR VERSION 9 published on May 5, 2023
+  // Added AttributeProto to FunctionProto so that default attribute values can be set.
+  // Added FLOAT8E4M3FN, FLOAT8E4M3FNUZ, FLOAT8E5M2, FLOAT8E5M2FNUZ.
+  IR_VERSION_2023_5_5 = 0x0000000000000009;
+
+  // IR VERSION 10 published on TBD
+  // Added UINT4, INT4.
+  IR_VERSION = 0x000000000000000A;
+}
+
+// Attributes
+//
+// A named attribute containing either singular float, integer, string, graph,
+// and tensor values, or repeated float, integer, string, graph, and tensor values.
+// An AttributeProto MUST contain the name field, and *only one* of the
+// following content fields, effectively enforcing a C/C++ union equivalent.
+message AttributeProto {
+  reserved 12, 16 to 19;
+  reserved "v";
+
+  // Note: this enum is structurally identical to the OpSchema::AttrType
+  // enum defined in schema.h.  If you rev one, you likely need to rev the other.
+  enum AttributeType {
+    UNDEFINED = 0;
+    FLOAT = 1;
+    INT = 2;
+    STRING = 3;
+    TENSOR = 4;
+    GRAPH = 5;
+    SPARSE_TENSOR = 11;
+    TYPE_PROTO = 13;
+
+    FLOATS = 6;
+    INTS = 7;
+    STRINGS = 8;
+    TENSORS = 9;
+    GRAPHS = 10;
+    SPARSE_TENSORS = 12;
+    TYPE_PROTOS = 14;
+  }
+
+  // The name field MUST be present for this version of the IR.
+  optional string name = 1;           // namespace Attribute
+
+  // if ref_attr_name is not empty, ref_attr_name is the attribute name in parent function.
+  // In this case, this AttributeProto does not contain data, and it's a reference of attribute
+  // in parent scope.
+  // NOTE: This should ONLY be used in function (sub-graph). It's invalid to be used in main graph.
+  optional string ref_attr_name = 21;
+
+  // A human-readable documentation for this attribute. Markdown is allowed.
+  optional string doc_string = 13;
+
+  // The type field MUST be present for this version of the IR.
+  // For 0.0.1 versions of the IR, this field was not defined, and
+  // implementations needed to use has_field heuristics to determine
+  // which value field was in use.  For IR_VERSION 0.0.2 or later, this
+  // field MUST be set and match the f|i|s|t|... field in use.  This
+  // change was made to accommodate proto3 implementations.
+  optional AttributeType type = 20;   // discriminator that indicates which field below is in use
+
+  // Exactly ONE of the following fields must be present for this version of the IR
+  optional float f = 2;               // float
+  optional int64 i = 3;               // int
+  optional bytes s = 4;               // UTF-8 string
+  optional TensorProto t = 5;         // tensor value
+  optional GraphProto g = 6;          // graph
+  optional SparseTensorProto sparse_tensor = 22;  // sparse tensor value
+  // Do not use field below, it's deprecated.
+  // optional ValueProto v = 12;         // value - subsumes everything but graph
+  optional TypeProto tp = 14;          // type proto
+
+  repeated float floats = 7;          // list of floats
+  repeated int64 ints = 8;            // list of ints
+  repeated bytes strings = 9;         // list of UTF-8 strings
+  repeated TensorProto tensors = 10;  // list of tensors
+  repeated GraphProto graphs = 11;    // list of graph
+  repeated SparseTensorProto sparse_tensors = 23; // list of sparse tensors
+  repeated TypeProto type_protos = 15;// list of type protos
+}
+
+// Defines information on value, including the name, the type, and
+// the shape of the value.
+message ValueInfoProto {
+  // This field MUST be present in this version of the IR.
+  optional string name = 1;     // namespace Value
+  // This field MUST be present in this version of the IR for
+  // inputs and outputs of the top-level graph.
+  optional TypeProto type = 2;
+  // A human-readable documentation for this value. Markdown is allowed.
+  optional string doc_string = 3;
+  // Named metadata values; keys should be distinct.
+  repeated StringStringEntryProto metadata_props = 4;
+}
+
+// Nodes
+//
+// Computation graphs are made up of a DAG of nodes, which represent what is
+// commonly called a "layer" or "pipeline stage" in machine learning frameworks.
+//
+// For example, it can be a node of type "Conv" that takes in an image, a filter
+// tensor and a bias tensor, and produces the convolved output.
+message NodeProto {
+  repeated string input = 1;    // namespace Value
+  repeated string output = 2;   // namespace Value
+
+  // An optional identifier for this node in a graph.
+  // This field MAY be absent in this version of the IR.
+  optional string name = 3;     // namespace Node
+
+  // The symbolic identifier of the Operator to execute.
+  optional string op_type = 4;  // namespace Operator
+  // The domain of the OperatorSet that specifies the operator named by op_type.
+  optional string domain = 7;   // namespace Domain
+  // Overload identifier, used only to map this to a model-local function.
+  optional string overload = 8;
+
+  // Additional named attributes.
+  repeated AttributeProto attribute = 5;
+
+  // A human-readable documentation for this node. Markdown is allowed.
+  optional string doc_string = 6;
+
+  // Named metadata values; keys should be distinct.
+  repeated StringStringEntryProto metadata_props = 9;
+}
+
+// Training information
+// TrainingInfoProto stores information for training a model.
+// In particular, this defines two functionalities: an initialization-step
+// and a training-algorithm-step. Initialization resets the model
+// back to its original state as if no training has been performed.
+// Training algorithm improves the model based on input data.
+//
+// The semantics of the initialization-step is that the initializers
+// in ModelProto.graph and in TrainingInfoProto.algorithm are first
+// initialized as specified by the initializers in the graph, and then
+// updated by the "initialization_binding" in every instance in
+// ModelProto.training_info.
+//
+// The field "algorithm" defines a computation graph which represents a
+// training algorithm's step. After the execution of a
+// TrainingInfoProto.algorithm, the initializers specified by "update_binding"
+// may be immediately updated. If the targeted training algorithm contains
+// consecutive update steps (such as block coordinate descent methods),
+// the user needs to create a TrainingInfoProto for each step.
+message TrainingInfoProto {
+  // This field describes a graph to compute the initial tensors
+  // upon starting the training process. Initialization graph has no input
+  // and can have multiple outputs. Usually, trainable tensors in neural
+  // networks are randomly initialized. To achieve that, for each tensor,
+  // the user can put a random number operator such as RandomNormal or
+  // RandomUniform in TrainingInfoProto.initialization.node and assign its
+  // random output to the specific tensor using "initialization_binding".
+  // This graph can also set the initializers in "algorithm" in the same
+  // TrainingInfoProto; a use case is resetting the number of training
+  // iteration to zero.
+  //
+  // By default, this field is an empty graph and its evaluation does not
+  // produce any output. Thus, no initializer would be changed by default.
+  optional GraphProto initialization = 1;
+
+  // This field represents a training algorithm step. Given required inputs,
+  // it computes outputs to update initializers in its own or inference graph's
+  // initializer lists. In general, this field contains loss node, gradient node,
+  // optimizer node, increment of iteration count.
+  //
+  // An execution of the training algorithm step is performed by executing the
+  // graph obtained by combining the inference graph (namely "ModelProto.graph")
+  // and the "algorithm" graph. That is, the actual
+  // input/initializer/output/node/value_info/sparse_initializer list of
+  // the training graph is the concatenation of
+  // "ModelProto.graph.input/initializer/output/node/value_info/sparse_initializer"
+  // and "algorithm.input/initializer/output/node/value_info/sparse_initializer"
+  // in that order. This combined graph must satisfy the normal ONNX conditions.
+  // Now, let's provide a visualization of graph combination for clarity.
+  // Let the inference graph (i.e., "ModelProto.graph") be
+  //    tensor_a, tensor_b -> MatMul -> tensor_c -> Sigmoid -> tensor_d
+  // and the "algorithm" graph be
+  //    tensor_d -> Add -> tensor_e
+  // The combination process results
+  //    tensor_a, tensor_b -> MatMul -> tensor_c -> Sigmoid -> tensor_d -> Add -> tensor_e
+  //
+  // Notice that an input of a node in the "algorithm" graph may reference the
+  // output of a node in the inference graph (but not the other way round). Also, inference
+  // node cannot reference inputs of "algorithm". With these restrictions, inference graph
+  // can always be run independently without training information.
+  //
+  // By default, this field is an empty graph and its evaluation does not
+  // produce any output. Evaluating the default training step never
+  // update any initializers.
+  optional GraphProto algorithm = 2;
+
+  // This field specifies the bindings from the outputs of "initialization" to
+  // some initializers in "ModelProto.graph.initializer" and
+  // the "algorithm.initializer" in the same TrainingInfoProto.
+  // See "update_binding" below for details.
+  //
+  // By default, this field is empty and no initializer would be changed
+  // by the execution of "initialization".
+  repeated StringStringEntryProto initialization_binding = 3;
+
+  // Gradient-based training is usually an iterative procedure. In one gradient
+  // descent iteration, we apply
+  //
+  // x = x - r * g
+  //
+  // where "x" is the optimized tensor, "r" stands for learning rate, and "g" is
+  // gradient of "x" with respect to a chosen loss. To avoid adding assignments
+  // into the training graph, we split the update equation into
+  //
+  // y = x - r * g
+  // x = y
+  //
+  // The user needs to save "y = x - r * g" into TrainingInfoProto.algorithm. To
+  // tell that "y" should be assigned to "x", the field "update_binding" may
+  // contain a key-value pair of strings, "x" (key of StringStringEntryProto)
+  // and "y" (value of StringStringEntryProto).
+  // For a neural network with multiple trainable (mutable) tensors, there can
+  // be multiple key-value pairs in "update_binding".
+  //
+  // The initializers appears as keys in "update_binding" are considered
+  // mutable variables. This implies some behaviors
+  // as described below.
+  //
+  //  1. We have only unique keys in all "update_binding"s so that two
+  //     variables may not have the same name. This ensures that one
+  //     variable is assigned up to once.
+  //  2. The keys must appear in names of "ModelProto.graph.initializer" or
+  //     "TrainingInfoProto.algorithm.initializer".
+  //  3. The values must be output names of "algorithm" or "ModelProto.graph.output".
+  //  4. Mutable variables are initialized to the value specified by the
+  //     corresponding initializer, and then potentially updated by
+  //     "initializer_binding"s and "update_binding"s in "TrainingInfoProto"s.
+  //
+  // This field usually contains names of trainable tensors
+  // (in ModelProto.graph), optimizer states such as momentums in advanced
+  // stochastic gradient methods (in TrainingInfoProto.graph),
+  // and number of training iterations (in TrainingInfoProto.graph).
+  //
+  // By default, this field is empty and no initializer would be changed
+  // by the execution of "algorithm".
+  repeated StringStringEntryProto update_binding = 4;
+}
+
+// Models
+//
+// ModelProto is a top-level file/container format for bundling a ML model and
+// associating its computation graph with metadata.
+//
+// The semantics of the model are described by the associated GraphProto's.
+message ModelProto {
+  // The version of the IR this model targets. See Version enum above.
+  // This field MUST be present.
+  optional int64 ir_version = 1;
+
+  // The OperatorSets this model relies on.
+  // All ModelProtos MUST have at least one entry that
+  // specifies which version of the ONNX OperatorSet is
+  // being imported.
+  //
+  // All nodes in the ModelProto's graph will bind against the operator
+  // with the same-domain/same-op_type operator with the HIGHEST version
+  // in the referenced operator sets.
+  repeated OperatorSetIdProto opset_import = 8;
+
+  // The name of the framework or tool used to generate this model.
+  // This field SHOULD be present to indicate which implementation/tool/framework
+  // emitted the model.
+  optional string producer_name = 2;
+
+  // The version of the framework or tool used to generate this model.
+  // This field SHOULD be present to indicate which implementation/tool/framework
+  // emitted the model.
+  optional string producer_version = 3;
+
+  // Domain name of the model.
+  // We use reverse domain names as name space indicators. For example:
+  // `com.facebook.fair` or `com.microsoft.cognitiveservices`
+  //
+  // Together with `model_version` and GraphProto.name, this forms the unique identity of
+  // the graph.
+  optional string domain = 4;
+
+  // The version of the graph encoded. See Version enum below.
+  optional int64 model_version = 5;
+
+  // A human-readable documentation for this model. Markdown is allowed.
+  optional string doc_string = 6;
+
+  // The parameterized graph that is evaluated to execute the model.
+  optional GraphProto graph = 7;
+
+  // Named metadata values; keys should be distinct.
+  repeated StringStringEntryProto metadata_props = 14;
+
+  // Training-specific information. Sequentially executing all stored
+  // `TrainingInfoProto.algorithm`s and assigning their outputs following
+  // the corresponding `TrainingInfoProto.update_binding`s is one training
+  // iteration. Similarly, to initialize the model
+  // (as if training hasn't happened), the user should sequentially execute
+  // all stored `TrainingInfoProto.initialization`s and assigns their outputs
+  // using `TrainingInfoProto.initialization_binding`s.
+  //
+  // If this field is empty, the training behavior of the model is undefined.
+  repeated TrainingInfoProto training_info = 20;
+
+  // A list of function protos local to the model.
+  //
+  // The (domain, name, overload) tuple must be unique across the function protos in this list.
+  // In case of any conflicts the behavior (whether the model local functions are given higher priority,
+  // or standard operator sets are given higher priotity or this is treated as error) is defined by
+  // the runtimes.
+  //
+  // The operator sets imported by FunctionProto should be compatible with the ones
+  // imported by ModelProto and other model local FunctionProtos.
+  // Example, if same operator set say 'A' is imported by a FunctionProto and ModelProto
+  // or by 2 FunctionProtos then versions for the operator set may be different but,
+  // the operator schema returned for op_type, domain, version combination
+  // for both the versions should be same for every node in the function body.
+  //
+  // One FunctionProto can reference other FunctionProto in the model, however, recursive reference
+  // is not allowed.
+  repeated FunctionProto functions = 25;
+};
+
+// StringStringEntryProto follows the pattern for cross-proto-version maps.
+// See https://developers.google.com/protocol-buffers/docs/proto3#maps
+message StringStringEntryProto {
+  optional string key = 1;
+  optional string value = 2;
+};
+
+message TensorAnnotation {
+  optional string tensor_name = 1;
+  // <key, value> pairs to annotate tensor specified by <tensor_name> above.
+  // The keys used in the mapping below must be pre-defined in ONNX spec.
+  // For example, for 8-bit linear quantization case, 'SCALE_TENSOR', 'ZERO_POINT_TENSOR' will be pre-defined as
+  // quantization parameter keys.
+  repeated StringStringEntryProto quant_parameter_tensor_names = 2;
+}
+
+
+
+// Graphs
+//
+// A graph defines the computational logic of a model and is comprised of a parameterized
+// list of nodes that form a directed acyclic graph based on their inputs and outputs.
+// This is the equivalent of the "network" or "graph" in many deep learning
+// frameworks.
+message GraphProto {
+  // The nodes in the graph, sorted topologically.
+  repeated NodeProto node = 1;
+
+  // The name of the graph.
+  optional string name = 2;   // namespace Graph
+
+  // A list of named tensor values, used to specify constant inputs of the graph.
+  // Each initializer (both TensorProto as well SparseTensorProto) MUST have a name.
+  // The name MUST be unique across both initializer and sparse_initializer,
+  // but the name MAY also appear in the input list.
+  repeated TensorProto initializer = 5;
+
+  // Initializers (see above) stored in sparse format.
+  repeated SparseTensorProto sparse_initializer = 15;
+
+  // A human-readable documentation for this graph. Markdown is allowed.
+  optional string doc_string = 10;
+
+  // The inputs and outputs of the graph.
+  repeated ValueInfoProto input = 11;
+  repeated ValueInfoProto output = 12;
+
+  // Information for the values in the graph. The ValueInfoProto.name's
+  // must be distinct. It is optional for a value to appear in value_info list.
+  repeated ValueInfoProto value_info = 13;
+
+  // This field carries information to indicate the mapping among a tensor and its
+  // quantization parameter tensors. For example:
+  // For tensor 'a', it may have {'SCALE_TENSOR', 'a_scale'} and {'ZERO_POINT_TENSOR', 'a_zero_point'} annotated,
+  // which means, tensor 'a_scale' and tensor 'a_zero_point' are scale and zero point of tensor 'a' in the model.
+  repeated TensorAnnotation quantization_annotation = 14;
+
+  // Named metadata values; keys should be distinct.
+  repeated StringStringEntryProto metadata_props = 16;
+
+  reserved 3, 4, 6 to 9;
+  reserved "ir_version", "producer_version", "producer_tag", "domain";
+}
+
+// Tensors
+//
+// A serialized tensor value.
+message TensorProto {
+  enum DataType {
+    UNDEFINED = 0;
+    // Basic types.
+    FLOAT = 1;   // float
+    UINT8 = 2;   // uint8_t
+    INT8 = 3;    // int8_t
+    UINT16 = 4;  // uint16_t
+    INT16 = 5;   // int16_t
+    INT32 = 6;   // int32_t
+    INT64 = 7;   // int64_t
+    STRING = 8;  // string
+    BOOL = 9;    // bool
+
+    // IEEE754 half-precision floating-point format (16 bits wide).
+    // This format has 1 sign bit, 5 exponent bits, and 10 mantissa bits.
+    FLOAT16 = 10;
+
+    DOUBLE = 11;
+    UINT32 = 12;
+    UINT64 = 13;
+    COMPLEX64 = 14;     // complex with float32 real and imaginary components
+    COMPLEX128 = 15;    // complex with float64 real and imaginary components
+
+    // Non-IEEE floating-point format based on IEEE754 single-precision
+    // floating-point number truncated to 16 bits.
+    // This format has 1 sign bit, 8 exponent bits, and 7 mantissa bits.
+    BFLOAT16 = 16;
+
+    // Non-IEEE floating-point format based on papers
+    // FP8 Formats for Deep Learning, https://arxiv.org/abs/2209.05433,
+    // 8-bit Numerical Formats For Deep Neural Networks, https://arxiv.org/pdf/2206.02915.pdf.
+    // Operators supported FP8 are Cast, CastLike, QuantizeLinear, DequantizeLinear.
+    // The computation usually happens inside a block quantize / dequantize
+    // fused by the runtime.
+    FLOAT8E4M3FN = 17;    // float 8, mostly used for coefficients, supports nan, not inf
+    FLOAT8E4M3FNUZ = 18;  // float 8, mostly used for coefficients, supports nan, not inf, no negative zero
+    FLOAT8E5M2 = 19;      // follows IEEE 754, supports nan, inf, mostly used for gradients
+    FLOAT8E5M2FNUZ = 20;  // follows IEEE 754, supports nan, not inf, mostly used for gradients, no negative zero
+
+    // 4-bit data-types
+    UINT4 = 21;  // Unsigned integer in range [0, 15]
+    INT4 = 22;   // Signed integer in range [-8, 7], using two's-complement representation
+
+    // Future extensions go here.
+  }
+
+  // The shape of the tensor.
+  repeated int64 dims = 1;
+
+  // The data type of the tensor.
+  // This field MUST have a valid TensorProto.DataType value
+  optional int32 data_type = 2;
+
+  // For very large tensors, we may want to store them in chunks, in which
+  // case the following fields will specify the segment that is stored in
+  // the current TensorProto.
+  message Segment {
+    optional int64 begin = 1;
+    optional int64 end = 2;
+  }
+  optional Segment segment = 3;
+
+  // Tensor content must be organized in row-major order.
+  //
+  // Depending on the data_type field, exactly one of the fields below with
+  // name ending in _data is used to store the elements of the tensor.
+
+  // For float and complex64 values
+  // Complex64 tensors are encoded as a single array of floats,
+  // with the real components appearing in odd numbered positions,
+  // and the corresponding imaginary component appearing in the
+  // subsequent even numbered position. (e.g., [1.0 + 2.0i, 3.0 + 4.0i]
+  // is encoded as [1.0, 2.0 ,3.0 ,4.0]
+  // When this field is present, the data_type field MUST be FLOAT or COMPLEX64.
+  repeated float float_data = 4 [packed = true];
+
+  // For int32, uint8, int8, uint16, int16, uint4, int4, bool, float8 and float16 values
+  // float16 and float8 values must be bit-wise converted to an uint16_t prior
+  // to writing to the buffer.
+  // uint4 and int4 values must be packed to 4bitx2 prior to writing to the buffer, the first element is stored in
+  // the 4 LSB and the second element is stored in the 4 MSB.
+  // When this field is present, the data_type field MUST be
+  // INT32, INT16, INT8, INT4, UINT16, UINT8, UINT4, BOOL, FLOAT16, BFLOAT16, FLOAT8E4M3FN, FLOAT8E4M3FNUZ, FLOAT8E5M2, FLOAT8E5M2FNUZ
+  repeated int32 int32_data = 5 [packed = true];
+
+  // For strings.
+  // Each element of string_data is a UTF-8 encoded Unicode
+  // string. No trailing null, no leading BOM. The protobuf "string"
+  // scalar type is not used to match ML community conventions.
+  // When this field is present, the data_type field MUST be STRING
+  repeated bytes string_data = 6;
+
+  // For int64.
+  // When this field is present, the data_type field MUST be INT64
+  repeated int64 int64_data = 7 [packed = true];
+
+  // Optionally, a name for the tensor.
+  optional string name = 8; // namespace Value
+
+  // A human-readable documentation for this tensor. Markdown is allowed.
+  optional string doc_string = 12;
+
+  // Serializations can either use one of the fields above, or use this
+  // raw bytes field. The only exception is the string case, where one is
+  // required to store the content in the repeated bytes string_data field.
+  //
+  // When this raw_data field is used to store tensor value, elements MUST
+  // be stored in as fixed-width, little-endian order.
+  // Floating-point data types MUST be stored in IEEE 754 format.
+  // Complex64 elements must be written as two consecutive FLOAT values, real component first.
+  // Complex128 elements must be written as two consecutive DOUBLE values, real component first.
+  // Boolean type MUST be written one byte per tensor element (00000001 for true, 00000000 for false).
+  // uint4 and int4 values must be packed to 4bitx2, the first element is stored in the 4 LSB and the second element is stored in the 4 MSB.
+  //
+  // Note: the advantage of specific field rather than the raw_data field is
+  // that in some cases (e.g. int data), protobuf does a better packing via
+  // variable length storage, and may lead to smaller binary footprint.
+  // When this field is present, the data_type field MUST NOT be STRING or UNDEFINED
+  optional bytes raw_data = 9;
+
+  // Data can be stored inside the protobuf file using type-specific fields or raw_data.
+  // Alternatively, raw bytes data can be stored in an external file, using the external_data field.
+  // external_data stores key-value pairs describing data location. Recognized keys are:
+  // - "location" (required) - POSIX filesystem path relative to the directory where the ONNX
+  //                           protobuf model was stored
+  // - "offset" (optional) - position of byte at which stored data begins. Integer stored as string.
+  //                         Offset values SHOULD be multiples 4096 (page size) to enable mmap support.
+  // - "length" (optional) - number of bytes containing data. Integer stored as string.
+  // - "checksum" (optional) - SHA1 digest of file specified in under 'location' key.
+  repeated StringStringEntryProto external_data = 13;
+
+  // Location of the data for this tensor. MUST be one of:
+  // - DEFAULT - data stored inside the protobuf message. Data is stored in raw_data (if set) otherwise in type-specified field.
+  // - EXTERNAL - data stored in an external location as described by external_data field.
+  enum DataLocation {
+    DEFAULT = 0;
+    EXTERNAL = 1;
+  }
+
+  // If value not set, data is stored in raw_data (if set) otherwise in type-specified field.
+  optional DataLocation data_location = 14;
+
+  // For double
+  // Complex128 tensors are encoded as a single array of doubles,
+  // with the real components appearing in odd numbered positions,
+  // and the corresponding imaginary component appearing in the
+  // subsequent even numbered position. (e.g., [1.0 + 2.0i, 3.0 + 4.0i]
+  // is encoded as [1.0, 2.0 ,3.0 ,4.0]
+  // When this field is present, the data_type field MUST be DOUBLE or COMPLEX128
+  repeated double double_data = 10 [packed = true];
+
+  // For uint64 and uint32 values
+  // When this field is present, the data_type field MUST be
+  // UINT32 or UINT64
+  repeated uint64 uint64_data = 11 [packed = true];
+
+  // Named metadata values; keys should be distinct.
+  repeated StringStringEntryProto metadata_props = 16;
+}
+
+// A serialized sparse-tensor value
+message SparseTensorProto {
+  // The sequence of non-default values are encoded as a tensor of shape [NNZ].
+  // The default-value is zero for numeric tensors, and empty-string for string tensors.
+  // values must have a non-empty name present which serves as a name for SparseTensorProto
+  // when used in sparse_initializer list.
+  optional TensorProto values = 1;
+
+  // The indices of the non-default values, which may be stored in one of two formats.
+  // (a) Indices can be a tensor of shape [NNZ, rank] with the [i,j]-th value
+  // corresponding to the j-th index of the i-th value (in the values tensor).
+  // (b) Indices can be a tensor of shape [NNZ], in which case the i-th value
+  // must be the linearized-index of the i-th value (in the values tensor).
+  // The linearized-index can be converted into an index tuple (k_1,...,k_rank)
+  // using the shape provided below.
+  // The indices must appear in ascending order without duplication.
+  // In the first format, the ordering is lexicographic-ordering:
+  // e.g., index-value [1,4] must appear before [2,1]
+  optional TensorProto indices = 2;
+
+  // The shape of the underlying dense-tensor: [dim_1, dim_2, ... dim_rank]
+  repeated int64 dims = 3;
+}
+
+// Defines a tensor shape. A dimension can be either an integer value
+// or a symbolic variable. A symbolic variable represents an unknown
+// dimension.
+message TensorShapeProto {
+  message Dimension {
+    oneof value {
+      int64 dim_value = 1;
+      string dim_param = 2;   // namespace Shape
+    };
+    // Standard denotation can optionally be used to denote tensor
+    // dimensions with standard semantic descriptions to ensure
+    // that operations are applied to the correct axis of a tensor.
+    // Refer to https://github.com/onnx/onnx/blob/main/docs/DimensionDenotation.md#denotation-definition
+    // for pre-defined dimension denotations.
+    optional string denotation = 3;
+  };
+  repeated Dimension dim = 1;
+}
+
+// Types
+//
+// The standard ONNX data types.
+message TypeProto {
+
+  message Tensor {
+    // This field MUST NOT have the value of UNDEFINED
+    // This field MUST have a valid TensorProto.DataType value
+    // This field MUST be present for this version of the IR.
+    optional int32 elem_type = 1;
+    optional TensorShapeProto shape = 2;
+  }
+
+  // repeated T
+  message Sequence {
+    // The type and optional shape of each element of the sequence.
+    // This field MUST be present for this version of the IR.
+    optional TypeProto elem_type = 1;
+  };
+
+  // map<K,V>
+  message Map {
+    // This field MUST have a valid TensorProto.DataType value
+    // This field MUST be present for this version of the IR.
+    // This field MUST refer to an integral type ([U]INT{8|16|32|64}) or STRING
+    optional int32 key_type = 1;
+    // This field MUST be present for this version of the IR.
+    optional TypeProto value_type = 2;
+  };
+
+  // wrapper for Tensor, Sequence, or Map
+  message Optional {
+    // The type and optional shape of the element wrapped.
+    // This field MUST be present for this version of the IR.
+    // Possible values correspond to OptionalProto.DataType enum
+    optional TypeProto elem_type = 1;
+  };
+
+
+  message SparseTensor {
+    // This field MUST NOT have the value of UNDEFINED
+    // This field MUST have a valid TensorProto.DataType value
+    // This field MUST be present for this version of the IR.
+    optional int32 elem_type = 1;
+    optional TensorShapeProto shape = 2;
+  }
+
+
+  oneof value {
+    // The type of a tensor.
+    Tensor tensor_type = 1;
+
+    // NOTE:  DNN-only implementations of ONNX MAY elect to not support non-tensor values
+    //        as input and output to graphs and nodes. These types are needed to naturally
+    //        support classical ML operators.  DNN operators SHOULD restrict their input
+    //        and output types to tensors.
+
+    // The type of a sequence.
+    Sequence sequence_type = 4;
+
+    // The type of a map.
+    Map map_type = 5;
+
+    // The type of an optional.
+    Optional optional_type = 9;
+
+
+    // Type of the sparse tensor
+    SparseTensor sparse_tensor_type = 8;
+
+  }
+
+  // An optional denotation can be used to denote the whole
+  // type with a standard semantic description as to what is
+  // stored inside. Refer to https://github.com/onnx/onnx/blob/main/docs/TypeDenotation.md#type-denotation-definition
+  // for pre-defined type denotations.
+  optional string denotation = 6;
+}
+
+// Operator Sets
+//
+// OperatorSets are uniquely identified by a (domain, opset_version) pair.
+message OperatorSetIdProto {
+  // The domain of the operator set being identified.
+  // The empty string ("") or absence of this field implies the operator
+  // set that is defined as part of the ONNX specification.
+  // This field MUST be present in this version of the IR when referring to any other operator set.
+  optional string domain = 1;
+
+  // The version of the operator set being identified.
+  // This field MUST be present in this version of the IR.
+  optional int64 version = 2;
+}
+
+// Operator/function status.
+enum OperatorStatus {
+    EXPERIMENTAL = 0;
+    STABLE = 1;
+}
+
+message FunctionProto {
+  // The name of the function, similar to op_type in NodeProto.
+  // This is part of the unique-id (domain, name, overload) of FunctionProtos in a model.
+  optional string name = 1;
+
+  // Deprecated since IR Version 8
+  // optional int64 since_version = 2;
+  reserved 2;
+  reserved "since_version";
+
+  // Deprecated since IR Version 8
+  // optional OperatorStatus status = 3;
+  reserved 3;
+  reserved "status";
+
+  // The inputs and outputs of the function.
+  repeated string input = 4;
+  repeated string output = 5;
+
+  // The attribute parameters of the function.
+  // It is for function parameters without default values.
+  repeated string attribute = 6;
+
+  // The attribute protos of the function.
+  // It is for function attributes with default values.
+  // A function attribute shall be represented either as
+  // a string attribute or an AttributeProto, not both.
+  repeated AttributeProto attribute_proto = 11;
+
+  // The nodes in the function.
+  repeated NodeProto node = 7;
+  // A human-readable documentation for this function. Markdown is allowed.
+  optional string doc_string = 8;
+
+  // The OperatorSets this function body (graph) relies on.
+  //
+  // All nodes in the function body (graph) will bind against the operator
+  // with the same-domain/same-op_type operator with the HIGHEST version
+  // in the referenced operator sets. This means at most one version can be relied
+  // for one domain.
+  //
+  // The operator sets imported by FunctionProto should be compatible with the ones
+  // imported by ModelProto. Example, if same operator set say 'A' is imported by FunctionProto
+  // and ModelProto then versions for the operator set may be different but,
+  // the operator schema returned for op_type, domain, version combination
+  // for both the versions should be same.
+
+  repeated OperatorSetIdProto opset_import = 9;
+
+  // The domain which this function belongs to.
+  // This is part of the unique-id (domain, name, overload) of FunctionProtos in a model.
+  optional string domain = 10;
+
+  // The overload identifier of the function.
+  // This is part of the unique-id (domain, name, overload) of FunctionProtos in a model.
+  optional string overload = 13;
+
+  // Information for the values in the function. The ValueInfoProto.name's
+  // must be distinct and refer to names in the function (including inputs,
+  // outputs, and intermediate values). It is optional for a value to appear
+  // in value_info list.
+  repeated ValueInfoProto value_info = 12;
+
+  // Named metadata values; keys should be distinct.
+  repeated StringStringEntryProto metadata_props = 14;
+}
+
+// For using protobuf-lite
+option optimize_for = LITE_RUNTIME;
diff --git a/tools/onnx-subgraph/onnx_subgraph_ut.py b/tools/onnx-subgraph/onnx_subgraph_ut.py
new file mode 100644
index 00000000000..26daf8dd245
--- /dev/null
+++ b/tools/onnx-subgraph/onnx_subgraph_ut.py
@@ -0,0 +1,66 @@
+# Copyright (c) 2025 Samsung Electronics Co., Ltd. All Rights Reserved
+
+import unittest
+import os
+import sys
+import extract_onnx_lib
+import shutil
+
+
+def onnx_parser_test(args):
+    #exe = './onnx-subgraph ' + '--onnx=test.onnx'
+    exe = './onnx-subgraph ' + args
+    rec = os.system(exe)
+
+
+class ONNX_Parser_Test(unittest.TestCase):
+    def test_parse_result_exception(self):
+        ret = os.path.exists('./subgraphs_ios.txt')
+        if ret:
+            os.remove('./subgraphs_ios.txt')
+        onnx_parser_test('--onnx=no_file.onnx')
+        ret = os.path.exists('./subgraphs_ios.txt')
+        self.assertEqual(ret, False)
+
+    def test_parse_result_normal(self):
+        ret = os.path.exists('./subgraphs_ios.txt')
+        if ret:
+            os.remove('./subgraphs_ios.txt')
+
+        onnx_parser_test('--onnx=test.onnx')
+        ret = os.path.exists('./subgraphs_ios.txt')
+        self.assertEqual(ret, True)
+
+    def test_subgraph_normal(self):
+        ret = os.path.exists('./subgraphs')
+        if ret:
+            shutil.rmtree(path='./subgraphs')
+
+        extract_onnx_lib.split_onnx_ios('./subgraphs_ios.txt', './test.onnx')
+        ret = os.path.exists('./subgraphs')
+        self.assertEqual(ret, True)
+
+        ret = os.path.exists('./subgraphs/CPU')
+        self.assertEqual(ret, True)
+
+        ret = os.path.exists('./subgraphs/NPU')
+        self.assertEqual(ret, True)
+
+        ret = os.path.exists('./subgraphs/CPU/CPUsubgraph15.onnx')
+        self.assertEqual(ret, True)
+
+        ret = os.path.exists('./subgraphs/NPU/NPUsubgraph15.onnx')
+        self.assertEqual(ret, True)
+
+    def test_subgraph_exception(self):
+        ret = os.path.exists('./subgraphs')
+        if ret:
+            shutil.rmtree(path='./subgraphs')
+
+        extract_onnx_lib.split_onnx_ios('./subgraphs_ios.txt', './fake.onnx')
+        ret = os.path.exists('./subgraphs')
+        self.assertEqual(ret, False)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tools/onnx-subgraph/quant.py b/tools/onnx-subgraph/quant.py
new file mode 100644
index 00000000000..5ca672709d4
--- /dev/null
+++ b/tools/onnx-subgraph/quant.py
@@ -0,0 +1,427 @@
+# Copyright (c) 2025 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import os
+import time
+from types import MethodType
+
+import torch
+import torch.nn as nn
+from torch import Tensor
+from torch.nn import functional as F
+from tqdm import tqdm
+from sklearn.cluster import KMeans
+from torch.nn import functional as F
+import numpy as np
+import pdb
+
+
+def quant_transmartix(x, bits=8):
+    # Quantizes the input tensor and returns the quantized tensor and its integer representation.
+    if (x.max() == x.min()):
+        return x, 0
+    n = 2**(bits - 1) - 1
+    act_scale = (x.max() - x.min()) / 2 / n
+    zero_point = (x.min() + x.max()) / 2
+    aint = ((x - zero_point) / act_scale).round().clamp(-n - 1, n)
+    xq = aint * act_scale + zero_point
+    return xq, aint
+
+
+def quant_transmartix1(x, bits=8):
+    # Computes the projection matrix using singular value decomposition and quantizes it.
+    cov = torch.matmul(im, im.t()) / im.shape[1]
+    if (x.max() == x.min()):
+        return x, 0
+    n = 2**(bits - 1) - 1
+    act_scale = (x.max() - x.min()) / 2 / n
+    zero_point = (x.min() + x.max()) / 2
+    aint = ((x - zero_point) / act_scale).round().clamp(-n - 1, n)
+    return aint, act_scale, zero_point
+
+
+def get_projection_matrix(im, eigenVar, num_bits=8):
+    # covariance matrix
+    cov = torch.matmul(im, im.t()) / im.shape[1]
+    # svd
+    u, s, _ = torch.svd(cov)
+    u, _ = quant_transmartix(u, 16)
+    return u, s
+
+
+def comp(x, rate, output_dir, count, transu, inb, num_bits, layer):
+    # Compresses the input tensor using a transformation matrix and quantizes the result.
+    if (len(x.shape) == 2):
+        B, C = x.shape
+        x_reshape = x
+    elif (len(x.shape) == 3):
+        B, C, H = x.shape
+        x_reshape = x.permute(1, 0, 2).reshape(C, -1)
+    elif (len(x.shape) == 4):
+        B, C, H, W = x.shape
+        x_reshape = x.permute(1, 0, 2, 3).reshape(C, -1)
+    else:
+        raise NotImplementedError
+    if (count == 1):
+        u, s = get_projection_matrix(x_reshape, rate, num_bits)
+        x_trans = torch.matmul(u.t(), x_reshape)
+        x_trans, x_trans_int = quant_transmartix(x_trans, num_bits)
+        channel_max = x_trans_int.max(-1)[0].reshape(1, -1)
+        channel_min = x_trans_int.min(-1)[0].reshape(1, -1)
+        channel_dif = channel_max - channel_min
+        channel_dif[torch.where(channel_dif == 0)] = 1
+        bits = torch.ceil(torch.log2(channel_dif))
+        max_min = torch.cat([channel_max, channel_min], dim=0)
+        x_return = torch.matmul(u, x_trans)
+        x_return, x_return_int = quant_transmartix(x_return, num_bits)
+        ru = u
+        rbits = max_min
+    elif (count <= 100):
+        x_trans = torch.matmul(transu.t(), x_reshape)
+        x_trans, x_trans_int = quant_transmartix(x_trans, num_bits)
+        channel_max = x_trans_int.max(-1)[0].reshape(1, -1)
+        channel_min = x_trans_int.min(-1)[0].reshape(1, -1)
+        max_min = torch.cat([channel_max, channel_min], dim=0)
+        x_return = torch.matmul(transu, x_trans)
+        x_return, x_return_int = quant_transmartix(x_return, num_bits)
+        ru = None
+        rbits = max_min
+    else:
+        x_trans = torch.matmul(transu.t(), x_reshape)
+        x_trans_int, act_scale, zero_point = quant_transmartix1(x_trans, num_bits)
+        inb_expend = inb[:, :, None].repeat(1, 1, H * W)
+        mask_clip_max = torch.where(x_trans_int > inb_expend[0])
+        mask_clip_min = torch.where(x_trans_int < inb_expend[1])
+        x_trans_int[mask_clip_max] = inb_expend[0][mask_clip_max]
+        x_trans_int[mask_clip_min] = inb_expend[1][mask_clip_min]
+        x_trans = x_trans_int * act_scale + zero_point
+        channel_max = x_trans_int.max(-1)[0].reshape(1, -1)
+        channel_min = x_trans_int.min(-1)[0].reshape(1, -1)
+        max_min = torch.cat([channel_max, channel_min], dim=0)
+        x_return = torch.matmul(transu, x_trans)
+        print(x_return.size())
+        x_return, x_return_int = quant_transmartix(x_return, num_bits)
+        ru = None
+        rbits = max_min
+
+    result_pt = os.path.join(output_dir, "result_pt")
+    os.makedirs(result_pt, exist_ok=True)
+    torch.save(ru, os.path.join(result_pt, f"{layer}.pt"))
+    if len(x.shape) == 2:
+        x_return = x_return
+    elif len(x.shape) == 3:
+        x_return = x_return.reshape(C, B, H).permute(1, 0, 2)
+    elif len(x.shape) == 4:
+        x_return = x_return.reshape(C, B, H, W).permute(1, 0, 2, 3)
+    return x_return, ru, rbits
+
+
+def quant_activation(x, bit, act_scale, zero_point=0):
+    # Applies quantization on activation tensors with given scale and zero point.
+    n = 2**(bit - 1) - 1
+    aint = ((x - zero_point) / act_scale).round().clamp(-n - 1, n)
+    xq = aint * act_scale + zero_point
+    return xq
+
+
+def quant_linear_weight(w, bit, mode="channel_wise", symmetric=True):
+    # Quantizes linear layer weights either channel-wise or tensor-wise.
+    if mode == "channel_wise" and symmetric:
+        n = 2**(bit - 1) - 1
+        scale_channel_wise = w.abs().max(dim=1, keepdim=True)[0] / n
+        wint = (w / scale_channel_wise).round().clamp(-n - 1, n)
+        wq = wint * scale_channel_wise
+    else:
+        n = 2**(bit - 1) - 1
+        scale_tensor_wise = w.abs().max() / n
+        wint = (w / scale_tensor_wise).round().clamp(-n - 1, n)
+        wq = wint * scale_tensor_wise
+    return wq
+
+
+def quant_conv_weight(w, bit, mode="channel_wise", symmetric=True):
+    # Quantizes convolutional layer weights channel-wise.
+    if mode == "channel_wise" and symmetric:
+        n = 2**(bit - 1) - 1
+        scale_channel_wise = (w.view(w.shape[0], -1).abs().max(dim=-1, keepdim=True)[0] /
+                              n)
+        scale_channel_wise = scale_channel_wise.view(w.shape[0], 1, 1, 1)
+        wint = (w / scale_channel_wise).round().clamp(-n - 1, n)
+        wq = wint * scale_channel_wise
+    else:
+        raise NotImplementedError
+    return wq
+
+
+def quant_conv_forward_save_output(x, layer, count, bit, i, output_dir):
+    # Performs forward pass of convolutional layer while saving intermediate quantization outputs.
+    x, xq_int = quant_transmartix(x, bit)
+    result_path = output_dir + "/result"
+    os.makedirs(result_path, exist_ok=True)
+    output_tensor, ru, rb = comp(x=x,
+                                 rate=0.999999,
+                                 output_dir=output_dir,
+                                 count=1,
+                                 transu=None,
+                                 inb=None,
+                                 num_bits=8,
+                                 layer=layer)
+    if (count == 1):
+        u = ru
+        rb = rb
+
+    B, C, H, W = x.shape
+    Max = rb[0:200:2]
+    Min = rb[1:200:2]
+    channel_max = Max.max(0)[0].reshape(1, -1)
+    channel_min = Min.min(0)[0].reshape(1, -1)
+    #0.285
+    mask_neg_max = torch.where(channel_max < 0)
+    channel_max[mask_neg_max] = -1 * channel_max[mask_neg_max]
+    mask_zero_max = torch.where(channel_max == 0)
+    channel_max[mask_zero_max] = 1
+    channel_max_log = torch.log2(channel_max)
+
+    condition = channel_max_log - torch.floor(channel_max_log) <= 0.55
+    channel_max_return = torch.where(condition, 2**torch.floor(channel_max_log),
+                                     2**torch.ceil(channel_max_log))
+
+    mask_neg_min = torch.where(channel_min < 0)
+    channel_min[mask_neg_min] = -1 * channel_min[mask_neg_min]
+    mask_zero_min = torch.where(channel_min == 0)
+    channel_min[mask_zero_min] = 1
+    channel_min_log = torch.log2(channel_min)
+
+    condition = channel_min_log - torch.floor(channel_min_log) <= 0.6
+    channel_min_return = torch.where(condition, 2**torch.floor(channel_min_log),
+                                     2**torch.ceil(channel_min_log))
+
+    channel_min_return[mask_neg_min] = -1 * channel_min_return[mask_neg_min]
+    rb = torch.cat([channel_max_return, channel_min_return], dim=0)
+    filename = result_path + f'/result_{i}.txt'
+    with open(filename, 'a') as f:
+        f.write(f'{(channel_max_return-channel_min_return).mean()/2**bit},{x.numel()}\n')
+    return output_tensor
+
+
+def quant_conv_forward_optimization(self, x):
+    # Optimizes the quantization parameters for a convolutional layer's activations during forward pass.
+    if self.enable_calib_act_min_max:
+        z_target = self._conv_forward(x, self.weight, None)
+        xmax = x.abs().max()
+        best_scale = None
+        best_mse = 1e10
+        range_num = 200
+        pbar = tqdm(range(range_num), desc=self.own_name)
+        for ii in pbar:
+            xq = x.clip(
+                -xmax * (1 / range_num * (range_num - ii)),
+                xmax * (1 / range_num * (range_num - ii)),
+            )
+            zero_point = (xq.max() + xq.min()) / 2
+            act_scale = (xq.max() - xq.min()) / 2 / (2**(self.bit - 1) - 1)
+            xq = quant_activation(xq,
+                                  bit=self.bit,
+                                  act_scale=act_scale,
+                                  zero_point=zero_point)
+            zq = self._conv_forward(xq, self.weight, None)
+            mse = ((z_target - zq)**2).mean().item()
+            if mse < best_mse:
+                best_mse = mse
+                best_scale = act_scale
+                best_zero_point = zero_point
+                best_clip_value = xmax * (1 / range_num * (range_num - ii))
+            pbar.set_postfix(
+                dict(
+                    best_mse=f"{best_mse:.1e}",
+                    best_scale=best_scale.data.item(),
+                    xmax=xmax.data.item(),
+                    best_clip_value=best_clip_value.data.item(),
+                ))
+        assert best_scale is not None
+        del z_target
+        del xq
+        del zq
+        del mse
+        gc.collect()
+        torch.cuda.empty_cache()
+        self.act_scale = best_scale
+        self.zero_point = best_zero_point
+        self.clip_value = best_clip_value
+        self.enable_calib_act_min_max = False
+    if self.act_scale is None:
+        self.zero_point = (x.max() + x.min()) / 2
+        self.act_scale = (x.max() - x.min()) / 2 / (2**(self.bit - 1) - 1)
+    x = quant_activation(x,
+                         bit=self.bit,
+                         act_scale=self.act_scale,
+                         zero_point=self.zero_point)
+    return self._conv_forward(x, self.weight, self.bias)
+
+
+def quant_linear_forward_save_output(self, x):
+    # Performs forward pass of linear layer while saving intermediate quantization outputs.
+    xq = x.clip(-self.clip_value, self.clip_value)
+    # xq=x
+    act_scale = (xq.max() - xq.min()) / 2 / (2**(self.bit - 1) - 1)
+    zero_point = (xq.min() + xq.max()) / 2
+    xq = quant_activation(xq, bit=self.bit, act_scale=act_scale, zero_point=zero_point)
+    zq = F.linear(xq, self.weight, self.bias)
+    return zq
+
+
+def quant_linear_forward_optimization(self, x):
+    # Optimizes the quantization parameters for a linear layer's activations and weights during forward pass.
+    if self.enable_calib_act_min_max:
+        z_target = F.linear(x, self.weight)
+        xmax = x.abs().max()
+        best_scale = None
+        best_mse = 1e5
+        range_num = 200
+        pbar = tqdm(range(range_num), desc=self.own_name)
+        for ii in pbar:
+            xq = x.clip(
+                -xmax * (1 / range_num * (range_num - ii)),
+                xmax * (1 / range_num * (range_num - ii)),
+            )
+            act_scale = (xq.max() - xq.min()) / 2 / (2**(self.bit - 1) - 1)
+            zero_point = (xq.min() + xq.max()) / 2
+            xq = quant_activation(xq,
+                                  bit=self.bit,
+                                  act_scale=act_scale,
+                                  zero_point=zero_point)
+            zq = F.linear(xq, self.weight)
+            mse = ((z_target - zq)**2).mean()
+            if mse < best_mse:
+                best_mse = mse
+                best_scale = act_scale
+                best_zero = zero_point
+                best_clip_value = xmax * (1 / range_num * (range_num - ii))
+            pbar.set_postfix(
+                dict(
+                    best_mse=best_mse.data.item(),
+                    best_scale=best_scale.data.item(),
+                    xmax=xmax.data.item(),
+                    best_clip_value=best_clip_value.data.item(),
+                ))
+        assert best_scale is not None
+        del z_target
+        del xq
+        del zq
+        del mse
+        gc.collect()
+        torch.cuda.empty_cache()
+        self.act_scale = best_scale
+        self.zero_point = best_zero
+        self.clip_value = best_clip_value
+        self.enable_calib_act_min_max = False
+
+    if self.act_scale is None:
+        self.zero_point = (x.max() + x.min()) / 2
+        self.act_scale = (x.max() - x.min()) / 2 / (2**(self.bit - 1) - 1)
+    x = quant_activation(x,
+                         bit=self.bit,
+                         act_scale=self.act_scale,
+                         zero_point=self.zero_point)
+
+    if self.enable_calib_weight_min_max:
+        z_target = F.linear(x, self.fp_weight)
+        best_mse = 1e5
+        range_num = 200
+        wmax = self.fp_weight.abs().max()
+        pbar = tqdm(range(range_num), desc=self.own_name)
+        for ii in pbar:
+            w_clip = wmax * (1 / range_num * (range_num - ii))
+            wq = quant_linear_weight(
+                self.fp_weight.clip(-w_clip, w_clip),
+                self.bit,
+                mode="tensor_wise",
+                symmetric=True,
+            )
+            zq = F.linear(x, wq)
+            mse = ((z_target - zq)**2).mean()
+            if mse < best_mse:
+                best_mse = mse
+                best_w_clip = w_clip
+            pbar.set_postfix(
+                dict(
+                    best_mse=best_mse.data.item(),
+                    best_w_clip=best_w_clip.data.item(),
+                ))
+        self.weight.data = quant_linear_weight(
+            self.fp_weight.clip(-best_w_clip, best_w_clip),
+            self.bit,
+            mode="tensor_wise",
+            symmetric=True,
+        )
+        self.enable_calib_weight_min_max = False
+
+    return F.linear(x, self.weight, self.bias)
+
+
+def fast_quant(
+    model,
+    comp,
+    bit=8,
+    fp=False,
+    enable_calib_act_min_max=False,
+    enable_calib_weight_min_max=False,
+    optimization=False,
+    load_min_max_from_json=False,
+    min_max_dict=None,
+):
+    if fp:
+        return model
+    layer = 0
+    convlayer = 0
+    for name, module in tqdm(model.named_modules(), desc="Quantize weights"):
+        module.own_name = name
+        if isinstance(module, nn.Linear):
+            module.bit = bit
+            w = module.weight.data.clone()
+            wq = quant_linear_weight(w, bit, mode="tensor_wise", symmetric=True)
+            module.weight.data = wq.data
+            module.act_scale = None
+            if optimization:
+                module.forward = MethodType(quant_linear_forward_optimization, module)
+            elif load_min_max_from_json:
+                module.clip_value = torch.tensor(min_max_dict[name], device="cuda")
+                module.forward = MethodType(quant_linear_forward_save_output, module)
+            module.enable_calib_act_min_max = enable_calib_act_min_max
+            module.enable_calib_weight_min_max = enable_calib_weight_min_max
+            module.layer = layer
+            layer += 1
+        if isinstance(module, nn.Conv2d):
+            module.layer = layer
+            module.convlayer = convlayer
+            convlayer += 1
+            layer += 1
+            module.count = 0
+            module.u = 0
+            module.rb = 0
+            module.comp = comp
+
+            module.bit = bit
+            w = module.weight.data.clone()
+            wq = quant_conv_weight(w, bit, mode="channel_wise", symmetric=True)
+            module.weight.data = wq.data
+            module.act_scale = None
+            if optimization:
+                module.forward = MethodType(quant_conv_forward_optimization, module)
+            elif load_min_max_from_json:
+                module.clip_value = torch.tensor(min_max_dict[name], device="cuda")
+                module.forward = MethodType(quant_conv_forward_save_output, module)
+            module.enable_calib_act_min_max = enable_calib_act_min_max
+    return model
diff --git a/tools/onnx-subgraph/single_vs_multiple_onnx.py b/tools/onnx-subgraph/single_vs_multiple_onnx.py
new file mode 100644
index 00000000000..d83026c4357
--- /dev/null
+++ b/tools/onnx-subgraph/single_vs_multiple_onnx.py
@@ -0,0 +1,114 @@
+# Copyright (c) 2025 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import onnxruntime as ort
+import numpy as np
+from model_inference_multiple_output import *
+import os
+
+
+def compare_results(output_single, output_multiple):
+    """
+    Compares the Mean Squared Error (MSE) between identically named outputs from two inference result dictionaries.
+    Ensures each output name is processed only once.
+    """
+    all_keys = set(output_single.keys()).union(set(output_multiple.keys()))
+    for key in sorted(all_keys):
+        if key in output_single and key in output_multiple:
+            single_output = np.array(output_single[key])
+            multiple_output = np.array(output_multiple[key])
+            mse = np.mean((single_output - multiple_output)**2)
+            print(f"Output '{key}' MSE: {mse}")
+        else:
+            print(f"Output '{key}' is missing in one of the result sets.")
+
+
+def prepare_initial_input_data(onnx_model_path, default_input_data):
+    """
+    Prepares initial input data for inference.
+
+    Args:
+        onnx_model_path (str): Path to the ONNX model file.
+        default_input_data (dict): Dictionary containing default input data.
+
+    Returns:
+        dict: Dictionary with user-specified or default shaped and typed input data.
+    """
+    session = ort.InferenceSession(onnx_model_path)
+    input_info = {input.name: input.shape for input in session.get_inputs()}
+
+    initial_input_data = {}
+    dtype_map = {'f': np.float32, 'i': np.int64}
+
+    for input_name, shape in input_info.items():
+        custom_shape_str = input(
+            f"Enter new shape for input '{input_name}' (comma-separated integers), or press Enter to use default: "
+        )
+        custom_dtype_str = input(
+            f"Enter data type for input '{input_name}' ('f' for float32, 'i' for int64), or press Enter to use default: "
+        )
+
+        if not custom_shape_str:
+            new_shape = default_input_data[input_name].shape
+        else:
+            try:
+                new_shape = [int(dim) for dim in custom_shape_str.split(',')]
+            except ValueError:
+                print("Invalid input, please ensure you enter comma-separated integers.")
+                continue
+
+        if not custom_dtype_str:
+            dtype = default_input_data[input_name].dtype
+        else:
+            dtype = dtype_map.get(custom_dtype_str.strip(), None)
+            if dtype is None:
+                print("Invalid data type, please enter 'f' or 'i'.")
+                continue
+
+        input_data = np.random.rand(*new_shape).astype(dtype)
+        initial_input_data[input_name] = input_data
+
+    return initial_input_data
+
+
+# Define paths for single ONNX model and split subgraph models
+single_onnx_model_path = './resnet-test.onnx'
+model_path = './subgraphs/'
+subgraphsiostxt_path = './subgraphs_ios.txt'
+
+# Initialize ModelInference instance for inference
+model_inference = ModelInference(model_path, subgraphsiostxt_path)
+
+# Default input data dictionary
+default_input_data = {
+    "x": np.random.rand(1, 3, 256, 256).astype(np.float32),
+}
+
+#initial_input_data = prepare_initial_input_data(single_onnx_model_path, default_input_data)
+initial_input_data = default_input_data
+
+# Perform inference using a single ONNX model
+output_single = ModelInference.infer_single_onnx_model(single_onnx_model_path,
+                                                       initial_input_data)
+print("Single model inference completed!")
+
+# Retrieve all output names from the single model
+output_names_list = list(output_single.keys())
+
+# Perform inference using multiple split subgraph models
+output_multiple = model_inference.inference(initial_input_data, output_names_list)
+print("Multiple subgraph inference completed!")
+
+print("Comparing inference results between single ONNX model and multiple subgraphs...")
+compare_results(output_single, output_multiple)
diff --git a/tools/onnx-subgraph/src/lib/device.cpp b/tools/onnx-subgraph/src/lib/device.cpp
new file mode 100644
index 00000000000..9dfc13843d8
--- /dev/null
+++ b/tools/onnx-subgraph/src/lib/device.cpp
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2025 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "device.h"
+
+void Device::GenerateCutInstruction(std::vector<onnx::GraphProto> &Subgraphs, std::string device,
+                                    std::vector<std::unordered_set<NodeTensor>> &subgraphs_inputs,
+                                    std::vector<std::unordered_set<NodeTensor>> &subgraphs_outputs)
+{
+  std::cout << "Generate Cut Instruction for Target_NPU" << std::endl;
+  // open file
+  std::string file_name = device + "CutInstruction.txt";
+  std::ofstream outFile(file_name);
+  if (!outFile.is_open())
+  {
+    std::cerr << "Error opening file." << std::endl;
+    exit(0);
+  }
+  for (size_t i = 0; i < Subgraphs.size(); i++)
+  {
+    // default parameters
+    std::string modelFile = onnxFile;
+    std::string dataScaleDiv = "255";
+    std::string postprocess = "save_and_top5";
+
+    std::unordered_set<NodeTensor> graphInputs = subgraphs_inputs[i];
+    std::unordered_set<NodeTensor> graphOutputs = subgraphs_outputs[i];
+
+    std::string inputName = "\"";
+    for (const auto &input : graphInputs)
+    {
+      inputName = inputName + input.name + ";";
+    }
+    // delete last semicolon
+    if (!inputName.empty() && inputName.back() == ';')
+    {
+      inputName.pop_back();
+    }
+    inputName = inputName + "\"";
+    std::string outputName = "\"";
+    for (const auto &output : graphOutputs)
+    {
+      outputName = outputName + output.name + ";";
+    }
+    // delete last semicolon
+    if (!outputName.empty() && outputName.back() == ';')
+    {
+      outputName.pop_back();
+    }
+    outputName = outputName + "\"";
+
+    std::string inputShape = "\"";
+    for (const auto &input : graphInputs)
+    {
+      for (const auto &dim : input.shape)
+      {
+        inputShape = inputShape + std::to_string(dim) + " ";
+      }
+      // delete last space
+      if (!inputShape.empty() && inputShape.back() == ' ')
+      {
+        inputShape.pop_back();
+      }
+      inputShape = inputShape + ";";
+    }
+    // delete last semicolon
+    if (!inputShape.empty() && inputShape.back() == ';')
+    {
+      inputShape.pop_back();
+    }
+    inputShape = inputShape + "\"";
+
+    std::string calibrateDataset = device + "_Subgraphs_" + std::to_string(i) + ".npz";
+    std::string quantizationScheme = "int8_asym";
+  }
+
+  outFile.close();
+}
diff --git a/tools/onnx-subgraph/src/lib/graph.cpp b/tools/onnx-subgraph/src/lib/graph.cpp
new file mode 100644
index 00000000000..610c0d71346
--- /dev/null
+++ b/tools/onnx-subgraph/src/lib/graph.cpp
@@ -0,0 +1,258 @@
+/*
+ * Copyright (c) 2025 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "graph.h"
+#include "partition.h"
+
+std::unordered_set<NodeTensor> getInitializer(const onnx::GraphProto &graph)
+{
+  std::unordered_set<NodeTensor> initializerNames;
+  for (const auto &initializer : graph.initializer())
+  {
+    NodeTensor nt;
+    nt.name = initializer.name();
+    std::vector<int64_t> shape;
+    for (const auto &dim : initializer.dims())
+    {
+      shape.push_back(dim);
+    }
+    nt.shape = shape;
+    initializerNames.insert(nt);
+  }
+  return initializerNames;
+}
+
+std::unordered_set<NodeTensor> getIOvalue(const onnx::GraphProto &graph)
+{
+  std::unordered_set<NodeTensor> IOvalue;
+  for (const auto &value_info : graph.value_info())
+  {
+    NodeTensor nt;
+    nt.name = value_info.name();
+
+    std::vector<int64_t> shape;
+    for (const auto &dim : value_info.type().tensor_type().shape().dim())
+    {
+      shape.push_back(dim.dim_value());
+    }
+    nt.shape = shape;
+    IOvalue.insert(nt);
+  }
+  for (auto value_info : graph.input())
+  {
+    NodeTensor nt;
+    nt.name = value_info.name();
+
+    std::vector<int64_t> shape;
+    for (const auto &dim : value_info.type().tensor_type().shape().dim())
+    {
+      shape.push_back(dim.dim_value());
+    }
+    nt.shape = shape;
+    IOvalue.insert(nt);
+  }
+  for (auto value_info : graph.output())
+  {
+    NodeTensor nt;
+    nt.name = value_info.name();
+
+    std::vector<int64_t> shape;
+    for (const auto &dim : value_info.type().tensor_type().shape().dim())
+    {
+      shape.push_back(dim.dim_value());
+    }
+    nt.shape = shape;
+    IOvalue.insert(nt);
+  }
+  return IOvalue;
+}
+/**
+ * @brief     Finds a NodeTensor with the specified name in the given set of NodeTensors.
+ *
+ * @param     [in] name The name of the NodeTensor to find.
+ * @param     [in] tensors The set of NodeTensors to search within.
+ * @pre       The tensors set should be valid and contain NodeTensor objects.
+ * @post      None
+ * @exception None
+ * @return    An iterator to the found NodeTensor if it exists, otherwise an iterator to the end of
+ * the set.
+ */
+std::unordered_set<NodeTensor>::const_iterator
+isInputFromInitializer(const std::string &name, const std::unordered_set<NodeTensor> &tensors)
+{
+  return std::find_if(tensors.begin(), tensors.end(),
+                      [&](const NodeTensor &tensor) { return tensor.name == name; });
+}
+
+void determineGraphInput(const onnx::GraphProto &g,
+                         const std::unordered_set<NodeTensor> &initializerNames,
+                         std::unordered_set<NodeTensor> &graphInputs)
+{
+  std::unordered_set<std::string> allnodeOutputs;
+
+  // Iterate over each node in the graph to collect all outputs
+  for (const auto &node : g.node())
+  {
+    // Get the output list of the current node
+    const auto &outputs = node.output();
+
+    // Insert each output into the set of all node outputs
+    for (const auto &output : outputs)
+    {
+      allnodeOutputs.insert(output);
+    }
+  }
+
+  // Iterate over each node in the graph to identify inputs not produced by any node
+  for (const auto &node : g.node())
+  {
+    // Get the input list of the current node
+    const auto &inputs = node.input();
+
+    // Check each input to determine if it is an external input to the graph
+    for (const auto &input : inputs)
+    {
+      // If the input is not found in the set of all node outputs, it is a graph input
+      if (std::find(allnodeOutputs.begin(), allnodeOutputs.end(), input) == allnodeOutputs.end())
+      {
+        auto iter = isInputFromInitializer(input, initializerNames);
+        NodeTensor nt;
+        nt.name = input;
+        if (iter != initializerNames.end())
+        {
+          graphInputs.insert(*iter);
+        }
+      }
+    }
+  }
+}
+
+void determineGraphOutput(const onnx::GraphProto &originalGraph, const onnx::GraphProto &g,
+                          std::vector<std::unordered_set<NodeTensor>> &allgraphInputs_1,
+                          std::vector<std::unordered_set<NodeTensor>> &allgraphInputs_2,
+                          std::unordered_set<NodeTensor> &graphOutputs)
+{
+  auto allgraphInputs = allgraphInputs_1;
+  allgraphInputs.insert(allgraphInputs.end(), allgraphInputs_2.begin(), allgraphInputs_2.end());
+  for (const auto &node : g.node())
+  {
+    const auto &outputs = node.output();
+    for (const auto &output : outputs)
+    {
+      int flag = 0;
+      for (auto value_info : originalGraph.output())
+      {
+        if (value_info.name() == output)
+        {
+          NodeTensor nt;
+          nt.name = value_info.name();
+          std::cout << nt.name << std::endl;
+          std::vector<int64_t> shape;
+          for (const auto &dim : value_info.type().tensor_type().shape().dim())
+          {
+            shape.push_back(dim.dim_value());
+          }
+          nt.shape = shape;
+          graphOutputs.insert(nt);
+          flag = 1;
+          break;
+        }
+      }
+      if (flag)
+      {
+        continue;
+      }
+      for (size_t i = 0; i < allgraphInputs.size(); i++)
+      {
+        for (auto &input : allgraphInputs[i])
+        {
+          if (input.name == output)
+          {
+            graphOutputs.insert(input);
+            flag = 1;
+            break;
+          }
+        }
+        if (flag)
+        {
+          break;
+        }
+      }
+    }
+  }
+}
+std::string findInputNode(const onnx::GraphProto &g, const std::string &outputTensorName)
+{
+  std::string node_name = "";
+  for (const auto &node : g.node())
+  {
+    for (const auto &output : node.output())
+    {
+      if (output == outputTensorName)
+      {
+        node_name = node.name();
+      }
+    }
+  }
+  return node_name;
+}
+
+std::unordered_set<std::string> collectNodeNames(const onnx::GraphProto &graph)
+{
+  std::unordered_set<std::string> nodeNames;
+  for (const auto &node : graph.node())
+  {
+    nodeNames.insert(node.name());
+  }
+  return nodeNames;
+}
+
+void mergeGraphs(onnx::GraphProto &targetGraph, onnx::GraphProto &sourceGraph)
+{
+  std::cout << "size before merged: " << targetGraph.node_size() << "+" << sourceGraph.node_size()
+            << std::endl;
+  int size_before = targetGraph.node_size() + sourceGraph.node_size();
+  for (const auto &node : sourceGraph.node())
+  {
+    *targetGraph.add_node() = node;
+  }
+  std::cout << "size after merged: " << targetGraph.node_size() << std::endl;
+  if (size_before != targetGraph.node_size())
+  {
+    std::cout << "error in mergeGraphs" << std::endl;
+    std::exit(0);
+  }
+}
+
+onnx::GraphProto Graph::GetGraphFromOnnx(std::string &path)
+{
+  std::ifstream input(path, std::ios::ate | std::ios::binary);
+  if (!input.is_open())
+  {
+    std::cout << "Error: Failed to open file: " << path << std::endl;
+    exit(0);
+  }
+  onnx::ModelProto model;
+  // get current position in file
+  std::streamsize size = input.tellg();
+  // move to start of file
+  input.seekg(0, std::ios::beg);
+  // read raw data
+  std::vector<char> buffer(size);
+  input.read(buffer.data(), size);
+  model.ParseFromArray(buffer.data(), size); // parse protobuf
+  return model.graph();
+}
diff --git a/tools/onnx-subgraph/src/lib/partition.cpp b/tools/onnx-subgraph/src/lib/partition.cpp
new file mode 100644
index 00000000000..e91c86401c5
--- /dev/null
+++ b/tools/onnx-subgraph/src/lib/partition.cpp
@@ -0,0 +1,2977 @@
+/*
+ * Copyright (c) 2025 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "partition.h"
+#include <algorithm>
+#include <stdio.h>
+#include <stdlib.h>
+#define MAX_DEPTH 1000
+std::vector<onnx::GraphProto> Subgraphs;
+/**
+ * Prints the subgraph information of an ONNX model to specified files.
+ *
+ * @param Subgraphs A vector containing subgraph information.
+ * @param subgraph_file_name The filename for the output of subgraph information.
+ * @param otherSubgraphs A vector containing other subgraph information.
+ * @param other_subgraph_file_name The filename for the output of other subgraph information.
+ */
+void print_subgraphs(std::vector<onnx::GraphProto> Subgraphs, char *subgraph_file_name,
+                     std::vector<onnx::GraphProto> otherSubgraphs, char *other_subgraph_file_name)
+{
+  int node_sum = 0;
+  std::ofstream outFile(subgraph_file_name);
+  if (!outFile.is_open())
+  {
+    std::cerr << "Error opening file." << std::endl;
+    exit(0);
+  }
+  int id = 0;
+  for (const auto &vec : Subgraphs)
+  {
+    outFile << " subgraph" << id << ":";
+    for (const auto &node : vec.node())
+    {
+      outFile << node.name() << " ";
+    }
+    id++;
+    outFile << std::endl;
+    node_sum += vec.node_size();
+  }
+  std::ofstream outFile_2(other_subgraph_file_name);
+  if (!outFile_2.is_open())
+  {
+    std::cerr << "Error opening file." << std::endl;
+    exit(0);
+  }
+  std::cout << "before:" << std::endl;
+  for (const auto &vec : otherSubgraphs)
+  {
+    outFile_2 << " subgraph" << id << ":";
+    for (const auto &node : vec.node())
+    {
+      outFile_2 << node.name() << " ";
+    }
+    id++;
+    outFile_2 << std::endl;
+    node_sum += vec.node_size();
+  }
+}
+///////
+/**
+ * @brief     Constructs an adjacency list representation of the ONNX graph.
+ *
+ * @param     [in] g A const reference to an ONNX GraphProto object that contains the graph
+ * structure.
+ * @param     [in,out] visited A pointer to an integer array used to mark whether nodes have been
+ * visited.
+ * @pre       The 'visited' array should be pre-allocated with a size at least equal to the number
+ * of nodes in the graph.
+ * @post      The 'visited' array will be initialized to 0 for all nodes.
+ * @exception None
+ * @return    A vector of graph_adjacency_node objects representing the adjacency list of the graph.
+ */
+std::vector<graph_adjacency_node> get_adjancency_list(const onnx::GraphProto &g, int *visited)
+{
+  std::vector<graph_adjacency_node> adjacency_list;
+  int node_index = 0;
+  for (const auto &node : g.node())
+  {
+    visited[node_index] = 0;
+    graph_adjacency_node ad_node;
+    ad_node.index = node_index;
+    ad_node.name = node.name();
+    const auto &outputs = node.output();
+    for (const auto &output : outputs)
+    {
+      int output_node_index = 0;
+      for (const auto &output_node : g.node())
+      {
+        int find_flag = 0;
+        const auto &inputs = output_node.input();
+        for (const auto &input : inputs)
+        {
+          if (output == input)
+          {
+            find_flag = 1;
+            break;
+          }
+        }
+        if (find_flag == 1)
+        {
+          if (std::find(ad_node.output_node_index.begin(), ad_node.output_node_index.end(),
+                        output_node_index) == ad_node.output_node_index.end())
+          {
+            ad_node.output_node_index.push_back(output_node_index);
+          }
+        }
+        output_node_index++;
+      }
+    }
+    node_index++;
+    adjacency_list.push_back(ad_node);
+  }
+  return adjacency_list;
+}
+/**
+ * @brief     Calculates the size of a specific node in the ONNX graph in kilobytes (KB).
+ *
+ * @param     [in] g A const reference to an ONNX GraphProto object that contains the graph
+ * structure.
+ * @param     [in] node_index The index of the node for which the size is to be calculated.
+ * @pre       The node_index should be a valid index within the range of nodes in the graph.
+ * @post      None
+ * @exception None
+ * @return    The size of the node in kilobytes (KB).
+ */
+float calculate_node_size(const onnx::GraphProto &g, int node_index) // unit : KB
+{
+  int64_t node_size = 0;
+  for (int i = 0; i < g.node(node_index).input_size(); i++)
+  {
+    std::string input_name = g.node(node_index).input(i);
+    for (int j = 0; j < g.initializer_size(); j++)
+    {
+      if (g.initializer(j).name() == input_name)
+      {
+        int64_t node_init_size = 4;
+        for (int k = 0; k < g.initializer(j).dims().size(); k++)
+        {
+          node_init_size = g.initializer(j).dims(k) * node_init_size;
+        }
+        node_size += node_init_size;
+        break;
+      }
+    }
+  }
+  return float(node_size * 1.0 / 1024.0);
+}
+/**
+ * @brief     Depth-First Search (DFS) to build a NPU subgraph.
+ *
+ * @param     [in] onnx_graph Input ONNX graph structure.
+ * @param     [out] onnx_subgraph Output subgraph.
+ * @param     [in,out] subgraph_node_indices Vector storing indices of nodes in the subgraph.
+ * @param     [in,out] visited Array recording whether nodes have been visited.
+ * @param     [in] start_node Current starting node for the search.
+ * @param     [in] current_node_index Index of the current node.
+ * @param     [in] adjacency_list Adjacency list representing connections between nodes in the
+ * graph.
+ * @param     [in] supported_op_types List of supported operation types.
+ * @param     [in] preferred_op_types List of preferred operation types (not used in the code).
+ * @param     [in] current_depth Current depth of the search.
+ * @param     [in,out] current_graph_size Current size of the subgraph.
+ * @param     [in] max_graph_size Maximum allowed size of the subgraph.
+ * @pre       `current_node_index` should be a valid node index.
+ * @post      If the subgraph size exceeds `max_graph_size`, a warning message is printed.
+ * @exception None
+ */
+void DFS(const onnx::GraphProto &g, onnx::GraphProto &subgraph,
+         std::vector<int> &sugraph_node_index, int *visited, const onnx::NodeProto &start_node,
+         int node_index, std::vector<graph_adjacency_node> &adjacency_list,
+         const std::vector<std::string> &support_op, const std::vector<std::string> &prefer_op,
+         int depth_in, float &graph_size, float max_graph_size)
+{
+  int depth_out = depth_in + 1;
+  *subgraph.add_node() = start_node;
+  visited[node_index] = 1;
+  sugraph_node_index.push_back(node_index);
+  float node_size = calculate_node_size(g, node_index);
+  graph_size += node_size;
+  if (graph_size > max_graph_size)
+  {
+    std::cout << "graph size exceed max size!" << graph_size << " " << max_graph_size << std::endl;
+  }
+  for (int i = 0; i < int(adjacency_list[node_index].output_node_index.size()); i++)
+  {
+    if (i > 1)
+    {
+      std::cout << adjacency_list[node_index].output_node_index[i] << "->";
+    }
+    //
+    int next_node_index = adjacency_list[node_index].output_node_index[i];
+    const auto &next_node = g.node(next_node_index);
+    if (!visited[next_node_index] &&
+        (std::find(support_op.begin(), support_op.end(), next_node.op_type()) !=
+         support_op.end()) &&
+        (depth_out < MAX_DEPTH) && (graph_size < max_graph_size)) // 尚未访问且op_type符合的邻接顶点
+      DFS(g, subgraph, sugraph_node_index, visited, next_node, next_node_index, adjacency_list,
+          support_op, prefer_op, depth_out, graph_size, max_graph_size);
+  }
+}
+/**
+ * @brief     Perform a depth-first search (DFS) to build a CPU subgraph from a given starting node.
+ *
+ * @param     [in] g The original ONNX graph from which the subgraph will be extracted.
+ * @param     [out] subgraph The subgraph being constructed.
+ * @param     [out] subgraph_node_indices A vector to store indices of nodes included in the
+ * subgraph.
+ * @param     [in,out] visited An array to keep track of visited nodes.
+ * @param     [in] start_node The starting node for the DFS.
+ * @param     [in] node_index The index of the starting node in the original graph.
+ * @param     [in] adjacency_list The adjacency list representing the graph's structure.
+ * @param     [in] depth_in The current depth of the DFS.
+ * @param     [in,out] graph_size The cumulative size of the nodes in the subgraph.
+ * @param     [in] max_graph_size The maximum allowed size for the subgraph.
+ *
+ * @pre       The graph `g` and `adjacency_list` should be properly initialized.
+ * @pre       The `visited` array should be initialized to zero.
+ * @pre       `graph_size` should be initialized to zero before the first call to this function.
+ *
+ * @post      The `subgraph` will contain the nodes visited during the DFS.
+ * @post      The `subgraph_node_indices` will contain the indices of the nodes in the subgraph.
+ * @post      The `visited` array will reflect the nodes that have been visited.
+ * @post      The `graph_size` will reflect the cumulative size of the nodes in the subgraph.
+ *
+ * @exception None
+ *
+ * @return    None
+ */
+void DFS_other(const onnx::GraphProto &g, onnx::GraphProto &subgraph,
+               std::vector<int> &sugraph_node_index, int *visited,
+               const onnx::NodeProto &start_node, int node_index,
+               std::vector<graph_adjacency_node> &adjacency_list, int depth_in, float &graph_size,
+               float max_graph_size)
+{
+  int depth_out = depth_in + 1;
+  *subgraph.add_node() = start_node;
+  visited[node_index] = 1;
+  sugraph_node_index.push_back(node_index);
+  float node_size = calculate_node_size(g, node_index);
+  graph_size += node_size;
+  if (graph_size > max_graph_size)
+  {
+    std::cout << "graph size exceed max size!" << graph_size << " " << max_graph_size << std::endl;
+  }
+  for (int i = 0; i < int(adjacency_list[node_index].output_node_index.size()); i++)
+  {
+    int next_node_index = adjacency_list[node_index].output_node_index[i];
+    const auto &next_node = g.node(next_node_index);
+    if (!visited[next_node_index] && (depth_out < MAX_DEPTH) &&
+        (graph_size < max_graph_size)) // do deep first search for each successor node
+      DFS_other(g, subgraph, sugraph_node_index, visited, next_node, next_node_index,
+                adjacency_list, depth_out, graph_size, max_graph_size);
+  }
+}
+
+/**
+ * @brief     Determine and partition subgraphs from the given ONNX graph based on DFS strategy.
+ * Compared with determine_subgraphs_v2, this function is more stable but may produce more subgraphs
+ *
+ * @param     [in] g The original ONNX graph to be partitioned.
+ * @param     [out] otherSubgraphs A vector to store the subgraphs that do not meet the preferred
+ * operation criteria.
+ * @param     [in] d The device object containing information about supported and preferred
+ * operations.
+ * @param     [in,out] visited An array to keep track of visited nodes.
+ * @param     [in] adjacency_list The adjacency list representing the graph's structure.
+ * @param     [in] strategy The partitioning strategy to be used (e.g., SPILTE_CPU_STRUCTURE_FIRST,
+ * SPILTE_NPU_STRUCTURE_FIRST).
+ *
+ * @pre       The graph `g` and `adjacency_list` should be properly initialized.
+ * @pre       The `visited` array should be initialized to zero.
+ * @pre       The device object `d` should be properly initialized with support and preferred
+ * operations.
+ *
+ * @post      The `otherSubgraphs` vector will contain subgraphs that do not meet the preferred
+ * operation criteria.
+ * @post      The `visited` array will reflect the nodes that have been visited.
+ *
+ * @exception None
+ *
+ * @return    None
+ */
+void determine_subgraphs(const onnx::GraphProto &g, std::vector<onnx::GraphProto> &otherSubgraphs,
+                         Device &d, int *visited, std::vector<graph_adjacency_node> &adjacency_list,
+                         PartitionStrategy strategy)
+{
+  int max_subgraph_size = d.max_subgraph_size;
+  std::vector<std::string> support_op;
+  std::vector<std::string> prefer_op;
+  switch (strategy)
+  {
+    case SPILTE_CPU_STRUCTURE_FIRST:
+    {
+      support_op = d.getCPUSupportOp();
+      break;
+    }
+    case SPILTE_NPU_STRUCTURE_FIRST:
+    {
+      support_op = d.getNPUSupportOp();
+      prefer_op = d.getNPUPreferOp();
+      break;
+    }
+    default:
+      break;
+  }
+  for (int i = 0; i < g.node_size(); i++)
+  {
+    if (!visited[i] &&
+        (std::find(support_op.begin(), support_op.end(), g.node(i).op_type()) != support_op.end()))
+    {
+      onnx::GraphProto subgraph;
+      std::vector<int> sugraph_node_index;
+      const auto &node = g.node(i);
+      int depth = 0;
+      float graph_size = 0;
+      DFS(g, subgraph, sugraph_node_index, visited, node, i, adjacency_list, support_op, prefer_op,
+          depth, graph_size, max_subgraph_size);
+      std::cout << "graph_size: " << graph_size << std::endl;
+      int find_prefer_op = 0;
+      for (const auto &node : subgraph.node())
+      {
+        if (std::find(prefer_op.begin(), prefer_op.end(), node.op_type()) != prefer_op.end())
+        {
+          find_prefer_op = 1;
+        }
+      }
+      if (find_prefer_op)
+      {
+        Subgraphs.push_back(subgraph);
+      }
+      else
+      {
+        for (const auto &index : sugraph_node_index)
+        {
+          visited[index] = 0;
+        }
+      }
+    }
+  }
+  for (int i = 0; i < g.node_size(); i++)
+  {
+    if (!visited[i])
+    {
+      int depth = 0;
+      float graph_size = 0;
+      onnx::GraphProto subgraph;
+      std::vector<int> sugraph_node_index;
+      const auto &node = g.node(i);
+      DFS_other(g, subgraph, sugraph_node_index, visited, node, i, adjacency_list, depth,
+                graph_size, max_subgraph_size);
+      std::cout << "graph_size:" << graph_size << std::endl;
+      otherSubgraphs.push_back(subgraph);
+    }
+  }
+}
+
+/**
+ * @brief     Determine and partition subgraphs from the given ONNX graph using the index of nodes,
+ * compared with determine_subgraphs, this function may produce less subgraphs but some of them may
+ * be not fully connected(connectivity of subgrpahs will not affect the inference procedure of
+ * subgraphs) This function specifically handles the partitioning logic for NPU support and
+ * preferred operations.
+ *
+ * @param     [in] g The original ONNX graph to be partitioned.
+ * @param     [out] otherSubgraphs A vector to store the subgraphs that do not meet the preferred
+ * operation criteria.
+ * @param     [in] d The device object containing information about supported and preferred
+ * operations.
+ * @param     [in,out] visited An array to keep track of visited nodes.
+ * @param     [in] adjacency_list The adjacency list representing the graph's structure.
+ * @param     [in] strategy The partitioning strategy to be used (e.g., SPILTE_CPU_STRUCTURE_FIRST,
+ * SPILTE_NPU_STRUCTURE_FIRST).
+ *
+ * @pre       The graph `g` and `adjacency_list` should be properly initialized.
+ * @pre       The `visited` array should be initialized to zero.
+ * @pre       The device object `d` should be properly initialized with support and preferred
+ * operations.
+ *
+ * @post      The `otherSubgraphs` vector will contain subgraphs that do not meet the preferred
+ * operation criteria.
+ * @post      The `visited` array will reflect the nodes that have been visited.
+ *
+ * @exception None
+ *
+ * @return    None
+ */
+void determine_subgraphs_v2(const onnx::GraphProto &g,
+                            std::vector<onnx::GraphProto> &otherSubgraphs, Device &d, int *visited,
+                            std::vector<graph_adjacency_node> &adjacency_list,
+                            PartitionStrategy strategy)
+{
+  float max_subgraph_size = d.max_subgraph_size;
+  std::vector<std::string> support_op;
+  std::vector<std::string> prefer_op;
+  support_op = d.getNPUSupportOp();
+  prefer_op = d.getNPUPreferOp();
+  onnx::GraphProto temp_graph;
+  int end_flag = 0;
+  int node_count = 0;
+  float temp_graph_size = 0;
+  while (!end_flag)
+  {
+    float node_size = calculate_node_size(g, node_count);
+    if (temp_graph.node_size() != 0)
+    {
+      if ((std::find(support_op.begin(), support_op.end(), g.node(node_count).op_type()) !=
+           support_op.end()) &&
+          temp_graph.node_size() <= max_subgraph_size)
+      {
+        *temp_graph.add_node() = g.node(node_count);
+        temp_graph_size += node_size;
+        if (temp_graph_size > max_subgraph_size)
+        {
+          std::cout << "graph size exceed max size!" << temp_graph_size << " " << max_subgraph_size
+                    << std::endl;
+        }
+        visited[node_count] = 1;
+      }
+      else
+      {
+        int find_preferop = 0;
+        for (int j = 0; j < temp_graph.node_size(); j++)
+        {
+          if (std::find(prefer_op.begin(), prefer_op.end(), temp_graph.node(j).op_type()) !=
+              prefer_op.end())
+          {
+            find_preferop = 1;
+            break;
+          }
+        }
+        if (find_preferop == 1)
+        {
+          Subgraphs.push_back(temp_graph);
+        }
+        else
+        {
+          for (int k = 1; k <= temp_graph.node_size(); k++)
+          {
+            visited[node_count - k] = 0;
+          }
+        }
+        temp_graph.Clear();
+        temp_graph_size = 0;
+        if (std::find(support_op.begin(), support_op.end(), g.node(node_count).op_type()) !=
+            support_op.end())
+        {
+          *temp_graph.add_node() = g.node(node_count);
+          temp_graph_size += node_size;
+          visited[node_count] = 1;
+          continue;
+        }
+      }
+    }
+    else
+    {
+      if (std::find(support_op.begin(), support_op.end(), g.node(node_count).op_type()) !=
+          support_op.end())
+      {
+        *temp_graph.add_node() = g.node(node_count);
+        temp_graph_size += node_size;
+        if (temp_graph_size > max_subgraph_size)
+        {
+          std::cout << "graph size exceed max size!" << temp_graph_size << " " << max_subgraph_size
+                    << std::endl;
+        }
+        visited[node_count] = 1;
+      }
+    }
+    node_count++;
+    if (node_count == g.node_size())
+    {
+      end_flag = 1;
+      if (temp_graph.node_size() != 0)
+      {
+        Subgraphs.push_back(temp_graph);
+      }
+    }
+  }
+  onnx::GraphProto temp_graph2;
+  float temp_graph_size2 = 0;
+  for (int i = 0; i < g.node_size(); i++)
+  {
+    float node_size = calculate_node_size(g, i);
+    if (visited[i] == 0 && temp_graph_size2 < max_subgraph_size)
+    {
+      *temp_graph2.add_node() = g.node(i);
+      temp_graph_size2 += node_size;
+    }
+    else
+    {
+      std::cout << "i = " << i << " temp_graph_size2: " << temp_graph_size2 << std::endl;
+      if (temp_graph2.node_size() != 0)
+      {
+        otherSubgraphs.push_back(temp_graph2);
+        temp_graph_size2 = 0;
+        temp_graph2.Clear();
+      }
+      if (visited[i] == 0)
+      {
+        *temp_graph2.add_node() = g.node(i);
+        temp_graph_size2 += node_size;
+        continue;
+      }
+    }
+    if (i == g.node_size() - 1)
+    {
+      if (temp_graph2.node_size() != 0)
+      {
+        otherSubgraphs.push_back(temp_graph2);
+        temp_graph2.Clear();
+      }
+    }
+  }
+}
+/**
+ * @brief     Perform Tarjan's algorithm to find all strongly connected components in a directed
+ * graph. This function uses depth-first search (DFS) to identify and group nodes into strongly
+ * connected components.
+ *
+ * @param     [in] index The current node index being visited.
+ * @param     [in] depth The current depth in the DFS traversal.
+ * @param     [out] strongly_connected_subgraphs A vector to store the identified strongly connected
+ * components.
+ * @param     [in,out] DFN An array to store the discovery time of each node.
+ * @param     [in,out] LOW An array to store the lowest discovery time reachable from each node.
+ * @param     [in,out] stack_subgraphs A stack to keep track of nodes in the current DFS path.
+ * @param     [in] successors_Subgraphs A vector of vectors representing the adjacency list of the
+ * graph.
+ *
+ * @pre       The `DFN` and `LOW` arrays should be initialized to zero.
+ * @pre       The `stack_subgraphs` should be empty before the first call to this function.
+ * @pre       The `successors_Subgraphs` should be properly initialized with the graph's adjacency
+ * list.
+ *
+ * @post      The `strongly_connected_subgraphs` vector will contain all the strongly connected
+ * components found in the graph.
+ * @post      The `DFN` and `LOW` arrays will reflect the discovery times and lowest reachable
+ * discovery times for each node.
+ * @post      The `stack_subgraphs` will be empty after the function completes.
+ *
+ * @exception None
+ *
+ * @return    None
+ */
+void Tarjan(int index, int depth, std::vector<std::vector<int>> &strongly_connected_subgraphs,
+            int *DFN, int *LOW, std::vector<int> &stack_subgraphs,
+            std::vector<std::vector<int>> &successors_Subgraphs)
+{
+  int rank = depth + 1;
+  DFN[index] = LOW[index] = rank; // initialize DFN and LOW to 0
+  stack_subgraphs.push_back(index);
+  for (const auto &successor : successors_Subgraphs[index])
+  {
+    if (DFN[successor] == 0) // the successor is not visited
+    {
+      Tarjan(successor, rank, strongly_connected_subgraphs, DFN, LOW, stack_subgraphs,
+             successors_Subgraphs); // visit successor
+      LOW[index] = std::min(LOW[index], LOW[successor]);
+    }
+    else if (std::find(stack_subgraphs.begin(), stack_subgraphs.end(), successor) !=
+             stack_subgraphs.end())
+    {
+      LOW[index] = std::min(LOW[index], DFN[successor]);
+    }
+  }
+  if (LOW[index] == DFN[index]) // if this node is the smallest root of the strongly connected
+                                // component subtree, then subsequent nodes are popped out of the
+                                // stack and the obtained strongly connected components are saved.
+  {
+    auto it = stack_subgraphs.end() - 1;
+    std::vector<int> strongly_connected;
+    while (*it != index)
+    {
+      strongly_connected.insert(strongly_connected.begin(), *it);
+      stack_subgraphs.pop_back();
+      it = stack_subgraphs.end() - 1;
+    }
+    strongly_connected.insert(strongly_connected.begin(), *it);
+
+    if (strongly_connected.size() > 1)
+    {
+      strongly_connected_subgraphs.push_back(strongly_connected);
+    }
+    stack_subgraphs.pop_back(); // pop
+  }
+}
+/**
+ * @brief     Calculate the rank of each node in the merged graph formed by the given strongly
+ * connected components. The rank is determined based on the topological order of the nodes.
+ *
+ * @param     [in] strongly_connected A vector containing indices of strongly connected components.
+ * @param     [in] Subgraphs A vector of ONNX GraphProtos representing the main subgraphs.
+ * @param     [in] otherSubgraphs A vector of ONNX GraphProtos representing additional subgraphs.
+ *
+ * @pre       The `strongly_connected` vector should contain valid indices for `Subgraphs` and
+ * `otherSubgraphs`.
+ * @pre       The `Subgraphs` and `otherSubgraphs` vectors should be properly initialized with ONNX
+ * GraphProtos.
+ *
+ * @post      The `node_rank_list` vector will contain the nodes of the merged graph with their
+ * respective ranks.
+ *
+ * @exception None
+ *
+ * @return    A vector of `graph_adjacency_node` structures containing the nodes and their ranks.
+ */
+std::vector<graph_adjacency_node> calculate_node_rank(std::vector<int> &strongly_connected,
+                                                      std::vector<onnx::GraphProto> &Subgraphs,
+                                                      std::vector<onnx::GraphProto> &otherSubgraphs)
+{
+  onnx::GraphProto merged_graph;
+  std::vector<graph_adjacency_node> node_rank_list;
+  for (const auto &index : strongly_connected)
+  {
+    if (index < int(Subgraphs.size()))
+    {
+      mergeGraphs(merged_graph, Subgraphs[index]);
+    }
+    else
+    {
+      mergeGraphs(merged_graph, otherSubgraphs[index - Subgraphs.size()]);
+    }
+  }
+  int index = 0;
+  for (const auto &node : merged_graph.node())
+  {
+    graph_adjacency_node node_rank;
+    node_rank.name = node.name();
+    node_rank.index = index;
+    node_rank.rank = -1;
+    node_rank_list.push_back(node_rank);
+    index++;
+  }
+  int sort_count = 0;
+  int finished_flag = 0;
+  while (!finished_flag)
+  {
+    finished_flag = 1;
+    if (sort_count == 0)
+    {
+      for (int i = 0; i < merged_graph.node_size(); i++) // Traverse all nodes
+      {
+        int find_flag = 0;
+        for (const auto &input : merged_graph.node(i).input())
+        {
+          for (int j = 0; j < merged_graph.node_size(); j++)
+          {
+            for (const auto &output : merged_graph.node(j).output())
+            {
+              if (input == output)
+              {
+                find_flag = 1;
+                break;
+              }
+            }
+            if (find_flag)
+            {
+              break;
+            }
+          }
+          if (find_flag)
+          {
+            break;
+          }
+        }
+        if (!find_flag)
+        {
+          node_rank_list[i].rank = sort_count;
+        }
+      }
+      finished_flag = 0;
+    }
+    else
+    {
+      for (int i = 0; i < merged_graph.node_size(); i++)
+      {
+        int find_flag = 0;
+        if (node_rank_list[i].rank >= 0 && node_rank_list[i].rank < sort_count)
+        {
+          continue;
+        } ////If it has already been sorted, skip this subgraph
+        for (const auto &input :
+             merged_graph.node(i).input()) ////traveres all inputs of this subgraph
+        {
+          for (int j = 0; j < merged_graph.node_size();
+               j++) ////examint if the input is the output of j th subgraph
+          {
+            for (const auto &output : merged_graph.node(j).output())
+            {
+              if (output == input)
+              {
+                if ((node_rank_list[j].rank < 0 ||
+                     node_rank_list[j].rank >= sort_count)) // the j th subgraph has not been sorted
+                {
+                  find_flag = 1;
+                  break;
+                }
+              }
+            }
+            if (find_flag)
+            {
+              break;
+            }
+          }
+          if (find_flag)
+          {
+            break;
+          }
+        }
+        if (!find_flag)
+        {
+          node_rank_list[i].rank = sort_count;
+        }
+        else
+        {
+          node_rank_list[i].rank = sort_count + 1;
+          finished_flag = 0;
+        }
+      }
+    }
+    sort_count++;
+  }
+  return node_rank_list;
+}
+/**
+ * @brief     Calculate the rank of each node in the merged graph formed by the given strongly
+ * connected components. The rank is determined based on the topological order of the nodes.
+ * Compared with calculate_node_rank, this function has different input parameters.
+ *
+ * @param     [in] strongly_connected A vector containing indices of strongly connected components.
+ * @param     [in] Subgraphs A vector of ONNX GraphProtos representing the main subgraphs.
+ * @param     [in] otherSubgraphs A vector of ONNX GraphProtos representing additional subgraphs.
+ * @param     [in] subgraph_size The size of the Subgraphs vector.
+ * @param     [in] other_subgraph_size The size of the otherSubgraphs vector.
+ *
+ * @pre       The `strongly_connected` vector should contain valid indices for `Subgraphs` and
+ * `otherSubgraphs`.
+ * @pre       The `Subgraphs` and `otherSubgraphs` vectors should be properly initialized with ONNX
+ * GraphProtos.
+ * @pre       `subgraph_size` should be equal to the size of the `Subgraphs` vector.
+ * @pre       `other_subgraph_size` should be equal to the size of the `otherSubgraphs` vector.
+ *
+ * @post      The `node_rank_list` vector will contain the nodes of the merged graph with their
+ * respective ranks.
+ *
+ * @exception None
+ *
+ * @return    A vector of `graph_adjacency_node` structures containing the nodes and their ranks.
+ */
+std::vector<graph_adjacency_node> calculate_node_rank_v2(
+  std::vector<int> &strongly_connected, std::vector<onnx::GraphProto> &Subgraphs,
+  std::vector<onnx::GraphProto> &otherSubgraphs, int subgraph_size, int other_subgraph_size)
+{
+  onnx::GraphProto merged_graph;
+  std::vector<graph_adjacency_node> node_rank_list;
+  for (const auto &index : strongly_connected)
+  {
+    if (index < subgraph_size)
+    {
+      mergeGraphs(merged_graph, Subgraphs[index]);
+    }
+    else
+    {
+      mergeGraphs(merged_graph, otherSubgraphs[index - subgraph_size]);
+    }
+  }
+  int index = 0;
+  for (const auto &node : merged_graph.node())
+  {
+    graph_adjacency_node node_rank;
+    node_rank.name = node.name();
+    node_rank.index = index;
+    node_rank.rank = -1;
+    node_rank_list.push_back(node_rank);
+    index++;
+  }
+  int sort_count = 0;
+  int finished_flag = 0;
+  while (!finished_flag)
+  {
+    finished_flag = 1;
+    if (sort_count == 0)
+    {
+      for (int i = 0; i < merged_graph.node_size(); i++) // traverse all nodes
+      {
+        int find_flag = 0;
+        for (const auto &input : merged_graph.node(i).input())
+        {
+          for (int j = 0; j < merged_graph.node_size(); j++)
+          {
+            for (const auto &output : merged_graph.node(j).output())
+            {
+              if (input == output)
+              {
+                find_flag = 1;
+                break;
+              }
+            }
+            if (find_flag)
+            {
+              break;
+            }
+          }
+          if (find_flag)
+          {
+            break;
+          }
+        }
+        if (!find_flag)
+        {
+          node_rank_list[i].rank = sort_count;
+        }
+      }
+      finished_flag = 0;
+    }
+    else
+    {
+      for (int i = 0; i < merged_graph.node_size(); i++)
+      {
+        int find_flag = 0;
+        if (node_rank_list[i].rank >= 0 && node_rank_list[i].rank < sort_count)
+        {
+          continue;
+        }
+        for (const auto &input :
+             merged_graph.node(i).input()) ////traverses all inputs of this subgraph
+        {
+          for (int j = 0; j < merged_graph.node_size();
+               j++) /// examint if the input is the output of j th subgraph
+          {
+            for (const auto &output : merged_graph.node(j).output())
+            {
+              if (output == input)
+              {
+                if ((node_rank_list[j].rank < 0 ||
+                     node_rank_list[j].rank >= sort_count)) // the j th subgraph has not been sorted
+                {
+                  find_flag = 1;
+                  break;
+                }
+              }
+            }
+            if (find_flag)
+            {
+              break;
+            }
+          }
+          if (find_flag)
+          {
+            break;
+          }
+        }
+        if (!find_flag)
+        {
+          node_rank_list[i].rank = sort_count;
+        }
+        else
+        {
+          node_rank_list[i].rank = sort_count + 1;
+          finished_flag = 0;
+        }
+      }
+    }
+    sort_count++;
+  }
+  return node_rank_list;
+}
+/**
+ * @brief     Calculate the rank of each node in the given merged ONNX graph.
+ *            The rank is determined based on the topological order of the nodes.
+ *            This function is only used to calculate the rank of the nodes in a single graph,
+ * especially the original graph
+ *
+ * @param     [in] merged_graph The ONNX GraphProto representing the merged graph.
+ * @param     [out] node_rank_list A vector of `graph_adjacency_node` structures to store the nodes
+ * and their ranks.
+ *
+ * @pre       The `merged_graph` should be a valid ONNX GraphProto.
+ * @pre       The `node_rank_list` should be an empty vector or properly initialized.
+ *
+ * @post      The `node_rank_list` vector will contain the nodes of the merged graph with their
+ * respective ranks.
+ *
+ * @exception None
+ *
+ * @return    None
+ */
+void calculate_node_rank_v3(const onnx::GraphProto &merged_graph,
+                            std::vector<graph_adjacency_node> &node_rank_list)
+{
+  int index = 0;
+  for (const auto &node : merged_graph.node())
+  {
+    graph_adjacency_node node_rank;
+    node_rank.name = node.name();
+    node_rank.index = index;
+    node_rank.rank = -1;
+    node_rank_list.push_back(node_rank);
+    index++;
+  }
+  int sort_count = 0;
+  int finished_flag = 0;
+  while (!finished_flag)
+  {
+    finished_flag = 1;
+    if (sort_count == 0)
+    {
+      for (int i = 0; i < merged_graph.node_size(); i++) // traverse all nodes
+      {
+        int find_flag = 0;
+        for (const auto &input : merged_graph.node(i).input())
+        {
+          for (int j = 0; j < merged_graph.node_size(); j++)
+          {
+            for (const auto &output : merged_graph.node(j).output())
+            {
+              if (input == output)
+              {
+                find_flag = 1;
+                break;
+              }
+            }
+            if (find_flag)
+            {
+              break;
+            }
+          }
+          if (find_flag)
+          {
+            break;
+          }
+        }
+        if (!find_flag)
+        {
+          node_rank_list[i].rank = sort_count;
+        }
+      }
+      finished_flag = 0;
+    }
+    else
+    {
+      for (int i = 0; i < merged_graph.node_size(); i++)
+      {
+        int find_flag = 0;
+        if (node_rank_list[i].rank >= 0 && node_rank_list[i].rank < sort_count)
+        {
+          continue;
+        }
+        for (const auto &input :
+             merged_graph.node(i).input()) ////traverses all inputs of this subgraph
+        {
+          for (int j = 0; j < merged_graph.node_size();
+               j++) /// examint if the input is the output of j th subgraph
+          {
+            for (const auto &output : merged_graph.node(j).output())
+            {
+              if (output == input)
+              {
+                if ((node_rank_list[j].rank < 0 ||
+                     node_rank_list[j].rank >= sort_count)) // the j th subgraph has not been sorted
+                {
+                  find_flag = 1;
+                  break;
+                }
+              }
+            }
+            if (find_flag)
+            {
+              break;
+            }
+          }
+          if (find_flag)
+          {
+            break;
+          }
+        }
+        if (!find_flag)
+        {
+          node_rank_list[i].rank = sort_count;
+        }
+        else
+        {
+          node_rank_list[i].rank = sort_count + 1;
+          finished_flag = 0;
+        }
+      }
+    }
+    sort_count++;
+  }
+}
+/**
+ * @brief     Determine the cut ranks in the given list of SCC (Strongly Connected Component) node
+ * ranks. A cut rank is defined as a rank where no node exists, but there is at least one node at
+ * the next rank.
+ *
+ * @param     [in] scc_node_rank A vector of `graph_adjacency_node` structures representing the
+ * nodes and their ranks.
+ *
+ * @pre       The `scc_node_rank` vector should be properly initialized and contain valid node
+ * ranks.
+ *
+ * @post      The function does not modify the `scc_node_rank` vector.
+ *
+ * @exception None
+ *
+ * @return    A vector of integers representing the cut ranks.
+ */
+std::vector<int> get_cut_rank_v2(std::vector<graph_adjacency_node> &scc_node_rank)
+{
+  std::vector<int> cut_rank_list;
+  int min_cut_rank = -1;
+  int max_rank = 0;
+  // get min
+  for (int i = 0; i < int(scc_node_rank.size()); i++)
+  {
+    if (scc_node_rank[i].rank < min_cut_rank || min_cut_rank < 0)
+    {
+      min_cut_rank = scc_node_rank[i].rank;
+    }
+    if (scc_node_rank[i].rank > max_rank)
+    {
+      max_rank = scc_node_rank[i].rank;
+    }
+  }
+  int find_flag = 1;
+  while (find_flag)
+  {
+    min_cut_rank++;
+    int temp_find_flag = 0;
+    for (int i = 0; i < int(scc_node_rank.size()); i++)
+    {
+      if (scc_node_rank[i].rank == min_cut_rank)
+      {
+        temp_find_flag = 1;
+        break;
+      }
+    }
+    find_flag = temp_find_flag;
+  }
+  cut_rank_list.push_back(min_cut_rank);
+  int cut_rank = min_cut_rank;
+  while (cut_rank < max_rank)
+  {
+    cut_rank = cut_rank + 1;
+    int rank_flag = 0;
+    int rank_plus_flag = 0;
+    for (int i = 0; i < int(scc_node_rank.size()); i++)
+    {
+      if (scc_node_rank[i].rank == cut_rank)
+      {
+        rank_flag = 1;
+      }
+      else if (scc_node_rank[i].rank == cut_rank + 1)
+      {
+        rank_plus_flag = 1;
+      }
+    }
+    if (rank_flag == 0 && rank_plus_flag == 1)
+    {
+      cut_rank_list.push_back(cut_rank + 1);
+    }
+  }
+
+  return cut_rank_list;
+}
+/**
+ * @brief     Eliminate strongly connected components in the graph and partition them into subgraphs
+ * based on node ranks.
+ *
+ * @param     [in] strongly_connected_subgraphs List of indices representing strongly connected
+ * components.
+ * @param     [in,out] Subgraphs List of subgraphs that will be updated.
+ * @param     [in,out] otherSubgraphs List of other subgraphs that will be updated.
+ * @param     [in] g The original graph from which strongly connected components are derived.
+ * @pre       The input graph `g` should be properly initialized and contain nodes.
+ * @post      The `Subgraphs` and `otherSubgraphs` lists may be modified with new partitions based
+ * on node ranks.
+ * @exception None
+ * @return    None
+ */
+void eliminate_scc_v2(std::vector<std::vector<int>> &strongly_connected_subgraphs,
+                      std::vector<onnx::GraphProto> &Subgraphs,
+                      std::vector<onnx::GraphProto> &otherSubgraphs, const onnx::GraphProto &g)
+{
+  int subgraph_size = Subgraphs.size();
+  std::vector<graph_adjacency_node> node_rank_list;
+  calculate_node_rank_v3(g, node_rank_list);
+  for (auto &strongly_connected : strongly_connected_subgraphs)
+    for (const auto scc_index : strongly_connected)
+    {
+      onnx::GraphProto scc_graph;
+      if (scc_index < subgraph_size)
+      {
+        scc_graph = Subgraphs[scc_index];
+      }
+      else
+      {
+        scc_graph = otherSubgraphs[scc_index - subgraph_size];
+      }
+      std::vector<graph_adjacency_node> scc_node_rank;
+      for (int i = 0; i < scc_graph.node_size(); i++)
+      {
+        for (int j = 0; j < int(node_rank_list.size()); j++)
+        {
+          if (scc_graph.node(i).name() == node_rank_list[j].name)
+          {
+            scc_node_rank.push_back(node_rank_list[j]);
+            break;
+          }
+        }
+      }
+      std::vector<int> cut_rank = get_cut_rank_v2(scc_node_rank);
+      onnx::GraphProto temp_graph_upper;
+      int node_in_upper = 0;
+      for (int i = 0; i < scc_graph.node_size(); i++)
+      {
+        if (scc_node_rank[i].rank < cut_rank[0])
+        {
+          node_in_upper++;
+        }
+      }
+      int node_in_upper_added = 0;
+      std::vector<onnx::GraphProto> temp_graph_upper_adder_list;
+      int record_i = 0;
+      std::cout << "node size: " << scc_graph.node_size() << std::endl;
+      std::cout << "node in upper: " << node_in_upper << std::endl;
+      while (node_in_upper_added < node_in_upper)
+      {
+        onnx::GraphProto temp_graph_upper_adder;
+        for (int i = record_i; i < scc_graph.node_size(); i++)
+        {
+          int i_minus_1 = 0;
+          if (i == 0)
+          {
+            i_minus_1 = 0;
+          }
+          else
+          {
+            i_minus_1 = i - 1;
+          }
+          if (scc_node_rank[i].rank < cut_rank[0] &&
+              (i == record_i || (scc_node_rank[i].rank == scc_node_rank[i_minus_1].rank + 1)))
+          {
+            *temp_graph_upper_adder.add_node() = scc_graph.node(i);
+            node_in_upper_added++;
+          }
+          else
+          {
+            if (scc_node_rank[i].rank >= cut_rank[0])
+            {
+              record_i = i + 1;
+            }
+            else
+            {
+              record_i = i;
+            }
+            if (temp_graph_upper_adder.node_size() > 0)
+            {
+              temp_graph_upper_adder_list.push_back(temp_graph_upper_adder);
+              temp_graph_upper_adder.clear_node();
+            }
+            break;
+          }
+          if (i == scc_graph.node_size() - 1 && temp_graph_upper_adder.node_size() > 0)
+          {
+            temp_graph_upper_adder_list.push_back(temp_graph_upper_adder);
+            temp_graph_upper_adder.clear_node();
+          }
+        }
+        std::cout << "loop ended:temp graph upper adder size: "
+                  << temp_graph_upper_adder.node_size() << " " << record_i << "/"
+                  << scc_graph.node_size() << " node_in_upper_added:" << node_in_upper_added
+                  << std::endl;
+      }
+      if (scc_index < subgraph_size)
+      {
+        Subgraphs[scc_index] = temp_graph_upper_adder_list[0];
+      }
+      else
+      {
+        otherSubgraphs[scc_index - subgraph_size] = temp_graph_upper_adder_list[0];
+      }
+
+      if (temp_graph_upper_adder_list.size() > 1)
+      {
+        for (int i = 1; i < int(temp_graph_upper_adder_list.size()); i++)
+        {
+          if (scc_index < subgraph_size)
+          {
+            Subgraphs.push_back(temp_graph_upper_adder_list[i]);
+          }
+          else
+          {
+            otherSubgraphs.push_back(temp_graph_upper_adder_list[i]);
+          }
+        }
+      }
+      std::cout << "scc index" << scc_index << " scc size: " << scc_graph.node_size() << std::endl;
+      std::cout << "scc node rank: ";
+      for (int i = 0; i < scc_graph.node_size(); i++)
+      {
+        std::cout << scc_node_rank[i].name << " " << scc_node_rank[i].rank << " ";
+      }
+      std::cout << std::endl;
+      for (int i = 0; i < int(cut_rank.size()) - 1; i++)
+      {
+        onnx::GraphProto temp_graph_lower;
+        for (int j = 0; j < scc_graph.node_size(); j++)
+        {
+          if (scc_node_rank[j].rank >= cut_rank[i] && scc_node_rank[j].rank < cut_rank[i + 1])
+          {
+            *temp_graph_lower.add_node() = scc_graph.node(j);
+          }
+        }
+        if (scc_index < subgraph_size)
+        {
+          if (temp_graph_lower.node_size() > 0)
+          {
+            Subgraphs.push_back(temp_graph_lower);
+          }
+        }
+        else
+        {
+          if (temp_graph_lower.node_size() > 0)
+          {
+            otherSubgraphs.push_back(temp_graph_lower);
+          }
+        }
+      }
+      onnx::GraphProto temp_graph_lower;
+      for (int j = 0; j < scc_graph.node_size(); j++)
+      {
+        if (scc_node_rank[j].rank >= cut_rank[cut_rank.size() - 1])
+        {
+          *temp_graph_lower.add_node() = scc_graph.node(j);
+        }
+      }
+      if (scc_index < subgraph_size)
+      {
+        if (temp_graph_lower.node_size() > 0)
+        {
+          Subgraphs.push_back(temp_graph_lower);
+        }
+      }
+      else
+      {
+        if (temp_graph_lower.node_size() > 0)
+        {
+          otherSubgraphs.push_back(temp_graph_lower);
+        }
+      }
+    }
+  for (int i = Subgraphs.size() - 1; i >= 0; i--)
+  {
+    if (Subgraphs[i].node_size() == 0)
+    {
+      Subgraphs.erase(Subgraphs.begin() + i);
+    }
+  }
+  for (int i = otherSubgraphs.size() - 1; i >= 0; i--)
+  {
+    if (otherSubgraphs[i].node_size() == 0)
+    {
+      otherSubgraphs.erase(otherSubgraphs.begin() + i);
+    }
+  }
+}
+/**
+ * @brief     Eliminate strongly connected components in the graph and partition them into
+ * individual subgraphs.
+ *
+ * @param     [in] strongly_connected_subgraphs List of indices representing strongly connected
+ * components.
+ * @param     [in,out] Subgraphs List of subgraphs that will be updated.
+ * @param     [in,out] otherSubgraphs List of other subgraphs that will be updated.
+ * @param     [in] g The original graph from which strongly connected components are derived.
+ * @pre       The input graph `g` should be properly initialized and contain nodes.
+ * @post      The `Subgraphs` and `otherSubgraphs` lists will be updated with individual nodes from
+ * each strongly connected component.
+ * @exception None
+ * @return    None
+ */
+void eliminate_scc_v3(std::vector<std::vector<int>> &strongly_connected_subgraphs,
+                      std::vector<onnx::GraphProto> &Subgraphs,
+                      std::vector<onnx::GraphProto> &otherSubgraphs, const onnx::GraphProto &g)
+{
+  int subgraph_size = Subgraphs.size();
+  for (int i = 0; i < int(strongly_connected_subgraphs.size()); i++)
+  {
+    for (const auto scc_index : strongly_connected_subgraphs[i])
+    {
+      std::cout << "scc index: " << scc_index << std::endl;
+      onnx::GraphProto scc_graph;
+      if (scc_index < subgraph_size)
+      {
+        scc_graph = Subgraphs[scc_index];
+      }
+      else
+      {
+        scc_graph = otherSubgraphs[scc_index - subgraph_size];
+      }
+      for (int j = 0; j < scc_graph.node_size(); j++)
+      {
+        onnx::GraphProto graph_temp;
+        *graph_temp.add_node() = scc_graph.node(j);
+        if (j == 0)
+        {
+          if (scc_index < subgraph_size)
+          {
+            Subgraphs[scc_index] = graph_temp;
+          }
+          else
+          {
+            otherSubgraphs[scc_index - subgraph_size] = graph_temp;
+          }
+        }
+        else
+        {
+          if (scc_index < subgraph_size)
+          {
+            Subgraphs.push_back(graph_temp);
+          }
+          else
+          {
+            otherSubgraphs.push_back(graph_temp);
+          }
+        }
+      }
+    }
+  }
+  for (int i = Subgraphs.size() - 1; i >= 0; i--)
+  {
+    if (Subgraphs[i].node_size() == 0)
+    {
+      Subgraphs.erase(Subgraphs.begin() + i);
+    }
+  }
+  for (int i = otherSubgraphs.size() - 1; i >= 0; i--)
+  {
+    if (otherSubgraphs[i].node_size() == 0)
+    {
+      otherSubgraphs.erase(otherSubgraphs.begin() + i);
+    }
+  }
+}
+/**
+ * @brief     Determine the graph type based on the given index and return the corresponding graph.
+ *
+ * @param     [in] index The index of the graph to determine.
+ * @param     [in] Subgraphs List of subgraphs.
+ * @param     [in] otherSubgraphs List of other subgraphs.
+ * @param     [in] subgraph_size The size of the Subgraphs list.
+ * @pre       The `index` should be a valid index within the combined range of `Subgraphs` and
+ * `otherSubgraphs`.
+ * @post      None
+ * @exception None
+ * @return    The graph corresponding to the given index.
+ */
+onnx::GraphProto determinegraphtype_v2(int index, std::vector<onnx::GraphProto> &Subgraphs,
+                                       std::vector<onnx::GraphProto> &otherSubgraphs,
+                                       int subgraph_size)
+{
+  if (index < subgraph_size)
+  {
+    return Subgraphs[index];
+  }
+  else
+  {
+    return otherSubgraphs[index - subgraph_size];
+  }
+}
+/**
+ * @brief     Find pairs of strongly connected subgraphs based on input and output tensors.
+ *
+ * @param     [in] strongly_connected_subgraphs List of strongly connected subgraphs.
+ * @param     [in] Subgraphs List of subgraphs.
+ * @param     [in] otherSubgraphs List of other subgraphs.
+ * @param     [in] graphs_inputs List of input tensors for each graph.
+ * @param     [in] graphs_outputs List of output tensors for each graph.
+ * @param     [out] sccs_pairs List of pairs of strongly connected subgraphs.
+ * @pre       The input lists should be properly initialized and contain valid data.
+ * @post      The `sccs_pairs` list will contain pairs of indices representing connected subgraphs.
+ * @exception None
+ * @return    None
+ */
+void find_subgraph_pair_v2(std::vector<std::vector<int>> &strongly_connected_subgraphs,
+                           std::vector<onnx::GraphProto> &Subgraphs,
+                           std::vector<onnx::GraphProto> &otherSubgraphs,
+                           std::vector<std::unordered_set<NodeTensor>> &graphs_inputs,
+                           std::vector<std::unordered_set<NodeTensor>> &graphs_outputs,
+                           std::vector<std::vector<std::vector<int>>> &sccs_pairs)
+{
+  int count = 0;
+  for (const auto &strongly_connected : strongly_connected_subgraphs)
+  {
+    std::vector<onnx::GraphProto> scc_graphs;
+    std::vector<std::unordered_set<NodeTensor>> scc_graphs_inputs;
+    std::vector<std::unordered_set<NodeTensor>> scc_graphs_outputs;
+    for (const auto &index : strongly_connected)
+    {
+      std::unordered_set<NodeTensor> graph_inputs = graphs_inputs[index];
+      std::unordered_set<NodeTensor> graph_outputs = graphs_outputs[index];
+      scc_graphs_inputs.push_back(graph_inputs);
+      scc_graphs_outputs.push_back(graph_outputs);
+    }
+    std::vector<std::vector<int>> scc_pairs;
+    std::vector<int> is_pushed;
+    for (int j = 0; j < int(strongly_connected.size()); j++)
+    {
+      is_pushed.push_back(0);
+    }
+    for (int i = 0; i < int(strongly_connected.size()); i++)
+    {
+      for (const auto &graph_input : scc_graphs_inputs[i])
+      {
+        for (int j = i + 1; j < int(strongly_connected.size()); j++)
+        {
+          std::vector<int> scc_pair;
+          if (scc_graphs_outputs[j].find(graph_input) != scc_graphs_outputs[j].end() &&
+              is_pushed[j] == 0)
+          {
+            for (const auto &graph_output : scc_graphs_outputs[i])
+            {
+              if (scc_graphs_inputs[j].find(graph_output) != scc_graphs_inputs[j].end())
+              {
+                scc_pair.push_back(strongly_connected[i]);
+                scc_pair.push_back(strongly_connected[j]);
+                scc_pairs.push_back(scc_pair);
+                is_pushed[j] = 1;
+                is_pushed[i] = 1;
+                break;
+              }
+            }
+          }
+          if (is_pushed[i] == 1)
+          {
+            break;
+          }
+        }
+        if (is_pushed[i] == 1)
+        {
+          break;
+        }
+      }
+    }
+    if (scc_pairs.size() != 0)
+    {
+      sccs_pairs.push_back(scc_pairs);
+    }
+    count++;
+  }
+  for (const auto &scc_pairs : sccs_pairs)
+  {
+    std::cout << "scc pair:";
+    for (const auto &scc_pair : scc_pairs)
+    {
+
+      for (const auto &scc_id : scc_pair)
+      {
+        std::cout << scc_id << " ";
+      }
+      std::cout << ";";
+    }
+    std::cout << std::endl;
+  }
+}
+/**
+ * @brief     Cut a pair of subgraphs into upper and lower parts based on node rank.
+ *
+ * @param     [in] Subgraphs List of subgraphs.
+ * @param     [in] otherSubgraphs List of other subgraphs.
+ * @param     [in] graphs_inputs List of input tensors for each graph.
+ * @param     [in] graphs_outputs List of output tensors for each graph.
+ * @param     [in] scc_pair Pair of subgraph indices to be cut.
+ * @param     [out] scc_pair_cut List of cut subgraphs (upper and lower parts of master graph and
+ * slave graph).
+ * @param     [in] subgraph_size Size of subgraph.
+ * @pre       The input lists should be properly initialized and contain valid data.
+ * @post      The `scc_pair_cut` list will contain the cut subgraphs.
+ * @exception None
+ * @return    A vector containing the index of the master graph and the cut rank.
+ */
+std::vector<int> cut_pair(std::vector<onnx::GraphProto> &Subgraphs,
+                          std::vector<onnx::GraphProto> &otherSubgraphs,
+                          std::vector<std::unordered_set<NodeTensor>> &graphs_inputs,
+                          std::vector<std::unordered_set<NodeTensor>> &graphs_outputs,
+                          std::vector<int> &scc_pair, std::vector<onnx::GraphProto> &scc_pair_cut,
+                          int subgraph_size)
+{
+  std::vector<graph_adjacency_node> pair_node_list =
+    calculate_node_rank(scc_pair, Subgraphs, otherSubgraphs);
+  int master_graph = 0;
+  for (const auto &node : pair_node_list)
+  {
+    if (node.rank == 0)
+    {
+      int find_flag = -1;
+      onnx::GraphProto graph_temp =
+        determinegraphtype_v2(scc_pair[0], Subgraphs, otherSubgraphs, subgraph_size);
+      for (const auto &graph_node : graph_temp.node())
+      {
+        if (graph_node.name() == node.name)
+        {
+          find_flag = 1;
+          break;
+        }
+      }
+      if (find_flag == 1)
+      {
+        master_graph = 0;
+        break;
+      }
+      else
+      {
+        master_graph = 1;
+        break;
+      }
+    }
+  }
+  int slave_graph = 1 - master_graph;
+  // find the position where master and slave graph connect
+  int cut_rank = -1;
+  for (const auto &output : graphs_outputs[scc_pair[slave_graph]])
+  {
+    for (const auto &input : graphs_inputs[scc_pair[master_graph]])
+    {
+
+      if (input.name == output.name)
+      {
+        int node_index = 0;
+        onnx::GraphProto graph_temp =
+          determinegraphtype_v2(scc_pair[slave_graph], Subgraphs, otherSubgraphs, subgraph_size);
+        for (const auto &graph_node : graph_temp.node())
+        {
+          int update_node_rank = 0;
+          for (const auto &output_node : graph_node.output())
+          {
+            if (output_node == output.name)
+            {
+              if (slave_graph == 0)
+              {
+                if (cut_rank == -1 || cut_rank > pair_node_list[node_index].rank)
+                {
+                  cut_rank = pair_node_list[node_index].rank;
+                }
+              }
+              else
+              {
+                onnx::GraphProto graph_temp_1 = determinegraphtype_v2(
+                  scc_pair[master_graph], Subgraphs, otherSubgraphs, subgraph_size);
+                if (cut_rank == -1 ||
+                    cut_rank > pair_node_list[node_index + graph_temp_1.node_size()].rank)
+                {
+                  cut_rank = pair_node_list[node_index + graph_temp_1.node_size()].rank;
+                }
+              }
+              update_node_rank = 1;
+              break;
+            }
+          }
+          if (update_node_rank == 1)
+          {
+            break;
+          }
+          node_index++;
+        }
+        break;
+      }
+    }
+  }
+  // cut master graph according to the rank
+  onnx::GraphProto master_upper;
+  onnx::GraphProto master_lower;
+  int node_index = 0;
+  onnx::GraphProto graph_temp =
+    determinegraphtype_v2(scc_pair[master_graph], Subgraphs, otherSubgraphs, subgraph_size);
+  for (const auto &node : graph_temp.node())
+  {
+    int node_rank;
+    if (master_graph == 0)
+    {
+      node_rank = pair_node_list[node_index].rank;
+    }
+    else
+    {
+      onnx::GraphProto graph_temp_2 =
+        determinegraphtype_v2(scc_pair[slave_graph], Subgraphs, otherSubgraphs, subgraph_size);
+      node_rank = pair_node_list[node_index + graph_temp_2.node_size()].rank;
+    }
+    if (node_rank < cut_rank)
+    {
+      *master_upper.add_node() = node;
+    }
+    else
+    {
+      *master_lower.add_node() = node;
+    }
+    node_index++;
+  }
+  scc_pair_cut.push_back(master_upper);
+  scc_pair_cut.push_back(master_lower);
+  scc_pair_cut.push_back(
+    determinegraphtype_v2(scc_pair[slave_graph], Subgraphs, otherSubgraphs, subgraph_size));
+  if (master_graph == 1)
+  {
+    int temp = scc_pair[0];
+    scc_pair[0] = scc_pair[1];
+    scc_pair[1] = temp;
+    master_graph = 0;
+  } // assure the first graph is master
+  std::vector<int> return_value;
+  return_value.push_back(master_graph);
+  return_value.push_back(cut_rank);
+  return return_value;
+}
+/**
+ * @brief     Eliminate pairs of subgraphs by cutting them and updating the subgraph lists.
+ *
+ * @param     [in,out] Subgraphs List of subgraphs to be processed and updated.
+ * @param     [in,out] otherSubgraphs List of other subgraphs to be processed and updated.
+ * @param     [in] graphs_inputs List of input tensors for each graph.
+ * @param     [in] graphs_outputs List of output tensors for each graph.
+ * @param     [in] strongly_connected_subgraphs List of strongly connected subgraphs.
+ * @param     [in] subgraph_size Size of subgraph.
+ * @pre       The input lists should be properly initialized and contain valid data.
+ * @post      The `Subgraphs` and `otherSubgraphs` lists will be updated with cut subgraphs.
+ * @exception None
+ * @return    None
+ */
+void eliminate_pair_v2(std::vector<onnx::GraphProto> &Subgraphs,
+                       std::vector<onnx::GraphProto> &otherSubgraphs,
+                       std::vector<std::unordered_set<NodeTensor>> &graphs_inputs,
+                       std::vector<std::unordered_set<NodeTensor>> &graphs_outputs,
+                       std::vector<std::vector<int>> &strongly_connected_subgraphs,
+                       int subgraph_size)
+{
+  int original_node_size = 0;
+  for (auto &subgraph : Subgraphs)
+  {
+    original_node_size += subgraph.node_size();
+  }
+  for (auto &subgraph : otherSubgraphs)
+  {
+    original_node_size += subgraph.node_size();
+  }
+  std::vector<std::vector<std::vector<int>>> sccs_pairs;
+  find_subgraph_pair_v2(strongly_connected_subgraphs, Subgraphs, otherSubgraphs, graphs_inputs,
+                        graphs_outputs, sccs_pairs);
+  for (auto &scc_pairs : sccs_pairs)
+  {
+    for (auto &scc_pair : scc_pairs)
+    {
+      std::vector<onnx::GraphProto> scc_pair_cut;
+      cut_pair(Subgraphs, otherSubgraphs, graphs_inputs, graphs_outputs, scc_pair, scc_pair_cut,
+               subgraph_size);
+      if (scc_pair[0] < subgraph_size)
+      {
+        Subgraphs[scc_pair[0]] = scc_pair_cut[0];
+        Subgraphs.push_back(scc_pair_cut[1]);
+      }
+      else
+      {
+        otherSubgraphs[scc_pair[0] - subgraph_size] = scc_pair_cut[0];
+        otherSubgraphs.push_back(scc_pair_cut[1]);
+      }
+
+      if (scc_pair[1] < subgraph_size)
+      {
+        Subgraphs[scc_pair[1]] = scc_pair_cut[2];
+      }
+      else
+      {
+        otherSubgraphs[scc_pair[1] - subgraph_size] = scc_pair_cut[2];
+      }
+    }
+  }
+  for (int i = Subgraphs.size() - 1; i >= 0; i--)
+  {
+    if (Subgraphs[i].node_size() == 0)
+    {
+      Subgraphs.erase(Subgraphs.begin() + i);
+    }
+  }
+  for (int i = otherSubgraphs.size() - 1; i >= 0; i--)
+  {
+    if (otherSubgraphs[i].node_size() == 0)
+    {
+      otherSubgraphs.erase(otherSubgraphs.begin() + i);
+    }
+  }
+}
+/**
+ * @brief     Find the successor or predecessor subgraph with the least number of nodes.
+ *
+ * @param     [in] index Index of the current subgraph.
+ * @param     [in] successor List of successor indices.
+ * @param     [in] predecessor List of predecessor indices.
+ * @param     [in] Subgraphs List of subgraphs.
+ * @param     [in] otherSubgraphs List of other subgraphs.
+ * @pre       The input lists should be properly initialized and contain valid data.
+ * @post      None
+ * @exception None
+ * @return    Index of the successor or predecessor subgraph with the least number of nodes, or -1
+ * if no such subgraph exists.
+ */
+int find_min_size(int index, std::vector<int> &successor, std::vector<int> &predecessor,
+                  std::vector<onnx::GraphProto> &Subgraphs,
+                  std::vector<onnx::GraphProto>
+                    &otherSubgraphs) // find the successor or predecessor with the least nodes
+{
+  std::vector<int> size_list;
+  int min_index = -1;
+  int min_size = 10000;
+  for (int i = 0; i < int(successor.size()); i++)
+  {
+    std::cout << "successor: " << successor[i];
+    onnx::GraphProto tempgraph;
+    if ((successor[i] < int(Subgraphs.size()) && index < int(Subgraphs.size())) ||
+        (successor[i] >= int(Subgraphs.size()) && index >= int(Subgraphs.size())))
+    {
+      if (successor[i] < int(Subgraphs.size()))
+      {
+        tempgraph = Subgraphs[successor[i]];
+      }
+      else
+      {
+        tempgraph = otherSubgraphs[successor[i] - int(Subgraphs.size())];
+      }
+    }
+    else
+    {
+      continue;
+    }
+    int size = int(tempgraph.node_size());
+    std::cout << " size:" << size << " min:" << min_size;
+    if (size < min_size && size != 1)
+    {
+      min_size = size;
+      min_index = successor[i];
+      std::cout << " update min index:" << min_index;
+    }
+    std::cout << std::endl;
+  }
+  for (int i = 0; i < int(predecessor.size()); i++)
+  {
+    std::cout << "predecessor: " << predecessor[i];
+    onnx::GraphProto tempgraph;
+    if ((predecessor[i] < int(Subgraphs.size()) && index < int(Subgraphs.size())) ||
+        (predecessor[i] >= int(Subgraphs.size()) && index >= int(Subgraphs.size())))
+    {
+      if (predecessor[i] < int(Subgraphs.size()))
+      {
+        tempgraph = Subgraphs[predecessor[i]];
+      }
+      else
+      {
+        tempgraph = otherSubgraphs[predecessor[i] - int(Subgraphs.size())];
+      }
+    }
+    else
+    {
+      continue;
+    }
+    int size = int(tempgraph.node_size());
+    std::cout << " size:" << size << " min:" << min_size;
+    if (size < min_size && size != 1)
+    {
+      min_size = size;
+      min_index = predecessor[i];
+      std::cout << " update min index:" << min_index;
+    }
+    std::cout << std::endl;
+  }
+  return min_index;
+}
+void Partition::PartitionGraph(const onnx::GraphProto &g, Device &d, PartitionStrategy strategy,
+                               const std::unordered_map<std::string, NodeIOSize> &node_io_size)
+{
+  std::unordered_set<NodeTensor> IOvalueNames = getIOvalue(g);
+  int *visited = (int *)malloc(g.node_size() * sizeof(int));
+  std::vector<graph_adjacency_node> adjacency_list = get_adjancency_list(g, visited);
+  std::vector<onnx::GraphProto> otherSubgraphs;
+  determine_subgraphs_v2(g, otherSubgraphs, d, visited, adjacency_list, strategy);
+  std::cout << "Partition Done" << std::endl;
+  free(visited);
+  std::vector<graph_adjacency_node>().swap(adjacency_list);
+  int node_sum = 0;
+  // traverse the structures and print each element
+  std::ofstream outFile("./subgraphs_1.txt");
+  if (!outFile.is_open())
+  {
+    std::cerr << "Error opening file." << std::endl;
+    exit(0);
+  }
+  int id = 0;
+  for (const auto &vec : Subgraphs)
+  {
+    outFile << " subgraph" << id << ":";
+    for (const auto &node : vec.node())
+    {
+      outFile << node.name() << " ";
+    }
+    id++;
+    outFile << std::endl;
+    node_sum += vec.node_size();
+  }
+  int id_record = id;
+  std::ofstream outFile_2("./subgraphs_2.txt");
+  if (!outFile_2.is_open())
+  {
+    std::cerr << "Error opening file." << std::endl;
+    exit(0);
+  }
+  std::cout << "before:" << std::endl;
+  for (const auto &vec : otherSubgraphs)
+  {
+    outFile_2 << " subgraph" << id << ":";
+    for (const auto &node : vec.node())
+    {
+      outFile_2 << node.name() << " ";
+    }
+    id++;
+    outFile_2 << std::endl;
+    node_sum += vec.node_size();
+  }
+  std::vector<std::unordered_set<std::string>> subgraphs_2_input_nodes_;
+  std::vector<std::unordered_set<std::string>> subgraphs_2_nodes_;
+  for (const auto &sg : otherSubgraphs)
+  {
+    std::unordered_set<NodeTensor> graphInputs;
+    determineGraphInput(sg, IOvalueNames, graphInputs);
+    std::unordered_set<std::string> graphInputsNodes;
+    for (const auto &input : graphInputs)
+    {
+      auto nodename = findInputNode(g, input.name);
+      if (nodename != "")
+      {
+        graphInputsNodes.insert(nodename);
+      }
+    }
+    subgraphs_2_input_nodes_.push_back(graphInputsNodes);
+    subgraphs_2_nodes_.push_back(collectNodeNames(sg));
+  }
+  int *is_merged = (int *)malloc(otherSubgraphs.size() * sizeof(int));
+  for (int i = 0; i < int(otherSubgraphs.size()); i++)
+  {
+    is_merged[i] = 0;
+  }
+  std::cout << "graph size after merging:" << otherSubgraphs.size() << std::endl;
+  free(is_merged);
+  std::ofstream outFile_3("./subgraphs_3.txt");
+  if (!outFile_3.is_open())
+  {
+    std::cerr << "Error opening file." << std::endl;
+    exit(0);
+  }
+  ////othersubgraphs after merged
+  for (const auto &vec : otherSubgraphs)
+  {
+    outFile_3 << " subgraph" << id_record << ":";
+    for (const auto &node : vec.node())
+    {
+      outFile_3 << node.name() << " ";
+    }
+    id_record++;
+    outFile_3 << std::endl;
+  }
+  std::cout << "sub node size:" << node_sum << std::endl;
+
+  std::vector<std::unordered_set<NodeTensor>> subgraphs_1_inputs;
+  std::vector<std::unordered_set<std::string>> subgraphs_1_input_nodes;
+  std::vector<std::unordered_set<std::string>> subgraphs_1_nodes;
+  for (const auto &sg : Subgraphs)
+  {
+    std::unordered_set<NodeTensor> graphInputs;
+    determineGraphInput(sg, IOvalueNames, graphInputs);
+    subgraphs_1_inputs.push_back(graphInputs);
+    std::unordered_set<std::string> graphInputsNodes;
+    for (const auto &input : graphInputs)
+    {
+      auto nodename = findInputNode(g, input.name);
+      if (nodename != "")
+      {
+        graphInputsNodes.insert(nodename);
+      }
+    }
+    subgraphs_1_input_nodes.push_back(graphInputsNodes);
+    subgraphs_1_nodes.push_back(collectNodeNames(sg));
+  }
+
+  std::vector<std::unordered_set<NodeTensor>> subgraphs_2_inputs;
+  std::vector<std::unordered_set<std::string>> subgraphs_2_input_nodes;
+  std::vector<std::unordered_set<std::string>> subgraphs_2_nodes;
+  for (const auto &sg : otherSubgraphs)
+  {
+    std::unordered_set<NodeTensor> graphInputs;
+    determineGraphInput(sg, IOvalueNames, graphInputs);
+    subgraphs_2_inputs.push_back(graphInputs);
+    std::unordered_set<std::string> graphInputsNodes;
+    for (const auto &input : graphInputs)
+    {
+      auto nodename = findInputNode(g, input.name);
+      if (nodename != "")
+      {
+        graphInputsNodes.insert(nodename);
+      }
+    }
+    subgraphs_2_input_nodes.push_back(graphInputsNodes);
+    subgraphs_2_nodes.push_back(collectNodeNames(sg));
+  }
+  std::vector<std::unordered_set<NodeTensor>> subgraphs_1_outputs;
+
+  int node_number = 0;
+
+  for (const auto &sg : Subgraphs)
+  {
+    std::unordered_set<NodeTensor> graphOutputs;
+    node_number += sg.node_size();
+    determineGraphOutput(g, sg, subgraphs_1_inputs, subgraphs_2_inputs, graphOutputs);
+    subgraphs_1_outputs.push_back(graphOutputs);
+  }
+  std::vector<std::unordered_set<NodeTensor>> subgraphs_2_outputs;
+  for (const auto &sg : otherSubgraphs)
+  {
+    std::unordered_set<NodeTensor> graphOutputs;
+    node_number += sg.node_size();
+    determineGraphOutput(g, sg, subgraphs_1_inputs, subgraphs_2_inputs, graphOutputs);
+    subgraphs_2_outputs.push_back(graphOutputs);
+  }
+  int graph_node_size_minus_constant = g.node_size();
+  for (const auto &node : g.node())
+  {
+    if (node.op_type() == "Constant")
+    {
+      graph_node_size_minus_constant--;
+    }
+  }
+  std::cout << "total number of nodes in subgraphs:" << node_number << std::endl;
+  std::cout << "total number of nodes in origional graph:" << graph_node_size_minus_constant
+            << std::endl;
+  std::vector<std::unordered_set<NodeTensor>> graphs_inputs;
+  graphs_inputs.insert(graphs_inputs.end(), subgraphs_1_inputs.begin(), subgraphs_1_inputs.end());
+  graphs_inputs.insert(graphs_inputs.end(), subgraphs_2_inputs.begin(), subgraphs_2_inputs.end());
+  std::vector<std::unordered_set<NodeTensor>> graphs_outputs;
+  graphs_outputs.insert(graphs_outputs.end(), subgraphs_1_outputs.begin(),
+                        subgraphs_1_outputs.end());
+  graphs_outputs.insert(graphs_outputs.end(), subgraphs_2_outputs.begin(),
+                        subgraphs_2_outputs.end());
+
+  std::vector<std::vector<int>> predecessors_Subgraphs(graphs_inputs.size());
+  std::vector<std::vector<int>> successors_Subgraphs(graphs_inputs.size());
+  for (int i = 0; i < int(graphs_inputs.size()); i++) // traversal all subgraphs
+  {
+    std::vector<int> predecessors;
+    for (const auto &g_input : graphs_inputs[i])
+    {
+      for (int j = 0; j < int(graphs_outputs.size()); j++)
+      {
+        if ((graphs_outputs[j].find(g_input) != graphs_outputs[j].end()))
+        {
+          predecessors.push_back(j);
+        }
+      }
+    }
+    if (predecessors.size() == 0)
+    {
+      std::cout << "subgraph " << i << " has no predecessors" << std::endl;
+    }
+    predecessors_Subgraphs[i].insert(predecessors_Subgraphs[i].end(), predecessors.begin(),
+                                     predecessors.end());
+  }
+  for (int i = 0; i < int(graphs_inputs.size()); i++)
+  {
+    for (int j = 0; j < int(graphs_inputs.size()); j++)
+    {
+      if (find(predecessors_Subgraphs[j].begin(), predecessors_Subgraphs[j].end(), i) !=
+          predecessors_Subgraphs[j].end())
+      {
+        successors_Subgraphs[i].push_back(j);
+      }
+    }
+  }
+  std::vector<std::vector<int>> strongly_connected_subgraphs;
+  int *DFN = (int *)malloc(graphs_inputs.size() * sizeof(int));
+  int *LOW = (int *)malloc(graphs_inputs.size() * sizeof(int));
+  for (int i = 0; i < int(graphs_inputs.size()); i++)
+  {
+    DFN[i] = 0;
+    LOW[i] = 0;
+  }
+  for (int temp_count = 0; temp_count < int(predecessors_Subgraphs.size()); temp_count++)
+  {
+    if (DFN[temp_count] == 0)
+    {
+      std::vector<int> stack_subgraphs;
+      int depth = 0;
+      Tarjan(temp_count, depth, strongly_connected_subgraphs, DFN, LOW, stack_subgraphs,
+             successors_Subgraphs);
+    }
+  }
+
+  std::string file_name_scc = "scc.txt";
+  std::ofstream outfile_scc(file_name_scc);
+  outfile_scc << strongly_connected_subgraphs.size() << std::endl;
+  for (const auto &scc : strongly_connected_subgraphs)
+  {
+    std::cout << "scc:";
+    outfile_scc << "scc: ";
+    for (const auto &scc_id : scc)
+    {
+      outfile_scc << scc_id << " ";
+    }
+    outfile_scc << std::endl;
+    for (const auto &scc_id : scc)
+    {
+      std::cout << scc_id << " ";
+      outfile_scc << "subgraph" << scc_id << " input:";
+      for (const auto &scc_input : graphs_inputs[scc_id])
+      {
+        outfile_scc << scc_input.name << ";";
+      }
+      outfile_scc << " output:";
+      for (const auto &scc_output : graphs_outputs[scc_id])
+      {
+        outfile_scc << scc_output.name << ";";
+      }
+      outfile_scc << std::endl;
+    }
+
+    std::cout << std::endl;
+  }
+  outfile_scc.close();
+  free(DFN);
+  free(LOW);
+  int node_num_all = 0;
+  for (const auto &sg : Subgraphs)
+  {
+    node_num_all += sg.node_size();
+  }
+  for (const auto &sg : otherSubgraphs)
+  {
+    node_num_all += sg.node_size();
+  }
+  std::cout << "node num in original graph: " << g.node_size() << std::endl;
+  std::cout << "node_num after cut " << node_num_all << std::endl;
+  ///////////////////////+++
+  int *DFN_ = (int *)malloc(graphs_inputs.size() * sizeof(int));
+  int *LOW_ = (int *)malloc(graphs_inputs.size() * sizeof(int));
+  for (int i = 0; i < int(graphs_inputs.size()); i++)
+  {
+    DFN_[i] = 0;
+    LOW_[i] = 0;
+  }
+  for (int temp_count = 0; temp_count < int(predecessors_Subgraphs.size()); temp_count++)
+  {
+    if (DFN_[temp_count] == 0)
+    {
+      std::vector<int> stack_subgraphs;
+      int depth = 0;
+      Tarjan(temp_count, depth, strongly_connected_subgraphs, DFN_, LOW_, stack_subgraphs,
+             successors_Subgraphs);
+    }
+  }
+  free(DFN_);
+  free(LOW_);
+  eliminate_scc_v2(strongly_connected_subgraphs, Subgraphs, otherSubgraphs, g);
+  /////////////////////
+  strongly_connected_subgraphs.clear();
+  predecessors_Subgraphs.clear();
+  successors_Subgraphs.clear();
+  std::vector<std::unordered_set<NodeTensor>>().swap(subgraphs_2_inputs);
+  std::vector<std::unordered_set<NodeTensor>>().swap(subgraphs_1_inputs);
+  std::vector<std::unordered_set<NodeTensor>>().swap(subgraphs_2_outputs);
+  std::vector<std::unordered_set<NodeTensor>>().swap(subgraphs_1_outputs);
+  std::vector<std::unordered_set<NodeTensor>>().swap(graphs_inputs);
+  std::vector<std::unordered_set<NodeTensor>>().swap(graphs_outputs);
+  for (const auto &sg : Subgraphs)
+  {
+    std::unordered_set<NodeTensor> graphInputs;
+    determineGraphInput(sg, IOvalueNames, graphInputs);
+    subgraphs_1_inputs.push_back(graphInputs);
+  }
+  for (const auto &sg : otherSubgraphs)
+  {
+    std::unordered_set<NodeTensor> graphInputs;
+    determineGraphInput(sg, IOvalueNames, graphInputs);
+    subgraphs_2_inputs.push_back(graphInputs);
+  }
+  node_number = 0;
+  for (const auto &sg : Subgraphs)
+  {
+    std::unordered_set<NodeTensor> graphOutputs;
+    node_number += sg.node_size();
+    determineGraphOutput(g, sg, subgraphs_1_inputs, subgraphs_2_inputs, graphOutputs);
+    subgraphs_1_outputs.push_back(graphOutputs);
+  }
+  for (const auto &sg : otherSubgraphs)
+  {
+    std::unordered_set<NodeTensor> graphOutputs;
+    node_number += sg.node_size();
+    determineGraphOutput(g, sg, subgraphs_1_inputs, subgraphs_2_inputs, graphOutputs);
+    subgraphs_2_outputs.push_back(graphOutputs);
+  }
+  graphs_inputs.insert(graphs_inputs.end(), subgraphs_1_inputs.begin(), subgraphs_1_inputs.end());
+  graphs_inputs.insert(graphs_inputs.end(), subgraphs_2_inputs.begin(), subgraphs_2_inputs.end());
+  graphs_outputs.insert(graphs_outputs.end(), subgraphs_1_outputs.begin(),
+                        subgraphs_1_outputs.end());
+  graphs_outputs.insert(graphs_outputs.end(), subgraphs_2_outputs.begin(),
+                        subgraphs_2_outputs.end());
+  for (int i = 0; i < int(graphs_inputs.size()); i++)
+  {
+    std::vector<int> predecessors;
+    for (const auto &g_input : graphs_inputs[i])
+    {
+      for (int j = 0; j < int(graphs_outputs.size()); j++)
+      {
+        if ((graphs_outputs[j].find(g_input) != graphs_outputs[j].end()))
+        {
+          predecessors.push_back(j);
+        }
+      }
+    }
+    predecessors_Subgraphs.push_back(predecessors);
+  }
+  for (int i = 0; i < int(graphs_inputs.size()); i++)
+  {
+    std::vector<int> temp;
+    for (int j = 0; j < int(graphs_inputs.size()); j++)
+    {
+      if (find(predecessors_Subgraphs[j].begin(), predecessors_Subgraphs[j].end(), i) !=
+          predecessors_Subgraphs[j].end())
+      {
+        temp.push_back(j);
+      }
+    }
+    successors_Subgraphs.push_back(temp);
+  }
+  std::string file_name_predecessor_2 = "predecessor_final_2.txt";
+  std::string file_name_successor_2 = "successor_final_2.txt";
+  std::ofstream outfile_predecessor_2(file_name_predecessor_2);
+  std::ofstream outfile_successor_2(file_name_successor_2);
+  if (!(outfile_predecessor_2.is_open() && outfile_successor_2.is_open()))
+  {
+    std::cerr << "Error opening file." << std::endl;
+    exit(0);
+  }
+  for (int i = 0; i < int(graphs_inputs.size()); i++)
+  {
+    outfile_predecessor_2 << "predecessor of subgraph " << i << ":";
+    for (const auto &predecessor : predecessors_Subgraphs[i])
+    {
+      outfile_predecessor_2 << predecessor << ";";
+    }
+    outfile_predecessor_2 << std::endl;
+    outfile_successor_2 << "successor of subgraph " << i << ":";
+    for (const auto &successor : successors_Subgraphs[i])
+    {
+      outfile_successor_2 << successor << ";";
+    }
+    outfile_successor_2 << std::endl;
+  }
+  outfile_predecessor_2.close();
+  outfile_successor_2.close();
+  print_subgraphs(Subgraphs, (char *)"./subgraphs_final_2.txt", otherSubgraphs,
+                  (char *)"./other_subgraphs_final_2.txt");
+  int *DFN_2 = (int *)malloc(graphs_inputs.size() * sizeof(int));
+  int *LOW_2 = (int *)malloc(graphs_inputs.size() * sizeof(int));
+  for (int i = 0; i < int(graphs_inputs.size()); i++)
+  {
+    DFN_2[i] = 0;
+    LOW_2[i] = 0;
+  }
+  for (int temp_count = 0; temp_count < int(predecessors_Subgraphs.size()); temp_count++)
+  {
+    if (DFN_[temp_count] == 0)
+    {
+      std::vector<int> stack_subgraphs;
+      int depth = 0;
+      Tarjan(temp_count, depth, strongly_connected_subgraphs, DFN_2, LOW_2, stack_subgraphs,
+             successors_Subgraphs);
+    }
+  }
+  std::string file_name_scc2 = "scc2.txt";
+  std::ofstream outfile_scc2(file_name_scc2);
+  for (const auto &scc : strongly_connected_subgraphs)
+  {
+    std::cout << "scc:";
+    outfile_scc2 << "scc: ";
+    for (const auto &scc_id : scc)
+    {
+      outfile_scc2 << scc_id << " ";
+    }
+    outfile_scc2 << std::endl;
+    for (const auto &scc_id : scc)
+    {
+      std::cout << scc_id << " ";
+      outfile_scc2 << "subgraph" << scc_id << " input:";
+      for (const auto &scc_input : graphs_inputs[scc_id])
+      {
+        outfile_scc2 << scc_input.name << ";";
+      }
+      outfile_scc2 << " output:";
+      for (const auto &scc_output : graphs_outputs[scc_id])
+      {
+        outfile_scc2 << scc_output.name << ";";
+      }
+      outfile_scc2 << std::endl;
+    }
+
+    std::cout << std::endl;
+  }
+  outfile_scc.close();
+  free(DFN_2);
+  free(LOW_2);
+  // eliminate_scc_v2(strongly_connected_subgraphs,  Subgraphs, otherSubgraphs, g);
+  int subgraph_size_2 = Subgraphs.size();
+  int other_subgraph_size_2 = otherSubgraphs.size();
+  std::vector<int> eliminated_small_graph_id;
+  std::vector<int> eliminated_small_graph_size;
+  std::vector<int> eliminated_small_graph_size_2;
+  std::vector<int> unmerged_graph_id;
+  for (int i = 0; i < subgraph_size_2 + other_subgraph_size_2; i++)
+  {
+    std::cout << "i:" << i << std::endl;
+    if (i < subgraph_size_2)
+    {
+      if (Subgraphs[i].node_size() < 2)
+      {
+        int merge_id = find_min_size(i, successors_Subgraphs[i], predecessors_Subgraphs[i],
+                                     Subgraphs, otherSubgraphs);
+        if (merge_id < subgraph_size_2 && merge_id >= 0)
+        {
+          mergeGraphs(Subgraphs[merge_id], Subgraphs[i]);
+          eliminated_small_graph_id.push_back(i);
+          eliminated_small_graph_size.push_back(Subgraphs[i].node_size());
+          std::cout << "eliminating small graph " << i << "and merged to " << merge_id << std::endl;
+        }
+        else if (merge_id >= 0)
+        {
+          mergeGraphs(otherSubgraphs[merge_id - subgraph_size_2], Subgraphs[i]);
+          eliminated_small_graph_id.push_back(i);
+          eliminated_small_graph_size.push_back(Subgraphs[i].node_size());
+          std::cout << "eliminating small graph " << i << "and merged to " << merge_id << std::endl;
+        }
+        else
+        {
+          unmerged_graph_id.push_back(i);
+        }
+      }
+    }
+    else
+    {
+      if (otherSubgraphs[i - subgraph_size_2].node_size() < 2)
+      {
+        int merge_id = find_min_size(i, successors_Subgraphs[i], predecessors_Subgraphs[i],
+                                     Subgraphs, otherSubgraphs);
+        if (merge_id < subgraph_size_2 && merge_id >= 0)
+        {
+          mergeGraphs(Subgraphs[merge_id], otherSubgraphs[i - subgraph_size_2]);
+          eliminated_small_graph_id.push_back(i);
+          eliminated_small_graph_size.push_back(otherSubgraphs[i - subgraph_size_2].node_size());
+          std::cout << "eliminating small graph " << i << "and merged to " << merge_id << std::endl;
+        }
+        else if (merge_id >= 0)
+        {
+          mergeGraphs(otherSubgraphs[merge_id - subgraph_size_2],
+                      otherSubgraphs[i - subgraph_size_2]);
+          eliminated_small_graph_id.push_back(i);
+          eliminated_small_graph_size.push_back(otherSubgraphs[i - subgraph_size_2].node_size());
+          std::cout << "eliminating small graph " << i << "and merged to " << merge_id << std::endl;
+        }
+        else
+        {
+          unmerged_graph_id.push_back(i);
+        }
+      }
+    }
+  }
+  std::cout << "succeed in reaching here" << std::endl;
+  for (int i = eliminated_small_graph_id.size() - 1; i >= 0; i--)
+  {
+    if (std::find(unmerged_graph_id.begin(), unmerged_graph_id.end(),
+                  eliminated_small_graph_id[i]) != unmerged_graph_id.end())
+    {
+      continue;
+    }
+    std::cout << eliminated_small_graph_id[i] << " ";
+    int index = eliminated_small_graph_id[i];
+    if (index < subgraph_size_2)
+    {
+      if (Subgraphs[index].node_size() > 1)
+      {
+        std::cout << "eliminate Subgraphs" << index << " ";
+        for (auto node : Subgraphs[index].node())
+        {
+          std::cout << node.name() << " ";
+        }
+      }
+      eliminated_small_graph_size_2.push_back(Subgraphs[index].node_size());
+      Subgraphs.erase(Subgraphs.begin() + index);
+    }
+    else
+    {
+      if (otherSubgraphs[index - subgraph_size_2].node_size() > 1)
+      {
+        std::cout << "eliminate otherSubgraphs" << index - subgraph_size_2 << " ";
+        for (auto node : otherSubgraphs[index - subgraph_size_2].node())
+        {
+          std::cout << node.name() << " ";
+        }
+      }
+      eliminated_small_graph_size_2.push_back(otherSubgraphs[index - subgraph_size_2].node_size());
+      otherSubgraphs.erase(otherSubgraphs.begin() + index - subgraph_size_2);
+    }
+  }
+  std::cout << std::endl;
+  std::cout << "eliminated_small_graph_size_1: ";
+  for (const auto &size : eliminated_small_graph_size)
+  {
+    std::cout << size << " ";
+  }
+  std::cout << std::endl;
+  std::cout << "eliminated_small_graph_size_2: ";
+  for (const auto &size : eliminated_small_graph_size_2)
+  {
+    std::cout << size << " ";
+  }
+  std::cout << std::endl;
+  /////////clear
+  strongly_connected_subgraphs.clear();
+  predecessors_Subgraphs.clear();
+  successors_Subgraphs.clear();
+  std::vector<std::unordered_set<NodeTensor>>().swap(subgraphs_2_inputs);
+  std::vector<std::unordered_set<NodeTensor>>().swap(subgraphs_1_inputs);
+  std::vector<std::unordered_set<NodeTensor>>().swap(subgraphs_2_outputs);
+  std::vector<std::unordered_set<NodeTensor>>().swap(subgraphs_1_outputs);
+  std::vector<std::unordered_set<NodeTensor>>().swap(graphs_inputs);
+  std::vector<std::unordered_set<NodeTensor>>().swap(graphs_outputs);
+  for (const auto &sg : Subgraphs)
+  {
+    std::unordered_set<NodeTensor> graphInputs;
+    determineGraphInput(sg, IOvalueNames, graphInputs);
+    subgraphs_1_inputs.push_back(graphInputs);
+  }
+  for (const auto &sg : otherSubgraphs)
+  {
+    std::unordered_set<NodeTensor> graphInputs;
+    determineGraphInput(sg, IOvalueNames, graphInputs);
+    subgraphs_2_inputs.push_back(graphInputs);
+  }
+  node_number = 0;
+  for (const auto &sg : Subgraphs)
+  {
+    std::unordered_set<NodeTensor> graphOutputs;
+    node_number += sg.node_size();
+    determineGraphOutput(g, sg, subgraphs_1_inputs, subgraphs_2_inputs, graphOutputs);
+    subgraphs_1_outputs.push_back(graphOutputs);
+  }
+  for (const auto &sg : otherSubgraphs)
+  {
+    std::unordered_set<NodeTensor> graphOutputs;
+    node_number += sg.node_size();
+    determineGraphOutput(g, sg, subgraphs_1_inputs, subgraphs_2_inputs, graphOutputs);
+    subgraphs_2_outputs.push_back(graphOutputs);
+  }
+  graphs_inputs.insert(graphs_inputs.end(), subgraphs_1_inputs.begin(), subgraphs_1_inputs.end());
+  graphs_inputs.insert(graphs_inputs.end(), subgraphs_2_inputs.begin(), subgraphs_2_inputs.end());
+  graphs_outputs.insert(graphs_outputs.end(), subgraphs_1_outputs.begin(),
+                        subgraphs_1_outputs.end());
+  graphs_outputs.insert(graphs_outputs.end(), subgraphs_2_outputs.begin(),
+                        subgraphs_2_outputs.end());
+  for (int i = 0; i < int(graphs_inputs.size()); i++)
+  {
+    std::vector<int> predecessors;
+    for (const auto &g_input : graphs_inputs[i])
+    {
+      for (int j = 0; j < int(graphs_outputs.size()); j++)
+      {
+        if ((graphs_outputs[j].find(g_input) != graphs_outputs[j].end()))
+        {
+          predecessors.push_back(j);
+        }
+      }
+    }
+    predecessors_Subgraphs.push_back(predecessors);
+  }
+  for (int i = 0; i < int(graphs_inputs.size()); i++)
+  {
+    std::vector<int> temp;
+    for (int j = 0; j < int(graphs_inputs.size()); j++)
+    {
+      if (find(predecessors_Subgraphs[j].begin(), predecessors_Subgraphs[j].end(), i) !=
+          predecessors_Subgraphs[j].end())
+      {
+        temp.push_back(j);
+      }
+    }
+    successors_Subgraphs.push_back(temp);
+  }
+  std::string file_name_predecessor_3 = "predecessor_final_3.txt";
+  std::string file_name_successor_3 = "successor_final_3.txt";
+  std::ofstream outfile_predecessor_3(file_name_predecessor_3);
+  std::ofstream outfile_successor_3(file_name_successor_3);
+  if (!(outfile_predecessor_3.is_open() && outfile_successor_3.is_open()))
+  {
+    std::cerr << "Error opening file." << std::endl;
+    exit(0);
+  }
+  for (int i = 0; i < int(graphs_inputs.size()); i++)
+  {
+    outfile_predecessor_3 << "predecessor of subgraph " << i << ":";
+    for (const auto &predecessor : predecessors_Subgraphs[i])
+    {
+      outfile_predecessor_3 << predecessor << ";";
+    }
+    outfile_predecessor_3 << std::endl;
+    outfile_successor_3 << "successor of subgraph " << i << ":";
+    for (const auto &successor : successors_Subgraphs[i])
+    {
+      outfile_successor_3 << successor << ";";
+    }
+    outfile_successor_3 << std::endl;
+  }
+  outfile_predecessor_3.close();
+  outfile_successor_3.close();
+  print_subgraphs(Subgraphs, (char *)"./subgraphs_final_3.txt", otherSubgraphs,
+                  (char *)"./other_subgraphs_final_3.txt");
+  node_num_all = 0;
+  for (const auto &sg : Subgraphs)
+  {
+    node_num_all += sg.node_size();
+  }
+  for (const auto &sg : otherSubgraphs)
+  {
+    node_num_all += sg.node_size();
+  }
+  int *DFN_3 = (int *)malloc(graphs_inputs.size() * sizeof(int));
+  int *LOW_3 = (int *)malloc(graphs_inputs.size() * sizeof(int));
+  for (int i = 0; i < int(graphs_inputs.size()); i++)
+  {
+    DFN_3[i] = 0;
+    LOW_3[i] = 0;
+  }
+  for (int temp_count = 0; temp_count < int(predecessors_Subgraphs.size()); temp_count++)
+  {
+    if (DFN_[temp_count] == 0)
+    {
+      std::vector<int> stack_subgraphs;
+      int depth = 0;
+      Tarjan(temp_count, depth, strongly_connected_subgraphs, DFN_3, LOW_3, stack_subgraphs,
+             successors_Subgraphs);
+    }
+  }
+  std::string file_name_scc3 = "scc3.txt";
+  std::ofstream outfile_scc3(file_name_scc3);
+  for (const auto &scc : strongly_connected_subgraphs)
+  {
+    std::cout << "scc:";
+    outfile_scc3 << "scc: ";
+    for (const auto &scc_id : scc)
+    {
+      outfile_scc3 << scc_id << " ";
+    }
+    outfile_scc3 << std::endl;
+    for (const auto &scc_id : scc)
+    {
+      std::cout << scc_id << " ";
+      outfile_scc3 << "subgraph" << scc_id << " input:";
+      for (const auto &scc_input : graphs_inputs[scc_id])
+      {
+        outfile_scc3 << scc_input.name << ";";
+      }
+      outfile_scc3 << " output:";
+      for (const auto &scc_output : graphs_outputs[scc_id])
+      {
+        outfile_scc3 << scc_output.name << ";";
+      }
+      outfile_scc3 << std::endl;
+    }
+
+    std::cout << std::endl;
+  }
+  outfile_scc.close();
+  free(DFN_3);
+  free(LOW_3);
+  std::cout << "node_num after cut " << node_num_all << std::endl;
+  if (node_num_all != g.node_size())
+  {
+    std::cout << "num error!" << std::endl;
+    exit(0);
+  }
+  int count_cut_pair = 0;
+  while (1)
+  {
+    count_cut_pair++;
+    if (count_cut_pair > 15)
+    {
+      std::cout << "cut pair error! So many times!" << std::endl;
+      exit(0);
+      break;
+    }
+    int subgraph_size = Subgraphs.size();
+    std::vector<std::vector<int>> strongly_connected_subgraphs_all;
+    std::vector<int> scc_all;
+    for (int i = 0; i < int(Subgraphs.size()) + int(otherSubgraphs.size()); i++)
+    {
+      scc_all.push_back(i);
+    }
+    strongly_connected_subgraphs_all.push_back(scc_all);
+    if (((count_cut_pair > 1 && count_cut_pair < 5) ||
+         (count_cut_pair > 10 && count_cut_pair < 13)) &&
+        strongly_connected_subgraphs.size() != 0)
+    {
+      std::cout << count_cut_pair << " eliminate scc v2 executed" << std::endl;
+      eliminate_scc_v2(strongly_connected_subgraphs, Subgraphs, otherSubgraphs, g);
+      // eliminate_pair_v2(Subgraphs, otherSubgraphs, graphs_inputs, graphs_outputs,
+      // strongly_connected_subgraphs_all, subgraph_size);
+    }
+    else if (((count_cut_pair == 15)) && strongly_connected_subgraphs.size() != 0)
+    {
+      std::cout << count_cut_pair << " eliminate scc v3 executed" << std::endl;
+      eliminate_scc_v3(strongly_connected_subgraphs, Subgraphs, otherSubgraphs, g);
+    }
+    else
+    {
+      std::cout << count_cut_pair << " eliminate pair v2 executed" << std::endl;
+      eliminate_pair_v2(Subgraphs, otherSubgraphs, graphs_inputs, graphs_outputs,
+                        strongly_connected_subgraphs_all, subgraph_size);
+    }
+    strongly_connected_subgraphs.clear();
+    predecessors_Subgraphs.clear();
+    successors_Subgraphs.clear();
+    std::vector<std::unordered_set<NodeTensor>>().swap(subgraphs_2_inputs);
+    std::vector<std::unordered_set<NodeTensor>>().swap(subgraphs_1_inputs);
+    std::vector<std::unordered_set<NodeTensor>>().swap(subgraphs_2_outputs);
+    std::vector<std::unordered_set<NodeTensor>>().swap(subgraphs_1_outputs);
+    std::vector<std::unordered_set<NodeTensor>>().swap(graphs_inputs);
+    std::vector<std::unordered_set<NodeTensor>>().swap(graphs_outputs);
+    for (const auto &sg : Subgraphs)
+    {
+      std::unordered_set<NodeTensor> graphInputs;
+      determineGraphInput(sg, IOvalueNames, graphInputs);
+      subgraphs_1_inputs.push_back(graphInputs);
+    }
+    for (const auto &sg : otherSubgraphs)
+    {
+      std::unordered_set<NodeTensor> graphInputs;
+      determineGraphInput(sg, IOvalueNames, graphInputs);
+      subgraphs_2_inputs.push_back(graphInputs);
+    }
+    node_number = 0;
+    for (const auto &sg : Subgraphs)
+    {
+      std::unordered_set<NodeTensor> graphOutputs;
+      node_number += sg.node_size();
+      determineGraphOutput(g, sg, subgraphs_1_inputs, subgraphs_2_inputs, graphOutputs);
+      subgraphs_1_outputs.push_back(graphOutputs);
+    }
+    for (const auto &sg : otherSubgraphs)
+    {
+      std::unordered_set<NodeTensor> graphOutputs;
+      node_number += sg.node_size();
+      determineGraphOutput(g, sg, subgraphs_1_inputs, subgraphs_2_inputs, graphOutputs);
+      subgraphs_2_outputs.push_back(graphOutputs);
+    }
+    graphs_inputs.insert(graphs_inputs.end(), subgraphs_1_inputs.begin(), subgraphs_1_inputs.end());
+    graphs_inputs.insert(graphs_inputs.end(), subgraphs_2_inputs.begin(), subgraphs_2_inputs.end());
+    graphs_outputs.insert(graphs_outputs.end(), subgraphs_1_outputs.begin(),
+                          subgraphs_1_outputs.end());
+    graphs_outputs.insert(graphs_outputs.end(), subgraphs_2_outputs.begin(),
+                          subgraphs_2_outputs.end());
+    for (int i = 0; i < int(graphs_inputs.size()); i++)
+    {
+      std::vector<int> predecessors;
+      for (const auto &g_input : graphs_inputs[i])
+      {
+        for (int j = 0; j < int(graphs_outputs.size()); j++)
+        {
+          if ((graphs_outputs[j].find(g_input) != graphs_outputs[j].end()))
+          {
+            predecessors.push_back(j);
+          }
+        }
+      }
+      predecessors_Subgraphs.push_back(predecessors);
+    }
+    for (int i = 0; i < int(graphs_inputs.size()); i++)
+    {
+      std::vector<int> temp;
+      for (int j = 0; j < int(graphs_inputs.size()); j++)
+      {
+        if (find(predecessors_Subgraphs[j].begin(), predecessors_Subgraphs[j].end(), i) !=
+            predecessors_Subgraphs[j].end())
+        {
+          temp.push_back(j);
+        }
+      }
+      successors_Subgraphs.push_back(temp);
+    }
+    node_num_all = 0;
+    for (const auto &sg : Subgraphs)
+    {
+      node_num_all += sg.node_size();
+    }
+    for (const auto &sg : otherSubgraphs)
+    {
+      node_num_all += sg.node_size();
+    }
+    int *DFN_4 = (int *)malloc(graphs_inputs.size() * sizeof(int));
+    int *LOW_4 = (int *)malloc(graphs_inputs.size() * sizeof(int));
+    for (int i = 0; i < int(graphs_inputs.size()); i++)
+    {
+      DFN_4[i] = 0;
+      LOW_4[i] = 0;
+    }
+    for (int temp_count = 0; temp_count < int(predecessors_Subgraphs.size()); temp_count++)
+    {
+      if (DFN_[temp_count] == 0)
+      {
+        std::vector<int> stack_subgraphs;
+        int depth = 0;
+        Tarjan(temp_count, depth, strongly_connected_subgraphs, DFN_4, LOW_4, stack_subgraphs,
+               successors_Subgraphs);
+      }
+    }
+    std::string file_name_scc4 = "scc4.txt";
+    std::ofstream outfile_scc4(file_name_scc4);
+    for (const auto &scc : strongly_connected_subgraphs)
+    {
+      std::cout << "scc4:";
+      for (const auto &scc_id : scc)
+      {
+        std::cout << scc_id << " ";
+        outfile_scc4 << "subgraph" << scc_id << " input:";
+        for (const auto &scc_input : graphs_inputs[scc_id])
+        {
+          outfile_scc4 << scc_input.name << ";";
+        }
+        outfile_scc4 << " output:";
+        for (const auto &scc_output : graphs_outputs[scc_id])
+        {
+          outfile_scc4 << scc_output.name << ";";
+        }
+        outfile_scc4 << std::endl;
+      }
+
+      std::cout << std::endl;
+    }
+    outfile_scc.close();
+    free(DFN_4);
+    free(LOW_4);
+    std::cout << "node num in original graph: " << g.node_size() << std::endl;
+    std::cout << "node_num after cut " << node_num_all << std::endl;
+    if (node_num_all != g.node_size())
+    {
+      std::cout << "num error!, time" << count_cut_pair << std::endl;
+      exit(0);
+    }
+    if (count_cut_pair == 15)
+    {
+      if (strongly_connected_subgraphs.size() == 0)
+      {
+        break;
+      }
+      else
+      {
+        std::cout << "error!" << std::endl;
+        exit(0);
+      }
+    }
+    std::cout << "graph number after " << count_cut_pair
+              << "loops: " << Subgraphs.size() + otherSubgraphs.size() << std::endl;
+  } // end of while
+  std::string file_name_predecessor_4 = "predecessor_final_4.txt";
+  std::string file_name_successor_4 = "successor_final_4.txt";
+  std::ofstream outfile_predecessor_4(file_name_predecessor_4);
+  std::ofstream outfile_successor_4(file_name_successor_4);
+  if (!(outfile_predecessor_4.is_open() && outfile_successor_4.is_open()))
+  {
+    std::cerr << "Error opening file." << std::endl;
+    exit(0);
+  }
+  for (int i = 0; i < int(graphs_inputs.size()); i++)
+  {
+    outfile_predecessor_4 << "predecessor of subgraph " << i << ":";
+    for (const auto &predecessor : predecessors_Subgraphs[i])
+    {
+      outfile_predecessor_4 << predecessor << ";";
+    }
+    outfile_predecessor_4 << std::endl;
+    outfile_successor_4 << "successor of subgraph " << i << ":";
+    for (const auto &successor : successors_Subgraphs[i])
+    {
+      outfile_successor_4 << successor << ";";
+    }
+    outfile_successor_4 << std::endl;
+  }
+  outfile_predecessor_4.close();
+  outfile_successor_4.close();
+  print_subgraphs(Subgraphs, (char *)"./subgraphs_final_4.txt", otherSubgraphs,
+                  (char *)"./other_subgraphs_final_4.txt");
+  ////*
+  int temp_count_subgraph = 0;
+
+  std::ofstream outfile_conv_flag("end_with_conv.txt");
+  for (const auto &graph_outputs : subgraphs_1_outputs)
+  {
+    int find_flag = 0;
+    for (const auto &graph_output : graph_outputs)
+    {
+      for (const auto &node : Subgraphs[temp_count_subgraph].node())
+      {
+        for (const auto &output : node.output())
+        {
+          if (graph_output.name == output && node.op_type() == "Conv")
+          {
+            outfile_conv_flag << temp_count_subgraph << " ";
+            find_flag = 1;
+            break;
+          }
+        }
+        if (find_flag)
+        {
+          break;
+        }
+      }
+      if (find_flag)
+      {
+        break;
+      }
+    }
+    temp_count_subgraph++;
+  }
+  outfile_conv_flag.close();
+  std::cout << "succeeded in reaching sorting" << std::endl;
+  int finished_flag = 0;
+  int sort_count = 0;
+  std::vector<int> order_Subgraphs(graphs_inputs.size());
+  std::vector<int> issort_Subgraphs(graphs_inputs.size());
+  while (!finished_flag)
+  {
+    finished_flag = 1;
+    int changed_sort_flag = 0;
+    if (sort_count == 0)
+    {
+      changed_sort_flag = 1;
+      for (int i = 0; i < int(graphs_inputs.size()); i++)
+      {
+        int find_flag = 0;
+        for (const auto &g_input : graphs_inputs[i])
+        {
+          for (int j = 0; j < int(graphs_outputs.size()); j++)
+          {
+            if (graphs_outputs[j].find(g_input) != graphs_outputs[j].end())
+            {
+              find_flag = 1;
+              break;
+            }
+          }
+          if (find_flag)
+          {
+            break;
+          }
+        }
+        if (!find_flag)
+        {
+          order_Subgraphs[i] = 0;
+          issort_Subgraphs[i] = 1;
+        }
+        else
+        {
+          order_Subgraphs[i] = 1;
+          issort_Subgraphs[i] = 0;
+          finished_flag = 0;
+        }
+      }
+    }
+    else
+    {
+      std::cout << "sort count:" << sort_count << std::endl;
+      for (int i = 0; i < int(graphs_inputs.size()); i++)
+      {
+        int find_flag = 0;
+        if (issort_Subgraphs[i] == 1 && i != int(graphs_inputs.size()) - 1)
+        {
+          continue;
+        }
+        for (const auto &g_input : graphs_inputs[i])
+        {
+          for (int j = 0; j < int(graphs_outputs.size()); j++)
+          {
+            if ((graphs_outputs[j].find(g_input) != graphs_outputs[j].end()))
+            {
+              if ((issort_Subgraphs[j] == 0))
+              {
+                std::cout << "graph " << i << "is after graph " << j << std::endl;
+                find_flag = 1;
+                break;
+              }
+            }
+          }
+          if (find_flag)
+          {
+            break;
+          }
+        }
+        if (!find_flag)
+        {
+          if (!(issort_Subgraphs[i] == 1))
+          {
+            order_Subgraphs[i] = sort_count;
+          }
+        }
+        else
+        {
+          order_Subgraphs[i] = sort_count + 1;
+          issort_Subgraphs[i] = 0;
+          finished_flag = 0;
+        }
+        if (i == int(graphs_inputs.size()) -
+                   1) // add the subgraph to the queue only when cycle is completed to prevent the
+                      // newly added subgraph in this cycle from being the predecessor of the
+                      // subsequent sub-graph.
+        {
+          for (int j = 0; j < int(graphs_inputs.size()); j++)
+          {
+            if (order_Subgraphs[j] == sort_count)
+            {
+              issort_Subgraphs[j] = 1;
+              changed_sort_flag = 1;
+              std::cout << "graph " << j << " is in the " << sort_count << "th sort" << std::endl;
+            }
+          }
+        }
+      }
+      if (changed_sort_flag == 0)
+      {
+        std::cout << "error: endless loop!" << std::endl;
+        std::cout << "sort count:" << sort_count << std::endl;
+        std::cout << "count_cut_pair: " << count_cut_pair << std::endl;
+        for (int i = 0; i < int(graphs_inputs.size()); i++)
+        {
+          std::cout << "order_Subgraphs[" << i << "]:" << order_Subgraphs[i] << " ";
+        }
+        std::cout << std::endl;
+        std::exit(1);
+        break;
+      }
+    }
+    sort_count++;
+  }
+  char *sub1_type, *sub2_type;
+  if (strategy == SPILTE_CPU_STRUCTURE_FIRST)
+  {
+    sub1_type = (char *)"CPU";
+    sub2_type = (char *)"NPU";
+  }
+  else
+  {
+    sub1_type = (char *)"NPU";
+    sub2_type = (char *)"CPU";
+  }
+  std::cout << " order" << std::endl;
+  for (auto element : order_Subgraphs)
+  {
+    std::cout << element << " ";
+  }
+  std::cout << std::endl;
+
+  std::string file_name = "subgraphs_ios.txt";
+  std::ofstream outfile1(file_name);
+  if (!outfile1.is_open())
+  {
+    std::cerr << "Error opening file." << std::endl;
+    exit(0);
+  }
+  int sub1_size = subgraphs_1_inputs.size();
+  for (int i = 0; i < int(graphs_inputs.size()); i++)
+  {
+    outfile1 << (i >= sub1_size ? sub2_type : sub1_type) << "subgraph"
+             << (i >= sub1_size ? (i - sub1_size) : i) << ": order" << order_Subgraphs[i];
+    outfile1 << "--input-name ";
+    std::cout << (i >= sub1_size ? sub2_type : sub1_type) << "subgraph"
+              << (i >= sub1_size ? (i - sub1_size) : i) << ": order" << order_Subgraphs[i]
+              << std::endl;
+    std::cout << "Inputs:";
+    for (auto element : graphs_inputs[i])
+    {
+      std::cout << element.name << "; size:";
+      for (auto Size : element.shape)
+      {
+        std::cout << Size << " ";
+      }
+      outfile1 << element.name << ";";
+    }
+    std::cout << std::endl;
+    std::cout << "Outputs:";
+    outfile1 << "--output-name ";
+    for (auto element : graphs_outputs[i])
+    {
+      std::cout << element.name << "; size:";
+      for (auto Size : element.shape)
+      {
+        std::cout << Size << " ";
+      }
+      outfile1 << element.name << ";";
+    }
+    outfile1 << std::endl;
+    std::cout << std::endl;
+    std::cout << " The predecessors of " << (i >= sub1_size ? sub2_type : sub1_type) << "subgraph"
+              << (i >= sub1_size ? (i - sub1_size) : i) << ": ";
+    for (auto element : predecessors_Subgraphs[i])
+    {
+      std::cout << (element >= sub1_size ? sub2_type : sub1_type) << "subgraph"
+                << (element >= sub1_size ? (element - sub1_size) : element) << "; ";
+    }
+    std::cout << std::endl;
+    std::cout << " The successors of " << (i >= sub1_size ? sub2_type : sub1_type) << "subgraph"
+              << (i >= sub1_size ? (i - sub1_size) : i) << ": ";
+    for (auto element : successors_Subgraphs[i])
+    {
+      std::cout << (element >= sub1_size ? sub2_type : sub1_type) << "subgraph"
+                << (element >= sub1_size ? (element - sub1_size) : element) << "; ";
+    }
+    std::cout << std::endl;
+  }
+  outfile1.close();
+  for (const auto &tensor : IOvalueNames)
+  {
+    std::cout << "Name: " << tensor.name << ", Shape: [";
+    for (size_t i = 0; i < tensor.shape.size(); ++i)
+    {
+      std::cout << tensor.shape[i];
+      if (i < tensor.shape.size() - 1)
+      {
+        std::cout << ", ";
+      }
+    }
+    std::cout << "]" << std::endl;
+  }
+
+  switch (d.getType())
+  {
+    case DeviceType::Target_NPU:
+    {
+      if (strategy == SPILTE_CPU_STRUCTURE_FIRST)
+      {
+        d.GenerateCutInstruction(Subgraphs, "cpu", subgraphs_1_inputs, subgraphs_1_outputs);
+        d.GenerateCutInstruction(otherSubgraphs, "npu", subgraphs_2_inputs, subgraphs_2_outputs);
+      }
+      else if (strategy == SPILTE_NPU_STRUCTURE_FIRST)
+      {
+        d.GenerateCutInstruction(Subgraphs, "npu", subgraphs_1_inputs, subgraphs_1_outputs);
+        d.GenerateCutInstruction(otherSubgraphs, "cpu", subgraphs_2_inputs, subgraphs_2_outputs);
+      }
+      break;
+    }
+    default:
+      std::cout << "Unknown device type" << std::endl;
+      exit(0);
+  }
+  std::cout << "node num in original graph: " << g.node_size() << std::endl;
+  std::cout << "node_num after cut " << node_num_all << std::endl;
+}
diff --git a/tools/onnx-subgraph/src/lib/structures.cpp b/tools/onnx-subgraph/src/lib/structures.cpp
new file mode 100644
index 00000000000..5ddcf81dc7e
--- /dev/null
+++ b/tools/onnx-subgraph/src/lib/structures.cpp
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2025 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "partition.h"
+#include <algorithm>
+int DetermineStructure(const onnx::GraphProto &graph, Device &d, PartitionStrategy strategy)
+{
+  int node_index = 0;
+  std::vector<std::vector<std::string>> enabled_structure;
+  std::vector<std::string> structure_temp;
+  while (node_index < graph.node_size())
+  {
+    std::vector<std::string> support_op;
+    const auto &node = graph.node(node_index);
+    switch (strategy)
+    {
+      case SPILTE_CPU_STRUCTURE_FIRST:
+      {
+        support_op = d.getCPUSupportOp();
+        break;
+      }
+      case SPILTE_NPU_STRUCTURE_FIRST:
+      {
+        support_op = d.getNPUSupportOp();
+        break;
+      }
+      default:
+      {
+        break;
+      }
+    }
+    if (std::find(support_op.begin(), support_op.end(), node.op_type()) != support_op.end())
+    {
+      auto op_index = std::find(support_op.begin(), support_op.end(), node.op_type());
+      structure_temp.push_back(*op_index);
+    }
+    else
+    {
+      if (structure_temp.size() >= 3)
+      {
+        bool isequal = 0;
+        for (const auto &structure : enabled_structure)
+
+        {
+          if (std::equal(structure.begin(), structure.end(), structure_temp.begin(),
+                         structure_temp.end()))
+          {
+            isequal = 1;
+            break;
+          }
+        }
+        if (isequal == 0)
+        {
+          enabled_structure.push_back(structure_temp);
+        }
+      }
+      if (structure_temp.size() != 0)
+      {
+        structure_temp.clear();
+      }
+    }
+    node_index++;
+  }
+
+  for (const auto &structure : enabled_structure)
+  {
+    std::cout << "{";
+    for (const auto &op : structure)
+    {
+      std::cout << "\"" << op << "\",";
+    }
+    std::cout << "}," << std::endl;
+  }
+  return 0;
+}
diff --git a/tools/onnx-subgraph/src/main.cpp b/tools/onnx-subgraph/src/main.cpp
new file mode 100644
index 00000000000..9e78641e3db
--- /dev/null
+++ b/tools/onnx-subgraph/src/main.cpp
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2025 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include <string>
+#include "graph.h"
+#include "partition.h"
+#include "Python.h"
+
+int main(int argc, char *argv[])
+{
+  std::string onnxFile;
+  if (argc > 1)
+  {
+    for (int i = 1; i < argc; ++i)
+    {
+      std::string arg = argv[i];
+      if (arg.substr(0, 7) == "--onnx=")
+      {
+        onnxFile = arg.substr(7);
+        std::cout << "ONNX file: " << onnxFile << std::endl;
+      }
+    }
+    if (onnxFile.empty())
+    {
+      std::cout << "No ONNX file provided." << std::endl;
+      return -1;
+    }
+  }
+  else
+  {
+    printf("Please set valide args: ./onnx-subgraph --onnx=xxx.onnx\n");
+    return -1;
+  }
+
+  Graph graph;
+  auto g = graph.GetGraphFromOnnx(onnxFile);
+  std::unordered_map<std::string, NodeIOSize> node_io_size;
+  Partition p;
+  Device target;
+  target.updateOnnxFile(onnxFile);
+  target.GetDeviceJson("./scripts/config.json");
+  p.PartitionGraph(g, target, PartitionStrategy::SPILTE_NPU_STRUCTURE_FIRST, node_io_size);
+
+  Py_Initialize();
+  if (!Py_IsInitialized())
+  {
+    std::cout << "python init fail" << std::endl;
+    return 0;
+  }
+  PyRun_SimpleString("import sys");
+  PyRun_SimpleString("sys.path.append('.')");
+  Py_Finalize();
+
+  return 0;
+}
diff --git a/tools/onnx-subgraph/subgraphs_ios.txt b/tools/onnx-subgraph/subgraphs_ios.txt
new file mode 100644
index 00000000000..9dfdc95e32e
--- /dev/null
+++ b/tools/onnx-subgraph/subgraphs_ios.txt
@@ -0,0 +1,4 @@
+NPUsubgraph0: order0--input-name x;--output-name /stem/conv3/bn/act/Mul_output_0;
+NPUsubgraph1: order2--input-name /stem/pool/MaxPool_output_0;--output-name /stages/stages.3/stages.3.1/act/Mul_output_0;
+CPUsubgraph0: order1--input-name /stem/conv3/bn/act/Mul_output_0;--output-name /stem/pool/MaxPool_output_0;
+CPUsubgraph1: order3--input-name /stages/stages.3/stages.3.1/act/Mul_output_0;--output-name 316;
diff --git a/tools/onnx-subgraph/test_model_download.sh b/tools/onnx-subgraph/test_model_download.sh
new file mode 100644
index 00000000000..d6597d2dd79
--- /dev/null
+++ b/tools/onnx-subgraph/test_model_download.sh
@@ -0,0 +1,16 @@
+pip install onnx onnxsim
+
+if [ ! -d "./models/" ];then
+  mkdir ./models/
+  else
+  echo "./models path existing"
+fi
+
+cd ./models
+wget https://media.githubusercontent.com/media/onnx/models/refs/heads/main/Computer_Vision/resnext26ts_Opset16_timm/resnext26ts_Opset16.onnx --no-check-certificate
+#wget https://media.githubusercontent.com/media/onnx/models/refs/heads/main/Natural_Language_Processing/xmod_Opset16_transformers/xmod_Opset16.onnx --no-check-certificate
+
+onnxsim resnext26ts_Opset16.onnx ../resnet-test.onnx
+#onnxsim xmod_Opset16.onnx ../xmod-transformer-test.onnx
+
+cd ..