From 13438620f075b83da079c91269fa779561a97731 Mon Sep 17 00:00:00 2001 From: Sanggyu Lee Date: Fri, 14 Nov 2025 16:05:05 +0900 Subject: [PATCH 1/3] [ggma] Add documentation for TinyLlama example MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Created `runtime/ggma/examples/generate_text/tinyllama.md` with step‑by‑step guide. - Includes prerequisites, model generation commands, full processing pipeline, and a summary. ONE-DCO-1.0-Signed-off-by: Sanggyu Lee --- runtime/ggma/examples/generate_text/decode.py | 68 +++++++++++++++ .../ggma/examples/generate_text/prefill.py | 75 +++++++++++++++++ .../examples/generate_text/requirements.txt | 2 + .../ggma/examples/generate_text/tinyllama.md | 84 +++++++++++++++++++ 4 files changed, 229 insertions(+) create mode 100644 runtime/ggma/examples/generate_text/decode.py create mode 100644 runtime/ggma/examples/generate_text/prefill.py create mode 100644 runtime/ggma/examples/generate_text/requirements.txt create mode 100644 runtime/ggma/examples/generate_text/tinyllama.md diff --git a/runtime/ggma/examples/generate_text/decode.py b/runtime/ggma/examples/generate_text/decode.py new file mode 100644 index 00000000000..43fbcb730b2 --- /dev/null +++ b/runtime/ggma/examples/generate_text/decode.py @@ -0,0 +1,68 @@ +# User input +prompt = "Lily picked up a flower." +model_name = "Maykeye/TinyLLama-v0" + +# Tokenizer +from transformers import AutoTokenizer + +tokenizer = AutoTokenizer.from_pretrained(model_name) +tokenizer.pad_token = tokenizer.eos_token +tokenizer.padding_side = "right" +inputs = tokenizer( + prompt, + return_tensors="pt", + padding="max_length", + max_length=30, + truncation=True, +) + +# Generator +import torch + +from transformers import AutoModelForCausalLM + +model = AutoModelForCausalLM.from_pretrained(model_name) +model.eval() + +from tico.utils.record_input import RecordingInput + +# past_key_values +# --------------- +# During prefill, "past_key_values" not None, but an empty Cache instance. +# Passing None makes torch.export happy. + +input_to_remove = [ + "attention_mask", + # For left pad, [0, ⋯, 0, 1, ⋯, 1] + # For right right pad, [1, ⋯, 1, 0, ⋯, 0] + # ( 0 is pad-token ) + # This script uses right pad and pass all-1 attention mask (including pad). + # Npu computes all positions whether it is pad or not. +] +condition_fn = lambda args_dict: args_dict["past_key_values"].get_seq_length() != 0 + +with torch.no_grad(), RecordingInput(model, condition_fn, + input_to_remove=input_to_remove) as rec: + outputs = model.generate( + **inputs, + max_new_tokens=32, + do_sample=False, + pad_token_id=tokenizer.eos_token_id, + ) + captured_input = rec.captured_input + +generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True) +print(generated_text) + +# Tico +import tico +from tico.serialize.operators.adapters.onert.llama_attention import ( + llama_attention_forward_adapter, ) +from transformers.models.llama.modeling_llama import LlamaAttention + +#LlamaAttention.forward = llama_attention_forward_adapter + +model = AutoModelForCausalLM.from_pretrained(model_name) +model.eval() +circle_model = tico.convert(model, captured_input) +circle_model.save(f"tinyllama.decode.circle") diff --git a/runtime/ggma/examples/generate_text/prefill.py b/runtime/ggma/examples/generate_text/prefill.py new file mode 100644 index 00000000000..a4f37e96e5c --- /dev/null +++ b/runtime/ggma/examples/generate_text/prefill.py @@ -0,0 +1,75 @@ +# User input +prompt = "Lily picked up a flower." +model_name = "Maykeye/TinyLLama-v0" + +# Tokenizer +from transformers import AutoTokenizer + +tokenizer = AutoTokenizer.from_pretrained(model_name) +tokenizer.pad_token = tokenizer.eos_token +tokenizer.padding_side = "right" +inputs = tokenizer( + prompt, + return_tensors="pt", + padding="max_length", + max_length=32, + truncation=True, +) + +# Generator +import torch + +from transformers import AutoModelForCausalLM + +model = AutoModelForCausalLM.from_pretrained(model_name) +model.eval() + +from tico.utils.record_input import RecordingInput + +# past_key_values +# --------------- +# During prefill, "past_key_values" not None, but an empty Cache instance. +# Passing None makes torch.export happy. + +input_to_remove = [ + "past_key_values", + # DynamicCache is flatten-able operator since 4.50. + # See _pytree.py > tree_flatten + # SUPPORTED_NODES has *transformers.DynamicCache* + # After flattening, DynamicCache becomes { "key_cache": [] , "value_cache": [ ] } + # dict.value is returne. dict.key is stored in treespec. + # + # On prefill, DynamicCache is empty, and dict is empty after flattening. + # PyTorch removes empty dict! + # If number of args is 4 (including cache), it becomes 3! + # To avoid this error, don't pass empty cache, just pass None. + "attention_mask", + # For left pad, [0, ⋯, 0, 1, ⋯, 1] + # For right right pad, [1, ⋯, 1, 0, ⋯, 0] + # ( 0 is pad-token ) + # This script uses right pad and pass all-1 attention mask (including pad). + # Npu computes all positions whether it is pad or not. + "cache_position" + # It is the list of cache position like [0, 1, ..., 11]. + # For npu, we always store all values (including pad). +] + +with torch.no_grad(), RecordingInput(model, input_to_remove=input_to_remove) as rec: + outputs = model.generate( + **inputs, + max_new_tokens=32, + do_sample=False, + pad_token_id=tokenizer.eos_token_id, + ) + captured_input = rec.captured_input + +generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True) +print(generated_text) + +# Tico +import tico + +model = AutoModelForCausalLM.from_pretrained(model_name) +model.eval() +circle_model = tico.convert(model, captured_input) +circle_model.save(f"tinyllama.prefill.circle") diff --git a/runtime/ggma/examples/generate_text/requirements.txt b/runtime/ggma/examples/generate_text/requirements.txt new file mode 100644 index 00000000000..34dd7beb64d --- /dev/null +++ b/runtime/ggma/examples/generate_text/requirements.txt @@ -0,0 +1,2 @@ +transformers==4.50.3 +torch diff --git a/runtime/ggma/examples/generate_text/tinyllama.md b/runtime/ggma/examples/generate_text/tinyllama.md new file mode 100644 index 00000000000..a7a4dfd50cc --- /dev/null +++ b/runtime/ggma/examples/generate_text/tinyllama.md @@ -0,0 +1,84 @@ +# TinyLlama Example Documentation + +This document provides a step‑by‑step guide for generating and processing a text generation model. + +## Summary + +1. Set up the environment and install dependencies. +2. Generate the initial `prefill` and `decode` Circle model files. +3. Run the pipeline to optimize, reshape, and prune the model, producing a final `decode.circle` ready for inference. + +## Prerequisites + +1. **Python virtual environment** + ```bash + cd runtime/ggma/examples/generate_text/ + python3 -m venv _ + source _/bin/activate + ``` + +2. **Install required Python packages** + ```bash + pip install -r requirements.txt + ``` + +3. **Install TICO (Torch IR to Circle ONE)** + ```bash + # Clone the repository + git clone https://github.com/Samsung/TICO.git + # Install it in editable mode + pip install -e TICO + ``` + +## Generating Model Files + +Run the provided scripts to create the prefill and decode Circle model files: + +```bash +python prefill.py # Generates tinyllama.prefill.circle +python decode.py # Generates tinyllama.decode.circle +``` + +You can verify the generated files: + +```bash +ls -lh *.circle +# Expected output: +# -rw-rw-r-- 1 gyu gyu 18M Nov 14 14:09 tinyllama.decode.circle +# -rw-rw-r-- 1 gyu gyu 18M Nov 14 14:09 tinyllama.prefill.circle +``` + +## Full Processing Pipeline + +The following pipeline shows how to chain several tools to transform the model: + +```bash +with.py tinyllama.decode.circle | +fuse.attention.py \ +fuse.bmm_lhs_const.py | reshape.fc_weight.py | \ +reshape.io.py input --by_shape [1,16,30,4] [1,16,32,4] | \ +transpose.io.kvcache.py | \ +remove.io.py output --keep_by_id 0 | \ +select.op.py --by_id 0-181 | \ +gc.py | \ +retype.input_ids.py > decode.circle +``` + +### Explanation of each step + +| Tool | Purpose | +|------|---------| +| `with.py` | Reads the Circle model from stdin and writes it to stdout. | +| `fuse.attention.py` | Fuses attention‑related operators for optimization. | +| `fuse.bmm_lhs_const.py` | Fuses constant left‑hand side matrices in batch matrix multiplication. | +| `reshape.fc_weight.py` | Reshapes fully‑connected layer weights. | +| `reshape.io.py input --by_shape [...]` | Reshapes input tensors to the specified shapes. | +| `transpose.io.kvcache.py` | Transposes the KV‑cache tensors. | +| `remove.io.py output --keep_by_id 0` | Keeps only the output tensor with ID 0, removing the rest. | +| `select.op.py --by_id 0-181` | Selects operators with IDs from 0 to 181. | +| `gc.py` | Performs garbage collection, removing unused tensors and operators. | +| `retype.input_ids.py` | Changes the data type of input IDs as needed. | +| `> decode.circle` | Saves the final processed model to `decode.circle`. | + + +Feel free to adjust the pipeline arguments (e.g., shapes, IDs) to suit your specific model configuration. From fd78f13787f57fe5c4499c77013eb35fd9d84941 Mon Sep 17 00:00:00 2001 From: Sanggyu Lee Date: Fri, 21 Nov 2025 17:36:25 +0900 Subject: [PATCH 2/3] Update document --- runtime/ggma/examples/generate_text/README.md | 125 ++++++++++++++++++ runtime/ggma/examples/generate_text/decode.py | 2 +- .../ggma/examples/generate_text/prefill.py | 2 +- .../ggma/examples/generate_text/tinyllama.md | 84 ------------ 4 files changed, 127 insertions(+), 86 deletions(-) create mode 100644 runtime/ggma/examples/generate_text/README.md delete mode 100644 runtime/ggma/examples/generate_text/tinyllama.md diff --git a/runtime/ggma/examples/generate_text/README.md b/runtime/ggma/examples/generate_text/README.md new file mode 100644 index 00000000000..f56d77929e9 --- /dev/null +++ b/runtime/ggma/examples/generate_text/README.md @@ -0,0 +1,125 @@ +# TinyLlama Text Generation Example + +This document provides a step‑by‑step guide for generating and processing a TinyLlama text‑generation model. + +## Summary + +1. Set up the environment and install dependencies. +2. Generate the initial `prefill` and `decode` Circle model files. +3. Run the pipeline to optimize, reshape, and prune the model, producing a final `decode.circle` ready for inference. + +## Prerequisites + +### 1. Python virtual environment +```bash +cd runtime/ggma/examples/generate_text/ +python3 -m venv _ +source _/bin/activate +``` + +### 2. Install required Python packages +```bash +pip install -r requirements.txt +``` + +### 3. Install TICO (Torch IR to Circle ONE) +```bash +# Clone the repository +git clone https://github.com/Samsung/TICO.git +# Install it in editable mode +pip install -e TICO +``` + +### 4. Get [o2o](https://github.com/Samsung/ONE/pull/16233) in PATH +*Requires the GitHub CLI (`gh`).* +```bash +gh pr checkout 16233 +export PATH=../../../../tools/o2o:$PATH +``` + +## Generating Model Files + +### 1. Create the prefill and decode Circle model files +```bash +python prefill.py # Generates prefill.circle +python decode.py # Generates decode_.circle +``` + +Verify the generated files: +```bash +ls -lh *.circle +# -rw-rw-r-- 1 gyu gyu 18M Nov 14 14:09 decode_.circle +# -rw-rw-r-- 1 gyu gyu 18M Nov 14 14:09 prefill.circle +``` + +### 2. Update `tinyllama.decode.circle` +Fuse attention and normalize KV-cache inputs for the decode model. + +```bash +# Fuse attention and reshape KV-cache for the decode model +fuse.attention.py < decode_.circle \ + | fuse.bmm_lhs_const.py \ + | reshape.io.py input --by_shape [1,16,30,4] [1,16,32,4] \ + | transpose.io.kvcache.py > decode.circle +``` + +### 3. Merge prefill and decode circles +Merge the models, retype input IDs, and clean up. + +```bash +merge.circles.py prefill.circle decode.circle \ + | downcast.input_ids.py \ + | gc.py > model.circle +``` + +Verify final model files: +```bash +ls -l {decode,prefill,model}.circle +# -rw-rw-r-- 1 gyu gyu 18594868 Nov 22 17:26 decode.circle +# -rw-rw-r-- 1 gyu gyu 18642052 Nov 22 07:53 prefill.circle +# -rw-rw-r-- 1 gyu gyu 18629520 Nov 22 17:28 model.circle +``` + +## Create a GGMA package + +1. Create the package root directory and move `model.circle` there: +```bash +cd runtime/ggma/examples/generate_text +mkdir tinyllama +mv model.circle tinyllama/ +``` + +2. Copy the tokenizer files (replace `{your_snapshot}` with the actual snapshot hash): +```bash +cp -L ~/.cache/huggingface/hub/models--Maykeye--TinyLLama-v0/snapshots/{your_snapshot}/tokenizer.* tinyllama/ +cp -L ~/.cache/huggingface/hub/models--Maykeye--TinyLLama-v0/snapshots/{your_snapshot}/config.json tinyllama/ +``` + +```bash +tree tinyllama/ +tinyllama/ +├── model.circle +├── tokenizer.json +└── tokenizer.model +``` + +## Build and run `ggma_run` + +```bash +make -j$(nproc) +make install +``` + +Check version: +```bash +Product/out/bin/ggma_run --version +# ggma_run v0.1.0 (nnfw runtime: v1.31.0) +``` + +Run the model: +```bash +Product/out/bin/ggma_run tinyllama +# prompt: Lily picked up a flower. +# generated: { 1100, 7899, 289, 826, 351, 600, 2439, 288, 266, 3653, 31843, 1100, 7899, 289, 1261, 291, 5869, 291, 1261, 31843, 1100, 7899 } +# detokenized: She liked to play with her friends in the park. She liked to run and jump and run. She liked +``` diff --git a/runtime/ggma/examples/generate_text/decode.py b/runtime/ggma/examples/generate_text/decode.py index 43fbcb730b2..c397ae2b163 100644 --- a/runtime/ggma/examples/generate_text/decode.py +++ b/runtime/ggma/examples/generate_text/decode.py @@ -65,4 +65,4 @@ model = AutoModelForCausalLM.from_pretrained(model_name) model.eval() circle_model = tico.convert(model, captured_input) -circle_model.save(f"tinyllama.decode.circle") +circle_model.save(f"decode_.circle") diff --git a/runtime/ggma/examples/generate_text/prefill.py b/runtime/ggma/examples/generate_text/prefill.py index a4f37e96e5c..aa0b8ae46f3 100644 --- a/runtime/ggma/examples/generate_text/prefill.py +++ b/runtime/ggma/examples/generate_text/prefill.py @@ -72,4 +72,4 @@ model = AutoModelForCausalLM.from_pretrained(model_name) model.eval() circle_model = tico.convert(model, captured_input) -circle_model.save(f"tinyllama.prefill.circle") +circle_model.save(f"prefill.circle") diff --git a/runtime/ggma/examples/generate_text/tinyllama.md b/runtime/ggma/examples/generate_text/tinyllama.md deleted file mode 100644 index a7a4dfd50cc..00000000000 --- a/runtime/ggma/examples/generate_text/tinyllama.md +++ /dev/null @@ -1,84 +0,0 @@ -# TinyLlama Example Documentation - -This document provides a step‑by‑step guide for generating and processing a text generation model. - -## Summary - -1. Set up the environment and install dependencies. -2. Generate the initial `prefill` and `decode` Circle model files. -3. Run the pipeline to optimize, reshape, and prune the model, producing a final `decode.circle` ready for inference. - -## Prerequisites - -1. **Python virtual environment** - ```bash - cd runtime/ggma/examples/generate_text/ - python3 -m venv _ - source _/bin/activate - ``` - -2. **Install required Python packages** - ```bash - pip install -r requirements.txt - ``` - -3. **Install TICO (Torch IR to Circle ONE)** - ```bash - # Clone the repository - git clone https://github.com/Samsung/TICO.git - # Install it in editable mode - pip install -e TICO - ``` - -## Generating Model Files - -Run the provided scripts to create the prefill and decode Circle model files: - -```bash -python prefill.py # Generates tinyllama.prefill.circle -python decode.py # Generates tinyllama.decode.circle -``` - -You can verify the generated files: - -```bash -ls -lh *.circle -# Expected output: -# -rw-rw-r-- 1 gyu gyu 18M Nov 14 14:09 tinyllama.decode.circle -# -rw-rw-r-- 1 gyu gyu 18M Nov 14 14:09 tinyllama.prefill.circle -``` - -## Full Processing Pipeline - -The following pipeline shows how to chain several tools to transform the model: - -```bash -with.py tinyllama.decode.circle | -fuse.attention.py \ -fuse.bmm_lhs_const.py | reshape.fc_weight.py | \ -reshape.io.py input --by_shape [1,16,30,4] [1,16,32,4] | \ -transpose.io.kvcache.py | \ -remove.io.py output --keep_by_id 0 | \ -select.op.py --by_id 0-181 | \ -gc.py | \ -retype.input_ids.py > decode.circle -``` - -### Explanation of each step - -| Tool | Purpose | -|------|---------| -| `with.py` | Reads the Circle model from stdin and writes it to stdout. | -| `fuse.attention.py` | Fuses attention‑related operators for optimization. | -| `fuse.bmm_lhs_const.py` | Fuses constant left‑hand side matrices in batch matrix multiplication. | -| `reshape.fc_weight.py` | Reshapes fully‑connected layer weights. | -| `reshape.io.py input --by_shape [...]` | Reshapes input tensors to the specified shapes. | -| `transpose.io.kvcache.py` | Transposes the KV‑cache tensors. | -| `remove.io.py output --keep_by_id 0` | Keeps only the output tensor with ID 0, removing the rest. | -| `select.op.py --by_id 0-181` | Selects operators with IDs from 0 to 181. | -| `gc.py` | Performs garbage collection, removing unused tensors and operators. | -| `retype.input_ids.py` | Changes the data type of input IDs as needed. | -| `> decode.circle` | Saves the final processed model to `decode.circle`. | - - -Feel free to adjust the pipeline arguments (e.g., shapes, IDs) to suit your specific model configuration. From f1d3ef6c9bb5d2921cd717a4bd0bc5fde142e709 Mon Sep 17 00:00:00 2001 From: Sanggyu Lee Date: Sun, 23 Nov 2025 16:11:21 +0900 Subject: [PATCH 3/3] Add USER.md and merge prefill.py and decode.py --- .../ggma/examples/generate_text/DEVELOPER.md | 130 ++++++++++++++++++ runtime/ggma/examples/generate_text/README.md | 125 ----------------- runtime/ggma/examples/generate_text/USER.md | 108 +++++++++++++++ runtime/ggma/examples/generate_text/decode.py | 68 --------- .../ggma/examples/generate_text/prefill.py | 75 ---------- .../generate_text/tinyllama/pipeline.yaml | 9 ++ .../{ => tinyllama}/requirements.txt | 1 - .../generate_text/tinyllama/tinyllama.py | 98 +++++++++++++ 8 files changed, 345 insertions(+), 269 deletions(-) create mode 100644 runtime/ggma/examples/generate_text/DEVELOPER.md delete mode 100644 runtime/ggma/examples/generate_text/README.md create mode 100644 runtime/ggma/examples/generate_text/USER.md delete mode 100644 runtime/ggma/examples/generate_text/decode.py delete mode 100644 runtime/ggma/examples/generate_text/prefill.py create mode 100644 runtime/ggma/examples/generate_text/tinyllama/pipeline.yaml rename runtime/ggma/examples/generate_text/{ => tinyllama}/requirements.txt (77%) create mode 100644 runtime/ggma/examples/generate_text/tinyllama/tinyllama.py diff --git a/runtime/ggma/examples/generate_text/DEVELOPER.md b/runtime/ggma/examples/generate_text/DEVELOPER.md new file mode 100644 index 00000000000..2d9f5c93c33 --- /dev/null +++ b/runtime/ggma/examples/generate_text/DEVELOPER.md @@ -0,0 +1,130 @@ +# TinyLlama Text Generation Developer Guide + +This document provides a detailed technical guide for generating, processing, and optimizing the TinyLlama text-generation model. For basic usage, see [USER.md](USER.md). + +## Summary + +1. Set up the environment and install dependencies. +2. Generate the initial `prefill` and `decode` Circle model files. +3. Run the pipeline to optimize, reshape, and prune the model, producing a final `decode.circle` ready for inference. + +## Prerequisites + +### 1. Python virtual environment +```bash +$ cd runtime/ggma/examples/generate_text/ +$ python3 -m venv _ +$ source _/bin/activate +``` + +### 2. Prepare [gyu](tools/gyu/README.md) and o2o tools +Install dependencies and setup `o2o` tools (similar to what `tools/gyu/init.py` does). + +> **Note**: We install the CPU version of `torch` first because `gyu` depends on `TICO`, which by default pulls in the large NVIDIA version of `torch`. Installing the CPU version beforehand prevents this. + +```bash +# 1. Install torch (CPU) and gyu requirements +$ pip install torch --index-url https://download.pytorch.org/whl/cpu +$ pip install -r tools/gyu/requirements.txt + +# 2. Fetch o2o tools from PR #16233 +$ git fetch origin pull/16233/head:pr-16233 +$ git checkout pr-16233 -- tools/o2o +$ chmod +x tools/o2o/*.py + +# 3. Add tools to PATH +$ export PATH=$PWD/tools/o2o:$PWD/tools/gyu:$PATH +``` + + + +## Generating Model Files + +### 1. Install model dependencies +```bash +$ pip install -r tinyllama/tinyllama.requirements +``` + +### 2. Create the prefill and decode Circle model files +```bash +$ python tinyllama/tinyllama.py --mode prefill # Generates prefill.circle +$ python tinyllama/tinyllama.py --mode decode # Generates decode_.circle +``` + +Verify the generated files: +```bash +$ ls -lh *.circle +-rw-rw-r-- 1 gyu gyu 18M Nov 14 14:09 decode_.circle +-rw-rw-r-- 1 gyu gyu 18M Nov 14 14:09 prefill.circle +``` + +### 3. Update `tinyllama.decode.circle` +Fuse attention and normalize KV-cache inputs for the decode model. + +```bash +$ fuse.attention.py < decode_.circle \ + | reshape.io.py input --by_shape [1,16,30,4] [1,16,32,4] \ + | transpose.io.kvcache.py > decode.circle +``` + +### 4. Merge prefill and decode circles +Merge the models, retype input IDs, and clean up. + +```bash +$ merge.circles.py prefill.circle decode.circle \ + | fuse.bmm_lhs_const.py \ + | downcast.input_ids.py \ + | gc.py > model.circle +``` + +Verify final model files: +```bash +$ ls -l {decode,prefill,model}.circle +-rw-rw-r-- 1 gyu gyu 18594868 Nov 22 17:26 decode.circle +-rw-rw-r-- 1 gyu gyu 18642052 Nov 22 07:53 prefill.circle +-rw-rw-r-- 1 gyu gyu 18629520 Nov 22 17:28 model.circle +``` + +## Create a GGMA package + +1. Create the package root directory and move `model.circle` there: +```bash +$ cd runtime/ggma/examples/generate_text +$ mkdir tinyllama +$ mv model.circle tinyllama/ +``` + +2. Copy the tokenizer files (replace `{your_snapshot}` with the actual snapshot hash): +```bash +$ cp -L ~/.cache/huggingface/hub/models--Maykeye--TinyLLama-v0/snapshots/{your_snapshot}/tokenizer.* tinyllama/ +$ cp -L ~/.cache/huggingface/hub/models--Maykeye--TinyLLama-v0/snapshots/{your_snapshot}/config.json tinyllama/ +``` + +```bash +$ tree tinyllama/ +tinyllama/ +├── model.circle +├── tokenizer.json +└── tokenizer.model +``` + +## Build and run `ggma_run` + +```bash +$ make -j$(nproc) +$ make install +``` + +Check version: +```bash +$ Product/out/bin/ggma_run --version +ggma_run v0.1.0 (nnfw runtime: v1.31.0) +``` + +Run the model: +```bash +$ Product/out/bin/ggma_run tinyllama +prompt: Lily picked up a flower. +generated: { 1100, 7899, 289, 826, 351, 600, 2439, 288, 266, 3653, 31843, 1100, 7899, 289, 1261, 291, 5869, 291, 1261, 31843, 1100, 7899 } +detokenized: She liked to play with her friends in the park. She liked to run and jump and run. She liked +``` diff --git a/runtime/ggma/examples/generate_text/README.md b/runtime/ggma/examples/generate_text/README.md deleted file mode 100644 index f56d77929e9..00000000000 --- a/runtime/ggma/examples/generate_text/README.md +++ /dev/null @@ -1,125 +0,0 @@ -# TinyLlama Text Generation Example - -This document provides a step‑by‑step guide for generating and processing a TinyLlama text‑generation model. - -## Summary - -1. Set up the environment and install dependencies. -2. Generate the initial `prefill` and `decode` Circle model files. -3. Run the pipeline to optimize, reshape, and prune the model, producing a final `decode.circle` ready for inference. - -## Prerequisites - -### 1. Python virtual environment -```bash -cd runtime/ggma/examples/generate_text/ -python3 -m venv _ -source _/bin/activate -``` - -### 2. Install required Python packages -```bash -pip install -r requirements.txt -``` - -### 3. Install TICO (Torch IR to Circle ONE) -```bash -# Clone the repository -git clone https://github.com/Samsung/TICO.git -# Install it in editable mode -pip install -e TICO -``` - -### 4. Get [o2o](https://github.com/Samsung/ONE/pull/16233) in PATH -*Requires the GitHub CLI (`gh`).* -```bash -gh pr checkout 16233 -export PATH=../../../../tools/o2o:$PATH -``` - -## Generating Model Files - -### 1. Create the prefill and decode Circle model files -```bash -python prefill.py # Generates prefill.circle -python decode.py # Generates decode_.circle -``` - -Verify the generated files: -```bash -ls -lh *.circle -# -rw-rw-r-- 1 gyu gyu 18M Nov 14 14:09 decode_.circle -# -rw-rw-r-- 1 gyu gyu 18M Nov 14 14:09 prefill.circle -``` - -### 2. Update `tinyllama.decode.circle` -Fuse attention and normalize KV-cache inputs for the decode model. - -```bash -# Fuse attention and reshape KV-cache for the decode model -fuse.attention.py < decode_.circle \ - | fuse.bmm_lhs_const.py \ - | reshape.io.py input --by_shape [1,16,30,4] [1,16,32,4] \ - | transpose.io.kvcache.py > decode.circle -``` - -### 3. Merge prefill and decode circles -Merge the models, retype input IDs, and clean up. - -```bash -merge.circles.py prefill.circle decode.circle \ - | downcast.input_ids.py \ - | gc.py > model.circle -``` - -Verify final model files: -```bash -ls -l {decode,prefill,model}.circle -# -rw-rw-r-- 1 gyu gyu 18594868 Nov 22 17:26 decode.circle -# -rw-rw-r-- 1 gyu gyu 18642052 Nov 22 07:53 prefill.circle -# -rw-rw-r-- 1 gyu gyu 18629520 Nov 22 17:28 model.circle -``` - -## Create a GGMA package - -1. Create the package root directory and move `model.circle` there: -```bash -cd runtime/ggma/examples/generate_text -mkdir tinyllama -mv model.circle tinyllama/ -``` - -2. Copy the tokenizer files (replace `{your_snapshot}` with the actual snapshot hash): -```bash -cp -L ~/.cache/huggingface/hub/models--Maykeye--TinyLLama-v0/snapshots/{your_snapshot}/tokenizer.* tinyllama/ -cp -L ~/.cache/huggingface/hub/models--Maykeye--TinyLLama-v0/snapshots/{your_snapshot}/config.json tinyllama/ -``` - -```bash -tree tinyllama/ -tinyllama/ -├── model.circle -├── tokenizer.json -└── tokenizer.model -``` - -## Build and run `ggma_run` - -```bash -make -j$(nproc) -make install -``` - -Check version: -```bash -Product/out/bin/ggma_run --version -# ggma_run v0.1.0 (nnfw runtime: v1.31.0) -``` - -Run the model: -```bash -Product/out/bin/ggma_run tinyllama -# prompt: Lily picked up a flower. -# generated: { 1100, 7899, 289, 826, 351, 600, 2439, 288, 266, 3653, 31843, 1100, 7899, 289, 1261, 291, 5869, 291, 1261, 31843, 1100, 7899 } -# detokenized: She liked to play with her friends in the park. She liked to run and jump and run. She liked -``` diff --git a/runtime/ggma/examples/generate_text/USER.md b/runtime/ggma/examples/generate_text/USER.md new file mode 100644 index 00000000000..cfbb8ae5ab7 --- /dev/null +++ b/runtime/ggma/examples/generate_text/USER.md @@ -0,0 +1,108 @@ +# Text Generation User Guide + +This guide shows how to create a GGMA package for text generation models using the `opm` (one packaging manager) tool. + +We use TinyLlama as an example throughout this guide. + +## Creating a GGMA package + +NOTE: Start from the ONE repository root directory. + +### 1. Initialize environment (one-time setup) + +Add [opm](../../../../tools/opm/README.md) to PATH: +```bash +$ export PATH=$PWD/tools/opm:$PATH +``` + +Then, change directory to tinyllama example directory and run opm init: +```bash +$ cd runtime/ggma/examples/generate_text/tinyllama +$ opm init +``` + +Python environment and o2o tools are prepared: +```bash +$ ls -ld o2o venv +drwxrwxr-x 2 opm opm 4096 Nov 24 09:44 o2o +drwxrwxr-x 6 opm opm 4096 Nov 24 09:42 venv +``` + +> **Note**: The `o2o` directory will be removed once [#13689](https://github.com/Samsung/ONE/pull/13689) is merged. + +### 2. Import model from HuggingFace + +```bash +$ opm import Maykeye/TinyLLama-v0 +``` + +The HuggingFace model is downloaded to `build/tinyllama-v0/`: +``` +$ tree build +build +└── tinyllama-v0 + ├── backup + ├── config.json + ├── demo.py + ├── generation_config.json + ├── model.onnx + ├── model.safetensors + ├── pytorch_model.bin + ├── README.md + ├── special_tokens_map.json + ├── tokenizer_config.json + ├── tokenizer.json + ├── tokenizer.model + ├── train.ipynb + └── valid.py +``` + +### 3. Export to GGMA package + +```bash +$ opm export -s tinyllama.py +``` + +The GGMA package is generated in `build/out/`: +``` +$ tree build/out +build/out/ +├── config.json +├── model.circle +├── tokenizer.json +└── tokenizer.model +``` + +## Building GGMA and Running a GGMA package + +NOTE: Start from the ONE repository root directory. + +### Build + +```bash +$ make -j$(nproc) +$ make install +``` + +For detailed build instructions, see the [ONE Runtime Build Guide](https://github.com/Samsung/ONE/blob/master/docs/runtime/README.md). + +Confirm that `ggma_run` is built and show its version: +```bash +$ Product/out/bin/ggma_run --version +ggma_run v0.1.0 (nnfw runtime: v1.31.0) +``` + +### Run + +Execute the GGMA package (default prompt) to see a sample output: +```bash +$ Product/out/bin/ggma_run build/out +prompt: Lily picked up a flower. +generated: { 1100, 7899, 289, 826, 351, 600, 2439, 288, 266, 3653, 31843, 1100, 7899, 289, 1261, 291, 5869, 291, 1261, 31843, 1100, 7899 } +detokenized: She liked to play with her friends in the park. She liked to run and jump and run. She liked +``` + +For detailed run instructions, see the [ggma_run guide](https://github.com/Samsung/ONE/blob/master/runtime/tests/tools/ggma_run/README.md). + + +For developers who want to understand what happens under the hood, see [DEVELOPER.md](DEVELOPER.md). diff --git a/runtime/ggma/examples/generate_text/decode.py b/runtime/ggma/examples/generate_text/decode.py deleted file mode 100644 index c397ae2b163..00000000000 --- a/runtime/ggma/examples/generate_text/decode.py +++ /dev/null @@ -1,68 +0,0 @@ -# User input -prompt = "Lily picked up a flower." -model_name = "Maykeye/TinyLLama-v0" - -# Tokenizer -from transformers import AutoTokenizer - -tokenizer = AutoTokenizer.from_pretrained(model_name) -tokenizer.pad_token = tokenizer.eos_token -tokenizer.padding_side = "right" -inputs = tokenizer( - prompt, - return_tensors="pt", - padding="max_length", - max_length=30, - truncation=True, -) - -# Generator -import torch - -from transformers import AutoModelForCausalLM - -model = AutoModelForCausalLM.from_pretrained(model_name) -model.eval() - -from tico.utils.record_input import RecordingInput - -# past_key_values -# --------------- -# During prefill, "past_key_values" not None, but an empty Cache instance. -# Passing None makes torch.export happy. - -input_to_remove = [ - "attention_mask", - # For left pad, [0, ⋯, 0, 1, ⋯, 1] - # For right right pad, [1, ⋯, 1, 0, ⋯, 0] - # ( 0 is pad-token ) - # This script uses right pad and pass all-1 attention mask (including pad). - # Npu computes all positions whether it is pad or not. -] -condition_fn = lambda args_dict: args_dict["past_key_values"].get_seq_length() != 0 - -with torch.no_grad(), RecordingInput(model, condition_fn, - input_to_remove=input_to_remove) as rec: - outputs = model.generate( - **inputs, - max_new_tokens=32, - do_sample=False, - pad_token_id=tokenizer.eos_token_id, - ) - captured_input = rec.captured_input - -generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True) -print(generated_text) - -# Tico -import tico -from tico.serialize.operators.adapters.onert.llama_attention import ( - llama_attention_forward_adapter, ) -from transformers.models.llama.modeling_llama import LlamaAttention - -#LlamaAttention.forward = llama_attention_forward_adapter - -model = AutoModelForCausalLM.from_pretrained(model_name) -model.eval() -circle_model = tico.convert(model, captured_input) -circle_model.save(f"decode_.circle") diff --git a/runtime/ggma/examples/generate_text/prefill.py b/runtime/ggma/examples/generate_text/prefill.py deleted file mode 100644 index aa0b8ae46f3..00000000000 --- a/runtime/ggma/examples/generate_text/prefill.py +++ /dev/null @@ -1,75 +0,0 @@ -# User input -prompt = "Lily picked up a flower." -model_name = "Maykeye/TinyLLama-v0" - -# Tokenizer -from transformers import AutoTokenizer - -tokenizer = AutoTokenizer.from_pretrained(model_name) -tokenizer.pad_token = tokenizer.eos_token -tokenizer.padding_side = "right" -inputs = tokenizer( - prompt, - return_tensors="pt", - padding="max_length", - max_length=32, - truncation=True, -) - -# Generator -import torch - -from transformers import AutoModelForCausalLM - -model = AutoModelForCausalLM.from_pretrained(model_name) -model.eval() - -from tico.utils.record_input import RecordingInput - -# past_key_values -# --------------- -# During prefill, "past_key_values" not None, but an empty Cache instance. -# Passing None makes torch.export happy. - -input_to_remove = [ - "past_key_values", - # DynamicCache is flatten-able operator since 4.50. - # See _pytree.py > tree_flatten - # SUPPORTED_NODES has *transformers.DynamicCache* - # After flattening, DynamicCache becomes { "key_cache": [] , "value_cache": [ ] } - # dict.value is returne. dict.key is stored in treespec. - # - # On prefill, DynamicCache is empty, and dict is empty after flattening. - # PyTorch removes empty dict! - # If number of args is 4 (including cache), it becomes 3! - # To avoid this error, don't pass empty cache, just pass None. - "attention_mask", - # For left pad, [0, ⋯, 0, 1, ⋯, 1] - # For right right pad, [1, ⋯, 1, 0, ⋯, 0] - # ( 0 is pad-token ) - # This script uses right pad and pass all-1 attention mask (including pad). - # Npu computes all positions whether it is pad or not. - "cache_position" - # It is the list of cache position like [0, 1, ..., 11]. - # For npu, we always store all values (including pad). -] - -with torch.no_grad(), RecordingInput(model, input_to_remove=input_to_remove) as rec: - outputs = model.generate( - **inputs, - max_new_tokens=32, - do_sample=False, - pad_token_id=tokenizer.eos_token_id, - ) - captured_input = rec.captured_input - -generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True) -print(generated_text) - -# Tico -import tico - -model = AutoModelForCausalLM.from_pretrained(model_name) -model.eval() -circle_model = tico.convert(model, captured_input) -circle_model.save(f"prefill.circle") diff --git a/runtime/ggma/examples/generate_text/tinyllama/pipeline.yaml b/runtime/ggma/examples/generate_text/tinyllama/pipeline.yaml new file mode 100644 index 00000000000..19052d11c3e --- /dev/null +++ b/runtime/ggma/examples/generate_text/tinyllama/pipeline.yaml @@ -0,0 +1,9 @@ +decode: | + reshape.io.py input --by_shape [1,16,30,4] [1,16,32,4] < decode.circle + | transpose.io.kvcache.py > _.circle && mv _.circle decode.circle + +merge: | + merge.circles.py prefill.circle decode.circle + | fuse.bmm_lhs_const.py + | downcast.input_ids.py + | gc.py > model.circle diff --git a/runtime/ggma/examples/generate_text/requirements.txt b/runtime/ggma/examples/generate_text/tinyllama/requirements.txt similarity index 77% rename from runtime/ggma/examples/generate_text/requirements.txt rename to runtime/ggma/examples/generate_text/tinyllama/requirements.txt index 34dd7beb64d..bc13c47a23b 100644 --- a/runtime/ggma/examples/generate_text/requirements.txt +++ b/runtime/ggma/examples/generate_text/tinyllama/requirements.txt @@ -1,2 +1 @@ transformers==4.50.3 -torch diff --git a/runtime/ggma/examples/generate_text/tinyllama/tinyllama.py b/runtime/ggma/examples/generate_text/tinyllama/tinyllama.py new file mode 100644 index 00000000000..7ccb68e041f --- /dev/null +++ b/runtime/ggma/examples/generate_text/tinyllama/tinyllama.py @@ -0,0 +1,98 @@ +import argparse +import torch +from dataclasses import dataclass +from typing import Callable, List, Optional +from transformers import AutoTokenizer, AutoModelForCausalLM +from tico.utils.record_input import RecordingInput +import tico + +# Constants +MODEL_ID = "Maykeye/TinyLLama-v0" +PROMPT = "Lily picked up a flower." + + +@dataclass +class ModeArg: + max_length: int + input_to_remove: List[str] + condition: Optional[Callable] + + +MODE_ARGS = { + "prefill": + ModeArg(max_length=32, + input_to_remove=["past_key_values", "attention_mask", "cache_position"], + condition=None), + "decode": + ModeArg( + max_length=30, + input_to_remove=["attention_mask"], + condition=lambda args_dict: args_dict["past_key_values"].get_seq_length() != 0) +} + + +def main(): + parser = argparse.ArgumentParser( + description="Export TinyLlama model to Circle format.") + parser.add_argument("--mode", + choices=["prefill", "decode"], + required=True, + help="Export mode: prefill or decode") + args = parser.parse_args() + + # Get configuration for the selected mode + config = MODE_ARGS[args.mode] + + # Tokenizer + tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) + tokenizer.pad_token = tokenizer.eos_token + tokenizer.padding_side = "right" + inputs = tokenizer( + PROMPT, + return_tensors="pt", + padding="max_length", + max_length=config.max_length, + truncation=True, + ) + + # Model + model = AutoModelForCausalLM.from_pretrained(MODEL_ID) + model.eval() + + rec_context = RecordingInput(model, + config.condition, + input_to_remove=config.input_to_remove) + + with torch.no_grad(), rec_context as rec: + outputs = model.generate( + **inputs, + max_new_tokens=32, + do_sample=False, + pad_token_id=tokenizer.eos_token_id, + ) + captured_input = rec.captured_input + + generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True) + print(f"Generated text: {generated_text}") + + # Tico Conversion + # Reload model to ensure clean state for conversion if needed, + # but prefill.py and decode.py re-instantiate model. Let's follow that pattern to be safe. + model = AutoModelForCausalLM.from_pretrained(MODEL_ID) + model.eval() + + if args.mode == "decode": + # Monkey patch for decode mode + from tico.serialize.operators.adapters.onert.llama_attention import ( + llama_attention_forward_adapter, ) + from transformers.models.llama.modeling_llama import LlamaAttention + LlamaAttention.forward = llama_attention_forward_adapter + + circle_model = tico.convert(model, captured_input) + output_file = f"{args.mode}.circle" + circle_model.save(output_file) + print(f"Model saved to {output_file}") + + +if __name__ == "__main__": + main()