From 13438620f075b83da079c91269fa779561a97731 Mon Sep 17 00:00:00 2001
From: Sanggyu Lee <takepencil@naver.com>
Date: Fri, 14 Nov 2025 16:05:05 +0900
Subject: [PATCH 1/3] [ggma] Add documentation for TinyLlama example
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Created `runtime/ggma/examples/generate_text/tinyllama.md` with step‑by‑step guide.
- Includes prerequisites, model generation commands, full processing pipeline, and a summary.

ONE-DCO-1.0-Signed-off-by: Sanggyu Lee <sg5.lee@samsung.com>
---
 runtime/ggma/examples/generate_text/decode.py | 68 +++++++++++++++
 .../ggma/examples/generate_text/prefill.py    | 75 +++++++++++++++++
 .../examples/generate_text/requirements.txt   |  2 +
 .../ggma/examples/generate_text/tinyllama.md  | 84 +++++++++++++++++++
 4 files changed, 229 insertions(+)
 create mode 100644 runtime/ggma/examples/generate_text/decode.py
 create mode 100644 runtime/ggma/examples/generate_text/prefill.py
 create mode 100644 runtime/ggma/examples/generate_text/requirements.txt
 create mode 100644 runtime/ggma/examples/generate_text/tinyllama.md

diff --git a/runtime/ggma/examples/generate_text/decode.py b/runtime/ggma/examples/generate_text/decode.py
new file mode 100644
index 00000000000..43fbcb730b2
--- /dev/null
+++ b/runtime/ggma/examples/generate_text/decode.py
@@ -0,0 +1,68 @@
+# User input
+prompt = "Lily picked up a flower."
+model_name = "Maykeye/TinyLLama-v0"
+
+# Tokenizer
+from transformers import AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+tokenizer.pad_token = tokenizer.eos_token
+tokenizer.padding_side = "right"
+inputs = tokenizer(
+    prompt,
+    return_tensors="pt",
+    padding="max_length",
+    max_length=30,
+    truncation=True,
+)
+
+# Generator
+import torch
+
+from transformers import AutoModelForCausalLM
+
+model = AutoModelForCausalLM.from_pretrained(model_name)
+model.eval()
+
+from tico.utils.record_input import RecordingInput
+
+# past_key_values
+# ---------------
+# During prefill, "past_key_values" not None, but an empty Cache instance.
+# Passing None makes torch.export happy.
+
+input_to_remove = [
+    "attention_mask",
+    # For left pad,        [0, ⋯, 0, 1, ⋯, 1]
+    # For right right pad, [1, ⋯, 1, 0, ⋯, 0]
+    # ( 0 is pad-token )
+    # This script uses right pad and pass all-1 attention mask (including pad).
+    # Npu computes all positions whether it is pad or not.
+]
+condition_fn = lambda args_dict: args_dict["past_key_values"].get_seq_length() != 0
+
+with torch.no_grad(), RecordingInput(model, condition_fn,
+                                     input_to_remove=input_to_remove) as rec:
+    outputs = model.generate(
+        **inputs,
+        max_new_tokens=32,
+        do_sample=False,
+        pad_token_id=tokenizer.eos_token_id,
+    )
+    captured_input = rec.captured_input
+
+generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
+print(generated_text)
+
+# Tico
+import tico
+from tico.serialize.operators.adapters.onert.llama_attention import (
+    llama_attention_forward_adapter, )
+from transformers.models.llama.modeling_llama import LlamaAttention
+
+#LlamaAttention.forward = llama_attention_forward_adapter
+
+model = AutoModelForCausalLM.from_pretrained(model_name)
+model.eval()
+circle_model = tico.convert(model, captured_input)
+circle_model.save(f"tinyllama.decode.circle")
diff --git a/runtime/ggma/examples/generate_text/prefill.py b/runtime/ggma/examples/generate_text/prefill.py
new file mode 100644
index 00000000000..a4f37e96e5c
--- /dev/null
+++ b/runtime/ggma/examples/generate_text/prefill.py
@@ -0,0 +1,75 @@
+# User input
+prompt = "Lily picked up a flower."
+model_name = "Maykeye/TinyLLama-v0"
+
+# Tokenizer
+from transformers import AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+tokenizer.pad_token = tokenizer.eos_token
+tokenizer.padding_side = "right"
+inputs = tokenizer(
+    prompt,
+    return_tensors="pt",
+    padding="max_length",
+    max_length=32,
+    truncation=True,
+)
+
+# Generator
+import torch
+
+from transformers import AutoModelForCausalLM
+
+model = AutoModelForCausalLM.from_pretrained(model_name)
+model.eval()
+
+from tico.utils.record_input import RecordingInput
+
+# past_key_values
+# ---------------
+# During prefill, "past_key_values" not None, but an empty Cache instance.
+# Passing None makes torch.export happy.
+
+input_to_remove = [
+    "past_key_values",
+    # DynamicCache is flatten-able operator since 4.50.
+    # See _pytree.py > tree_flatten
+    # SUPPORTED_NODES has *transformers.DynamicCache*
+    # After flattening, DynamicCache becomes { "key_cache": [] , "value_cache": [ ] }
+    # dict.value is returne. dict.key is stored in treespec.
+    #
+    # On prefill, DynamicCache is empty, and dict is empty after flattening.
+    # PyTorch removes empty dict!
+    # If number of args is 4 (including cache), it becomes 3!
+    # To avoid this error, don't pass empty cache, just pass None.
+    "attention_mask",
+    # For left pad,        [0, ⋯, 0, 1, ⋯, 1]
+    # For right right pad, [1, ⋯, 1, 0, ⋯, 0]
+    # ( 0 is pad-token )
+    # This script uses right pad and pass all-1 attention mask (including pad).
+    # Npu computes all positions whether it is pad or not.
+    "cache_position"
+    # It is the list of cache position like [0, 1, ..., 11].
+    # For npu, we always store all values (including pad).
+]
+
+with torch.no_grad(), RecordingInput(model, input_to_remove=input_to_remove) as rec:
+    outputs = model.generate(
+        **inputs,
+        max_new_tokens=32,
+        do_sample=False,
+        pad_token_id=tokenizer.eos_token_id,
+    )
+    captured_input = rec.captured_input
+
+generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
+print(generated_text)
+
+# Tico
+import tico
+
+model = AutoModelForCausalLM.from_pretrained(model_name)
+model.eval()
+circle_model = tico.convert(model, captured_input)
+circle_model.save(f"tinyllama.prefill.circle")
diff --git a/runtime/ggma/examples/generate_text/requirements.txt b/runtime/ggma/examples/generate_text/requirements.txt
new file mode 100644
index 00000000000..34dd7beb64d
--- /dev/null
+++ b/runtime/ggma/examples/generate_text/requirements.txt
@@ -0,0 +1,2 @@
+transformers==4.50.3
+torch
diff --git a/runtime/ggma/examples/generate_text/tinyllama.md b/runtime/ggma/examples/generate_text/tinyllama.md
new file mode 100644
index 00000000000..a7a4dfd50cc
--- /dev/null
+++ b/runtime/ggma/examples/generate_text/tinyllama.md
@@ -0,0 +1,84 @@
+# TinyLlama Example Documentation
+
+This document provides a step‑by‑step guide for generating and processing a text generation model.
+
+## Summary
+
+1. Set up the environment and install dependencies.
+2. Generate the initial `prefill` and `decode` Circle model files.
+3. Run the pipeline to optimize, reshape, and prune the model, producing a final `decode.circle` ready for inference.
+
+## Prerequisites
+
+1. **Python virtual environment**
+   ```bash
+   cd runtime/ggma/examples/generate_text/
+   python3 -m venv _
+   source _/bin/activate
+   ```
+
+2. **Install required Python packages**
+   ```bash
+   pip install -r requirements.txt
+   ```
+
+3. **Install TICO (Torch IR to Circle ONE)**
+   ```bash
+   # Clone the repository
+   git clone https://github.com/Samsung/TICO.git
+   # Install it in editable mode
+   pip install -e TICO
+   ```
+
+## Generating Model Files
+
+Run the provided scripts to create the prefill and decode Circle model files:
+
+```bash
+python prefill.py   # Generates tinyllama.prefill.circle
+python decode.py    # Generates tinyllama.decode.circle
+```
+
+You can verify the generated files:
+
+```bash
+ls -lh *.circle
+# Expected output:
+# -rw-rw-r-- 1 gyu gyu 18M Nov 14 14:09 tinyllama.decode.circle
+# -rw-rw-r-- 1 gyu gyu 18M Nov 14 14:09 tinyllama.prefill.circle
+```
+
+## Full Processing Pipeline
+
+The following pipeline shows how to chain several tools to transform the model:
+
+```bash
+with.py tinyllama.decode.circle |
+fuse.attention.py \
+fuse.bmm_lhs_const.py | reshape.fc_weight.py | \
+reshape.io.py input --by_shape [1,16,30,4] [1,16,32,4] | \
+transpose.io.kvcache.py | \
+remove.io.py output --keep_by_id 0 | \
+select.op.py --by_id 0-181 | \
+gc.py | \
+retype.input_ids.py > decode.circle
+```
+
+### Explanation of each step
+
+| Tool | Purpose |
+|------|---------|
+| `with.py` | Reads the Circle model from stdin and writes it to stdout. |
+| `fuse.attention.py` | Fuses attention‑related operators for optimization. |
+| `fuse.bmm_lhs_const.py` | Fuses constant left‑hand side matrices in batch matrix multiplication. |
+| `reshape.fc_weight.py` | Reshapes fully‑connected layer weights. |
+| `reshape.io.py input --by_shape [...]` | Reshapes input tensors to the specified shapes. |
+| `transpose.io.kvcache.py` | Transposes the KV‑cache tensors. |
+| `remove.io.py output --keep_by_id 0` | Keeps only the output tensor with ID 0, removing the rest. |
+| `select.op.py --by_id 0-181` | Selects operators with IDs from 0 to 181. |
+| `gc.py` | Performs garbage collection, removing unused tensors and operators. |
+| `retype.input_ids.py` | Changes the data type of input IDs as needed. |
+| `> decode.circle` | Saves the final processed model to `decode.circle`. |
+
+
+Feel free to adjust the pipeline arguments (e.g., shapes, IDs) to suit your specific model configuration.

From fd78f13787f57fe5c4499c77013eb35fd9d84941 Mon Sep 17 00:00:00 2001
From: Sanggyu Lee <takepencil@naver.com>
Date: Fri, 21 Nov 2025 17:36:25 +0900
Subject: [PATCH 2/3] Update document

---
 runtime/ggma/examples/generate_text/README.md | 125 ++++++++++++++++++
 runtime/ggma/examples/generate_text/decode.py |   2 +-
 .../ggma/examples/generate_text/prefill.py    |   2 +-
 .../ggma/examples/generate_text/tinyllama.md  |  84 ------------
 4 files changed, 127 insertions(+), 86 deletions(-)
 create mode 100644 runtime/ggma/examples/generate_text/README.md
 delete mode 100644 runtime/ggma/examples/generate_text/tinyllama.md

diff --git a/runtime/ggma/examples/generate_text/README.md b/runtime/ggma/examples/generate_text/README.md
new file mode 100644
index 00000000000..f56d77929e9
--- /dev/null
+++ b/runtime/ggma/examples/generate_text/README.md
@@ -0,0 +1,125 @@
+# TinyLlama Text Generation Example
+
+This document provides a step‑by‑step guide for generating and processing a TinyLlama text‑generation model.
+
+## Summary
+
+1. Set up the environment and install dependencies.
+2. Generate the initial `prefill` and `decode` Circle model files.
+3. Run the pipeline to optimize, reshape, and prune the model, producing a final `decode.circle` ready for inference.
+
+## Prerequisites
+
+### 1. Python virtual environment
+```bash
+cd runtime/ggma/examples/generate_text/
+python3 -m venv _
+source _/bin/activate
+```
+
+### 2. Install required Python packages
+```bash
+pip install -r requirements.txt
+```
+
+### 3. Install TICO (Torch IR to Circle ONE)
+```bash
+# Clone the repository
+git clone https://github.com/Samsung/TICO.git
+# Install it in editable mode
+pip install -e TICO
+```
+
+### 4. Get [o2o](https://github.com/Samsung/ONE/pull/16233) in PATH
+*Requires the GitHub CLI (`gh`).*
+```bash
+gh pr checkout 16233
+export PATH=../../../../tools/o2o:$PATH
+```
+
+## Generating Model Files
+
+### 1. Create the prefill and decode Circle model files
+```bash
+python prefill.py   # Generates prefill.circle
+python decode.py    # Generates decode_.circle
+```
+
+Verify the generated files:
+```bash
+ls -lh *.circle
+# -rw-rw-r-- 1 gyu gyu 18M Nov 14 14:09 decode_.circle
+# -rw-rw-r-- 1 gyu gyu 18M Nov 14 14:09 prefill.circle
+```
+
+### 2. Update `tinyllama.decode.circle`
+Fuse attention and normalize KV-cache inputs for the decode model.
+
+```bash
+# Fuse attention and reshape KV-cache for the decode model
+fuse.attention.py < decode_.circle \
+    | fuse.bmm_lhs_const.py \
+    | reshape.io.py input --by_shape [1,16,30,4] [1,16,32,4] \
+    | transpose.io.kvcache.py > decode.circle
+```
+
+### 3. Merge prefill and decode circles
+Merge the models, retype input IDs, and clean up.
+
+```bash
+merge.circles.py prefill.circle decode.circle \
+    | downcast.input_ids.py \
+    | gc.py > model.circle
+```
+
+Verify final model files:
+```bash
+ls -l {decode,prefill,model}.circle
+# -rw-rw-r-- 1 gyu gyu 18594868 Nov 22 17:26 decode.circle
+# -rw-rw-r-- 1 gyu gyu 18642052 Nov 22 07:53 prefill.circle
+# -rw-rw-r-- 1 gyu gyu 18629520 Nov 22 17:28 model.circle
+```
+
+## Create a GGMA package
+
+1. Create the package root directory and move `model.circle` there:
+```bash
+cd runtime/ggma/examples/generate_text
+mkdir tinyllama
+mv model.circle tinyllama/
+```
+
+2. Copy the tokenizer files (replace `{your_snapshot}` with the actual snapshot hash):
+```bash
+cp -L ~/.cache/huggingface/hub/models--Maykeye--TinyLLama-v0/snapshots/{your_snapshot}/tokenizer.* tinyllama/
+cp -L ~/.cache/huggingface/hub/models--Maykeye--TinyLLama-v0/snapshots/{your_snapshot}/config.json tinyllama/
+```
+
+```bash
+tree tinyllama/
+tinyllama/
+├── model.circle
+├── tokenizer.json
+└── tokenizer.model
+```
+
+## Build and run `ggma_run`
+
+```bash
+make -j$(nproc)
+make install
+```
+
+Check version:
+```bash
+Product/out/bin/ggma_run --version
+# ggma_run v0.1.0 (nnfw runtime: v1.31.0)
+```
+
+Run the model:
+```bash
+Product/out/bin/ggma_run tinyllama
+# prompt: Lily picked up a flower.
+# generated: { 1100, 7899, 289, 826, 351, 600, 2439, 288, 266, 3653, 31843, 1100, 7899, 289, 1261, 291, 5869, 291, 1261, 31843, 1100, 7899 }
+# detokenized: She liked to play with her friends in the park. She liked to run and jump and run. She liked
+```
diff --git a/runtime/ggma/examples/generate_text/decode.py b/runtime/ggma/examples/generate_text/decode.py
index 43fbcb730b2..c397ae2b163 100644
--- a/runtime/ggma/examples/generate_text/decode.py
+++ b/runtime/ggma/examples/generate_text/decode.py
@@ -65,4 +65,4 @@
 model = AutoModelForCausalLM.from_pretrained(model_name)
 model.eval()
 circle_model = tico.convert(model, captured_input)
-circle_model.save(f"tinyllama.decode.circle")
+circle_model.save(f"decode_.circle")
diff --git a/runtime/ggma/examples/generate_text/prefill.py b/runtime/ggma/examples/generate_text/prefill.py
index a4f37e96e5c..aa0b8ae46f3 100644
--- a/runtime/ggma/examples/generate_text/prefill.py
+++ b/runtime/ggma/examples/generate_text/prefill.py
@@ -72,4 +72,4 @@
 model = AutoModelForCausalLM.from_pretrained(model_name)
 model.eval()
 circle_model = tico.convert(model, captured_input)
-circle_model.save(f"tinyllama.prefill.circle")
+circle_model.save(f"prefill.circle")
diff --git a/runtime/ggma/examples/generate_text/tinyllama.md b/runtime/ggma/examples/generate_text/tinyllama.md
deleted file mode 100644
index a7a4dfd50cc..00000000000
--- a/runtime/ggma/examples/generate_text/tinyllama.md
+++ /dev/null
@@ -1,84 +0,0 @@
-# TinyLlama Example Documentation
-
-This document provides a step‑by‑step guide for generating and processing a text generation model.
-
-## Summary
-
-1. Set up the environment and install dependencies.
-2. Generate the initial `prefill` and `decode` Circle model files.
-3. Run the pipeline to optimize, reshape, and prune the model, producing a final `decode.circle` ready for inference.
-
-## Prerequisites
-
-1. **Python virtual environment**
-   ```bash
-   cd runtime/ggma/examples/generate_text/
-   python3 -m venv _
-   source _/bin/activate
-   ```
-
-2. **Install required Python packages**
-   ```bash
-   pip install -r requirements.txt
-   ```
-
-3. **Install TICO (Torch IR to Circle ONE)**
-   ```bash
-   # Clone the repository
-   git clone https://github.com/Samsung/TICO.git
-   # Install it in editable mode
-   pip install -e TICO
-   ```
-
-## Generating Model Files
-
-Run the provided scripts to create the prefill and decode Circle model files:
-
-```bash
-python prefill.py   # Generates tinyllama.prefill.circle
-python decode.py    # Generates tinyllama.decode.circle
-```
-
-You can verify the generated files:
-
-```bash
-ls -lh *.circle
-# Expected output:
-# -rw-rw-r-- 1 gyu gyu 18M Nov 14 14:09 tinyllama.decode.circle
-# -rw-rw-r-- 1 gyu gyu 18M Nov 14 14:09 tinyllama.prefill.circle
-```
-
-## Full Processing Pipeline
-
-The following pipeline shows how to chain several tools to transform the model:
-
-```bash
-with.py tinyllama.decode.circle |
-fuse.attention.py \
-fuse.bmm_lhs_const.py | reshape.fc_weight.py | \
-reshape.io.py input --by_shape [1,16,30,4] [1,16,32,4] | \
-transpose.io.kvcache.py | \
-remove.io.py output --keep_by_id 0 | \
-select.op.py --by_id 0-181 | \
-gc.py | \
-retype.input_ids.py > decode.circle
-```
-
-### Explanation of each step
-
-| Tool | Purpose |
-|------|---------|
-| `with.py` | Reads the Circle model from stdin and writes it to stdout. |
-| `fuse.attention.py` | Fuses attention‑related operators for optimization. |
-| `fuse.bmm_lhs_const.py` | Fuses constant left‑hand side matrices in batch matrix multiplication. |
-| `reshape.fc_weight.py` | Reshapes fully‑connected layer weights. |
-| `reshape.io.py input --by_shape [...]` | Reshapes input tensors to the specified shapes. |
-| `transpose.io.kvcache.py` | Transposes the KV‑cache tensors. |
-| `remove.io.py output --keep_by_id 0` | Keeps only the output tensor with ID 0, removing the rest. |
-| `select.op.py --by_id 0-181` | Selects operators with IDs from 0 to 181. |
-| `gc.py` | Performs garbage collection, removing unused tensors and operators. |
-| `retype.input_ids.py` | Changes the data type of input IDs as needed. |
-| `> decode.circle` | Saves the final processed model to `decode.circle`. |
-
-
-Feel free to adjust the pipeline arguments (e.g., shapes, IDs) to suit your specific model configuration.

From f1d3ef6c9bb5d2921cd717a4bd0bc5fde142e709 Mon Sep 17 00:00:00 2001
From: Sanggyu Lee <sanggyu.yi@yahoo.com>
Date: Sun, 23 Nov 2025 16:11:21 +0900
Subject: [PATCH 3/3] Add USER.md and merge prefill.py and decode.py

---
 .../ggma/examples/generate_text/DEVELOPER.md  | 130 ++++++++++++++++++
 runtime/ggma/examples/generate_text/README.md | 125 -----------------
 runtime/ggma/examples/generate_text/USER.md   | 108 +++++++++++++++
 runtime/ggma/examples/generate_text/decode.py |  68 ---------
 .../ggma/examples/generate_text/prefill.py    |  75 ----------
 .../generate_text/tinyllama/pipeline.yaml     |   9 ++
 .../{ => tinyllama}/requirements.txt          |   1 -
 .../generate_text/tinyllama/tinyllama.py      |  98 +++++++++++++
 8 files changed, 345 insertions(+), 269 deletions(-)
 create mode 100644 runtime/ggma/examples/generate_text/DEVELOPER.md
 delete mode 100644 runtime/ggma/examples/generate_text/README.md
 create mode 100644 runtime/ggma/examples/generate_text/USER.md
 delete mode 100644 runtime/ggma/examples/generate_text/decode.py
 delete mode 100644 runtime/ggma/examples/generate_text/prefill.py
 create mode 100644 runtime/ggma/examples/generate_text/tinyllama/pipeline.yaml
 rename runtime/ggma/examples/generate_text/{ => tinyllama}/requirements.txt (77%)
 create mode 100644 runtime/ggma/examples/generate_text/tinyllama/tinyllama.py

diff --git a/runtime/ggma/examples/generate_text/DEVELOPER.md b/runtime/ggma/examples/generate_text/DEVELOPER.md
new file mode 100644
index 00000000000..2d9f5c93c33
--- /dev/null
+++ b/runtime/ggma/examples/generate_text/DEVELOPER.md
@@ -0,0 +1,130 @@
+# TinyLlama Text Generation Developer Guide
+
+This document provides a detailed technical guide for generating, processing, and optimizing the TinyLlama text-generation model. For basic usage, see [USER.md](USER.md).
+
+## Summary
+
+1. Set up the environment and install dependencies.
+2. Generate the initial `prefill` and `decode` Circle model files.
+3. Run the pipeline to optimize, reshape, and prune the model, producing a final `decode.circle` ready for inference.
+
+## Prerequisites
+
+### 1. Python virtual environment
+```bash
+$ cd runtime/ggma/examples/generate_text/
+$ python3 -m venv _
+$ source _/bin/activate
+```
+
+### 2. Prepare [gyu](tools/gyu/README.md) and o2o tools
+Install dependencies and setup `o2o` tools (similar to what `tools/gyu/init.py` does).
+
+> **Note**: We install the CPU version of `torch` first because `gyu` depends on `TICO`, which by default pulls in the large NVIDIA version of `torch`. Installing the CPU version beforehand prevents this.
+
+```bash
+# 1. Install torch (CPU) and gyu requirements
+$ pip install torch --index-url https://download.pytorch.org/whl/cpu
+$ pip install -r tools/gyu/requirements.txt
+
+# 2. Fetch o2o tools from PR #16233
+$ git fetch origin pull/16233/head:pr-16233
+$ git checkout pr-16233 -- tools/o2o
+$ chmod +x tools/o2o/*.py
+
+# 3. Add tools to PATH
+$ export PATH=$PWD/tools/o2o:$PWD/tools/gyu:$PATH
+```
+
+
+
+## Generating Model Files
+
+### 1. Install model dependencies
+```bash
+$ pip install -r tinyllama/tinyllama.requirements
+```
+
+### 2. Create the prefill and decode Circle model files
+```bash
+$ python tinyllama/tinyllama.py --mode prefill   # Generates prefill.circle
+$ python tinyllama/tinyllama.py --mode decode    # Generates decode_.circle
+```
+
+Verify the generated files:
+```bash
+$ ls -lh *.circle
+-rw-rw-r-- 1 gyu gyu 18M Nov 14 14:09 decode_.circle
+-rw-rw-r-- 1 gyu gyu 18M Nov 14 14:09 prefill.circle
+```
+
+### 3. Update `tinyllama.decode.circle`
+Fuse attention and normalize KV-cache inputs for the decode model.
+
+```bash
+$ fuse.attention.py < decode_.circle \
+    | reshape.io.py input --by_shape [1,16,30,4] [1,16,32,4] \
+    | transpose.io.kvcache.py > decode.circle
+```
+
+### 4. Merge prefill and decode circles
+Merge the models, retype input IDs, and clean up.
+
+```bash
+$ merge.circles.py prefill.circle decode.circle \
+    | fuse.bmm_lhs_const.py \
+    | downcast.input_ids.py \
+    | gc.py > model.circle
+```
+
+Verify final model files:
+```bash
+$ ls -l {decode,prefill,model}.circle
+-rw-rw-r-- 1 gyu gyu 18594868 Nov 22 17:26 decode.circle
+-rw-rw-r-- 1 gyu gyu 18642052 Nov 22 07:53 prefill.circle
+-rw-rw-r-- 1 gyu gyu 18629520 Nov 22 17:28 model.circle
+```
+
+## Create a GGMA package
+
+1. Create the package root directory and move `model.circle` there:
+```bash
+$ cd runtime/ggma/examples/generate_text
+$ mkdir tinyllama
+$ mv model.circle tinyllama/
+```
+
+2. Copy the tokenizer files (replace `{your_snapshot}` with the actual snapshot hash):
+```bash
+$ cp -L ~/.cache/huggingface/hub/models--Maykeye--TinyLLama-v0/snapshots/{your_snapshot}/tokenizer.* tinyllama/
+$ cp -L ~/.cache/huggingface/hub/models--Maykeye--TinyLLama-v0/snapshots/{your_snapshot}/config.json tinyllama/
+```
+
+```bash
+$ tree tinyllama/
+tinyllama/
+├── model.circle
+├── tokenizer.json
+└── tokenizer.model
+```
+
+## Build and run `ggma_run`
+
+```bash
+$ make -j$(nproc)
+$ make install
+```
+
+Check version:
+```bash
+$ Product/out/bin/ggma_run --version
+ggma_run v0.1.0 (nnfw runtime: v1.31.0)
+```
+
+Run the model:
+```bash
+$ Product/out/bin/ggma_run tinyllama
+prompt: Lily picked up a flower.
+generated: { 1100, 7899, 289, 826, 351, 600, 2439, 288, 266, 3653, 31843, 1100, 7899, 289, 1261, 291, 5869, 291, 1261, 31843, 1100, 7899 }
+detokenized: She liked to play with her friends in the park. She liked to run and jump and run. She liked
+```
diff --git a/runtime/ggma/examples/generate_text/README.md b/runtime/ggma/examples/generate_text/README.md
deleted file mode 100644
index f56d77929e9..00000000000
--- a/runtime/ggma/examples/generate_text/README.md
+++ /dev/null
@@ -1,125 +0,0 @@
-# TinyLlama Text Generation Example
-
-This document provides a step‑by‑step guide for generating and processing a TinyLlama text‑generation model.
-
-## Summary
-
-1. Set up the environment and install dependencies.
-2. Generate the initial `prefill` and `decode` Circle model files.
-3. Run the pipeline to optimize, reshape, and prune the model, producing a final `decode.circle` ready for inference.
-
-## Prerequisites
-
-### 1. Python virtual environment
-```bash
-cd runtime/ggma/examples/generate_text/
-python3 -m venv _
-source _/bin/activate
-```
-
-### 2. Install required Python packages
-```bash
-pip install -r requirements.txt
-```
-
-### 3. Install TICO (Torch IR to Circle ONE)
-```bash
-# Clone the repository
-git clone https://github.com/Samsung/TICO.git
-# Install it in editable mode
-pip install -e TICO
-```
-
-### 4. Get [o2o](https://github.com/Samsung/ONE/pull/16233) in PATH
-*Requires the GitHub CLI (`gh`).*
-```bash
-gh pr checkout 16233
-export PATH=../../../../tools/o2o:$PATH
-```
-
-## Generating Model Files
-
-### 1. Create the prefill and decode Circle model files
-```bash
-python prefill.py   # Generates prefill.circle
-python decode.py    # Generates decode_.circle
-```
-
-Verify the generated files:
-```bash
-ls -lh *.circle
-# -rw-rw-r-- 1 gyu gyu 18M Nov 14 14:09 decode_.circle
-# -rw-rw-r-- 1 gyu gyu 18M Nov 14 14:09 prefill.circle
-```
-
-### 2. Update `tinyllama.decode.circle`
-Fuse attention and normalize KV-cache inputs for the decode model.
-
-```bash
-# Fuse attention and reshape KV-cache for the decode model
-fuse.attention.py < decode_.circle \
-    | fuse.bmm_lhs_const.py \
-    | reshape.io.py input --by_shape [1,16,30,4] [1,16,32,4] \
-    | transpose.io.kvcache.py > decode.circle
-```
-
-### 3. Merge prefill and decode circles
-Merge the models, retype input IDs, and clean up.
-
-```bash
-merge.circles.py prefill.circle decode.circle \
-    | downcast.input_ids.py \
-    | gc.py > model.circle
-```
-
-Verify final model files:
-```bash
-ls -l {decode,prefill,model}.circle
-# -rw-rw-r-- 1 gyu gyu 18594868 Nov 22 17:26 decode.circle
-# -rw-rw-r-- 1 gyu gyu 18642052 Nov 22 07:53 prefill.circle
-# -rw-rw-r-- 1 gyu gyu 18629520 Nov 22 17:28 model.circle
-```
-
-## Create a GGMA package
-
-1. Create the package root directory and move `model.circle` there:
-```bash
-cd runtime/ggma/examples/generate_text
-mkdir tinyllama
-mv model.circle tinyllama/
-```
-
-2. Copy the tokenizer files (replace `{your_snapshot}` with the actual snapshot hash):
-```bash
-cp -L ~/.cache/huggingface/hub/models--Maykeye--TinyLLama-v0/snapshots/{your_snapshot}/tokenizer.* tinyllama/
-cp -L ~/.cache/huggingface/hub/models--Maykeye--TinyLLama-v0/snapshots/{your_snapshot}/config.json tinyllama/
-```
-
-```bash
-tree tinyllama/
-tinyllama/
-├── model.circle
-├── tokenizer.json
-└── tokenizer.model
-```
-
-## Build and run `ggma_run`
-
-```bash
-make -j$(nproc)
-make install
-```
-
-Check version:
-```bash
-Product/out/bin/ggma_run --version
-# ggma_run v0.1.0 (nnfw runtime: v1.31.0)
-```
-
-Run the model:
-```bash
-Product/out/bin/ggma_run tinyllama
-# prompt: Lily picked up a flower.
-# generated: { 1100, 7899, 289, 826, 351, 600, 2439, 288, 266, 3653, 31843, 1100, 7899, 289, 1261, 291, 5869, 291, 1261, 31843, 1100, 7899 }
-# detokenized: She liked to play with her friends in the park. She liked to run and jump and run. She liked
-```
diff --git a/runtime/ggma/examples/generate_text/USER.md b/runtime/ggma/examples/generate_text/USER.md
new file mode 100644
index 00000000000..cfbb8ae5ab7
--- /dev/null
+++ b/runtime/ggma/examples/generate_text/USER.md
@@ -0,0 +1,108 @@
+# Text Generation User Guide
+
+This guide shows how to create a GGMA package for text generation models using the `opm` (one packaging manager) tool.
+
+We use TinyLlama as an example throughout this guide.
+
+## Creating a GGMA package
+
+NOTE: Start from the ONE repository root directory.
+
+### 1. Initialize environment (one-time setup)
+
+Add [opm](../../../../tools/opm/README.md) to PATH:
+```bash
+$ export PATH=$PWD/tools/opm:$PATH
+```
+
+Then, change directory to tinyllama example directory and run opm init:
+```bash
+$ cd runtime/ggma/examples/generate_text/tinyllama
+$ opm init
+```
+
+Python environment and o2o tools are prepared:
+```bash
+$ ls -ld o2o venv
+drwxrwxr-x 2 opm opm 4096 Nov 24 09:44 o2o
+drwxrwxr-x 6 opm opm 4096 Nov 24 09:42 venv
+```
+
+> **Note**: The `o2o` directory will be removed once [#13689](https://github.com/Samsung/ONE/pull/13689) is merged.
+
+### 2. Import model from HuggingFace
+
+```bash
+$ opm import Maykeye/TinyLLama-v0
+```
+
+The HuggingFace model is downloaded to `build/tinyllama-v0/`:
+```
+$ tree build
+build
+└── tinyllama-v0
+    ├── backup
+    ├── config.json
+    ├── demo.py
+    ├── generation_config.json
+    ├── model.onnx
+    ├── model.safetensors
+    ├── pytorch_model.bin
+    ├── README.md
+    ├── special_tokens_map.json
+    ├── tokenizer_config.json
+    ├── tokenizer.json
+    ├── tokenizer.model
+    ├── train.ipynb
+    └── valid.py
+```
+
+### 3. Export to GGMA package
+
+```bash
+$ opm export -s tinyllama.py
+```
+
+The GGMA package is generated in `build/out/`:
+```
+$ tree build/out
+build/out/
+├── config.json
+├── model.circle
+├── tokenizer.json
+└── tokenizer.model
+```
+
+## Building GGMA and Running a GGMA package
+
+NOTE: Start from the ONE repository root directory.
+
+### Build
+
+```bash
+$ make -j$(nproc)
+$ make install
+```
+
+For detailed build instructions, see the [ONE Runtime Build Guide](https://github.com/Samsung/ONE/blob/master/docs/runtime/README.md).
+
+Confirm that `ggma_run` is built and show its version:
+```bash
+$ Product/out/bin/ggma_run --version
+ggma_run v0.1.0 (nnfw runtime: v1.31.0)
+```
+
+### Run
+
+Execute the GGMA package (default prompt) to see a sample output:
+```bash
+$ Product/out/bin/ggma_run build/out
+prompt: Lily picked up a flower.
+generated: { 1100, 7899, 289, 826, 351, 600, 2439, 288, 266, 3653, 31843, 1100, 7899, 289, 1261, 291, 5869, 291, 1261, 31843, 1100, 7899 }
+detokenized: She liked to play with her friends in the park. She liked to run and jump and run. She liked
+```
+
+For detailed run instructions, see the [ggma_run guide](https://github.com/Samsung/ONE/blob/master/runtime/tests/tools/ggma_run/README.md).
+
+
+For developers who want to understand what happens under the hood, see [DEVELOPER.md](DEVELOPER.md).
diff --git a/runtime/ggma/examples/generate_text/decode.py b/runtime/ggma/examples/generate_text/decode.py
deleted file mode 100644
index c397ae2b163..00000000000
--- a/runtime/ggma/examples/generate_text/decode.py
+++ /dev/null
@@ -1,68 +0,0 @@
-# User input
-prompt = "Lily picked up a flower."
-model_name = "Maykeye/TinyLLama-v0"
-
-# Tokenizer
-from transformers import AutoTokenizer
-
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-tokenizer.pad_token = tokenizer.eos_token
-tokenizer.padding_side = "right"
-inputs = tokenizer(
-    prompt,
-    return_tensors="pt",
-    padding="max_length",
-    max_length=30,
-    truncation=True,
-)
-
-# Generator
-import torch
-
-from transformers import AutoModelForCausalLM
-
-model = AutoModelForCausalLM.from_pretrained(model_name)
-model.eval()
-
-from tico.utils.record_input import RecordingInput
-
-# past_key_values
-# ---------------
-# During prefill, "past_key_values" not None, but an empty Cache instance.
-# Passing None makes torch.export happy.
-
-input_to_remove = [
-    "attention_mask",
-    # For left pad,        [0, ⋯, 0, 1, ⋯, 1]
-    # For right right pad, [1, ⋯, 1, 0, ⋯, 0]
-    # ( 0 is pad-token )
-    # This script uses right pad and pass all-1 attention mask (including pad).
-    # Npu computes all positions whether it is pad or not.
-]
-condition_fn = lambda args_dict: args_dict["past_key_values"].get_seq_length() != 0
-
-with torch.no_grad(), RecordingInput(model, condition_fn,
-                                     input_to_remove=input_to_remove) as rec:
-    outputs = model.generate(
-        **inputs,
-        max_new_tokens=32,
-        do_sample=False,
-        pad_token_id=tokenizer.eos_token_id,
-    )
-    captured_input = rec.captured_input
-
-generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
-print(generated_text)
-
-# Tico
-import tico
-from tico.serialize.operators.adapters.onert.llama_attention import (
-    llama_attention_forward_adapter, )
-from transformers.models.llama.modeling_llama import LlamaAttention
-
-#LlamaAttention.forward = llama_attention_forward_adapter
-
-model = AutoModelForCausalLM.from_pretrained(model_name)
-model.eval()
-circle_model = tico.convert(model, captured_input)
-circle_model.save(f"decode_.circle")
diff --git a/runtime/ggma/examples/generate_text/prefill.py b/runtime/ggma/examples/generate_text/prefill.py
deleted file mode 100644
index aa0b8ae46f3..00000000000
--- a/runtime/ggma/examples/generate_text/prefill.py
+++ /dev/null
@@ -1,75 +0,0 @@
-# User input
-prompt = "Lily picked up a flower."
-model_name = "Maykeye/TinyLLama-v0"
-
-# Tokenizer
-from transformers import AutoTokenizer
-
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-tokenizer.pad_token = tokenizer.eos_token
-tokenizer.padding_side = "right"
-inputs = tokenizer(
-    prompt,
-    return_tensors="pt",
-    padding="max_length",
-    max_length=32,
-    truncation=True,
-)
-
-# Generator
-import torch
-
-from transformers import AutoModelForCausalLM
-
-model = AutoModelForCausalLM.from_pretrained(model_name)
-model.eval()
-
-from tico.utils.record_input import RecordingInput
-
-# past_key_values
-# ---------------
-# During prefill, "past_key_values" not None, but an empty Cache instance.
-# Passing None makes torch.export happy.
-
-input_to_remove = [
-    "past_key_values",
-    # DynamicCache is flatten-able operator since 4.50.
-    # See _pytree.py > tree_flatten
-    # SUPPORTED_NODES has *transformers.DynamicCache*
-    # After flattening, DynamicCache becomes { "key_cache": [] , "value_cache": [ ] }
-    # dict.value is returne. dict.key is stored in treespec.
-    #
-    # On prefill, DynamicCache is empty, and dict is empty after flattening.
-    # PyTorch removes empty dict!
-    # If number of args is 4 (including cache), it becomes 3!
-    # To avoid this error, don't pass empty cache, just pass None.
-    "attention_mask",
-    # For left pad,        [0, ⋯, 0, 1, ⋯, 1]
-    # For right right pad, [1, ⋯, 1, 0, ⋯, 0]
-    # ( 0 is pad-token )
-    # This script uses right pad and pass all-1 attention mask (including pad).
-    # Npu computes all positions whether it is pad or not.
-    "cache_position"
-    # It is the list of cache position like [0, 1, ..., 11].
-    # For npu, we always store all values (including pad).
-]
-
-with torch.no_grad(), RecordingInput(model, input_to_remove=input_to_remove) as rec:
-    outputs = model.generate(
-        **inputs,
-        max_new_tokens=32,
-        do_sample=False,
-        pad_token_id=tokenizer.eos_token_id,
-    )
-    captured_input = rec.captured_input
-
-generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
-print(generated_text)
-
-# Tico
-import tico
-
-model = AutoModelForCausalLM.from_pretrained(model_name)
-model.eval()
-circle_model = tico.convert(model, captured_input)
-circle_model.save(f"prefill.circle")
diff --git a/runtime/ggma/examples/generate_text/tinyllama/pipeline.yaml b/runtime/ggma/examples/generate_text/tinyllama/pipeline.yaml
new file mode 100644
index 00000000000..19052d11c3e
--- /dev/null
+++ b/runtime/ggma/examples/generate_text/tinyllama/pipeline.yaml
@@ -0,0 +1,9 @@
+decode: |
+  reshape.io.py input --by_shape [1,16,30,4] [1,16,32,4] < decode.circle
+      | transpose.io.kvcache.py > _.circle && mv _.circle decode.circle
+
+merge: |
+  merge.circles.py prefill.circle decode.circle
+      | fuse.bmm_lhs_const.py
+      | downcast.input_ids.py
+      | gc.py > model.circle
diff --git a/runtime/ggma/examples/generate_text/requirements.txt b/runtime/ggma/examples/generate_text/tinyllama/requirements.txt
similarity index 77%
rename from runtime/ggma/examples/generate_text/requirements.txt
rename to runtime/ggma/examples/generate_text/tinyllama/requirements.txt
index 34dd7beb64d..bc13c47a23b 100644
--- a/runtime/ggma/examples/generate_text/requirements.txt
+++ b/runtime/ggma/examples/generate_text/tinyllama/requirements.txt
@@ -1,2 +1 @@
 transformers==4.50.3
-torch
diff --git a/runtime/ggma/examples/generate_text/tinyllama/tinyllama.py b/runtime/ggma/examples/generate_text/tinyllama/tinyllama.py
new file mode 100644
index 00000000000..7ccb68e041f
--- /dev/null
+++ b/runtime/ggma/examples/generate_text/tinyllama/tinyllama.py
@@ -0,0 +1,98 @@
+import argparse
+import torch
+from dataclasses import dataclass
+from typing import Callable, List, Optional
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from tico.utils.record_input import RecordingInput
+import tico
+
+# Constants
+MODEL_ID = "Maykeye/TinyLLama-v0"
+PROMPT = "Lily picked up a flower."
+
+
+@dataclass
+class ModeArg:
+    max_length: int
+    input_to_remove: List[str]
+    condition: Optional[Callable]
+
+
+MODE_ARGS = {
+    "prefill":
+    ModeArg(max_length=32,
+            input_to_remove=["past_key_values", "attention_mask", "cache_position"],
+            condition=None),
+    "decode":
+    ModeArg(
+        max_length=30,
+        input_to_remove=["attention_mask"],
+        condition=lambda args_dict: args_dict["past_key_values"].get_seq_length() != 0)
+}
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Export TinyLlama model to Circle format.")
+    parser.add_argument("--mode",
+                        choices=["prefill", "decode"],
+                        required=True,
+                        help="Export mode: prefill or decode")
+    args = parser.parse_args()
+
+    # Get configuration for the selected mode
+    config = MODE_ARGS[args.mode]
+
+    # Tokenizer
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+    tokenizer.pad_token = tokenizer.eos_token
+    tokenizer.padding_side = "right"
+    inputs = tokenizer(
+        PROMPT,
+        return_tensors="pt",
+        padding="max_length",
+        max_length=config.max_length,
+        truncation=True,
+    )
+
+    # Model
+    model = AutoModelForCausalLM.from_pretrained(MODEL_ID)
+    model.eval()
+
+    rec_context = RecordingInput(model,
+                                 config.condition,
+                                 input_to_remove=config.input_to_remove)
+
+    with torch.no_grad(), rec_context as rec:
+        outputs = model.generate(
+            **inputs,
+            max_new_tokens=32,
+            do_sample=False,
+            pad_token_id=tokenizer.eos_token_id,
+        )
+        captured_input = rec.captured_input
+
+    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    print(f"Generated text: {generated_text}")
+
+    # Tico Conversion
+    # Reload model to ensure clean state for conversion if needed,
+    # but prefill.py and decode.py re-instantiate model. Let's follow that pattern to be safe.
+    model = AutoModelForCausalLM.from_pretrained(MODEL_ID)
+    model.eval()
+
+    if args.mode == "decode":
+        # Monkey patch for decode mode
+        from tico.serialize.operators.adapters.onert.llama_attention import (
+            llama_attention_forward_adapter, )
+        from transformers.models.llama.modeling_llama import LlamaAttention
+        LlamaAttention.forward = llama_attention_forward_adapter
+
+    circle_model = tico.convert(model, captured_input)
+    output_file = f"{args.mode}.circle"
+    circle_model.save(output_file)
+    print(f"Model saved to {output_file}")
+
+
+if __name__ == "__main__":
+    main()