kedro-org · lrcouto · Apr 17, 2026 · Apr 17, 2026 · Apr 17, 2026 · Apr 17, 2026
diff --git a/kedro-agentic-workflows/README.md b/kedro-agentic-workflows/README.md
@@ -199,17 +199,17 @@ For more details see `conf/base/catalog_genai_config.yml` and [docs for `Langfus
 
 ## 🧪 Evaluation
 
-The project includes an **intent detection evaluation pipeline** that runs the intent classification agent against a labeled dataset and scores results using two evaluators. It integrates with [Langfuse](https://langfuse.com/) so results, traces, and scores are visible in the Langfuse UI.
+The project includes an **intent detection evaluation pipeline** that runs the intent classification agent against a labeled dataset and scores results using two evaluators. It supports two observability backends — [Langfuse](https://langfuse.com/) and [Opik](https://www.comet.com/opik) — so results, traces, and scores are visible in either platform's UI.
 
 ### How it works
 
 The pipeline:
-1. Loads the **evaluation dataset** (labeled question/intent pairs) from a local JSON file and syncs it to Langfuse.
-2. Runs the **Intent Detection Agent** on each item, recording traces as Langfuse observations linked to the prompt and model.
+1. Loads the **evaluation dataset** (labeled question/intent pairs) from a local JSON file and syncs it to the remote platform.
+2. Runs the **Intent Detection Agent** on each item, recording traces linked to the prompt version and model.
 3. Scores each result with two evaluators:
    - **Intent accuracy** — binary match between predicted and expected intent.
    - **Reason quality** — LLM-as-a-judge score (1–5) evaluating the reasoning behind the prediction.
-4. Publishes the experiment to Langfuse with all scores, traces, and metadata.
+4. Publishes the experiment with all scores, traces, and metadata.
 
 ### `LangfuseEvaluationDataset`
 
@@ -237,14 +237,40 @@ intent_evaluation_data:
     created_by: kedro
 ```
 
+### `OpikEvaluationDataset`
+
+Alternatively, the evaluation dataset can be managed by `OpikEvaluationDataset` from `kedro-datasets-experimental`, which bridges a local JSON/YAML file with a remote Opik dataset. It supports the same two sync policies as the Langfuse variant:
+
+- **`local`** — the local file is the source of truth; `load()` re-inserts all local items to remote on every sync via Opik's upsert-by-ID API. Items with a UUID v7 `id` update the existing remote row in-place; items without a UUID v7 `id` create a new remote row on every sync.
+- **`remote`** — the remote Opik dataset is the source of truth. `load()` fetches remote as-is with no local file interaction; `save()` inserts items to remote only.
+
+Catalog entry (`conf/base/catalog_evaluation_opik.yml`):
+
+```yaml
+opik_intent_evaluation_data:
+  type: kedro_datasets_experimental.opik.OpikEvaluationDataset
+  dataset_name: evaluations/intent_agent_evaluation
+  filepath: data/intent_detection/evaluation/intent_evaluation.json
+  sync_policy: local
+  credentials: opik_credentials
+```
+
 ### Running the evaluation pipeline
 
+**With Langfuse:**
 ```bash
 kedro run -p intent_detection_evaluation --params intent_prompt_version=1,model_name=gpt-4o
 ```
 
 The `intent_prompt_version` and `model_name` parameters are used to name the experiment in Langfuse (e.g., `intent_prompt_v1_model_gpt-4o`), making it easy to compare runs across prompt iterations and models.
 
+**With Opik:**
+```bash
+kedro run -p intent_detection_evaluation_opik --params model_name=gpt-4o
+```
+
+The experiment name is derived automatically from the active prompt name, its commit hash, and the model (e.g., `intent_eval_prompt_intent-classifier_abc12345_model_gpt-4o`), making it easy to compare runs across prompt versions and models in the Opik UI.
+
 ### Evaluation data
 
 Stored at `data/intent_detection/evaluation/intent_evaluation.json` — a JSON array of labeled items:

diff --git a/kedro-agentic-workflows/conf/base/catalog_evaluation_opik.yml b/kedro-agentic-workflows/conf/base/catalog_evaluation_opik.yml
@@ -0,0 +1,27 @@
+opik_intent_evaluation_data:
+  type: kedro_datasets_experimental.opik.OpikEvaluationDataset
+  dataset_name: evaluations/intent_agent_evaluation
+  filepath: data/intent_detection/evaluation/intent_evaluation.json
+  sync_policy: local
+  credentials: opik_credentials
+
+opik_intent_judge_llm:
+  type: langchain.ChatOpenAIDataset
+  kwargs:
+    model: "gpt-4o"
+    temperature: 0.0
+  credentials: openai
+
+opik_judge_llm_prompt:
+  type: kedro_datasets_experimental.opik.OpikPromptDataset
+  filepath: data/intent_detection/evaluation/prompts/intent_judge_llm_prompt_opik.json
+  prompt_name: "opik_judge_llm_prompt"
+  prompt_type: "chat"
+  credentials: opik_credentials
+  sync_policy: local
+  mode: langchain
+
+opik_client:
+  type: kedro_datasets_experimental.opik.OpikTraceDataset
+  credentials: opik_credentials
+  mode: langchain
diff --git a/kedro-agentic-workflows/conf/base/catalog_genai_config.yml b/kedro-agentic-workflows/conf/base/catalog_genai_config.yml
@@ -35,23 +35,25 @@ response_prompt:
   dataset:
     type: yaml.YAMLDataset
 
-intent_prompt:
-  type: kedro_datasets_experimental.langfuse.LangfusePromptDataset
-  filepath: data/intent_detection/prompts/intent_prompt_langfuse.json
-  prompt_name: "intent-classifier"
-  prompt_type: "chat"
-  credentials: langfuse_credentials
-  sync_policy: local       # local|remote|strict
-  mode: sdk         # langchain|sdk
-  load_args:
-    version: ${runtime_params:intent_prompt_version, 1}
-
 # intent_prompt:
-#   type: kedro_datasets_experimental.opik.OpikPromptDataset
-#   filepath: data/intent_detection/prompts/intent_prompt_opik.json
+#   type: kedro_datasets_experimental.langfuse.LangfusePromptDataset
+#   filepath: data/intent_detection/prompts/intent_prompt_langfuse.json
 #   prompt_name: "intent-classifier"
 #   prompt_type: "chat"
-#   credentials: opik_credentials
+#   credentials: langfuse_credentials
+#   sync_policy: local       # local|remote|strict
+#   mode: sdk         # langchain|sdk
+#   load_args:
+#     version: ${runtime_params:intent_prompt_version, 1}
+
+intent_prompt:
+  type: kedro_datasets_experimental.opik.OpikPromptDataset
+  filepath: data/intent_detection/prompts/intent_prompt_opik.json
+  prompt_name: "intent-classifier"
+  prompt_type: "chat"
+  credentials: opik_credentials
+  sync_policy: local
+  mode: sdk
 
 # --- Tracing ---
 intent_tracer_langfuse:

diff --git a/kedro-agentic-workflows/conf/base/parameters.yml b/kedro-agentic-workflows/conf/base/parameters.yml
@@ -1,4 +1,4 @@
 user_id: 1
 docs_matches: 3
 intent_prompt_version: 1
-model_name: "gpt-4o"
+model_name: "gpt-4o"
diff --git a/kedro-agentic-workflows/data/intent_detection/evaluation/intent_evaluation.json b/kedro-agentic-workflows/data/intent_detection/evaluation/intent_evaluation.json
@@ -109,4 +109,4 @@
       "reason": "While the first part is a general question, the second part reveals the user has a pending claim they are asking about."
     }
   }
-]
+]
diff --git a/...ntic-workflows/data/intent_detection/evaluation/prompts/intent_judge_llm_prompt_opik.json b/...ntic-workflows/data/intent_detection/evaluation/prompts/intent_judge_llm_prompt_opik.json
@@ -0,0 +1,10 @@
+[
+  {
+    "role": "system",
+    "content": "You are an expert evaluator for an insurance customer support intent classification system.\n\nYour task is to evaluate the QUALITY OF THE MODEL'S REASONING for the predicted intent.\n\nYou will receive:\n- The user's question\n- The model's predicted intent\n- The model's explanation (reason)\n- The expected intent\n- The reference explanation\n\nYour job is NOT to rewrite the explanation. Instead, you must score how well the model's explanation justifies the predicted intent and whether it aligns with the user question.\n\nEvaluation Criteria:\n\n5 \u2014 Excellent\nThe predicted intent is correct and the reasoning clearly and accurately explains why the intent was chosen.\n\n4 \u2014 Good\nThe predicted intent is correct and the reasoning is mostly correct, but slightly incomplete or generic.\n\n3 \u2014 Acceptable\nThe reasoning partially explains the intent but is vague or lacks clarity.\n\n2 \u2014 Poor\nThe reasoning is weak, incorrect, or does not clearly justify the predicted intent.\n\n1 \u2014 Incorrect\nThe predicted intent is wrong OR the reasoning does not match the user question.\n\nImportant Rules:\n- Focus on whether the reasoning logically explains the predicted intent.\n- If the predicted intent is incorrect, the score should be 1.\n- Minor wording differences from the reference explanation are acceptable.\n- Return only the score according to the schema."
+  },
+  {
+    "role": "human",
+    "content": "User Question:\n{question}\n\nPredicted Intent:\n{predicted_intent}\n\nModel Explanation:\n{predicted_reason}\n\nExpected Intent:\n{expected_intent}\n\nReference Explanation:\n{expected_reason}"
+  }
+]
-Original file line number
+Diff line change
@@ Expand Up / @@ -109,4 +109,4 @@ @@
           "reason": "While the first part is a general question, the second part reveals the user has a pending claim they are asking about."
         }
       }
-    ]
+    ]