diff --git a/docs/tutorials/tlm_structured_outputs/index.ipynb b/docs/tutorials/tlm_structured_outputs/index.ipynb
index f1fd633..fda38cd 100644
--- a/docs/tutorials/tlm_structured_outputs/index.ipynb
+++ b/docs/tutorials/tlm_structured_outputs/index.ipynb
@@ -67,7 +67,11 @@
"from pydantic import create_model\n",
"from typing import Optional\n",
"\n",
- "from tlm import TLM"
+ "from tlm import TLM\n",
+ "\n",
+ "import warnings\n",
+ "\n",
+ "warnings.filterwarnings(\"ignore\")"
]
},
{
@@ -100,13 +104,13 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "Melvin, the password of your study support account has been changed to Xnjv7nCydECf for security purposes. Please update it promptly.\n"
+ "Dear Orland, we are pleased to inform you that your scholarship grant of Danish Krone39k will be transferred to your account KZ551736040717OKI15Z shortly.\n"
]
}
],
"source": [
"input_texts = [\n",
- " \"Melvin, the password of your study support account has been changed to Xnjv7nCydECf for security purposes. Please update it promptly.\",\n",
+ " \"Dear Orland, we are pleased to inform you that your scholarship grant of Danish Krone39k will be transferred to your account KZ551736040717OKI15Z shortly.\",\n",
" \"In relation to the filed litigation, we hereby request full disclosure of all data logs associated with 242.218.157.166 and cd9f:d9e5:1ceb:fd39:b2d7:f3fd:c9cd:c27b tied to the account of Fleta London Emard. This involves her employment account Investment Account with Gutkowski Inc.\",\n",
" \"We would like to do a follow-up meeting with Sierra Green regarding her recent surgery. The proposed date is August 13, 2013 at our clinic in West Nash.\",\n",
" \"Melvin, the password of your study support account has been changed to Xnjv7nCydECf for security purposes. Please update it promptly.\",\n",
@@ -167,7 +171,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 6,
"id": "63e35b01",
"metadata": {},
"outputs": [],
@@ -206,14 +210,471 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "LLM response: FIRSTNAME='Melvin' LASTNAME=None DATE=None ACCOUNTNUMBER=None\n",
- "Trustworthiness score: 0.98902\n"
+ "LLM response: {'FIRSTNAME': 'Orland', 'LASTNAME': None, 'DATE': None, 'ACCOUNTNUMBER': 'KZ551736040717OKI15Z'}\n",
+ "Trustworthiness score: 0.9800000000000001\n",
+ "Per-field Trustworthiness Scores: {'FIRSTNAME': {'score': 0.999, 'explanation': \"The text addresses 'Orland', which is a valid extracted first name.\"}, 'LASTNAME': {'score': 0.999, 'explanation': \"No last name is mentioned in the text, so 'null' is appropriate.\"}, 'DATE': {'score': 0.999, 'explanation': \"No date is provided in the text, so returning 'null' is correct.\"}, 'ACCOUNTNUMBER': {'score': 0.999, 'explanation': \"The account number 'KZ551736040717OKI15Z' is explicitly mentioned in the text and correctly extracted.\"}}\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(\"LLM response: \", tlm_result[\"response\"].choices[0].message.content)\n",
+ "print(\"Trustworthiness score: \", tlm_result[\"trustworthiness_score\"])\n",
+ "print(f\"Per-field Trustworthiness Scores: {tlm_result['metadata']['per_field_score']}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "fbd9ccf5",
+ "metadata": {},
+ "source": [
+ "## Run a dataset of many examples\n",
+ "\n",
+ "Here, we define a quick helper function that allows us to process multiple text samples in parallel, which will speed up prompting the LLM over a dataset. The helper function also collects the LLM outputs and trustworthiness score in a formatted DataFrame for easy downstream analysis."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "id": "6a0aecb0",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from concurrent.futures import ThreadPoolExecutor\n",
+ "import pandas as pd\n",
+ "import time\n",
+ "from tqdm import tqdm\n",
+ "\n",
+ "\n",
+ "def extract_pii(text):\n",
+ " tlm = TLM()\n",
+ "\n",
+ " openai_kwargs = {\n",
+ " \"model\": \"gpt-4.1-mini\",\n",
+ " \"messages\": [\n",
+ " {\n",
+ " \"role\": \"user\",\n",
+ " \"content\": f\"Extract PII information from the following text, return null if the entity is not found: {text}\",\n",
+ " }\n",
+ " ],\n",
+ " \"response_format\": PII,\n",
+ " }\n",
+ " tlm_result = tlm.create(**openai_kwargs)\n",
+ "\n",
+ " return {\n",
+ " \"raw_completion\": tlm_result,\n",
+ " # the columns below extract the PII information and scores from the raw OpenAI response\n",
+ " \"extracted_pii\": tlm_result[\"response\"].choices[0].message.content,\n",
+ " \"trustworthiness_score\": tlm_result[\"trustworthiness_score\"],\n",
+ " \"per_field_score\": tlm_result[\"metadata\"][\"per_field_score\"],\n",
+ " }\n",
+ "\n",
+ "\n",
+ "def extract_pii_batch(texts, batch_size=15, max_threads=8, sleep_time=2):\n",
+ " results = []\n",
+ " for i in tqdm(range(0, len(texts), batch_size)):\n",
+ " batch = texts[i : i + batch_size]\n",
+ "\n",
+ " with ThreadPoolExecutor(max_threads) as executor:\n",
+ " futures = [executor.submit(extract_pii, text) for text in batch]\n",
+ " batch_results = [f.result() for f in futures]\n",
+ "\n",
+ " results.extend(batch_results)\n",
+ "\n",
+ " # sleep to prevent hitting rate limits\n",
+ " if i + batch_size < len(texts):\n",
+ " time.sleep(sleep_time)\n",
+ "\n",
+ " return pd.DataFrame(results)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "id": "989fd969",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "100%|██████████| 1/1 [00:05<00:00, 5.84s/it]\n"
]
+ },
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " raw_completion | \n",
+ " extracted_pii | \n",
+ " trustworthiness_score | \n",
+ " per_field_score | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " {'response': ModelResponse(id='chatcmpl-DBwRlW... | \n",
+ " {'FIRSTNAME': 'Orland', 'LASTNAME': None, 'DAT... | \n",
+ " 0.970000 | \n",
+ " {'FIRSTNAME': {'score': 0.999, 'explanation': ... | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " {'response': ModelResponse(id='chatcmpl-DBwRla... | \n",
+ " {'FIRSTNAME': 'Fleta London', 'LASTNAME': 'Ema... | \n",
+ " 0.181196 | \n",
+ " {'FIRSTNAME': {'score': 0.13961111111111107, '... | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " raw_completion \\\n",
+ "0 {'response': ModelResponse(id='chatcmpl-DBwRlW... \n",
+ "1 {'response': ModelResponse(id='chatcmpl-DBwRla... \n",
+ "\n",
+ " extracted_pii trustworthiness_score \\\n",
+ "0 {'FIRSTNAME': 'Orland', 'LASTNAME': None, 'DAT... 0.970000 \n",
+ "1 {'FIRSTNAME': 'Fleta London', 'LASTNAME': 'Ema... 0.181196 \n",
+ "\n",
+ " per_field_score \n",
+ "0 {'FIRSTNAME': {'score': 0.999, 'explanation': ... \n",
+ "1 {'FIRSTNAME': {'score': 0.13961111111111107, '... "
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "results = extract_pii_batch(input_texts)\n",
+ "results.head(2)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "82d24e28",
+ "metadata": {},
+ "source": [
+ "## Examine Results\n",
+ "\n",
+ "We’ve now generated structured ouputs (i.e. extracted data) for each text sample in the dataset and scored the trustworthiness of each output."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "id": "d0bcab92",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "pd.set_option(\"display.max_colwidth\", None)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "id": "28852df4",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "results[\"input_text\"] = input_texts"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "920581e2",
+ "metadata": {},
+ "source": [
+ "### High Trustworthiness Scores\n",
+ "\n",
+ "The responses with the highest trustworthiness scores represent texts where TLM is most confident in the accuracy of your LLM’s structured outputs.\n",
+ "\n",
+ "Looking at the examples below with high trustworthiness scores, we can see that the model successfully extracted the correct PII elements in these text samples:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "id": "1b3c22f6",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " input_text | \n",
+ " extracted_pii | \n",
+ " trustworthiness_score | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " Dear Orland, we are pleased to inform you that your scholarship grant of Danish Krone39k will be transferred to your account KZ551736040717OKI15Z shortly. | \n",
+ " {'FIRSTNAME': 'Orland', 'LASTNAME': None, 'DATE': None, 'ACCOUNTNUMBER': 'KZ551736040717OKI15Z'} | \n",
+ " 0.97 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " Melvin, the password of your study support account has been changed to Xnjv7nCydECf for security purposes. Please update it promptly. | \n",
+ " {'FIRSTNAME': 'Melvin', 'LASTNAME': None, 'DATE': None, 'ACCOUNTNUMBER': None} | \n",
+ " 0.97 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " input_text \\\n",
+ "0 Dear Orland, we are pleased to inform you that your scholarship grant of Danish Krone39k will be transferred to your account KZ551736040717OKI15Z shortly. \n",
+ "3 Melvin, the password of your study support account has been changed to Xnjv7nCydECf for security purposes. Please update it promptly. \n",
+ "\n",
+ " extracted_pii \\\n",
+ "0 {'FIRSTNAME': 'Orland', 'LASTNAME': None, 'DATE': None, 'ACCOUNTNUMBER': 'KZ551736040717OKI15Z'} \n",
+ "3 {'FIRSTNAME': 'Melvin', 'LASTNAME': None, 'DATE': None, 'ACCOUNTNUMBER': None} \n",
+ "\n",
+ " trustworthiness_score \n",
+ "0 0.97 \n",
+ "3 0.97 "
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "results.sort_values(\"trustworthiness_score\", ascending=False).head(2)[\n",
+ " [\"input_text\", \"extracted_pii\", \"trustworthiness_score\"]\n",
+ "]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a3c41f05",
+ "metadata": {},
+ "source": [
+ "### Low Trustworthiness Scores\n",
+ "\n",
+ "The lowest trustworthiness scores reveal the LLM outputs that TLM is least confident are accurate. Documents/results with low trustworthiness scores would benefit most from manual review, especially if we need almost all outputs across the dataset to be correct and want to save human review costs.\n",
+ "\n",
+ "The LLM outputs with the lowest trustworthiness scores in this dataset are shown below, and these extractions are often incorrect or ambiguous warranting further review."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "id": "d69bcafe",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " input_text | \n",
+ " extracted_pii | \n",
+ " trustworthiness_score | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 1 | \n",
+ " In relation to the filed litigation, we hereby request full disclosure of all data logs associated with 242.218.157.166 and cd9f:d9e5:1ceb:fd39:b2d7:f3fd:c9cd:c27b tied to the account of Fleta London Emard. This involves her employment account Investment Account with Gutkowski Inc. | \n",
+ " {'FIRSTNAME': 'Fleta London', 'LASTNAME': 'Emard', 'DATE': None, 'ACCOUNTNUMBER': 'Investment Account'} | \n",
+ " 0.181196 | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " To: Maximillian Noah Moore, we forgot to update your record with phone IMEI: 30-265288-033265-8. Could you please provide it in your earliest convenience to keep your records updated. | \n",
+ " {'FIRSTNAME': 'Maximillian', 'LASTNAME': 'Noah Moore', 'DATE': None, 'ACCOUNTNUMBER': None} | \n",
+ " 0.372823 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " input_text \\\n",
+ "1 In relation to the filed litigation, we hereby request full disclosure of all data logs associated with 242.218.157.166 and cd9f:d9e5:1ceb:fd39:b2d7:f3fd:c9cd:c27b tied to the account of Fleta London Emard. This involves her employment account Investment Account with Gutkowski Inc. \n",
+ "5 To: Maximillian Noah Moore, we forgot to update your record with phone IMEI: 30-265288-033265-8. Could you please provide it in your earliest convenience to keep your records updated. \n",
+ "\n",
+ " extracted_pii \\\n",
+ "1 {'FIRSTNAME': 'Fleta London', 'LASTNAME': 'Emard', 'DATE': None, 'ACCOUNTNUMBER': 'Investment Account'} \n",
+ "5 {'FIRSTNAME': 'Maximillian', 'LASTNAME': 'Noah Moore', 'DATE': None, 'ACCOUNTNUMBER': None} \n",
+ "\n",
+ " trustworthiness_score \n",
+ "1 0.181196 \n",
+ "5 0.372823 "
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "results.sort_values(\"trustworthiness_score\").head(2)[[\"input_text\", \"extracted_pii\", \"trustworthiness_score\"]]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b5462b1a",
+ "metadata": {},
+ "source": [
+ "## Obtaining Trust Scores for Individual Fields\n",
+ "\n",
+ "Beyond TLM’s overall trustworthiness score, you can obtain granular confidence scores for each individual field in the structured output from your LLM. These field-level scores help you pinpoint which specific values may be incorrect or warrant focused review.\n",
+ "\n",
+ "Let’s look at the text sample receiving the lowest trustworthiness score in this dataset:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "id": "e3fa99ac",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Text: In relation to the filed litigation, we hereby request full disclosure of all data logs associated with 242.218.157.166 and cd9f:d9e5:1ceb:fd39:b2d7:f3fd:c9cd:c27b tied to the account of Fleta London Emard. This involves her employment account Investment Account with Gutkowski Inc.\n",
+ "Extracted PII Information: {'FIRSTNAME': 'Fleta London', 'LASTNAME': 'Emard', 'DATE': None, 'ACCOUNTNUMBER': 'Investment Account'}\n",
+ "Trustworthiness Score: 0.18119641830134262\n",
+ "Per-field Trustworthiness Scores: {'FIRSTNAME': {'score': 0.13961111111111107, 'explanation': \"The text specifies the full name as 'Fleta London Emard', suggesting 'Fleta' is the first name and 'London' is likely a middle name. The response incorrectly includes 'Fleta London' as the first name, which is not accurate.\"}, 'LASTNAME': {'score': 0.999, 'explanation': \"The last name 'Emard' is clearly stated and correctly extracted from the text.\"}, 'DATE': {'score': 0.999, 'explanation': 'There is no date mentioned anywhere in the text, so returning null is appropriate and correct.'}, 'ACCOUNTNUMBER': {'score': 0.001, 'explanation': \"The text mentions 'Investment Account' as the account type associated with the person; however, 'Investment Account' is not an account number, it's just a descriptor. Since no account number is given, returning 'Investment Account' as account number is inaccurate.\"}}\n"
+ ]
+ }
+ ],
+ "source": [
+ "lowest_scoring_text = results.loc[results[\"trustworthiness_score\"].idxmin()]\n",
+ "\n",
+ "print(f\"Text: {lowest_scoring_text['input_text']}\")\n",
+ "print(f\"Extracted PII Information: {lowest_scoring_text['extracted_pii']}\")\n",
+ "print(f\"Trustworthiness Score: {lowest_scoring_text['trustworthiness_score']}\")\n",
+ "print(f\"Per-field Trustworthiness Scores: {lowest_scoring_text['per_field_score']}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "c881ae54",
+ "metadata": {},
+ "source": [
+ "The `per_field_score` dictionary contains a granular confidence score and explanation for each extracted field. Since this dictionary can be overwhelming, we provide a `get_untrustworthy_fields()` method that:\n",
+ "\n",
+ "- Prints detailed information about low-confidence fields\n",
+ "- Returns a list of fields that may need manual review due to low trust scores"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "id": "271bd9b6",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Untrustworthy fields: ['ACCOUNTNUMBER', 'FIRSTNAME']\n",
+ "\n",
+ "Field: ACCOUNTNUMBER\n",
+ "Response: Investment Account\n",
+ "Score: 0.001\n",
+ "Explanation: The text mentions 'Investment Account' as the account type associated with the person; however, 'Investment Account' is not an account number, it's just a descriptor. Since no account number is given, returning 'Investment Account' as account number is inaccurate.\n",
+ "\n",
+ "Field: FIRSTNAME\n",
+ "Response: Fleta London\n",
+ "Score: 0.13961111111111107\n",
+ "Explanation: The text specifies the full name as 'Fleta London Emard', suggesting 'Fleta' is the first name and 'London' is likely a middle name. The response incorrectly includes 'Fleta London' as the first name, which is not accurate.\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "untrustworthy_fields = tlm.get_untrustworthy_fields(tlm_result=lowest_scoring_text[\"raw_completion\"])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "496137b6",
+ "metadata": {},
+ "source": [
+ "This method returns a list of fields whose confidence score is low, allowing you to focus manual review on the specific fields whose extracted value is untrustworthy."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "id": "4bcd46c3",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['ACCOUNTNUMBER', 'FIRSTNAME']"
+ ]
+ },
+ "execution_count": 16,
+ "metadata": {},
+ "output_type": "execute_result"
}
],
"source": [
- "print(\"LLM response: \", tlm_result[\"response\"].choices[0].message.parsed)\n",
- "print(\"Trustworthiness score: \", tlm_result[\"trustworthiness_score\"])"
+ "untrustworthy_fields"
]
}
],
diff --git a/tlm/api.py b/tlm/api.py
index c391717..c7ad65c 100644
--- a/tlm/api.py
+++ b/tlm/api.py
@@ -10,6 +10,7 @@
from tlm.config.presets import WorkflowType
from tlm.inference import InferenceResult, tlm_inference
from tlm.types import Eval
+from tlm.utils.structured_output_utils import _get_untrustworthy_fields
def is_notebook() -> bool:
@@ -174,3 +175,29 @@ async def _async_inference(
context=context,
config=config,
)
+
+ def get_untrustworthy_fields(
+ self,
+ *,
+ tlm_result: InferenceResult,
+ threshold: float = 0.8,
+ display_details: bool = True,
+ ) -> list[str]:
+ """Gets the fields of a structured output response that are considered untrustworthy by TLM.
+ Only works for responses that are valid JSON objects (uses `response_format` to specify the output format).
+ Prints detailed information about the untrustworthy fields if `display_details` is True.
+
+ Args:
+ response: The OpenAI ChatCompletion response object to evaluate
+ tlm_result: The result object from a previous TLM call
+ threshold: The threshold for considering a field untrustworthy
+ display_details (bool): Whether to display detailed information about the untrustworthy fields
+
+ Returns:
+ list[str]: The fields of the response that are considered untrustworthy by TLM
+ """
+ return _get_untrustworthy_fields(
+ tlm_result=tlm_result,
+ threshold=threshold,
+ display_details=display_details,
+ )
diff --git a/tlm/config/presets.py b/tlm/config/presets.py
index 9102741..becd0ee 100644
--- a/tlm/config/presets.py
+++ b/tlm/config/presets.py
@@ -63,7 +63,7 @@ def from_inference_params(
score: bool,
constrain_outputs: list[str] | None = None,
) -> "WorkflowType":
- if openai_args.get("response_format") is not None and score:
+ if openai_args.get("response_format") is not None:
return cls.STRUCTURED_OUTPUT_SCORING
if rag:
diff --git a/tlm/templates/reflection_completion_templates.py b/tlm/templates/reflection_completion_templates.py
index cf43b96..3b1cee5 100644
--- a/tlm/templates/reflection_completion_templates.py
+++ b/tlm/templates/reflection_completion_templates.py
@@ -1,6 +1,7 @@
from abc import ABC, abstractmethod
from typing import Callable, ClassVar, Literal
import json
+import ast
from pydantic import BaseModel, Field
from tlm.types.base import SOReflectionScoreConfigType
@@ -808,7 +809,12 @@ def create(cls, reasoning_effort: ReasoningEffort, **kwargs) -> ReflectionComple
@classmethod
def construct_response_format(cls, response_json: str) -> type[BaseModel] | None:
- response_fields = json.loads(response_json).keys()
+ try:
+ response_dict = json.loads(response_json)
+ except Exception:
+ response_dict = ast.literal_eval(response_json)
+
+ response_fields = response_dict.keys()
ResponseFields = Literal[tuple(response_fields)] # type: ignore
class IncorrectField(BaseModel):
@@ -876,7 +882,12 @@ def create(cls, reasoning_effort: ReasoningEffort, **kwargs) -> ReflectionComple
@classmethod
def construct_response_format(cls, response_json: str) -> type[BaseModel] | None:
- response_fields = json.loads(response_json).keys()
+ try:
+ response_dict = json.loads(response_json)
+ except Exception:
+ response_dict = ast.literal_eval(response_json)
+
+ response_fields = response_dict.keys()
ResponseFields = Literal[tuple(response_fields)] # type: ignore
class IncorrectField(BaseModel):
@@ -910,8 +921,6 @@ class RatingModel(IncorrectFieldEvaluationBase):
ReflectionRAGIssuesTemplate,
],
WorkflowType.STRUCTURED_OUTPUT_SCORING: [
- # ReflectionCertaintyTemplate,
- # ReflectionKnowledgeGapTemplate,
SelfReflectionSOFieldAccuracyConfig,
SelfReflectionSOFieldKnowledgeGapConfig,
ReflectionArgumentTemplate,
diff --git a/tlm/utils/completion_utils.py b/tlm/utils/completion_utils.py
index 3dd49bd..e2f1d33 100644
--- a/tlm/utils/completion_utils.py
+++ b/tlm/utils/completion_utils.py
@@ -280,7 +280,7 @@ def _parse_completion(completion: Completion, reference_answer: str | None = Non
completion.add_response_field(
ExtractedResponseField.MAPPED_SCORE,
- score_mapper(unmapped_overall_score),
+ score_mapper(str(unmapped_overall_score)),
)
diff --git a/tlm/utils/openai_utils.py b/tlm/utils/openai_utils.py
index a68cb74..68f689c 100644
--- a/tlm/utils/openai_utils.py
+++ b/tlm/utils/openai_utils.py
@@ -48,12 +48,12 @@ def extract_message_content(completion: Dict[str, Any]) -> str:
def extract_structured_output_field(message_content: str, field: str) -> str | None:
try:
return str(ast.literal_eval(message_content)[field])
- except Exception as e:
- logger.warning(f"ast.literal_eval failed for message_content: {message_content}\nError: {e}")
+ except Exception:
+ pass
try:
return str(json.loads(message_content)[field])
- except Exception as e:
- logger.warning(f"json.loads failed for message_content: {message_content}\nError: {e}")
+ except Exception:
+ pass
return None
diff --git a/tlm/utils/response_format_utils.py b/tlm/utils/response_format_utils.py
index f78384c..635c106 100644
--- a/tlm/utils/response_format_utils.py
+++ b/tlm/utils/response_format_utils.py
@@ -2,7 +2,7 @@
from pydantic import BaseModel, Field, create_model
import json
import copy
-
+import ast
from tlm.types import CompletionParams
from tlm.config.defaults import get_settings
@@ -55,6 +55,10 @@ def add_explanation_field(schema: Dict[str, Any]) -> Dict[str, Any]:
def construct_per_field_response_format_model(
reference_answer: str, per_field_score_response_format: type[BaseModel]
) -> type[BaseModel]:
- answer_keys = json.loads(reference_answer).keys()
+ try:
+ answer_keys = json.loads(reference_answer).keys()
+ except Exception:
+ answer_keys = ast.literal_eval(reference_answer).keys()
+
fields = {key: (per_field_score_response_format, Field(...)) for key in answer_keys}
return create_model(per_field_score_response_format.__name__, **fields) # type:ignore
diff --git a/tlm/utils/scoring/per_field_scoring_utils.py b/tlm/utils/scoring/per_field_scoring_utils.py
index f5318bd..6254b88 100644
--- a/tlm/utils/scoring/per_field_scoring_utils.py
+++ b/tlm/utils/scoring/per_field_scoring_utils.py
@@ -1,7 +1,7 @@
import json
import numpy as np
from typing import Callable
-
+import ast
from tlm.types import FieldMetadata, Completion, SOReflectionScoreConfigType
from tlm.utils.math_utils import make_score_asymptotic
from tlm.config.presets import (
@@ -41,7 +41,11 @@ def extract_incorrect_fields_reflection_metadata(
incorrect_fields_list = answer_json["incorrect_fields"]
incorrect_field_names_and_explanations = {item["field_name"]: item["explanation"] for item in incorrect_fields_list}
- field_names = json.loads(reference_answer).keys()
+ try:
+ field_names = json.loads(reference_answer).keys()
+ except Exception:
+ field_names = ast.literal_eval(reference_answer).keys()
+
per_field_metadata = {}
# construct scores and mapped scores for each field for downstream use of per-field score details
diff --git a/tlm/utils/structured_output_utils.py b/tlm/utils/structured_output_utils.py
new file mode 100644
index 0000000..6fd150b
--- /dev/null
+++ b/tlm/utils/structured_output_utils.py
@@ -0,0 +1,68 @@
+import ast
+import json
+
+from tlm.inference import InferenceResult
+
+
+def _get_untrustworthy_fields(
+ tlm_result: InferenceResult,
+ threshold: float = 0.8,
+ display_details: bool = True,
+) -> list[str]:
+ tlm_metadata = tlm_result["metadata"]
+ response_text = tlm_result["response"].choices[0].message.content # type: ignore
+
+ if tlm_metadata is None or "per_field_score" not in tlm_metadata:
+ raise ValueError(
+ "`per_field_score` is not present in the metadata.\n"
+ "`get_untrustworthy_fields()` can only be called scoring structured outputs responses."
+ )
+
+ try:
+ so_response = json.loads(response_text)
+ except Exception:
+ pass
+ try:
+ so_response = ast.literal_eval(response_text)
+ except Exception:
+ raise ValueError(
+ "The LLM response must be a valid JSON output (use `response_format` to specify the output format)"
+ )
+
+ per_field_score = tlm_metadata["per_field_score"]
+ per_score_details = []
+
+ # handle cases where error log is returned
+ if len(per_field_score) == 1 and isinstance(per_field_score.get("error"), str):
+ print("Per-field score returned an error:")
+ print(per_field_score.get("error"))
+ return []
+
+ for key, value in per_field_score.items():
+ score = value["score"]
+ if float(score) < threshold:
+ key_details = {
+ "response": so_response[key],
+ "score": score,
+ "explanation": value["explanation"],
+ }
+ per_score_details.append({key: key_details})
+
+ per_score_details.sort(key=lambda x: next(iter(x.values()))["score"])
+ untrustworthy_fields = [next(iter(item.keys())) for item in per_score_details]
+
+ if display_details:
+ if len(untrustworthy_fields) == 0:
+ print("No untrustworthy fields found")
+
+ else:
+ print(f"Untrustworthy fields: {untrustworthy_fields}\n")
+ for item in per_score_details:
+ print(f"Field: {next(iter(item.keys()))}")
+ details = next(iter(item.values()))
+ print(f"Response: {details['response']}")
+ print(f"Score: {details['score']}")
+ print(f"Explanation: {details['explanation']}")
+ print()
+
+ return untrustworthy_fields