Skip to content
Open
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
101 changes: 101 additions & 0 deletions codemeticulous/ai_convert.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
import litellm, logging, re, json, os
from pydantic import BaseModel, ValidationError
from codemeticulous.standards import STANDARDS
from codemeticulous.prompt_strategies import DefaultPrompt
from codemeticulous.summarize_schema import get_schema_summary

logging.basicConfig(level=logging.INFO)

# Toggle for additional llm debugging
# litellm._turn_on_debug()


def extract_json(llm_output: str) -> dict:
# Try to extract JSON from markdown code block and disregard it
json_match = re.search(r'```json\s*(.*?)\s*```', llm_output, re.DOTALL)

if json_match:
json_str = json_match.group(1)
else:
# If no code block found, assume the whole string is JSON obj
json_str = llm_output

logging.info(f"Extracted JSON string: {json_str}")
return json.loads(json_str)


def structured_completion(llm_model: str, messages: list, target_model: BaseModel, key: str) -> BaseModel | None:
# try:
# client = instructor.from_litellm(completion)

# response = client.chat.completions.create(
# model=llm_model,
# response_model=target_model,
# messages=messages,
# api_key=key,
# )
# return response
try:
os.environ["OPENROUTER_API_KEY"] = key #FIXME: avoid setting env var
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

lets remove the key getting passed around and just assume the environment variable is set. A bit more secure and lets us support any api that litellm does with no extra work


response = litellm.completion(
model=llm_model,
messages=messages,
api_key=key
)
output = extract_json(response.choices[0].message.content)
logging.info(output)
except Exception as e:
logging.error(f"ERROR: structured output failed: {e}")
raise

# Attempt to validate into a pydantic instance of the target model
try:
return target_model(**output)
except ValidationError as e:
logging.error(f"Pydantic validation error: {e}")
raise


def convert_ai(key: str, llm_model: str, source_format: str, target_format: str, source_data):
"""
Automate metadata standard conversion using LLM and canonical representation.

Args:
- key: API key for LLM authorization.
- llm_model: LLM model string (e.g., "openrouter/openai/gpt-4o").
- source_format: string representation of the source metadata standard.
- target_format: string representation of the target metadata standard.
- model: LLM model string (e.g., "openrouter/openai/gpt-4o")
- source_data: dict or pydantic.BaseModel instance representing the source metadata
"""

# Build prompt messages using pydantic schemas and the source data
source_model = STANDARDS[source_format]["model"]
target_model = STANDARDS[target_format]["model"]

# Creates pydantic model instance of source data
if isinstance(source_data, dict):
source_instance = source_model(**source_data)
elif isinstance(source_data, source_model):
source_instance = source_data

# Create summarized schema of source pydantic model according to data instance
source_schema_dict = get_schema_summary(source_model, llm_model, key, source_instance)
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this should just be cached as a static data file so the descriptions aren't opaquely+non-deterministically generated every time

source_schema = json.dumps(source_schema_dict, indent=2)

# Attempt to automatically create JSON string of target pydantic schema, fallback to manually doing it otherwise
try:
target_schema = target_model.schema_json()
except Exception:
logging.warning("Failed to serialize via Pydantic's built-in schema, falling back to manually creating schema")
target_schema = get_schema_summary(target_model, llm_model, key)

target_schema = json.dumps(target_schema, indent=2)

strategy = DefaultPrompt()
messages = strategy.generate_system_prompt(source_instance, source_schema, target_schema)

target_data = structured_completion(llm_model, messages, target_model, key)

return target_data
85 changes: 83 additions & 2 deletions codemeticulous/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import yaml

from codemeticulous.convert import STANDARDS, convert as _convert

from codemeticulous.ai_convert import convert_ai as _convert_ai

@click.group()
def cli():
Expand Down Expand Up @@ -104,6 +104,87 @@ def validate(format_name, input_file, verbose):
traceback.print_exc()


@cli.command()
@click.option(
"-k",
"--key",
"api_key",
type=str,
required=True,
help="API key for LLM authorization",
)
@click.option(
"-m",
"--model",
"llm_model",
type=str,
required=True,
help="LLM model to use for conversion (e.g., 'openrouter/openai/gpt-4o')",
)
@click.option(
"-f",
"--from",
"source_format",
type=click.Choice(STANDARDS.keys()),
required=True,
help="Source format",
)
@click.option(
"-t",
"--to",
"target_format",
type=click.Choice(STANDARDS.keys()),
required=True,
help="Target format",
)
@click.option(
"-o",
"--output",
"output_file",
type=click.File("w"),
default=None,
help="Output file name (by default prints to stdout)",
)
@click.option(
"-v",
"--verbose",
is_flag=True,
default=False,
help="Print verbose output",
)
@click.argument("input_file", type=click.Path(exists=True))
def ai_convert(api_key: str, llm_model: str, source_format: str, target_format: str, input_file, output_file, verbose):
try:
input_data = load_file_autodetect(input_file)
except Exception as e:
click.echo(f"Failed to load file: {input_file}. {str(e)}", err=True)
if verbose:
traceback.print_exc()
try:
converted_data = _convert_ai(api_key, llm_model, source_format, target_format, input_data)
except Exception as e:
click.echo(f"Error during AI-assisted conversion: {str(e)}", err=True)
if verbose:
traceback.print_exc()
return

output_format = STANDARDS[target_format]["format"]

try:
output_data = dump_data(converted_data, output_format)
except Exception as e:
click.echo(f"Error during serialization: {str(e)}", err=True)
if verbose:
traceback.print_exc()
return

if output_file:
output_file.write(output_data)
click.echo(f"Data written to {output_file.name}")
else:
click.echo(output_data)


def dump_data(data, format):
if format == "json":
return data.json()
Expand Down Expand Up @@ -136,4 +217,4 @@ def load_file_autodetect(file_path):
else:
raise ValueError(f"Unsupported file extension: {ext}.")
except Exception as e:
raise ValueError(f"Failed to load file: {file_path}. {str(e)}")
raise ValueError(f"Failed to load file: {file_path}. {str(e)}")
23 changes: 1 addition & 22 deletions codemeticulous/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,28 +4,7 @@
from codemeticulous.codemeta.convert import canonical_to_codemeta, codemeta_to_canonical
from codemeticulous.datacite.convert import canonical_to_datacite, datacite_to_canonical
from codemeticulous.cff.convert import canonical_to_cff, cff_to_canonical


STANDARDS = {
"codemeta": {
"model": CodeMeta,
"format": "json",
"to_canonical": codemeta_to_canonical,
"from_canonical": canonical_to_codemeta,
},
"datacite": {
"model": DataCite,
"format": "json",
"to_canonical": datacite_to_canonical,
"from_canonical": canonical_to_datacite,
},
"cff": {
"model": CitationFileFormat,
"format": "yaml",
"to_canonical": cff_to_canonical,
"from_canonical": canonical_to_cff,
},
}
from codemeticulous.standards import STANDARDS


def to_canonical(source_format: str, source_data):
Expand Down
39 changes: 39 additions & 0 deletions codemeticulous/prompt_strategies.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
from abc import ABC, abstractmethod

class PromptStrategy(ABC):
@abstractmethod
def generate_system_prompt(self, source_instance) -> list:
pass


class DefaultPrompt(PromptStrategy):
PROMPT = """
Your task is to convert source metadata from one format to another using the provided schemas.

INPUTS PROVIDED:
- Source data: A Pydantic model instance containing the original metadata
- Source schema: A JSON object containing the source Pydantic model's fields and descriptions
- Target schema: The Pydantic model definition for the output format

INSTRUCTIONS:
1. Analyze the source data and understand its structure.
2. Extract and map the relevant fields from the source data to the corresponding fields in the target format.
3. Transform data types and structures as needed to match the target schema requirements which could either be one-to-one or complex transformations.
4. Instantiate the target model using the mapped and transformed data, so that a new instance of the target Pyndantic model can be created.

OUTPUT REQUIREMENTS:
- Return ONLY a raw JSON without any further encoding such as escaping quotes.
- Ensure all required fields in the target schema are populated
- Use appropriate data types as defined in the target schema

The final output must be an instance of the target model schema that can be successfully validated by Pydantic.
"""

def generate_system_prompt(self, source_instance, source_schema, target_schema) -> list:

return [
{"role": "system", "content": self.PROMPT},
{"role": "user", "content": "SOURCE_DATA:\n" + source_instance.json()},
{"role": "user", "content": "SOURCE SCHEMA\n" + source_schema},
{"role": "user", "content": "TARGET_MODEL:\n" + target_schema}
]
27 changes: 27 additions & 0 deletions codemeticulous/standards.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
from codemeticulous.codemeta.models import CodeMeta
from codemeticulous.datacite.models import DataCite
from codemeticulous.cff.models import CitationFileFormat
from codemeticulous.codemeta.convert import canonical_to_codemeta, codemeta_to_canonical
from codemeticulous.datacite.convert import canonical_to_datacite, datacite_to_canonical
from codemeticulous.cff.convert import canonical_to_cff, cff_to_canonical

STANDARDS = {
"codemeta": {
"model": CodeMeta,
"format": "json",
"to_canonical": codemeta_to_canonical,
"from_canonical": canonical_to_codemeta,
},
"datacite": {
"model": DataCite,
"format": "json",
"to_canonical": datacite_to_canonical,
"from_canonical": canonical_to_datacite,
},
"cff": {
"model": CitationFileFormat,
"format": "yaml",
"to_canonical": cff_to_canonical,
"from_canonical": canonical_to_cff,
},
}
73 changes: 73 additions & 0 deletions codemeticulous/summarize_schema.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
from pydantic import BaseModel
import re
import json
import logging
import litellm

def generate_desc(model_name: str, data, llm_model: str, api_key: str):
prompt = f"""
For a Pydantic model '{model_name}', we have a list of lists, each containing a field and their field type.

In one or two sentences, please provide brief descriptions of each field in relation to the model at the end of each sub-list.
Your response should be in a valid array consisting of the field name, field type, and new descriptions.
Please do not include outside explanatory text or unnecessary formatting syntax so your response can be piped into 'json.loads()'.

Here is the data:
{data}
"""

try:
response = litellm.completion(
messages=[{"role": "user", "content": prompt}],
api_key=api_key,
model=llm_model
)
return response.choices[0].message.content.strip()
except Exception as e:
print(f"ERROR: structured output failed: {e}")
raise


def get_schema_summary(source_model: type[BaseModel], llm_model: str, api_key: str, instance_data: BaseModel = None):
# Generate a nested list of fields and their types
fields = []

for field_name, model_field in source_model.__fields__.items():
# if there's an instance and the field isn't referenced in it, skip
if instance_data is not None and getattr(instance_data, field_name) is None:
continue

field_type = model_field.annotation
field = [field_name, field_type]
fields.append(field)

llm_response = generate_desc(source_model.__name__, fields, llm_model, api_key) # Call to LLM to generate field's description

match = re.search(r'\{.*\}|\[.*\]', llm_response, re.DOTALL) # Clean up reponse in case it's wrapped around any unnecessary text/syntax

if match:
llm_response = match.group(0)

try:
field_descriptions = json.loads(llm_response)
schema = {
"model_name": source_model.__name__,
"fields": field_descriptions
}

return schema
except Exception as e:
logging.error(f"ERROR: failed to create list from llm response: ", e)

# try:
# output = json.loads(llm_response)
# filename = f"{source_model.__name__}.csv"

# # Create a csv retaining the final schema information
# with open(filename, "w", newline='') as csvfile:
# csvwriter = csv.writer(csvfile)
# csvwriter.writerow(["Field Name", "Field Type", "Description"])
# csvwriter.writerows(output)

# except Exception as e:
# print(f"ERROR: failed to create list from llm response: ", e)
4 changes: 3 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ readme = "README.md"
requires-python = ">=3.10"
dependencies = [
"click>=8.1.7",
"instructor>=1.13.0",
"litellm>=1.77.1",
"pydantic2-schemaorg==0.2.0",
"pydantic>=2.9.2",
"pyyaml>=6.0.2",
Expand All @@ -30,4 +32,4 @@ include-package-data = false

[tool.setuptools.packages.find]
include = ["codemeticulous", "codemeticulous.*"]
exclude = ["tests*", "schema"]
exclude = ["tests*", "schema"]
Loading