Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,4 @@ wheels/
.venv

.vscode/
.env
15 changes: 15 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -145,3 +145,18 @@ Run tests
$ uv run pytest tests
```

## Experimental AI Mode
An optional mode to sidestep manually encoding logic conversions between metadata formats by leveraging LLMs to automate the process:

```
$ codemeticulous ai-convert --model <LLM-model> --from codemeta --to cff codemeta.json > CITATION.cff
```

Currently, it can convert between known formats (listed in ```standards.py```), outputting structured data which is then deserialized back into codemeticulous/pydantic objects.

*Note: ensure that ```.env``` contains the necessary API keys to call the LLM.*

Run experiemental AI tests
```
$ uv run pytest tests_llm/
```
138 changes: 138 additions & 0 deletions codemeticulous/ai_convert.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
import litellm
import logging
import re
import json
import ast
from pathlib import Path
from pydantic import BaseModel, ValidationError
from codemeticulous.standards import STANDARDS
from codemeticulous.prompt_strategies import DefaultPrompt
from codemeticulous.generate_schemas import check_schema

logging.basicConfig(level=logging.INFO)

# Toggle for additional llm debugging
# litellm._turn_on_debug()

def extract_json(llm_output: str) -> dict:
# Try to extract JSON from markdown code block first
json_match = re.search(r'```json\s*(.*?)\s*```', llm_output, re.DOTALL)
if json_match:
json_str = json_match.group(1)
else:
# Fall back to extracting the first JSON object from the string
json_match = re.search(r'\{.*\}', llm_output, re.DOTALL)
json_str = json_match.group(0) if json_match else llm_output

try:
return json.loads(json_str)
except json.JSONDecodeError:
logging.error(f"ERROR: failed to decode JSON from LLM output: {json_str}")
# Try to fix single-quoted keys/values produced by the LLM
return ast.literal_eval(json_str)


def structured_completion(llm_model: str, messages: list, target_model: BaseModel) -> tuple[BaseModel | None, dict]:
max_retries = 3 # sets limit on retries if llm's output has validation errors
prompt_tokens = 0
completion_tokens = 0

for attempt in range(max_retries):
try:
response = litellm.completion(
model=llm_model,
messages=messages,
temperature=0.3
)
# try:
# total_cost += litellm.completion_cost(completion_response=response)
# except Exception:
# pass
prompt_tokens += getattr(response.usage, "prompt_tokens", 0) or 0
completion_tokens += getattr(response.usage, "completion_tokens", 0) or 0

output = extract_json(response.choices[0].message.content)
validated_model = target_model(**output)
usage = {
# "cost": round(total_cost, 6),
"prompt_tokens": prompt_tokens,
"completion_tokens": completion_tokens,
}
return validated_model, usage
except ValidationError as e:
logging.warning(f"Pydantic validation error on attempt {attempt + 1}: {e}")

if attempt < max_retries - 1:
failing_fields = [err["loc"][0] for err in e.errors() if err["loc"]]
error_msg = {
"role": "user",
"content": f"""
The previous JSON output had validation errors. Return the same JSON with ONLY these fields corrected — do not change anything else:

Failing fields: {failing_fields}

Errors:
{str(e)}
"""
}
messages.append({"role": "assistant", "content": response.choices[0].message.content})
messages.append(error_msg)
else:
logging.error(f"ERROR: LLM had validation failures after {max_retries}")
raise
except Exception as e:
logging.error(f"ERROR: LLM completion call failed: {e}")
raise


def get_examples(crosswalk: str, num: int = 1):
PATH = Path(__file__).parent.parent / "tests_llm" / "logs" / "passed_cases.json"
raw = PATH.read_text() if PATH.exists() else ""
cases = json.loads(raw) if raw.strip() else []
return [c for c in cases if c["source:target"] == crosswalk][:num] #TODO: randomize example selection


def convert_ai(llm_model: str, source_format: str, target_format: str, source_data):
"""
Automate metadata standard conversion using LLM and canonical representation.

Args:
- llm_model: LLM model string (e.g., "openrouter/openai/gpt-4o").
- source_format: string representation of the source metadata standard.
- target_format: string representation of the target metadata standard.
- source_data: dict or pydantic.BaseModel instance representing the source metadata
"""

# Build prompt messages using pydantic schemas and the source data
source_model = STANDARDS[source_format]["model"]
target_model = STANDARDS[target_format]["model"]

# Creates pydantic model instance of source data
if isinstance(source_data, dict):
source_instance = source_model(**source_data)
elif isinstance(source_data, source_model):
source_instance = source_data

# Create summarized schema of source pydantic model according to data instance
source_schema_dict = check_schema(source_format, source_instance)
source_schema = json.dumps(source_schema_dict, indent=2)

target_schema = check_schema(target_format)
target_schema = json.dumps(target_schema, indent=2)

strategy = DefaultPrompt()
messages = strategy.generate_system_prompt(source_schema, target_schema)

# inject few shot examples of past conversions, then end with the actual source data
examples = get_examples(f"{source_format}:{target_format}")
if examples:
messages.append({"role": "system", "content": f"Here are some examples of correct conversions between {source_format} and {target_format}:"})

for ex in examples:
messages.append({"role": "user", "content": "SOURCE_DATA:\n" + json.dumps(ex["source_metadata"])})
messages.append({"role": "assistant", "content": json.dumps(ex["llm_output"])})

messages.append({"role": "user", "content": "SOURCE_DATA:\n" + source_instance.json()})
target_data, usage = structured_completion(llm_model, messages, target_model)

return target_data, usage
96 changes: 94 additions & 2 deletions codemeticulous/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,12 @@
import click
import json
import yaml
from dotenv import load_dotenv

from codemeticulous.convert import STANDARDS, convert as _convert

from codemeticulous.ai_convert import convert_ai as _convert_ai
from codemeticulous.generate_schemas import generate_schemas as _generate_schemas
load_dotenv()

@click.group()
def cli():
Expand Down Expand Up @@ -104,6 +107,95 @@ def validate(format_name, input_file, verbose):
traceback.print_exc()


@cli.command()
@click.option(
"-m",
"--model",
"llm_model",
type=str,
required=True,
help="LLM model to use for conversion (e.g., 'openrouter/openai/gpt-4o')",
)
def generate_schemas(llm_model: str):
try:
_generate_schemas(llm_model)
except Exception as e:
click.echo(f"Error from attempting to generate and cache schemas: {str(e)}", err=True)


@cli.command()
@click.option(
"-m",
"--model",
"llm_model",
type=str,
required=True,
help="LLM model to use for conversion (e.g., 'openrouter/openai/gpt-4o')",
)
@click.option(
"-f",
"--from",
"source_format",
type=click.Choice(STANDARDS.keys()),
required=True,
help="Source format",
)
@click.option(
"-t",
"--to",
"target_format",
type=click.Choice(STANDARDS.keys()),
required=True,
help="Target format",
)
@click.option(
"-o",
"--output",
"output_file",
type=click.File("w"),
default=None,
help="Output file name (by default prints to stdout)",
)
@click.option(
"-v",
"--verbose",
is_flag=True,
default=False,
help="Print verbose output",
)
@click.argument("input_file", type=click.Path(exists=True))
def ai_convert(llm_model: str, source_format: str, target_format: str, input_file, output_file, verbose):
try:
input_data = load_file_autodetect(input_file)
except Exception as e:
click.echo(f"Failed to load file: {input_file}. {str(e)}", err=True)
if verbose:
traceback.print_exc()
try:
converted_data, _ = _convert_ai(llm_model, source_format, target_format, input_data)
except Exception as e:
click.echo(f"Error during AI-assisted conversion: {str(e)}", err=True)
if verbose:
traceback.print_exc()
return

output_format = STANDARDS[target_format]["format"]

try:
output_data = dump_data(converted_data, output_format)
except Exception as e:
click.echo(f"Error during serialization: {str(e)}", err=True)
if verbose:
traceback.print_exc()
return

if output_file:
output_file.write(output_data)
click.echo(f"Data written to {output_file.name}")
else:
click.echo(output_data)


def dump_data(data, format):
if format == "json":
return data.json()
Expand Down Expand Up @@ -136,4 +228,4 @@ def load_file_autodetect(file_path):
else:
raise ValueError(f"Unsupported file extension: {ext}.")
except Exception as e:
raise ValueError(f"Failed to load file: {file_path}. {str(e)}")
raise ValueError(f"Failed to load file: {file_path}. {str(e)}")
23 changes: 1 addition & 22 deletions codemeticulous/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,28 +4,7 @@
from codemeticulous.codemeta.convert import canonical_to_codemeta, codemeta_to_canonical
from codemeticulous.datacite.convert import canonical_to_datacite, datacite_to_canonical
from codemeticulous.cff.convert import canonical_to_cff, cff_to_canonical


STANDARDS = {
"codemeta": {
"model": CodeMeta,
"format": "json",
"to_canonical": codemeta_to_canonical,
"from_canonical": canonical_to_codemeta,
},
"datacite": {
"model": DataCite,
"format": "json",
"to_canonical": datacite_to_canonical,
"from_canonical": canonical_to_datacite,
},
"cff": {
"model": CitationFileFormat,
"format": "yaml",
"to_canonical": cff_to_canonical,
"from_canonical": canonical_to_cff,
},
}
from codemeticulous.standards import STANDARDS


def to_canonical(source_format: str, source_data):
Expand Down
8 changes: 4 additions & 4 deletions codemeticulous/datacite/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -384,10 +384,10 @@ def canonical_to_datacite(
# TODO: though, it may be possible to use the following codemeta fields:
# hasPart, isPartOf, readme, sameAs, review, releaseNotes
# relatedIdentifiers=data.relatedLink,
sizes=[data.fileSize] if data.fileSize else None,
formats=codemeta_language_fileformat_to_datacite_format(
data.programmingLanguage, data.fileFormat
),
# sizes=[data.fileSize] if data.fileSize else None,
# formats=codemeta_language_fileformat_to_datacite_format(
# data.programmingLanguage, data.fileFormat
# ),
Comment on lines +387 to +390
version=str(data.version) if data.version else None,
rightsList=codemeta_license_to_datacite_rights(data.license),
descriptions=descriptions,
Expand Down
Loading
Loading