SciCodes · kellytea · Sep 30, 2025 · Nov 10, 2025 · Nov 20, 2025 · Nov 25, 2025
diff --git a/.gitignore b/.gitignore
@@ -10,3 +10,4 @@ wheels/
 .venv
 
 .vscode/
+.env
diff --git a/README.md b/README.md
@@ -145,3 +145,18 @@ Run tests
 $ uv run pytest tests
 ```
 
+## Experimental AI Mode
+An optional mode to sidestep manually encoding logic conversions between metadata formats by leveraging LLMs to automate the process:
+
+```
+$ codemeticulous ai-convert --model <LLM-model> --from codemeta --to cff codemeta.json > CITATION.cff
+```
+
+Currently, it can convert between known formats (listed in ```standards.py```), outputting structured data which is then deserialized back into codemeticulous/pydantic objects.
+
+*Note: ensure that ```.env``` contains the necessary API keys to call the LLM.*
+
+Run experiemental AI tests
+```
+$ uv run pytest tests_llm/
+```
diff --git a/codemeticulous/ai_convert.py b/codemeticulous/ai_convert.py
@@ -0,0 +1,138 @@
+import litellm
+import logging
+import re
+import json
+import ast
+from pathlib import Path
+from pydantic import BaseModel, ValidationError
+from codemeticulous.standards import STANDARDS
+from codemeticulous.prompt_strategies import DefaultPrompt
+from codemeticulous.generate_schemas import check_schema
+
+logging.basicConfig(level=logging.INFO)
+
+# Toggle for additional llm debugging
+# litellm._turn_on_debug()
+
+def extract_json(llm_output: str) -> dict:
+    # Try to extract JSON from markdown code block first
+    json_match = re.search(r'```json\s*(.*?)\s*```', llm_output, re.DOTALL)
+    if json_match:
+        json_str = json_match.group(1)
+    else:
+        # Fall back to extracting the first JSON object from the string
+        json_match = re.search(r'\{.*\}', llm_output, re.DOTALL)
+        json_str = json_match.group(0) if json_match else llm_output
+
+    try:
+        return json.loads(json_str)
+    except json.JSONDecodeError:
+        logging.error(f"ERROR: failed to decode JSON from LLM output: {json_str}")
+        # Try to fix single-quoted keys/values produced by the LLM
+        return ast.literal_eval(json_str)
+
+
+def structured_completion(llm_model: str, messages: list, target_model: BaseModel) -> tuple[BaseModel | None, dict]:
+    max_retries = 3 # sets limit on retries if llm's output has validation errors
+    prompt_tokens = 0
+    completion_tokens = 0
+
+    for attempt in range(max_retries):
+        try:
+            response = litellm.completion(
+                model=llm_model,
+                messages=messages,
+                temperature=0.3
+            )
+            # try: 
+            #     total_cost += litellm.completion_cost(completion_response=response)
+            # except Exception:
+            #     pass
+            prompt_tokens += getattr(response.usage, "prompt_tokens", 0) or 0
+            completion_tokens += getattr(response.usage, "completion_tokens", 0) or 0
+
+            output = extract_json(response.choices[0].message.content)
+            validated_model = target_model(**output)
+            usage = {
+                # "cost": round(total_cost, 6),
+                "prompt_tokens": prompt_tokens,
+                "completion_tokens": completion_tokens,
+            }
+            return validated_model, usage
+        except ValidationError as e:
+            logging.warning(f"Pydantic validation error on attempt {attempt + 1}: {e}")
+
+            if attempt < max_retries - 1:
+                failing_fields = [err["loc"][0] for err in e.errors() if err["loc"]]
+                error_msg = {
+                    "role": "user",
+                    "content": f"""
+                    The previous JSON output had validation errors. Return the same JSON with ONLY these fields corrected — do not change anything else:
+
+                    Failing fields: {failing_fields}
+
+                    Errors:
+                    {str(e)}
+                    """
+                }
+                messages.append({"role": "assistant", "content": response.choices[0].message.content})
+                messages.append(error_msg)
+            else:
+                logging.error(f"ERROR: LLM had validation failures after {max_retries}")
+                raise
+        except Exception as e:
+            logging.error(f"ERROR: LLM completion call failed: {e}")
+            raise
+
+
+def get_examples(crosswalk: str, num: int = 1):
+    PATH = Path(__file__).parent.parent / "tests_llm" / "logs" / "passed_cases.json"
+    raw = PATH.read_text() if PATH.exists() else ""
+    cases = json.loads(raw) if raw.strip() else []
+    return [c for c in cases if c["source:target"] == crosswalk][:num] #TODO: randomize example selection
+
+
+def convert_ai(llm_model: str, source_format: str, target_format: str, source_data):
+    """
+    Automate metadata standard conversion using LLM and canonical representation.
+
+    Args:
+    - llm_model: LLM model string (e.g., "openrouter/openai/gpt-4o").
+    - source_format: string representation of the source metadata standard.
+    - target_format: string representation of the target metadata standard.
+    - source_data: dict or pydantic.BaseModel instance representing the source metadata
+    """
+
+    # Build prompt messages using pydantic schemas and the source data
+    source_model = STANDARDS[source_format]["model"]
+    target_model = STANDARDS[target_format]["model"]
+
+    # Creates pydantic model instance of source data
+    if isinstance(source_data, dict):
+        source_instance = source_model(**source_data)
+    elif isinstance(source_data, source_model):
+        source_instance = source_data
+
+    # Create summarized schema of source pydantic model according to data instance
+    source_schema_dict = check_schema(source_format, source_instance)
+    source_schema = json.dumps(source_schema_dict, indent=2)
+
+    target_schema = check_schema(target_format)
+    target_schema = json.dumps(target_schema, indent=2)
+
+    strategy = DefaultPrompt()
+    messages = strategy.generate_system_prompt(source_schema, target_schema)
+
+    # inject few shot examples of past conversions, then end with the actual source data
+    examples = get_examples(f"{source_format}:{target_format}")
+    if examples:
+        messages.append({"role": "system", "content": f"Here are some examples of correct conversions between {source_format} and {target_format}:"})
+
+    for ex in examples:
+        messages.append({"role": "user", "content": "SOURCE_DATA:\n" + json.dumps(ex["source_metadata"])})
+        messages.append({"role": "assistant", "content": json.dumps(ex["llm_output"])})
+
+    messages.append({"role": "user", "content": "SOURCE_DATA:\n" + source_instance.json()})
+    target_data, usage = structured_completion(llm_model, messages, target_model)
+
+    return target_data, usage
diff --git a/codemeticulous/cli.py b/codemeticulous/cli.py
@@ -3,9 +3,12 @@
 import click
 import json
 import yaml
+from dotenv import load_dotenv
 
 from codemeticulous.convert import STANDARDS, convert as _convert
-
+from codemeticulous.ai_convert import convert_ai as _convert_ai
+from codemeticulous.generate_schemas import generate_schemas as _generate_schemas
+load_dotenv()
 
 @click.group()
 def cli():
@@ -104,6 +107,95 @@ def validate(format_name, input_file, verbose):
             traceback.print_exc()
 
 
+@cli.command()
+@click.option(
+    "-m",
+    "--model",
+    "llm_model",
+    type=str,
+    required=True,
+    help="LLM model to use for conversion (e.g., 'openrouter/openai/gpt-4o')",
+)
+def generate_schemas(llm_model: str):
+    try:
+        _generate_schemas(llm_model)
+    except Exception as e:
+        click.echo(f"Error from attempting to generate and cache schemas: {str(e)}", err=True)
+
+
+@cli.command()
+@click.option(
+    "-m",
+    "--model",
+    "llm_model",
+    type=str,
+    required=True,
+    help="LLM model to use for conversion (e.g., 'openrouter/openai/gpt-4o')",
+)
+@click.option(
+    "-f",
+    "--from",
+    "source_format",
+    type=click.Choice(STANDARDS.keys()),
+    required=True,
+    help="Source format",
+)
+@click.option(
+    "-t",
+    "--to",
+    "target_format",
+    type=click.Choice(STANDARDS.keys()),
+    required=True,
+    help="Target format",
+)
+@click.option(
+    "-o",
+    "--output",
+    "output_file",
+    type=click.File("w"),
+    default=None,
+    help="Output file name (by default prints to stdout)",
+)
+@click.option(
+    "-v",
+    "--verbose",
+    is_flag=True,
+    default=False,
+    help="Print verbose output",
+)
+@click.argument("input_file", type=click.Path(exists=True))
+def ai_convert(llm_model: str, source_format: str, target_format: str, input_file, output_file, verbose):
+    try:
+        input_data = load_file_autodetect(input_file)
+    except Exception as e:
+        click.echo(f"Failed to load file: {input_file}. {str(e)}", err=True)
+        if verbose:
+            traceback.print_exc()
+    try:
+        converted_data, _ = _convert_ai(llm_model, source_format, target_format, input_data)
+    except Exception as e:
+        click.echo(f"Error during AI-assisted conversion: {str(e)}", err=True)
+        if verbose:
+            traceback.print_exc()
+        return
+
+    output_format = STANDARDS[target_format]["format"]
+
+    try:
+        output_data = dump_data(converted_data, output_format)
+    except Exception as e:
+        click.echo(f"Error during serialization: {str(e)}", err=True)
+        if verbose:
+            traceback.print_exc()
+        return
+
+    if output_file:
+        output_file.write(output_data)
+        click.echo(f"Data written to {output_file.name}")
+    else:
+        click.echo(output_data)
+
+
 def dump_data(data, format):
     if format == "json":
         return data.json()
@@ -136,4 +228,4 @@ def load_file_autodetect(file_path):
             else:
                 raise ValueError(f"Unsupported file extension: {ext}.")
     except Exception as e:
-        raise ValueError(f"Failed to load file: {file_path}. {str(e)}")
+        raise ValueError(f"Failed to load file: {file_path}. {str(e)}")
diff --git a/codemeticulous/convert.py b/codemeticulous/convert.py
@@ -4,28 +4,7 @@
 from codemeticulous.codemeta.convert import canonical_to_codemeta, codemeta_to_canonical
 from codemeticulous.datacite.convert import canonical_to_datacite, datacite_to_canonical
 from codemeticulous.cff.convert import canonical_to_cff, cff_to_canonical
-
-
-STANDARDS = {
-    "codemeta": {
-        "model": CodeMeta,
-        "format": "json",
-        "to_canonical": codemeta_to_canonical,
-        "from_canonical": canonical_to_codemeta,
-    },
-    "datacite": {
-        "model": DataCite,
-        "format": "json",
-        "to_canonical": datacite_to_canonical,
-        "from_canonical": canonical_to_datacite,
-    },
-    "cff": {
-        "model": CitationFileFormat,
-        "format": "yaml",
-        "to_canonical": cff_to_canonical,
-        "from_canonical": canonical_to_cff,
-    },
-}
+from codemeticulous.standards import STANDARDS
 
 
 def to_canonical(source_format: str, source_data):

diff --git a/codemeticulous/datacite/convert.py b/codemeticulous/datacite/convert.py
@@ -384,10 +384,10 @@ def canonical_to_datacite(
                 # TODO: though, it may be possible to use the following codemeta fields:
                 # hasPart, isPartOf, readme, sameAs, review, releaseNotes
                 # relatedIdentifiers=data.relatedLink,
-                sizes=[data.fileSize] if data.fileSize else None,
-                formats=codemeta_language_fileformat_to_datacite_format(
-                    data.programmingLanguage, data.fileFormat
-                ),
+                # sizes=[data.fileSize] if data.fileSize else None,
+                # formats=codemeta_language_fileformat_to_datacite_format(
+                #     data.programmingLanguage, data.fileFormat
+                # ),
                 version=str(data.version) if data.version else None,
                 rightsList=codemeta_license_to_datacite_rights(data.license),
                 descriptions=descriptions,