SciCodes · kellytea · Sep 30, 2025 · Nov 10, 2025 · Nov 20, 2025 · Nov 25, 2025
diff --git a/codemeticulous/ai_convert.py b/codemeticulous/ai_convert.py
@@ -0,0 +1,101 @@
+import litellm, logging, re, json, os
+from pydantic import BaseModel, ValidationError
+from codemeticulous.standards import STANDARDS
+from codemeticulous.prompt_strategies import DefaultPrompt
+from codemeticulous.summarize_schema import get_schema_summary
+
+logging.basicConfig(level=logging.INFO)
+
+# Toggle for additional llm debugging
+# litellm._turn_on_debug()
+
+
+def extract_json(llm_output: str) -> dict:
+    # Try to extract JSON from markdown code block and disregard it
+    json_match = re.search(r'```json\s*(.*?)\s*```', llm_output, re.DOTALL)
+
+    if json_match:
+        json_str = json_match.group(1)
+    else:
+        # If no code block found, assume the whole string is JSON obj
+        json_str = llm_output
+
+    logging.info(f"Extracted JSON string: {json_str}")
+    return json.loads(json_str)
+
+
+def structured_completion(llm_model: str, messages: list, target_model: BaseModel, key: str) -> BaseModel | None:
+    # try:
+    #     client = instructor.from_litellm(completion)
+
+    #     response = client.chat.completions.create(
+    #         model=llm_model,
+    #         response_model=target_model,
+    #         messages=messages,
+    #         api_key=key,
+    #     )
+    #     return response
+    try:
+        os.environ["OPENROUTER_API_KEY"] = key #FIXME: avoid setting env var
+
+        response = litellm.completion(
+            model=llm_model,
+            messages=messages,
+            api_key=key
+        )
+        output = extract_json(response.choices[0].message.content)
+        logging.info(output)
+    except Exception as e:
+        logging.error(f"ERROR: structured output failed: {e}") 
+        raise
+
+    # Attempt to validate into a pydantic instance of the target model
+    try:
+        return target_model(**output)
+    except ValidationError as e:
+        logging.error(f"Pydantic validation error: {e}")
+        raise
+
+
+def convert_ai(key: str, llm_model: str, source_format: str, target_format: str, source_data):
+    """
+    Automate metadata standard conversion using LLM and canonical representation.
+
+    Args:
+    - key: API key for LLM authorization.
+    - llm_model: LLM model string (e.g., "openrouter/openai/gpt-4o").
+    - source_format: string representation of the source metadata standard.
+    - target_format: string representation of the target metadata standard.
+    - model: LLM model string (e.g., "openrouter/openai/gpt-4o")
+    - source_data: dict or pydantic.BaseModel instance representing the source metadata
+    """
+
+    # Build prompt messages using pydantic schemas and the source data
+    source_model = STANDARDS[source_format]["model"]
+    target_model = STANDARDS[target_format]["model"]
+
+    # Creates pydantic model instance of source data
+    if isinstance(source_data, dict):
+        source_instance = source_model(**source_data)
+    elif isinstance(source_data, source_model):
+        source_instance = source_data
+
+    # Create summarized schema of source pydantic model according to data instance
+    source_schema_dict = get_schema_summary(source_model, llm_model, key, source_instance)
+    source_schema = json.dumps(source_schema_dict, indent=2)
+
+    # Attempt to automatically create JSON string of target pydantic schema, fallback to manually doing it otherwise
+    try:
+        target_schema = target_model.schema_json() 
+    except Exception:
+        logging.warning("Failed to serialize via Pydantic's built-in schema, falling back to manually creating schema")
+        target_schema = get_schema_summary(target_model, llm_model, key)
+
+    target_schema = json.dumps(target_schema, indent=2)
+
+    strategy = DefaultPrompt()
+    messages = strategy.generate_system_prompt(source_instance, source_schema, target_schema)
+
+    target_data = structured_completion(llm_model, messages, target_model, key)
+
+    return target_data
diff --git a/codemeticulous/cli.py b/codemeticulous/cli.py
@@ -5,7 +5,7 @@
 import yaml
 
 from codemeticulous.convert import STANDARDS, convert as _convert
-
+from codemeticulous.ai_convert import convert_ai as _convert_ai
 
 @click.group()
 def cli():
@@ -104,6 +104,87 @@ def validate(format_name, input_file, verbose):
             traceback.print_exc()
 
 
+@cli.command()
+@click.option(
+    "-k",
+    "--key",
+    "api_key",
+    type=str,
+    required=True,
+    help="API key for LLM authorization",
+)
+@click.option(
+    "-m",
+    "--model",
+    "llm_model",
+    type=str,
+    required=True,
+    help="LLM model to use for conversion (e.g., 'openrouter/openai/gpt-4o')",
+)
+@click.option(
+    "-f",
+    "--from",
+    "source_format",
+    type=click.Choice(STANDARDS.keys()),
+    required=True,
+    help="Source format",
+)
+@click.option(
+    "-t",
+    "--to",
+    "target_format",
+    type=click.Choice(STANDARDS.keys()),
+    required=True,
+    help="Target format",
+)
+@click.option(
+    "-o",
+    "--output",
+    "output_file",
+    type=click.File("w"),
+    default=None,
+    help="Output file name (by default prints to stdout)",
+)
+@click.option(
+    "-v",
+    "--verbose",
+    is_flag=True,
+    default=False,
+    help="Print verbose output",
+)
+@click.argument("input_file", type=click.Path(exists=True))
+def ai_convert(api_key: str, llm_model: str, source_format: str, target_format: str, input_file, output_file, verbose):
+    try:
+        input_data = load_file_autodetect(input_file)
+    except Exception as e:
+        click.echo(f"Failed to load file: {input_file}. {str(e)}", err=True)
+        if verbose:
+            traceback.print_exc()
+    try:
+        converted_data = _convert_ai(api_key, llm_model, source_format, target_format, input_data)
+    except Exception as e:
+        click.echo(f"Error during AI-assisted conversion: {str(e)}", err=True)
+        if verbose:
+            traceback.print_exc()
+        return
+
+    output_format = STANDARDS[target_format]["format"]
+
+    try:
+        output_data = dump_data(converted_data, output_format)
+    except Exception as e:
+        click.echo(f"Error during serialization: {str(e)}", err=True)
+        if verbose:
+            traceback.print_exc()
+        return
+
+    if output_file:
+        output_file.write(output_data)
+        click.echo(f"Data written to {output_file.name}")
+    else:
+        click.echo(output_data)
+
+
 def dump_data(data, format):
     if format == "json":
         return data.json()
@@ -136,4 +217,4 @@ def load_file_autodetect(file_path):
             else:
                 raise ValueError(f"Unsupported file extension: {ext}.")
     except Exception as e:
-        raise ValueError(f"Failed to load file: {file_path}. {str(e)}")
+        raise ValueError(f"Failed to load file: {file_path}. {str(e)}")
diff --git a/codemeticulous/convert.py b/codemeticulous/convert.py
@@ -4,28 +4,7 @@
 from codemeticulous.codemeta.convert import canonical_to_codemeta, codemeta_to_canonical
 from codemeticulous.datacite.convert import canonical_to_datacite, datacite_to_canonical
 from codemeticulous.cff.convert import canonical_to_cff, cff_to_canonical
-
-
-STANDARDS = {
-    "codemeta": {
-        "model": CodeMeta,
-        "format": "json",
-        "to_canonical": codemeta_to_canonical,
-        "from_canonical": canonical_to_codemeta,
-    },
-    "datacite": {
-        "model": DataCite,
-        "format": "json",
-        "to_canonical": datacite_to_canonical,
-        "from_canonical": canonical_to_datacite,
-    },
-    "cff": {
-        "model": CitationFileFormat,
-        "format": "yaml",
-        "to_canonical": cff_to_canonical,
-        "from_canonical": canonical_to_cff,
-    },
-}
+from codemeticulous.standards import STANDARDS
 
 
 def to_canonical(source_format: str, source_data):

diff --git a/codemeticulous/prompt_strategies.py b/codemeticulous/prompt_strategies.py
@@ -0,0 +1,39 @@
+from abc import ABC, abstractmethod
+
+class PromptStrategy(ABC):
+    @abstractmethod
+    def generate_system_prompt(self, source_instance) -> list:
+        pass
+
+
+class DefaultPrompt(PromptStrategy):
+    PROMPT = """
+    Your task is to convert source metadata from one format to another using the provided schemas.
+
+    INPUTS PROVIDED:
+    - Source data: A Pydantic model instance containing the original metadata
+    - Source schema: A JSON object containing the source Pydantic model's fields and descriptions
+    - Target schema: The Pydantic model definition for the output format
+
+    INSTRUCTIONS:
+    1. Analyze the source data and understand its structure.
+    2. Extract and map the relevant fields from the source data to the corresponding fields in the target format.
+    3. Transform data types and structures as needed to match the target schema requirements which could either be one-to-one or complex transformations.
+    4. Instantiate the target model using the mapped and transformed data, so that a new instance of the target Pyndantic model can be created.
+
+    OUTPUT REQUIREMENTS:
+    - Return ONLY a raw JSON without any further encoding such as escaping quotes.
+    - Ensure all required fields in the target schema are populated
+    - Use appropriate data types as defined in the target schema
+
+    The final output must be an instance of the target model schema that can be successfully validated by Pydantic.
+    """
+
+    def generate_system_prompt(self, source_instance, source_schema, target_schema) -> list:
+
+        return [
+            {"role": "system", "content": self.PROMPT},
+            {"role": "user", "content": "SOURCE_DATA:\n" + source_instance.json()},
+            {"role": "user", "content": "SOURCE SCHEMA\n" + source_schema},
+            {"role": "user", "content": "TARGET_MODEL:\n" + target_schema}
+        ]
diff --git a/codemeticulous/standards.py b/codemeticulous/standards.py
@@ -0,0 +1,27 @@
+from codemeticulous.codemeta.models import CodeMeta
+from codemeticulous.datacite.models import DataCite
+from codemeticulous.cff.models import CitationFileFormat
+from codemeticulous.codemeta.convert import canonical_to_codemeta, codemeta_to_canonical
+from codemeticulous.datacite.convert import canonical_to_datacite, datacite_to_canonical
+from codemeticulous.cff.convert import canonical_to_cff, cff_to_canonical
+
+STANDARDS = {
+    "codemeta": {
+        "model": CodeMeta,
+        "format": "json",
+        "to_canonical": codemeta_to_canonical,
+        "from_canonical": canonical_to_codemeta,
+    },
+    "datacite": {
+        "model": DataCite,
+        "format": "json",
+        "to_canonical": datacite_to_canonical,
+        "from_canonical": canonical_to_datacite,
+    },
+    "cff": {
+        "model": CitationFileFormat,
+        "format": "yaml",
+        "to_canonical": cff_to_canonical,
+        "from_canonical": canonical_to_cff,
+    },
+}
diff --git a/codemeticulous/summarize_schema.py b/codemeticulous/summarize_schema.py
@@ -0,0 +1,73 @@
+from pydantic import BaseModel
+import re
+import json
+import logging
+import litellm
+
+def generate_desc(model_name: str, data, llm_model: str, api_key: str):
+    prompt = f"""
+    For a Pydantic model '{model_name}', we have a list of lists, each containing a field and their field type. 
+
+    In one or two sentences, please provide brief descriptions of each field in relation to the model at the end of each sub-list.
+    Your response should be in a valid array consisting of the field name, field type, and new descriptions.
+    Please do not include outside explanatory text or unnecessary formatting syntax so your response can be piped into 'json.loads()'.
+
+    Here is the data:
+    {data}
+    """
+
+    try:
+        response = litellm.completion(
+            messages=[{"role": "user", "content": prompt}],
+            api_key=api_key,
+            model=llm_model
+        )
+        return response.choices[0].message.content.strip()
+    except Exception as e:
+        print(f"ERROR: structured output failed: {e}") 
+        raise
+
+
+def get_schema_summary(source_model: type[BaseModel], llm_model: str, api_key: str, instance_data: BaseModel = None):
+    # Generate a nested list of fields and their types
+    fields = []
+
+    for field_name, model_field in source_model.__fields__.items():
+        # if there's an instance and the field isn't referenced in it, skip
+        if instance_data is not None and getattr(instance_data, field_name) is None:
+            continue
+
+        field_type = model_field.annotation
+        field = [field_name, field_type]
+        fields.append(field)
+
+    llm_response = generate_desc(source_model.__name__, fields, llm_model, api_key) # Call to LLM to generate field's description
+
+    match = re.search(r'\{.*\}|\[.*\]', llm_response, re.DOTALL) # Clean up reponse in case it's wrapped around any unnecessary text/syntax
+
+    if match:
+        llm_response = match.group(0)
+
+    try:
+        field_descriptions = json.loads(llm_response)
+        schema = {
+            "model_name": source_model.__name__,
+            "fields": field_descriptions
+        }
+
+        return schema
+    except Exception as e:
+        logging.error(f"ERROR: failed to create list from llm response: ", e) 
+
+    # try:
+    #     output = json.loads(llm_response)
+    #     filename = f"{source_model.__name__}.csv"
+
+    #     # Create a csv retaining the final schema information
+    #     with open(filename, "w", newline='') as csvfile:
+    #       csvwriter = csv.writer(csvfile)
+    #       csvwriter.writerow(["Field Name", "Field Type", "Description"])
+    #       csvwriter.writerows(output)
+
+    # except Exception as e:
+    #     print(f"ERROR: failed to create list from llm response: ", e)        
diff --git a/pyproject.toml b/pyproject.toml
@@ -6,6 +6,8 @@ readme = "README.md"
 requires-python = ">=3.10"
 dependencies = [
     "click>=8.1.7",
+    "instructor>=1.13.0",
+    "litellm>=1.77.1",
     "pydantic2-schemaorg==0.2.0",
     "pydantic>=2.9.2",
     "pyyaml>=6.0.2",
@@ -30,4 +32,4 @@ include-package-data = false
 
 [tool.setuptools.packages.find]
 include = ["codemeticulous", "codemeticulous.*"]
-exclude = ["tests*", "schema"]
+exclude = ["tests*", "schema"]