-
Notifications
You must be signed in to change notification settings - Fork 3
feat(WIP): implement llm for metadata conversion #3
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
kellytea
wants to merge
14
commits into
SciCodes:main
Choose a base branch
from
kellytea:automate-conversion
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from 6 commits
Commits
Show all changes
14 commits
Select commit
Hold shift + click to select a range
a0e2726
feat(WIP): implement llm for metadata conversion
kellytea 1585ec8
feat: added llm converion
kellytea fd2a876
fix: simplified llm calls via instructor
kellytea 27c6edf
fix(WIP): improve "poor" llm model performance
kellytea 3a42cc5
feat: summmarize pydantic schemas with generated descriptions via llm
kellytea 74bfca7
feat: pruned schemas into llm conversion calls
kellytea e7fa909
fix(WIP): consolidate PR fixes
kellytea e67cdf9
fix: refactored schema generation + ai conversion logic
kellytea 588bba7
update README.md
kellytea 16c2e1a
update README.md
kellytea 728dbd8
test(WIP): first layer evaluation for llm's conversion performance
kellytea aeb471b
test(WIP): adding logging by llm model
kellytea fb4b493
test: refactored observability logs
kellytea 483e1c6
feat: added few shot examples
kellytea File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Some comments aren't visible on the classic Files Changed page.
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,101 @@ | ||
| import litellm, logging, re, json, os | ||
| from pydantic import BaseModel, ValidationError | ||
| from codemeticulous.standards import STANDARDS | ||
| from codemeticulous.prompt_strategies import DefaultPrompt | ||
| from codemeticulous.summarize_schema import get_schema_summary | ||
|
|
||
| logging.basicConfig(level=logging.INFO) | ||
|
|
||
| # Toggle for additional llm debugging | ||
| # litellm._turn_on_debug() | ||
|
|
||
|
|
||
| def extract_json(llm_output: str) -> dict: | ||
| # Try to extract JSON from markdown code block and disregard it | ||
| json_match = re.search(r'```json\s*(.*?)\s*```', llm_output, re.DOTALL) | ||
|
|
||
| if json_match: | ||
| json_str = json_match.group(1) | ||
| else: | ||
| # If no code block found, assume the whole string is JSON obj | ||
| json_str = llm_output | ||
|
|
||
| logging.info(f"Extracted JSON string: {json_str}") | ||
| return json.loads(json_str) | ||
|
|
||
|
|
||
| def structured_completion(llm_model: str, messages: list, target_model: BaseModel, key: str) -> BaseModel | None: | ||
| # try: | ||
| # client = instructor.from_litellm(completion) | ||
|
|
||
| # response = client.chat.completions.create( | ||
| # model=llm_model, | ||
| # response_model=target_model, | ||
| # messages=messages, | ||
| # api_key=key, | ||
| # ) | ||
| # return response | ||
| try: | ||
| os.environ["OPENROUTER_API_KEY"] = key #FIXME: avoid setting env var | ||
|
|
||
| response = litellm.completion( | ||
| model=llm_model, | ||
| messages=messages, | ||
| api_key=key | ||
| ) | ||
| output = extract_json(response.choices[0].message.content) | ||
| logging.info(output) | ||
| except Exception as e: | ||
| logging.error(f"ERROR: structured output failed: {e}") | ||
| raise | ||
|
|
||
| # Attempt to validate into a pydantic instance of the target model | ||
| try: | ||
| return target_model(**output) | ||
| except ValidationError as e: | ||
| logging.error(f"Pydantic validation error: {e}") | ||
| raise | ||
|
|
||
|
|
||
| def convert_ai(key: str, llm_model: str, source_format: str, target_format: str, source_data): | ||
| """ | ||
| Automate metadata standard conversion using LLM and canonical representation. | ||
|
|
||
| Args: | ||
| - key: API key for LLM authorization. | ||
| - llm_model: LLM model string (e.g., "openrouter/openai/gpt-4o"). | ||
| - source_format: string representation of the source metadata standard. | ||
| - target_format: string representation of the target metadata standard. | ||
| - model: LLM model string (e.g., "openrouter/openai/gpt-4o") | ||
| - source_data: dict or pydantic.BaseModel instance representing the source metadata | ||
| """ | ||
|
|
||
| # Build prompt messages using pydantic schemas and the source data | ||
| source_model = STANDARDS[source_format]["model"] | ||
| target_model = STANDARDS[target_format]["model"] | ||
|
|
||
| # Creates pydantic model instance of source data | ||
| if isinstance(source_data, dict): | ||
| source_instance = source_model(**source_data) | ||
| elif isinstance(source_data, source_model): | ||
| source_instance = source_data | ||
|
|
||
|
|
||
| # Create summarized schema of source pydantic model according to data instance | ||
| source_schema_dict = get_schema_summary(source_model, llm_model, key, source_instance) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this should just be cached as a static data file so the descriptions aren't opaquely+non-deterministically generated every time |
||
| source_schema = json.dumps(source_schema_dict, indent=2) | ||
|
|
||
| # Attempt to automatically create JSON string of target pydantic schema, fallback to manually doing it otherwise | ||
| try: | ||
| target_schema = target_model.schema_json() | ||
| except Exception: | ||
| logging.warning("Failed to serialize via Pydantic's built-in schema, falling back to manually creating schema") | ||
| target_schema = get_schema_summary(target_model, llm_model, key) | ||
|
|
||
| target_schema = json.dumps(target_schema, indent=2) | ||
|
|
||
| strategy = DefaultPrompt() | ||
| messages = strategy.generate_system_prompt(source_instance, source_schema, target_schema) | ||
|
|
||
| target_data = structured_completion(llm_model, messages, target_model, key) | ||
|
|
||
| return target_data | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,39 @@ | ||
| from abc import ABC, abstractmethod | ||
|
|
||
| class PromptStrategy(ABC): | ||
| @abstractmethod | ||
| def generate_system_prompt(self, source_instance) -> list: | ||
|
|
||
| pass | ||
|
|
||
|
|
||
| class DefaultPrompt(PromptStrategy): | ||
| PROMPT = """ | ||
| Your task is to convert source metadata from one format to another using the provided schemas. | ||
|
|
||
| INPUTS PROVIDED: | ||
| - Source data: A Pydantic model instance containing the original metadata | ||
| - Source schema: A JSON object containing the source Pydantic model's fields and descriptions | ||
| - Target schema: The Pydantic model definition for the output format | ||
|
|
||
| INSTRUCTIONS: | ||
| 1. Analyze the source data and understand its structure. | ||
| 2. Extract and map the relevant fields from the source data to the corresponding fields in the target format. | ||
| 3. Transform data types and structures as needed to match the target schema requirements which could either be one-to-one or complex transformations. | ||
| 4. Instantiate the target model using the mapped and transformed data, so that a new instance of the target Pyndantic model can be created. | ||
|
|
||
|
|
||
| OUTPUT REQUIREMENTS: | ||
| - Return ONLY a raw JSON without any further encoding such as escaping quotes. | ||
| - Ensure all required fields in the target schema are populated | ||
| - Use appropriate data types as defined in the target schema | ||
|
|
||
| The final output must be an instance of the target model schema that can be successfully validated by Pydantic. | ||
| """ | ||
|
|
||
| def generate_system_prompt(self, source_instance, source_schema, target_schema) -> list: | ||
|
|
||
| return [ | ||
| {"role": "system", "content": self.PROMPT}, | ||
| {"role": "user", "content": "SOURCE_DATA:\n" + source_instance.json()}, | ||
| {"role": "user", "content": "SOURCE SCHEMA\n" + source_schema}, | ||
| {"role": "user", "content": "TARGET_MODEL:\n" + target_schema} | ||
| ] | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,27 @@ | ||
| from codemeticulous.codemeta.models import CodeMeta | ||
| from codemeticulous.datacite.models import DataCite | ||
| from codemeticulous.cff.models import CitationFileFormat | ||
| from codemeticulous.codemeta.convert import canonical_to_codemeta, codemeta_to_canonical | ||
| from codemeticulous.datacite.convert import canonical_to_datacite, datacite_to_canonical | ||
| from codemeticulous.cff.convert import canonical_to_cff, cff_to_canonical | ||
|
|
||
| STANDARDS = { | ||
| "codemeta": { | ||
| "model": CodeMeta, | ||
| "format": "json", | ||
| "to_canonical": codemeta_to_canonical, | ||
| "from_canonical": canonical_to_codemeta, | ||
| }, | ||
| "datacite": { | ||
| "model": DataCite, | ||
| "format": "json", | ||
| "to_canonical": datacite_to_canonical, | ||
| "from_canonical": canonical_to_datacite, | ||
| }, | ||
| "cff": { | ||
| "model": CitationFileFormat, | ||
| "format": "yaml", | ||
| "to_canonical": cff_to_canonical, | ||
| "from_canonical": canonical_to_cff, | ||
| }, | ||
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,73 @@ | ||
| from pydantic import BaseModel | ||
| import re | ||
| import json | ||
| import logging | ||
| import litellm | ||
|
|
||
| def generate_desc(model_name: str, data, llm_model: str, api_key: str): | ||
| prompt = f""" | ||
| For a Pydantic model '{model_name}', we have a list of lists, each containing a field and their field type. | ||
|
|
||
| In one or two sentences, please provide brief descriptions of each field in relation to the model at the end of each sub-list. | ||
| Your response should be in a valid array consisting of the field name, field type, and new descriptions. | ||
| Please do not include outside explanatory text or unnecessary formatting syntax so your response can be piped into 'json.loads()'. | ||
|
|
||
| Here is the data: | ||
| {data} | ||
| """ | ||
|
|
||
| try: | ||
| response = litellm.completion( | ||
| messages=[{"role": "user", "content": prompt}], | ||
| api_key=api_key, | ||
| model=llm_model | ||
| ) | ||
| return response.choices[0].message.content.strip() | ||
| except Exception as e: | ||
| print(f"ERROR: structured output failed: {e}") | ||
| raise | ||
|
|
||
|
|
||
| def get_schema_summary(source_model: type[BaseModel], llm_model: str, api_key: str, instance_data: BaseModel = None): | ||
| # Generate a nested list of fields and their types | ||
| fields = [] | ||
|
|
||
| for field_name, model_field in source_model.__fields__.items(): | ||
| # if there's an instance and the field isn't referenced in it, skip | ||
| if instance_data is not None and getattr(instance_data, field_name) is None: | ||
| continue | ||
|
|
||
| field_type = model_field.annotation | ||
| field = [field_name, field_type] | ||
| fields.append(field) | ||
|
|
||
| llm_response = generate_desc(source_model.__name__, fields, llm_model, api_key) # Call to LLM to generate field's description | ||
|
|
||
| match = re.search(r'\{.*\}|\[.*\]', llm_response, re.DOTALL) # Clean up reponse in case it's wrapped around any unnecessary text/syntax | ||
|
|
||
| if match: | ||
| llm_response = match.group(0) | ||
|
|
||
| try: | ||
| field_descriptions = json.loads(llm_response) | ||
| schema = { | ||
| "model_name": source_model.__name__, | ||
| "fields": field_descriptions | ||
| } | ||
|
|
||
| return schema | ||
| except Exception as e: | ||
| logging.error(f"ERROR: failed to create list from llm response: ", e) | ||
|
|
||
| # try: | ||
| # output = json.loads(llm_response) | ||
| # filename = f"{source_model.__name__}.csv" | ||
|
|
||
| # # Create a csv retaining the final schema information | ||
| # with open(filename, "w", newline='') as csvfile: | ||
| # csvwriter = csv.writer(csvfile) | ||
| # csvwriter.writerow(["Field Name", "Field Type", "Description"]) | ||
| # csvwriter.writerows(output) | ||
|
|
||
| # except Exception as e: | ||
| # print(f"ERROR: failed to create list from llm response: ", e) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
lets remove the key getting passed around and just assume the environment variable is set. A bit more secure and lets us support any api that litellm does with no extra work