-
Notifications
You must be signed in to change notification settings - Fork 33
ENH+BF: Format species names consistently in species_map, make matching more stringent for common names, improved testing #1866
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 5 commits
1a2d7a9
5a9d473
e98375c
1177640
2eadb00
fbe2f6c
db81fa7
a8834c2
e85ef07
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -332,7 +332,7 @@ def extract_cellLine(metadata: dict) -> str | None: | |
|
|
||
| NCBITAXON_URI_TEMPLATE = "http://purl.obolibrary.org/obo/NCBITaxon_{}" | ||
|
|
||
| # common_names, prefix, uri, name | ||
| # common_names, prefix, uri, name ({current name} - {GenBank common name}) | ||
| species_map = [ | ||
| ( | ||
| ["mouse"], | ||
|
|
@@ -386,25 +386,31 @@ def extract_cellLine(metadata: dict) -> str | None: | |
| ["c. elegans", "caenorhabditis elegans"], | ||
| "caenorhabditis", | ||
| NCBITAXON_URI_TEMPLATE.format("6239"), | ||
| "Caenorhabditis elegans", | ||
| "Caenorhabditis elegans - Roundworm", | ||
| ), | ||
| ( | ||
| ["pig-tailed macaque", "pigtail monkey", "pigtail macaque"], | ||
| None, | ||
| NCBITAXON_URI_TEMPLATE.format("9545"), | ||
| "Macaca nemestrina", | ||
| "Macaca nemestrina - Pig-tailed macaque", | ||
| ), | ||
| ( | ||
| ["bonnet macaque", "bonnet monkey", "radiata"], | ||
| None, | ||
| NCBITAXON_URI_TEMPLATE.format("9548"), | ||
| "Macaca radiata - Bonnet macaque", | ||
| ), | ||
| ( | ||
| ["mongolian gerbil", "mongolian jird"], | ||
| None, | ||
| NCBITAXON_URI_TEMPLATE.format("10047"), | ||
| "Meriones unguiculatus", | ||
| "Meriones unguiculatus - Mongolian gerbil", | ||
| ), | ||
| ( | ||
| ["common paper wasp"], | ||
| None, | ||
| NCBITAXON_URI_TEMPLATE.format("30207"), | ||
| "Polistes fuscatus", | ||
| "Polistes fuscatus - Common paper wasp", | ||
| ), | ||
| ] | ||
|
|
||
|
|
@@ -484,14 +490,24 @@ def extract_species(metadata: dict) -> models.SpeciesType | None: | |
| else: | ||
| lower_value = value_orig.lower() | ||
| for common_names, prefix, uri, name in species_map: | ||
| scientific_name, _, common_name = name.partition(" - ") | ||
| if ( | ||
| lower_value == name.lower() | ||
| or any(key in lower_value for key in common_names) | ||
| or (prefix is not None and lower_value.startswith(prefix)) | ||
| or lower_value == scientific_name.lower() | ||
| or (common_name and lower_value == common_name.lower()) | ||
| ): | ||
| value_id = uri | ||
| value = name | ||
| break | ||
| else: | ||
| for common_names, prefix, uri, name in species_map: | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Towards consistency (+/- future class-based representation) the line above marked such fields as private in the scope, (not must sense to me) so might want to adjust that to match; https://github.com/dandi/dandi-cli/pull/1866/changes#diff-f1fffc52a57ec0f6b12d707f48d19c51f4dd81a4606d3535935d38e10e710bbdL470 Interesting though, https://github.com/dandi/dandi-cli/pull/1866/changes#diff-f1fffc52a57ec0f6b12d707f48d19c51f4dd81a4606d3535935d38e10e710bbdR498 didn't |
||
| if ( | ||
| any(key in lower_value for key in common_names) | ||
| or (prefix is not None and lower_value.startswith(prefix)) | ||
| ): | ||
| value_id = uri | ||
| value = name | ||
| break | ||
|
yarikoptic marked this conversation as resolved.
Outdated
|
||
| if value_id is None: | ||
| raise ValueError( | ||
| f"Cannot interpret species field: {value_orig}. Please " | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.