Skip to content
30 changes: 23 additions & 7 deletions dandi/metadata/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -332,7 +332,7 @@ def extract_cellLine(metadata: dict) -> str | None:

NCBITAXON_URI_TEMPLATE = "http://purl.obolibrary.org/obo/NCBITaxon_{}"

# common_names, prefix, uri, name
# common_names, prefix, uri, name ({current name} - {GenBank common name})
species_map = [
Comment thread
yarikoptic marked this conversation as resolved.
(
["mouse"],
Expand Down Expand Up @@ -386,25 +386,31 @@ def extract_cellLine(metadata: dict) -> str | None:
["c. elegans", "caenorhabditis elegans"],
"caenorhabditis",
NCBITAXON_URI_TEMPLATE.format("6239"),
"Caenorhabditis elegans",
"Caenorhabditis elegans - Roundworm",
),
(
["pig-tailed macaque", "pigtail monkey", "pigtail macaque"],
None,
NCBITAXON_URI_TEMPLATE.format("9545"),
"Macaca nemestrina",
"Macaca nemestrina - Pig-tailed macaque",
),
(
["bonnet macaque", "bonnet monkey", "radiata"],
None,
NCBITAXON_URI_TEMPLATE.format("9548"),
"Macaca radiata - Bonnet macaque",
),
(
["mongolian gerbil", "mongolian jird"],
None,
NCBITAXON_URI_TEMPLATE.format("10047"),
"Meriones unguiculatus",
"Meriones unguiculatus - Mongolian gerbil",
),
(
["common paper wasp"],
None,
NCBITAXON_URI_TEMPLATE.format("30207"),
"Polistes fuscatus",
"Polistes fuscatus - Common paper wasp",
),
]

Expand Down Expand Up @@ -484,14 +490,24 @@ def extract_species(metadata: dict) -> models.SpeciesType | None:
else:
lower_value = value_orig.lower()
for common_names, prefix, uri, name in species_map:
scientific_name, _, common_name = name.partition(" - ")
if (
lower_value == name.lower()
or any(key in lower_value for key in common_names)
or (prefix is not None and lower_value.startswith(prefix))
or lower_value == scientific_name.lower()
or (common_name and lower_value == common_name.lower())
):
value_id = uri
value = name
break
else:
for common_names, prefix, uri, name in species_map:
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Towards consistency (+/- future class-based representation) the line above marked such fields as private in the scope, (not must sense to me) so might want to adjust that to match; https://github.com/dandi/dandi-cli/pull/1866/changes#diff-f1fffc52a57ec0f6b12d707f48d19c51f4dd81a4606d3535935d38e10e710bbdL470

Interesting though, https://github.com/dandi/dandi-cli/pull/1866/changes#diff-f1fffc52a57ec0f6b12d707f48d19c51f4dd81a4606d3535935d38e10e710bbdR498 didn't

if (
any(key in lower_value for key in common_names)
or (prefix is not None and lower_value.startswith(prefix))
):
value_id = uri
value = name
break
Comment thread
yarikoptic marked this conversation as resolved.
Outdated
if value_id is None:
raise ValueError(
f"Cannot interpret species field: {value_orig}. Please "
Expand Down
10 changes: 8 additions & 2 deletions dandi/tests/test_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -770,7 +770,7 @@ def test_species_all_possible(species: str) -> None:
assert species_rec.model_dump(mode="json", exclude_none=True) == {
"identifier": "http://purl.obolibrary.org/obo/NCBITaxon_10047",
"schemaKey": "SpeciesType",
"name": "Meriones unguiculatus",
"name": "Meriones unguiculatus - Mongolian gerbil",
}


Expand All @@ -782,9 +782,15 @@ def test_extract_unknown_species():

def test_species_map():
# all alternative names should be lower case
for common_names, *_ in species_map:
for common_names, _, uri, name in species_map:
for key in common_names:
assert key.lower() == key
assert " - " in name
for species in name.split(" - "):
species_rec = extract_species({"species": species})
assert species_rec
assert str(species_rec.identifier) == uri
assert species_rec.name == name


@pytest.mark.parametrize(
Expand Down