Skip to content
49 changes: 34 additions & 15 deletions dandi/metadata/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -332,7 +332,7 @@ def extract_cellLine(metadata: dict) -> str | None:

NCBITAXON_URI_TEMPLATE = "http://purl.obolibrary.org/obo/NCBITaxon_{}"

# common_names, prefix, uri, name
# common_names, prefix, uri, name ({current name} - {GenBank common name})
species_map = [
Comment thread
yarikoptic marked this conversation as resolved.
(
["mouse"],
Expand All @@ -347,7 +347,7 @@ def extract_cellLine(metadata: dict) -> str | None:
"Homo sapiens - Human",
),
(
["rat", "norvegicus"],
["brown rat", "rat", "norvegicus"],
None,
NCBITAXON_URI_TEMPLATE.format("10116"),
"Rattus norvegicus - Norway rat",
Expand Down Expand Up @@ -386,13 +386,19 @@ def extract_cellLine(metadata: dict) -> str | None:
["c. elegans", "caenorhabditis elegans"],
"caenorhabditis",
NCBITAXON_URI_TEMPLATE.format("6239"),
"Caenorhabditis elegans",
"Caenorhabditis elegans - Roundworm",
),
(
["pig-tailed macaque", "pigtail monkey", "pigtail macaque"],
None,
NCBITAXON_URI_TEMPLATE.format("9545"),
"Macaca nemestrina",
"Macaca nemestrina - Pig-tailed macaque",
),
(
["bonnet macaque", "bonnet monkey", "radiata"],
None,
NCBITAXON_URI_TEMPLATE.format("9548"),
"Macaca radiata - Bonnet macaque",
),
(
["bonnet macaque", "bonnet monkey", "radiata"],
Expand All @@ -404,13 +410,13 @@ def extract_cellLine(metadata: dict) -> str | None:
["mongolian gerbil", "mongolian jird"],
None,
NCBITAXON_URI_TEMPLATE.format("10047"),
"Meriones unguiculatus",
"Meriones unguiculatus - Mongolian gerbil",
),
(
["common paper wasp"],
None,
NCBITAXON_URI_TEMPLATE.format("30207"),
"Polistes fuscatus",
"Polistes fuscatus - Common paper wasp",
),
]

Expand Down Expand Up @@ -459,7 +465,7 @@ def parse_purlobourl(

def extract_species(metadata: dict) -> models.SpeciesType | None:
value_orig = metadata.get("species", None)
value_id = None
value_matches: list[tuple[str, str | None]] = [] # of (value_id, value)
if value_orig is not None and value_orig != "":
if m := re.fullmatch(
r"https?://purl\.obolibrary\.org/obo/NCBITaxon_([0-9]+)/?",
Expand All @@ -469,8 +475,7 @@ def extract_species(metadata: dict) -> models.SpeciesType | None:
normed_value = NCBITAXON_URI_TEMPLATE.format(m[1])
for _common_names, _prefix, uri, name in species_map:
if uri == normed_value:
value_id = uri
value = name
value_matches.append((uri, name))
break
else:
value_id = value_orig
Expand All @@ -487,18 +492,24 @@ def extract_species(metadata: dict) -> models.SpeciesType | None:
value = " - ".join(
[result[key] for key in lookup if key in result]
)
value_matches.append((value_id, value))
else:
lower_value = value_orig.lower()
lower_value = value_orig.lower().strip()
for common_names, prefix, uri, name in species_map:
if (
lower_value == name.lower()
or any(key in lower_value for key in common_names)
or (prefix is not None and lower_value.startswith(prefix))
or any(lower_value == v.lower() for v in name.partition(" - "))
):
value_id = uri
value = name
break
if value_id is None:
value_matches.append((uri, name))
# only if no matches -- try to match by common names within common names
if not value_matches:
for common_names, prefix, uri, name in species_map:
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Towards consistency (+/- future class-based representation) the line above marked such fields as private in the scope, (not must sense to me) so might want to adjust that to match; https://github.com/dandi/dandi-cli/pull/1866/changes#diff-f1fffc52a57ec0f6b12d707f48d19c51f4dd81a4606d3535935d38e10e710bbdL470

Interesting though, https://github.com/dandi/dandi-cli/pull/1866/changes#diff-f1fffc52a57ec0f6b12d707f48d19c51f4dd81a4606d3535935d38e10e710bbdR498 didn't

if any(key == lower_value for key in common_names):
value_matches.append((uri, name))
Comment on lines +508 to +509
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Plus if this were a class unto itself you could define such equivalency logic as a method within it to make a little more sense from the outside looking in, such as .match(...), possible arguments exact or loose


value_matches = list(set(value_matches)) # unique values
if not value_matches:
raise ValueError(
f"Cannot interpret species field: {value_orig}. Please "
"contact help@dandiarchive.org to add your species. "
Expand All @@ -509,6 +520,14 @@ def extract_species(metadata: dict) -> models.SpeciesType | None:
"url for the species Homo sapiens. Please note that "
"this url is case sensitive."
)
elif len(value_matches) > 1:
raise ValueError(
f"Got multiple ({len(value_matches)}) species matched for "
f"{value_orig}: {value_matches}. Should not happen. "
"Either you need to be more specific or our code needs "
"fixing - contact help@dandiarchive.org."
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I do like including such info in error since it gives the user something Actionable~(A) at least

)
value_id, value = value_matches[0]
return models.SpeciesType(identifier=value_id, name=value)
else:
return None
Expand Down
58 changes: 48 additions & 10 deletions dandi/tests/test_metadata.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from __future__ import annotations

from datetime import datetime, timedelta
from itertools import chain
import json
from pathlib import Path
import shutil
Expand Down Expand Up @@ -762,6 +763,7 @@ def test_species():
"Meriones unguiculatus",
"Meriones Unguiculatus",
"meriones Unguiculatus",
"Meriones unguiculatus - Mongolian gerbil",
],
)
def test_species_all_possible(species: str) -> None:
Expand All @@ -770,21 +772,57 @@ def test_species_all_possible(species: str) -> None:
assert species_rec.model_dump(mode="json", exclude_none=True) == {
"identifier": "http://purl.obolibrary.org/obo/NCBITaxon_10047",
"schemaKey": "SpeciesType",
"name": "Meriones unguiculatus",
"name": "Meriones unguiculatus - Mongolian gerbil",
}


def test_extract_unknown_species():
with pytest.raises(ValueError) as excinfo:
extract_species({"species": "mumba-jumba"})
assert str(excinfo.value).startswith("Cannot interpret species field: mumba-jumba")
# tricky one since we have multiple rats but only one we want to map
# as the default "rat"
@pytest.mark.parametrize(
"species",
[
"rat",
"http://purl.obolibrary.org/obo/NCBITaxon_10116",
],
)
def test_species_rat(species: str) -> None:
species_rec = extract_species({"species": species})
assert species_rec
assert species_rec.model_dump(mode="json", exclude_none=True) == {
"identifier": "http://purl.obolibrary.org/obo/NCBITaxon_10116",
"schemaKey": "SpeciesType",
"name": "Rattus norvegicus - Norway rat",
}


def test_species_map():
# all alternative names should be lower case
for common_names, *_ in species_map:
for key in common_names:
assert key.lower() == key
@pytest.mark.parametrize(
"species",
[
"mumba-jumba",
"rat unknown",
"borat",
"my wonderful rat in pokadots",
"http://example.com/myrat",
],
)
def test_species_extract_unknown(species):
with pytest.raises(ValueError) as excinfo:
extract_species({"species": species})
assert str(excinfo.value).startswith(f"Cannot interpret species field: {species}")


@pytest.mark.parametrize("common_names,prefix,uri,name", species_map)
def test_species_map(common_names, prefix, uri, name):
# all alternative names should be in lower case
for key in common_names:
assert key.lower() == key
assert " - " in name
# verify that feeding a full "standard" name matches the correct one
for species in chain(name.split(" - "), common_names):
species_rec = extract_species({"species": species})
assert species_rec
assert str(species_rec.identifier) == uri
assert species_rec.name == name


@pytest.mark.parametrize(
Expand Down
Loading