dandi · yarikoptic · Jun 1, 2026 · May 28, 2026 · May 28, 2026 · May 28, 2026
diff --git a/dandi/metadata/util.py b/dandi/metadata/util.py
@@ -332,7 +332,7 @@ def extract_cellLine(metadata: dict) -> str | None:
 
 NCBITAXON_URI_TEMPLATE = "http://purl.obolibrary.org/obo/NCBITaxon_{}"
 
-# common_names, prefix, uri, name
+# common_names, prefix, uri, name ({current name} - {GenBank common name})
 species_map = [
     (
         ["mouse"],
@@ -347,7 +347,7 @@ def extract_cellLine(metadata: dict) -> str | None:
         "Homo sapiens - Human",
     ),
     (
-        ["rat", "norvegicus"],
+        ["brown rat", "rat", "norvegicus"],
         None,
         NCBITAXON_URI_TEMPLATE.format("10116"),
         "Rattus norvegicus - Norway rat",
@@ -386,13 +386,19 @@ def extract_cellLine(metadata: dict) -> str | None:
         ["c. elegans", "caenorhabditis elegans"],
         "caenorhabditis",
         NCBITAXON_URI_TEMPLATE.format("6239"),
-        "Caenorhabditis elegans",
+        "Caenorhabditis elegans - Roundworm",
     ),
     (
         ["pig-tailed macaque", "pigtail monkey", "pigtail macaque"],
         None,
         NCBITAXON_URI_TEMPLATE.format("9545"),
-        "Macaca nemestrina",
+        "Macaca nemestrina - Pig-tailed macaque",
+    ),
+    (
+        ["bonnet macaque", "bonnet monkey", "radiata"],
+        None,
+        NCBITAXON_URI_TEMPLATE.format("9548"),
+        "Macaca radiata - Bonnet macaque",
     ),
     (
         ["bonnet macaque", "bonnet monkey", "radiata"],
@@ -404,13 +410,13 @@ def extract_cellLine(metadata: dict) -> str | None:
         ["mongolian gerbil", "mongolian jird"],
         None,
         NCBITAXON_URI_TEMPLATE.format("10047"),
-        "Meriones unguiculatus",
+        "Meriones unguiculatus - Mongolian gerbil",
     ),
     (
         ["common paper wasp"],
         None,
         NCBITAXON_URI_TEMPLATE.format("30207"),
-        "Polistes fuscatus",
+        "Polistes fuscatus - Common paper wasp",
     ),
 ]
 
@@ -459,7 +465,7 @@ def parse_purlobourl(
 
 def extract_species(metadata: dict) -> models.SpeciesType | None:
     value_orig = metadata.get("species", None)
-    value_id = None
+    value_matches: list[tuple[str, str | None]] = []  # of (value_id, value)
     if value_orig is not None and value_orig != "":
         if m := re.fullmatch(
             r"https?://purl\.obolibrary\.org/obo/NCBITaxon_([0-9]+)/?",
@@ -469,8 +475,7 @@ def extract_species(metadata: dict) -> models.SpeciesType | None:
             normed_value = NCBITAXON_URI_TEMPLATE.format(m[1])
             for _common_names, _prefix, uri, name in species_map:
                 if uri == normed_value:
-                    value_id = uri
-                    value = name
+                    value_matches.append((uri, name))
                     break
             else:
                 value_id = value_orig
@@ -487,18 +492,24 @@ def extract_species(metadata: dict) -> models.SpeciesType | None:
                         value = " - ".join(
                             [result[key] for key in lookup if key in result]
                         )
+                value_matches.append((value_id, value))
         else:
-            lower_value = value_orig.lower()
+            lower_value = value_orig.lower().strip()
             for common_names, prefix, uri, name in species_map:
                 if (
                     lower_value == name.lower()
-                    or any(key in lower_value for key in common_names)
                     or (prefix is not None and lower_value.startswith(prefix))
+                    or any(lower_value == v.lower() for v in name.partition(" - "))
                 ):
-                    value_id = uri
-                    value = name
-                    break
-        if value_id is None:
+                    value_matches.append((uri, name))
+            # only if no matches -- try to match by common names within common names
+            if not value_matches:
+                for common_names, prefix, uri, name in species_map:
+                    if any(key == lower_value for key in common_names):
+                        value_matches.append((uri, name))
+
+        value_matches = list(set(value_matches))  # unique values
+        if not value_matches:
             raise ValueError(
                 f"Cannot interpret species field: {value_orig}. Please "
                 "contact help@dandiarchive.org to add your species. "
@@ -509,6 +520,14 @@ def extract_species(metadata: dict) -> models.SpeciesType | None:
                 "url for the species Homo sapiens. Please note that "
                 "this url is case sensitive."
             )
+        elif len(value_matches) > 1:
+            raise ValueError(
+                f"Got multiple ({len(value_matches)}) species matched for "
+                f"{value_orig}: {value_matches}.  Should not happen. "
+                "Either you need to be more specific or our code needs "
+                "fixing - contact help@dandiarchive.org."
+            )
+        value_id, value = value_matches[0]
         return models.SpeciesType(identifier=value_id, name=value)
     else:
         return None

diff --git a/dandi/tests/test_metadata.py b/dandi/tests/test_metadata.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 from datetime import datetime, timedelta
+from itertools import chain
 import json
 from pathlib import Path
 import shutil
@@ -762,6 +763,7 @@ def test_species():
         "Meriones unguiculatus",
         "Meriones Unguiculatus",
         "meriones Unguiculatus",
+        "Meriones unguiculatus - Mongolian gerbil",
     ],
 )
 def test_species_all_possible(species: str) -> None:
@@ -770,21 +772,57 @@ def test_species_all_possible(species: str) -> None:
     assert species_rec.model_dump(mode="json", exclude_none=True) == {
         "identifier": "http://purl.obolibrary.org/obo/NCBITaxon_10047",
         "schemaKey": "SpeciesType",
-        "name": "Meriones unguiculatus",
+        "name": "Meriones unguiculatus - Mongolian gerbil",
     }
 
 
-def test_extract_unknown_species():
-    with pytest.raises(ValueError) as excinfo:
-        extract_species({"species": "mumba-jumba"})
-    assert str(excinfo.value).startswith("Cannot interpret species field: mumba-jumba")
+# tricky one since we have multiple rats but only one we want to map
+# as the default "rat"
+@pytest.mark.parametrize(
+    "species",
+    [
+        "rat",
+        "http://purl.obolibrary.org/obo/NCBITaxon_10116",
+    ],
+)
+def test_species_rat(species: str) -> None:
+    species_rec = extract_species({"species": species})
+    assert species_rec
+    assert species_rec.model_dump(mode="json", exclude_none=True) == {
+        "identifier": "http://purl.obolibrary.org/obo/NCBITaxon_10116",
+        "schemaKey": "SpeciesType",
+        "name": "Rattus norvegicus - Norway rat",
+    }
 
 
-def test_species_map():
-    # all alternative names should be lower case
-    for common_names, *_ in species_map:
-        for key in common_names:
-            assert key.lower() == key
+@pytest.mark.parametrize(
+    "species",
+    [
+        "mumba-jumba",
+        "rat unknown",
+        "borat",
+        "my wonderful rat in pokadots",
+        "http://example.com/myrat",
+    ],
+)
+def test_species_extract_unknown(species):
+    with pytest.raises(ValueError) as excinfo:
+        extract_species({"species": species})
+    assert str(excinfo.value).startswith(f"Cannot interpret species field: {species}")
+
+
+@pytest.mark.parametrize("common_names,prefix,uri,name", species_map)
+def test_species_map(common_names, prefix, uri, name):
+    # all alternative names should be in lower case
+    for key in common_names:
+        assert key.lower() == key
+    assert " - " in name
+    # verify that feeding a full "standard" name matches the correct one
+    for species in chain(name.split(" - "), common_names):
+        species_rec = extract_species({"species": species})
+        assert species_rec
+        assert str(species_rec.identifier) == uri
+        assert species_rec.name == name
 
 
 @pytest.mark.parametrize(