diff --git a/.github/workflows/ci-tests.yml b/.github/workflows/ci-tests.yml index 794da84d7..8ff58a555 100644 --- a/.github/workflows/ci-tests.yml +++ b/.github/workflows/ci-tests.yml @@ -105,11 +105,12 @@ jobs: LUA_VERSION: ${{ matrix.lua }} - name: Install test prerequisites (apt) - run: sudo apt-get install -y -qq python3-pytest python3-pytest-asyncio uvicorn python3-falcon python3-aiosqlite python3-pyosmium + run: sudo apt-get install -y -qq python3-pytest python3-pytest-asyncio uvicorn python3-falcon python3-aiosqlite python3-pyosmium python3-unidecode if: matrix.dependencies == 'apt' - name: Install test prerequisites (pip) - run: ./venv/bin/pip install pytest-asyncio falcon starlette asgi_lifespan aiosqlite osmium uvicorn + run: | + ./venv/bin/pip install pytest-asyncio falcon starlette asgi_lifespan aiosqlite osmium uvicorn cantonese-romanisation unidecode opencc if: matrix.dependencies == 'pip' - name: Install test prerequisites diff --git a/.gitignore b/.gitignore index 23bcdb1c6..e90917d21 100644 --- a/.gitignore +++ b/.gitignore @@ -10,4 +10,4 @@ dist .coverage .vagrant -data/country_osm_grid.sql.gz +data/country_osm_grid.sql.gz \ No newline at end of file diff --git a/packaging/nominatim-api/pyproject.toml b/packaging/nominatim-api/pyproject.toml index c2d42bd5d..765cf89f3 100644 --- a/packaging/nominatim-api/pyproject.toml +++ b/packaging/nominatim-api/pyproject.toml @@ -23,6 +23,13 @@ dependencies = [ ] dynamic = ["version"] +[project.optional-dependencies] +transliteration = [ + "cantonese-romanisation", + "unidecode", + "opencc" +] + [project.urls] Homepage = "https://nominatim.org" Documentation = "https://nominatim.org/release-docs/latest/" diff --git a/settings/languages.yaml b/settings/languages.yaml new file mode 100644 index 000000000..d681048d8 --- /dev/null +++ b/settings/languages.yaml @@ -0,0 +1,1118 @@ +# comprehensive list of iso languages taken from https://en.wikipedia.org/wiki/List_of_ISO_639_language_codes +# for now, when still workshopping, have both a latin field and a written field, subjet to renaming +# worried about string identification taking longer than boolean check + +# Abkhazian (Cyrillic) -> Салам аҩыза +ab: + latin: false + written: cyr + + +# Afar -> Nagaale kataysaw +aa: + latin: true + written: lat + + +# Afrikaans -> hallo vriend +af: + latin: true + written: lat + + +# Akan -> hello adamfo +ak: + latin: true + written: lat + + +# Albanian -> pershendetje mik +sq: + latin: true + written: lat + + +# Amharic -> ሰላም ጓደኛ +am: + latin: false + written: None + + +# Arabic -> مرحبا صديق +ar: + latin: false + written: None + + +# Aragonese -> not supported by google translate, spanish-esque +an: + latin: true + written: lat + + +# Armenian -> բարև ընկեր +hy: + latin: false + written: None + + +# Assamese -> নমস্কাৰ বন্ধু +as: + latin: false + written: None + + +# Avaric -> салам гьудул (Cyrillic?) +av: + latin: false + written: cyr + + +# Avestan -> not supported by google translate, iranian base +ae: + latin: false + written: None + + +# Aymara -> aruntt'asmawa amigo +ay: + latin: true + written: lat + + +# Azerbaijani -> salam dostum +az: + latin: true + written: lat + + +# Bambara -> bonjour terikɛ +bm: + latin: true + written: lat + + +# Bashkir (Cyrillic) -> һаумы дуҫ +ba: + latin: false + written: cyr + + +# Basque -> kaixo lagun +eu: + latin: true + written: lat + + +# Belarusian (Cyrillic) -> прывітанне сябар +be: + latin: false + written: cyr + + + # Bengali -> হ্যালো বন্ধু +bn: + latin: false + written: None + + +# Bislama -> not supported by google translate, english creole +bi: + latin: true + written: lat + + +# Bosnian -> zdravo prijatelju +bs: + latin: true + written: lat + + +# Breton -> Demat mignon +br: + latin: true + written: lat + + +# Bulgarian (Cyrillic) -> здравей приятел +bg: + latin: false + written: cyr + + +# Burmese -> မင်္ဂလာပါ သူငယ်ချင်း +my: + latin: false + written: None + + +# Catalan -> hola amic +ca: + latin: true + written: lat + + +# Chamorro -> håfa amigu +ch: + latin: true + written: lat + + +# Chechen (Cyrillic) -> салам маршал доттагӀ +ce: + latin: false + written: cyr + + +# Chichewa, Nyanja -> moni bwenzi +ny: + latin: true + written: lat + + +# Chinese -> 朋友你好 +zh: + latin: false + written: None + + +# Church Slavonic, Church Slavic (Cyrillic) +cu: + latin: false + written: cyr + + +# Chuvash (Cyrillic) -> салам тусӑм +cv: + latin: false + written: cyr + + +# Cornish -> not supported by google translate, english uk +kw: + latin: true + written: lat + + +# Corsican -> salutu amicu +co: + latin: true + written: lat + + +# Cree -> not supported by google translate, latin & aboriginal syllabics +cr: + latin: false + written: None + + +# Croatian -> zdravo prijatelju +hr: + latin: true + written: lat + + +# Czech -> ahoj příteli +cs: + latin: true + written: lat + + +# Danish -> hej ven +da: + latin: true + written: lat + + +# Divehi -> ހެލޯ ފްރެންޑް +dv: + latin: false + written: None + + +# Dutch -> hallo vriend +nl: + latin: true + written: lat + + +# Dzongkha -> ཧེ་ལོ་གྲོགས་པོ། +dz: + latin: false + written: None + + +# English -> hello friend +en: + latin: true + written: lat + + +# Esperanto -> saluton amiko +eo: + latin: true + written: lat + + +# Estonian -> tere sõber +et: + latin: true + written: lat + + +# Ewe -> hello xɔ̃nye +ee: + latin: true + written: lat + + +# Faroese -> hey vinur +fo: + latin: true + written: lat + + +# Fijian -> bula vinaka itokani +fj: + latin: true + written: lat + + +# Finnish -> hei ystävä +fi: + latin: true + written: lat + + +# French -> bonjour mon ami +fr: + latin: true + written: lat + + +# Western Frisian -> hallo freon +fy: + latin: true + written: lat + + +# Fulah, Fulani -> on njaaraama musiɗɗo +ff: + latin: true + written: lat + + +# Scottish Gaelic -> halò a charaid +gd: + latin: true + written: lat + + +# Galician -> ola amigo +gl: + latin: true + written: lat + + +# Ganda -> mulamusizza mukwano gwange +lg: + latin: true + written: lat + + +# Georgian -> გამარჯობა მეგობარო +ka: + latin: false + written: None + + +# German -> Hallo Freund +de: + latin: true + written: lat + + +# Greek -> γεια σου φιλε +el: + latin: false + written: None + + +# Greenlandic, Kalaallisut -> ilassi kammaga +kl: + latin: true + written: lat + + +# Guarani -> maitei angirũ +gn: + latin: true + written: lat + + +# Gujarati -> હેલો મિત્ર +gu: + latin: false + written: None + + +# Haitian Creole, Haitian -> +ht: + latin: true + written: lat + + +# Hausa -> sannu abokina +ha: + latin: true + written: lat + + +# Hebrew -> שלום חבר +he: + latin: false + written: None + + +# Herero -> no google translate, but wikipedia says latin writing system +hz: + latin: true + written: lat + + +# Hindi -> हैलो दोस्त +hi: + latin: false + written: None + + +# Hiri Motu -> no google translate, but wikipedia says latin writing system +ho: + latin: true + written: lat + + +# Hungarian -> helló barátom +hu: + latin: true + written: lat + + +# Icelandic -> sæll vinur +is: + latin: true + written: lat + + +# Ido -> no google translate, but derivation of esperanto +io: + latin: true + written: lat + + +# Igbo -> ndewo enyi +ig: + latin: true + written: lat + + +# Indonesian -> halo teman +id: + latin: true + written: lat + + +# Interlingua -> no google translate, but wikipedia says latin writing system (auxilary language) +ia: + latin: true + written: lat + + +# Interlingue -> no google translate, but wikipedia says latin writing system (auxilary language) +ie: + latin: true + written: lat + + +# Inuktitut -> no google translate, but wikipedia says nonlatin +iu: + latin: false + written: None + + +# Inupiaq -> no google translate, but wikipedia says latin writing +ik: + latin: true + written: lat + + +# Irish -> Dia duit a chara +ga: + latin: true + written: lat + + +# Italian -> ciao amico +it: + latin: true + written: lat + + +# Japanese -> こんにちは、友人 +ja: + latin: false + written: None + + +# Javanese -> halo kanca +jv: + latin: true + written: lat + + +# Kannada -> ನಮಸ್ಕಾರ ಗೆಳೆಯ +kn: + latin: false + written: None + + +# Kanuri -> salam sawa +kr: + latin: true + written: lat + + +# Kashmiri -> no google translate, but wikipedia says nonlatin +ks: + latin: false + written: None + + +# Kazakh (Cyrillic) -> сәлем досым +kk: + latin: false + written: cyr + + +# Khmer -> សួស្តីមិត្ត +km: + latin: false + written: None + + +# Kikuyu -> no google translate, but wikipedia says latin +ki: + latin: true + written: lat + + +# Kinyarwanda -> muraho nshuti +rw: + latin: true + written: lat + + +# Kyrgyz -> салам дос +ky: + latin: false + written: cyr + + +# Komi -> чолӧм ёрт, cyrillic? +kv: + latin: false + written: None + + +# Kongo -> mbote nduku +kg: + latin: true + written: lat + + +# Korean -> 안녕, 친구 +ko: + latin: false + written: None + + +# Kuanyama -> dialect of Ovambo, wikipedia shows latin based excerpt +kj: + latin: true + written: lat + + +# Kurdish -> merheba heval +ku: + latin: true + written: lat + + +# Lao -> ສະບາຍດີເພື່ອນ +lo: + latin: false + written: None + + +# Latin -> salve amicus +la: + latin: true + written: lat + + +# Latvian -> sveiks draugs +lv: + latin: true + written: lat + + +# Ligurian -> ciao amigo +lj: + latin: true + written: lat + + +# Limburgish -> hallo vrund +li: + latin: true + written: lat + + +# Lingala -> mbote moninga +ln: + latin: true + written: lat + + +# Lithuanian -> labas drauge +lt: + latin: true + written: lat + + +# Luba-Katanga -> latin based, but only 22 letters +lu: + latin: true + written: lat + + +# Luxembourgish -> Moien Frënd +lb: + latin: true + written: lat + + +# Macedonian (Cyrillic) -> здраво пријателе +mk: + latin: false + written: cyr + + +# Malagasy -> salama namana +mg: + latin: true + written: lat + + +# Malay -> salam kawan +ms: + latin: true + written: lat + + +# Malayalam -> ഹലോ സുഹൃത്തേ +ml: + latin: false + written: None + + +# Maltese -> hello habib +mt: + latin: true + written: lat + + +# Manx -> Halloo chaarjyn. +gv: + latin: true + written: lat + + +# Maori -> wikipedia says latin writing system +mi: + latin: true + written: lat + + +# Marathi -> नमस्कार मित्रा +mr: + latin: false + written: None + + +# Marshallese -> yokyokwe mōtta +mh: + latin: true + written: lat + + +# Mongolian -> сайн уу найзаа (Cyrillic?) +mn: + latin: false + written: None + + +# Nauru -> wikipedia says latin based +na: + latin: true + written: lat + + +# Navajo -> wikipedia says latin based, but some letters are accented +nv: + latin: true + written: lat + + +# North Ndebele -> wikipedia says latin based +nd: + latin: true + written: lat + + +# South Ndebele -> wikipedia says latin based +nr: + latin: true + written: lat + + +# Ndonga -> dialect of Ovambo, wikipedia shows latin based excerpt +ng: + latin: true + written: lat + + +# Nepali -> नमस्ते साथी +ne: + latin: false + written: None + + +# Norwegian -> hei venn +no: + latin: true + written: lat + + +# Norwegian Bokmål -> norwegian written form +nb: + latin: true + written: lat + + +# Norwegian Nynorsk -> norwegian written form +nn: + latin: true + written: lat + + +# Occitan -> bonjorn amic +oc: + latin: true + written: lat + + +# Ojibwa -> indigenous canadian language, toss up +oj: + latin: false + written: None + + +# Oriya, Odia -> ନମସ୍କାର ବନ୍ଧୁ +or: + latin: false + written: None + + +# Oromo -> akkam jirtu hiriyaa koo +om: + latin: true + written: lat + + +# Ossetian, Ossetic -> салам хæлар (Cyrillic?) +os: + latin: false + written: None + + +# Pali -> nonlatin, from wikipedia +pi: + latin: false + written: None + + +# Pashto -> سلام ملګری +ps: + latin: false + written: None + + +# Persian -> سلام دوست +fa: + latin: false + written: None + + +# Polish -> cześć przyjacielu +pl: + latin: true + written: lat + + +# Portuguese -> olá amigo +pt: + latin: true + written: lat + + +# Punjabi -> ਹੈਲੋ ਦੋਸਤ +pa: + latin: false + written: None + + +# Quechua -> hola amigo +qu: + latin: true + written: lat + + +# Romanian -> salut prietene +ro: + latin: true + written: lat + + +# Romansh -> yes, by wikipedia +rm: + latin: true + written: lat + + +# Rundi -> ndagusavye mugenzi +rn: + latin: true + written: lat + + +# Russian -> привет, друг +ru: + latin: false + written: cyr + + +# Northern Sami -> dearvvuođa ustit +se: + latin: true + written: lat + + +# Samoan -> talofa uo +sm: + latin: true + written: lat + + +# Sango -> bonjour kamarade +sg: + latin: true + written: lat + + +# Sanskrit -> नमस्कार मित्र +sa: + latin: false + written: None + + +# Sardinian -> romance language, as per wikipedia +sc: + latin: true + written: lat + + +# Serbian -> здраво пријатељу (Cyrillic?) +sr: + latin: false + written: cyr + + +# Shona -> mhoro shamwari +sn: + latin: true + written: lat + + +# Sindhi -> هيلو دوست +sd: + latin: false + written: None + + +# Sinhala -> ආයුබෝවන් මිත්‍රයා +si: + latin: false + written: None + + +# Slovak -> ahoj kamarát +sk: + latin: true + written: lat + + +# Slovenian -> zdravo prijatelj +sl: + latin: true + written: lat + + +# Somali -> hello saaxiib +so: + latin: true + written: lat + + +# Southern Sotho -> dumela motswalle +st: + latin: true + written: lat + + +# Spanish -> Hola amiga +es: + latin: true + written: lat + + +# Sundanese -> halo sobat +su: + latin: true + written: lat + + +# Swahili -> habari rafiki +sw: + latin: true + written: lat + + +# Swati -> sawubona mngani +ss: + latin: true + written: lat + + +# Swedish -> hej vän +sv: + latin: true + written: lat + + +# Tagalog, Filipino -> Tagalog +tl: + latin: true + written: lat + + +# Tahitian -> aroha e hoa +ty: + latin: true + written: lat + + +# Tajik (Cyrillic) -> салом дустам +tg: + latin: false + written: cyr + + +# Tamil -> வணக்கம் நண்பரே +ta: + latin: false + written: None + + +# Tatar (Cyrillic) -> сәлам дус +tt: + latin: false + written: cyr + + +# Telugu -> హలో మిత్రమా +te: + latin: false + written: None + + +# Thai -> สวัสดีเพื่อน +th: + latin: false + written: None + + +# Tibetan -> ཀྱེ་གྲོགས་པོ། +bo: + latin: false + written: None + + +# Tigrinya -> ሰላም ዓርከይ +ti: + latin: false + written: None + + +# Tonga -> Malo kaume'a +to: + latin: true + written: lat + + +# Tsonga -> hello munghana +ts: + latin: true + written: lat + + +# Tswana -> Dumela tsala +tn: + latin: true + written: lat + + +# Turkish -> merhaba arkadaşım +tr: + latin: true + written: lat + + +# Turkmen -> salam dost +tk: + latin: true + written: lat + + +# Twi (same as Akan?) -> hello adamfo +tw: + latin: true + written: lat + + +# Uyghur -> ياخشىمۇسىز دوست +ug: + latin: false + written: None + + +# Ukrainian -> привіт друже (Cyrillic) +uk: + latin: false + written: cyr + + +# Urdu -> ہیلو دوست +ur: + latin: false + written: None + + +# Uzbek -> salom do'stim +uz: + latin: true + written: lat + + +# Venda -> lumelisa khonani +ve: + latin: true + written: lat + + +# Vietnamese -> Chào bạn +vi: + latin: true + written: lat + + +# Venetian -> not supported by google translate, italian base +vc: + latin: true + written: lat + + +# Volapük -> not supported by google translate, latin base, "world speak" +vo: + latin: true + written: lat + + +# Walloon -> not supported by google translate, french base +wa: + latin: true + written: lat + + +# Welsh -> helo ffrind +cy: + latin: true + written: lat + + +# Wolof -> Nanga def xarit +wo: + latin: true + written: lat + + +# Xhosa -> Molo mhlobo +xh: + latin: true + written: lat + + +# Sichuan Yi -> yi base, according to wikipedia +ii: + latin: false + written: None + + +# Yiddish -> העלא פרייַנד +yi: + latin: false + written: None + + +# Yoruba -> hello ore +yo: + latin: true + written: lat + + +# Zhuang -> tai language +za: + latin: false + written: None + + +# Zulu -> sawubona mngani +zu: + latin: true + written: lat + + +# Yue Chinese, used as Cantonese 你好朋友 +yue: + latin: false + written: None \ No newline at end of file diff --git a/src/nominatim_api/__init__.py b/src/nominatim_api/__init__.py index 89c57b651..8c1661e28 100644 --- a/src/nominatim_api/__init__.py +++ b/src/nominatim_api/__init__.py @@ -35,8 +35,10 @@ ReverseResults as ReverseResults, SearchResult as SearchResult, SearchResults as SearchResults) -from .localization import (Locales as Locales) from .result_formatting import (FormatDispatcher as FormatDispatcher, load_format_dispatcher as load_format_dispatcher) - +from .localization import (Locales as Locales, + TransliterateLocales as TransliterateLocales, + AbstractLocales as AbstractLocales) +from .data import (lang_info as lang_info, country_info as country_info) from .version import NOMINATIM_API_VERSION as __version__ diff --git a/src/nominatim_api/data/__init__.py b/src/nominatim_api/data/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/nominatim_api/data/country_info.py b/src/nominatim_api/data/country_info.py new file mode 100644 index 000000000..1ac756374 --- /dev/null +++ b/src/nominatim_api/data/country_info.py @@ -0,0 +1,93 @@ +# SPDX-License-Identifier: GPL-3.0-or-later +# +# This file is part of Nominatim. (https://nominatim.org) +# +# Copyright (C) 2024 by the Nominatim developer community. +# For a full list of authors see the git log. +""" +Functions for importing and managing static country information. +""" +from typing import Dict, Any, Iterable, Tuple, Optional, List, overload + +from ..config import Configuration + + +class _CountryInfo: + """ Caches country-specific properties from the configuration file. + """ + + def __init__(self) -> None: + self._info: Dict[str, Dict[str, Any]] = {} + + def load(self, config: Configuration) -> None: + """ Load the country languages from the configuration files, + if they are not loaded yet. + """ + if not self._info: + self._info = config.load_sub_configuration('country_settings.yaml') + for prop in self._info.values(): + # Convert languages into a list for simpler handling. + if 'languages' not in prop: + prop['languages'] = [] + elif not isinstance(prop['languages'], list): + prop['languages'] = [x.strip() + for x in prop['languages'].split(',')] + + def items(self) -> Iterable[Tuple[str, Dict[str, Any]]]: + """ Return tuples of (country_code, property dict) as iterable. + """ + return self._info.items() + + def get(self, country_code: str) -> Dict[str, Any]: + """ Get country information for the country with the given country code. + """ + return self._info.get(country_code, {}) + + +_COUNTRY_INFO = _CountryInfo() + + +def setup_country_config(config: Configuration) -> None: + """ Load country properties from the configuration file. + Needs to be called before using any other functions in this + file. + """ + _COUNTRY_INFO.load(config) + + +def get(country: str) -> Dict[str, Any]: + """ Get country information for the country with the given country code. """ + return _COUNTRY_INFO._info.get(country) or {} + + +def get_lang(country: str) -> List[str]: + """ Get country languages for the country with the given country code.""" + country_info = _COUNTRY_INFO._info.get(country.lower(), {}) + languages = country_info.get('languages', []) + return languages if isinstance(languages, list) else [] + + +@overload +def iterate() -> Iterable[Tuple[str, Dict[str, Any]]]: + ... + + +@overload +def iterate(prop: str) -> Iterable[Tuple[str, Any]]: + ... + + +def iterate(prop: Optional[str] = None) -> Iterable[Tuple[str, Dict[str, Any]]]: + """ Iterate over country code and properties. + + When `prop` is None, all countries are returned with their complete + set of properties. + + If `prop` is given, then only countries are returned where the + given property is set. The second item of the tuple contains only + the content of the given property. + """ + if prop is None: + return _COUNTRY_INFO.items() + + return ((c, p[prop]) for c, p in _COUNTRY_INFO.items() if prop in p) diff --git a/src/nominatim_api/data/lang_info.py b/src/nominatim_api/data/lang_info.py new file mode 100644 index 000000000..10bdc8d25 --- /dev/null +++ b/src/nominatim_api/data/lang_info.py @@ -0,0 +1,79 @@ +# SPDX-License-Identifier: GPL-3.0-or-later +# +# This file is part of Nominatim. (https://nominatim.org) +# +# Copyright (C) 2025 by the Nominatim developer community. +# For a full list of authors see the git log. +""" +Functions for importing and managing static language information. +""" +from typing import Dict, Any, Iterable, Tuple, Optional, overload + +from ..config import Configuration + + +class _LangInfo: + """ Caches language-specific properties from the configuration file. + """ + + def __init__(self) -> None: + self._info: Dict[str, Dict[str, Any]] = {} + + def load(self, config: Configuration) -> None: + """ Load the language properties from the configuration files, + if they are not loaded yet. + """ + if not self._info: + self._info = config.load_sub_configuration('languages.yaml') + + def items(self) -> Iterable[Tuple[str, Dict[str, Any]]]: + """ Return tuples of (country_code, property dict) as iterable. + """ + return self._info.items() + + def get(self, lang: str) -> Dict[str, Any]: + """ Get language information for the language with the given language code. + """ + return self._info.get(lang, {}) + + +_LANG_INFO = _LangInfo() + + +def setup_lang_config(config: Configuration) -> None: + """ Load country properties from the configuration file. + Needs to be called before using any other functions in this + file. + """ + _LANG_INFO.load(config) + + +def get(lang: str) -> Dict[str, Any]: + """ Get language information for the language with the given language code. """ + return _LANG_INFO._info.get(lang) or {} + + +@overload +def iterate() -> Iterable[Tuple[str, Dict[str, Any]]]: + ... + + +@overload +def iterate(prop: str) -> Iterable[Tuple[str, Any]]: + ... + + +def iterate(prop: Optional[str] = None) -> Iterable[Tuple[str, Dict[str, Any]]]: + """ Iterate over languages and their properties. + + When `prop` is None, all countries are returned with their complete + set of properties. + + If `prop` is given, then only countries are returned where the + given property is set. The second item of the tuple contains only + the content of the given property. + """ + if prop is None: + return _LANG_INFO.items() + + return ((c, p[prop]) for c, p in _LANG_INFO.items() if prop in p) diff --git a/src/nominatim_api/localization/__init__.py b/src/nominatim_api/localization/__init__.py new file mode 100644 index 000000000..8f1821659 --- /dev/null +++ b/src/nominatim_api/localization/__init__.py @@ -0,0 +1,12 @@ +# SPDX-License-Identifier: GPL-3.0-or-later +# +# This file is part of Nominatim. (https://nominatim.org) +# +# Copyright (C) 2025 by the Nominatim developer community. +# For a full list of authors see the git log. +""" +Module for localization +""" +from .localizer import (Locales as Locales) +from .transliterator import (TransliterateLocales as TransliterateLocales) +from .base import (AbstractLocales as AbstractLocales) diff --git a/src/nominatim_api/localization.py b/src/nominatim_api/localization/base.py similarity index 56% rename from src/nominatim_api/localization.py rename to src/nominatim_api/localization/base.py index 3c786a209..64ef809d2 100644 --- a/src/nominatim_api/localization.py +++ b/src/nominatim_api/localization/base.py @@ -2,25 +2,17 @@ # # This file is part of Nominatim. (https://nominatim.org) # -# Copyright (C) 2024 by the Nominatim developer community. +# Copyright (C) 2025 by the Nominatim developer community. # For a full list of authors see the git log. -""" -Helper functions for localizing names of results. -""" -from typing import Mapping, List, Optional -from .config import Configuration -from .results import AddressLines, BaseResultT - import re +from abc import ABC, abstractmethod +from typing import Optional, List, Mapping, Tuple, Any +from ..results import BaseResultT +from ..config import Configuration -class Locales: - """ Helper class for localization of names. - - It takes a list of language prefixes in their order of preferred - usage. - """ - +class AbstractLocales(ABC): + """Interface for localization logic.""" def __init__(self, langs: Optional[List[str]] = None): self.config = Configuration(None) self.languages = langs or [] @@ -56,27 +48,53 @@ def display_name(self, names: Optional[Mapping[str, str]]) -> str: If 'names' is null or empty, an empty string is returned. If no appropriate localization is found, the first name is returned. """ + return self.display_name_with_locale(names)[0] + + def display_name_with_locale(self, names: Optional[Mapping[str, str]]) -> Tuple[str, str]: + """ Return the best matching name from a dictionary of names + containing different name variants, as well as an identifier + with regards to what language used + + If 'names' is null or empty, an empty tuple is returned. If no + appropriate localization is found, the first name is returned with + the 'default' marker, where afterwards iso is used, using country of origin. + """ if not names: - return '' + return ('', '') if len(names) > 1: for tag in self.name_tags: if tag in names: - return names[tag] + _, _, lang = tag.partition(':') + return (names[tag], lang or 'default') # Nothing? Return any of the other names as a default. - return next(iter(names.values())) + return (next(iter(names.values())), 'default') + + def localize_results(self, results: List[BaseResultT]) -> None: + """ Localize results according to the chosen locale. """ + for result in results: + result.locale_name = self.display_name(result.names) + self.localize(result) + + @abstractmethod + def localize(self, result: BaseResultT) -> None: + """ Localize address parts according to the chosen locale. """ + pass @staticmethod - def from_accept_languages(langstr: str) -> 'Locales': - """ Create a localization object from a language list in the - format of HTTP accept-languages header. + @abstractmethod + def from_accept_languages(langstr: str) -> 'AbstractLocales': + """ Parse a language list in the format of HTTP accept-languages header. - The functions tries to be forgiving of format errors by first splitting + The function tries to be forgiving of format errors by first splitting the string into comma-separated parts and then parsing each description separately. Badly formatted parts are then ignored. """ - # split string into languages + pass + + @staticmethod + def sort_languages(langstr: str) -> List[Tuple[str, Any]]: candidates = [] for desc in langstr.split(','): m = re.fullmatch(r'\s*([a-z_-]+)(?:;\s*q\s*=\s*([01](?:\.\d+)?))?\s*', @@ -86,35 +104,4 @@ def from_accept_languages(langstr: str) -> 'Locales': # sort the results by the weight of each language (preserving order). candidates.sort(reverse=True, key=lambda e: e[1]) - - # If a language has a region variant, also add the language without - # variant but only if it isn't already in the list to not mess up the weight. - languages = [] - for lid, _ in candidates: - languages.append(lid) - parts = lid.split('-', 1) - if len(parts) > 1 and all(c[0] != parts[0] for c in candidates): - languages.append(parts[0]) - - return Locales(languages) - - def localize(self, lines: AddressLines) -> None: - """ Sets the local name of address parts according to the chosen - locale. - - Only address parts that are marked as isaddress are localized. - - AddressLines should be modified in place. - """ - for line in lines: - if line.isaddress and line.names: - line.local_name = self.display_name(line.names) - - def localize_results(self, results: List[BaseResultT]) -> None: - """ Set the local name of results according to the chosen - locale. - """ - for result in results: - result.locale_name = self.display_name(result.names) - if result.address_rows: - self.localize(result.address_rows) + return candidates diff --git a/src/nominatim_api/localization/localizer.py b/src/nominatim_api/localization/localizer.py new file mode 100644 index 000000000..6d9d2e5cf --- /dev/null +++ b/src/nominatim_api/localization/localizer.py @@ -0,0 +1,50 @@ +# SPDX-License-Identifier: GPL-3.0-or-later +# +# This file is part of Nominatim. (https://nominatim.org) +# +# Copyright (C) 2025 by the Nominatim developer community. +# For a full list of authors see the git log. +from .base import AbstractLocales +from ..results import BaseResultT + + +class Locales(AbstractLocales): + """ Simple Helper class for localization of names. + + It takes a list of language prefixes in their order of preferred + usage. + """ + + def localize(self, result: BaseResultT) -> None: + """ Sets the local name of address parts according to the chosen + locale. + + Only address parts that are marked as isaddress are localized. + """ + if not result.address_rows: + return + + for line in result.address_rows: + if line.isaddress and line.names: + line.local_name = self.display_name(line.names) + + @staticmethod + def from_accept_languages(langstr: str) -> 'Locales': + """ Parse a language list in the format of HTTP accept-languages header. + + The function tries to be forgiving of format errors by first splitting + the string into comma-separated parts and then parsing each + description separately. Badly formatted parts are then ignored. + """ + candidates = AbstractLocales.sort_languages(langstr) + + # If a language has a region variant, also add the language without + # variant but only if it isn't already in the list to not mess up the weight. + languages = [] + for lid, _ in candidates: + languages.append(lid) + parts = lid.split('-', 1) + if len(parts) > 1 and all(c[0] != parts[0] for c in candidates): + languages.append(parts[0]) + + return Locales(languages) diff --git a/src/nominatim_api/localization/transliterator.py b/src/nominatim_api/localization/transliterator.py new file mode 100644 index 000000000..9d0a2c5f0 --- /dev/null +++ b/src/nominatim_api/localization/transliterator.py @@ -0,0 +1,236 @@ +# SPDX-License-Identifier: GPL-3.0-or-later +# +# This file is part of Nominatim. (https://nominatim.org) +# +# Copyright (C) 2025 by the Nominatim developer community. +# For a full list of authors see the git log. +from typing import Optional, List + +from .base import AbstractLocales +from ..results import AddressLine, BaseResultT +from ..data import lang_info, country_info + +# optional dependencies +try: + from unidecode import unidecode +except ImportError: + unidecode = None # type: ignore +try: + from cantoroman import Cantonese # type: ignore +except ImportError: + Cantonese = None +try: + import opencc # type: ignore +except ImportError: + opencc = None + + +def latindecode(local_name: Optional[str]) -> str: + if unidecode is None: + raise ImportError('The unidecode library is required for Latin transliteration.') + return unidecode(local_name) if local_name else '' + + +def chinesedecode(local_name: Optional[str], conversion: str) -> str: + if opencc is None: + raise ImportError('The opencc library is required for Latin transliteration.') + converter = opencc.OpenCC(conversion) + return str(converter.convert(local_name)) if local_name else '' + + +def cantodecode(line: str) -> str: + """ Takes in a string in Cantonese and returns the Latin + transliterated version. + Uses the cantoroman library, named as so to be homogenous with unidecode + + For cases with multiple pronounciation, the first is always taken + """ + if Cantonese is None: + raise ImportError('The cantonese-romanisation library is' + 'required for Cantonese transliteration.') + cantonese = Cantonese() # perhaps make into global variable later + cantonese_line = "" + for char in line: + cantonese_line += cantonese.getRoman(char)[0][0].capitalize() + cantonese_line += ' ' + return cantonese_line.strip() + + +class TransliterateLocales(AbstractLocales): + """ Complex Helper class for localization of names. + + It takes a list of language prefixes in their order of preferred + usage. + """ + def __init__(self, langs: Optional[list[str]] = None): + super().__init__(langs) + country_info.setup_country_config(self.config) + lang_info.setup_lang_config(self.config) + + @staticmethod + def is_latin(language_code: str) -> bool: + """ Returns if the given language is latin based on the information in languanges.yaml + + If the code does not exist in the yaml file, it will return false. + Due to normalization, the "prime" version of the code must also be in + known languages, so it will eventually execute + + Will only work on two-letter ISO 639 language codes with the addition of yue + """ + language = lang_info.get(language_code) + return bool(language and language.get('written') == 'lat') + + @staticmethod + def normalize_dict(lang: str) -> List[str]: + """ Language mapping dictionary to standardize certain names, i.e. zh and + zh-cn will always map to zh-Hanszh-tw will always map to zh-Hant. + In the case of ambiguity, the largest number of languages will be added. + + For all other languages, follow Nominatim precedent + and just concatenate after the '-' + + Code assumes all language codes are in two letter format + https://en.wikipedia.org/wiki/List_of_ISO_639_language_codes + with the exception of yue + """ + # For zh-Latn-pinyin and zh-Latn, I did not include this as it is not a spoken language + # and it would be in Latin anyways -> this could potentially be changed in the future + lang_dict = { + "zh": ["zh-Hans", "zh-Hant", "yue"], # zh covers zh-Hans, zh-Hant, yue + "zh-cn": ["zh-Hans"], # only Simplfied + "zh-tw": ["zh-Hant"], # only Traditional Mandarin + "zh-hans": ["zh-Hans"], + "zh-hant": ["zh-Hant", "yue"], # Traditional implies both canto & mando + "zh-Hans-CN": ["zh-Hans"], # only Simplfied + "zh-cmn": ["zh-Hans"], # only Simplified, cmn means Mandarin + "zh-cmn-Hans": ["zh-Hans"], # only Simplified, cmn means Mandarin + "zh-cmn-Hant": ["zh-Hant"] # only Traditional, cmn means Mandarin + } + + if lang in lang_dict: # Ordering nessecary due to zh edge case (no '-') + return lang_dict[lang] + elif '-' not in lang: + return [lang] + return [lang.split('-')[0]] + + @staticmethod + def set_lang(result: BaseResultT) -> str: + if result.country_code == 'cn' and result.address_rows: # check for Hong Kong + for address_line in result.address_rows: + if address_line.category == ('place', 'state') and \ + '香港' in address_line.names.get('name', ''): + return 'yue' + + local_languages = country_info.get_lang(str(result.country_code)) + return str(local_languages[0]) if len(local_languages) == 1 else '' + + @staticmethod + def zh_Hans_transliterate(line: AddressLine) -> str: + """ If in Traditional Chinese, convert to Simplified + NOT TESTED, PROOF OF CONCEPT + + Otherwise switch to standard Latin default transliteration + """ + if line.local_name_lang == 'zh-hant': + # t2s.json Traditional Chinese to Simplified Chinese 繁體到簡體 + return chinesedecode(line.local_name, 't2s.json') + return latindecode(line.local_name) + + @staticmethod + def zh_Hant_transliterate(line: AddressLine) -> str: + """ If in Simplified Chinese, convert to Traditional + Otherwise switch to standard Latin default transliteration + """ + if line.local_name_lang == 'zh-hans' or line.local_name_lang == 'zh-CN': + # t2s.json Traditional Chinese to Simplified Chinese 繁體到簡體 + return chinesedecode(line.local_name, 's2t.json') + return latindecode(line.local_name) + + @staticmethod + def yue_transliterate(line: AddressLine) -> str: + """ If in Simplified Chinese, convert to Traditional + Otherwise switch to standard Latin default transliteration + """ + if line.local_name_lang == 'zh-hans' or line.local_name_lang == 'zh': + # t2s.json Traditional Chinese to Simplified Chinese 繁體到簡體 + return chinesedecode(line.local_name, 's2t.json') + return latindecode(line.local_name) + + def latin_transliterate(self, line: AddressLine) -> str: + "Transliterates to latin, needs to take into account Han Re-Unification" + if line.local_name_lang == 'yue': + return cantodecode(line.local_name) if line.local_name else '' + else: + return latindecode(line.local_name) + + def transliterate(self, line: AddressLine) -> str: + """ Most granular transliteration component that performs raw transliteration + + Defaults to Latin + """ + for lang in self.languages: + _function = f"{lang.replace('-', '_')}_transliterate" + transliterate_function = getattr(self, _function, None) + + if transliterate_function: + print(f"{lang} transliteration successful") + return str(transliterate_function(line)) + elif self.is_latin(lang): + print("latin based language detected, latin transliteration occuring") + return self.latin_transliterate(line) + + print("defaulting to latin based transliteration") + return self.latin_transliterate(line) + + def localize(self, result: BaseResultT) -> None: + """ Sets the local name of address parts according to the chosen + local, transliterating if not avaliable. + + Only address parts that are marked as isaddress are localized. + """ + if not result.address_rows: + return + + region_lang = self.set_lang(result) + + for line in result.address_rows: + if line.isaddress and line.names: + if region_lang: + line.local_name_lang = region_lang + + if line.local_name_lang not in self.languages: + line.local_name, line.local_name_lang = ( + self.display_name_with_locale(line.names) + ) + + if line.local_name_lang in self.languages: + print(f"no transliteration needed for {line.local_name}") + else: + line.local_name = self.transliterate(line).strip() + + @staticmethod + def from_accept_languages(langstr: str) -> 'TransliterateLocales': + """ Create a localization object from a language list in the + format of HTTP accept-languages header. + + The functions tries to be forgiving of format errors by first splitting + the string into comma-separated parts and then parsing each + description separately. Badly formatted parts are then ignored. + + Using the additional normalization transliteration constraints, + then returns the larguage in its normalized form, as well as the regional + dialect, if applicable. + """ + candidates = AbstractLocales.sort_languages(langstr) + + languages = [] + for lid, _ in candidates: + if lid not in languages: + languages.append(lid) + + normalized = TransliterateLocales.normalize_dict(lid) + for norm_lang in normalized: + if norm_lang not in languages: + languages.append(norm_lang) + + return TransliterateLocales(languages) diff --git a/src/nominatim_api/results.py b/src/nominatim_api/results.py index 1b74b5aae..b76573cd4 100644 --- a/src/nominatim_api/results.py +++ b/src/nominatim_api/results.py @@ -131,6 +131,10 @@ class AddressLine: """ Place holder for localization of this address part. See [Localization](Result-Handling.md#localization) below. """ + local_name_lang: Optional[str] = None + """ Place holder for language of this address part, computed + during localization + """ @property def display_name(self) -> Optional[str]: diff --git a/src/nominatim_api/v1/server_glue.py b/src/nominatim_api/v1/server_glue.py index 99f7dc480..948669c3c 100644 --- a/src/nominatim_api/v1/server_glue.py +++ b/src/nominatim_api/v1/server_glue.py @@ -8,7 +8,7 @@ Generic part of the server implementation of the v1 API. Combine with the scaffolding provided for the various Python ASGI frameworks. """ -from typing import Optional, Any, Type, Dict, cast, Sequence, Tuple +from typing import Optional, Any, Type, Dict, cast, Sequence, Tuple, List from functools import reduce import dataclasses from urllib.parse import urlencode @@ -21,8 +21,8 @@ from .format import RawDataList from ..types import DataLayer, GeometryFormat, PlaceRef, PlaceID, OsmID, Point from ..status import StatusResult -from ..results import DetailedResult, ReverseResults, SearchResult, SearchResults -from ..localization import Locales +from ..results import DetailedResult, ReverseResults, SearchResult, SearchResults, BaseResultT +from ..localization import Locales, TransliterateLocales, AbstractLocales from . import helpers from ..server import content_types as ct from ..server.asgi_adaptor import ASGIAdaptor, EndpointFunc @@ -53,6 +53,20 @@ def get_accepted_languages(adaptor: ASGIAdaptor) -> str: or adaptor.config().DEFAULT_LANGUAGE +def localize_results(params: ASGIAdaptor, results: List[BaseResultT]) -> AbstractLocales: + locales: AbstractLocales + transliterate = params.get_bool('transliterate', False) + + if transliterate: + locales = TransliterateLocales.from_accept_languages( + get_accepted_languages(params)) + else: + locales = Locales.from_accept_languages(get_accepted_languages(params)) + + locales.localize_results(results) + return locales + + def setup_debugging(adaptor: ASGIAdaptor) -> bool: """ Set up collection of debug information if requested. @@ -172,8 +186,7 @@ async def details_endpoint(api: NominatimAPIAsync, params: ASGIAdaptor) -> Any: if result is None: params.raise_error('No place with that OSM ID found.', status=404) - locales = Locales.from_accept_languages(get_accepted_languages(params)) - locales.localize_results([result]) + locales = localize_results(params, [result]) output = params.formatting().format_result( result, fmt, @@ -210,8 +223,7 @@ async def reverse_endpoint(api: NominatimAPIAsync, params: ASGIAdaptor) -> Any: query = '' if result: - Locales.from_accept_languages(get_accepted_languages(params)).localize_results( - [result]) + localize_results(params, [result]) fmt_options = {'query': query, 'extratags': params.get_bool('extratags', False), @@ -248,7 +260,7 @@ async def lookup_endpoint(api: NominatimAPIAsync, params: ASGIAdaptor) -> Any: if debug: return build_response(params, loglib.get_and_disable(), num_results=len(results)) - Locales.from_accept_languages(get_accepted_languages(params)).localize_results(results) + localize_results(params, results) fmt_options = {'extratags': params.get_bool('extratags', False), 'namedetails': params.get_bool('namedetails', False), @@ -302,6 +314,7 @@ async def search_endpoint(api: NominatimAPIAsync, params: ASGIAdaptor) -> Any: details['viewbox'] = params.get('viewbox', None) or params.get('viewboxlbrt', None) details['bounded_viewbox'] = params.get_bool('bounded', False) details['dedupe'] = params.get_bool('dedupe', True) + details['transliterate'] = params.get_bool('transliterate', False) max_results = max(1, min(50, params.get_int('limit', 10))) details['max_results'] = (max_results + min(10, max_results) @@ -338,7 +351,7 @@ async def search_endpoint(api: NominatimAPIAsync, params: ASGIAdaptor) -> Any: except UsageError as err: params.raise_error(str(err)) - Locales.from_accept_languages(get_accepted_languages(params)).localize_results(results) + localize_results(params, results) if details['dedupe'] and len(results) > 1: results = helpers.deduplicate_results(results, max_results) diff --git a/src/nominatim_db/data/country_info.py b/src/nominatim_db/data/country_info.py index bc3f20f59..257c3eee7 100644 --- a/src/nominatim_db/data/country_info.py +++ b/src/nominatim_db/data/country_info.py @@ -84,6 +84,11 @@ def setup_country_config(config: Configuration) -> None: _COUNTRY_INFO.load(config) +def get(country: str) -> Dict[str, Any]: + """ Get country information for the country with the given country code. """ + return _COUNTRY_INFO._info.get(country) or {} + + @overload def iterate() -> Iterable[Tuple[str, Dict[str, Any]]]: ... diff --git a/test/python/api/test_locales_languages.py b/test/python/api/test_locales_languages.py new file mode 100644 index 000000000..e13b3ffe4 --- /dev/null +++ b/test/python/api/test_locales_languages.py @@ -0,0 +1,44 @@ +# SPDX-License-Identifier: GPL-3.0-or-later +# +# This file is part of Nominatim. (https://nominatim.org) +# +# Copyright (C) 2025 by the Nominatim developer community. +# For a full list of authors see the git log. +""" +Tests for language parsing with the complex locales function +""" +from nominatim_api.localization import TransliterateLocales + + +def test_parsing_en(): + """ Base HTML Header Parsing test to see if it can properly concatanate and + extract the proper naming conventions + + Checks if the prototype can differentiate between English Variants + """ + test_header = "en-CA,en-GB;q=0.9,en-US;q=0.8,en;q=0.7" + output = TransliterateLocales().from_accept_languages(test_header).languages + assert output == ['en-CA', 'en', 'en-GB', 'en-US'] + + +def test_parsing_zh(): + """ Base HTML Header Parsing test to see if it can properly concatanate and + extract the proper naming conventions + + Checks if the prototype can differentiate between Chinese Variants + """ + test_header = "zh;q=0.9,zh-cn;q=0.8,zh-Hans-CN;q=0.7" + output = TransliterateLocales().from_accept_languages(test_header).languages + assert output == ['zh', 'zh-Hans', 'zh-Hant', 'yue', 'zh-cn', 'zh-Hans-CN'] + + +def test_parsing_zh_en(): + """ Base HTML Header Parsing test to see if it can properly concatanate and + extract the proper naming conventions + + Checks if the prototype can differentiate between Chinese Variants and English Variants + """ + test_header = "zh;q=0.4, en-US, zh-cn;q=0.8,zh-Hans-CN;q=0.7, en-UK;q=0.1" + output = TransliterateLocales().from_accept_languages(test_header).languages + assert output == ['en-US', 'en', 'zh-cn', 'zh-Hans', 'zh-Hans-CN', + 'zh', 'zh-Hant', 'yue', 'en-UK'] diff --git a/test/python/api/test_localization.py b/test/python/api/test_localization.py index c3e02596b..322d38267 100644 --- a/test/python/api/test_localization.py +++ b/test/python/api/test_localization.py @@ -9,7 +9,7 @@ """ import pytest -from nominatim_api import Locales +from nominatim_api.localization import Locales as Locales def test_display_name_empty_names(): @@ -27,6 +27,16 @@ def test_display_name_none_localized(): assert loc.display_name({'ref': '34', 'name:de': 'DE'}) == '34' +def test_display_name_with_locale(): + loc = Locales(["de"]) + + assert loc.display_name({}) == '' + assert loc.display_name({'name:zh': 'ZH', 'name:de': 'DE'}) == 'DE' + assert loc.display_name({'name:de': 'DE', 'name': 'ALL'}) == 'DE' + assert loc.display_name_with_locale({'name:zh': 'ZH', 'name:de': 'DE'}) == ('DE', 'de') + assert loc.display_name_with_locale({'name:de': 'DE', 'ref': '34', 'name': 'A'}) == ('DE', "de") + + def test_output_names_none_localized(): loc = Locales() diff --git a/test/python/api/test_normalization.py b/test/python/api/test_normalization.py new file mode 100644 index 000000000..e13b3ffe4 --- /dev/null +++ b/test/python/api/test_normalization.py @@ -0,0 +1,44 @@ +# SPDX-License-Identifier: GPL-3.0-or-later +# +# This file is part of Nominatim. (https://nominatim.org) +# +# Copyright (C) 2025 by the Nominatim developer community. +# For a full list of authors see the git log. +""" +Tests for language parsing with the complex locales function +""" +from nominatim_api.localization import TransliterateLocales + + +def test_parsing_en(): + """ Base HTML Header Parsing test to see if it can properly concatanate and + extract the proper naming conventions + + Checks if the prototype can differentiate between English Variants + """ + test_header = "en-CA,en-GB;q=0.9,en-US;q=0.8,en;q=0.7" + output = TransliterateLocales().from_accept_languages(test_header).languages + assert output == ['en-CA', 'en', 'en-GB', 'en-US'] + + +def test_parsing_zh(): + """ Base HTML Header Parsing test to see if it can properly concatanate and + extract the proper naming conventions + + Checks if the prototype can differentiate between Chinese Variants + """ + test_header = "zh;q=0.9,zh-cn;q=0.8,zh-Hans-CN;q=0.7" + output = TransliterateLocales().from_accept_languages(test_header).languages + assert output == ['zh', 'zh-Hans', 'zh-Hant', 'yue', 'zh-cn', 'zh-Hans-CN'] + + +def test_parsing_zh_en(): + """ Base HTML Header Parsing test to see if it can properly concatanate and + extract the proper naming conventions + + Checks if the prototype can differentiate between Chinese Variants and English Variants + """ + test_header = "zh;q=0.4, en-US, zh-cn;q=0.8,zh-Hans-CN;q=0.7, en-UK;q=0.1" + output = TransliterateLocales().from_accept_languages(test_header).languages + assert output == ['en-US', 'en', 'zh-cn', 'zh-Hans', 'zh-Hans-CN', + 'zh', 'zh-Hant', 'yue', 'en-UK'] diff --git a/test/python/api/test_transliterate_locales.py b/test/python/api/test_transliterate_locales.py new file mode 100644 index 000000000..b186742f8 --- /dev/null +++ b/test/python/api/test_transliterate_locales.py @@ -0,0 +1,279 @@ +# SPDX-License-Identifier: GPL-3.0-or-later +# +# This file is part of Nominatim. (https://nominatim.org) +# +# Copyright (C) 2025 by the Nominatim developer community. +# For a full list of authors see the git log. +""" +Tests for transliteration with the complex locales function +""" +import pytest + +from nominatim_api.localization import TransliterateLocales +from nominatim_api.config import Configuration +from nominatim_api.data import lang_info +from nominatim_api.results import AddressLine, SearchResult + + +def hospital_result(): + result = SearchResult( + source_table=None, + category=('place', 'city'), + centroid=(0.0, 0.0) + ) + + result.country_code = 'cn' + result.address_rows = [ + AddressLine( + category=('amenity', 'hospital'), + names={'name': '丹东市中医院'}, + fromarea=True, + isaddress=True, + rank_address=30, + distance=0.0, + place_id=100109, + osm_object=('N', 12112291499), + extratags={}, + admin_level=15, + local_name='丹东市中医院' + ), + AddressLine( + category=('highway', 'trunk'), + names={'ref': 'G331', 'name': '锦山大街', 'name:en': 'Jinshan Main Street', + 'name:zh': '锦山大街', 'name:zh-Hant': '錦山大街'}, + fromarea=True, isaddress=True, rank_address=26, + distance=0.0, place_id=100287, osm_object=('W', 1209291912), + extratags={'oneway': 'yes', 'surface': 'asphalt'}, + admin_level=15, local_name=None), + AddressLine( + category=('boundary', 'administrative'), + names={'name': '广济街道', 'name:en': 'Guangji Subdistrict', 'name:ko': '위안바오 구', + 'name:zh': '广济街道', 'alt_name': 'Guangji;广济', 'official_name': '广济街道', + '_place_name:en': 'Guangji', '_place_alt_name': 'Guangji Subdistrict;广济', + 'name:zh-Hans': '广济街道', 'name:zh-Hant': '廣濟街道', 'name:zh-Latn-pinyin': + 'Yuánbăo Qū'}, + fromarea=True, isaddress=False, rank_address=20, distance=2.085130332117237e-08, + place_id=100168, osm_object=('R', 9660093), admin_level=8, local_name=None), + AddressLine( + category=('boundary', 'administrative'), + names={'name': '兴东街道', 'name:en': 'Xingdong Subdistrict', 'name:zh': '兴东街道', + 'alt_name': 'Xingdong;兴东', 'official_name': '兴东街道', '_place_name:en': + 'Xingdong', 'name:ja': '興東街道', '_place_alt_name': 'Xingdong Subdistrict;兴东', + 'name:zh-Hans': '兴东街道', 'name:zh-Hant': '興東街道'}, + fromarea=True, isaddress=False, rank_address=20, distance=0.003425476071486737, + place_id=100222, osm_object=('R', 9660099), admin_level=8, local_name=None), + AddressLine( + category=('boundary', 'administrative'), + names={'name': '六道口街道', 'name:en': 'Liudaokou Subdistrict', 'name:zh': '六道口街道', + 'alt_name': 'Liudaokou;六道口', 'official_name': '六道口街道', + 'name:ja': '六道口街道', '_place_alt_name': 'Liudaokou Subdistrict;六道口', + 'name:zh-Hans': '六道口街道', 'name:zh-Hant': '六道口街道'}, + fromarea=True, isaddress=False, rank_address=20, distance=0.0008695703526356822, + place_id=100332, osm_object=('R', 9660097), admin_level=8, local_name=None), + AddressLine( + category=('boundary', 'administrative'), + names={'name': '站前街道', 'name:en': 'Zhanqian Subdistrict', 'name:zh': '站前街道', + 'alt_name': 'Zhanqian;站前', 'official_name': '站前街道', '_place_name:en': + 'Zhanqian', 'name:ja': '駅前街道', '_place_alt_name': 'Zhanqian Subdistrict;站前', + 'name:zh-Hans': '站前街道', 'name:zh-Hant': '站前街道'}, + fromarea=True, isaddress=True, rank_address=20, distance=0.0035797314418621385, + place_id=100044, admin_level=8, local_name=None), + AddressLine( + category=('place', 'city'), + names={'name': '振兴区', 'name:en': 'Zhenxing', 'name:fr': 'Zhenxing', 'name:ja': '振興区', + 'name:ko': '전싱구', 'name:zh': '振兴区', 'alt_name': '振兴', 'alt_name:zh': '振兴', + 'name:zh-Hans': '振兴区', 'name:zh-Hant': '振興區'}, + fromarea=False, isaddress=False, rank_address=16, distance=0.034584735617470413, + place_id=101842, osm_object=('N', 6416739765), admin_level=15, local_name=None), + AddressLine( + category=('place', 'city'), + names={'name': '丹东市', 'name:ar': 'داندونغ', 'name:az': 'Dandun', 'name:bg': 'Дандун', + 'name:cs': 'Tan-tung', 'name:de': 'Dandong', 'name:en': 'Dandong', 'name:et': + 'Dandong', 'name:eu': 'Dandong', 'name:fi': 'Dandong', 'name:fr': 'Dandong', + 'name:hi': 'डेन्डोंग', 'name:hr': 'Dandong', 'name:ja': '丹東市', 'name:ko': '단둥시', + 'name:ru': 'Даньдун', 'name:sv': 'Dandong', 'name:vi': 'Đan Đông', 'name:zh': + '丹东市', 'int_name': 'Dandong', 'old_name': '安东市', 'short_name': '丹东', + 'alt_name:en': 'Dandong City', 'alt_name:ko': '단동시', 'old_name:en': + 'Andong;Antung', 'old_name:ja': '安東市', 'old_name:ko': '안둥시', 'old_name:zh': + '安东市', 'name:zh-Hans': '丹东市', 'name:zh-Hant': '丹東市', 'short_name:ja': + '丹東', 'short_name:ko': '단둥;단동', 'short_name:zh': '丹东', + 'name:zh-Latn-pinyin': 'Dāndōng Shì'}, + fromarea=False, isaddress=False, rank_address=16, distance=0.002896152207101176, + place_id=100418, osm_object=('N', 244078242), admin_level=15, local_name=None), + AddressLine( + category=('place', 'city'), + names={'name': '元宝区', 'name:en': 'Yuanbao', 'name:fr': 'Yuanbao', 'name:ja': + '元宝区', 'name:ko': '위안바오구', 'name:zh': '元宝区', 'alt_name': '元宝', + 'alt_name:en': 'Yuanbao District', 'alt_name:zh': '元宝', 'name:zh-Hans': + '元宝区', 'name:zh-Hant': '元寶區'}, + fromarea=False, isaddress=True, rank_address=16, distance=0.0014780993108928506, + place_id=100117, osm_object=('N', 6416739764), admin_level=15, local_name=None), + AddressLine( + category=('place', 'district'), + names={'name': '振兴区', 'name:en': 'Zhenxing', 'name:zh': '振兴区', 'alt_name': '振兴', + 'alt_name:en': 'Zhenxing District', 'alt_name:zh': '振兴', 'name:zh-Hans': '振兴区', + 'name:zh-Hant': '振興區', 'name:zh-Latn-pinyin': 'Zhènxīng Qū'}, + fromarea=False, isaddress=True, rank_address=12, distance=0.100276398631487, + place_id=101581, osm_object=('N', 244084848), admin_level=15, local_name=None), + AddressLine( + category=('place', 'postcode'), names={'ref': '118000'}, fromarea=False, isaddress=True, + rank_address=5, distance=0.0, place_id=None, osm_object=None, extratags=None, + admin_level=None, local_name='118000'), + AddressLine( + category=('place', 'country_code'), + names={'ref': 'cn'}, fromarea=True, isaddress=False, rank_address=4, distance=0.0, + place_id=None, osm_object=None, extratags={}, admin_level=None, local_name=None), + AddressLine( + category=('place', 'country'), + names={'name': '中国', 'name:ab': 'Чынҭ', 'name:af': 'China', 'name:ak': 'China', + 'name:am': 'የቻይና', 'name:an': 'China', 'name:ar': 'الصين', 'name:as': 'চীন', + 'name:av': 'Чин', 'name:ay': 'China', 'name:az': 'Çin', 'name:ba': 'Ҡытай', + 'name:be': 'Кітай', 'name:bg': 'Китай', 'name:bh': 'चीन', 'name:bi': 'Jaena', + 'name:bm': 'China', 'name:bn': 'গণচীন', 'name:bo': 'ཀྲུང་གོ།', 'name:br': 'Sina', + 'name:bs': 'Kina', 'name:ca': 'Xina', 'name:ce': 'Цийн-мохк', 'name:ch': 'China', + 'name:co': 'China', 'name:cs': 'Čína', 'name:cu': 'Срѣдинꙗнє', 'name:cv': + 'Китай', 'name:cy': 'Tsieina', 'name:da': 'Kina', 'name:de': 'China', 'name:dv': + 'ސީނުކަރަ', 'name:dz': 'རྒྱ་ནག', 'name:ee': 'China', 'name:el': 'Κίνα', + 'name:en': 'China', 'name:eo': 'Ĉinio', 'name:es': 'China', 'name:et': 'Hiina', + 'name:eu': 'Txina', 'name:fa': 'چین', 'name:ff': 'Ciina', 'name:fi': 'Kiina', + 'name:fj': 'Jaina', 'name:fo': 'Kina', 'name:fr': 'Chine', 'name:fy': 'Sina', + 'name:ga': 'Síne', 'name:gd': 'Sìona', 'name:gl': 'China', 'name:gn': 'Chína', + 'name:gu': 'ચીન', 'name:gv': 'Sheen', 'name:ha': 'Sin', 'name:he': 'סין', + 'name:hi': 'चीनी', 'name:hr': 'Kina', 'name:ht': 'Chin', 'name:hu': 'Kína', + 'name:hy': 'Չինաստան', 'name:ia': 'China', 'name:id': 'Tiongkok', 'name:ie': + 'China', 'name:ig': 'Chaina', 'name:ik': 'China', 'name:io': 'Chinia', 'name:is': + 'Kína', 'name:it': 'Cina', 'name:iu': 'ᓴᐃᓇ', 'name:ja': '中国', 'name:jv': 'Cina', + 'name:ka': 'ჩინეთი', 'name:kg': 'Sina', 'name:ki': 'China', 'name:kk': 'Қытай', + 'name:kl': 'Kina', 'name:km': 'ចិន', 'name:kn': 'ಚೀನಿ', 'name:ko': '중국', + 'name:ks': 'چیٖن', 'name:ku': 'Çîn', 'name:kv': 'Китай', 'name:kw': 'China', + 'name:ky': 'Кытай', 'name:la': 'Sinae', 'name:lb': 'China', 'name:lg': 'Cayina', + 'name:li': 'China', 'name:ln': 'Sína', 'name:lo': 'ປະເທດຈີນ', + 'name:lv': 'Ķīna', 'name:mg': 'Sina', 'name:mi': 'Haina', 'name:mk': 'Кина', + 'name:ml': 'ചീന', 'name:mn': 'Хятад', 'name:mr': 'चीन', 'name:ms': 'China', + 'name:mt': 'Ċina', 'name:my': 'တရုတ်', 'name:na': 'Tsiene', 'name:nb': 'Kina', + 'name:ne': 'चीन', 'name:nl': 'China', 'name:nn': 'Kina', 'name:no': 'Kina', + 'name:nv': 'Tsiiʼyishbizhí Dineʼé Bikéyah', 'name:ny': 'China', + 'name:oc': 'China', 'name:om': 'Chaayinaan', 'name:or': 'ଚୀନ', 'name:os': + 'Китай', 'name:pa': 'ਚੀਨ', 'name:pl': 'Chiny', 'name:ps': 'چين', 'name:pt': + 'China', 'name:qu': 'Chunkuk', 'name:rm': 'China', 'name:rn': + 'Ubushinwa', 'name:ro': 'China', 'name:ru': 'Китай', 'name:rw': + 'Ubushinwa', 'name:sc': 'Cina', 'name:sd': 'چين', 'name:se': 'Kiinná', + 'name:sg': 'Sînä', 'name:sh': 'Kina', 'name:si': 'චීනය', 'name:sk': 'Čína', + 'name:sl': 'Kitájska', 'name:sm': 'Saina', 'name:sn': 'China', 'name:so': + 'Shiinaha', 'name:sq': 'Kina', 'name:sr': 'Кина', 'name:ss': 'iShayina', + 'name:st': 'Tjhaena', 'name:su': 'Tiongkok', 'name:sv': 'Kina', + 'name:sw': 'China', 'name:ta': 'சீனா', 'name:te': 'చైనా', 'name:tg': + 'Хито́й', 'name:th': 'ประเทศจีน', 'name:ti': 'የቻይና', 'name:tk': 'Hytaý', + 'name:tl': 'Tsina', 'name:tn': 'China', 'name:to': 'Siaina', 'name:tr': + 'Çin', 'name:ts': 'Chayina', 'name:tt': 'Кытай', 'name:tw': 'China', + 'name:ty': 'Tinitō', 'name:ug': 'جۇڭخۇا خەلق جۇمھۇرىيىتى', 'name:uk': + 'Кита́йська', 'name:ur': 'چین', 'name:uz': 'Xitoy', 'name:ve': 'China', + 'name:vi': 'Trung Quốc', 'name:vo': 'Tsyinän', 'name:wo': 'Siin', + 'name:xh': 'IShayina', 'name:yi': 'כינע', 'name:yo': 'Ṣáínà', + 'name:za': 'Cunghgoz', 'name:zh': '中国', 'name:zu': 'IShayina'}, + fromarea=False, isaddress=True, rank_address=4, distance=0.0, place_id=None, + osm_object=None, extratags=None, admin_level=None, local_name=None) + ] + return result + + +@pytest.mark.parametrize("header,expected_output", [ + ("zh-Hans", "丹东市中医院, 锦山大街, 站前街道, 元宝区, 振兴区, 118000, 中国"), + (None, "Dan Dong Shi Zhong Yi Yuan, Jin Shan Da Jie, Zhan Qian Jie Dao," + " Yuan Bao Qu, Zhen Xing Qu, 118000, Zhong Guo"), + ("en", "Dan Dong Shi Zhong Yi Yuan, Jinshan Main Street, Zhanqian" + " Subdistrict, Yuanbao, Zhenxing, 118000, China"), + ("he", "Dan Dong Shi Zhong Yi Yuan, Jin Shan Da Jie, Zhan Qian Jie Dao," + " Yuan Bao Qu, Zhen Xing Qu, 118000, סין"), + ("ps", "Dan Dong Shi Zhong Yi Yuan, Jin Shan Da Jie, Zhan Qian" + " Jie Dao, Yuan Bao Qu, Zhen Xing Qu, 118000, چين"), + ("fr;q=0.8,en;q=0.2", "Dan Dong Shi Zhong Yi Yuan, Jinshan Main Street, Zhanqian Subdistrict," + " Yuanbao, Zhenxing, 118000, Chine"), + ("zh", "丹东市中医院, 锦山大街, 站前街道, 元宝区, 振兴区, 118000, 中国"), +]) +def test_transliterate_hospital(header, expected_output): + """Parameterized test for transliteration of hospitals in Dandong.""" + pytest.importorskip("unidecode", reason="The 'unidecode' library is" + "required for Latin transliteration.") + results = [hospital_result()] + if header: + langs = TransliterateLocales().from_accept_languages(header).languages + print(langs) + print(results[0].display_name) + TransliterateLocales(langs).localize_results(results) + else: + TransliterateLocales().localize_results(results) + + assert results[0].display_name == expected_output + + +# def test_transliterate(): +# """ Base Transliteration Test """ +# variable = 'school in dandong' +# results = asyncio.run(search(f"{variable}")) +# print(results) # not resulting:( pytest mocking for now, could maybe try a small import? +# # set locale name here first, will probably need to further integrate with display name +# SimpleLocales().localize_results(results) + +# output = TransliterateLocales().result_transliterate(results)[0] +# assert output == ( +# "Dan Dong Shi Di Liu Zhong Xue, Qi Wei Lu, Zhan Qian Jie Dao, " +# "Dan Dong Shi, Zhen Xing Qu, 118000, Zhong Guo" +# ) + + +# def test_transliterate_english(): +# """ Base Transliteration Test to English + +# Results should show that the result is transliterated to latin +# Except for components that have English locales already set +# """ +# variable = 'school in dandong' +# results = asyncio.run(search(f"{variable}")) +# output = TransliterateLocales(['en']).result_transliterate(results)[0] +# assert output == ( +# "Dan Dong Shi Di Liu Zhong Xue, Qi Wei Lu, Zhanqian Subdistrict, " +# "Dandong, Zhenxing, 118000, China" +# ) + + +# def test_parsing_transliterate(): +# """ Base HTML Header Parsing test + Transliteration +# to see if it can properly concatanate and +# extract the proper naming conventions + +# Checks if the prototype can differentiate between English Variants +# """ +# test_header = "en-CA,en-GB;q=0.9,en-US;q=0.8,en;q=0.7" +# variable = 'school in dandong' +# results = asyncio.run(search(f"{variable}")) +# output = TransliterateLocales(test_header).result_transliterate(results)[0] +# assert output == ( +# "Dan Dong Shi Di Liu Zhong Xue, Qi Wei Lu, Zhanqian Subdistrict," +# " Dandong, Zhenxing, 118000, China" +# ) + + +def test_canto_transliterate(): + """ Cantonese transliteration to Latin test + + Tests to see if transliteration can accurately convert to + Cantonese + """ + pytest.importorskip("cantoroman", reason="The 'cantonese-romanisation' library is " + "required for Cantonese transliteration.") + line = AddressLine(category=('place', 'city'), names={}, fromarea=True, + isaddress=True, rank_address=30, distance=0.0, local_name_lang='yue', + local_name='梁國雄') + assert TransliterateLocales().latin_transliterate(line) == "Leung Gwok Hung" + + +def test_load_languages(): + config = Configuration(None) + lang_info.setup_lang_config(config) + + # Access language data + for language_code, _ in lang_info.iterate(): + language = lang_info.get(language_code) + latin = (language['written'] == 'lat') + assert latin == language['latin']