Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ We define the affinity between two locales using a `LocaleAffinity` enum value:
- `LOW`: Locales are somewhat related, meaning they either have low similarities from a linguistic
perspective or co-exist in given geopolitical or cultural contexts.
- `HIGH`: Locales are quite related, meaning they have similarities from a linguistic perspective.
- `SAME_OR_INTERCHANGEABLE`: Locales either identify the same language, or languages that are
- `SAME_OR_MUTUALLY_INTELLIGIBLE`: Locales either identify the same language, or languages that are
similar to a point where a person should understand both if they understand one of them.

We offer two separate logics, each dedicated to separate use-cases:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@

import static com.spotify.i18n.locales.common.model.LocaleAffinity.LOW;
import static com.spotify.i18n.locales.common.model.LocaleAffinity.NONE;
import static com.spotify.i18n.locales.common.model.LocaleAffinity.SAME_OR_INTERCHANGEABLE;
import static com.spotify.i18n.locales.common.model.LocaleAffinity.SAME_OR_MUTUALLY_INTELLIGIBLE;

import com.ibm.icu.util.ULocale;
import com.spotify.i18n.locales.common.LocaleAffinityCalculator;
Expand Down Expand Up @@ -120,18 +120,18 @@ private static Map<String, LocaleAffinity> getLanguageTagToExpectedAffinityMap()
map.put("en-US", NONE);

// Spanish in Europe should ok
map.put("es-419", SAME_OR_INTERCHANGEABLE);
map.put("es-GB", SAME_OR_INTERCHANGEABLE);
map.put("es-US", SAME_OR_INTERCHANGEABLE);
map.put("es-419", SAME_OR_MUTUALLY_INTELLIGIBLE);
map.put("es-GB", SAME_OR_MUTUALLY_INTELLIGIBLE);
map.put("es-US", SAME_OR_MUTUALLY_INTELLIGIBLE);

// Basque should be matched, since we support Spanish
map.put("eu", LOW);

// French
map.put("fr", SAME_OR_INTERCHANGEABLE);
map.put("fr-BE", SAME_OR_INTERCHANGEABLE);
map.put("fr-CA", SAME_OR_INTERCHANGEABLE);
map.put("fr-FR", SAME_OR_INTERCHANGEABLE);
map.put("fr", SAME_OR_MUTUALLY_INTELLIGIBLE);
map.put("fr-BE", SAME_OR_MUTUALLY_INTELLIGIBLE);
map.put("fr-CA", SAME_OR_MUTUALLY_INTELLIGIBLE);
map.put("fr-FR", SAME_OR_MUTUALLY_INTELLIGIBLE);

// Galician should be matched, since we support Spanish
map.put("gl", LOW);
Expand All @@ -140,41 +140,41 @@ private static Map<String, LocaleAffinity> getLanguageTagToExpectedAffinityMap()
map.put("hi", NONE);

// Croatian should be nicely matched with Bosnian
map.put("hr-HR", SAME_OR_INTERCHANGEABLE);
map.put("hr-HR", SAME_OR_MUTUALLY_INTELLIGIBLE);

// Serbian Cyrillic should be matched, although only Latin script is supported
map.put("sr", SAME_OR_INTERCHANGEABLE);
map.put("sr-Latn", SAME_OR_INTERCHANGEABLE);
map.put("sr-Cyrl-ME", SAME_OR_INTERCHANGEABLE);
map.put("sr", SAME_OR_MUTUALLY_INTELLIGIBLE);
map.put("sr-Latn", SAME_OR_MUTUALLY_INTELLIGIBLE);
map.put("sr-Cyrl-ME", SAME_OR_MUTUALLY_INTELLIGIBLE);

// Portuguese
map.put("pt", SAME_OR_INTERCHANGEABLE);
map.put("pt-BR", SAME_OR_INTERCHANGEABLE);
map.put("pt-SE", SAME_OR_INTERCHANGEABLE);
map.put("pt-US", SAME_OR_INTERCHANGEABLE);
map.put("pt", SAME_OR_MUTUALLY_INTELLIGIBLE);
map.put("pt-BR", SAME_OR_MUTUALLY_INTELLIGIBLE);
map.put("pt-SE", SAME_OR_MUTUALLY_INTELLIGIBLE);
map.put("pt-US", SAME_OR_MUTUALLY_INTELLIGIBLE);

// Only Traditional Chinese should be matched, not Simplified
map.put("zh-CN", NONE);
map.put("zh-TW", SAME_OR_INTERCHANGEABLE);
map.put("zh-HK", SAME_OR_INTERCHANGEABLE);
map.put("zh-TW", SAME_OR_MUTUALLY_INTELLIGIBLE);
map.put("zh-HK", SAME_OR_MUTUALLY_INTELLIGIBLE);
return map;
}

public static void main(String[] args) {
final LocaleAffinityCalculator affinityCalculator = getLocaleAffinityCalculator();

// Example 1: Filter the list of test language tags, and only retain the ones that result in a
// locale affinity at the level of SAME_OR_INTERCHANGEABLE, therefore guaranteeing that the
// associated
// locale affinity at the level of SAME_OR_MUTUALLY_INTELLIGIBLE
System.out.println("========================================");
System.out.println(
String.format(
"Example 1: List of language tags with calculated affinity = %s",
SAME_OR_INTERCHANGEABLE.name()));
SAME_OR_MUTUALLY_INTELLIGIBLE.name()));
getLanguageTagToExpectedAffinityMap().keySet().stream()
.filter(
languageTag ->
affinityCalculator.calculate(languageTag).affinity() == SAME_OR_INTERCHANGEABLE)
affinityCalculator.calculate(languageTag).affinity()
== SAME_OR_MUTUALLY_INTELLIGIBLE)
.forEach(System.out::println);

// Example 2: Check that calculated affinity for each language tag matches the expected value.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,17 +45,17 @@ public class ReferenceLocalesBasedJoinExampleMain {
* <p>Possible joins in the execution output are:
*
* <ul>
* <li>(de, de-AT), join on reference locale [de-AT] with SAME_OR_INTERCHANGEABLE affinity
* <li>(en-GB, en-JP), join on reference locale [en-GB] with SAME_OR_INTERCHANGEABLE affinity
* <li>(en-GB, en-SE), join on reference locale [en-SE] with SAME_OR_INTERCHANGEABLE affinity
* <li>(es-BE, ca), join on reference locale [ca] with LOW affinity
* <li>(fr-SE, fr-BE-u-ca-gregorian), join on reference locale [fr-BE] with
* SAME_OR_INTERCHANGEABLE affinity
* <li>(fr-SE, fr-CA), join on reference locale [fr-CA] with SAME_OR_INTERCHANGEABLE affinity
* <li>(ja-IT, ja@calendar=buddhist), join on reference locale [ja] with SAME_OR_INTERCHANGEABLE
* <li>(de, de-AT) on reference locale [de-AT] with SAME_OR_MUTUALLY_INTELLIGIBLE affinity
* <li>(en-GB, en-JP) on reference locale [en-GB] with SAME_OR_MUTUALLY_INTELLIGIBLE affinity
* <li>(en-GB, en-SE) on reference locale [en-SE] with SAME_OR_MUTUALLY_INTELLIGIBLE affinity
* <li>(es-BE, ca) on reference locale [ca] with LOW affinity
* <li>(fr-SE, fr-BE-u-ca-gregorian) on reference locale [fr-BE] with
* SAME_OR_MUTUALLY_INTELLIGIBLE affinity
* <li>(fr-SE, fr-CA) on reference locale [fr-CA] with SAME_OR_MUTUALLY_INTELLIGIBLE affinity
* <li>(ja-IT, ja@calendar=buddhist) on reference locale [ja] with SAME_OR_MUTUALLY_INTELLIGIBLE
* affinity
* <li>(nl-BE, nl-ZA), join on reference locale [nl] with SAME_OR_INTERCHANGEABLE affinity
* <li>(zh-Hans-US, zh-CN), join on reference locale [zh] with SAME_OR_INTERCHANGEABLE affinity
* <li>(nl-BE, nl-ZA) on reference locale [nl] with SAME_OR_MUTUALLY_INTELLIGIBLE affinity
* <li>(zh-Hans-US, zh-CN) on reference locale [zh] with SAME_OR_MUTUALLY_INTELLIGIBLE affinity
* </ul>
*
* @param args
Expand Down Expand Up @@ -97,7 +97,7 @@ public static void main(String[] args) {
(rrl) ->
System.out.println(
String.format(
"(%s, %s), join on reference locale [%s] with %s affinity",
"(%s, %s) on reference locale [%s] with %s affinity",
languageTagInOriginDataset,
languageTagInTargetDataset,
rrl.referenceLocale().toLanguageTag(),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ public abstract class LocaleAffinityCalculatorBaseImpl implements LocaleAffinity
private static final double DISTANCE_THRESHOLD = 224.0;

// Score to affinity thresholds
private static final int SCORE_THRESHOLD_SAME_OR_INTERCHANGEABLE = 65;
private static final int SCORE_THRESHOLD_SAME_OR_MUTUALLY_INTELLIGIBLE = 65;
private static final int SCORE_THRESHOLD_HIGH = 30;
private static final int SCORE_THRESHOLD_LOW = 0;

Expand Down Expand Up @@ -103,7 +103,7 @@ private LocaleAffinity getAffinity(@Nullable final String languageTag) {
// We attempt to match based on corresponding spoken language first, and make use of the
// score-based affinity calculation as fallback.
if (hasSameSpokenLanguageAffinity(languageTag)) {
return LocaleAffinity.SAME_OR_INTERCHANGEABLE;
return LocaleAffinity.SAME_OR_MUTUALLY_INTELLIGIBLE;
} else {
return calculateScoreBasedAffinity(languageTag);
}
Expand Down Expand Up @@ -155,8 +155,8 @@ private int convertDistanceToAffinityScore(final int distance) {
}

private LocaleAffinity convertScoreToLocaleAffinity(final int score) {
if (score > SCORE_THRESHOLD_SAME_OR_INTERCHANGEABLE) {
return LocaleAffinity.SAME_OR_INTERCHANGEABLE;
if (score > SCORE_THRESHOLD_SAME_OR_MUTUALLY_INTELLIGIBLE) {
return LocaleAffinity.SAME_OR_MUTUALLY_INTELLIGIBLE;
} else if (score > SCORE_THRESHOLD_HIGH) {
return LocaleAffinity.HIGH;
} else if (score > SCORE_THRESHOLD_LOW) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,5 +43,5 @@ public enum LocaleAffinity {
* Locales either identify the same language, or languages that are similar to a point where a
* person should understand both if they understand one of them.
*/
SAME_OR_INTERCHANGEABLE
SAME_OR_MUTUALLY_INTELLIGIBLE
}
Original file line number Diff line number Diff line change
Expand Up @@ -188,15 +188,15 @@ void whenBuildingRelatedReferenceLocalesCalculator_returnsExpectedCalculator() {
whenJoiningDatasetsUsingReferenceLocalesCalculator_joinsBasedOnExpectedRelatedReferenceLocale() {
return Stream.of(
// Chinese (Hong-Kong), Chinese (Traditional) -> Chinese (Taiwan)
Arguments.of("zh-HK", "zh-Hant", "zh-TW", LocaleAffinity.SAME_OR_INTERCHANGEABLE),
Arguments.of("zh-HK", "zh-Hant", "zh-TW", LocaleAffinity.SAME_OR_MUTUALLY_INTELLIGIBLE),

// Chinese (Hong-Kong), Cantonese (Hong-Kong) -> Cantonese
Arguments.of("zh-HK", "yue-HK", "yue", LocaleAffinity.HIGH),

// Dutch (Belgium), Dutch (Netherlands) -> Dutch
Arguments.of("nl-BE", "nl-NL", "nl", LocaleAffinity.SAME_OR_INTERCHANGEABLE),
Arguments.of("nl-BE", "nl-NL", "nl", LocaleAffinity.SAME_OR_MUTUALLY_INTELLIGIBLE),

// French (Switzerland), French (Canada) -> French
Arguments.of("fr-CH", "fr-CA", "fr-CA", LocaleAffinity.SAME_OR_INTERCHANGEABLE));
Arguments.of("fr-CH", "fr-CA", "fr-CA", LocaleAffinity.SAME_OR_MUTUALLY_INTELLIGIBLE));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@

import static com.spotify.i18n.locales.common.model.LocaleAffinity.LOW;
import static com.spotify.i18n.locales.common.model.LocaleAffinity.NONE;
import static com.spotify.i18n.locales.common.model.LocaleAffinity.SAME_OR_INTERCHANGEABLE;
import static com.spotify.i18n.locales.common.model.LocaleAffinity.SAME_OR_MUTUALLY_INTELLIGIBLE;
import static org.hamcrest.MatcherAssert.assertThat;
import static org.hamcrest.Matchers.is;
import static org.junit.jupiter.api.Assertions.*;
Expand Down Expand Up @@ -112,18 +112,18 @@ public static Stream<Arguments> whenCalculating_returnsExpectedAffinity() {
Arguments.of("en-US", NONE),

// Spanish in Europe should be ranked higher
Arguments.of("es-419", SAME_OR_INTERCHANGEABLE),
Arguments.of("es-GB", SAME_OR_INTERCHANGEABLE),
Arguments.of("es-US", SAME_OR_INTERCHANGEABLE),
Arguments.of("es-419", SAME_OR_MUTUALLY_INTELLIGIBLE),
Arguments.of("es-GB", SAME_OR_MUTUALLY_INTELLIGIBLE),
Arguments.of("es-US", SAME_OR_MUTUALLY_INTELLIGIBLE),

// Basque should be matched, since we support Spanish
Arguments.of("eu", LOW),

// French
Arguments.of("fr", SAME_OR_INTERCHANGEABLE),
Arguments.of("fr-BE", SAME_OR_INTERCHANGEABLE),
Arguments.of("fr-CA", SAME_OR_INTERCHANGEABLE),
Arguments.of("fr-FR", SAME_OR_INTERCHANGEABLE),
Arguments.of("fr", SAME_OR_MUTUALLY_INTELLIGIBLE),
Arguments.of("fr-BE", SAME_OR_MUTUALLY_INTELLIGIBLE),
Arguments.of("fr-CA", SAME_OR_MUTUALLY_INTELLIGIBLE),
Arguments.of("fr-FR", SAME_OR_MUTUALLY_INTELLIGIBLE),

// Galician should be matched, since we support Spanish
Arguments.of("gl", LOW),
Expand All @@ -132,23 +132,23 @@ public static Stream<Arguments> whenCalculating_returnsExpectedAffinity() {
Arguments.of("hi", NONE),

// Croatian should be nicely matched with Bosnian
Arguments.of("hr-HR", SAME_OR_INTERCHANGEABLE),
Arguments.of("hr-HR", SAME_OR_MUTUALLY_INTELLIGIBLE),

// Serbian Cyrillic should be matched, although only Latin script is supported
Arguments.of("sr", SAME_OR_INTERCHANGEABLE),
Arguments.of("sr-Latn", SAME_OR_INTERCHANGEABLE),
Arguments.of("sr-Cyrl-ME", SAME_OR_INTERCHANGEABLE),
Arguments.of("sr", SAME_OR_MUTUALLY_INTELLIGIBLE),
Arguments.of("sr-Latn", SAME_OR_MUTUALLY_INTELLIGIBLE),
Arguments.of("sr-Cyrl-ME", SAME_OR_MUTUALLY_INTELLIGIBLE),

// Portuguese
Arguments.of("pt", SAME_OR_INTERCHANGEABLE),
Arguments.of("pt-BR", SAME_OR_INTERCHANGEABLE),
Arguments.of("pt-SE", SAME_OR_INTERCHANGEABLE),
Arguments.of("pt-US", SAME_OR_INTERCHANGEABLE),
Arguments.of("pt", SAME_OR_MUTUALLY_INTELLIGIBLE),
Arguments.of("pt-BR", SAME_OR_MUTUALLY_INTELLIGIBLE),
Arguments.of("pt-SE", SAME_OR_MUTUALLY_INTELLIGIBLE),
Arguments.of("pt-US", SAME_OR_MUTUALLY_INTELLIGIBLE),

// Only Traditional Chinese should be matched, not Simplified
Arguments.of("zh-CN", NONE),
Arguments.of("zh-TW", SAME_OR_INTERCHANGEABLE),
Arguments.of("zh-HK", SAME_OR_INTERCHANGEABLE));
Arguments.of("zh-TW", SAME_OR_MUTUALLY_INTELLIGIBLE),
Arguments.of("zh-HK", SAME_OR_MUTUALLY_INTELLIGIBLE));
}

@Test
Expand Down
Loading