Skip to content

Commit 448005f

Browse files
authored
feat: Split affinity value between SAME and MUTUALLY_INTELLIGIBLE (#22)
1 parent 746fdbe commit 448005f

10 files changed

Lines changed: 354 additions & 277 deletions

File tree

README.md

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -73,8 +73,9 @@ We define the affinity between two locales using a `LocaleAffinity` enum value:
7373
- `LOW`: Locales are somewhat related, meaning they either have low similarities from a linguistic
7474
perspective or co-exist in given geopolitical or cultural contexts.
7575
- `HIGH`: Locales are quite related, meaning they have similarities from a linguistic perspective.
76-
- `SAME_OR_MUTUALLY_INTELLIGIBLE`: Locales either identify the same language, or languages that are
77-
similar to a point where a person should understand both if they understand one of them.
76+
- `MUTUALLY_INTELLIGIBLE`: Locales identify languages that are similar to a point where a person
77+
should understand both if they understand one of them.
78+
- `SAME`: Locales identify the same language
7879

7980
We offer two separate logics, each dedicated to separate use-cases:
8081

examples/locales-affinity-examples/src/main/java/com/spotify/i18n/locales/affinity/examples/AffinityCalculationExampleMain.java

Lines changed: 20 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222

2323
import static com.spotify.i18n.locales.common.model.LocaleAffinity.LOW;
2424
import static com.spotify.i18n.locales.common.model.LocaleAffinity.NONE;
25-
import static com.spotify.i18n.locales.common.model.LocaleAffinity.SAME_OR_MUTUALLY_INTELLIGIBLE;
25+
import static com.spotify.i18n.locales.common.model.LocaleAffinity.SAME;
2626

2727
import com.ibm.icu.util.ULocale;
2828
import com.spotify.i18n.locales.common.LocaleAffinityCalculator;
@@ -120,18 +120,18 @@ private static Map<String, LocaleAffinity> getLanguageTagToExpectedAffinityMap()
120120
map.put("en-US", NONE);
121121

122122
// Spanish in Europe should ok
123-
map.put("es-419", SAME_OR_MUTUALLY_INTELLIGIBLE);
124-
map.put("es-GB", SAME_OR_MUTUALLY_INTELLIGIBLE);
125-
map.put("es-US", SAME_OR_MUTUALLY_INTELLIGIBLE);
123+
map.put("es-419", SAME);
124+
map.put("es-GB", SAME);
125+
map.put("es-US", SAME);
126126

127127
// Basque should be matched, since we support Spanish
128128
map.put("eu", LOW);
129129

130130
// French
131-
map.put("fr", SAME_OR_MUTUALLY_INTELLIGIBLE);
132-
map.put("fr-BE", SAME_OR_MUTUALLY_INTELLIGIBLE);
133-
map.put("fr-CA", SAME_OR_MUTUALLY_INTELLIGIBLE);
134-
map.put("fr-FR", SAME_OR_MUTUALLY_INTELLIGIBLE);
131+
map.put("fr", SAME);
132+
map.put("fr-BE", SAME);
133+
map.put("fr-CA", SAME);
134+
map.put("fr-FR", SAME);
135135

136136
// Galician should be matched, since we support Spanish
137137
map.put("gl", LOW);
@@ -140,23 +140,23 @@ private static Map<String, LocaleAffinity> getLanguageTagToExpectedAffinityMap()
140140
map.put("hi", NONE);
141141

142142
// Croatian should be nicely matched with Bosnian
143-
map.put("hr-HR", SAME_OR_MUTUALLY_INTELLIGIBLE);
143+
map.put("hr-HR", SAME);
144144

145145
// Serbian Cyrillic should be matched, although only Latin script is supported
146-
map.put("sr", SAME_OR_MUTUALLY_INTELLIGIBLE);
147-
map.put("sr-Latn", SAME_OR_MUTUALLY_INTELLIGIBLE);
148-
map.put("sr-Cyrl-ME", SAME_OR_MUTUALLY_INTELLIGIBLE);
146+
map.put("sr", SAME);
147+
map.put("sr-Latn", SAME);
148+
map.put("sr-Cyrl-ME", SAME);
149149

150150
// Portuguese
151-
map.put("pt", SAME_OR_MUTUALLY_INTELLIGIBLE);
152-
map.put("pt-BR", SAME_OR_MUTUALLY_INTELLIGIBLE);
153-
map.put("pt-SE", SAME_OR_MUTUALLY_INTELLIGIBLE);
154-
map.put("pt-US", SAME_OR_MUTUALLY_INTELLIGIBLE);
151+
map.put("pt", SAME);
152+
map.put("pt-BR", SAME);
153+
map.put("pt-SE", SAME);
154+
map.put("pt-US", SAME);
155155

156156
// Only Traditional Chinese should be matched, not Simplified
157157
map.put("zh-CN", NONE);
158-
map.put("zh-TW", SAME_OR_MUTUALLY_INTELLIGIBLE);
159-
map.put("zh-HK", SAME_OR_MUTUALLY_INTELLIGIBLE);
158+
map.put("zh-TW", SAME);
159+
map.put("zh-HK", SAME);
160160
return map;
161161
}
162162

@@ -168,13 +168,9 @@ public static void main(String[] args) {
168168
System.out.println("========================================");
169169
System.out.println(
170170
String.format(
171-
"Example 1: List of language tags with calculated affinity = %s",
172-
SAME_OR_MUTUALLY_INTELLIGIBLE.name()));
171+
"Example 1: List of language tags with calculated affinity = %s", SAME.name()));
173172
getLanguageTagToExpectedAffinityMap().keySet().stream()
174-
.filter(
175-
languageTag ->
176-
affinityCalculator.calculate(languageTag).affinity()
177-
== SAME_OR_MUTUALLY_INTELLIGIBLE)
173+
.filter(languageTag -> affinityCalculator.calculate(languageTag).affinity() == SAME)
178174
.forEach(System.out::println);
179175

180176
// Example 2: Check that calculated affinity for each language tag matches the expected value.

examples/locales-affinity-examples/src/main/java/com/spotify/i18n/locales/affinity/examples/ReferenceLocalesBasedJoinExampleMain.java

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -45,17 +45,15 @@ public class ReferenceLocalesBasedJoinExampleMain {
4545
* <p>Possible joins in the execution output are:
4646
*
4747
* <ul>
48-
* <li>(de, de-AT) on reference locale [de-AT] with SAME_OR_MUTUALLY_INTELLIGIBLE affinity
49-
* <li>(en-GB, en-JP) on reference locale [en-GB] with SAME_OR_MUTUALLY_INTELLIGIBLE affinity
50-
* <li>(en-GB, en-SE) on reference locale [en-SE] with SAME_OR_MUTUALLY_INTELLIGIBLE affinity
48+
* <li>(de, de-AT) on reference locale [de-AT] with SAME affinity
49+
* <li>(en-GB, en-JP) on reference locale [en-GB] with SAME affinity
50+
* <li>(en-GB, en-SE) on reference locale [en-SE] with SAME affinity
5151
* <li>(es-BE, ca) on reference locale [ca] with LOW affinity
52-
* <li>(fr-SE, fr-BE-u-ca-gregorian) on reference locale [fr-BE] with
53-
* SAME_OR_MUTUALLY_INTELLIGIBLE affinity
54-
* <li>(fr-SE, fr-CA) on reference locale [fr-CA] with SAME_OR_MUTUALLY_INTELLIGIBLE affinity
55-
* <li>(ja-IT, ja@calendar=buddhist) on reference locale [ja] with SAME_OR_MUTUALLY_INTELLIGIBLE
56-
* affinity
57-
* <li>(nl-BE, nl-ZA) on reference locale [nl] with SAME_OR_MUTUALLY_INTELLIGIBLE affinity
58-
* <li>(zh-Hans-US, zh-CN) on reference locale [zh] with SAME_OR_MUTUALLY_INTELLIGIBLE affinity
52+
* <li>(fr-SE, fr-BE-u-ca-gregorian) on reference locale [fr-BE] with SAME affinity
53+
* <li>(fr-SE, fr-CA) on reference locale [fr-CA] with SAME affinity
54+
* <li>(ja-IT, ja@calendar=buddhist) on reference locale [ja] with SAME affinity
55+
* <li>(nl-BE, nl-ZA) on reference locale [nl] with SAME affinity
56+
* <li>(zh-Hans-US, zh-CN) on reference locale [zh] with SAME affinity
5957
* </ul>
6058
*
6159
* @param args

locales-common/src/main/java/com/spotify/i18n/locales/common/impl/LocaleAffinityCalculatorBaseImpl.java

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ public abstract class LocaleAffinityCalculatorBaseImpl implements LocaleAffinity
7575
private static final double DISTANCE_THRESHOLD = 224.0;
7676

7777
// Score to affinity thresholds
78-
private static final int SCORE_THRESHOLD_SAME_OR_MUTUALLY_INTELLIGIBLE = 65;
78+
private static final int SCORE_THRESHOLD_MUTUALLY_INTELLIGIBLE = 65;
7979
private static final int SCORE_THRESHOLD_HIGH = 30;
8080
private static final int SCORE_THRESHOLD_LOW = 0;
8181

@@ -103,14 +103,14 @@ private LocaleAffinity getAffinity(@Nullable final String languageTag) {
103103
// We attempt to match based on corresponding spoken language first, and make use of the
104104
// score-based affinity calculation as fallback.
105105
if (hasSameSpokenLanguageAffinity(languageTag)) {
106-
return LocaleAffinity.SAME_OR_MUTUALLY_INTELLIGIBLE;
106+
return LocaleAffinity.SAME;
107107
} else {
108108
return calculateScoreBasedAffinity(languageTag);
109109
}
110110
}
111111
}
112112

113-
private boolean hasSameSpokenLanguageAffinity(final String languageTag) {
113+
private boolean hasSameSpokenLanguageAffinity(@Nullable final String languageTag) {
114114
return LanguageUtils.getSpokenLanguageLocale(languageTag)
115115
.map(
116116
spokenLanguageLocale ->
@@ -155,8 +155,8 @@ private int convertDistanceToAffinityScore(final int distance) {
155155
}
156156

157157
private LocaleAffinity convertScoreToLocaleAffinity(final int score) {
158-
if (score > SCORE_THRESHOLD_SAME_OR_MUTUALLY_INTELLIGIBLE) {
159-
return LocaleAffinity.SAME_OR_MUTUALLY_INTELLIGIBLE;
158+
if (score > SCORE_THRESHOLD_MUTUALLY_INTELLIGIBLE) {
159+
return LocaleAffinity.MUTUALLY_INTELLIGIBLE;
160160
} else if (score > SCORE_THRESHOLD_HIGH) {
161161
return LocaleAffinity.HIGH;
162162
} else if (score > SCORE_THRESHOLD_LOW) {

locales-common/src/main/java/com/spotify/i18n/locales/common/model/LocaleAffinity.java

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -40,8 +40,11 @@ public enum LocaleAffinity {
4040
HIGH,
4141

4242
/**
43-
* Locales either identify the same language, or languages that are similar to a point where a
44-
* person should understand both if they understand one of them.
43+
* Locales identify languages that are similar to a point where a person should understand both if
44+
* they understand one of them.
4545
*/
46-
SAME_OR_MUTUALLY_INTELLIGIBLE
46+
MUTUALLY_INTELLIGIBLE,
47+
48+
/** Locales identify the same language. */
49+
SAME
4750
}

locales-common/src/test/java/com/spotify/i18n/locales/common/LocaleAffinityHelpersFactoryTest.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -188,15 +188,15 @@ void whenBuildingRelatedReferenceLocalesCalculator_returnsExpectedCalculator() {
188188
whenJoiningDatasetsUsingReferenceLocalesCalculator_joinsBasedOnExpectedRelatedReferenceLocale() {
189189
return Stream.of(
190190
// Chinese (Hong-Kong), Chinese (Traditional) -> Chinese (Taiwan)
191-
Arguments.of("zh-HK", "zh-Hant", "zh-TW", LocaleAffinity.SAME_OR_MUTUALLY_INTELLIGIBLE),
191+
Arguments.of("zh-HK", "zh-Hant", "zh-TW", LocaleAffinity.SAME),
192192

193193
// Chinese (Hong-Kong), Cantonese (Hong-Kong) -> Cantonese
194194
Arguments.of("zh-HK", "yue-HK", "yue", LocaleAffinity.HIGH),
195195

196196
// Dutch (Belgium), Dutch (Netherlands) -> Dutch
197-
Arguments.of("nl-BE", "nl-NL", "nl", LocaleAffinity.SAME_OR_MUTUALLY_INTELLIGIBLE),
197+
Arguments.of("nl-BE", "nl-NL", "nl", LocaleAffinity.SAME),
198198

199199
// French (Switzerland), French (Canada) -> French
200-
Arguments.of("fr-CH", "fr-CA", "fr-CA", LocaleAffinity.SAME_OR_MUTUALLY_INTELLIGIBLE));
200+
Arguments.of("fr-CH", "fr-CA", "fr-CA", LocaleAffinity.SAME));
201201
}
202202
}

locales-common/src/test/java/com/spotify/i18n/locales/common/impl/LocaleAffinityCalculatorBaseImplTest.java

Lines changed: 19 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,9 @@
2121
package com.spotify.i18n.locales.common.impl;
2222

2323
import static com.spotify.i18n.locales.common.model.LocaleAffinity.LOW;
24+
import static com.spotify.i18n.locales.common.model.LocaleAffinity.MUTUALLY_INTELLIGIBLE;
2425
import static com.spotify.i18n.locales.common.model.LocaleAffinity.NONE;
25-
import static com.spotify.i18n.locales.common.model.LocaleAffinity.SAME_OR_MUTUALLY_INTELLIGIBLE;
26+
import static com.spotify.i18n.locales.common.model.LocaleAffinity.SAME;
2627
import static org.hamcrest.MatcherAssert.assertThat;
2728
import static org.hamcrest.Matchers.is;
2829
import static org.junit.jupiter.api.Assertions.*;
@@ -112,18 +113,18 @@ public static Stream<Arguments> whenCalculating_returnsExpectedAffinity() {
112113
Arguments.of("en-US", NONE),
113114

114115
// Spanish in Europe should be ranked higher
115-
Arguments.of("es-419", SAME_OR_MUTUALLY_INTELLIGIBLE),
116-
Arguments.of("es-GB", SAME_OR_MUTUALLY_INTELLIGIBLE),
117-
Arguments.of("es-US", SAME_OR_MUTUALLY_INTELLIGIBLE),
116+
Arguments.of("es-419", SAME),
117+
Arguments.of("es-GB", SAME),
118+
Arguments.of("es-US", SAME),
118119

119120
// Basque should be matched, since we support Spanish
120121
Arguments.of("eu", LOW),
121122

122123
// French
123-
Arguments.of("fr", SAME_OR_MUTUALLY_INTELLIGIBLE),
124-
Arguments.of("fr-BE", SAME_OR_MUTUALLY_INTELLIGIBLE),
125-
Arguments.of("fr-CA", SAME_OR_MUTUALLY_INTELLIGIBLE),
126-
Arguments.of("fr-FR", SAME_OR_MUTUALLY_INTELLIGIBLE),
124+
Arguments.of("fr", SAME),
125+
Arguments.of("fr-BE", SAME),
126+
Arguments.of("fr-CA", SAME),
127+
Arguments.of("fr-FR", SAME),
127128

128129
// Galician should be matched, since we support Spanish
129130
Arguments.of("gl", LOW),
@@ -132,23 +133,23 @@ public static Stream<Arguments> whenCalculating_returnsExpectedAffinity() {
132133
Arguments.of("hi", NONE),
133134

134135
// Croatian should be nicely matched with Bosnian
135-
Arguments.of("hr-HR", SAME_OR_MUTUALLY_INTELLIGIBLE),
136+
Arguments.of("hr-HR", MUTUALLY_INTELLIGIBLE),
136137

137138
// Serbian Cyrillic should be matched, although only Latin script is supported
138-
Arguments.of("sr", SAME_OR_MUTUALLY_INTELLIGIBLE),
139-
Arguments.of("sr-Latn", SAME_OR_MUTUALLY_INTELLIGIBLE),
140-
Arguments.of("sr-Cyrl-ME", SAME_OR_MUTUALLY_INTELLIGIBLE),
139+
Arguments.of("sr", SAME),
140+
Arguments.of("sr-Latn", SAME),
141+
Arguments.of("sr-Cyrl-ME", SAME),
141142

142143
// Portuguese
143-
Arguments.of("pt", SAME_OR_MUTUALLY_INTELLIGIBLE),
144-
Arguments.of("pt-BR", SAME_OR_MUTUALLY_INTELLIGIBLE),
145-
Arguments.of("pt-SE", SAME_OR_MUTUALLY_INTELLIGIBLE),
146-
Arguments.of("pt-US", SAME_OR_MUTUALLY_INTELLIGIBLE),
144+
Arguments.of("pt", SAME),
145+
Arguments.of("pt-BR", SAME),
146+
Arguments.of("pt-SE", SAME),
147+
Arguments.of("pt-US", SAME),
147148

148149
// Only Traditional Chinese should be matched, not Simplified
149150
Arguments.of("zh-CN", NONE),
150-
Arguments.of("zh-TW", SAME_OR_MUTUALLY_INTELLIGIBLE),
151-
Arguments.of("zh-HK", SAME_OR_MUTUALLY_INTELLIGIBLE));
151+
Arguments.of("zh-TW", SAME),
152+
Arguments.of("zh-HK", SAME));
152153
}
153154

154155
@Test

0 commit comments

Comments
 (0)