| /* |
| * Copyright (C) 2011 Google Inc. All rights reserved. |
| * |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions are |
| * met: |
| * |
| * * Redistributions of source code must retain the above copyright |
| * notice, this list of conditions and the following disclaimer. |
| * * Redistributions in binary form must reproduce the above |
| * copyright notice, this list of conditions and the following disclaimer |
| * in the documentation and/or other materials provided with the |
| * distribution. |
| * * Neither the name of Google Inc. nor the names of its |
| * contributors may be used to endorse or promote products derived from |
| * this software without specific prior written permission. |
| * |
| * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
| * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
| * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
| * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
| * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
| * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| */ |
| |
| #include "config.h" |
| #include "LocaleToScriptMapping.h" |
| |
| #include <wtf/HashMap.h> |
| #include <wtf/NeverDestroyed.h> |
| #include <wtf/text/StringHash.h> |
| |
| namespace WebCore { |
| |
| struct ScriptNameCode { |
| ASCIILiteral name; |
| UScriptCode code; |
| }; |
| |
| // This generally maps an ISO 15924 script code to its UScriptCode, but certain families of script codes are |
| // treated as a single script for assigning a per-script font in Settings. For example, "hira" is mapped to |
| // USCRIPT_KATAKANA_OR_HIRAGANA instead of USCRIPT_HIRAGANA, since we want all Japanese scripts to be rendered |
| // using the same font setting. |
| static const ScriptNameCode scriptNameCodeList[] = { |
| { "zyyy"_s, USCRIPT_COMMON }, |
| { "qaai"_s, USCRIPT_INHERITED }, |
| { "arab"_s, USCRIPT_ARABIC }, |
| { "armn"_s, USCRIPT_ARMENIAN }, |
| { "beng"_s, USCRIPT_BENGALI }, |
| { "bopo"_s, USCRIPT_BOPOMOFO }, |
| { "cher"_s, USCRIPT_CHEROKEE }, |
| { "copt"_s, USCRIPT_COPTIC }, |
| { "cyrl"_s, USCRIPT_CYRILLIC }, |
| { "dsrt"_s, USCRIPT_DESERET }, |
| { "deva"_s, USCRIPT_DEVANAGARI }, |
| { "ethi"_s, USCRIPT_ETHIOPIC }, |
| { "geor"_s, USCRIPT_GEORGIAN }, |
| { "goth"_s, USCRIPT_GOTHIC }, |
| { "grek"_s, USCRIPT_GREEK }, |
| { "gujr"_s, USCRIPT_GUJARATI }, |
| { "guru"_s, USCRIPT_GURMUKHI }, |
| { "hani"_s, USCRIPT_HAN }, |
| { "hang"_s, USCRIPT_HANGUL }, |
| { "hebr"_s, USCRIPT_HEBREW }, |
| { "hira"_s, USCRIPT_KATAKANA_OR_HIRAGANA }, |
| { "knda"_s, USCRIPT_KANNADA }, |
| { "kana"_s, USCRIPT_KATAKANA_OR_HIRAGANA }, |
| { "khmr"_s, USCRIPT_KHMER }, |
| { "laoo"_s, USCRIPT_LAO }, |
| { "latn"_s, USCRIPT_LATIN }, |
| { "mlym"_s, USCRIPT_MALAYALAM }, |
| { "mong"_s, USCRIPT_MONGOLIAN }, |
| { "mymr"_s, USCRIPT_MYANMAR }, |
| { "ogam"_s, USCRIPT_OGHAM }, |
| { "ital"_s, USCRIPT_OLD_ITALIC }, |
| { "orya"_s, USCRIPT_ORIYA }, |
| { "runr"_s, USCRIPT_RUNIC }, |
| { "sinh"_s, USCRIPT_SINHALA }, |
| { "syrc"_s, USCRIPT_SYRIAC }, |
| { "taml"_s, USCRIPT_TAMIL }, |
| { "telu"_s, USCRIPT_TELUGU }, |
| { "thaa"_s, USCRIPT_THAANA }, |
| { "thai"_s, USCRIPT_THAI }, |
| { "tibt"_s, USCRIPT_TIBETAN }, |
| { "cans"_s, USCRIPT_CANADIAN_ABORIGINAL }, |
| { "yiii"_s, USCRIPT_YI }, |
| { "tglg"_s, USCRIPT_TAGALOG }, |
| { "hano"_s, USCRIPT_HANUNOO }, |
| { "buhd"_s, USCRIPT_BUHID }, |
| { "tagb"_s, USCRIPT_TAGBANWA }, |
| { "brai"_s, USCRIPT_BRAILLE }, |
| { "cprt"_s, USCRIPT_CYPRIOT }, |
| { "limb"_s, USCRIPT_LIMBU }, |
| { "linb"_s, USCRIPT_LINEAR_B }, |
| { "osma"_s, USCRIPT_OSMANYA }, |
| { "shaw"_s, USCRIPT_SHAVIAN }, |
| { "tale"_s, USCRIPT_TAI_LE }, |
| { "ugar"_s, USCRIPT_UGARITIC }, |
| { "hrkt"_s, USCRIPT_KATAKANA_OR_HIRAGANA }, |
| { "bugi"_s, USCRIPT_BUGINESE }, |
| { "glag"_s, USCRIPT_GLAGOLITIC }, |
| { "khar"_s, USCRIPT_KHAROSHTHI }, |
| { "sylo"_s, USCRIPT_SYLOTI_NAGRI }, |
| { "talu"_s, USCRIPT_NEW_TAI_LUE }, |
| { "tfng"_s, USCRIPT_TIFINAGH }, |
| { "xpeo"_s, USCRIPT_OLD_PERSIAN }, |
| { "bali"_s, USCRIPT_BALINESE }, |
| { "batk"_s, USCRIPT_BATAK }, |
| { "blis"_s, USCRIPT_BLISSYMBOLS }, |
| { "brah"_s, USCRIPT_BRAHMI }, |
| { "cham"_s, USCRIPT_CHAM }, |
| { "cirt"_s, USCRIPT_CIRTH }, |
| { "cyrs"_s, USCRIPT_OLD_CHURCH_SLAVONIC_CYRILLIC }, |
| { "egyd"_s, USCRIPT_DEMOTIC_EGYPTIAN }, |
| { "egyh"_s, USCRIPT_HIERATIC_EGYPTIAN }, |
| { "egyp"_s, USCRIPT_EGYPTIAN_HIEROGLYPHS }, |
| { "geok"_s, USCRIPT_KHUTSURI }, |
| { "hans"_s, USCRIPT_SIMPLIFIED_HAN }, |
| { "hant"_s, USCRIPT_TRADITIONAL_HAN }, |
| { "hmng"_s, USCRIPT_PAHAWH_HMONG }, |
| { "hung"_s, USCRIPT_OLD_HUNGARIAN }, |
| { "inds"_s, USCRIPT_HARAPPAN_INDUS }, |
| { "java"_s, USCRIPT_JAVANESE }, |
| { "kali"_s, USCRIPT_KAYAH_LI }, |
| { "latf"_s, USCRIPT_LATIN_FRAKTUR }, |
| { "latg"_s, USCRIPT_LATIN_GAELIC }, |
| { "lepc"_s, USCRIPT_LEPCHA }, |
| { "lina"_s, USCRIPT_LINEAR_A }, |
| { "mand"_s, USCRIPT_MANDAEAN }, |
| { "maya"_s, USCRIPT_MAYAN_HIEROGLYPHS }, |
| { "mero"_s, USCRIPT_MEROITIC }, |
| { "nkoo"_s, USCRIPT_NKO }, |
| { "orkh"_s, USCRIPT_ORKHON }, |
| { "perm"_s, USCRIPT_OLD_PERMIC }, |
| { "phag"_s, USCRIPT_PHAGS_PA }, |
| { "phnx"_s, USCRIPT_PHOENICIAN }, |
| { "plrd"_s, USCRIPT_PHONETIC_POLLARD }, |
| { "roro"_s, USCRIPT_RONGORONGO }, |
| { "sara"_s, USCRIPT_SARATI }, |
| { "syre"_s, USCRIPT_ESTRANGELO_SYRIAC }, |
| { "syrj"_s, USCRIPT_WESTERN_SYRIAC }, |
| { "syrn"_s, USCRIPT_EASTERN_SYRIAC }, |
| { "teng"_s, USCRIPT_TENGWAR }, |
| { "vaii"_s, USCRIPT_VAI }, |
| { "visp"_s, USCRIPT_VISIBLE_SPEECH }, |
| { "xsux"_s, USCRIPT_CUNEIFORM }, |
| { "jpan"_s, USCRIPT_KATAKANA_OR_HIRAGANA }, |
| { "kore"_s, USCRIPT_HANGUL }, |
| { "zxxx"_s, USCRIPT_UNWRITTEN_LANGUAGES }, |
| { "zzzz"_s, USCRIPT_UNKNOWN } |
| }; |
| |
| struct ScriptNameCodeMapHashTraits : public HashTraits<String> { |
| static const int minimumTableSize = WTF::HashTableCapacityForSize<WTF_ARRAY_LENGTH(scriptNameCodeList)>::value; |
| }; |
| |
| UScriptCode scriptNameToCode(const String& scriptName) |
| { |
| static const auto scriptNameCodeMap = makeNeverDestroyed([] { |
| HashMap<String, UScriptCode, ASCIICaseInsensitiveHash, ScriptNameCodeMapHashTraits> map; |
| for (auto& nameAndCode : scriptNameCodeList) |
| map.add(nameAndCode.name, nameAndCode.code); |
| return map; |
| }()); |
| |
| auto it = scriptNameCodeMap.get().find(scriptName); |
| if (it != scriptNameCodeMap.get().end()) |
| return it->value; |
| return USCRIPT_INVALID_CODE; |
| } |
| |
| struct LocaleScript { |
| ASCIILiteral locale; |
| UScriptCode script; |
| }; |
| |
| static const LocaleScript localeScriptList[] = { |
| { "aa"_s, USCRIPT_LATIN }, |
| { "ab"_s, USCRIPT_CYRILLIC }, |
| { "ady"_s, USCRIPT_CYRILLIC }, |
| { "af"_s, USCRIPT_LATIN }, |
| { "ak"_s, USCRIPT_LATIN }, |
| { "am"_s, USCRIPT_ETHIOPIC }, |
| { "ar"_s, USCRIPT_ARABIC }, |
| { "as"_s, USCRIPT_BENGALI }, |
| { "ast"_s, USCRIPT_LATIN }, |
| { "av"_s, USCRIPT_CYRILLIC }, |
| { "ay"_s, USCRIPT_LATIN }, |
| { "az"_s, USCRIPT_LATIN }, |
| { "ba"_s, USCRIPT_CYRILLIC }, |
| { "be"_s, USCRIPT_CYRILLIC }, |
| { "bg"_s, USCRIPT_CYRILLIC }, |
| { "bi"_s, USCRIPT_LATIN }, |
| { "bn"_s, USCRIPT_BENGALI }, |
| { "bo"_s, USCRIPT_TIBETAN }, |
| { "bs"_s, USCRIPT_LATIN }, |
| { "ca"_s, USCRIPT_LATIN }, |
| { "ce"_s, USCRIPT_CYRILLIC }, |
| { "ceb"_s, USCRIPT_LATIN }, |
| { "ch"_s, USCRIPT_LATIN }, |
| { "chk"_s, USCRIPT_LATIN }, |
| { "cs"_s, USCRIPT_LATIN }, |
| { "cy"_s, USCRIPT_LATIN }, |
| { "da"_s, USCRIPT_LATIN }, |
| { "de"_s, USCRIPT_LATIN }, |
| { "dv"_s, USCRIPT_THAANA }, |
| { "dz"_s, USCRIPT_TIBETAN }, |
| { "ee"_s, USCRIPT_LATIN }, |
| { "efi"_s, USCRIPT_LATIN }, |
| { "el"_s, USCRIPT_GREEK }, |
| { "en"_s, USCRIPT_LATIN }, |
| { "es"_s, USCRIPT_LATIN }, |
| { "et"_s, USCRIPT_LATIN }, |
| { "eu"_s, USCRIPT_LATIN }, |
| { "fa"_s, USCRIPT_ARABIC }, |
| { "fi"_s, USCRIPT_LATIN }, |
| { "fil"_s, USCRIPT_LATIN }, |
| { "fj"_s, USCRIPT_LATIN }, |
| { "fo"_s, USCRIPT_LATIN }, |
| { "fr"_s, USCRIPT_LATIN }, |
| { "fur"_s, USCRIPT_LATIN }, |
| { "fy"_s, USCRIPT_LATIN }, |
| { "ga"_s, USCRIPT_LATIN }, |
| { "gaa"_s, USCRIPT_LATIN }, |
| { "gd"_s, USCRIPT_LATIN }, |
| { "gil"_s, USCRIPT_LATIN }, |
| { "gl"_s, USCRIPT_LATIN }, |
| { "gn"_s, USCRIPT_LATIN }, |
| { "gsw"_s, USCRIPT_LATIN }, |
| { "gu"_s, USCRIPT_GUJARATI }, |
| { "ha"_s, USCRIPT_LATIN }, |
| { "haw"_s, USCRIPT_LATIN }, |
| { "he"_s, USCRIPT_HEBREW }, |
| { "hi"_s, USCRIPT_DEVANAGARI }, |
| { "hil"_s, USCRIPT_LATIN }, |
| { "ho"_s, USCRIPT_LATIN }, |
| { "hr"_s, USCRIPT_LATIN }, |
| { "ht"_s, USCRIPT_LATIN }, |
| { "hu"_s, USCRIPT_LATIN }, |
| { "hy"_s, USCRIPT_ARMENIAN }, |
| { "id"_s, USCRIPT_LATIN }, |
| { "ig"_s, USCRIPT_LATIN }, |
| { "ii"_s, USCRIPT_YI }, |
| { "ilo"_s, USCRIPT_LATIN }, |
| { "inh"_s, USCRIPT_CYRILLIC }, |
| { "is"_s, USCRIPT_LATIN }, |
| { "it"_s, USCRIPT_LATIN }, |
| { "iu"_s, USCRIPT_CANADIAN_ABORIGINAL }, |
| { "ja"_s, USCRIPT_KATAKANA_OR_HIRAGANA }, |
| { "jv"_s, USCRIPT_LATIN }, |
| { "ka"_s, USCRIPT_GEORGIAN }, |
| { "kaj"_s, USCRIPT_LATIN }, |
| { "kam"_s, USCRIPT_LATIN }, |
| { "kbd"_s, USCRIPT_CYRILLIC }, |
| { "kha"_s, USCRIPT_LATIN }, |
| { "kk"_s, USCRIPT_CYRILLIC }, |
| { "kl"_s, USCRIPT_LATIN }, |
| { "km"_s, USCRIPT_KHMER }, |
| { "kn"_s, USCRIPT_KANNADA }, |
| { "ko"_s, USCRIPT_HANGUL }, |
| { "kok"_s, USCRIPT_DEVANAGARI }, |
| { "kos"_s, USCRIPT_LATIN }, |
| { "kpe"_s, USCRIPT_LATIN }, |
| { "krc"_s, USCRIPT_CYRILLIC }, |
| { "ks"_s, USCRIPT_ARABIC }, |
| { "ku"_s, USCRIPT_ARABIC }, |
| { "kum"_s, USCRIPT_CYRILLIC }, |
| { "ky"_s, USCRIPT_CYRILLIC }, |
| { "la"_s, USCRIPT_LATIN }, |
| { "lah"_s, USCRIPT_ARABIC }, |
| { "lb"_s, USCRIPT_LATIN }, |
| { "lez"_s, USCRIPT_CYRILLIC }, |
| { "ln"_s, USCRIPT_LATIN }, |
| { "lo"_s, USCRIPT_LAO }, |
| { "lt"_s, USCRIPT_LATIN }, |
| { "lv"_s, USCRIPT_LATIN }, |
| { "mai"_s, USCRIPT_DEVANAGARI }, |
| { "mdf"_s, USCRIPT_CYRILLIC }, |
| { "mg"_s, USCRIPT_LATIN }, |
| { "mh"_s, USCRIPT_LATIN }, |
| { "mi"_s, USCRIPT_LATIN }, |
| { "mk"_s, USCRIPT_CYRILLIC }, |
| { "ml"_s, USCRIPT_MALAYALAM }, |
| { "mn"_s, USCRIPT_CYRILLIC }, |
| { "mr"_s, USCRIPT_DEVANAGARI }, |
| { "ms"_s, USCRIPT_LATIN }, |
| { "mt"_s, USCRIPT_LATIN }, |
| { "my"_s, USCRIPT_MYANMAR }, |
| { "myv"_s, USCRIPT_CYRILLIC }, |
| { "na"_s, USCRIPT_LATIN }, |
| { "nb"_s, USCRIPT_LATIN }, |
| { "ne"_s, USCRIPT_DEVANAGARI }, |
| { "niu"_s, USCRIPT_LATIN }, |
| { "nl"_s, USCRIPT_LATIN }, |
| { "nn"_s, USCRIPT_LATIN }, |
| { "nr"_s, USCRIPT_LATIN }, |
| { "nso"_s, USCRIPT_LATIN }, |
| { "ny"_s, USCRIPT_LATIN }, |
| { "oc"_s, USCRIPT_LATIN }, |
| { "om"_s, USCRIPT_LATIN }, |
| { "or"_s, USCRIPT_ORIYA }, |
| { "os"_s, USCRIPT_CYRILLIC }, |
| { "pa"_s, USCRIPT_GURMUKHI }, |
| { "pag"_s, USCRIPT_LATIN }, |
| { "pap"_s, USCRIPT_LATIN }, |
| { "pau"_s, USCRIPT_LATIN }, |
| { "pl"_s, USCRIPT_LATIN }, |
| { "pon"_s, USCRIPT_LATIN }, |
| { "ps"_s, USCRIPT_ARABIC }, |
| { "pt"_s, USCRIPT_LATIN }, |
| { "qu"_s, USCRIPT_LATIN }, |
| { "rm"_s, USCRIPT_LATIN }, |
| { "rn"_s, USCRIPT_LATIN }, |
| { "ro"_s, USCRIPT_LATIN }, |
| { "ru"_s, USCRIPT_CYRILLIC }, |
| { "rw"_s, USCRIPT_LATIN }, |
| { "sa"_s, USCRIPT_DEVANAGARI }, |
| { "sah"_s, USCRIPT_CYRILLIC }, |
| { "sat"_s, USCRIPT_LATIN }, |
| { "sd"_s, USCRIPT_ARABIC }, |
| { "se"_s, USCRIPT_LATIN }, |
| { "sg"_s, USCRIPT_LATIN }, |
| { "si"_s, USCRIPT_SINHALA }, |
| { "sid"_s, USCRIPT_LATIN }, |
| { "sk"_s, USCRIPT_LATIN }, |
| { "sl"_s, USCRIPT_LATIN }, |
| { "sm"_s, USCRIPT_LATIN }, |
| { "so"_s, USCRIPT_LATIN }, |
| { "sq"_s, USCRIPT_LATIN }, |
| { "sr"_s, USCRIPT_CYRILLIC }, |
| { "ss"_s, USCRIPT_LATIN }, |
| { "st"_s, USCRIPT_LATIN }, |
| { "su"_s, USCRIPT_LATIN }, |
| { "sv"_s, USCRIPT_LATIN }, |
| { "sw"_s, USCRIPT_LATIN }, |
| { "ta"_s, USCRIPT_TAMIL }, |
| { "te"_s, USCRIPT_TELUGU }, |
| { "tet"_s, USCRIPT_LATIN }, |
| { "tg"_s, USCRIPT_CYRILLIC }, |
| { "th"_s, USCRIPT_THAI }, |
| { "ti"_s, USCRIPT_ETHIOPIC }, |
| { "tig"_s, USCRIPT_ETHIOPIC }, |
| { "tk"_s, USCRIPT_LATIN }, |
| { "tkl"_s, USCRIPT_LATIN }, |
| { "tl"_s, USCRIPT_LATIN }, |
| { "tn"_s, USCRIPT_LATIN }, |
| { "to"_s, USCRIPT_LATIN }, |
| { "tpi"_s, USCRIPT_LATIN }, |
| { "tr"_s, USCRIPT_LATIN }, |
| { "trv"_s, USCRIPT_LATIN }, |
| { "ts"_s, USCRIPT_LATIN }, |
| { "tt"_s, USCRIPT_CYRILLIC }, |
| { "tvl"_s, USCRIPT_LATIN }, |
| { "tw"_s, USCRIPT_LATIN }, |
| { "ty"_s, USCRIPT_LATIN }, |
| { "tyv"_s, USCRIPT_CYRILLIC }, |
| { "udm"_s, USCRIPT_CYRILLIC }, |
| { "ug"_s, USCRIPT_ARABIC }, |
| { "uk"_s, USCRIPT_CYRILLIC }, |
| { "und"_s, USCRIPT_LATIN }, |
| { "ur"_s, USCRIPT_ARABIC }, |
| { "uz"_s, USCRIPT_CYRILLIC }, |
| { "ve"_s, USCRIPT_LATIN }, |
| { "vi"_s, USCRIPT_LATIN }, |
| { "wal"_s, USCRIPT_ETHIOPIC }, |
| { "war"_s, USCRIPT_LATIN }, |
| { "wo"_s, USCRIPT_LATIN }, |
| { "xh"_s, USCRIPT_LATIN }, |
| { "yap"_s, USCRIPT_LATIN }, |
| { "yo"_s, USCRIPT_LATIN }, |
| { "za"_s, USCRIPT_LATIN }, |
| { "zh"_s, USCRIPT_HAN }, |
| { "zh_hk"_s, USCRIPT_TRADITIONAL_HAN }, |
| { "zh_tw"_s, USCRIPT_TRADITIONAL_HAN }, |
| { "zu"_s, USCRIPT_LATIN } |
| }; |
| |
| struct LocaleScriptMapHashTraits : public HashTraits<String> { |
| static const int minimumTableSize = WTF::HashTableCapacityForSize<WTF_ARRAY_LENGTH(localeScriptList)>::value; |
| }; |
| |
| UScriptCode localeToScriptCodeForFontSelection(const String& locale) |
| { |
| static const auto localeScriptMap = makeNeverDestroyed([] { |
| HashMap<String, UScriptCode, ASCIICaseInsensitiveHash, LocaleScriptMapHashTraits> map; |
| for (auto& localeAndScript : localeScriptList) |
| map.add(localeAndScript.locale, localeAndScript.script); |
| return map; |
| }()); |
| |
| String canonicalLocale = locale; |
| canonicalLocale.replace('-', '_'); |
| while (!canonicalLocale.isEmpty()) { |
| auto it = localeScriptMap.get().find(canonicalLocale); |
| if (it != localeScriptMap.get().end()) |
| return it->value; |
| auto underscorePosition = canonicalLocale.reverseFind('_'); |
| if (underscorePosition == notFound) |
| break; |
| UScriptCode code = scriptNameToCode(canonicalLocale.substring(underscorePosition + 1)); |
| if (code != USCRIPT_INVALID_CODE && code != USCRIPT_UNKNOWN) |
| return code; |
| canonicalLocale = canonicalLocale.substring(0, underscorePosition); |
| } |
| return USCRIPT_COMMON; |
| } |
| |
| } // namespace WebCore |