blob: 71df184bda455fc82396cf0fbea391b362464079 [file] [log] [blame]
#!/usr/bin/env python
# Copyright (C) 2018 Andy VanWagoner (andy@vanwagoner.family)
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#
# 1. Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# This tool processes the IANA file language-subtag-registry.txt to create
# information required to canonicalize language tags according to ECMA 402 and
# RFC 5646 Section 4.5.
# https://www.iana.org/assignments/language-subtag-registry
# https://tc39.github.io/ecma402/#sec-canonicalizelanguagetag
# https://tools.ietf.org/html/rfc5646#section-4.5
import sys
import optparse
import os
header = """// DO NO EDIT! - This file was generated by """ + __file__ + """
"""
footer = """
"""
def openOrExit(path, mode):
try:
if sys.version_info.major >= 3:
return open(path, mode, encoding="UTF-8")
else:
return open(path, mode)
except IOError as e:
print("I/O error opening {0}, ({1}): {2}".format(path, e.errno, e.strerror))
exit(1)
class SubtagRegistry:
def __init__(self):
self.languageMap = {}
self.extlangMap = {}
self.regionMap = {}
self.redundantMap = {}
self.grandfatheredMap = {}
def parse(self, file):
record = {}
for line in file:
line = line.strip()
if line.startswith("#"):
continue
if line == "%%":
self.process(record)
record = {}
continue
if ":" in line:
key, value = line.split(":", 1)
record[key.strip()] = value.strip()
else:
# Description often continues on the next line
record[key.strip()] = " " + line
self.process(record)
def process(self, record):
if "File-Date" in record:
self.fileDate = record["File-Date"]
if not ("Type" in record):
return
type = record["Type"]
preferred = record.get("Preferred-Value")
if type == "language" and preferred:
self.languageMap[record["Subtag"]] = preferred
elif type == "extlang":
self.extlangMap[record["Subtag"]] = record["Prefix"]
elif type == "region" and preferred:
self.regionMap[record["Subtag"]] = preferred
elif type == "redundant" and preferred:
lang = self.extlangMap.get(preferred)
if "{}-{}".format(lang, preferred) != record["Tag"]:
self.redundantMap[record["Tag"]] = preferred
elif type == "variant" and preferred:
key = "{}-{}".format(record['Prefix'], record['Subtag'])
if preferred == "alalc97":
preferred = "ja-Latn-alalc97"
self.redundantMap[key] = preferred
elif type == "grandfathered":
key = record["Tag"].lower()
value = record.get("Preferred-Value", key)
self.grandfatheredMap[key] = value
def dump(self, file):
if self.fileDate:
file.write("// language-subtag-registry file date: {}\n".format(self.fileDate))
file.write("\n#pragma once\n")
file.write("\n#if ENABLE(INTL)\n")
file.write("\nnamespace JSC {\n")
self.dumpLookup(file, "intlPreferredLanguageTag", self.languageMap)
self.dumpLookup(file, "intlPreferredExtlangTag", self.extlangMap)
self.dumpLookup(file, "intlPreferredRegionTag", self.regionMap)
self.dumpLookup(file, "intlRedundantLanguageTag", self.redundantMap)
self.dumpLookup(file, "intlGrandfatheredLanguageTag", self.grandfatheredMap)
file.write("\n} // namespace JSC\n")
file.write("\n#endif // ENABLE(INTL)\n")
def dumpLookup(self, file, name, map):
file.write("\nstatic String {}(const String& tag)\n{{\n".format(name))
file.write(" // {} possible replacements\n".format(len(map)))
# We could pick the lookup implementation per map if desired
# Anecdotal perf: if > switch > hash (slowest)
# Code complexity: switch > if > hash (least complex)
# Algo complexity: if = O(N) > switch > O(log N) > hash = O(1) (least complex)
self.dumpIfLookup(file, name, map)
file.write("}\n")
def dumpHashLookup(self, file, name, map):
file.write(" static NeverDestroyed<HashMap<String, String>> cache;\n")
file.write(" HashMap<String, String>& map = cache.get();\n")
file.write(" if (UNLIKELY(map.isEmpty())) {\n")
entries = [" map.add(\"{}\"_s, \"{}\"_s);\n".format(k, v) for k, v in map.items()]
entries.sort()
file.write("".join(entries))
file.write(" }\n")
file.write(" return map.get(tag);\n")
def dumpIfLookup(self, file, name, map):
entries = [" if (tag == \"{}\")\n return \"{}\"_s;".format(k, v) for k, v in map.items()]
entries.sort()
file.write("\n".join(entries))
file.write("\n return String();\n")
def dumpSwitchLookup(self, file, name, map):
tree = {}
for k, v in map.items():
node = tree
for char in k:
if not (char in node):
node[char] = {}
node = node[char]
node["value"] = v
self.dumpSwitchLookupTree(file, tree, 0)
file.write("\n return String();\n")
def dumpSwitchLookupTree(self, file, tree, level):
indent = "".ljust((level + 1) * 4)
if "value" in tree:
file.write(indent + "if (tag.length() == {})\n".format(level))
file.write(indent + " return \"{}\"_s;\n".format(tree["value"]))
del tree["value"]
keys = tree.keys()
keys.sort()
if len(keys) == 0:
return
file.write(indent + "switch (tag[{}]) {{\n".format(level))
for key in keys:
file.write(indent + "case {}:\n".format(ord(key)))
self.dumpSwitchLookupTree(file, tree[key], level + 1)
file.write(indent + " break;\n")
file.write(indent + "default: break;\n")
file.write(indent + "}\n")
if __name__ == "__main__":
parser = optparse.OptionParser(usage="usage: %prog <language-subtag-registry.txt> <IntlCanonicalizeLanguage.h>")
(options, args) = parser.parse_args()
if len(args) != 2:
parser.error("<language-subtag-registry.txt> <IntlCanonicalizeLanguage.h>")
registryPath = args[0]
intlCanonHPath = args[1]
registryFile = openOrExit(registryPath, "r")
intlCanonHFile = openOrExit(intlCanonHPath, "w")
intlCanonHFile.write(header)
registry = SubtagRegistry()
registry.parse(registryFile)
registry.dump(intlCanonHFile)
intlCanonHFile.write(footer)
exit(0)