| #!/usr/bin/env python |
| |
| # Copyright (C) 2018 Andy VanWagoner (andy@vanwagoner.family) |
| # |
| # Redistribution and use in source and binary forms, with or without |
| # modification, are permitted provided that the following conditions |
| # are met: |
| # |
| # 1. Redistributions of source code must retain the above copyright |
| # notice, this list of conditions and the following disclaimer. |
| # 2. Redistributions in binary form must reproduce the above copyright |
| # notice, this list of conditions and the following disclaimer in the |
| # documentation and/or other materials provided with the distribution. |
| # |
| # THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY |
| # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
| # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
| # DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY |
| # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES |
| # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
| # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND |
| # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF |
| # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| |
| # This tool processes the IANA file language-subtag-registry.txt to create |
| # information required to canonicalize language tags according to ECMA 402 and |
| # RFC 5646 Section 4.5. |
| # https://www.iana.org/assignments/language-subtag-registry |
| # https://tc39.github.io/ecma402/#sec-canonicalizelanguagetag |
| # https://tools.ietf.org/html/rfc5646#section-4.5 |
| |
| import sys |
| import optparse |
| import os |
| |
| header = """// DO NO EDIT! - This file was generated by """ + __file__ + """ |
| """ |
| |
| |
| footer = """ |
| """ |
| |
| |
| def openOrExit(path, mode): |
| try: |
| if sys.version_info.major >= 3: |
| return open(path, mode, encoding="UTF-8") |
| else: |
| return open(path, mode) |
| except IOError as e: |
| print("I/O error opening {0}, ({1}): {2}".format(path, e.errno, e.strerror)) |
| exit(1) |
| |
| |
| class SubtagRegistry: |
| def __init__(self): |
| self.languageMap = {} |
| self.extlangMap = {} |
| self.regionMap = {} |
| self.redundantMap = {} |
| self.grandfatheredMap = {} |
| |
| def parse(self, file): |
| record = {} |
| for line in file: |
| line = line.strip() |
| if line.startswith("#"): |
| continue |
| |
| if line == "%%": |
| self.process(record) |
| record = {} |
| continue |
| |
| if ":" in line: |
| key, value = line.split(":", 1) |
| record[key.strip()] = value.strip() |
| else: |
| # Description often continues on the next line |
| record[key.strip()] = " " + line |
| self.process(record) |
| |
| def process(self, record): |
| if "File-Date" in record: |
| self.fileDate = record["File-Date"] |
| |
| if not ("Type" in record): |
| return |
| |
| type = record["Type"] |
| preferred = record.get("Preferred-Value") |
| if type == "language" and preferred: |
| self.languageMap[record["Subtag"]] = preferred |
| elif type == "extlang": |
| self.extlangMap[record["Subtag"]] = record["Prefix"] |
| elif type == "region" and preferred: |
| self.regionMap[record["Subtag"]] = preferred |
| elif type == "redundant" and preferred: |
| lang = self.extlangMap.get(preferred) |
| if "{}-{}".format(lang, preferred) != record["Tag"]: |
| self.redundantMap[record["Tag"]] = preferred |
| elif type == "variant" and preferred: |
| key = "{}-{}".format(record['Prefix'], record['Subtag']) |
| if preferred == "alalc97": |
| preferred = "ja-Latn-alalc97" |
| self.redundantMap[key] = preferred |
| elif type == "grandfathered": |
| key = record["Tag"].lower() |
| value = record.get("Preferred-Value", key) |
| self.grandfatheredMap[key] = value |
| |
| def dump(self, file): |
| if self.fileDate: |
| file.write("// language-subtag-registry file date: {}\n".format(self.fileDate)) |
| file.write("\n#pragma once\n") |
| file.write("\n#if ENABLE(INTL)\n") |
| file.write("\nnamespace JSC {\n") |
| self.dumpLookup(file, "intlPreferredLanguageTag", self.languageMap) |
| self.dumpLookup(file, "intlPreferredExtlangTag", self.extlangMap) |
| self.dumpLookup(file, "intlPreferredRegionTag", self.regionMap) |
| self.dumpLookup(file, "intlRedundantLanguageTag", self.redundantMap) |
| self.dumpLookup(file, "intlGrandfatheredLanguageTag", self.grandfatheredMap) |
| file.write("\n} // namespace JSC\n") |
| file.write("\n#endif // ENABLE(INTL)\n") |
| |
| def dumpLookup(self, file, name, map): |
| file.write("\nstatic String {}(const String& tag)\n{{\n".format(name)) |
| file.write(" // {} possible replacements\n".format(len(map))) |
| # We could pick the lookup implementation per map if desired |
| # Anecdotal perf: if > switch > hash (slowest) |
| # Code complexity: switch > if > hash (least complex) |
| # Algo complexity: if = O(N) > switch > O(log N) > hash = O(1) (least complex) |
| self.dumpIfLookup(file, name, map) |
| file.write("}\n") |
| |
| def dumpHashLookup(self, file, name, map): |
| file.write(" static NeverDestroyed<HashMap<String, String>> cache;\n") |
| file.write(" HashMap<String, String>& map = cache.get();\n") |
| file.write(" if (UNLIKELY(map.isEmpty())) {\n") |
| entries = [" map.add(\"{}\"_s, \"{}\"_s);\n".format(k, v) for k, v in map.items()] |
| entries.sort() |
| file.write("".join(entries)) |
| file.write(" }\n") |
| file.write(" return map.get(tag);\n") |
| |
| def dumpIfLookup(self, file, name, map): |
| entries = [" if (tag == \"{}\")\n return \"{}\"_s;".format(k, v) for k, v in map.items()] |
| entries.sort() |
| file.write("\n".join(entries)) |
| file.write("\n return String();\n") |
| |
| def dumpSwitchLookup(self, file, name, map): |
| tree = {} |
| for k, v in map.items(): |
| node = tree |
| for char in k: |
| if not (char in node): |
| node[char] = {} |
| node = node[char] |
| node["value"] = v |
| self.dumpSwitchLookupTree(file, tree, 0) |
| file.write("\n return String();\n") |
| |
| def dumpSwitchLookupTree(self, file, tree, level): |
| indent = "".ljust((level + 1) * 4) |
| if "value" in tree: |
| file.write(indent + "if (tag.length() == {})\n".format(level)) |
| file.write(indent + " return \"{}\"_s;\n".format(tree["value"])) |
| del tree["value"] |
| keys = tree.keys() |
| keys.sort() |
| if len(keys) == 0: |
| return |
| file.write(indent + "switch (tag[{}]) {{\n".format(level)) |
| for key in keys: |
| file.write(indent + "case {}:\n".format(ord(key))) |
| self.dumpSwitchLookupTree(file, tree[key], level + 1) |
| file.write(indent + " break;\n") |
| file.write(indent + "default: break;\n") |
| file.write(indent + "}\n") |
| |
| |
| if __name__ == "__main__": |
| parser = optparse.OptionParser(usage="usage: %prog <language-subtag-registry.txt> <IntlCanonicalizeLanguage.h>") |
| (options, args) = parser.parse_args() |
| |
| if len(args) != 2: |
| parser.error("<language-subtag-registry.txt> <IntlCanonicalizeLanguage.h>") |
| |
| registryPath = args[0] |
| intlCanonHPath = args[1] |
| |
| registryFile = openOrExit(registryPath, "r") |
| intlCanonHFile = openOrExit(intlCanonHPath, "w") |
| |
| intlCanonHFile.write(header) |
| |
| registry = SubtagRegistry() |
| registry.parse(registryFile) |
| registry.dump(intlCanonHFile) |
| |
| intlCanonHFile.write(footer) |
| |
| exit(0) |