Source/JavaScriptCore/Scripts/generateIntlCanonicalizeLanguage.py - WebKit - Git at Google

 #!/usr/bin/env python

 # Copyright (C) 2018 Andy VanWagoner (andy@vanwagoner.family)
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
 # are met:
 #
 # 1.  Redistributions of source code must retain the above copyright
 #     notice, this list of conditions and the following disclaimer.
 # 2.  Redistributions in binary form must reproduce the above copyright
 #     notice, this list of conditions and the following disclaimer in the
 #     documentation and/or other materials provided with the distribution.
 #
 # THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY
 # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 # DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY
 # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 # This tool processes the IANA file language-subtag-registry.txt to create
 # information required to canonicalize language tags according to ECMA 402 and
 # RFC 5646 Section 4.5.
 # https://www.iana.org/assignments/language-subtag-registry
 # https://tc39.github.io/ecma402/#sec-canonicalizelanguagetag
 # https://tools.ietf.org/html/rfc5646#section-4.5

 import sys
 import optparse
 import os

 header = """// DO NO EDIT! - This file was generated by """ + __file__ + """
 """


 footer = """
 """


 def openOrExit(path, mode):
     try:
         if sys.version_info.major >= 3:
             return open(path, mode, encoding="UTF-8")
         else:
             return open(path, mode)
     except IOError as e:
         print("I/O error opening {0}, ({1}): {2}".format(path, e.errno, e.strerror))
         exit(1)


 class SubtagRegistry:
     def __init__(self):
         self.languageMap = {}
         self.extlangMap = {}
         self.regionMap = {}
         self.redundantMap = {}
         self.grandfatheredMap = {}

     def parse(self, file):
         record = {}
         for line in file:
             line = line.strip()
             if line.startswith("#"):
                 continue

             if line == "%%":
                 self.process(record)
                 record = {}
                 continue

             if ":" in line:
                 key, value = line.split(":", 1)
                 record[key.strip()] = value.strip()
             else:
                 # Description often continues on the next line
                 record[key.strip()] = " " + line
         self.process(record)

     def process(self, record):
         if "File-Date" in record:
             self.fileDate = record["File-Date"]

         if not ("Type" in record):
             return

         type = record["Type"]
         preferred = record.get("Preferred-Value")
         if type == "language" and preferred:
             self.languageMap[record["Subtag"]] = preferred
         elif type == "extlang":
             self.extlangMap[record["Subtag"]] = record["Prefix"]
         elif type == "region" and preferred:
             self.regionMap[record["Subtag"]] = preferred
         elif type == "redundant" and preferred:
             lang = self.extlangMap.get(preferred)
             if "{}-{}".format(lang, preferred) != record["Tag"]:
                 self.redundantMap[record["Tag"]] = preferred
         elif type == "variant" and preferred:
             key = "{}-{}".format(record['Prefix'], record['Subtag'])
             if preferred == "alalc97":
                 preferred = "ja-Latn-alalc97"
             self.redundantMap[key] = preferred
         elif type == "grandfathered":
             key = record["Tag"].lower()
             value = record.get("Preferred-Value", key)
             self.grandfatheredMap[key] = value

     def dump(self, file):
         if self.fileDate:
             file.write("// language-subtag-registry file date: {}\n".format(self.fileDate))
         file.write("\n#pragma once\n")
         file.write("\n#if ENABLE(INTL)\n")
         file.write("\nnamespace JSC {\n")
         self.dumpLookup(file, "intlPreferredLanguageTag", self.languageMap)
         self.dumpLookup(file, "intlPreferredExtlangTag", self.extlangMap)
         self.dumpLookup(file, "intlPreferredRegionTag", self.regionMap)
         self.dumpLookup(file, "intlRedundantLanguageTag", self.redundantMap)
         self.dumpLookup(file, "intlGrandfatheredLanguageTag", self.grandfatheredMap)
         file.write("\n} // namespace JSC\n")
         file.write("\n#endif // ENABLE(INTL)\n")

     def dumpLookup(self, file, name, map):
         file.write("\nstatic String {}(const String& tag)\n{{\n".format(name))
         file.write("    // {} possible replacements\n".format(len(map)))
         # We could pick the lookup implementation per map if desired
         # Anecdotal perf: if > switch > hash (slowest)
         # Code complexity: switch > if > hash (least complex)
         # Algo complexity: if = O(N) > switch > O(log N) > hash = O(1) (least complex)
         self.dumpIfLookup(file, name, map)
         file.write("}\n")

     def dumpHashLookup(self, file, name, map):
         file.write("    static NeverDestroyed<HashMap<String, String>> cache;\n")
         file.write("    HashMap<String, String>& map = cache.get();\n")
         file.write("    if (UNLIKELY(map.isEmpty())) {\n")
         entries = ["        map.add(\"{}\"_s, \"{}\"_s);\n".format(k, v) for k, v in map.items()]
         entries.sort()
         file.write("".join(entries))
         file.write("    }\n")
         file.write("    return map.get(tag);\n")

     def dumpIfLookup(self, file, name, map):
         entries = ["    if (tag == \"{}\")\n        return \"{}\"_s;".format(k, v) for k, v in map.items()]
         entries.sort()
         file.write("\n".join(entries))
         file.write("\n    return String();\n")

     def dumpSwitchLookup(self, file, name, map):
         tree = {}
         for k, v in map.items():
             node = tree
             for char in k:
                 if not (char in node):
                     node[char] = {}
                 node = node[char]
             node["value"] = v
         self.dumpSwitchLookupTree(file, tree, 0)
         file.write("\n    return String();\n")

     def dumpSwitchLookupTree(self, file, tree, level):
         indent = "".ljust((level + 1) * 4)
         if "value" in tree:
             file.write(indent + "if (tag.length() == {})\n".format(level))
             file.write(indent + "    return \"{}\"_s;\n".format(tree["value"]))
             del tree["value"]
         keys = tree.keys()
         keys.sort()
         if len(keys) == 0:
             return
         file.write(indent + "switch (tag[{}]) {{\n".format(level))
         for key in keys:
             file.write(indent + "case {}:\n".format(ord(key)))
             self.dumpSwitchLookupTree(file, tree[key], level + 1)
             file.write(indent + "    break;\n")
         file.write(indent + "default: break;\n")
         file.write(indent + "}\n")


 if __name__ == "__main__":
     parser = optparse.OptionParser(usage="usage: %prog <language-subtag-registry.txt> <IntlCanonicalizeLanguage.h>")
     (options, args) = parser.parse_args()

     if len(args) != 2:
         parser.error("<language-subtag-registry.txt> <IntlCanonicalizeLanguage.h>")

     registryPath = args[0]
     intlCanonHPath = args[1]

     registryFile = openOrExit(registryPath, "r")
     intlCanonHFile = openOrExit(intlCanonHPath, "w")

     intlCanonHFile.write(header)

     registry = SubtagRegistry()
     registry.parse(registryFile)
     registry.dump(intlCanonHFile)

     intlCanonHFile.write(footer)

     exit(0)
	#!/usr/bin/env python

	# Copyright (C) 2018 Andy VanWagoner (andy@vanwagoner.family)
	#
	# Redistribution and use in source and binary forms, with or without
	# modification, are permitted provided that the following conditions
	# are met:
	#
	# 1. Redistributions of source code must retain the above copyright
	# notice, this list of conditions and the following disclaimer.
	# 2. Redistributions in binary form must reproduce the above copyright
	# notice, this list of conditions and the following disclaimer in the
	# documentation and/or other materials provided with the distribution.
	#
	# THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY
	# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
	# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
	# DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY
	# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
	# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
	# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

	# This tool processes the IANA file language-subtag-registry.txt to create
	# information required to canonicalize language tags according to ECMA 402 and
	# RFC 5646 Section 4.5.
	# https://www.iana.org/assignments/language-subtag-registry
	# https://tc39.github.io/ecma402/#sec-canonicalizelanguagetag
	# https://tools.ietf.org/html/rfc5646#section-4.5

	import sys
	import optparse
	import os

	header = """// DO NO EDIT! - This file was generated by """ + __file__ + """
	"""


	footer = """
	"""


	def openOrExit(path, mode):
	try:
	if sys.version_info.major >= 3:
	return open(path, mode, encoding="UTF-8")
	else:
	return open(path, mode)
	except IOError as e:
	print("I/O error opening {0}, ({1}): {2}".format(path, e.errno, e.strerror))
	exit(1)


	class SubtagRegistry:
	def __init__(self):
	self.languageMap = {}
	self.extlangMap = {}
	self.regionMap = {}
	self.redundantMap = {}
	self.grandfatheredMap = {}

	def parse(self, file):
	record = {}
	for line in file:
	line = line.strip()
	if line.startswith("#"):
	continue

	if line == "%%":
	self.process(record)
	record = {}
	continue

	if ":" in line:
	key, value = line.split(":", 1)
	record[key.strip()] = value.strip()
	else:
	# Description often continues on the next line
	record[key.strip()] = " " + line
	self.process(record)

	def process(self, record):
	if "File-Date" in record:
	self.fileDate = record["File-Date"]

	if not ("Type" in record):
	return

	type = record["Type"]
	preferred = record.get("Preferred-Value")
	if type == "language" and preferred:
	self.languageMap[record["Subtag"]] = preferred
	elif type == "extlang":
	self.extlangMap[record["Subtag"]] = record["Prefix"]
	elif type == "region" and preferred:
	self.regionMap[record["Subtag"]] = preferred
	elif type == "redundant" and preferred:
	lang = self.extlangMap.get(preferred)
	if "{}-{}".format(lang, preferred) != record["Tag"]:
	self.redundantMap[record["Tag"]] = preferred
	elif type == "variant" and preferred:
	key = "{}-{}".format(record['Prefix'], record['Subtag'])
	if preferred == "alalc97":
	preferred = "ja-Latn-alalc97"
	self.redundantMap[key] = preferred
	elif type == "grandfathered":
	key = record["Tag"].lower()
	value = record.get("Preferred-Value", key)
	self.grandfatheredMap[key] = value

	def dump(self, file):
	if self.fileDate:
	file.write("// language-subtag-registry file date: {}\n".format(self.fileDate))
	file.write("\n#pragma once\n")
	file.write("\n#if ENABLE(INTL)\n")
	file.write("\nnamespace JSC {\n")
	self.dumpLookup(file, "intlPreferredLanguageTag", self.languageMap)
	self.dumpLookup(file, "intlPreferredExtlangTag", self.extlangMap)
	self.dumpLookup(file, "intlPreferredRegionTag", self.regionMap)
	self.dumpLookup(file, "intlRedundantLanguageTag", self.redundantMap)
	self.dumpLookup(file, "intlGrandfatheredLanguageTag", self.grandfatheredMap)
	file.write("\n} // namespace JSC\n")
	file.write("\n#endif // ENABLE(INTL)\n")

	def dumpLookup(self, file, name, map):
	file.write("\nstatic String {}(const String& tag)\n{{\n".format(name))
	file.write(" // {} possible replacements\n".format(len(map)))
	# We could pick the lookup implementation per map if desired
	# Anecdotal perf: if > switch > hash (slowest)
	# Code complexity: switch > if > hash (least complex)
	# Algo complexity: if = O(N) > switch > O(log N) > hash = O(1) (least complex)
	self.dumpIfLookup(file, name, map)
	file.write("}\n")

	def dumpHashLookup(self, file, name, map):
	file.write(" static NeverDestroyed<HashMap<String, String>> cache;\n")
	file.write(" HashMap<String, String>& map = cache.get();\n")
	file.write(" if (UNLIKELY(map.isEmpty())) {\n")
	entries = [" map.add(\"{}\"_s, \"{}\"_s);\n".format(k, v) for k, v in map.items()]
	entries.sort()
	file.write("".join(entries))
	file.write(" }\n")
	file.write(" return map.get(tag);\n")

	def dumpIfLookup(self, file, name, map):
	entries = [" if (tag == \"{}\")\n return \"{}\"_s;".format(k, v) for k, v in map.items()]
	entries.sort()
	file.write("\n".join(entries))
	file.write("\n return String();\n")

	def dumpSwitchLookup(self, file, name, map):
	tree = {}
	for k, v in map.items():
	node = tree
	for char in k:
	if not (char in node):
	node[char] = {}
	node = node[char]
	node["value"] = v
	self.dumpSwitchLookupTree(file, tree, 0)
	file.write("\n return String();\n")

	def dumpSwitchLookupTree(self, file, tree, level):
	indent = "".ljust((level + 1) * 4)
	if "value" in tree:
	file.write(indent + "if (tag.length() == {})\n".format(level))
	file.write(indent + " return \"{}\"_s;\n".format(tree["value"]))
	del tree["value"]
	keys = tree.keys()
	keys.sort()
	if len(keys) == 0:
	return
	file.write(indent + "switch (tag[{}]) {{\n".format(level))
	for key in keys:
	file.write(indent + "case {}:\n".format(ord(key)))
	self.dumpSwitchLookupTree(file, tree[key], level + 1)
	file.write(indent + " break;\n")
	file.write(indent + "default: break;\n")
	file.write(indent + "}\n")


	if __name__ == "__main__":
	parser = optparse.OptionParser(usage="usage: %prog <language-subtag-registry.txt> <IntlCanonicalizeLanguage.h>")
	(options, args) = parser.parse_args()

	if len(args) != 2:
	parser.error("<language-subtag-registry.txt> <IntlCanonicalizeLanguage.h>")

	registryPath = args[0]
	intlCanonHPath = args[1]

	registryFile = openOrExit(registryPath, "r")
	intlCanonHFile = openOrExit(intlCanonHPath, "w")

	intlCanonHFile.write(header)

	registry = SubtagRegistry()
	registry.parse(registryFile)
	registry.dump(intlCanonHFile)

	intlCanonHFile.write(footer)

	exit(0)