Source/JavaScriptCore/yarr/generateYarrCanonicalizeUnicode - WebKit - Git at Google

 #!/usr/bin/env python3

 # Copyright (C) 2016-2019 Apple Inc. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
 # are met:
 #
 # 1.  Redistributions of source code must retain the above copyright
 #     notice, this list of conditions and the following disclaimer.
 # 2.  Redistributions in binary form must reproduce the above copyright
 #     notice, this list of conditions and the following disclaimer in the
 #     documentation and/or other materials provided with the distribution.
 #
 # THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY
 # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 # DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY
 # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 # This tool processes the Unicode Character Database file CaseFolding.txt to create
 # canonicalization table as decribed in ECMAScript 6 standard in section
 # "21.2.2.8.2 Runtime Semantics: Canonicalize()", step 2.

 import optparse
 import os
 import re
 import sys

 header = """/*
 * Copyright (C) 2016 Apple Inc. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 * 1.  Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 * 2.  Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in the
 *     documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

 // DO NO EDIT! - This file was generated by generateYarrCanonicalizeUnicode

 #include "config.h"
 #include "YarrCanonicalize.h"

 namespace JSC { namespace Yarr {

 """

 footer = """} } // JSC::Yarr
 """

 MaxUnicode = 0x10ffff
 commonAndSimpleLinesRE = re.compile(r"(?P<code>[0-9A-F]+)\s*;\s*[CS]\s*;\s*(?P<mapping>[0-9A-F]+)", re.IGNORECASE)

 def openOrExit(path, mode):
     try:
         dirname = os.path.dirname(path)
         if not os.path.isdir(dirname):
             os.makedirs(dirname)
         if sys.version_info.major >= 3:
             return open(path, mode, encoding="UTF-8")
         else:
             return open(path, mode)
     except IOError as e:
         print("I/O error opening {0}, ({1}): {2}".format(path, e.errno, e.strerror))
         exit(1)

 class Canonicalize:
     def __init__(self):
         self.canonicalGroups = {};

     def addMapping(self, code, mapping):
         if mapping not in self.canonicalGroups:
             self.canonicalGroups[mapping] = []
         self.canonicalGroups[mapping].append(code)

     def readCaseFolding(self, file):
         codesSeen = set()
         for line in file:
             line = line.split('#', 1)[0]
             line = line.rstrip()
             if (not len(line)):
                 continue

             fields = commonAndSimpleLinesRE.match(line)
             if (not fields):
                 continue

             code = int(fields.group('code'), 16)
             mapping = int(fields.group('mapping'), 16)

             codesSeen.add(code)
             self.addMapping(code, mapping)

         for i in range(MaxUnicode + 1):
             if i in codesSeen:
                 continue;

             self.addMapping(i, i)

     def createTables(self, file):
         typeInfo = [""] * (MaxUnicode + 1)
         characterSets = []

         for mapping in sorted(self.canonicalGroups.keys()):
             characters = self.canonicalGroups[mapping]
             if len(characters) == 1:
                 typeInfo[characters[0]] = "CanonicalizeUnique:0"
             else:
                 characters.sort()
                 if len(characters) > 2:
                     for ch in characters:
                         typeInfo[ch] = "CanonicalizeSet:%d" % len(characterSets)
                     characterSets.append(characters)
                 else:
                     low = characters[0]
                     high = characters[1]
                     delta = high - low
                     if delta == 1:
                         type = "CanonicalizeAlternatingUnaligned:0" if low & 1 else "CanonicalizeAlternatingAligned:0"
                         typeInfo[low] = type
                         typeInfo[high] = type
                     else:
                         typeInfo[low] = "CanonicalizeRangeLo:%d" % delta
                         typeInfo[high] = "CanonicalizeRangeHi:%d" % delta

         rangeInfo = []
         end = 0
         while end <= MaxUnicode:
             begin = end
             type = typeInfo[end]
             while end < MaxUnicode and typeInfo[end + 1] == type:
                 end = end + 1
             rangeInfo.append({"begin": begin, "end": end, "type": type})
             end = end + 1

         for i in range(len(characterSets)):
             characters = ""
             cur_set = characterSets[i]
             for ch in cur_set:
                 characters = characters + "0x{character:04x}, ".format(character=ch)
             file.write("const UChar32 unicodeCharacterSet{index:d}[] = {{ {characters}0 }};\n".format(index=i, characters=characters))

         file.write("\n")
         file.write("static constexpr size_t UNICODE_CANONICALIZATION_SETS = {setCount:d};\n".format(setCount=len(characterSets)))
         file.write("const UChar32* const unicodeCharacterSetInfo[UNICODE_CANONICALIZATION_SETS] = {\n")

         for i in range(len(characterSets)):
             file.write("    unicodeCharacterSet{setNumber:d},\n".format(setNumber=i))

         file.write("};\n")
         file.write("\n")
         file.write("const size_t UNICODE_CANONICALIZATION_RANGES = {rangeCount:d};\n".format(rangeCount=len(rangeInfo)))
         file.write("const CanonicalizationRange unicodeRangeInfo[UNICODE_CANONICALIZATION_RANGES] = {\n")

         for info in rangeInfo:
             typeAndValue = info["type"].split(":")
             file.write("    {{ 0x{begin:04x}, 0x{end:04x}, 0x{value:04x}, {type} }},\n".format(begin=info["begin"], end=info["end"], value=int(typeAndValue[1]), type=typeAndValue[0]))

         file.write("};\n")
         file.write("\n")


 if __name__ == "__main__":
     parser = optparse.OptionParser(usage = "usage: %prog  <CaseFolding.txt> <YarrCanonicalizeUnicode.h>")
     (options, args) = parser.parse_args()

     if len(args) != 2:
         parser.error("<CaseFolding.txt> <YarrCanonicalizeUnicode.h>")

     caseFoldingTxtPath = args[0]
     canonicalizeHPath = args[1]
     caseFoldingTxtFile = openOrExit(caseFoldingTxtPath, "r")
     canonicalizeHFile = openOrExit(canonicalizeHPath, "w")

     canonicalize = Canonicalize()
     canonicalize.readCaseFolding(caseFoldingTxtFile)

     canonicalizeHFile.write(header);
     canonicalize.createTables(canonicalizeHFile)
     canonicalizeHFile.write(footer);

     caseFoldingTxtFile.close()
     canonicalizeHFile.close()

     exit(0)
	#!/usr/bin/env python3

	# Copyright (C) 2016-2019 Apple Inc. All rights reserved.
	#
	# Redistribution and use in source and binary forms, with or without
	# modification, are permitted provided that the following conditions
	# are met:
	#
	# 1. Redistributions of source code must retain the above copyright
	# notice, this list of conditions and the following disclaimer.
	# 2. Redistributions in binary form must reproduce the above copyright
	# notice, this list of conditions and the following disclaimer in the
	# documentation and/or other materials provided with the distribution.
	#
	# THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY
	# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
	# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
	# DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY
	# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
	# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
	# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

	# This tool processes the Unicode Character Database file CaseFolding.txt to create
	# canonicalization table as decribed in ECMAScript 6 standard in section
	# "21.2.2.8.2 Runtime Semantics: Canonicalize()", step 2.

	import optparse
	import os
	import re
	import sys

	header = """/*
	* Copyright (C) 2016 Apple Inc. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	*
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY
	* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
	* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
	* DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY
	* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
	* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
	* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*/

	// DO NO EDIT! - This file was generated by generateYarrCanonicalizeUnicode

	#include "config.h"
	#include "YarrCanonicalize.h"

	namespace JSC { namespace Yarr {

	"""

	footer = """} } // JSC::Yarr
	"""

	MaxUnicode = 0x10ffff
	commonAndSimpleLinesRE = re.compile(r"(?P<code>[0-9A-F]+)\s;\s[CS]\s;\s(?P<mapping>[0-9A-F]+)", re.IGNORECASE)

	def openOrExit(path, mode):
	try:
	dirname = os.path.dirname(path)
	if not os.path.isdir(dirname):
	os.makedirs(dirname)
	if sys.version_info.major >= 3:
	return open(path, mode, encoding="UTF-8")
	else:
	return open(path, mode)
	except IOError as e:
	print("I/O error opening {0}, ({1}): {2}".format(path, e.errno, e.strerror))
	exit(1)

	class Canonicalize:
	def __init__(self):
	self.canonicalGroups = {};

	def addMapping(self, code, mapping):
	if mapping not in self.canonicalGroups:
	self.canonicalGroups[mapping] = []
	self.canonicalGroups[mapping].append(code)

	def readCaseFolding(self, file):
	codesSeen = set()
	for line in file:
	line = line.split('#', 1)[0]
	line = line.rstrip()
	if (not len(line)):
	continue

	fields = commonAndSimpleLinesRE.match(line)
	if (not fields):
	continue

	code = int(fields.group('code'), 16)
	mapping = int(fields.group('mapping'), 16)

	codesSeen.add(code)
	self.addMapping(code, mapping)

	for i in range(MaxUnicode + 1):
	if i in codesSeen:
	continue;

	self.addMapping(i, i)

	def createTables(self, file):
	typeInfo = [""] * (MaxUnicode + 1)
	characterSets = []

	for mapping in sorted(self.canonicalGroups.keys()):
	characters = self.canonicalGroups[mapping]
	if len(characters) == 1:
	typeInfo[characters[0]] = "CanonicalizeUnique:0"
	else:
	characters.sort()
	if len(characters) > 2:
	for ch in characters:
	typeInfo[ch] = "CanonicalizeSet:%d" % len(characterSets)
	characterSets.append(characters)
	else:
	low = characters[0]
	high = characters[1]
	delta = high - low
	if delta == 1:
	type = "CanonicalizeAlternatingUnaligned:0" if low & 1 else "CanonicalizeAlternatingAligned:0"
	typeInfo[low] = type
	typeInfo[high] = type
	else:
	typeInfo[low] = "CanonicalizeRangeLo:%d" % delta
	typeInfo[high] = "CanonicalizeRangeHi:%d" % delta

	rangeInfo = []
	end = 0
	while end <= MaxUnicode:
	begin = end
	type = typeInfo[end]
	while end < MaxUnicode and typeInfo[end + 1] == type:
	end = end + 1
	rangeInfo.append({"begin": begin, "end": end, "type": type})
	end = end + 1

	for i in range(len(characterSets)):
	characters = ""
	cur_set = characterSets[i]
	for ch in cur_set:
	characters = characters + "0x{character:04x}, ".format(character=ch)
	file.write("const UChar32 unicodeCharacterSet{index:d}[] = {{ {characters}0 }};\n".format(index=i, characters=characters))

	file.write("\n")
	file.write("static constexpr size_t UNICODE_CANONICALIZATION_SETS = {setCount:d};\n".format(setCount=len(characterSets)))
	file.write("const UChar32* const unicodeCharacterSetInfo[UNICODE_CANONICALIZATION_SETS] = {\n")

	for i in range(len(characterSets)):
	file.write(" unicodeCharacterSet{setNumber:d},\n".format(setNumber=i))

	file.write("};\n")
	file.write("\n")
	file.write("const size_t UNICODE_CANONICALIZATION_RANGES = {rangeCount:d};\n".format(rangeCount=len(rangeInfo)))
	file.write("const CanonicalizationRange unicodeRangeInfo[UNICODE_CANONICALIZATION_RANGES] = {\n")

	for info in rangeInfo:
	typeAndValue = info["type"].split(":")
	file.write(" {{ 0x{begin:04x}, 0x{end:04x}, 0x{value:04x}, {type} }},\n".format(begin=info["begin"], end=info["end"], value=int(typeAndValue[1]), type=typeAndValue[0]))

	file.write("};\n")
	file.write("\n")


	if __name__ == "__main__":
	parser = optparse.OptionParser(usage = "usage: %prog <CaseFolding.txt> <YarrCanonicalizeUnicode.h>")
	(options, args) = parser.parse_args()

	if len(args) != 2:
	parser.error("<CaseFolding.txt> <YarrCanonicalizeUnicode.h>")

	caseFoldingTxtPath = args[0]
	canonicalizeHPath = args[1]
	caseFoldingTxtFile = openOrExit(caseFoldingTxtPath, "r")
	canonicalizeHFile = openOrExit(canonicalizeHPath, "w")

	canonicalize = Canonicalize()
	canonicalize.readCaseFolding(caseFoldingTxtFile)

	canonicalizeHFile.write(header);
	canonicalize.createTables(canonicalizeHFile)
	canonicalizeHFile.write(footer);

	caseFoldingTxtFile.close()
	canonicalizeHFile.close()

	exit(0)