blob: e03a16fc87e95e32cb22f56fcb6211a0aa7e1fcf [file] [log] [blame]
#!/usr/bin/env python3
# Copyright (C) 2016-2019 Apple Inc. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#
# 1. Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# This tool processes the Unicode Character Database file CaseFolding.txt to create
# canonicalization table as decribed in ECMAScript 6 standard in section
# "21.2.2.8.2 Runtime Semantics: Canonicalize()", step 2.
import optparse
import os
import re
import sys
header = """/*
* Copyright (C) 2016 Apple Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
// DO NO EDIT! - This file was generated by generateYarrCanonicalizeUnicode
#include "config.h"
#include "YarrCanonicalize.h"
namespace JSC { namespace Yarr {
"""
footer = """} } // JSC::Yarr
"""
MaxUnicode = 0x10ffff
commonAndSimpleLinesRE = re.compile(r"(?P<code>[0-9A-F]+)\s*;\s*[CS]\s*;\s*(?P<mapping>[0-9A-F]+)", re.IGNORECASE)
def openOrExit(path, mode):
try:
dirname = os.path.dirname(path)
if not os.path.isdir(dirname):
os.makedirs(dirname)
if sys.version_info.major >= 3:
return open(path, mode, encoding="UTF-8")
else:
return open(path, mode)
except IOError as e:
print("I/O error opening {0}, ({1}): {2}".format(path, e.errno, e.strerror))
exit(1)
class Canonicalize:
def __init__(self):
self.canonicalGroups = {};
def addMapping(self, code, mapping):
if mapping not in self.canonicalGroups:
self.canonicalGroups[mapping] = []
self.canonicalGroups[mapping].append(code)
def readCaseFolding(self, file):
codesSeen = set()
for line in file:
line = line.split('#', 1)[0]
line = line.rstrip()
if (not len(line)):
continue
fields = commonAndSimpleLinesRE.match(line)
if (not fields):
continue
code = int(fields.group('code'), 16)
mapping = int(fields.group('mapping'), 16)
codesSeen.add(code)
self.addMapping(code, mapping)
for i in range(MaxUnicode + 1):
if i in codesSeen:
continue;
self.addMapping(i, i)
def createTables(self, file):
typeInfo = [""] * (MaxUnicode + 1)
characterSets = []
for mapping in sorted(self.canonicalGroups.keys()):
characters = self.canonicalGroups[mapping]
if len(characters) == 1:
typeInfo[characters[0]] = "CanonicalizeUnique:0"
else:
characters.sort()
if len(characters) > 2:
for ch in characters:
typeInfo[ch] = "CanonicalizeSet:%d" % len(characterSets)
characterSets.append(characters)
else:
low = characters[0]
high = characters[1]
delta = high - low
if delta == 1:
type = "CanonicalizeAlternatingUnaligned:0" if low & 1 else "CanonicalizeAlternatingAligned:0"
typeInfo[low] = type
typeInfo[high] = type
else:
typeInfo[low] = "CanonicalizeRangeLo:%d" % delta
typeInfo[high] = "CanonicalizeRangeHi:%d" % delta
rangeInfo = []
end = 0
while end <= MaxUnicode:
begin = end
type = typeInfo[end]
while end < MaxUnicode and typeInfo[end + 1] == type:
end = end + 1
rangeInfo.append({"begin": begin, "end": end, "type": type})
end = end + 1
for i in range(len(characterSets)):
characters = ""
cur_set = characterSets[i]
for ch in cur_set:
characters = characters + "0x{character:04x}, ".format(character=ch)
file.write("const UChar32 unicodeCharacterSet{index:d}[] = {{ {characters}0 }};\n".format(index=i, characters=characters))
file.write("\n")
file.write("static constexpr size_t UNICODE_CANONICALIZATION_SETS = {setCount:d};\n".format(setCount=len(characterSets)))
file.write("const UChar32* const unicodeCharacterSetInfo[UNICODE_CANONICALIZATION_SETS] = {\n")
for i in range(len(characterSets)):
file.write(" unicodeCharacterSet{setNumber:d},\n".format(setNumber=i))
file.write("};\n")
file.write("\n")
file.write("const size_t UNICODE_CANONICALIZATION_RANGES = {rangeCount:d};\n".format(rangeCount=len(rangeInfo)))
file.write("const CanonicalizationRange unicodeRangeInfo[UNICODE_CANONICALIZATION_RANGES] = {\n")
for info in rangeInfo:
typeAndValue = info["type"].split(":")
file.write(" {{ 0x{begin:04x}, 0x{end:04x}, 0x{value:04x}, {type} }},\n".format(begin=info["begin"], end=info["end"], value=int(typeAndValue[1]), type=typeAndValue[0]))
file.write("};\n")
file.write("\n")
if __name__ == "__main__":
parser = optparse.OptionParser(usage = "usage: %prog <CaseFolding.txt> <YarrCanonicalizeUnicode.h>")
(options, args) = parser.parse_args()
if len(args) != 2:
parser.error("<CaseFolding.txt> <YarrCanonicalizeUnicode.h>")
caseFoldingTxtPath = args[0]
canonicalizeHPath = args[1]
caseFoldingTxtFile = openOrExit(caseFoldingTxtPath, "r")
canonicalizeHFile = openOrExit(canonicalizeHPath, "w")
canonicalize = Canonicalize()
canonicalize.readCaseFolding(caseFoldingTxtFile)
canonicalizeHFile.write(header);
canonicalize.createTables(canonicalizeHFile)
canonicalizeHFile.write(footer);
caseFoldingTxtFile.close()
canonicalizeHFile.close()
exit(0)