Source/JavaScriptCore/yarr/YarrCanonicalize.h - WebKit - Git at Google

 /*
  * Copyright (C) 2012-2016 Apple Inc. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */

 #pragma once

 #include <stdint.h>
 #include <unicode/utypes.h>

 namespace JSC { namespace Yarr {

 // This set of data provides information for each UCS2 code point as to the set of code points
 // that it should match under the ES6 case insensitive RegExp matching rules, specified in 21.2.2.8.2.
 // The non-Unicode tables are autogenerated using YarrCanonicalize.js into YarrCanonicalize.cpp.
 // The Unicode tables are autogenerated using the python script generateYarrCanonicalizeUnicode
 // which creates YarrCanonicalizeUnicode.cpp.
 enum UCS2CanonicalizationType {
     CanonicalizeUnique,               // No canonically equal values, e.g. 0x0.
     CanonicalizeSet,                  // Value indicates a set in characterSetInfo.
     CanonicalizeRangeLo,              // Value is positive delta to pair, E.g. 0x41 has value 0x20, -> 0x61.
     CanonicalizeRangeHi,              // Value is positive delta to pair, E.g. 0x61 has value 0x20, -> 0x41.
     CanonicalizeAlternatingAligned,   // Aligned consequtive pair, e.g. 0x1f4,0x1f5.
     CanonicalizeAlternatingUnaligned, // Unaligned consequtive pair, e.g. 0x241,0x242.
 };
 struct CanonicalizationRange {
     UChar32 begin;
     UChar32 end;
     UChar32 value;
     UCS2CanonicalizationType type;
 };

 extern const size_t UCS2_CANONICALIZATION_RANGES;
 extern const UChar32* const ucs2CharacterSetInfo[];
 extern const CanonicalizationRange ucs2RangeInfo[];

 extern const size_t UNICODE_CANONICALIZATION_RANGES;
 extern const UChar32* const unicodeCharacterSetInfo[];
 extern const CanonicalizationRange unicodeRangeInfo[];

 enum class CanonicalMode { UCS2, Unicode };

 inline const UChar32* canonicalCharacterSetInfo(unsigned index, CanonicalMode canonicalMode)
 {
     const UChar32* const* rangeInfo = canonicalMode == CanonicalMode::UCS2 ? ucs2CharacterSetInfo : unicodeCharacterSetInfo;
     return rangeInfo[index];
 }

 // This searches in log2 time over ~400-600 entries, so should typically result in 9 compares.
 inline const CanonicalizationRange* canonicalRangeInfoFor(UChar32 ch, CanonicalMode canonicalMode = CanonicalMode::UCS2)
 {
     const CanonicalizationRange* info = canonicalMode == CanonicalMode::UCS2 ? ucs2RangeInfo : unicodeRangeInfo;
     size_t entries = canonicalMode == CanonicalMode::UCS2 ? UCS2_CANONICALIZATION_RANGES : UNICODE_CANONICALIZATION_RANGES;

     while (true) {
         size_t candidate = entries >> 1;
         const CanonicalizationRange* candidateInfo = info + candidate;
         if (ch < candidateInfo->begin)
             entries = candidate;
         else if (ch <= candidateInfo->end)
             return candidateInfo;
         else {
             info = candidateInfo + 1;
             entries -= (candidate + 1);
         }
     }
 }

 // Should only be called for characters that have one canonically matching value.
 inline UChar32 getCanonicalPair(const CanonicalizationRange* info, UChar32 ch)
 {
     ASSERT(ch >= info->begin && ch <= info->end);
     switch (info->type) {
     case CanonicalizeRangeLo:
         return ch + info->value;
     case CanonicalizeRangeHi:
         return ch - info->value;
     case CanonicalizeAlternatingAligned:
         return ch ^ 1;
     case CanonicalizeAlternatingUnaligned:
         return ((ch - 1) ^ 1) + 1;
     default:
         RELEASE_ASSERT_NOT_REACHED();
     }
     RELEASE_ASSERT_NOT_REACHED();
     return 0;
 }

 // Returns true if no other UCS2 codepoint can match this value.
 inline bool isCanonicallyUnique(UChar32 ch, CanonicalMode canonicalMode = CanonicalMode::UCS2)
 {
     return canonicalRangeInfoFor(ch, canonicalMode)->type == CanonicalizeUnique;
 }

 // Returns true if values are equal, under the canonicalization rules.
 inline bool areCanonicallyEquivalent(UChar32 a, UChar32 b, CanonicalMode canonicalMode = CanonicalMode::UCS2)
 {
     const CanonicalizationRange* info = canonicalRangeInfoFor(a, canonicalMode);
     switch (info->type) {
     case CanonicalizeUnique:
         return a == b;
     case CanonicalizeSet: {
         for (const UChar32* set = canonicalCharacterSetInfo(info->value, canonicalMode); (a = *set); ++set) {
             if (a == b)
                 return true;
         }
         return false;
     }
     case CanonicalizeRangeLo:
         return (a == b) || (a + info->value == b);
     case CanonicalizeRangeHi:
         return (a == b) || (a - info->value == b);
     case CanonicalizeAlternatingAligned:
         return (a | 1) == (b | 1);
     case CanonicalizeAlternatingUnaligned:
         return ((a - 1) | 1) == ((b - 1) | 1);
     }

     RELEASE_ASSERT_NOT_REACHED();
     return false;
 }

 } } // JSC::Yarr
	/*
	* Copyright (C) 2012-2016 Apple Inc. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
	* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
	* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
	* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
	* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
	* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
	* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
	* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
	* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*/

	#pragma once

	#include <stdint.h>
	#include <unicode/utypes.h>

	namespace JSC { namespace Yarr {

	// This set of data provides information for each UCS2 code point as to the set of code points
	// that it should match under the ES6 case insensitive RegExp matching rules, specified in 21.2.2.8.2.
	// The non-Unicode tables are autogenerated using YarrCanonicalize.js into YarrCanonicalize.cpp.
	// The Unicode tables are autogenerated using the python script generateYarrCanonicalizeUnicode
	// which creates YarrCanonicalizeUnicode.cpp.
	enum UCS2CanonicalizationType {
	CanonicalizeUnique, // No canonically equal values, e.g. 0x0.
	CanonicalizeSet, // Value indicates a set in characterSetInfo.
	CanonicalizeRangeLo, // Value is positive delta to pair, E.g. 0x41 has value 0x20, -> 0x61.
	CanonicalizeRangeHi, // Value is positive delta to pair, E.g. 0x61 has value 0x20, -> 0x41.
	CanonicalizeAlternatingAligned, // Aligned consequtive pair, e.g. 0x1f4,0x1f5.
	CanonicalizeAlternatingUnaligned, // Unaligned consequtive pair, e.g. 0x241,0x242.
	};
	struct CanonicalizationRange {
	UChar32 begin;
	UChar32 end;
	UChar32 value;
	UCS2CanonicalizationType type;
	};

	extern const size_t UCS2_CANONICALIZATION_RANGES;
	extern const UChar32* const ucs2CharacterSetInfo[];
	extern const CanonicalizationRange ucs2RangeInfo[];

	extern const size_t UNICODE_CANONICALIZATION_RANGES;
	extern const UChar32* const unicodeCharacterSetInfo[];
	extern const CanonicalizationRange unicodeRangeInfo[];

	enum class CanonicalMode { UCS2, Unicode };

	inline const UChar32* canonicalCharacterSetInfo(unsigned index, CanonicalMode canonicalMode)
	{
	const UChar32* const* rangeInfo = canonicalMode == CanonicalMode::UCS2 ? ucs2CharacterSetInfo : unicodeCharacterSetInfo;
	return rangeInfo[index];
	}

	// This searches in log2 time over ~400-600 entries, so should typically result in 9 compares.
	inline const CanonicalizationRange* canonicalRangeInfoFor(UChar32 ch, CanonicalMode canonicalMode = CanonicalMode::UCS2)
	{
	const CanonicalizationRange* info = canonicalMode == CanonicalMode::UCS2 ? ucs2RangeInfo : unicodeRangeInfo;
	size_t entries = canonicalMode == CanonicalMode::UCS2 ? UCS2_CANONICALIZATION_RANGES : UNICODE_CANONICALIZATION_RANGES;

	while (true) {
	size_t candidate = entries >> 1;
	const CanonicalizationRange* candidateInfo = info + candidate;
	if (ch < candidateInfo->begin)
	entries = candidate;
	else if (ch <= candidateInfo->end)
	return candidateInfo;
	else {
	info = candidateInfo + 1;
	entries -= (candidate + 1);
	}
	}
	}

	// Should only be called for characters that have one canonically matching value.
	inline UChar32 getCanonicalPair(const CanonicalizationRange* info, UChar32 ch)
	{
	ASSERT(ch >= info->begin && ch <= info->end);
	switch (info->type) {
	case CanonicalizeRangeLo:
	return ch + info->value;
	case CanonicalizeRangeHi:
	return ch - info->value;
	case CanonicalizeAlternatingAligned:
	return ch ^ 1;
	case CanonicalizeAlternatingUnaligned:
	return ((ch - 1) ^ 1) + 1;
	default:
	RELEASE_ASSERT_NOT_REACHED();
	}
	RELEASE_ASSERT_NOT_REACHED();
	return 0;
	}

	// Returns true if no other UCS2 codepoint can match this value.
	inline bool isCanonicallyUnique(UChar32 ch, CanonicalMode canonicalMode = CanonicalMode::UCS2)
	{
	return canonicalRangeInfoFor(ch, canonicalMode)->type == CanonicalizeUnique;
	}

	// Returns true if values are equal, under the canonicalization rules.
	inline bool areCanonicallyEquivalent(UChar32 a, UChar32 b, CanonicalMode canonicalMode = CanonicalMode::UCS2)
	{
	const CanonicalizationRange* info = canonicalRangeInfoFor(a, canonicalMode);
	switch (info->type) {
	case CanonicalizeUnique:
	return a == b;
	case CanonicalizeSet: {
	for (const UChar32* set = canonicalCharacterSetInfo(info->value, canonicalMode); (a = *set); ++set) {
	if (a == b)
	return true;
	}
	return false;
	}
	case CanonicalizeRangeLo:
	return (a == b) \|\| (a + info->value == b);
	case CanonicalizeRangeHi:
	return (a == b) \|\| (a - info->value == b);
	case CanonicalizeAlternatingAligned:
	return (a \| 1) == (b \| 1);
	case CanonicalizeAlternatingUnaligned:
	return ((a - 1) \| 1) == ((b - 1) \| 1);
	}

	RELEASE_ASSERT_NOT_REACHED();
	return false;
	}

	} } // JSC::Yarr