Source/WebCore/platform/text/DecodeEscapeSequences.h - WebKit - Git at Google

 /*
  * Copyright (C) 2011 Daniel Bates (dbates@intudata.com). All Rights Reserved.
  * Copyright (c) 2012 Google, inc.  All Rights Reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of Google Inc. nor the names of its
  *    contributors may be used to endorse or promote products derived from
  *    this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */

 #pragma once

 #include "TextEncoding.h"
 #include <wtf/ASCIICType.h>
 #include <wtf/Assertions.h>
 #include <wtf/text/StringBuilder.h>

 namespace WebCore {

 // See <http://en.wikipedia.org/wiki/Percent-encoding#Non-standard_implementations>.
 struct Unicode16BitEscapeSequence {
     enum { sequenceSize = 6 }; // e.g. %u26C4
     static size_t findInString(StringView string, size_t startPosition) { return string.find(StringView("%u"), startPosition); }
     static size_t findEndOfRun(StringView string, size_t startPosition, size_t endPosition)
     {
         size_t runEnd = startPosition;
         while (endPosition - runEnd >= sequenceSize && string[runEnd] == '%' && string[runEnd + 1] == 'u'
                && isASCIIHexDigit(string[runEnd + 2]) && isASCIIHexDigit(string[runEnd + 3])
                && isASCIIHexDigit(string[runEnd + 4]) && isASCIIHexDigit(string[runEnd + 5])) {
             runEnd += sequenceSize;
         }
         return runEnd;
     }
     static String decodeRun(StringView run, const TextEncoding&)
     {
         // Each %u-escape sequence represents a UTF-16 code unit.
         // See <http://www.w3.org/International/iri-edit/draft-duerst-iri.html#anchor29>.
         // For 16-bit escape sequences, we know that findEndOfRun() has given us a contiguous run of sequences
         // without any intervening characters, so decode the run without additional checks.
         auto numberOfSequences = run.length() / sequenceSize;
         StringBuilder builder;
         builder.reserveCapacity(numberOfSequences);
         while (numberOfSequences--) {
             UChar codeUnit = (toASCIIHexValue(run[2]) << 12) | (toASCIIHexValue(run[3]) << 8) | (toASCIIHexValue(run[4]) << 4) | toASCIIHexValue(run[5]);
             builder.append(codeUnit);
             run = run.substring(sequenceSize);
         }
         return builder.toString();
     }
 };

 struct URLEscapeSequence {
     enum { sequenceSize = 3 }; // e.g. %41
     static size_t findInString(StringView string, size_t startPosition) { return string.find('%', startPosition); }
     static size_t findEndOfRun(StringView string, size_t startPosition, size_t endPosition)
     {
         // Make the simplifying assumption that supported encodings may have up to two unescaped characters
         // in the range 0x40 - 0x7F as the trailing bytes of their sequences which need to be passed into the
         // decoder as part of the run. In other words, we end the run at the first value outside of the
         // 0x40 - 0x7F range, after two values in this range, or at a %-sign that does not introduce a valid
         // escape sequence.
         size_t runEnd = startPosition;
         int numberOfTrailingCharacters = 0;
         while (runEnd < endPosition) {
             if (string[runEnd] == '%') {
                 if (endPosition - runEnd >= sequenceSize && isASCIIHexDigit(string[runEnd + 1]) && isASCIIHexDigit(string[runEnd + 2])) {
                     runEnd += sequenceSize;
                     numberOfTrailingCharacters = 0;
                 } else
                     break;
             } else if (string[runEnd] >= 0x40 && string[runEnd] <= 0x7F && numberOfTrailingCharacters < 2) {
                 runEnd += 1;
                 numberOfTrailingCharacters += 1;
             } else
                 break;
         }
         return runEnd;
     }

     static Vector<char, 512> decodeRun(StringView run)
     {
         // For URL escape sequences, we know that findEndOfRun() has given us a run where every %-sign introduces
         // a valid escape sequence, but there may be characters between the sequences.
         Vector<char, 512> buffer;
         buffer.grow(run.length()); // Unescaping hex sequences only makes the length smaller.
         char* p = buffer.data();
         while (!run.isEmpty()) {
             if (run[0] == '%') {
                 *p++ = (toASCIIHexValue(run[1]) << 4) | toASCIIHexValue(run[2]);
                 run = run.substring(sequenceSize);
             } else {
                 *p++ = run[0];
                 run = run.substring(1);
             }
         }
         ASSERT(buffer.size() >= static_cast<size_t>(p - buffer.data())); // Prove buffer not overrun.
         buffer.shrink(p - buffer.data());
         return buffer;
     }

     static String decodeRun(StringView run, const TextEncoding& encoding)
     {
         auto buffer = decodeRun(run);
         if (!encoding.isValid())
             return UTF8Encoding().decode(buffer.data(), buffer.size());
         return encoding.decode(buffer.data(), buffer.size());
     }
 };

 template<typename EscapeSequence>
 String decodeEscapeSequences(StringView string, const TextEncoding& encoding)
 {
     StringBuilder result;
     size_t length = string.length();
     size_t decodedPosition = 0;
     size_t searchPosition = 0;
     size_t encodedRunPosition;
     while ((encodedRunPosition = EscapeSequence::findInString(string, searchPosition)) != notFound) {
         size_t encodedRunEnd = EscapeSequence::findEndOfRun(string, encodedRunPosition, length);
         searchPosition = encodedRunEnd;
         if (encodedRunEnd == encodedRunPosition) {
             ++searchPosition;
             continue;
         }

         String decoded = EscapeSequence::decodeRun(string.substring(encodedRunPosition, encodedRunEnd - encodedRunPosition), encoding);
         if (decoded.isEmpty())
             continue;

         result.append(string.substring(decodedPosition, encodedRunPosition - decodedPosition));
         result.append(decoded);
         decodedPosition = encodedRunEnd;
     }
     result.append(string.substring(decodedPosition, length - decodedPosition));
     return result.toString();
 }

 inline Vector<uint8_t> decodeURLEscapeSequencesAsData(StringView string, const TextEncoding& encoding)
 {
     ASSERT(encoding.isValid());

     Vector<uint8_t> result;
     size_t decodedPosition = 0;
     size_t searchPosition = 0;
     while (true) {
         size_t encodedRunPosition = URLEscapeSequence::findInString(string, searchPosition);
         size_t encodedRunEnd = 0;
         if (encodedRunPosition != notFound) {
             encodedRunEnd = URLEscapeSequence::findEndOfRun(string, encodedRunPosition, string.length());
             searchPosition = encodedRunEnd;
             if (encodedRunEnd == encodedRunPosition) {
                 ++searchPosition;
                 continue;
             }
         }

         // Strings are encoded as requested.
         result.appendVector(encoding.encode(string.substring(decodedPosition, encodedRunPosition - decodedPosition), UnencodableHandling::URLEncodedEntities));

         if (encodedRunPosition == notFound)
             return result;

         // Bytes go through as-is.
         auto decodedEscapeSequence = URLEscapeSequence::decodeRun(string.substring(encodedRunPosition, encodedRunEnd - encodedRunPosition));
         ASSERT(!decodedEscapeSequence.isEmpty());
         result.appendVector(decodedEscapeSequence);

         decodedPosition = encodedRunEnd;
     }
 }

 } // namespace WebCore
	/*
	* Copyright (C) 2011 Daniel Bates (dbates@intudata.com). All Rights Reserved.
	* Copyright (c) 2012 Google, inc. All Rights Reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. Neither the name of Google Inc. nor the names of its
	* contributors may be used to endorse or promote products derived from
	* this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
	* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
	* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
	* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
	* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
	* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
	* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
	* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
	* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*/

	#pragma once

	#include "TextEncoding.h"
	#include <wtf/ASCIICType.h>
	#include <wtf/Assertions.h>
	#include <wtf/text/StringBuilder.h>

	namespace WebCore {

	// See <http://en.wikipedia.org/wiki/Percent-encoding#Non-standard_implementations>.
	struct Unicode16BitEscapeSequence {
	enum { sequenceSize = 6 }; // e.g. %u26C4
	static size_t findInString(StringView string, size_t startPosition) { return string.find(StringView("%u"), startPosition); }
	static size_t findEndOfRun(StringView string, size_t startPosition, size_t endPosition)
	{
	size_t runEnd = startPosition;
	while (endPosition - runEnd >= sequenceSize && string[runEnd] == '%' && string[runEnd + 1] == 'u'
	&& isASCIIHexDigit(string[runEnd + 2]) && isASCIIHexDigit(string[runEnd + 3])
	&& isASCIIHexDigit(string[runEnd + 4]) && isASCIIHexDigit(string[runEnd + 5])) {
	runEnd += sequenceSize;
	}
	return runEnd;
	}
	static String decodeRun(StringView run, const TextEncoding&)
	{
	// Each %u-escape sequence represents a UTF-16 code unit.
	// See <http://www.w3.org/International/iri-edit/draft-duerst-iri.html#anchor29>.
	// For 16-bit escape sequences, we know that findEndOfRun() has given us a contiguous run of sequences
	// without any intervening characters, so decode the run without additional checks.
	auto numberOfSequences = run.length() / sequenceSize;
	StringBuilder builder;
	builder.reserveCapacity(numberOfSequences);
	while (numberOfSequences--) {
	UChar codeUnit = (toASCIIHexValue(run[2]) << 12) \| (toASCIIHexValue(run[3]) << 8) \| (toASCIIHexValue(run[4]) << 4) \| toASCIIHexValue(run[5]);
	builder.append(codeUnit);
	run = run.substring(sequenceSize);
	}
	return builder.toString();
	}
	};

	struct URLEscapeSequence {
	enum { sequenceSize = 3 }; // e.g. %41
	static size_t findInString(StringView string, size_t startPosition) { return string.find('%', startPosition); }
	static size_t findEndOfRun(StringView string, size_t startPosition, size_t endPosition)
	{
	// Make the simplifying assumption that supported encodings may have up to two unescaped characters
	// in the range 0x40 - 0x7F as the trailing bytes of their sequences which need to be passed into the
	// decoder as part of the run. In other words, we end the run at the first value outside of the
	// 0x40 - 0x7F range, after two values in this range, or at a %-sign that does not introduce a valid
	// escape sequence.
	size_t runEnd = startPosition;
	int numberOfTrailingCharacters = 0;
	while (runEnd < endPosition) {
	if (string[runEnd] == '%') {
	if (endPosition - runEnd >= sequenceSize && isASCIIHexDigit(string[runEnd + 1]) && isASCIIHexDigit(string[runEnd + 2])) {
	runEnd += sequenceSize;
	numberOfTrailingCharacters = 0;
	} else
	break;
	} else if (string[runEnd] >= 0x40 && string[runEnd] <= 0x7F && numberOfTrailingCharacters < 2) {
	runEnd += 1;
	numberOfTrailingCharacters += 1;
	} else
	break;
	}
	return runEnd;
	}

	static Vector<char, 512> decodeRun(StringView run)
	{
	// For URL escape sequences, we know that findEndOfRun() has given us a run where every %-sign introduces
	// a valid escape sequence, but there may be characters between the sequences.
	Vector<char, 512> buffer;
	buffer.grow(run.length()); // Unescaping hex sequences only makes the length smaller.
	char* p = buffer.data();
	while (!run.isEmpty()) {
	if (run[0] == '%') {
	*p++ = (toASCIIHexValue(run[1]) << 4) \| toASCIIHexValue(run[2]);
	run = run.substring(sequenceSize);
	} else {
	*p++ = run[0];
	run = run.substring(1);
	}
	}
	ASSERT(buffer.size() >= static_cast<size_t>(p - buffer.data())); // Prove buffer not overrun.
	buffer.shrink(p - buffer.data());
	return buffer;
	}

	static String decodeRun(StringView run, const TextEncoding& encoding)
	{
	auto buffer = decodeRun(run);
	if (!encoding.isValid())
	return UTF8Encoding().decode(buffer.data(), buffer.size());
	return encoding.decode(buffer.data(), buffer.size());
	}
	};

	template<typename EscapeSequence>
	String decodeEscapeSequences(StringView string, const TextEncoding& encoding)
	{
	StringBuilder result;
	size_t length = string.length();
	size_t decodedPosition = 0;
	size_t searchPosition = 0;
	size_t encodedRunPosition;
	while ((encodedRunPosition = EscapeSequence::findInString(string, searchPosition)) != notFound) {
	size_t encodedRunEnd = EscapeSequence::findEndOfRun(string, encodedRunPosition, length);
	searchPosition = encodedRunEnd;
	if (encodedRunEnd == encodedRunPosition) {
	++searchPosition;
	continue;
	}

	String decoded = EscapeSequence::decodeRun(string.substring(encodedRunPosition, encodedRunEnd - encodedRunPosition), encoding);
	if (decoded.isEmpty())
	continue;

	result.append(string.substring(decodedPosition, encodedRunPosition - decodedPosition));
	result.append(decoded);
	decodedPosition = encodedRunEnd;
	}
	result.append(string.substring(decodedPosition, length - decodedPosition));
	return result.toString();
	}

	inline Vector<uint8_t> decodeURLEscapeSequencesAsData(StringView string, const TextEncoding& encoding)
	{
	ASSERT(encoding.isValid());

	Vector<uint8_t> result;
	size_t decodedPosition = 0;
	size_t searchPosition = 0;
	while (true) {
	size_t encodedRunPosition = URLEscapeSequence::findInString(string, searchPosition);
	size_t encodedRunEnd = 0;
	if (encodedRunPosition != notFound) {
	encodedRunEnd = URLEscapeSequence::findEndOfRun(string, encodedRunPosition, string.length());
	searchPosition = encodedRunEnd;
	if (encodedRunEnd == encodedRunPosition) {
	++searchPosition;
	continue;
	}
	}

	// Strings are encoded as requested.
	result.appendVector(encoding.encode(string.substring(decodedPosition, encodedRunPosition - decodedPosition), UnencodableHandling::URLEncodedEntities));

	if (encodedRunPosition == notFound)
	return result;

	// Bytes go through as-is.
	auto decodedEscapeSequence = URLEscapeSequence::decodeRun(string.substring(encodedRunPosition, encodedRunEnd - encodedRunPosition));
	ASSERT(!decodedEscapeSequence.isEmpty());
	result.appendVector(decodedEscapeSequence);

	decodedPosition = encodedRunEnd;
	}
	}

	} // namespace WebCore