| /* |
| * Copyright (C) 2016-2020 Apple Inc. All rights reserved. |
| * |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions |
| * are met: |
| * 1. Redistributions of source code must retain the above copyright |
| * notice, this list of conditions and the following disclaimer. |
| * 2. Redistributions in binary form must reproduce the above copyright |
| * notice, this list of conditions and the following disclaimer in the |
| * documentation and/or other materials provided with the distribution. |
| * |
| * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS'' |
| * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, |
| * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
| * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS |
| * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
| * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
| * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
| * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
| * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
| * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF |
| * THE POSSIBILITY OF SUCH DAMAGE. |
| */ |
| |
| #include "config.h" |
| #include <wtf/URLParser.h> |
| |
| #include <array> |
| #include <functional> |
| #include <mutex> |
| #include <wtf/text/CodePointIterator.h> |
| |
| namespace WTF { |
| |
| #define URL_PARSER_DEBUGGING 0 |
| |
| #if URL_PARSER_DEBUGGING |
| #define URL_PARSER_LOG(...) WTFLogAlways(__VA_ARGS__) |
| #else |
| #define URL_PARSER_LOG(...) |
| #endif |
| |
| ALWAYS_INLINE static void appendCodePoint(Vector<UChar>& destination, UChar32 codePoint) |
| { |
| if (U_IS_BMP(codePoint)) { |
| destination.append(static_cast<UChar>(codePoint)); |
| return; |
| } |
| destination.reserveCapacity(destination.size() + 2); |
| destination.uncheckedAppend(U16_LEAD(codePoint)); |
| destination.uncheckedAppend(U16_TRAIL(codePoint)); |
| } |
| |
| enum URLCharacterClass { |
| UserInfo = 0x1, |
| Default = 0x2, |
| ForbiddenHost = 0x4, |
| QueryPercent = 0x8, |
| SlashQuestionOrHash = 0x10, |
| ValidScheme = 0x20, |
| }; |
| |
| static const uint8_t characterClassTable[256] = { |
| UserInfo | Default | QueryPercent | ForbiddenHost, // 0x0 |
| UserInfo | Default | QueryPercent | ForbiddenHost, // 0x1 |
| UserInfo | Default | QueryPercent | ForbiddenHost, // 0x2 |
| UserInfo | Default | QueryPercent | ForbiddenHost, // 0x3 |
| UserInfo | Default | QueryPercent | ForbiddenHost, // 0x4 |
| UserInfo | Default | QueryPercent | ForbiddenHost, // 0x5 |
| UserInfo | Default | QueryPercent | ForbiddenHost, // 0x6 |
| UserInfo | Default | QueryPercent | ForbiddenHost, // 0x7 |
| UserInfo | Default | QueryPercent | ForbiddenHost, // 0x8 |
| UserInfo | Default | QueryPercent | ForbiddenHost, // 0x9 |
| UserInfo | Default | QueryPercent | ForbiddenHost, // 0xA |
| UserInfo | Default | QueryPercent | ForbiddenHost, // 0xB |
| UserInfo | Default | QueryPercent | ForbiddenHost, // 0xC |
| UserInfo | Default | QueryPercent | ForbiddenHost, // 0xD |
| UserInfo | Default | QueryPercent | ForbiddenHost, // 0xE |
| UserInfo | Default | QueryPercent | ForbiddenHost, // 0xF |
| UserInfo | Default | QueryPercent | ForbiddenHost, // 0x10 |
| UserInfo | Default | QueryPercent | ForbiddenHost, // 0x11 |
| UserInfo | Default | QueryPercent | ForbiddenHost, // 0x12 |
| UserInfo | Default | QueryPercent | ForbiddenHost, // 0x13 |
| UserInfo | Default | QueryPercent | ForbiddenHost, // 0x14 |
| UserInfo | Default | QueryPercent | ForbiddenHost, // 0x15 |
| UserInfo | Default | QueryPercent | ForbiddenHost, // 0x16 |
| UserInfo | Default | QueryPercent | ForbiddenHost, // 0x17 |
| UserInfo | Default | QueryPercent | ForbiddenHost, // 0x18 |
| UserInfo | Default | QueryPercent | ForbiddenHost, // 0x19 |
| UserInfo | Default | QueryPercent | ForbiddenHost, // 0x1A |
| UserInfo | Default | QueryPercent | ForbiddenHost, // 0x1B |
| UserInfo | Default | QueryPercent | ForbiddenHost, // 0x1C |
| UserInfo | Default | QueryPercent | ForbiddenHost, // 0x1D |
| UserInfo | Default | QueryPercent | ForbiddenHost, // 0x1E |
| UserInfo | Default | QueryPercent | ForbiddenHost, // 0x1F |
| UserInfo | Default | QueryPercent | ForbiddenHost, // ' ' |
| 0, // '!' |
| UserInfo | Default | QueryPercent, // '"' |
| UserInfo | Default | QueryPercent | SlashQuestionOrHash | ForbiddenHost, // '#' |
| 0, // '$' |
| ForbiddenHost, // '%' |
| 0, // '&' |
| 0, // '\'' |
| 0, // '(' |
| 0, // ')' |
| 0, // '*' |
| ValidScheme, // '+' |
| 0, // ',' |
| ValidScheme, // '-' |
| ValidScheme, // '.' |
| UserInfo | SlashQuestionOrHash | ForbiddenHost, // '/' |
| ValidScheme, // '0' |
| ValidScheme, // '1' |
| ValidScheme, // '2' |
| ValidScheme, // '3' |
| ValidScheme, // '4' |
| ValidScheme, // '5' |
| ValidScheme, // '6' |
| ValidScheme, // '7' |
| ValidScheme, // '8' |
| ValidScheme, // '9' |
| UserInfo | ForbiddenHost, // ':' |
| UserInfo, // ';' |
| UserInfo | Default | QueryPercent | ForbiddenHost, // '<' |
| UserInfo, // '=' |
| UserInfo | Default | QueryPercent | ForbiddenHost, // '>' |
| UserInfo | Default | SlashQuestionOrHash | ForbiddenHost, // '?' |
| UserInfo | ForbiddenHost, // '@' |
| ValidScheme, // 'A' |
| ValidScheme, // 'B' |
| ValidScheme, // 'C' |
| ValidScheme, // 'D' |
| ValidScheme, // 'E' |
| ValidScheme, // 'F' |
| ValidScheme, // 'G' |
| ValidScheme, // 'H' |
| ValidScheme, // 'I' |
| ValidScheme, // 'J' |
| ValidScheme, // 'K' |
| ValidScheme, // 'L' |
| ValidScheme, // 'M' |
| ValidScheme, // 'N' |
| ValidScheme, // 'O' |
| ValidScheme, // 'P' |
| ValidScheme, // 'Q' |
| ValidScheme, // 'R' |
| ValidScheme, // 'S' |
| ValidScheme, // 'T' |
| ValidScheme, // 'U' |
| ValidScheme, // 'V' |
| ValidScheme, // 'W' |
| ValidScheme, // 'X' |
| ValidScheme, // 'Y' |
| ValidScheme, // 'Z' |
| UserInfo | ForbiddenHost, // '[' |
| UserInfo | SlashQuestionOrHash | ForbiddenHost, // '\\' |
| UserInfo | ForbiddenHost, // ']' |
| UserInfo | ForbiddenHost, // '^' |
| 0, // '_' |
| UserInfo | Default, // '`' |
| ValidScheme, // 'a' |
| ValidScheme, // 'b' |
| ValidScheme, // 'c' |
| ValidScheme, // 'd' |
| ValidScheme, // 'e' |
| ValidScheme, // 'f' |
| ValidScheme, // 'g' |
| ValidScheme, // 'h' |
| ValidScheme, // 'i' |
| ValidScheme, // 'j' |
| ValidScheme, // 'k' |
| ValidScheme, // 'l' |
| ValidScheme, // 'm' |
| ValidScheme, // 'n' |
| ValidScheme, // 'o' |
| ValidScheme, // 'p' |
| ValidScheme, // 'q' |
| ValidScheme, // 'r' |
| ValidScheme, // 's' |
| ValidScheme, // 't' |
| ValidScheme, // 'u' |
| ValidScheme, // 'v' |
| ValidScheme, // 'w' |
| ValidScheme, // 'x' |
| ValidScheme, // 'y' |
| ValidScheme, // 'z' |
| UserInfo | Default, // '{' |
| UserInfo | ForbiddenHost, // '|' |
| UserInfo | Default, // '}' |
| 0, // '~' |
| QueryPercent | ForbiddenHost, // 0x7F |
| QueryPercent, // 0x80 |
| QueryPercent, // 0x81 |
| QueryPercent, // 0x82 |
| QueryPercent, // 0x83 |
| QueryPercent, // 0x84 |
| QueryPercent, // 0x85 |
| QueryPercent, // 0x86 |
| QueryPercent, // 0x87 |
| QueryPercent, // 0x88 |
| QueryPercent, // 0x89 |
| QueryPercent, // 0x8A |
| QueryPercent, // 0x8B |
| QueryPercent, // 0x8C |
| QueryPercent, // 0x8D |
| QueryPercent, // 0x8E |
| QueryPercent, // 0x8F |
| QueryPercent, // 0x90 |
| QueryPercent, // 0x91 |
| QueryPercent, // 0x92 |
| QueryPercent, // 0x93 |
| QueryPercent, // 0x94 |
| QueryPercent, // 0x95 |
| QueryPercent, // 0x96 |
| QueryPercent, // 0x97 |
| QueryPercent, // 0x98 |
| QueryPercent, // 0x99 |
| QueryPercent, // 0x9A |
| QueryPercent, // 0x9B |
| QueryPercent, // 0x9C |
| QueryPercent, // 0x9D |
| QueryPercent, // 0x9E |
| QueryPercent, // 0x9F |
| QueryPercent, // 0xA0 |
| QueryPercent, // 0xA1 |
| QueryPercent, // 0xA2 |
| QueryPercent, // 0xA3 |
| QueryPercent, // 0xA4 |
| QueryPercent, // 0xA5 |
| QueryPercent, // 0xA6 |
| QueryPercent, // 0xA7 |
| QueryPercent, // 0xA8 |
| QueryPercent, // 0xA9 |
| QueryPercent, // 0xAA |
| QueryPercent, // 0xAB |
| QueryPercent, // 0xAC |
| QueryPercent, // 0xAD |
| QueryPercent, // 0xAE |
| QueryPercent, // 0xAF |
| QueryPercent, // 0xB0 |
| QueryPercent, // 0xB1 |
| QueryPercent, // 0xB2 |
| QueryPercent, // 0xB3 |
| QueryPercent, // 0xB4 |
| QueryPercent, // 0xB5 |
| QueryPercent, // 0xB6 |
| QueryPercent, // 0xB7 |
| QueryPercent, // 0xB8 |
| QueryPercent, // 0xB9 |
| QueryPercent, // 0xBA |
| QueryPercent, // 0xBB |
| QueryPercent, // 0xBC |
| QueryPercent, // 0xBD |
| QueryPercent, // 0xBE |
| QueryPercent, // 0xBF |
| QueryPercent, // 0xC0 |
| QueryPercent, // 0xC1 |
| QueryPercent, // 0xC2 |
| QueryPercent, // 0xC3 |
| QueryPercent, // 0xC4 |
| QueryPercent, // 0xC5 |
| QueryPercent, // 0xC6 |
| QueryPercent, // 0xC7 |
| QueryPercent, // 0xC8 |
| QueryPercent, // 0xC9 |
| QueryPercent, // 0xCA |
| QueryPercent, // 0xCB |
| QueryPercent, // 0xCC |
| QueryPercent, // 0xCD |
| QueryPercent, // 0xCE |
| QueryPercent, // 0xCF |
| QueryPercent, // 0xD0 |
| QueryPercent, // 0xD1 |
| QueryPercent, // 0xD2 |
| QueryPercent, // 0xD3 |
| QueryPercent, // 0xD4 |
| QueryPercent, // 0xD5 |
| QueryPercent, // 0xD6 |
| QueryPercent, // 0xD7 |
| QueryPercent, // 0xD8 |
| QueryPercent, // 0xD9 |
| QueryPercent, // 0xDA |
| QueryPercent, // 0xDB |
| QueryPercent, // 0xDC |
| QueryPercent, // 0xDD |
| QueryPercent, // 0xDE |
| QueryPercent, // 0xDF |
| QueryPercent, // 0xE0 |
| QueryPercent, // 0xE1 |
| QueryPercent, // 0xE2 |
| QueryPercent, // 0xE3 |
| QueryPercent, // 0xE4 |
| QueryPercent, // 0xE5 |
| QueryPercent, // 0xE6 |
| QueryPercent, // 0xE7 |
| QueryPercent, // 0xE8 |
| QueryPercent, // 0xE9 |
| QueryPercent, // 0xEA |
| QueryPercent, // 0xEB |
| QueryPercent, // 0xEC |
| QueryPercent, // 0xED |
| QueryPercent, // 0xEE |
| QueryPercent, // 0xEF |
| QueryPercent, // 0xF0 |
| QueryPercent, // 0xF1 |
| QueryPercent, // 0xF2 |
| QueryPercent, // 0xF3 |
| QueryPercent, // 0xF4 |
| QueryPercent, // 0xF5 |
| QueryPercent, // 0xF6 |
| QueryPercent, // 0xF7 |
| QueryPercent, // 0xF8 |
| QueryPercent, // 0xF9 |
| QueryPercent, // 0xFA |
| QueryPercent, // 0xFB |
| QueryPercent, // 0xFC |
| QueryPercent, // 0xFD |
| QueryPercent, // 0xFE |
| QueryPercent, // 0xFF |
| }; |
| |
| template<typename CharacterType> ALWAYS_INLINE static bool isC0Control(CharacterType character) { return character <= 0x1F; } |
| template<typename CharacterType> ALWAYS_INLINE static bool isC0ControlOrSpace(CharacterType character) { return character <= 0x20; } |
| template<typename CharacterType> ALWAYS_INLINE static bool isTabOrNewline(CharacterType character) { return character <= 0xD && character >= 0x9 && character != 0xB && character != 0xC; } |
| template<typename CharacterType> ALWAYS_INLINE static bool isInSimpleEncodeSet(CharacterType character) { return character > 0x7E || isC0Control(character); } |
| template<typename CharacterType> ALWAYS_INLINE static bool isInFragmentEncodeSet(CharacterType character) { return character > 0x7E || character == '`' || ((characterClassTable[character] & QueryPercent) && character != '#'); } |
| template<typename CharacterType> ALWAYS_INLINE static bool isInDefaultEncodeSet(CharacterType character) { return character > 0x7E || characterClassTable[character] & Default; } |
| template<typename CharacterType> ALWAYS_INLINE static bool isInUserInfoEncodeSet(CharacterType character) { return character > 0x7E || characterClassTable[character] & UserInfo; } |
| template<typename CharacterType> ALWAYS_INLINE static bool isPercentOrNonASCII(CharacterType character) { return !isASCII(character) || character == '%'; } |
| template<typename CharacterType> ALWAYS_INLINE static bool isSlashQuestionOrHash(CharacterType character) { return character <= '\\' && characterClassTable[character] & SlashQuestionOrHash; } |
| template<typename CharacterType> ALWAYS_INLINE static bool isValidSchemeCharacter(CharacterType character) { return character <= 'z' && characterClassTable[character] & ValidScheme; } |
| template<typename CharacterType> ALWAYS_INLINE static bool isForbiddenHostCodePoint(CharacterType character) { return character <= 0x7F && characterClassTable[character] & ForbiddenHost; } |
| ALWAYS_INLINE static bool shouldPercentEncodeQueryByte(uint8_t byte, const bool& urlIsSpecial) |
| { |
| if (characterClassTable[byte] & QueryPercent) |
| return true; |
| if (byte == '\'' && urlIsSpecial) |
| return true; |
| return false; |
| } |
| |
| bool URLParser::isInUserInfoEncodeSet(UChar c) |
| { |
| return WTF::isInUserInfoEncodeSet(c); |
| } |
| |
| template<typename CharacterType, URLParser::ReportSyntaxViolation reportSyntaxViolation> |
| ALWAYS_INLINE void URLParser::advance(CodePointIterator<CharacterType>& iterator, const CodePointIterator<CharacterType>& iteratorForSyntaxViolationPosition) |
| { |
| ++iterator; |
| while (UNLIKELY(!iterator.atEnd() && isTabOrNewline(*iterator))) { |
| if (reportSyntaxViolation == ReportSyntaxViolation::Yes) |
| syntaxViolation(iteratorForSyntaxViolationPosition); |
| ++iterator; |
| } |
| } |
| |
| template<typename CharacterType> |
| bool URLParser::takesTwoAdvancesUntilEnd(CodePointIterator<CharacterType> iterator) |
| { |
| if (iterator.atEnd()) |
| return false; |
| advance<CharacterType, ReportSyntaxViolation::No>(iterator); |
| if (iterator.atEnd()) |
| return false; |
| advance<CharacterType, ReportSyntaxViolation::No>(iterator); |
| return iterator.atEnd(); |
| } |
| |
| template<typename CharacterType> |
| ALWAYS_INLINE bool URLParser::isWindowsDriveLetter(CodePointIterator<CharacterType> iterator) |
| { |
| // https://url.spec.whatwg.org/#start-with-a-windows-drive-letter |
| if (iterator.atEnd() || !isASCIIAlpha(*iterator)) |
| return false; |
| advance<CharacterType, ReportSyntaxViolation::No>(iterator); |
| if (iterator.atEnd()) |
| return false; |
| if (*iterator != ':' && *iterator != '|') |
| return false; |
| advance<CharacterType, ReportSyntaxViolation::No>(iterator); |
| return iterator.atEnd() || *iterator == '/' || *iterator == '\\' || *iterator == '?' || *iterator == '#'; |
| } |
| |
| ALWAYS_INLINE void URLParser::appendToASCIIBuffer(UChar32 codePoint) |
| { |
| ASSERT(isASCII(codePoint)); |
| if (UNLIKELY(m_didSeeSyntaxViolation)) |
| m_asciiBuffer.append(codePoint); |
| } |
| |
| ALWAYS_INLINE void URLParser::appendToASCIIBuffer(const char* characters, size_t length) |
| { |
| if (UNLIKELY(m_didSeeSyntaxViolation)) |
| m_asciiBuffer.append(characters, length); |
| } |
| |
| template<typename CharacterType> |
| void URLParser::appendWindowsDriveLetter(CodePointIterator<CharacterType>& iterator) |
| { |
| auto lengthWithOnlyOneSlashInPath = m_url.m_hostEnd + m_url.m_portLength + 1; |
| if (m_url.m_pathAfterLastSlash > lengthWithOnlyOneSlashInPath) { |
| syntaxViolation(iterator); |
| m_url.m_pathAfterLastSlash = lengthWithOnlyOneSlashInPath; |
| m_asciiBuffer.resize(lengthWithOnlyOneSlashInPath); |
| } |
| ASSERT(isWindowsDriveLetter(iterator)); |
| appendToASCIIBuffer(*iterator); |
| advance(iterator); |
| ASSERT(!iterator.atEnd()); |
| ASSERT(*iterator == ':' || *iterator == '|'); |
| if (*iterator == '|') |
| syntaxViolation(iterator); |
| appendToASCIIBuffer(':'); |
| advance(iterator); |
| } |
| |
| bool URLParser::copyBaseWindowsDriveLetter(const URL& base) |
| { |
| if (base.protocolIs("file")) { |
| RELEASE_ASSERT(base.m_hostEnd + base.m_portLength < base.m_string.length()); |
| if (base.m_string.is8Bit()) { |
| const LChar* begin = base.m_string.characters8(); |
| CodePointIterator<LChar> c(begin + base.m_hostEnd + base.m_portLength + 1, begin + base.m_string.length()); |
| if (isWindowsDriveLetter(c)) { |
| appendWindowsDriveLetter(c); |
| return true; |
| } |
| } else { |
| const UChar* begin = base.m_string.characters16(); |
| CodePointIterator<UChar> c(begin + base.m_hostEnd + base.m_portLength + 1, begin + base.m_string.length()); |
| if (isWindowsDriveLetter(c)) { |
| appendWindowsDriveLetter(c); |
| return true; |
| } |
| } |
| } |
| return false; |
| } |
| |
| template<typename CharacterType> |
| bool URLParser::shouldCopyFileURL(CodePointIterator<CharacterType> iterator) |
| { |
| if (!isWindowsDriveLetter(iterator)) |
| return true; |
| if (iterator.atEnd()) |
| return false; |
| advance(iterator); |
| if (iterator.atEnd()) |
| return true; |
| advance(iterator); |
| if (iterator.atEnd()) |
| return true; |
| return !isSlashQuestionOrHash(*iterator); |
| } |
| |
| static void percentEncodeByte(uint8_t byte, Vector<LChar>& buffer) |
| { |
| buffer.append('%'); |
| buffer.append(upperNibbleToASCIIHexDigit(byte)); |
| buffer.append(lowerNibbleToASCIIHexDigit(byte)); |
| } |
| |
| void URLParser::percentEncodeByte(uint8_t byte) |
| { |
| ASSERT(m_didSeeSyntaxViolation); |
| appendToASCIIBuffer('%'); |
| appendToASCIIBuffer(upperNibbleToASCIIHexDigit(byte)); |
| appendToASCIIBuffer(lowerNibbleToASCIIHexDigit(byte)); |
| } |
| |
| const char replacementCharacterUTF8PercentEncoded[10] = "%EF%BF%BD"; |
| const size_t replacementCharacterUTF8PercentEncodedLength = sizeof(replacementCharacterUTF8PercentEncoded) - 1; |
| |
| template<bool(*isInCodeSet)(UChar32), typename CharacterType> |
| ALWAYS_INLINE void URLParser::utf8PercentEncode(const CodePointIterator<CharacterType>& iterator) |
| { |
| ASSERT(!iterator.atEnd()); |
| UChar32 codePoint = *iterator; |
| if (LIKELY(isASCII(codePoint))) { |
| if (UNLIKELY(isInCodeSet(codePoint))) { |
| syntaxViolation(iterator); |
| percentEncodeByte(codePoint); |
| } else |
| appendToASCIIBuffer(codePoint); |
| return; |
| } |
| ASSERT_WITH_MESSAGE(isInCodeSet(codePoint), "isInCodeSet should always return true for non-ASCII characters"); |
| syntaxViolation(iterator); |
| |
| uint8_t buffer[U8_MAX_LENGTH]; |
| int32_t offset = 0; |
| UBool isError = false; |
| U8_APPEND(buffer, offset, U8_MAX_LENGTH, codePoint, isError); |
| if (isError) { |
| appendToASCIIBuffer(replacementCharacterUTF8PercentEncoded, replacementCharacterUTF8PercentEncodedLength); |
| return; |
| } |
| for (int32_t i = 0; i < offset; ++i) |
| percentEncodeByte(buffer[i]); |
| } |
| |
| template<typename CharacterType> |
| ALWAYS_INLINE void URLParser::utf8QueryEncode(const CodePointIterator<CharacterType>& iterator) |
| { |
| ASSERT(!iterator.atEnd()); |
| UChar32 codePoint = *iterator; |
| if (LIKELY(isASCII(codePoint))) { |
| if (UNLIKELY(shouldPercentEncodeQueryByte(codePoint, m_urlIsSpecial))) { |
| syntaxViolation(iterator); |
| percentEncodeByte(codePoint); |
| } else |
| appendToASCIIBuffer(codePoint); |
| return; |
| } |
| |
| syntaxViolation(iterator); |
| |
| uint8_t buffer[U8_MAX_LENGTH]; |
| int32_t offset = 0; |
| UBool isError = false; |
| U8_APPEND(buffer, offset, U8_MAX_LENGTH, codePoint, isError); |
| if (isError) { |
| appendToASCIIBuffer(replacementCharacterUTF8PercentEncoded, replacementCharacterUTF8PercentEncodedLength); |
| return; |
| } |
| for (int32_t i = 0; i < offset; ++i) { |
| auto byte = buffer[i]; |
| if (shouldPercentEncodeQueryByte(byte, m_urlIsSpecial)) |
| percentEncodeByte(byte); |
| else |
| appendToASCIIBuffer(byte); |
| } |
| } |
| |
| template<typename CharacterType> |
| void URLParser::encodeNonUTF8Query(const Vector<UChar>& source, const URLTextEncoding& encoding, CodePointIterator<CharacterType> iterator) |
| { |
| auto encoded = encoding.encodeForURLParsing(StringView(source.data(), source.size())); |
| auto* data = encoded.data(); |
| size_t length = encoded.size(); |
| |
| if (!length == !iterator.atEnd()) { |
| syntaxViolation(iterator); |
| return; |
| } |
| |
| size_t i = 0; |
| for (; i < length; ++i) { |
| ASSERT(!iterator.atEnd()); |
| uint8_t byte = data[i]; |
| if (UNLIKELY(byte != *iterator)) { |
| syntaxViolation(iterator); |
| break; |
| } |
| if (UNLIKELY(shouldPercentEncodeQueryByte(byte, m_urlIsSpecial))) { |
| syntaxViolation(iterator); |
| break; |
| } |
| appendToASCIIBuffer(byte); |
| ++iterator; |
| } |
| while (!iterator.atEnd() && isTabOrNewline(*iterator)) |
| ++iterator; |
| ASSERT((i == length) == iterator.atEnd()); |
| for (; i < length; ++i) { |
| ASSERT(m_didSeeSyntaxViolation); |
| uint8_t byte = data[i]; |
| if (shouldPercentEncodeQueryByte(byte, m_urlIsSpecial)) |
| percentEncodeByte(byte); |
| else |
| appendToASCIIBuffer(byte); |
| } |
| } |
| |
| std::optional<uint16_t> URLParser::defaultPortForProtocol(StringView scheme) |
| { |
| static constexpr uint16_t ftpPort = 21; |
| static constexpr uint16_t httpPort = 80; |
| static constexpr uint16_t httpsPort = 443; |
| static constexpr uint16_t wsPort = 80; |
| static constexpr uint16_t wssPort = 443; |
| |
| auto length = scheme.length(); |
| if (!length) |
| return std::nullopt; |
| switch (scheme[0]) { |
| case 'w': |
| switch (length) { |
| case 2: |
| if (scheme[1] == 's') |
| return wsPort; |
| return std::nullopt; |
| case 3: |
| if (scheme[1] == 's' |
| && scheme[2] == 's') |
| return wssPort; |
| return std::nullopt; |
| default: |
| return false; |
| } |
| case 'h': |
| switch (length) { |
| case 4: |
| if (scheme[1] == 't' |
| && scheme[2] == 't' |
| && scheme[3] == 'p') |
| return httpPort; |
| return std::nullopt; |
| case 5: |
| if (scheme[1] == 't' |
| && scheme[2] == 't' |
| && scheme[3] == 'p' |
| && scheme[4] == 's') |
| return httpsPort; |
| return std::nullopt; |
| default: |
| return std::nullopt; |
| } |
| case 'f': |
| if (length == 3 |
| && scheme[1] == 't' |
| && scheme[2] == 'p') |
| return ftpPort; |
| return std::nullopt; |
| default: |
| return std::nullopt; |
| } |
| } |
| |
| enum class Scheme { |
| WS, |
| WSS, |
| File, |
| FTP, |
| HTTP, |
| HTTPS, |
| NonSpecial |
| }; |
| |
| ALWAYS_INLINE static Scheme scheme(StringView scheme) |
| { |
| auto length = scheme.length(); |
| if (!length) |
| return Scheme::NonSpecial; |
| switch (scheme[0]) { |
| case 'f': |
| switch (length) { |
| case 3: |
| if (scheme[1] == 't' |
| && scheme[2] == 'p') |
| return Scheme::FTP; |
| return Scheme::NonSpecial; |
| case 4: |
| if (scheme[1] == 'i' |
| && scheme[2] == 'l' |
| && scheme[3] == 'e') |
| return Scheme::File; |
| return Scheme::NonSpecial; |
| default: |
| return Scheme::NonSpecial; |
| } |
| case 'h': |
| switch (length) { |
| case 4: |
| if (scheme[1] == 't' |
| && scheme[2] == 't' |
| && scheme[3] == 'p') |
| return Scheme::HTTP; |
| return Scheme::NonSpecial; |
| case 5: |
| if (scheme[1] == 't' |
| && scheme[2] == 't' |
| && scheme[3] == 'p' |
| && scheme[4] == 's') |
| return Scheme::HTTPS; |
| return Scheme::NonSpecial; |
| default: |
| return Scheme::NonSpecial; |
| } |
| case 'w': |
| switch (length) { |
| case 2: |
| if (scheme[1] == 's') |
| return Scheme::WS; |
| return Scheme::NonSpecial; |
| case 3: |
| if (scheme[1] == 's' |
| && scheme[2] == 's') |
| return Scheme::WSS; |
| return Scheme::NonSpecial; |
| default: |
| return Scheme::NonSpecial; |
| } |
| default: |
| return Scheme::NonSpecial; |
| } |
| } |
| |
| std::optional<String> URLParser::maybeCanonicalizeScheme(StringView scheme) |
| { |
| if (scheme.isEmpty()) |
| return std::nullopt; |
| |
| if (!isASCIIAlpha(scheme[0])) |
| return std::nullopt; |
| |
| for (size_t i = 1; i < scheme.length(); ++i) { |
| if (isASCIIAlphanumeric(scheme[i]) || scheme[i] == '+' || scheme[i] == '-' || scheme[i] == '.') |
| continue; |
| return std::nullopt; |
| } |
| |
| return scheme.convertToASCIILowercase(); |
| } |
| |
| bool URLParser::isSpecialScheme(StringView schemeArg) |
| { |
| return scheme(schemeArg) != Scheme::NonSpecial; |
| } |
| |
| enum class URLParser::URLPart { |
| SchemeEnd, |
| UserStart, |
| UserEnd, |
| PasswordEnd, |
| HostEnd, |
| PortEnd, |
| PathAfterLastSlash, |
| PathEnd, |
| QueryEnd, |
| }; |
| |
| size_t URLParser::urlLengthUntilPart(const URL& url, URLPart part) |
| { |
| switch (part) { |
| case URLPart::QueryEnd: |
| return url.m_queryEnd; |
| case URLPart::PathEnd: |
| return url.m_pathEnd; |
| case URLPart::PathAfterLastSlash: |
| return url.m_pathAfterLastSlash; |
| case URLPart::PortEnd: |
| return url.m_hostEnd + url.m_portLength; |
| case URLPart::HostEnd: |
| return url.m_hostEnd; |
| case URLPart::PasswordEnd: |
| return url.m_passwordEnd; |
| case URLPart::UserEnd: |
| return url.m_userEnd; |
| case URLPart::UserStart: |
| return url.m_userStart; |
| case URLPart::SchemeEnd: |
| return url.m_schemeEnd; |
| } |
| ASSERT_NOT_REACHED(); |
| return 0; |
| } |
| |
| void URLParser::copyASCIIStringUntil(const String& string, size_t length) |
| { |
| RELEASE_ASSERT(length <= string.length()); |
| if (string.isNull()) |
| return; |
| ASSERT(m_asciiBuffer.isEmpty()); |
| if (string.is8Bit()) |
| appendToASCIIBuffer(string.characters8(), length); |
| else { |
| const UChar* characters = string.characters16(); |
| for (size_t i = 0; i < length; ++i) { |
| UChar c = characters[i]; |
| ASSERT_WITH_SECURITY_IMPLICATION(isASCII(c)); |
| appendToASCIIBuffer(c); |
| } |
| } |
| } |
| |
| template<typename CharacterType> |
| void URLParser::copyURLPartsUntil(const URL& base, URLPart part, const CodePointIterator<CharacterType>& iterator, const URLTextEncoding*& nonUTF8QueryEncoding) |
| { |
| syntaxViolation(iterator); |
| |
| m_asciiBuffer.clear(); |
| copyASCIIStringUntil(base.m_string, urlLengthUntilPart(base, part)); |
| switch (part) { |
| case URLPart::QueryEnd: |
| m_url.m_queryEnd = base.m_queryEnd; |
| FALLTHROUGH; |
| case URLPart::PathEnd: |
| m_url.m_pathEnd = base.m_pathEnd; |
| FALLTHROUGH; |
| case URLPart::PathAfterLastSlash: |
| m_url.m_pathAfterLastSlash = base.m_pathAfterLastSlash; |
| FALLTHROUGH; |
| case URLPart::PortEnd: |
| m_url.m_portLength = base.m_portLength; |
| FALLTHROUGH; |
| case URLPart::HostEnd: |
| m_url.m_hostEnd = base.m_hostEnd; |
| FALLTHROUGH; |
| case URLPart::PasswordEnd: |
| m_url.m_passwordEnd = base.m_passwordEnd; |
| FALLTHROUGH; |
| case URLPart::UserEnd: |
| m_url.m_userEnd = base.m_userEnd; |
| FALLTHROUGH; |
| case URLPart::UserStart: |
| m_url.m_userStart = base.m_userStart; |
| FALLTHROUGH; |
| case URLPart::SchemeEnd: |
| m_url.m_isValid = base.m_isValid; |
| m_url.m_protocolIsInHTTPFamily = base.m_protocolIsInHTTPFamily; |
| m_url.m_schemeEnd = base.m_schemeEnd; |
| } |
| |
| switch (scheme(StringView(m_asciiBuffer.data(), m_url.m_schemeEnd))) { |
| case Scheme::WS: |
| case Scheme::WSS: |
| nonUTF8QueryEncoding = nullptr; |
| m_urlIsSpecial = true; |
| return; |
| case Scheme::File: |
| m_urlIsFile = true; |
| FALLTHROUGH; |
| case Scheme::FTP: |
| case Scheme::HTTP: |
| case Scheme::HTTPS: |
| m_urlIsSpecial = true; |
| return; |
| case Scheme::NonSpecial: |
| m_urlIsSpecial = false; |
| nonUTF8QueryEncoding = nullptr; |
| auto pathStart = m_url.m_hostEnd + m_url.m_portLength; |
| if (pathStart + 2 < m_asciiBuffer.size() |
| && m_asciiBuffer[pathStart] == '/' |
| && m_asciiBuffer[pathStart + 1] == '.' |
| && m_asciiBuffer[pathStart + 2] == '/') { |
| m_asciiBuffer.remove(pathStart + 1, 2); |
| m_url.m_pathAfterLastSlash = std::max(2u, m_url.m_pathAfterLastSlash) - 2; |
| m_url.m_pathEnd = std::max(2u, m_url.m_pathEnd) - 2; |
| m_url.m_queryEnd = std::max(2u, m_url.m_queryEnd) - 2; |
| } |
| return; |
| } |
| ASSERT_NOT_REACHED(); |
| } |
| |
| static const char dotASCIICode[2] = {'2', 'e'}; |
| |
| template<typename CharacterType> |
| ALWAYS_INLINE bool URLParser::isSingleDotPathSegment(CodePointIterator<CharacterType> c) |
| { |
| if (c.atEnd()) |
| return false; |
| if (*c == '.') { |
| advance<CharacterType, ReportSyntaxViolation::No>(c); |
| return c.atEnd() || isSlashQuestionOrHash(*c); |
| } |
| if (*c != '%') |
| return false; |
| advance<CharacterType, ReportSyntaxViolation::No>(c); |
| if (c.atEnd() || *c != dotASCIICode[0]) |
| return false; |
| advance<CharacterType, ReportSyntaxViolation::No>(c); |
| if (c.atEnd()) |
| return false; |
| if (toASCIILower(*c) == dotASCIICode[1]) { |
| advance<CharacterType, ReportSyntaxViolation::No>(c); |
| return c.atEnd() || isSlashQuestionOrHash(*c); |
| } |
| return false; |
| } |
| |
| template<typename CharacterType> |
| ALWAYS_INLINE bool URLParser::isDoubleDotPathSegment(CodePointIterator<CharacterType> c) |
| { |
| if (c.atEnd()) |
| return false; |
| if (*c == '.') { |
| advance<CharacterType, ReportSyntaxViolation::No>(c); |
| return isSingleDotPathSegment(c); |
| } |
| if (*c != '%') |
| return false; |
| advance<CharacterType, ReportSyntaxViolation::No>(c); |
| if (c.atEnd() || *c != dotASCIICode[0]) |
| return false; |
| advance<CharacterType, ReportSyntaxViolation::No>(c); |
| if (c.atEnd()) |
| return false; |
| if (toASCIILower(*c) == dotASCIICode[1]) { |
| advance<CharacterType, ReportSyntaxViolation::No>(c); |
| return isSingleDotPathSegment(c); |
| } |
| return false; |
| } |
| |
| template<typename CharacterType> |
| void URLParser::consumeSingleDotPathSegment(CodePointIterator<CharacterType>& c) |
| { |
| ASSERT(isSingleDotPathSegment(c)); |
| if (*c == '.') { |
| advance(c); |
| if (!c.atEnd()) { |
| if (*c == '/' || *c == '\\') |
| advance(c); |
| else |
| ASSERT(*c == '?' || *c == '#'); |
| } |
| } else { |
| ASSERT(*c == '%'); |
| advance(c); |
| ASSERT(*c == dotASCIICode[0]); |
| advance(c); |
| ASSERT(toASCIILower(*c) == dotASCIICode[1]); |
| advance(c); |
| if (!c.atEnd()) { |
| if (*c == '/' || *c == '\\') |
| advance(c); |
| else |
| ASSERT(*c == '?' || *c == '#'); |
| } |
| } |
| } |
| |
| template<typename CharacterType> |
| void URLParser::consumeDoubleDotPathSegment(CodePointIterator<CharacterType>& c) |
| { |
| ASSERT(isDoubleDotPathSegment(c)); |
| if (*c == '.') |
| advance(c); |
| else { |
| ASSERT(*c == '%'); |
| advance(c); |
| ASSERT(*c == dotASCIICode[0]); |
| advance(c); |
| ASSERT(toASCIILower(*c) == dotASCIICode[1]); |
| advance(c); |
| } |
| consumeSingleDotPathSegment(c); |
| } |
| |
| bool URLParser::shouldPopPath(unsigned newPathAfterLastSlash) |
| { |
| ASSERT(m_didSeeSyntaxViolation); |
| if (!m_urlIsFile) |
| return true; |
| |
| ASSERT(m_url.m_pathAfterLastSlash <= m_asciiBuffer.size()); |
| CodePointIterator<LChar> componentToPop(&m_asciiBuffer[newPathAfterLastSlash], &m_asciiBuffer[0] + m_url.m_pathAfterLastSlash); |
| if (newPathAfterLastSlash == m_url.m_hostEnd + m_url.m_portLength + 1 && isWindowsDriveLetter(componentToPop)) |
| return false; |
| return true; |
| } |
| |
| void URLParser::popPath() |
| { |
| ASSERT(m_didSeeSyntaxViolation); |
| if (m_url.m_pathAfterLastSlash > m_url.m_hostEnd + m_url.m_portLength + 1) { |
| auto newPathAfterLastSlash = m_url.m_pathAfterLastSlash - 1; |
| if (m_asciiBuffer[newPathAfterLastSlash] == '/') |
| newPathAfterLastSlash--; |
| while (newPathAfterLastSlash > m_url.m_hostEnd + m_url.m_portLength && m_asciiBuffer[newPathAfterLastSlash] != '/') |
| newPathAfterLastSlash--; |
| newPathAfterLastSlash++; |
| if (shouldPopPath(newPathAfterLastSlash)) |
| m_url.m_pathAfterLastSlash = newPathAfterLastSlash; |
| } |
| m_asciiBuffer.resize(m_url.m_pathAfterLastSlash); |
| } |
| |
| template<typename CharacterType> |
| void URLParser::syntaxViolation(const CodePointIterator<CharacterType>& iterator) |
| { |
| if (m_didSeeSyntaxViolation) |
| return; |
| m_didSeeSyntaxViolation = true; |
| |
| ASSERT(m_asciiBuffer.isEmpty()); |
| size_t codeUnitsToCopy = iterator.codeUnitsSince(reinterpret_cast<const CharacterType*>(m_inputBegin)); |
| RELEASE_ASSERT(codeUnitsToCopy <= m_inputString.length()); |
| m_asciiBuffer.reserveCapacity(m_inputString.length()); |
| for (size_t i = 0; i < codeUnitsToCopy; ++i) { |
| ASSERT(isASCII(m_inputString[i])); |
| m_asciiBuffer.uncheckedAppend(m_inputString[i]); |
| } |
| } |
| |
| void URLParser::failure() |
| { |
| m_url.invalidate(); |
| m_url.m_string = m_inputString; |
| } |
| |
| template<typename CharacterType> |
| bool URLParser::checkLocalhostCodePoint(CodePointIterator<CharacterType>& iterator, UChar32 codePoint) |
| { |
| if (iterator.atEnd() || toASCIILower(*iterator) != codePoint) |
| return false; |
| advance<CharacterType, ReportSyntaxViolation::No>(iterator); |
| return true; |
| } |
| |
| template<typename CharacterType> |
| bool URLParser::isAtLocalhost(CodePointIterator<CharacterType> iterator) |
| { |
| if (!checkLocalhostCodePoint(iterator, 'l')) |
| return false; |
| if (!checkLocalhostCodePoint(iterator, 'o')) |
| return false; |
| if (!checkLocalhostCodePoint(iterator, 'c')) |
| return false; |
| if (!checkLocalhostCodePoint(iterator, 'a')) |
| return false; |
| if (!checkLocalhostCodePoint(iterator, 'l')) |
| return false; |
| if (!checkLocalhostCodePoint(iterator, 'h')) |
| return false; |
| if (!checkLocalhostCodePoint(iterator, 'o')) |
| return false; |
| if (!checkLocalhostCodePoint(iterator, 's')) |
| return false; |
| if (!checkLocalhostCodePoint(iterator, 't')) |
| return false; |
| return iterator.atEnd(); |
| } |
| |
| bool URLParser::isLocalhost(StringView view) |
| { |
| if (view.is8Bit()) |
| return isAtLocalhost(CodePointIterator<LChar>(view.characters8(), view.characters8() + view.length())); |
| return isAtLocalhost(CodePointIterator<UChar>(view.characters16(), view.characters16() + view.length())); |
| } |
| |
| ALWAYS_INLINE StringView URLParser::parsedDataView(size_t start, size_t length) |
| { |
| if (UNLIKELY(m_didSeeSyntaxViolation)) { |
| ASSERT(start + length <= m_asciiBuffer.size()); |
| return StringView(m_asciiBuffer.data() + start, length); |
| } |
| ASSERT(start + length <= m_inputString.length()); |
| return StringView(m_inputString).substring(start, length); |
| } |
| |
| ALWAYS_INLINE UChar URLParser::parsedDataView(size_t position) |
| { |
| if (UNLIKELY(m_didSeeSyntaxViolation)) |
| return m_asciiBuffer[position]; |
| return m_inputString[position]; |
| } |
| |
| template<typename CharacterType> |
| ALWAYS_INLINE size_t URLParser::currentPosition(const CodePointIterator<CharacterType>& iterator) |
| { |
| if (UNLIKELY(m_didSeeSyntaxViolation)) |
| return m_asciiBuffer.size(); |
| |
| return iterator.codeUnitsSince(reinterpret_cast<const CharacterType*>(m_inputBegin)); |
| } |
| |
| URLParser::URLParser(const String& input, const URL& base, const URLTextEncoding* nonUTF8QueryEncoding) |
| : m_inputString(input) |
| { |
| if (input.isNull()) { |
| if (base.isValid() && !base.m_cannotBeABaseURL) { |
| m_url = base; |
| m_url.removeFragmentIdentifier(); |
| } |
| return; |
| } |
| |
| if (input.is8Bit()) { |
| m_inputBegin = input.characters8(); |
| parse(input.characters8(), input.length(), base, nonUTF8QueryEncoding); |
| } else { |
| m_inputBegin = input.characters16(); |
| parse(input.characters16(), input.length(), base, nonUTF8QueryEncoding); |
| } |
| |
| ASSERT(!m_url.m_isValid |
| || m_didSeeSyntaxViolation == (m_url.string() != input) |
| || (input.isAllSpecialCharacters<isC0ControlOrSpace>() && m_url.m_string == base.m_string.left(base.m_queryEnd)) |
| || (base.isValid() && base.protocolIs("file"))); |
| ASSERT(internalValuesConsistent(m_url)); |
| #if ASSERT_ENABLED |
| if (!m_didSeeSyntaxViolation) { |
| // Force a syntax violation at the beginning to make sure we get the same result. |
| URLParser parser(makeString(" ", input), base, nonUTF8QueryEncoding); |
| URL parsed = parser.result(); |
| if (parsed.isValid()) |
| ASSERT(allValuesEqual(parser.result(), m_url)); |
| } |
| #endif // ASSERT_ENABLED |
| |
| if (UNLIKELY(needsNonSpecialDotSlash())) |
| addNonSpecialDotSlash(); |
| } |
| |
| template<typename CharacterType> |
| void URLParser::parse(const CharacterType* input, const unsigned length, const URL& base, const URLTextEncoding* nonUTF8QueryEncoding) |
| { |
| URL_PARSER_LOG("Parsing URL <%s> base <%s>", String(input, length).utf8().data(), base.string().utf8().data()); |
| m_url = { }; |
| ASSERT(m_asciiBuffer.isEmpty()); |
| |
| Vector<UChar> queryBuffer; |
| |
| unsigned endIndex = length; |
| if (UNLIKELY(nonUTF8QueryEncoding == URLTextEncodingSentinelAllowingC0AtEndOfHash)) |
| nonUTF8QueryEncoding = nullptr; |
| else { |
| while (UNLIKELY(endIndex && isC0ControlOrSpace(input[endIndex - 1]))) { |
| syntaxViolation(CodePointIterator<CharacterType>(input, input)); |
| endIndex--; |
| } |
| } |
| CodePointIterator<CharacterType> c(input, input + endIndex); |
| CodePointIterator<CharacterType> authorityOrHostBegin; |
| CodePointIterator<CharacterType> queryBegin; |
| while (UNLIKELY(!c.atEnd() && isC0ControlOrSpace(*c))) { |
| syntaxViolation(c); |
| ++c; |
| } |
| auto beginAfterControlAndSpace = c; |
| |
| enum class State : uint8_t { |
| SchemeStart, |
| Scheme, |
| NoScheme, |
| SpecialRelativeOrAuthority, |
| PathOrAuthority, |
| Relative, |
| RelativeSlash, |
| SpecialAuthoritySlashes, |
| SpecialAuthorityIgnoreSlashes, |
| AuthorityOrHost, |
| Host, |
| File, |
| FileSlash, |
| FileHost, |
| PathStart, |
| Path, |
| CannotBeABaseURLPath, |
| UTF8Query, |
| NonUTF8Query, |
| Fragment, |
| }; |
| |
| #define LOG_STATE(x) URL_PARSER_LOG("State %s, code point %c, parsed data <%s> size %zu", x, *c, parsedDataView(0, currentPosition(c)).utf8().data(), currentPosition(c)) |
| #define LOG_FINAL_STATE(x) URL_PARSER_LOG("Final State: %s", x) |
| |
| State state = State::SchemeStart; |
| while (!c.atEnd()) { |
| if (UNLIKELY(isTabOrNewline(*c))) { |
| syntaxViolation(c); |
| ++c; |
| continue; |
| } |
| |
| switch (state) { |
| case State::SchemeStart: |
| LOG_STATE("SchemeStart"); |
| if (isASCIIAlpha(*c)) { |
| if (UNLIKELY(isASCIIUpper(*c))) |
| syntaxViolation(c); |
| appendToASCIIBuffer(toASCIILower(*c)); |
| advance(c); |
| if (c.atEnd()) { |
| m_asciiBuffer.clear(); |
| state = State::NoScheme; |
| c = beginAfterControlAndSpace; |
| break; |
| } |
| state = State::Scheme; |
| } else |
| state = State::NoScheme; |
| break; |
| case State::Scheme: |
| LOG_STATE("Scheme"); |
| if (isValidSchemeCharacter(*c)) { |
| if (UNLIKELY(isASCIIUpper(*c))) |
| syntaxViolation(c); |
| appendToASCIIBuffer(toASCIILower(*c)); |
| } else if (*c == ':') { |
| unsigned schemeEnd = currentPosition(c); |
| if (schemeEnd > URL::maxSchemeLength) { |
| failure(); |
| return; |
| } |
| m_url.m_schemeEnd = schemeEnd; |
| StringView urlScheme = parsedDataView(0, m_url.m_schemeEnd); |
| appendToASCIIBuffer(':'); |
| switch (scheme(urlScheme)) { |
| case Scheme::File: |
| m_urlIsSpecial = true; |
| m_urlIsFile = true; |
| state = State::File; |
| ++c; |
| break; |
| case Scheme::WS: |
| case Scheme::WSS: |
| nonUTF8QueryEncoding = nullptr; |
| m_urlIsSpecial = true; |
| if (base.protocolIs(urlScheme)) |
| state = State::SpecialRelativeOrAuthority; |
| else |
| state = State::SpecialAuthoritySlashes; |
| ++c; |
| break; |
| case Scheme::HTTP: |
| case Scheme::HTTPS: |
| m_url.m_protocolIsInHTTPFamily = true; |
| FALLTHROUGH; |
| case Scheme::FTP: |
| m_urlIsSpecial = true; |
| if (base.protocolIs(urlScheme)) |
| state = State::SpecialRelativeOrAuthority; |
| else |
| state = State::SpecialAuthoritySlashes; |
| ++c; |
| break; |
| case Scheme::NonSpecial: |
| nonUTF8QueryEncoding = nullptr; |
| auto maybeSlash = c; |
| advance(maybeSlash); |
| if (!maybeSlash.atEnd() && *maybeSlash == '/') { |
| appendToASCIIBuffer('/'); |
| c = maybeSlash; |
| state = State::PathOrAuthority; |
| ASSERT(*c == '/'); |
| ++c; |
| m_url.m_userStart = currentPosition(c); |
| } else { |
| ++c; |
| m_url.m_userStart = currentPosition(c); |
| m_url.m_userEnd = m_url.m_userStart; |
| m_url.m_passwordEnd = m_url.m_userStart; |
| m_url.m_hostEnd = m_url.m_userStart; |
| m_url.m_portLength = 0; |
| m_url.m_pathAfterLastSlash = m_url.m_userStart; |
| m_url.m_cannotBeABaseURL = true; |
| state = State::CannotBeABaseURLPath; |
| } |
| break; |
| } |
| break; |
| } else { |
| m_asciiBuffer.clear(); |
| state = State::NoScheme; |
| c = beginAfterControlAndSpace; |
| break; |
| } |
| advance(c); |
| if (c.atEnd()) { |
| m_asciiBuffer.clear(); |
| state = State::NoScheme; |
| c = beginAfterControlAndSpace; |
| } |
| break; |
| case State::NoScheme: |
| LOG_STATE("NoScheme"); |
| if (!base.isValid() || (base.m_cannotBeABaseURL && *c != '#')) { |
| failure(); |
| return; |
| } |
| if (base.m_cannotBeABaseURL && *c == '#') { |
| copyURLPartsUntil(base, URLPart::QueryEnd, c, nonUTF8QueryEncoding); |
| state = State::Fragment; |
| appendToASCIIBuffer('#'); |
| ++c; |
| break; |
| } |
| if (!base.protocolIs("file")) { |
| state = State::Relative; |
| break; |
| } |
| state = State::File; |
| break; |
| case State::SpecialRelativeOrAuthority: |
| LOG_STATE("SpecialRelativeOrAuthority"); |
| if (*c == '/') { |
| appendToASCIIBuffer('/'); |
| advance(c); |
| if (c.atEnd()) { |
| failure(); |
| return; |
| } |
| if (*c == '/') { |
| appendToASCIIBuffer('/'); |
| state = State::SpecialAuthorityIgnoreSlashes; |
| ++c; |
| } else |
| state = State::RelativeSlash; |
| } else |
| state = State::Relative; |
| break; |
| case State::PathOrAuthority: |
| LOG_STATE("PathOrAuthority"); |
| if (*c == '/') { |
| appendToASCIIBuffer('/'); |
| state = State::AuthorityOrHost; |
| advance(c); |
| m_url.m_userStart = currentPosition(c); |
| authorityOrHostBegin = c; |
| } else { |
| ASSERT(parsedDataView(currentPosition(c) - 1) == '/'); |
| m_url.m_userStart = currentPosition(c) - 1; |
| m_url.m_userEnd = m_url.m_userStart; |
| m_url.m_passwordEnd = m_url.m_userStart; |
| m_url.m_hostEnd = m_url.m_userStart; |
| m_url.m_portLength = 0; |
| m_url.m_pathAfterLastSlash = m_url.m_userStart + 1; |
| state = State::Path; |
| } |
| break; |
| case State::Relative: |
| LOG_STATE("Relative"); |
| switch (*c) { |
| case '/': |
| case '\\': |
| state = State::RelativeSlash; |
| ++c; |
| break; |
| case '?': |
| copyURLPartsUntil(base, URLPart::PathEnd, c, nonUTF8QueryEncoding); |
| appendToASCIIBuffer('?'); |
| ++c; |
| if (nonUTF8QueryEncoding) { |
| queryBegin = c; |
| state = State::NonUTF8Query; |
| } else |
| state = State::UTF8Query; |
| break; |
| case '#': |
| copyURLPartsUntil(base, URLPart::QueryEnd, c, nonUTF8QueryEncoding); |
| appendToASCIIBuffer('#'); |
| state = State::Fragment; |
| ++c; |
| break; |
| default: |
| copyURLPartsUntil(base, URLPart::PathAfterLastSlash, c, nonUTF8QueryEncoding); |
| if ((currentPosition(c) && parsedDataView(currentPosition(c) - 1) != '/') |
| || (base.host().isEmpty() && base.path().isEmpty())) { |
| appendToASCIIBuffer('/'); |
| m_url.m_pathAfterLastSlash = currentPosition(c); |
| } |
| state = State::Path; |
| break; |
| } |
| break; |
| case State::RelativeSlash: |
| LOG_STATE("RelativeSlash"); |
| if (*c == '/' || *c == '\\') { |
| ++c; |
| copyURLPartsUntil(base, URLPart::SchemeEnd, c, nonUTF8QueryEncoding); |
| appendToASCIIBuffer("://", 3); |
| if (m_urlIsSpecial) |
| state = State::SpecialAuthorityIgnoreSlashes; |
| else { |
| m_url.m_userStart = currentPosition(c); |
| state = State::AuthorityOrHost; |
| authorityOrHostBegin = c; |
| } |
| } else { |
| copyURLPartsUntil(base, URLPart::PortEnd, c, nonUTF8QueryEncoding); |
| appendToASCIIBuffer('/'); |
| m_url.m_pathAfterLastSlash = base.m_hostEnd + base.m_portLength + 1; |
| state = State::Path; |
| } |
| break; |
| case State::SpecialAuthoritySlashes: |
| LOG_STATE("SpecialAuthoritySlashes"); |
| if (LIKELY(*c == '/' || *c == '\\')) { |
| if (UNLIKELY(*c == '\\')) |
| syntaxViolation(c); |
| appendToASCIIBuffer('/'); |
| advance(c); |
| if (LIKELY(!c.atEnd() && (*c == '/' || *c == '\\'))) { |
| if (UNLIKELY(*c == '\\')) |
| syntaxViolation(c); |
| ++c; |
| appendToASCIIBuffer('/'); |
| } else { |
| syntaxViolation(c); |
| appendToASCIIBuffer('/'); |
| } |
| } else { |
| syntaxViolation(c); |
| appendToASCIIBuffer("//", 2); |
| } |
| state = State::SpecialAuthorityIgnoreSlashes; |
| break; |
| case State::SpecialAuthorityIgnoreSlashes: |
| LOG_STATE("SpecialAuthorityIgnoreSlashes"); |
| if (*c == '/' || *c == '\\') { |
| syntaxViolation(c); |
| ++c; |
| } else { |
| m_url.m_userStart = currentPosition(c); |
| state = State::AuthorityOrHost; |
| authorityOrHostBegin = c; |
| } |
| break; |
| case State::AuthorityOrHost: |
| do { |
| LOG_STATE("AuthorityOrHost"); |
| if (*c == '@') { |
| auto lastAt = c; |
| auto findLastAt = c; |
| while (!findLastAt.atEnd()) { |
| URL_PARSER_LOG("Finding last @: %c", *findLastAt); |
| if (*findLastAt == '@') |
| lastAt = findLastAt; |
| bool isSlash = *findLastAt == '/' || (m_urlIsSpecial && *findLastAt == '\\'); |
| if (isSlash || *findLastAt == '?' || *findLastAt == '#') |
| break; |
| ++findLastAt; |
| } |
| parseAuthority(CodePointIterator<CharacterType>(authorityOrHostBegin, lastAt)); |
| c = lastAt; |
| advance(c); |
| authorityOrHostBegin = c; |
| state = State::Host; |
| m_hostHasPercentOrNonASCII = false; |
| break; |
| } |
| bool isSlash = *c == '/' || (m_urlIsSpecial && *c == '\\'); |
| if (isSlash || *c == '?' || *c == '#') { |
| auto iterator = CodePointIterator<CharacterType>(authorityOrHostBegin, c); |
| if (iterator.atEnd()) { |
| if (m_urlIsSpecial) |
| return failure(); |
| m_url.m_userEnd = currentPosition(c); |
| m_url.m_passwordEnd = m_url.m_userEnd; |
| m_url.m_hostEnd = m_url.m_userEnd; |
| m_url.m_portLength = 0; |
| m_url.m_pathAfterLastSlash = m_url.m_userEnd; |
| } else { |
| m_url.m_userEnd = currentPosition(authorityOrHostBegin); |
| m_url.m_passwordEnd = m_url.m_userEnd; |
| if (parseHostAndPort(iterator) == HostParsingResult::InvalidHost) { |
| failure(); |
| return; |
| } |
| if (UNLIKELY(!isSlash)) { |
| if (m_urlIsSpecial) { |
| syntaxViolation(c); |
| appendToASCIIBuffer('/'); |
| } |
| m_url.m_pathAfterLastSlash = currentPosition(c); |
| } |
| } |
| state = State::Path; |
| break; |
| } |
| if (isPercentOrNonASCII(*c)) |
| m_hostHasPercentOrNonASCII = true; |
| ++c; |
| } while (!c.atEnd()); |
| break; |
| case State::Host: |
| do { |
| LOG_STATE("Host"); |
| if (*c == '/' || *c == '?' || *c == '#') { |
| if (parseHostAndPort(CodePointIterator<CharacterType>(authorityOrHostBegin, c)) == HostParsingResult::InvalidHost) { |
| failure(); |
| return; |
| } |
| if (*c == '?' || *c == '#') { |
| syntaxViolation(c); |
| appendToASCIIBuffer('/'); |
| m_url.m_pathAfterLastSlash = currentPosition(c); |
| } |
| state = State::Path; |
| break; |
| } |
| if (isPercentOrNonASCII(*c)) |
| m_hostHasPercentOrNonASCII = true; |
| ++c; |
| } while (!c.atEnd()); |
| break; |
| case State::File: |
| LOG_STATE("File"); |
| switch (*c) { |
| case '\\': |
| syntaxViolation(c); |
| FALLTHROUGH; |
| case '/': |
| appendToASCIIBuffer('/'); |
| state = State::FileSlash; |
| ++c; |
| break; |
| case '?': |
| syntaxViolation(c); |
| if (base.isValid() && base.protocolIs("file")) { |
| copyURLPartsUntil(base, URLPart::PathEnd, c, nonUTF8QueryEncoding); |
| appendToASCIIBuffer('?'); |
| ++c; |
| } else { |
| appendToASCIIBuffer("///?", 4); |
| ++c; |
| m_url.m_userStart = currentPosition(c) - 2; |
| m_url.m_userEnd = m_url.m_userStart; |
| m_url.m_passwordEnd = m_url.m_userStart; |
| m_url.m_hostEnd = m_url.m_userStart; |
| m_url.m_portLength = 0; |
| m_url.m_pathAfterLastSlash = m_url.m_userStart + 1; |
| m_url.m_pathEnd = m_url.m_pathAfterLastSlash; |
| } |
| if (nonUTF8QueryEncoding) { |
| queryBegin = c; |
| state = State::NonUTF8Query; |
| } else |
| state = State::UTF8Query; |
| break; |
| case '#': |
| syntaxViolation(c); |
| if (base.isValid() && base.protocolIs("file")) { |
| copyURLPartsUntil(base, URLPart::QueryEnd, c, nonUTF8QueryEncoding); |
| appendToASCIIBuffer('#'); |
| } else { |
| appendToASCIIBuffer("///#", 4); |
| m_url.m_userStart = currentPosition(c) - 2; |
| m_url.m_userEnd = m_url.m_userStart; |
| m_url.m_passwordEnd = m_url.m_userStart; |
| m_url.m_hostEnd = m_url.m_userStart; |
| m_url.m_portLength = 0; |
| m_url.m_pathAfterLastSlash = m_url.m_userStart + 1; |
| m_url.m_pathEnd = m_url.m_pathAfterLastSlash; |
| m_url.m_queryEnd = m_url.m_pathAfterLastSlash; |
| } |
| state = State::Fragment; |
| ++c; |
| break; |
| default: |
| syntaxViolation(c); |
| if (base.isValid() && base.protocolIs("file") && shouldCopyFileURL(c)) |
| copyURLPartsUntil(base, URLPart::PathAfterLastSlash, c, nonUTF8QueryEncoding); |
| else { |
| bool copiedHost = false; |
| if (base.isValid() && base.protocolIs("file")) { |
| if (base.host().isEmpty()) { |
| copyURLPartsUntil(base, URLPart::SchemeEnd, c, nonUTF8QueryEncoding); |
| appendToASCIIBuffer(":///", 4); |
| } else { |
| copyURLPartsUntil(base, URLPart::PortEnd, c, nonUTF8QueryEncoding); |
| appendToASCIIBuffer('/'); |
| copiedHost = true; |
| } |
| } else |
| appendToASCIIBuffer("///", 3); |
| if (!copiedHost) { |
| m_url.m_userStart = currentPosition(c) - 1; |
| m_url.m_userEnd = m_url.m_userStart; |
| m_url.m_passwordEnd = m_url.m_userStart; |
| m_url.m_hostEnd = m_url.m_userStart; |
| m_url.m_portLength = 0; |
| } |
| m_url.m_pathAfterLastSlash = m_url.m_hostEnd + 1; |
| } |
| if (isWindowsDriveLetter(c)) |
| appendWindowsDriveLetter(c); |
| state = State::Path; |
| break; |
| } |
| break; |
| case State::FileSlash: |
| LOG_STATE("FileSlash"); |
| if (LIKELY(*c == '/' || *c == '\\')) { |
| if (UNLIKELY(*c == '\\')) |
| syntaxViolation(c); |
| if (base.isValid() && base.protocolIs("file")) { |
| copyURLPartsUntil(base, URLPart::SchemeEnd, c, nonUTF8QueryEncoding); |
| appendToASCIIBuffer(":/", 2); |
| } |
| appendToASCIIBuffer('/'); |
| advance(c); |
| m_url.m_userStart = currentPosition(c); |
| m_url.m_userEnd = m_url.m_userStart; |
| m_url.m_passwordEnd = m_url.m_userStart; |
| m_url.m_hostEnd = m_url.m_userStart; |
| m_url.m_portLength = 0; |
| authorityOrHostBegin = c; |
| state = State::FileHost; |
| break; |
| } |
| { |
| bool copiedHost = false; |
| if (base.isValid() && base.protocolIs("file")) { |
| if (base.host().isEmpty()) { |
| copyURLPartsUntil(base, URLPart::SchemeEnd, c, nonUTF8QueryEncoding); |
| appendToASCIIBuffer(":///", 4); |
| } else { |
| copyURLPartsUntil(base, URLPart::PortEnd, c, nonUTF8QueryEncoding); |
| appendToASCIIBuffer('/'); |
| copiedHost = true; |
| } |
| } else { |
| syntaxViolation(c); |
| appendToASCIIBuffer("//", 2); |
| } |
| if (!copiedHost) { |
| m_url.m_userStart = currentPosition(c) - 1; |
| m_url.m_userEnd = m_url.m_userStart; |
| m_url.m_passwordEnd = m_url.m_userStart; |
| m_url.m_hostEnd = m_url.m_userStart; |
| m_url.m_portLength = 0; |
| } |
| } |
| if (isWindowsDriveLetter(c)) { |
| appendWindowsDriveLetter(c); |
| m_url.m_pathAfterLastSlash = m_url.m_hostEnd + 1; |
| } else if (copyBaseWindowsDriveLetter(base)) { |
| appendToASCIIBuffer('/'); |
| m_url.m_pathAfterLastSlash = m_url.m_hostEnd + 4; |
| } else |
| m_url.m_pathAfterLastSlash = m_url.m_hostEnd + 1; |
| state = State::Path; |
| break; |
| case State::FileHost: |
| do { |
| LOG_STATE("FileHost"); |
| if (isSlashQuestionOrHash(*c)) { |
| bool windowsQuirk = takesTwoAdvancesUntilEnd(CodePointIterator<CharacterType>(authorityOrHostBegin, c)) |
| && isWindowsDriveLetter(authorityOrHostBegin); |
| if (windowsQuirk) { |
| syntaxViolation(authorityOrHostBegin); |
| appendToASCIIBuffer('/'); |
| appendWindowsDriveLetter(authorityOrHostBegin); |
| } |
| if (windowsQuirk || authorityOrHostBegin == c) { |
| ASSERT(windowsQuirk || parsedDataView(currentPosition(c) - 1) == '/'); |
| if (UNLIKELY(*c == '?')) { |
| syntaxViolation(c); |
| appendToASCIIBuffer("/?", 2); |
| ++c; |
| if (nonUTF8QueryEncoding) { |
| queryBegin = c; |
| state = State::NonUTF8Query; |
| } else |
| state = State::UTF8Query; |
| m_url.m_pathAfterLastSlash = currentPosition(c) - 1; |
| m_url.m_pathEnd = m_url.m_pathAfterLastSlash; |
| break; |
| } |
| if (UNLIKELY(*c == '#')) { |
| syntaxViolation(c); |
| appendToASCIIBuffer("/#", 2); |
| ++c; |
| m_url.m_pathAfterLastSlash = currentPosition(c) - 1; |
| m_url.m_pathEnd = m_url.m_pathAfterLastSlash; |
| m_url.m_queryEnd = m_url.m_pathAfterLastSlash; |
| state = State::Fragment; |
| break; |
| } |
| state = State::Path; |
| break; |
| } |
| if (parseHostAndPort(CodePointIterator<CharacterType>(authorityOrHostBegin, c)) == HostParsingResult::InvalidHost) { |
| failure(); |
| return; |
| } |
| if (UNLIKELY(isLocalhost(parsedDataView(m_url.m_passwordEnd, currentPosition(c) - m_url.m_passwordEnd)))) { |
| syntaxViolation(c); |
| m_asciiBuffer.shrink(m_url.m_passwordEnd); |
| m_url.m_hostEnd = currentPosition(c); |
| m_url.m_portLength = 0; |
| } |
| |
| state = State::PathStart; |
| break; |
| } |
| if (isPercentOrNonASCII(*c)) |
| m_hostHasPercentOrNonASCII = true; |
| ++c; |
| } while (!c.atEnd()); |
| break; |
| case State::PathStart: |
| LOG_STATE("PathStart"); |
| if (*c != '/' && *c != '\\') { |
| syntaxViolation(c); |
| appendToASCIIBuffer('/'); |
| } |
| m_url.m_pathAfterLastSlash = currentPosition(c); |
| state = State::Path; |
| break; |
| case State::Path: |
| LOG_STATE("Path"); |
| if (*c == '/' || (m_urlIsSpecial && *c == '\\')) { |
| if (UNLIKELY(m_urlIsSpecial && *c == '\\')) |
| syntaxViolation(c); |
| appendToASCIIBuffer('/'); |
| ++c; |
| m_url.m_pathAfterLastSlash = currentPosition(c); |
| break; |
| } |
| if (UNLIKELY(currentPosition(c) && parsedDataView(currentPosition(c) - 1) == '/')) { |
| if (UNLIKELY(isDoubleDotPathSegment(c))) { |
| syntaxViolation(c); |
| consumeDoubleDotPathSegment(c); |
| popPath(); |
| break; |
| } |
| if (UNLIKELY(isSingleDotPathSegment(c))) { |
| syntaxViolation(c); |
| consumeSingleDotPathSegment(c); |
| break; |
| } |
| } |
| if (*c == '?') { |
| m_url.m_pathEnd = currentPosition(c); |
| appendToASCIIBuffer('?'); |
| ++c; |
| if (nonUTF8QueryEncoding) { |
| queryBegin = c; |
| state = State::NonUTF8Query; |
| } else |
| state = State::UTF8Query; |
| break; |
| } |
| if (*c == '#') { |
| m_url.m_pathEnd = currentPosition(c); |
| m_url.m_queryEnd = m_url.m_pathEnd; |
| state = State::Fragment; |
| break; |
| } |
| utf8PercentEncode<isInDefaultEncodeSet>(c); |
| ++c; |
| break; |
| case State::CannotBeABaseURLPath: |
| LOG_STATE("CannotBeABaseURLPath"); |
| if (*c == '?') { |
| m_url.m_pathEnd = currentPosition(c); |
| appendToASCIIBuffer('?'); |
| ++c; |
| if (nonUTF8QueryEncoding) { |
| queryBegin = c; |
| state = State::NonUTF8Query; |
| } else |
| state = State::UTF8Query; |
| } else if (*c == '#') { |
| m_url.m_pathEnd = currentPosition(c); |
| m_url.m_queryEnd = m_url.m_pathEnd; |
| state = State::Fragment; |
| } else if (*c == '/') { |
| appendToASCIIBuffer('/'); |
| ++c; |
| m_url.m_pathAfterLastSlash = currentPosition(c); |
| } else { |
| utf8PercentEncode<isInSimpleEncodeSet>(c); |
| ++c; |
| } |
| break; |
| case State::UTF8Query: |
| LOG_STATE("UTF8Query"); |
| ASSERT(queryBegin == CodePointIterator<CharacterType>()); |
| if (*c == '#') { |
| m_url.m_queryEnd = currentPosition(c); |
| state = State::Fragment; |
| break; |
| } |
| ASSERT(!nonUTF8QueryEncoding); |
| utf8QueryEncode(c); |
| ++c; |
| break; |
| case State::NonUTF8Query: |
| do { |
| LOG_STATE("NonUTF8Query"); |
| ASSERT(queryBegin != CodePointIterator<CharacterType>()); |
| if (*c == '#') { |
| encodeNonUTF8Query(queryBuffer, *nonUTF8QueryEncoding, CodePointIterator<CharacterType>(queryBegin, c)); |
| m_url.m_queryEnd = currentPosition(c); |
| state = State::Fragment; |
| break; |
| } |
| appendCodePoint(queryBuffer, *c); |
| advance(c, queryBegin); |
| } while (!c.atEnd()); |
| break; |
| case State::Fragment: |
| URL_PARSER_LOG("State Fragment"); |
| utf8PercentEncode<isInFragmentEncodeSet>(c); |
| ++c; |
| break; |
| } |
| } |
| |
| switch (state) { |
| case State::SchemeStart: |
| LOG_FINAL_STATE("SchemeStart"); |
| if (!currentPosition(c) && base.isValid() && !base.m_cannotBeABaseURL) { |
| m_url = base; |
| m_url.removeFragmentIdentifier(); |
| return; |
| } |
| failure(); |
| return; |
| case State::Scheme: |
| LOG_FINAL_STATE("Scheme"); |
| failure(); |
| return; |
| case State::NoScheme: |
| LOG_FINAL_STATE("NoScheme"); |
| RELEASE_ASSERT_NOT_REACHED(); |
| case State::SpecialRelativeOrAuthority: |
| LOG_FINAL_STATE("SpecialRelativeOrAuthority"); |
| copyURLPartsUntil(base, URLPart::QueryEnd, c, nonUTF8QueryEncoding); |
| break; |
| case State::PathOrAuthority: |
| LOG_FINAL_STATE("PathOrAuthority"); |
| ASSERT(m_url.m_userStart); |
| ASSERT(m_url.m_userStart == currentPosition(c)); |
| ASSERT(parsedDataView(currentPosition(c) - 1) == '/'); |
| m_url.m_userStart--; |
| m_url.m_userEnd = m_url.m_userStart; |
| m_url.m_passwordEnd = m_url.m_userStart; |
| m_url.m_hostEnd = m_url.m_userStart; |
| m_url.m_portLength = 0; |
| m_url.m_pathAfterLastSlash = m_url.m_userStart + 1; |
| m_url.m_pathEnd = m_url.m_pathAfterLastSlash; |
| m_url.m_queryEnd = m_url.m_pathAfterLastSlash; |
| break; |
| case State::Relative: |
| LOG_FINAL_STATE("Relative"); |
| RELEASE_ASSERT_NOT_REACHED(); |
| case State::RelativeSlash: |
| LOG_FINAL_STATE("RelativeSlash"); |
| copyURLPartsUntil(base, URLPart::PortEnd, c, nonUTF8QueryEncoding); |
| appendToASCIIBuffer('/'); |
| m_url.m_pathAfterLastSlash = m_url.m_hostEnd + m_url.m_portLength + 1; |
| m_url.m_pathEnd = m_url.m_pathAfterLastSlash; |
| m_url.m_queryEnd = m_url.m_pathAfterLastSlash; |
| break; |
| case State::SpecialAuthoritySlashes: |
| LOG_FINAL_STATE("SpecialAuthoritySlashes"); |
| failure(); |
| return; |
| case State::SpecialAuthorityIgnoreSlashes: |
| LOG_FINAL_STATE("SpecialAuthorityIgnoreSlashes"); |
| failure(); |
| return; |
| case State::AuthorityOrHost: |
| LOG_FINAL_STATE("AuthorityOrHost"); |
| m_url.m_userEnd = currentPosition(authorityOrHostBegin); |
| m_url.m_passwordEnd = m_url.m_userEnd; |
| if (authorityOrHostBegin.atEnd()) { |
| m_url.m_userEnd = m_url.m_userStart; |
| m_url.m_passwordEnd = m_url.m_userStart; |
| m_url.m_hostEnd = m_url.m_userStart; |
| m_url.m_portLength = 0; |
| m_url.m_pathEnd = m_url.m_userStart; |
| } else if (parseHostAndPort(authorityOrHostBegin) == HostParsingResult::InvalidHost) { |
| failure(); |
| return; |
| } else { |
| if (m_urlIsSpecial) { |
| syntaxViolation(c); |
| appendToASCIIBuffer('/'); |
| m_url.m_pathEnd = m_url.m_hostEnd + m_url.m_portLength + 1; |
| } else |
| m_url.m_pathEnd = m_url.m_hostEnd + m_url.m_portLength; |
| } |
| m_url.m_pathAfterLastSlash = m_url.m_pathEnd; |
| m_url.m_queryEnd = m_url.m_pathEnd; |
| break; |
| case State::Host: |
| LOG_FINAL_STATE("Host"); |
| if (parseHostAndPort(authorityOrHostBegin) == HostParsingResult::InvalidHost) { |
| failure(); |
| return; |
| } |
| if (m_urlIsSpecial) { |
| syntaxViolation(c); |
| appendToASCIIBuffer('/'); |
| m_url.m_pathEnd = m_url.m_hostEnd + m_url.m_portLength + 1; |
| } else |
| m_url.m_pathEnd = m_url.m_hostEnd + m_url.m_portLength; |
| m_url.m_pathAfterLastSlash = m_url.m_pathEnd; |
| m_url.m_queryEnd = m_url.m_pathEnd; |
| break; |
| case State::File: |
| LOG_FINAL_STATE("File"); |
| if (base.isValid() && base.protocolIs("file")) { |
| copyURLPartsUntil(base, URLPart::QueryEnd, c, nonUTF8QueryEncoding); |
| break; |
| } |
| syntaxViolation(c); |
| appendToASCIIBuffer("///", 3); |
| m_url.m_userStart = currentPosition(c) - 1; |
| m_url.m_userEnd = m_url.m_userStart; |
| m_url.m_passwordEnd = m_url.m_userStart; |
| m_url.m_hostEnd = m_url.m_userStart; |
| m_url.m_portLength = 0; |
| m_url.m_pathAfterLastSlash = m_url.m_userStart + 1; |
| m_url.m_pathEnd = m_url.m_pathAfterLastSlash; |
| m_url.m_queryEnd = m_url.m_pathAfterLastSlash; |
| break; |
| case State::FileSlash: |
| LOG_FINAL_STATE("FileSlash"); |
| syntaxViolation(c); |
| { |
| bool copiedHost = false; |
| if (base.isValid() && base.protocolIs("file")) { |
| if (base.host().isEmpty()) { |
| copyURLPartsUntil(base, URLPart::SchemeEnd, c, nonUTF8QueryEncoding); |
| appendToASCIIBuffer(":/", 2); |
| } else { |
| copyURLPartsUntil(base, URLPart::PortEnd, c, nonUTF8QueryEncoding); |
| appendToASCIIBuffer('/'); |
| copiedHost = true; |
| } |
| } |
| if (!copiedHost) { |
| m_url.m_userStart = currentPosition(c) + 1; |
| appendToASCIIBuffer("//", 2); |
| m_url.m_userEnd = m_url.m_userStart; |
| m_url.m_passwordEnd = m_url.m_userStart; |
| m_url.m_hostEnd = m_url.m_userStart; |
| m_url.m_portLength = 0; |
| } |
| } |
| if (copyBaseWindowsDriveLetter(base)) { |
| appendToASCIIBuffer('/'); |
| m_url.m_pathAfterLastSlash = m_url.m_hostEnd + 4; |
| } else |
| m_url.m_pathAfterLastSlash = m_url.m_hostEnd + 1; |
| m_url.m_pathEnd = m_url.m_pathAfterLastSlash; |
| m_url.m_queryEnd = m_url.m_pathAfterLastSlash; |
| break; |
| case State::FileHost: |
| LOG_FINAL_STATE("FileHost"); |
| if (takesTwoAdvancesUntilEnd(CodePointIterator<CharacterType>(authorityOrHostBegin, c)) |
| && isWindowsDriveLetter(authorityOrHostBegin)) { |
| syntaxViolation(authorityOrHostBegin); |
| appendToASCIIBuffer('/'); |
| appendWindowsDriveLetter(authorityOrHostBegin); |
| m_url.m_pathAfterLastSlash = currentPosition(c); |
| m_url.m_pathEnd = m_url.m_pathAfterLastSlash; |
| m_url.m_queryEnd = m_url.m_pathAfterLastSlash; |
| break; |
| } |
| |
| if (authorityOrHostBegin == c) { |
| syntaxViolation(c); |
| appendToASCIIBuffer('/'); |
| m_url.m_userStart = currentPosition(c) - 1; |
| m_url.m_userEnd = m_url.m_userStart; |
| m_url.m_passwordEnd = m_url.m_userStart; |
| m_url.m_hostEnd = m_url.m_userStart; |
| m_url.m_portLength = 0; |
| m_url.m_pathAfterLastSlash = m_url.m_userStart + 1; |
| m_url.m_pathEnd = m_url.m_pathAfterLastSlash; |
| m_url.m_queryEnd = m_url.m_pathAfterLastSlash; |
| break; |
| } |
| |
| if (parseHostAndPort(CodePointIterator<CharacterType>(authorityOrHostBegin, c)) == HostParsingResult::InvalidHost) { |
| failure(); |
| return; |
| } |
| |
| syntaxViolation(c); |
| if (isLocalhost(parsedDataView(m_url.m_passwordEnd, currentPosition(c) - m_url.m_passwordEnd))) { |
| m_asciiBuffer.shrink(m_url.m_passwordEnd); |
| m_url.m_hostEnd = currentPosition(c); |
| m_url.m_portLength = 0; |
| } |
| appendToASCIIBuffer('/'); |
| m_url.m_pathAfterLastSlash = m_url.m_hostEnd + m_url.m_portLength + 1; |
| m_url.m_pathEnd = m_url.m_pathAfterLastSlash; |
| m_url.m_queryEnd = m_url.m_pathAfterLastSlash; |
| break; |
| case State::PathStart: |
| LOG_FINAL_STATE("PathStart"); |
| RELEASE_ASSERT_NOT_REACHED(); |
| case State::Path: |
| LOG_FINAL_STATE("Path"); |
| m_url.m_pathEnd = currentPosition(c); |
| m_url.m_queryEnd = m_url.m_pathEnd; |
| break; |
| case State::CannotBeABaseURLPath: |
| LOG_FINAL_STATE("CannotBeABaseURLPath"); |
| m_url.m_pathEnd = currentPosition(c); |
| m_url.m_queryEnd = m_url.m_pathEnd; |
| break; |
| case State::UTF8Query: |
| LOG_FINAL_STATE("UTF8Query"); |
| ASSERT(queryBegin == CodePointIterator<CharacterType>()); |
| m_url.m_queryEnd = currentPosition(c); |
| break; |
| case State::NonUTF8Query: |
| LOG_FINAL_STATE("NonUTF8Query"); |
| ASSERT(queryBegin != CodePointIterator<CharacterType>()); |
| encodeNonUTF8Query(queryBuffer, *nonUTF8QueryEncoding, CodePointIterator<CharacterType>(queryBegin, c)); |
| m_url.m_queryEnd = currentPosition(c); |
| break; |
| case State::Fragment: |
| LOG_FINAL_STATE("Fragment"); |
| break; |
| } |
| |
| if (LIKELY(!m_didSeeSyntaxViolation)) { |
| m_url.m_string = m_inputString; |
| ASSERT(m_asciiBuffer.isEmpty()); |
| } else |
| m_url.m_string = String::adopt(WTFMove(m_asciiBuffer)); |
| m_url.m_isValid = true; |
| URL_PARSER_LOG("Parsed URL <%s>\n\n", m_url.m_string.utf8().data()); |
| } |
| |
| template<typename CharacterType> |
| void URLParser::parseAuthority(CodePointIterator<CharacterType> iterator) |
| { |
| if (UNLIKELY(iterator.atEnd())) { |
| syntaxViolation(iterator); |
| m_url.m_userEnd = currentPosition(iterator); |
| m_url.m_passwordEnd = m_url.m_userEnd; |
| return; |
| } |
| for (; !iterator.atEnd(); advance(iterator)) { |
| if (*iterator == ':') { |
| m_url.m_userEnd = currentPosition(iterator); |
| auto iteratorAtColon = iterator; |
| ++iterator; |
| bool tabOrNewlineAfterColon = false; |
| while (UNLIKELY(!iterator.atEnd() && isTabOrNewline(*iterator))) { |
| tabOrNewlineAfterColon = true; |
| ++iterator; |
| } |
| if (UNLIKELY(iterator.atEnd())) { |
| syntaxViolation(iteratorAtColon); |
| m_url.m_passwordEnd = m_url.m_userEnd; |
| if (m_url.m_userEnd > m_url.m_userStart) |
| appendToASCIIBuffer('@'); |
| return; |
| } |
| if (tabOrNewlineAfterColon) |
| syntaxViolation(iteratorAtColon); |
| appendToASCIIBuffer(':'); |
| break; |
| } |
| utf8PercentEncode<WTF::isInUserInfoEncodeSet>(iterator); |
| } |
| for (; !iterator.atEnd(); advance(iterator)) |
| utf8PercentEncode<WTF::isInUserInfoEncodeSet>(iterator); |
| m_url.m_passwordEnd = currentPosition(iterator); |
| if (!m_url.m_userEnd) |
| m_url.m_userEnd = m_url.m_passwordEnd; |
| appendToASCIIBuffer('@'); |
| } |
| |
| template<typename UnsignedIntegerType> |
| void URLParser::appendNumberToASCIIBuffer(UnsignedIntegerType number) |
| { |
| LChar buf[sizeof(UnsignedIntegerType) * 3 + 1]; |
| LChar* end = std::end(buf); |
| LChar* p = end; |
| do { |
| *--p = (number % 10) + '0'; |
| number /= 10; |
| } while (number); |
| appendToASCIIBuffer(p, end - p); |
| } |
| |
| void URLParser::serializeIPv4(IPv4Address address) |
| { |
| appendNumberToASCIIBuffer<uint8_t>(address >> 24); |
| appendToASCIIBuffer('.'); |
| appendNumberToASCIIBuffer<uint8_t>(address >> 16); |
| appendToASCIIBuffer('.'); |
| appendNumberToASCIIBuffer<uint8_t>(address >> 8); |
| appendToASCIIBuffer('.'); |
| appendNumberToASCIIBuffer<uint8_t>(address); |
| } |
| |
| static size_t zeroSequenceLength(const std::array<uint16_t, 8>& address, size_t begin) |
| { |
| size_t end = begin; |
| for (; end < 8; end++) { |
| if (address[end]) |
| break; |
| } |
| return end - begin; |
| } |
| |
| static std::optional<size_t> findLongestZeroSequence(const std::array<uint16_t, 8>& address) |
| { |
| std::optional<size_t> longest; |
| size_t longestLength = 0; |
| for (size_t i = 0; i < 8; i++) { |
| size_t length = zeroSequenceLength(address, i); |
| if (length) { |
| if (length > 1 && (!longest || longestLength < length)) { |
| longest = i; |
| longestLength = length; |
| } |
| i += length; |
| } |
| } |
| return longest; |
| } |
| |
| void URLParser::serializeIPv6Piece(uint16_t piece) |
| { |
| bool printed = false; |
| if (auto nibble0 = piece >> 12) { |
| appendToASCIIBuffer(lowerNibbleToLowercaseASCIIHexDigit(nibble0)); |
| printed = true; |
| } |
| auto nibble1 = piece >> 8 & 0xF; |
| if (printed || nibble1) { |
| appendToASCIIBuffer(lowerNibbleToLowercaseASCIIHexDigit(nibble1)); |
| printed = true; |
| } |
| auto nibble2 = piece >> 4 & 0xF; |
| if (printed || nibble2) |
| appendToASCIIBuffer(lowerNibbleToLowercaseASCIIHexDigit(nibble2)); |
| appendToASCIIBuffer(lowerNibbleToLowercaseASCIIHexDigit(piece & 0xF)); |
| } |
| |
| void URLParser::serializeIPv6(URLParser::IPv6Address address) |
| { |
| appendToASCIIBuffer('['); |
| auto compressPointer = findLongestZeroSequence(address); |
| for (size_t piece = 0; piece < 8; piece++) { |
| if (compressPointer && compressPointer.value() == piece) { |
| ASSERT(!address[piece]); |
| if (piece) |
| appendToASCIIBuffer(':'); |
| else |
| appendToASCIIBuffer("::", 2); |
| while (piece < 8 && !address[piece]) |
| piece++; |
| if (piece == 8) |
| break; |
| } |
| serializeIPv6Piece(address[piece]); |
| if (piece < 7) |
| appendToASCIIBuffer(':'); |
| } |
| appendToASCIIBuffer(']'); |
| } |
| |
| enum class URLParser::IPv4PieceParsingError { |
| Failure, |
| Overflow, |
| }; |
| |
| template<typename CharacterType> |
| Expected<uint32_t, URLParser::IPv4PieceParsingError> URLParser::parseIPv4Piece(CodePointIterator<CharacterType>& iterator, bool& didSeeSyntaxViolation) |
| { |
| enum class State : uint8_t { |
| UnknownBase, |
| Decimal, |
| OctalOrHex, |
| Octal, |
| Hex, |
| }; |
| State state = State::UnknownBase; |
| CheckedUint32 value = 0; |
| if (!iterator.atEnd() && *iterator == '.') |
| return makeUnexpected(IPv4PieceParsingError::Failure); |
| while (!iterator.atEnd()) { |
| if (isTabOrNewline(*iterator)) { |
| didSeeSyntaxViolation = true; |
| ++iterator; |
| continue; |
| } |
| if (*iterator == '.') { |
| ASSERT(!value.hasOverflowed()); |
| return value.value(); |
| } |
| switch (state) { |
| case State::UnknownBase: |
| if (UNLIKELY(*iterator == '0')) { |
| ++iterator; |
| state = State::OctalOrHex; |
| break; |
| } |
| state = State::Decimal; |
| break; |
| case State::OctalOrHex: |
| didSeeSyntaxViolation = true; |
| if (*iterator == 'x' || *iterator == 'X') { |
| ++iterator; |
| state = State::Hex; |
| break; |
| } |
| state = State::Octal; |
| break; |
| case State::Decimal: |
| if (!isASCIIDigit(*iterator)) |
| return makeUnexpected(IPv4PieceParsingError::Failure); |
| value *= 10; |
| value += *iterator - '0'; |
| if (UNLIKELY(value.hasOverflowed())) |
| return makeUnexpected(IPv4PieceParsingError::Overflow); |
| ++iterator; |
| break; |
| case State::Octal: |
| ASSERT(didSeeSyntaxViolation); |
| if (*iterator < '0' || *iterator > '7') |
| return makeUnexpected(IPv4PieceParsingError::Failure); |
| value *= 8; |
| value += *iterator - '0'; |
| if (UNLIKELY(value.hasOverflowed())) |
| return makeUnexpected(IPv4PieceParsingError::Overflow); |
| ++iterator; |
| break; |
| case State::Hex: |
| ASSERT(didSeeSyntaxViolation); |
| if (!isASCIIHexDigit(*iterator)) |
| return makeUnexpected(IPv4PieceParsingError::Failure); |
| value *= 16; |
| value += toASCIIHexValue(*iterator); |
| if (UNLIKELY(value.hasOverflowed())) |
| return makeUnexpected(IPv4PieceParsingError::Overflow); |
| ++iterator; |
| break; |
| } |
| } |
| ASSERT(!value.hasOverflowed()); |
| return value.value(); |
| } |
| |
| ALWAYS_INLINE static uint64_t pow256(size_t exponent) |
| { |
| RELEASE_ASSERT(exponent <= 4); |
| uint64_t values[5] = {1, 256, 256 * 256, 256 * 256 * 256, 256ull * 256 * 256 * 256 }; |
| return values[exponent]; |
| } |
| |
| enum class URLParser::IPv4ParsingError { |
| Failure, |
| NotIPv4, |
| }; |
| |
| template<typename CharacterTypeForSyntaxViolation, typename CharacterType> |
| Expected<URLParser::IPv4Address, URLParser::IPv4ParsingError> URLParser::parseIPv4Host(const CodePointIterator<CharacterTypeForSyntaxViolation>& iteratorForSyntaxViolationPosition, CodePointIterator<CharacterType> iterator) |
| { |
| Vector<Expected<uint32_t, URLParser::IPv4PieceParsingError>, 4> items; |
| bool didSeeSyntaxViolation = false; |
| if (!iterator.atEnd() && *iterator == '.') |
| return makeUnexpected(IPv4ParsingError::NotIPv4); |
| while (!iterator.atEnd()) { |
| if (isTabOrNewline(*iterator)) { |
| didSeeSyntaxViolation = true; |
| ++iterator; |
| continue; |
| } |
| if (items.size() >= 4) |
| return makeUnexpected(IPv4ParsingError::NotIPv4); |
| items.append(parseIPv4Piece(iterator, didSeeSyntaxViolation)); |
| if (!iterator.atEnd() && *iterator == '.') { |
| ++iterator; |
| if (iterator.atEnd()) |
| didSeeSyntaxViolation = true; |
| else if (*iterator == '.') |
| return makeUnexpected(IPv4ParsingError::NotIPv4); |
| } |
| } |
| if (!iterator.atEnd() || !items.size() || items.size() > 4) |
| return makeUnexpected(IPv4ParsingError::NotIPv4); |
| for (const auto& item : items) { |
| if (!item.has_value() && item.error() == IPv4PieceParsingError::Failure) |
| return makeUnexpected(IPv4ParsingError::NotIPv4); |
| } |
| for (const auto& item : items) { |
| if (!item.has_value() && item.error() == IPv4PieceParsingError::Overflow) |
| return makeUnexpected(IPv4ParsingError::Failure); |
| } |
| if (items.size() > 1) { |
| for (size_t i = 0; i < items.size() - 1; i++) { |
| if (items[i].value() > 255) |
| return makeUnexpected(IPv4ParsingError::Failure); |
| } |
| } |
| if (items[items.size() - 1].value() >= pow256(5 - items.size())) |
| return makeUnexpected(IPv4ParsingError::Failure); |
| |
| if (didSeeSyntaxViolation) |
| syntaxViolation(iteratorForSyntaxViolationPosition); |
| for (const auto& item : items) { |
| if (item.value() > 255) |
| syntaxViolation(iteratorForSyntaxViolationPosition); |
| } |
| |
| if (UNLIKELY(items.size() != 4)) |
| syntaxViolation(iteratorForSyntaxViolationPosition); |
| |
| IPv4Address ipv4 = items.takeLast().value(); |
| for (size_t counter = 0; counter < items.size(); ++counter) |
| ipv4 += items[counter].value() * pow256(3 - counter); |
| return ipv4; |
| } |
| |
| template<typename CharacterType> |
| std::optional<uint32_t> URLParser::parseIPv4PieceInsideIPv6(CodePointIterator<CharacterType>& iterator) |
| { |
| if (iterator.atEnd()) |
| return std::nullopt; |
| uint32_t piece = 0; |
| bool leadingZeros = false; |
| size_t digitCount = 0; |
| while (!iterator.atEnd()) { |
| if (!isASCIIDigit(*iterator)) |
| return std::nullopt; |
| ++digitCount; |
| if (!piece && *iterator == '0') { |
| if (leadingZeros) |
| return std::nullopt; |
| leadingZeros = true; |
| } |
| if (!piece && *iterator == '0') |
| leadingZeros = true; |
| piece = piece * 10 + *iterator - '0'; |
| if (piece > 255) |
| return std::nullopt; |
| advance<CharacterType, ReportSyntaxViolation::No>(iterator); |
| if (iterator.atEnd()) |
| break; |
| if (*iterator == '.') |
| break; |
| } |
| if (piece && leadingZeros) |
| return std::nullopt; |
| return piece; |
| } |
| |
| template<typename CharacterType> |
| std::optional<URLParser::IPv4Address> URLParser::parseIPv4AddressInsideIPv6(CodePointIterator<CharacterType> iterator) |
| { |
| IPv4Address address = 0; |
| for (size_t i = 0; i < 4; ++i) { |
| if (std::optional<uint32_t> piece = parseIPv4PieceInsideIPv6(iterator)) |
| address = (address << 8) + piece.value(); |
| else |
| return std::nullopt; |
| if (i < 3) { |
| if (iterator.atEnd()) |
| return std::nullopt; |
| if (*iterator != '.') |
| return std::nullopt; |
| advance<CharacterType, ReportSyntaxViolation::No>(iterator); |
| } else if (!iterator.atEnd()) |
| return std::nullopt; |
| } |
| ASSERT(iterator.atEnd()); |
| return address; |
| } |
| |
| template<typename CharacterType> |
| std::optional<URLParser::IPv6Address> URLParser::parseIPv6Host(CodePointIterator<CharacterType> c) |
| { |
| ASSERT(*c == '['); |
| const auto hostBegin = c; |
| advance(c, hostBegin); |
| if (c.atEnd()) |
| return std::nullopt; |
| |
| IPv6Address address = {{0, 0, 0, 0, 0, 0, 0, 0}}; |
| size_t piecePointer = 0; |
| std::optional<size_t> compressPointer; |
| bool previousValueWasZero = false; |
| bool immediatelyAfterCompress = false; |
| |
| if (*c == ':') { |
| advance(c, hostBegin); |
| if (c.atEnd()) |
| return std::nullopt; |
| if (*c != ':') |
| return std::nullopt; |
| advance(c, hostBegin); |
| ++piecePointer; |
| compressPointer = piecePointer; |
| immediatelyAfterCompress = true; |
| } |
| |
| while (!c.atEnd()) { |
| if (piecePointer == 8) |
| return std::nullopt; |
| if (*c == ':') { |
| if (compressPointer) |
| return std::nullopt; |
| advance(c, hostBegin); |
| ++piecePointer; |
| compressPointer = piecePointer; |
| immediatelyAfterCompress = true; |
| if (previousValueWasZero) |
| syntaxViolation(hostBegin); |
| continue; |
| } |
| if (piecePointer == 6 || (compressPointer && piecePointer < 6)) { |
| if (std::optional<IPv4Address> ipv4Address = parseIPv4AddressInsideIPv6(c)) { |
| if (compressPointer && piecePointer == 5) |
| return std::nullopt; |
| syntaxViolation(hostBegin); |
| address[piecePointer++] = ipv4Address.value() >> 16; |
| address[piecePointer++] = ipv4Address.value() & 0xFFFF; |
| c = { }; |
| break; |
| } |
| } |
| uint16_t value = 0; |
| size_t length = 0; |
| bool leadingZeros = false; |
| for (; length < 4; length++) { |
| if (c.atEnd()) |
| break; |
| if (!isASCIIHexDigit(*c)) |
| break; |
| if (isASCIIUpper(*c)) |
| syntaxViolation(hostBegin); |
| if (*c == '0' && !length) |
| leadingZeros = true; |
| value = value * 0x10 + toASCIIHexValue(*c); |
| advance(c, hostBegin); |
| } |
| |
| previousValueWasZero = !value; |
| if (UNLIKELY((value && leadingZeros) || (previousValueWasZero && (length > 1 || immediatelyAfterCompress)))) |
| syntaxViolation(hostBegin); |
| |
| address[piecePointer++] = value; |
| if (c.atEnd()) |
| break; |
| if (piecePointer == 8 || *c != ':') |
| return std::nullopt; |
| advance(c, hostBegin); |
| if (c.atEnd()) |
| syntaxViolation(hostBegin); |
| |
| immediatelyAfterCompress = false; |
| } |
| |
| if (!c.atEnd()) |
| return std::nullopt; |
| |
| if (compressPointer) { |
| size_t swaps = piecePointer - compressPointer.value(); |
| piecePointer = 7; |
| while (swaps) |
| std::swap(address[piecePointer--], address[compressPointer.value() + swaps-- - 1]); |
| } else if (piecePointer != 8) |
| return std::nullopt; |
| |
| std::optional<size_t> possibleCompressPointer = findLongestZeroSequence(address); |
| if (possibleCompressPointer) |
| possibleCompressPointer.value()++; |
| if (UNLIKELY(compressPointer != possibleCompressPointer)) |
| syntaxViolation(hostBegin); |
| |
| return address; |
| } |
| |
| template<typename CharacterType> |
| URLParser::LCharBuffer URLParser::percentDecode(const LChar* input, size_t length, const CodePointIterator<CharacterType>& iteratorForSyntaxViolationPosition) |
| { |
| LCharBuffer output; |
| output.reserveInitialCapacity(length); |
| |
| for (size_t i = 0; i < length; ++i) { |
| uint8_t byte = input[i]; |
| if (byte != '%') |
| output.uncheckedAppend(byte); |
| else if (length > 2 && i < length - 2) { |
| if (isASCIIHexDigit(input[i + 1]) && isASCIIHexDigit(input[i + 2])) { |
| syntaxViolation(iteratorForSyntaxViolationPosition); |
| output.uncheckedAppend(toASCIIHexValue(input[i + 1], input[i + 2])); |
| i += 2; |
| } else |
| output.uncheckedAppend(byte); |
| } else |
| output.uncheckedAppend(byte); |
| } |
| return output; |
| } |
| |
| URLParser::LCharBuffer URLParser::percentDecode(const LChar* input, size_t length) |
| { |
| LCharBuffer output; |
| output.reserveInitialCapacity(length); |
| |
| for (size_t i = 0; i < length; ++i) { |
| uint8_t byte = input[i]; |
| if (byte != '%') |
| output.uncheckedAppend(byte); |
| else if (length > 2 && i < length - 2) { |
| if (isASCIIHexDigit(input[i + 1]) && isASCIIHexDigit(input[i + 2])) { |
| output.uncheckedAppend(toASCIIHexValue(input[i + 1], input[i + 2])); |
| i += 2; |
| } else |
| output.uncheckedAppend(byte); |
| } else |
| output.uncheckedAppend(byte); |
| } |
| return output; |
| } |
| |
| bool URLParser::needsNonSpecialDotSlash() const |
| { |
| auto pathStart = m_url.m_hostEnd + m_url.m_portLength; |
| return !m_urlIsSpecial |
| && pathStart == m_url.m_schemeEnd + 1U |
| && pathStart + 1 < m_url.m_string.length() |
| && m_url.m_string[pathStart] == '/' |
| && m_url.m_string[pathStart + 1] == '/'; |
| } |
| |
| void URLParser::addNonSpecialDotSlash() |
| { |
| auto oldPathStart = m_url.m_hostEnd + m_url.m_portLength; |
| auto& oldString = m_url.m_string; |
| m_url.m_string = makeString(oldString.substring(0, oldPathStart + 1), "./", oldString.substring(oldPathStart + 1)); |
| m_url.m_pathAfterLastSlash += 2; |
| m_url.m_pathEnd += 2; |
| m_url.m_queryEnd += 2; |
| } |
| |
| template<typename CharacterType> std::optional<URLParser::LCharBuffer> URLParser::domainToASCII(StringImpl& domain, const CodePointIterator<CharacterType>& iteratorForSyntaxViolationPosition) |
| { |
| LCharBuffer ascii; |
| if (domain.isAllASCII() && !subdomainStartsWithXNDashDash(domain)) { |
| size_t length = domain.length(); |
| if (domain.is8Bit()) { |
| const LChar* characters = domain.characters8(); |
| ascii.reserveInitialCapacity(length); |
| for (size_t i = 0; i < length; ++i) { |
| if (UNLIKELY(isASCIIUpper(characters[i]))) |
| syntaxViolation(iteratorForSyntaxViolationPosition); |
| ascii.uncheckedAppend(toASCIILower(characters[i])); |
| } |
| } else { |
| const UChar* characters = domain.characters16(); |
| ascii.reserveInitialCapacity(length); |
| for (size_t i = 0; i < length; ++i) { |
| if (UNLIKELY(isASCIIUpper(characters[i]))) |
| syntaxViolation(iteratorForSyntaxViolationPosition); |
| ascii.uncheckedAppend(toASCIILower(characters[i])); |
| } |
| } |
| return ascii; |
| } |
| |
| UChar hostnameBuffer[hostnameBufferLength]; |
| UErrorCode error = U_ZERO_ERROR; |
| UIDNAInfo processingDetails = UIDNA_INFO_INITIALIZER; |
| int32_t numCharactersConverted = uidna_nameToASCII(&internationalDomainNameTranscoder(), StringView(domain).upconvertedCharacters(), domain.length(), hostnameBuffer, hostnameBufferLength, &processingDetails, &error); |
| |
| if (U_SUCCESS(error) && !(processingDetails.errors & ~allowedNameToASCIIErrors) && numCharactersConverted) { |
| #if ASSERT_ENABLED |
| for (int32_t i = 0; i < numCharactersConverted; ++i) { |
| ASSERT(isASCII(hostnameBuffer[i])); |
| ASSERT(!isASCIIUpper(hostnameBuffer[i])); |
| } |
| #else |
| UNUSED_PARAM(numCharactersConverted); |
| #endif // ASSERT_ENABLED |
| ascii.append(hostnameBuffer, numCharactersConverted); |
| if (domain != StringView(ascii.data(), ascii.size())) |
| syntaxViolation(iteratorForSyntaxViolationPosition); |
| return ascii; |
| } |
| return std::nullopt; |
| } |
| |
| bool URLParser::hasForbiddenHostCodePoint(const URLParser::LCharBuffer& asciiDomain) |
| { |
| for (size_t i = 0; i < asciiDomain.size(); ++i) { |
| if (isForbiddenHostCodePoint(asciiDomain[i])) |
| return true; |
| } |
| return false; |
| } |
| |
| template<typename CharacterType> |
| bool URLParser::parsePort(CodePointIterator<CharacterType>& iterator) |
| { |
| if (UNLIKELY(m_urlIsFile)) |
| return false; |
| |
| ASSERT(*iterator == ':'); |
| auto colonIterator = iterator; |
| advance(iterator, colonIterator); |
| uint32_t port = 0; |
| if (UNLIKELY(iterator.atEnd())) { |
| unsigned portLength = currentPosition(colonIterator) - m_url.m_hostEnd; |
| RELEASE_ASSERT(portLength <= URL::maxPortLength); |
| m_url.m_portLength = portLength; |
| syntaxViolation(colonIterator); |
| return true; |
| } |
| size_t digitCount = 0; |
| bool leadingZeros = false; |
| for (; !iterator.atEnd(); ++iterator) { |
| if (UNLIKELY(isTabOrNewline(*iterator))) { |
| syntaxViolation(colonIterator); |
| continue; |
| } |
| if (isASCIIDigit(*iterator)) { |
| if (*iterator == '0' && !digitCount) |
| leadingZeros = true; |
| ++digitCount; |
| port = port * 10 + *iterator - '0'; |
| if (port > std::numeric_limits<uint16_t>::max()) |
| return false; |
| } else |
| return false; |
| } |
| |
| if (port && leadingZeros) |
| syntaxViolation(colonIterator); |
| |
| if (!port && digitCount > 1) |
| syntaxViolation(colonIterator); |
| |
| ASSERT(port == static_cast<uint16_t>(port)); |
| if (UNLIKELY(defaultPortForProtocol(parsedDataView(0, m_url.m_schemeEnd)) == static_cast<uint16_t>(port))) |
| syntaxViolation(colonIterator); |
| else { |
| appendToASCIIBuffer(':'); |
| ASSERT(port <= std::numeric_limits<uint16_t>::max()); |
| appendNumberToASCIIBuffer<uint16_t>(static_cast<uint16_t>(port)); |
| } |
| |
| unsigned portLength = currentPosition(iterator) - m_url.m_hostEnd; |
| RELEASE_ASSERT(portLength <= URL::maxPortLength); |
| m_url.m_portLength = portLength; |
| return true; |
| } |
| |
| template<typename CharacterType> |
| bool URLParser::subdomainStartsWithXNDashDash(CodePointIterator<CharacterType> iterator) |
| { |
| enum class State : uint8_t { |
| NotAtSubdomainBeginOrInXNDashDash, |
| AtSubdomainBegin, |
| AfterX, |
| AfterN, |
| AfterFirstDash, |
| } state { State::AtSubdomainBegin }; |
| |
| for (; !iterator.atEnd(); advance<CharacterType, ReportSyntaxViolation::No>(iterator)) { |
| CharacterType c = *iterator; |
| |
| // These characters indicate the end of the host. |
| if (c == ':' || c == '/' || c == '?' || c == '#') |
| return false; |
| |
| switch (state) { |
| case State::NotAtSubdomainBeginOrInXNDashDash: |
| break; |
| case State::AtSubdomainBegin: |
| if (c == 'x' || c == 'X') { |
| state = State::AfterX; |
| continue; |
| } |
| break; |
| case State::AfterX: |
| if (c == 'n' || c == 'N') { |
| state = State::AfterN; |
| continue; |
| } |
| break; |
| case State::AfterN: |
| if (c == '-') { |
| state = State::AfterFirstDash; |
| continue; |
| } |
| break; |
| case State::AfterFirstDash: |
| if (c == '-') |
| return true; |
| break; |
| } |
| |
| if (c == '.') |
| state = State::AtSubdomainBegin; |
| else |
| state = State::NotAtSubdomainBeginOrInXNDashDash; |
| } |
| return false; |
| } |
| |
| bool URLParser::subdomainStartsWithXNDashDash(StringImpl& host) |
| { |
| if (host.is8Bit()) { |
| const LChar* begin = host.characters8(); |
| return subdomainStartsWithXNDashDash(CodePointIterator<LChar>(begin, begin + host.length())); |
| } |
| const UChar* begin = host.characters16(); |
| return subdomainStartsWithXNDashDash(CodePointIterator<UChar>(begin, begin + host.length())); |
| } |
| |
| static bool dnsNameEndsInNumber(StringView name) |
| { |
| // https://url.spec.whatwg.org/#ends-in-a-number-checker |
| auto containsOctalDecimalOrHexNumber = [] (StringView segment) { |
| const auto segmentLength = segment.length(); |
| if (!UNLIKELY(segmentLength)) |
| return false; |
| auto firstCodeUnit = segment[0]; |
| if (LIKELY(!isASCIIDigit(firstCodeUnit))) |
| return false; |
| if (segmentLength == 1) |
| return true; |
| auto secondCodeUnit = segment[1]; |
| if ((secondCodeUnit == 'x' || secondCodeUnit == 'X') && firstCodeUnit == '0') |
| return segment.find(std::not_fn(isASCIIHexDigit<UChar>), 2) == notFound; |
| return !segment.contains(std::not_fn(isASCIIDigit<UChar>)); |
| }; |
| |
| size_t lastDotLocation = name.reverseFind('.'); |
| if (lastDotLocation == notFound) |
| return containsOctalDecimalOrHexNumber(name); |
| size_t lastSegmentEnd = name.length(); |
| if (lastDotLocation == lastSegmentEnd - 1) { |
| lastSegmentEnd = lastDotLocation; |
| lastDotLocation = name.reverseFind('.', lastDotLocation - 1); |
| } |
| StringView lastPart = name.substring(lastDotLocation == notFound ? 0 : lastDotLocation + 1, lastSegmentEnd - lastDotLocation - 1); |
| return containsOctalDecimalOrHexNumber(lastPart); |
| } |
| |
| template<typename CharacterType> |
| auto URLParser::parseHostAndPort(CodePointIterator<CharacterType> iterator) -> HostParsingResult |
| { |
| if (iterator.atEnd()) |
| return HostParsingResult::InvalidHost; |
| if (*iterator == ':') |
| return HostParsingResult::InvalidHost; |
| if (*iterator == '[') { |
| auto ipv6End = iterator; |
| while (!ipv6End.atEnd() && *ipv6End != ']') |
| ++ipv6End; |
| if (ipv6End.atEnd()) |
| return HostParsingResult::InvalidHost; |
| if (auto address = parseIPv6Host(CodePointIterator<CharacterType>(iterator, ipv6End))) { |
| serializeIPv6(address.value()); |
| if (!ipv6End.atEnd()) { |
| advance(ipv6End); |
| m_url.m_hostEnd = currentPosition(ipv6End); |
| if (!ipv6End.atEnd() && *ipv6End == ':') |
| return parsePort(ipv6End) ? HostParsingResult::IPv6WithPort : HostParsingResult::InvalidHost; |
| m_url.m_portLength = 0; |
| return ipv6End.atEnd() ? HostParsingResult::IPv6WithoutPort : HostParsingResult::InvalidHost; |
| } |
| m_url.m_hostEnd = currentPosition(ipv6End); |
| return HostParsingResult::IPv6WithoutPort; |
| } |
| return HostParsingResult::InvalidHost; |
| } |
| |
| if (!m_urlIsSpecial) { |
| for (; !iterator.atEnd(); ++iterator) { |
| if (UNLIKELY(isTabOrNewline(*iterator))) { |
| syntaxViolation(iterator); |
| continue; |
| } |
| if (*iterator == ':') |
| break; |
| if (UNLIKELY(isForbiddenHostCodePoint(*iterator) && *iterator != '%')) |
| return HostParsingResult::InvalidHost; |
| utf8PercentEncode<isInSimpleEncodeSet>(iterator); |
| } |
| m_url.m_hostEnd = currentPosition(iterator); |
| if (iterator.atEnd()) { |
| m_url.m_portLength = 0; |
| return HostParsingResult::NonSpecialHostWithoutPort; |
| } |
| return parsePort(iterator) ? HostParsingResult::NonSpecialHostWithPort : HostParsingResult::InvalidHost; |
| } |
| |
| if (LIKELY(!m_hostHasPercentOrNonASCII && !subdomainStartsWithXNDashDash(iterator))) { |
| auto hostIterator = iterator; |
| for (; !iterator.atEnd(); ++iterator) { |
| if (isTabOrNewline(*iterator)) |
| continue; |
| if (*iterator == ':') |
| break; |
| if (isForbiddenHostCodePoint(*iterator)) |
| return HostParsingResult::InvalidHost; |
| } |
| auto address = parseIPv4Host(hostIterator, CodePointIterator<CharacterType>(hostIterator, iterator)); |
| if (address) { |
| serializeIPv4(address.value()); |
| m_url.m_hostEnd = currentPosition(iterator); |
| if (iterator.atEnd()) { |
| m_url.m_portLength = 0; |
| return HostParsingResult::IPv4WithoutPort; |
| } |
| return parsePort(iterator) ? HostParsingResult::IPv4WithPort : HostParsingResult::InvalidHost; |
| } |
| if (address.error() == IPv4ParsingError::Failure) |
| return HostParsingResult::InvalidHost; |
| for (; hostIterator != iterator; ++hostIterator) { |
| if (UNLIKELY(isTabOrNewline(*hostIterator))) { |
| syntaxViolation(hostIterator); |
| continue; |
| } |
| if (UNLIKELY(isASCIIUpper(*hostIterator))) |
| syntaxViolation(hostIterator); |
| appendToASCIIBuffer(toASCIILower(*hostIterator)); |
| } |
| m_url.m_hostEnd = currentPosition(iterator); |
| auto hostStart = m_url.hostStart(); |
| if (UNLIKELY(dnsNameEndsInNumber(parsedDataView(hostStart, m_url.m_hostEnd - hostStart)))) |
| return HostParsingResult::InvalidHost; |
| if (!hostIterator.atEnd()) |
| return parsePort(hostIterator) ? HostParsingResult::DNSNameWithPort : HostParsingResult::InvalidHost; |
| m_url.m_portLength = 0; |
| return HostParsingResult::DNSNameWithoutPort; |
| } |
| |
| const auto hostBegin = iterator; |
| |
| LCharBuffer utf8Encoded; |
| for (; !iterator.atEnd(); ++iterator) { |
| if (UNLIKELY(isTabOrNewline(*iterator))) { |
| syntaxViolation(hostBegin); |
| continue; |
| } |
| if (*iterator == ':') |
| break; |
| if (UNLIKELY(!isASCII(*iterator))) |
| syntaxViolation(hostBegin); |
| |
| uint8_t buffer[U8_MAX_LENGTH]; |
| int32_t offset = 0; |
| UBool isError = false; |
| U8_APPEND(buffer, offset, U8_MAX_LENGTH, *iterator, isError); |
| if (isError) |
| return HostParsingResult::InvalidHost; |
| utf8Encoded.append(buffer, offset); |
| } |
| LCharBuffer percentDecoded = percentDecode(utf8Encoded.data(), utf8Encoded.size(), hostBegin); |
| String domain = String::fromUTF8(percentDecoded.data(), percentDecoded.size()); |
| if (domain.isNull()) |
| return HostParsingResult::InvalidHost; |
| if (domain != StringView(percentDecoded.data(), percentDecoded.size())) |
| syntaxViolation(hostBegin); |
| auto asciiDomain = domainToASCII(*domain.impl(), hostBegin); |
| if (!asciiDomain || hasForbiddenHostCodePoint(asciiDomain.value())) |
| return HostParsingResult::InvalidHost; |
| LCharBuffer& asciiDomainValue = asciiDomain.value(); |
| const LChar* asciiDomainCharacters = asciiDomainValue.data(); |
| |
| auto address = parseIPv4Host(hostBegin, CodePointIterator<LChar>(asciiDomainValue.begin(), asciiDomainValue.end())); |
| if (address) { |
| serializeIPv4(address.value()); |
| m_url.m_hostEnd = currentPosition(iterator); |
| if (iterator.atEnd()) { |
| m_url.m_portLength = 0; |
| return HostParsingResult::IPv4WithoutPort; |
| } |
| return parsePort(iterator) ? HostParsingResult::IPv4WithPort : HostParsingResult::InvalidHost; |
| } |
| if (address.error() == IPv4ParsingError::Failure) |
| return HostParsingResult::InvalidHost; |
| |
| appendToASCIIBuffer(asciiDomainCharacters, asciiDomainValue.size()); |
| m_url.m_hostEnd = currentPosition(iterator); |
| auto hostStart = m_url.hostStart(); |
| if (UNLIKELY(dnsNameEndsInNumber(parsedDataView(hostStart, m_url.m_hostEnd - hostStart)))) |
| return HostParsingResult::InvalidHost; |
| if (!iterator.atEnd()) |
| return parsePort(iterator) ? HostParsingResult::DNSNameWithPort : HostParsingResult::InvalidHost; |
| |
| m_url.m_portLength = 0; |
| return HostParsingResult::DNSNameWithoutPort; |
| } |
| |
| std::optional<String> URLParser::formURLDecode(StringView input) |
| { |
| auto utf8 = input.utf8(StrictConversion); |
| if (utf8.isNull()) |
| return std::nullopt; |
| auto percentDecoded = percentDecode(utf8.dataAsUInt8Ptr(), utf8.length()); |
| return String::fromUTF8ReplacingInvalidSequences(percentDecoded.data(), percentDecoded.size()); |
| } |
| |
| // https://url.spec.whatwg.org/#concept-urlencoded-parser |
| auto URLParser::parseURLEncodedForm(StringView input) -> URLEncodedForm |
| { |
| URLEncodedForm output; |
| for (StringView bytes : input.split('&')) { |
| auto equalIndex = bytes.find('='); |
| if (equalIndex == notFound) { |
| auto name = formURLDecode(bytes.toString().replace('+', 0x20)); |
| if (name) |
| output.append({ name.value(), emptyString() }); |
| } else { |
| auto name = formURLDecode(bytes.substring(0, equalIndex).toString().replace('+', 0x20)); |
| auto value = formURLDecode(bytes.substring(equalIndex + 1).toString().replace('+', 0x20)); |
| if (name && value) |
| output.append({ name.value(), value.value() }); |
| } |
| } |
| return output; |
| } |
| |
| static void serializeURLEncodedForm(const String& input, Vector<LChar>& output) |
| { |
| auto utf8 = input.utf8(StrictConversion); |
| const char* data = utf8.data(); |
| for (size_t i = 0; i < utf8.length(); ++i) { |
| const char byte = data[i]; |
| if (byte == 0x20) |
| output.append(0x2B); |
| else if (byte == 0x2A |
| || byte == 0x2D |
| || byte == 0x2E |
| || (byte >= 0x30 && byte <= 0x39) |
| || (byte >= 0x41 && byte <= 0x5A) |
| || byte == 0x5F |
| || (byte >= 0x61 && byte <= 0x7A)) // FIXME: Put these in the characterClassTable to avoid branches. |
| output.append(byte); |
| else |
| percentEncodeByte(byte, output); |
| } |
| } |
| |
| String URLParser::serialize(const URLEncodedForm& tuples) |
| { |
| if (tuples.isEmpty()) |
| return { }; |
| |
| Vector<LChar> output; |
| for (auto& tuple : tuples) { |
| if (!output.isEmpty()) |
| output.append('&'); |
| serializeURLEncodedForm(tuple.key, output); |
| output.append('='); |
| serializeURLEncodedForm(tuple.value, output); |
| } |
| return String::adopt(WTFMove(output)); |
| } |
| |
| const UIDNA& URLParser::internationalDomainNameTranscoder() |
| { |
| static UIDNA* encoder; |
| static std::once_flag onceFlag; |
| std::call_once(onceFlag, [] { |
| UErrorCode error = U_ZERO_ERROR; |
| encoder = uidna_openUTS46(UIDNA_CHECK_BIDI | UIDNA_CHECK_CONTEXTJ | UIDNA_NONTRANSITIONAL_TO_UNICODE | UIDNA_NONTRANSITIONAL_TO_ASCII, &error); |
| if (UNLIKELY(U_FAILURE(error))) |
| CRASH_WITH_INFO(error); |
| RELEASE_ASSERT(encoder); |
| }); |
| return *encoder; |
| } |
| |
| bool URLParser::allValuesEqual(const URL& a, const URL& b) |
| { |
| URL_PARSER_LOG("%d %d %d %d %d %d %d %d %d %d %d %d %s\n%d %d %d %d %d %d %d %d %d %d %d %d %s", |
| a.m_isValid, |
| a.m_cannotBeABaseURL, |
| a.m_protocolIsInHTTPFamily, |
| a.m_schemeEnd, |
| a.m_userStart, |
| a.m_userEnd, |
| a.m_passwordEnd, |
| a.m_hostEnd, |
| a.m_hostEnd + a.m_portLength, |
| a.m_pathAfterLastSlash, |
| a.m_pathEnd, |
| a.m_queryEnd, |
| a.m_string.utf8().data(), |
| b.m_isValid, |
| b.m_cannotBeABaseURL, |
| b.m_protocolIsInHTTPFamily, |
| b.m_schemeEnd, |
| b.m_userStart, |
| b.m_userEnd, |
| b.m_passwordEnd, |
| b.m_hostEnd, |
| b.m_hostEnd + b.m_portLength, |
| b.m_pathAfterLastSlash, |
| b.m_pathEnd, |
| b.m_queryEnd, |
| b.m_string.utf8().data()); |
| |
| return a.m_string == b.m_string |
| && a.m_isValid == b.m_isValid |
| && a.m_cannotBeABaseURL == b.m_cannotBeABaseURL |
| && a.m_protocolIsInHTTPFamily == b.m_protocolIsInHTTPFamily |
| && a.m_schemeEnd == b.m_schemeEnd |
| && a.m_userStart == b.m_userStart |
| && a.m_userEnd == b.m_userEnd |
| && a.m_passwordEnd == b.m_passwordEnd |
| && a.m_hostEnd == b.m_hostEnd |
| && a.m_portLength == b.m_portLength |
| && a.m_pathAfterLastSlash == b.m_pathAfterLastSlash |
| && a.m_pathEnd == b.m_pathEnd |
| && a.m_queryEnd == b.m_queryEnd; |
| } |
| |
| bool URLParser::internalValuesConsistent(const URL& url) |
| { |
| return url.m_schemeEnd <= url.m_userStart |
| && url.m_userStart <= url.m_userEnd |
| && url.m_userEnd <= url.m_passwordEnd |
| && url.m_passwordEnd <= url.m_hostEnd |
| && url.m_hostEnd + url.m_portLength <= url.m_pathAfterLastSlash |
| && url.m_pathAfterLastSlash <= url.m_pathEnd |
| && url.m_pathEnd <= url.m_queryEnd |
| && url.m_queryEnd <= url.m_string.length(); |
| } |
| |
| } // namespace WTF |