Source/WTF/wtf/URL.cpp - WebKit - Git at Google

 /*
  * Copyright (C) 2004-2020 Apple Inc. All rights reserved.
  * Copyright (C) 2012 Research In Motion Limited. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */

 #include "config.h"
 #include <wtf/URL.h>

 #include "URLParser.h"
 #include <stdio.h>
 #include <unicode/uidna.h>
 #include <wtf/HashMap.h>
 #include <wtf/HashSet.h>
 #include <wtf/Lock.h>
 #include <wtf/NeverDestroyed.h>
 #include <wtf/PrintStream.h>
 #include <wtf/StdLibExtras.h>
 #include <wtf/UUID.h>
 #include <wtf/text/CString.h>
 #include <wtf/text/StringBuilder.h>
 #include <wtf/text/StringHash.h>
 #include <wtf/text/StringToIntegerConversion.h>
 #include <wtf/text/TextStream.h>

 namespace WTF {

 void URL::invalidate()
 {
     m_isValid = false;
     m_protocolIsInHTTPFamily = false;
     m_cannotBeABaseURL = false;
     m_schemeEnd = 0;
     m_userStart = 0;
     m_userEnd = 0;
     m_passwordEnd = 0;
     m_hostEnd = 0;
     m_portLength = 0;
     m_pathEnd = 0;
     m_pathAfterLastSlash = 0;
     m_queryEnd = 0;
 }

 URL::URL(const URL& base, const String& relative, const URLTextEncoding* encoding)
 {
     *this = URLParser(relative, base, encoding).result();
 }

 static bool shouldTrimFromURL(UChar character)
 {
     // Ignore leading/trailing whitespace and control characters.
     return character <= ' ';
 }

 URL URL::isolatedCopy() const &
 {
     URL result = *this;
     result.m_string = result.m_string.isolatedCopy();
     return result;
 }

 URL URL::isolatedCopy() &&
 {
     URL result = *this;
     result.m_string = WTFMove(result.m_string).isolatedCopy();
     return result;
 }

 StringView URL::lastPathComponent() const
 {
     if (!hasPath())
         return { };

     unsigned end = m_pathEnd - 1;
     if (m_string[end] == '/')
         --end;

     size_t start = m_string.reverseFind('/', end);
     if (start < pathStart())
         return { };
     ++start;

     return StringView(m_string).substring(start, end - start + 1);
 }

 bool URL::hasSpecialScheme() const
 {
     // https://url.spec.whatwg.org/#special-scheme
     return protocolIs("ftp")
         || protocolIs("file")
         || protocolIs("http")
         || protocolIs("https")
         || protocolIs("ws")
         || protocolIs("wss");
 }

 unsigned URL::pathStart() const
 {
     unsigned start = m_hostEnd + m_portLength;
     if (start == m_schemeEnd + 1U
         && start + 1 < m_string.length()
         && m_string[start] == '/' && m_string[start + 1] == '.')
         start += 2;
     return start;
 }

 StringView URL::protocol() const
 {
     if (!m_isValid)
         return { };

     return StringView(m_string).substring(0, m_schemeEnd);
 }

 StringView URL::host() const
 {
     if (!m_isValid)
         return { };

     unsigned start = hostStart();
     return StringView(m_string).substring(start, m_hostEnd - start);
 }

 std::optional<uint16_t> URL::port() const
 {
     return m_portLength ? parseInteger<uint16_t>(StringView(m_string).substring(m_hostEnd + 1, m_portLength - 1)) : std::nullopt;
 }

 String URL::hostAndPort() const
 {
     if (auto port = this->port())
         return makeString(host(), ':', port.value());
     return host().toString();
 }

 String URL::protocolHostAndPort() const
 {
     if (!hasCredentials())
         return m_string.substring(0, pathStart());

     return makeString(
         StringView(m_string).substring(0, m_userStart),
         StringView(m_string).substring(hostStart(), pathStart() - hostStart())
     );
 }

 static std::optional<LChar> decodeEscapeSequence(StringView input, unsigned index, unsigned length)
 {
     if (index + 3 > length || input[index] != '%')
         return std::nullopt;
     auto digit1 = input[index + 1];
     auto digit2 = input[index + 2];
     if (!isASCIIHexDigit(digit1) || !isASCIIHexDigit(digit2))
         return std::nullopt;
     return toASCIIHexValue(digit1, digit2);
 }

 static String decodeEscapeSequencesFromParsedURL(StringView input)
 {
     ASSERT(input.isAllASCII());

     auto length = input.length();
     if (length < 3 || !input.contains('%'))
         return input.toString();

     // FIXME: This 100 is arbitrary. Should make a histogram of how this function is actually used to choose a better value.
     Vector<LChar, 100> percentDecoded;
     percentDecoded.reserveInitialCapacity(length);
     for (unsigned i = 0; i < length; ) {
         if (auto decodedCharacter = decodeEscapeSequence(input, i, length)) {
             percentDecoded.uncheckedAppend(*decodedCharacter);
             i += 3;
         } else {
             percentDecoded.uncheckedAppend(input[i]);
             ++i;
         }
     }

     // FIXME: Is UTF-8 always the correct encoding?
     // FIXME: This returns a null string when we encounter an invalid UTF-8 sequence. Is that OK?
     return String::fromUTF8(percentDecoded.data(), percentDecoded.size());
 }

 String URL::user() const
 {
     return decodeEscapeSequencesFromParsedURL(encodedUser());
 }

 String URL::password() const
 {
     return decodeEscapeSequencesFromParsedURL(encodedPassword());
 }

 StringView URL::encodedUser() const
 {
     return StringView(m_string).substring(m_userStart, m_userEnd - m_userStart);
 }

 StringView URL::encodedPassword() const
 {
     if (m_passwordEnd == m_userEnd)
         return { };

     return StringView(m_string).substring(m_userEnd + 1, m_passwordEnd - m_userEnd - 1);
 }

 StringView URL::fragmentIdentifier() const
 {
     if (!hasFragmentIdentifier())
         return { };

     return StringView(m_string).substring(m_queryEnd + 1);
 }

 URL URL::truncatedForUseAsBase() const
 {
     return URL(URL(), m_string.left(m_pathAfterLastSlash));
 }

 #if !USE(CF)

 String URL::fileSystemPath() const
 {
     if (!isLocalFile())
         return { };

     auto result = decodeEscapeSequencesFromParsedURL(path());
 #if PLATFORM(WIN)
     result = fileSystemRepresentation(result);
 #endif
     return result;
 }

 #endif

 #if !ASSERT_ENABLED

 static inline void assertProtocolIsGood(StringView)
 {
 }

 #else

 static void assertProtocolIsGood(StringView protocol)
 {
     // FIXME: We probably don't need this function any more.
     // The isASCIIAlphaCaselessEqual function asserts that passed-in characters
     // are ones it can handle; the older code did not and relied on these checks.
     for (auto character : protocol.codeUnits()) {
         ASSERT(isASCII(character));
         ASSERT(character > ' ');
         ASSERT(!isASCIIUpper(character));
         ASSERT(toASCIILowerUnchecked(character) == character);
     }
 }

 #endif

 static Lock defaultPortForProtocolMapForTestingLock;

 using DefaultPortForProtocolMapForTesting = HashMap<String, uint16_t>;
 static DefaultPortForProtocolMapForTesting*& defaultPortForProtocolMapForTesting() WTF_REQUIRES_LOCK(defaultPortForProtocolMapForTestingLock)
 {
     static DefaultPortForProtocolMapForTesting* defaultPortForProtocolMap;
     return defaultPortForProtocolMap;
 }

 static DefaultPortForProtocolMapForTesting& ensureDefaultPortForProtocolMapForTesting() WTF_REQUIRES_LOCK(defaultPortForProtocolMapForTestingLock)
 {
     DefaultPortForProtocolMapForTesting*& defaultPortForProtocolMap = defaultPortForProtocolMapForTesting();
     if (!defaultPortForProtocolMap)
         defaultPortForProtocolMap = new DefaultPortForProtocolMapForTesting;
     return *defaultPortForProtocolMap;
 }

 void registerDefaultPortForProtocolForTesting(uint16_t port, const String& protocol)
 {
     Locker locker { defaultPortForProtocolMapForTestingLock };
     ensureDefaultPortForProtocolMapForTesting().add(protocol, port);
 }

 void clearDefaultPortForProtocolMapForTesting()
 {
     Locker locker { defaultPortForProtocolMapForTestingLock };
     if (auto* map = defaultPortForProtocolMapForTesting())
         map->clear();
 }

 std::optional<uint16_t> defaultPortForProtocol(StringView protocol)
 {
     {
         Locker locker { defaultPortForProtocolMapForTestingLock };
         if (auto* overrideMap = defaultPortForProtocolMapForTesting()) {
             auto iterator = overrideMap->find(protocol.toStringWithoutCopying());
             if (iterator != overrideMap->end())
                 return iterator->value;
         }
     }
     return URLParser::defaultPortForProtocol(protocol);
 }

 bool isDefaultPortForProtocol(uint16_t port, StringView protocol)
 {
     return defaultPortForProtocol(protocol) == port;
 }

 bool URL::protocolIsJavaScript() const
 {
     return WTF::protocolIsJavaScript(string());
 }

 bool URL::protocolIsInFTPFamily() const
 {
     return WTF::protocolIsInFTPFamily(string());
 }

 bool URL::protocolIs(const char* protocol) const
 {
     assertProtocolIsGood(protocol);

     // JavaScript URLs are "valid" and should be executed even if URL decides they are invalid.
     // The free function protocolIsJavaScript() should be used instead.
     ASSERT(!equalLettersIgnoringASCIICase(StringView(protocol), "javascript"));

     if (!m_isValid)
         return false;

     // Do the comparison without making a new string object.
     for (unsigned i = 0; i < m_schemeEnd; ++i) {
         if (!protocol[i] || !isASCIIAlphaCaselessEqual(m_string[i], protocol[i]))
             return false;
     }
     return !protocol[m_schemeEnd]; // We should have consumed all characters in the argument.
 }

 bool URL::protocolIs(StringView protocol) const
 {
     assertProtocolIsGood(protocol);

     if (!m_isValid)
         return false;

     if (m_schemeEnd != protocol.length())
         return false;

     // Do the comparison without making a new string object.
     for (unsigned i = 0; i < m_schemeEnd; ++i) {
         if (!isASCIIAlphaCaselessEqual(m_string[i], protocol[i]))
             return false;
     }
     return true;
 }

 StringView URL::query() const
 {
     if (m_queryEnd == m_pathEnd)
         return { };

     return StringView(m_string).substring(m_pathEnd + 1, m_queryEnd - (m_pathEnd + 1));
 }

 StringView URL::path() const
 {
     if (!m_isValid)
         return { };

     return StringView(m_string).substring(pathStart(), m_pathEnd - pathStart());
 }

 bool URL::setProtocol(StringView newProtocol)
 {
     // Firefox and IE remove everything after the first ':'.
     auto newProtocolPrefix = newProtocol.substring(0, newProtocol.find(':'));
     auto newProtocolCanonicalized = URLParser::maybeCanonicalizeScheme(newProtocolPrefix);
     if (!newProtocolCanonicalized)
         return false;

     if (!m_isValid) {
         parse(makeString(*newProtocolCanonicalized, ':', m_string));
         return true;
     }

     if ((m_passwordEnd != m_userStart || port()) && *newProtocolCanonicalized == "file")
         return true;

     if (isLocalFile() && host().isEmpty())
         return true;

     parse(makeString(*newProtocolCanonicalized, StringView(m_string).substring(m_schemeEnd)));
     return true;
 }

 // Appends the punycoded hostname identified by the given string and length to
 // the output buffer. The result will not be null terminated.
 // Return value of false means error in encoding.
 static bool appendEncodedHostname(Vector<UChar, 512>& buffer, StringView string)
 {
     // hostnameBuffer needs to be big enough to hold an IDN-encoded name.
     // For host names bigger than this, we won't do IDN encoding, which is almost certainly OK.
     if (string.length() > URLParser::hostnameBufferLength || string.isAllASCII()) {
         append(buffer, string);
         return true;
     }

     UChar hostnameBuffer[URLParser::hostnameBufferLength];
     UErrorCode error = U_ZERO_ERROR;
     UIDNAInfo processingDetails = UIDNA_INFO_INITIALIZER;
     int32_t numCharactersConverted = uidna_nameToASCII(&URLParser::internationalDomainNameTranscoder(),
         string.upconvertedCharacters(), string.length(), hostnameBuffer, URLParser::hostnameBufferLength, &processingDetails, &error);

     if (U_SUCCESS(error) && !(processingDetails.errors & ~URLParser::allowedNameToASCIIErrors) && numCharactersConverted) {
         buffer.append(hostnameBuffer, numCharactersConverted);
         return true;
     }
     return false;
 }

 unsigned URL::hostStart() const
 {
     return (m_passwordEnd == m_userStart) ? m_passwordEnd : m_passwordEnd + 1;
 }

 unsigned URL::credentialsEnd() const
 {
     // Include '@' too if we have it.
     unsigned end = m_passwordEnd;
     if (end != m_hostEnd && m_string[end] == '@')
         end += 1;
     return end;
 }

 static bool forwardSlashHashOrQuestionMark(UChar c)
 {
     return c == '/'
         || c == '#'
         || c == '?';
 }

 static bool slashHashOrQuestionMark(UChar c)
 {
     return forwardSlashHashOrQuestionMark(c) || c == '\\';
 }

 void URL::setHost(StringView newHost)
 {
     if (!m_isValid)
         return;

     if (newHost.contains(':') && !newHost.startsWith('['))
         return;

     if (auto index = newHost.find(hasSpecialScheme() ? slashHashOrQuestionMark : forwardSlashHashOrQuestionMark); index != notFound)
         newHost = newHost.substring(0, index);

     Vector<UChar, 512> encodedHostName;
     if (hasSpecialScheme() && !appendEncodedHostname(encodedHostName, newHost))
         return;

     bool slashSlashNeeded = m_userStart == m_schemeEnd + 1U;
     parse(makeString(
         StringView(m_string).left(hostStart()),
         slashSlashNeeded ? "//" : "",
         hasSpecialScheme() ? StringView(encodedHostName.data(), encodedHostName.size()) : newHost,
         StringView(m_string).substring(m_hostEnd)
     ));
 }

 void URL::setPort(std::optional<uint16_t> port)
 {
     if (!m_isValid)
         return;

     if (!port) {
         remove(m_hostEnd, m_portLength);
         return;
     }

     parse(makeString(
         StringView(m_string).left(m_hostEnd),
         ':',
         static_cast<unsigned>(*port),
         StringView(m_string).substring(pathStart())
     ));
 }

 void URL::setHostAndPort(StringView hostAndPort)
 {
     if (!m_isValid)
         return;

     auto hostName = hostAndPort;
     StringView portString;
     auto colonIndex = hostName.reverseFind(':');
     if (colonIndex != notFound) {
         portString = hostName.substring(colonIndex + 1);
         hostName = hostName.substring(0, colonIndex);
         // Multiple colons are acceptable only in case of IPv6.
         if (hostName.contains(':') && !hostName.startsWith('['))
             return;
         if (!parseInteger<uint16_t>(portString))
             portString = { };
     }
     if (hostName.isEmpty()) {
         remove(hostStart(), pathStart() - hostStart());
         return;
     }

     Vector<UChar, 512> encodedHostName;
     if (hasSpecialScheme() && !appendEncodedHostname(encodedHostName, hostName))
         return;

     bool slashSlashNeeded = m_userStart == m_schemeEnd + 1U;
     parse(makeString(
         StringView(m_string).left(hostStart()),
         slashSlashNeeded ? "//" : "",
         hasSpecialScheme() ? StringView(encodedHostName.data(), encodedHostName.size()) : hostName,
         portString.isEmpty() ? "" : ":",
         portString,
         StringView(m_string).substring(pathStart())
     ));
 }

 template<typename StringType>
 static String percentEncodeCharacters(const StringType& input, bool(*shouldEncode)(UChar))
 {
     auto encode = [shouldEncode] (const StringType& input) {
         CString utf8 = input.utf8();
         auto* data = utf8.data();
         StringBuilder builder;
         auto length = utf8.length();
         for (unsigned j = 0; j < length; j++) {
             auto c = data[j];
             if (shouldEncode(c)) {
                 builder.append('%');
                 builder.append(upperNibbleToASCIIHexDigit(c));
                 builder.append(lowerNibbleToASCIIHexDigit(c));
             } else
                 builder.append(c);
         }
         return builder.toString();
     };

     for (size_t i = 0; i < input.length(); ++i) {
         if (UNLIKELY(shouldEncode(input[i])))
             return encode(input);
     }
     if constexpr (std::is_same_v<StringType, StringView>)
         return input.toString();
     else
         return input;
 }

 void URL::parse(const String& string)
 {
     *this = URLParser(string).result();
 }

 void URL::remove(unsigned start, unsigned length)
 {
     if (!length)
         return;
     ASSERT(start < m_string.length());
     ASSERT(length <= m_string.length() - start);

     auto stringAfterRemoval = WTFMove(m_string);
     stringAfterRemoval.remove(start, length);
     parse(stringAfterRemoval);
 }

 void URL::setUser(StringView newUser)
 {
     if (!m_isValid)
         return;

     unsigned end = m_userEnd;
     if (!newUser.isEmpty()) {
         bool slashSlashNeeded = m_userStart == m_schemeEnd + 1U;
         bool needSeparator = end == m_hostEnd || (end == m_passwordEnd && m_string[end] != '@');
         parse(makeString(
             StringView(m_string).left(m_userStart),
             slashSlashNeeded ? "//" : "",
             percentEncodeCharacters(newUser, URLParser::isInUserInfoEncodeSet),
             needSeparator ? "@" : "",
             StringView(m_string).substring(end)
         ));
     } else {
         // Remove '@' if we now have neither user nor password.
         if (m_userEnd == m_passwordEnd && end != m_hostEnd && m_string[end] == '@')
             end += 1;
         remove(m_userStart, end - m_userStart);
     }
 }

 void URL::setPassword(StringView newPassword)
 {
     if (!m_isValid)
         return;

     if (!newPassword.isEmpty()) {
         bool needLeadingSlashes = m_userEnd == m_schemeEnd + 1U;
         parse(makeString(
             StringView(m_string).left(m_userEnd),
             needLeadingSlashes ? "//:" : ":",
             percentEncodeCharacters(newPassword, URLParser::isInUserInfoEncodeSet),
             '@',
             StringView(m_string).substring(credentialsEnd())
         ));
     } else {
         unsigned end = m_userStart == m_userEnd ? credentialsEnd() : m_passwordEnd;
         remove(m_userEnd, end - m_userEnd);
     }
 }

 void URL::removeCredentials()
 {
     if (!m_isValid)
         return;

     remove(m_userStart, credentialsEnd() - m_userStart);
 }

 void URL::setFragmentIdentifier(StringView identifier)
 {
     if (!m_isValid)
         return;

     *this = URLParser(makeString(StringView(m_string).left(m_queryEnd), '#', identifier), { }, URLTextEncodingSentinelAllowingC0AtEndOfHash).result();
 }

 void URL::removeFragmentIdentifier()
 {
     if (!m_isValid)
         return;

     m_string = m_string.left(m_queryEnd);
 }

 void URL::removeQueryAndFragmentIdentifier()
 {
     if (!m_isValid)
         return;

     m_string = m_string.left(m_pathEnd);
     m_queryEnd = m_pathEnd;
 }

 void URL::setQuery(StringView newQuery)
 {
     // FIXME: Consider renaming this function to setEncodedQuery and/or calling percentEncodeCharacters the way setPath does.
     // https://webkit.org/b/161176

     if (!m_isValid)
         return;

     parse(makeString(
         StringView(m_string).left(m_pathEnd),
         (!newQuery.startsWith('?') && !newQuery.isNull()) ? "?" : "",
         newQuery,
         StringView(m_string).substring(m_queryEnd)
     ));
 }

 static String escapePathWithoutCopying(StringView path)
 {
     auto questionMarkOrNumberSignOrNonASCII = [] (UChar character) {
         return character == '?' || character == '#' || !isASCII(character);
     };
     return percentEncodeCharacters(path, questionMarkOrNumberSignOrNonASCII);
 }

 void URL::setPath(StringView path)
 {
     if (!m_isValid)
         return;

     parse(makeString(
         StringView(m_string).left(pathStart()),
         path.startsWith('/') || (path.startsWith('\\') && (hasSpecialScheme() || protocolIs("file"))) || (!hasSpecialScheme() && path.isEmpty() && m_schemeEnd + 1U < pathStart()) ? "" : "/",
         !hasSpecialScheme() && host().isEmpty() && path.startsWith("//") && path.length() > 2 ? "/." : "",
         escapePathWithoutCopying(path),
         StringView(m_string).substring(m_pathEnd)
     ));
 }

 StringView URL::stringWithoutQueryOrFragmentIdentifier() const
 {
     if (!m_isValid)
         return m_string;

     return StringView(m_string).left(pathEnd());
 }

 StringView URL::stringWithoutFragmentIdentifier() const
 {
     if (!m_isValid)
         return m_string;

     return StringView(m_string).left(m_queryEnd);
 }

 bool equalIgnoringFragmentIdentifier(const URL& a, const URL& b)
 {
     return a.stringWithoutFragmentIdentifier() == b.stringWithoutFragmentIdentifier();
 }

 bool protocolHostAndPortAreEqual(const URL& a, const URL& b)
 {
     if (a.m_schemeEnd != b.m_schemeEnd)
         return false;

     unsigned hostStartA = a.hostStart();
     unsigned hostLengthA = a.m_hostEnd - hostStartA;
     unsigned hostStartB = b.hostStart();
     unsigned hostLengthB = b.m_hostEnd - b.hostStart();
     if (hostLengthA != hostLengthB)
         return false;

     // Check the scheme
     for (unsigned i = 0; i < a.m_schemeEnd; ++i) {
         if (toASCIILower(a.string()[i]) != toASCIILower(b.string()[i]))
             return false;
     }

     // And the host
     for (unsigned i = 0; i < hostLengthA; ++i) {
         if (toASCIILower(a.string()[hostStartA + i]) != toASCIILower(b.string()[hostStartB + i]))
             return false;
     }

     if (a.port() != b.port())
         return false;

     return true;
 }

 bool URL::isMatchingDomain(StringView domain) const
 {
     // FIXME: Consider moving this to an appropriate place in WebCore's plug-in code; don't want people tempted to use this instead of SecurityOrigin.

     if (isNull())
         return false;

     if (domain.isEmpty())
         return true;

     if (!protocolIsInHTTPFamily())
         return false;

     auto host = this->host();
     if (!host.endsWith(domain))
         return false;

     return host.length() == domain.length() || host[host.length() - domain.length() - 1] == '.';
 }

 // FIXME: Rename this so it's clear that it does the appropriate escaping for URL query field values.
 String encodeWithURLEscapeSequences(const String& input)
 {
     return percentEncodeCharacters(input, URLParser::isInUserInfoEncodeSet);
 }

 bool URL::isHierarchical() const
 {
     if (!m_isValid)
         return false;
     ASSERT(m_string[m_schemeEnd] == ':');
     return m_string[m_schemeEnd + 1] == '/';
 }

 static bool protocolIsInternal(StringView string, const char* protocol)
 {
     assertProtocolIsGood(protocol);
     bool isLeading = true;
     for (auto codeUnit : string.codeUnits()) {
         if (isLeading) {
             // Skip leading whitespace and control characters.
             if (shouldTrimFromURL(codeUnit))
                 continue;
             isLeading = false;
         } else {
             // Skip tabs and newlines even later in the protocol.
             if (codeUnit == '\t' || codeUnit == '\r' || codeUnit == '\n')
                 continue;
         }

         char expectedCharacter = *protocol++;
         if (!expectedCharacter)
             return codeUnit == ':';
         if (!isASCIIAlphaCaselessEqual(codeUnit, expectedCharacter))
             return false;
     }
     return false;
 }

 bool protocolIs(StringView string, const char* protocol)
 {
     return protocolIsInternal(string, protocol);
 }

 #ifndef NDEBUG

 void URL::print() const
 {
     printf("%s\n", m_string.utf8().data());
 }

 #endif

 void URL::dump(PrintStream& out) const
 {
     out.print(m_string);
 }

 String URL::strippedForUseAsReferrer() const
 {
     if (!m_isValid)
         return m_string;

     unsigned end = credentialsEnd();

     if (m_userStart == end && m_queryEnd == m_string.length())
         return m_string;

     return makeString(
         StringView(m_string).substring(0, m_userStart),
         StringView(m_string).substring(end, m_queryEnd - end)
     );
 }

 bool URL::isLocalFile() const
 {
     // Including feed here might be a bad idea since drag and drop uses this check
     // and including feed would allow feeds to potentially let someone's blog
     // read the contents of the clipboard on a drag, even without a drop.
     // Likewise with using the FrameLoader::shouldTreatURLAsLocal() function.
     return protocolIs("file");
 }

 bool protocolIsJavaScript(StringView string)
 {
     return protocolIsInternal(string, "javascript");
 }

 bool protocolIsInFTPFamily(StringView url)
 {
     auto length = url.length();
     // Do the comparison without making a new string object.
     return length >= 4
         && isASCIIAlphaCaselessEqual(url[0], 'f')
         && isASCIIAlphaCaselessEqual(url[1], 't')
         && isASCIIAlphaCaselessEqual(url[2], 'p')
         && (url[3] == ':' || (isASCIIAlphaCaselessEqual(url[3], 's') && length >= 5 && url[4] == ':'));
 }

 bool protocolIsInHTTPFamily(StringView url)
 {
     auto length = url.length();
     // Do the comparison without making a new string object.
     return length >= 5
         && isASCIIAlphaCaselessEqual(url[0], 'h')
         && isASCIIAlphaCaselessEqual(url[1], 't')
         && isASCIIAlphaCaselessEqual(url[2], 't')
         && isASCIIAlphaCaselessEqual(url[3], 'p')
         && (url[4] == ':' || (isASCIIAlphaCaselessEqual(url[4], 's') && length >= 6 && url[5] == ':'));
 }


 static StaticStringImpl aboutBlankString { "about:blank" };
 const URL& aboutBlankURL()
 {
     static LazyNeverDestroyed<URL> staticBlankURL;
     static std::once_flag onceFlag;
     std::call_once(onceFlag, [&] {
         staticBlankURL.construct(URL(), &aboutBlankString);
     });
     return staticBlankURL;
 }

 static StaticStringImpl aboutSrcDocString { "about:srcdoc" };
 const URL& aboutSrcDocURL()
 {
     static LazyNeverDestroyed<URL> staticSrcDocURL;
     static std::once_flag onceFlag;
     std::call_once(onceFlag, [&] {
         staticSrcDocURL.construct(URL(), &aboutSrcDocString);
     });
     return staticSrcDocURL;
 }

 bool URL::protocolIsAbout() const
 {
     return protocolIs("about");
 }

 bool portAllowed(const URL& url)
 {
     std::optional<uint16_t> port = url.port();

     // Since most URLs don't have a port, return early for the "no port" case.
     if (!port)
         return true;

     // This blocked port list matches the port blocking that Mozilla implements.
     // See http://www.mozilla.org/projects/netlib/PortBanning.html for more information.
     static const uint16_t blockedPortList[] = {
         1,    // tcpmux
         7,    // echo
         9,    // discard
         11,   // systat
         13,   // daytime
         15,   // netstat
         17,   // qotd
         19,   // chargen
         20,   // FTP-data
         21,   // FTP-control
         22,   // SSH
         23,   // telnet
         25,   // SMTP
         37,   // time
         42,   // name
         43,   // nicname
         53,   // domain
         69,   // TFTP
         77,   // priv-rjs
         79,   // finger
         87,   // ttylink
         95,   // supdup
         101,  // hostriame
         102,  // iso-tsap
         103,  // gppitnp
         104,  // acr-nema
         109,  // POP2
         110,  // POP3
         111,  // sunrpc
         113,  // auth
         115,  // SFTP
         117,  // uucp-path
         119,  // nntp
         123,  // NTP
         135,  // loc-srv / epmap
         137,  // NetBIOS
         139,  // netbios
         143,  // IMAP2
         161,  // SNMP
         179,  // BGP
         389,  // LDAP
         427,  // SLP (Also used by Apple Filing Protocol)
         465,  // SMTP+SSL
         512,  // print / exec
         513,  // login
         514,  // shell
         515,  // printer
         526,  // tempo
         530,  // courier
         531,  // Chat
         532,  // netnews
         540,  // UUCP
         548,  // afpovertcp [Apple addition]
         554,  // rtsp
         556,  // remotefs
         563,  // NNTP+SSL
         587,  // ESMTP
         601,  // syslog-conn
         636,  // LDAP+SSL
         989,  // ftps-data
         990,  // ftps
         993,  // IMAP+SSL
         995,  // POP3+SSL
         1719, // H323 (RAS)
         1720, // H323 (Q931)
         1723, // H323 (H245)
         2049, // NFS
         3659, // apple-sasl / PasswordServer [Apple addition]
         4045, // lockd
         4190, // ManageSieve [Apple addition]
         5060, // SIP
         5061, // SIPS
         6000, // X11
         6566, // SANE
         6665, // Alternate IRC [Apple addition]
         6666, // Alternate IRC [Apple addition]
         6667, // Standard IRC [Apple addition]
         6668, // Alternate IRC [Apple addition]
         6669, // Alternate IRC [Apple addition]
         6679, // Alternate IRC SSL [Apple addition]
         6697, // IRC+SSL [Apple addition]
         10080, // amanda
     };

     // If the port is not in the blocked port list, allow it.
     ASSERT(std::is_sorted(std::begin(blockedPortList), std::end(blockedPortList)));
     if (!std::binary_search(std::begin(blockedPortList), std::end(blockedPortList), port.value()))
         return true;

     // Allow ports 21 and 22 for FTP URLs, as Mozilla does.
     if ((port.value() == 21 || port.value() == 22) && url.protocolIs("ftp"))
         return true;

     // Allow any port number in a file URL, since the port number is ignored.
     if (url.protocolIs("file"))
         return true;

     return false;
 }

 String mimeTypeFromDataURL(StringView dataURL)
 {
     ASSERT(protocolIsInternal(dataURL, "data"));

     // FIXME: What's the right behavior when the URL has a comma first, but a semicolon later?
     // Currently this code will break at the semicolon in that case; should add a test.
     auto index = dataURL.find(';', 5);
     if (index == notFound)
         index = dataURL.find(',', 5);
     if (index == notFound) {
         // FIXME: There was an old comment here that made it sound like this should be returning text/plain.
         // But we have been returning empty string here for some time, so not changing its behavior at this time.
         return emptyString();
     }
     if (index == 5)
         return "text/plain"_s;
     ASSERT(index >= 5);
     return dataURL.substring(5, index - 5).convertToASCIILowercase();
 }

 String URL::stringCenterEllipsizedToLength(unsigned length) const
 {
     if (m_string.length() <= length)
         return m_string;

     return makeString(StringView(m_string).left(length / 2 - 1), "...", StringView(m_string).right(length / 2 - 2));
 }

 URL URL::fakeURLWithRelativePart(StringView relativePart)
 {
     return URL(URL(), makeString("webkit-fake-url://", createCanonicalUUIDString(), '/', relativePart));
 }

 URL URL::fileURLWithFileSystemPath(StringView path)
 {
     return URL(URL(), makeString(
         "file://",
         path.startsWith('/') ? "" : "/",
         escapePathWithoutCopying(path)
     ));
 }

 StringView URL::queryWithLeadingQuestionMark() const
 {
     if (m_queryEnd <= m_pathEnd)
         return { };

     return StringView(m_string).substring(m_pathEnd, m_queryEnd - m_pathEnd);
 }

 StringView URL::fragmentIdentifierWithLeadingNumberSign() const
 {
     if (!m_isValid || m_string.length() <= m_queryEnd)
         return { };

     return StringView(m_string).substring(m_queryEnd);
 }

 bool URL::isAboutBlank() const
 {
     return protocolIsAbout() && path() == "blank";
 }

 bool URL::isAboutSrcDoc() const
 {
     return protocolIsAbout() && path() == "srcdoc";
 }

 TextStream& operator<<(TextStream& ts, const URL& url)
 {
     ts << url.string();
     return ts;
 }

 #if !PLATFORM(COCOA) && !USE(SOUP)

 static bool isIPv4Address(StringView string)
 {
     auto count = 0;

     for (const auto octet : string.splitAllowingEmptyEntries('.')) {
         if (count >= 4)
             return false;

         const auto length = octet.length();
         if (!length || length > 3)
             return false;

         auto value = 0;
         for (auto i = 0u; i < length; ++i) {
             const auto digit = octet[i];

             // Prohibit leading zeroes.
             if (digit > '9' || digit < (!i && length > 1 ? '1' : '0'))
                 return false;

             value = 10 * value + (digit - '0');
         }

         if (value > 255)
             return false;

         count++;
     }

     return (count == 4);
 }

 static bool isIPv6Address(StringView string)
 {
     enum SkipState { None, WillSkip, Skipping, Skipped, Final };
     auto skipState = None;
     auto count = 0;

     for (const auto hextet : string.splitAllowingEmptyEntries(':')) {
         if (count >= 8 || skipState == Final)
             return false;

         const auto length = hextet.length();
         if (!length) {
             // :: may be used anywhere to skip 1 to 8 hextets, but only once.
             if (skipState == Skipped)
                 return false;

             if (skipState == None)
                 skipState = !count ? WillSkip : Skipping;
             else if (skipState == WillSkip)
                 skipState = Skipping;
             else
                 skipState = Final;
             continue;
         }

         if (skipState == WillSkip)
             return false;

         if (skipState == Skipping)
             skipState = Skipped;

         if (length > 4) {
             // An IPv4 address may be used in place of the final two hextets.
             if ((skipState == None && count != 6) || (skipState == Skipped && count >= 6) || !isIPv4Address(hextet))
                 return false;

             skipState = Final;
             continue;
         }

         for (const auto codeUnit : hextet.codeUnits()) {
             // IPv6 allows leading zeroes.
             if (!isASCIIHexDigit(codeUnit))
                 return false;
         }

         count++;
     }

     return (count == 8 && skipState == None) || skipState == Skipped || skipState == Final;
 }

 bool URL::hostIsIPAddress(StringView host)
 {
     return host.contains(':') ? isIPv6Address(host) : isIPv4Address(host);
 }

 #endif

 Vector<KeyValuePair<String, String>> differingQueryParameters(const URL& firstURL, const URL& secondURL)
 {
     auto firstQueryParameters = URLParser::parseURLEncodedForm(firstURL.query());
     auto secondQueryParameters = URLParser::parseURLEncodedForm(secondURL.query());

     if (firstQueryParameters.isEmpty())
         return secondQueryParameters;

     if (secondQueryParameters.isEmpty())
         return firstQueryParameters;

     auto compare = [] (const KeyValuePair<String, String>& a, const KeyValuePair<String, String>& b) {
         if (int result = codePointCompare(a.key, b.key))
             return result;
         return codePointCompare(a.value, b.value);

     };
     auto comparesLessThan = [compare] (const KeyValuePair<String, String>& a, const KeyValuePair<String, String>& b) {
         return compare(a, b) < 0;
     };

     std::sort(firstQueryParameters.begin(), firstQueryParameters.end(), comparesLessThan);
     std::sort(secondQueryParameters.begin(), secondQueryParameters.end(), comparesLessThan);
     size_t totalFirstQueryParameters = firstQueryParameters.size();
     size_t totalSecondQueryParameters = secondQueryParameters.size();
     size_t indexInFirstQueryParameters = 0;
     size_t indexInSecondQueryParameters = 0;
     Vector<KeyValuePair<String, String>> differingQueryParameters;
     while (indexInFirstQueryParameters < totalFirstQueryParameters && indexInSecondQueryParameters < totalSecondQueryParameters) {
         int comparison = compare(firstQueryParameters[indexInFirstQueryParameters], secondQueryParameters[indexInSecondQueryParameters]);
         if (comparison < 0) {
             differingQueryParameters.append(firstQueryParameters[indexInFirstQueryParameters]);
             indexInFirstQueryParameters++;
         } else if (comparison > 0) {
             differingQueryParameters.append(secondQueryParameters[indexInSecondQueryParameters]);
             indexInSecondQueryParameters++;
         } else {
             indexInFirstQueryParameters++;
             indexInSecondQueryParameters++;
         }
     }

     while (indexInFirstQueryParameters < totalFirstQueryParameters) {
         differingQueryParameters.append(firstQueryParameters[indexInFirstQueryParameters]);
         indexInFirstQueryParameters++;
     }

     while (indexInSecondQueryParameters < totalSecondQueryParameters) {
         differingQueryParameters.append(secondQueryParameters[indexInSecondQueryParameters]);
         indexInSecondQueryParameters++;
     }

     return differingQueryParameters;
 }

 static StringView substringIgnoringQueryAndFragments(const URL& url)
 {
     if (!url.isValid())
         return StringView(url.string());

     return StringView(url.string()).left(url.pathEnd());
 }

 bool isEqualIgnoringQueryAndFragments(const URL& a, const URL& b)
 {
     return substringIgnoringQueryAndFragments(a) == substringIgnoringQueryAndFragments(b);
 }

 void removeQueryParameters(URL& url, const HashSet<String>& keysToRemove)
 {
     if (keysToRemove.isEmpty())
         return;

     StringBuilder queryWithoutRemovalKeys;
     for (auto& parameter : URLParser::parseURLEncodedForm(url.query())) {
         if (!keysToRemove.contains(parameter.key))
             queryWithoutRemovalKeys.append(queryWithoutRemovalKeys.isEmpty() ? "" : "&", parameter.key, '=', parameter.value);
     }

     url.setQuery(queryWithoutRemovalKeys);
 }

 } // namespace WTF