blob: 1ef5555020eb42e2f0f217d78e68d9c2acfa4123 [file] [log] [blame]
/*
* Copyright (C) 2004, 2007-2008, 2011-2013, 2015-2016 Apple Inc. All rights reserved.
* Copyright (C) 2012 Research In Motion Limited. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "config.h"
#include "URL.h"
#include "DecodeEscapeSequences.h"
#include "MIMETypeRegistry.h"
#include "TextEncoding.h"
#include "URLParser.h"
#include <stdio.h>
#include <unicode/uidna.h>
#include <wtf/HashMap.h>
#include <wtf/HexNumber.h>
#include <wtf/NeverDestroyed.h>
#include <wtf/StdLibExtras.h>
#include <wtf/UUID.h>
#include <wtf/text/CString.h>
#include <wtf/text/StringBuilder.h>
#include <wtf/text/StringHash.h>
// FIXME: This file makes too much use of the + operator on String.
// We either have to optimize that operator so it doesn't involve
// so many allocations, or change this to use StringBuffer instead.
using namespace WTF;
namespace WebCore {
typedef Vector<char, 512> CharBuffer;
typedef Vector<UChar, 512> UCharBuffer;
static const unsigned invalidPortNumber = 0xFFFF;
enum URLCharacterClasses {
// alpha
SchemeFirstChar = 1 << 0,
// ( alpha | digit | "+" | "-" | "." )
SchemeChar = 1 << 1,
// mark = "-" | "_" | "." | "!" | "~" | "*" | "'" | "(" | ")"
// unreserved = alphanum | mark
// ( unreserved | escaped | ";" | ":" | "&" | "=" | "+" | "$" | "," )
UserInfoChar = 1 << 2,
// alnum | "." | "-" | "%"
// The above is what the specification says, but we are lenient to
// match existing practice and also allow:
// "_"
HostnameChar = 1 << 3,
// hexdigit | ":" | "%"
IPv6Char = 1 << 4,
// "#" | "?" | "/" | nul
PathSegmentEndChar = 1 << 5,
// not allowed in path
BadChar = 1 << 6,
// "\t" | "\n" | "\r"
TabNewline = 1 << 7
};
static const unsigned char characterClassTable[256] = {
/* 0 nul */ PathSegmentEndChar, /* 1 soh */ BadChar,
/* 2 stx */ BadChar, /* 3 etx */ BadChar,
/* 4 eot */ BadChar, /* 5 enq */ BadChar, /* 6 ack */ BadChar, /* 7 bel */ BadChar,
/* 8 bs */ BadChar, /* 9 ht */ BadChar | TabNewline, /* 10 nl */ BadChar | TabNewline,
/* 11 vt */ BadChar, /* 12 np */ BadChar, /* 13 cr */ BadChar | TabNewline,
/* 14 so */ BadChar, /* 15 si */ BadChar,
/* 16 dle */ BadChar, /* 17 dc1 */ BadChar, /* 18 dc2 */ BadChar, /* 19 dc3 */ BadChar,
/* 20 dc4 */ BadChar, /* 21 nak */ BadChar, /* 22 syn */ BadChar, /* 23 etb */ BadChar,
/* 24 can */ BadChar, /* 25 em */ BadChar, /* 26 sub */ BadChar, /* 27 esc */ BadChar,
/* 28 fs */ BadChar, /* 29 gs */ BadChar, /* 30 rs */ BadChar, /* 31 us */ BadChar,
/* 32 sp */ BadChar, /* 33 ! */ UserInfoChar,
/* 34 " */ BadChar, /* 35 # */ PathSegmentEndChar | BadChar,
/* 36 $ */ UserInfoChar, /* 37 % */ UserInfoChar | HostnameChar | IPv6Char | BadChar,
/* 38 & */ UserInfoChar, /* 39 ' */ UserInfoChar,
/* 40 ( */ UserInfoChar, /* 41 ) */ UserInfoChar,
/* 42 * */ UserInfoChar, /* 43 + */ SchemeChar | UserInfoChar,
/* 44 , */ UserInfoChar,
/* 45 - */ SchemeChar | UserInfoChar | HostnameChar,
/* 46 . */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
/* 47 / */ PathSegmentEndChar,
/* 48 0 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
/* 49 1 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
/* 50 2 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
/* 51 3 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
/* 52 4 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
/* 53 5 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
/* 54 6 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
/* 55 7 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
/* 56 8 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
/* 57 9 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
/* 58 : */ UserInfoChar | IPv6Char, /* 59 ; */ UserInfoChar,
/* 60 < */ BadChar, /* 61 = */ UserInfoChar,
/* 62 > */ BadChar, /* 63 ? */ PathSegmentEndChar | BadChar,
/* 64 @ */ 0,
/* 65 A */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
/* 66 B */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
/* 67 C */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
/* 68 D */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
/* 69 E */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
/* 70 F */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
/* 71 G */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
/* 72 H */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
/* 73 I */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
/* 74 J */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
/* 75 K */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
/* 76 L */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
/* 77 M */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
/* 78 N */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
/* 79 O */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
/* 80 P */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
/* 81 Q */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
/* 82 R */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
/* 83 S */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
/* 84 T */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
/* 85 U */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
/* 86 V */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
/* 87 W */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
/* 88 X */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
/* 89 Y */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
/* 90 Z */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
/* 91 [ */ 0,
/* 92 \ */ 0, /* 93 ] */ 0,
/* 94 ^ */ 0,
/* 95 _ */ UserInfoChar | HostnameChar,
/* 96 ` */ 0,
/* 97 a */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
/* 98 b */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
/* 99 c */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
/* 100 d */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
/* 101 e */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
/* 102 f */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
/* 103 g */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
/* 104 h */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
/* 105 i */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
/* 106 j */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
/* 107 k */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
/* 108 l */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
/* 109 m */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
/* 110 n */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
/* 111 o */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
/* 112 p */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
/* 113 q */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
/* 114 r */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
/* 115 s */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
/* 116 t */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
/* 117 u */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
/* 118 v */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
/* 119 w */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
/* 120 x */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
/* 121 y */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
/* 122 z */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
/* 123 { */ 0,
/* 124 | */ 0, /* 125 } */ 0, /* 126 ~ */ UserInfoChar, /* 127 del */ BadChar,
/* 128 */ BadChar, /* 129 */ BadChar, /* 130 */ BadChar, /* 131 */ BadChar,
/* 132 */ BadChar, /* 133 */ BadChar, /* 134 */ BadChar, /* 135 */ BadChar,
/* 136 */ BadChar, /* 137 */ BadChar, /* 138 */ BadChar, /* 139 */ BadChar,
/* 140 */ BadChar, /* 141 */ BadChar, /* 142 */ BadChar, /* 143 */ BadChar,
/* 144 */ BadChar, /* 145 */ BadChar, /* 146 */ BadChar, /* 147 */ BadChar,
/* 148 */ BadChar, /* 149 */ BadChar, /* 150 */ BadChar, /* 151 */ BadChar,
/* 152 */ BadChar, /* 153 */ BadChar, /* 154 */ BadChar, /* 155 */ BadChar,
/* 156 */ BadChar, /* 157 */ BadChar, /* 158 */ BadChar, /* 159 */ BadChar,
/* 160 */ BadChar, /* 161 */ BadChar, /* 162 */ BadChar, /* 163 */ BadChar,
/* 164 */ BadChar, /* 165 */ BadChar, /* 166 */ BadChar, /* 167 */ BadChar,
/* 168 */ BadChar, /* 169 */ BadChar, /* 170 */ BadChar, /* 171 */ BadChar,
/* 172 */ BadChar, /* 173 */ BadChar, /* 174 */ BadChar, /* 175 */ BadChar,
/* 176 */ BadChar, /* 177 */ BadChar, /* 178 */ BadChar, /* 179 */ BadChar,
/* 180 */ BadChar, /* 181 */ BadChar, /* 182 */ BadChar, /* 183 */ BadChar,
/* 184 */ BadChar, /* 185 */ BadChar, /* 186 */ BadChar, /* 187 */ BadChar,
/* 188 */ BadChar, /* 189 */ BadChar, /* 190 */ BadChar, /* 191 */ BadChar,
/* 192 */ BadChar, /* 193 */ BadChar, /* 194 */ BadChar, /* 195 */ BadChar,
/* 196 */ BadChar, /* 197 */ BadChar, /* 198 */ BadChar, /* 199 */ BadChar,
/* 200 */ BadChar, /* 201 */ BadChar, /* 202 */ BadChar, /* 203 */ BadChar,
/* 204 */ BadChar, /* 205 */ BadChar, /* 206 */ BadChar, /* 207 */ BadChar,
/* 208 */ BadChar, /* 209 */ BadChar, /* 210 */ BadChar, /* 211 */ BadChar,
/* 212 */ BadChar, /* 213 */ BadChar, /* 214 */ BadChar, /* 215 */ BadChar,
/* 216 */ BadChar, /* 217 */ BadChar, /* 218 */ BadChar, /* 219 */ BadChar,
/* 220 */ BadChar, /* 221 */ BadChar, /* 222 */ BadChar, /* 223 */ BadChar,
/* 224 */ BadChar, /* 225 */ BadChar, /* 226 */ BadChar, /* 227 */ BadChar,
/* 228 */ BadChar, /* 229 */ BadChar, /* 230 */ BadChar, /* 231 */ BadChar,
/* 232 */ BadChar, /* 233 */ BadChar, /* 234 */ BadChar, /* 235 */ BadChar,
/* 236 */ BadChar, /* 237 */ BadChar, /* 238 */ BadChar, /* 239 */ BadChar,
/* 240 */ BadChar, /* 241 */ BadChar, /* 242 */ BadChar, /* 243 */ BadChar,
/* 244 */ BadChar, /* 245 */ BadChar, /* 246 */ BadChar, /* 247 */ BadChar,
/* 248 */ BadChar, /* 249 */ BadChar, /* 250 */ BadChar, /* 251 */ BadChar,
/* 252 */ BadChar, /* 253 */ BadChar, /* 254 */ BadChar, /* 255 */ BadChar
};
enum PercentEncodeCharacterClass {
// Class names match the URL Standard; each class is a superset of the previous one.
PercentEncodeSimple = 255,
PercentEncodeDefault = 127,
PercentEncodePassword = 63,
PercentEncodeUsername = 31,
};
static const unsigned char percentEncodeClassTable[256] = {
/* 0 nul */ PercentEncodeSimple, /* 1 soh */ PercentEncodeSimple, /* 2 stx */ PercentEncodeSimple, /* 3 etx */ PercentEncodeSimple,
/* 4 eot */ PercentEncodeSimple, /* 5 enq */ PercentEncodeSimple, /* 6 ack */ PercentEncodeSimple, /* 7 bel */ PercentEncodeSimple,
/* 8 bs */ PercentEncodeSimple, /* 9 ht */ PercentEncodeSimple, /* 10 nl */ PercentEncodeSimple, /* 11 vt */ PercentEncodeSimple,
/* 12 np */ PercentEncodeSimple, /* 13 cr */ PercentEncodeSimple, /* 14 so */ PercentEncodeSimple, /* 15 si */ PercentEncodeSimple,
/* 16 dle */ PercentEncodeSimple, /* 17 dc1 */ PercentEncodeSimple, /* 18 dc2 */ PercentEncodeSimple, /* 19 dc3 */ PercentEncodeSimple,
/* 20 dc4 */ PercentEncodeSimple, /* 21 nak */ PercentEncodeSimple, /* 22 syn */ PercentEncodeSimple, /* 23 etb */ PercentEncodeSimple,
/* 24 can */ PercentEncodeSimple, /* 25 em */ PercentEncodeSimple, /* 26 sub */ PercentEncodeSimple, /* 27 esc */ PercentEncodeSimple,
/* 28 fs */ PercentEncodeSimple, /* 29 gs */ PercentEncodeSimple, /* 30 rs */ PercentEncodeSimple, /* 31 us */ PercentEncodeSimple,
/* 32 sp */ PercentEncodeDefault,
/* 33 ! */ 0,
/* 34 " */ PercentEncodeDefault,
/* 35 # */ PercentEncodeDefault,
/* 36 $ */ 0,
/* 37 % */ 0,
/* 38 & */ 0,
/* 39 ' */ 0,
/* 40 ( */ 0,
/* 41 ) */ 0,
/* 42 * */ 0,
/* 43 + */ 0,
/* 44 , */ 0,
/* 45 - */ 0,
/* 46 . */ 0,
/* 47 / */ PercentEncodePassword,
/* 48 0 */ 0, /* 49 1 */ 0, /* 50 2 */ 0, /* 51 3 */ 0,
/* 52 4 */ 0, /* 53 5 */ 0, /* 54 6 */ 0, /* 55 7 */ 0,
/* 56 8 */ 0, /* 57 9 */ 0,
/* 58 : */ PercentEncodeUsername,
/* 59 ; */ 0,
/* 60 < */ PercentEncodeDefault,
/* 61 = */ 0,
/* 62 > */ PercentEncodeDefault,
/* 63 ? */ PercentEncodeDefault,
/* 64 @ */ PercentEncodePassword,
/* 65 A */ 0, /* 66 B */ 0, /* 67 C */ 0, /* 68 D */ 0,
/* 69 E */ 0, /* 70 F */ 0, /* 71 G */ 0, /* 72 H */ 0,
/* 73 I */ 0, /* 74 J */ 0, /* 75 K */ 0, /* 76 L */ 0,
/* 77 M */ 0, /* 78 N */ 0, /* 79 O */ 0, /* 80 P */ 0,
/* 81 Q */ 0, /* 82 R */ 0, /* 83 S */ 0, /* 84 T */ 0,
/* 85 U */ 0, /* 86 V */ 0, /* 87 W */ 0, /* 88 X */ 0,
/* 89 Y */ 0, /* 90 Z */ 0,
/* 91 [ */ 0,
/* 92 \ */ PercentEncodePassword,
/* 93 ] */ 0,
/* 94 ^ */ 0,
/* 95 _ */ 0,
/* 96 ` */ PercentEncodeDefault,
/* 97 a */ 0, /* 98 b */ 0, /* 99 c */ 0, /* 100 d */ 0,
/* 101 e */ 0, /* 102 f */ 0, /* 103 g */ 0, /* 104 h */ 0,
/* 105 i */ 0, /* 106 j */ 0, /* 107 k */ 0, /* 108 l */ 0,
/* 109 m */ 0, /* 110 n */ 0, /* 111 o */ 0, /* 112 p */ 0,
/* 113 q */ 0, /* 114 r */ 0, /* 115 s */ 0, /* 116 t */ 0,
/* 117 u */ 0, /* 118 v */ 0, /* 119 w */ 0, /* 120 x */ 0,
/* 121 y */ 0, /* 122 z */ 0,
/* 123 { */ 0,
/* 124 | */ 0,
/* 125 } */ 0,
/* 126 ~ */ 0,
/* 127 del */ PercentEncodeSimple,
/* 128 */ PercentEncodeSimple, /* 129 */ PercentEncodeSimple, /* 130 */ PercentEncodeSimple, /* 131 */ PercentEncodeSimple,
/* 132 */ PercentEncodeSimple, /* 133 */ PercentEncodeSimple, /* 134 */ PercentEncodeSimple, /* 135 */ PercentEncodeSimple,
/* 136 */ PercentEncodeSimple, /* 137 */ PercentEncodeSimple, /* 138 */ PercentEncodeSimple, /* 139 */ PercentEncodeSimple,
/* 140 */ PercentEncodeSimple, /* 141 */ PercentEncodeSimple, /* 142 */ PercentEncodeSimple, /* 143 */ PercentEncodeSimple,
/* 144 */ PercentEncodeSimple, /* 145 */ PercentEncodeSimple, /* 146 */ PercentEncodeSimple, /* 147 */ PercentEncodeSimple,
/* 148 */ PercentEncodeSimple, /* 149 */ PercentEncodeSimple, /* 150 */ PercentEncodeSimple, /* 151 */ PercentEncodeSimple,
/* 152 */ PercentEncodeSimple, /* 153 */ PercentEncodeSimple, /* 154 */ PercentEncodeSimple, /* 155 */ PercentEncodeSimple,
/* 156 */ PercentEncodeSimple, /* 157 */ PercentEncodeSimple, /* 158 */ PercentEncodeSimple, /* 159 */ PercentEncodeSimple,
/* 160 */ PercentEncodeSimple, /* 161 */ PercentEncodeSimple, /* 162 */ PercentEncodeSimple, /* 163 */ PercentEncodeSimple,
/* 164 */ PercentEncodeSimple, /* 165 */ PercentEncodeSimple, /* 166 */ PercentEncodeSimple, /* 167 */ PercentEncodeSimple,
/* 168 */ PercentEncodeSimple, /* 169 */ PercentEncodeSimple, /* 170 */ PercentEncodeSimple, /* 171 */ PercentEncodeSimple,
/* 172 */ PercentEncodeSimple, /* 173 */ PercentEncodeSimple, /* 174 */ PercentEncodeSimple, /* 175 */ PercentEncodeSimple,
/* 176 */ PercentEncodeSimple, /* 177 */ PercentEncodeSimple, /* 178 */ PercentEncodeSimple, /* 179 */ PercentEncodeSimple,
/* 180 */ PercentEncodeSimple, /* 181 */ PercentEncodeSimple, /* 182 */ PercentEncodeSimple, /* 183 */ PercentEncodeSimple,
/* 184 */ PercentEncodeSimple, /* 185 */ PercentEncodeSimple, /* 186 */ PercentEncodeSimple, /* 187 */ PercentEncodeSimple,
/* 188 */ PercentEncodeSimple, /* 189 */ PercentEncodeSimple, /* 190 */ PercentEncodeSimple, /* 191 */ PercentEncodeSimple,
/* 192 */ PercentEncodeSimple, /* 193 */ PercentEncodeSimple, /* 194 */ PercentEncodeSimple, /* 195 */ PercentEncodeSimple,
/* 196 */ PercentEncodeSimple, /* 197 */ PercentEncodeSimple, /* 198 */ PercentEncodeSimple, /* 199 */ PercentEncodeSimple,
/* 200 */ PercentEncodeSimple, /* 201 */ PercentEncodeSimple, /* 202 */ PercentEncodeSimple, /* 203 */ PercentEncodeSimple,
/* 204 */ PercentEncodeSimple, /* 205 */ PercentEncodeSimple, /* 206 */ PercentEncodeSimple, /* 207 */ PercentEncodeSimple,
/* 208 */ PercentEncodeSimple, /* 209 */ PercentEncodeSimple, /* 210 */ PercentEncodeSimple, /* 211 */ PercentEncodeSimple,
/* 212 */ PercentEncodeSimple, /* 213 */ PercentEncodeSimple, /* 214 */ PercentEncodeSimple, /* 215 */ PercentEncodeSimple,
/* 216 */ PercentEncodeSimple, /* 217 */ PercentEncodeSimple, /* 218 */ PercentEncodeSimple, /* 219 */ PercentEncodeSimple,
/* 220 */ PercentEncodeSimple, /* 221 */ PercentEncodeSimple, /* 222 */ PercentEncodeSimple, /* 223 */ PercentEncodeSimple,
/* 224 */ PercentEncodeSimple, /* 225 */ PercentEncodeSimple, /* 226 */ PercentEncodeSimple, /* 227 */ PercentEncodeSimple,
/* 228 */ PercentEncodeSimple, /* 229 */ PercentEncodeSimple, /* 230 */ PercentEncodeSimple, /* 231 */ PercentEncodeSimple,
/* 232 */ PercentEncodeSimple, /* 233 */ PercentEncodeSimple, /* 234 */ PercentEncodeSimple, /* 235 */ PercentEncodeSimple,
/* 236 */ PercentEncodeSimple, /* 237 */ PercentEncodeSimple, /* 238 */ PercentEncodeSimple, /* 239 */ PercentEncodeSimple,
/* 240 */ PercentEncodeSimple, /* 241 */ PercentEncodeSimple, /* 242 */ PercentEncodeSimple, /* 243 */ PercentEncodeSimple,
/* 244 */ PercentEncodeSimple, /* 245 */ PercentEncodeSimple, /* 246 */ PercentEncodeSimple, /* 247 */ PercentEncodeSimple,
/* 248 */ PercentEncodeSimple, /* 249 */ PercentEncodeSimple, /* 250 */ PercentEncodeSimple, /* 251 */ PercentEncodeSimple,
/* 252 */ PercentEncodeSimple, /* 253 */ PercentEncodeSimple, /* 254 */ PercentEncodeSimple, /* 255 */ PercentEncodeSimple
};
static inline bool isSchemeFirstChar(UChar c) { return c <= 0xff && (characterClassTable[c] & SchemeFirstChar); }
static inline bool isSchemeChar(UChar c) { return c <= 0xff && (characterClassTable[c] & SchemeChar); }
static inline bool isBadChar(unsigned char c) { return characterClassTable[c] & BadChar; }
static inline bool isTabNewline(UChar c) { return c <= 0xff && (characterClassTable[c] & TabNewline); }
String encodeWithURLEscapeSequences(const String& notEncodedString, PercentEncodeCharacterClass whatToEncode);
// Copies the source to the destination, assuming all the source characters are
// ASCII. The destination buffer must be large enough. Null characters are allowed
// in the source string, and no attempt is made to null-terminate the result.
static void copyASCII(const String& string, char* dest)
{
if (string.isEmpty())
return;
if (string.is8Bit())
memcpy(dest, string.characters8(), string.length());
else {
const UChar* src = string.characters16();
size_t length = string.length();
for (size_t i = 0; i < length; i++)
dest[i] = static_cast<char>(src[i]);
}
}
void URL::invalidate()
{
m_isValid = false;
m_protocolIsInHTTPFamily = false;
m_cannotBeABaseURL = false;
m_schemeEnd = 0;
m_userStart = 0;
m_userEnd = 0;
m_passwordEnd = 0;
m_hostEnd = 0;
m_portEnd = 0;
m_pathEnd = 0;
m_pathAfterLastSlash = 0;
m_queryEnd = 0;
}
URL::URL(ParsedURLStringTag, const String& url)
{
URLParser parser(url);
*this = parser.result();
#if OS(WINDOWS)
// FIXME(148598): Work around Windows local file handling bug in CFNetwork
ASSERT(isLocalFile() || url == m_string);
#else
ASSERT(url == m_string);
#endif
}
URL::URL(const URL& base, const String& relative)
{
URLParser parser(relative, base);
*this = parser.result();
}
URL::URL(const URL& base, const String& relative, const TextEncoding& encoding)
{
// For UTF-{7,16,32}, we want to use UTF-8 for the query part as
// we do when submitting a form. A form with GET method
// has its contents added to a URL as query params and it makes sense
// to be consistent.
URLParser parser(relative, base, encoding.encodingForFormSubmission());
*this = parser.result();
}
static bool shouldTrimFromURL(UChar c)
{
// Browsers ignore leading/trailing whitespace and control
// characters from URLs. Note that c is an *unsigned* char here
// so this comparison should only catch control characters.
return c <= ' ';
}
URL URL::isolatedCopy() const
{
URL result = *this;
result.m_string = result.m_string.isolatedCopy();
return result;
}
String URL::lastPathComponent() const
{
if (!hasPath())
return String();
unsigned end = m_pathEnd - 1;
if (m_string[end] == '/')
--end;
size_t start = m_string.reverseFind('/', end);
if (start < static_cast<unsigned>(m_portEnd))
return String();
++start;
return m_string.substring(start, end - start + 1);
}
StringView URL::protocol() const
{
return StringView(m_string).substring(0, m_schemeEnd);
}
String URL::host() const
{
unsigned start = hostStart();
return m_string.substring(start, m_hostEnd - start);
}
std::optional<uint16_t> URL::port() const
{
if (!m_portEnd || m_hostEnd >= m_portEnd - 1)
return std::nullopt;
bool ok = false;
unsigned number;
if (m_string.is8Bit())
number = charactersToUIntStrict(m_string.characters8() + m_hostEnd + 1, m_portEnd - m_hostEnd - 1, &ok);
else
number = charactersToUIntStrict(m_string.characters16() + m_hostEnd + 1, m_portEnd - m_hostEnd - 1, &ok);
if (!ok || number > std::numeric_limits<uint16_t>::max())
return std::nullopt;
return number;
}
String URL::hostAndPort() const
{
if (auto port = this->port())
return host() + ':' + String::number(port.value());
return host();
}
String URL::protocolHostAndPort() const
{
String result = m_string.substring(0, m_portEnd);
if (m_passwordEnd - m_userStart > 0) {
const int allowForTrailingAtSign = 1;
result.remove(m_userStart, m_passwordEnd - m_userStart + allowForTrailingAtSign);
}
return result;
}
String URL::user() const
{
return decodeURLEscapeSequences(m_string.substring(m_userStart, m_userEnd - m_userStart));
}
String URL::pass() const
{
if (m_passwordEnd == m_userEnd)
return String();
return decodeURLEscapeSequences(m_string.substring(m_userEnd + 1, m_passwordEnd - m_userEnd - 1));
}
String URL::encodedUser() const
{
return m_string.substring(m_userStart, m_userEnd - m_userStart);
}
String URL::encodedPass() const
{
if (m_passwordEnd == m_userEnd)
return String();
return m_string.substring(m_userEnd + 1, m_passwordEnd - m_userEnd - 1);
}
String URL::fragmentIdentifier() const
{
if (!m_isValid || m_queryEnd == m_string.length())
return String();
return m_string.substring(m_queryEnd + 1);
}
bool URL::hasFragmentIdentifier() const
{
return m_isValid && m_string.length() != m_queryEnd;
}
String URL::baseAsString() const
{
return m_string.left(m_pathAfterLastSlash);
}
#if !USE(CF)
String URL::fileSystemPath() const
{
if (!isValid() || !isLocalFile())
return String();
return decodeURLEscapeSequences(path());
}
#endif
#ifdef NDEBUG
static inline void assertProtocolIsGood(StringView)
{
}
#else
static void assertProtocolIsGood(StringView protocol)
{
// FIXME: We probably don't need this function any more.
// The isASCIIAlphaCaselessEqual function asserts that passed-in characters
// are ones it can handle; the older code did not and relied on these checks.
for (auto character : protocol.codeUnits()) {
ASSERT(isASCII(character));
ASSERT(character > ' ');
ASSERT(!isASCIIUpper(character));
ASSERT(toASCIILowerUnchecked(character) == character);
}
}
#endif
static Lock& defaultPortForProtocolMapForTestingLock()
{
static NeverDestroyed<Lock> lock;
return lock;
}
using DefaultPortForProtocolMapForTesting = HashMap<String, uint16_t>;
static DefaultPortForProtocolMapForTesting*& defaultPortForProtocolMapForTesting()
{
static DefaultPortForProtocolMapForTesting* defaultPortForProtocolMap;
return defaultPortForProtocolMap;
}
static DefaultPortForProtocolMapForTesting& ensureDefaultPortForProtocolMapForTesting()
{
DefaultPortForProtocolMapForTesting*& defaultPortForProtocolMap = defaultPortForProtocolMapForTesting();
if (!defaultPortForProtocolMap)
defaultPortForProtocolMap = new DefaultPortForProtocolMapForTesting;
return *defaultPortForProtocolMap;
}
void registerDefaultPortForProtocolForTesting(uint16_t port, const String& protocol)
{
LockHolder locker(defaultPortForProtocolMapForTestingLock());
ensureDefaultPortForProtocolMapForTesting().add(protocol, port);
}
void clearDefaultPortForProtocolMapForTesting()
{
LockHolder locker(defaultPortForProtocolMapForTestingLock());
if (auto* map = defaultPortForProtocolMapForTesting())
map->clear();
}
std::optional<uint16_t> defaultPortForProtocol(StringView protocol)
{
if (auto* overrideMap = defaultPortForProtocolMapForTesting()) {
LockHolder locker(defaultPortForProtocolMapForTestingLock());
ASSERT(overrideMap); // No need to null check again here since overrideMap cannot become null after being non-null.
auto iterator = overrideMap->find(protocol.toStringWithoutCopying());
if (iterator != overrideMap->end())
return iterator->value;
}
return URLParser::defaultPortForProtocol(protocol);
}
bool isDefaultPortForProtocol(uint16_t port, StringView protocol)
{
return defaultPortForProtocol(protocol) == port;
}
bool URL::protocolIs(const char* protocol) const
{
assertProtocolIsGood(StringView(reinterpret_cast<const LChar*>(protocol), strlen(protocol)));
// JavaScript URLs are "valid" and should be executed even if URL decides they are invalid.
// The free function protocolIsJavaScript() should be used instead.
ASSERT(!equalLettersIgnoringASCIICase(StringView(protocol), "javascript"));
if (!m_isValid)
return false;
// Do the comparison without making a new string object.
for (unsigned i = 0; i < m_schemeEnd; ++i) {
if (!protocol[i] || !isASCIIAlphaCaselessEqual(m_string[i], protocol[i]))
return false;
}
return !protocol[m_schemeEnd]; // We should have consumed all characters in the argument.
}
bool URL::protocolIs(StringView protocol) const
{
assertProtocolIsGood(protocol);
if (!m_isValid)
return false;
if (m_schemeEnd != protocol.length())
return false;
// Do the comparison without making a new string object.
for (unsigned i = 0; i < m_schemeEnd; ++i) {
if (!isASCIIAlphaCaselessEqual(m_string[i], protocol[i]))
return false;
}
return true;
}
String URL::query() const
{
if (m_queryEnd == m_pathEnd)
return String();
return m_string.substring(m_pathEnd + 1, m_queryEnd - (m_pathEnd + 1));
}
String URL::path() const
{
return m_string.substring(m_portEnd, m_pathEnd - m_portEnd);
}
bool URL::setProtocol(const String& s)
{
// Firefox and IE remove everything after the first ':'.
size_t separatorPosition = s.find(':');
String newProtocol = s.substring(0, separatorPosition);
if (!isValidProtocol(newProtocol))
return false;
if (!m_isValid) {
URLParser parser(makeString(newProtocol, ":", m_string));
*this = parser.result();
return true;
}
URLParser parser(makeString(newProtocol, m_string.substring(m_schemeEnd)));
*this = parser.result();
return true;
}
static bool containsOnlyASCII(StringView string)
{
if (string.is8Bit())
return charactersAreAllASCII(string.characters8(), string.length());
return charactersAreAllASCII(string.characters16(), string.length());
}
// Appends the punycoded hostname identified by the given string and length to
// the output buffer. The result will not be null terminated.
// Return value of false means error in encoding.
static bool appendEncodedHostname(UCharBuffer& buffer, StringView string)
{
// Needs to be big enough to hold an IDN-encoded name.
// For host names bigger than this, we won't do IDN encoding, which is almost certainly OK.
const unsigned hostnameBufferLength = 2048;
if (string.length() > hostnameBufferLength || containsOnlyASCII(string)) {
append(buffer, string);
return true;
}
UChar hostnameBuffer[hostnameBufferLength];
UErrorCode error = U_ZERO_ERROR;
#if COMPILER(GCC_OR_CLANG)
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
#endif
int32_t numCharactersConverted = uidna_IDNToASCII(string.upconvertedCharacters(), string.length(), hostnameBuffer,
hostnameBufferLength, UIDNA_ALLOW_UNASSIGNED, 0, &error);
#if COMPILER(GCC_OR_CLANG)
#pragma GCC diagnostic pop
#endif
if (error == U_ZERO_ERROR) {
buffer.append(hostnameBuffer, numCharactersConverted);
return true;
}
return false;
}
void URL::setHost(const String& s)
{
if (!m_isValid)
return;
auto colonIndex = s.find(':');
if (colonIndex != notFound)
return;
UCharBuffer encodedHostName;
if (!appendEncodedHostname(encodedHostName, s))
return;
bool slashSlashNeeded = m_userStart == m_schemeEnd + 1;
StringBuilder builder;
builder.append(m_string.left(hostStart()));
if (slashSlashNeeded)
builder.appendLiteral("//");
builder.append(StringView(encodedHostName.data(), encodedHostName.size()));
builder.append(m_string.substring(m_hostEnd));
URLParser parser(builder.toString());
*this = parser.result();
}
void URL::removePort()
{
if (m_hostEnd == m_portEnd)
return;
URLParser parser(m_string.left(m_hostEnd) + m_string.substring(m_portEnd));
*this = parser.result();
}
void URL::setPort(unsigned short i)
{
if (!m_isValid)
return;
bool colonNeeded = m_portEnd == m_hostEnd;
unsigned portStart = (colonNeeded ? m_hostEnd : m_hostEnd + 1);
URLParser parser(makeString(m_string.left(portStart), (colonNeeded ? ":" : ""), String::number(i), m_string.substring(m_portEnd)));
*this = parser.result();
}
void URL::setHostAndPort(const String& hostAndPort)
{
if (!m_isValid)
return;
StringView hostName(hostAndPort);
StringView port;
auto colonIndex = hostName.find(':');
if (colonIndex != notFound) {
port = hostName.substring(colonIndex + 1);
bool ok;
int portInt = port.toIntStrict(ok);
if (!ok || portInt < 0)
return;
hostName = hostName.substring(0, colonIndex);
}
if (hostName.isEmpty())
return;
UCharBuffer encodedHostName;
if (!appendEncodedHostname(encodedHostName, hostName))
return;
bool slashSlashNeeded = m_userStart == m_schemeEnd + 1;
StringBuilder builder;
builder.append(m_string.left(hostStart()));
if (slashSlashNeeded)
builder.appendLiteral("//");
builder.append(StringView(encodedHostName.data(), encodedHostName.size()));
if (!port.isEmpty()) {
builder.appendLiteral(":");
builder.append(port);
}
builder.append(m_string.substring(m_portEnd));
URLParser parser(builder.toString());
*this = parser.result();
}
void URL::setUser(const String& user)
{
if (!m_isValid)
return;
// FIXME: Non-ASCII characters must be encoded and escaped to match parse() expectations,
// and to avoid changing more than just the user login.
unsigned end = m_userEnd;
if (!user.isEmpty()) {
String u = encodeWithURLEscapeSequences(user, PercentEncodeUsername);
if (m_userStart == m_schemeEnd + 1)
u = "//" + u;
// Add '@' if we didn't have one before.
if (end == m_hostEnd || (end == m_passwordEnd && m_string[end] != '@'))
u.append('@');
URLParser parser(makeString(m_string.left(m_userStart), u, m_string.substring(end)));
*this = parser.result();
} else {
// Remove '@' if we now have neither user nor password.
if (m_userEnd == m_passwordEnd && end != m_hostEnd && m_string[end] == '@')
end += 1;
// We don't want to parse in the extremely common case where we are not going to make a change.
if (m_userStart != end) {
URLParser parser(makeString(m_string.left(m_userStart), m_string.substring(end)));
*this = parser.result();
}
}
}
void URL::setPass(const String& password)
{
if (!m_isValid)
return;
unsigned end = m_passwordEnd;
if (!password.isEmpty()) {
String p = ":" + encodeWithURLEscapeSequences(password, PercentEncodePassword) + "@";
if (m_userEnd == m_schemeEnd + 1)
p = "//" + p;
// Eat the existing '@' since we are going to add our own.
if (end != m_hostEnd && m_string[end] == '@')
end += 1;
URLParser parser(makeString(m_string.left(m_userEnd), p, m_string.substring(end)));
*this = parser.result();
} else {
// Remove '@' if we now have neither user nor password.
if (m_userStart == m_userEnd && end != m_hostEnd && m_string[end] == '@')
end += 1;
// We don't want to parse in the extremely common case where we are not going to make a change.
if (m_userEnd != end) {
URLParser parser(makeString(m_string.left(m_userEnd), m_string.substring(end)));
*this = parser.result();
}
}
}
void URL::setFragmentIdentifier(StringView identifier)
{
if (!m_isValid)
return;
// FIXME: Optimize the case where the identifier already happens to be equal to what was passed?
// FIXME: Is it correct to do this without encoding and escaping non-ASCII characters?
*this = URLParser { makeString(StringView { m_string }.substring(0, m_queryEnd), '#', identifier) }.result();
}
void URL::removeFragmentIdentifier()
{
if (!m_isValid) {
ASSERT(!m_queryEnd);
return;
}
if (m_isValid && m_string.length() > m_queryEnd)
m_string = m_string.left(m_queryEnd);
}
void URL::setQuery(const String& query)
{
if (!m_isValid)
return;
// FIXME: '#' and non-ASCII characters must be encoded and escaped.
// Usually, the query is encoded using document encoding, not UTF-8, but we don't have
// access to the document in this function.
// https://webkit.org/b/161176
if ((query.isEmpty() || query[0] != '?') && !query.isNull()) {
URLParser parser(makeString(m_string.left(m_pathEnd), "?", query, m_string.substring(m_queryEnd)));
*this = parser.result();
} else {
URLParser parser(makeString(m_string.left(m_pathEnd), query, m_string.substring(m_queryEnd)));
*this = parser.result();
}
}
void URL::setPath(const String& s)
{
if (!m_isValid)
return;
// FIXME: encodeWithURLEscapeSequences does not correctly escape '#' and '?', so fragment and query parts
// may be inadvertently affected.
String path = s;
if (path.isEmpty() || path[0] != '/')
path = "/" + path;
URLParser parser(makeString(m_string.left(m_portEnd), encodeWithURLEscapeSequences(path), m_string.substring(m_pathEnd)));
*this = parser.result();
}
String decodeURLEscapeSequences(const String& string)
{
return decodeEscapeSequences<URLEscapeSequence>(string, UTF8Encoding());
}
String decodeURLEscapeSequences(const String& string, const TextEncoding& encoding)
{
return decodeEscapeSequences<URLEscapeSequence>(string, encoding);
}
// Caution: This function does not bounds check.
static void appendEscapedChar(char*& buffer, unsigned char c)
{
*buffer++ = '%';
placeByteAsHex(c, buffer);
}
String URL::serialize(bool omitFragment) const
{
if (omitFragment)
return m_string.left(m_queryEnd);
return m_string;
}
#if PLATFORM(IOS)
static bool shouldCanonicalizeScheme = true;
void enableURLSchemeCanonicalization(bool enableSchemeCanonicalization)
{
shouldCanonicalizeScheme = enableSchemeCanonicalization;
}
#endif
template<size_t length>
static inline bool equal(const char* a, const char (&b)[length])
{
#if PLATFORM(IOS)
if (!shouldCanonicalizeScheme) {
for (size_t i = 0; i < length; ++i) {
if (toASCIILower(a[i]) != b[i])
return false;
}
return true;
}
#endif
for (size_t i = 0; i < length; ++i) {
if (a[i] != b[i])
return false;
}
return true;
}
template<size_t lengthB>
static inline bool equal(const char* stringA, size_t lengthA, const char (&stringB)[lengthB])
{
return lengthA == lengthB && equal(stringA, stringB);
}
bool equalIgnoringFragmentIdentifier(const URL& a, const URL& b)
{
if (a.m_queryEnd != b.m_queryEnd)
return false;
unsigned queryLength = a.m_queryEnd;
for (unsigned i = 0; i < queryLength; ++i)
if (a.string()[i] != b.string()[i])
return false;
return true;
}
bool protocolHostAndPortAreEqual(const URL& a, const URL& b)
{
if (a.m_schemeEnd != b.m_schemeEnd)
return false;
unsigned hostStartA = a.hostStart();
unsigned hostLengthA = a.hostEnd() - hostStartA;
unsigned hostStartB = b.hostStart();
unsigned hostLengthB = b.hostEnd() - b.hostStart();
if (hostLengthA != hostLengthB)
return false;
// Check the scheme
for (unsigned i = 0; i < a.m_schemeEnd; ++i) {
if (a.string()[i] != b.string()[i])
return false;
}
// And the host
for (unsigned i = 0; i < hostLengthA; ++i) {
if (a.string()[hostStartA + i] != b.string()[hostStartB + i])
return false;
}
if (a.port() != b.port())
return false;
return true;
}
bool hostsAreEqual(const URL& a, const URL& b)
{
unsigned hostStartA = a.hostStart();
unsigned hostLengthA = a.hostEnd() - hostStartA;
unsigned hostStartB = b.hostStart();
unsigned hostLengthB = b.hostEnd() - hostStartB;
if (hostLengthA != hostLengthB)
return false;
for (unsigned i = 0; i < hostLengthA; ++i) {
if (a.string()[hostStartA + i] != b.string()[hostStartB + i])
return false;
}
return true;
}
String encodeWithURLEscapeSequences(const String& notEncodedString, PercentEncodeCharacterClass whatToEncode)
{
CString asUTF8 = notEncodedString.utf8();
CharBuffer buffer(asUTF8.length() * 3 + 1);
char* p = buffer.data();
const char* str = asUTF8.data();
const char* strEnd = str + asUTF8.length();
while (str < strEnd) {
unsigned char c = *str++;
if (percentEncodeClassTable[c] >= whatToEncode)
appendEscapedChar(p, c);
else
*p++ = c;
}
ASSERT(p - buffer.data() <= static_cast<int>(buffer.size()));
return String(buffer.data(), p - buffer.data());
}
String encodeWithURLEscapeSequences(const String& notEncodedString)
{
CString asUTF8 = notEncodedString.utf8();
CharBuffer buffer(asUTF8.length() * 3 + 1);
char* p = buffer.data();
const char* str = asUTF8.data();
const char* strEnd = str + asUTF8.length();
while (str < strEnd) {
unsigned char c = *str++;
if (isBadChar(c))
appendEscapedChar(p, c);
else
*p++ = c;
}
ASSERT(p - buffer.data() <= static_cast<int>(buffer.size()));
return String(buffer.data(), p - buffer.data());
}
bool URL::isHierarchical() const
{
if (!m_isValid)
return false;
ASSERT(m_string[m_schemeEnd] == ':');
return m_string[m_schemeEnd + 1] == '/';
}
void URL::copyToBuffer(Vector<char, 512>& buffer) const
{
// FIXME: This throws away the high bytes of all the characters in the string!
// That's fine for a valid URL, which is all ASCII, but not for invalid URLs.
buffer.resize(m_string.length());
copyASCII(m_string, buffer.data());
}
template<typename StringClass>
bool protocolIsInternal(const StringClass& url, const char* protocol)
{
// Do the comparison without making a new string object.
assertProtocolIsGood(StringView(reinterpret_cast<const LChar*>(protocol), strlen(protocol)));
bool isLeading = true;
for (unsigned i = 0, j = 0; url[i]; ++i) {
// Skip leading whitespace and control characters.
if (isLeading && shouldTrimFromURL(url[i]))
continue;
isLeading = false;
// Skip any tabs and newlines.
if (isTabNewline(url[i]))
continue;
if (!protocol[j])
return url[i] == ':';
if (!isASCIIAlphaCaselessEqual(url[i], protocol[j]))
return false;
++j;
}
return false;
}
bool protocolIs(const String& url, const char* protocol)
{
return protocolIsInternal(url, protocol);
}
inline bool URL::protocolIs(const String& string, const char* protocol)
{
return WebCore::protocolIsInternal(string, protocol);
}
bool isValidProtocol(const String& protocol)
{
// RFC3986: ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
if (protocol.isEmpty())
return false;
if (!isSchemeFirstChar(protocol[0]))
return false;
unsigned protocolLength = protocol.length();
for (unsigned i = 1; i < protocolLength; i++) {
if (!isSchemeChar(protocol[i]))
return false;
}
return true;
}
#ifndef NDEBUG
void URL::print() const
{
printf("%s\n", m_string.utf8().data());
}
#endif
String URL::strippedForUseAsReferrer() const
{
URL referrer(*this);
referrer.setUser(String());
referrer.setPass(String());
referrer.removeFragmentIdentifier();
return referrer.string();
}
bool URL::isLocalFile() const
{
// Including feed here might be a bad idea since drag and drop uses this check
// and including feed would allow feeds to potentially let someone's blog
// read the contents of the clipboard on a drag, even without a drop.
// Likewise with using the FrameLoader::shouldTreatURLAsLocal() function.
return protocolIs("file");
}
bool protocolIsJavaScript(const String& url)
{
return protocolIsInternal(url, "javascript");
}
bool protocolIsJavaScript(StringView url)
{
return protocolIsInternal(url, "javascript");
}
bool protocolIsInHTTPFamily(const String& url)
{
// Do the comparison without making a new string object.
return isASCIIAlphaCaselessEqual(url[0], 'h')
&& isASCIIAlphaCaselessEqual(url[1], 't')
&& isASCIIAlphaCaselessEqual(url[2], 't')
&& isASCIIAlphaCaselessEqual(url[3], 'p')
&& (url[4] == ':' || (isASCIIAlphaCaselessEqual(url[4], 's') && url[5] == ':'));
}
const URL& blankURL()
{
static NeverDestroyed<URL> staticBlankURL(ParsedURLString, "about:blank");
return staticBlankURL;
}
bool URL::isBlankURL() const
{
return protocolIs("about");
}
bool portAllowed(const URL& url)
{
std::optional<uint16_t> port = url.port();
// Since most URLs don't have a port, return early for the "no port" case.
if (!port)
return true;
// This blocked port list matches the port blocking that Mozilla implements.
// See http://www.mozilla.org/projects/netlib/PortBanning.html for more information.
static const uint16_t blockedPortList[] = {
1, // tcpmux
7, // echo
9, // discard
11, // systat
13, // daytime
15, // netstat
17, // qotd
19, // chargen
20, // FTP-data
21, // FTP-control
22, // SSH
23, // telnet
25, // SMTP
37, // time
42, // name
43, // nicname
53, // domain
77, // priv-rjs
79, // finger
87, // ttylink
95, // supdup
101, // hostriame
102, // iso-tsap
103, // gppitnp
104, // acr-nema
109, // POP2
110, // POP3
111, // sunrpc
113, // auth
115, // SFTP
117, // uucp-path
119, // nntp
123, // NTP
135, // loc-srv / epmap
139, // netbios
143, // IMAP2
179, // BGP
389, // LDAP
465, // SMTP+SSL
512, // print / exec
513, // login
514, // shell
515, // printer
526, // tempo
530, // courier
531, // Chat
532, // netnews
540, // UUCP
556, // remotefs
563, // NNTP+SSL
587, // ESMTP
601, // syslog-conn
636, // LDAP+SSL
993, // IMAP+SSL
995, // POP3+SSL
2049, // NFS
3659, // apple-sasl / PasswordServer [Apple addition]
4045, // lockd
4190, // ManageSieve [Apple addition]
6000, // X11
6665, // Alternate IRC [Apple addition]
6666, // Alternate IRC [Apple addition]
6667, // Standard IRC [Apple addition]
6668, // Alternate IRC [Apple addition]
6669, // Alternate IRC [Apple addition]
invalidPortNumber, // Used to block all invalid port numbers
};
// If the port is not in the blocked port list, allow it.
ASSERT(std::is_sorted(std::begin(blockedPortList), std::end(blockedPortList)));
if (!std::binary_search(std::begin(blockedPortList), std::end(blockedPortList), port.value()))
return true;
// Allow ports 21 and 22 for FTP URLs, as Mozilla does.
if ((port.value() == 21 || port.value() == 22) && url.protocolIs("ftp"))
return true;
// Allow any port number in a file URL, since the port number is ignored.
if (url.protocolIs("file"))
return true;
return false;
}
String mimeTypeFromDataURL(const String& url)
{
ASSERT(protocolIsInternal(url, "data"));
// FIXME: What's the right behavior when the URL has a comma first, but a semicolon later?
// Currently this code will break at the semicolon in that case. Not sure that's correct.
auto index = url.find(';', 5);
if (index == notFound)
index = url.find(',', 5);
if (index == notFound) {
// FIXME: There was an old comment here that made it sound like this should be returning text/plain.
// But we have been returning empty string here for some time, so not changing its behavior at this time.
return emptyString();
}
if (index == 5)
return ASCIILiteral("text/plain");
ASSERT(index >= 5);
return url.substring(5, index - 5).convertToASCIILowercase();
}
String mimeTypeFromURL(const URL& url)
{
String decodedPath = decodeURLEscapeSequences(url.path());
String extension = decodedPath.substring(decodedPath.reverseFind('.') + 1);
// We don't use MIMETypeRegistry::getMIMETypeForPath() because it returns "application/octet-stream" upon failure
return MIMETypeRegistry::getMIMETypeForExtension(extension);
}
String URL::stringCenterEllipsizedToLength(unsigned length) const
{
if (string().length() <= length)
return string();
return string().left(length / 2 - 1) + "..." + string().right(length / 2 - 2);
}
URL URL::fakeURLWithRelativePart(const String& relativePart)
{
return URL(URL(), "webkit-fake-url://" + createCanonicalUUIDString() + '/' + relativePart);
}
URL URL::fileURLWithFileSystemPath(const String& filePath)
{
return URL(URL(), "file:///" + filePath);
}
}