Ada 2.8.0
Fast spec-compliant URL parser
Loading...
Searching...
No Matches
url.cpp
Go to the documentation of this file.
1#include "ada.h"
2#include "ada/scheme.h"
3#include "ada/log.h"
4
5#include <numeric>
6#include <algorithm>
7#include <string>
8
9namespace ada {
10
11bool url::parse_opaque_host(std::string_view input) {
12 ada_log("parse_opaque_host ", input, " [", input.size(), " bytes]");
13 if (std::any_of(input.begin(), input.end(),
14 ada::unicode::is_forbidden_host_code_point)) {
15 return is_valid = false;
16 }
17
18 // Return the result of running UTF-8 percent-encode on input using the C0
19 // control percent-encode set.
20 host = ada::unicode::percent_encode(
22 return true;
23}
24
25bool url::parse_ipv4(std::string_view input) {
26 ada_log("parse_ipv4 ", input, " [", input.size(), " bytes]");
27 if (input.back() == '.') {
28 input.remove_suffix(1);
29 }
30 size_t digit_count{0};
31 int pure_decimal_count = 0; // entries that are decimal
32 std::string_view original_input =
33 input; // we might use this if pure_decimal_count == 4.
34 uint64_t ipv4{0};
35 // we could unroll for better performance?
36 for (; (digit_count < 4) && !(input.empty()); digit_count++) {
37 uint32_t
38 segment_result{}; // If any number exceeds 32 bits, we have an error.
39 bool is_hex = checkers::has_hex_prefix(input);
40 if (is_hex && ((input.length() == 2) ||
41 ((input.length() > 2) && (input[2] == '.')))) {
42 // special case
43 segment_result = 0;
44 input.remove_prefix(2);
45 } else {
46 std::from_chars_result r{};
47 if (is_hex) {
48 r = std::from_chars(input.data() + 2, input.data() + input.size(),
49 segment_result, 16);
50 } else if ((input.length() >= 2) && input[0] == '0' &&
51 checkers::is_digit(input[1])) {
52 r = std::from_chars(input.data() + 1, input.data() + input.size(),
53 segment_result, 8);
54 } else {
55 pure_decimal_count++;
56 r = std::from_chars(input.data(), input.data() + input.size(),
57 segment_result, 10);
58 }
59 if (r.ec != std::errc()) {
60 return is_valid = false;
61 }
62 input.remove_prefix(r.ptr - input.data());
63 }
64 if (input.empty()) {
65 // We have the last value.
66 // At this stage, ipv4 contains digit_count*8 bits.
67 // So we have 32-digit_count*8 bits left.
68 if (segment_result >= (uint64_t(1) << (32 - digit_count * 8))) {
69 return is_valid = false;
70 }
71 ipv4 <<= (32 - digit_count * 8);
72 ipv4 |= segment_result;
73 goto final;
74 } else {
75 // There is more, so that the value must no be larger than 255
76 // and we must have a '.'.
77 if ((segment_result > 255) || (input[0] != '.')) {
78 return is_valid = false;
79 }
80 ipv4 <<= 8;
81 ipv4 |= segment_result;
82 input.remove_prefix(1); // remove '.'
83 }
84 }
85 if ((digit_count != 4) || (!input.empty())) {
86 return is_valid = false;
87 }
88final:
89 // We could also check r.ptr to see where the parsing ended.
90 if (pure_decimal_count == 4) {
91 host = original_input; // The original input was already all decimal and we
92 // validated it.
93 } else {
94 host = ada::serializers::ipv4(ipv4); // We have to reserialize the address.
95 }
97 return true;
98}
99
100bool url::parse_ipv6(std::string_view input) {
101 ada_log("parse_ipv6 ", input, " [", input.size(), " bytes]");
102
103 if (input.empty()) {
104 return is_valid = false;
105 }
106 // Let address be a new IPv6 address whose IPv6 pieces are all 0.
107 std::array<uint16_t, 8> address{};
108
109 // Let pieceIndex be 0.
110 int piece_index = 0;
111
112 // Let compress be null.
113 std::optional<int> compress{};
114
115 // Let pointer be a pointer for input.
116 std::string_view::iterator pointer = input.begin();
117
118 // If c is U+003A (:), then:
119 if (input[0] == ':') {
120 // If remaining does not start with U+003A (:), validation error, return
121 // failure.
122 if (input.size() == 1 || input[1] != ':') {
123 ada_log("parse_ipv6 starts with : but the rest does not start with :");
124 return is_valid = false;
125 }
126
127 // Increase pointer by 2.
128 pointer += 2;
129
130 // Increase pieceIndex by 1 and then set compress to pieceIndex.
131 compress = ++piece_index;
132 }
133
134 // While c is not the EOF code point:
135 while (pointer != input.end()) {
136 // If pieceIndex is 8, validation error, return failure.
137 if (piece_index == 8) {
138 ada_log("parse_ipv6 piece_index == 8");
139 return is_valid = false;
140 }
141
142 // If c is U+003A (:), then:
143 if (*pointer == ':') {
144 // If compress is non-null, validation error, return failure.
145 if (compress.has_value()) {
146 ada_log("parse_ipv6 compress is non-null");
147 return is_valid = false;
148 }
149
150 // Increase pointer and pieceIndex by 1, set compress to pieceIndex, and
151 // then continue.
152 pointer++;
153 compress = ++piece_index;
154 continue;
155 }
156
157 // Let value and length be 0.
158 uint16_t value = 0, length = 0;
159
160 // While length is less than 4 and c is an ASCII hex digit,
161 // set value to value times 0x10 + c interpreted as hexadecimal number, and
162 // increase pointer and length by 1.
163 while (length < 4 && pointer != input.end() &&
164 unicode::is_ascii_hex_digit(*pointer)) {
165 // https://stackoverflow.com/questions/39060852/why-does-the-addition-of-two-shorts-return-an-int
166 value = uint16_t(value * 0x10 + unicode::convert_hex_to_binary(*pointer));
167 pointer++;
168 length++;
169 }
170
171 // If c is U+002E (.), then:
172 if (pointer != input.end() && *pointer == '.') {
173 // If length is 0, validation error, return failure.
174 if (length == 0) {
175 ada_log("parse_ipv6 length is 0");
176 return is_valid = false;
177 }
178
179 // Decrease pointer by length.
180 pointer -= length;
181
182 // If pieceIndex is greater than 6, validation error, return failure.
183 if (piece_index > 6) {
184 ada_log("parse_ipv6 piece_index > 6");
185 return is_valid = false;
186 }
187
188 // Let numbersSeen be 0.
189 int numbers_seen = 0;
190
191 // While c is not the EOF code point:
192 while (pointer != input.end()) {
193 // Let ipv4Piece be null.
194 std::optional<uint16_t> ipv4_piece{};
195
196 // If numbersSeen is greater than 0, then:
197 if (numbers_seen > 0) {
198 // If c is a U+002E (.) and numbersSeen is less than 4, then increase
199 // pointer by 1.
200 if (*pointer == '.' && numbers_seen < 4) {
201 pointer++;
202 }
203 // Otherwise, validation error, return failure.
204 else {
205 ada_log("parse_ipv6 Otherwise, validation error, return failure");
206 return is_valid = false;
207 }
208 }
209
210 // If c is not an ASCII digit, validation error, return failure.
211 if (pointer == input.end() || !checkers::is_digit(*pointer)) {
212 ada_log(
213 "parse_ipv6 If c is not an ASCII digit, validation error, return "
214 "failure");
215 return is_valid = false;
216 }
217
218 // While c is an ASCII digit:
219 while (pointer != input.end() && checkers::is_digit(*pointer)) {
220 // Let number be c interpreted as decimal number.
221 int number = *pointer - '0';
222
223 // If ipv4Piece is null, then set ipv4Piece to number.
224 if (!ipv4_piece.has_value()) {
225 ipv4_piece = number;
226 }
227 // Otherwise, if ipv4Piece is 0, validation error, return failure.
228 else if (ipv4_piece == 0) {
229 ada_log("parse_ipv6 if ipv4Piece is 0, validation error");
230 return is_valid = false;
231 }
232 // Otherwise, set ipv4Piece to ipv4Piece times 10 + number.
233 else {
234 ipv4_piece = *ipv4_piece * 10 + number;
235 }
236
237 // If ipv4Piece is greater than 255, validation error, return failure.
238 if (ipv4_piece > 255) {
239 ada_log("parse_ipv6 ipv4_piece > 255");
240 return is_valid = false;
241 }
242
243 // Increase pointer by 1.
244 pointer++;
245 }
246
247 // Set address[pieceIndex] to address[pieceIndex] times 0x100 +
248 // ipv4Piece.
249 // https://stackoverflow.com/questions/39060852/why-does-the-addition-of-two-shorts-return-an-int
250 address[piece_index] =
251 uint16_t(address[piece_index] * 0x100 + *ipv4_piece);
252
253 // Increase numbersSeen by 1.
254 numbers_seen++;
255
256 // If numbersSeen is 2 or 4, then increase pieceIndex by 1.
257 if (numbers_seen == 2 || numbers_seen == 4) {
258 piece_index++;
259 }
260 }
261
262 // If numbersSeen is not 4, validation error, return failure.
263 if (numbers_seen != 4) {
264 return is_valid = false;
265 }
266
267 // Break.
268 break;
269 }
270 // Otherwise, if c is U+003A (:):
271 else if ((pointer != input.end()) && (*pointer == ':')) {
272 // Increase pointer by 1.
273 pointer++;
274
275 // If c is the EOF code point, validation error, return failure.
276 if (pointer == input.end()) {
277 ada_log(
278 "parse_ipv6 If c is the EOF code point, validation error, return "
279 "failure");
280 return is_valid = false;
281 }
282 }
283 // Otherwise, if c is not the EOF code point, validation error, return
284 // failure.
285 else if (pointer != input.end()) {
286 ada_log(
287 "parse_ipv6 Otherwise, if c is not the EOF code point, validation "
288 "error, return failure");
289 return is_valid = false;
290 }
291
292 // Set address[pieceIndex] to value.
293 address[piece_index] = value;
294
295 // Increase pieceIndex by 1.
296 piece_index++;
297 }
298
299 // If compress is non-null, then:
300 if (compress.has_value()) {
301 // Let swaps be pieceIndex - compress.
302 int swaps = piece_index - *compress;
303
304 // Set pieceIndex to 7.
305 piece_index = 7;
306
307 // While pieceIndex is not 0 and swaps is greater than 0,
308 // swap address[pieceIndex] with address[compress + swaps - 1], and then
309 // decrease both pieceIndex and swaps by 1.
310 while (piece_index != 0 && swaps > 0) {
311 std::swap(address[piece_index], address[*compress + swaps - 1]);
312 piece_index--;
313 swaps--;
314 }
315 }
316 // Otherwise, if compress is null and pieceIndex is not 8, validation error,
317 // return failure.
318 else if (piece_index != 8) {
319 ada_log(
320 "parse_ipv6 if compress is null and pieceIndex is not 8, validation "
321 "error, return failure");
322 return is_valid = false;
323 }
324 host = ada::serializers::ipv6(address);
325 ada_log("parse_ipv6 ", *host);
326 host_type = IPV6;
327 return true;
328}
329
330template <bool has_state_override>
331ada_really_inline bool url::parse_scheme(const std::string_view input) {
332 auto parsed_type = ada::scheme::get_scheme_type(input);
333 bool is_input_special = (parsed_type != ada::scheme::NOT_SPECIAL);
338 if (is_input_special) { // fast path!!!
339 if (has_state_override) {
340 // If url's scheme is not a special scheme and buffer is a special scheme,
341 // then return.
342 if (is_special() != is_input_special) {
343 return true;
344 }
345
346 // If url includes credentials or has a non-null port, and buffer is
347 // "file", then return.
348 if ((has_credentials() || port.has_value()) &&
349 parsed_type == ada::scheme::type::FILE) {
350 return true;
351 }
352
353 // If url's scheme is "file" and its host is an empty host, then return.
354 // An empty host is the empty string.
355 if (type == ada::scheme::type::FILE && host.has_value() &&
356 host.value().empty()) {
357 return true;
358 }
359 }
360
361 type = parsed_type;
362
363 if (has_state_override) {
364 // This is uncommon.
365 uint16_t urls_scheme_port = get_special_port();
366
367 if (urls_scheme_port) {
368 // If url's port is url's scheme's default port, then set url's port to
369 // null.
370 if (port.has_value() && *port == urls_scheme_port) {
371 port = std::nullopt;
372 }
373 }
374 }
375 } else { // slow path
376 std::string _buffer(input);
377 // Next function is only valid if the input is ASCII and returns false
378 // otherwise, but it seems that we always have ascii content so we do not
379 // need to check the return value.
380 // bool is_ascii =
381 unicode::to_lower_ascii(_buffer.data(), _buffer.size());
382
383 if (has_state_override) {
384 // If url's scheme is a special scheme and buffer is not a special scheme,
385 // then return. If url's scheme is not a special scheme and buffer is a
386 // special scheme, then return.
387 if (is_special() != ada::scheme::is_special(_buffer)) {
388 return true;
389 }
390
391 // If url includes credentials or has a non-null port, and buffer is
392 // "file", then return.
393 if ((has_credentials() || port.has_value()) && _buffer == "file") {
394 return true;
395 }
396
397 // If url's scheme is "file" and its host is an empty host, then return.
398 // An empty host is the empty string.
399 if (type == ada::scheme::type::FILE && host.has_value() &&
400 host.value().empty()) {
401 return true;
402 }
403 }
404
405 set_scheme(std::move(_buffer));
406
407 if (has_state_override) {
408 // This is uncommon.
409 uint16_t urls_scheme_port = get_special_port();
410
411 if (urls_scheme_port) {
412 // If url's port is url's scheme's default port, then set url's port to
413 // null.
414 if (port.has_value() && *port == urls_scheme_port) {
415 port = std::nullopt;
416 }
417 }
418 }
419 }
420
421 return true;
422}
423
424ada_really_inline bool url::parse_host(std::string_view input) {
425 ada_log("parse_host ", input, " [", input.size(), " bytes]");
426 if (input.empty()) {
427 return is_valid = false;
428 } // technically unnecessary.
429 // If input starts with U+005B ([), then:
430 if (input[0] == '[') {
431 // If input does not end with U+005D (]), validation error, return failure.
432 if (input.back() != ']') {
433 return is_valid = false;
434 }
435 ada_log("parse_host ipv6");
436
437 // Return the result of IPv6 parsing input with its leading U+005B ([) and
438 // trailing U+005D (]) removed.
439 input.remove_prefix(1);
440 input.remove_suffix(1);
441 return parse_ipv6(input);
442 }
443
444 // If isNotSpecial is true, then return the result of opaque-host parsing
445 // input.
446 if (!is_special()) {
447 return parse_opaque_host(input);
448 }
449 // Let domain be the result of running UTF-8 decode without BOM on the
450 // percent-decoding of input. Let asciiDomain be the result of running domain
451 // to ASCII with domain and false. The most common case is an ASCII input, in
452 // which case we do not need to call the expensive 'to_ascii' if a few
453 // conditions are met: no '%' and no 'xn-' subsequence.
454 std::string buffer = std::string(input);
455 // This next function checks that the result is ascii, but we are going to
456 // to check anyhow with is_forbidden.
457 // bool is_ascii =
458 unicode::to_lower_ascii(buffer.data(), buffer.size());
459 bool is_forbidden = unicode::contains_forbidden_domain_code_point(
460 buffer.data(), buffer.size());
461 if (is_forbidden == 0 && buffer.find("xn-") == std::string_view::npos) {
462 // fast path
463 host = std::move(buffer);
464 if (checkers::is_ipv4(host.value())) {
465 ada_log("parse_host fast path ipv4");
466 return parse_ipv4(host.value());
467 }
468 ada_log("parse_host fast path ", *host);
469 return true;
470 }
471 ada_log("parse_host calling to_ascii");
472 is_valid = ada::unicode::to_ascii(host, input, input.find('%'));
473 if (!is_valid) {
474 ada_log("parse_host to_ascii returns false");
475 return is_valid = false;
476 }
477 ada_log("parse_host to_ascii succeeded ", *host, " [", host->size(),
478 " bytes]");
479
480 if (std::any_of(host.value().begin(), host.value().end(),
481 ada::unicode::is_forbidden_domain_code_point)) {
482 host = std::nullopt;
483 return is_valid = false;
484 }
485
486 // If asciiDomain ends in a number, then return the result of IPv4 parsing
487 // asciiDomain.
488 if (checkers::is_ipv4(host.value())) {
489 ada_log("parse_host got ipv4 ", *host);
490 return parse_ipv4(host.value());
491 }
492
493 return true;
494}
495
496ada_really_inline void url::parse_path(std::string_view input) {
497 ada_log("parse_path ", input);
498 std::string tmp_buffer;
499 std::string_view internal_input;
500 if (unicode::has_tabs_or_newline(input)) {
501 tmp_buffer = input;
502 // Optimization opportunity: Instead of copying and then pruning, we could
503 // just directly build the string from user_input.
504 helpers::remove_ascii_tab_or_newline(tmp_buffer);
505 internal_input = tmp_buffer;
506 } else {
507 internal_input = input;
508 }
509
510 // If url is special, then:
511 if (is_special()) {
512 if (internal_input.empty()) {
513 path = "/";
514 } else if ((internal_input[0] == '/') || (internal_input[0] == '\\')) {
515 helpers::parse_prepared_path(internal_input.substr(1), type, path);
516 return;
517 } else {
518 helpers::parse_prepared_path(internal_input, type, path);
519 return;
520 }
521 } else if (!internal_input.empty()) {
522 if (internal_input[0] == '/') {
523 helpers::parse_prepared_path(internal_input.substr(1), type, path);
524 return;
525 } else {
526 helpers::parse_prepared_path(internal_input, type, path);
527 return;
528 }
529 } else {
530 if (!host.has_value()) {
531 path = "/";
532 }
533 }
534}
535
536[[nodiscard]] std::string url::to_string() const {
537 if (!is_valid) {
538 return "null";
539 }
540 std::string answer;
541 auto back = std::back_insert_iterator(answer);
542 answer.append("{\n");
543 answer.append("\t\"protocol\":\"");
544 helpers::encode_json(get_protocol(), back);
545 answer.append("\",\n");
546 if (has_credentials()) {
547 answer.append("\t\"username\":\"");
548 helpers::encode_json(username, back);
549 answer.append("\",\n");
550 answer.append("\t\"password\":\"");
551 helpers::encode_json(password, back);
552 answer.append("\",\n");
553 }
554 if (host.has_value()) {
555 answer.append("\t\"host\":\"");
556 helpers::encode_json(host.value(), back);
557 answer.append("\",\n");
558 }
559 if (port.has_value()) {
560 answer.append("\t\"port\":\"");
561 answer.append(std::to_string(port.value()));
562 answer.append("\",\n");
563 }
564 answer.append("\t\"path\":\"");
565 helpers::encode_json(path, back);
566 answer.append("\",\n");
567 answer.append("\t\"opaque path\":");
568 answer.append((has_opaque_path ? "true" : "false"));
569 if (has_search()) {
570 answer.append(",\n");
571 answer.append("\t\"query\":\"");
572 helpers::encode_json(query.value(), back);
573 answer.append("\"");
574 }
575 if (hash.has_value()) {
576 answer.append(",\n");
577 answer.append("\t\"hash\":\"");
578 helpers::encode_json(hash.value(), back);
579 answer.append("\"");
580 }
581 answer.append("\n}");
582 return answer;
583}
584
585[[nodiscard]] bool url::has_valid_domain() const noexcept {
586 if (!host.has_value()) {
587 return false;
588 }
589 return checkers::verify_dns_length(host.value());
590}
591
592} // namespace ada
Includes all definitions for Ada.
#define ada_really_inline
Definition common_defs.h:84
constexpr uint8_t C0_CONTROL_PERCENT_ENCODE[32]
bool has_hex_prefix(std::string_view input)
constexpr bool is_digit(char x) noexcept
constexpr ada::scheme::type get_scheme_type(std::string_view scheme) noexcept
Definition scheme-inl.h:72
@ NOT_SPECIAL
Definition scheme.h:32
constexpr uint16_t get_special_port(std::string_view scheme) noexcept
Definition scheme-inl.h:57
std::string ipv6(const std::array< uint16_t, 8 > &address) noexcept
std::string ipv4(uint64_t address) noexcept
Definition ada_idna.h:13
@ IPV6
Definition url_base.h:32
@ IPV4
Definition url_base.h:27
Declarations for the URL scheme.
ada_really_inline bool is_special() const noexcept
url_host_type host_type
Definition url_base.h:60
bool is_valid
Definition url_base.h:50
bool has_opaque_path
Definition url_base.h:55
ada_really_inline bool has_credentials() const noexcept
Definition url-inl.h:19
std::string to_string() const override
Definition url.cpp:536
std::string get_protocol() const noexcept
bool has_valid_domain() const noexcept override
Definition url.cpp:585
bool has_search() const noexcept override
Definition url-inl.h:159