Ada 3.4.0
Fast spec-compliant URL parser
Loading...
Searching...
No Matches
url_aggregator.cpp
Go to the documentation of this file.
1#include "ada/checkers-inl.h"
2#include "ada/helpers.h"
4#include "ada/scheme.h"
5#include "ada/unicode-inl.h"
10
11#include <iterator>
12#include <ranges>
13#include <string>
14#include <string_view>
15
16namespace ada {
17template <bool has_state_override>
18[[nodiscard]] ada_really_inline bool url_aggregator::parse_scheme_with_colon(
19 const std::string_view input_with_colon) {
20 ada_log("url_aggregator::parse_scheme_with_colon ", input_with_colon);
22 ADA_ASSERT_TRUE(!helpers::overlaps(input_with_colon, buffer));
23 std::string_view input{input_with_colon};
24 input.remove_suffix(1);
25 auto parsed_type = ada::scheme::get_scheme_type(input);
26 const bool is_input_special = (parsed_type != ada::scheme::NOT_SPECIAL);
31 if (is_input_special) { // fast path!!!
32 if constexpr (has_state_override) {
33 // If url's scheme is not a special scheme and buffer is a special scheme,
34 // then return.
35 if (is_special() != is_input_special) {
36 return false;
37 }
38
39 // If url includes credentials or has a non-null port, and buffer is
40 // "file", then return.
41 if ((has_credentials() || components.port != url_components::omitted) &&
42 parsed_type == ada::scheme::type::FILE) {
43 return false;
44 }
45
46 // If url's scheme is "file" and its host is an empty host, then return.
47 // An empty host is the empty string.
48 if (type == ada::scheme::type::FILE &&
49 components.host_start == components.host_end) {
50 return false;
51 }
52 }
53
54 type = parsed_type;
55 set_scheme_from_view_with_colon(input_with_colon);
56
57 if constexpr (has_state_override) {
58 // This is uncommon.
59 uint16_t urls_scheme_port = get_special_port();
60
61 // If url's port is url's scheme's default port, then set url's port to
62 // null.
63 if (components.port == urls_scheme_port) {
64 clear_port();
65 }
66 }
67 } else { // slow path
68 std::string _buffer(input);
69 // Next function is only valid if the input is ASCII and returns false
70 // otherwise, but it seems that we always have ascii content so we do not
71 // need to check the return value.
72 unicode::to_lower_ascii(_buffer.data(), _buffer.size());
73
74 if constexpr (has_state_override) {
75 // If url's scheme is a special scheme and buffer is not a special scheme,
76 // then return. If url's scheme is not a special scheme and buffer is a
77 // special scheme, then return.
78 if (is_special() != ada::scheme::is_special(_buffer)) {
79 return true;
80 }
81
82 // If url includes credentials or has a non-null port, and buffer is
83 // "file", then return.
84 if ((has_credentials() || components.port != url_components::omitted) &&
85 _buffer == "file") {
86 return true;
87 }
88
89 // If url's scheme is "file" and its host is an empty host, then return.
90 // An empty host is the empty string.
91 if (type == ada::scheme::type::FILE &&
92 components.host_start == components.host_end) {
93 return true;
94 }
95 }
96
97 set_scheme(_buffer);
98
99 if constexpr (has_state_override) {
100 // This is uncommon.
101 uint16_t urls_scheme_port = get_special_port();
102
103 // If url's port is url's scheme's default port, then set url's port to
104 // null.
105 if (components.port == urls_scheme_port) {
106 clear_port();
107 }
108 }
109 }
111 return true;
112}
113
114inline void url_aggregator::copy_scheme(const url_aggregator& u) noexcept {
115 ada_log("url_aggregator::copy_scheme ", u.buffer);
116 ADA_ASSERT_TRUE(validate());
117 // next line could overflow but unsigned arithmetic has well-defined
118 // overflows.
119 uint32_t new_difference = u.components.protocol_end - components.protocol_end;
120 type = u.type;
121 buffer.erase(0, components.protocol_end);
122 buffer.insert(0, u.get_protocol());
123 components.protocol_end = u.components.protocol_end;
124
125 // No need to update the components
126 if (new_difference == 0) {
127 return;
128 }
129
130 // Update the rest of the components.
131 components.username_end += new_difference;
132 components.host_start += new_difference;
133 components.host_end += new_difference;
134 components.pathname_start += new_difference;
135 if (components.search_start != url_components::omitted) {
136 components.search_start += new_difference;
137 }
138 if (components.hash_start != url_components::omitted) {
139 components.hash_start += new_difference;
140 }
141 ADA_ASSERT_TRUE(validate());
142}
143
144inline void url_aggregator::set_scheme_from_view_with_colon(
145 std::string_view new_scheme_with_colon) noexcept {
146 ada_log("url_aggregator::set_scheme_from_view_with_colon ",
147 new_scheme_with_colon);
148 ADA_ASSERT_TRUE(validate());
149 ADA_ASSERT_TRUE(!new_scheme_with_colon.empty() &&
150 new_scheme_with_colon.back() == ':');
151 // next line could overflow but unsigned arithmetic has well-defined
152 // overflows.
153 uint32_t new_difference =
154 uint32_t(new_scheme_with_colon.size()) - components.protocol_end;
155
156 if (buffer.empty()) {
157 buffer.append(new_scheme_with_colon);
158 } else {
159 buffer.erase(0, components.protocol_end);
160 buffer.insert(0, new_scheme_with_colon);
161 }
162 components.protocol_end += new_difference;
163
164 // Update the rest of the components.
165 components.username_end += new_difference;
166 components.host_start += new_difference;
167 components.host_end += new_difference;
168 components.pathname_start += new_difference;
169 if (components.search_start != url_components::omitted) {
170 components.search_start += new_difference;
171 }
172 if (components.hash_start != url_components::omitted) {
173 components.hash_start += new_difference;
174 }
175 ADA_ASSERT_TRUE(validate());
176}
177
178inline void url_aggregator::set_scheme(std::string_view new_scheme) noexcept {
179 ada_log("url_aggregator::set_scheme ", new_scheme);
180 ADA_ASSERT_TRUE(validate());
181 ADA_ASSERT_TRUE(new_scheme.empty() || new_scheme.back() != ':');
182 // next line could overflow but unsigned arithmetic has well-defined
183 // overflows.
184 uint32_t new_difference =
185 uint32_t(new_scheme.size()) - components.protocol_end + 1;
186
188 if (buffer.empty()) {
189 buffer.append(helpers::concat(new_scheme, ":"));
190 } else {
191 buffer.erase(0, components.protocol_end);
192 buffer.insert(0, helpers::concat(new_scheme, ":"));
193 }
194 components.protocol_end = uint32_t(new_scheme.size() + 1);
195
196 // Update the rest of the components.
197 components.username_end += new_difference;
198 components.host_start += new_difference;
199 components.host_end += new_difference;
200 components.pathname_start += new_difference;
201 if (components.search_start != url_components::omitted) {
202 components.search_start += new_difference;
203 }
204 if (components.hash_start != url_components::omitted) {
205 components.hash_start += new_difference;
206 }
207 ADA_ASSERT_TRUE(validate());
208}
209
210bool url_aggregator::set_protocol(const std::string_view input) {
211 ada_log("url_aggregator::set_protocol ", input);
213 ADA_ASSERT_TRUE(!helpers::overlaps(input, buffer));
214 std::string view(input);
215 helpers::remove_ascii_tab_or_newline(view);
216 if (view.empty()) {
217 return true;
218 }
219
220 // Schemes should start with alpha values.
221 if (!checkers::is_alpha(view[0])) {
222 return false;
223 }
224
225 view.append(":");
226
227 std::string::iterator pointer =
228 std::ranges::find_if_not(view, unicode::is_alnum_plus);
229
230 if (pointer != view.end() && *pointer == ':') {
231 return parse_scheme_with_colon<true>(
232 view.substr(0, pointer - view.begin() + 1));
233 }
234 return false;
235}
236
237bool url_aggregator::set_username(const std::string_view input) {
238 ada_log("url_aggregator::set_username '", input, "' ");
240 ADA_ASSERT_TRUE(!helpers::overlaps(input, buffer));
241 if (cannot_have_credentials_or_port()) {
242 return false;
243 }
246 if (idx == input.size()) {
247 update_base_username(input);
248 } else {
249 // We only create a temporary string if we have to!
250 update_base_username(ada::unicode::percent_encode(
252 }
254 return true;
255}
256
257bool url_aggregator::set_password(const std::string_view input) {
258 ada_log("url_aggregator::set_password '", input, "'");
260 ADA_ASSERT_TRUE(!helpers::overlaps(input, buffer));
261 if (cannot_have_credentials_or_port()) {
262 return false;
263 }
266 if (idx == input.size()) {
267 update_base_password(input);
268 } else {
269 // We only create a temporary string if we have to!
270 update_base_password(ada::unicode::percent_encode(
272 }
274 return true;
275}
276
277bool url_aggregator::set_port(const std::string_view input) {
278 ada_log("url_aggregator::set_port ", input);
280 ADA_ASSERT_TRUE(!helpers::overlaps(input, buffer));
281 if (cannot_have_credentials_or_port()) {
282 return false;
283 }
284
285 if (input.empty()) {
286 clear_port();
287 return true;
288 }
289
290 std::string trimmed(input);
291 helpers::remove_ascii_tab_or_newline(trimmed);
292
293 if (trimmed.empty()) {
294 return true;
295 }
296
297 // Input should not start with a non-digit character.
298 if (!ada::unicode::is_ascii_digit(trimmed.front())) {
299 return false;
300 }
301
302 // Find the first non-digit character to determine the length of digits
303 auto first_non_digit =
304 std::ranges::find_if_not(trimmed, ada::unicode::is_ascii_digit);
305 std::string_view digits_to_parse =
306 std::string_view(trimmed.data(), first_non_digit - trimmed.begin());
307
308 // Revert changes if parse_port fails.
309 uint32_t previous_port = components.port;
310 parse_port(digits_to_parse);
311 if (is_valid) {
312 return true;
313 }
314 update_base_port(previous_port);
315 is_valid = true;
317 return false;
318}
319
320bool url_aggregator::set_pathname(const std::string_view input) {
321 ada_log("url_aggregator::set_pathname ", input);
323 ADA_ASSERT_TRUE(!helpers::overlaps(input, buffer));
324 if (has_opaque_path) {
325 return false;
326 }
327 clear_pathname();
328 parse_path(input);
329 if (get_pathname().starts_with("//") && !has_authority() && !has_dash_dot()) {
330 buffer.insert(components.pathname_start, "/.");
331 components.pathname_start += 2;
332 }
334 return true;
335}
336
337ada_really_inline void url_aggregator::parse_path(std::string_view input) {
338 ada_log("url_aggregator::parse_path ", input);
340 ADA_ASSERT_TRUE(!helpers::overlaps(input, buffer));
341 std::string tmp_buffer;
342 std::string_view internal_input;
343 if (unicode::has_tabs_or_newline(input)) {
344 tmp_buffer = input;
345 // Optimization opportunity: Instead of copying and then pruning, we could
346 // just directly build the string from user_input.
347 helpers::remove_ascii_tab_or_newline(tmp_buffer);
348 internal_input = tmp_buffer;
349 } else {
350 internal_input = input;
351 }
352
353 // If url is special, then:
354 if (is_special()) {
355 if (internal_input.empty()) {
356 update_base_pathname("/");
357 } else if ((internal_input[0] == '/') || (internal_input[0] == '\\')) {
358 consume_prepared_path(internal_input.substr(1));
359 } else {
360 consume_prepared_path(internal_input);
361 }
362 } else if (!internal_input.empty()) {
363 if (internal_input[0] == '/') {
364 consume_prepared_path(internal_input.substr(1));
365 } else {
366 consume_prepared_path(internal_input);
367 }
368 } else {
369 // Non-special URLs with an empty host can have their paths erased
370 // Path-only URLs cannot have their paths erased
371 if (components.host_start == components.host_end && !has_authority()) {
372 update_base_pathname("/");
373 }
374 }
376}
377
378void url_aggregator::set_search(const std::string_view input) {
379 ada_log("url_aggregator::set_search ", input);
381 ADA_ASSERT_TRUE(!helpers::overlaps(input, buffer));
382 if (input.empty()) {
383 clear_search();
384 helpers::strip_trailing_spaces_from_opaque_path(*this);
385 return;
386 }
387
388 std::string new_value;
389 new_value = input[0] == '?' ? input.substr(1) : input;
390 helpers::remove_ascii_tab_or_newline(new_value);
391
392 auto query_percent_encode_set =
395
396 update_base_search(new_value, query_percent_encode_set);
398}
399
400void url_aggregator::set_hash(const std::string_view input) {
401 ada_log("url_aggregator::set_hash ", input);
403 ADA_ASSERT_TRUE(!helpers::overlaps(input, buffer));
404 if (input.empty()) {
405 if (components.hash_start != url_components::omitted) {
406 buffer.resize(components.hash_start);
407 components.hash_start = url_components::omitted;
408 }
409 helpers::strip_trailing_spaces_from_opaque_path(*this);
410 return;
411 }
412
413 std::string new_value;
414 new_value = input[0] == '#' ? input.substr(1) : input;
415 helpers::remove_ascii_tab_or_newline(new_value);
416 update_unencoded_base_hash(new_value);
418}
419
420bool url_aggregator::set_href(const std::string_view input) {
421 ADA_ASSERT_TRUE(!helpers::overlaps(input, buffer));
422 ada_log("url_aggregator::set_href ", input, " [", input.size(), " bytes]");
424 ada_log("url_aggregator::set_href, success :", out.has_value());
425
426 if (out) {
427 ada_log("url_aggregator::set_href, parsed ", out->to_string());
428 // TODO: Figure out why the following line puts test to never finish.
429 *this = *out;
430 }
431
432 return out.has_value();
433}
434
435ada_really_inline bool url_aggregator::parse_host(std::string_view input) {
436 ada_log("url_aggregator:parse_host \"", input, "\" [", input.size(),
437 " bytes]");
439 ADA_ASSERT_TRUE(!helpers::overlaps(input, buffer));
440 if (input.empty()) {
441 return is_valid = false;
442 } // technically unnecessary.
443 // If input starts with U+005B ([), then:
444 if (input[0] == '[') {
445 // If input does not end with U+005D (]), validation error, return failure.
446 if (input.back() != ']') {
447 return is_valid = false;
448 }
449 ada_log("parse_host ipv6");
450
451 // Return the result of IPv6 parsing input with its leading U+005B ([) and
452 // trailing U+005D (]) removed.
453 input.remove_prefix(1);
454 input.remove_suffix(1);
455 return parse_ipv6(input);
456 }
457
458 // If isNotSpecial is true, then return the result of opaque-host parsing
459 // input.
460 if (!is_special()) {
461 return parse_opaque_host(input);
462 }
463 // Let domain be the result of running UTF-8 decode without BOM on the
464 // percent-decoding of input. Let asciiDomain be the result of running domain
465 // to ASCII with domain and false. The most common case is an ASCII input, in
466 // which case we do not need to call the expensive 'to_ascii' if a few
467 // conditions are met: no '%' and no 'xn-' subsequence.
468
469 // Often, the input does not contain any forbidden code points, and no upper
470 // case ASCII letter, then we can just copy it to the buffer. We want to
471 // optimize for such a common case.
472
473 // Fast path: try to parse as pure decimal IPv4(a.b.c.d) first.
474 const uint64_t fast_result = checkers::try_parse_ipv4_fast(input);
475 if (fast_result < checkers::ipv4_fast_fail) {
476 // Fast path succeeded - input is pure decimal IPv4
477 if (!input.empty() && input.back() == '.') {
478 update_base_hostname(input.substr(0, input.size() - 1));
479 } else {
480 update_base_hostname(input);
481 }
482 host_type = IPV4;
483 ada_log("parse_host fast path decimal ipv4");
485 return true;
486 }
487 uint8_t is_forbidden_or_upper =
488 unicode::contains_forbidden_domain_code_point_or_upper(input.data(),
489 input.size());
490 // Minor optimization opportunity:
491 // contains_forbidden_domain_code_point_or_upper could be extend to check for
492 // the presence of characters that cannot appear in the ipv4 address and we
493 // could also check whether x and n and - are present, and so we could skip
494 // some of the checks below. However, the gains are likely to be small, and
495 // the code would be more complex.
496 if (is_forbidden_or_upper == 0 &&
497 input.find("xn-") == std::string_view::npos) {
498 // fast path
499 update_base_hostname(input);
500
501 // Check for other IPv4 formats (hex, octal, etc.)
502 if (checkers::is_ipv4(get_hostname())) {
503 ada_log("parse_host fast path ipv4");
504 return parse_ipv4(get_hostname(), true);
505 }
506 ada_log("parse_host fast path ", get_hostname());
507 return true;
508 }
509 // We have encountered at least one forbidden code point or the input contains
510 // 'xn-' (case insensitive), so we need to call 'to_ascii' to perform the full
511 // conversion.
512
513 ada_log("parse_host calling to_ascii");
514 std::optional<std::string> host = std::string(get_hostname());
515 is_valid = ada::unicode::to_ascii(host, input, input.find('%'));
516 if (!is_valid) {
517 ada_log("parse_host to_ascii returns false");
518 return is_valid = false;
519 }
520 ada_log("parse_host to_ascii succeeded ", *host, " [", host->size(),
521 " bytes]");
522
523 if (std::ranges::any_of(host.value(),
524 ada::unicode::is_forbidden_domain_code_point)) {
525 return is_valid = false;
526 }
527
528 // If asciiDomain ends in a number, then return the result of IPv4 parsing
529 // asciiDomain.
530 if (checkers::is_ipv4(host.value())) {
531 ada_log("parse_host got ipv4 ", *host);
532 return parse_ipv4(host.value(), false);
533 }
534
535 update_base_hostname(host.value());
537 return true;
538}
539
540template <bool override_hostname>
541bool url_aggregator::set_host_or_hostname(const std::string_view input) {
542 ada_log("url_aggregator::set_host_or_hostname ", input);
544 ADA_ASSERT_TRUE(!helpers::overlaps(input, buffer));
545 if (has_opaque_path) {
546 return false;
547 }
548
549 std::string previous_host(get_hostname());
550 uint32_t previous_port = components.port;
551
552 size_t host_end_pos = input.find('#');
553 std::string _host(input.data(), host_end_pos != std::string_view::npos
554 ? host_end_pos
555 : input.size());
556 helpers::remove_ascii_tab_or_newline(_host);
557 std::string_view new_host(_host);
558
559 // If url's scheme is "file", then set state to file host state, instead of
560 // host state.
561 if (type != ada::scheme::type::FILE) {
562 std::string_view host_view(_host.data(), _host.length());
563 auto [location, found_colon] =
564 helpers::get_host_delimiter_location(is_special(), host_view);
565
566 // Otherwise, if c is U+003A (:) and insideBrackets is false, then:
567 // Note: the 'found_colon' value is true if and only if a colon was
568 // encountered while not inside brackets.
569 if (found_colon) {
570 // If buffer is the empty string, host-missing validation error, return
571 // failure.
572 std::string_view host_buffer = host_view.substr(0, location);
573 if (host_buffer.empty()) {
574 return false;
575 }
576
577 // If state override is given and state override is hostname state, then
578 // return failure.
579 if constexpr (override_hostname) {
580 return false;
581 }
582
583 // Let host be the result of host parsing buffer with url is not special.
584 bool succeeded = parse_host(host_buffer);
585 if (!succeeded) {
586 update_base_hostname(previous_host);
587 update_base_port(previous_port);
588 return false;
589 }
590
591 // Set url's host to host, buffer to the empty string, and state to port
592 // state.
593 std::string_view port_buffer = new_host.substr(location + 1);
594 if (!port_buffer.empty()) {
595 set_port(port_buffer);
596 }
597 return true;
598 }
599 // Otherwise, if one of the following is true:
600 // - c is the EOF code point, U+002F (/), U+003F (?), or U+0023 (#)
601 // - url is special and c is U+005C (\‍)
602 else {
603 // If url is special and host_view is the empty string, host-missing
604 // validation error, return failure.
605 if (host_view.empty() && is_special()) {
606 return false;
607 }
608
609 // Otherwise, if state override is given, host_view is the empty string,
610 // and either url includes credentials or url's port is non-null, then
611 // return failure.
612 if (host_view.empty() && (has_credentials() || has_port())) {
613 return false;
614 }
615
616 // Let host be the result of host parsing host_view with url is not
617 // special.
618 if (host_view.empty() && !is_special()) {
619 if (has_hostname()) {
620 clear_hostname(); // easy!
621 } else if (has_dash_dot()) {
622 add_authority_slashes_if_needed();
623 delete_dash_dot();
624 }
625 return true;
626 }
627
628 bool succeeded = parse_host(host_view);
629 if (!succeeded) {
630 update_base_hostname(previous_host);
631 update_base_port(previous_port);
632 return false;
633 } else if (has_dash_dot()) {
634 // Should remove dash_dot from pathname
635 delete_dash_dot();
636 }
637 return true;
638 }
639 }
640
641 size_t location = new_host.find_first_of("/\\?");
642 if (location != std::string_view::npos) {
643 new_host.remove_suffix(new_host.length() - location);
644 }
645
646 if (new_host.empty()) {
647 // Set url's host to the empty string.
648 clear_hostname();
649 } else {
650 // Let host be the result of host parsing buffer with url is not special.
651 if (!parse_host(new_host)) {
652 update_base_hostname(previous_host);
653 update_base_port(previous_port);
654 return false;
655 }
656
657 // If host is "localhost", then set host to the empty string.
658 if (helpers::substring(buffer, components.host_start,
659 components.host_end) == "localhost") {
660 clear_hostname();
661 }
662 }
664 return true;
665}
666
667bool url_aggregator::set_host(const std::string_view input) {
668 ada_log("url_aggregator::set_host '", input, "'");
670 ADA_ASSERT_TRUE(!helpers::overlaps(input, buffer));
671 return set_host_or_hostname<false>(input);
672}
673
674bool url_aggregator::set_hostname(const std::string_view input) {
675 ada_log("url_aggregator::set_hostname '", input, "'");
677 ADA_ASSERT_TRUE(!helpers::overlaps(input, buffer));
678 return set_host_or_hostname<true>(input);
679}
680
681[[nodiscard]] std::string url_aggregator::get_origin() const noexcept {
682 ada_log("url_aggregator::get_origin");
683 if (is_special()) {
684 // Return a new opaque origin.
685 if (type == scheme::FILE) {
686 return "null";
687 }
688
689 return helpers::concat(get_protocol(), "//", get_host());
690 }
691
692 if (get_protocol() == "blob:") {
693 std::string_view path = get_pathname();
694 if (!path.empty()) {
695 auto out = ada::parse<ada::url_aggregator>(path);
696 if (out && (out->type == scheme::HTTP || out->type == scheme::HTTPS)) {
697 // If pathURL's scheme is not "http" and not "https", then return a
698 // new opaque origin.
699 return helpers::concat(out->get_protocol(), "//", out->get_host());
700 }
701 }
702 }
703
704 // Return a new opaque origin.
705 return "null";
706}
707
708[[nodiscard]] std::string_view url_aggregator::get_username() const noexcept
710 ada_log("url_aggregator::get_username");
712 return helpers::substring(buffer, components.protocol_end + 2,
713 components.username_end);
714 }
715 return "";
716}
717
718[[nodiscard]] std::string_view url_aggregator::get_password() const noexcept
720 ada_log("url_aggregator::get_password");
722 return helpers::substring(buffer, components.username_end + 1,
723 components.host_start);
724 }
725 return "";
726}
727
728[[nodiscard]] std::string_view url_aggregator::get_port() const noexcept
730 ada_log("url_aggregator::get_port");
731 if (components.port == url_components::omitted) {
732 return "";
733 }
734 return helpers::substring(buffer, components.host_end + 1,
735 components.pathname_start);
736}
737
738[[nodiscard]] std::string_view url_aggregator::get_hash() const noexcept
740 ada_log("url_aggregator::get_hash");
741 // If this's URL's fragment is either null or the empty string, then return
742 // the empty string. Return U+0023 (#), followed by this's URL's fragment.
743 if (components.hash_start == url_components::omitted) {
744 return "";
745 }
746 if (buffer.size() - components.hash_start <= 1) {
747 return "";
748 }
749 return helpers::substring(buffer, components.hash_start);
750}
751
752[[nodiscard]] std::string_view url_aggregator::get_host() const noexcept
754 ada_log("url_aggregator::get_host");
755 // Technically, we should check if there is a hostname, but
756 // the code below works even if there isn't.
757 // if(!has_hostname()) { return ""; }
758 size_t start = components.host_start;
759 if (components.host_end > components.host_start &&
760 buffer[components.host_start] == '@') {
761 start++;
762 }
763 // if we have an empty host, then the space between components.host_end and
764 // components.pathname_start may be occupied by /.
765 if (start == components.host_end) {
766 return {};
767 }
768 return helpers::substring(buffer, start, components.pathname_start);
769}
770
771[[nodiscard]] std::string_view url_aggregator::get_hostname() const noexcept
773 ada_log("url_aggregator::get_hostname");
774 // Technically, we should check if there is a hostname, but
775 // the code below works even if there isn't.
776 // if(!has_hostname()) { return ""; }
777 size_t start = components.host_start;
778 // So host_start is not where the host begins.
779 if (components.host_end > components.host_start &&
780 buffer[components.host_start] == '@') {
781 start++;
782 }
783 return helpers::substring(buffer, start, components.host_end);
784}
785
786[[nodiscard]] std::string_view url_aggregator::get_search() const noexcept
788 ada_log("url_aggregator::get_search");
789 // If this's URL's query is either null or the empty string, then return the
790 // empty string. Return U+003F (?), followed by this's URL's query.
791 if (components.search_start == url_components::omitted) {
792 return "";
793 }
794 auto ending_index = uint32_t(buffer.size());
795 if (components.hash_start != url_components::omitted) {
796 ending_index = components.hash_start;
797 }
798 if (ending_index - components.search_start <= 1) {
799 return "";
800 }
801 return helpers::substring(buffer, components.search_start, ending_index);
802}
803
804[[nodiscard]] std::string_view url_aggregator::get_protocol() const noexcept
806 ada_log("url_aggregator::get_protocol");
807 return helpers::substring(buffer, 0, components.protocol_end);
808}
809
810[[nodiscard]] std::string ada::url_aggregator::to_string() const {
811 ada_log("url_aggregator::to_string buffer:", buffer, " [", buffer.size(),
812 " bytes]");
813 if (!is_valid) {
814 return "null";
815 }
816
817 std::string answer;
818 auto back = std::back_insert_iterator(answer);
819 answer.append("{\n");
820
821 answer.append("\t\"buffer\":\"");
822 helpers::encode_json(buffer, back);
823 answer.append("\",\n");
824
825 answer.append("\t\"protocol\":\"");
826 helpers::encode_json(get_protocol(), back);
827 answer.append("\",\n");
828
829 if (has_credentials()) {
830 answer.append("\t\"username\":\"");
831 helpers::encode_json(get_username(), back);
832 answer.append("\",\n");
833 answer.append("\t\"password\":\"");
834 helpers::encode_json(get_password(), back);
835 answer.append("\",\n");
836 }
837
838 answer.append("\t\"host\":\"");
839 helpers::encode_json(get_host(), back);
840 answer.append("\",\n");
841
842 answer.append("\t\"path\":\"");
843 helpers::encode_json(get_pathname(), back);
844 answer.append("\",\n");
845 answer.append("\t\"opaque path\":");
846 answer.append((has_opaque_path ? "true" : "false"));
847 answer.append(",\n");
848
849 if (components.search_start != url_components::omitted) {
850 answer.append("\t\"query\":\"");
851 helpers::encode_json(get_search(), back);
852 answer.append("\",\n");
853 }
854 if (components.hash_start != url_components::omitted) {
855 answer.append("\t\"fragment\":\"");
856 helpers::encode_json(get_hash(), back);
857 answer.append("\",\n");
858 }
859
860 auto convert_offset_to_string = [](uint32_t offset) -> std::string {
861 if (offset == url_components::omitted) {
862 return "null";
863 } else {
864 return std::to_string(offset);
865 }
866 };
867
868 answer.append("\t\"protocol_end\":");
869 answer.append(convert_offset_to_string(components.protocol_end));
870 answer.append(",\n");
871
872 answer.append("\t\"username_end\":");
873 answer.append(convert_offset_to_string(components.username_end));
874 answer.append(",\n");
875
876 answer.append("\t\"host_start\":");
877 answer.append(convert_offset_to_string(components.host_start));
878 answer.append(",\n");
879
880 answer.append("\t\"host_end\":");
881 answer.append(convert_offset_to_string(components.host_end));
882 answer.append(",\n");
883
884 answer.append("\t\"port\":");
885 answer.append(convert_offset_to_string(components.port));
886 answer.append(",\n");
887
888 answer.append("\t\"pathname_start\":");
889 answer.append(convert_offset_to_string(components.pathname_start));
890 answer.append(",\n");
891
892 answer.append("\t\"search_start\":");
893 answer.append(convert_offset_to_string(components.search_start));
894 answer.append(",\n");
895
896 answer.append("\t\"hash_start\":");
897 answer.append(convert_offset_to_string(components.hash_start));
898 answer.append("\n}");
899
900 return answer;
901}
902
903[[nodiscard]] bool url_aggregator::has_valid_domain() const noexcept {
904 if (components.host_start == components.host_end) {
905 return false;
906 }
907 return checkers::verify_dns_length(get_hostname());
908}
909
910bool url_aggregator::parse_ipv4(std::string_view input, bool in_place) {
911 ada_log("parse_ipv4 ", input, " [", input.size(),
912 " bytes], overlaps with buffer: ",
913 helpers::overlaps(input, buffer) ? "yes" : "no");
915 const bool trailing_dot = (input.back() == '.');
916 if (trailing_dot) {
917 input.remove_suffix(1);
918 }
919 size_t digit_count{0};
920 int pure_decimal_count = 0; // entries that are decimal
921 uint64_t ipv4{0};
922 // we could unroll for better performance?
923 for (; (digit_count < 4) && !(input.empty()); digit_count++) {
924 uint32_t
925 segment_result{}; // If any number exceeds 32 bits, we have an error.
926 bool is_hex = checkers::has_hex_prefix(input);
927 if (is_hex && ((input.length() == 2) ||
928 ((input.length() > 2) && (input[2] == '.')))) {
929 // special case
930 segment_result = 0;
931 input.remove_prefix(2);
932 } else {
933 std::from_chars_result r{};
934 if (is_hex) {
935 ada_log("parse_ipv4 trying to parse hex number");
936 r = std::from_chars(input.data() + 2, input.data() + input.size(),
937 segment_result, 16);
938 } else if ((input.length() >= 2) && input[0] == '0' &&
939 checkers::is_digit(input[1])) {
940 ada_log("parse_ipv4 trying to parse octal number");
941 r = std::from_chars(input.data() + 1, input.data() + input.size(),
942 segment_result, 8);
943 } else {
944 ada_log("parse_ipv4 trying to parse decimal number");
945 pure_decimal_count++;
946 r = std::from_chars(input.data(), input.data() + input.size(),
947 segment_result, 10);
948 }
949 if (r.ec != std::errc()) {
950 ada_log("parse_ipv4 parsing failed");
951 return is_valid = false;
952 }
953 ada_log("parse_ipv4 parsed ", segment_result);
954 input.remove_prefix(r.ptr - input.data());
955 }
956 if (input.empty()) {
957 // We have the last value.
958 // At this stage, ipv4 contains digit_count*8 bits.
959 // So we have 32-digit_count*8 bits left.
960 if (segment_result >= (uint64_t(1) << (32 - digit_count * 8))) {
961 return is_valid = false;
962 }
963 ipv4 <<= (32 - digit_count * 8);
964 ipv4 |= segment_result;
965 goto final;
966 } else {
967 // There is more, so that the value must no be larger than 255
968 // and we must have a '.'.
969 if ((segment_result > 255) || (input[0] != '.')) {
970 return is_valid = false;
971 }
972 ipv4 <<= 8;
973 ipv4 |= segment_result;
974 input.remove_prefix(1); // remove '.'
975 }
976 }
977 if ((digit_count != 4) || (!input.empty())) {
978 ada_log("parse_ipv4 found invalid (more than 4 numbers or empty) ");
979 return is_valid = false;
980 }
981final:
982 ada_log("url_aggregator::parse_ipv4 completed ", get_href(),
983 " host: ", get_host());
984
985 // We could also check r.ptr to see where the parsing ended.
986 if (in_place && pure_decimal_count == 4 && !trailing_dot) {
987 ada_log(
988 "url_aggregator::parse_ipv4 completed and was already correct in the "
989 "buffer");
990 // The original input was already all decimal and we validated it. So we
991 // don't need to do anything.
992 } else {
993 ada_log("url_aggregator::parse_ipv4 completed and we need to update it");
994 // Optimization opportunity: Get rid of unnecessary string return in ipv4
995 // serializer.
996 // TODO: This is likely a bug because it goes back update_base_hostname, not
997 // what we want to do.
998 update_base_hostname(
999 ada::serializers::ipv4(ipv4)); // We have to reserialize the address.
1000 }
1001 host_type = IPV4;
1003 return true;
1004}
1005
1006bool url_aggregator::parse_ipv6(std::string_view input) {
1007 // TODO: Implement in_place optimization: we know that input points
1008 // in the buffer, so we can just check whether the buffer is already
1009 // well formatted.
1010 // TODO: Find a way to merge parse_ipv6 with url.cpp implementation.
1011 ada_log("parse_ipv6 ", input, " [", input.size(), " bytes]");
1013 ADA_ASSERT_TRUE(!helpers::overlaps(input, buffer));
1014 if (input.empty()) {
1015 return is_valid = false;
1016 }
1017 // Let address be a new IPv6 address whose IPv6 pieces are all 0.
1018 std::array<uint16_t, 8> address{};
1019
1020 // Let pieceIndex be 0.
1021 int piece_index = 0;
1022
1023 // Let compress be null.
1024 std::optional<int> compress{};
1025
1026 // Let pointer be a pointer for input.
1027 std::string_view::iterator pointer = input.begin();
1028
1029 // If c is U+003A (:), then:
1030 if (input[0] == ':') {
1031 // If remaining does not start with U+003A (:), validation error, return
1032 // failure.
1033 if (input.size() == 1 || input[1] != ':') {
1034 ada_log("parse_ipv6 starts with : but the rest does not start with :");
1035 return is_valid = false;
1036 }
1037
1038 // Increase pointer by 2.
1039 pointer += 2;
1040
1041 // Increase pieceIndex by 1 and then set compress to pieceIndex.
1042 compress = ++piece_index;
1043 }
1044
1045 // While c is not the EOF code point:
1046 while (pointer != input.end()) {
1047 // If pieceIndex is 8, validation error, return failure.
1048 if (piece_index == 8) {
1049 ada_log("parse_ipv6 piece_index == 8");
1050 return is_valid = false;
1051 }
1052
1053 // If c is U+003A (:), then:
1054 if (*pointer == ':') {
1055 // If compress is non-null, validation error, return failure.
1056 if (compress.has_value()) {
1057 ada_log("parse_ipv6 compress is non-null");
1058 return is_valid = false;
1059 }
1060
1061 // Increase pointer and pieceIndex by 1, set compress to pieceIndex, and
1062 // then continue.
1063 pointer++;
1064 compress = ++piece_index;
1065 continue;
1066 }
1067
1068 // Let value and length be 0.
1069 uint16_t value = 0, length = 0;
1070
1071 // While length is less than 4 and c is an ASCII hex digit,
1072 // set value to value times 0x10 + c interpreted as hexadecimal number, and
1073 // increase pointer and length by 1.
1074 while (length < 4 && pointer != input.end() &&
1075 unicode::is_ascii_hex_digit(*pointer)) {
1076 // https://stackoverflow.com/questions/39060852/why-does-the-addition-of-two-shorts-return-an-int
1077 value = uint16_t(value * 0x10 + unicode::convert_hex_to_binary(*pointer));
1078 pointer++;
1079 length++;
1080 }
1081
1082 // If c is U+002E (.), then:
1083 if (pointer != input.end() && *pointer == '.') {
1084 // If length is 0, validation error, return failure.
1085 if (length == 0) {
1086 ada_log("parse_ipv6 length is 0");
1087 return is_valid = false;
1088 }
1089
1090 // Decrease pointer by length.
1091 pointer -= length;
1092
1093 // If pieceIndex is greater than 6, validation error, return failure.
1094 if (piece_index > 6) {
1095 ada_log("parse_ipv6 piece_index > 6");
1096 return is_valid = false;
1097 }
1098
1099 // Let numbersSeen be 0.
1100 int numbers_seen = 0;
1101
1102 // While c is not the EOF code point:
1103 while (pointer != input.end()) {
1104 // Let ipv4Piece be null.
1105 std::optional<uint16_t> ipv4_piece{};
1106
1107 // If numbersSeen is greater than 0, then:
1108 if (numbers_seen > 0) {
1109 // If c is a U+002E (.) and numbersSeen is less than 4, then increase
1110 // pointer by 1.
1111 if (*pointer == '.' && numbers_seen < 4) {
1112 pointer++;
1113 } else {
1114 // Otherwise, validation error, return failure.
1115 ada_log("parse_ipv6 Otherwise, validation error, return failure");
1116 return is_valid = false;
1117 }
1118 }
1119
1120 // If c is not an ASCII digit, validation error, return failure.
1121 if (pointer == input.end() || !checkers::is_digit(*pointer)) {
1122 ada_log(
1123 "parse_ipv6 If c is not an ASCII digit, validation error, return "
1124 "failure");
1125 return is_valid = false;
1126 }
1127
1128 // While c is an ASCII digit:
1129 while (pointer != input.end() && checkers::is_digit(*pointer)) {
1130 // Let number be c interpreted as decimal number.
1131 int number = *pointer - '0';
1132
1133 // If ipv4Piece is null, then set ipv4Piece to number.
1134 if (!ipv4_piece.has_value()) {
1135 ipv4_piece = number;
1136 }
1137 // Otherwise, if ipv4Piece is 0, validation error, return failure.
1138 else if (ipv4_piece == 0) {
1139 ada_log("parse_ipv6 if ipv4Piece is 0, validation error");
1140 return is_valid = false;
1141 }
1142 // Otherwise, set ipv4Piece to ipv4Piece times 10 + number.
1143 else {
1144 ipv4_piece = *ipv4_piece * 10 + number;
1145 }
1146
1147 // If ipv4Piece is greater than 255, validation error, return failure.
1148 if (ipv4_piece > 255) {
1149 ada_log("parse_ipv6 ipv4_piece > 255");
1150 return is_valid = false;
1151 }
1152
1153 // Increase pointer by 1.
1154 pointer++;
1155 }
1156
1157 // Set address[pieceIndex] to address[pieceIndex] times 0x100 +
1158 // ipv4Piece.
1159 // https://stackoverflow.com/questions/39060852/why-does-the-addition-of-two-shorts-return-an-int
1160 address[piece_index] =
1161 uint16_t(address[piece_index] * 0x100 + *ipv4_piece);
1162
1163 // Increase numbersSeen by 1.
1164 numbers_seen++;
1165
1166 // If numbersSeen is 2 or 4, then increase pieceIndex by 1.
1167 if (numbers_seen == 2 || numbers_seen == 4) {
1168 piece_index++;
1169 }
1170 }
1171
1172 // If numbersSeen is not 4, validation error, return failure.
1173 if (numbers_seen != 4) {
1174 return is_valid = false;
1175 }
1176
1177 // Break.
1178 break;
1179 }
1180 // Otherwise, if c is U+003A (:):
1181 else if ((pointer != input.end()) && (*pointer == ':')) {
1182 // Increase pointer by 1.
1183 pointer++;
1184
1185 // If c is the EOF code point, validation error, return failure.
1186 if (pointer == input.end()) {
1187 ada_log(
1188 "parse_ipv6 If c is the EOF code point, validation error, return "
1189 "failure");
1190 return is_valid = false;
1191 }
1192 }
1193 // Otherwise, if c is not the EOF code point, validation error, return
1194 // failure.
1195 else if (pointer != input.end()) {
1196 ada_log(
1197 "parse_ipv6 Otherwise, if c is not the EOF code point, validation "
1198 "error, return failure");
1199 return is_valid = false;
1200 }
1201
1202 // Set address[pieceIndex] to value.
1203 address[piece_index] = value;
1204
1205 // Increase pieceIndex by 1.
1206 piece_index++;
1207 }
1208
1209 // If compress is non-null, then:
1210 if (compress.has_value()) {
1211 // Let swaps be pieceIndex - compress.
1212 int swaps = piece_index - *compress;
1213
1214 // Set pieceIndex to 7.
1215 piece_index = 7;
1216
1217 // While pieceIndex is not 0 and swaps is greater than 0,
1218 // swap address[pieceIndex] with address[compress + swaps - 1], and then
1219 // decrease both pieceIndex and swaps by 1.
1220 while (piece_index != 0 && swaps > 0) {
1221 std::swap(address[piece_index], address[*compress + swaps - 1]);
1222 piece_index--;
1223 swaps--;
1224 }
1225 }
1226 // Otherwise, if compress is null and pieceIndex is not 8, validation error,
1227 // return failure.
1228 else if (piece_index != 8) {
1229 ada_log(
1230 "parse_ipv6 if compress is null and pieceIndex is not 8, validation "
1231 "error, return failure");
1232 return is_valid = false;
1233 }
1234 // TODO: Optimization opportunity: Get rid of unnecessary string creation.
1235 // TODO: This is likely a bug because it goes back update_base_hostname, not
1236 // what we want to do.
1237 update_base_hostname(ada::serializers::ipv6(address));
1238 ada_log("parse_ipv6 ", get_hostname());
1240 host_type = IPV6;
1241 return true;
1242}
1243
1244bool url_aggregator::parse_opaque_host(std::string_view input) {
1245 ada_log("parse_opaque_host ", input, " [", input.size(), " bytes]");
1247 ADA_ASSERT_TRUE(!helpers::overlaps(input, buffer));
1248 if (std::ranges::any_of(input, ada::unicode::is_forbidden_host_code_point)) {
1249 return is_valid = false;
1250 }
1251
1252 // Return the result of running UTF-8 percent-encode on input using the C0
1253 // control percent-encode set.
1256 if (idx == input.size()) {
1257 update_base_hostname(input);
1258 } else {
1259 // We only create a temporary string if we need to.
1260 update_base_hostname(ada::unicode::percent_encode(
1262 }
1264 return true;
1265}
1266
1267[[nodiscard]] std::string url_aggregator::to_diagram() const {
1268 if (!is_valid) {
1269 return "invalid";
1270 }
1271 std::string answer;
1272 answer.append(buffer);
1273 answer.append(" [");
1274 answer.append(std::to_string(buffer.size()));
1275 answer.append(" bytes]");
1276 answer.append("\n");
1277 // first line
1278 std::string line1;
1279 line1.resize(buffer.size(), ' ');
1280 if (components.hash_start != url_components::omitted) {
1281 line1[components.hash_start] = '|';
1282 }
1283 if (components.search_start != url_components::omitted) {
1284 line1[components.search_start] = '|';
1285 }
1286 if (components.pathname_start != buffer.size()) {
1287 line1[components.pathname_start] = '|';
1288 }
1289 if (components.host_end != buffer.size()) {
1290 line1[components.host_end] = '|';
1291 }
1292 if (components.host_start != buffer.size()) {
1293 line1[components.host_start] = '|';
1294 }
1295 if (components.username_end != buffer.size()) {
1296 line1[components.username_end] = '|';
1297 }
1298 if (components.protocol_end != buffer.size()) {
1299 line1[components.protocol_end] = '|';
1300 }
1301 answer.append(line1);
1302 answer.append("\n");
1303
1304 std::string line2 = line1;
1305 if (components.hash_start != url_components::omitted) {
1306 line2[components.hash_start] = '`';
1307 line1[components.hash_start] = ' ';
1308
1309 for (size_t i = components.hash_start + 1; i < line2.size(); i++) {
1310 line2[i] = '-';
1311 }
1312 line2.append(" hash_start");
1313 answer.append(line2);
1314 answer.append("\n");
1315 }
1316
1317 std::string line3 = line1;
1318 if (components.search_start != url_components::omitted) {
1319 line3[components.search_start] = '`';
1320 line1[components.search_start] = ' ';
1321
1322 for (size_t i = components.search_start + 1; i < line3.size(); i++) {
1323 line3[i] = '-';
1324 }
1325 line3.append(" search_start ");
1326 line3.append(std::to_string(components.search_start));
1327 answer.append(line3);
1328 answer.append("\n");
1329 }
1330
1331 std::string line4 = line1;
1332 if (components.pathname_start != buffer.size()) {
1333 line4[components.pathname_start] = '`';
1334 line1[components.pathname_start] = ' ';
1335 for (size_t i = components.pathname_start + 1; i < line4.size(); i++) {
1336 line4[i] = '-';
1337 }
1338 line4.append(" pathname_start ");
1339 line4.append(std::to_string(components.pathname_start));
1340 answer.append(line4);
1341 answer.append("\n");
1342 }
1343
1344 std::string line5 = line1;
1345 if (components.host_end != buffer.size()) {
1346 line5[components.host_end] = '`';
1347 line1[components.host_end] = ' ';
1348
1349 for (size_t i = components.host_end + 1; i < line5.size(); i++) {
1350 line5[i] = '-';
1351 }
1352 line5.append(" host_end ");
1353 line5.append(std::to_string(components.host_end));
1354 answer.append(line5);
1355 answer.append("\n");
1356 }
1357
1358 std::string line6 = line1;
1359 if (components.host_start != buffer.size()) {
1360 line6[components.host_start] = '`';
1361 line1[components.host_start] = ' ';
1362
1363 for (size_t i = components.host_start + 1; i < line6.size(); i++) {
1364 line6[i] = '-';
1365 }
1366 line6.append(" host_start ");
1367 line6.append(std::to_string(components.host_start));
1368 answer.append(line6);
1369 answer.append("\n");
1370 }
1371
1372 std::string line7 = line1;
1373 if (components.username_end != buffer.size()) {
1374 line7[components.username_end] = '`';
1375 line1[components.username_end] = ' ';
1376
1377 for (size_t i = components.username_end + 1; i < line7.size(); i++) {
1378 line7[i] = '-';
1379 }
1380 line7.append(" username_end ");
1381 line7.append(std::to_string(components.username_end));
1382 answer.append(line7);
1383 answer.append("\n");
1384 }
1385
1386 std::string line8 = line1;
1387 if (components.protocol_end != buffer.size()) {
1388 line8[components.protocol_end] = '`';
1389 line1[components.protocol_end] = ' ';
1390
1391 for (size_t i = components.protocol_end + 1; i < line8.size(); i++) {
1392 line8[i] = '-';
1393 }
1394 line8.append(" protocol_end ");
1395 line8.append(std::to_string(components.protocol_end));
1396 answer.append(line8);
1397 answer.append("\n");
1398 }
1399
1400 if (components.hash_start == url_components::omitted) {
1401 answer.append("note: hash omitted\n");
1402 }
1403 if (components.search_start == url_components::omitted) {
1404 answer.append("note: search omitted\n");
1405 }
1406 if (components.protocol_end > buffer.size()) {
1407 answer.append("warning: protocol_end overflows\n");
1408 }
1409 if (components.username_end > buffer.size()) {
1410 answer.append("warning: username_end overflows\n");
1411 }
1412 if (components.host_start > buffer.size()) {
1413 answer.append("warning: host_start overflows\n");
1414 }
1415 if (components.host_end > buffer.size()) {
1416 answer.append("warning: host_end overflows\n");
1417 }
1418 if (components.pathname_start > buffer.size()) {
1419 answer.append("warning: pathname_start overflows\n");
1420 }
1421 return answer;
1422}
1423
1424void url_aggregator::delete_dash_dot() {
1425 ada_log("url_aggregator::delete_dash_dot");
1427 ADA_ASSERT_TRUE(has_dash_dot());
1428 buffer.erase(components.host_end, 2);
1429 components.pathname_start -= 2;
1430 if (components.search_start != url_components::omitted) {
1431 components.search_start -= 2;
1432 }
1433 if (components.hash_start != url_components::omitted) {
1434 components.hash_start -= 2;
1435 }
1437 ADA_ASSERT_TRUE(!has_dash_dot());
1438}
1439
1440inline void url_aggregator::consume_prepared_path(std::string_view input) {
1441 ada_log("url_aggregator::consume_prepared_path ", input);
1442
1451 uint8_t accumulator = checkers::path_signature(input);
1452 // Let us first detect a trivial case.
1453 // If it is special, we check that we have no dot, no %, no \ and no
1454 // character needing percent encoding. Otherwise, we check that we have no %,
1455 // no dot, and no character needing percent encoding.
1456 constexpr uint8_t need_encoding = 1;
1457 constexpr uint8_t backslash_char = 2;
1458 constexpr uint8_t dot_char = 4;
1459 constexpr uint8_t percent_char = 8;
1460 bool special = type != ada::scheme::NOT_SPECIAL;
1461 bool may_need_slow_file_handling = (type == ada::scheme::type::FILE &&
1463 bool trivial_path =
1464 (special ? (accumulator == 0)
1465 : ((accumulator & (need_encoding | dot_char | percent_char)) ==
1466 0)) &&
1467 (!may_need_slow_file_handling);
1468 if (accumulator == dot_char && !may_need_slow_file_handling) {
1469 // '4' means that we have at least one dot, but nothing that requires
1470 // percent encoding or decoding. The only part that is not trivial is
1471 // that we may have single dots and double dots path segments.
1472 // If we have such segments, then we either have a path that begins
1473 // with '.' (easy to check), or we have the sequence './'.
1474 // Note: input cannot be empty, it must at least contain one character ('.')
1475 // Note: we know that '\' is not present.
1476 if (input[0] != '.') {
1477 size_t slashdot = 0;
1478 bool dot_is_file = true;
1479 for (;;) {
1480 slashdot = input.find("/.", slashdot);
1481 if (slashdot == std::string_view::npos) { // common case
1482 break;
1483 } else { // uncommon
1484 // only three cases matter: /./, /.. or a final /
1485 slashdot += 2;
1486 dot_is_file &= !(slashdot == input.size() || input[slashdot] == '.' ||
1487 input[slashdot] == '/');
1488 }
1489 }
1490 trivial_path = dot_is_file;
1491 }
1492 }
1493 if (trivial_path && is_at_path()) {
1494 ada_log("parse_path trivial");
1495 buffer += '/';
1496 buffer += input;
1497 return;
1498 }
1499 std::string path = std::string(get_pathname());
1500 // We are going to need to look a bit at the path, but let us see if we can
1501 // ignore percent encoding *and* backslashes *and* percent characters.
1502 // Except for the trivial case, this is likely to capture 99% of paths out
1503 // there.
1504 bool fast_path =
1505 (special &&
1506 (accumulator & (need_encoding | backslash_char | percent_char)) == 0) &&
1507 (type != ada::scheme::type::FILE);
1508 if (fast_path) {
1509 ada_log("parse_prepared_path fast");
1510 // Here we don't need to worry about \ or percent encoding.
1511 // We also do not have a file protocol. We might have dots, however,
1512 // but dots must as appear as '.', and they cannot be encoded because
1513 // the symbol '%' is not present.
1514 size_t previous_location = 0; // We start at 0.
1515 do {
1516 size_t new_location = input.find('/', previous_location);
1517 // std::string_view path_view = input;
1518 // We process the last segment separately:
1519 if (new_location == std::string_view::npos) {
1520 std::string_view path_view = input.substr(previous_location);
1521 if (path_view == "..") { // The path ends with ..
1522 // e.g., if you receive ".." with an empty path, you go to "/".
1523 if (path.empty()) {
1524 path = '/';
1525 update_base_pathname(path);
1526 return;
1527 }
1528 // Fast case where we have nothing to do:
1529 if (path.back() == '/') {
1530 update_base_pathname(path);
1531 return;
1532 }
1533 // If you have the path "/joe/myfriend",
1534 // then you delete 'myfriend'.
1535 path.resize(path.rfind('/') + 1);
1536 update_base_pathname(path);
1537 return;
1538 }
1539 path += '/';
1540 if (path_view != ".") {
1541 path.append(path_view);
1542 }
1543 update_base_pathname(path);
1544 return;
1545 } else {
1546 // This is a non-final segment.
1547 std::string_view path_view =
1548 input.substr(previous_location, new_location - previous_location);
1549 previous_location = new_location + 1;
1550 if (path_view == "..") {
1551 size_t last_delimiter = path.rfind('/');
1552 if (last_delimiter != std::string::npos) {
1553 path.erase(last_delimiter);
1554 }
1555 } else if (path_view != ".") {
1556 path += '/';
1557 path.append(path_view);
1558 }
1559 }
1560 } while (true);
1561 } else {
1562 ada_log("parse_path slow");
1563 // we have reached the general case
1564 bool needs_percent_encoding = (accumulator & 1);
1565 std::string path_buffer_tmp;
1566 do {
1567 size_t location = (special && (accumulator & 2))
1568 ? input.find_first_of("/\\")
1569 : input.find('/');
1570 std::string_view path_view = input;
1571 if (location != std::string_view::npos) {
1572 path_view.remove_suffix(path_view.size() - location);
1573 input.remove_prefix(location + 1);
1574 }
1575 // path_buffer is either path_view or it might point at a percent encoded
1576 // temporary string.
1577 std::string_view path_buffer =
1578 (needs_percent_encoding &&
1579 ada::unicode::percent_encode<false>(
1580 path_view, character_sets::PATH_PERCENT_ENCODE, path_buffer_tmp))
1581 ? path_buffer_tmp
1582 : path_view;
1583 if (unicode::is_double_dot_path_segment(path_buffer)) {
1584 helpers::shorten_path(path, type);
1585 if (location == std::string_view::npos) {
1586 path += '/';
1587 }
1588 } else if (unicode::is_single_dot_path_segment(path_buffer) &&
1589 (location == std::string_view::npos)) {
1590 path += '/';
1591 }
1592 // Otherwise, if path_buffer is not a single-dot path segment, then:
1593 else if (!unicode::is_single_dot_path_segment(path_buffer)) {
1594 // If url's scheme is "file", url's path is empty, and path_buffer is a
1595 // Windows drive letter, then replace the second code point in
1596 // path_buffer with U+003A (:).
1597 if (type == ada::scheme::type::FILE && path.empty() &&
1598 checkers::is_windows_drive_letter(path_buffer)) {
1599 path += '/';
1600 path += path_buffer[0];
1601 path += ':';
1602 path_buffer.remove_prefix(2);
1603 path.append(path_buffer);
1604 } else {
1605 // Append path_buffer to url's path.
1606 path += '/';
1607 path.append(path_buffer);
1608 }
1609 }
1610 if (location == std::string_view::npos) {
1611 update_base_pathname(path);
1612 return;
1613 }
1614 } while (true);
1615 }
1616}
1617} // namespace ada
Definitions for URL specific checkers used within Ada.
#define ADA_ASSERT_TRUE(COND)
#define ada_lifetime_bound
#define ada_really_inline
Definition common_defs.h:85
Definitions for helper functions used within Ada.
User-facing functions for URL parsing and manipulation.
constexpr uint8_t QUERY_PERCENT_ENCODE[32]
constexpr uint8_t SPECIAL_QUERY_PERCENT_ENCODE[32]
constexpr uint8_t PATH_PERCENT_ENCODE[32]
constexpr uint8_t C0_CONTROL_PERCENT_ENCODE[32]
constexpr uint8_t USERINFO_PERCENT_ENCODE[32]
constexpr bool has_hex_prefix(std::string_view input)
constexpr bool is_windows_drive_letter(std::string_view input) noexcept
constexpr uint64_t ipv4_fast_fail
Definition checkers.h:129
constexpr bool is_alpha(char x) noexcept
constexpr bool is_digit(char x) noexcept
ada_really_inline constexpr uint64_t try_parse_ipv4_fast(std::string_view input) noexcept
constexpr ada::scheme::type get_scheme_type(std::string_view scheme) noexcept
Definition scheme-inl.h:72
type
Enumeration of URL scheme types.
Definition scheme.h:41
@ NOT_SPECIAL
Definition scheme.h:43
std::string ipv6(const std::array< uint16_t, 8 > &address) noexcept
std::string ipv4(uint64_t address) noexcept
ada_really_inline size_t percent_encode_index(const std::string_view input, const uint8_t character_set[])
Definition unicode-inl.h:19
Definition ada_idna.h:13
@ IPV6
Definition url_base.h:32
@ IPV4
Definition url_base.h:30
template ada::result< url_aggregator > parse< url_aggregator >(std::string_view input, const url_aggregator *base_url)
tl::expected< result_type, ada::errors > result
ada_warn_unused ada::result< result_type > parse(std::string_view input, const result_type *base_url=nullptr)
URL scheme type definitions and utilities.
Memory-efficient URL representation using a single buffer.
constexpr bool has_non_empty_password() const noexcept
void set_hash(std::string_view input)
constexpr bool validate() const noexcept
void clear_search() override
std::string_view get_hostname() const noexcept ada_lifetime_bound
std::string to_string() const override
std::string_view get_hash() const noexcept ada_lifetime_bound
std::string to_diagram() const
constexpr bool has_hostname() const noexcept
bool set_protocol(std::string_view input)
std::string get_origin() const noexcept override
constexpr std::string_view get_href() const noexcept ada_lifetime_bound
std::string_view get_search() const noexcept ada_lifetime_bound
bool has_valid_domain() const noexcept override
bool set_hostname(std::string_view input)
bool set_password(std::string_view input)
constexpr std::string_view get_pathname() const noexcept ada_lifetime_bound
bool set_pathname(std::string_view input)
std::string_view get_protocol() const noexcept ada_lifetime_bound
std::string_view get_password() const noexcept ada_lifetime_bound
bool set_href(std::string_view input)
void set_search(std::string_view input)
std::string_view get_port() const noexcept ada_lifetime_bound
constexpr bool has_port() const noexcept
ada_really_inline constexpr bool has_credentials() const noexcept
bool set_host(std::string_view input)
std::string_view get_host() const noexcept ada_lifetime_bound
bool set_port(std::string_view input)
constexpr bool has_non_empty_username() const noexcept
std::string_view get_username() const noexcept ada_lifetime_bound
bool set_username(std::string_view input)
ada_really_inline constexpr bool is_special() const noexcept
url_host_type host_type
Definition url_base.h:67
bool is_valid
Definition url_base.h:56
bool has_opaque_path
Definition url_base.h:62
static constexpr uint32_t omitted
Definitions for unicode operations.
Inline functions for url aggregator.
Declaration for the ada::url_aggregator class.
Declaration for the URL Components.