Ada 3.0.1
Fast spec-compliant URL parser
Loading...
Searching...
No Matches
url_aggregator.cpp
Go to the documentation of this file.
1#include "ada/checkers-inl.h"
2#include "ada/helpers.h"
4#include "ada/scheme.h"
5#include "ada/unicode-inl.h"
10
11#include <string>
12#include <string_view>
13
14namespace ada {
15template <bool has_state_override>
16[[nodiscard]] ada_really_inline bool url_aggregator::parse_scheme_with_colon(
17 const std::string_view input_with_colon) {
18 ada_log("url_aggregator::parse_scheme_with_colon ", input_with_colon);
20 ADA_ASSERT_TRUE(!helpers::overlaps(input_with_colon, buffer));
21 std::string_view input{input_with_colon};
22 input.remove_suffix(1);
23 auto parsed_type = ada::scheme::get_scheme_type(input);
24 const bool is_input_special = (parsed_type != ada::scheme::NOT_SPECIAL);
29 if (is_input_special) { // fast path!!!
30 if constexpr (has_state_override) {
31 // If url's scheme is not a special scheme and buffer is a special scheme,
32 // then return.
33 if (is_special() != is_input_special) {
34 return false;
35 }
36
37 // If url includes credentials or has a non-null port, and buffer is
38 // "file", then return.
39 if ((has_credentials() || components.port != url_components::omitted) &&
40 parsed_type == ada::scheme::type::FILE) {
41 return false;
42 }
43
44 // If url's scheme is "file" and its host is an empty host, then return.
45 // An empty host is the empty string.
46 if (type == ada::scheme::type::FILE &&
47 components.host_start == components.host_end) {
48 return false;
49 }
50 }
51
52 type = parsed_type;
53 set_scheme_from_view_with_colon(input_with_colon);
54
55 if constexpr (has_state_override) {
56 // This is uncommon.
57 uint16_t urls_scheme_port = get_special_port();
58
59 // If url's port is url's scheme's default port, then set url's port to
60 // null.
61 if (components.port == urls_scheme_port) {
62 clear_port();
63 }
64 }
65 } else { // slow path
66 std::string _buffer(input);
67 // Next function is only valid if the input is ASCII and returns false
68 // otherwise, but it seems that we always have ascii content so we do not
69 // need to check the return value.
70 unicode::to_lower_ascii(_buffer.data(), _buffer.size());
71
72 if constexpr (has_state_override) {
73 // If url's scheme is a special scheme and buffer is not a special scheme,
74 // then return. If url's scheme is not a special scheme and buffer is a
75 // special scheme, then return.
76 if (is_special() != ada::scheme::is_special(_buffer)) {
77 return true;
78 }
79
80 // If url includes credentials or has a non-null port, and buffer is
81 // "file", then return.
82 if ((has_credentials() || components.port != url_components::omitted) &&
83 _buffer == "file") {
84 return true;
85 }
86
87 // If url's scheme is "file" and its host is an empty host, then return.
88 // An empty host is the empty string.
89 if (type == ada::scheme::type::FILE &&
90 components.host_start == components.host_end) {
91 return true;
92 }
93 }
94
95 set_scheme(_buffer);
96
97 if constexpr (has_state_override) {
98 // This is uncommon.
99 uint16_t urls_scheme_port = get_special_port();
100
101 // If url's port is url's scheme's default port, then set url's port to
102 // null.
103 if (components.port == urls_scheme_port) {
104 clear_port();
105 }
106 }
107 }
109 return true;
110}
111
112inline void url_aggregator::copy_scheme(const url_aggregator& u) noexcept {
113 ada_log("url_aggregator::copy_scheme ", u.buffer);
114 ADA_ASSERT_TRUE(validate());
115 // next line could overflow but unsigned arithmetic has well-defined
116 // overflows.
117 uint32_t new_difference = u.components.protocol_end - components.protocol_end;
118 type = u.type;
119 buffer.erase(0, components.protocol_end);
120 buffer.insert(0, u.get_protocol());
121 components.protocol_end = u.components.protocol_end;
122
123 // No need to update the components
124 if (new_difference == 0) {
125 return;
126 }
127
128 // Update the rest of the components.
129 components.username_end += new_difference;
130 components.host_start += new_difference;
131 components.host_end += new_difference;
132 components.pathname_start += new_difference;
133 if (components.search_start != url_components::omitted) {
134 components.search_start += new_difference;
135 }
136 if (components.hash_start != url_components::omitted) {
137 components.hash_start += new_difference;
138 }
139 ADA_ASSERT_TRUE(validate());
140}
141
142inline void url_aggregator::set_scheme_from_view_with_colon(
143 std::string_view new_scheme_with_colon) noexcept {
144 ada_log("url_aggregator::set_scheme_from_view_with_colon ",
145 new_scheme_with_colon);
146 ADA_ASSERT_TRUE(validate());
147 ADA_ASSERT_TRUE(!new_scheme_with_colon.empty() &&
148 new_scheme_with_colon.back() == ':');
149 // next line could overflow but unsigned arithmetic has well-defined
150 // overflows.
151 uint32_t new_difference =
152 uint32_t(new_scheme_with_colon.size()) - components.protocol_end;
153
154 if (buffer.empty()) {
155 buffer.append(new_scheme_with_colon);
156 } else {
157 buffer.erase(0, components.protocol_end);
158 buffer.insert(0, new_scheme_with_colon);
159 }
160 components.protocol_end += new_difference;
161
162 // Update the rest of the components.
163 components.username_end += new_difference;
164 components.host_start += new_difference;
165 components.host_end += new_difference;
166 components.pathname_start += new_difference;
167 if (components.search_start != url_components::omitted) {
168 components.search_start += new_difference;
169 }
170 if (components.hash_start != url_components::omitted) {
171 components.hash_start += new_difference;
172 }
173 ADA_ASSERT_TRUE(validate());
174}
175
176inline void url_aggregator::set_scheme(std::string_view new_scheme) noexcept {
177 ada_log("url_aggregator::set_scheme ", new_scheme);
178 ADA_ASSERT_TRUE(validate());
179 ADA_ASSERT_TRUE(new_scheme.empty() || new_scheme.back() != ':');
180 // next line could overflow but unsigned arithmetic has well-defined
181 // overflows.
182 uint32_t new_difference =
183 uint32_t(new_scheme.size()) - components.protocol_end + 1;
184
186 if (buffer.empty()) {
187 buffer.append(helpers::concat(new_scheme, ":"));
188 } else {
189 buffer.erase(0, components.protocol_end);
190 buffer.insert(0, helpers::concat(new_scheme, ":"));
191 }
192 components.protocol_end = uint32_t(new_scheme.size() + 1);
193
194 // Update the rest of the components.
195 components.username_end += new_difference;
196 components.host_start += new_difference;
197 components.host_end += new_difference;
198 components.pathname_start += new_difference;
199 if (components.search_start != url_components::omitted) {
200 components.search_start += new_difference;
201 }
202 if (components.hash_start != url_components::omitted) {
203 components.hash_start += new_difference;
204 }
205 ADA_ASSERT_TRUE(validate());
206}
207
208bool url_aggregator::set_protocol(const std::string_view input) {
209 ada_log("url_aggregator::set_protocol ", input);
211 ADA_ASSERT_TRUE(!helpers::overlaps(input, buffer));
212 std::string view(input);
213 helpers::remove_ascii_tab_or_newline(view);
214 if (view.empty()) {
215 return true;
216 }
217
218 // Schemes should start with alpha values.
219 if (!checkers::is_alpha(view[0])) {
220 return false;
221 }
222
223 view.append(":");
224
225 std::string::iterator pointer =
226 std::ranges::find_if_not(view, unicode::is_alnum_plus);
227
228 if (pointer != view.end() && *pointer == ':') {
229 return parse_scheme_with_colon<true>(
230 std::string_view(view.data(), pointer - view.begin() + 1));
231 }
232 return false;
233}
234
235bool url_aggregator::set_username(const std::string_view input) {
236 ada_log("url_aggregator::set_username '", input, "' ");
238 ADA_ASSERT_TRUE(!helpers::overlaps(input, buffer));
239 if (cannot_have_credentials_or_port()) {
240 return false;
241 }
244 if (idx == input.size()) {
245 update_base_username(input);
246 } else {
247 // We only create a temporary string if we have to!
248 update_base_username(ada::unicode::percent_encode(
250 }
252 return true;
253}
254
255bool url_aggregator::set_password(const std::string_view input) {
256 ada_log("url_aggregator::set_password '", input, "'");
258 ADA_ASSERT_TRUE(!helpers::overlaps(input, buffer));
259 if (cannot_have_credentials_or_port()) {
260 return false;
261 }
264 if (idx == input.size()) {
265 update_base_password(input);
266 } else {
267 // We only create a temporary string if we have to!
268 update_base_password(ada::unicode::percent_encode(
270 }
272 return true;
273}
274
275bool url_aggregator::set_port(const std::string_view input) {
276 ada_log("url_aggregator::set_port ", input);
278 ADA_ASSERT_TRUE(!helpers::overlaps(input, buffer));
279 if (cannot_have_credentials_or_port()) {
280 return false;
281 }
282 std::string trimmed(input);
283 helpers::remove_ascii_tab_or_newline(trimmed);
284 if (trimmed.empty()) {
285 clear_port();
286 return true;
287 }
288
289 // Input should not start with a non-digit character.
290 if (!ada::unicode::is_ascii_digit(trimmed.front())) {
291 return false;
292 }
293
294 // Revert changes if parse_port fails.
295 uint32_t previous_port = components.port;
296 parse_port(trimmed);
297 if (is_valid) {
298 return true;
299 }
300 update_base_port(previous_port);
301 is_valid = true;
303 return false;
304}
305
306bool url_aggregator::set_pathname(const std::string_view input) {
307 ada_log("url_aggregator::set_pathname ", input);
309 ADA_ASSERT_TRUE(!helpers::overlaps(input, buffer));
310 if (has_opaque_path) {
311 return false;
312 }
313 clear_pathname();
314 parse_path(input);
315 if (get_pathname().starts_with("//") && !has_authority() && !has_dash_dot()) {
316 buffer.insert(components.pathname_start, "/.");
317 components.pathname_start += 2;
318 }
320 return true;
321}
322
323ada_really_inline void url_aggregator::parse_path(std::string_view input) {
324 ada_log("url_aggregator::parse_path ", input);
326 ADA_ASSERT_TRUE(!helpers::overlaps(input, buffer));
327 std::string tmp_buffer;
328 std::string_view internal_input;
329 if (unicode::has_tabs_or_newline(input)) {
330 tmp_buffer = input;
331 // Optimization opportunity: Instead of copying and then pruning, we could
332 // just directly build the string from user_input.
333 helpers::remove_ascii_tab_or_newline(tmp_buffer);
334 internal_input = tmp_buffer;
335 } else {
336 internal_input = input;
337 }
338
339 // If url is special, then:
340 if (is_special()) {
341 if (internal_input.empty()) {
342 update_base_pathname("/");
343 } else if ((internal_input[0] == '/') || (internal_input[0] == '\\')) {
344 consume_prepared_path(internal_input.substr(1));
345 } else {
346 consume_prepared_path(internal_input);
347 }
348 } else if (!internal_input.empty()) {
349 if (internal_input[0] == '/') {
350 consume_prepared_path(internal_input.substr(1));
351 } else {
352 consume_prepared_path(internal_input);
353 }
354 } else {
355 // Non-special URLs with an empty host can have their paths erased
356 // Path-only URLs cannot have their paths erased
357 if (components.host_start == components.host_end && !has_authority()) {
358 update_base_pathname("/");
359 }
360 }
362}
363
364void url_aggregator::set_search(const std::string_view input) {
365 ada_log("url_aggregator::set_search ", input);
367 ADA_ASSERT_TRUE(!helpers::overlaps(input, buffer));
368 if (input.empty()) {
369 clear_search();
370 helpers::strip_trailing_spaces_from_opaque_path(*this);
371 return;
372 }
373
374 std::string new_value;
375 new_value = input[0] == '?' ? input.substr(1) : input;
376 helpers::remove_ascii_tab_or_newline(new_value);
377
378 auto query_percent_encode_set =
381
382 update_base_search(new_value, query_percent_encode_set);
384}
385
386void url_aggregator::set_hash(const std::string_view input) {
387 ada_log("url_aggregator::set_hash ", input);
389 ADA_ASSERT_TRUE(!helpers::overlaps(input, buffer));
390 if (input.empty()) {
391 if (components.hash_start != url_components::omitted) {
392 buffer.resize(components.hash_start);
393 components.hash_start = url_components::omitted;
394 }
395 helpers::strip_trailing_spaces_from_opaque_path(*this);
396 return;
397 }
398
399 std::string new_value;
400 new_value = input[0] == '#' ? input.substr(1) : input;
401 helpers::remove_ascii_tab_or_newline(new_value);
402 update_unencoded_base_hash(new_value);
404}
405
406bool url_aggregator::set_href(const std::string_view input) {
407 ADA_ASSERT_TRUE(!helpers::overlaps(input, buffer));
408 ada_log("url_aggregator::set_href ", input, " [", input.size(), " bytes]");
410 ada_log("url_aggregator::set_href, success :", out.has_value());
411
412 if (out) {
413 ada_log("url_aggregator::set_href, parsed ", out->to_string());
414 // TODO: Figure out why the following line puts test to never finish.
415 *this = *out;
416 }
417
418 return out.has_value();
419}
420
421ada_really_inline bool url_aggregator::parse_host(std::string_view input) {
422 ada_log("url_aggregator:parse_host \"", input, "\" [", input.size(),
423 " bytes]");
425 ADA_ASSERT_TRUE(!helpers::overlaps(input, buffer));
426 if (input.empty()) {
427 return is_valid = false;
428 } // technically unnecessary.
429 // If input starts with U+005B ([), then:
430 if (input[0] == '[') {
431 // If input does not end with U+005D (]), validation error, return failure.
432 if (input.back() != ']') {
433 return is_valid = false;
434 }
435 ada_log("parse_host ipv6");
436
437 // Return the result of IPv6 parsing input with its leading U+005B ([) and
438 // trailing U+005D (]) removed.
439 input.remove_prefix(1);
440 input.remove_suffix(1);
441 return parse_ipv6(input);
442 }
443
444 // If isNotSpecial is true, then return the result of opaque-host parsing
445 // input.
446 if (!is_special()) {
447 return parse_opaque_host(input);
448 }
449 // Let domain be the result of running UTF-8 decode without BOM on the
450 // percent-decoding of input. Let asciiDomain be the result of running domain
451 // to ASCII with domain and false. The most common case is an ASCII input, in
452 // which case we do not need to call the expensive 'to_ascii' if a few
453 // conditions are met: no '%' and no 'xn-' subsequence.
454
455 // Often, the input does not contain any forbidden code points, and no upper
456 // case ASCII letter, then we can just copy it to the buffer. We want to
457 // optimize for such a common case.
458 uint8_t is_forbidden_or_upper =
459 unicode::contains_forbidden_domain_code_point_or_upper(input.data(),
460 input.size());
461 // Minor optimization opportunity:
462 // contains_forbidden_domain_code_point_or_upper could be extend to check for
463 // the presence of characters that cannot appear in the ipv4 address and we
464 // could also check whether x and n and - are present, and so we could skip
465 // some of the checks below. However, the gains are likely to be small, and
466 // the code would be more complex.
467 if (is_forbidden_or_upper == 0 &&
468 input.find("xn-") == std::string_view::npos) {
469 // fast path
470 update_base_hostname(input);
471 if (checkers::is_ipv4(get_hostname())) {
472 ada_log("parse_host fast path ipv4");
473 return parse_ipv4(get_hostname(), true);
474 }
475 ada_log("parse_host fast path ", get_hostname());
476 return true;
477 }
478 // We have encountered at least one forbidden code point or the input contains
479 // 'xn-' (case insensitive), so we need to call 'to_ascii' to perform the full
480 // conversion.
481
482 ada_log("parse_host calling to_ascii");
483 std::optional<std::string> host = std::string(get_hostname());
484 is_valid = ada::unicode::to_ascii(host, input, input.find('%'));
485 if (!is_valid) {
486 ada_log("parse_host to_ascii returns false");
487 return is_valid = false;
488 }
489 ada_log("parse_host to_ascii succeeded ", *host, " [", host->size(),
490 " bytes]");
491
492 if (std::any_of(host.value().begin(), host.value().end(),
493 ada::unicode::is_forbidden_domain_code_point)) {
494 return is_valid = false;
495 }
496
497 // If asciiDomain ends in a number, then return the result of IPv4 parsing
498 // asciiDomain.
499 if (checkers::is_ipv4(host.value())) {
500 ada_log("parse_host got ipv4 ", *host);
501 return parse_ipv4(host.value(), false);
502 }
503
504 update_base_hostname(host.value());
506 return true;
507}
508
509template <bool override_hostname>
510bool url_aggregator::set_host_or_hostname(const std::string_view input) {
511 ada_log("url_aggregator::set_host_or_hostname ", input);
513 ADA_ASSERT_TRUE(!helpers::overlaps(input, buffer));
514 if (has_opaque_path) {
515 return false;
516 }
517
518 std::string previous_host(get_hostname());
519 uint32_t previous_port = components.port;
520
521 size_t host_end_pos = input.find('#');
522 std::string _host(input.data(), host_end_pos != std::string_view::npos
523 ? host_end_pos
524 : input.size());
525 helpers::remove_ascii_tab_or_newline(_host);
526 std::string_view new_host(_host);
527
528 // If url's scheme is "file", then set state to file host state, instead of
529 // host state.
530 if (type != ada::scheme::type::FILE) {
531 std::string_view host_view(_host.data(), _host.length());
532 auto [location, found_colon] =
533 helpers::get_host_delimiter_location(is_special(), host_view);
534
535 // Otherwise, if c is U+003A (:) and insideBrackets is false, then:
536 // Note: the 'found_colon' value is true if and only if a colon was
537 // encountered while not inside brackets.
538 if (found_colon) {
539 if constexpr (override_hostname) {
540 return false;
541 }
542 std::string_view sub_buffer = new_host.substr(location + 1);
543 if (!sub_buffer.empty()) {
544 set_port(sub_buffer);
545 }
546 }
547 // If url is special and host_view is the empty string, validation error,
548 // return failure. Otherwise, if state override is given, host_view is the
549 // empty string, and either url includes credentials or url's port is
550 // non-null, return.
551 else if (host_view.empty() &&
552 (is_special() || has_credentials() || has_port())) {
553 return false;
554 }
555
556 // Let host be the result of host parsing host_view with url is not special.
557 if (host_view.empty() && !is_special()) {
558 if (has_hostname()) {
559 clear_hostname(); // easy!
560 } else if (has_dash_dot()) {
561 add_authority_slashes_if_needed();
562 delete_dash_dot();
563 }
564 return true;
565 }
566
567 bool succeeded = parse_host(host_view);
568 if (!succeeded) {
569 update_base_hostname(previous_host);
570 update_base_port(previous_port);
571 } else if (has_dash_dot()) {
572 // Should remove dash_dot from pathname
573 delete_dash_dot();
574 }
575 return succeeded;
576 }
577
578 size_t location = new_host.find_first_of("/\\?");
579 if (location != std::string_view::npos) {
580 new_host.remove_suffix(new_host.length() - location);
581 }
582
583 if (new_host.empty()) {
584 // Set url's host to the empty string.
585 clear_hostname();
586 } else {
587 // Let host be the result of host parsing buffer with url is not special.
588 if (!parse_host(new_host)) {
589 update_base_hostname(previous_host);
590 update_base_port(previous_port);
591 return false;
592 }
593
594 // If host is "localhost", then set host to the empty string.
595 if (helpers::substring(buffer, components.host_start,
596 components.host_end) == "localhost") {
597 clear_hostname();
598 }
599 }
601 return true;
602}
603
604bool url_aggregator::set_host(const std::string_view input) {
605 ada_log("url_aggregator::set_host '", input, "'");
607 ADA_ASSERT_TRUE(!helpers::overlaps(input, buffer));
608 return set_host_or_hostname<false>(input);
609}
610
611bool url_aggregator::set_hostname(const std::string_view input) {
612 ada_log("url_aggregator::set_hostname '", input, "'");
614 ADA_ASSERT_TRUE(!helpers::overlaps(input, buffer));
615 return set_host_or_hostname<true>(input);
616}
617
618[[nodiscard]] std::string url_aggregator::get_origin() const noexcept {
619 ada_log("url_aggregator::get_origin");
620 if (is_special()) {
621 // Return a new opaque origin.
622 if (type == scheme::FILE) {
623 return "null";
624 }
625
626 return helpers::concat(get_protocol(), "//", get_host());
627 }
628
629 if (get_protocol() == "blob:") {
630 std::string_view path = get_pathname();
631 if (!path.empty()) {
632 auto out = ada::parse<ada::url_aggregator>(path);
633 if (out && (out->type == scheme::HTTP || out->type == scheme::HTTPS)) {
634 // If pathURL's scheme is not "http" and not "https", then return a
635 // new opaque origin.
636 return helpers::concat(out->get_protocol(), "//", out->get_host());
637 }
638 }
639 }
640
641 // Return a new opaque origin.
642 return "null";
643}
644
645[[nodiscard]] std::string_view url_aggregator::get_username() const noexcept
647 ada_log("url_aggregator::get_username");
649 return helpers::substring(buffer, components.protocol_end + 2,
650 components.username_end);
651 }
652 return "";
653}
654
655[[nodiscard]] std::string_view url_aggregator::get_password() const noexcept
657 ada_log("url_aggregator::get_password");
659 return helpers::substring(buffer, components.username_end + 1,
660 components.host_start);
661 }
662 return "";
663}
664
665[[nodiscard]] std::string_view url_aggregator::get_port() const noexcept
667 ada_log("url_aggregator::get_port");
668 if (components.port == url_components::omitted) {
669 return "";
670 }
671 return helpers::substring(buffer, components.host_end + 1,
672 components.pathname_start);
673}
674
675[[nodiscard]] std::string_view url_aggregator::get_hash() const noexcept
677 ada_log("url_aggregator::get_hash");
678 // If this's URL's fragment is either null or the empty string, then return
679 // the empty string. Return U+0023 (#), followed by this's URL's fragment.
680 if (components.hash_start == url_components::omitted) {
681 return "";
682 }
683 if (buffer.size() - components.hash_start <= 1) {
684 return "";
685 }
686 return helpers::substring(buffer, components.hash_start);
687}
688
689[[nodiscard]] std::string_view url_aggregator::get_host() const noexcept
691 ada_log("url_aggregator::get_host");
692 // Technically, we should check if there is a hostname, but
693 // the code below works even if there isn't.
694 // if(!has_hostname()) { return ""; }
695 size_t start = components.host_start;
696 if (components.host_end > components.host_start &&
697 buffer[components.host_start] == '@') {
698 start++;
699 }
700 // if we have an empty host, then the space between components.host_end and
701 // components.pathname_start may be occupied by /.
702 if (start == components.host_end) {
703 return {};
704 }
705 return helpers::substring(buffer, start, components.pathname_start);
706}
707
708[[nodiscard]] std::string_view url_aggregator::get_hostname() const noexcept
710 ada_log("url_aggregator::get_hostname");
711 // Technically, we should check if there is a hostname, but
712 // the code below works even if there isn't.
713 // if(!has_hostname()) { return ""; }
714 size_t start = components.host_start;
715 // So host_start is not where the host begins.
716 if (components.host_end > components.host_start &&
717 buffer[components.host_start] == '@') {
718 start++;
719 }
720 return helpers::substring(buffer, start, components.host_end);
721}
722
723[[nodiscard]] std::string_view url_aggregator::get_search() const noexcept
725 ada_log("url_aggregator::get_search");
726 // If this's URL's query is either null or the empty string, then return the
727 // empty string. Return U+003F (?), followed by this's URL's query.
728 if (components.search_start == url_components::omitted) {
729 return "";
730 }
731 auto ending_index = uint32_t(buffer.size());
732 if (components.hash_start != url_components::omitted) {
733 ending_index = components.hash_start;
734 }
735 if (ending_index - components.search_start <= 1) {
736 return "";
737 }
738 return helpers::substring(buffer, components.search_start, ending_index);
739}
740
741[[nodiscard]] std::string_view url_aggregator::get_protocol() const noexcept
743 ada_log("url_aggregator::get_protocol");
744 return helpers::substring(buffer, 0, components.protocol_end);
745}
746
747[[nodiscard]] std::string ada::url_aggregator::to_string() const {
748 ada_log("url_aggregator::to_string buffer:", buffer, " [", buffer.size(),
749 " bytes]");
750 if (!is_valid) {
751 return "null";
752 }
753
754 std::string answer;
755 auto back = std::back_insert_iterator(answer);
756 answer.append("{\n");
757
758 answer.append("\t\"buffer\":\"");
759 helpers::encode_json(buffer, back);
760 answer.append("\",\n");
761
762 answer.append("\t\"protocol\":\"");
763 helpers::encode_json(get_protocol(), back);
764 answer.append("\",\n");
765
766 if (has_credentials()) {
767 answer.append("\t\"username\":\"");
768 helpers::encode_json(get_username(), back);
769 answer.append("\",\n");
770 answer.append("\t\"password\":\"");
771 helpers::encode_json(get_password(), back);
772 answer.append("\",\n");
773 }
774
775 answer.append("\t\"host\":\"");
776 helpers::encode_json(get_host(), back);
777 answer.append("\",\n");
778
779 answer.append("\t\"path\":\"");
780 helpers::encode_json(get_pathname(), back);
781 answer.append("\",\n");
782 answer.append("\t\"opaque path\":");
783 answer.append((has_opaque_path ? "true" : "false"));
784 answer.append(",\n");
785
786 if (components.search_start != url_components::omitted) {
787 answer.append("\t\"query\":\"");
788 helpers::encode_json(get_search(), back);
789 answer.append("\",\n");
790 }
791 if (components.hash_start != url_components::omitted) {
792 answer.append("\t\"fragment\":\"");
793 helpers::encode_json(get_hash(), back);
794 answer.append("\",\n");
795 }
796
797 auto convert_offset_to_string = [](uint32_t offset) -> std::string {
798 if (offset == url_components::omitted) {
799 return "null";
800 } else {
801 return std::to_string(offset);
802 }
803 };
804
805 answer.append("\t\"protocol_end\":");
806 answer.append(convert_offset_to_string(components.protocol_end));
807 answer.append(",\n");
808
809 answer.append("\t\"username_end\":");
810 answer.append(convert_offset_to_string(components.username_end));
811 answer.append(",\n");
812
813 answer.append("\t\"host_start\":");
814 answer.append(convert_offset_to_string(components.host_start));
815 answer.append(",\n");
816
817 answer.append("\t\"host_end\":");
818 answer.append(convert_offset_to_string(components.host_end));
819 answer.append(",\n");
820
821 answer.append("\t\"port\":");
822 answer.append(convert_offset_to_string(components.port));
823 answer.append(",\n");
824
825 answer.append("\t\"pathname_start\":");
826 answer.append(convert_offset_to_string(components.pathname_start));
827 answer.append(",\n");
828
829 answer.append("\t\"search_start\":");
830 answer.append(convert_offset_to_string(components.search_start));
831 answer.append(",\n");
832
833 answer.append("\t\"hash_start\":");
834 answer.append(convert_offset_to_string(components.hash_start));
835 answer.append("\n}");
836
837 return answer;
838}
839
840[[nodiscard]] bool url_aggregator::has_valid_domain() const noexcept {
841 if (components.host_start == components.host_end) {
842 return false;
843 }
844 return checkers::verify_dns_length(get_hostname());
845}
846
847bool url_aggregator::parse_ipv4(std::string_view input, bool in_place) {
848 ada_log("parse_ipv4 ", input, " [", input.size(),
849 " bytes], overlaps with buffer: ",
850 helpers::overlaps(input, buffer) ? "yes" : "no");
852 const bool trailing_dot = (input.back() == '.');
853 if (trailing_dot) {
854 input.remove_suffix(1);
855 }
856 size_t digit_count{0};
857 int pure_decimal_count = 0; // entries that are decimal
858 uint64_t ipv4{0};
859 // we could unroll for better performance?
860 for (; (digit_count < 4) && !(input.empty()); digit_count++) {
861 uint32_t
862 segment_result{}; // If any number exceeds 32 bits, we have an error.
863 bool is_hex = checkers::has_hex_prefix(input);
864 if (is_hex && ((input.length() == 2) ||
865 ((input.length() > 2) && (input[2] == '.')))) {
866 // special case
867 segment_result = 0;
868 input.remove_prefix(2);
869 } else {
870 std::from_chars_result r{};
871 if (is_hex) {
872 ada_log("parse_ipv4 trying to parse hex number");
873 r = std::from_chars(input.data() + 2, input.data() + input.size(),
874 segment_result, 16);
875 } else if ((input.length() >= 2) && input[0] == '0' &&
876 checkers::is_digit(input[1])) {
877 ada_log("parse_ipv4 trying to parse octal number");
878 r = std::from_chars(input.data() + 1, input.data() + input.size(),
879 segment_result, 8);
880 } else {
881 ada_log("parse_ipv4 trying to parse decimal number");
882 pure_decimal_count++;
883 r = std::from_chars(input.data(), input.data() + input.size(),
884 segment_result, 10);
885 }
886 if (r.ec != std::errc()) {
887 ada_log("parse_ipv4 parsing failed");
888 return is_valid = false;
889 }
890 ada_log("parse_ipv4 parsed ", segment_result);
891 input.remove_prefix(r.ptr - input.data());
892 }
893 if (input.empty()) {
894 // We have the last value.
895 // At this stage, ipv4 contains digit_count*8 bits.
896 // So we have 32-digit_count*8 bits left.
897 if (segment_result >= (uint64_t(1) << (32 - digit_count * 8))) {
898 return is_valid = false;
899 }
900 ipv4 <<= (32 - digit_count * 8);
901 ipv4 |= segment_result;
902 goto final;
903 } else {
904 // There is more, so that the value must no be larger than 255
905 // and we must have a '.'.
906 if ((segment_result > 255) || (input[0] != '.')) {
907 return is_valid = false;
908 }
909 ipv4 <<= 8;
910 ipv4 |= segment_result;
911 input.remove_prefix(1); // remove '.'
912 }
913 }
914 if ((digit_count != 4) || (!input.empty())) {
915 ada_log("parse_ipv4 found invalid (more than 4 numbers or empty) ");
916 return is_valid = false;
917 }
918final:
919 ada_log("url_aggregator::parse_ipv4 completed ", get_href(),
920 " host: ", get_host());
921
922 // We could also check r.ptr to see where the parsing ended.
923 if (in_place && pure_decimal_count == 4 && !trailing_dot) {
924 ada_log(
925 "url_aggregator::parse_ipv4 completed and was already correct in the "
926 "buffer");
927 // The original input was already all decimal and we validated it. So we
928 // don't need to do anything.
929 } else {
930 ada_log("url_aggregator::parse_ipv4 completed and we need to update it");
931 // Optimization opportunity: Get rid of unnecessary string return in ipv4
932 // serializer.
933 // TODO: This is likely a bug because it goes back update_base_hostname, not
934 // what we want to do.
935 update_base_hostname(
936 ada::serializers::ipv4(ipv4)); // We have to reserialize the address.
937 }
938 host_type = IPV4;
940 return true;
941}
942
943bool url_aggregator::parse_ipv6(std::string_view input) {
944 // TODO: Implement in_place optimization: we know that input points
945 // in the buffer, so we can just check whether the buffer is already
946 // well formatted.
947 // TODO: Find a way to merge parse_ipv6 with url.cpp implementation.
948 ada_log("parse_ipv6 ", input, " [", input.size(), " bytes]");
950 ADA_ASSERT_TRUE(!helpers::overlaps(input, buffer));
951 if (input.empty()) {
952 return is_valid = false;
953 }
954 // Let address be a new IPv6 address whose IPv6 pieces are all 0.
955 std::array<uint16_t, 8> address{};
956
957 // Let pieceIndex be 0.
958 int piece_index = 0;
959
960 // Let compress be null.
961 std::optional<int> compress{};
962
963 // Let pointer be a pointer for input.
964 std::string_view::iterator pointer = input.begin();
965
966 // If c is U+003A (:), then:
967 if (input[0] == ':') {
968 // If remaining does not start with U+003A (:), validation error, return
969 // failure.
970 if (input.size() == 1 || input[1] != ':') {
971 ada_log("parse_ipv6 starts with : but the rest does not start with :");
972 return is_valid = false;
973 }
974
975 // Increase pointer by 2.
976 pointer += 2;
977
978 // Increase pieceIndex by 1 and then set compress to pieceIndex.
979 compress = ++piece_index;
980 }
981
982 // While c is not the EOF code point:
983 while (pointer != input.end()) {
984 // If pieceIndex is 8, validation error, return failure.
985 if (piece_index == 8) {
986 ada_log("parse_ipv6 piece_index == 8");
987 return is_valid = false;
988 }
989
990 // If c is U+003A (:), then:
991 if (*pointer == ':') {
992 // If compress is non-null, validation error, return failure.
993 if (compress.has_value()) {
994 ada_log("parse_ipv6 compress is non-null");
995 return is_valid = false;
996 }
997
998 // Increase pointer and pieceIndex by 1, set compress to pieceIndex, and
999 // then continue.
1000 pointer++;
1001 compress = ++piece_index;
1002 continue;
1003 }
1004
1005 // Let value and length be 0.
1006 uint16_t value = 0, length = 0;
1007
1008 // While length is less than 4 and c is an ASCII hex digit,
1009 // set value to value times 0x10 + c interpreted as hexadecimal number, and
1010 // increase pointer and length by 1.
1011 while (length < 4 && pointer != input.end() &&
1012 unicode::is_ascii_hex_digit(*pointer)) {
1013 // https://stackoverflow.com/questions/39060852/why-does-the-addition-of-two-shorts-return-an-int
1014 value = uint16_t(value * 0x10 + unicode::convert_hex_to_binary(*pointer));
1015 pointer++;
1016 length++;
1017 }
1018
1019 // If c is U+002E (.), then:
1020 if (pointer != input.end() && *pointer == '.') {
1021 // If length is 0, validation error, return failure.
1022 if (length == 0) {
1023 ada_log("parse_ipv6 length is 0");
1024 return is_valid = false;
1025 }
1026
1027 // Decrease pointer by length.
1028 pointer -= length;
1029
1030 // If pieceIndex is greater than 6, validation error, return failure.
1031 if (piece_index > 6) {
1032 ada_log("parse_ipv6 piece_index > 6");
1033 return is_valid = false;
1034 }
1035
1036 // Let numbersSeen be 0.
1037 int numbers_seen = 0;
1038
1039 // While c is not the EOF code point:
1040 while (pointer != input.end()) {
1041 // Let ipv4Piece be null.
1042 std::optional<uint16_t> ipv4_piece{};
1043
1044 // If numbersSeen is greater than 0, then:
1045 if (numbers_seen > 0) {
1046 // If c is a U+002E (.) and numbersSeen is less than 4, then increase
1047 // pointer by 1.
1048 if (*pointer == '.' && numbers_seen < 4) {
1049 pointer++;
1050 } else {
1051 // Otherwise, validation error, return failure.
1052 ada_log("parse_ipv6 Otherwise, validation error, return failure");
1053 return is_valid = false;
1054 }
1055 }
1056
1057 // If c is not an ASCII digit, validation error, return failure.
1058 if (pointer == input.end() || !checkers::is_digit(*pointer)) {
1059 ada_log(
1060 "parse_ipv6 If c is not an ASCII digit, validation error, return "
1061 "failure");
1062 return is_valid = false;
1063 }
1064
1065 // While c is an ASCII digit:
1066 while (pointer != input.end() && checkers::is_digit(*pointer)) {
1067 // Let number be c interpreted as decimal number.
1068 int number = *pointer - '0';
1069
1070 // If ipv4Piece is null, then set ipv4Piece to number.
1071 if (!ipv4_piece.has_value()) {
1072 ipv4_piece = number;
1073 }
1074 // Otherwise, if ipv4Piece is 0, validation error, return failure.
1075 else if (ipv4_piece == 0) {
1076 ada_log("parse_ipv6 if ipv4Piece is 0, validation error");
1077 return is_valid = false;
1078 }
1079 // Otherwise, set ipv4Piece to ipv4Piece times 10 + number.
1080 else {
1081 ipv4_piece = *ipv4_piece * 10 + number;
1082 }
1083
1084 // If ipv4Piece is greater than 255, validation error, return failure.
1085 if (ipv4_piece > 255) {
1086 ada_log("parse_ipv6 ipv4_piece > 255");
1087 return is_valid = false;
1088 }
1089
1090 // Increase pointer by 1.
1091 pointer++;
1092 }
1093
1094 // Set address[pieceIndex] to address[pieceIndex] times 0x100 +
1095 // ipv4Piece.
1096 // https://stackoverflow.com/questions/39060852/why-does-the-addition-of-two-shorts-return-an-int
1097 address[piece_index] =
1098 uint16_t(address[piece_index] * 0x100 + *ipv4_piece);
1099
1100 // Increase numbersSeen by 1.
1101 numbers_seen++;
1102
1103 // If numbersSeen is 2 or 4, then increase pieceIndex by 1.
1104 if (numbers_seen == 2 || numbers_seen == 4) {
1105 piece_index++;
1106 }
1107 }
1108
1109 // If numbersSeen is not 4, validation error, return failure.
1110 if (numbers_seen != 4) {
1111 return is_valid = false;
1112 }
1113
1114 // Break.
1115 break;
1116 }
1117 // Otherwise, if c is U+003A (:):
1118 else if ((pointer != input.end()) && (*pointer == ':')) {
1119 // Increase pointer by 1.
1120 pointer++;
1121
1122 // If c is the EOF code point, validation error, return failure.
1123 if (pointer == input.end()) {
1124 ada_log(
1125 "parse_ipv6 If c is the EOF code point, validation error, return "
1126 "failure");
1127 return is_valid = false;
1128 }
1129 }
1130 // Otherwise, if c is not the EOF code point, validation error, return
1131 // failure.
1132 else if (pointer != input.end()) {
1133 ada_log(
1134 "parse_ipv6 Otherwise, if c is not the EOF code point, validation "
1135 "error, return failure");
1136 return is_valid = false;
1137 }
1138
1139 // Set address[pieceIndex] to value.
1140 address[piece_index] = value;
1141
1142 // Increase pieceIndex by 1.
1143 piece_index++;
1144 }
1145
1146 // If compress is non-null, then:
1147 if (compress.has_value()) {
1148 // Let swaps be pieceIndex - compress.
1149 int swaps = piece_index - *compress;
1150
1151 // Set pieceIndex to 7.
1152 piece_index = 7;
1153
1154 // While pieceIndex is not 0 and swaps is greater than 0,
1155 // swap address[pieceIndex] with address[compress + swaps - 1], and then
1156 // decrease both pieceIndex and swaps by 1.
1157 while (piece_index != 0 && swaps > 0) {
1158 std::swap(address[piece_index], address[*compress + swaps - 1]);
1159 piece_index--;
1160 swaps--;
1161 }
1162 }
1163 // Otherwise, if compress is null and pieceIndex is not 8, validation error,
1164 // return failure.
1165 else if (piece_index != 8) {
1166 ada_log(
1167 "parse_ipv6 if compress is null and pieceIndex is not 8, validation "
1168 "error, return failure");
1169 return is_valid = false;
1170 }
1171 // TODO: Optimization opportunity: Get rid of unnecessary string creation.
1172 // TODO: This is likely a bug because it goes back update_base_hostname, not
1173 // what we want to do.
1174 update_base_hostname(ada::serializers::ipv6(address));
1175 ada_log("parse_ipv6 ", get_hostname());
1177 host_type = IPV6;
1178 return true;
1179}
1180
1181bool url_aggregator::parse_opaque_host(std::string_view input) {
1182 ada_log("parse_opaque_host ", input, " [", input.size(), " bytes]");
1184 ADA_ASSERT_TRUE(!helpers::overlaps(input, buffer));
1185 if (std::any_of(input.begin(), input.end(),
1186 ada::unicode::is_forbidden_host_code_point)) {
1187 return is_valid = false;
1188 }
1189
1190 // Return the result of running UTF-8 percent-encode on input using the C0
1191 // control percent-encode set.
1194 if (idx == input.size()) {
1195 update_base_hostname(input);
1196 } else {
1197 // We only create a temporary string if we need to.
1198 update_base_hostname(ada::unicode::percent_encode(
1200 }
1202 return true;
1203}
1204
1205[[nodiscard]] std::string url_aggregator::to_diagram() const {
1206 if (!is_valid) {
1207 return "invalid";
1208 }
1209 std::string answer;
1210 answer.append(buffer);
1211 answer.append(" [");
1212 answer.append(std::to_string(buffer.size()));
1213 answer.append(" bytes]");
1214 answer.append("\n");
1215 // first line
1216 std::string line1;
1217 line1.resize(buffer.size(), ' ');
1218 if (components.hash_start != url_components::omitted) {
1219 line1[components.hash_start] = '|';
1220 }
1221 if (components.search_start != url_components::omitted) {
1222 line1[components.search_start] = '|';
1223 }
1224 if (components.pathname_start != buffer.size()) {
1225 line1[components.pathname_start] = '|';
1226 }
1227 if (components.host_end != buffer.size()) {
1228 line1[components.host_end] = '|';
1229 }
1230 if (components.host_start != buffer.size()) {
1231 line1[components.host_start] = '|';
1232 }
1233 if (components.username_end != buffer.size()) {
1234 line1[components.username_end] = '|';
1235 }
1236 if (components.protocol_end != buffer.size()) {
1237 line1[components.protocol_end] = '|';
1238 }
1239 answer.append(line1);
1240 answer.append("\n");
1241
1242 std::string line2 = line1;
1243 if (components.hash_start != url_components::omitted) {
1244 line2[components.hash_start] = '`';
1245 line1[components.hash_start] = ' ';
1246
1247 for (size_t i = components.hash_start + 1; i < line2.size(); i++) {
1248 line2[i] = '-';
1249 }
1250 line2.append(" hash_start");
1251 answer.append(line2);
1252 answer.append("\n");
1253 }
1254
1255 std::string line3 = line1;
1256 if (components.search_start != url_components::omitted) {
1257 line3[components.search_start] = '`';
1258 line1[components.search_start] = ' ';
1259
1260 for (size_t i = components.search_start + 1; i < line3.size(); i++) {
1261 line3[i] = '-';
1262 }
1263 line3.append(" search_start ");
1264 line3.append(std::to_string(components.search_start));
1265 answer.append(line3);
1266 answer.append("\n");
1267 }
1268
1269 std::string line4 = line1;
1270 if (components.pathname_start != buffer.size()) {
1271 line4[components.pathname_start] = '`';
1272 line1[components.pathname_start] = ' ';
1273 for (size_t i = components.pathname_start + 1; i < line4.size(); i++) {
1274 line4[i] = '-';
1275 }
1276 line4.append(" pathname_start ");
1277 line4.append(std::to_string(components.pathname_start));
1278 answer.append(line4);
1279 answer.append("\n");
1280 }
1281
1282 std::string line5 = line1;
1283 if (components.host_end != buffer.size()) {
1284 line5[components.host_end] = '`';
1285 line1[components.host_end] = ' ';
1286
1287 for (size_t i = components.host_end + 1; i < line5.size(); i++) {
1288 line5[i] = '-';
1289 }
1290 line5.append(" host_end ");
1291 line5.append(std::to_string(components.host_end));
1292 answer.append(line5);
1293 answer.append("\n");
1294 }
1295
1296 std::string line6 = line1;
1297 if (components.host_start != buffer.size()) {
1298 line6[components.host_start] = '`';
1299 line1[components.host_start] = ' ';
1300
1301 for (size_t i = components.host_start + 1; i < line6.size(); i++) {
1302 line6[i] = '-';
1303 }
1304 line6.append(" host_start ");
1305 line6.append(std::to_string(components.host_start));
1306 answer.append(line6);
1307 answer.append("\n");
1308 }
1309
1310 std::string line7 = line1;
1311 if (components.username_end != buffer.size()) {
1312 line7[components.username_end] = '`';
1313 line1[components.username_end] = ' ';
1314
1315 for (size_t i = components.username_end + 1; i < line7.size(); i++) {
1316 line7[i] = '-';
1317 }
1318 line7.append(" username_end ");
1319 line7.append(std::to_string(components.username_end));
1320 answer.append(line7);
1321 answer.append("\n");
1322 }
1323
1324 std::string line8 = line1;
1325 if (components.protocol_end != buffer.size()) {
1326 line8[components.protocol_end] = '`';
1327 line1[components.protocol_end] = ' ';
1328
1329 for (size_t i = components.protocol_end + 1; i < line8.size(); i++) {
1330 line8[i] = '-';
1331 }
1332 line8.append(" protocol_end ");
1333 line8.append(std::to_string(components.protocol_end));
1334 answer.append(line8);
1335 answer.append("\n");
1336 }
1337
1338 if (components.hash_start == url_components::omitted) {
1339 answer.append("note: hash omitted\n");
1340 }
1341 if (components.search_start == url_components::omitted) {
1342 answer.append("note: search omitted\n");
1343 }
1344 if (components.protocol_end > buffer.size()) {
1345 answer.append("warning: protocol_end overflows\n");
1346 }
1347 if (components.username_end > buffer.size()) {
1348 answer.append("warning: username_end overflows\n");
1349 }
1350 if (components.host_start > buffer.size()) {
1351 answer.append("warning: host_start overflows\n");
1352 }
1353 if (components.host_end > buffer.size()) {
1354 answer.append("warning: host_end overflows\n");
1355 }
1356 if (components.pathname_start > buffer.size()) {
1357 answer.append("warning: pathname_start overflows\n");
1358 }
1359 return answer;
1360}
1361
1362void url_aggregator::delete_dash_dot() {
1363 ada_log("url_aggregator::delete_dash_dot");
1365 ADA_ASSERT_TRUE(has_dash_dot());
1366 buffer.erase(components.host_end, 2);
1367 components.pathname_start -= 2;
1368 if (components.search_start != url_components::omitted) {
1369 components.search_start -= 2;
1370 }
1371 if (components.hash_start != url_components::omitted) {
1372 components.hash_start -= 2;
1373 }
1375 ADA_ASSERT_TRUE(!has_dash_dot());
1376}
1377
1378inline void url_aggregator::consume_prepared_path(std::string_view input) {
1379 ada_log("url_aggregator::consume_prepared_path ", input);
1380
1389 uint8_t accumulator = checkers::path_signature(input);
1390 // Let us first detect a trivial case.
1391 // If it is special, we check that we have no dot, no %, no \ and no
1392 // character needing percent encoding. Otherwise, we check that we have no %,
1393 // no dot, and no character needing percent encoding.
1394 constexpr uint8_t need_encoding = 1;
1395 constexpr uint8_t backslash_char = 2;
1396 constexpr uint8_t dot_char = 4;
1397 constexpr uint8_t percent_char = 8;
1398 bool special = type != ada::scheme::NOT_SPECIAL;
1399 bool may_need_slow_file_handling = (type == ada::scheme::type::FILE &&
1401 bool trivial_path =
1402 (special ? (accumulator == 0)
1403 : ((accumulator & (need_encoding | dot_char | percent_char)) ==
1404 0)) &&
1405 (!may_need_slow_file_handling);
1406 if (accumulator == dot_char && !may_need_slow_file_handling) {
1407 // '4' means that we have at least one dot, but nothing that requires
1408 // percent encoding or decoding. The only part that is not trivial is
1409 // that we may have single dots and double dots path segments.
1410 // If we have such segments, then we either have a path that begins
1411 // with '.' (easy to check), or we have the sequence './'.
1412 // Note: input cannot be empty, it must at least contain one character ('.')
1413 // Note: we know that '\' is not present.
1414 if (input[0] != '.') {
1415 size_t slashdot = input.find("/.");
1416 if (slashdot == std::string_view::npos) { // common case
1417 trivial_path = true;
1418 } else { // uncommon
1419 // only three cases matter: /./, /.. or a final /
1420 trivial_path =
1421 !(slashdot + 2 == input.size() || input[slashdot + 2] == '.' ||
1422 input[slashdot + 2] == '/');
1423 }
1424 }
1425 }
1426 if (trivial_path && is_at_path()) {
1427 ada_log("parse_path trivial");
1428 buffer += '/';
1429 buffer += input;
1430 return;
1431 }
1432 std::string path = std::string(get_pathname());
1433 // We are going to need to look a bit at the path, but let us see if we can
1434 // ignore percent encoding *and* backslashes *and* percent characters.
1435 // Except for the trivial case, this is likely to capture 99% of paths out
1436 // there.
1437 bool fast_path =
1438 (special &&
1439 (accumulator & (need_encoding | backslash_char | percent_char)) == 0) &&
1440 (type != ada::scheme::type::FILE);
1441 if (fast_path) {
1442 ada_log("parse_prepared_path fast");
1443 // Here we don't need to worry about \ or percent encoding.
1444 // We also do not have a file protocol. We might have dots, however,
1445 // but dots must as appear as '.', and they cannot be encoded because
1446 // the symbol '%' is not present.
1447 size_t previous_location = 0; // We start at 0.
1448 do {
1449 size_t new_location = input.find('/', previous_location);
1450 // std::string_view path_view = input;
1451 // We process the last segment separately:
1452 if (new_location == std::string_view::npos) {
1453 std::string_view path_view = input.substr(previous_location);
1454 if (path_view == "..") { // The path ends with ..
1455 // e.g., if you receive ".." with an empty path, you go to "/".
1456 if (path.empty()) {
1457 path = '/';
1458 update_base_pathname(path);
1459 return;
1460 }
1461 // Fast case where we have nothing to do:
1462 if (path.back() == '/') {
1463 update_base_pathname(path);
1464 return;
1465 }
1466 // If you have the path "/joe/myfriend",
1467 // then you delete 'myfriend'.
1468 path.resize(path.rfind('/') + 1);
1469 update_base_pathname(path);
1470 return;
1471 }
1472 path += '/';
1473 if (path_view != ".") {
1474 path.append(path_view);
1475 }
1476 update_base_pathname(path);
1477 return;
1478 } else {
1479 // This is a non-final segment.
1480 std::string_view path_view =
1481 input.substr(previous_location, new_location - previous_location);
1482 previous_location = new_location + 1;
1483 if (path_view == "..") {
1484 size_t last_delimiter = path.rfind('/');
1485 if (last_delimiter != std::string::npos) {
1486 path.erase(last_delimiter);
1487 }
1488 } else if (path_view != ".") {
1489 path += '/';
1490 path.append(path_view);
1491 }
1492 }
1493 } while (true);
1494 } else {
1495 ada_log("parse_path slow");
1496 // we have reached the general case
1497 bool needs_percent_encoding = (accumulator & 1);
1498 std::string path_buffer_tmp;
1499 do {
1500 size_t location = (special && (accumulator & 2))
1501 ? input.find_first_of("/\\")
1502 : input.find('/');
1503 std::string_view path_view = input;
1504 if (location != std::string_view::npos) {
1505 path_view.remove_suffix(path_view.size() - location);
1506 input.remove_prefix(location + 1);
1507 }
1508 // path_buffer is either path_view or it might point at a percent encoded
1509 // temporary string.
1510 std::string_view path_buffer =
1511 (needs_percent_encoding &&
1512 ada::unicode::percent_encode<false>(
1513 path_view, character_sets::PATH_PERCENT_ENCODE, path_buffer_tmp))
1514 ? path_buffer_tmp
1515 : path_view;
1516 if (unicode::is_double_dot_path_segment(path_buffer)) {
1517 if ((helpers::shorten_path(path, type) || special) &&
1518 location == std::string_view::npos) {
1519 path += '/';
1520 }
1521 } else if (unicode::is_single_dot_path_segment(path_buffer) &&
1522 (location == std::string_view::npos)) {
1523 path += '/';
1524 }
1525 // Otherwise, if path_buffer is not a single-dot path segment, then:
1526 else if (!unicode::is_single_dot_path_segment(path_buffer)) {
1527 // If url's scheme is "file", url's path is empty, and path_buffer is a
1528 // Windows drive letter, then replace the second code point in
1529 // path_buffer with U+003A (:).
1530 if (type == ada::scheme::type::FILE && path.empty() &&
1531 checkers::is_windows_drive_letter(path_buffer)) {
1532 path += '/';
1533 path += path_buffer[0];
1534 path += ':';
1535 path_buffer.remove_prefix(2);
1536 path.append(path_buffer);
1537 } else {
1538 // Append path_buffer to url's path.
1539 path += '/';
1540 path.append(path_buffer);
1541 }
1542 }
1543 if (location == std::string_view::npos) {
1544 update_base_pathname(path);
1545 return;
1546 }
1547 } while (true);
1548 }
1549}
1550} // namespace ada
Definitions for URL specific checkers used within Ada.
#define ADA_ASSERT_TRUE(COND)
#define ada_lifetime_bound
#define ada_really_inline
Definition common_defs.h:81
Definitions for helper functions used within Ada.
Definitions for user facing functions for parsing URL and it's components.
constexpr uint8_t QUERY_PERCENT_ENCODE[32]
constexpr uint8_t SPECIAL_QUERY_PERCENT_ENCODE[32]
constexpr uint8_t PATH_PERCENT_ENCODE[32]
constexpr uint8_t C0_CONTROL_PERCENT_ENCODE[32]
constexpr uint8_t USERINFO_PERCENT_ENCODE[32]
constexpr bool has_hex_prefix(std::string_view input)
constexpr bool is_windows_drive_letter(std::string_view input) noexcept
constexpr bool is_alpha(char x) noexcept
constexpr bool is_digit(char x) noexcept
constexpr ada::scheme::type get_scheme_type(std::string_view scheme) noexcept
Definition scheme-inl.h:72
@ NOT_SPECIAL
Definition scheme.h:30
std::string ipv6(const std::array< uint16_t, 8 > &address) noexcept
std::string ipv4(uint64_t address) noexcept
ada_really_inline size_t percent_encode_index(const std::string_view input, const uint8_t character_set[])
Definition unicode-inl.h:19
Definition ada_idna.h:13
@ IPV6
Definition url_base.h:31
@ IPV4
Definition url_base.h:26
template ada::result< url_aggregator > parse< url_aggregator >(std::string_view input, const url_aggregator *base_url)
tl::expected< result_type, ada::errors > result
ada_warn_unused ada::result< result_type > parse(std::string_view input, const result_type *base_url=nullptr)
Declarations for the URL scheme.
Lightweight URL struct.
constexpr bool has_non_empty_password() const noexcept
void set_hash(std::string_view input)
constexpr bool validate() const noexcept
void clear_search() override
std::string_view get_hostname() const noexcept ada_lifetime_bound
std::string to_string() const override
std::string_view get_hash() const noexcept ada_lifetime_bound
std::string to_diagram() const
constexpr bool has_hostname() const noexcept
bool set_protocol(std::string_view input)
std::string get_origin() const noexcept override
constexpr std::string_view get_href() const noexcept ada_lifetime_bound
std::string_view get_search() const noexcept ada_lifetime_bound
bool has_valid_domain() const noexcept override
bool set_hostname(std::string_view input)
bool set_password(std::string_view input)
constexpr std::string_view get_pathname() const noexcept ada_lifetime_bound
bool set_pathname(std::string_view input)
std::string_view get_protocol() const noexcept ada_lifetime_bound
std::string_view get_password() const noexcept ada_lifetime_bound
bool set_href(std::string_view input)
void set_search(std::string_view input)
std::string_view get_port() const noexcept ada_lifetime_bound
constexpr bool has_port() const noexcept
ada_really_inline constexpr bool has_credentials() const noexcept
bool set_host(std::string_view input)
std::string_view get_host() const noexcept ada_lifetime_bound
bool set_port(std::string_view input)
constexpr bool has_non_empty_username() const noexcept
std::string_view get_username() const noexcept ada_lifetime_bound
bool set_username(std::string_view input)
ada_really_inline constexpr bool is_special() const noexcept
url_host_type host_type
Definition url_base.h:59
bool is_valid
Definition url_base.h:49
bool has_opaque_path
Definition url_base.h:54
static constexpr uint32_t omitted
Definitions for unicode operations.
Inline functions for url aggregator.
Declaration for the basic URL definitions.
Declaration for the URL Components.