Ada 2.9.2
Fast spec-compliant URL parser
Loading...
Searching...
No Matches
url_aggregator.cpp
Go to the documentation of this file.
1#include "ada.h"
2#include "ada/checkers-inl.h"
3#include "ada/helpers.h"
5#include "ada/scheme.h"
6#include "ada/unicode-inl.h"
10
11#include <string>
12#include <string_view>
13
14namespace ada {
15template <bool has_state_override>
16[[nodiscard]] ada_really_inline bool url_aggregator::parse_scheme_with_colon(
17 const std::string_view input_with_colon) {
18 ada_log("url_aggregator::parse_scheme_with_colon ", input_with_colon);
20 ADA_ASSERT_TRUE(!helpers::overlaps(input_with_colon, buffer));
21 std::string_view input{input_with_colon};
22 input.remove_suffix(1);
23 auto parsed_type = ada::scheme::get_scheme_type(input);
24 bool is_input_special = (parsed_type != ada::scheme::NOT_SPECIAL);
29 if (is_input_special) { // fast path!!!
30 if (has_state_override) {
31 // If url's scheme is not a special scheme and buffer is a special scheme,
32 // then return.
33 if (is_special() != is_input_special) {
34 return false;
35 }
36
37 // If url includes credentials or has a non-null port, and buffer is
38 // "file", then return.
39 if ((has_credentials() || components.port != url_components::omitted) &&
40 parsed_type == ada::scheme::type::FILE) {
41 return false;
42 }
43
44 // If url's scheme is "file" and its host is an empty host, then return.
45 // An empty host is the empty string.
46 if (type == ada::scheme::type::FILE &&
47 components.host_start == components.host_end) {
48 return false;
49 }
50 }
51
52 type = parsed_type;
53 set_scheme_from_view_with_colon(input_with_colon);
54
55 if (has_state_override) {
56 // This is uncommon.
57 uint16_t urls_scheme_port = get_special_port();
58
59 // If url's port is url's scheme's default port, then set url's port to
60 // null.
61 if (components.port == urls_scheme_port) {
62 clear_port();
63 }
64 }
65 } else { // slow path
66 std::string _buffer(input);
67 // Next function is only valid if the input is ASCII and returns false
68 // otherwise, but it seems that we always have ascii content so we do not
69 // need to check the return value.
70 unicode::to_lower_ascii(_buffer.data(), _buffer.size());
71
72 if (has_state_override) {
73 // If url's scheme is a special scheme and buffer is not a special scheme,
74 // then return. If url's scheme is not a special scheme and buffer is a
75 // special scheme, then return.
76 if (is_special() != ada::scheme::is_special(_buffer)) {
77 return true;
78 }
79
80 // If url includes credentials or has a non-null port, and buffer is
81 // "file", then return.
82 if ((has_credentials() || components.port != url_components::omitted) &&
83 _buffer == "file") {
84 return true;
85 }
86
87 // If url's scheme is "file" and its host is an empty host, then return.
88 // An empty host is the empty string.
89 if (type == ada::scheme::type::FILE &&
90 components.host_start == components.host_end) {
91 return true;
92 }
93 }
94
95 set_scheme(_buffer);
96
97 if (has_state_override) {
98 // This is uncommon.
99 uint16_t urls_scheme_port = get_special_port();
100
101 // If url's port is url's scheme's default port, then set url's port to
102 // null.
103 if (components.port == urls_scheme_port) {
104 clear_port();
105 }
106 }
107 }
109 return true;
110}
111
112inline void url_aggregator::copy_scheme(const url_aggregator& u) noexcept {
113 ada_log("url_aggregator::copy_scheme ", u.buffer);
114 ADA_ASSERT_TRUE(validate());
115 // next line could overflow but unsigned arithmetic has well-defined
116 // overflows.
117 uint32_t new_difference = u.components.protocol_end - components.protocol_end;
118 type = u.type;
119 buffer.erase(0, components.protocol_end);
120 buffer.insert(0, u.get_protocol());
121 components.protocol_end = u.components.protocol_end;
122
123 // No need to update the components
124 if (new_difference == 0) {
125 return;
126 }
127
128 // Update the rest of the components.
129 components.username_end += new_difference;
130 components.host_start += new_difference;
131 components.host_end += new_difference;
132 components.pathname_start += new_difference;
133 if (components.search_start != url_components::omitted) {
134 components.search_start += new_difference;
135 }
136 if (components.hash_start != url_components::omitted) {
137 components.hash_start += new_difference;
138 }
139 ADA_ASSERT_TRUE(validate());
140}
141
142inline void url_aggregator::set_scheme_from_view_with_colon(
143 std::string_view new_scheme_with_colon) noexcept {
144 ada_log("url_aggregator::set_scheme_from_view_with_colon ",
145 new_scheme_with_colon);
146 ADA_ASSERT_TRUE(validate());
147 ADA_ASSERT_TRUE(!new_scheme_with_colon.empty() &&
148 new_scheme_with_colon.back() == ':');
149 // next line could overflow but unsigned arithmetic has well-defined
150 // overflows.
151 uint32_t new_difference =
152 uint32_t(new_scheme_with_colon.size()) - components.protocol_end;
153
154 if (buffer.empty()) {
155 buffer.append(new_scheme_with_colon);
156 } else {
157 buffer.erase(0, components.protocol_end);
158 buffer.insert(0, new_scheme_with_colon);
159 }
160 components.protocol_end += new_difference;
161
162 // Update the rest of the components.
163 components.username_end += new_difference;
164 components.host_start += new_difference;
165 components.host_end += new_difference;
166 components.pathname_start += new_difference;
167 if (components.search_start != url_components::omitted) {
168 components.search_start += new_difference;
169 }
170 if (components.hash_start != url_components::omitted) {
171 components.hash_start += new_difference;
172 }
173 ADA_ASSERT_TRUE(validate());
174}
175
176inline void url_aggregator::set_scheme(std::string_view new_scheme) noexcept {
177 ada_log("url_aggregator::set_scheme ", new_scheme);
178 ADA_ASSERT_TRUE(validate());
179 ADA_ASSERT_TRUE(new_scheme.empty() || new_scheme.back() != ':');
180 // next line could overflow but unsigned arithmetic has well-defined
181 // overflows.
182 uint32_t new_difference =
183 uint32_t(new_scheme.size()) - components.protocol_end + 1;
184
186 if (buffer.empty()) {
187 buffer.append(helpers::concat(new_scheme, ":"));
188 } else {
189 buffer.erase(0, components.protocol_end);
190 buffer.insert(0, helpers::concat(new_scheme, ":"));
191 }
192 components.protocol_end = uint32_t(new_scheme.size() + 1);
193
194 // Update the rest of the components.
195 components.username_end += new_difference;
196 components.host_start += new_difference;
197 components.host_end += new_difference;
198 components.pathname_start += new_difference;
199 if (components.search_start != url_components::omitted) {
200 components.search_start += new_difference;
201 }
202 if (components.hash_start != url_components::omitted) {
203 components.hash_start += new_difference;
204 }
205 ADA_ASSERT_TRUE(validate());
206}
207
208bool url_aggregator::set_protocol(const std::string_view input) {
209 ada_log("url_aggregator::set_protocol ", input);
211 ADA_ASSERT_TRUE(!helpers::overlaps(input, buffer));
212 std::string view(input);
213 helpers::remove_ascii_tab_or_newline(view);
214 if (view.empty()) {
215 return true;
216 }
217
218 // Schemes should start with alpha values.
219 if (!checkers::is_alpha(view[0])) {
220 return false;
221 }
222
223 view.append(":");
224
225 std::string::iterator pointer =
226 std::find_if_not(view.begin(), view.end(), unicode::is_alnum_plus);
227
228 if (pointer != view.end() && *pointer == ':') {
229 return parse_scheme_with_colon<true>(
230 std::string_view(view.data(), pointer - view.begin() + 1));
231 }
232 return false;
233}
234
235bool url_aggregator::set_username(const std::string_view input) {
236 ada_log("url_aggregator::set_username '", input, "' ");
238 ADA_ASSERT_TRUE(!helpers::overlaps(input, buffer));
239 if (cannot_have_credentials_or_port()) {
240 return false;
241 }
244 if (idx == input.size()) {
245 update_base_username(input);
246 } else {
247 // We only create a temporary string if we have to!
248 update_base_username(ada::unicode::percent_encode(
250 }
252 return true;
253}
254
255bool url_aggregator::set_password(const std::string_view input) {
256 ada_log("url_aggregator::set_password '", input, "'");
258 ADA_ASSERT_TRUE(!helpers::overlaps(input, buffer));
259 if (cannot_have_credentials_or_port()) {
260 return false;
261 }
264 if (idx == input.size()) {
265 update_base_password(input);
266 } else {
267 // We only create a temporary string if we have to!
268 update_base_password(ada::unicode::percent_encode(
270 }
272 return true;
273}
274
275bool url_aggregator::set_port(const std::string_view input) {
276 ada_log("url_aggregator::set_port ", input);
278 ADA_ASSERT_TRUE(!helpers::overlaps(input, buffer));
279 if (cannot_have_credentials_or_port()) {
280 return false;
281 }
282 std::string trimmed(input);
283 helpers::remove_ascii_tab_or_newline(trimmed);
284 if (trimmed.empty()) {
285 clear_port();
286 return true;
287 }
288 // Input should not start with control characters.
289 if (ada::unicode::is_c0_control_or_space(trimmed.front())) {
290 return false;
291 }
292 // Input should contain at least one ascii digit.
293 if (input.find_first_of("0123456789") == std::string_view::npos) {
294 return false;
295 }
296
297 // Revert changes if parse_port fails.
298 uint32_t previous_port = components.port;
299 parse_port(trimmed);
300 if (is_valid) {
301 return true;
302 }
303 update_base_port(previous_port);
304 is_valid = true;
306 return false;
307}
308
309bool url_aggregator::set_pathname(const std::string_view input) {
310 ada_log("url_aggregator::set_pathname ", input);
312 ADA_ASSERT_TRUE(!helpers::overlaps(input, buffer));
313 if (has_opaque_path) {
314 return false;
315 }
316 clear_pathname();
317 parse_path(input);
318 if (get_pathname().starts_with("//") && !has_authority() && !has_dash_dot()) {
319 buffer.insert(components.pathname_start, "/.");
320 components.pathname_start += 2;
321 }
323 return true;
324}
325
326ada_really_inline void url_aggregator::parse_path(std::string_view input) {
327 ada_log("url_aggregator::parse_path ", input);
329 ADA_ASSERT_TRUE(!helpers::overlaps(input, buffer));
330 std::string tmp_buffer;
331 std::string_view internal_input;
332 if (unicode::has_tabs_or_newline(input)) {
333 tmp_buffer = input;
334 // Optimization opportunity: Instead of copying and then pruning, we could
335 // just directly build the string from user_input.
336 helpers::remove_ascii_tab_or_newline(tmp_buffer);
337 internal_input = tmp_buffer;
338 } else {
339 internal_input = input;
340 }
341
342 // If url is special, then:
343 if (is_special()) {
344 if (internal_input.empty()) {
345 update_base_pathname("/");
346 } else if ((internal_input[0] == '/') || (internal_input[0] == '\\')) {
347 consume_prepared_path(internal_input.substr(1));
348 } else {
349 consume_prepared_path(internal_input);
350 }
351 } else if (!internal_input.empty()) {
352 if (internal_input[0] == '/') {
353 consume_prepared_path(internal_input.substr(1));
354 } else {
355 consume_prepared_path(internal_input);
356 }
357 } else {
358 // Non-special URLs with an empty host can have their paths erased
359 // Path-only URLs cannot have their paths erased
360 if (components.host_start == components.host_end && !has_authority()) {
361 update_base_pathname("/");
362 }
363 }
365}
366
367void url_aggregator::set_search(const std::string_view input) {
368 ada_log("url_aggregator::set_search ", input);
370 ADA_ASSERT_TRUE(!helpers::overlaps(input, buffer));
371 if (input.empty()) {
372 clear_search();
373 helpers::strip_trailing_spaces_from_opaque_path(*this);
374 return;
375 }
376
377 std::string new_value;
378 new_value = input[0] == '?' ? input.substr(1) : input;
379 helpers::remove_ascii_tab_or_newline(new_value);
380
381 auto query_percent_encode_set =
384
385 update_base_search(new_value, query_percent_encode_set);
387}
388
389void url_aggregator::set_hash(const std::string_view input) {
390 ada_log("url_aggregator::set_hash ", input);
392 ADA_ASSERT_TRUE(!helpers::overlaps(input, buffer));
393 if (input.empty()) {
394 if (components.hash_start != url_components::omitted) {
395 buffer.resize(components.hash_start);
397 }
398 helpers::strip_trailing_spaces_from_opaque_path(*this);
399 return;
400 }
401
402 std::string new_value;
403 new_value = input[0] == '#' ? input.substr(1) : input;
404 helpers::remove_ascii_tab_or_newline(new_value);
405 update_unencoded_base_hash(new_value);
407}
408
409bool url_aggregator::set_href(const std::string_view input) {
410 ADA_ASSERT_TRUE(!helpers::overlaps(input, buffer));
411 ada_log("url_aggregator::set_href ", input, " [", input.size(), " bytes]");
413 ada_log("url_aggregator::set_href, success :", out.has_value());
414
415 if (out) {
416 ada_log("url_aggregator::set_href, parsed ", out->to_string());
417 // TODO: Figure out why the following line puts test to never finish.
418 *this = *out;
419 }
420
421 return out.has_value();
422}
423
424ada_really_inline bool url_aggregator::parse_host(std::string_view input) {
425 ada_log("url_aggregator:parse_host \"", input, "\" [", input.size(),
426 " bytes]");
428 ADA_ASSERT_TRUE(!helpers::overlaps(input, buffer));
429 if (input.empty()) {
430 return is_valid = false;
431 } // technically unnecessary.
432 // If input starts with U+005B ([), then:
433 if (input[0] == '[') {
434 // If input does not end with U+005D (]), validation error, return failure.
435 if (input.back() != ']') {
436 return is_valid = false;
437 }
438 ada_log("parse_host ipv6");
439
440 // Return the result of IPv6 parsing input with its leading U+005B ([) and
441 // trailing U+005D (]) removed.
442 input.remove_prefix(1);
443 input.remove_suffix(1);
444 return parse_ipv6(input);
445 }
446
447 // If isNotSpecial is true, then return the result of opaque-host parsing
448 // input.
449 if (!is_special()) {
450 return parse_opaque_host(input);
451 }
452 // Let domain be the result of running UTF-8 decode without BOM on the
453 // percent-decoding of input. Let asciiDomain be the result of running domain
454 // to ASCII with domain and false. The most common case is an ASCII input, in
455 // which case we do not need to call the expensive 'to_ascii' if a few
456 // conditions are met: no '%' and no 'xn-' subsequence.
457
458 // Often, the input does not contain any forbidden code points, and no upper
459 // case ASCII letter, then we can just copy it to the buffer. We want to
460 // optimize for such a common case.
461 uint8_t is_forbidden_or_upper =
462 unicode::contains_forbidden_domain_code_point_or_upper(input.data(),
463 input.size());
464 // Minor optimization opportunity:
465 // contains_forbidden_domain_code_point_or_upper could be extend to check for
466 // the presence of characters that cannot appear in the ipv4 address and we
467 // could also check whether x and n and - are present, and so we could skip
468 // some of the checks below. However, the gains are likely to be small, and
469 // the code would be more complex.
470 if (is_forbidden_or_upper == 0 &&
471 input.find("xn-") == std::string_view::npos) {
472 // fast path
473 update_base_hostname(input);
474 if (checkers::is_ipv4(get_hostname())) {
475 ada_log("parse_host fast path ipv4");
476 return parse_ipv4(get_hostname(), true);
477 }
478 ada_log("parse_host fast path ", get_hostname());
479 return true;
480 }
481 // We have encountered at least one forbidden code point or the input contains
482 // 'xn-' (case insensitive), so we need to call 'to_ascii' to perform the full
483 // conversion.
484
485 ada_log("parse_host calling to_ascii");
486 std::optional<std::string> host = std::string(get_hostname());
487 is_valid = ada::unicode::to_ascii(host, input, input.find('%'));
488 if (!is_valid) {
489 ada_log("parse_host to_ascii returns false");
490 return is_valid = false;
491 }
492 ada_log("parse_host to_ascii succeeded ", *host, " [", host->size(),
493 " bytes]");
494
495 if (std::any_of(host.value().begin(), host.value().end(),
496 ada::unicode::is_forbidden_domain_code_point)) {
497 return is_valid = false;
498 }
499
500 // If asciiDomain ends in a number, then return the result of IPv4 parsing
501 // asciiDomain.
502 if (checkers::is_ipv4(host.value())) {
503 ada_log("parse_host got ipv4 ", *host);
504 return parse_ipv4(host.value(), false);
505 }
506
507 update_base_hostname(host.value());
509 return true;
510}
511
512template <bool override_hostname>
513bool url_aggregator::set_host_or_hostname(const std::string_view input) {
514 ada_log("url_aggregator::set_host_or_hostname ", input);
516 ADA_ASSERT_TRUE(!helpers::overlaps(input, buffer));
517 if (has_opaque_path) {
518 return false;
519 }
520
521 std::string previous_host(get_hostname());
522 uint32_t previous_port = components.port;
523
524 size_t host_end_pos = input.find('#');
525 std::string _host(input.data(), host_end_pos != std::string_view::npos
526 ? host_end_pos
527 : input.size());
528 helpers::remove_ascii_tab_or_newline(_host);
529 std::string_view new_host(_host);
530
531 // If url's scheme is "file", then set state to file host state, instead of
532 // host state.
533 if (type != ada::scheme::type::FILE) {
534 std::string_view host_view(_host.data(), _host.length());
535 auto [location, found_colon] =
536 helpers::get_host_delimiter_location(is_special(), host_view);
537
538 // Otherwise, if c is U+003A (:) and insideBrackets is false, then:
539 // Note: the 'found_colon' value is true if and only if a colon was
540 // encountered while not inside brackets.
541 if (found_colon) {
542 if (override_hostname) {
543 return false;
544 }
545 std::string_view sub_buffer = new_host.substr(location + 1);
546 if (!sub_buffer.empty()) {
547 set_port(sub_buffer);
548 }
549 }
550 // If url is special and host_view is the empty string, validation error,
551 // return failure. Otherwise, if state override is given, host_view is the
552 // empty string, and either url includes credentials or url's port is
553 // non-null, return.
554 else if (host_view.empty() &&
555 (is_special() || has_credentials() || has_port())) {
556 return false;
557 }
558
559 // Let host be the result of host parsing host_view with url is not special.
560 if (host_view.empty() && !is_special()) {
561 if (has_hostname()) {
562 clear_hostname(); // easy!
563 } else if (has_dash_dot()) {
564 add_authority_slashes_if_needed();
565 delete_dash_dot();
566 }
567 return true;
568 }
569
570 bool succeeded = parse_host(host_view);
571 if (!succeeded) {
572 update_base_hostname(previous_host);
573 update_base_port(previous_port);
574 } else if (has_dash_dot()) {
575 // Should remove dash_dot from pathname
576 delete_dash_dot();
577 }
578 return succeeded;
579 }
580
581 size_t location = new_host.find_first_of("/\\?");
582 if (location != std::string_view::npos) {
583 new_host.remove_suffix(new_host.length() - location);
584 }
585
586 if (new_host.empty()) {
587 // Set url's host to the empty string.
588 clear_hostname();
589 } else {
590 // Let host be the result of host parsing buffer with url is not special.
591 if (!parse_host(new_host)) {
592 update_base_hostname(previous_host);
593 update_base_port(previous_port);
594 return false;
595 }
596
597 // If host is "localhost", then set host to the empty string.
598 if (helpers::substring(buffer, components.host_start,
599 components.host_end) == "localhost") {
600 clear_hostname();
601 }
602 }
604 return true;
605}
606
607bool url_aggregator::set_host(const std::string_view input) {
608 ada_log("url_aggregator::set_host '", input, "'");
610 ADA_ASSERT_TRUE(!helpers::overlaps(input, buffer));
611 return set_host_or_hostname<false>(input);
612}
613
614bool url_aggregator::set_hostname(const std::string_view input) {
615 ada_log("url_aggregator::set_hostname '", input, "'");
617 ADA_ASSERT_TRUE(!helpers::overlaps(input, buffer));
618 return set_host_or_hostname<true>(input);
619}
620
621[[nodiscard]] std::string url_aggregator::get_origin() const noexcept {
622 ada_log("url_aggregator::get_origin");
623 if (is_special()) {
624 // Return a new opaque origin.
625 if (type == scheme::FILE) {
626 return "null";
627 }
628
629 return helpers::concat(get_protocol(), "//", get_host());
630 }
631
632 if (get_protocol() == "blob:") {
633 std::string_view path = get_pathname();
634 if (!path.empty()) {
635 auto out = ada::parse<ada::url_aggregator>(path);
636 if (out && (out->type == scheme::HTTP || out->type == scheme::HTTPS)) {
637 // If pathURL's scheme is not "http" and not "https", then return a
638 // new opaque origin.
639 return helpers::concat(out->get_protocol(), "//", out->get_host());
640 }
641 }
642 }
643
644 // Return a new opaque origin.
645 return "null";
646}
647
648[[nodiscard]] std::string_view url_aggregator::get_username() const noexcept
650 ada_log("url_aggregator::get_username");
652 return helpers::substring(buffer, components.protocol_end + 2,
653 components.username_end);
654 }
655 return "";
656}
657
658[[nodiscard]] std::string_view url_aggregator::get_password() const noexcept
660 ada_log("url_aggregator::get_password");
662 return helpers::substring(buffer, components.username_end + 1,
663 components.host_start);
664 }
665 return "";
666}
667
668[[nodiscard]] std::string_view url_aggregator::get_port() const noexcept
670 ada_log("url_aggregator::get_port");
671 if (components.port == url_components::omitted) {
672 return "";
673 }
674 return helpers::substring(buffer, components.host_end + 1,
675 components.pathname_start);
676}
677
678[[nodiscard]] std::string_view url_aggregator::get_hash() const noexcept
680 ada_log("url_aggregator::get_hash");
681 // If this's URL's fragment is either null or the empty string, then return
682 // the empty string. Return U+0023 (#), followed by this's URL's fragment.
683 if (components.hash_start == url_components::omitted) {
684 return "";
685 }
686 if (buffer.size() - components.hash_start <= 1) {
687 return "";
688 }
689 return helpers::substring(buffer, components.hash_start);
690}
691
692[[nodiscard]] std::string_view url_aggregator::get_host() const noexcept
694 ada_log("url_aggregator::get_host");
695 // Technically, we should check if there is a hostname, but
696 // the code below works even if there isn't.
697 // if(!has_hostname()) { return ""; }
698 size_t start = components.host_start;
699 if (components.host_end > components.host_start &&
700 buffer[components.host_start] == '@') {
701 start++;
702 }
703 // if we have an empty host, then the space between components.host_end and
704 // components.pathname_start may be occupied by /.
705 if (start == components.host_end) {
706 return {};
707 }
708 return helpers::substring(buffer, start, components.pathname_start);
709}
710
711[[nodiscard]] std::string_view url_aggregator::get_hostname() const noexcept
713 ada_log("url_aggregator::get_hostname");
714 // Technically, we should check if there is a hostname, but
715 // the code below works even if there isn't.
716 // if(!has_hostname()) { return ""; }
717 size_t start = components.host_start;
718 // So host_start is not where the host begins.
719 if (components.host_end > components.host_start &&
720 buffer[components.host_start] == '@') {
721 start++;
722 }
723 return helpers::substring(buffer, start, components.host_end);
724}
725
726[[nodiscard]] std::string_view url_aggregator::get_pathname() const noexcept
728 ada_log("url_aggregator::get_pathname pathname_start = ",
729 components.pathname_start, " buffer.size() = ", buffer.size(),
730 " components.search_start = ", components.search_start,
731 " components.hash_start = ", components.hash_start);
732 auto ending_index = uint32_t(buffer.size());
733 if (components.search_start != url_components::omitted) {
734 ending_index = components.search_start;
735 } else if (components.hash_start != url_components::omitted) {
736 ending_index = components.hash_start;
737 }
738 return helpers::substring(buffer, components.pathname_start, ending_index);
739}
740
741[[nodiscard]] std::string_view url_aggregator::get_search() const noexcept
743 ada_log("url_aggregator::get_search");
744 // If this's URL's query is either null or the empty string, then return the
745 // empty string. Return U+003F (?), followed by this's URL's query.
746 if (components.search_start == url_components::omitted) {
747 return "";
748 }
749 auto ending_index = uint32_t(buffer.size());
750 if (components.hash_start != url_components::omitted) {
751 ending_index = components.hash_start;
752 }
753 if (ending_index - components.search_start <= 1) {
754 return "";
755 }
756 return helpers::substring(buffer, components.search_start, ending_index);
757}
758
759[[nodiscard]] std::string_view url_aggregator::get_protocol() const noexcept
761 ada_log("url_aggregator::get_protocol");
762 return helpers::substring(buffer, 0, components.protocol_end);
763}
764
765[[nodiscard]] std::string ada::url_aggregator::to_string() const {
766 ada_log("url_aggregator::to_string buffer:", buffer, " [", buffer.size(),
767 " bytes]");
768 if (!is_valid) {
769 return "null";
770 }
771
772 std::string answer;
773 auto back = std::back_insert_iterator(answer);
774 answer.append("{\n");
775
776 answer.append("\t\"buffer\":\"");
777 helpers::encode_json(buffer, back);
778 answer.append("\",\n");
779
780 answer.append("\t\"protocol\":\"");
781 helpers::encode_json(get_protocol(), back);
782 answer.append("\",\n");
783
784 if (has_credentials()) {
785 answer.append("\t\"username\":\"");
786 helpers::encode_json(get_username(), back);
787 answer.append("\",\n");
788 answer.append("\t\"password\":\"");
789 helpers::encode_json(get_password(), back);
790 answer.append("\",\n");
791 }
792
793 answer.append("\t\"host\":\"");
794 helpers::encode_json(get_host(), back);
795 answer.append("\",\n");
796
797 answer.append("\t\"path\":\"");
798 helpers::encode_json(get_pathname(), back);
799 answer.append("\",\n");
800 answer.append("\t\"opaque path\":");
801 answer.append((has_opaque_path ? "true" : "false"));
802 answer.append(",\n");
803
804 if (components.search_start != url_components::omitted) {
805 answer.append("\t\"query\":\"");
806 helpers::encode_json(get_search(), back);
807 answer.append("\",\n");
808 }
809 if (components.hash_start != url_components::omitted) {
810 answer.append("\t\"fragment\":\"");
811 helpers::encode_json(get_hash(), back);
812 answer.append("\",\n");
813 }
814
815 auto convert_offset_to_string = [](uint32_t offset) -> std::string {
816 if (offset == url_components::omitted) {
817 return "null";
818 } else {
819 return std::to_string(offset);
820 }
821 };
822
823 answer.append("\t\"protocol_end\":");
824 answer.append(convert_offset_to_string(components.protocol_end));
825 answer.append(",\n");
826
827 answer.append("\t\"username_end\":");
828 answer.append(convert_offset_to_string(components.username_end));
829 answer.append(",\n");
830
831 answer.append("\t\"host_start\":");
832 answer.append(convert_offset_to_string(components.host_start));
833 answer.append(",\n");
834
835 answer.append("\t\"host_end\":");
836 answer.append(convert_offset_to_string(components.host_end));
837 answer.append(",\n");
838
839 answer.append("\t\"port\":");
840 answer.append(convert_offset_to_string(components.port));
841 answer.append(",\n");
842
843 answer.append("\t\"pathname_start\":");
844 answer.append(convert_offset_to_string(components.pathname_start));
845 answer.append(",\n");
846
847 answer.append("\t\"search_start\":");
848 answer.append(convert_offset_to_string(components.search_start));
849 answer.append(",\n");
850
851 answer.append("\t\"hash_start\":");
852 answer.append(convert_offset_to_string(components.hash_start));
853 answer.append("\n}");
854
855 return answer;
856}
857
858[[nodiscard]] bool url_aggregator::has_valid_domain() const noexcept {
859 if (components.host_start == components.host_end) {
860 return false;
861 }
862 return checkers::verify_dns_length(get_hostname());
863}
864
865bool url_aggregator::parse_ipv4(std::string_view input, bool in_place) {
866 ada_log("parse_ipv4 ", input, " [", input.size(),
867 " bytes], overlaps with buffer: ",
868 helpers::overlaps(input, buffer) ? "yes" : "no");
870 const bool trailing_dot = (input.back() == '.');
871 if (trailing_dot) {
872 input.remove_suffix(1);
873 }
874 size_t digit_count{0};
875 int pure_decimal_count = 0; // entries that are decimal
876 uint64_t ipv4{0};
877 // we could unroll for better performance?
878 for (; (digit_count < 4) && !(input.empty()); digit_count++) {
879 uint32_t
880 segment_result{}; // If any number exceeds 32 bits, we have an error.
881 bool is_hex = checkers::has_hex_prefix(input);
882 if (is_hex && ((input.length() == 2) ||
883 ((input.length() > 2) && (input[2] == '.')))) {
884 // special case
885 segment_result = 0;
886 input.remove_prefix(2);
887 } else {
888 std::from_chars_result r{};
889 if (is_hex) {
890 ada_log("parse_ipv4 trying to parse hex number");
891 r = std::from_chars(input.data() + 2, input.data() + input.size(),
892 segment_result, 16);
893 } else if ((input.length() >= 2) && input[0] == '0' &&
894 checkers::is_digit(input[1])) {
895 ada_log("parse_ipv4 trying to parse octal number");
896 r = std::from_chars(input.data() + 1, input.data() + input.size(),
897 segment_result, 8);
898 } else {
899 ada_log("parse_ipv4 trying to parse decimal number");
900 pure_decimal_count++;
901 r = std::from_chars(input.data(), input.data() + input.size(),
902 segment_result, 10);
903 }
904 if (r.ec != std::errc()) {
905 ada_log("parse_ipv4 parsing failed");
906 return is_valid = false;
907 }
908 ada_log("parse_ipv4 parsed ", segment_result);
909 input.remove_prefix(r.ptr - input.data());
910 }
911 if (input.empty()) {
912 // We have the last value.
913 // At this stage, ipv4 contains digit_count*8 bits.
914 // So we have 32-digit_count*8 bits left.
915 if (segment_result >= (uint64_t(1) << (32 - digit_count * 8))) {
916 return is_valid = false;
917 }
918 ipv4 <<= (32 - digit_count * 8);
919 ipv4 |= segment_result;
920 goto final;
921 } else {
922 // There is more, so that the value must no be larger than 255
923 // and we must have a '.'.
924 if ((segment_result > 255) || (input[0] != '.')) {
925 return is_valid = false;
926 }
927 ipv4 <<= 8;
928 ipv4 |= segment_result;
929 input.remove_prefix(1); // remove '.'
930 }
931 }
932 if ((digit_count != 4) || (!input.empty())) {
933 ada_log("parse_ipv4 found invalid (more than 4 numbers or empty) ");
934 return is_valid = false;
935 }
936final:
937 ada_log("url_aggregator::parse_ipv4 completed ", get_href(),
938 " host: ", get_host());
939
940 // We could also check r.ptr to see where the parsing ended.
941 if (in_place && pure_decimal_count == 4 && !trailing_dot) {
942 ada_log(
943 "url_aggregator::parse_ipv4 completed and was already correct in the "
944 "buffer");
945 // The original input was already all decimal and we validated it. So we
946 // don't need to do anything.
947 } else {
948 ada_log("url_aggregator::parse_ipv4 completed and we need to update it");
949 // Optimization opportunity: Get rid of unnecessary string return in ipv4
950 // serializer.
951 // TODO: This is likely a bug because it goes back update_base_hostname, not
952 // what we want to do.
953 update_base_hostname(
954 ada::serializers::ipv4(ipv4)); // We have to reserialize the address.
955 }
956 host_type = IPV4;
958 return true;
959}
960
961bool url_aggregator::parse_ipv6(std::string_view input) {
962 // TODO: Implement in_place optimization: we know that input points
963 // in the buffer, so we can just check whether the buffer is already
964 // well formatted.
965 // TODO: Find a way to merge parse_ipv6 with url.cpp implementation.
966 ada_log("parse_ipv6 ", input, " [", input.size(), " bytes]");
968 ADA_ASSERT_TRUE(!helpers::overlaps(input, buffer));
969 if (input.empty()) {
970 return is_valid = false;
971 }
972 // Let address be a new IPv6 address whose IPv6 pieces are all 0.
973 std::array<uint16_t, 8> address{};
974
975 // Let pieceIndex be 0.
976 int piece_index = 0;
977
978 // Let compress be null.
979 std::optional<int> compress{};
980
981 // Let pointer be a pointer for input.
982 std::string_view::iterator pointer = input.begin();
983
984 // If c is U+003A (:), then:
985 if (input[0] == ':') {
986 // If remaining does not start with U+003A (:), validation error, return
987 // failure.
988 if (input.size() == 1 || input[1] != ':') {
989 ada_log("parse_ipv6 starts with : but the rest does not start with :");
990 return is_valid = false;
991 }
992
993 // Increase pointer by 2.
994 pointer += 2;
995
996 // Increase pieceIndex by 1 and then set compress to pieceIndex.
997 compress = ++piece_index;
998 }
999
1000 // While c is not the EOF code point:
1001 while (pointer != input.end()) {
1002 // If pieceIndex is 8, validation error, return failure.
1003 if (piece_index == 8) {
1004 ada_log("parse_ipv6 piece_index == 8");
1005 return is_valid = false;
1006 }
1007
1008 // If c is U+003A (:), then:
1009 if (*pointer == ':') {
1010 // If compress is non-null, validation error, return failure.
1011 if (compress.has_value()) {
1012 ada_log("parse_ipv6 compress is non-null");
1013 return is_valid = false;
1014 }
1015
1016 // Increase pointer and pieceIndex by 1, set compress to pieceIndex, and
1017 // then continue.
1018 pointer++;
1019 compress = ++piece_index;
1020 continue;
1021 }
1022
1023 // Let value and length be 0.
1024 uint16_t value = 0, length = 0;
1025
1026 // While length is less than 4 and c is an ASCII hex digit,
1027 // set value to value times 0x10 + c interpreted as hexadecimal number, and
1028 // increase pointer and length by 1.
1029 while (length < 4 && pointer != input.end() &&
1030 unicode::is_ascii_hex_digit(*pointer)) {
1031 // https://stackoverflow.com/questions/39060852/why-does-the-addition-of-two-shorts-return-an-int
1032 value = uint16_t(value * 0x10 + unicode::convert_hex_to_binary(*pointer));
1033 pointer++;
1034 length++;
1035 }
1036
1037 // If c is U+002E (.), then:
1038 if (pointer != input.end() && *pointer == '.') {
1039 // If length is 0, validation error, return failure.
1040 if (length == 0) {
1041 ada_log("parse_ipv6 length is 0");
1042 return is_valid = false;
1043 }
1044
1045 // Decrease pointer by length.
1046 pointer -= length;
1047
1048 // If pieceIndex is greater than 6, validation error, return failure.
1049 if (piece_index > 6) {
1050 ada_log("parse_ipv6 piece_index > 6");
1051 return is_valid = false;
1052 }
1053
1054 // Let numbersSeen be 0.
1055 int numbers_seen = 0;
1056
1057 // While c is not the EOF code point:
1058 while (pointer != input.end()) {
1059 // Let ipv4Piece be null.
1060 std::optional<uint16_t> ipv4_piece{};
1061
1062 // If numbersSeen is greater than 0, then:
1063 if (numbers_seen > 0) {
1064 // If c is a U+002E (.) and numbersSeen is less than 4, then increase
1065 // pointer by 1.
1066 if (*pointer == '.' && numbers_seen < 4) {
1067 pointer++;
1068 } else {
1069 // Otherwise, validation error, return failure.
1070 ada_log("parse_ipv6 Otherwise, validation error, return failure");
1071 return is_valid = false;
1072 }
1073 }
1074
1075 // If c is not an ASCII digit, validation error, return failure.
1076 if (pointer == input.end() || !checkers::is_digit(*pointer)) {
1077 ada_log(
1078 "parse_ipv6 If c is not an ASCII digit, validation error, return "
1079 "failure");
1080 return is_valid = false;
1081 }
1082
1083 // While c is an ASCII digit:
1084 while (pointer != input.end() && checkers::is_digit(*pointer)) {
1085 // Let number be c interpreted as decimal number.
1086 int number = *pointer - '0';
1087
1088 // If ipv4Piece is null, then set ipv4Piece to number.
1089 if (!ipv4_piece.has_value()) {
1090 ipv4_piece = number;
1091 }
1092 // Otherwise, if ipv4Piece is 0, validation error, return failure.
1093 else if (ipv4_piece == 0) {
1094 ada_log("parse_ipv6 if ipv4Piece is 0, validation error");
1095 return is_valid = false;
1096 }
1097 // Otherwise, set ipv4Piece to ipv4Piece times 10 + number.
1098 else {
1099 ipv4_piece = *ipv4_piece * 10 + number;
1100 }
1101
1102 // If ipv4Piece is greater than 255, validation error, return failure.
1103 if (ipv4_piece > 255) {
1104 ada_log("parse_ipv6 ipv4_piece > 255");
1105 return is_valid = false;
1106 }
1107
1108 // Increase pointer by 1.
1109 pointer++;
1110 }
1111
1112 // Set address[pieceIndex] to address[pieceIndex] times 0x100 +
1113 // ipv4Piece.
1114 // https://stackoverflow.com/questions/39060852/why-does-the-addition-of-two-shorts-return-an-int
1115 address[piece_index] =
1116 uint16_t(address[piece_index] * 0x100 + *ipv4_piece);
1117
1118 // Increase numbersSeen by 1.
1119 numbers_seen++;
1120
1121 // If numbersSeen is 2 or 4, then increase pieceIndex by 1.
1122 if (numbers_seen == 2 || numbers_seen == 4) {
1123 piece_index++;
1124 }
1125 }
1126
1127 // If numbersSeen is not 4, validation error, return failure.
1128 if (numbers_seen != 4) {
1129 return is_valid = false;
1130 }
1131
1132 // Break.
1133 break;
1134 }
1135 // Otherwise, if c is U+003A (:):
1136 else if ((pointer != input.end()) && (*pointer == ':')) {
1137 // Increase pointer by 1.
1138 pointer++;
1139
1140 // If c is the EOF code point, validation error, return failure.
1141 if (pointer == input.end()) {
1142 ada_log(
1143 "parse_ipv6 If c is the EOF code point, validation error, return "
1144 "failure");
1145 return is_valid = false;
1146 }
1147 }
1148 // Otherwise, if c is not the EOF code point, validation error, return
1149 // failure.
1150 else if (pointer != input.end()) {
1151 ada_log(
1152 "parse_ipv6 Otherwise, if c is not the EOF code point, validation "
1153 "error, return failure");
1154 return is_valid = false;
1155 }
1156
1157 // Set address[pieceIndex] to value.
1158 address[piece_index] = value;
1159
1160 // Increase pieceIndex by 1.
1161 piece_index++;
1162 }
1163
1164 // If compress is non-null, then:
1165 if (compress.has_value()) {
1166 // Let swaps be pieceIndex - compress.
1167 int swaps = piece_index - *compress;
1168
1169 // Set pieceIndex to 7.
1170 piece_index = 7;
1171
1172 // While pieceIndex is not 0 and swaps is greater than 0,
1173 // swap address[pieceIndex] with address[compress + swaps - 1], and then
1174 // decrease both pieceIndex and swaps by 1.
1175 while (piece_index != 0 && swaps > 0) {
1176 std::swap(address[piece_index], address[*compress + swaps - 1]);
1177 piece_index--;
1178 swaps--;
1179 }
1180 }
1181 // Otherwise, if compress is null and pieceIndex is not 8, validation error,
1182 // return failure.
1183 else if (piece_index != 8) {
1184 ada_log(
1185 "parse_ipv6 if compress is null and pieceIndex is not 8, validation "
1186 "error, return failure");
1187 return is_valid = false;
1188 }
1189 // TODO: Optimization opportunity: Get rid of unnecessary string creation.
1190 // TODO: This is likely a bug because it goes back update_base_hostname, not
1191 // what we want to do.
1192 update_base_hostname(ada::serializers::ipv6(address));
1193 ada_log("parse_ipv6 ", get_hostname());
1195 host_type = IPV6;
1196 return true;
1197}
1198
1199bool url_aggregator::parse_opaque_host(std::string_view input) {
1200 ada_log("parse_opaque_host ", input, " [", input.size(), " bytes]");
1202 ADA_ASSERT_TRUE(!helpers::overlaps(input, buffer));
1203 if (std::any_of(input.begin(), input.end(),
1204 ada::unicode::is_forbidden_host_code_point)) {
1205 return is_valid = false;
1206 }
1207
1208 // Return the result of running UTF-8 percent-encode on input using the C0
1209 // control percent-encode set.
1212 if (idx == input.size()) {
1213 update_base_hostname(input);
1214 } else {
1215 // We only create a temporary string if we need to.
1216 update_base_hostname(ada::unicode::percent_encode(
1218 }
1220 return true;
1221}
1222
1223[[nodiscard]] std::string url_aggregator::to_diagram() const {
1224 if (!is_valid) {
1225 return "invalid";
1226 }
1227 std::string answer;
1228 answer.append(buffer);
1229 answer.append(" [");
1230 answer.append(std::to_string(buffer.size()));
1231 answer.append(" bytes]");
1232 answer.append("\n");
1233 // first line
1234 std::string line1;
1235 line1.resize(buffer.size(), ' ');
1236 if (components.hash_start != url_components::omitted) {
1237 line1[components.hash_start] = '|';
1238 }
1239 if (components.search_start != url_components::omitted) {
1240 line1[components.search_start] = '|';
1241 }
1242 if (components.pathname_start != buffer.size()) {
1243 line1[components.pathname_start] = '|';
1244 }
1245 if (components.host_end != buffer.size()) {
1246 line1[components.host_end] = '|';
1247 }
1248 if (components.host_start != buffer.size()) {
1249 line1[components.host_start] = '|';
1250 }
1251 if (components.username_end != buffer.size()) {
1252 line1[components.username_end] = '|';
1253 }
1254 if (components.protocol_end != buffer.size()) {
1255 line1[components.protocol_end] = '|';
1256 }
1257 answer.append(line1);
1258 answer.append("\n");
1259
1260 std::string line2 = line1;
1261 if (components.hash_start != url_components::omitted) {
1262 line2[components.hash_start] = '`';
1263 line1[components.hash_start] = ' ';
1264
1265 for (size_t i = components.hash_start + 1; i < line2.size(); i++) {
1266 line2[i] = '-';
1267 }
1268 line2.append(" hash_start");
1269 answer.append(line2);
1270 answer.append("\n");
1271 }
1272
1273 std::string line3 = line1;
1274 if (components.search_start != url_components::omitted) {
1275 line3[components.search_start] = '`';
1276 line1[components.search_start] = ' ';
1277
1278 for (size_t i = components.search_start + 1; i < line3.size(); i++) {
1279 line3[i] = '-';
1280 }
1281 line3.append(" search_start ");
1282 line3.append(std::to_string(components.search_start));
1283 answer.append(line3);
1284 answer.append("\n");
1285 }
1286
1287 std::string line4 = line1;
1288 if (components.pathname_start != buffer.size()) {
1289 line4[components.pathname_start] = '`';
1290 line1[components.pathname_start] = ' ';
1291 for (size_t i = components.pathname_start + 1; i < line4.size(); i++) {
1292 line4[i] = '-';
1293 }
1294 line4.append(" pathname_start ");
1295 line4.append(std::to_string(components.pathname_start));
1296 answer.append(line4);
1297 answer.append("\n");
1298 }
1299
1300 std::string line5 = line1;
1301 if (components.host_end != buffer.size()) {
1302 line5[components.host_end] = '`';
1303 line1[components.host_end] = ' ';
1304
1305 for (size_t i = components.host_end + 1; i < line5.size(); i++) {
1306 line5[i] = '-';
1307 }
1308 line5.append(" host_end ");
1309 line5.append(std::to_string(components.host_end));
1310 answer.append(line5);
1311 answer.append("\n");
1312 }
1313
1314 std::string line6 = line1;
1315 if (components.host_start != buffer.size()) {
1316 line6[components.host_start] = '`';
1317 line1[components.host_start] = ' ';
1318
1319 for (size_t i = components.host_start + 1; i < line6.size(); i++) {
1320 line6[i] = '-';
1321 }
1322 line6.append(" host_start ");
1323 line6.append(std::to_string(components.host_start));
1324 answer.append(line6);
1325 answer.append("\n");
1326 }
1327
1328 std::string line7 = line1;
1329 if (components.username_end != buffer.size()) {
1330 line7[components.username_end] = '`';
1331 line1[components.username_end] = ' ';
1332
1333 for (size_t i = components.username_end + 1; i < line7.size(); i++) {
1334 line7[i] = '-';
1335 }
1336 line7.append(" username_end ");
1337 line7.append(std::to_string(components.username_end));
1338 answer.append(line7);
1339 answer.append("\n");
1340 }
1341
1342 std::string line8 = line1;
1343 if (components.protocol_end != buffer.size()) {
1344 line8[components.protocol_end] = '`';
1345 line1[components.protocol_end] = ' ';
1346
1347 for (size_t i = components.protocol_end + 1; i < line8.size(); i++) {
1348 line8[i] = '-';
1349 }
1350 line8.append(" protocol_end ");
1351 line8.append(std::to_string(components.protocol_end));
1352 answer.append(line8);
1353 answer.append("\n");
1354 }
1355
1356 if (components.hash_start == url_components::omitted) {
1357 answer.append("note: hash omitted\n");
1358 }
1359 if (components.search_start == url_components::omitted) {
1360 answer.append("note: search omitted\n");
1361 }
1362 if (components.protocol_end > buffer.size()) {
1363 answer.append("warning: protocol_end overflows\n");
1364 }
1365 if (components.username_end > buffer.size()) {
1366 answer.append("warning: username_end overflows\n");
1367 }
1368 if (components.host_start > buffer.size()) {
1369 answer.append("warning: host_start overflows\n");
1370 }
1371 if (components.host_end > buffer.size()) {
1372 answer.append("warning: host_end overflows\n");
1373 }
1374 if (components.pathname_start > buffer.size()) {
1375 answer.append("warning: pathname_start overflows\n");
1376 }
1377 return answer;
1378}
1379
1380[[nodiscard]] bool url_aggregator::validate() const noexcept {
1381 if (!is_valid) {
1382 return true;
1383 }
1384 if (!components.check_offset_consistency()) {
1385 ada_log("url_aggregator::validate inconsistent components \n",
1386 to_diagram());
1387 return false;
1388 }
1389 // We have a credible components struct, but let us investivate more
1390 // carefully:
1403 if (components.protocol_end == url_components::omitted) {
1404 ada_log("url_aggregator::validate omitted protocol_end \n", to_diagram());
1405 return false;
1406 }
1407 if (components.username_end == url_components::omitted) {
1408 ada_log("url_aggregator::validate omitted username_end \n", to_diagram());
1409 return false;
1410 }
1411 if (components.host_start == url_components::omitted) {
1412 ada_log("url_aggregator::validate omitted host_start \n", to_diagram());
1413 return false;
1414 }
1415 if (components.host_end == url_components::omitted) {
1416 ada_log("url_aggregator::validate omitted host_end \n", to_diagram());
1417 return false;
1418 }
1419 if (components.pathname_start == url_components::omitted) {
1420 ada_log("url_aggregator::validate omitted pathname_start \n", to_diagram());
1421 return false;
1422 }
1423
1424 if (components.protocol_end > buffer.size()) {
1425 ada_log("url_aggregator::validate protocol_end overflow \n", to_diagram());
1426 return false;
1427 }
1428 if (components.username_end > buffer.size()) {
1429 ada_log("url_aggregator::validate username_end overflow \n", to_diagram());
1430 return false;
1431 }
1432 if (components.host_start > buffer.size()) {
1433 ada_log("url_aggregator::validate host_start overflow \n", to_diagram());
1434 return false;
1435 }
1436 if (components.host_end > buffer.size()) {
1437 ada_log("url_aggregator::validate host_end overflow \n", to_diagram());
1438 return false;
1439 }
1440 if (components.pathname_start > buffer.size()) {
1441 ada_log("url_aggregator::validate pathname_start overflow \n",
1442 to_diagram());
1443 return false;
1444 }
1445
1446 if (components.protocol_end > 0) {
1447 if (buffer[components.protocol_end - 1] != ':') {
1448 ada_log(
1449 "url_aggregator::validate missing : at the end of the protocol \n",
1450 to_diagram());
1451 return false;
1452 }
1453 }
1454
1455 if (components.username_end != buffer.size() &&
1456 components.username_end > components.protocol_end + 2) {
1457 if (buffer[components.username_end] != ':' &&
1458 buffer[components.username_end] != '@') {
1459 ada_log(
1460 "url_aggregator::validate missing : or @ at the end of the username "
1461 "\n",
1462 to_diagram());
1463 return false;
1464 }
1465 }
1466
1467 if (components.host_start != buffer.size()) {
1468 if (components.host_start > components.username_end) {
1469 if (buffer[components.host_start] != '@') {
1470 ada_log(
1471 "url_aggregator::validate missing @ at the end of the password \n",
1472 to_diagram());
1473 return false;
1474 }
1475 } else if (components.host_start == components.username_end &&
1476 components.host_end > components.host_start) {
1477 if (components.host_start == components.protocol_end + 2) {
1478 if (buffer[components.protocol_end] != '/' ||
1479 buffer[components.protocol_end + 1] != '/') {
1480 ada_log(
1481 "url_aggregator::validate missing // between protocol and host "
1482 "\n",
1483 to_diagram());
1484 return false;
1485 }
1486 } else {
1487 if (components.host_start > components.protocol_end &&
1488 buffer[components.host_start] != '@') {
1489 ada_log(
1490 "url_aggregator::validate missing @ at the end of the username "
1491 "\n",
1492 to_diagram());
1493 return false;
1494 }
1495 }
1496 } else {
1497 if (components.host_end != components.host_start) {
1498 ada_log("url_aggregator::validate expected omitted host \n",
1499 to_diagram());
1500 return false;
1501 }
1502 }
1503 }
1504 if (components.host_end != buffer.size() &&
1505 components.pathname_start > components.host_end) {
1506 if (components.pathname_start == components.host_end + 2 &&
1507 buffer[components.host_end] == '/' &&
1508 buffer[components.host_end + 1] == '.') {
1509 if (components.pathname_start + 1 >= buffer.size() ||
1510 buffer[components.pathname_start] != '/' ||
1511 buffer[components.pathname_start + 1] != '/') {
1512 ada_log(
1513 "url_aggregator::validate expected the path to begin with // \n",
1514 to_diagram());
1515 return false;
1516 }
1517 } else if (buffer[components.host_end] != ':') {
1518 ada_log("url_aggregator::validate missing : at the port \n",
1519 to_diagram());
1520 return false;
1521 }
1522 }
1523 if (components.pathname_start != buffer.size() &&
1524 components.pathname_start < components.search_start &&
1525 components.pathname_start < components.hash_start && !has_opaque_path) {
1526 if (buffer[components.pathname_start] != '/') {
1527 ada_log("url_aggregator::validate missing / at the path \n",
1528 to_diagram());
1529 return false;
1530 }
1531 }
1532 if (components.search_start != url_components::omitted) {
1533 if (buffer[components.search_start] != '?') {
1534 ada_log("url_aggregator::validate missing ? at the search \n",
1535 to_diagram());
1536 return false;
1537 }
1538 }
1539 if (components.hash_start != url_components::omitted) {
1540 if (buffer[components.hash_start] != '#') {
1541 ada_log("url_aggregator::validate missing # at the hash \n",
1542 to_diagram());
1543 return false;
1544 }
1545 }
1546
1547 return true;
1548}
1549
1550void url_aggregator::delete_dash_dot() {
1551 ada_log("url_aggregator::delete_dash_dot");
1553 ADA_ASSERT_TRUE(has_dash_dot());
1554 buffer.erase(components.host_end, 2);
1555 components.pathname_start -= 2;
1556 if (components.search_start != url_components::omitted) {
1557 components.search_start -= 2;
1558 }
1559 if (components.hash_start != url_components::omitted) {
1560 components.hash_start -= 2;
1561 }
1563 ADA_ASSERT_TRUE(!has_dash_dot());
1564}
1565
1566inline void url_aggregator::consume_prepared_path(std::string_view input) {
1567 ada_log("url_aggregator::consume_prepared_path ", input);
1568
1577 uint8_t accumulator = checkers::path_signature(input);
1578 // Let us first detect a trivial case.
1579 // If it is special, we check that we have no dot, no %, no \ and no
1580 // character needing percent encoding. Otherwise, we check that we have no %,
1581 // no dot, and no character needing percent encoding.
1582 constexpr uint8_t need_encoding = 1;
1583 constexpr uint8_t backslash_char = 2;
1584 constexpr uint8_t dot_char = 4;
1585 constexpr uint8_t percent_char = 8;
1586 bool special = type != ada::scheme::NOT_SPECIAL;
1587 bool may_need_slow_file_handling = (type == ada::scheme::type::FILE &&
1589 bool trivial_path =
1590 (special ? (accumulator == 0)
1591 : ((accumulator & (need_encoding | dot_char | percent_char)) ==
1592 0)) &&
1593 (!may_need_slow_file_handling);
1594 if (accumulator == dot_char && !may_need_slow_file_handling) {
1595 // '4' means that we have at least one dot, but nothing that requires
1596 // percent encoding or decoding. The only part that is not trivial is
1597 // that we may have single dots and double dots path segments.
1598 // If we have such segments, then we either have a path that begins
1599 // with '.' (easy to check), or we have the sequence './'.
1600 // Note: input cannot be empty, it must at least contain one character ('.')
1601 // Note: we know that '\' is not present.
1602 if (input[0] != '.') {
1603 size_t slashdot = input.find("/.");
1604 if (slashdot == std::string_view::npos) { // common case
1605 trivial_path = true;
1606 } else { // uncommon
1607 // only three cases matter: /./, /.. or a final /
1608 trivial_path =
1609 !(slashdot + 2 == input.size() || input[slashdot + 2] == '.' ||
1610 input[slashdot + 2] == '/');
1611 }
1612 }
1613 }
1614 if (trivial_path && is_at_path()) {
1615 ada_log("parse_path trivial");
1616 buffer += '/';
1617 buffer += input;
1618 return;
1619 }
1620 std::string path = std::string(get_pathname());
1621 // We are going to need to look a bit at the path, but let us see if we can
1622 // ignore percent encoding *and* backslashes *and* percent characters.
1623 // Except for the trivial case, this is likely to capture 99% of paths out
1624 // there.
1625 bool fast_path =
1626 (special &&
1627 (accumulator & (need_encoding | backslash_char | percent_char)) == 0) &&
1628 (type != ada::scheme::type::FILE);
1629 if (fast_path) {
1630 ada_log("parse_prepared_path fast");
1631 // Here we don't need to worry about \ or percent encoding.
1632 // We also do not have a file protocol. We might have dots, however,
1633 // but dots must as appear as '.', and they cannot be encoded because
1634 // the symbol '%' is not present.
1635 size_t previous_location = 0; // We start at 0.
1636 do {
1637 size_t new_location = input.find('/', previous_location);
1638 // std::string_view path_view = input;
1639 // We process the last segment separately:
1640 if (new_location == std::string_view::npos) {
1641 std::string_view path_view = input.substr(previous_location);
1642 if (path_view == "..") { // The path ends with ..
1643 // e.g., if you receive ".." with an empty path, you go to "/".
1644 if (path.empty()) {
1645 path = '/';
1646 update_base_pathname(path);
1647 return;
1648 }
1649 // Fast case where we have nothing to do:
1650 if (path.back() == '/') {
1651 update_base_pathname(path);
1652 return;
1653 }
1654 // If you have the path "/joe/myfriend",
1655 // then you delete 'myfriend'.
1656 path.resize(path.rfind('/') + 1);
1657 update_base_pathname(path);
1658 return;
1659 }
1660 path += '/';
1661 if (path_view != ".") {
1662 path.append(path_view);
1663 }
1664 update_base_pathname(path);
1665 return;
1666 } else {
1667 // This is a non-final segment.
1668 std::string_view path_view =
1669 input.substr(previous_location, new_location - previous_location);
1670 previous_location = new_location + 1;
1671 if (path_view == "..") {
1672 size_t last_delimiter = path.rfind('/');
1673 if (last_delimiter != std::string::npos) {
1674 path.erase(last_delimiter);
1675 }
1676 } else if (path_view != ".") {
1677 path += '/';
1678 path.append(path_view);
1679 }
1680 }
1681 } while (true);
1682 } else {
1683 ada_log("parse_path slow");
1684 // we have reached the general case
1685 bool needs_percent_encoding = (accumulator & 1);
1686 std::string path_buffer_tmp;
1687 do {
1688 size_t location = (special && (accumulator & 2))
1689 ? input.find_first_of("/\\")
1690 : input.find('/');
1691 std::string_view path_view = input;
1692 if (location != std::string_view::npos) {
1693 path_view.remove_suffix(path_view.size() - location);
1694 input.remove_prefix(location + 1);
1695 }
1696 // path_buffer is either path_view or it might point at a percent encoded
1697 // temporary string.
1698 std::string_view path_buffer =
1699 (needs_percent_encoding &&
1700 ada::unicode::percent_encode<false>(
1701 path_view, character_sets::PATH_PERCENT_ENCODE, path_buffer_tmp))
1702 ? path_buffer_tmp
1703 : path_view;
1704 if (unicode::is_double_dot_path_segment(path_buffer)) {
1705 if ((helpers::shorten_path(path, type) || special) &&
1706 location == std::string_view::npos) {
1707 path += '/';
1708 }
1709 } else if (unicode::is_single_dot_path_segment(path_buffer) &&
1710 (location == std::string_view::npos)) {
1711 path += '/';
1712 }
1713 // Otherwise, if path_buffer is not a single-dot path segment, then:
1714 else if (!unicode::is_single_dot_path_segment(path_buffer)) {
1715 // If url's scheme is "file", url's path is empty, and path_buffer is a
1716 // Windows drive letter, then replace the second code point in
1717 // path_buffer with U+003A (:).
1718 if (type == ada::scheme::type::FILE && path.empty() &&
1719 checkers::is_windows_drive_letter(path_buffer)) {
1720 path += '/';
1721 path += path_buffer[0];
1722 path += ':';
1723 path_buffer.remove_prefix(2);
1724 path.append(path_buffer);
1725 } else {
1726 // Append path_buffer to url's path.
1727 path += '/';
1728 path.append(path_buffer);
1729 }
1730 }
1731 if (location == std::string_view::npos) {
1732 update_base_pathname(path);
1733 return;
1734 }
1735 } while (true);
1736 }
1737}
1738} // namespace ada
Includes all definitions for Ada.
Definitions for URL specific checkers used within Ada.
#define ADA_ASSERT_TRUE(COND)
#define ada_lifetime_bound
#define ada_really_inline
Definition common_defs.h:84
Definitions for helper functions used within Ada.
Definitions for user facing functions for parsing URL and it's components.
constexpr uint8_t QUERY_PERCENT_ENCODE[32]
constexpr uint8_t SPECIAL_QUERY_PERCENT_ENCODE[32]
constexpr uint8_t PATH_PERCENT_ENCODE[32]
constexpr uint8_t C0_CONTROL_PERCENT_ENCODE[32]
constexpr uint8_t USERINFO_PERCENT_ENCODE[32]
constexpr bool has_hex_prefix(std::string_view input)
constexpr bool is_windows_drive_letter(std::string_view input) noexcept
constexpr bool is_alpha(char x) noexcept
constexpr bool is_digit(char x) noexcept
constexpr ada::scheme::type get_scheme_type(std::string_view scheme) noexcept
Definition scheme-inl.h:72
@ NOT_SPECIAL
Definition scheme.h:32
constexpr uint16_t get_special_port(std::string_view scheme) noexcept
Definition scheme-inl.h:57
std::string ipv6(const std::array< uint16_t, 8 > &address) noexcept
std::string ipv4(uint64_t address) noexcept
ada_really_inline size_t percent_encode_index(const std::string_view input, const uint8_t character_set[])
Definition unicode-inl.h:19
Definition ada_idna.h:13
@ IPV6
Definition url_base.h:32
@ IPV4
Definition url_base.h:27
template ada::result< url_aggregator > parse< url_aggregator >(std::string_view input, const url_aggregator *base_url)
tl::expected< result_type, ada::errors > result
ada_warn_unused ada::result< result_type > parse(std::string_view input, const result_type *base_url=nullptr)
Declarations for the URL scheme.
bool has_non_empty_username() const noexcept
void set_hash(std::string_view input)
void clear_search() override
bool has_hostname() const noexcept
std::string_view get_hostname() const noexcept ada_lifetime_bound
bool has_non_empty_password() const noexcept
ada_really_inline bool has_credentials() const noexcept
std::string to_string() const override
std::string_view get_pathname() const noexcept ada_lifetime_bound
std::string_view get_hash() const noexcept ada_lifetime_bound
std::string to_diagram() const
bool set_protocol(std::string_view input)
std::string get_origin() const noexcept override
bool validate() const noexcept
std::string_view get_search() const noexcept ada_lifetime_bound
bool has_valid_domain() const noexcept override
bool set_hostname(std::string_view input)
bool set_password(std::string_view input)
bool set_pathname(std::string_view input)
std::string_view get_protocol() const noexcept ada_lifetime_bound
std::string_view get_password() const noexcept ada_lifetime_bound
bool set_href(std::string_view input)
void set_search(std::string_view input)
std::string_view get_port() const noexcept ada_lifetime_bound
bool has_port() const noexcept
std::string_view get_href() const noexcept ada_lifetime_bound
bool set_host(std::string_view input)
std::string_view get_host() const noexcept ada_lifetime_bound
bool set_port(std::string_view input)
std::string_view get_username() const noexcept ada_lifetime_bound
bool set_username(std::string_view input)
ada_really_inline bool is_special() const noexcept
url_host_type host_type
Definition url_base.h:60
bool is_valid
Definition url_base.h:50
bool has_opaque_path
Definition url_base.h:55
bool check_offset_consistency() const noexcept
static constexpr uint32_t omitted
Definitions for unicode operations.
Inline functions for url aggregator.
Declaration for the basic URL definitions.
Declaration for the URL Components.