Ada 2.9.0
Fast spec-compliant URL parser
Loading...
Searching...
No Matches
url_aggregator.cpp
Go to the documentation of this file.
1#include "ada.h"
2#include "ada/checkers-inl.h"
3#include "ada/helpers.h"
5#include "ada/scheme.h"
6#include "ada/unicode-inl.h"
10
11#include <string>
12#include <string_view>
13
14namespace ada {
15template <bool has_state_override>
16[[nodiscard]] ada_really_inline bool url_aggregator::parse_scheme_with_colon(
17 const std::string_view input_with_colon) {
18 ada_log("url_aggregator::parse_scheme_with_colon ", input_with_colon);
20 ADA_ASSERT_TRUE(!helpers::overlaps(input_with_colon, buffer));
21 std::string_view input{input_with_colon};
22 input.remove_suffix(1);
23 auto parsed_type = ada::scheme::get_scheme_type(input);
24 bool is_input_special = (parsed_type != ada::scheme::NOT_SPECIAL);
29 if (is_input_special) { // fast path!!!
30 if (has_state_override) {
31 // If url's scheme is not a special scheme and buffer is a special scheme,
32 // then return.
33 if (is_special() != is_input_special) {
34 return true;
35 }
36
37 // If url includes credentials or has a non-null port, and buffer is
38 // "file", then return.
39 if ((has_credentials() || components.port != url_components::omitted) &&
40 parsed_type == ada::scheme::type::FILE) {
41 return true;
42 }
43
44 // If url's scheme is "file" and its host is an empty host, then return.
45 // An empty host is the empty string.
46 if (type == ada::scheme::type::FILE &&
47 components.host_start == components.host_end) {
48 return true;
49 }
50 }
51
52 type = parsed_type;
53 set_scheme_from_view_with_colon(input_with_colon);
54
55 if (has_state_override) {
56 // This is uncommon.
57 uint16_t urls_scheme_port = get_special_port();
58
59 // If url's port is url's scheme's default port, then set url's port to
60 // null.
61 if (components.port == urls_scheme_port) {
62 clear_port();
63 }
64 }
65 } else { // slow path
66 std::string _buffer(input);
67 // Next function is only valid if the input is ASCII and returns false
68 // otherwise, but it seems that we always have ascii content so we do not
69 // need to check the return value.
70 unicode::to_lower_ascii(_buffer.data(), _buffer.size());
71
72 if (has_state_override) {
73 // If url's scheme is a special scheme and buffer is not a special scheme,
74 // then return. If url's scheme is not a special scheme and buffer is a
75 // special scheme, then return.
76 if (is_special() != ada::scheme::is_special(_buffer)) {
77 return true;
78 }
79
80 // If url includes credentials or has a non-null port, and buffer is
81 // "file", then return.
82 if ((has_credentials() || components.port != url_components::omitted) &&
83 _buffer == "file") {
84 return true;
85 }
86
87 // If url's scheme is "file" and its host is an empty host, then return.
88 // An empty host is the empty string.
89 if (type == ada::scheme::type::FILE &&
90 components.host_start == components.host_end) {
91 return true;
92 }
93 }
94
95 set_scheme(_buffer);
96
97 if (has_state_override) {
98 // This is uncommon.
99 uint16_t urls_scheme_port = get_special_port();
100
101 // If url's port is url's scheme's default port, then set url's port to
102 // null.
103 if (components.port == urls_scheme_port) {
104 clear_port();
105 }
106 }
107 }
109 return true;
110}
111
112inline void url_aggregator::copy_scheme(const url_aggregator& u) noexcept {
113 ada_log("url_aggregator::copy_scheme ", u.buffer);
114 ADA_ASSERT_TRUE(validate());
115 // next line could overflow but unsigned arithmetic has well-defined
116 // overflows.
117 uint32_t new_difference = u.components.protocol_end - components.protocol_end;
118 type = u.type;
119 buffer.erase(0, components.protocol_end);
120 buffer.insert(0, u.get_protocol());
121 components.protocol_end = u.components.protocol_end;
122
123 // No need to update the components
124 if (new_difference == 0) {
125 return;
126 }
127
128 // Update the rest of the components.
129 components.username_end += new_difference;
130 components.host_start += new_difference;
131 components.host_end += new_difference;
132 components.pathname_start += new_difference;
133 if (components.search_start != url_components::omitted) {
134 components.search_start += new_difference;
135 }
136 if (components.hash_start != url_components::omitted) {
137 components.hash_start += new_difference;
138 }
139 ADA_ASSERT_TRUE(validate());
140}
141
142inline void url_aggregator::set_scheme_from_view_with_colon(
143 std::string_view new_scheme_with_colon) noexcept {
144 ada_log("url_aggregator::set_scheme_from_view_with_colon ",
145 new_scheme_with_colon);
146 ADA_ASSERT_TRUE(validate());
147 ADA_ASSERT_TRUE(!new_scheme_with_colon.empty() &&
148 new_scheme_with_colon.back() == ':');
149 // next line could overflow but unsigned arithmetic has well-defined
150 // overflows.
151 uint32_t new_difference =
152 uint32_t(new_scheme_with_colon.size()) - components.protocol_end;
153
154 if (buffer.empty()) {
155 buffer.append(new_scheme_with_colon);
156 } else {
157 buffer.erase(0, components.protocol_end);
158 buffer.insert(0, new_scheme_with_colon);
159 }
160 components.protocol_end += new_difference;
161
162 // Update the rest of the components.
163 components.username_end += new_difference;
164 components.host_start += new_difference;
165 components.host_end += new_difference;
166 components.pathname_start += new_difference;
167 if (components.search_start != url_components::omitted) {
168 components.search_start += new_difference;
169 }
170 if (components.hash_start != url_components::omitted) {
171 components.hash_start += new_difference;
172 }
173 ADA_ASSERT_TRUE(validate());
174}
175
176inline void url_aggregator::set_scheme(std::string_view new_scheme) noexcept {
177 ada_log("url_aggregator::set_scheme ", new_scheme);
178 ADA_ASSERT_TRUE(validate());
179 ADA_ASSERT_TRUE(new_scheme.empty() || new_scheme.back() != ':');
180 // next line could overflow but unsigned arithmetic has well-defined
181 // overflows.
182 uint32_t new_difference =
183 uint32_t(new_scheme.size()) - components.protocol_end + 1;
184
186 if (buffer.empty()) {
187 buffer.append(helpers::concat(new_scheme, ":"));
188 } else {
189 buffer.erase(0, components.protocol_end);
190 buffer.insert(0, helpers::concat(new_scheme, ":"));
191 }
192 components.protocol_end = uint32_t(new_scheme.size() + 1);
193
194 // Update the rest of the components.
195 components.username_end += new_difference;
196 components.host_start += new_difference;
197 components.host_end += new_difference;
198 components.pathname_start += new_difference;
199 if (components.search_start != url_components::omitted) {
200 components.search_start += new_difference;
201 }
202 if (components.hash_start != url_components::omitted) {
203 components.hash_start += new_difference;
204 }
205 ADA_ASSERT_TRUE(validate());
206}
207
208bool url_aggregator::set_protocol(const std::string_view input) {
209 ada_log("url_aggregator::set_protocol ", input);
211 ADA_ASSERT_TRUE(!helpers::overlaps(input, buffer));
212 std::string view(input);
213 helpers::remove_ascii_tab_or_newline(view);
214 if (view.empty()) {
215 return true;
216 }
217
218 // Schemes should start with alpha values.
219 if (!checkers::is_alpha(view[0])) {
220 return false;
221 }
222
223 view.append(":");
224
225 std::string::iterator pointer =
226 std::find_if_not(view.begin(), view.end(), unicode::is_alnum_plus);
227
228 if (pointer != view.end() && *pointer == ':') {
229 return parse_scheme_with_colon<true>(
230 std::string_view(view.data(), pointer - view.begin() + 1));
231 }
232 return false;
233}
234
235bool url_aggregator::set_username(const std::string_view input) {
236 ada_log("url_aggregator::set_username '", input, "' ");
238 ADA_ASSERT_TRUE(!helpers::overlaps(input, buffer));
239 if (cannot_have_credentials_or_port()) {
240 return false;
241 }
244 if (idx == input.size()) {
245 update_base_username(input);
246 } else {
247 // We only create a temporary string if we have to!
248 update_base_username(ada::unicode::percent_encode(
250 }
252 return true;
253}
254
255bool url_aggregator::set_password(const std::string_view input) {
256 ada_log("url_aggregator::set_password '", input, "'");
258 ADA_ASSERT_TRUE(!helpers::overlaps(input, buffer));
259 if (cannot_have_credentials_or_port()) {
260 return false;
261 }
264 if (idx == input.size()) {
265 update_base_password(input);
266 } else {
267 // We only create a temporary string if we have to!
268 update_base_password(ada::unicode::percent_encode(
270 }
272 return true;
273}
274
275bool url_aggregator::set_port(const std::string_view input) {
276 ada_log("url_aggregator::set_port ", input);
278 ADA_ASSERT_TRUE(!helpers::overlaps(input, buffer));
279 if (cannot_have_credentials_or_port()) {
280 return false;
281 }
282 std::string trimmed(input);
283 helpers::remove_ascii_tab_or_newline(trimmed);
284 if (trimmed.empty()) {
285 clear_port();
286 return true;
287 }
288 // Input should not start with control characters.
289 if (ada::unicode::is_c0_control_or_space(trimmed.front())) {
290 return false;
291 }
292 // Input should contain at least one ascii digit.
293 if (input.find_first_of("0123456789") == std::string_view::npos) {
294 return false;
295 }
296
297 // Revert changes if parse_port fails.
298 uint32_t previous_port = components.port;
299 parse_port(trimmed);
300 if (is_valid) {
301 return true;
302 }
303 update_base_port(previous_port);
304 is_valid = true;
306 return false;
307}
308
309bool url_aggregator::set_pathname(const std::string_view input) {
310 ada_log("url_aggregator::set_pathname ", input);
312 ADA_ASSERT_TRUE(!helpers::overlaps(input, buffer));
313 if (has_opaque_path) {
314 return false;
315 }
316 clear_pathname();
317 parse_path(input);
318 if (checkers::begins_with(get_pathname(), "//") && !has_authority() &&
319 !has_dash_dot()) {
320 buffer.insert(components.pathname_start, "/.");
321 components.pathname_start += 2;
322 }
324 return true;
325}
326
327ada_really_inline void url_aggregator::parse_path(std::string_view input) {
328 ada_log("url_aggregator::parse_path ", input);
330 ADA_ASSERT_TRUE(!helpers::overlaps(input, buffer));
331 std::string tmp_buffer;
332 std::string_view internal_input;
333 if (unicode::has_tabs_or_newline(input)) {
334 tmp_buffer = input;
335 // Optimization opportunity: Instead of copying and then pruning, we could
336 // just directly build the string from user_input.
337 helpers::remove_ascii_tab_or_newline(tmp_buffer);
338 internal_input = tmp_buffer;
339 } else {
340 internal_input = input;
341 }
342
343 // If url is special, then:
344 if (is_special()) {
345 if (internal_input.empty()) {
346 update_base_pathname("/");
347 } else if ((internal_input[0] == '/') || (internal_input[0] == '\\')) {
348 consume_prepared_path(internal_input.substr(1));
349 } else {
350 consume_prepared_path(internal_input);
351 }
352 } else if (!internal_input.empty()) {
353 if (internal_input[0] == '/') {
354 consume_prepared_path(internal_input.substr(1));
355 } else {
356 consume_prepared_path(internal_input);
357 }
358 } else {
359 // Non-special URLs with an empty host can have their paths erased
360 // Path-only URLs cannot have their paths erased
361 if (components.host_start == components.host_end && !has_authority()) {
362 update_base_pathname("/");
363 }
364 }
366}
367
368void url_aggregator::set_search(const std::string_view input) {
369 ada_log("url_aggregator::set_search ", input);
371 ADA_ASSERT_TRUE(!helpers::overlaps(input, buffer));
372 if (input.empty()) {
373 clear_search();
374 helpers::strip_trailing_spaces_from_opaque_path(*this);
375 return;
376 }
377
378 std::string new_value;
379 new_value = input[0] == '?' ? input.substr(1) : input;
380 helpers::remove_ascii_tab_or_newline(new_value);
381
382 auto query_percent_encode_set =
385
386 update_base_search(new_value, query_percent_encode_set);
388}
389
390void url_aggregator::set_hash(const std::string_view input) {
391 ada_log("url_aggregator::set_hash ", input);
393 ADA_ASSERT_TRUE(!helpers::overlaps(input, buffer));
394 if (input.empty()) {
395 if (components.hash_start != url_components::omitted) {
396 buffer.resize(components.hash_start);
398 }
399 helpers::strip_trailing_spaces_from_opaque_path(*this);
400 return;
401 }
402
403 std::string new_value;
404 new_value = input[0] == '#' ? input.substr(1) : input;
405 helpers::remove_ascii_tab_or_newline(new_value);
406 update_unencoded_base_hash(new_value);
408}
409
410bool url_aggregator::set_href(const std::string_view input) {
411 ADA_ASSERT_TRUE(!helpers::overlaps(input, buffer));
412 ada_log("url_aggregator::set_href ", input, " [", input.size(), " bytes]");
414 ada_log("url_aggregator::set_href, success :", out.has_value());
415
416 if (out) {
417 ada_log("url_aggregator::set_href, parsed ", out->to_string());
418 // TODO: Figure out why the following line puts test to never finish.
419 *this = *out;
420 }
421
422 return out.has_value();
423}
424
425ada_really_inline bool url_aggregator::parse_host(std::string_view input) {
426 ada_log("url_aggregator:parse_host \"", input, "\" [", input.size(),
427 " bytes]");
429 ADA_ASSERT_TRUE(!helpers::overlaps(input, buffer));
430 if (input.empty()) {
431 return is_valid = false;
432 } // technically unnecessary.
433 // If input starts with U+005B ([), then:
434 if (input[0] == '[') {
435 // If input does not end with U+005D (]), validation error, return failure.
436 if (input.back() != ']') {
437 return is_valid = false;
438 }
439 ada_log("parse_host ipv6");
440
441 // Return the result of IPv6 parsing input with its leading U+005B ([) and
442 // trailing U+005D (]) removed.
443 input.remove_prefix(1);
444 input.remove_suffix(1);
445 return parse_ipv6(input);
446 }
447
448 // If isNotSpecial is true, then return the result of opaque-host parsing
449 // input.
450 if (!is_special()) {
451 return parse_opaque_host(input);
452 }
453 // Let domain be the result of running UTF-8 decode without BOM on the
454 // percent-decoding of input. Let asciiDomain be the result of running domain
455 // to ASCII with domain and false. The most common case is an ASCII input, in
456 // which case we do not need to call the expensive 'to_ascii' if a few
457 // conditions are met: no '%' and no 'xn-' subsequence.
458
459 // Often, the input does not contain any forbidden code points, and no upper
460 // case ASCII letter, then we can just copy it to the buffer. We want to
461 // optimize for such a common case.
462 uint8_t is_forbidden_or_upper =
463 unicode::contains_forbidden_domain_code_point_or_upper(input.data(),
464 input.size());
465 // Minor optimization opportunity:
466 // contains_forbidden_domain_code_point_or_upper could be extend to check for
467 // the presence of characters that cannot appear in the ipv4 address and we
468 // could also check whether x and n and - are present, and so we could skip
469 // some of the checks below. However, the gains are likely to be small, and
470 // the code would be more complex.
471 if (is_forbidden_or_upper == 0 &&
472 input.find("xn-") == std::string_view::npos) {
473 // fast path
474 update_base_hostname(input);
475 if (checkers::is_ipv4(get_hostname())) {
476 ada_log("parse_host fast path ipv4");
477 return parse_ipv4(get_hostname(), true);
478 }
479 ada_log("parse_host fast path ", get_hostname());
480 return true;
481 }
482 // We have encountered at least one forbidden code point or the input contains
483 // 'xn-' (case insensitive), so we need to call 'to_ascii' to perform the full
484 // conversion.
485
486 ada_log("parse_host calling to_ascii");
487 std::optional<std::string> host = std::string(get_hostname());
488 is_valid = ada::unicode::to_ascii(host, input, input.find('%'));
489 if (!is_valid) {
490 ada_log("parse_host to_ascii returns false");
491 return is_valid = false;
492 }
493 ada_log("parse_host to_ascii succeeded ", *host, " [", host->size(),
494 " bytes]");
495
496 if (std::any_of(host.value().begin(), host.value().end(),
497 ada::unicode::is_forbidden_domain_code_point)) {
498 return is_valid = false;
499 }
500
501 // If asciiDomain ends in a number, then return the result of IPv4 parsing
502 // asciiDomain.
503 if (checkers::is_ipv4(host.value())) {
504 ada_log("parse_host got ipv4 ", *host);
505 return parse_ipv4(host.value(), false);
506 }
507
508 update_base_hostname(host.value());
510 return true;
511}
512
513template <bool override_hostname>
514bool url_aggregator::set_host_or_hostname(const std::string_view input) {
515 ada_log("url_aggregator::set_host_or_hostname ", input);
517 ADA_ASSERT_TRUE(!helpers::overlaps(input, buffer));
518 if (has_opaque_path) {
519 return false;
520 }
521
522 std::string previous_host(get_hostname());
523 uint32_t previous_port = components.port;
524
525 size_t host_end_pos = input.find('#');
526 std::string _host(input.data(), host_end_pos != std::string_view::npos
527 ? host_end_pos
528 : input.size());
529 helpers::remove_ascii_tab_or_newline(_host);
530 std::string_view new_host(_host);
531
532 // If url's scheme is "file", then set state to file host state, instead of
533 // host state.
534 if (type != ada::scheme::type::FILE) {
535 std::string_view host_view(_host.data(), _host.length());
536 auto [location, found_colon] =
537 helpers::get_host_delimiter_location(is_special(), host_view);
538
539 // Otherwise, if c is U+003A (:) and insideBrackets is false, then:
540 // Note: the 'found_colon' value is true if and only if a colon was
541 // encountered while not inside brackets.
542 if (found_colon) {
543 if (override_hostname) {
544 return false;
545 }
546 std::string_view sub_buffer = new_host.substr(location + 1);
547 if (!sub_buffer.empty()) {
548 set_port(sub_buffer);
549 }
550 }
551 // If url is special and host_view is the empty string, validation error,
552 // return failure. Otherwise, if state override is given, host_view is the
553 // empty string, and either url includes credentials or url's port is
554 // non-null, return.
555 else if (host_view.empty() &&
556 (is_special() || has_credentials() || has_port())) {
557 return false;
558 }
559
560 // Let host be the result of host parsing host_view with url is not special.
561 if (host_view.empty() && !is_special()) {
562 if (has_hostname()) {
563 clear_hostname(); // easy!
564 } else if (has_dash_dot()) {
565 add_authority_slashes_if_needed();
566 delete_dash_dot();
567 }
568 return true;
569 }
570
571 bool succeeded = parse_host(host_view);
572 if (!succeeded) {
573 update_base_hostname(previous_host);
574 update_base_port(previous_port);
575 } else if (has_dash_dot()) {
576 // Should remove dash_dot from pathname
577 delete_dash_dot();
578 }
579 return succeeded;
580 }
581
582 size_t location = new_host.find_first_of("/\\?");
583 if (location != std::string_view::npos) {
584 new_host.remove_suffix(new_host.length() - location);
585 }
586
587 if (new_host.empty()) {
588 // Set url's host to the empty string.
589 clear_hostname();
590 } else {
591 // Let host be the result of host parsing buffer with url is not special.
592 if (!parse_host(new_host)) {
593 update_base_hostname(previous_host);
594 update_base_port(previous_port);
595 return false;
596 }
597
598 // If host is "localhost", then set host to the empty string.
599 if (helpers::substring(buffer, components.host_start,
600 components.host_end) == "localhost") {
601 clear_hostname();
602 }
603 }
605 return true;
606}
607
608bool url_aggregator::set_host(const std::string_view input) {
609 ada_log("url_aggregator::set_host '", input, "'");
611 ADA_ASSERT_TRUE(!helpers::overlaps(input, buffer));
612 return set_host_or_hostname<false>(input);
613}
614
615bool url_aggregator::set_hostname(const std::string_view input) {
616 ada_log("url_aggregator::set_hostname '", input, "'");
618 ADA_ASSERT_TRUE(!helpers::overlaps(input, buffer));
619 return set_host_or_hostname<true>(input);
620}
621
622[[nodiscard]] std::string url_aggregator::get_origin() const noexcept {
623 ada_log("url_aggregator::get_origin");
624 if (is_special()) {
625 // Return a new opaque origin.
626 if (type == scheme::FILE) {
627 return "null";
628 }
629
630 return helpers::concat(get_protocol(), "//", get_host());
631 }
632
633 if (get_protocol() == "blob:") {
634 std::string_view path = get_pathname();
635 if (!path.empty()) {
636 auto out = ada::parse<ada::url_aggregator>(path);
637 if (out && (out->type == scheme::HTTP || out->type == scheme::HTTPS)) {
638 // If pathURL's scheme is not "http" and not "https", then return a
639 // new opaque origin.
640 return helpers::concat(out->get_protocol(), "//", out->get_host());
641 }
642 }
643 }
644
645 // Return a new opaque origin.
646 return "null";
647}
648
649[[nodiscard]] std::string_view url_aggregator::get_username() const noexcept
651 ada_log("url_aggregator::get_username");
653 return helpers::substring(buffer, components.protocol_end + 2,
654 components.username_end);
655 }
656 return "";
657}
658
659[[nodiscard]] std::string_view url_aggregator::get_password() const noexcept
661 ada_log("url_aggregator::get_password");
663 return helpers::substring(buffer, components.username_end + 1,
664 components.host_start);
665 }
666 return "";
667}
668
669[[nodiscard]] std::string_view url_aggregator::get_port() const noexcept
671 ada_log("url_aggregator::get_port");
672 if (components.port == url_components::omitted) {
673 return "";
674 }
675 return helpers::substring(buffer, components.host_end + 1,
676 components.pathname_start);
677}
678
679[[nodiscard]] std::string_view url_aggregator::get_hash() const noexcept
681 ada_log("url_aggregator::get_hash");
682 // If this's URL's fragment is either null or the empty string, then return
683 // the empty string. Return U+0023 (#), followed by this's URL's fragment.
684 if (components.hash_start == url_components::omitted) {
685 return "";
686 }
687 if (buffer.size() - components.hash_start <= 1) {
688 return "";
689 }
690 return helpers::substring(buffer, components.hash_start);
691}
692
693[[nodiscard]] std::string_view url_aggregator::get_host() const noexcept
695 ada_log("url_aggregator::get_host");
696 // Technically, we should check if there is a hostname, but
697 // the code below works even if there isn't.
698 // if(!has_hostname()) { return ""; }
699 size_t start = components.host_start;
700 if (components.host_end > components.host_start &&
701 buffer[components.host_start] == '@') {
702 start++;
703 }
704 // if we have an empty host, then the space between components.host_end and
705 // components.pathname_start may be occupied by /.
706 if (start == components.host_end) {
707 return {};
708 }
709 return helpers::substring(buffer, start, components.pathname_start);
710}
711
712[[nodiscard]] std::string_view url_aggregator::get_hostname() const noexcept
714 ada_log("url_aggregator::get_hostname");
715 // Technically, we should check if there is a hostname, but
716 // the code below works even if there isn't.
717 // if(!has_hostname()) { return ""; }
718 size_t start = components.host_start;
719 // So host_start is not where the host begins.
720 if (components.host_end > components.host_start &&
721 buffer[components.host_start] == '@') {
722 start++;
723 }
724 return helpers::substring(buffer, start, components.host_end);
725}
726
727[[nodiscard]] std::string_view url_aggregator::get_pathname() const noexcept
729 ada_log("url_aggregator::get_pathname pathname_start = ",
730 components.pathname_start, " buffer.size() = ", buffer.size(),
731 " components.search_start = ", components.search_start,
732 " components.hash_start = ", components.hash_start);
733 auto ending_index = uint32_t(buffer.size());
734 if (components.search_start != url_components::omitted) {
735 ending_index = components.search_start;
736 } else if (components.hash_start != url_components::omitted) {
737 ending_index = components.hash_start;
738 }
739 return helpers::substring(buffer, components.pathname_start, ending_index);
740}
741
742[[nodiscard]] std::string_view url_aggregator::get_search() const noexcept
744 ada_log("url_aggregator::get_search");
745 // If this's URL's query is either null or the empty string, then return the
746 // empty string. Return U+003F (?), followed by this's URL's query.
747 if (components.search_start == url_components::omitted) {
748 return "";
749 }
750 auto ending_index = uint32_t(buffer.size());
751 if (components.hash_start != url_components::omitted) {
752 ending_index = components.hash_start;
753 }
754 if (ending_index - components.search_start <= 1) {
755 return "";
756 }
757 return helpers::substring(buffer, components.search_start, ending_index);
758}
759
760[[nodiscard]] std::string_view url_aggregator::get_protocol() const noexcept
762 ada_log("url_aggregator::get_protocol");
763 return helpers::substring(buffer, 0, components.protocol_end);
764}
765
766[[nodiscard]] std::string ada::url_aggregator::to_string() const {
767 ada_log("url_aggregator::to_string buffer:", buffer, " [", buffer.size(),
768 " bytes]");
769 if (!is_valid) {
770 return "null";
771 }
772
773 std::string answer;
774 auto back = std::back_insert_iterator(answer);
775 answer.append("{\n");
776
777 answer.append("\t\"buffer\":\"");
778 helpers::encode_json(buffer, back);
779 answer.append("\",\n");
780
781 answer.append("\t\"protocol\":\"");
782 helpers::encode_json(get_protocol(), back);
783 answer.append("\",\n");
784
785 if (has_credentials()) {
786 answer.append("\t\"username\":\"");
787 helpers::encode_json(get_username(), back);
788 answer.append("\",\n");
789 answer.append("\t\"password\":\"");
790 helpers::encode_json(get_password(), back);
791 answer.append("\",\n");
792 }
793
794 answer.append("\t\"host\":\"");
795 helpers::encode_json(get_host(), back);
796 answer.append("\",\n");
797
798 answer.append("\t\"path\":\"");
799 helpers::encode_json(get_pathname(), back);
800 answer.append("\",\n");
801 answer.append("\t\"opaque path\":");
802 answer.append((has_opaque_path ? "true" : "false"));
803 answer.append(",\n");
804
805 if (components.search_start != url_components::omitted) {
806 answer.append("\t\"query\":\"");
807 helpers::encode_json(get_search(), back);
808 answer.append("\",\n");
809 }
810 if (components.hash_start != url_components::omitted) {
811 answer.append("\t\"fragment\":\"");
812 helpers::encode_json(get_hash(), back);
813 answer.append("\",\n");
814 }
815
816 auto convert_offset_to_string = [](uint32_t offset) -> std::string {
817 if (offset == url_components::omitted) {
818 return "null";
819 } else {
820 return std::to_string(offset);
821 }
822 };
823
824 answer.append("\t\"protocol_end\":");
825 answer.append(convert_offset_to_string(components.protocol_end));
826 answer.append(",\n");
827
828 answer.append("\t\"username_end\":");
829 answer.append(convert_offset_to_string(components.username_end));
830 answer.append(",\n");
831
832 answer.append("\t\"host_start\":");
833 answer.append(convert_offset_to_string(components.host_start));
834 answer.append(",\n");
835
836 answer.append("\t\"host_end\":");
837 answer.append(convert_offset_to_string(components.host_end));
838 answer.append(",\n");
839
840 answer.append("\t\"port\":");
841 answer.append(convert_offset_to_string(components.port));
842 answer.append(",\n");
843
844 answer.append("\t\"pathname_start\":");
845 answer.append(convert_offset_to_string(components.pathname_start));
846 answer.append(",\n");
847
848 answer.append("\t\"search_start\":");
849 answer.append(convert_offset_to_string(components.search_start));
850 answer.append(",\n");
851
852 answer.append("\t\"hash_start\":");
853 answer.append(convert_offset_to_string(components.hash_start));
854 answer.append("\n}");
855
856 return answer;
857}
858
859[[nodiscard]] bool url_aggregator::has_valid_domain() const noexcept {
860 if (components.host_start == components.host_end) {
861 return false;
862 }
863 return checkers::verify_dns_length(get_hostname());
864}
865
866bool url_aggregator::parse_ipv4(std::string_view input, bool in_place) {
867 ada_log("parse_ipv4 ", input, " [", input.size(),
868 " bytes], overlaps with buffer: ",
869 helpers::overlaps(input, buffer) ? "yes" : "no");
871 const bool trailing_dot = (input.back() == '.');
872 if (trailing_dot) {
873 input.remove_suffix(1);
874 }
875 size_t digit_count{0};
876 int pure_decimal_count = 0; // entries that are decimal
877 uint64_t ipv4{0};
878 // we could unroll for better performance?
879 for (; (digit_count < 4) && !(input.empty()); digit_count++) {
880 uint32_t
881 segment_result{}; // If any number exceeds 32 bits, we have an error.
882 bool is_hex = checkers::has_hex_prefix(input);
883 if (is_hex && ((input.length() == 2) ||
884 ((input.length() > 2) && (input[2] == '.')))) {
885 // special case
886 segment_result = 0;
887 input.remove_prefix(2);
888 } else {
889 std::from_chars_result r{};
890 if (is_hex) {
891 ada_log("parse_ipv4 trying to parse hex number");
892 r = std::from_chars(input.data() + 2, input.data() + input.size(),
893 segment_result, 16);
894 } else if ((input.length() >= 2) && input[0] == '0' &&
895 checkers::is_digit(input[1])) {
896 ada_log("parse_ipv4 trying to parse octal number");
897 r = std::from_chars(input.data() + 1, input.data() + input.size(),
898 segment_result, 8);
899 } else {
900 ada_log("parse_ipv4 trying to parse decimal number");
901 pure_decimal_count++;
902 r = std::from_chars(input.data(), input.data() + input.size(),
903 segment_result, 10);
904 }
905 if (r.ec != std::errc()) {
906 ada_log("parse_ipv4 parsing failed");
907 return is_valid = false;
908 }
909 ada_log("parse_ipv4 parsed ", segment_result);
910 input.remove_prefix(r.ptr - input.data());
911 }
912 if (input.empty()) {
913 // We have the last value.
914 // At this stage, ipv4 contains digit_count*8 bits.
915 // So we have 32-digit_count*8 bits left.
916 if (segment_result >= (uint64_t(1) << (32 - digit_count * 8))) {
917 return is_valid = false;
918 }
919 ipv4 <<= (32 - digit_count * 8);
920 ipv4 |= segment_result;
921 goto final;
922 } else {
923 // There is more, so that the value must no be larger than 255
924 // and we must have a '.'.
925 if ((segment_result > 255) || (input[0] != '.')) {
926 return is_valid = false;
927 }
928 ipv4 <<= 8;
929 ipv4 |= segment_result;
930 input.remove_prefix(1); // remove '.'
931 }
932 }
933 if ((digit_count != 4) || (!input.empty())) {
934 ada_log("parse_ipv4 found invalid (more than 4 numbers or empty) ");
935 return is_valid = false;
936 }
937final:
938 ada_log("url_aggregator::parse_ipv4 completed ", get_href(),
939 " host: ", get_host());
940
941 // We could also check r.ptr to see where the parsing ended.
942 if (in_place && pure_decimal_count == 4 && !trailing_dot) {
943 ada_log(
944 "url_aggregator::parse_ipv4 completed and was already correct in the "
945 "buffer");
946 // The original input was already all decimal and we validated it. So we
947 // don't need to do anything.
948 } else {
949 ada_log("url_aggregator::parse_ipv4 completed and we need to update it");
950 // Optimization opportunity: Get rid of unnecessary string return in ipv4
951 // serializer.
952 // TODO: This is likely a bug because it goes back update_base_hostname, not
953 // what we want to do.
954 update_base_hostname(
955 ada::serializers::ipv4(ipv4)); // We have to reserialize the address.
956 }
957 host_type = IPV4;
959 return true;
960}
961
962bool url_aggregator::parse_ipv6(std::string_view input) {
963 // TODO: Implement in_place optimization: we know that input points
964 // in the buffer, so we can just check whether the buffer is already
965 // well formatted.
966 // TODO: Find a way to merge parse_ipv6 with url.cpp implementation.
967 ada_log("parse_ipv6 ", input, " [", input.size(), " bytes]");
969 ADA_ASSERT_TRUE(!helpers::overlaps(input, buffer));
970 if (input.empty()) {
971 return is_valid = false;
972 }
973 // Let address be a new IPv6 address whose IPv6 pieces are all 0.
974 std::array<uint16_t, 8> address{};
975
976 // Let pieceIndex be 0.
977 int piece_index = 0;
978
979 // Let compress be null.
980 std::optional<int> compress{};
981
982 // Let pointer be a pointer for input.
983 std::string_view::iterator pointer = input.begin();
984
985 // If c is U+003A (:), then:
986 if (input[0] == ':') {
987 // If remaining does not start with U+003A (:), validation error, return
988 // failure.
989 if (input.size() == 1 || input[1] != ':') {
990 ada_log("parse_ipv6 starts with : but the rest does not start with :");
991 return is_valid = false;
992 }
993
994 // Increase pointer by 2.
995 pointer += 2;
996
997 // Increase pieceIndex by 1 and then set compress to pieceIndex.
998 compress = ++piece_index;
999 }
1000
1001 // While c is not the EOF code point:
1002 while (pointer != input.end()) {
1003 // If pieceIndex is 8, validation error, return failure.
1004 if (piece_index == 8) {
1005 ada_log("parse_ipv6 piece_index == 8");
1006 return is_valid = false;
1007 }
1008
1009 // If c is U+003A (:), then:
1010 if (*pointer == ':') {
1011 // If compress is non-null, validation error, return failure.
1012 if (compress.has_value()) {
1013 ada_log("parse_ipv6 compress is non-null");
1014 return is_valid = false;
1015 }
1016
1017 // Increase pointer and pieceIndex by 1, set compress to pieceIndex, and
1018 // then continue.
1019 pointer++;
1020 compress = ++piece_index;
1021 continue;
1022 }
1023
1024 // Let value and length be 0.
1025 uint16_t value = 0, length = 0;
1026
1027 // While length is less than 4 and c is an ASCII hex digit,
1028 // set value to value times 0x10 + c interpreted as hexadecimal number, and
1029 // increase pointer and length by 1.
1030 while (length < 4 && pointer != input.end() &&
1031 unicode::is_ascii_hex_digit(*pointer)) {
1032 // https://stackoverflow.com/questions/39060852/why-does-the-addition-of-two-shorts-return-an-int
1033 value = uint16_t(value * 0x10 + unicode::convert_hex_to_binary(*pointer));
1034 pointer++;
1035 length++;
1036 }
1037
1038 // If c is U+002E (.), then:
1039 if (pointer != input.end() && *pointer == '.') {
1040 // If length is 0, validation error, return failure.
1041 if (length == 0) {
1042 ada_log("parse_ipv6 length is 0");
1043 return is_valid = false;
1044 }
1045
1046 // Decrease pointer by length.
1047 pointer -= length;
1048
1049 // If pieceIndex is greater than 6, validation error, return failure.
1050 if (piece_index > 6) {
1051 ada_log("parse_ipv6 piece_index > 6");
1052 return is_valid = false;
1053 }
1054
1055 // Let numbersSeen be 0.
1056 int numbers_seen = 0;
1057
1058 // While c is not the EOF code point:
1059 while (pointer != input.end()) {
1060 // Let ipv4Piece be null.
1061 std::optional<uint16_t> ipv4_piece{};
1062
1063 // If numbersSeen is greater than 0, then:
1064 if (numbers_seen > 0) {
1065 // If c is a U+002E (.) and numbersSeen is less than 4, then increase
1066 // pointer by 1.
1067 if (*pointer == '.' && numbers_seen < 4) {
1068 pointer++;
1069 } else {
1070 // Otherwise, validation error, return failure.
1071 ada_log("parse_ipv6 Otherwise, validation error, return failure");
1072 return is_valid = false;
1073 }
1074 }
1075
1076 // If c is not an ASCII digit, validation error, return failure.
1077 if (pointer == input.end() || !checkers::is_digit(*pointer)) {
1078 ada_log(
1079 "parse_ipv6 If c is not an ASCII digit, validation error, return "
1080 "failure");
1081 return is_valid = false;
1082 }
1083
1084 // While c is an ASCII digit:
1085 while (pointer != input.end() && checkers::is_digit(*pointer)) {
1086 // Let number be c interpreted as decimal number.
1087 int number = *pointer - '0';
1088
1089 // If ipv4Piece is null, then set ipv4Piece to number.
1090 if (!ipv4_piece.has_value()) {
1091 ipv4_piece = number;
1092 }
1093 // Otherwise, if ipv4Piece is 0, validation error, return failure.
1094 else if (ipv4_piece == 0) {
1095 ada_log("parse_ipv6 if ipv4Piece is 0, validation error");
1096 return is_valid = false;
1097 }
1098 // Otherwise, set ipv4Piece to ipv4Piece times 10 + number.
1099 else {
1100 ipv4_piece = *ipv4_piece * 10 + number;
1101 }
1102
1103 // If ipv4Piece is greater than 255, validation error, return failure.
1104 if (ipv4_piece > 255) {
1105 ada_log("parse_ipv6 ipv4_piece > 255");
1106 return is_valid = false;
1107 }
1108
1109 // Increase pointer by 1.
1110 pointer++;
1111 }
1112
1113 // Set address[pieceIndex] to address[pieceIndex] times 0x100 +
1114 // ipv4Piece.
1115 // https://stackoverflow.com/questions/39060852/why-does-the-addition-of-two-shorts-return-an-int
1116 address[piece_index] =
1117 uint16_t(address[piece_index] * 0x100 + *ipv4_piece);
1118
1119 // Increase numbersSeen by 1.
1120 numbers_seen++;
1121
1122 // If numbersSeen is 2 or 4, then increase pieceIndex by 1.
1123 if (numbers_seen == 2 || numbers_seen == 4) {
1124 piece_index++;
1125 }
1126 }
1127
1128 // If numbersSeen is not 4, validation error, return failure.
1129 if (numbers_seen != 4) {
1130 return is_valid = false;
1131 }
1132
1133 // Break.
1134 break;
1135 }
1136 // Otherwise, if c is U+003A (:):
1137 else if ((pointer != input.end()) && (*pointer == ':')) {
1138 // Increase pointer by 1.
1139 pointer++;
1140
1141 // If c is the EOF code point, validation error, return failure.
1142 if (pointer == input.end()) {
1143 ada_log(
1144 "parse_ipv6 If c is the EOF code point, validation error, return "
1145 "failure");
1146 return is_valid = false;
1147 }
1148 }
1149 // Otherwise, if c is not the EOF code point, validation error, return
1150 // failure.
1151 else if (pointer != input.end()) {
1152 ada_log(
1153 "parse_ipv6 Otherwise, if c is not the EOF code point, validation "
1154 "error, return failure");
1155 return is_valid = false;
1156 }
1157
1158 // Set address[pieceIndex] to value.
1159 address[piece_index] = value;
1160
1161 // Increase pieceIndex by 1.
1162 piece_index++;
1163 }
1164
1165 // If compress is non-null, then:
1166 if (compress.has_value()) {
1167 // Let swaps be pieceIndex - compress.
1168 int swaps = piece_index - *compress;
1169
1170 // Set pieceIndex to 7.
1171 piece_index = 7;
1172
1173 // While pieceIndex is not 0 and swaps is greater than 0,
1174 // swap address[pieceIndex] with address[compress + swaps - 1], and then
1175 // decrease both pieceIndex and swaps by 1.
1176 while (piece_index != 0 && swaps > 0) {
1177 std::swap(address[piece_index], address[*compress + swaps - 1]);
1178 piece_index--;
1179 swaps--;
1180 }
1181 }
1182 // Otherwise, if compress is null and pieceIndex is not 8, validation error,
1183 // return failure.
1184 else if (piece_index != 8) {
1185 ada_log(
1186 "parse_ipv6 if compress is null and pieceIndex is not 8, validation "
1187 "error, return failure");
1188 return is_valid = false;
1189 }
1190 // TODO: Optimization opportunity: Get rid of unnecessary string creation.
1191 // TODO: This is likely a bug because it goes back update_base_hostname, not
1192 // what we want to do.
1193 update_base_hostname(ada::serializers::ipv6(address));
1194 ada_log("parse_ipv6 ", get_hostname());
1196 host_type = IPV6;
1197 return true;
1198}
1199
1200bool url_aggregator::parse_opaque_host(std::string_view input) {
1201 ada_log("parse_opaque_host ", input, " [", input.size(), " bytes]");
1203 ADA_ASSERT_TRUE(!helpers::overlaps(input, buffer));
1204 if (std::any_of(input.begin(), input.end(),
1205 ada::unicode::is_forbidden_host_code_point)) {
1206 return is_valid = false;
1207 }
1208
1209 // Return the result of running UTF-8 percent-encode on input using the C0
1210 // control percent-encode set.
1213 if (idx == input.size()) {
1214 update_base_hostname(input);
1215 } else {
1216 // We only create a temporary string if we need to.
1217 update_base_hostname(ada::unicode::percent_encode(
1219 }
1221 return true;
1222}
1223
1224[[nodiscard]] std::string url_aggregator::to_diagram() const {
1225 if (!is_valid) {
1226 return "invalid";
1227 }
1228 std::string answer;
1229 answer.append(buffer);
1230 answer.append(" [");
1231 answer.append(std::to_string(buffer.size()));
1232 answer.append(" bytes]");
1233 answer.append("\n");
1234 // first line
1235 std::string line1;
1236 line1.resize(buffer.size(), ' ');
1237 if (components.hash_start != url_components::omitted) {
1238 line1[components.hash_start] = '|';
1239 }
1240 if (components.search_start != url_components::omitted) {
1241 line1[components.search_start] = '|';
1242 }
1243 if (components.pathname_start != buffer.size()) {
1244 line1[components.pathname_start] = '|';
1245 }
1246 if (components.host_end != buffer.size()) {
1247 line1[components.host_end] = '|';
1248 }
1249 if (components.host_start != buffer.size()) {
1250 line1[components.host_start] = '|';
1251 }
1252 if (components.username_end != buffer.size()) {
1253 line1[components.username_end] = '|';
1254 }
1255 if (components.protocol_end != buffer.size()) {
1256 line1[components.protocol_end] = '|';
1257 }
1258 answer.append(line1);
1259 answer.append("\n");
1260
1261 std::string line2 = line1;
1262 if (components.hash_start != url_components::omitted) {
1263 line2[components.hash_start] = '`';
1264 line1[components.hash_start] = ' ';
1265
1266 for (size_t i = components.hash_start + 1; i < line2.size(); i++) {
1267 line2[i] = '-';
1268 }
1269 line2.append(" hash_start");
1270 answer.append(line2);
1271 answer.append("\n");
1272 }
1273
1274 std::string line3 = line1;
1275 if (components.search_start != url_components::omitted) {
1276 line3[components.search_start] = '`';
1277 line1[components.search_start] = ' ';
1278
1279 for (size_t i = components.search_start + 1; i < line3.size(); i++) {
1280 line3[i] = '-';
1281 }
1282 line3.append(" search_start ");
1283 line3.append(std::to_string(components.search_start));
1284 answer.append(line3);
1285 answer.append("\n");
1286 }
1287
1288 std::string line4 = line1;
1289 if (components.pathname_start != buffer.size()) {
1290 line4[components.pathname_start] = '`';
1291 line1[components.pathname_start] = ' ';
1292 for (size_t i = components.pathname_start + 1; i < line4.size(); i++) {
1293 line4[i] = '-';
1294 }
1295 line4.append(" pathname_start ");
1296 line4.append(std::to_string(components.pathname_start));
1297 answer.append(line4);
1298 answer.append("\n");
1299 }
1300
1301 std::string line5 = line1;
1302 if (components.host_end != buffer.size()) {
1303 line5[components.host_end] = '`';
1304 line1[components.host_end] = ' ';
1305
1306 for (size_t i = components.host_end + 1; i < line5.size(); i++) {
1307 line5[i] = '-';
1308 }
1309 line5.append(" host_end ");
1310 line5.append(std::to_string(components.host_end));
1311 answer.append(line5);
1312 answer.append("\n");
1313 }
1314
1315 std::string line6 = line1;
1316 if (components.host_start != buffer.size()) {
1317 line6[components.host_start] = '`';
1318 line1[components.host_start] = ' ';
1319
1320 for (size_t i = components.host_start + 1; i < line6.size(); i++) {
1321 line6[i] = '-';
1322 }
1323 line6.append(" host_start ");
1324 line6.append(std::to_string(components.host_start));
1325 answer.append(line6);
1326 answer.append("\n");
1327 }
1328
1329 std::string line7 = line1;
1330 if (components.username_end != buffer.size()) {
1331 line7[components.username_end] = '`';
1332 line1[components.username_end] = ' ';
1333
1334 for (size_t i = components.username_end + 1; i < line7.size(); i++) {
1335 line7[i] = '-';
1336 }
1337 line7.append(" username_end ");
1338 line7.append(std::to_string(components.username_end));
1339 answer.append(line7);
1340 answer.append("\n");
1341 }
1342
1343 std::string line8 = line1;
1344 if (components.protocol_end != buffer.size()) {
1345 line8[components.protocol_end] = '`';
1346 line1[components.protocol_end] = ' ';
1347
1348 for (size_t i = components.protocol_end + 1; i < line8.size(); i++) {
1349 line8[i] = '-';
1350 }
1351 line8.append(" protocol_end ");
1352 line8.append(std::to_string(components.protocol_end));
1353 answer.append(line8);
1354 answer.append("\n");
1355 }
1356
1357 if (components.hash_start == url_components::omitted) {
1358 answer.append("note: hash omitted\n");
1359 }
1360 if (components.search_start == url_components::omitted) {
1361 answer.append("note: search omitted\n");
1362 }
1363 if (components.protocol_end > buffer.size()) {
1364 answer.append("warning: protocol_end overflows\n");
1365 }
1366 if (components.username_end > buffer.size()) {
1367 answer.append("warning: username_end overflows\n");
1368 }
1369 if (components.host_start > buffer.size()) {
1370 answer.append("warning: host_start overflows\n");
1371 }
1372 if (components.host_end > buffer.size()) {
1373 answer.append("warning: host_end overflows\n");
1374 }
1375 if (components.pathname_start > buffer.size()) {
1376 answer.append("warning: pathname_start overflows\n");
1377 }
1378 return answer;
1379}
1380
1381[[nodiscard]] bool url_aggregator::validate() const noexcept {
1382 if (!is_valid) {
1383 return true;
1384 }
1385 if (!components.check_offset_consistency()) {
1386 ada_log("url_aggregator::validate inconsistent components \n",
1387 to_diagram());
1388 return false;
1389 }
1390 // We have a credible components struct, but let us investivate more
1391 // carefully:
1404 if (components.protocol_end == url_components::omitted) {
1405 ada_log("url_aggregator::validate omitted protocol_end \n", to_diagram());
1406 return false;
1407 }
1408 if (components.username_end == url_components::omitted) {
1409 ada_log("url_aggregator::validate omitted username_end \n", to_diagram());
1410 return false;
1411 }
1412 if (components.host_start == url_components::omitted) {
1413 ada_log("url_aggregator::validate omitted host_start \n", to_diagram());
1414 return false;
1415 }
1416 if (components.host_end == url_components::omitted) {
1417 ada_log("url_aggregator::validate omitted host_end \n", to_diagram());
1418 return false;
1419 }
1420 if (components.pathname_start == url_components::omitted) {
1421 ada_log("url_aggregator::validate omitted pathname_start \n", to_diagram());
1422 return false;
1423 }
1424
1425 if (components.protocol_end > buffer.size()) {
1426 ada_log("url_aggregator::validate protocol_end overflow \n", to_diagram());
1427 return false;
1428 }
1429 if (components.username_end > buffer.size()) {
1430 ada_log("url_aggregator::validate username_end overflow \n", to_diagram());
1431 return false;
1432 }
1433 if (components.host_start > buffer.size()) {
1434 ada_log("url_aggregator::validate host_start overflow \n", to_diagram());
1435 return false;
1436 }
1437 if (components.host_end > buffer.size()) {
1438 ada_log("url_aggregator::validate host_end overflow \n", to_diagram());
1439 return false;
1440 }
1441 if (components.pathname_start > buffer.size()) {
1442 ada_log("url_aggregator::validate pathname_start overflow \n",
1443 to_diagram());
1444 return false;
1445 }
1446
1447 if (components.protocol_end > 0) {
1448 if (buffer[components.protocol_end - 1] != ':') {
1449 ada_log(
1450 "url_aggregator::validate missing : at the end of the protocol \n",
1451 to_diagram());
1452 return false;
1453 }
1454 }
1455
1456 if (components.username_end != buffer.size() &&
1457 components.username_end > components.protocol_end + 2) {
1458 if (buffer[components.username_end] != ':' &&
1459 buffer[components.username_end] != '@') {
1460 ada_log(
1461 "url_aggregator::validate missing : or @ at the end of the username "
1462 "\n",
1463 to_diagram());
1464 return false;
1465 }
1466 }
1467
1468 if (components.host_start != buffer.size()) {
1469 if (components.host_start > components.username_end) {
1470 if (buffer[components.host_start] != '@') {
1471 ada_log(
1472 "url_aggregator::validate missing @ at the end of the password \n",
1473 to_diagram());
1474 return false;
1475 }
1476 } else if (components.host_start == components.username_end &&
1477 components.host_end > components.host_start) {
1478 if (components.host_start == components.protocol_end + 2) {
1479 if (buffer[components.protocol_end] != '/' ||
1480 buffer[components.protocol_end + 1] != '/') {
1481 ada_log(
1482 "url_aggregator::validate missing // between protocol and host "
1483 "\n",
1484 to_diagram());
1485 return false;
1486 }
1487 } else {
1488 if (components.host_start > components.protocol_end &&
1489 buffer[components.host_start] != '@') {
1490 ada_log(
1491 "url_aggregator::validate missing @ at the end of the username "
1492 "\n",
1493 to_diagram());
1494 return false;
1495 }
1496 }
1497 } else {
1498 if (components.host_end != components.host_start) {
1499 ada_log("url_aggregator::validate expected omitted host \n",
1500 to_diagram());
1501 return false;
1502 }
1503 }
1504 }
1505 if (components.host_end != buffer.size() &&
1506 components.pathname_start > components.host_end) {
1507 if (components.pathname_start == components.host_end + 2 &&
1508 buffer[components.host_end] == '/' &&
1509 buffer[components.host_end + 1] == '.') {
1510 if (components.pathname_start + 1 >= buffer.size() ||
1511 buffer[components.pathname_start] != '/' ||
1512 buffer[components.pathname_start + 1] != '/') {
1513 ada_log(
1514 "url_aggregator::validate expected the path to begin with // \n",
1515 to_diagram());
1516 return false;
1517 }
1518 } else if (buffer[components.host_end] != ':') {
1519 ada_log("url_aggregator::validate missing : at the port \n",
1520 to_diagram());
1521 return false;
1522 }
1523 }
1524 if (components.pathname_start != buffer.size() &&
1525 components.pathname_start < components.search_start &&
1526 components.pathname_start < components.hash_start && !has_opaque_path) {
1527 if (buffer[components.pathname_start] != '/') {
1528 ada_log("url_aggregator::validate missing / at the path \n",
1529 to_diagram());
1530 return false;
1531 }
1532 }
1533 if (components.search_start != url_components::omitted) {
1534 if (buffer[components.search_start] != '?') {
1535 ada_log("url_aggregator::validate missing ? at the search \n",
1536 to_diagram());
1537 return false;
1538 }
1539 }
1540 if (components.hash_start != url_components::omitted) {
1541 if (buffer[components.hash_start] != '#') {
1542 ada_log("url_aggregator::validate missing # at the hash \n",
1543 to_diagram());
1544 return false;
1545 }
1546 }
1547
1548 return true;
1549}
1550
1551void url_aggregator::delete_dash_dot() {
1552 ada_log("url_aggregator::delete_dash_dot");
1554 ADA_ASSERT_TRUE(has_dash_dot());
1555 buffer.erase(components.host_end, 2);
1556 components.pathname_start -= 2;
1557 if (components.search_start != url_components::omitted) {
1558 components.search_start -= 2;
1559 }
1560 if (components.hash_start != url_components::omitted) {
1561 components.hash_start -= 2;
1562 }
1564 ADA_ASSERT_TRUE(!has_dash_dot());
1565}
1566
1567inline void url_aggregator::consume_prepared_path(std::string_view input) {
1568 ada_log("url_aggregator::consume_prepared_path ", input);
1569
1578 uint8_t accumulator = checkers::path_signature(input);
1579 // Let us first detect a trivial case.
1580 // If it is special, we check that we have no dot, no %, no \ and no
1581 // character needing percent encoding. Otherwise, we check that we have no %,
1582 // no dot, and no character needing percent encoding.
1583 constexpr uint8_t need_encoding = 1;
1584 constexpr uint8_t backslash_char = 2;
1585 constexpr uint8_t dot_char = 4;
1586 constexpr uint8_t percent_char = 8;
1587 bool special = type != ada::scheme::NOT_SPECIAL;
1588 bool may_need_slow_file_handling = (type == ada::scheme::type::FILE &&
1590 bool trivial_path =
1591 (special ? (accumulator == 0)
1592 : ((accumulator & (need_encoding | dot_char | percent_char)) ==
1593 0)) &&
1594 (!may_need_slow_file_handling);
1595 if (accumulator == dot_char && !may_need_slow_file_handling) {
1596 // '4' means that we have at least one dot, but nothing that requires
1597 // percent encoding or decoding. The only part that is not trivial is
1598 // that we may have single dots and double dots path segments.
1599 // If we have such segments, then we either have a path that begins
1600 // with '.' (easy to check), or we have the sequence './'.
1601 // Note: input cannot be empty, it must at least contain one character ('.')
1602 // Note: we know that '\' is not present.
1603 if (input[0] != '.') {
1604 size_t slashdot = input.find("/.");
1605 if (slashdot == std::string_view::npos) { // common case
1606 trivial_path = true;
1607 } else { // uncommon
1608 // only three cases matter: /./, /.. or a final /
1609 trivial_path =
1610 !(slashdot + 2 == input.size() || input[slashdot + 2] == '.' ||
1611 input[slashdot + 2] == '/');
1612 }
1613 }
1614 }
1615 if (trivial_path && is_at_path()) {
1616 ada_log("parse_path trivial");
1617 buffer += '/';
1618 buffer += input;
1619 return;
1620 }
1621 std::string path = std::string(get_pathname());
1622 // We are going to need to look a bit at the path, but let us see if we can
1623 // ignore percent encoding *and* backslashes *and* percent characters.
1624 // Except for the trivial case, this is likely to capture 99% of paths out
1625 // there.
1626 bool fast_path =
1627 (special &&
1628 (accumulator & (need_encoding | backslash_char | percent_char)) == 0) &&
1629 (type != ada::scheme::type::FILE);
1630 if (fast_path) {
1631 ada_log("parse_prepared_path fast");
1632 // Here we don't need to worry about \ or percent encoding.
1633 // We also do not have a file protocol. We might have dots, however,
1634 // but dots must as appear as '.', and they cannot be encoded because
1635 // the symbol '%' is not present.
1636 size_t previous_location = 0; // We start at 0.
1637 do {
1638 size_t new_location = input.find('/', previous_location);
1639 // std::string_view path_view = input;
1640 // We process the last segment separately:
1641 if (new_location == std::string_view::npos) {
1642 std::string_view path_view = input.substr(previous_location);
1643 if (path_view == "..") { // The path ends with ..
1644 // e.g., if you receive ".." with an empty path, you go to "/".
1645 if (path.empty()) {
1646 path = '/';
1647 update_base_pathname(path);
1648 return;
1649 }
1650 // Fast case where we have nothing to do:
1651 if (path.back() == '/') {
1652 update_base_pathname(path);
1653 return;
1654 }
1655 // If you have the path "/joe/myfriend",
1656 // then you delete 'myfriend'.
1657 path.resize(path.rfind('/') + 1);
1658 update_base_pathname(path);
1659 return;
1660 }
1661 path += '/';
1662 if (path_view != ".") {
1663 path.append(path_view);
1664 }
1665 update_base_pathname(path);
1666 return;
1667 } else {
1668 // This is a non-final segment.
1669 std::string_view path_view =
1670 input.substr(previous_location, new_location - previous_location);
1671 previous_location = new_location + 1;
1672 if (path_view == "..") {
1673 size_t last_delimiter = path.rfind('/');
1674 if (last_delimiter != std::string::npos) {
1675 path.erase(last_delimiter);
1676 }
1677 } else if (path_view != ".") {
1678 path += '/';
1679 path.append(path_view);
1680 }
1681 }
1682 } while (true);
1683 } else {
1684 ada_log("parse_path slow");
1685 // we have reached the general case
1686 bool needs_percent_encoding = (accumulator & 1);
1687 std::string path_buffer_tmp;
1688 do {
1689 size_t location = (special && (accumulator & 2))
1690 ? input.find_first_of("/\\")
1691 : input.find('/');
1692 std::string_view path_view = input;
1693 if (location != std::string_view::npos) {
1694 path_view.remove_suffix(path_view.size() - location);
1695 input.remove_prefix(location + 1);
1696 }
1697 // path_buffer is either path_view or it might point at a percent encoded
1698 // temporary string.
1699 std::string_view path_buffer =
1700 (needs_percent_encoding &&
1701 ada::unicode::percent_encode<false>(
1702 path_view, character_sets::PATH_PERCENT_ENCODE, path_buffer_tmp))
1703 ? path_buffer_tmp
1704 : path_view;
1705 if (unicode::is_double_dot_path_segment(path_buffer)) {
1706 if ((helpers::shorten_path(path, type) || special) &&
1707 location == std::string_view::npos) {
1708 path += '/';
1709 }
1710 } else if (unicode::is_single_dot_path_segment(path_buffer) &&
1711 (location == std::string_view::npos)) {
1712 path += '/';
1713 }
1714 // Otherwise, if path_buffer is not a single-dot path segment, then:
1715 else if (!unicode::is_single_dot_path_segment(path_buffer)) {
1716 // If url's scheme is "file", url's path is empty, and path_buffer is a
1717 // Windows drive letter, then replace the second code point in
1718 // path_buffer with U+003A (:).
1719 if (type == ada::scheme::type::FILE && path.empty() &&
1720 checkers::is_windows_drive_letter(path_buffer)) {
1721 path += '/';
1722 path += path_buffer[0];
1723 path += ':';
1724 path_buffer.remove_prefix(2);
1725 path.append(path_buffer);
1726 } else {
1727 // Append path_buffer to url's path.
1728 path += '/';
1729 path.append(path_buffer);
1730 }
1731 }
1732 if (location == std::string_view::npos) {
1733 update_base_pathname(path);
1734 return;
1735 }
1736 } while (true);
1737 }
1738}
1739} // namespace ada
Includes all definitions for Ada.
Definitions for URL specific checkers used within Ada.
#define ADA_ASSERT_TRUE(COND)
#define ada_lifetime_bound
#define ada_really_inline
Definition common_defs.h:84
Definitions for helper functions used within Ada.
Definitions for user facing functions for parsing URL and it's components.
constexpr uint8_t QUERY_PERCENT_ENCODE[32]
constexpr uint8_t SPECIAL_QUERY_PERCENT_ENCODE[32]
constexpr uint8_t PATH_PERCENT_ENCODE[32]
constexpr uint8_t C0_CONTROL_PERCENT_ENCODE[32]
constexpr uint8_t USERINFO_PERCENT_ENCODE[32]
constexpr bool is_windows_drive_letter(std::string_view input) noexcept
bool has_hex_prefix(std::string_view input)
constexpr bool is_alpha(char x) noexcept
constexpr bool is_digit(char x) noexcept
ada_really_inline bool begins_with(std::string_view view, std::string_view prefix)
constexpr ada::scheme::type get_scheme_type(std::string_view scheme) noexcept
Definition scheme-inl.h:72
@ NOT_SPECIAL
Definition scheme.h:32
constexpr uint16_t get_special_port(std::string_view scheme) noexcept
Definition scheme-inl.h:57
std::string ipv6(const std::array< uint16_t, 8 > &address) noexcept
std::string ipv4(uint64_t address) noexcept
ada_really_inline size_t percent_encode_index(const std::string_view input, const uint8_t character_set[])
Definition unicode-inl.h:19
Definition ada_idna.h:13
@ IPV6
Definition url_base.h:32
@ IPV4
Definition url_base.h:27
template ada::result< url_aggregator > parse< url_aggregator >(std::string_view input, const url_aggregator *base_url)
tl::expected< result_type, ada::errors > result
ada_warn_unused ada::result< result_type > parse(std::string_view input, const result_type *base_url=nullptr)
Declarations for the URL scheme.
bool has_non_empty_username() const noexcept
void set_hash(std::string_view input)
void clear_search() override
bool has_hostname() const noexcept
std::string_view get_hostname() const noexcept ada_lifetime_bound
bool has_non_empty_password() const noexcept
ada_really_inline bool has_credentials() const noexcept
std::string to_string() const override
std::string_view get_pathname() const noexcept ada_lifetime_bound
std::string_view get_hash() const noexcept ada_lifetime_bound
std::string to_diagram() const
bool set_protocol(std::string_view input)
std::string get_origin() const noexcept override
bool validate() const noexcept
std::string_view get_search() const noexcept ada_lifetime_bound
bool has_valid_domain() const noexcept override
bool set_hostname(std::string_view input)
bool set_password(std::string_view input)
bool set_pathname(std::string_view input)
std::string_view get_protocol() const noexcept ada_lifetime_bound
std::string_view get_password() const noexcept ada_lifetime_bound
bool set_href(std::string_view input)
void set_search(std::string_view input)
std::string_view get_port() const noexcept ada_lifetime_bound
bool has_port() const noexcept
std::string_view get_href() const noexcept ada_lifetime_bound
bool set_host(std::string_view input)
std::string_view get_host() const noexcept ada_lifetime_bound
bool set_port(std::string_view input)
std::string_view get_username() const noexcept ada_lifetime_bound
bool set_username(std::string_view input)
ada_really_inline bool is_special() const noexcept
url_host_type host_type
Definition url_base.h:60
bool is_valid
Definition url_base.h:50
bool has_opaque_path
Definition url_base.h:55
bool check_offset_consistency() const noexcept
static constexpr uint32_t omitted
Definitions for unicode operations.
Inline functions for url aggregator.
Declaration for the basic URL definitions.
Declaration for the URL Components.