5#ifndef ADA_URL_PATTERN_HELPERS_INL_H
6#define ADA_URL_PATTERN_HELPERS_INL_H
12#include "ada/expected.h"
16#if ADA_INCLUDE_URL_PATTERN
17namespace ada::url_pattern_helpers {
18#if defined(ADA_TESTING) || defined(ADA_LOGGING)
19inline std::string
to_string(token_type type) {
21 case token_type::INVALID_CHAR:
22 return "INVALID_CHAR";
23 case token_type::OPEN:
25 case token_type::CLOSE:
27 case token_type::REGEXP:
29 case token_type::NAME:
31 case token_type::CHAR:
33 case token_type::ESCAPED_CHAR:
34 return "ESCAPED_CHAR";
35 case token_type::OTHER_MODIFIER:
36 return "OTHER_MODIFIER";
37 case token_type::ASTERISK:
47template <url_pattern_regex::regex_concept regex_prov
ider>
48constexpr void constructor_string_parser<regex_provider>::rewind() {
50 token_index = component_start;
55template <url_pattern_regex::regex_concept regex_prov
ider>
56constexpr bool constructor_string_parser<regex_provider>::is_hash_prefix() {
59 return is_non_special_pattern_char(token_index,
'#');
62template <url_pattern_regex::regex_concept regex_prov
ider>
63constexpr bool constructor_string_parser<regex_provider>::is_search_prefix() {
66 if (is_non_special_pattern_char(token_index,
'?')) {
72 if (token_list[token_index].value !=
"?") {
77 if (token_index == 0)
return true;
79 auto previous_index = token_index - 1;
82 auto previous_token = get_safe_token(previous_index);
89 return !(previous_token->type == token_type::NAME ||
90 previous_token->type == token_type::REGEXP ||
91 previous_token->type == token_type::CLOSE ||
92 previous_token->type == token_type::ASTERISK);
95template <url_pattern_regex::regex_concept regex_prov
ider>
97constructor_string_parser<regex_provider>::is_non_special_pattern_char(
98 size_t index, uint32_t value)
const {
100 auto token = get_safe_token(index);
106 if (!token->value.empty() &&
107 static_cast<uint32_t
>(token->value[0]) != value) {
116 return token->type == token_type::CHAR ||
117 token->type == token_type::ESCAPED_CHAR ||
118 token->type == token_type::INVALID_CHAR;
121template <url_pattern_regex::regex_concept regex_prov
ider>
122constexpr const token*
123constructor_string_parser<regex_provider>::get_safe_token(
size_t index)
const {
126 if (index < token_list.size()) [[likely]] {
127 return &token_list[index];
138 return &token_list.back();
141template <url_pattern_regex::regex_concept regex_prov
ider>
142constexpr bool constructor_string_parser<regex_provider>::is_group_open()
146 return token_list[token_index].type == token_type::OPEN;
149template <url_pattern_regex::regex_concept regex_prov
ider>
150constexpr bool constructor_string_parser<regex_provider>::is_group_close()
154 return token_list[token_index].type == token_type::CLOSE;
157template <url_pattern_regex::regex_concept regex_prov
ider>
159constructor_string_parser<regex_provider>::next_is_authority_slashes()
const {
162 if (!is_non_special_pattern_char(token_index + 1,
'/')) {
167 if (!is_non_special_pattern_char(token_index + 2,
'/')) {
173template <url_pattern_regex::regex_concept regex_prov
ider>
174constexpr bool constructor_string_parser<regex_provider>::is_protocol_suffix()
178 return is_non_special_pattern_char(token_index,
':');
181template <url_pattern_regex::regex_concept regex_prov
ider>
182void constructor_string_parser<regex_provider>::change_state(State new_state,
187 if (state != State::INIT && state != State::AUTHORITY &&
188 state != State::DONE) {
189 auto value = make_component_string();
192 case State::PROTOCOL: {
196 case State::USERNAME: {
200 case State::PASSWORD: {
204 case State::HOSTNAME: {
212 case State::PATHNAME: {
216 case State::SEARCH: {
230 if (state != State::INIT && new_state != State::DONE) {
235 if ((state == State::PROTOCOL || state == State::AUTHORITY ||
236 state == State::USERNAME || state == State::PASSWORD) &&
237 (new_state == State::PORT || new_state == State::PATHNAME ||
238 new_state == State::SEARCH || new_state == State::HASH) &&
246 if ((state == State::PROTOCOL || state == State::AUTHORITY ||
247 state == State::USERNAME || state == State::PASSWORD ||
248 state == State::HOSTNAME || state == State::PORT) &&
249 (new_state == State::SEARCH || new_state == State::HASH) &&
251 if (protocol_matches_a_special_scheme_flag) {
263 if ((state == State::PROTOCOL || state == State::AUTHORITY ||
264 state == State::USERNAME || state == State::PASSWORD ||
265 state == State::HOSTNAME || state == State::PORT ||
266 state == State::PATHNAME) &&
267 new_state == State::HASH && !
result.search) {
276 component_start = token_index;
281template <url_pattern_regex::regex_concept regex_prov
ider>
282std::string constructor_string_parser<regex_provider>::make_component_string() {
288 const auto end_index = token_list[token_index].index;
291 const auto component_start_token = get_safe_token(component_start);
294 const auto component_start_input_index = component_start_token->index;
297 return std::string(input.substr(component_start_input_index,
298 end_index - component_start_input_index));
301template <url_pattern_regex::regex_concept regex_prov
ider>
303constructor_string_parser<regex_provider>::is_an_identity_terminator()
const {
306 return is_non_special_pattern_char(token_index,
'@');
309template <url_pattern_regex::regex_concept regex_prov
ider>
310constexpr bool constructor_string_parser<regex_provider>::is_pathname_start()
314 return is_non_special_pattern_char(token_index,
'/');
317template <url_pattern_regex::regex_concept regex_prov
ider>
318constexpr bool constructor_string_parser<regex_provider>::is_password_prefix()
322 return is_non_special_pattern_char(token_index,
':');
325template <url_pattern_regex::regex_concept regex_prov
ider>
326constexpr bool constructor_string_parser<regex_provider>::is_an_ipv6_open()
330 return is_non_special_pattern_char(token_index,
'[');
333template <url_pattern_regex::regex_concept regex_prov
ider>
334constexpr bool constructor_string_parser<regex_provider>::is_an_ipv6_close()
338 return is_non_special_pattern_char(token_index,
']');
341template <url_pattern_regex::regex_concept regex_prov
ider>
342constexpr bool constructor_string_parser<regex_provider>::is_port_prefix()
346 return is_non_special_pattern_char(token_index,
':');
349constexpr void Tokenizer::get_next_code_point() {
350 ada_log(
"Tokenizer::get_next_code_point called with index=", next_index);
354 size_t number_bytes = 0;
355 unsigned char first_byte = input[next_index];
357 if ((first_byte & 0x80) == 0) {
360 code_point = first_byte;
361 ada_log(
"Tokenizer::get_next_code_point returning ASCII code point=",
362 uint32_t(code_point));
363 ada_log(
"Tokenizer::get_next_code_point next_index =", next_index,
364 " input.size()=", input.size());
367 ada_log(
"Tokenizer::get_next_code_point read first byte=",
368 uint32_t(first_byte));
369 if ((first_byte & 0xE0) == 0xC0) {
370 code_point = first_byte & 0x1F;
372 ada_log(
"Tokenizer::get_next_code_point two bytes");
373 }
else if ((first_byte & 0xF0) == 0xE0) {
374 code_point = first_byte & 0x0F;
376 ada_log(
"Tokenizer::get_next_code_point three bytes");
377 }
else if ((first_byte & 0xF8) == 0xF0) {
378 code_point = first_byte & 0x07;
380 ada_log(
"Tokenizer::get_next_code_point four bytes");
384 for (
size_t i = 1 + next_index; i < number_bytes + next_index; ++i) {
385 unsigned char byte = input[i];
386 ada_log(
"Tokenizer::get_next_code_point read byte=", uint32_t(
byte));
387 code_point = (code_point << 6) | (
byte & 0x3F);
389 ada_log(
"Tokenizer::get_next_code_point returning non-ASCII code point=",
390 uint32_t(code_point));
391 ada_log(
"Tokenizer::get_next_code_point next_index =", next_index,
392 " input.size()=", input.size());
393 next_index += number_bytes;
396constexpr void Tokenizer::seek_and_get_next_code_point(
size_t new_index) {
397 ada_log(
"Tokenizer::seek_and_get_next_code_point called with new_index=",
400 next_index = new_index;
402 get_next_code_point();
405inline void Tokenizer::add_token(token_type type,
size_t next_position,
406 size_t value_position,
size_t value_length) {
407 ada_log(
"Tokenizer::add_token called with type=",
to_string(type),
408 " next_position=", next_position,
" value_position=", value_position);
417 token_list.emplace_back(type, index,
418 input.substr(value_position, value_length));
420 index = next_position;
423inline void Tokenizer::add_token_with_default_length(token_type type,
424 size_t next_position,
425 size_t value_position) {
427 auto computed_length = next_position - value_position;
430 add_token(type, next_position, value_position, computed_length);
433inline void Tokenizer::add_token_with_defaults(token_type type) {
434 ada_log(
"Tokenizer::add_token_with_defaults called with type=",
438 add_token_with_default_length(type, next_index, index);
442Tokenizer::process_tokenizing_error(
size_t next_position,
443 size_t value_position) {
445 if (policy == token_policy::strict) {
446 ada_log(
"process_tokenizing_error failed with next_position=",
447 next_position,
" value_position=", value_position);
448 return errors::type_error;
454 add_token_with_default_length(token_type::INVALID_CHAR, next_position,
459template <url_pattern_encoding_callback F>
460token* url_pattern_parser<F>::try_consume_modifier_token() {
463 auto token = try_consume_token(token_type::OTHER_MODIFIER);
465 if (token)
return token;
469 return try_consume_token(token_type::ASTERISK);
472template <url_pattern_encoding_callback F>
473token* url_pattern_parser<F>::try_consume_regexp_or_wildcard_token(
474 const token* name_token) {
477 auto token = try_consume_token(token_type::REGEXP);
480 if (!name_token && !token) {
481 token = try_consume_token(token_type::ASTERISK);
487template <url_pattern_encoding_callback F>
488token* url_pattern_parser<F>::try_consume_token(token_type type) {
489 ada_log(
"url_pattern_parser::try_consume_token called with type=",
494 auto& next_token = tokens[index];
496 if (next_token.type != type)
return nullptr;
503template <url_pattern_encoding_callback F>
504std::string url_pattern_parser<F>::consume_text() {
511 auto token = try_consume_token(token_type::CHAR);
514 if (!token) token = try_consume_token(token_type::ESCAPED_CHAR);
518 result.append(token->value);
524template <url_pattern_encoding_callback F>
525bool url_pattern_parser<F>::consume_required_token(token_type type) {
526 ada_log(
"url_pattern_parser::consume_required_token called with type=",
530 return try_consume_token(type) !=
nullptr;
533template <url_pattern_encoding_callback F>
535url_pattern_parser<F>::maybe_add_part_from_the_pending_fixed_value() {
537 if (pending_fixed_value.empty()) {
538 ada_log(
"pending_fixed_value is empty");
543 auto encoded_value = encoding_callback(pending_fixed_value);
544 if (!encoded_value) {
545 ada_log(
"failed to encode pending_fixed_value: ", pending_fixed_value);
546 return encoded_value.error();
549 pending_fixed_value.clear();
553 parts.emplace_back(url_pattern_part_type::FIXED_TEXT,
554 std::move(*encoded_value),
555 url_pattern_part_modifier::none);
559template <url_pattern_encoding_callback F>
560std::optional<errors> url_pattern_parser<F>::add_part(
561 std::string_view prefix, token* name_token, token* regexp_or_wildcard_token,
562 std::string_view suffix, token* modifier_token) {
564 auto modifier = url_pattern_part_modifier::none;
566 if (modifier_token) {
568 if (modifier_token->value ==
"?") {
569 modifier = url_pattern_part_modifier::optional;
570 }
else if (modifier_token->value ==
"*") {
573 modifier = url_pattern_part_modifier::zero_or_more;
574 }
else if (modifier_token->value ==
"+") {
577 modifier = url_pattern_part_modifier::one_or_more;
582 if (!name_token && !regexp_or_wildcard_token &&
583 modifier == url_pattern_part_modifier::none) {
585 pending_fixed_value.append(prefix);
589 if (
auto error = maybe_add_part_from_the_pending_fixed_value()) {
593 if (!name_token && !regexp_or_wildcard_token) {
597 if (prefix.empty())
return std::nullopt;
600 auto encoded_value = encoding_callback(prefix);
601 if (!encoded_value) {
602 return encoded_value.error();
607 parts.emplace_back(url_pattern_part_type::FIXED_TEXT,
608 std::move(*encoded_value), modifier);
612 std::string regexp_value{};
615 if (!regexp_or_wildcard_token) {
616 regexp_value = segment_wildcard_regexp;
617 }
else if (regexp_or_wildcard_token->type == token_type::ASTERISK) {
623 regexp_value = regexp_or_wildcard_token->value;
626 auto type = url_pattern_part_type::REGEXP;
628 if (regexp_value == segment_wildcard_regexp) {
630 type = url_pattern_part_type::SEGMENT_WILDCARD;
632 regexp_value.clear();
633 }
else if (regexp_value ==
".*") {
636 type = url_pattern_part_type::FULL_WILDCARD;
638 regexp_value.clear();
644 name = name_token->value;
645 }
else if (regexp_or_wildcard_token !=
nullptr) {
648 name = std::to_string(next_numeric_name);
654 if (std::ranges::any_of(
655 parts, [&name](
const auto& part) {
return part.name == name; })) {
656 return errors::type_error;
660 auto encoded_prefix = encoding_callback(prefix);
661 if (!encoded_prefix)
return encoded_prefix.error();
664 auto encoded_suffix = encoding_callback(suffix);
665 if (!encoded_suffix)
return encoded_suffix.error();
670 parts.emplace_back(type, std::move(regexp_value), modifier, std::move(name),
671 std::move(*encoded_prefix), std::move(*encoded_suffix));
675template <url_pattern_encoding_callback F>
676tl::expected<std::vector<url_pattern_part>,
errors> parse_pattern_string(
677 std::string_view input, url_pattern_compile_component_options& options,
678 F& encoding_callback) {
679 ada_log(
"parse_pattern_string input=", input);
683 auto parser = url_pattern_parser<F>(
684 encoding_callback, generate_segment_wildcard_regexp(options));
687 auto tokenize_result = tokenize(input, token_policy::strict);
688 if (!tokenize_result) {
689 ada_log(
"parse_pattern_string tokenize failed");
690 return tl::unexpected(tokenize_result.error());
692 parser.tokens = std::move(*tokenize_result);
695 while (parser.can_continue()) {
698 auto char_token = parser.try_consume_token(token_type::CHAR);
701 auto name_token = parser.try_consume_token(token_type::NAME);
704 auto regexp_or_wildcard_token =
705 parser.try_consume_regexp_or_wildcard_token(name_token);
707 if (name_token || regexp_or_wildcard_token) {
709 std::string prefix{};
711 if (char_token) prefix = char_token->value;
713 if (!prefix.empty() && prefix != options.get_prefix()) {
715 parser.pending_fixed_value.append(prefix);
720 if (
auto error = parser.maybe_add_part_from_the_pending_fixed_value()) {
721 ada_log(
"maybe_add_part_from_the_pending_fixed_value failed");
722 return tl::unexpected(*error);
726 auto modifier_token = parser.try_consume_modifier_token();
730 parser.add_part(prefix, name_token, regexp_or_wildcard_token,
"",
732 ada_log(
"parser.add_part failed");
733 return tl::unexpected(*error);
740 auto fixed_token = char_token;
744 fixed_token = parser.try_consume_token(token_type::ESCAPED_CHAR);
748 parser.pending_fixed_value.append(fixed_token->value);
754 auto open_token = parser.try_consume_token(token_type::OPEN);
758 auto prefix_ = parser.consume_text();
761 name_token = parser.try_consume_token(token_type::NAME);
764 regexp_or_wildcard_token =
765 parser.try_consume_regexp_or_wildcard_token(name_token);
767 auto suffix_ = parser.consume_text();
769 if (!parser.consume_required_token(token_type::CLOSE)) {
770 ada_log(
"parser.consume_required_token failed");
771 return tl::unexpected(errors::type_error);
775 auto modifier_token = parser.try_consume_modifier_token();
779 parser.add_part(prefix_, name_token, regexp_or_wildcard_token,
780 suffix_, modifier_token)) {
781 return tl::unexpected(*error);
787 if (
auto error = parser.maybe_add_part_from_the_pending_fixed_value()) {
788 ada_log(
"maybe_add_part_from_the_pending_fixed_value failed on line 992");
789 return tl::unexpected(*error);
792 if (!parser.consume_required_token(token_type::END)) {
793 return tl::unexpected(errors::type_error);
796 ada_log(
"parser.parts size is: ", parser.parts.size());
801template <url_pattern_regex::regex_concept regex_prov
ider>
802bool protocol_component_matches_special_scheme(
803 url_pattern_component<regex_provider>& component) {
805 switch (component.type) {
806 case url_pattern_component_type::EMPTY:
809 case url_pattern_component_type::EXACT_MATCH:
811 return component.exact_match_value ==
"http" ||
812 component.exact_match_value ==
"https" ||
813 component.exact_match_value ==
"ws" ||
814 component.exact_match_value ==
"wss" ||
815 component.exact_match_value ==
"ftp";
816 case url_pattern_component_type::FULL_WILDCARD:
819 case url_pattern_component_type::REGEXP:
821 auto& regex = component.regexp;
822 return regex_provider::regex_match(
"http", regex) ||
823 regex_provider::regex_match(
"https", regex) ||
824 regex_provider::regex_match(
"ws", regex) ||
825 regex_provider::regex_match(
"wss", regex) ||
826 regex_provider::regex_match(
"ftp", regex);
831template <url_pattern_regex::regex_concept regex_prov
ider>
832inline std::optional<errors> constructor_string_parser<
835 "constructor_string_parser::compute_protocol_matches_special_scheme_"
839 auto protocol_string = make_component_string();
842 auto protocol_component = url_pattern_component<regex_provider>::compile(
843 protocol_string, canonicalize_protocol,
844 url_pattern_compile_component_options::DEFAULT);
845 if (!protocol_component) {
846 ada_log(
"url_pattern_component::compile failed for protocol_string ",
848 return protocol_component.error();
853 if (protocol_component_matches_special_scheme(*protocol_component)) {
854 protocol_matches_a_special_scheme_flag =
true;
859template <url_pattern_regex::regex_concept regex_prov
ider>
860tl::expected<url_pattern_init, errors>
861constructor_string_parser<regex_provider>::parse(std::string_view input) {
862 ada_log(
"constructor_string_parser::parse input=", input);
865 auto token_list = tokenize(input, token_policy::lenient);
867 return tl::unexpected(token_list.error());
869 auto parser = constructor_string_parser(input, std::move(*token_list));
872 while (parser.token_index < parser.token_list.size()) {
874 parser.token_increment = 1;
877 if (parser.token_list[parser.token_index].type == token_type::END) {
879 if (parser.state == State::INIT) {
884 if (parser.is_hash_prefix()) {
885 parser.change_state(State::HASH, 1);
886 }
else if (parser.is_search_prefix()) {
889 parser.change_state(State::SEARCH, 1);
892 parser.change_state(State::PATHNAME, 0);
895 parser.token_index += parser.token_increment;
900 if (parser.state == State::AUTHORITY) {
904 parser.change_state(State::HOSTNAME, 0);
906 parser.token_index += parser.token_increment;
912 parser.change_state(State::DONE, 0);
918 if (parser.is_group_open()) {
920 parser.group_depth += 1;
922 parser.token_index += parser.token_increment;
926 if (parser.group_depth > 0) {
929 if (parser.is_group_close()) {
930 parser.group_depth -= 1;
933 parser.token_index += parser.token_increment;
939 switch (parser.state) {
942 if (parser.is_protocol_suffix()) {
945 parser.change_state(State::PROTOCOL, 0);
949 case State::PROTOCOL: {
951 if (parser.is_protocol_suffix()) {
953 if (
const auto error =
954 parser.compute_protocol_matches_special_scheme_flag()) {
955 ada_log(
"compute_protocol_matches_special_scheme_flag failed");
956 return tl::unexpected(*error);
959 auto next_state = State::PATHNAME;
964 if (parser.next_is_authority_slashes()) {
966 next_state = State::AUTHORITY;
969 }
else if (parser.protocol_matches_a_special_scheme_flag) {
972 next_state = State::AUTHORITY;
976 parser.change_state(next_state, skip);
980 case State::AUTHORITY: {
983 if (parser.is_an_identity_terminator()) {
985 parser.change_state(State::USERNAME, 0);
986 }
else if (parser.is_pathname_start() || parser.is_search_prefix() ||
987 parser.is_hash_prefix()) {
994 parser.change_state(State::HOSTNAME, 0);
998 case State::USERNAME: {
1001 if (parser.is_password_prefix()) {
1002 parser.change_state(State::PASSWORD, 1);
1003 }
else if (parser.is_an_identity_terminator()) {
1007 parser.change_state(State::HOSTNAME, 1);
1011 case State::PASSWORD: {
1014 if (parser.is_an_identity_terminator()) {
1015 parser.change_state(State::HOSTNAME, 1);
1019 case State::HOSTNAME: {
1022 if (parser.is_an_ipv6_open()) {
1023 parser.hostname_ipv6_bracket_depth += 1;
1024 }
else if (parser.is_an_ipv6_close()) {
1027 parser.hostname_ipv6_bracket_depth -= 1;
1028 }
else if (parser.is_port_prefix() &&
1029 parser.hostname_ipv6_bracket_depth == 0) {
1033 parser.change_state(State::PORT, 1);
1034 }
else if (parser.is_pathname_start()) {
1037 parser.change_state(State::PATHNAME, 0);
1038 }
else if (parser.is_search_prefix()) {
1041 parser.change_state(State::SEARCH, 1);
1042 }
else if (parser.is_hash_prefix()) {
1045 parser.change_state(State::HASH, 1);
1053 if (parser.is_pathname_start()) {
1054 parser.change_state(State::PATHNAME, 0);
1055 }
else if (parser.is_search_prefix()) {
1058 parser.change_state(State::SEARCH, 1);
1059 }
else if (parser.is_hash_prefix()) {
1062 parser.change_state(State::HASH, 1);
1066 case State::PATHNAME: {
1069 if (parser.is_search_prefix()) {
1070 parser.change_state(State::SEARCH, 1);
1071 }
else if (parser.is_hash_prefix()) {
1074 parser.change_state(State::HASH, 1);
1078 case State::SEARCH: {
1081 if (parser.is_hash_prefix()) {
1082 parser.change_state(State::HASH, 1);
1097 parser.token_index += parser.token_increment;
1102 if (parser.result.hostname && !parser.result.port) {
1103 parser.result.port =
"";
1107 return parser.result;
Cross-platform compiler macros and common definitions.
#define ADA_ASSERT_TRUE(COND)
User-facing functions for URL parsing and manipulation.
type
Enumeration of URL scheme types.
errors
Error codes for URL parsing operations.
state
States in the URL parsing state machine.
ada_warn_unused std::string_view to_string(encoding_type type)
tl::expected< result_type, ada::errors > result
ada::url_pattern_regex::std_regex_provider regex_provider
Declaration for the URLPattern helpers.