Ada 3.4.0
Fast spec-compliant URL parser
Loading...
Searching...
No Matches
url_pattern_helpers-inl.h
Go to the documentation of this file.
1
5#ifndef ADA_URL_PATTERN_HELPERS_INL_H
6#define ADA_URL_PATTERN_HELPERS_INL_H
7
8#include <optional>
9#include <string_view>
10
11#include "ada/common_defs.h"
12#include "ada/expected.h"
14#include "ada/implementation.h"
15
16#if ADA_INCLUDE_URL_PATTERN
17namespace ada::url_pattern_helpers {
18#if defined(ADA_TESTING) || defined(ADA_LOGGING)
19inline std::string to_string(token_type type) {
20 switch (type) {
21 case token_type::INVALID_CHAR:
22 return "INVALID_CHAR";
23 case token_type::OPEN:
24 return "OPEN";
25 case token_type::CLOSE:
26 return "CLOSE";
27 case token_type::REGEXP:
28 return "REGEXP";
29 case token_type::NAME:
30 return "NAME";
31 case token_type::CHAR:
32 return "CHAR";
33 case token_type::ESCAPED_CHAR:
34 return "ESCAPED_CHAR";
35 case token_type::OTHER_MODIFIER:
36 return "OTHER_MODIFIER";
37 case token_type::ASTERISK:
38 return "ASTERISK";
39 case token_type::END:
40 return "END";
41 default:
43 }
44}
45#endif // defined(ADA_TESTING) || defined(ADA_LOGGING)
46
47template <url_pattern_regex::regex_concept regex_provider>
48constexpr void constructor_string_parser<regex_provider>::rewind() {
49 // Set parser's token index to parser's component start.
50 token_index = component_start;
51 // Set parser's token increment to 0.
52 token_increment = 0;
53}
54
55template <url_pattern_regex::regex_concept regex_provider>
56constexpr bool constructor_string_parser<regex_provider>::is_hash_prefix() {
57 // Return the result of running is a non-special pattern char given parser,
58 // parser's token index and "#".
59 return is_non_special_pattern_char(token_index, '#');
60}
61
62template <url_pattern_regex::regex_concept regex_provider>
63constexpr bool constructor_string_parser<regex_provider>::is_search_prefix() {
64 // If result of running is a non-special pattern char given parser, parser's
65 // token index and "?" is true, then return true.
66 if (is_non_special_pattern_char(token_index, '?')) {
67 return true;
68 }
69
70 // If parser's token list[parser's token index]'s value is not "?", then
71 // return false.
72 if (token_list[token_index].value != "?") {
73 return false;
74 }
75
76 // If previous index is less than 0, then return true.
77 if (token_index == 0) return true;
78 // Let previous index be parser's token index - 1.
79 auto previous_index = token_index - 1;
80 // Let previous token be the result of running get a safe token given parser
81 // and previous index.
82 auto previous_token = get_safe_token(previous_index);
83 ADA_ASSERT_TRUE(previous_token);
84 // If any of the following are true, then return false:
85 // - previous token's type is "name".
86 // - previous token's type is "regexp".
87 // - previous token's type is "close".
88 // - previous token's type is "asterisk".
89 return !(previous_token->type == token_type::NAME ||
90 previous_token->type == token_type::REGEXP ||
91 previous_token->type == token_type::CLOSE ||
92 previous_token->type == token_type::ASTERISK);
93}
94
95template <url_pattern_regex::regex_concept regex_provider>
96constexpr bool
97constructor_string_parser<regex_provider>::is_non_special_pattern_char(
98 size_t index, uint32_t value) const {
99 // Let token be the result of running get a safe token given parser and index.
100 auto token = get_safe_token(index);
101 ADA_ASSERT_TRUE(token);
102
103 // If token's value is not value, then return false.
104 // TODO: Remove this once we make sure get_safe_token returns a non-empty
105 // string.
106 if (!token->value.empty() &&
107 static_cast<uint32_t>(token->value[0]) != value) {
108 return false;
109 }
110
111 // If any of the following are true:
112 // - token's type is "char";
113 // - token's type is "escaped-char"; or
114 // - token's type is "invalid-char",
115 // - then return true.
116 return token->type == token_type::CHAR ||
117 token->type == token_type::ESCAPED_CHAR ||
118 token->type == token_type::INVALID_CHAR;
119}
120
121template <url_pattern_regex::regex_concept regex_provider>
122constexpr const token*
123constructor_string_parser<regex_provider>::get_safe_token(size_t index) const {
124 // If index is less than parser's token list's size, then return parser's
125 // token list[index].
126 if (index < token_list.size()) [[likely]] {
127 return &token_list[index];
128 }
129
130 // Assert: parser's token list's size is greater than or equal to 1.
131 ADA_ASSERT_TRUE(!token_list.empty());
132
133 // Let token be parser's token list[last index].
134 // Assert: token's type is "end".
135 ADA_ASSERT_TRUE(token_list.back().type == token_type::END);
136
137 // Return token.
138 return &token_list.back();
139}
140
141template <url_pattern_regex::regex_concept regex_provider>
142constexpr bool constructor_string_parser<regex_provider>::is_group_open()
143 const {
144 // If parser's token list[parser's token index]'s type is "open", then return
145 // true.
146 return token_list[token_index].type == token_type::OPEN;
147}
148
149template <url_pattern_regex::regex_concept regex_provider>
150constexpr bool constructor_string_parser<regex_provider>::is_group_close()
151 const {
152 // If parser's token list[parser's token index]'s type is "close", then return
153 // true.
154 return token_list[token_index].type == token_type::CLOSE;
155}
156
157template <url_pattern_regex::regex_concept regex_provider>
158constexpr bool
159constructor_string_parser<regex_provider>::next_is_authority_slashes() const {
160 // If the result of running is a non-special pattern char given parser,
161 // parser's token index + 1, and "/" is false, then return false.
162 if (!is_non_special_pattern_char(token_index + 1, '/')) {
163 return false;
164 }
165 // If the result of running is a non-special pattern char given parser,
166 // parser's token index + 2, and "/" is false, then return false.
167 if (!is_non_special_pattern_char(token_index + 2, '/')) {
168 return false;
169 }
170 return true;
171}
172
173template <url_pattern_regex::regex_concept regex_provider>
174constexpr bool constructor_string_parser<regex_provider>::is_protocol_suffix()
175 const {
176 // Return the result of running is a non-special pattern char given parser,
177 // parser's token index, and ":".
178 return is_non_special_pattern_char(token_index, ':');
179}
180
181template <url_pattern_regex::regex_concept regex_provider>
182void constructor_string_parser<regex_provider>::change_state(State new_state,
183 size_t skip) {
184 // If parser's state is not "init", not "authority", and not "done", then set
185 // parser's result[parser's state] to the result of running make a component
186 // string given parser.
187 if (state != State::INIT && state != State::AUTHORITY &&
188 state != State::DONE) {
189 auto value = make_component_string();
190 // TODO: Simplify this.
191 switch (state) {
192 case State::PROTOCOL: {
193 result.protocol = value;
194 break;
195 }
196 case State::USERNAME: {
197 result.username = value;
198 break;
199 }
200 case State::PASSWORD: {
201 result.password = value;
202 break;
203 }
204 case State::HOSTNAME: {
205 result.hostname = value;
206 break;
207 }
208 case State::PORT: {
209 result.port = value;
210 break;
211 }
212 case State::PATHNAME: {
213 result.pathname = value;
214 break;
215 }
216 case State::SEARCH: {
217 result.search = value;
218 break;
219 }
220 case State::HASH: {
221 result.hash = value;
222 break;
223 }
224 default:
226 }
227 }
228
229 // If parser's state is not "init" and new state is not "done", then:
230 if (state != State::INIT && new_state != State::DONE) {
231 // If parser's state is "protocol", "authority", "username", or "password";
232 // new state is "port", "pathname", "search", or "hash"; and parser's
233 // result["hostname"] does not exist, then set parser's result["hostname"]
234 // to the empty string.
235 if ((state == State::PROTOCOL || state == State::AUTHORITY ||
236 state == State::USERNAME || state == State::PASSWORD) &&
237 (new_state == State::PORT || new_state == State::PATHNAME ||
238 new_state == State::SEARCH || new_state == State::HASH) &&
239 !result.hostname)
240 result.hostname = "";
241 }
242
243 // If parser's state is "protocol", "authority", "username", "password",
244 // "hostname", or "port"; new state is "search" or "hash"; and parser's
245 // result["pathname"] does not exist, then:
246 if ((state == State::PROTOCOL || state == State::AUTHORITY ||
247 state == State::USERNAME || state == State::PASSWORD ||
248 state == State::HOSTNAME || state == State::PORT) &&
249 (new_state == State::SEARCH || new_state == State::HASH) &&
250 !result.pathname) {
251 if (protocol_matches_a_special_scheme_flag) {
252 result.pathname = "/";
253 } else {
254 // Otherwise, set parser's result["pathname"] to the empty string.
255 result.pathname = "";
256 }
257 }
258
259 // If parser's state is "protocol", "authority", "username", "password",
260 // "hostname", "port", or "pathname"; new state is "hash"; and parser's
261 // result["search"] does not exist, then set parser's result["search"] to
262 // the empty string.
263 if ((state == State::PROTOCOL || state == State::AUTHORITY ||
264 state == State::USERNAME || state == State::PASSWORD ||
265 state == State::HOSTNAME || state == State::PORT ||
266 state == State::PATHNAME) &&
267 new_state == State::HASH && !result.search) {
268 result.search = "";
269 }
270
271 // Set parser's state to new state.
272 state = new_state;
273 // Increment parser's token index by skip.
274 token_index += skip;
275 // Set parser's component start to parser's token index.
276 component_start = token_index;
277 // Set parser's token increment to 0.
278 token_increment = 0;
279}
280
281template <url_pattern_regex::regex_concept regex_provider>
282std::string constructor_string_parser<regex_provider>::make_component_string() {
283 // Assert: parser's token index is less than parser's token list's size.
284 ADA_ASSERT_TRUE(token_index < token_list.size());
285
286 // Let token be parser's token list[parser's token index].
287 // Let end index be token's index.
288 const auto end_index = token_list[token_index].index;
289 // Let component start token be the result of running get a safe token given
290 // parser and parser's component start.
291 const auto component_start_token = get_safe_token(component_start);
292 ADA_ASSERT_TRUE(component_start_token);
293 // Let component start input index be component start token's index.
294 const auto component_start_input_index = component_start_token->index;
295 // Return the code point substring from component start input index to end
296 // index within parser's input.
297 return std::string(input.substr(component_start_input_index,
298 end_index - component_start_input_index));
299}
300
301template <url_pattern_regex::regex_concept regex_provider>
302constexpr bool
303constructor_string_parser<regex_provider>::is_an_identity_terminator() const {
304 // Return the result of running is a non-special pattern char given parser,
305 // parser's token index, and "@".
306 return is_non_special_pattern_char(token_index, '@');
307}
308
309template <url_pattern_regex::regex_concept regex_provider>
310constexpr bool constructor_string_parser<regex_provider>::is_pathname_start()
311 const {
312 // Return the result of running is a non-special pattern char given parser,
313 // parser's token index, and "/".
314 return is_non_special_pattern_char(token_index, '/');
315}
316
317template <url_pattern_regex::regex_concept regex_provider>
318constexpr bool constructor_string_parser<regex_provider>::is_password_prefix()
319 const {
320 // Return the result of running is a non-special pattern char given parser,
321 // parser's token index, and ":".
322 return is_non_special_pattern_char(token_index, ':');
323}
324
325template <url_pattern_regex::regex_concept regex_provider>
326constexpr bool constructor_string_parser<regex_provider>::is_an_ipv6_open()
327 const {
328 // Return the result of running is a non-special pattern char given parser,
329 // parser's token index, and "[".
330 return is_non_special_pattern_char(token_index, '[');
331}
332
333template <url_pattern_regex::regex_concept regex_provider>
334constexpr bool constructor_string_parser<regex_provider>::is_an_ipv6_close()
335 const {
336 // Return the result of running is a non-special pattern char given parser,
337 // parser's token index, and "]".
338 return is_non_special_pattern_char(token_index, ']');
339}
340
341template <url_pattern_regex::regex_concept regex_provider>
342constexpr bool constructor_string_parser<regex_provider>::is_port_prefix()
343 const {
344 // Return the result of running is a non-special pattern char given parser,
345 // parser's token index, and ":".
346 return is_non_special_pattern_char(token_index, ':');
347}
348
349constexpr void Tokenizer::get_next_code_point() {
350 ada_log("Tokenizer::get_next_code_point called with index=", next_index);
351 ADA_ASSERT_TRUE(next_index < input.size());
352 // this assumes that we have a valid, non-truncated UTF-8 stream.
353 code_point = 0;
354 size_t number_bytes = 0;
355 unsigned char first_byte = input[next_index];
356
357 if ((first_byte & 0x80) == 0) {
358 // 1-byte character (ASCII)
359 next_index++;
360 code_point = first_byte;
361 ada_log("Tokenizer::get_next_code_point returning ASCII code point=",
362 uint32_t(code_point));
363 ada_log("Tokenizer::get_next_code_point next_index =", next_index,
364 " input.size()=", input.size());
365 return;
366 }
367 ada_log("Tokenizer::get_next_code_point read first byte=",
368 uint32_t(first_byte));
369 if ((first_byte & 0xE0) == 0xC0) {
370 code_point = first_byte & 0x1F;
371 number_bytes = 2;
372 ada_log("Tokenizer::get_next_code_point two bytes");
373 } else if ((first_byte & 0xF0) == 0xE0) {
374 code_point = first_byte & 0x0F;
375 number_bytes = 3;
376 ada_log("Tokenizer::get_next_code_point three bytes");
377 } else if ((first_byte & 0xF8) == 0xF0) {
378 code_point = first_byte & 0x07;
379 number_bytes = 4;
380 ada_log("Tokenizer::get_next_code_point four bytes");
381 }
382 ADA_ASSERT_TRUE(number_bytes + next_index <= input.size());
383
384 for (size_t i = 1 + next_index; i < number_bytes + next_index; ++i) {
385 unsigned char byte = input[i];
386 ada_log("Tokenizer::get_next_code_point read byte=", uint32_t(byte));
387 code_point = (code_point << 6) | (byte & 0x3F);
388 }
389 ada_log("Tokenizer::get_next_code_point returning non-ASCII code point=",
390 uint32_t(code_point));
391 ada_log("Tokenizer::get_next_code_point next_index =", next_index,
392 " input.size()=", input.size());
393 next_index += number_bytes;
394}
395
396constexpr void Tokenizer::seek_and_get_next_code_point(size_t new_index) {
397 ada_log("Tokenizer::seek_and_get_next_code_point called with new_index=",
398 new_index);
399 // Set tokenizer's next index to index.
400 next_index = new_index;
401 // Run get the next code point given tokenizer.
402 get_next_code_point();
403}
404
405inline void Tokenizer::add_token(token_type type, size_t next_position,
406 size_t value_position, size_t value_length) {
407 ada_log("Tokenizer::add_token called with type=", to_string(type),
408 " next_position=", next_position, " value_position=", value_position);
409 ADA_ASSERT_TRUE(next_position >= value_position);
410
411 // Let token be a new token.
412 // Set token's type to type.
413 // Set token's index to tokenizer's index.
414 // Set token's value to the code point substring from value position with
415 // length value length within tokenizer's input.
416 // Append token to the back of tokenizer's token list.
417 token_list.emplace_back(type, index,
418 input.substr(value_position, value_length));
419 // Set tokenizer's index to next position.
420 index = next_position;
421}
422
423inline void Tokenizer::add_token_with_default_length(token_type type,
424 size_t next_position,
425 size_t value_position) {
426 // Let computed length be next position - value position.
427 auto computed_length = next_position - value_position;
428 // Run add a token given tokenizer, type, next position, value position, and
429 // computed length.
430 add_token(type, next_position, value_position, computed_length);
431}
432
433inline void Tokenizer::add_token_with_defaults(token_type type) {
434 ada_log("Tokenizer::add_token_with_defaults called with type=",
435 to_string(type));
436 // Run add a token with default length given tokenizer, type, tokenizer's next
437 // index, and tokenizer's index.
438 add_token_with_default_length(type, next_index, index);
439}
440
441inline ada_warn_unused std::optional<errors>
442Tokenizer::process_tokenizing_error(size_t next_position,
443 size_t value_position) {
444 // If tokenizer's policy is "strict", then throw a TypeError.
445 if (policy == token_policy::strict) {
446 ada_log("process_tokenizing_error failed with next_position=",
447 next_position, " value_position=", value_position);
448 return errors::type_error;
449 }
450 // Assert: tokenizer's policy is "lenient".
451 ADA_ASSERT_TRUE(policy == token_policy::lenient);
452 // Run add a token with default length given tokenizer, "invalid-char", next
453 // position, and value position.
454 add_token_with_default_length(token_type::INVALID_CHAR, next_position,
455 value_position);
456 return std::nullopt;
457}
458
459template <url_pattern_encoding_callback F>
460token* url_pattern_parser<F>::try_consume_modifier_token() {
461 // Let token be the result of running try to consume a token given parser and
462 // "other-modifier".
463 auto token = try_consume_token(token_type::OTHER_MODIFIER);
464 // If token is not null, then return token.
465 if (token) return token;
466 // Set token to the result of running try to consume a token given parser and
467 // "asterisk".
468 // Return token.
469 return try_consume_token(token_type::ASTERISK);
470}
471
472template <url_pattern_encoding_callback F>
473token* url_pattern_parser<F>::try_consume_regexp_or_wildcard_token(
474 const token* name_token) {
475 // Let token be the result of running try to consume a token given parser and
476 // "regexp".
477 auto token = try_consume_token(token_type::REGEXP);
478 // If name token is null and token is null, then set token to the result of
479 // running try to consume a token given parser and "asterisk".
480 if (!name_token && !token) {
481 token = try_consume_token(token_type::ASTERISK);
482 }
483 // Return token.
484 return token;
485}
486
487template <url_pattern_encoding_callback F>
488token* url_pattern_parser<F>::try_consume_token(token_type type) {
489 ada_log("url_pattern_parser::try_consume_token called with type=",
490 to_string(type));
491 // Assert: parser's index is less than parser's token list size.
492 ADA_ASSERT_TRUE(index < tokens.size());
493 // Let next token be parser's token list[parser's index].
494 auto& next_token = tokens[index];
495 // If next token's type is not type return null.
496 if (next_token.type != type) return nullptr;
497 // Increase parser's index by 1.
498 index++;
499 // Return next token.
500 return &next_token;
501}
502
503template <url_pattern_encoding_callback F>
504std::string url_pattern_parser<F>::consume_text() {
505 // Let result be the empty string.
506 std::string result{};
507 // While true:
508 while (true) {
509 // Let token be the result of running try to consume a token given parser
510 // and "char".
511 auto token = try_consume_token(token_type::CHAR);
512 // If token is null, then set token to the result of running try to consume
513 // a token given parser and "escaped-char".
514 if (!token) token = try_consume_token(token_type::ESCAPED_CHAR);
515 // If token is null, then break.
516 if (!token) break;
517 // Append token's value to the end of result.
518 result.append(token->value);
519 }
520 // Return result.
521 return result;
522}
523
524template <url_pattern_encoding_callback F>
525bool url_pattern_parser<F>::consume_required_token(token_type type) {
526 ada_log("url_pattern_parser::consume_required_token called with type=",
527 to_string(type));
528 // Let result be the result of running try to consume a token given parser and
529 // type.
530 return try_consume_token(type) != nullptr;
531}
532
533template <url_pattern_encoding_callback F>
534std::optional<errors>
535url_pattern_parser<F>::maybe_add_part_from_the_pending_fixed_value() {
536 // If parser's pending fixed value is the empty string, then return.
537 if (pending_fixed_value.empty()) {
538 ada_log("pending_fixed_value is empty");
539 return std::nullopt;
540 }
541 // Let encoded value be the result of running parser's encoding callback given
542 // parser's pending fixed value.
543 auto encoded_value = encoding_callback(pending_fixed_value);
544 if (!encoded_value) {
545 ada_log("failed to encode pending_fixed_value: ", pending_fixed_value);
546 return encoded_value.error();
547 }
548 // Set parser's pending fixed value to the empty string.
549 pending_fixed_value.clear();
550 // Let part be a new part whose type is "fixed-text", value is encoded value,
551 // and modifier is "none".
552 // Append part to parser's part list.
553 parts.emplace_back(url_pattern_part_type::FIXED_TEXT,
554 std::move(*encoded_value),
555 url_pattern_part_modifier::none);
556 return std::nullopt;
557}
558
559template <url_pattern_encoding_callback F>
560std::optional<errors> url_pattern_parser<F>::add_part(
561 std::string_view prefix, token* name_token, token* regexp_or_wildcard_token,
562 std::string_view suffix, token* modifier_token) {
563 // Let modifier be "none".
564 auto modifier = url_pattern_part_modifier::none;
565 // If modifier token is not null:
566 if (modifier_token) {
567 // If modifier token's value is "?" then set modifier to "optional".
568 if (modifier_token->value == "?") {
569 modifier = url_pattern_part_modifier::optional;
570 } else if (modifier_token->value == "*") {
571 // Otherwise if modifier token's value is "*" then set modifier to
572 // "zero-or-more".
573 modifier = url_pattern_part_modifier::zero_or_more;
574 } else if (modifier_token->value == "+") {
575 // Otherwise if modifier token's value is "+" then set modifier to
576 // "one-or-more".
577 modifier = url_pattern_part_modifier::one_or_more;
578 }
579 }
580 // If name token is null and regexp or wildcard token is null and modifier
581 // is "none":
582 if (!name_token && !regexp_or_wildcard_token &&
583 modifier == url_pattern_part_modifier::none) {
584 // Append prefix to the end of parser's pending fixed value.
585 pending_fixed_value.append(prefix);
586 return std::nullopt;
587 }
588 // Run maybe add a part from the pending fixed value given parser.
589 if (auto error = maybe_add_part_from_the_pending_fixed_value()) {
590 return *error;
591 }
592 // If name token is null and regexp or wildcard token is null:
593 if (!name_token && !regexp_or_wildcard_token) {
594 // Assert: suffix is the empty string.
595 ADA_ASSERT_TRUE(suffix.empty());
596 // If prefix is the empty string, then return.
597 if (prefix.empty()) return std::nullopt;
598 // Let encoded value be the result of running parser's encoding callback
599 // given prefix.
600 auto encoded_value = encoding_callback(prefix);
601 if (!encoded_value) {
602 return encoded_value.error();
603 }
604 // Let part be a new part whose type is "fixed-text", value is encoded
605 // value, and modifier is modifier.
606 // Append part to parser's part list.
607 parts.emplace_back(url_pattern_part_type::FIXED_TEXT,
608 std::move(*encoded_value), modifier);
609 return std::nullopt;
610 }
611 // Let regexp value be the empty string.
612 std::string regexp_value{};
613 // If regexp or wildcard token is null, then set regexp value to parser's
614 // segment wildcard regexp.
615 if (!regexp_or_wildcard_token) {
616 regexp_value = segment_wildcard_regexp;
617 } else if (regexp_or_wildcard_token->type == token_type::ASTERISK) {
618 // Otherwise if regexp or wildcard token's type is "asterisk", then set
619 // regexp value to the full wildcard regexp value.
620 regexp_value = ".*";
621 } else {
622 // Otherwise set regexp value to regexp or wildcard token's value.
623 regexp_value = regexp_or_wildcard_token->value;
624 }
625 // Let type be "regexp".
626 auto type = url_pattern_part_type::REGEXP;
627 // If regexp value is parser's segment wildcard regexp:
628 if (regexp_value == segment_wildcard_regexp) {
629 // Set type to "segment-wildcard".
630 type = url_pattern_part_type::SEGMENT_WILDCARD;
631 // Set regexp value to the empty string.
632 regexp_value.clear();
633 } else if (regexp_value == ".*") {
634 // Otherwise if regexp value is the full wildcard regexp value:
635 // Set type to "full-wildcard".
636 type = url_pattern_part_type::FULL_WILDCARD;
637 // Set regexp value to the empty string.
638 regexp_value.clear();
639 }
640 // Let name be the empty string.
641 std::string name{};
642 // If name token is not null, then set name to name token's value.
643 if (name_token) {
644 name = name_token->value;
645 } else if (regexp_or_wildcard_token != nullptr) {
646 // Otherwise if regexp or wildcard token is not null:
647 // Set name to parser's next numeric name, serialized.
648 name = std::to_string(next_numeric_name);
649 // Increment parser's next numeric name by 1.
650 next_numeric_name++;
651 }
652 // If the result of running is a duplicate name given parser and name is
653 // true, then throw a TypeError.
654 if (std::ranges::any_of(
655 parts, [&name](const auto& part) { return part.name == name; })) {
656 return errors::type_error;
657 }
658 // Let encoded prefix be the result of running parser's encoding callback
659 // given prefix.
660 auto encoded_prefix = encoding_callback(prefix);
661 if (!encoded_prefix) return encoded_prefix.error();
662 // Let encoded suffix be the result of running parser's encoding callback
663 // given suffix.
664 auto encoded_suffix = encoding_callback(suffix);
665 if (!encoded_suffix) return encoded_suffix.error();
666 // Let part be a new part whose type is type, value is regexp value,
667 // modifier is modifier, name is name, prefix is encoded prefix, and suffix
668 // is encoded suffix.
669 // Append part to parser's part list.
670 parts.emplace_back(type, std::move(regexp_value), modifier, std::move(name),
671 std::move(*encoded_prefix), std::move(*encoded_suffix));
672 return std::nullopt;
673}
674
675template <url_pattern_encoding_callback F>
676tl::expected<std::vector<url_pattern_part>, errors> parse_pattern_string(
677 std::string_view input, url_pattern_compile_component_options& options,
678 F& encoding_callback) {
679 ada_log("parse_pattern_string input=", input);
680 // Let parser be a new pattern parser whose encoding callback is encoding
681 // callback and segment wildcard regexp is the result of running generate a
682 // segment wildcard regexp given options.
683 auto parser = url_pattern_parser<F>(
684 encoding_callback, generate_segment_wildcard_regexp(options));
685 // Set parser's token list to the result of running tokenize given input and
686 // "strict".
687 auto tokenize_result = tokenize(input, token_policy::strict);
688 if (!tokenize_result) {
689 ada_log("parse_pattern_string tokenize failed");
690 return tl::unexpected(tokenize_result.error());
691 }
692 parser.tokens = std::move(*tokenize_result);
693
694 // While parser's index is less than parser's token list's size:
695 while (parser.can_continue()) {
696 // Let char token be the result of running try to consume a token given
697 // parser and "char".
698 auto char_token = parser.try_consume_token(token_type::CHAR);
699 // Let name token be the result of running try to consume a token given
700 // parser and "name".
701 auto name_token = parser.try_consume_token(token_type::NAME);
702 // Let regexp or wildcard token be the result of running try to consume a
703 // regexp or wildcard token given parser and name token.
704 auto regexp_or_wildcard_token =
705 parser.try_consume_regexp_or_wildcard_token(name_token);
706 // If name token is not null or regexp or wildcard token is not null:
707 if (name_token || regexp_or_wildcard_token) {
708 // Let prefix be the empty string.
709 std::string prefix{};
710 // If char token is not null then set prefix to char token's value.
711 if (char_token) prefix = char_token->value;
712 // If prefix is not the empty string and not options's prefix code point:
713 if (!prefix.empty() && prefix != options.get_prefix()) {
714 // Append prefix to the end of parser's pending fixed value.
715 parser.pending_fixed_value.append(prefix);
716 // Set prefix to the empty string.
717 prefix.clear();
718 }
719 // Run maybe add a part from the pending fixed value given parser.
720 if (auto error = parser.maybe_add_part_from_the_pending_fixed_value()) {
721 ada_log("maybe_add_part_from_the_pending_fixed_value failed");
722 return tl::unexpected(*error);
723 }
724 // Let modifier token be the result of running try to consume a modifier
725 // token given parser.
726 auto modifier_token = parser.try_consume_modifier_token();
727 // Run add a part given parser, prefix, name token, regexp or wildcard
728 // token, the empty string, and modifier token.
729 if (auto error =
730 parser.add_part(prefix, name_token, regexp_or_wildcard_token, "",
731 modifier_token)) {
732 ada_log("parser.add_part failed");
733 return tl::unexpected(*error);
734 }
735 // Continue.
736 continue;
737 }
738
739 // Let fixed token be char token.
740 auto fixed_token = char_token;
741 // If fixed token is null, then set fixed token to the result of running try
742 // to consume a token given parser and "escaped-char".
743 if (!fixed_token)
744 fixed_token = parser.try_consume_token(token_type::ESCAPED_CHAR);
745 // If fixed token is not null:
746 if (fixed_token) {
747 // Append fixed token's value to parser's pending fixed value.
748 parser.pending_fixed_value.append(fixed_token->value);
749 // Continue.
750 continue;
751 }
752 // Let open token be the result of running try to consume a token given
753 // parser and "open".
754 auto open_token = parser.try_consume_token(token_type::OPEN);
755 // If open token is not null:
756 if (open_token) {
757 // Set prefix be the result of running consume text given parser.
758 auto prefix_ = parser.consume_text();
759 // Set name token to the result of running try to consume a token given
760 // parser and "name".
761 name_token = parser.try_consume_token(token_type::NAME);
762 // Set regexp or wildcard token to the result of running try to consume a
763 // regexp or wildcard token given parser and name token.
764 regexp_or_wildcard_token =
765 parser.try_consume_regexp_or_wildcard_token(name_token);
766 // Let suffix be the result of running consume text given parser.
767 auto suffix_ = parser.consume_text();
768 // Run consume a required token given parser and "close".
769 if (!parser.consume_required_token(token_type::CLOSE)) {
770 ada_log("parser.consume_required_token failed");
771 return tl::unexpected(errors::type_error);
772 }
773 // Set modifier token to the result of running try to consume a modifier
774 // token given parser.
775 auto modifier_token = parser.try_consume_modifier_token();
776 // Run add a part given parser, prefix, name token, regexp or wildcard
777 // token, suffix, and modifier token.
778 if (auto error =
779 parser.add_part(prefix_, name_token, regexp_or_wildcard_token,
780 suffix_, modifier_token)) {
781 return tl::unexpected(*error);
782 }
783 // Continue.
784 continue;
785 }
786 // Run maybe add a part from the pending fixed value given parser.
787 if (auto error = parser.maybe_add_part_from_the_pending_fixed_value()) {
788 ada_log("maybe_add_part_from_the_pending_fixed_value failed on line 992");
789 return tl::unexpected(*error);
790 }
791 // Run consume a required token given parser and "end".
792 if (!parser.consume_required_token(token_type::END)) {
793 return tl::unexpected(errors::type_error);
794 }
795 }
796 ada_log("parser.parts size is: ", parser.parts.size());
797 // Return parser's part list.
798 return parser.parts;
799}
800
801template <url_pattern_regex::regex_concept regex_provider>
802bool protocol_component_matches_special_scheme(
803 url_pattern_component<regex_provider>& component) {
804 // Optimization: Use fast_test for simple patterns to avoid regex overhead
805 switch (component.type) {
806 case url_pattern_component_type::EMPTY:
807 // Empty pattern can't match any special scheme
808 return false;
809 case url_pattern_component_type::EXACT_MATCH:
810 // Direct string comparison for exact match patterns
811 return component.exact_match_value == "http" ||
812 component.exact_match_value == "https" ||
813 component.exact_match_value == "ws" ||
814 component.exact_match_value == "wss" ||
815 component.exact_match_value == "ftp";
816 case url_pattern_component_type::FULL_WILDCARD:
817 // Full wildcard matches everything including special schemes
818 return true;
819 case url_pattern_component_type::REGEXP:
820 // Fall back to regex matching for complex patterns
821 auto& regex = component.regexp;
822 return regex_provider::regex_match("http", regex) ||
823 regex_provider::regex_match("https", regex) ||
824 regex_provider::regex_match("ws", regex) ||
825 regex_provider::regex_match("wss", regex) ||
826 regex_provider::regex_match("ftp", regex);
827 }
829}
830
831template <url_pattern_regex::regex_concept regex_provider>
832inline std::optional<errors> constructor_string_parser<
833 regex_provider>::compute_protocol_matches_special_scheme_flag() {
834 ada_log(
835 "constructor_string_parser::compute_protocol_matches_special_scheme_"
836 "flag");
837 // Let protocol string be the result of running make a component string given
838 // parser.
839 auto protocol_string = make_component_string();
840 // Let protocol component be the result of compiling a component given
841 // protocol string, canonicalize a protocol, and default options.
842 auto protocol_component = url_pattern_component<regex_provider>::compile(
843 protocol_string, canonicalize_protocol,
844 url_pattern_compile_component_options::DEFAULT);
845 if (!protocol_component) {
846 ada_log("url_pattern_component::compile failed for protocol_string ",
847 protocol_string);
848 return protocol_component.error();
849 }
850 // If the result of running protocol component matches a special scheme given
851 // protocol component is true, then set parser's protocol matches a special
852 // scheme flag to true.
853 if (protocol_component_matches_special_scheme(*protocol_component)) {
854 protocol_matches_a_special_scheme_flag = true;
855 }
856 return std::nullopt;
857}
858
859template <url_pattern_regex::regex_concept regex_provider>
860tl::expected<url_pattern_init, errors>
861constructor_string_parser<regex_provider>::parse(std::string_view input) {
862 ada_log("constructor_string_parser::parse input=", input);
863 // Let parser be a new constructor string parser whose input is input and
864 // token list is the result of running tokenize given input and "lenient".
865 auto token_list = tokenize(input, token_policy::lenient);
866 if (!token_list) {
867 return tl::unexpected(token_list.error());
868 }
869 auto parser = constructor_string_parser(input, std::move(*token_list));
870
871 // While parser's token index is less than parser's token list size:
872 while (parser.token_index < parser.token_list.size()) {
873 // Set parser's token increment to 1.
874 parser.token_increment = 1;
875
876 // If parser's token list[parser's token index]'s type is "end" then:
877 if (parser.token_list[parser.token_index].type == token_type::END) {
878 // If parser's state is "init":
879 if (parser.state == State::INIT) {
880 // Run rewind given parser.
881 parser.rewind();
882 // If the result of running is a hash prefix given parser is true, then
883 // run change state given parser, "hash" and 1.
884 if (parser.is_hash_prefix()) {
885 parser.change_state(State::HASH, 1);
886 } else if (parser.is_search_prefix()) {
887 // Otherwise if the result of running is a search prefix given parser
888 // is true: Run change state given parser, "search" and 1.
889 parser.change_state(State::SEARCH, 1);
890 } else {
891 // Run change state given parser, "pathname" and 0.
892 parser.change_state(State::PATHNAME, 0);
893 }
894 // Increment parser's token index by parser's token increment.
895 parser.token_index += parser.token_increment;
896 // Continue.
897 continue;
898 }
899
900 if (parser.state == State::AUTHORITY) {
901 // If parser's state is "authority":
902 // Run rewind and set state given parser, and "hostname".
903 parser.rewind();
904 parser.change_state(State::HOSTNAME, 0);
905 // Increment parser's token index by parser's token increment.
906 parser.token_index += parser.token_increment;
907 // Continue.
908 continue;
909 }
910
911 // Run change state given parser, "done" and 0.
912 parser.change_state(State::DONE, 0);
913 // Break.
914 break;
915 }
916
917 // If the result of running is a group open given parser is true:
918 if (parser.is_group_open()) {
919 // Increment parser's group depth by 1.
920 parser.group_depth += 1;
921 // Increment parser's token index by parser's token increment.
922 parser.token_index += parser.token_increment;
923 }
924
925 // If parser's group depth is greater than 0:
926 if (parser.group_depth > 0) {
927 // If the result of running is a group close given parser is true, then
928 // decrement parser's group depth by 1.
929 if (parser.is_group_close()) {
930 parser.group_depth -= 1;
931 } else {
932 // Increment parser's token index by parser's token increment.
933 parser.token_index += parser.token_increment;
934 continue;
935 }
936 }
937
938 // Switch on parser's state and run the associated steps:
939 switch (parser.state) {
940 case State::INIT: {
941 // If the result of running is a protocol suffix given parser is true:
942 if (parser.is_protocol_suffix()) {
943 // Run rewind and set state given parser and "protocol".
944 parser.rewind();
945 parser.change_state(State::PROTOCOL, 0);
946 }
947 break;
948 }
949 case State::PROTOCOL: {
950 // If the result of running is a protocol suffix given parser is true:
951 if (parser.is_protocol_suffix()) {
952 // Run compute protocol matches a special scheme flag given parser.
953 if (const auto error =
954 parser.compute_protocol_matches_special_scheme_flag()) {
955 ada_log("compute_protocol_matches_special_scheme_flag failed");
956 return tl::unexpected(*error);
957 }
958 // Let next state be "pathname".
959 auto next_state = State::PATHNAME;
960 // Let skip be 1.
961 auto skip = 1;
962 // If the result of running next is authority slashes given parser is
963 // true:
964 if (parser.next_is_authority_slashes()) {
965 // Set next state to "authority".
966 next_state = State::AUTHORITY;
967 // Set skip to 3.
968 skip = 3;
969 } else if (parser.protocol_matches_a_special_scheme_flag) {
970 // Otherwise if parser's protocol matches a special scheme flag is
971 // true, then set next state to "authority".
972 next_state = State::AUTHORITY;
973 }
974
975 // Run change state given parser, next state, and skip.
976 parser.change_state(next_state, skip);
977 }
978 break;
979 }
980 case State::AUTHORITY: {
981 // If the result of running is an identity terminator given parser is
982 // true, then run rewind and set state given parser and "username".
983 if (parser.is_an_identity_terminator()) {
984 parser.rewind();
985 parser.change_state(State::USERNAME, 0);
986 } else if (parser.is_pathname_start() || parser.is_search_prefix() ||
987 parser.is_hash_prefix()) {
988 // Otherwise if any of the following are true:
989 // - the result of running is a pathname start given parser;
990 // - the result of running is a search prefix given parser; or
991 // - the result of running is a hash prefix given parser,
992 // then run rewind and set state given parser and "hostname".
993 parser.rewind();
994 parser.change_state(State::HOSTNAME, 0);
995 }
996 break;
997 }
998 case State::USERNAME: {
999 // If the result of running is a password prefix given parser is true,
1000 // then run change state given parser, "password", and 1.
1001 if (parser.is_password_prefix()) {
1002 parser.change_state(State::PASSWORD, 1);
1003 } else if (parser.is_an_identity_terminator()) {
1004 // Otherwise if the result of running is an identity terminator given
1005 // parser is true, then run change state given parser, "hostname",
1006 // and 1.
1007 parser.change_state(State::HOSTNAME, 1);
1008 }
1009 break;
1010 }
1011 case State::PASSWORD: {
1012 // If the result of running is an identity terminator given parser is
1013 // true, then run change state given parser, "hostname", and 1.
1014 if (parser.is_an_identity_terminator()) {
1015 parser.change_state(State::HOSTNAME, 1);
1016 }
1017 break;
1018 }
1019 case State::HOSTNAME: {
1020 // If the result of running is an IPv6 open given parser is true, then
1021 // increment parser's hostname IPv6 bracket depth by 1.
1022 if (parser.is_an_ipv6_open()) {
1023 parser.hostname_ipv6_bracket_depth += 1;
1024 } else if (parser.is_an_ipv6_close()) {
1025 // Otherwise if the result of running is an IPv6 close given parser is
1026 // true, then decrement parser's hostname IPv6 bracket depth by 1.
1027 parser.hostname_ipv6_bracket_depth -= 1;
1028 } else if (parser.is_port_prefix() &&
1029 parser.hostname_ipv6_bracket_depth == 0) {
1030 // Otherwise if the result of running is a port prefix given parser is
1031 // true and parser's hostname IPv6 bracket depth is zero, then run
1032 // change state given parser, "port", and 1.
1033 parser.change_state(State::PORT, 1);
1034 } else if (parser.is_pathname_start()) {
1035 // Otherwise if the result of running is a pathname start given parser
1036 // is true, then run change state given parser, "pathname", and 0.
1037 parser.change_state(State::PATHNAME, 0);
1038 } else if (parser.is_search_prefix()) {
1039 // Otherwise if the result of running is a search prefix given parser
1040 // is true, then run change state given parser, "search", and 1.
1041 parser.change_state(State::SEARCH, 1);
1042 } else if (parser.is_hash_prefix()) {
1043 // Otherwise if the result of running is a hash prefix given parser is
1044 // true, then run change state given parser, "hash", and 1.
1045 parser.change_state(State::HASH, 1);
1046 }
1047
1048 break;
1049 }
1050 case State::PORT: {
1051 // If the result of running is a pathname start given parser is true,
1052 // then run change state given parser, "pathname", and 0.
1053 if (parser.is_pathname_start()) {
1054 parser.change_state(State::PATHNAME, 0);
1055 } else if (parser.is_search_prefix()) {
1056 // Otherwise if the result of running is a search prefix given parser
1057 // is true, then run change state given parser, "search", and 1.
1058 parser.change_state(State::SEARCH, 1);
1059 } else if (parser.is_hash_prefix()) {
1060 // Otherwise if the result of running is a hash prefix given parser is
1061 // true, then run change state given parser, "hash", and 1.
1062 parser.change_state(State::HASH, 1);
1063 }
1064 break;
1065 }
1066 case State::PATHNAME: {
1067 // If the result of running is a search prefix given parser is true,
1068 // then run change state given parser, "search", and 1.
1069 if (parser.is_search_prefix()) {
1070 parser.change_state(State::SEARCH, 1);
1071 } else if (parser.is_hash_prefix()) {
1072 // Otherwise if the result of running is a hash prefix given parser is
1073 // true, then run change state given parser, "hash", and 1.
1074 parser.change_state(State::HASH, 1);
1075 }
1076 break;
1077 }
1078 case State::SEARCH: {
1079 // If the result of running is a hash prefix given parser is true, then
1080 // run change state given parser, "hash", and 1.
1081 if (parser.is_hash_prefix()) {
1082 parser.change_state(State::HASH, 1);
1083 }
1084 break;
1085 }
1086 case State::HASH: {
1087 // Do nothing
1088 break;
1089 }
1090 default: {
1091 // Assert: This step is never reached.
1092 unreachable();
1093 }
1094 }
1095
1096 // Increment parser's token index by parser's token increment.
1097 parser.token_index += parser.token_increment;
1098 }
1099
1100 // If parser's result contains "hostname" and not "port", then set parser's
1101 // result["port"] to the empty string.
1102 if (parser.result.hostname && !parser.result.port) {
1103 parser.result.port = "";
1104 }
1105
1106 // Return parser's result.
1107 return parser.result;
1108}
1109
1110} // namespace ada::url_pattern_helpers
1111#endif // ADA_INCLUDE_URL_PATTERN
1112#endif
Cross-platform compiler macros and common definitions.
#define ADA_ASSERT_TRUE(COND)
#define ada_warn_unused
Definition common_defs.h:89
User-facing functions for URL parsing and manipulation.
type
Enumeration of URL scheme types.
Definition scheme.h:41
errors
Error codes for URL parsing operations.
Definition errors.h:17
state
States in the URL parsing state machine.
Definition state.h:27
ada_warn_unused std::string_view to_string(encoding_type type)
void unreachable()
tl::expected< result_type, ada::errors > result
ada::url_pattern_regex::std_regex_provider regex_provider
Definition url_pattern.cc:9
Declaration for the URLPattern helpers.