Ada 3.0.1
Fast spec-compliant URL parser
Loading...
Searching...
No Matches
url_pattern_helpers.cpp
Go to the documentation of this file.
2
3#include <algorithm>
4#include <optional>
5#include <string>
6
8
9std::tuple<std::string, std::vector<std::string>>
11 const std::vector<url_pattern_part>& part_list,
13 // Let result be "^"
14 std::string result = "^";
15
16 // Let name list be a new list
17 std::vector<std::string> name_list{};
18 const std::string full_wildcard_regexp_value = ".*";
19
20 // For each part of part list:
21 for (const url_pattern_part& part : part_list) {
22 // If part's type is "fixed-text":
23 if (part.type == url_pattern_part_type::FIXED_TEXT) {
24 // If part's modifier is "none"
25 if (part.modifier == url_pattern_part_modifier::none) {
26 // Append the result of running escape a regexp string given part's
27 // value
28 result += escape_regexp_string(part.value);
29 } else {
30 // A "fixed-text" part with a modifier uses a non capturing group
31 // (?:<fixed text>)<modifier>
32 // Append "(?:" to the end of result.
33 result.append("(?:");
34 // Append the result of running escape a regexp string given part’s
35 // value to the end of result.
36 result.append(escape_regexp_string(part.value));
37 // Append ")" to the end of result.
38 result.append(")");
39 // Append the result of running convert a modifier to a string given
40 // part’s modifier to the end of result.
41 result.append(convert_modifier_to_string(part.modifier));
42 }
43 continue;
44 }
45
46 // Assert: part's name is not the empty string
47 ADA_ASSERT_TRUE(!part.name.empty());
48
49 // Append part's name to name list
50 name_list.push_back(part.name);
51
52 // Let regexp value be part's value
53 std::string regexp_value = part.value;
54
55 // If part's type is "segment-wildcard"
57 // then set regexp value to the result of running generate a segment
58 // wildcard regexp given options.
59 regexp_value = generate_segment_wildcard_regexp(options);
60 }
61 // Otherwise if part's type is "full-wildcard"
62 else if (part.type == url_pattern_part_type::FULL_WILDCARD) {
63 // then set regexp value to full wildcard regexp value.
64 regexp_value = full_wildcard_regexp_value;
65 }
66
67 // If part's prefix is the empty string and part's suffix is the empty
68 // string
69 if (part.prefix.empty() && part.suffix.empty()) {
70 // If part's modifier is "none" or "optional"
71 if (part.modifier == url_pattern_part_modifier::none ||
72 part.modifier == url_pattern_part_modifier::optional) {
73 // (<regexp value>)<modifier>
74 result += "(" + regexp_value + ")" +
75 convert_modifier_to_string(part.modifier);
76 } else {
77 // ((?:<regexp value>)<modifier>)
78 result += "((?:" + regexp_value + ")" +
79 convert_modifier_to_string(part.modifier) + ")";
80 }
81 continue;
82 }
83
84 // If part's modifier is "none" or "optional"
85 if (part.modifier == url_pattern_part_modifier::none ||
86 part.modifier == url_pattern_part_modifier::optional) {
87 // (?:<prefix>(<regexp value>)<suffix>)<modifier>
88 result += "(?:" + escape_regexp_string(part.prefix) + "(" + regexp_value +
89 ")" + escape_regexp_string(part.suffix) + ")" +
90 convert_modifier_to_string(part.modifier);
91 continue;
92 }
93
94 // Assert: part's modifier is "zero-or-more" or "one-or-more"
97
98 // Assert: part's prefix is not the empty string or part's suffix is not the
99 // empty string
100 ADA_ASSERT_TRUE(!part.prefix.empty() || !part.suffix.empty());
101
102 // (?:<prefix>((?:<regexp value>)(?:<suffix><prefix>(?:<regexp
103 // value>))*)<suffix>)?
104 // Append "(?:" to the end of result.
105 result.append("(?:");
106 // Append the result of running escape a regexp string given part’s prefix
107 // to the end of result.
108 result.append(escape_regexp_string(part.prefix));
109 // Append "((?:" to the end of result.
110 result.append("((?:");
111 // Append regexp value to the end of result.
112 result.append(regexp_value);
113 // Append ")(?:" to the end of result.
114 result.append(")(?:");
115 // Append the result of running escape a regexp string given part’s suffix
116 // to the end of result.
117 result.append(escape_regexp_string(part.suffix));
118 // Append the result of running escape a regexp string given part’s prefix
119 // to the end of result.
120 result.append(escape_regexp_string(part.prefix));
121 // Append "(?:" to the end of result.
122 result.append("(?:");
123 // Append regexp value to the end of result.
124 result.append(regexp_value);
125 // Append "))*)" to the end of result.
126 result.append("))*)");
127 // Append the result of running escape a regexp string given part’s suffix
128 // to the end of result.
129 result.append(escape_regexp_string(part.suffix));
130 // Append ")" to the end of result.
131 result.append(")");
132
133 // If part's modifier is "zero-or-more" then append "?" to the end of result
134 if (part.modifier == url_pattern_part_modifier::zero_or_more) {
135 result += "?";
136 }
137 }
138
139 // Append "$" to the end of result
140 result += "$";
141
142 // Return (result, name list)
143 return {result, name_list};
144}
145
146bool is_ipv6_address(std::string_view input) noexcept {
147 // If input’s code point length is less than 2, then return false.
148 if (input.size() < 2) return false;
149
150 // Let input code points be input interpreted as a list of code points.
151 // If input code points[0] is U+005B ([), then return true.
152 if (input.front() == '[') return true;
153 // If input code points[0] is U+007B ({) and input code points[1] is U+005B
154 // ([), then return true.
155 if (input.starts_with("{[")) return true;
156 // If input code points[0] is U+005C (\‍) and input code points[1] is U+005B
157 // ([), then return true.
158 return input.starts_with("\\[");
159}
160
162 // TODO: Optimize this.
163 switch (modifier) {
164 // If modifier is "zero-or-more", then return "*".
166 return "*";
167 // If modifier is "optional", then return "?".
169 return "?";
170 // If modifier is "one-or-more", then return "+".
172 return "+";
173 // Return the empty string.
174 default:
175 return "";
176 }
177}
178
181 // Let result be "[^".
182 std::string result = "[^";
183 // Append the result of running escape a regexp string given options’s
184 // delimiter code point to the end of result.
185 result.append(escape_regexp_string(options.get_delimiter()));
186 // Append "]+?" to the end of result.
187 result.append("]+?");
188 // Return result.
189 ada_log("generate_segment_wildcard_regexp result: ", result);
190 return result;
191}
192
193tl::expected<std::string, errors> canonicalize_protocol(
194 std::string_view input) {
195 ada_log("canonicalize_protocol called with input=", input);
196 // If value is the empty string, return value.
197 if (input.empty()) [[unlikely]] {
198 return "";
199 }
200
201 // IMPORTANT: Deviation from the spec. We remove the trailing ':' here.
202 if (input.ends_with(":")) {
203 input.remove_suffix(1);
204 }
205
206 // Let dummyURL be a new URL record.
207 // Let parseResult be the result of running the basic URL parser given value
208 // followed by "://dummy.test", with dummyURL as url.
209 if (auto dummy_url = ada::parse<url_aggregator>(
210 std::string(input) + "://dummy.test", nullptr)) {
211 // IMPORTANT: Deviation from the spec. We remove the trailing ':' here.
212 // Since URL parser always return protocols ending with `:`
213 auto protocol = dummy_url->get_protocol();
214 protocol.remove_suffix(1);
215 return std::string(protocol);
216 }
217 // If parseResult is failure, then throw a TypeError.
218 return tl::unexpected(errors::type_error);
219}
220
221tl::expected<std::string, errors> canonicalize_username(
222 std::string_view input) {
223 // If value is the empty string, return value.
224 if (input.empty()) [[unlikely]] {
225 return "";
226 }
227 // Let dummyURL be a new URL record.
228 auto url = ada::parse<url_aggregator>("fake://dummy.test", nullptr);
229 ADA_ASSERT_TRUE(url.has_value());
230 // Set the username given dummyURL and value.
231 if (!url->set_username(input)) {
232 return tl::unexpected(errors::type_error);
233 }
234 // Return dummyURL’s username.
235 return std::string(url->get_username());
236}
237
238tl::expected<std::string, errors> canonicalize_password(
239 std::string_view input) {
240 // If value is the empty string, return value.
241 if (input.empty()) [[unlikely]] {
242 return "";
243 }
244 // Let dummyURL be a new URL record.
245 // Set the password given dummyURL and value.
246 auto url = ada::parse<url_aggregator>("fake://dummy.test", nullptr);
247
248 ADA_ASSERT_TRUE(url.has_value());
249 if (!url->set_password(input)) {
250 return tl::unexpected(errors::type_error);
251 }
252 // Return dummyURL’s password.
253 return std::string(url->get_password());
254}
255
256tl::expected<std::string, errors> canonicalize_hostname(
257 std::string_view input) {
258 ada_log("canonicalize_hostname input=", input);
259 // If value is the empty string, return value.
260 if (input.empty()) [[unlikely]] {
261 return "";
262 }
263 // Let dummyURL be a new URL record.
264 // Let parseResult be the result of running the basic URL parser given value
265 // with dummyURL as url and hostname state as state override.
266
267 // IMPORTANT: The protocol needs to be a special protocol, otherwise the
268 // hostname will not be converted using IDNA.
269 auto url = ada::parse<url_aggregator>("https://dummy.test", nullptr);
271 // if (!isValidHostnameInput(hostname)) return kj::none;
272 if (!url->set_hostname(input)) {
273 // If parseResult is failure, then throw a TypeError.
274 return tl::unexpected(errors::type_error);
275 }
276 // Return dummyURL’s host, serialized, or empty string if it is null.
277 return std::string(url->get_hostname());
278}
279
280tl::expected<std::string, errors> canonicalize_ipv6_hostname(
281 std::string_view input) {
282 ada_log("canonicalize_ipv6_hostname input=", input);
283 // TODO: Optimization opportunity: Use lookup table to speed up checking
284 if (std::ranges::any_of(input, [](char c) {
285 return c != '[' && c != ']' && c != ':' &&
286 !unicode::is_ascii_hex_digit(c);
287 })) {
288 return tl::unexpected(errors::type_error);
289 }
290 // Append the result of running ASCII lowercase given code point to the end of
291 // result.
292 auto hostname = std::string(input);
293 unicode::to_lower_ascii(hostname.data(), hostname.size());
294 return hostname;
295}
296
297tl::expected<std::string, errors> canonicalize_port(
298 std::string_view port_value) {
299 // If portValue is the empty string, return portValue.
300 if (port_value.empty()) [[unlikely]] {
301 return "";
302 }
303 // Let dummyURL be a new URL record.
304 // If protocolValue was given, then set dummyURL’s scheme to protocolValue.
305 // Let parseResult be the result of running basic URL parser given portValue
306 // with dummyURL as url and port state as state override.
307 auto url = ada::parse<url_aggregator>("fake://dummy.test", nullptr);
309 if (url->set_port(port_value)) {
310 // Return dummyURL’s port, serialized, or empty string if it is null.
311 return std::string(url->get_port());
312 }
313 // If parseResult is failure, then throw a TypeError.
314 return tl::unexpected(errors::type_error);
315}
316
317tl::expected<std::string, errors> canonicalize_port_with_protocol(
318 std::string_view port_value, std::string_view protocol) {
319 // If portValue is the empty string, return portValue.
320 if (port_value.empty()) [[unlikely]] {
321 return "";
322 }
323
324 // TODO: Remove this
325 // We have an empty protocol because get_protocol() returns an empty string
326 // We should handle this in the caller rather than here.
327 if (protocol.empty()) {
328 protocol = "fake";
329 } else if (protocol.ends_with(":")) {
330 protocol.remove_suffix(1);
331 }
332 // Let dummyURL be a new URL record.
333 // If protocolValue was given, then set dummyURL’s scheme to protocolValue.
334 // Let parseResult be the result of running basic URL parser given portValue
335 // with dummyURL as url and port state as state override.
336 auto url = ada::parse<url_aggregator>(std::string(protocol) + "://dummy.test",
337 nullptr);
338 // TODO: Remove has_port() check.
339 // This is actually a bug with url parser where set_port() returns true for
340 // "invalid80" port value.
341 if (url && url->set_port(port_value) && url->has_port()) {
342 // Return dummyURL’s port, serialized, or empty string if it is null.
343 return std::string(url->get_port());
344 }
345 // TODO: Remove this once the previous has_port() check is removed.
346 if (url) {
347 if (scheme::is_special(protocol) && url->get_port().empty()) {
348 return "";
349 }
350 }
351 // If parseResult is failure, then throw a TypeError.
352 return tl::unexpected(errors::type_error);
353}
354
355tl::expected<std::string, errors> canonicalize_pathname(
356 std::string_view input) {
357 // If value is the empty string, then return value.
358 if (input.empty()) [[unlikely]] {
359 return "";
360 }
361 // Let leading slash be true if the first code point in value is U+002F (/)
362 // and otherwise false.
363 const bool leading_slash = input.starts_with("/");
364 // Let modified value be "/-" if leading slash is false and otherwise the
365 // empty string.
366 const auto modified_value = leading_slash ? "" : "/-";
367 const auto full_url =
368 std::string("fake://fake-url") + modified_value + std::string(input);
369 if (auto url = ada::parse<url_aggregator>(full_url, nullptr)) {
370 const auto pathname = url->get_pathname();
371 // If leading slash is false, then set result to the code point substring
372 // from 2 to the end of the string within result.
373 return leading_slash ? std::string(pathname)
374 : std::string(pathname.substr(2));
375 }
376 // If parseResult is failure, then throw a TypeError.
377 return tl::unexpected(errors::type_error);
378}
379
380tl::expected<std::string, errors> canonicalize_opaque_pathname(
381 std::string_view input) {
382 // If value is the empty string, return value.
383 if (input.empty()) [[unlikely]] {
384 return "";
385 }
386 // Let dummyURL be a new URL record.
387 // Set dummyURL’s path to the empty string.
388 // Let parseResult be the result of running URL parsing given value with
389 // dummyURL as url and opaque path state as state override.
390 if (auto url =
391 ada::parse<url_aggregator>("fake:" + std::string(input), nullptr)) {
392 // Return the result of URL path serializing dummyURL.
393 return std::string(url->get_pathname());
394 }
395 // If parseResult is failure, then throw a TypeError.
396 return tl::unexpected(errors::type_error);
397}
398
399tl::expected<std::string, errors> canonicalize_search(std::string_view input) {
400 // If value is the empty string, return value.
401 if (input.empty()) [[unlikely]] {
402 return "";
403 }
404 // Let dummyURL be a new URL record.
405 // Set dummyURL’s query to the empty string.
406 // Let parseResult be the result of running basic URL parser given value with
407 // dummyURL as url and query state as state override.
408 auto url = ada::parse<url_aggregator>("fake://dummy.test", nullptr);
409 ADA_ASSERT_TRUE(url.has_value());
410 url->set_search(input);
411 if (url->has_search()) {
412 const auto search = url->get_search();
413 return std::string(search.substr(1));
414 }
415 return tl::unexpected(errors::type_error);
416}
417
418tl::expected<std::string, errors> canonicalize_hash(std::string_view input) {
419 // If value is the empty string, return value.
420 if (input.empty()) [[unlikely]] {
421 return "";
422 }
423 // Let dummyURL be a new URL record.
424 // Set dummyURL’s fragment to the empty string.
425 // Let parseResult be the result of running basic URL parser given value with
426 // dummyURL as url and fragment state as state override.
427 auto url = ada::parse<url_aggregator>("fake://dummy.test", nullptr);
428 ADA_ASSERT_TRUE(url.has_value());
429 url->set_hash(input);
430 // Return dummyURL’s fragment.
431 if (url->has_hash()) {
432 const auto hash = url->get_hash();
433 return std::string(hash.substr(1));
434 }
435 return tl::unexpected(errors::type_error);
436}
437
438tl::expected<std::vector<token>, errors> tokenize(std::string_view input,
439 token_policy policy) {
440 ada_log("tokenize input: ", input);
441 // Let tokenizer be a new tokenizer.
442 // Set tokenizer’s input to input.
443 // Set tokenizer’s policy to policy.
444 auto tokenizer = Tokenizer(input, policy);
445 // While tokenizer’s index is less than tokenizer’s input's code point length:
446 while (tokenizer.index < tokenizer.input.size()) {
447 // Run seek and get the next code point given tokenizer and tokenizer’s
448 // index.
449 tokenizer.seek_and_get_next_code_point(tokenizer.index);
450
451 // If tokenizer’s code point is U+002A (*):
452 if (tokenizer.code_point == '*') {
453 // Run add a token with default position and length given tokenizer and
454 // "asterisk".
455 tokenizer.add_token_with_defaults(token_type::ASTERISK);
456 ada_log("add ASTERISK token");
457 // Continue.
458 continue;
459 }
460
461 // If tokenizer’s code point is U+002B (+) or U+003F (?):
462 if (tokenizer.code_point == '+' || tokenizer.code_point == '?') {
463 // Run add a token with default position and length given tokenizer and
464 // "other-modifier".
465 tokenizer.add_token_with_defaults(token_type::OTHER_MODIFIER);
466 // Continue.
467 continue;
468 }
469
470 // If tokenizer’s code point is U+005C (\‍):
471 if (tokenizer.code_point == '\\') {
472 // If tokenizer’s index is equal to tokenizer’s input's code point length
473 // − 1:
474 if (tokenizer.index == tokenizer.input.size() - 1) {
475 // Run process a tokenizing error given tokenizer, tokenizer’s next
476 // index, and tokenizer’s index.
477 if (auto error = tokenizer.process_tokenizing_error(
478 tokenizer.next_index, tokenizer.index)) {
479 ada_log("process_tokenizing_error failed");
480 return tl::unexpected(*error);
481 }
482 continue;
483 }
484
485 // Let escaped index be tokenizer’s next index.
486 auto escaped_index = tokenizer.next_index;
487 // Run get the next code point given tokenizer.
488 tokenizer.get_next_code_point();
489 // Run add a token with default length given tokenizer, "escaped-char",
490 // tokenizer’s next index, and escaped index.
491 tokenizer.add_token_with_default_length(
492 token_type::ESCAPED_CHAR, tokenizer.next_index, escaped_index);
493 ada_log("add ESCAPED_CHAR token on next_index ", tokenizer.next_index,
494 " with escaped index ", escaped_index);
495 // Continue.
496 continue;
497 }
498
499 // If tokenizer’s code point is U+007B ({):
500 if (tokenizer.code_point == '{') {
501 // Run add a token with default position and length given tokenizer and
502 // "open".
503 tokenizer.add_token_with_defaults(token_type::OPEN);
504 ada_log("add OPEN token");
505 continue;
506 }
507
508 // If tokenizer’s code point is U+007D (}):
509 if (tokenizer.code_point == '}') {
510 // Run add a token with default position and length given tokenizer and
511 // "close".
512 tokenizer.add_token_with_defaults(token_type::CLOSE);
513 ada_log("add CLOSE token");
514 continue;
515 }
516
517 // If tokenizer’s code point is U+003A (:):
518 if (tokenizer.code_point == ':') {
519 // Let name position be tokenizer’s next index.
520 auto name_position = tokenizer.next_index;
521 // Let name start be name position.
522 auto name_start = name_position;
523 // While name position is less than tokenizer’s input's code point length:
524 while (name_position < tokenizer.input.size()) {
525 // Run seek and get the next code point given tokenizer and name
526 // position.
527 tokenizer.seek_and_get_next_code_point(name_position);
528 // Let first code point be true if name position equals name start and
529 // false otherwise.
530 bool first_code_point = name_position == name_start;
531 // Let valid code point be the result of running is a valid name code
532 // point given tokenizer’s code point and first code point.
533 auto valid_code_point =
534 idna::valid_name_code_point(tokenizer.code_point, first_code_point);
535 ada_log("tokenizer.code_point=", uint32_t(tokenizer.code_point),
536 " first_code_point=", first_code_point,
537 " valid_code_point=", valid_code_point);
538 // If valid code point is false break.
539 if (!valid_code_point) break;
540 // Set name position to tokenizer’s next index.
541 name_position = tokenizer.next_index;
542 }
543
544 // If name position is less than or equal to name start:
545 if (name_position <= name_start) {
546 // Run process a tokenizing error given tokenizer, name start, and
547 // tokenizer’s index.
548 if (auto error = tokenizer.process_tokenizing_error(name_start,
549 tokenizer.index)) {
550 ada_log("process_tokenizing_error failed");
551 return tl::unexpected(*error);
552 }
553 // Continue
554 continue;
555 }
556
557 // Run add a token with default length given tokenizer, "name", name
558 // position, and name start.
559 tokenizer.add_token_with_default_length(token_type::NAME, name_position,
560 name_start);
561 continue;
562 }
563
564 // If tokenizer’s code point is U+0028 (():
565 if (tokenizer.code_point == '(') {
566 // Let depth be 1.
567 size_t depth = 1;
568 // Let regexp position be tokenizer’s next index.
569 auto regexp_position = tokenizer.next_index;
570 // Let regexp start be regexp position.
571 auto regexp_start = regexp_position;
572 // Let error be false.
573 bool error = false;
574
575 // While regexp position is less than tokenizer’s input's code point
576 // length:
577 while (regexp_position < tokenizer.input.size()) {
578 // Run seek and get the next code point given tokenizer and regexp
579 // position.
580 tokenizer.seek_and_get_next_code_point(regexp_position);
581
582 // TODO: Optimization opportunity: The next 2 if statements can be
583 // merged. If the result of running is ASCII given tokenizer’s code
584 // point is false:
585 if (!unicode::is_ascii(tokenizer.code_point)) {
586 // Run process a tokenizing error given tokenizer, regexp start, and
587 // tokenizer’s index.
588 if (auto process_error = tokenizer.process_tokenizing_error(
589 regexp_start, tokenizer.index)) {
590 return tl::unexpected(*process_error);
591 }
592 // Set error to true.
593 error = true;
594 break;
595 }
596
597 // If regexp position equals regexp start and tokenizer’s code point is
598 // U+003F (?):
599 if (regexp_position == regexp_start && tokenizer.code_point == '?') {
600 // Run process a tokenizing error given tokenizer, regexp start, and
601 // tokenizer’s index.
602 if (auto process_error = tokenizer.process_tokenizing_error(
603 regexp_start, tokenizer.index)) {
604 return tl::unexpected(*process_error);
605 }
606 // Set error to true;
607 error = true;
608 break;
609 }
610
611 // If tokenizer’s code point is U+005C (\‍):
612 if (tokenizer.code_point == '\\') {
613 // If regexp position equals tokenizer’s input's code point length − 1
614 if (regexp_position == tokenizer.input.size() - 1) {
615 // Run process a tokenizing error given tokenizer, regexp start, and
616 // tokenizer’s index.
617 if (auto process_error = tokenizer.process_tokenizing_error(
618 regexp_start, tokenizer.index)) {
619 return tl::unexpected(*process_error);
620 }
621 // Set error to true.
622 error = true;
623 break;
624 }
625 // Run get the next code point given tokenizer.
626 tokenizer.get_next_code_point();
627 // If the result of running is ASCII given tokenizer’s code point is
628 // false:
629 if (!unicode::is_ascii(tokenizer.code_point)) {
630 // Run process a tokenizing error given tokenizer, regexp start, and
631 // tokenizer’s index.
632 if (auto process_error = tokenizer.process_tokenizing_error(
633 regexp_start, tokenizer.index);
634 process_error.has_value()) {
635 return tl::unexpected(*process_error);
636 }
637 // Set error to true.
638 error = true;
639 break;
640 }
641 // Set regexp position to tokenizer’s next index.
642 regexp_position = tokenizer.next_index;
643 continue;
644 }
645
646 // If tokenizer’s code point is U+0029 ()):
647 if (tokenizer.code_point == ')') {
648 // Decrement depth by 1.
649 depth--;
650 // If depth is 0:
651 if (depth == 0) {
652 // Set regexp position to tokenizer’s next index.
653 regexp_position = tokenizer.next_index;
654 // Break.
655 break;
656 }
657 } else if (tokenizer.code_point == '(') {
658 // Otherwise if tokenizer’s code point is U+0028 (():
659 // Increment depth by 1.
660 depth++;
661 // If regexp position equals tokenizer’s input's code point length −
662 // 1:
663 if (regexp_position == tokenizer.input.size() - 1) {
664 // Run process a tokenizing error given tokenizer, regexp start, and
665 // tokenizer’s index.
666 if (auto process_error = tokenizer.process_tokenizing_error(
667 regexp_start, tokenizer.index)) {
668 return tl::unexpected(*process_error);
669 }
670 // Set error to true.
671 error = true;
672 break;
673 }
674 // Let temporary position be tokenizer’s next index.
675 auto temporary_position = tokenizer.next_index;
676 // Run get the next code point given tokenizer.
677 tokenizer.get_next_code_point();
678 // If tokenizer’s code point is not U+003F (?):
679 if (tokenizer.code_point != '?') {
680 // Run process a tokenizing error given tokenizer, regexp start, and
681 // tokenizer’s index.
682 if (auto process_error = tokenizer.process_tokenizing_error(
683 regexp_start, tokenizer.index)) {
684 return tl::unexpected(*process_error);
685 }
686 // Set error to true.
687 error = true;
688 break;
689 }
690 // Set tokenizer’s next index to temporary position.
691 tokenizer.next_index = temporary_position;
692 }
693 // Set regexp position to tokenizer’s next index.
694 regexp_position = tokenizer.next_index;
695 }
696
697 // If error is true continue.
698 if (error) continue;
699 // If depth is not zero:
700 if (depth != 0) {
701 // Run process a tokenizing error given tokenizer, regexp start, and
702 // tokenizer’s index.
703 if (auto process_error = tokenizer.process_tokenizing_error(
704 regexp_start, tokenizer.index)) {
705 return tl::unexpected(*process_error);
706 }
707 continue;
708 }
709 // Let regexp length be regexp position − regexp start − 1.
710 auto regexp_length = regexp_position - regexp_start - 1;
711 // If regexp length is zero:
712 if (regexp_length == 0) {
713 // Run process a tokenizing error given tokenizer, regexp start, and
714 // tokenizer’s index.
715 if (auto process_error = tokenizer.process_tokenizing_error(
716 regexp_start, tokenizer.index)) {
717 ada_log("process_tokenizing_error failed");
718 return tl::unexpected(*process_error);
719 }
720 continue;
721 }
722 // Run add a token given tokenizer, "regexp", regexp position, regexp
723 // start, and regexp length.
724 tokenizer.add_token(token_type::REGEXP, regexp_position, regexp_start,
725 regexp_length);
726 continue;
727 }
728 // Run add a token with default position and length given tokenizer and
729 // "char".
730 tokenizer.add_token_with_defaults(token_type::CHAR);
731 }
732 // Run add a token with default length given tokenizer, "end", tokenizer’s
733 // index, and tokenizer’s index.
734 tokenizer.add_token_with_default_length(token_type::END, tokenizer.index,
735 tokenizer.index);
736
737 ada_log("tokenizer.token_list size is: ", tokenizer.token_list.size());
738 // Return tokenizer’s token list.
739 return tokenizer.token_list;
740}
741
742std::string escape_pattern_string(std::string_view input) {
743 ada_log("escape_pattern_string called with input=", input);
744 if (input.empty()) [[unlikely]] {
745 return "";
746 }
747 // Assert: input is an ASCII string.
749 // Let result be the empty string.
750 std::string result{};
751 result.reserve(input.size());
752
753 // TODO: Optimization opportunity: Use a lookup table
754 constexpr auto should_escape = [](const char c) {
755 return c == '+' || c == '*' || c == '?' || c == ':' || c == '{' ||
756 c == '}' || c == '(' || c == ')' || c == '\\';
757 };
758
759 // While index is less than input’s length:
760 for (const auto& c : input) {
761 if (should_escape(c)) {
762 // then append U+005C (\‍) to the end of result.
763 result.append("\\");
764 }
765
766 // Append c to the end of result.
767 result += c;
768 }
769 // Return result.
770 return result;
771}
772
773namespace {
774constexpr std::array<uint8_t, 256> escape_regexp_table = []() consteval {
775 std::array<uint8_t, 256> out{};
776 for (auto& c : {'.', '+', '*', '?', '^', '$', '{', '}', '(', ')', '[', ']',
777 '|', '/', '\\'}) {
778 out[c] = 1;
779 }
780 return out;
781}();
782
783constexpr bool should_escape_regexp_char(char c) {
784 return escape_regexp_table[(uint8_t)c];
785}
786} // namespace
787
788std::string escape_regexp_string(std::string_view input) {
789 // Assert: input is an ASCII string.
791 // Let result be the empty string.
792 std::string result{};
793 result.reserve(input.size());
794 for (const auto& c : input) {
795 // TODO: Optimize this even further
796 if (should_escape_regexp_char(c)) {
797 result.append(std::string("\\") + c);
798 } else {
799 result.push_back(c);
800 }
801 }
802 return result;
803}
804
805std::string process_base_url_string(std::string_view input,
806 std::string_view type) {
807 // If type is not "pattern" return input.
808 if (type != "pattern") {
809 return std::string(input);
810 }
811 // Return the result of escaping a pattern string given input.
812 return escape_pattern_string(input);
813}
814
815constexpr bool is_absolute_pathname(std::string_view input,
816 std::string_view type) noexcept {
817 // If input is the empty string, then return false.
818 if (input.empty()) [[unlikely]] {
819 return false;
820 }
821 // If input[0] is U+002F (/), then return true.
822 if (input.starts_with("/")) return true;
823 // If type is "url", then return false.
824 if (type == "url") return false;
825 // If input’s code point length is less than 2, then return false.
826 if (input.size() < 2) return false;
827 // If input[0] is U+005C (\‍) and input[1] is U+002F (/), then return true.
828 if (input.starts_with("\\/")) return true;
829 // If input[0] is U+007B ({) and input[1] is U+002F (/), then return true.
830 if (input.starts_with("{/")) return true;
831 // Return false.
832 return false;
833}
834
836 std::vector<url_pattern_part>& part_list,
838 // Let result be the empty string.
839 std::string result{};
840 // Let index list be the result of getting the indices for part list.
841 // For each index of index list:
842 for (size_t index = 0; index < part_list.size(); index++) {
843 // Let part be part list[index].
844 auto part = part_list[index];
845 // Let previous part be part list[index - 1] if index is greater than 0,
846 // otherwise let it be null.
847 // TODO: Optimization opportunity. Find a way to avoid making a copy here.
848 std::optional<url_pattern_part> previous_part =
849 index == 0 ? std::nullopt : std::optional(part_list[index - 1]);
850 // Let next part be part list[index + 1] if index is less than index list’s
851 // size - 1, otherwise let it be null.
852 std::optional<url_pattern_part> next_part =
853 index < part_list.size() - 1 ? std::optional(part_list[index + 1])
854 : std::nullopt;
855 // If part’s type is "fixed-text" then:
856 if (part.type == url_pattern_part_type::FIXED_TEXT) {
857 // If part’s modifier is "none" then:
858 if (part.modifier == url_pattern_part_modifier::none) {
859 // Append the result of running escape a pattern string given part’s
860 // value to the end of result.
861 result.append(escape_pattern_string(part.value));
862 continue;
863 }
864 // Append "{" to the end of result.
865 result += "{";
866 // Append the result of running escape a pattern string given part’s value
867 // to the end of result.
868 result.append(escape_pattern_string(part.value));
869 // Append "}" to the end of result.
870 result += "}";
871 // Append the result of running convert a modifier to a string given
872 // part’s modifier to the end of result.
873 result.append(convert_modifier_to_string(part.modifier));
874 continue;
875 }
876 // Let custom name be true if part’s name[0] is not an ASCII digit;
877 // otherwise false.
878 bool custom_name = !unicode::is_ascii_digit(part.name[0]);
879 // Let needs grouping be true if at least one of the following are true,
880 // otherwise let it be false:
881 // - part’s suffix is not the empty string.
882 // - part’s prefix is not the empty string and is not options’s prefix code
883 // point.
884 bool needs_grouping =
885 !part.suffix.empty() ||
886 (!part.prefix.empty() && part.prefix[0] != options.get_prefix()[0]);
887
888 // If all of the following are true:
889 // - needs grouping is false; and
890 // - custom name is true; and
891 // - part’s type is "segment-wildcard"; and
892 // - part’s modifier is "none"; and
893 // - next part is not null; and
894 // - next part’s prefix is the empty string; and
895 // - next part’s suffix is the empty string
896 if (!needs_grouping && custom_name &&
898 part.modifier == url_pattern_part_modifier::none &&
899 next_part.has_value() && next_part->prefix.empty() &&
900 next_part->suffix.empty()) {
901 // If next part’s type is "fixed-text":
902 if (next_part->type == url_pattern_part_type::FIXED_TEXT) {
903 // Set needs grouping to true if the result of running is a valid name
904 // code point given next part’s value's first code point and the boolean
905 // false is true.
906 if (idna::valid_name_code_point(next_part->value[0], false)) {
907 needs_grouping = true;
908 }
909 } else {
910 // Set needs grouping to true if next part’s name[0] is an ASCII digit.
911 needs_grouping = !next_part->name.empty() &&
912 unicode::is_ascii_digit(next_part->name[0]);
913 }
914 }
915
916 // If all of the following are true:
917 // - needs grouping is false; and
918 // - part’s prefix is the empty string; and
919 // - previous part is not null; and
920 // - previous part’s type is "fixed-text"; and
921 // - previous part’s value's last code point is options’s prefix code point.
922 // then set needs grouping to true.
923 if (!needs_grouping && part.prefix.empty() && previous_part.has_value() &&
924 previous_part->type == url_pattern_part_type::FIXED_TEXT &&
925 !options.get_prefix().empty() &&
926 previous_part->value.at(previous_part->value.size() - 1) ==
927 options.get_prefix()[0]) {
928 needs_grouping = true;
929 }
930
931 // Assert: part’s name is not the empty string or null.
932 ADA_ASSERT_TRUE(!part.name.empty());
933
934 // If needs grouping is true, then append "{" to the end of result.
935 if (needs_grouping) {
936 result.append("{");
937 }
938
939 // Append the result of running escape a pattern string given part’s prefix
940 // to the end of result.
941 result.append(escape_pattern_string(part.prefix));
942
943 // If custom name is true:
944 if (custom_name) {
945 // Append ":" to the end of result.
946 result.append(":");
947 // Append part’s name to the end of result.
948 result.append(part.name);
949 }
950
951 // If part’s type is "regexp" then:
952 if (part.type == url_pattern_part_type::REGEXP) {
953 // Append "(" to the end of result.
954 result.append("(");
955 // Append part’s value to the end of result.
956 result.append(part.value);
957 // Append ")" to the end of result.
958 result.append(")");
959 } else if (part.type == url_pattern_part_type::SEGMENT_WILDCARD &&
960 !custom_name) {
961 // Otherwise if part’s type is "segment-wildcard" and custom name is
962 // false: Append "(" to the end of result.
963 result.append("(");
964 // Append the result of running generate a segment wildcard regexp given
965 // options to the end of result.
967 // Append ")" to the end of result.
968 result.append(")");
969 } else if (part.type == url_pattern_part_type::FULL_WILDCARD) {
970 // Otherwise if part’s type is "full-wildcard":
971 // If custom name is false and one of the following is true:
972 // - previous part is null; or
973 // - previous part’s type is "fixed-text"; or
974 // - previous part’s modifier is not "none"; or
975 // - needs grouping is true; or
976 // - part’s prefix is not the empty string
977 // - then append "*" to the end of result.
978 if (!custom_name &&
979 (!previous_part.has_value() ||
980 previous_part->type == url_pattern_part_type::FIXED_TEXT ||
981 previous_part->modifier != url_pattern_part_modifier::none ||
982 needs_grouping || !part.prefix.empty())) {
983 result.append("*");
984 } else {
985 // Append "(" to the end of result.
986 // Append full wildcard regexp value to the end of result.
987 // Append ")" to the end of result.
988 result.append("(.*)");
989 }
990 }
991
992 // If all of the following are true:
993 // - part’s type is "segment-wildcard"; and
994 // - custom name is true; and
995 // - part’s suffix is not the empty string; and
996 // - The result of running is a valid name code point given part’s suffix's
997 // first code point and the boolean false is true then append U+005C (\‍) to
998 // the end of result.
999 if (part.type == url_pattern_part_type::SEGMENT_WILDCARD && custom_name &&
1000 !part.suffix.empty() &&
1001 idna::valid_name_code_point(part.suffix[0], false)) {
1002 result.append("\\");
1003 }
1004
1005 // Append the result of running escape a pattern string given part’s suffix
1006 // to the end of result.
1007 result.append(escape_pattern_string(part.suffix));
1008 // If needs grouping is true, then append "}" to the end of result.
1009 if (needs_grouping) result.append("}");
1010 // Append the result of running convert a modifier to a string given part’s
1011 // modifier to the end of result.
1012 result.append(convert_modifier_to_string(part.modifier));
1013 }
1014 // Return result.
1015 return result;
1016}
1017} // namespace ada::url_pattern_helpers
Tokenizer(std::string_view new_input, token_policy new_policy)
#define ADA_ASSERT_TRUE(COND)
bool valid_name_code_point(char32_t input, bool first)
bool constexpr is_ascii(std::u32string_view view)
tl::expected< std::string, errors > canonicalize_opaque_pathname(std::string_view input)
tl::expected< std::string, errors > canonicalize_pathname(std::string_view input)
std::string escape_pattern_string(std::string_view input)
constexpr bool is_absolute_pathname(std::string_view input, std::string_view type) noexcept
std::string convert_modifier_to_string(url_pattern_part_modifier modifier)
tl::expected< std::string, errors > canonicalize_password(std::string_view input)
tl::expected< std::vector< token >, errors > tokenize(std::string_view input, token_policy policy)
std::string process_base_url_string(std::string_view input, std::string_view type)
std::string generate_segment_wildcard_regexp(url_pattern_compile_component_options options)
tl::expected< std::string, errors > canonicalize_protocol(std::string_view input)
tl::expected< std::string, errors > canonicalize_hostname(std::string_view input)
std::string generate_pattern_string(std::vector< url_pattern_part > &part_list, url_pattern_compile_component_options &options)
tl::expected< std::string, errors > canonicalize_port_with_protocol(std::string_view input, std::string_view protocol)
std::string escape_regexp_string(std::string_view input)
tl::expected< std::string, errors > canonicalize_hash(std::string_view input)
tl::expected< std::string, errors > canonicalize_port(std::string_view input)
bool is_ipv6_address(std::string_view input) noexcept
tl::expected< std::string, errors > canonicalize_search(std::string_view input)
tl::expected< std::string, errors > canonicalize_ipv6_hostname(std::string_view input)
tl::expected< std::string, errors > canonicalize_username(std::string_view input)
std::tuple< std::string, std::vector< std::string > > generate_regular_expression_and_name_list(const std::vector< url_pattern_part > &part_list, url_pattern_compile_component_options options)
url_pattern_part_modifier
Definition url_pattern.h:38
errors
Definition errors.h:10
@ type_error
Definition errors.h:10
template ada::result< url_aggregator > parse< url_aggregator >(std::string_view input, const url_aggregator *base_url)
tl::expected< result_type, ada::errors > result
ada_warn_unused ada::result< result_type > parse(std::string_view input, const result_type *base_url=nullptr)
std::string_view get_prefix() const ada_warn_unused
std::string_view get_delimiter() const ada_warn_unused
Generic URL struct reliant on std::string instantiation.
Definition url.h:44
void set_hash(std::string_view input)
Definition url.cpp:802
std::string get_search() const noexcept
Definition url.cpp:641
bool set_hostname(std::string_view input)
Definition url.cpp:753
bool has_port() const noexcept
Definition url-inl.h:22
bool set_password(std::string_view input)
Definition url.cpp:766
void set_search(std::string_view input)
Definition url.cpp:816
bool set_username(std::string_view input)
Definition url.cpp:757
constexpr std::string_view get_pathname() const noexcept
Definition url-inl.h:46
std::string get_hash() const noexcept
Definition url.cpp:660
std::string get_hostname() const noexcept
Definition url.cpp:637
const std::string & get_password() const noexcept
Definition url.cpp:652
std::string get_port() const noexcept
Definition url.cpp:656
const std::string & get_username() const noexcept
Definition url.cpp:648
bool set_port(std::string_view input)
Definition url.cpp:775
constexpr bool has_search() const noexcept override
Definition url-inl.h:163
constexpr bool has_hash() const noexcept override
Definition url-inl.h:159
Declaration for the URLPattern helpers.