Ada 3.4.0
Fast spec-compliant URL parser
Loading...
Searching...
No Matches
url_pattern-inl.h
Go to the documentation of this file.
1
5#ifndef ADA_URL_PATTERN_INL_H
6#define ADA_URL_PATTERN_INL_H
7
8#include "ada/common_defs.h"
10#include "ada/url_pattern.h"
11
12#include <algorithm>
13#include <string_view>
14#include <utility>
15
16#if ADA_INCLUDE_URL_PATTERN
17namespace ada {
18
19inline bool url_pattern_init::operator==(const url_pattern_init& other) const {
20 return protocol == other.protocol && username == other.username &&
21 password == other.password && hostname == other.hostname &&
22 port == other.port && search == other.search && hash == other.hash &&
23 pathname == other.pathname;
24}
25
26inline bool url_pattern_component_result::operator==(
27 const url_pattern_component_result& other) const {
28 return input == other.input && groups == other.groups;
29}
30
31template <url_pattern_regex::regex_concept regex_provider>
32url_pattern_component_result
33url_pattern_component<regex_provider>::create_component_match_result(
34 std::string&& input,
35 std::vector<std::optional<std::string>>&& exec_result) {
36 // Let result be a new URLPatternComponentResult.
37 // Set result["input"] to input.
38 // Let groups be a record<USVString, (USVString or undefined)>.
39 auto result =
40 url_pattern_component_result{.input = std::move(input), .groups = {}};
41
42 // Optimization: Let's reserve the size.
43 result.groups.reserve(exec_result.size());
44
45 // We explicitly start iterating from 0 even though the spec
46 // says we should start from 1. This case is handled by the
47 // std_regex_provider.
48 for (size_t index = 0; index < exec_result.size(); index++) {
49 result.groups.emplace(group_name_list[index],
50 std::move(exec_result[index]));
51 }
52 return result;
53}
54
55template <url_pattern_regex::regex_concept regex_provider>
56std::string_view url_pattern<regex_provider>::get_protocol() const
58 // Return this's associated URL pattern's protocol component's pattern string.
59 return protocol_component.pattern;
60}
61template <url_pattern_regex::regex_concept regex_provider>
62std::string_view url_pattern<regex_provider>::get_username() const
64 // Return this's associated URL pattern's username component's pattern string.
65 return username_component.pattern;
66}
67template <url_pattern_regex::regex_concept regex_provider>
68std::string_view url_pattern<regex_provider>::get_password() const
70 // Return this's associated URL pattern's password component's pattern string.
71 return password_component.pattern;
72}
73template <url_pattern_regex::regex_concept regex_provider>
74std::string_view url_pattern<regex_provider>::get_hostname() const
76 // Return this's associated URL pattern's hostname component's pattern string.
77 return hostname_component.pattern;
78}
79template <url_pattern_regex::regex_concept regex_provider>
80std::string_view url_pattern<regex_provider>::get_port() const
82 // Return this's associated URL pattern's port component's pattern string.
83 return port_component.pattern;
84}
85template <url_pattern_regex::regex_concept regex_provider>
86std::string_view url_pattern<regex_provider>::get_pathname() const
88 // Return this's associated URL pattern's pathname component's pattern string.
89 return pathname_component.pattern;
90}
91template <url_pattern_regex::regex_concept regex_provider>
92std::string_view url_pattern<regex_provider>::get_search() const
94 // Return this's associated URL pattern's search component's pattern string.
95 return search_component.pattern;
96}
97template <url_pattern_regex::regex_concept regex_provider>
98std::string_view url_pattern<regex_provider>::get_hash() const
100 // Return this's associated URL pattern's hash component's pattern string.
101 return hash_component.pattern;
102}
103template <url_pattern_regex::regex_concept regex_provider>
104bool url_pattern<regex_provider>::ignore_case() const {
105 return ignore_case_;
106}
107template <url_pattern_regex::regex_concept regex_provider>
108bool url_pattern<regex_provider>::has_regexp_groups() const {
109 // If this's associated URL pattern's has regexp groups, then return true.
110 return protocol_component.has_regexp_groups ||
111 username_component.has_regexp_groups ||
112 password_component.has_regexp_groups ||
113 hostname_component.has_regexp_groups ||
114 port_component.has_regexp_groups ||
115 pathname_component.has_regexp_groups ||
116 search_component.has_regexp_groups || hash_component.has_regexp_groups;
117}
118
119inline bool url_pattern_part::is_regexp() const noexcept {
120 return type == url_pattern_part_type::REGEXP;
121}
122
123inline std::string_view url_pattern_compile_component_options::get_delimiter()
124 const {
125 if (delimiter) {
126 return {&delimiter.value(), 1};
127 }
128 return {};
129}
130
131inline std::string_view url_pattern_compile_component_options::get_prefix()
132 const {
133 if (prefix) {
134 return {&prefix.value(), 1};
135 }
136 return {};
137}
138
139template <url_pattern_regex::regex_concept regex_provider>
140template <url_pattern_encoding_callback F>
141tl::expected<url_pattern_component<regex_provider>, errors>
142url_pattern_component<regex_provider>::compile(
143 std::string_view input, F& encoding_callback,
144 url_pattern_compile_component_options& options) {
145 ada_log("url_pattern_component::compile input: ", input);
146 // Let part list be the result of running parse a pattern string given input,
147 // options, and encoding callback.
148 auto part_list = url_pattern_helpers::parse_pattern_string(input, options,
149 encoding_callback);
150
151 if (!part_list) {
152 ada_log("parse_pattern_string failed");
153 return tl::unexpected(part_list.error());
154 }
155
156 // Detect pattern type early to potentially skip expensive regex compilation
157 const auto has_regexp = [](const auto& part) { return part.is_regexp(); };
158 const bool has_regexp_groups = std::ranges::any_of(*part_list, has_regexp);
159
160 url_pattern_component_type component_type =
161 url_pattern_component_type::REGEXP;
162 std::string exact_match_value{};
163
164 if (part_list->empty()) {
165 component_type = url_pattern_component_type::EMPTY;
166 } else if (part_list->size() == 1) {
167 const auto& part = (*part_list)[0];
168 if (part.type == url_pattern_part_type::FIXED_TEXT &&
169 part.modifier == url_pattern_part_modifier::none &&
170 !options.ignore_case) {
171 component_type = url_pattern_component_type::EXACT_MATCH;
172 exact_match_value = part.value;
173 } else if (part.type == url_pattern_part_type::FULL_WILDCARD &&
174 part.modifier == url_pattern_part_modifier::none &&
175 part.prefix.empty() && part.suffix.empty()) {
176 component_type = url_pattern_component_type::FULL_WILDCARD;
177 }
178 }
179
180 // For simple patterns, skip regex generation and compilation entirely
181 if (component_type != url_pattern_component_type::REGEXP) {
182 auto pattern_string =
183 url_pattern_helpers::generate_pattern_string(*part_list, options);
184 // For FULL_WILDCARD, we need the group name from
185 // generate_regular_expression
186 std::vector<std::string> name_list;
187 if (component_type == url_pattern_component_type::FULL_WILDCARD &&
188 !part_list->empty()) {
189 name_list.push_back((*part_list)[0].name);
190 }
191 return url_pattern_component<regex_provider>(
192 std::move(pattern_string), typename regex_provider::regex_type{},
193 std::move(name_list), has_regexp_groups, component_type,
194 std::move(exact_match_value));
195 }
196
197 // Generate regex for complex patterns
198 auto [regular_expression_string, name_list] =
199 url_pattern_helpers::generate_regular_expression_and_name_list(*part_list,
200 options);
201 auto pattern_string =
202 url_pattern_helpers::generate_pattern_string(*part_list, options);
203
204 std::optional<typename regex_provider::regex_type> regular_expression =
205 regex_provider::create_instance(regular_expression_string,
206 options.ignore_case);
207 if (!regular_expression) {
208 return tl::unexpected(errors::type_error);
209 }
210
211 return url_pattern_component<regex_provider>(
212 std::move(pattern_string), std::move(*regular_expression),
213 std::move(name_list), has_regexp_groups, component_type,
214 std::move(exact_match_value));
215}
216
217template <url_pattern_regex::regex_concept regex_provider>
218bool url_pattern_component<regex_provider>::fast_test(
219 std::string_view input) const noexcept {
220 // Fast path for simple patterns - avoid regex evaluation
221 // Using if-else for better branch prediction on common cases
222 if (type == url_pattern_component_type::FULL_WILDCARD) {
223 return true;
224 }
225 if (type == url_pattern_component_type::EXACT_MATCH) {
226 return input == exact_match_value;
227 }
228 if (type == url_pattern_component_type::EMPTY) {
229 return input.empty();
230 }
231 // type == REGEXP
232 return regex_provider::regex_match(input, regexp);
233}
234
235template <url_pattern_regex::regex_concept regex_provider>
236std::optional<std::vector<std::optional<std::string>>>
237url_pattern_component<regex_provider>::fast_match(
238 std::string_view input) const {
239 // Handle each type directly without redundant checks
240 if (type == url_pattern_component_type::FULL_WILDCARD) {
241 // FULL_WILDCARD always matches
242 // Match regex_search behavior: empty input returns empty groups
243 if (input.empty() || group_name_list.empty()) {
244 return std::vector<std::optional<std::string>>{};
245 }
246 return std::vector<std::optional<std::string>>{std::string(input)};
247 }
248 if (type == url_pattern_component_type::EXACT_MATCH) {
249 if (input == exact_match_value) {
250 return std::vector<std::optional<std::string>>{};
251 }
252 return std::nullopt;
253 }
254 if (type == url_pattern_component_type::EMPTY) {
255 if (input.empty()) {
256 return std::vector<std::optional<std::string>>{};
257 }
258 return std::nullopt;
259 }
260 // type == REGEXP - use regex
261 return regex_provider::regex_search(input, regexp);
262}
263
264template <url_pattern_regex::regex_concept regex_provider>
265result<std::optional<url_pattern_result>> url_pattern<regex_provider>::exec(
266 const url_pattern_input& input, const std::string_view* base_url) {
267 // Return the result of match given this's associated URL pattern, input, and
268 // baseURL if given.
269 return match(input, base_url);
270}
271
272template <url_pattern_regex::regex_concept regex_provider>
273bool url_pattern<regex_provider>::test_components(
274 std::string_view protocol, std::string_view username,
275 std::string_view password, std::string_view hostname, std::string_view port,
276 std::string_view pathname, std::string_view search,
277 std::string_view hash) const {
278 return protocol_component.fast_test(protocol) &&
279 username_component.fast_test(username) &&
280 password_component.fast_test(password) &&
281 hostname_component.fast_test(hostname) &&
282 port_component.fast_test(port) &&
283 pathname_component.fast_test(pathname) &&
284 search_component.fast_test(search) && hash_component.fast_test(hash);
285}
286
287template <url_pattern_regex::regex_concept regex_provider>
288result<bool> url_pattern<regex_provider>::test(
289 const url_pattern_input& input, const std::string_view* base_url_string) {
290 // If input is a URLPatternInit
291 if (std::holds_alternative<url_pattern_init>(input)) {
292 if (base_url_string) {
293 return tl::unexpected(errors::type_error);
294 }
295
296 std::string protocol{}, username{}, password{}, hostname{};
297 std::string port{}, pathname{}, search{}, hash{};
298
299 auto apply_result = url_pattern_init::process(
300 std::get<url_pattern_init>(input), url_pattern_init::process_type::url,
301 protocol, username, password, hostname, port, pathname, search, hash);
302
303 if (!apply_result) {
304 return false;
305 }
306
307 std::string_view search_view = *apply_result->search;
308 if (search_view.starts_with("?")) {
309 search_view.remove_prefix(1);
310 }
311
312 return test_components(*apply_result->protocol, *apply_result->username,
313 *apply_result->password, *apply_result->hostname,
314 *apply_result->port, *apply_result->pathname,
315 search_view, *apply_result->hash);
316 }
317
318 // URL string input path
319 result<url_aggregator> base_url;
320 if (base_url_string) {
321 base_url = ada::parse<url_aggregator>(*base_url_string, nullptr);
322 if (!base_url) {
323 return false;
324 }
325 }
326
327 auto url =
328 ada::parse<url_aggregator>(std::get<std::string_view>(input),
329 base_url.has_value() ? &*base_url : nullptr);
330 if (!url) {
331 return false;
332 }
333
334 // Extract components as string_view
335 auto protocol_view = url->get_protocol();
336 if (protocol_view.ends_with(":")) {
337 protocol_view.remove_suffix(1);
338 }
339
340 auto search_view = url->get_search();
341 if (search_view.starts_with("?")) {
342 search_view.remove_prefix(1);
343 }
344
345 auto hash_view = url->get_hash();
346 if (hash_view.starts_with("#")) {
347 hash_view.remove_prefix(1);
348 }
349
350 return test_components(protocol_view, url->get_username(),
352 url->get_port(), url->get_pathname(), search_view,
353 hash_view);
354}
355
356template <url_pattern_regex::regex_concept regex_provider>
357result<std::optional<url_pattern_result>> url_pattern<regex_provider>::match(
358 const url_pattern_input& input, const std::string_view* base_url_string) {
359 std::string protocol{};
360 std::string username{};
361 std::string password{};
362 std::string hostname{};
363 std::string port{};
364 std::string pathname{};
365 std::string search{};
366 std::string hash{};
367
368 // Let inputs be an empty list.
369 // Append input to inputs.
370 std::vector inputs{input};
371
372 // If input is a URLPatternInit then:
373 if (std::holds_alternative<url_pattern_init>(input)) {
374 ada_log(
375 "url_pattern::match called with url_pattern_init and base_url_string=",
376 base_url_string);
377 // If baseURLString was given, throw a TypeError.
378 if (base_url_string) {
379 ada_log("failed to match because base_url_string was given");
380 return tl::unexpected(errors::type_error);
381 }
382
383 // Let applyResult be the result of process a URLPatternInit given input,
384 // "url", protocol, username, password, hostname, port, pathname, search,
385 // and hash.
386 auto apply_result = url_pattern_init::process(
387 std::get<url_pattern_init>(input), url_pattern_init::process_type::url,
388 protocol, username, password, hostname, port, pathname, search, hash);
389
390 // If this throws an exception, catch it, and return null.
391 if (!apply_result.has_value()) {
392 ada_log("match returned std::nullopt because process threw");
393 return std::nullopt;
394 }
395
396 // Set protocol to applyResult["protocol"].
397 ADA_ASSERT_TRUE(apply_result->protocol.has_value());
398 protocol = std::move(apply_result->protocol.value());
399
400 // Set username to applyResult["username"].
401 ADA_ASSERT_TRUE(apply_result->username.has_value());
402 username = std::move(apply_result->username.value());
403
404 // Set password to applyResult["password"].
405 ADA_ASSERT_TRUE(apply_result->password.has_value());
406 password = std::move(apply_result->password.value());
407
408 // Set hostname to applyResult["hostname"].
409 ADA_ASSERT_TRUE(apply_result->hostname.has_value());
410 hostname = std::move(apply_result->hostname.value());
411
412 // Set port to applyResult["port"].
413 ADA_ASSERT_TRUE(apply_result->port.has_value());
414 port = std::move(apply_result->port.value());
415
416 // Set pathname to applyResult["pathname"].
417 ADA_ASSERT_TRUE(apply_result->pathname.has_value());
418 pathname = std::move(apply_result->pathname.value());
419
420 // Set search to applyResult["search"].
421 ADA_ASSERT_TRUE(apply_result->search.has_value());
422 if (apply_result->search->starts_with("?")) {
423 search = apply_result->search->substr(1);
424 } else {
425 search = std::move(apply_result->search.value());
426 }
427
428 // Set hash to applyResult["hash"].
429 ADA_ASSERT_TRUE(apply_result->hash.has_value());
430 ADA_ASSERT_TRUE(!apply_result->hash->starts_with("#"));
431 hash = std::move(apply_result->hash.value());
432 } else {
433 ADA_ASSERT_TRUE(std::holds_alternative<std::string_view>(input));
434
435 // Let baseURL be null.
436 result<url_aggregator> base_url;
437
438 // If baseURLString was given, then:
439 if (base_url_string) {
440 // Let baseURL be the result of parsing baseURLString.
441 base_url = ada::parse<url_aggregator>(*base_url_string, nullptr);
442
443 // If baseURL is failure, return null.
444 if (!base_url) {
445 ada_log("match returned std::nullopt because failed to parse base_url=",
446 *base_url_string);
447 return std::nullopt;
448 }
449
450 // Append baseURLString to inputs.
451 inputs.emplace_back(*base_url_string);
452 }
453
454 url_aggregator* base_url_value =
455 base_url.has_value() ? &*base_url : nullptr;
456
457 // Set url to the result of parsing input given baseURL.
458 auto url = ada::parse<url_aggregator>(std::get<std::string_view>(input),
459 base_url_value);
460
461 // If url is failure, return null.
462 if (!url) {
463 ada_log("match returned std::nullopt because url failed");
464 return std::nullopt;
465 }
466
467 // Set protocol to url's scheme.
468 // IMPORTANT: Not documented on the URLPattern spec, but protocol suffix ':'
469 // is removed. Similar work was done on workerd:
470 // https://github.com/cloudflare/workerd/blob/8620d14012513a6ce04d079e401d3becac3c67bd/src/workerd/jsg/url.c%2B%2B#L2038
471 protocol = url->get_protocol().substr(0, url->get_protocol().size() - 1);
472 // Set username to url's username.
473 username = url->get_username();
474 // Set password to url's password.
475 password = url->get_password();
476 // Set hostname to url's host, serialized, or the empty string if the value
477 // is null.
478 hostname = url->get_hostname();
479 // Set port to url's port, serialized, or the empty string if the value is
480 // null.
481 port = url->get_port();
482 // Set pathname to the result of URL path serializing url.
483 pathname = url->get_pathname();
484 // Set search to url's query or the empty string if the value is null.
485 // IMPORTANT: Not documented on the URLPattern spec, but search prefix '?'
486 // is removed. Similar work was done on workerd:
487 // https://github.com/cloudflare/workerd/blob/8620d14012513a6ce04d079e401d3becac3c67bd/src/workerd/jsg/url.c%2B%2B#L2232
488 if (url->has_search()) {
489 auto view = url->get_search();
490 search = view.starts_with("?") ? url->get_search().substr(1) : view;
491 }
492 // Set hash to url's fragment or the empty string if the value is null.
493 // IMPORTANT: Not documented on the URLPattern spec, but hash prefix '#' is
494 // removed. Similar work was done on workerd:
495 // https://github.com/cloudflare/workerd/blob/8620d14012513a6ce04d079e401d3becac3c67bd/src/workerd/jsg/url.c%2B%2B#L2242
496 if (url->has_hash()) {
497 auto view = url->get_hash();
498 hash = view.starts_with("#") ? url->get_hash().substr(1) : view;
499 }
500 }
501
502 // Use fast_match which skips regex for simple patterns (EMPTY, EXACT_MATCH,
503 // FULL_WILDCARD) and only falls back to regex for complex REGEXP patterns.
504
505 // Let protocolExecResult be RegExpBuiltinExec(urlPattern's protocol
506 // component's regular expression, protocol).
507 auto protocol_exec_result = protocol_component.fast_match(protocol);
508 if (!protocol_exec_result) {
509 return std::nullopt;
510 }
511
512 // Let usernameExecResult be RegExpBuiltinExec(urlPattern's username
513 // component's regular expression, username).
514 auto username_exec_result = username_component.fast_match(username);
515 if (!username_exec_result) {
516 return std::nullopt;
517 }
518
519 // Let passwordExecResult be RegExpBuiltinExec(urlPattern's password
520 // component's regular expression, password).
521 auto password_exec_result = password_component.fast_match(password);
522 if (!password_exec_result) {
523 return std::nullopt;
524 }
525
526 // Let hostnameExecResult be RegExpBuiltinExec(urlPattern's hostname
527 // component's regular expression, hostname).
528 auto hostname_exec_result = hostname_component.fast_match(hostname);
529 if (!hostname_exec_result) {
530 return std::nullopt;
531 }
532
533 // Let portExecResult be RegExpBuiltinExec(urlPattern's port component's
534 // regular expression, port).
535 auto port_exec_result = port_component.fast_match(port);
536 if (!port_exec_result) {
537 return std::nullopt;
538 }
539
540 // Let pathnameExecResult be RegExpBuiltinExec(urlPattern's pathname
541 // component's regular expression, pathname).
542 auto pathname_exec_result = pathname_component.fast_match(pathname);
543 if (!pathname_exec_result) {
544 return std::nullopt;
545 }
546
547 // Let searchExecResult be RegExpBuiltinExec(urlPattern's search component's
548 // regular expression, search).
549 auto search_exec_result = search_component.fast_match(search);
550 if (!search_exec_result) {
551 return std::nullopt;
552 }
553
554 // Let hashExecResult be RegExpBuiltinExec(urlPattern's hash component's
555 // regular expression, hash).
556 auto hash_exec_result = hash_component.fast_match(hash);
557 if (!hash_exec_result) {
558 return std::nullopt;
559 }
560
561 // Let result be a new URLPatternResult.
562 auto result = url_pattern_result{};
563 // Set result["inputs"] to inputs.
564 result.inputs = std::move(inputs);
565 // Set result["protocol"] to the result of creating a component match result
566 // given urlPattern's protocol component, protocol, and protocolExecResult.
567 result.protocol = protocol_component.create_component_match_result(
568 std::move(protocol), std::move(*protocol_exec_result));
569
570 // Set result["username"] to the result of creating a component match result
571 // given urlPattern's username component, username, and usernameExecResult.
572 result.username = username_component.create_component_match_result(
573 std::move(username), std::move(*username_exec_result));
574
575 // Set result["password"] to the result of creating a component match result
576 // given urlPattern's password component, password, and passwordExecResult.
577 result.password = password_component.create_component_match_result(
578 std::move(password), std::move(*password_exec_result));
579
580 // Set result["hostname"] to the result of creating a component match result
581 // given urlPattern's hostname component, hostname, and hostnameExecResult.
582 result.hostname = hostname_component.create_component_match_result(
583 std::move(hostname), std::move(*hostname_exec_result));
584
585 // Set result["port"] to the result of creating a component match result given
586 // urlPattern's port component, port, and portExecResult.
587 result.port = port_component.create_component_match_result(
588 std::move(port), std::move(*port_exec_result));
589
590 // Set result["pathname"] to the result of creating a component match result
591 // given urlPattern's pathname component, pathname, and pathnameExecResult.
592 result.pathname = pathname_component.create_component_match_result(
593 std::move(pathname), std::move(*pathname_exec_result));
594
595 // Set result["search"] to the result of creating a component match result
596 // given urlPattern's search component, search, and searchExecResult.
597 result.search = search_component.create_component_match_result(
598 std::move(search), std::move(*search_exec_result));
599
600 // Set result["hash"] to the result of creating a component match result given
601 // urlPattern's hash component, hash, and hashExecResult.
602 result.hash = hash_component.create_component_match_result(
603 std::move(hash), std::move(*hash_exec_result));
604
605 return result;
606}
607
608} // namespace ada
609#endif // ADA_INCLUDE_URL_PATTERN
610#endif
Cross-platform compiler macros and common definitions.
#define ADA_ASSERT_TRUE(COND)
#define ada_lifetime_bound
type
Enumeration of URL scheme types.
Definition scheme.h:41
Definition ada_idna.h:13
errors
Error codes for URL parsing operations.
Definition errors.h:17
@ type_error
Definition errors.h:18
template ada::result< url_aggregator > parse< url_aggregator >(std::string_view input, const url_aggregator *base_url)
tl::expected< result_type, ada::errors > result
Memory-efficient URL representation using a single buffer.
Represents a parsed URL with individual string components.
Definition url.h:62
std::string get_search() const noexcept
Definition url.cpp:659
constexpr std::string_view get_pathname() const noexcept
Definition url-inl.h:46
std::string get_hash() const noexcept
Definition url.cpp:678
std::string get_hostname() const noexcept
Definition url.cpp:655
const std::string & get_password() const noexcept
Definition url.cpp:670
std::string get_port() const noexcept
Definition url.cpp:674
const std::string & get_username() const noexcept
Definition url.cpp:666
constexpr bool has_search() const noexcept override
Definition url-inl.h:164
std::string get_protocol() const noexcept
Definition url.cpp:633
constexpr bool has_hash() const noexcept override
Definition url-inl.h:160
URLPattern API implementation.
Declaration for the URLPattern helpers.