Ada 3.4.0
Fast spec-compliant URL parser
Loading...
Searching...
No Matches
url_pattern.h
Go to the documentation of this file.
1
12#ifndef ADA_URL_PATTERN_H
13#define ADA_URL_PATTERN_H
14
15#include "ada/implementation.h"
16#include "ada/expected.h"
17#include "ada/parser.h"
19
20#include <ostream>
21#include <string>
22#include <string_view>
23#include <unordered_map>
24#include <variant>
25#include <vector>
26
27#if ADA_TESTING
28#include <iostream>
29#endif // ADA_TESTING
30
31#if ADA_INCLUDE_URL_PATTERN
32namespace ada {
33
34enum class url_pattern_part_type : uint8_t {
35 // The part represents a simple fixed text string.
36 FIXED_TEXT,
37 // The part represents a matching group with a custom regular expression.
38 REGEXP,
39 // The part represents a matching group that matches code points up to the
40 // next separator code point. This is typically used for a named group like
41 // ":foo" that does not have a custom regular expression.
42 SEGMENT_WILDCARD,
43 // The part represents a matching group that greedily matches all code points.
44 // This is typically used for the "*" wildcard matching group.
45 FULL_WILDCARD,
46};
47
48// Pattern type for fast-path matching optimization.
49// This allows skipping expensive regex evaluation for common simple patterns.
50enum class url_pattern_component_type : uint8_t {
51 // Pattern is "^$" - only matches empty string
52 EMPTY,
53 // Pattern is "^<literal>$" - exact string match (no regex needed)
54 EXACT_MATCH,
55 // Pattern is "^(.*)$" - matches anything (full wildcard)
56 FULL_WILDCARD,
57 // Pattern requires actual regex evaluation
58 REGEXP,
59};
60
61enum class url_pattern_part_modifier : uint8_t {
62 // The part does not have a modifier.
63 none,
64 // The part has an optional modifier indicated by the U+003F (?) code point.
65 optional,
66 // The part has a "zero or more" modifier indicated by the U+002A (*) code
67 // point.
68 zero_or_more,
69 // The part has a "one or more" modifier indicated by the U+002B (+) code
70 // point.
71 one_or_more,
72};
73
74// @see https://urlpattern.spec.whatwg.org/#part
75class url_pattern_part {
76 public:
77 url_pattern_part(url_pattern_part_type _type, std::string&& _value,
78 url_pattern_part_modifier _modifier)
79 : type(_type), value(std::move(_value)), modifier(_modifier) {}
80
81 url_pattern_part(url_pattern_part_type _type, std::string&& _value,
82 url_pattern_part_modifier _modifier, std::string&& _name,
83 std::string&& _prefix, std::string&& _suffix)
84 : type(_type),
85 value(std::move(_value)),
86 modifier(_modifier),
87 name(std::move(_name)),
88 prefix(std::move(_prefix)),
89 suffix(std::move(_suffix)) {}
90 // A part has an associated type, a string, which must be set upon creation.
91 url_pattern_part_type type;
92 // A part has an associated value, a string, which must be set upon creation.
93 std::string value;
94 // A part has an associated modifier a string, which must be set upon
95 // creation.
96 url_pattern_part_modifier modifier;
97 // A part has an associated name, a string, initially the empty string.
98 std::string name{};
99 // A part has an associated prefix, a string, initially the empty string.
100 std::string prefix{};
101 // A part has an associated suffix, a string, initially the empty string.
102 std::string suffix{};
103
104 inline bool is_regexp() const noexcept;
105};
106
107// @see https://urlpattern.spec.whatwg.org/#options-header
108struct url_pattern_compile_component_options {
109 url_pattern_compile_component_options() = default;
110 explicit url_pattern_compile_component_options(
111 std::optional<char> new_delimiter = std::nullopt,
112 std::optional<char> new_prefix = std::nullopt)
113 : delimiter(new_delimiter), prefix(new_prefix) {}
114
115 inline std::string_view get_delimiter() const ada_warn_unused;
116 inline std::string_view get_prefix() const ada_warn_unused;
117
118 // @see https://urlpattern.spec.whatwg.org/#options-ignore-case
119 bool ignore_case = false;
120
121 static url_pattern_compile_component_options DEFAULT;
122 static url_pattern_compile_component_options HOSTNAME;
123 static url_pattern_compile_component_options PATHNAME;
124
125 private:
126 // @see https://urlpattern.spec.whatwg.org/#options-delimiter-code-point
127 std::optional<char> delimiter{};
128 // @see https://urlpattern.spec.whatwg.org/#options-prefix-code-point
129 std::optional<char> prefix{};
130};
131
132// The default options is an options struct with delimiter code point set to
133// the empty string and prefix code point set to the empty string.
134inline url_pattern_compile_component_options
135 url_pattern_compile_component_options::DEFAULT(std::nullopt, std::nullopt);
136
137// The hostname options is an options struct with delimiter code point set
138// "." and prefix code point set to the empty string.
139inline url_pattern_compile_component_options
140 url_pattern_compile_component_options::HOSTNAME('.', std::nullopt);
141
142// The pathname options is an options struct with delimiter code point set
143// "/" and prefix code point set to "/".
144inline url_pattern_compile_component_options
145 url_pattern_compile_component_options::PATHNAME('/', '/');
146
147// A struct providing the URLPattern matching results for a single
148// URL component. The URLPatternComponentResult is only ever used
149// as a member attribute of a URLPatternResult struct. The
150// URLPatternComponentResult API is defined as part of the URLPattern
151// specification.
152struct url_pattern_component_result {
153 std::string input;
154 std::unordered_map<std::string, std::optional<std::string>> groups;
155
156 bool operator==(const url_pattern_component_result&) const;
157
158#if ADA_TESTING
159 friend void PrintTo(const url_pattern_component_result& result,
160 std::ostream* os) {
161 *os << "input: '" << result.input << "', group: ";
162 for (const auto& group : result.groups) {
163 *os << "(" << group.first << ", " << group.second.value_or("undefined")
164 << ") ";
165 }
166 }
167#endif // ADA_TESTING
168};
169
170template <url_pattern_regex::regex_concept regex_provider>
171class url_pattern_component {
172 public:
173 url_pattern_component() = default;
174
175 // This function explicitly takes a std::string because it is moved.
176 // To avoid unnecessary copy, move each value while calling the constructor.
177 url_pattern_component(std::string&& new_pattern,
178 typename regex_provider::regex_type&& new_regexp,
179 std::vector<std::string>&& new_group_name_list,
180 bool new_has_regexp_groups,
181 url_pattern_component_type new_type,
182 std::string&& new_exact_match_value = {})
183 : regexp(std::move(new_regexp)),
184 pattern(std::move(new_pattern)),
185 group_name_list(std::move(new_group_name_list)),
186 exact_match_value(std::move(new_exact_match_value)),
187 has_regexp_groups(new_has_regexp_groups),
188 type(new_type) {}
189
190 // @see https://urlpattern.spec.whatwg.org/#compile-a-component
191 template <url_pattern_encoding_callback F>
192 static tl::expected<url_pattern_component, errors> compile(
193 std::string_view input, F& encoding_callback,
194 url_pattern_compile_component_options& options);
195
196 // @see https://urlpattern.spec.whatwg.org/#create-a-component-match-result
197 url_pattern_component_result create_component_match_result(
198 std::string&& input,
199 std::vector<std::optional<std::string>>&& exec_result);
200
201 // Fast path test that returns true/false without constructing result groups.
202 // Uses cached pattern type to skip regex evaluation for simple patterns.
203 bool fast_test(std::string_view input) const noexcept;
204
205 // Fast path match that returns capture groups without regex for simple
206 // patterns. Returns nullopt if pattern doesn't match, otherwise returns
207 // capture groups.
208 std::optional<std::vector<std::optional<std::string>>> fast_match(
209 std::string_view input) const;
210
211#if ADA_TESTING
212 friend void PrintTo(const url_pattern_component& component,
213 std::ostream* os) {
214 *os << "pattern: '" << component.pattern
215 << "', has_regexp_groups: " << component.has_regexp_groups
216 << "group_name_list: ";
217 for (const auto& name : component.group_name_list) {
218 *os << name << ", ";
219 }
220 }
221#endif // ADA_TESTING
222
223 typename regex_provider::regex_type regexp{};
224 std::string pattern{};
225 std::vector<std::string> group_name_list{};
226 // For EXACT_MATCH type: the literal string to compare against
227 std::string exact_match_value{};
228 bool has_regexp_groups = false;
229 // Cached pattern type for fast-path optimization
230 url_pattern_component_type type = url_pattern_component_type::REGEXP;
231};
232
233// A URLPattern input can be either a string or a URLPatternInit object.
234// If it is a string, it must be a valid UTF-8 string.
235using url_pattern_input = std::variant<std::string_view, url_pattern_init>;
236
237// A struct providing the URLPattern matching results for all
238// components of a URL. The URLPatternResult API is defined as
239// part of the URLPattern specification.
240struct url_pattern_result {
241 std::vector<url_pattern_input> inputs;
242 url_pattern_component_result protocol;
243 url_pattern_component_result username;
244 url_pattern_component_result password;
245 url_pattern_component_result hostname;
246 url_pattern_component_result port;
247 url_pattern_component_result pathname;
248 url_pattern_component_result search;
249 url_pattern_component_result hash;
250};
251
252struct url_pattern_options {
253 bool ignore_case = false;
254
255#if ADA_TESTING
256 friend void PrintTo(const url_pattern_options& options, std::ostream* os) {
257 *os << "ignore_case: '" << options.ignore_case;
258 }
259#endif // ADA_TESTING
260};
261
284template <url_pattern_regex::regex_concept regex_provider>
285class url_pattern {
286 public:
287 url_pattern() = default;
288
293 result<std::optional<url_pattern_result>> exec(
294 const url_pattern_input& input,
295 const std::string_view* base_url = nullptr);
296
301 result<bool> test(const url_pattern_input& input,
302 const std::string_view* base_url = nullptr);
303
308 result<std::optional<url_pattern_result>> match(
309 const url_pattern_input& input,
310 const std::string_view* base_url_string = nullptr);
311
312 // @see https://urlpattern.spec.whatwg.org/#dom-urlpattern-protocol
313 [[nodiscard]] std::string_view get_protocol() const ada_lifetime_bound;
314 // @see https://urlpattern.spec.whatwg.org/#dom-urlpattern-username
315 [[nodiscard]] std::string_view get_username() const ada_lifetime_bound;
316 // @see https://urlpattern.spec.whatwg.org/#dom-urlpattern-password
317 [[nodiscard]] std::string_view get_password() const ada_lifetime_bound;
318 // @see https://urlpattern.spec.whatwg.org/#dom-urlpattern-hostname
319 [[nodiscard]] std::string_view get_hostname() const ada_lifetime_bound;
320 // @see https://urlpattern.spec.whatwg.org/#dom-urlpattern-port
321 [[nodiscard]] std::string_view get_port() const ada_lifetime_bound;
322 // @see https://urlpattern.spec.whatwg.org/#dom-urlpattern-pathname
323 [[nodiscard]] std::string_view get_pathname() const ada_lifetime_bound;
324 // @see https://urlpattern.spec.whatwg.org/#dom-urlpattern-search
325 [[nodiscard]] std::string_view get_search() const ada_lifetime_bound;
326 // @see https://urlpattern.spec.whatwg.org/#dom-urlpattern-hash
327 [[nodiscard]] std::string_view get_hash() const ada_lifetime_bound;
328
329 // If ignoreCase is true, the JavaScript regular expression created for each
330 // pattern must use the `vi` flag. Otherwise, they must use the `v` flag.
331 [[nodiscard]] bool ignore_case() const;
332
333 // @see https://urlpattern.spec.whatwg.org/#url-pattern-has-regexp-groups
334 [[nodiscard]] bool has_regexp_groups() const;
335
336 // Helper to test all components at once. Returns true if all match.
337 [[nodiscard]] bool test_components(
338 std::string_view protocol, std::string_view username,
339 std::string_view password, std::string_view hostname,
340 std::string_view port, std::string_view pathname, std::string_view search,
341 std::string_view hash) const;
342
343#if ADA_TESTING
344 friend void PrintTo(const url_pattern& c, std::ostream* os) {
345 *os << "protocol_component: '" << c.get_protocol() << ", ";
346 *os << "username_component: '" << c.get_username() << ", ";
347 *os << "password_component: '" << c.get_password() << ", ";
348 *os << "hostname_component: '" << c.get_hostname() << ", ";
349 *os << "port_component: '" << c.get_port() << ", ";
350 *os << "pathname_component: '" << c.get_pathname() << ", ";
351 *os << "search_component: '" << c.get_search() << ", ";
352 *os << "hash_component: '" << c.get_hash();
353 }
354#endif // ADA_TESTING
355
356 template <url_pattern_regex::regex_concept P>
357 friend tl::expected<url_pattern<P>, errors> parser::parse_url_pattern_impl(
358 std::variant<std::string_view, url_pattern_init>&& input,
359 const std::string_view* base_url, const url_pattern_options* options);
360
366 url_pattern_component<regex_provider> protocol_component{};
372 url_pattern_component<regex_provider> username_component{};
378 url_pattern_component<regex_provider> password_component{};
384 url_pattern_component<regex_provider> hostname_component{};
390 url_pattern_component<regex_provider> port_component{};
396 url_pattern_component<regex_provider> pathname_component{};
402 url_pattern_component<regex_provider> search_component{};
408 url_pattern_component<regex_provider> hash_component{};
414 bool ignore_case_ = false;
415};
416} // namespace ada
417#endif // ADA_INCLUDE_URL_PATTERN
418#endif
#define ada_lifetime_bound
#define ada_warn_unused
Definition common_defs.h:89
User-facing functions for URL parsing and manipulation.
type
Enumeration of URL scheme types.
Definition scheme.h:41
Definition ada_idna.h:13
url_search_params url_search_params_iter< T, Type >::EMPTY
errors
Error codes for URL parsing operations.
Definition errors.h:17
tl::expected< result_type, ada::errors > result
Low-level URL parsing functions.
Declaration for the url_pattern_init implementation.