Ada 3.4.0
Fast spec-compliant URL parser
Loading...
Searching...
No Matches
url_pattern_helpers.h
Go to the documentation of this file.
1
5#ifndef ADA_URL_PATTERN_HELPERS_H
6#define ADA_URL_PATTERN_HELPERS_H
7
8#include "ada/expected.h"
9#include "ada/common_defs.h"
10#include "ada/url_pattern.h"
11
12#include <string>
13#include <tuple>
14#include <vector>
15
16#if ADA_INCLUDE_URL_PATTERN
17namespace ada {
18enum class errors : uint8_t;
19}
20
21namespace ada::url_pattern_helpers {
22
23// @see https://urlpattern.spec.whatwg.org/#token
24enum class token_type : uint8_t {
25 INVALID_CHAR, // 0
26 OPEN, // 1
27 CLOSE, // 2
28 REGEXP, // 3
29 NAME, // 4
30 CHAR, // 5
31 ESCAPED_CHAR, // 6
32 OTHER_MODIFIER, // 7
33 ASTERISK, // 8
34 END, // 9
35};
36
37#ifdef ADA_TESTING
38std::string to_string(token_type type);
39#endif // ADA_TESTING
40
41// @see https://urlpattern.spec.whatwg.org/#tokenize-policy
42enum class token_policy {
43 strict,
44 lenient,
45};
46
47// @see https://urlpattern.spec.whatwg.org/#tokens
48class token {
49 public:
50 token(token_type _type, size_t _index, std::string_view _value)
51 : type(_type), index(_index), value(_value) {}
52
53 // A token has an associated type, a string, initially "invalid-char".
54 token_type type = token_type::INVALID_CHAR;
55
56 // A token has an associated index, a number, initially 0. It is the position
57 // of the first code point in the pattern string represented by the token.
58 size_t index = 0;
59
60 // A token has an associated value, a string, initially the empty string. It
61 // contains the code points from the pattern string represented by the token.
62 std::string_view value{};
63};
64
65// @see https://urlpattern.spec.whatwg.org/#pattern-parser
66template <url_pattern_encoding_callback F>
67class url_pattern_parser {
68 public:
69 url_pattern_parser(F& encoding_callback_,
70 std::string_view segment_wildcard_regexp_)
71 : encoding_callback(encoding_callback_),
72 segment_wildcard_regexp(segment_wildcard_regexp_) {}
73
74 bool can_continue() const { return index < tokens.size(); }
75
76 // @see https://urlpattern.spec.whatwg.org/#try-to-consume-a-token
77 token* try_consume_token(token_type type);
78 // @see https://urlpattern.spec.whatwg.org/#try-to-consume-a-modifier-token
79 token* try_consume_modifier_token();
80 // @see
81 // https://urlpattern.spec.whatwg.org/#try-to-consume-a-regexp-or-wildcard-token
82 token* try_consume_regexp_or_wildcard_token(const token* name_token);
83 // @see https://urlpattern.spec.whatwg.org/#consume-text
84 std::string consume_text();
85 // @see https://urlpattern.spec.whatwg.org/#consume-a-required-token
86 bool consume_required_token(token_type type);
87 // @see
88 // https://urlpattern.spec.whatwg.org/#maybe-add-a-part-from-the-pending-fixed-value
89 std::optional<errors> maybe_add_part_from_the_pending_fixed_value()
91 // @see https://urlpattern.spec.whatwg.org/#add-a-part
92 std::optional<errors> add_part(std::string_view prefix, token* name_token,
93 token* regexp_or_wildcard_token,
94 std::string_view suyffix,
95 token* modifier_token) ada_warn_unused;
96
97 std::vector<token> tokens{};
98 F& encoding_callback;
99 std::string segment_wildcard_regexp;
100 std::vector<url_pattern_part> parts{};
101 std::string pending_fixed_value{};
102 size_t index = 0;
103 size_t next_numeric_name = 0;
104};
105
106// @see https://urlpattern.spec.whatwg.org/#tokenizer
107class Tokenizer {
108 public:
109 explicit Tokenizer(std::string_view new_input, token_policy new_policy)
110 : input(new_input), policy(new_policy) {}
111
112 // @see https://urlpattern.spec.whatwg.org/#get-the-next-code-point
113 constexpr void get_next_code_point();
114
115 // @see https://urlpattern.spec.whatwg.org/#seek-and-get-the-next-code-point
116 constexpr void seek_and_get_next_code_point(size_t index);
117
118 // @see https://urlpattern.spec.whatwg.org/#add-a-token
119
120 void add_token(token_type type, size_t next_position, size_t value_position,
121 size_t value_length);
122
123 // @see https://urlpattern.spec.whatwg.org/#add-a-token-with-default-length
124 void add_token_with_default_length(token_type type, size_t next_position,
125 size_t value_position);
126
127 // @see
128 // https://urlpattern.spec.whatwg.org/#add-a-token-with-default-position-and-length
129 void add_token_with_defaults(token_type type);
130
131 // @see https://urlpattern.spec.whatwg.org/#process-a-tokenizing-error
132 std::optional<errors> process_tokenizing_error(
133 size_t next_position, size_t value_position) ada_warn_unused;
134
135 friend tl::expected<std::vector<token>, errors> tokenize(
136 std::string_view input, token_policy policy);
137
138 private:
139 // has an associated input, a pattern string, initially the empty string.
140 std::string_view input;
141 // has an associated policy, a tokenize policy, initially "strict".
142 token_policy policy;
143 // has an associated token list, a token list, initially an empty list.
144 std::vector<token> token_list{};
145 // has an associated index, a number, initially 0.
146 size_t index = 0;
147 // has an associated next index, a number, initially 0.
148 size_t next_index = 0;
149 // has an associated code point, a Unicode code point, initially null.
150 char32_t code_point{};
151};
152
153// @see https://urlpattern.spec.whatwg.org/#constructor-string-parser
154template <url_pattern_regex::regex_concept regex_provider>
155struct constructor_string_parser {
156 explicit constructor_string_parser(std::string_view new_input,
157 std::vector<token>&& new_token_list)
158 : input(new_input), token_list(std::move(new_token_list)) {}
159 // @see https://urlpattern.spec.whatwg.org/#parse-a-constructor-string
160 static tl::expected<url_pattern_init, errors> parse(std::string_view input);
161
162 // @see https://urlpattern.spec.whatwg.org/#constructor-string-parser-state
163 enum class State {
164 INIT,
165 PROTOCOL,
166 AUTHORITY,
167 USERNAME,
168 PASSWORD,
169 HOSTNAME,
170 PORT,
171 PATHNAME,
172 SEARCH,
173 HASH,
174 DONE,
175 };
176
177 // @see
178 // https://urlpattern.spec.whatwg.org/#compute-protocol-matches-a-special-scheme-flag
179 std::optional<errors> compute_protocol_matches_special_scheme_flag();
180
181 private:
182 // @see https://urlpattern.spec.whatwg.org/#rewind
183 constexpr void rewind();
184
185 // @see https://urlpattern.spec.whatwg.org/#is-a-hash-prefix
186 constexpr bool is_hash_prefix();
187
188 // @see https://urlpattern.spec.whatwg.org/#is-a-search-prefix
189 constexpr bool is_search_prefix();
190
191 // @see https://urlpattern.spec.whatwg.org/#change-state
192 void change_state(State state, size_t skip);
193
194 // @see https://urlpattern.spec.whatwg.org/#is-a-group-open
195 constexpr bool is_group_open() const;
196
197 // @see https://urlpattern.spec.whatwg.org/#is-a-group-close
198 constexpr bool is_group_close() const;
199
200 // @see https://urlpattern.spec.whatwg.org/#is-a-protocol-suffix
201 constexpr bool is_protocol_suffix() const;
202
203 // @see https://urlpattern.spec.whatwg.org/#next-is-authority-slashes
204 constexpr bool next_is_authority_slashes() const;
205
206 // @see https://urlpattern.spec.whatwg.org/#is-an-identity-terminator
207 constexpr bool is_an_identity_terminator() const;
208
209 // @see https://urlpattern.spec.whatwg.org/#is-a-pathname-start
210 constexpr bool is_pathname_start() const;
211
212 // @see https://urlpattern.spec.whatwg.org/#is-a-password-prefix
213 constexpr bool is_password_prefix() const;
214
215 // @see https://urlpattern.spec.whatwg.org/#is-an-ipv6-open
216 constexpr bool is_an_ipv6_open() const;
217
218 // @see https://urlpattern.spec.whatwg.org/#is-an-ipv6-close
219 constexpr bool is_an_ipv6_close() const;
220
221 // @see https://urlpattern.spec.whatwg.org/#is-a-port-prefix
222 constexpr bool is_port_prefix() const;
223
224 // @see https://urlpattern.spec.whatwg.org/#is-a-non-special-pattern-char
225 constexpr bool is_non_special_pattern_char(size_t index,
226 uint32_t value) const;
227
228 // @see https://urlpattern.spec.whatwg.org/#get-a-safe-token
229 constexpr const token* get_safe_token(size_t index) const;
230
231 // @see https://urlpattern.spec.whatwg.org/#make-a-component-string
232 std::string make_component_string();
233 // has an associated input, a string, which must be set upon creation.
234 std::string_view input;
235 // has an associated token list, a token list, which must be set upon
236 // creation.
237 std::vector<token> token_list;
238 // has an associated result, a URLPatternInit, initially set to a new
239 // URLPatternInit.
240 url_pattern_init result{};
241 // has an associated component start, a number, initially set to 0.
242 size_t component_start = 0;
243 // has an associated token index, a number, initially set to 0.
244 size_t token_index = 0;
245 // has an associated token increment, a number, initially set to 1.
246 size_t token_increment = 1;
247 // has an associated group depth, a number, initially set to 0.
248 size_t group_depth = 0;
249 // has an associated hostname IPv6 bracket depth, a number, initially set to
250 // 0.
251 size_t hostname_ipv6_bracket_depth = 0;
252 // has an associated protocol matches a special scheme flag, a boolean,
253 // initially set to false.
254 bool protocol_matches_a_special_scheme_flag = false;
255 // has an associated state, a string, initially set to "init".
256 State state = State::INIT;
257};
258
259// @see https://urlpattern.spec.whatwg.org/#canonicalize-a-protocol
260tl::expected<std::string, errors> canonicalize_protocol(std::string_view input);
261
262// @see https://wicg.github.io/urlpattern/#canonicalize-a-username
263tl::expected<std::string, errors> canonicalize_username(std::string_view input);
264
265// @see https://wicg.github.io/urlpattern/#canonicalize-a-password
266tl::expected<std::string, errors> canonicalize_password(std::string_view input);
267
268// @see https://wicg.github.io/urlpattern/#canonicalize-a-password
269tl::expected<std::string, errors> canonicalize_hostname(std::string_view input);
270
271// @see https://wicg.github.io/urlpattern/#canonicalize-an-ipv6-hostname
272tl::expected<std::string, errors> canonicalize_ipv6_hostname(
273 std::string_view input);
274
275// @see https://wicg.github.io/urlpattern/#canonicalize-a-port
276tl::expected<std::string, errors> canonicalize_port(std::string_view input);
277
278// @see https://wicg.github.io/urlpattern/#canonicalize-a-port
279tl::expected<std::string, errors> canonicalize_port_with_protocol(
280 std::string_view input, std::string_view protocol);
281
282// @see https://wicg.github.io/urlpattern/#canonicalize-a-pathname
283tl::expected<std::string, errors> canonicalize_pathname(std::string_view input);
284
285// @see https://wicg.github.io/urlpattern/#canonicalize-an-opaque-pathname
286tl::expected<std::string, errors> canonicalize_opaque_pathname(
287 std::string_view input);
288
289// @see https://wicg.github.io/urlpattern/#canonicalize-a-search
290tl::expected<std::string, errors> canonicalize_search(std::string_view input);
291
292// @see https://wicg.github.io/urlpattern/#canonicalize-a-hash
293tl::expected<std::string, errors> canonicalize_hash(std::string_view input);
294
295// @see https://urlpattern.spec.whatwg.org/#tokenize
296tl::expected<std::vector<token>, errors> tokenize(std::string_view input,
297 token_policy policy);
298
299// @see https://urlpattern.spec.whatwg.org/#process-a-base-url-string
300std::string process_base_url_string(std::string_view input,
301 url_pattern_init::process_type type);
302
303// @see https://urlpattern.spec.whatwg.org/#escape-a-pattern-string
304std::string escape_pattern_string(std::string_view input);
305
306// @see https://urlpattern.spec.whatwg.org/#escape-a-regexp-string
307std::string escape_regexp_string(std::string_view input);
308
309// @see https://urlpattern.spec.whatwg.org/#is-an-absolute-pathname
310constexpr bool is_absolute_pathname(
311 std::string_view input, url_pattern_init::process_type type) noexcept;
312
313// @see https://urlpattern.spec.whatwg.org/#parse-a-pattern-string
314template <url_pattern_encoding_callback F>
315tl::expected<std::vector<url_pattern_part>, errors> parse_pattern_string(
316 std::string_view input, url_pattern_compile_component_options& options,
317 F& encoding_callback);
318
319// @see https://urlpattern.spec.whatwg.org/#generate-a-pattern-string
320std::string generate_pattern_string(
321 std::vector<url_pattern_part>& part_list,
322 url_pattern_compile_component_options& options);
323
324// @see
325// https://urlpattern.spec.whatwg.org/#generate-a-regular-expression-and-name-list
326std::tuple<std::string, std::vector<std::string>>
327generate_regular_expression_and_name_list(
328 const std::vector<url_pattern_part>& part_list,
329 url_pattern_compile_component_options options);
330
331// @see https://urlpattern.spec.whatwg.org/#hostname-pattern-is-an-ipv6-address
332bool is_ipv6_address(std::string_view input) noexcept;
333
334// @see
335// https://urlpattern.spec.whatwg.org/#protocol-component-matches-a-special-scheme
336template <url_pattern_regex::regex_concept regex_provider>
337bool protocol_component_matches_special_scheme(
338 ada::url_pattern_component<regex_provider>& input);
339
340// @see https://urlpattern.spec.whatwg.org/#convert-a-modifier-to-a-string
341std::string_view convert_modifier_to_string(url_pattern_part_modifier modifier);
342
343// @see https://urlpattern.spec.whatwg.org/#generate-a-segment-wildcard-regexp
344std::string generate_segment_wildcard_regexp(
345 url_pattern_compile_component_options options);
346
347} // namespace ada::url_pattern_helpers
348#endif // ADA_INCLUDE_URL_PATTERN
349#endif
Cross-platform compiler macros and common definitions.
#define ada_warn_unused
Definition common_defs.h:89
type
Enumeration of URL scheme types.
Definition scheme.h:41
Definition ada_idna.h:13
errors
Error codes for URL parsing operations.
Definition errors.h:17
state
States in the URL parsing state machine.
Definition state.h:27
@ AUTHORITY
Definition state.h:31
ada_warn_unused std::string_view to_string(encoding_type type)
tl::expected< result_type, ada::errors > result
ada_warn_unused ada::result< result_type > parse(std::string_view input, const result_type *base_url=nullptr)
URLPattern API implementation.