Ada 3.0.1
Fast spec-compliant URL parser
Loading...
Searching...
No Matches
url_pattern_helpers.h
Go to the documentation of this file.
1
5#ifndef ADA_URL_PATTERN_HELPERS_H
6#define ADA_URL_PATTERN_HELPERS_H
7
8#include "ada/expected.h"
9#include "ada/common_defs.h"
10#include "ada/url_pattern.h"
11
12#include <string>
13#include <tuple>
14#include <vector>
15
16namespace ada {
17enum class errors : uint8_t;
18}
19
21
22// @see https://urlpattern.spec.whatwg.org/#token
23enum class token_type : uint8_t {
25 OPEN, // 1
26 CLOSE, // 2
27 REGEXP, // 3
28 NAME, // 4
29 CHAR, // 5
33 END, // 9
34};
35
36std::string to_string(token_type type);
37
38// @see https://urlpattern.spec.whatwg.org/#tokenize-policy
39enum class token_policy {
42};
43
44// @see https://urlpattern.spec.whatwg.org/#tokens
45class token {
46 public:
47 token(token_type _type, size_t _index, std::string&& _value)
48 : type(_type), index(_index), value(std::move(_value)) {}
49
50 // A token has an associated type, a string, initially "invalid-char".
52
53 // A token has an associated index, a number, initially 0. It is the position
54 // of the first code point in the pattern string represented by the token.
55 size_t index = 0;
56
57 // A token has an associated value, a string, initially the empty string. It
58 // contains the code points from the pattern string represented by the token.
59 std::string value{};
60};
61
62// @see https://urlpattern.spec.whatwg.org/#pattern-parser
63template <url_pattern_encoding_callback F>
65 public:
66 url_pattern_parser(F& encoding_callback_,
67 std::string_view segment_wildcard_regexp_)
68 : encoding_callback(encoding_callback_),
69 segment_wildcard_regexp(segment_wildcard_regexp_) {}
70
71 bool can_continue() const { return index < tokens.size(); }
72
73 // @see https://urlpattern.spec.whatwg.org/#try-to-consume-a-token
75 // @see https://urlpattern.spec.whatwg.org/#try-to-consume-a-modifier-token
77 // @see
78 // https://urlpattern.spec.whatwg.org/#try-to-consume-a-regexp-or-wildcard-token
80 // @see https://urlpattern.spec.whatwg.org/#consume-text
81 std::string consume_text();
82 // @see https://urlpattern.spec.whatwg.org/#consume-a-required-token
84 // @see
85 // https://urlpattern.spec.whatwg.org/#maybe-add-a-part-from-the-pending-fixed-value
88 // @see https://urlpattern.spec.whatwg.org/#add-a-part
89 std::optional<errors> add_part(std::string_view prefix, token* name_token,
90 token* regexp_or_wildcard_token,
91 std::string_view suyffix,
92 token* modifier_token) ada_warn_unused;
93
94 std::vector<token> tokens{};
97 std::vector<url_pattern_part> parts{};
98 std::string pending_fixed_value{};
99 size_t index = 0;
101};
102
103// @see https://urlpattern.spec.whatwg.org/#tokenizer
105 public:
106 explicit Tokenizer(std::string_view new_input, token_policy new_policy)
107 : input(new_input), policy(new_policy) {}
108
109 // @see https://urlpattern.spec.whatwg.org/#get-the-next-code-point
110 void get_next_code_point();
111
112 // @see https://urlpattern.spec.whatwg.org/#seek-and-get-the-next-code-point
113 void seek_and_get_next_code_point(size_t index);
114
115 // @see https://urlpattern.spec.whatwg.org/#add-a-token
116
117 void add_token(token_type type, size_t next_position, size_t value_position,
118 size_t value_length);
119
120 // @see https://urlpattern.spec.whatwg.org/#add-a-token-with-default-length
121 void add_token_with_default_length(token_type type, size_t next_position,
122 size_t value_position);
123
124 // @see
125 // https://urlpattern.spec.whatwg.org/#add-a-token-with-default-position-and-length
127
128 // @see https://urlpattern.spec.whatwg.org/#process-a-tokenizing-error
129 std::optional<errors> process_tokenizing_error(
130 size_t next_position, size_t value_position) ada_warn_unused;
131
132 friend tl::expected<std::vector<token>, errors> tokenize(
133 std::string_view input, token_policy policy);
134
135 private:
136 // has an associated input, a pattern string, initially the empty string.
137 std::string input;
138 // has an associated policy, a tokenize policy, initially "strict".
139 token_policy policy;
140 // has an associated token list, a token list, initially an empty list.
141 std::vector<token> token_list{};
142 // has an associated index, a number, initially 0.
143 size_t index = 0;
144 // has an associated next index, a number, initially 0.
145 size_t next_index = 0;
146 // has an associated code point, a Unicode code point, initially null.
147 char32_t code_point{};
148};
149
150// @see https://urlpattern.spec.whatwg.org/#constructor-string-parser
151template <url_pattern_regex::regex_concept regex_provider>
153 explicit constructor_string_parser(std::string_view new_input,
154 std::vector<token>&& new_token_list)
155 : input(new_input), token_list(std::move(new_token_list)) {}
156
157 // @see https://urlpattern.spec.whatwg.org/#rewind
158 void rewind();
159
160 // @see https://urlpattern.spec.whatwg.org/#is-a-hash-prefix
161 bool is_hash_prefix();
162
163 // @see https://urlpattern.spec.whatwg.org/#is-a-search-prefix
164 bool is_search_prefix();
165
166 // @see https://urlpattern.spec.whatwg.org/#parse-a-constructor-string
167 static tl::expected<url_pattern_init, errors> parse(std::string_view input);
168
169 // @see https://urlpattern.spec.whatwg.org/#constructor-string-parser-state
183
184 // @see https://urlpattern.spec.whatwg.org/#change-state
185 void change_state(State state, size_t skip);
186
187 // @see https://urlpattern.spec.whatwg.org/#is-a-group-open
188 bool is_group_open() const;
189
190 // @see https://urlpattern.spec.whatwg.org/#is-a-group-close
191 bool is_group_close() const;
192
193 // @see https://urlpattern.spec.whatwg.org/#is-a-protocol-suffix
194 bool is_protocol_suffix();
195
196 // @see
197 // https://urlpattern.spec.whatwg.org/#compute-protocol-matches-a-special-scheme-flag
198 std::optional<errors> compute_protocol_matches_special_scheme_flag();
199
200 // @see https://urlpattern.spec.whatwg.org/#next-is-authority-slashes
202
203 // @see https://urlpattern.spec.whatwg.org/#is-an-identity-terminator
205
206 // @see https://urlpattern.spec.whatwg.org/#is-a-pathname-start
207 bool is_pathname_start();
208
209 // @see https://urlpattern.spec.whatwg.org/#is-a-password-prefix
210 bool is_password_prefix();
211
212 // @see https://urlpattern.spec.whatwg.org/#is-an-ipv6-open
213 bool is_an_ipv6_open();
214
215 // @see https://urlpattern.spec.whatwg.org/#is-an-ipv6-close
216 bool is_an_ipv6_close();
217
218 // @see https://urlpattern.spec.whatwg.org/#is-a-port-prefix
219 bool is_port_prefix();
220
221 private:
222 // @see https://urlpattern.spec.whatwg.org/#is-a-non-special-pattern-char
223 bool is_non_special_pattern_char(size_t index, std::string_view value);
224
225 // @see https://urlpattern.spec.whatwg.org/#get-a-safe-token
226 const token* get_safe_token(size_t index);
227
228 // @see https://urlpattern.spec.whatwg.org/#make-a-component-string
229 std::string make_component_string();
230 // has an associated input, a string, which must be set upon creation.
231 std::string input;
232 // has an associated token list, a token list, which must be set upon
233 // creation.
234 std::vector<token> token_list;
235 // has an associated result, a URLPatternInit, initially set to a new
236 // URLPatternInit.
238 // has an associated component start, a number, initially set to 0.
239 size_t component_start = 0;
240 // has an associated token index, a number, initially set to 0.
241 size_t token_index = 0;
242 // has an associated token increment, a number, initially set to 1.
243 size_t token_increment = 1;
244 // has an associated group depth, a number, initially set to 0.
245 size_t group_depth = 0;
246 // has an associated hostname IPv6 bracket depth, a number, initially set to
247 // 0.
248 size_t hostname_ipv6_bracket_depth = 0;
249 // has an associated protocol matches a special scheme flag, a boolean,
250 // initially set to false.
251 bool protocol_matches_a_special_scheme_flag = false;
252 // has an associated state, a string, initially set to "init".
254};
255
256// @see https://urlpattern.spec.whatwg.org/#canonicalize-a-protocol
257tl::expected<std::string, errors> canonicalize_protocol(std::string_view input);
258
259// @see https://wicg.github.io/urlpattern/#canonicalize-a-username
260tl::expected<std::string, errors> canonicalize_username(std::string_view input);
261
262// @see https://wicg.github.io/urlpattern/#canonicalize-a-password
263tl::expected<std::string, errors> canonicalize_password(std::string_view input);
264
265// @see https://wicg.github.io/urlpattern/#canonicalize-a-password
266tl::expected<std::string, errors> canonicalize_hostname(std::string_view input);
267
268// @see https://wicg.github.io/urlpattern/#canonicalize-an-ipv6-hostname
269tl::expected<std::string, errors> canonicalize_ipv6_hostname(
270 std::string_view input);
271
272// @see https://wicg.github.io/urlpattern/#canonicalize-a-port
273tl::expected<std::string, errors> canonicalize_port(std::string_view input);
274
275// @see https://wicg.github.io/urlpattern/#canonicalize-a-port
276tl::expected<std::string, errors> canonicalize_port_with_protocol(
277 std::string_view input, std::string_view protocol);
278
279// @see https://wicg.github.io/urlpattern/#canonicalize-a-pathname
280tl::expected<std::string, errors> canonicalize_pathname(std::string_view input);
281
282// @see https://wicg.github.io/urlpattern/#canonicalize-an-opaque-pathname
283tl::expected<std::string, errors> canonicalize_opaque_pathname(
284 std::string_view input);
285
286// @see https://wicg.github.io/urlpattern/#canonicalize-a-search
287tl::expected<std::string, errors> canonicalize_search(std::string_view input);
288
289// @see https://wicg.github.io/urlpattern/#canonicalize-a-hash
290tl::expected<std::string, errors> canonicalize_hash(std::string_view input);
291
292// @see https://urlpattern.spec.whatwg.org/#tokenize
293tl::expected<std::vector<token>, errors> tokenize(std::string_view input,
294 token_policy policy);
295
296// @see https://urlpattern.spec.whatwg.org/#process-a-base-url-string
297std::string process_base_url_string(std::string_view input,
298 std::string_view type);
299
300// @see https://urlpattern.spec.whatwg.org/#escape-a-pattern-string
301std::string escape_pattern_string(std::string_view input);
302
303// @see https://urlpattern.spec.whatwg.org/#escape-a-regexp-string
304std::string escape_regexp_string(std::string_view input);
305
306// @see https://urlpattern.spec.whatwg.org/#is-an-absolute-pathname
307constexpr bool is_absolute_pathname(std::string_view input,
308 std::string_view type) noexcept;
309
310// @see https://urlpattern.spec.whatwg.org/#parse-a-pattern-string
311template <url_pattern_encoding_callback F>
312tl::expected<std::vector<url_pattern_part>, errors> parse_pattern_string(
313 std::string_view input, url_pattern_compile_component_options& options,
314 F& encoding_callback);
315
316// @see https://urlpattern.spec.whatwg.org/#generate-a-pattern-string
317std::string generate_pattern_string(
318 std::vector<url_pattern_part>& part_list,
319 url_pattern_compile_component_options& options);
320
321// @see
322// https://urlpattern.spec.whatwg.org/#generate-a-regular-expression-and-name-list
323std::tuple<std::string, std::vector<std::string>>
325 const std::vector<url_pattern_part>& part_list,
326 url_pattern_compile_component_options options);
327
328// @see https://urlpattern.spec.whatwg.org/#hostname-pattern-is-an-ipv6-address
329bool is_ipv6_address(std::string_view input) noexcept;
330
331// @see
332// https://urlpattern.spec.whatwg.org/#protocol-component-matches-a-special-scheme
333template <url_pattern_regex::regex_concept regex_provider>
336
337// @see https://urlpattern.spec.whatwg.org/#convert-a-modifier-to-a-string
339
340// @see https://urlpattern.spec.whatwg.org/#generate-a-segment-wildcard-regexp
342 url_pattern_compile_component_options options);
343
344} // namespace ada::url_pattern_helpers
345
346#endif
void add_token_with_default_length(token_type type, size_t next_position, size_t value_position)
void add_token(token_type type, size_t next_position, size_t value_position, size_t value_length)
std::optional< errors > process_tokenizing_error(size_t next_position, size_t value_position) ada_warn_unused
Tokenizer(std::string_view new_input, token_policy new_policy)
friend tl::expected< std::vector< token >, errors > tokenize(std::string_view input, token_policy policy)
token(token_type _type, size_t _index, std::string &&_value)
std::optional< errors > add_part(std::string_view prefix, token *name_token, token *regexp_or_wildcard_token, std::string_view suyffix, token *modifier_token) ada_warn_unused
token * try_consume_regexp_or_wildcard_token(const token *name_token)
url_pattern_parser(F &encoding_callback_, std::string_view segment_wildcard_regexp_)
std::optional< errors > maybe_add_part_from_the_pending_fixed_value() ada_warn_unused
Common definitions for cross-platform compiler support.
#define ada_warn_unused
Definition common_defs.h:85
tl::expected< std::string, errors > canonicalize_opaque_pathname(std::string_view input)
tl::expected< std::string, errors > canonicalize_pathname(std::string_view input)
std::string escape_pattern_string(std::string_view input)
constexpr bool is_absolute_pathname(std::string_view input, std::string_view type) noexcept
std::string to_string(token_type type)
std::string convert_modifier_to_string(url_pattern_part_modifier modifier)
bool protocol_component_matches_special_scheme(url_pattern_component< regex_provider > &component)
tl::expected< std::string, errors > canonicalize_password(std::string_view input)
tl::expected< std::vector< token >, errors > tokenize(std::string_view input, token_policy policy)
std::string process_base_url_string(std::string_view input, std::string_view type)
std::string generate_segment_wildcard_regexp(url_pattern_compile_component_options options)
tl::expected< std::string, errors > canonicalize_protocol(std::string_view input)
tl::expected< std::vector< url_pattern_part >, errors > parse_pattern_string(std::string_view input, url_pattern_compile_component_options &options, F &encoding_callback)
tl::expected< std::string, errors > canonicalize_hostname(std::string_view input)
std::string generate_pattern_string(std::vector< url_pattern_part > &part_list, url_pattern_compile_component_options &options)
tl::expected< std::string, errors > canonicalize_port_with_protocol(std::string_view input, std::string_view protocol)
std::string escape_regexp_string(std::string_view input)
tl::expected< std::string, errors > canonicalize_hash(std::string_view input)
tl::expected< std::string, errors > canonicalize_port(std::string_view input)
bool is_ipv6_address(std::string_view input) noexcept
tl::expected< std::string, errors > canonicalize_search(std::string_view input)
tl::expected< std::string, errors > canonicalize_ipv6_hostname(std::string_view input)
tl::expected< std::string, errors > canonicalize_username(std::string_view input)
std::tuple< std::string, std::vector< std::string > > generate_regular_expression_and_name_list(const std::vector< url_pattern_part > &part_list, url_pattern_compile_component_options options)
Definition ada_idna.h:13
url_pattern_part_modifier
Definition url_pattern.h:38
errors
Definition errors.h:10
state
Definition state.h:17
tl::expected< result_type, ada::errors > result
constructor_string_parser(std::string_view new_input, std::vector< token > &&new_token_list)
static tl::expected< url_pattern_init, errors > parse(std::string_view input)
Declaration for the URLPattern implementation.