Ada 2.9.2
Fast spec-compliant URL parser
Loading...
Searching...
No Matches
parser.cpp
Go to the documentation of this file.
1#include "ada/parser.h"
2
3#include <limits>
4
5#include "ada.h"
7#include "ada/common_defs.h"
8#include "ada/log.h"
9#include "ada/unicode.h"
10#include "ada/url-inl.h"
11
12namespace ada::parser {
13
14template <class result_type, bool store_values>
15result_type parse_url_impl(std::string_view user_input,
16 const result_type* base_url) {
17 // We can specialize the implementation per type.
18 // Important: result_type_is_ada_url is evaluated at *compile time*. This
19 // means that doing if constexpr(result_type_is_ada_url) { something } else {
20 // something else } is free (at runtime). This means that ada::url_aggregator
21 // and ada::url **do not have to support the exact same API**.
22 constexpr bool result_type_is_ada_url =
23 std::is_same<ada::url, result_type>::value;
24 constexpr bool result_type_is_ada_url_aggregator =
25 std::is_same<ada::url_aggregator, result_type>::value;
26 static_assert(result_type_is_ada_url ||
27 result_type_is_ada_url_aggregator); // We don't support
28 // anything else for now.
29
30 ada_log("ada::parser::parse_url('", user_input, "' [", user_input.size(),
31 " bytes],", (base_url != nullptr ? base_url->to_string() : "null"),
32 ")");
33
35 result_type url{};
36
37 // We refuse to parse URL strings that exceed 4GB. Such strings are almost
38 // surely the result of a bug or are otherwise a security concern.
39 if (user_input.size() > std::numeric_limits<uint32_t>::max()) [[unlikely]] {
40 url.is_valid = false;
41 }
42 // Going forward, user_input.size() is in [0,
43 // std::numeric_limits<uint32_t>::max). If we are provided with an invalid
44 // base, or the optional_url was invalid, we must return.
45 if (base_url != nullptr) {
46 url.is_valid &= base_url->is_valid;
47 }
48 if (!url.is_valid) {
49 return url;
50 }
51 if constexpr (result_type_is_ada_url_aggregator && store_values) {
52 // Most of the time, we just need user_input.size().
53 // In some instances, we may need a bit more.
55 // This is *very* important. This line should *not* be removed
56 // hastily. There are principled reasons why reserve is important
57 // for performance. If you have a benchmark with small inputs,
58 // it may not matter, but in other instances, it could.
60 // This rounds up to the next power of two.
61 // We know that user_input.size() is in [0,
62 // std::numeric_limits<uint32_t>::max).
63 uint32_t reserve_capacity =
64 (0xFFFFFFFF >>
65 helpers::leading_zeroes(uint32_t(1 | user_input.size()))) +
66 1;
67 url.reserve(reserve_capacity);
68 }
69 std::string tmp_buffer;
70 std::string_view url_data;
71 if (unicode::has_tabs_or_newline(user_input)) [[unlikely]] {
72 tmp_buffer = user_input;
73 // Optimization opportunity: Instead of copying and then pruning, we could
74 // just directly build the string from user_input.
75 helpers::remove_ascii_tab_or_newline(tmp_buffer);
76 url_data = tmp_buffer;
77 } else [[likely]] {
78 url_data = user_input;
79 }
80
81 // Leading and trailing control characters are uncommon and easy to deal with
82 // (no performance concern).
83 helpers::trim_c0_whitespace(url_data);
84
85 // Optimization opportunity. Most websites do not have fragment.
86 std::optional<std::string_view> fragment = helpers::prune_hash(url_data);
87 // We add it last so that an implementation like ada::url_aggregator
88 // can append it last to its internal buffer, thus improving performance.
89
90 // Here url_data no longer has its fragment.
91 // We are going to access the data from url_data (it is immutable).
92 // At any given time, we are pointing at byte 'input_position' in url_data.
93 // The input_position variable should range from 0 to input_size.
94 // It is illegal to access url_data at input_size.
95 size_t input_position = 0;
96 const size_t input_size = url_data.size();
97 // Keep running the following state machine by switching on state.
98 // If after a run pointer points to the EOF code point, go to the next step.
99 // Otherwise, increase pointer by 1 and continue with the state machine.
100 // We never decrement input_position.
101 while (input_position <= input_size) {
102 ada_log("In parsing at ", input_position, " out of ", input_size,
103 " in state ", ada::to_string(state));
104 switch (state) {
106 ada_log("SCHEME_START ", helpers::substring(url_data, input_position));
107 // If c is an ASCII alpha, append c, lowercased, to buffer, and set
108 // state to scheme state.
109 if ((input_position != input_size) &&
110 checkers::is_alpha(url_data[input_position])) {
112 input_position++;
113 } else {
114 // Otherwise, if state override is not given, set state to no scheme
115 // state and decrease pointer by 1.
117 }
118 break;
119 }
120 case ada::state::SCHEME: {
121 ada_log("SCHEME ", helpers::substring(url_data, input_position));
122 // If c is an ASCII alphanumeric, U+002B (+), U+002D (-), or U+002E (.),
123 // append c, lowercased, to buffer.
124 while ((input_position != input_size) &&
125 (ada::unicode::is_alnum_plus(url_data[input_position]))) {
126 input_position++;
127 }
128 // Otherwise, if c is U+003A (:), then:
129 if ((input_position != input_size) &&
130 (url_data[input_position] == ':')) {
131 ada_log("SCHEME the scheme should be ",
132 url_data.substr(0, input_position));
133 if constexpr (result_type_is_ada_url) {
134 if (!url.parse_scheme(url_data.substr(0, input_position))) {
135 return url;
136 }
137 } else {
138 // we pass the colon along instead of painfully adding it back.
139 if (!url.parse_scheme_with_colon(
140 url_data.substr(0, input_position + 1))) {
141 return url;
142 }
143 }
144 ada_log("SCHEME the scheme is ", url.get_protocol());
145
146 // If url's scheme is "file", then:
147 if (url.type == ada::scheme::type::FILE) {
148 // Set state to file state.
150 }
151 // Otherwise, if url is special, base is non-null, and base's scheme
152 // is url's scheme: Note: Doing base_url->scheme is unsafe if base_url
153 // != nullptr is false.
154 else if (url.is_special() && base_url != nullptr &&
155 base_url->type == url.type) {
156 // Set state to special relative or authority state.
158 }
159 // Otherwise, if url is special, set state to special authority
160 // slashes state.
161 else if (url.is_special()) {
163 }
164 // Otherwise, if remaining starts with an U+002F (/), set state to
165 // path or authority state and increase pointer by 1.
166 else if (input_position + 1 < input_size &&
167 url_data[input_position + 1] == '/') {
169 input_position++;
170 }
171 // Otherwise, set url's path to the empty string and set state to
172 // opaque path state.
173 else {
175 }
176 }
177 // Otherwise, if state override is not given, set buffer to the empty
178 // string, state to no scheme state, and start over (from the first code
179 // point in input).
180 else {
182 input_position = 0;
183 break;
184 }
185 input_position++;
186 break;
187 }
189 ada_log("NO_SCHEME ", helpers::substring(url_data, input_position));
190 // If base is null, or base has an opaque path and c is not U+0023 (#),
191 // validation error, return failure.
192 if (base_url == nullptr ||
193 (base_url->has_opaque_path && !fragment.has_value())) {
194 ada_log("NO_SCHEME validation error");
195 url.is_valid = false;
196 return url;
197 }
198 // Otherwise, if base has an opaque path and c is U+0023 (#),
199 // set url's scheme to base's scheme, url's path to base's path, url's
200 // query to base's query, and set state to fragment state.
201 else if (base_url->has_opaque_path && fragment.has_value() &&
202 input_position == input_size) {
203 ada_log("NO_SCHEME opaque base with fragment");
204 url.copy_scheme(*base_url);
205 url.has_opaque_path = base_url->has_opaque_path;
206
207 if constexpr (result_type_is_ada_url) {
208 url.path = base_url->path;
209 url.query = base_url->query;
210 } else {
211 url.update_base_pathname(base_url->get_pathname());
212 url.update_base_search(base_url->get_search());
213 }
214 url.update_unencoded_base_hash(*fragment);
215 return url;
216 }
217 // Otherwise, if base's scheme is not "file", set state to relative
218 // state and decrease pointer by 1.
219 else if (base_url->type != ada::scheme::type::FILE) {
220 ada_log("NO_SCHEME non-file relative path");
222 }
223 // Otherwise, set state to file state and decrease pointer by 1.
224 else {
225 ada_log("NO_SCHEME file base type");
227 }
228 break;
229 }
231 ada_log("AUTHORITY ", helpers::substring(url_data, input_position));
232 // most URLs have no @. Having no @ tells us that we don't have to worry
233 // about AUTHORITY. Of course, we could have @ and still not have to
234 // worry about AUTHORITY.
235 // TODO: Instead of just collecting a bool, collect the location of the
236 // '@' and do something useful with it.
237 // TODO: We could do various processing early on, using a single pass
238 // over the string to collect information about it, e.g., telling us
239 // whether there is a @ and if so, where (or how many).
240
241 // Check if url data contains an @.
242 if (url_data.find('@', input_position) == std::string_view::npos) {
244 break;
245 }
246 bool at_sign_seen{false};
247 bool password_token_seen{false};
253 do {
254 std::string_view view = url_data.substr(input_position);
255 // The delimiters are @, /, ? \\.
256 size_t location =
257 url.is_special() ? helpers::find_authority_delimiter_special(view)
258 : helpers::find_authority_delimiter(view);
259 std::string_view authority_view = view.substr(0, location);
260 size_t end_of_authority = input_position + authority_view.size();
261 // If c is U+0040 (@), then:
262 if ((end_of_authority != input_size) &&
263 (url_data[end_of_authority] == '@')) {
264 // If atSignSeen is true, then prepend "%40" to buffer.
265 if (at_sign_seen) {
266 if (password_token_seen) {
267 if constexpr (result_type_is_ada_url) {
268 url.password += "%40";
269 } else {
270 url.append_base_password("%40");
271 }
272 } else {
273 if constexpr (result_type_is_ada_url) {
274 url.username += "%40";
275 } else {
276 url.append_base_username("%40");
277 }
278 }
279 }
280
281 at_sign_seen = true;
282
283 if (!password_token_seen) {
284 size_t password_token_location = authority_view.find(':');
285 password_token_seen =
286 password_token_location != std::string_view::npos;
287
288 if constexpr (store_values) {
289 if (!password_token_seen) {
290 if constexpr (result_type_is_ada_url) {
291 url.username += unicode::percent_encode(
292 authority_view,
294 } else {
295 url.append_base_username(unicode::percent_encode(
296 authority_view,
298 }
299 } else {
300 if constexpr (result_type_is_ada_url) {
301 url.username += unicode::percent_encode(
302 authority_view.substr(0, password_token_location),
304 url.password += unicode::percent_encode(
305 authority_view.substr(password_token_location + 1),
307 } else {
308 url.append_base_username(unicode::percent_encode(
309 authority_view.substr(0, password_token_location),
311 url.append_base_password(unicode::percent_encode(
312 authority_view.substr(password_token_location + 1),
314 }
315 }
316 }
317 } else if constexpr (store_values) {
318 if constexpr (result_type_is_ada_url) {
319 url.password += unicode::percent_encode(
321 } else {
322 url.append_base_password(unicode::percent_encode(
324 }
325 }
326 }
327 // Otherwise, if one of the following is true:
328 // - c is the EOF code point, U+002F (/), U+003F (?), or U+0023 (#)
329 // - url is special and c is U+005C (\‍)
330 else if (end_of_authority == input_size ||
331 url_data[end_of_authority] == '/' ||
332 url_data[end_of_authority] == '?' ||
333 (url.is_special() && url_data[end_of_authority] == '\\')) {
334 // If atSignSeen is true and authority_view is the empty string,
335 // validation error, return failure.
336 if (at_sign_seen && authority_view.empty()) {
337 url.is_valid = false;
338 return url;
339 }
341 break;
342 }
343 if (end_of_authority == input_size) {
344 if constexpr (store_values) {
345 if (fragment.has_value()) {
346 url.update_unencoded_base_hash(*fragment);
347 }
348 }
349 return url;
350 }
351 input_position = end_of_authority + 1;
352 } while (true);
353
354 break;
355 }
357 ada_log("SPECIAL_RELATIVE_OR_AUTHORITY ",
358 helpers::substring(url_data, input_position));
359
360 // If c is U+002F (/) and remaining starts with U+002F (/),
361 // then set state to special authority ignore slashes state and increase
362 // pointer by 1.
363 if (url_data.substr(input_position, 2) == "//") {
365 input_position += 2;
366 } else {
367 // Otherwise, validation error, set state to relative state and
368 // decrease pointer by 1.
370 }
371
372 break;
373 }
375 ada_log("PATH_OR_AUTHORITY ",
376 helpers::substring(url_data, input_position));
377
378 // If c is U+002F (/), then set state to authority state.
379 if ((input_position != input_size) &&
380 (url_data[input_position] == '/')) {
382 input_position++;
383 } else {
384 // Otherwise, set state to path state, and decrease pointer by 1.
386 }
387
388 break;
389 }
391 ada_log("RELATIVE_SCHEME ",
392 helpers::substring(url_data, input_position));
393
394 // Set url's scheme to base's scheme.
395 url.copy_scheme(*base_url);
396
397 // If c is U+002F (/), then set state to relative slash state.
398 if ((input_position != input_size) &&
399 (url_data[input_position] == '/')) {
400 ada_log(
401 "RELATIVE_SCHEME if c is U+002F (/), then set state to relative "
402 "slash state");
404 } else if (url.is_special() && (input_position != input_size) &&
405 (url_data[input_position] == '\\')) {
406 // Otherwise, if url is special and c is U+005C (\‍), validation error,
407 // set state to relative slash state.
408 ada_log(
409 "RELATIVE_SCHEME if url is special and c is U+005C, validation "
410 "error, set state to relative slash state");
412 } else {
413 ada_log("RELATIVE_SCHEME otherwise");
414 // Set url's username to base's username, url's password to base's
415 // password, url's host to base's host, url's port to base's port,
416 // url's path to a clone of base's path, and url's query to base's
417 // query.
418 if constexpr (result_type_is_ada_url) {
419 url.username = base_url->username;
420 url.password = base_url->password;
421 url.host = base_url->host;
422 url.port = base_url->port;
423 // cloning the base path includes cloning the has_opaque_path flag
424 url.has_opaque_path = base_url->has_opaque_path;
425 url.path = base_url->path;
426 url.query = base_url->query;
427 } else {
428 url.update_base_authority(base_url->get_href(),
429 base_url->get_components());
430 url.update_host_to_base_host(base_url->get_hostname());
431 url.update_base_port(base_url->retrieve_base_port());
432 // cloning the base path includes cloning the has_opaque_path flag
433 url.has_opaque_path = base_url->has_opaque_path;
434 url.update_base_pathname(base_url->get_pathname());
435 url.update_base_search(base_url->get_search());
436 }
437
438 url.has_opaque_path = base_url->has_opaque_path;
439
440 // If c is U+003F (?), then set url's query to the empty string, and
441 // state to query state.
442 if ((input_position != input_size) &&
443 (url_data[input_position] == '?')) {
445 }
446 // Otherwise, if c is not the EOF code point:
447 else if (input_position != input_size) {
448 // Set url's query to null.
449 url.clear_search();
450 if constexpr (result_type_is_ada_url) {
451 // Shorten url's path.
452 helpers::shorten_path(url.path, url.type);
453 } else {
454 std::string_view path = url.get_pathname();
455 if (helpers::shorten_path(path, url.type)) {
456 url.update_base_pathname(std::move(std::string(path)));
457 }
458 }
459 // Set state to path state and decrease pointer by 1.
461 break;
462 }
463 }
464 input_position++;
465 break;
466 }
468 ada_log("RELATIVE_SLASH ",
469 helpers::substring(url_data, input_position));
470
471 // If url is special and c is U+002F (/) or U+005C (\‍), then:
472 if (url.is_special() && (input_position != input_size) &&
473 (url_data[input_position] == '/' ||
474 url_data[input_position] == '\\')) {
475 // Set state to special authority ignore slashes state.
477 }
478 // Otherwise, if c is U+002F (/), then set state to authority state.
479 else if ((input_position != input_size) &&
480 (url_data[input_position] == '/')) {
482 }
483 // Otherwise, set
484 // - url's username to base's username,
485 // - url's password to base's password,
486 // - url's host to base's host,
487 // - url's port to base's port,
488 // - state to path state, and then, decrease pointer by 1.
489 else {
490 if constexpr (result_type_is_ada_url) {
491 url.username = base_url->username;
492 url.password = base_url->password;
493 url.host = base_url->host;
494 url.port = base_url->port;
495 } else {
496 url.update_base_authority(base_url->get_href(),
497 base_url->get_components());
498 url.update_host_to_base_host(base_url->get_hostname());
499 url.update_base_port(base_url->retrieve_base_port());
500 }
502 break;
503 }
504
505 input_position++;
506 break;
507 }
509 ada_log("SPECIAL_AUTHORITY_SLASHES ",
510 helpers::substring(url_data, input_position));
511
512 // If c is U+002F (/) and remaining starts with U+002F (/),
513 // then set state to special authority ignore slashes state and increase
514 // pointer by 1.
515 if (url_data.substr(input_position, 2) == "//") {
516 input_position += 2;
517 }
518
519 [[fallthrough]];
520 }
522 ada_log("SPECIAL_AUTHORITY_IGNORE_SLASHES ",
523 helpers::substring(url_data, input_position));
524
525 // If c is neither U+002F (/) nor U+005C (\‍), then set state to
526 // authority state and decrease pointer by 1.
527 while ((input_position != input_size) &&
528 ((url_data[input_position] == '/') ||
529 (url_data[input_position] == '\\'))) {
530 input_position++;
531 }
533
534 break;
535 }
536 case ada::state::QUERY: {
537 ada_log("QUERY ", helpers::substring(url_data, input_position));
538 if constexpr (store_values) {
539 // Let queryPercentEncodeSet be the special-query percent-encode set
540 // if url is special; otherwise the query percent-encode set.
541 const uint8_t* query_percent_encode_set =
545
546 // Percent-encode after encoding, with encoding, buffer, and
547 // queryPercentEncodeSet, and append the result to url's query.
548 url.update_base_search(url_data.substr(input_position),
549 query_percent_encode_set);
550 ada_log("QUERY update_base_search completed ");
551 if (fragment.has_value()) {
552 url.update_unencoded_base_hash(*fragment);
553 }
554 }
555 return url;
556 }
557 case ada::state::HOST: {
558 ada_log("HOST ", helpers::substring(url_data, input_position));
559
560 std::string_view host_view = url_data.substr(input_position);
561 auto [location, found_colon] =
562 helpers::get_host_delimiter_location(url.is_special(), host_view);
563 input_position = (location != std::string_view::npos)
564 ? input_position + location
565 : input_size;
566 // Otherwise, if c is U+003A (:) and insideBrackets is false, then:
567 // Note: the 'found_colon' value is true if and only if a colon was
568 // encountered while not inside brackets.
569 if (found_colon) {
570 // If buffer is the empty string, validation error, return failure.
571 // Let host be the result of host parsing buffer with url is not
572 // special.
573 ada_log("HOST parsing ", host_view);
574 if (!url.parse_host(host_view)) {
575 return url;
576 }
577 ada_log("HOST parsing results in ", url.get_hostname());
578 // Set url's host to host, buffer to the empty string, and state to
579 // port state.
581 input_position++;
582 }
583 // Otherwise, if one of the following is true:
584 // - c is the EOF code point, U+002F (/), U+003F (?), or U+0023 (#)
585 // - url is special and c is U+005C (\‍)
586 // The get_host_delimiter_location function either brings us to
587 // the colon outside of the bracket, or to one of those characters.
588 else {
589 // If url is special and host_view is the empty string, validation
590 // error, return failure.
591 if (host_view.empty() && url.is_special()) {
592 url.is_valid = false;
593 return url;
594 }
595 ada_log("HOST parsing ", host_view, " href=", url.get_href());
596 // Let host be the result of host parsing host_view with url is not
597 // special.
598 if (host_view.empty()) {
599 url.update_base_hostname("");
600 } else if (!url.parse_host(host_view)) {
601 return url;
602 }
603 ada_log("HOST parsing results in ", url.get_hostname(),
604 " href=", url.get_href());
605
606 // Set url's host to host, and state to path start state.
608 }
609
610 break;
611 }
613 ada_log("OPAQUE_PATH ", helpers::substring(url_data, input_position));
614 std::string_view view = url_data.substr(input_position);
615 // If c is U+003F (?), then set url's query to the empty string and
616 // state to query state.
617 size_t location = view.find('?');
618 if (location != std::string_view::npos) {
619 view.remove_suffix(view.size() - location);
621 input_position += location + 1;
622 } else {
623 input_position = input_size + 1;
624 }
625 url.has_opaque_path = true;
626 // This is a really unlikely scenario in real world. We should not seek
627 // to optimize it.
628 url.update_base_pathname(unicode::percent_encode(
630 break;
631 }
632 case ada::state::PORT: {
633 ada_log("PORT ", helpers::substring(url_data, input_position));
634 std::string_view port_view = url_data.substr(input_position);
635 input_position += url.parse_port(port_view, true);
636 if (!url.is_valid) {
637 return url;
638 }
640 [[fallthrough]];
641 }
643 ada_log("PATH_START ", helpers::substring(url_data, input_position));
644
645 // If url is special, then:
646 if (url.is_special()) {
647 // Set state to path state.
649
650 // Optimization: Avoiding going into PATH state improves the
651 // performance of urls ending with /.
652 if (input_position == input_size) {
653 if constexpr (store_values) {
654 url.update_base_pathname("/");
655 if (fragment.has_value()) {
656 url.update_unencoded_base_hash(*fragment);
657 }
658 }
659 return url;
660 }
661 // If c is neither U+002F (/) nor U+005C (\‍), then decrease pointer
662 // by 1. We know that (input_position == input_size) is impossible
663 // here, because of the previous if-check.
664 if ((url_data[input_position] != '/') &&
665 (url_data[input_position] != '\\')) {
666 break;
667 }
668 }
669 // Otherwise, if state override is not given and c is U+003F (?),
670 // set url's query to the empty string and state to query state.
671 else if ((input_position != input_size) &&
672 (url_data[input_position] == '?')) {
674 }
675 // Otherwise, if c is not the EOF code point:
676 else if (input_position != input_size) {
677 // Set state to path state.
679
680 // If c is not U+002F (/), then decrease pointer by 1.
681 if (url_data[input_position] != '/') {
682 break;
683 }
684 }
685
686 input_position++;
687 break;
688 }
689 case ada::state::PATH: {
690 ada_log("PATH ", helpers::substring(url_data, input_position));
691 std::string_view view = url_data.substr(input_position);
692
693 // Most time, we do not need percent encoding.
694 // Furthermore, we can immediately locate the '?'.
695 size_t locofquestionmark = view.find('?');
696 if (locofquestionmark != std::string_view::npos) {
698 view.remove_suffix(view.size() - locofquestionmark);
699 input_position += locofquestionmark + 1;
700 } else {
701 input_position = input_size + 1;
702 }
703 if constexpr (store_values) {
704 if constexpr (result_type_is_ada_url) {
705 helpers::parse_prepared_path(view, url.type, url.path);
706 } else {
707 url.consume_prepared_path(view);
708 ADA_ASSERT_TRUE(url.validate());
709 }
710 }
711 break;
712 }
714 ada_log("FILE_SLASH ", helpers::substring(url_data, input_position));
715
716 // If c is U+002F (/) or U+005C (\‍), then:
717 if ((input_position != input_size) &&
718 (url_data[input_position] == '/' ||
719 url_data[input_position] == '\\')) {
720 ada_log("FILE_SLASH c is U+002F or U+005C");
721 // Set state to file host state.
723 input_position++;
724 } else {
725 ada_log("FILE_SLASH otherwise");
726 // If base is non-null and base's scheme is "file", then:
727 // Note: it is unsafe to do base_url->scheme unless you know that
728 // base_url_has_value() is true.
729 if (base_url != nullptr &&
730 base_url->type == ada::scheme::type::FILE) {
731 // Set url's host to base's host.
732 if constexpr (result_type_is_ada_url) {
733 url.host = base_url->host;
734 } else {
735 url.update_host_to_base_host(base_url->get_host());
736 }
737 // If the code point substring from pointer to the end of input does
738 // not start with a Windows drive letter and base's path[0] is a
739 // normalized Windows drive letter, then append base's path[0] to
740 // url's path.
741 if (!base_url->get_pathname().empty()) {
743 url_data.substr(input_position))) {
744 std::string_view first_base_url_path =
745 base_url->get_pathname().substr(1);
746 size_t loc = first_base_url_path.find('/');
747 if (loc != std::string_view::npos) {
748 helpers::resize(first_base_url_path, loc);
749 }
751 first_base_url_path)) {
752 if constexpr (result_type_is_ada_url) {
753 url.path += '/';
754 url.path += first_base_url_path;
755 } else {
756 url.append_base_pathname(
757 helpers::concat("/", first_base_url_path));
758 }
759 }
760 }
761 }
762 }
763
764 // Set state to path state, and decrease pointer by 1.
766 }
767
768 break;
769 }
771 ada_log("FILE_HOST ", helpers::substring(url_data, input_position));
772 std::string_view view = url_data.substr(input_position);
773
774 size_t location = view.find_first_of("/\\?");
775 std::string_view file_host_buffer(
776 view.data(),
777 (location != std::string_view::npos) ? location : view.size());
778
779 if (checkers::is_windows_drive_letter(file_host_buffer)) {
781 } else if (file_host_buffer.empty()) {
782 // Set url's host to the empty string.
783 if constexpr (result_type_is_ada_url) {
784 url.host = "";
785 } else {
786 url.update_base_hostname("");
787 }
788 // Set state to path start state.
790 } else {
791 size_t consumed_bytes = file_host_buffer.size();
792 input_position += consumed_bytes;
793 // Let host be the result of host parsing buffer with url is not
794 // special.
795 if (!url.parse_host(file_host_buffer)) {
796 return url;
797 }
798
799 if constexpr (result_type_is_ada_url) {
800 // If host is "localhost", then set host to the empty string.
801 if (url.host.has_value() && url.host.value() == "localhost") {
802 url.host = "";
803 }
804 } else {
805 if (url.get_hostname() == "localhost") {
806 url.update_base_hostname("");
807 }
808 }
809
810 // Set buffer to the empty string and state to path start state.
812 }
813
814 break;
815 }
816 case ada::state::FILE: {
817 ada_log("FILE ", helpers::substring(url_data, input_position));
818 std::string_view file_view = url_data.substr(input_position);
819
820 url.set_protocol_as_file();
821 if constexpr (result_type_is_ada_url) {
822 // Set url's host to the empty string.
823 url.host = "";
824 } else {
825 url.update_base_hostname("");
826 }
827 // If c is U+002F (/) or U+005C (\‍), then:
828 if (input_position != input_size &&
829 (url_data[input_position] == '/' ||
830 url_data[input_position] == '\\')) {
831 ada_log("FILE c is U+002F or U+005C");
832 // Set state to file slash state.
834 }
835 // Otherwise, if base is non-null and base's scheme is "file":
836 else if (base_url != nullptr &&
837 base_url->type == ada::scheme::type::FILE) {
838 // Set url's host to base's host, url's path to a clone of base's
839 // path, and url's query to base's query.
840 ada_log("FILE base non-null");
841 if constexpr (result_type_is_ada_url) {
842 url.host = base_url->host;
843 url.path = base_url->path;
844 url.query = base_url->query;
845 } else {
846 url.update_host_to_base_host(base_url->get_hostname());
847 url.update_base_pathname(base_url->get_pathname());
848 url.update_base_search(base_url->get_search());
849 }
850 url.has_opaque_path = base_url->has_opaque_path;
851
852 // If c is U+003F (?), then set url's query to the empty string and
853 // state to query state.
854 if (input_position != input_size && url_data[input_position] == '?') {
856 }
857 // Otherwise, if c is not the EOF code point:
858 else if (input_position != input_size) {
859 // Set url's query to null.
860 url.clear_search();
861 // If the code point substring from pointer to the end of input does
862 // not start with a Windows drive letter, then shorten url's path.
863 if (!checkers::is_windows_drive_letter(file_view)) {
864 if constexpr (result_type_is_ada_url) {
865 helpers::shorten_path(url.path, url.type);
866 } else {
867 std::string_view path = url.get_pathname();
868 if (helpers::shorten_path(path, url.type)) {
869 url.update_base_pathname(std::move(std::string(path)));
870 }
871 }
872 }
873 // Otherwise:
874 else {
875 // Set url's path to an empty list.
876 url.clear_pathname();
877 url.has_opaque_path = true;
878 }
879
880 // Set state to path state and decrease pointer by 1.
882 break;
883 }
884 }
885 // Otherwise, set state to path state, and decrease pointer by 1.
886 else {
887 ada_log("FILE go to path");
889 break;
890 }
891
892 input_position++;
893 break;
894 }
895 default:
897 }
898 }
899 if constexpr (store_values) {
900 if (fragment.has_value()) {
901 url.update_unencoded_base_hash(*fragment);
902 }
903 }
904 return url;
905}
906
907template url parse_url_impl(std::string_view user_input,
908 const url* base_url = nullptr);
910 std::string_view user_input, const url_aggregator* base_url = nullptr);
911
912template <class result_type>
913result_type parse_url(std::string_view user_input,
914 const result_type* base_url) {
915 return parse_url_impl<result_type, true>(user_input, base_url);
916}
917
918template url parse_url<url>(std::string_view user_input,
919 const url* base_url = nullptr);
921 std::string_view user_input, const url_aggregator* base_url = nullptr);
922} // namespace ada::parser
Includes all definitions for Ada.
Definitions of the character sets used by unicode functions.
Common definitions for cross-platform compiler support.
#define ADA_ASSERT_TRUE(COND)
constexpr uint8_t QUERY_PERCENT_ENCODE[32]
constexpr uint8_t SPECIAL_QUERY_PERCENT_ENCODE[32]
constexpr uint8_t C0_CONTROL_PERCENT_ENCODE[32]
constexpr uint8_t USERINFO_PERCENT_ENCODE[32]
constexpr bool is_normalized_windows_drive_letter(std::string_view input) noexcept
constexpr bool is_windows_drive_letter(std::string_view input) noexcept
constexpr bool is_alpha(char x) noexcept
Includes the definitions for supported parsers.
template url parse_url< url >(std::string_view user_input, const url *base_url)
result_type parse_url(std::string_view user_input, const result_type *base_url=nullptr)
Definition parser.cpp:913
template url_aggregator parse_url< url_aggregator >(std::string_view user_input, const url_aggregator *base_url)
result_type parse_url_impl(std::string_view user_input, const result_type *base_url=nullptr)
Definition parser.cpp:15
ada_warn_unused std::string to_string(encoding_type type)
state
Definition state.h:17
@ SPECIAL_RELATIVE_OR_AUTHORITY
@ SPECIAL_AUTHORITY_SLASHES
@ SPECIAL_AUTHORITY_IGNORE_SLASHES
void unreachable()
Definitions for the parser.
Lightweight URL struct.
ada_really_inline constexpr bool is_special() const noexcept
bool is_valid
Definition url_base.h:50
bool has_opaque_path
Definition url_base.h:55
Generic URL struct reliant on std::string instantiation.
Definition url.h:38
ada_really_inline std::string get_href() const noexcept
Definition url-inl.h:187
constexpr std::string_view get_pathname() const noexcept
Definition url-inl.h:46
std::string get_hostname() const noexcept
Definition url.cpp:637
std::string get_protocol() const noexcept
Definition url.cpp:615
Definitions for all unicode specific functions.
Definitions for the URL.