Ada 2.8.0
Fast spec-compliant URL parser
Loading...
Searching...
No Matches
parser.cpp
Go to the documentation of this file.
1#include "ada.h"
2#include "ada/common_defs.h"
4#include "ada/unicode.h"
5#include "ada/url-inl.h"
6#include "ada/log.h"
7#include "ada/parser.h"
8
9#include <limits>
10
11namespace ada::parser {
12
13template <class result_type>
14result_type parse_url(std::string_view user_input,
15 const result_type* base_url) {
16 // We can specialize the implementation per type.
17 // Important: result_type_is_ada_url is evaluated at *compile time*. This
18 // means that doing if constexpr(result_type_is_ada_url) { something } else {
19 // something else } is free (at runtime). This means that ada::url_aggregator
20 // and ada::url **do not have to support the exact same API**.
21 constexpr bool result_type_is_ada_url =
22 std::is_same<ada::url, result_type>::value;
23 constexpr bool result_type_is_ada_url_aggregator =
24 std::is_same<ada::url_aggregator, result_type>::value;
25 static_assert(result_type_is_ada_url ||
26 result_type_is_ada_url_aggregator); // We don't support
27 // anything else for now.
28
29 ada_log("ada::parser::parse_url('", user_input, "' [", user_input.size(),
30 " bytes],", (base_url != nullptr ? base_url->to_string() : "null"),
31 ")");
32
34 result_type url{};
35
36 // We refuse to parse URL strings that exceed 4GB. Such strings are almost
37 // surely the result of a bug or are otherwise a security concern.
38 if (user_input.size() > std::numeric_limits<uint32_t>::max()) {
39 url.is_valid = false;
40 }
41 // Going forward, user_input.size() is in [0,
42 // std::numeric_limits<uint32_t>::max). If we are provided with an invalid
43 // base, or the optional_url was invalid, we must return.
44 if (base_url != nullptr) {
45 url.is_valid &= base_url->is_valid;
46 }
47 if (!url.is_valid) {
48 return url;
49 }
50 if constexpr (result_type_is_ada_url_aggregator) {
51 // Most of the time, we just need user_input.size().
52 // In some instances, we may need a bit more.
54 // This is *very* important. This line should *not* be removed
55 // hastily. There are principled reasons why reserve is important
56 // for performance. If you have a benchmark with small inputs,
57 // it may not matter, but in other instances, it could.
59 // This rounds up to the next power of two.
60 // We know that user_input.size() is in [0,
61 // std::numeric_limits<uint32_t>::max).
62 uint32_t reserve_capacity =
63 (0xFFFFFFFF >>
64 helpers::leading_zeroes(uint32_t(1 | user_input.size()))) +
65 1;
66 url.reserve(reserve_capacity);
67 //
68 //
69 //
70 }
71 std::string tmp_buffer;
72 std::string_view internal_input;
73 if (unicode::has_tabs_or_newline(user_input)) {
74 tmp_buffer = user_input;
75 // Optimization opportunity: Instead of copying and then pruning, we could
76 // just directly build the string from user_input.
77 helpers::remove_ascii_tab_or_newline(tmp_buffer);
78 internal_input = tmp_buffer;
79 } else {
80 internal_input = user_input;
81 }
82
83 // Leading and trailing control characters are uncommon and easy to deal with
84 // (no performance concern).
85 std::string_view url_data = internal_input;
86 helpers::trim_c0_whitespace(url_data);
87
88 // Optimization opportunity. Most websites do not have fragment.
89 std::optional<std::string_view> fragment = helpers::prune_hash(url_data);
90 // We add it last so that an implementation like ada::url_aggregator
91 // can append it last to its internal buffer, thus improving performance.
92
93 // Here url_data no longer has its fragment.
94 // We are going to access the data from url_data (it is immutable).
95 // At any given time, we are pointing at byte 'input_position' in url_data.
96 // The input_position variable should range from 0 to input_size.
97 // It is illegal to access url_data at input_size.
98 size_t input_position = 0;
99 const size_t input_size = url_data.size();
100 // Keep running the following state machine by switching on state.
101 // If after a run pointer points to the EOF code point, go to the next step.
102 // Otherwise, increase pointer by 1 and continue with the state machine.
103 // We never decrement input_position.
104 while (input_position <= input_size) {
105 ada_log("In parsing at ", input_position, " out of ", input_size,
106 " in state ", ada::to_string(state));
107 switch (state) {
109 ada_log("SCHEME_START ", helpers::substring(url_data, input_position));
110 // If c is an ASCII alpha, append c, lowercased, to buffer, and set
111 // state to scheme state.
112 if ((input_position != input_size) &&
113 checkers::is_alpha(url_data[input_position])) {
115 input_position++;
116 } else {
117 // Otherwise, if state override is not given, set state to no scheme
118 // state and decrease pointer by 1.
120 }
121 break;
122 }
123 case ada::state::SCHEME: {
124 ada_log("SCHEME ", helpers::substring(url_data, input_position));
125 // If c is an ASCII alphanumeric, U+002B (+), U+002D (-), or U+002E (.),
126 // append c, lowercased, to buffer.
127 while ((input_position != input_size) &&
128 (ada::unicode::is_alnum_plus(url_data[input_position]))) {
129 input_position++;
130 }
131 // Otherwise, if c is U+003A (:), then:
132 if ((input_position != input_size) &&
133 (url_data[input_position] == ':')) {
134 ada_log("SCHEME the scheme should be ",
135 url_data.substr(0, input_position));
136 if constexpr (result_type_is_ada_url) {
137 if (!url.parse_scheme(url_data.substr(0, input_position))) {
138 return url;
139 }
140 } else {
141 // we pass the colon along instead of painfully adding it back.
142 if (!url.parse_scheme_with_colon(
143 url_data.substr(0, input_position + 1))) {
144 return url;
145 }
146 }
147 ada_log("SCHEME the scheme is ", url.get_protocol());
148
149 // If url's scheme is "file", then:
150 if (url.type == ada::scheme::type::FILE) {
151 // Set state to file state.
153 }
154 // Otherwise, if url is special, base is non-null, and base's scheme
155 // is url's scheme: Note: Doing base_url->scheme is unsafe if base_url
156 // != nullptr is false.
157 else if (url.is_special() && base_url != nullptr &&
158 base_url->type == url.type) {
159 // Set state to special relative or authority state.
161 }
162 // Otherwise, if url is special, set state to special authority
163 // slashes state.
164 else if (url.is_special()) {
166 }
167 // Otherwise, if remaining starts with an U+002F (/), set state to
168 // path or authority state and increase pointer by 1.
169 else if (input_position + 1 < input_size &&
170 url_data[input_position + 1] == '/') {
172 input_position++;
173 }
174 // Otherwise, set url's path to the empty string and set state to
175 // opaque path state.
176 else {
178 }
179 }
180 // Otherwise, if state override is not given, set buffer to the empty
181 // string, state to no scheme state, and start over (from the first code
182 // point in input).
183 else {
185 input_position = 0;
186 break;
187 }
188 input_position++;
189 break;
190 }
192 ada_log("NO_SCHEME ", helpers::substring(url_data, input_position));
193 // If base is null, or base has an opaque path and c is not U+0023 (#),
194 // validation error, return failure.
195 if (base_url == nullptr ||
196 (base_url->has_opaque_path && !fragment.has_value())) {
197 ada_log("NO_SCHEME validation error");
198 url.is_valid = false;
199 return url;
200 }
201 // Otherwise, if base has an opaque path and c is U+0023 (#),
202 // set url's scheme to base's scheme, url's path to base's path, url's
203 // query to base's query, and set state to fragment state.
204 else if (base_url->has_opaque_path && fragment.has_value() &&
205 input_position == input_size) {
206 ada_log("NO_SCHEME opaque base with fragment");
207 url.copy_scheme(*base_url);
208 url.has_opaque_path = base_url->has_opaque_path;
209
210 if constexpr (result_type_is_ada_url) {
211 url.path = base_url->path;
212 url.query = base_url->query;
213 } else {
214 url.update_base_pathname(base_url->get_pathname());
215 url.update_base_search(base_url->get_search());
216 }
217 url.update_unencoded_base_hash(*fragment);
218 return url;
219 }
220 // Otherwise, if base's scheme is not "file", set state to relative
221 // state and decrease pointer by 1.
222 else if (base_url->type != ada::scheme::type::FILE) {
223 ada_log("NO_SCHEME non-file relative path");
225 }
226 // Otherwise, set state to file state and decrease pointer by 1.
227 else {
228 ada_log("NO_SCHEME file base type");
230 }
231 break;
232 }
234 ada_log("AUTHORITY ", helpers::substring(url_data, input_position));
235 // most URLs have no @. Having no @ tells us that we don't have to worry
236 // about AUTHORITY. Of course, we could have @ and still not have to
237 // worry about AUTHORITY.
238 // TODO: Instead of just collecting a bool, collect the location of the
239 // '@' and do something useful with it.
240 // TODO: We could do various processing early on, using a single pass
241 // over the string to collect information about it, e.g., telling us
242 // whether there is a @ and if so, where (or how many).
243 const bool contains_ampersand =
244 (url_data.find('@', input_position) != std::string_view::npos);
245
246 if (!contains_ampersand) {
248 break;
249 }
250 bool at_sign_seen{false};
251 bool password_token_seen{false};
257 do {
258 std::string_view view = helpers::substring(url_data, input_position);
259 // The delimiters are @, /, ? \\.
260 size_t location =
261 url.is_special() ? helpers::find_authority_delimiter_special(view)
262 : helpers::find_authority_delimiter(view);
263 std::string_view authority_view(view.data(), location);
264 size_t end_of_authority = input_position + authority_view.size();
265 // If c is U+0040 (@), then:
266 if ((end_of_authority != input_size) &&
267 (url_data[end_of_authority] == '@')) {
268 // If atSignSeen is true, then prepend "%40" to buffer.
269 if (at_sign_seen) {
270 if (password_token_seen) {
271 if constexpr (result_type_is_ada_url) {
272 url.password += "%40";
273 } else {
274 url.append_base_password("%40");
275 }
276 } else {
277 if constexpr (result_type_is_ada_url) {
278 url.username += "%40";
279 } else {
280 url.append_base_username("%40");
281 }
282 }
283 }
284
285 at_sign_seen = true;
286
287 if (!password_token_seen) {
288 size_t password_token_location = authority_view.find(':');
289 password_token_seen =
290 password_token_location != std::string_view::npos;
291
292 if (!password_token_seen) {
293 if constexpr (result_type_is_ada_url) {
294 url.username += unicode::percent_encode(
296 } else {
297 url.append_base_username(unicode::percent_encode(
299 }
300 } else {
301 if constexpr (result_type_is_ada_url) {
302 url.username += unicode::percent_encode(
303 authority_view.substr(0, password_token_location),
305 url.password += unicode::percent_encode(
306 authority_view.substr(password_token_location + 1),
308 } else {
309 url.append_base_username(unicode::percent_encode(
310 authority_view.substr(0, password_token_location),
312 url.append_base_password(unicode::percent_encode(
313 authority_view.substr(password_token_location + 1),
315 }
316 }
317 } else {
318 if constexpr (result_type_is_ada_url) {
319 url.password += unicode::percent_encode(
321 } else {
322 url.append_base_password(unicode::percent_encode(
324 }
325 }
326 }
327 // Otherwise, if one of the following is true:
328 // - c is the EOF code point, U+002F (/), U+003F (?), or U+0023 (#)
329 // - url is special and c is U+005C (\‍)
330 else if (end_of_authority == input_size ||
331 url_data[end_of_authority] == '/' ||
332 url_data[end_of_authority] == '?' ||
333 (url.is_special() && url_data[end_of_authority] == '\\')) {
334 // If atSignSeen is true and authority_view is the empty string,
335 // validation error, return failure.
336 if (at_sign_seen && authority_view.empty()) {
337 url.is_valid = false;
338 return url;
339 }
341 break;
342 }
343 if (end_of_authority == input_size) {
344 if (fragment.has_value()) {
345 url.update_unencoded_base_hash(*fragment);
346 }
347 return url;
348 }
349 input_position = end_of_authority + 1;
350 } while (true);
351
352 break;
353 }
355 ada_log("SPECIAL_RELATIVE_OR_AUTHORITY ",
356 helpers::substring(url_data, input_position));
357
358 // If c is U+002F (/) and remaining starts with U+002F (/),
359 // then set state to special authority ignore slashes state and increase
360 // pointer by 1.
361 std::string_view view = helpers::substring(url_data, input_position);
362 if (ada::checkers::begins_with(view, "//")) {
364 input_position += 2;
365 } else {
366 // Otherwise, validation error, set state to relative state and
367 // decrease pointer by 1.
369 }
370
371 break;
372 }
374 ada_log("PATH_OR_AUTHORITY ",
375 helpers::substring(url_data, input_position));
376
377 // If c is U+002F (/), then set state to authority state.
378 if ((input_position != input_size) &&
379 (url_data[input_position] == '/')) {
381 input_position++;
382 } else {
383 // Otherwise, set state to path state, and decrease pointer by 1.
385 }
386
387 break;
388 }
390 ada_log("RELATIVE_SCHEME ",
391 helpers::substring(url_data, input_position));
392
393 // Set url's scheme to base's scheme.
394 url.copy_scheme(*base_url);
395
396 // If c is U+002F (/), then set state to relative slash state.
397 if ((input_position != input_size) &&
398 (url_data[input_position] == '/')) {
399 ada_log(
400 "RELATIVE_SCHEME if c is U+002F (/), then set state to relative "
401 "slash state");
403 } else if (url.is_special() && (input_position != input_size) &&
404 (url_data[input_position] == '\\')) {
405 // Otherwise, if url is special and c is U+005C (\‍), validation error,
406 // set state to relative slash state.
407 ada_log(
408 "RELATIVE_SCHEME if url is special and c is U+005C, validation "
409 "error, set state to relative slash state");
411 } else {
412 ada_log("RELATIVE_SCHEME otherwise");
413 // Set url's username to base's username, url's password to base's
414 // password, url's host to base's host, url's port to base's port,
415 // url's path to a clone of base's path, and url's query to base's
416 // query.
417 if constexpr (result_type_is_ada_url) {
418 url.username = base_url->username;
419 url.password = base_url->password;
420 url.host = base_url->host;
421 url.port = base_url->port;
422 // cloning the base path includes cloning the has_opaque_path flag
423 url.has_opaque_path = base_url->has_opaque_path;
424 url.path = base_url->path;
425 url.query = base_url->query;
426 } else {
427 url.update_base_authority(base_url->get_href(),
428 base_url->get_components());
429 // TODO: Get rid of set_hostname and replace it with
430 // update_base_hostname
431 url.set_hostname(base_url->get_hostname());
432 url.update_base_port(base_url->retrieve_base_port());
433 // cloning the base path includes cloning the has_opaque_path flag
434 url.has_opaque_path = base_url->has_opaque_path;
435 url.update_base_pathname(base_url->get_pathname());
436 url.update_base_search(base_url->get_search());
437 }
438
439 url.has_opaque_path = base_url->has_opaque_path;
440
441 // If c is U+003F (?), then set url's query to the empty string, and
442 // state to query state.
443 if ((input_position != input_size) &&
444 (url_data[input_position] == '?')) {
446 }
447 // Otherwise, if c is not the EOF code point:
448 else if (input_position != input_size) {
449 // Set url's query to null.
450 url.clear_search();
451 if constexpr (result_type_is_ada_url) {
452 // Shorten url's path.
453 helpers::shorten_path(url.path, url.type);
454 } else {
455 std::string_view path = url.get_pathname();
456 if (helpers::shorten_path(path, url.type)) {
457 url.update_base_pathname(std::string(path));
458 }
459 }
460 // Set state to path state and decrease pointer by 1.
462 break;
463 }
464 }
465 input_position++;
466 break;
467 }
469 ada_log("RELATIVE_SLASH ",
470 helpers::substring(url_data, input_position));
471
472 // If url is special and c is U+002F (/) or U+005C (\‍), then:
473 if (url.is_special() && (input_position != input_size) &&
474 (url_data[input_position] == '/' ||
475 url_data[input_position] == '\\')) {
476 // Set state to special authority ignore slashes state.
478 }
479 // Otherwise, if c is U+002F (/), then set state to authority state.
480 else if ((input_position != input_size) &&
481 (url_data[input_position] == '/')) {
483 }
484 // Otherwise, set
485 // - url's username to base's username,
486 // - url's password to base's password,
487 // - url's host to base's host,
488 // - url's port to base's port,
489 // - state to path state, and then, decrease pointer by 1.
490 else {
491 if constexpr (result_type_is_ada_url) {
492 url.username = base_url->username;
493 url.password = base_url->password;
494 url.host = base_url->host;
495 url.port = base_url->port;
496 } else {
497 url.update_base_authority(base_url->get_href(),
498 base_url->get_components());
499 // TODO: Get rid of set_hostname and replace it with
500 // update_base_hostname
501 url.set_hostname(base_url->get_hostname());
502 url.update_base_port(base_url->retrieve_base_port());
503 }
505 break;
506 }
507
508 input_position++;
509 break;
510 }
512 ada_log("SPECIAL_AUTHORITY_SLASHES ",
513 helpers::substring(url_data, input_position));
514
515 // If c is U+002F (/) and remaining starts with U+002F (/),
516 // then set state to special authority ignore slashes state and increase
517 // pointer by 1.
518 std::string_view view = helpers::substring(url_data, input_position);
519 if (ada::checkers::begins_with(view, "//")) {
520 input_position += 2;
521 }
522
523 [[fallthrough]];
524 }
526 ada_log("SPECIAL_AUTHORITY_IGNORE_SLASHES ",
527 helpers::substring(url_data, input_position));
528
529 // If c is neither U+002F (/) nor U+005C (\‍), then set state to
530 // authority state and decrease pointer by 1.
531 while ((input_position != input_size) &&
532 ((url_data[input_position] == '/') ||
533 (url_data[input_position] == '\\'))) {
534 input_position++;
535 }
537
538 break;
539 }
540 case ada::state::QUERY: {
541 ada_log("QUERY ", helpers::substring(url_data, input_position));
542 // Let queryPercentEncodeSet be the special-query percent-encode set if
543 // url is special; otherwise the query percent-encode set.
544 const uint8_t* query_percent_encode_set =
547
548 // Percent-encode after encoding, with encoding, buffer, and
549 // queryPercentEncodeSet, and append the result to url's query.
550 url.update_base_search(helpers::substring(url_data, input_position),
551 query_percent_encode_set);
552 ada_log("QUERY update_base_search completed ");
553 if (fragment.has_value()) {
554 url.update_unencoded_base_hash(*fragment);
555 }
556 return url;
557 }
558 case ada::state::HOST: {
559 ada_log("HOST ", helpers::substring(url_data, input_position));
560
561 std::string_view host_view =
562 helpers::substring(url_data, input_position);
563 auto [location, found_colon] =
564 helpers::get_host_delimiter_location(url.is_special(), host_view);
565 input_position = (location != std::string_view::npos)
566 ? input_position + location
567 : input_size;
568 // Otherwise, if c is U+003A (:) and insideBrackets is false, then:
569 // Note: the 'found_colon' value is true if and only if a colon was
570 // encountered while not inside brackets.
571 if (found_colon) {
572 // If buffer is the empty string, validation error, return failure.
573 // Let host be the result of host parsing buffer with url is not
574 // special.
575 ada_log("HOST parsing ", host_view);
576 if (!url.parse_host(host_view)) {
577 return url;
578 }
579 ada_log("HOST parsing results in ", url.get_hostname());
580 // Set url's host to host, buffer to the empty string, and state to
581 // port state.
583 input_position++;
584 }
585 // Otherwise, if one of the following is true:
586 // - c is the EOF code point, U+002F (/), U+003F (?), or U+0023 (#)
587 // - url is special and c is U+005C (\‍)
588 // The get_host_delimiter_location function either brings us to
589 // the colon outside of the bracket, or to one of those characters.
590 else {
591 // If url is special and host_view is the empty string, validation
592 // error, return failure.
593 if (url.is_special() && host_view.empty()) {
594 url.is_valid = false;
595 return url;
596 }
597 ada_log("HOST parsing ", host_view, " href=", url.get_href());
598 // Let host be the result of host parsing host_view with url is not
599 // special.
600 if (host_view.empty()) {
601 url.update_base_hostname("");
602 } else if (!url.parse_host(host_view)) {
603 return url;
604 }
605 ada_log("HOST parsing results in ", url.get_hostname(),
606 " href=", url.get_href());
607
608 // Set url's host to host, and state to path start state.
610 }
611
612 break;
613 }
615 ada_log("OPAQUE_PATH ", helpers::substring(url_data, input_position));
616 std::string_view view = helpers::substring(url_data, input_position);
617 // If c is U+003F (?), then set url's query to the empty string and
618 // state to query state.
619 size_t location = view.find('?');
620 if (location != std::string_view::npos) {
621 view.remove_suffix(view.size() - location);
623 input_position += location + 1;
624 } else {
625 input_position = input_size + 1;
626 }
627 url.has_opaque_path = true;
628 // This is a really unlikely scenario in real world. We should not seek
629 // to optimize it.
630 url.update_base_pathname(unicode::percent_encode(
632 break;
633 }
634 case ada::state::PORT: {
635 ada_log("PORT ", helpers::substring(url_data, input_position));
636 std::string_view port_view =
637 helpers::substring(url_data, input_position);
638 size_t consumed_bytes = url.parse_port(port_view, true);
639 input_position += consumed_bytes;
640 if (!url.is_valid) {
641 return url;
642 }
644 [[fallthrough]];
645 }
647 ada_log("PATH_START ", helpers::substring(url_data, input_position));
648
649 // If url is special, then:
650 if (url.is_special()) {
651 // Set state to path state.
653
654 // Optimization: Avoiding going into PATH state improves the
655 // performance of urls ending with /.
656 if (input_position == input_size) {
657 url.update_base_pathname("/");
658 if (fragment.has_value()) {
659 url.update_unencoded_base_hash(*fragment);
660 }
661 return url;
662 }
663 // If c is neither U+002F (/) nor U+005C (\‍), then decrease pointer
664 // by 1. We know that (input_position == input_size) is impossible
665 // here, because of the previous if-check.
666 if ((url_data[input_position] != '/') &&
667 (url_data[input_position] != '\\')) {
668 break;
669 }
670 }
671 // Otherwise, if state override is not given and c is U+003F (?),
672 // set url's query to the empty string and state to query state.
673 else if ((input_position != input_size) &&
674 (url_data[input_position] == '?')) {
676 }
677 // Otherwise, if c is not the EOF code point:
678 else if (input_position != input_size) {
679 // Set state to path state.
681
682 // If c is not U+002F (/), then decrease pointer by 1.
683 if (url_data[input_position] != '/') {
684 break;
685 }
686 }
687
688 input_position++;
689 break;
690 }
691 case ada::state::PATH: {
692 std::string_view view = helpers::substring(url_data, input_position);
693 ada_log("PATH ", helpers::substring(url_data, input_position));
694
695 // Most time, we do not need percent encoding.
696 // Furthermore, we can immediately locate the '?'.
697 size_t locofquestionmark = view.find('?');
698 if (locofquestionmark != std::string_view::npos) {
700 view.remove_suffix(view.size() - locofquestionmark);
701 input_position += locofquestionmark + 1;
702 } else {
703 input_position = input_size + 1;
704 }
705 if constexpr (result_type_is_ada_url) {
706 helpers::parse_prepared_path(view, url.type, url.path);
707 } else {
708 url.consume_prepared_path(view);
709 ADA_ASSERT_TRUE(url.validate());
710 }
711 break;
712 }
714 ada_log("FILE_SLASH ", helpers::substring(url_data, input_position));
715
716 // If c is U+002F (/) or U+005C (\‍), then:
717 if ((input_position != input_size) &&
718 (url_data[input_position] == '/' ||
719 url_data[input_position] == '\\')) {
720 ada_log("FILE_SLASH c is U+002F or U+005C");
721 // Set state to file host state.
723 input_position++;
724 } else {
725 ada_log("FILE_SLASH otherwise");
726 // If base is non-null and base's scheme is "file", then:
727 // Note: it is unsafe to do base_url->scheme unless you know that
728 // base_url_has_value() is true.
729 if (base_url != nullptr &&
730 base_url->type == ada::scheme::type::FILE) {
731 // Set url's host to base's host.
732 if constexpr (result_type_is_ada_url) {
733 url.host = base_url->host;
734 } else {
735 // TODO: Optimization opportunity.
736 url.set_host(base_url->get_host());
737 }
738 // If the code point substring from pointer to the end of input does
739 // not start with a Windows drive letter and base's path[0] is a
740 // normalized Windows drive letter, then append base's path[0] to
741 // url's path.
742 if (!base_url->get_pathname().empty()) {
744 helpers::substring(url_data, input_position))) {
745 std::string_view first_base_url_path =
746 base_url->get_pathname().substr(1);
747 size_t loc = first_base_url_path.find('/');
748 if (loc != std::string_view::npos) {
749 helpers::resize(first_base_url_path, loc);
750 }
752 first_base_url_path)) {
753 if constexpr (result_type_is_ada_url) {
754 url.path += '/';
755 url.path += first_base_url_path;
756 } else {
757 url.append_base_pathname(
758 helpers::concat("/", first_base_url_path));
759 }
760 }
761 }
762 }
763 }
764
765 // Set state to path state, and decrease pointer by 1.
767 }
768
769 break;
770 }
772 std::string_view view = helpers::substring(url_data, input_position);
773 ada_log("FILE_HOST ", helpers::substring(url_data, input_position));
774
775 size_t location = view.find_first_of("/\\?");
776 std::string_view file_host_buffer(
777 view.data(),
778 (location != std::string_view::npos) ? location : view.size());
779
780 if (checkers::is_windows_drive_letter(file_host_buffer)) {
782 } else if (file_host_buffer.empty()) {
783 // Set url's host to the empty string.
784 if constexpr (result_type_is_ada_url) {
785 url.host = "";
786 } else {
787 url.update_base_hostname("");
788 }
789 // Set state to path start state.
791 } else {
792 size_t consumed_bytes = file_host_buffer.size();
793 input_position += consumed_bytes;
794 // Let host be the result of host parsing buffer with url is not
795 // special.
796 if (!url.parse_host(file_host_buffer)) {
797 return url;
798 }
799
800 if constexpr (result_type_is_ada_url) {
801 // If host is "localhost", then set host to the empty string.
802 if (url.host.has_value() && url.host.value() == "localhost") {
803 url.host = "";
804 }
805 } else {
806 if (url.get_hostname() == "localhost") {
807 url.update_base_hostname("");
808 }
809 }
810
811 // Set buffer to the empty string and state to path start state.
813 }
814
815 break;
816 }
817 case ada::state::FILE: {
818 ada_log("FILE ", helpers::substring(url_data, input_position));
819 std::string_view file_view =
820 helpers::substring(url_data, input_position);
821
822 url.set_protocol_as_file();
823 if constexpr (result_type_is_ada_url) {
824 // Set url's host to the empty string.
825 url.host = "";
826 } else {
827 url.update_base_hostname("");
828 }
829 // If c is U+002F (/) or U+005C (\‍), then:
830 if (input_position != input_size &&
831 (url_data[input_position] == '/' ||
832 url_data[input_position] == '\\')) {
833 ada_log("FILE c is U+002F or U+005C");
834 // Set state to file slash state.
836 }
837 // Otherwise, if base is non-null and base's scheme is "file":
838 else if (base_url != nullptr &&
839 base_url->type == ada::scheme::type::FILE) {
840 // Set url's host to base's host, url's path to a clone of base's
841 // path, and url's query to base's query.
842 ada_log("FILE base non-null");
843 if constexpr (result_type_is_ada_url) {
844 url.host = base_url->host;
845 url.path = base_url->path;
846 url.query = base_url->query;
847 } else {
848 // TODO: Get rid of set_hostname and replace it with
849 // update_base_hostname
850 url.set_hostname(base_url->get_hostname());
851 url.update_base_pathname(base_url->get_pathname());
852 url.update_base_search(base_url->get_search());
853 }
854 url.has_opaque_path = base_url->has_opaque_path;
855
856 // If c is U+003F (?), then set url's query to the empty string and
857 // state to query state.
858 if (input_position != input_size && url_data[input_position] == '?') {
860 }
861 // Otherwise, if c is not the EOF code point:
862 else if (input_position != input_size) {
863 // Set url's query to null.
864 url.clear_search();
865 // If the code point substring from pointer to the end of input does
866 // not start with a Windows drive letter, then shorten url's path.
867 if (!checkers::is_windows_drive_letter(file_view)) {
868 if constexpr (result_type_is_ada_url) {
869 helpers::shorten_path(url.path, url.type);
870 } else {
871 std::string_view path = url.get_pathname();
872 if (helpers::shorten_path(path, url.type)) {
873 url.update_base_pathname(std::string(path));
874 }
875 }
876 }
877 // Otherwise:
878 else {
879 // Set url's path to an empty list.
880 url.clear_pathname();
881 url.has_opaque_path = true;
882 }
883
884 // Set state to path state and decrease pointer by 1.
886 break;
887 }
888 }
889 // Otherwise, set state to path state, and decrease pointer by 1.
890 else {
891 ada_log("FILE go to path");
893 break;
894 }
895
896 input_position++;
897 break;
898 }
899 default:
901 }
902 }
903 if (fragment.has_value()) {
904 url.update_unencoded_base_hash(*fragment);
905 }
906 return url;
907}
908
909template url parse_url<url>(std::string_view user_input,
910 const url* base_url = nullptr);
912 std::string_view user_input, const url_aggregator* base_url = nullptr);
913
914} // namespace ada::parser
Includes all definitions for Ada.
Definitions of the character sets used by unicode functions.
Common definitions for cross-platform compiler support.
#define ADA_ASSERT_TRUE(COND)
constexpr uint8_t QUERY_PERCENT_ENCODE[32]
constexpr uint8_t SPECIAL_QUERY_PERCENT_ENCODE[32]
constexpr uint8_t C0_CONTROL_PERCENT_ENCODE[32]
constexpr uint8_t USERINFO_PERCENT_ENCODE[32]
constexpr bool is_normalized_windows_drive_letter(std::string_view input) noexcept
constexpr bool is_windows_drive_letter(std::string_view input) noexcept
constexpr bool is_alpha(char x) noexcept
ada_really_inline bool begins_with(std::string_view view, std::string_view prefix)
Includes the definitions for supported parsers.
template url parse_url< url >(std::string_view user_input, const url *base_url)
result_type parse_url(std::string_view user_input, const result_type *base_url=nullptr)
Definition parser.cpp:14
template url_aggregator parse_url< url_aggregator >(std::string_view user_input, const url_aggregator *base_url)
ada_warn_unused std::string to_string(encoding_type type)
state
Definition state.h:17
@ SPECIAL_RELATIVE_OR_AUTHORITY
@ SPECIAL_AUTHORITY_SLASHES
@ SPECIAL_AUTHORITY_IGNORE_SLASHES
void unreachable()
Definitions for the parser.
Lightweight URL struct.
ada_really_inline bool is_special() const noexcept
bool is_valid
Definition url_base.h:50
bool has_opaque_path
Definition url_base.h:55
Generic URL struct reliant on std::string instantiation.
Definition url.h:38
bool set_hostname(std::string_view input)
bool set_host(std::string_view input)
std::string_view get_pathname() const noexcept
ada_really_inline std::string get_href() const noexcept
Definition url-inl.h:183
std::string get_hostname() const noexcept
std::string get_protocol() const noexcept
Definitions for all unicode specific functions.
Definitions for the URL.