Ada 2.9.0
Fast spec-compliant URL parser
Loading...
Searching...
No Matches
parser.cpp
Go to the documentation of this file.
1#include "ada/parser.h"
2
3#include <limits>
4
5#include "ada.h"
7#include "ada/common_defs.h"
8#include "ada/log.h"
9#include "ada/unicode.h"
10#include "ada/url-inl.h"
11
12namespace ada::parser {
13
14template <class result_type, bool store_values>
15result_type parse_url_impl(std::string_view user_input,
16 const result_type* base_url) {
17 // We can specialize the implementation per type.
18 // Important: result_type_is_ada_url is evaluated at *compile time*. This
19 // means that doing if constexpr(result_type_is_ada_url) { something } else {
20 // something else } is free (at runtime). This means that ada::url_aggregator
21 // and ada::url **do not have to support the exact same API**.
22 constexpr bool result_type_is_ada_url =
23 std::is_same<ada::url, result_type>::value;
24 constexpr bool result_type_is_ada_url_aggregator =
25 std::is_same<ada::url_aggregator, result_type>::value;
26 static_assert(result_type_is_ada_url ||
27 result_type_is_ada_url_aggregator); // We don't support
28 // anything else for now.
29
30 ada_log("ada::parser::parse_url('", user_input, "' [", user_input.size(),
31 " bytes],", (base_url != nullptr ? base_url->to_string() : "null"),
32 ")");
33
35 result_type url{};
36
37 // We refuse to parse URL strings that exceed 4GB. Such strings are almost
38 // surely the result of a bug or are otherwise a security concern.
39 if (user_input.size() > std::numeric_limits<uint32_t>::max()) {
40 url.is_valid = false;
41 }
42 // Going forward, user_input.size() is in [0,
43 // std::numeric_limits<uint32_t>::max). If we are provided with an invalid
44 // base, or the optional_url was invalid, we must return.
45 if (base_url != nullptr) {
46 url.is_valid &= base_url->is_valid;
47 }
48 if (!url.is_valid) {
49 return url;
50 }
51 if constexpr (result_type_is_ada_url_aggregator && store_values) {
52 // Most of the time, we just need user_input.size().
53 // In some instances, we may need a bit more.
55 // This is *very* important. This line should *not* be removed
56 // hastily. There are principled reasons why reserve is important
57 // for performance. If you have a benchmark with small inputs,
58 // it may not matter, but in other instances, it could.
60 // This rounds up to the next power of two.
61 // We know that user_input.size() is in [0,
62 // std::numeric_limits<uint32_t>::max).
63 uint32_t reserve_capacity =
64 (0xFFFFFFFF >>
65 helpers::leading_zeroes(uint32_t(1 | user_input.size()))) +
66 1;
67 url.reserve(reserve_capacity);
68 }
69 std::string tmp_buffer;
70 std::string_view internal_input;
71 if (unicode::has_tabs_or_newline(user_input)) {
72 tmp_buffer = user_input;
73 // Optimization opportunity: Instead of copying and then pruning, we could
74 // just directly build the string from user_input.
75 helpers::remove_ascii_tab_or_newline(tmp_buffer);
76 internal_input = tmp_buffer;
77 } else {
78 internal_input = user_input;
79 }
80
81 // Leading and trailing control characters are uncommon and easy to deal with
82 // (no performance concern).
83 std::string_view url_data = internal_input;
84 helpers::trim_c0_whitespace(url_data);
85
86 // Optimization opportunity. Most websites do not have fragment.
87 std::optional<std::string_view> fragment = helpers::prune_hash(url_data);
88 // We add it last so that an implementation like ada::url_aggregator
89 // can append it last to its internal buffer, thus improving performance.
90
91 // Here url_data no longer has its fragment.
92 // We are going to access the data from url_data (it is immutable).
93 // At any given time, we are pointing at byte 'input_position' in url_data.
94 // The input_position variable should range from 0 to input_size.
95 // It is illegal to access url_data at input_size.
96 size_t input_position = 0;
97 const size_t input_size = url_data.size();
98 // Keep running the following state machine by switching on state.
99 // If after a run pointer points to the EOF code point, go to the next step.
100 // Otherwise, increase pointer by 1 and continue with the state machine.
101 // We never decrement input_position.
102 while (input_position <= input_size) {
103 ada_log("In parsing at ", input_position, " out of ", input_size,
104 " in state ", ada::to_string(state));
105 switch (state) {
107 ada_log("SCHEME_START ", helpers::substring(url_data, input_position));
108 // If c is an ASCII alpha, append c, lowercased, to buffer, and set
109 // state to scheme state.
110 if ((input_position != input_size) &&
111 checkers::is_alpha(url_data[input_position])) {
113 input_position++;
114 } else {
115 // Otherwise, if state override is not given, set state to no scheme
116 // state and decrease pointer by 1.
118 }
119 break;
120 }
121 case ada::state::SCHEME: {
122 ada_log("SCHEME ", helpers::substring(url_data, input_position));
123 // If c is an ASCII alphanumeric, U+002B (+), U+002D (-), or U+002E (.),
124 // append c, lowercased, to buffer.
125 while ((input_position != input_size) &&
126 (ada::unicode::is_alnum_plus(url_data[input_position]))) {
127 input_position++;
128 }
129 // Otherwise, if c is U+003A (:), then:
130 if ((input_position != input_size) &&
131 (url_data[input_position] == ':')) {
132 ada_log("SCHEME the scheme should be ",
133 url_data.substr(0, input_position));
134 if constexpr (result_type_is_ada_url) {
135 if (!url.parse_scheme(url_data.substr(0, input_position))) {
136 return url;
137 }
138 } else {
139 // we pass the colon along instead of painfully adding it back.
140 if (!url.parse_scheme_with_colon(
141 url_data.substr(0, input_position + 1))) {
142 return url;
143 }
144 }
145 ada_log("SCHEME the scheme is ", url.get_protocol());
146
147 // If url's scheme is "file", then:
148 if (url.type == ada::scheme::type::FILE) {
149 // Set state to file state.
151 }
152 // Otherwise, if url is special, base is non-null, and base's scheme
153 // is url's scheme: Note: Doing base_url->scheme is unsafe if base_url
154 // != nullptr is false.
155 else if (url.is_special() && base_url != nullptr &&
156 base_url->type == url.type) {
157 // Set state to special relative or authority state.
159 }
160 // Otherwise, if url is special, set state to special authority
161 // slashes state.
162 else if (url.is_special()) {
164 }
165 // Otherwise, if remaining starts with an U+002F (/), set state to
166 // path or authority state and increase pointer by 1.
167 else if (input_position + 1 < input_size &&
168 url_data[input_position + 1] == '/') {
170 input_position++;
171 }
172 // Otherwise, set url's path to the empty string and set state to
173 // opaque path state.
174 else {
176 }
177 }
178 // Otherwise, if state override is not given, set buffer to the empty
179 // string, state to no scheme state, and start over (from the first code
180 // point in input).
181 else {
183 input_position = 0;
184 break;
185 }
186 input_position++;
187 break;
188 }
190 ada_log("NO_SCHEME ", helpers::substring(url_data, input_position));
191 // If base is null, or base has an opaque path and c is not U+0023 (#),
192 // validation error, return failure.
193 if (base_url == nullptr ||
194 (base_url->has_opaque_path && !fragment.has_value())) {
195 ada_log("NO_SCHEME validation error");
196 url.is_valid = false;
197 return url;
198 }
199 // Otherwise, if base has an opaque path and c is U+0023 (#),
200 // set url's scheme to base's scheme, url's path to base's path, url's
201 // query to base's query, and set state to fragment state.
202 else if (base_url->has_opaque_path && fragment.has_value() &&
203 input_position == input_size) {
204 ada_log("NO_SCHEME opaque base with fragment");
205 url.copy_scheme(*base_url);
206 url.has_opaque_path = base_url->has_opaque_path;
207
208 if constexpr (result_type_is_ada_url) {
209 url.path = base_url->path;
210 url.query = base_url->query;
211 } else {
212 url.update_base_pathname(base_url->get_pathname());
213 url.update_base_search(base_url->get_search());
214 }
215 url.update_unencoded_base_hash(*fragment);
216 return url;
217 }
218 // Otherwise, if base's scheme is not "file", set state to relative
219 // state and decrease pointer by 1.
220 else if (base_url->type != ada::scheme::type::FILE) {
221 ada_log("NO_SCHEME non-file relative path");
223 }
224 // Otherwise, set state to file state and decrease pointer by 1.
225 else {
226 ada_log("NO_SCHEME file base type");
228 }
229 break;
230 }
232 ada_log("AUTHORITY ", helpers::substring(url_data, input_position));
233 // most URLs have no @. Having no @ tells us that we don't have to worry
234 // about AUTHORITY. Of course, we could have @ and still not have to
235 // worry about AUTHORITY.
236 // TODO: Instead of just collecting a bool, collect the location of the
237 // '@' and do something useful with it.
238 // TODO: We could do various processing early on, using a single pass
239 // over the string to collect information about it, e.g., telling us
240 // whether there is a @ and if so, where (or how many).
241 const bool contains_ampersand =
242 (url_data.find('@', input_position) != std::string_view::npos);
243
244 if (!contains_ampersand) {
246 break;
247 }
248 bool at_sign_seen{false};
249 bool password_token_seen{false};
255 do {
256 std::string_view view = helpers::substring(url_data, input_position);
257 // The delimiters are @, /, ? \\.
258 size_t location =
259 url.is_special() ? helpers::find_authority_delimiter_special(view)
260 : helpers::find_authority_delimiter(view);
261 std::string_view authority_view(view.data(), location);
262 size_t end_of_authority = input_position + authority_view.size();
263 // If c is U+0040 (@), then:
264 if ((end_of_authority != input_size) &&
265 (url_data[end_of_authority] == '@')) {
266 // If atSignSeen is true, then prepend "%40" to buffer.
267 if (at_sign_seen) {
268 if (password_token_seen) {
269 if constexpr (result_type_is_ada_url) {
270 url.password += "%40";
271 } else {
272 url.append_base_password("%40");
273 }
274 } else {
275 if constexpr (result_type_is_ada_url) {
276 url.username += "%40";
277 } else {
278 url.append_base_username("%40");
279 }
280 }
281 }
282
283 at_sign_seen = true;
284
285 if (!password_token_seen) {
286 size_t password_token_location = authority_view.find(':');
287 password_token_seen =
288 password_token_location != std::string_view::npos;
289
290 if constexpr (store_values) {
291 if (!password_token_seen) {
292 if constexpr (result_type_is_ada_url) {
293 url.username += unicode::percent_encode(
294 authority_view,
296 } else {
297 url.append_base_username(unicode::percent_encode(
298 authority_view,
300 }
301 } else {
302 if constexpr (result_type_is_ada_url) {
303 url.username += unicode::percent_encode(
304 authority_view.substr(0, password_token_location),
306 url.password += unicode::percent_encode(
307 authority_view.substr(password_token_location + 1),
309 } else {
310 url.append_base_username(unicode::percent_encode(
311 authority_view.substr(0, password_token_location),
313 url.append_base_password(unicode::percent_encode(
314 authority_view.substr(password_token_location + 1),
316 }
317 }
318 }
319 } else if constexpr (store_values) {
320 if constexpr (result_type_is_ada_url) {
321 url.password += unicode::percent_encode(
323 } else {
324 url.append_base_password(unicode::percent_encode(
326 }
327 }
328 }
329 // Otherwise, if one of the following is true:
330 // - c is the EOF code point, U+002F (/), U+003F (?), or U+0023 (#)
331 // - url is special and c is U+005C (\‍)
332 else if (end_of_authority == input_size ||
333 url_data[end_of_authority] == '/' ||
334 url_data[end_of_authority] == '?' ||
335 (url.is_special() && url_data[end_of_authority] == '\\')) {
336 // If atSignSeen is true and authority_view is the empty string,
337 // validation error, return failure.
338 if (at_sign_seen && authority_view.empty()) {
339 url.is_valid = false;
340 return url;
341 }
343 break;
344 }
345 if (end_of_authority == input_size) {
346 if constexpr (store_values) {
347 if (fragment.has_value()) {
348 url.update_unencoded_base_hash(*fragment);
349 }
350 }
351 return url;
352 }
353 input_position = end_of_authority + 1;
354 } while (true);
355
356 break;
357 }
359 ada_log("SPECIAL_RELATIVE_OR_AUTHORITY ",
360 helpers::substring(url_data, input_position));
361
362 // If c is U+002F (/) and remaining starts with U+002F (/),
363 // then set state to special authority ignore slashes state and increase
364 // pointer by 1.
365 std::string_view view = helpers::substring(url_data, input_position);
366 if (ada::checkers::begins_with(view, "//")) {
368 input_position += 2;
369 } else {
370 // Otherwise, validation error, set state to relative state and
371 // decrease pointer by 1.
373 }
374
375 break;
376 }
378 ada_log("PATH_OR_AUTHORITY ",
379 helpers::substring(url_data, input_position));
380
381 // If c is U+002F (/), then set state to authority state.
382 if ((input_position != input_size) &&
383 (url_data[input_position] == '/')) {
385 input_position++;
386 } else {
387 // Otherwise, set state to path state, and decrease pointer by 1.
389 }
390
391 break;
392 }
394 ada_log("RELATIVE_SCHEME ",
395 helpers::substring(url_data, input_position));
396
397 // Set url's scheme to base's scheme.
398 url.copy_scheme(*base_url);
399
400 // If c is U+002F (/), then set state to relative slash state.
401 if ((input_position != input_size) &&
402 (url_data[input_position] == '/')) {
403 ada_log(
404 "RELATIVE_SCHEME if c is U+002F (/), then set state to relative "
405 "slash state");
407 } else if (url.is_special() && (input_position != input_size) &&
408 (url_data[input_position] == '\\')) {
409 // Otherwise, if url is special and c is U+005C (\‍), validation error,
410 // set state to relative slash state.
411 ada_log(
412 "RELATIVE_SCHEME if url is special and c is U+005C, validation "
413 "error, set state to relative slash state");
415 } else {
416 ada_log("RELATIVE_SCHEME otherwise");
417 // Set url's username to base's username, url's password to base's
418 // password, url's host to base's host, url's port to base's port,
419 // url's path to a clone of base's path, and url's query to base's
420 // query.
421 if constexpr (result_type_is_ada_url) {
422 url.username = base_url->username;
423 url.password = base_url->password;
424 url.host = base_url->host;
425 url.port = base_url->port;
426 // cloning the base path includes cloning the has_opaque_path flag
427 url.has_opaque_path = base_url->has_opaque_path;
428 url.path = base_url->path;
429 url.query = base_url->query;
430 } else {
431 url.update_base_authority(base_url->get_href(),
432 base_url->get_components());
433 // TODO: Get rid of set_hostname and replace it with
434 // update_base_hostname
435 url.set_hostname(base_url->get_hostname());
436 url.update_base_port(base_url->retrieve_base_port());
437 // cloning the base path includes cloning the has_opaque_path flag
438 url.has_opaque_path = base_url->has_opaque_path;
439 url.update_base_pathname(base_url->get_pathname());
440 url.update_base_search(base_url->get_search());
441 }
442
443 url.has_opaque_path = base_url->has_opaque_path;
444
445 // If c is U+003F (?), then set url's query to the empty string, and
446 // state to query state.
447 if ((input_position != input_size) &&
448 (url_data[input_position] == '?')) {
450 }
451 // Otherwise, if c is not the EOF code point:
452 else if (input_position != input_size) {
453 // Set url's query to null.
454 url.clear_search();
455 if constexpr (result_type_is_ada_url) {
456 // Shorten url's path.
457 helpers::shorten_path(url.path, url.type);
458 } else {
459 std::string_view path = url.get_pathname();
460 if (helpers::shorten_path(path, url.type)) {
461 url.update_base_pathname(std::string(path));
462 }
463 }
464 // Set state to path state and decrease pointer by 1.
466 break;
467 }
468 }
469 input_position++;
470 break;
471 }
473 ada_log("RELATIVE_SLASH ",
474 helpers::substring(url_data, input_position));
475
476 // If url is special and c is U+002F (/) or U+005C (\‍), then:
477 if (url.is_special() && (input_position != input_size) &&
478 (url_data[input_position] == '/' ||
479 url_data[input_position] == '\\')) {
480 // Set state to special authority ignore slashes state.
482 }
483 // Otherwise, if c is U+002F (/), then set state to authority state.
484 else if ((input_position != input_size) &&
485 (url_data[input_position] == '/')) {
487 }
488 // Otherwise, set
489 // - url's username to base's username,
490 // - url's password to base's password,
491 // - url's host to base's host,
492 // - url's port to base's port,
493 // - state to path state, and then, decrease pointer by 1.
494 else {
495 if constexpr (result_type_is_ada_url) {
496 url.username = base_url->username;
497 url.password = base_url->password;
498 url.host = base_url->host;
499 url.port = base_url->port;
500 } else {
501 url.update_base_authority(base_url->get_href(),
502 base_url->get_components());
503 // TODO: Get rid of set_hostname and replace it with
504 // update_base_hostname
505 url.set_hostname(base_url->get_hostname());
506 url.update_base_port(base_url->retrieve_base_port());
507 }
509 break;
510 }
511
512 input_position++;
513 break;
514 }
516 ada_log("SPECIAL_AUTHORITY_SLASHES ",
517 helpers::substring(url_data, input_position));
518
519 // If c is U+002F (/) and remaining starts with U+002F (/),
520 // then set state to special authority ignore slashes state and increase
521 // pointer by 1.
522 std::string_view view = helpers::substring(url_data, input_position);
523 if (ada::checkers::begins_with(view, "//")) {
524 input_position += 2;
525 }
526
527 [[fallthrough]];
528 }
530 ada_log("SPECIAL_AUTHORITY_IGNORE_SLASHES ",
531 helpers::substring(url_data, input_position));
532
533 // If c is neither U+002F (/) nor U+005C (\‍), then set state to
534 // authority state and decrease pointer by 1.
535 while ((input_position != input_size) &&
536 ((url_data[input_position] == '/') ||
537 (url_data[input_position] == '\\'))) {
538 input_position++;
539 }
541
542 break;
543 }
544 case ada::state::QUERY: {
545 ada_log("QUERY ", helpers::substring(url_data, input_position));
546 if constexpr (store_values) {
547 // Let queryPercentEncodeSet be the special-query percent-encode set
548 // if url is special; otherwise the query percent-encode set.
549 const uint8_t* query_percent_encode_set =
553
554 // Percent-encode after encoding, with encoding, buffer, and
555 // queryPercentEncodeSet, and append the result to url's query.
556 url.update_base_search(helpers::substring(url_data, input_position),
557 query_percent_encode_set);
558 ada_log("QUERY update_base_search completed ");
559 if (fragment.has_value()) {
560 url.update_unencoded_base_hash(*fragment);
561 }
562 }
563 return url;
564 }
565 case ada::state::HOST: {
566 ada_log("HOST ", helpers::substring(url_data, input_position));
567
568 std::string_view host_view =
569 helpers::substring(url_data, input_position);
570 auto [location, found_colon] =
571 helpers::get_host_delimiter_location(url.is_special(), host_view);
572 input_position = (location != std::string_view::npos)
573 ? input_position + location
574 : input_size;
575 // Otherwise, if c is U+003A (:) and insideBrackets is false, then:
576 // Note: the 'found_colon' value is true if and only if a colon was
577 // encountered while not inside brackets.
578 if (found_colon) {
579 // If buffer is the empty string, validation error, return failure.
580 // Let host be the result of host parsing buffer with url is not
581 // special.
582 ada_log("HOST parsing ", host_view);
583 if (!url.parse_host(host_view)) {
584 return url;
585 }
586 ada_log("HOST parsing results in ", url.get_hostname());
587 // Set url's host to host, buffer to the empty string, and state to
588 // port state.
590 input_position++;
591 }
592 // Otherwise, if one of the following is true:
593 // - c is the EOF code point, U+002F (/), U+003F (?), or U+0023 (#)
594 // - url is special and c is U+005C (\‍)
595 // The get_host_delimiter_location function either brings us to
596 // the colon outside of the bracket, or to one of those characters.
597 else {
598 // If url is special and host_view is the empty string, validation
599 // error, return failure.
600 if (url.is_special() && host_view.empty()) {
601 url.is_valid = false;
602 return url;
603 }
604 ada_log("HOST parsing ", host_view, " href=", url.get_href());
605 // Let host be the result of host parsing host_view with url is not
606 // special.
607 if (host_view.empty()) {
608 url.update_base_hostname("");
609 } else if (!url.parse_host(host_view)) {
610 return url;
611 }
612 ada_log("HOST parsing results in ", url.get_hostname(),
613 " href=", url.get_href());
614
615 // Set url's host to host, and state to path start state.
617 }
618
619 break;
620 }
622 ada_log("OPAQUE_PATH ", helpers::substring(url_data, input_position));
623 std::string_view view = helpers::substring(url_data, input_position);
624 // If c is U+003F (?), then set url's query to the empty string and
625 // state to query state.
626 size_t location = view.find('?');
627 if (location != std::string_view::npos) {
628 view.remove_suffix(view.size() - location);
630 input_position += location + 1;
631 } else {
632 input_position = input_size + 1;
633 }
634 url.has_opaque_path = true;
635 // This is a really unlikely scenario in real world. We should not seek
636 // to optimize it.
637 url.update_base_pathname(unicode::percent_encode(
639 break;
640 }
641 case ada::state::PORT: {
642 ada_log("PORT ", helpers::substring(url_data, input_position));
643 std::string_view port_view =
644 helpers::substring(url_data, input_position);
645 size_t consumed_bytes = url.parse_port(port_view, true);
646 input_position += consumed_bytes;
647 if (!url.is_valid) {
648 return url;
649 }
651 [[fallthrough]];
652 }
654 ada_log("PATH_START ", helpers::substring(url_data, input_position));
655
656 // If url is special, then:
657 if (url.is_special()) {
658 // Set state to path state.
660
661 // Optimization: Avoiding going into PATH state improves the
662 // performance of urls ending with /.
663 if (input_position == input_size) {
664 if constexpr (store_values) {
665 url.update_base_pathname("/");
666 if (fragment.has_value()) {
667 url.update_unencoded_base_hash(*fragment);
668 }
669 }
670 return url;
671 }
672 // If c is neither U+002F (/) nor U+005C (\‍), then decrease pointer
673 // by 1. We know that (input_position == input_size) is impossible
674 // here, because of the previous if-check.
675 if ((url_data[input_position] != '/') &&
676 (url_data[input_position] != '\\')) {
677 break;
678 }
679 }
680 // Otherwise, if state override is not given and c is U+003F (?),
681 // set url's query to the empty string and state to query state.
682 else if ((input_position != input_size) &&
683 (url_data[input_position] == '?')) {
685 }
686 // Otherwise, if c is not the EOF code point:
687 else if (input_position != input_size) {
688 // Set state to path state.
690
691 // If c is not U+002F (/), then decrease pointer by 1.
692 if (url_data[input_position] != '/') {
693 break;
694 }
695 }
696
697 input_position++;
698 break;
699 }
700 case ada::state::PATH: {
701 std::string_view view = helpers::substring(url_data, input_position);
702 ada_log("PATH ", helpers::substring(url_data, input_position));
703
704 // Most time, we do not need percent encoding.
705 // Furthermore, we can immediately locate the '?'.
706 size_t locofquestionmark = view.find('?');
707 if (locofquestionmark != std::string_view::npos) {
709 view.remove_suffix(view.size() - locofquestionmark);
710 input_position += locofquestionmark + 1;
711 } else {
712 input_position = input_size + 1;
713 }
714 if constexpr (store_values) {
715 if constexpr (result_type_is_ada_url) {
716 helpers::parse_prepared_path(view, url.type, url.path);
717 } else {
718 url.consume_prepared_path(view);
719 ADA_ASSERT_TRUE(url.validate());
720 }
721 }
722 break;
723 }
725 ada_log("FILE_SLASH ", helpers::substring(url_data, input_position));
726
727 // If c is U+002F (/) or U+005C (\‍), then:
728 if ((input_position != input_size) &&
729 (url_data[input_position] == '/' ||
730 url_data[input_position] == '\\')) {
731 ada_log("FILE_SLASH c is U+002F or U+005C");
732 // Set state to file host state.
734 input_position++;
735 } else {
736 ada_log("FILE_SLASH otherwise");
737 // If base is non-null and base's scheme is "file", then:
738 // Note: it is unsafe to do base_url->scheme unless you know that
739 // base_url_has_value() is true.
740 if (base_url != nullptr &&
741 base_url->type == ada::scheme::type::FILE) {
742 // Set url's host to base's host.
743 if constexpr (result_type_is_ada_url) {
744 url.host = base_url->host;
745 } else {
746 // TODO: Optimization opportunity.
747 url.set_host(base_url->get_host());
748 }
749 // If the code point substring from pointer to the end of input does
750 // not start with a Windows drive letter and base's path[0] is a
751 // normalized Windows drive letter, then append base's path[0] to
752 // url's path.
753 if (!base_url->get_pathname().empty()) {
755 helpers::substring(url_data, input_position))) {
756 std::string_view first_base_url_path =
757 base_url->get_pathname().substr(1);
758 size_t loc = first_base_url_path.find('/');
759 if (loc != std::string_view::npos) {
760 helpers::resize(first_base_url_path, loc);
761 }
763 first_base_url_path)) {
764 if constexpr (result_type_is_ada_url) {
765 url.path += '/';
766 url.path += first_base_url_path;
767 } else {
768 url.append_base_pathname(
769 helpers::concat("/", first_base_url_path));
770 }
771 }
772 }
773 }
774 }
775
776 // Set state to path state, and decrease pointer by 1.
778 }
779
780 break;
781 }
783 std::string_view view = helpers::substring(url_data, input_position);
784 ada_log("FILE_HOST ", helpers::substring(url_data, input_position));
785
786 size_t location = view.find_first_of("/\\?");
787 std::string_view file_host_buffer(
788 view.data(),
789 (location != std::string_view::npos) ? location : view.size());
790
791 if (checkers::is_windows_drive_letter(file_host_buffer)) {
793 } else if (file_host_buffer.empty()) {
794 // Set url's host to the empty string.
795 if constexpr (result_type_is_ada_url) {
796 url.host = "";
797 } else {
798 url.update_base_hostname("");
799 }
800 // Set state to path start state.
802 } else {
803 size_t consumed_bytes = file_host_buffer.size();
804 input_position += consumed_bytes;
805 // Let host be the result of host parsing buffer with url is not
806 // special.
807 if (!url.parse_host(file_host_buffer)) {
808 return url;
809 }
810
811 if constexpr (result_type_is_ada_url) {
812 // If host is "localhost", then set host to the empty string.
813 if (url.host.has_value() && url.host.value() == "localhost") {
814 url.host = "";
815 }
816 } else {
817 if (url.get_hostname() == "localhost") {
818 url.update_base_hostname("");
819 }
820 }
821
822 // Set buffer to the empty string and state to path start state.
824 }
825
826 break;
827 }
828 case ada::state::FILE: {
829 ada_log("FILE ", helpers::substring(url_data, input_position));
830 std::string_view file_view =
831 helpers::substring(url_data, input_position);
832
833 url.set_protocol_as_file();
834 if constexpr (result_type_is_ada_url) {
835 // Set url's host to the empty string.
836 url.host = "";
837 } else {
838 url.update_base_hostname("");
839 }
840 // If c is U+002F (/) or U+005C (\‍), then:
841 if (input_position != input_size &&
842 (url_data[input_position] == '/' ||
843 url_data[input_position] == '\\')) {
844 ada_log("FILE c is U+002F or U+005C");
845 // Set state to file slash state.
847 }
848 // Otherwise, if base is non-null and base's scheme is "file":
849 else if (base_url != nullptr &&
850 base_url->type == ada::scheme::type::FILE) {
851 // Set url's host to base's host, url's path to a clone of base's
852 // path, and url's query to base's query.
853 ada_log("FILE base non-null");
854 if constexpr (result_type_is_ada_url) {
855 url.host = base_url->host;
856 url.path = base_url->path;
857 url.query = base_url->query;
858 } else {
859 // TODO: Get rid of set_hostname and replace it with
860 // update_base_hostname
861 url.set_hostname(base_url->get_hostname());
862 url.update_base_pathname(base_url->get_pathname());
863 url.update_base_search(base_url->get_search());
864 }
865 url.has_opaque_path = base_url->has_opaque_path;
866
867 // If c is U+003F (?), then set url's query to the empty string and
868 // state to query state.
869 if (input_position != input_size && url_data[input_position] == '?') {
871 }
872 // Otherwise, if c is not the EOF code point:
873 else if (input_position != input_size) {
874 // Set url's query to null.
875 url.clear_search();
876 // If the code point substring from pointer to the end of input does
877 // not start with a Windows drive letter, then shorten url's path.
878 if (!checkers::is_windows_drive_letter(file_view)) {
879 if constexpr (result_type_is_ada_url) {
880 helpers::shorten_path(url.path, url.type);
881 } else {
882 std::string_view path = url.get_pathname();
883 if (helpers::shorten_path(path, url.type)) {
884 url.update_base_pathname(std::string(path));
885 }
886 }
887 }
888 // Otherwise:
889 else {
890 // Set url's path to an empty list.
891 url.clear_pathname();
892 url.has_opaque_path = true;
893 }
894
895 // Set state to path state and decrease pointer by 1.
897 break;
898 }
899 }
900 // Otherwise, set state to path state, and decrease pointer by 1.
901 else {
902 ada_log("FILE go to path");
904 break;
905 }
906
907 input_position++;
908 break;
909 }
910 default:
912 }
913 }
914 if constexpr (store_values) {
915 if (fragment.has_value()) {
916 url.update_unencoded_base_hash(*fragment);
917 }
918 }
919 return url;
920}
921
922template url parse_url_impl(std::string_view user_input,
923 const url* base_url = nullptr);
925 std::string_view user_input, const url_aggregator* base_url = nullptr);
926
927template <class result_type>
928result_type parse_url(std::string_view user_input,
929 const result_type* base_url) {
930 return parse_url_impl<result_type, true>(user_input, base_url);
931}
932
933template url parse_url<url>(std::string_view user_input,
934 const url* base_url = nullptr);
936 std::string_view user_input, const url_aggregator* base_url = nullptr);
937} // namespace ada::parser
Includes all definitions for Ada.
Definitions of the character sets used by unicode functions.
Common definitions for cross-platform compiler support.
#define ADA_ASSERT_TRUE(COND)
constexpr uint8_t QUERY_PERCENT_ENCODE[32]
constexpr uint8_t SPECIAL_QUERY_PERCENT_ENCODE[32]
constexpr uint8_t C0_CONTROL_PERCENT_ENCODE[32]
constexpr uint8_t USERINFO_PERCENT_ENCODE[32]
constexpr bool is_normalized_windows_drive_letter(std::string_view input) noexcept
constexpr bool is_windows_drive_letter(std::string_view input) noexcept
constexpr bool is_alpha(char x) noexcept
ada_really_inline bool begins_with(std::string_view view, std::string_view prefix)
Includes the definitions for supported parsers.
template url parse_url< url >(std::string_view user_input, const url *base_url)
result_type parse_url(std::string_view user_input, const result_type *base_url=nullptr)
Definition parser.cpp:928
template url_aggregator parse_url< url_aggregator >(std::string_view user_input, const url_aggregator *base_url)
result_type parse_url_impl(std::string_view user_input, const result_type *base_url=nullptr)
Definition parser.cpp:15
ada_warn_unused std::string to_string(encoding_type type)
state
Definition state.h:17
@ SPECIAL_RELATIVE_OR_AUTHORITY
@ SPECIAL_AUTHORITY_SLASHES
@ SPECIAL_AUTHORITY_IGNORE_SLASHES
void unreachable()
Definitions for the parser.
Lightweight URL struct.
ada_really_inline bool is_special() const noexcept
bool is_valid
Definition url_base.h:50
bool has_opaque_path
Definition url_base.h:55
Generic URL struct reliant on std::string instantiation.
Definition url.h:38
bool set_hostname(std::string_view input)
bool set_host(std::string_view input)
std::string_view get_pathname() const noexcept
ada_really_inline std::string get_href() const noexcept
Definition url-inl.h:183
std::string get_hostname() const noexcept
std::string get_protocol() const noexcept
Definitions for all unicode specific functions.
Definitions for the URL.