Ada 2.9.0
Fast spec-compliant URL parser
Loading...
Searching...
No Matches
helpers.cpp
Go to the documentation of this file.
1#include "ada.h"
2#include "ada/checkers-inl.h"
3#include "ada/common_defs.h" // make sure ADA_IS_BIG_ENDIAN gets defined.
4#include "ada/scheme.h"
5
6#include <algorithm>
7#include <charconv>
8#include <cstring>
9#include <sstream>
10
11namespace ada::helpers {
12
13template <typename out_iter>
14void encode_json(std::string_view view, out_iter out) {
15 // trivial implementation. could be faster.
16 const char* hexvalues =
17 "000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f";
18 for (uint8_t c : view) {
19 if (c == '\\') {
20 *out++ = '\\';
21 *out++ = '\\';
22 } else if (c == '"') {
23 *out++ = '\\';
24 *out++ = '"';
25 } else if (c <= 0x1f) {
26 *out++ = '\\';
27 *out++ = 'u';
28 *out++ = '0';
29 *out++ = '0';
30 *out++ = hexvalues[2 * c];
31 *out++ = hexvalues[2 * c + 1];
32 } else {
33 *out++ = c;
34 }
35 }
36}
37
39 switch (s) {
41 return "Authority";
43 return "Scheme Start";
45 return "Scheme";
47 return "Host";
49 return "No Scheme";
51 return "Fragment";
53 return "Relative Scheme";
55 return "Relative Slash";
57 return "File";
59 return "File Host";
61 return "File Slash";
63 return "Path or Authority";
65 return "Special Authority Ignore Slashes";
67 return "Special Authority Slashes";
69 return "Special Relative or Authority";
71 return "Query";
73 return "Path";
75 return "Path Start";
77 return "Opaque Path";
79 return "Port";
80 default:
81 return "unknown state";
82 }
83}
84
85ada_really_inline std::optional<std::string_view> prune_hash(
86 std::string_view& input) noexcept {
87 // compiles down to 20--30 instructions including a class to memchr (C
88 // function). this function should be quite fast.
89 size_t location_of_first = input.find('#');
90 if (location_of_first == std::string_view::npos) {
91 return std::nullopt;
92 }
93 std::string_view hash = input;
94 hash.remove_prefix(location_of_first + 1);
95 input.remove_suffix(input.size() - location_of_first);
96 return hash;
97}
98
99ada_really_inline bool shorten_path(std::string& path,
100 ada::scheme::type type) noexcept {
101 size_t first_delimiter = path.find_first_of('/', 1);
102
103 // Let path be url's path.
104 // If url's scheme is "file", path's size is 1, and path[0] is a normalized
105 // Windows drive letter, then return.
106 if (type == ada::scheme::type::FILE &&
107 first_delimiter == std::string_view::npos && !path.empty()) {
109 helpers::substring(path, 1))) {
110 return false;
111 }
112 }
113
114 // Remove path's last item, if any.
115 size_t last_delimiter = path.rfind('/');
116 if (last_delimiter != std::string::npos) {
117 path.erase(last_delimiter);
118 return true;
119 }
120
121 return false;
122}
123
124ada_really_inline bool shorten_path(std::string_view& path,
125 ada::scheme::type type) noexcept {
126 size_t first_delimiter = path.find_first_of('/', 1);
127
128 // Let path be url's path.
129 // If url's scheme is "file", path's size is 1, and path[0] is a normalized
130 // Windows drive letter, then return.
131 if (type == ada::scheme::type::FILE &&
132 first_delimiter == std::string_view::npos && !path.empty()) {
134 helpers::substring(path, 1))) {
135 return false;
136 }
137 }
138
139 // Remove path's last item, if any.
140 if (!path.empty()) {
141 size_t slash_loc = path.rfind('/');
142 if (slash_loc != std::string_view::npos) {
143 path.remove_suffix(path.size() - slash_loc);
144 return true;
145 }
146 }
147
148 return false;
149}
150
151ada_really_inline void remove_ascii_tab_or_newline(
152 std::string& input) noexcept {
153 // if this ever becomes a performance issue, we could use an approach similar
154 // to has_tabs_or_newline
155 input.erase(std::remove_if(input.begin(), input.end(),
156 [](char c) {
157 return ada::unicode::is_ascii_tab_or_newline(c);
158 }),
159 input.end());
160}
161
162ada_really_inline std::string_view substring(std::string_view input,
163 size_t pos) noexcept {
164 ADA_ASSERT_TRUE(pos <= input.size());
165 // The following is safer but unneeded if we have the above line:
166 // return pos > input.size() ? std::string_view() : input.substr(pos);
167 return input.substr(pos);
168}
169
170ada_really_inline void resize(std::string_view& input, size_t pos) noexcept {
171 ADA_ASSERT_TRUE(pos <= input.size());
172 input.remove_suffix(input.size() - pos);
173}
174
175// computes the number of trailing zeroes
176// this is a private inline function only defined in this source file.
177ada_really_inline int trailing_zeroes(uint32_t input_num) noexcept {
178#ifdef ADA_REGULAR_VISUAL_STUDIO
179 unsigned long ret;
180 // Search the mask data from least significant bit (LSB)
181 // to the most significant bit (MSB) for a set bit (1).
182 _BitScanForward(&ret, input_num);
183 return (int)ret;
184#else // ADA_REGULAR_VISUAL_STUDIO
185 return __builtin_ctzl(input_num);
186#endif // ADA_REGULAR_VISUAL_STUDIO
187}
188
189// starting at index location, this finds the next location of a character
190// :, /, \\, ? or [. If none is found, view.size() is returned.
191// For use within get_host_delimiter_location.
192#if ADA_NEON
193// The ada_make_uint8x16_t macro is necessary because Visual Studio does not
194// support direct initialization of uint8x16_t. See
195// https://developercommunity.visualstudio.com/t/error-C2078:-too-many-initializers-whe/402911?q=backend+neon
196#ifndef ada_make_uint8x16_t
197#define ada_make_uint8x16_t(x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, \
198 x13, x14, x15, x16) \
199 ([=]() { \
200 static uint8_t array[16] = {x1, x2, x3, x4, x5, x6, x7, x8, \
201 x9, x10, x11, x12, x13, x14, x15, x16}; \
202 return vld1q_u8(array); \
203 }())
204#endif
205
207 std::string_view view, size_t location) noexcept {
208 // first check for short strings in which case we do it naively.
209 if (view.size() - location < 16) { // slow path
210 for (size_t i = location; i < view.size(); i++) {
211 if (view[i] == ':' || view[i] == '/' || view[i] == '\\' ||
212 view[i] == '?' || view[i] == '[') {
213 return i;
214 }
215 }
216 return size_t(view.size());
217 }
218 auto to_bitmask = [](uint8x16_t input) -> uint16_t {
219 uint8x16_t bit_mask =
220 ada_make_uint8x16_t(0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, 0x01,
221 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80);
222 uint8x16_t minput = vandq_u8(input, bit_mask);
223 uint8x16_t tmp = vpaddq_u8(minput, minput);
224 tmp = vpaddq_u8(tmp, tmp);
225 tmp = vpaddq_u8(tmp, tmp);
226 return vgetq_lane_u16(vreinterpretq_u16_u8(tmp), 0);
227 };
228
229 // fast path for long strings (expected to be common)
230 size_t i = location;
231 uint8x16_t low_mask =
232 ada_make_uint8x16_t(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
233 0x00, 0x01, 0x04, 0x04, 0x00, 0x00, 0x03);
234 uint8x16_t high_mask =
235 ada_make_uint8x16_t(0x00, 0x00, 0x02, 0x01, 0x00, 0x04, 0x00, 0x00, 0x00,
236 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
237 uint8x16_t fmask = vmovq_n_u8(0xf);
238 uint8x16_t zero{0};
239 for (; i + 15 < view.size(); i += 16) {
240 uint8x16_t word = vld1q_u8((const uint8_t*)view.data() + i);
241 uint8x16_t lowpart = vqtbl1q_u8(low_mask, vandq_u8(word, fmask));
242 uint8x16_t highpart = vqtbl1q_u8(high_mask, vshrq_n_u8(word, 4));
243 uint8x16_t classify = vandq_u8(lowpart, highpart);
244 if (vmaxvq_u32(vreinterpretq_u32_u8(classify)) != 0) {
245 uint8x16_t is_zero = vceqq_u8(classify, zero);
246 uint16_t is_non_zero = ~to_bitmask(is_zero);
247 return i + trailing_zeroes(is_non_zero);
248 }
249 }
250
251 if (i < view.size()) {
252 uint8x16_t word =
253 vld1q_u8((const uint8_t*)view.data() + view.length() - 16);
254 uint8x16_t lowpart = vqtbl1q_u8(low_mask, vandq_u8(word, fmask));
255 uint8x16_t highpart = vqtbl1q_u8(high_mask, vshrq_n_u8(word, 4));
256 uint8x16_t classify = vandq_u8(lowpart, highpart);
257 if (vmaxvq_u32(vreinterpretq_u32_u8(classify)) != 0) {
258 uint8x16_t is_zero = vceqq_u8(classify, zero);
259 uint16_t is_non_zero = ~to_bitmask(is_zero);
260 return view.length() - 16 + trailing_zeroes(is_non_zero);
261 }
262 }
263 return size_t(view.size());
264}
265#elif ADA_SSE2
267 std::string_view view, size_t location) noexcept {
268 // first check for short strings in which case we do it naively.
269 if (view.size() - location < 16) { // slow path
270 for (size_t i = location; i < view.size(); i++) {
271 if (view[i] == ':' || view[i] == '/' || view[i] == '\\' ||
272 view[i] == '?' || view[i] == '[') {
273 return i;
274 }
275 }
276 return size_t(view.size());
277 }
278 // fast path for long strings (expected to be common)
279 size_t i = location;
280 const __m128i mask1 = _mm_set1_epi8(':');
281 const __m128i mask2 = _mm_set1_epi8('/');
282 const __m128i mask3 = _mm_set1_epi8('\\');
283 const __m128i mask4 = _mm_set1_epi8('?');
284 const __m128i mask5 = _mm_set1_epi8('[');
285
286 for (; i + 15 < view.size(); i += 16) {
287 __m128i word = _mm_loadu_si128((const __m128i*)(view.data() + i));
288 __m128i m1 = _mm_cmpeq_epi8(word, mask1);
289 __m128i m2 = _mm_cmpeq_epi8(word, mask2);
290 __m128i m3 = _mm_cmpeq_epi8(word, mask3);
291 __m128i m4 = _mm_cmpeq_epi8(word, mask4);
292 __m128i m5 = _mm_cmpeq_epi8(word, mask5);
293 __m128i m = _mm_or_si128(
294 _mm_or_si128(_mm_or_si128(m1, m2), _mm_or_si128(m3, m4)), m5);
295 int mask = _mm_movemask_epi8(m);
296 if (mask != 0) {
297 return i + trailing_zeroes(mask);
298 }
299 }
300 if (i < view.size()) {
301 __m128i word =
302 _mm_loadu_si128((const __m128i*)(view.data() + view.length() - 16));
303 __m128i m1 = _mm_cmpeq_epi8(word, mask1);
304 __m128i m2 = _mm_cmpeq_epi8(word, mask2);
305 __m128i m3 = _mm_cmpeq_epi8(word, mask3);
306 __m128i m4 = _mm_cmpeq_epi8(word, mask4);
307 __m128i m5 = _mm_cmpeq_epi8(word, mask5);
308 __m128i m = _mm_or_si128(
309 _mm_or_si128(_mm_or_si128(m1, m2), _mm_or_si128(m3, m4)), m5);
310 int mask = _mm_movemask_epi8(m);
311 if (mask != 0) {
312 return view.length() - 16 + trailing_zeroes(mask);
313 }
314 }
315 return size_t(view.length());
316}
317#else
318// : / [ \\ ?
319static constexpr std::array<uint8_t, 256> special_host_delimiters =
320 []() constexpr {
321 std::array<uint8_t, 256> result{};
322 for (int i : {':', '/', '[', '\\', '?'}) {
323 result[i] = 1;
324 }
325 return result;
326 }();
327// credit: @the-moisrex recommended a table-based approach
329 std::string_view view, size_t location) noexcept {
330 auto const str = view.substr(location);
331 for (auto pos = str.begin(); pos != str.end(); ++pos) {
332 if (special_host_delimiters[(uint8_t)*pos]) {
333 return pos - str.begin() + location;
334 }
335 }
336 return size_t(view.size());
337}
338#endif
339
340// starting at index location, this finds the next location of a character
341// :, /, ? or [. If none is found, view.size() is returned.
342// For use within get_host_delimiter_location.
343#if ADA_NEON
344ada_really_inline size_t find_next_host_delimiter(std::string_view view,
345 size_t location) noexcept {
346 // first check for short strings in which case we do it naively.
347 if (view.size() - location < 16) { // slow path
348 for (size_t i = location; i < view.size(); i++) {
349 if (view[i] == ':' || view[i] == '/' || view[i] == '?' ||
350 view[i] == '[') {
351 return i;
352 }
353 }
354 return size_t(view.size());
355 }
356 auto to_bitmask = [](uint8x16_t input) -> uint16_t {
357 uint8x16_t bit_mask =
358 ada_make_uint8x16_t(0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, 0x01,
359 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80);
360 uint8x16_t minput = vandq_u8(input, bit_mask);
361 uint8x16_t tmp = vpaddq_u8(minput, minput);
362 tmp = vpaddq_u8(tmp, tmp);
363 tmp = vpaddq_u8(tmp, tmp);
364 return vgetq_lane_u16(vreinterpretq_u16_u8(tmp), 0);
365 };
366
367 // fast path for long strings (expected to be common)
368 size_t i = location;
369 uint8x16_t low_mask =
370 ada_make_uint8x16_t(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
371 0x00, 0x01, 0x04, 0x00, 0x00, 0x00, 0x03);
372 uint8x16_t high_mask =
373 ada_make_uint8x16_t(0x00, 0x00, 0x02, 0x01, 0x00, 0x04, 0x00, 0x00, 0x00,
374 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
375 uint8x16_t fmask = vmovq_n_u8(0xf);
376 uint8x16_t zero{0};
377 for (; i + 15 < view.size(); i += 16) {
378 uint8x16_t word = vld1q_u8((const uint8_t*)view.data() + i);
379 uint8x16_t lowpart = vqtbl1q_u8(low_mask, vandq_u8(word, fmask));
380 uint8x16_t highpart = vqtbl1q_u8(high_mask, vshrq_n_u8(word, 4));
381 uint8x16_t classify = vandq_u8(lowpart, highpart);
382 if (vmaxvq_u32(vreinterpretq_u32_u8(classify)) != 0) {
383 uint8x16_t is_zero = vceqq_u8(classify, zero);
384 uint16_t is_non_zero = ~to_bitmask(is_zero);
385 return i + trailing_zeroes(is_non_zero);
386 }
387 }
388
389 if (i < view.size()) {
390 uint8x16_t word =
391 vld1q_u8((const uint8_t*)view.data() + view.length() - 16);
392 uint8x16_t lowpart = vqtbl1q_u8(low_mask, vandq_u8(word, fmask));
393 uint8x16_t highpart = vqtbl1q_u8(high_mask, vshrq_n_u8(word, 4));
394 uint8x16_t classify = vandq_u8(lowpart, highpart);
395 if (vmaxvq_u32(vreinterpretq_u32_u8(classify)) != 0) {
396 uint8x16_t is_zero = vceqq_u8(classify, zero);
397 uint16_t is_non_zero = ~to_bitmask(is_zero);
398 return view.length() - 16 + trailing_zeroes(is_non_zero);
399 }
400 }
401 return size_t(view.size());
402}
403#elif ADA_SSE2
404ada_really_inline size_t find_next_host_delimiter(std::string_view view,
405 size_t location) noexcept {
406 // first check for short strings in which case we do it naively.
407 if (view.size() - location < 16) { // slow path
408 for (size_t i = location; i < view.size(); i++) {
409 if (view[i] == ':' || view[i] == '/' || view[i] == '?' ||
410 view[i] == '[') {
411 return i;
412 }
413 }
414 return size_t(view.size());
415 }
416 // fast path for long strings (expected to be common)
417 size_t i = location;
418 const __m128i mask1 = _mm_set1_epi8(':');
419 const __m128i mask2 = _mm_set1_epi8('/');
420 const __m128i mask4 = _mm_set1_epi8('?');
421 const __m128i mask5 = _mm_set1_epi8('[');
422
423 for (; i + 15 < view.size(); i += 16) {
424 __m128i word = _mm_loadu_si128((const __m128i*)(view.data() + i));
425 __m128i m1 = _mm_cmpeq_epi8(word, mask1);
426 __m128i m2 = _mm_cmpeq_epi8(word, mask2);
427 __m128i m4 = _mm_cmpeq_epi8(word, mask4);
428 __m128i m5 = _mm_cmpeq_epi8(word, mask5);
429 __m128i m = _mm_or_si128(_mm_or_si128(m1, m2), _mm_or_si128(m4, m5));
430 int mask = _mm_movemask_epi8(m);
431 if (mask != 0) {
432 return i + trailing_zeroes(mask);
433 }
434 }
435 if (i < view.size()) {
436 __m128i word =
437 _mm_loadu_si128((const __m128i*)(view.data() + view.length() - 16));
438 __m128i m1 = _mm_cmpeq_epi8(word, mask1);
439 __m128i m2 = _mm_cmpeq_epi8(word, mask2);
440 __m128i m4 = _mm_cmpeq_epi8(word, mask4);
441 __m128i m5 = _mm_cmpeq_epi8(word, mask5);
442 __m128i m = _mm_or_si128(_mm_or_si128(m1, m2), _mm_or_si128(m4, m5));
443 int mask = _mm_movemask_epi8(m);
444 if (mask != 0) {
445 return view.length() - 16 + trailing_zeroes(mask);
446 }
447 }
448 return size_t(view.length());
449}
450#else
451// : / [ ?
452static constexpr std::array<uint8_t, 256> host_delimiters = []() constexpr {
453 std::array<uint8_t, 256> result{};
454 for (int i : {':', '/', '?', '['}) {
455 result[i] = 1;
456 }
457 return result;
458}();
459// credit: @the-moisrex recommended a table-based approach
460ada_really_inline size_t find_next_host_delimiter(std::string_view view,
461 size_t location) noexcept {
462 auto const str = view.substr(location);
463 for (auto pos = str.begin(); pos != str.end(); ++pos) {
464 if (host_delimiters[(uint8_t)*pos]) {
465 return pos - str.begin() + location;
466 }
467 }
468 return size_t(view.size());
469}
470#endif
471
472ada_really_inline std::pair<size_t, bool> get_host_delimiter_location(
473 const bool is_special, std::string_view& view) noexcept {
482 const size_t view_size = view.size();
483 size_t location = 0;
484 bool found_colon = false;
504 if (is_special) {
505 // We move to the next delimiter.
506 location = find_next_host_delimiter_special(view, location);
507 // Unless we find '[' then we are going only going to have to call
508 // find_next_host_delimiter_special once.
509 for (; location < view_size;
510 location = find_next_host_delimiter_special(view, location)) {
511 if (view[location] == '[') {
512 location = view.find(']', location);
513 if (location == std::string_view::npos) {
514 // performance: view.find might get translated to a memchr, which
515 // has no notion of std::string_view::npos, so the code does not
516 // reflect the assembly.
517 location = view_size;
518 break;
519 }
520 } else {
521 found_colon = view[location] == ':';
522 break;
523 }
524 }
525 } else {
526 // We move to the next delimiter.
527 location = find_next_host_delimiter(view, location);
528 // Unless we find '[' then we are going only going to have to call
529 // find_next_host_delimiter_special once.
530 for (; location < view_size;
531 location = find_next_host_delimiter(view, location)) {
532 if (view[location] == '[') {
533 location = view.find(']', location);
534 if (location == std::string_view::npos) {
535 // performance: view.find might get translated to a memchr, which
536 // has no notion of std::string_view::npos, so the code does not
537 // reflect the assembly.
538 location = view_size;
539 break;
540 }
541 } else {
542 found_colon = view[location] == ':';
543 break;
544 }
545 }
546 }
547 // performance: remove_suffix may translate into a single instruction.
548 view.remove_suffix(view_size - location);
549 return {location, found_colon};
550}
551
552ada_really_inline void trim_c0_whitespace(std::string_view& input) noexcept {
553 while (!input.empty() &&
554 ada::unicode::is_c0_control_or_space(input.front())) {
555 input.remove_prefix(1);
556 }
557 while (!input.empty() && ada::unicode::is_c0_control_or_space(input.back())) {
558 input.remove_suffix(1);
559 }
560}
561
562ada_really_inline void parse_prepared_path(std::string_view input,
564 std::string& path) {
565 ada_log("parse_prepared_path ", input);
566 uint8_t accumulator = checkers::path_signature(input);
567 // Let us first detect a trivial case.
568 // If it is special, we check that we have no dot, no %, no \ and no
569 // character needing percent encoding. Otherwise, we check that we have no %,
570 // no dot, and no character needing percent encoding.
571 constexpr uint8_t need_encoding = 1;
572 constexpr uint8_t backslash_char = 2;
573 constexpr uint8_t dot_char = 4;
574 constexpr uint8_t percent_char = 8;
575 bool special = type != ada::scheme::NOT_SPECIAL;
576 bool may_need_slow_file_handling = (type == ada::scheme::type::FILE &&
578 bool trivial_path =
579 (special ? (accumulator == 0)
580 : ((accumulator & (need_encoding | dot_char | percent_char)) ==
581 0)) &&
582 (!may_need_slow_file_handling);
583 if (accumulator == dot_char && !may_need_slow_file_handling) {
584 // '4' means that we have at least one dot, but nothing that requires
585 // percent encoding or decoding. The only part that is not trivial is
586 // that we may have single dots and double dots path segments.
587 // If we have such segments, then we either have a path that begins
588 // with '.' (easy to check), or we have the sequence './'.
589 // Note: input cannot be empty, it must at least contain one character ('.')
590 // Note: we know that '\' is not present.
591 if (input[0] != '.') {
592 size_t slashdot = input.find("/.");
593 if (slashdot == std::string_view::npos) { // common case
594 trivial_path = true;
595 } else { // uncommon
596 // only three cases matter: /./, /.. or a final /
597 trivial_path =
598 !(slashdot + 2 == input.size() || input[slashdot + 2] == '.' ||
599 input[slashdot + 2] == '/');
600 }
601 }
602 }
603 if (trivial_path) {
604 ada_log("parse_path trivial");
605 path += '/';
606 path += input;
607 return;
608 }
609 // We are going to need to look a bit at the path, but let us see if we can
610 // ignore percent encoding *and* backslashes *and* percent characters.
611 // Except for the trivial case, this is likely to capture 99% of paths out
612 // there.
613 bool fast_path =
614 (special &&
615 (accumulator & (need_encoding | backslash_char | percent_char)) == 0) &&
616 (type != ada::scheme::type::FILE);
617 if (fast_path) {
618 ada_log("parse_prepared_path fast");
619 // Here we don't need to worry about \ or percent encoding.
620 // We also do not have a file protocol. We might have dots, however,
621 // but dots must as appear as '.', and they cannot be encoded because
622 // the symbol '%' is not present.
623 size_t previous_location = 0; // We start at 0.
624 do {
625 size_t new_location = input.find('/', previous_location);
626 // std::string_view path_view = input;
627 // We process the last segment separately:
628 if (new_location == std::string_view::npos) {
629 std::string_view path_view = input.substr(previous_location);
630 if (path_view == "..") { // The path ends with ..
631 // e.g., if you receive ".." with an empty path, you go to "/".
632 if (path.empty()) {
633 path = '/';
634 return;
635 }
636 // Fast case where we have nothing to do:
637 if (path.back() == '/') {
638 return;
639 }
640 // If you have the path "/joe/myfriend",
641 // then you delete 'myfriend'.
642 path.resize(path.rfind('/') + 1);
643 return;
644 }
645 path += '/';
646 if (path_view != ".") {
647 path.append(path_view);
648 }
649 return;
650 } else {
651 // This is a non-final segment.
652 std::string_view path_view =
653 input.substr(previous_location, new_location - previous_location);
654 previous_location = new_location + 1;
655 if (path_view == "..") {
656 size_t last_delimiter = path.rfind('/');
657 if (last_delimiter != std::string::npos) {
658 path.erase(last_delimiter);
659 }
660 } else if (path_view != ".") {
661 path += '/';
662 path.append(path_view);
663 }
664 }
665 } while (true);
666 } else {
667 ada_log("parse_path slow");
668 // we have reached the general case
669 bool needs_percent_encoding = (accumulator & 1);
670 std::string path_buffer_tmp;
671 do {
672 size_t location = (special && (accumulator & 2))
673 ? input.find_first_of("/\\")
674 : input.find('/');
675 std::string_view path_view = input;
676 if (location != std::string_view::npos) {
677 path_view.remove_suffix(path_view.size() - location);
678 input.remove_prefix(location + 1);
679 }
680 // path_buffer is either path_view or it might point at a percent encoded
681 // temporary file.
682 std::string_view path_buffer =
683 (needs_percent_encoding &&
684 ada::unicode::percent_encode<false>(
685 path_view, character_sets::PATH_PERCENT_ENCODE, path_buffer_tmp))
686 ? path_buffer_tmp
687 : path_view;
688 if (unicode::is_double_dot_path_segment(path_buffer)) {
689 if ((helpers::shorten_path(path, type) || special) &&
690 location == std::string_view::npos) {
691 path += '/';
692 }
693 } else if (unicode::is_single_dot_path_segment(path_buffer) &&
694 (location == std::string_view::npos)) {
695 path += '/';
696 }
697 // Otherwise, if path_buffer is not a single-dot path segment, then:
698 else if (!unicode::is_single_dot_path_segment(path_buffer)) {
699 // If url's scheme is "file", url's path is empty, and path_buffer is a
700 // Windows drive letter, then replace the second code point in
701 // path_buffer with U+003A (:).
702 if (type == ada::scheme::type::FILE && path.empty() &&
704 path += '/';
705 path += path_buffer[0];
706 path += ':';
707 path_buffer.remove_prefix(2);
708 path.append(path_buffer);
709 } else {
710 // Append path_buffer to url's path.
711 path += '/';
712 path.append(path_buffer);
713 }
714 }
715 if (location == std::string_view::npos) {
716 return;
717 }
718 } while (true);
719 }
720}
721
722bool overlaps(std::string_view input1, const std::string& input2) noexcept {
723 ada_log("helpers::overlaps check if string_view '", input1, "' [",
724 input1.size(), " bytes] is part of string '", input2, "' [",
725 input2.size(), " bytes]");
726 return !input1.empty() && !input2.empty() && input1.data() >= input2.data() &&
727 input1.data() < input2.data() + input2.size();
728}
729
730template <class url_type>
731ada_really_inline void strip_trailing_spaces_from_opaque_path(
732 url_type& url) noexcept {
733 ada_log("helpers::strip_trailing_spaces_from_opaque_path");
734 if (!url.has_opaque_path) return;
735 if (url.has_hash()) return;
736 if (url.has_search()) return;
737
738 auto path = std::string(url.get_pathname());
739 while (!path.empty() && path.back() == ' ') {
740 path.resize(path.size() - 1);
741 }
742 url.update_base_pathname(path);
743}
744
745// @ / \\ ?
746static constexpr std::array<uint8_t, 256> authority_delimiter_special =
747 []() constexpr {
748 std::array<uint8_t, 256> result{};
749 for (uint8_t i : {'@', '/', '\\', '?'}) {
750 result[i] = 1;
751 }
752 return result;
753 }();
754// credit: @the-moisrex recommended a table-based approach
756find_authority_delimiter_special(std::string_view view) noexcept {
757 // performance note: we might be able to gain further performance
758 // with SIMD instrinsics.
759 for (auto pos = view.begin(); pos != view.end(); ++pos) {
760 if (authority_delimiter_special[(uint8_t)*pos]) {
761 return pos - view.begin();
762 }
763 }
764 return size_t(view.size());
765}
766
767// @ / ?
768static constexpr std::array<uint8_t, 256> authority_delimiter = []() constexpr {
769 std::array<uint8_t, 256> result{};
770 for (uint8_t i : {'@', '/', '?'}) {
771 result[i] = 1;
772 }
773 return result;
774}();
775// credit: @the-moisrex recommended a table-based approach
777find_authority_delimiter(std::string_view view) noexcept {
778 // performance note: we might be able to gain further performance
779 // with SIMD instrinsics.
780 for (auto pos = view.begin(); pos != view.end(); ++pos) {
781 if (authority_delimiter[(uint8_t)*pos]) {
782 return pos - view.begin();
783 }
784 }
785 return size_t(view.size());
786}
787
788} // namespace ada::helpers
789
790namespace ada {
794#undef ada_make_uint8x16_t
795} // namespace ada
Includes all definitions for Ada.
Definitions for URL specific checkers used within Ada.
Common definitions for cross-platform compiler support.
#define ADA_ASSERT_TRUE(COND)
#define ada_unused
Definition common_defs.h:87
#define ada_warn_unused
Definition common_defs.h:88
#define ada_really_inline
Definition common_defs.h:84
constexpr uint8_t PATH_PERCENT_ENCODE[32]
constexpr bool is_normalized_windows_drive_letter(std::string_view input) noexcept
constexpr bool is_windows_drive_letter(std::string_view input) noexcept
Includes the definitions for helper functions.
ada_really_inline size_t find_next_host_delimiter(std::string_view view, size_t location) noexcept
Definition helpers.cpp:460
static constexpr std::array< uint8_t, 256 > authority_delimiter_special
Definition helpers.cpp:746
static constexpr std::array< uint8_t, 256 > host_delimiters
Definition helpers.cpp:452
ada_really_inline size_t find_next_host_delimiter_special(std::string_view view, size_t location) noexcept
Definition helpers.cpp:328
ada_unused std::string get_state(ada::state s)
Definition helpers.cpp:38
static constexpr std::array< uint8_t, 256 > authority_delimiter
Definition helpers.cpp:768
static constexpr std::array< uint8_t, 256 > special_host_delimiters
Definition helpers.cpp:319
ada_really_inline int trailing_zeroes(uint32_t input_num) noexcept
Definition helpers.cpp:177
@ NOT_SPECIAL
Definition scheme.h:32
Definition ada_idna.h:13
ada_warn_unused std::string to_string(encoding_type type)
state
Definition state.h:17
@ SPECIAL_RELATIVE_OR_AUTHORITY
@ SPECIAL_AUTHORITY_SLASHES
@ SPECIAL_AUTHORITY_IGNORE_SLASHES
tl::expected< result_type, ada::errors > result
Declarations for the URL scheme.