Ada 2.9.2
Fast spec-compliant URL parser
Loading...
Searching...
No Matches
helpers.cpp
Go to the documentation of this file.
1#include "ada.h"
2#include "ada/checkers-inl.h"
3#include "ada/common_defs.h" // make sure ADA_IS_BIG_ENDIAN gets defined.
4#include "ada/scheme.h"
5
6#include <cstring>
7#include <sstream>
8
9namespace ada::helpers {
10
11template <typename out_iter>
12void encode_json(std::string_view view, out_iter out) {
13 // trivial implementation. could be faster.
14 const char* hexvalues =
15 "000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f";
16 for (uint8_t c : view) {
17 if (c == '\\') {
18 *out++ = '\\';
19 *out++ = '\\';
20 } else if (c == '"') {
21 *out++ = '\\';
22 *out++ = '"';
23 } else if (c <= 0x1f) {
24 *out++ = '\\';
25 *out++ = 'u';
26 *out++ = '0';
27 *out++ = '0';
28 *out++ = hexvalues[2 * c];
29 *out++ = hexvalues[2 * c + 1];
30 } else {
31 *out++ = c;
32 }
33 }
34}
35
37 switch (s) {
39 return "Authority";
41 return "Scheme Start";
43 return "Scheme";
45 return "Host";
47 return "No Scheme";
49 return "Fragment";
51 return "Relative Scheme";
53 return "Relative Slash";
55 return "File";
57 return "File Host";
59 return "File Slash";
61 return "Path or Authority";
63 return "Special Authority Ignore Slashes";
65 return "Special Authority Slashes";
67 return "Special Relative or Authority";
69 return "Query";
71 return "Path";
73 return "Path Start";
75 return "Opaque Path";
77 return "Port";
78 default:
79 return "unknown state";
80 }
81}
82
83ada_really_inline std::optional<std::string_view> prune_hash(
84 std::string_view& input) noexcept {
85 // compiles down to 20--30 instructions including a class to memchr (C
86 // function). this function should be quite fast.
87 size_t location_of_first = input.find('#');
88 if (location_of_first == std::string_view::npos) {
89 return std::nullopt;
90 }
91 std::string_view hash = input;
92 hash.remove_prefix(location_of_first + 1);
93 input.remove_suffix(input.size() - location_of_first);
94 return hash;
95}
96
97ada_really_inline bool shorten_path(std::string& path,
98 ada::scheme::type type) noexcept {
99 size_t first_delimiter = path.find_first_of('/', 1);
100
101 // Let path be url's path.
102 // If url's scheme is "file", path's size is 1, and path[0] is a normalized
103 // Windows drive letter, then return.
104 if (type == ada::scheme::type::FILE &&
105 first_delimiter == std::string_view::npos && !path.empty()) {
107 helpers::substring(path, 1))) {
108 return false;
109 }
110 }
111
112 // Remove path's last item, if any.
113 size_t last_delimiter = path.rfind('/');
114 if (last_delimiter != std::string::npos) {
115 path.erase(last_delimiter);
116 return true;
117 }
118
119 return false;
120}
121
122ada_really_inline bool shorten_path(std::string_view& path,
123 ada::scheme::type type) noexcept {
124 size_t first_delimiter = path.find_first_of('/', 1);
125
126 // Let path be url's path.
127 // If url's scheme is "file", path's size is 1, and path[0] is a normalized
128 // Windows drive letter, then return.
129 if (type == ada::scheme::type::FILE &&
130 first_delimiter == std::string_view::npos && !path.empty()) {
132 helpers::substring(path, 1))) {
133 return false;
134 }
135 }
136
137 // Remove path's last item, if any.
138 if (!path.empty()) {
139 size_t slash_loc = path.rfind('/');
140 if (slash_loc != std::string_view::npos) {
141 path.remove_suffix(path.size() - slash_loc);
142 return true;
143 }
144 }
145
146 return false;
147}
148
149ada_really_inline void remove_ascii_tab_or_newline(
150 std::string& input) noexcept {
151 // if this ever becomes a performance issue, we could use an approach similar
152 // to has_tabs_or_newline
153 input.erase(std::remove_if(input.begin(), input.end(),
154 [](char c) {
155 return ada::unicode::is_ascii_tab_or_newline(c);
156 }),
157 input.end());
158}
159
160ada_really_inline constexpr std::string_view substring(std::string_view input,
161 size_t pos) noexcept {
162 ADA_ASSERT_TRUE(pos <= input.size());
163 // The following is safer but unneeded if we have the above line:
164 // return pos > input.size() ? std::string_view() : input.substr(pos);
165 return input.substr(pos);
166}
167
168ada_really_inline void resize(std::string_view& input, size_t pos) noexcept {
169 ADA_ASSERT_TRUE(pos <= input.size());
170 input.remove_suffix(input.size() - pos);
171}
172
173// computes the number of trailing zeroes
174// this is a private inline function only defined in this source file.
175ada_really_inline int trailing_zeroes(uint32_t input_num) noexcept {
176#ifdef ADA_REGULAR_VISUAL_STUDIO
177 unsigned long ret;
178 // Search the mask data from least significant bit (LSB)
179 // to the most significant bit (MSB) for a set bit (1).
180 _BitScanForward(&ret, input_num);
181 return (int)ret;
182#else // ADA_REGULAR_VISUAL_STUDIO
183 return __builtin_ctzl(input_num);
184#endif // ADA_REGULAR_VISUAL_STUDIO
185}
186
187// starting at index location, this finds the next location of a character
188// :, /, \\, ? or [. If none is found, view.size() is returned.
189// For use within get_host_delimiter_location.
190#if ADA_NEON
191// The ada_make_uint8x16_t macro is necessary because Visual Studio does not
192// support direct initialization of uint8x16_t. See
193// https://developercommunity.visualstudio.com/t/error-C2078:-too-many-initializers-whe/402911?q=backend+neon
194#ifndef ada_make_uint8x16_t
195#define ada_make_uint8x16_t(x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, \
196 x13, x14, x15, x16) \
197 ([=]() { \
198 static uint8_t array[16] = {x1, x2, x3, x4, x5, x6, x7, x8, \
199 x9, x10, x11, x12, x13, x14, x15, x16}; \
200 return vld1q_u8(array); \
201 }())
202#endif
203
205 std::string_view view, size_t location) noexcept {
206 // first check for short strings in which case we do it naively.
207 if (view.size() - location < 16) { // slow path
208 for (size_t i = location; i < view.size(); i++) {
209 if (view[i] == ':' || view[i] == '/' || view[i] == '\\' ||
210 view[i] == '?' || view[i] == '[') {
211 return i;
212 }
213 }
214 return size_t(view.size());
215 }
216 auto to_bitmask = [](uint8x16_t input) -> uint16_t {
217 uint8x16_t bit_mask =
218 ada_make_uint8x16_t(0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, 0x01,
219 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80);
220 uint8x16_t minput = vandq_u8(input, bit_mask);
221 uint8x16_t tmp = vpaddq_u8(minput, minput);
222 tmp = vpaddq_u8(tmp, tmp);
223 tmp = vpaddq_u8(tmp, tmp);
224 return vgetq_lane_u16(vreinterpretq_u16_u8(tmp), 0);
225 };
226
227 // fast path for long strings (expected to be common)
228 size_t i = location;
229 uint8x16_t low_mask =
230 ada_make_uint8x16_t(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
231 0x00, 0x01, 0x04, 0x04, 0x00, 0x00, 0x03);
232 uint8x16_t high_mask =
233 ada_make_uint8x16_t(0x00, 0x00, 0x02, 0x01, 0x00, 0x04, 0x00, 0x00, 0x00,
234 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
235 uint8x16_t fmask = vmovq_n_u8(0xf);
236 uint8x16_t zero{0};
237 for (; i + 15 < view.size(); i += 16) {
238 uint8x16_t word = vld1q_u8((const uint8_t*)view.data() + i);
239 uint8x16_t lowpart = vqtbl1q_u8(low_mask, vandq_u8(word, fmask));
240 uint8x16_t highpart = vqtbl1q_u8(high_mask, vshrq_n_u8(word, 4));
241 uint8x16_t classify = vandq_u8(lowpart, highpart);
242 if (vmaxvq_u32(vreinterpretq_u32_u8(classify)) != 0) {
243 uint8x16_t is_zero = vceqq_u8(classify, zero);
244 uint16_t is_non_zero = ~to_bitmask(is_zero);
245 return i + trailing_zeroes(is_non_zero);
246 }
247 }
248
249 if (i < view.size()) {
250 uint8x16_t word =
251 vld1q_u8((const uint8_t*)view.data() + view.length() - 16);
252 uint8x16_t lowpart = vqtbl1q_u8(low_mask, vandq_u8(word, fmask));
253 uint8x16_t highpart = vqtbl1q_u8(high_mask, vshrq_n_u8(word, 4));
254 uint8x16_t classify = vandq_u8(lowpart, highpart);
255 if (vmaxvq_u32(vreinterpretq_u32_u8(classify)) != 0) {
256 uint8x16_t is_zero = vceqq_u8(classify, zero);
257 uint16_t is_non_zero = ~to_bitmask(is_zero);
258 return view.length() - 16 + trailing_zeroes(is_non_zero);
259 }
260 }
261 return size_t(view.size());
262}
263#elif ADA_SSE2
265 std::string_view view, size_t location) noexcept {
266 // first check for short strings in which case we do it naively.
267 if (view.size() - location < 16) { // slow path
268 for (size_t i = location; i < view.size(); i++) {
269 if (view[i] == ':' || view[i] == '/' || view[i] == '\\' ||
270 view[i] == '?' || view[i] == '[') {
271 return i;
272 }
273 }
274 return size_t(view.size());
275 }
276 // fast path for long strings (expected to be common)
277 size_t i = location;
278 const __m128i mask1 = _mm_set1_epi8(':');
279 const __m128i mask2 = _mm_set1_epi8('/');
280 const __m128i mask3 = _mm_set1_epi8('\\');
281 const __m128i mask4 = _mm_set1_epi8('?');
282 const __m128i mask5 = _mm_set1_epi8('[');
283
284 for (; i + 15 < view.size(); i += 16) {
285 __m128i word = _mm_loadu_si128((const __m128i*)(view.data() + i));
286 __m128i m1 = _mm_cmpeq_epi8(word, mask1);
287 __m128i m2 = _mm_cmpeq_epi8(word, mask2);
288 __m128i m3 = _mm_cmpeq_epi8(word, mask3);
289 __m128i m4 = _mm_cmpeq_epi8(word, mask4);
290 __m128i m5 = _mm_cmpeq_epi8(word, mask5);
291 __m128i m = _mm_or_si128(
292 _mm_or_si128(_mm_or_si128(m1, m2), _mm_or_si128(m3, m4)), m5);
293 int mask = _mm_movemask_epi8(m);
294 if (mask != 0) {
295 return i + trailing_zeroes(mask);
296 }
297 }
298 if (i < view.size()) {
299 __m128i word =
300 _mm_loadu_si128((const __m128i*)(view.data() + view.length() - 16));
301 __m128i m1 = _mm_cmpeq_epi8(word, mask1);
302 __m128i m2 = _mm_cmpeq_epi8(word, mask2);
303 __m128i m3 = _mm_cmpeq_epi8(word, mask3);
304 __m128i m4 = _mm_cmpeq_epi8(word, mask4);
305 __m128i m5 = _mm_cmpeq_epi8(word, mask5);
306 __m128i m = _mm_or_si128(
307 _mm_or_si128(_mm_or_si128(m1, m2), _mm_or_si128(m3, m4)), m5);
308 int mask = _mm_movemask_epi8(m);
309 if (mask != 0) {
310 return view.length() - 16 + trailing_zeroes(mask);
311 }
312 }
313 return size_t(view.length());
314}
315#else
316// : / [ \\ ?
317static constexpr std::array<uint8_t, 256> special_host_delimiters =
318 []() consteval {
319 std::array<uint8_t, 256> result{};
320 for (int i : {':', '/', '[', '\\', '?'}) {
321 result[i] = 1;
322 }
323 return result;
324 }();
325// credit: @the-moisrex recommended a table-based approach
327 std::string_view view, size_t location) noexcept {
328 auto const str = view.substr(location);
329 for (auto pos = str.begin(); pos != str.end(); ++pos) {
330 if (special_host_delimiters[(uint8_t)*pos]) {
331 return pos - str.begin() + location;
332 }
333 }
334 return size_t(view.size());
335}
336#endif
337
338// starting at index location, this finds the next location of a character
339// :, /, ? or [. If none is found, view.size() is returned.
340// For use within get_host_delimiter_location.
341#if ADA_NEON
342ada_really_inline size_t find_next_host_delimiter(std::string_view view,
343 size_t location) noexcept {
344 // first check for short strings in which case we do it naively.
345 if (view.size() - location < 16) { // slow path
346 for (size_t i = location; i < view.size(); i++) {
347 if (view[i] == ':' || view[i] == '/' || view[i] == '?' ||
348 view[i] == '[') {
349 return i;
350 }
351 }
352 return size_t(view.size());
353 }
354 auto to_bitmask = [](uint8x16_t input) -> uint16_t {
355 uint8x16_t bit_mask =
356 ada_make_uint8x16_t(0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, 0x01,
357 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80);
358 uint8x16_t minput = vandq_u8(input, bit_mask);
359 uint8x16_t tmp = vpaddq_u8(minput, minput);
360 tmp = vpaddq_u8(tmp, tmp);
361 tmp = vpaddq_u8(tmp, tmp);
362 return vgetq_lane_u16(vreinterpretq_u16_u8(tmp), 0);
363 };
364
365 // fast path for long strings (expected to be common)
366 size_t i = location;
367 uint8x16_t low_mask =
368 ada_make_uint8x16_t(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
369 0x00, 0x01, 0x04, 0x00, 0x00, 0x00, 0x03);
370 uint8x16_t high_mask =
371 ada_make_uint8x16_t(0x00, 0x00, 0x02, 0x01, 0x00, 0x04, 0x00, 0x00, 0x00,
372 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
373 uint8x16_t fmask = vmovq_n_u8(0xf);
374 uint8x16_t zero{0};
375 for (; i + 15 < view.size(); i += 16) {
376 uint8x16_t word = vld1q_u8((const uint8_t*)view.data() + i);
377 uint8x16_t lowpart = vqtbl1q_u8(low_mask, vandq_u8(word, fmask));
378 uint8x16_t highpart = vqtbl1q_u8(high_mask, vshrq_n_u8(word, 4));
379 uint8x16_t classify = vandq_u8(lowpart, highpart);
380 if (vmaxvq_u32(vreinterpretq_u32_u8(classify)) != 0) {
381 uint8x16_t is_zero = vceqq_u8(classify, zero);
382 uint16_t is_non_zero = ~to_bitmask(is_zero);
383 return i + trailing_zeroes(is_non_zero);
384 }
385 }
386
387 if (i < view.size()) {
388 uint8x16_t word =
389 vld1q_u8((const uint8_t*)view.data() + view.length() - 16);
390 uint8x16_t lowpart = vqtbl1q_u8(low_mask, vandq_u8(word, fmask));
391 uint8x16_t highpart = vqtbl1q_u8(high_mask, vshrq_n_u8(word, 4));
392 uint8x16_t classify = vandq_u8(lowpart, highpart);
393 if (vmaxvq_u32(vreinterpretq_u32_u8(classify)) != 0) {
394 uint8x16_t is_zero = vceqq_u8(classify, zero);
395 uint16_t is_non_zero = ~to_bitmask(is_zero);
396 return view.length() - 16 + trailing_zeroes(is_non_zero);
397 }
398 }
399 return size_t(view.size());
400}
401#elif ADA_SSE2
402ada_really_inline size_t find_next_host_delimiter(std::string_view view,
403 size_t location) noexcept {
404 // first check for short strings in which case we do it naively.
405 if (view.size() - location < 16) { // slow path
406 for (size_t i = location; i < view.size(); i++) {
407 if (view[i] == ':' || view[i] == '/' || view[i] == '?' ||
408 view[i] == '[') {
409 return i;
410 }
411 }
412 return size_t(view.size());
413 }
414 // fast path for long strings (expected to be common)
415 size_t i = location;
416 const __m128i mask1 = _mm_set1_epi8(':');
417 const __m128i mask2 = _mm_set1_epi8('/');
418 const __m128i mask4 = _mm_set1_epi8('?');
419 const __m128i mask5 = _mm_set1_epi8('[');
420
421 for (; i + 15 < view.size(); i += 16) {
422 __m128i word = _mm_loadu_si128((const __m128i*)(view.data() + i));
423 __m128i m1 = _mm_cmpeq_epi8(word, mask1);
424 __m128i m2 = _mm_cmpeq_epi8(word, mask2);
425 __m128i m4 = _mm_cmpeq_epi8(word, mask4);
426 __m128i m5 = _mm_cmpeq_epi8(word, mask5);
427 __m128i m = _mm_or_si128(_mm_or_si128(m1, m2), _mm_or_si128(m4, m5));
428 int mask = _mm_movemask_epi8(m);
429 if (mask != 0) {
430 return i + trailing_zeroes(mask);
431 }
432 }
433 if (i < view.size()) {
434 __m128i word =
435 _mm_loadu_si128((const __m128i*)(view.data() + view.length() - 16));
436 __m128i m1 = _mm_cmpeq_epi8(word, mask1);
437 __m128i m2 = _mm_cmpeq_epi8(word, mask2);
438 __m128i m4 = _mm_cmpeq_epi8(word, mask4);
439 __m128i m5 = _mm_cmpeq_epi8(word, mask5);
440 __m128i m = _mm_or_si128(_mm_or_si128(m1, m2), _mm_or_si128(m4, m5));
441 int mask = _mm_movemask_epi8(m);
442 if (mask != 0) {
443 return view.length() - 16 + trailing_zeroes(mask);
444 }
445 }
446 return size_t(view.length());
447}
448#else
449// : / [ ?
450static constexpr std::array<uint8_t, 256> host_delimiters = []() consteval {
451 std::array<uint8_t, 256> result{};
452 for (int i : {':', '/', '?', '['}) {
453 result[i] = 1;
454 }
455 return result;
456}();
457// credit: @the-moisrex recommended a table-based approach
458ada_really_inline size_t find_next_host_delimiter(std::string_view view,
459 size_t location) noexcept {
460 auto const str = view.substr(location);
461 for (auto pos = str.begin(); pos != str.end(); ++pos) {
462 if (host_delimiters[(uint8_t)*pos]) {
463 return pos - str.begin() + location;
464 }
465 }
466 return size_t(view.size());
467}
468#endif
469
470ada_really_inline std::pair<size_t, bool> get_host_delimiter_location(
471 const bool is_special, std::string_view& view) noexcept {
480 const size_t view_size = view.size();
481 size_t location = 0;
482 bool found_colon = false;
502 if (is_special) {
503 // We move to the next delimiter.
504 location = find_next_host_delimiter_special(view, location);
505 // Unless we find '[' then we are going only going to have to call
506 // find_next_host_delimiter_special once.
507 for (; location < view_size;
508 location = find_next_host_delimiter_special(view, location)) {
509 if (view[location] == '[') {
510 location = view.find(']', location);
511 if (location == std::string_view::npos) {
512 // performance: view.find might get translated to a memchr, which
513 // has no notion of std::string_view::npos, so the code does not
514 // reflect the assembly.
515 location = view_size;
516 break;
517 }
518 } else {
519 found_colon = view[location] == ':';
520 break;
521 }
522 }
523 } else {
524 // We move to the next delimiter.
525 location = find_next_host_delimiter(view, location);
526 // Unless we find '[' then we are going only going to have to call
527 // find_next_host_delimiter_special once.
528 for (; location < view_size;
529 location = find_next_host_delimiter(view, location)) {
530 if (view[location] == '[') {
531 location = view.find(']', location);
532 if (location == std::string_view::npos) {
533 // performance: view.find might get translated to a memchr, which
534 // has no notion of std::string_view::npos, so the code does not
535 // reflect the assembly.
536 location = view_size;
537 break;
538 }
539 } else {
540 found_colon = view[location] == ':';
541 break;
542 }
543 }
544 }
545 // performance: remove_suffix may translate into a single instruction.
546 view.remove_suffix(view_size - location);
547 return {location, found_colon};
548}
549
550ada_really_inline void trim_c0_whitespace(std::string_view& input) noexcept {
551 while (!input.empty() &&
552 ada::unicode::is_c0_control_or_space(input.front())) {
553 input.remove_prefix(1);
554 }
555 while (!input.empty() && ada::unicode::is_c0_control_or_space(input.back())) {
556 input.remove_suffix(1);
557 }
558}
559
560ada_really_inline void parse_prepared_path(std::string_view input,
562 std::string& path) {
563 ada_log("parse_prepared_path ", input);
564 uint8_t accumulator = checkers::path_signature(input);
565 // Let us first detect a trivial case.
566 // If it is special, we check that we have no dot, no %, no \ and no
567 // character needing percent encoding. Otherwise, we check that we have no %,
568 // no dot, and no character needing percent encoding.
569 constexpr uint8_t need_encoding = 1;
570 constexpr uint8_t backslash_char = 2;
571 constexpr uint8_t dot_char = 4;
572 constexpr uint8_t percent_char = 8;
573 bool special = type != ada::scheme::NOT_SPECIAL;
574 bool may_need_slow_file_handling = (type == ada::scheme::type::FILE &&
576 bool trivial_path =
577 (special ? (accumulator == 0)
578 : ((accumulator & (need_encoding | dot_char | percent_char)) ==
579 0)) &&
580 (!may_need_slow_file_handling);
581 if (accumulator == dot_char && !may_need_slow_file_handling) {
582 // '4' means that we have at least one dot, but nothing that requires
583 // percent encoding or decoding. The only part that is not trivial is
584 // that we may have single dots and double dots path segments.
585 // If we have such segments, then we either have a path that begins
586 // with '.' (easy to check), or we have the sequence './'.
587 // Note: input cannot be empty, it must at least contain one character ('.')
588 // Note: we know that '\' is not present.
589 if (input[0] != '.') {
590 size_t slashdot = input.find("/.");
591 if (slashdot == std::string_view::npos) { // common case
592 trivial_path = true;
593 } else { // uncommon
594 // only three cases matter: /./, /.. or a final /
595 trivial_path =
596 !(slashdot + 2 == input.size() || input[slashdot + 2] == '.' ||
597 input[slashdot + 2] == '/');
598 }
599 }
600 }
601 if (trivial_path) {
602 ada_log("parse_path trivial");
603 path += '/';
604 path += input;
605 return;
606 }
607 // We are going to need to look a bit at the path, but let us see if we can
608 // ignore percent encoding *and* backslashes *and* percent characters.
609 // Except for the trivial case, this is likely to capture 99% of paths out
610 // there.
611 bool fast_path =
612 (special &&
613 (accumulator & (need_encoding | backslash_char | percent_char)) == 0) &&
614 (type != ada::scheme::type::FILE);
615 if (fast_path) {
616 ada_log("parse_prepared_path fast");
617 // Here we don't need to worry about \ or percent encoding.
618 // We also do not have a file protocol. We might have dots, however,
619 // but dots must as appear as '.', and they cannot be encoded because
620 // the symbol '%' is not present.
621 size_t previous_location = 0; // We start at 0.
622 do {
623 size_t new_location = input.find('/', previous_location);
624 // std::string_view path_view = input;
625 // We process the last segment separately:
626 if (new_location == std::string_view::npos) {
627 std::string_view path_view = input.substr(previous_location);
628 if (path_view == "..") { // The path ends with ..
629 // e.g., if you receive ".." with an empty path, you go to "/".
630 if (path.empty()) {
631 path = '/';
632 return;
633 }
634 // Fast case where we have nothing to do:
635 if (path.back() == '/') {
636 return;
637 }
638 // If you have the path "/joe/myfriend",
639 // then you delete 'myfriend'.
640 path.resize(path.rfind('/') + 1);
641 return;
642 }
643 path += '/';
644 if (path_view != ".") {
645 path.append(path_view);
646 }
647 return;
648 } else {
649 // This is a non-final segment.
650 std::string_view path_view =
651 input.substr(previous_location, new_location - previous_location);
652 previous_location = new_location + 1;
653 if (path_view == "..") {
654 size_t last_delimiter = path.rfind('/');
655 if (last_delimiter != std::string::npos) {
656 path.erase(last_delimiter);
657 }
658 } else if (path_view != ".") {
659 path += '/';
660 path.append(path_view);
661 }
662 }
663 } while (true);
664 } else {
665 ada_log("parse_path slow");
666 // we have reached the general case
667 bool needs_percent_encoding = (accumulator & 1);
668 std::string path_buffer_tmp;
669 do {
670 size_t location = (special && (accumulator & 2))
671 ? input.find_first_of("/\\")
672 : input.find('/');
673 std::string_view path_view = input;
674 if (location != std::string_view::npos) {
675 path_view.remove_suffix(path_view.size() - location);
676 input.remove_prefix(location + 1);
677 }
678 // path_buffer is either path_view or it might point at a percent encoded
679 // temporary file.
680 std::string_view path_buffer =
681 (needs_percent_encoding &&
682 ada::unicode::percent_encode<false>(
683 path_view, character_sets::PATH_PERCENT_ENCODE, path_buffer_tmp))
684 ? path_buffer_tmp
685 : path_view;
686 if (unicode::is_double_dot_path_segment(path_buffer)) {
687 if ((helpers::shorten_path(path, type) || special) &&
688 location == std::string_view::npos) {
689 path += '/';
690 }
691 } else if (unicode::is_single_dot_path_segment(path_buffer) &&
692 (location == std::string_view::npos)) {
693 path += '/';
694 }
695 // Otherwise, if path_buffer is not a single-dot path segment, then:
696 else if (!unicode::is_single_dot_path_segment(path_buffer)) {
697 // If url's scheme is "file", url's path is empty, and path_buffer is a
698 // Windows drive letter, then replace the second code point in
699 // path_buffer with U+003A (:).
700 if (type == ada::scheme::type::FILE && path.empty() &&
702 path += '/';
703 path += path_buffer[0];
704 path += ':';
705 path_buffer.remove_prefix(2);
706 path.append(path_buffer);
707 } else {
708 // Append path_buffer to url's path.
709 path += '/';
710 path.append(path_buffer);
711 }
712 }
713 if (location == std::string_view::npos) {
714 return;
715 }
716 } while (true);
717 }
718}
719
720bool overlaps(std::string_view input1, const std::string& input2) noexcept {
721 ada_log("helpers::overlaps check if string_view '", input1, "' [",
722 input1.size(), " bytes] is part of string '", input2, "' [",
723 input2.size(), " bytes]");
724 return !input1.empty() && !input2.empty() && input1.data() >= input2.data() &&
725 input1.data() < input2.data() + input2.size();
726}
727
728template <class url_type>
729ada_really_inline void strip_trailing_spaces_from_opaque_path(
730 url_type& url) noexcept {
731 ada_log("helpers::strip_trailing_spaces_from_opaque_path");
732 if (!url.has_opaque_path) return;
733 if (url.has_hash()) return;
734 if (url.has_search()) return;
735
736 auto path = std::string(url.get_pathname());
737 while (!path.empty() && path.back() == ' ') {
738 path.resize(path.size() - 1);
739 }
740 url.update_base_pathname(path);
741}
742
743// @ / \\ ?
744static constexpr std::array<uint8_t, 256> authority_delimiter_special =
745 []() consteval {
746 std::array<uint8_t, 256> result{};
747 for (uint8_t i : {'@', '/', '\\', '?'}) {
748 result[i] = 1;
749 }
750 return result;
751 }();
752// credit: @the-moisrex recommended a table-based approach
754find_authority_delimiter_special(std::string_view view) noexcept {
755 // performance note: we might be able to gain further performance
756 // with SIMD instrinsics.
757 for (auto pos = view.begin(); pos != view.end(); ++pos) {
758 if (authority_delimiter_special[(uint8_t)*pos]) {
759 return pos - view.begin();
760 }
761 }
762 return size_t(view.size());
763}
764
765// @ / ?
766static constexpr std::array<uint8_t, 256> authority_delimiter = []() consteval {
767 std::array<uint8_t, 256> result{};
768 for (uint8_t i : {'@', '/', '?'}) {
769 result[i] = 1;
770 }
771 return result;
772}();
773// credit: @the-moisrex recommended a table-based approach
775find_authority_delimiter(std::string_view view) noexcept {
776 // performance note: we might be able to gain further performance
777 // with SIMD instrinsics.
778 for (auto pos = view.begin(); pos != view.end(); ++pos) {
779 if (authority_delimiter[(uint8_t)*pos]) {
780 return pos - view.begin();
781 }
782 }
783 return size_t(view.size());
784}
785
786} // namespace ada::helpers
787
788namespace ada {
792#undef ada_make_uint8x16_t
793} // namespace ada
Includes all definitions for Ada.
Definitions for URL specific checkers used within Ada.
Common definitions for cross-platform compiler support.
#define ADA_ASSERT_TRUE(COND)
#define ada_unused
Definition common_defs.h:87
#define ada_warn_unused
Definition common_defs.h:88
#define ada_really_inline
Definition common_defs.h:84
constexpr uint8_t PATH_PERCENT_ENCODE[32]
constexpr bool is_normalized_windows_drive_letter(std::string_view input) noexcept
constexpr bool is_windows_drive_letter(std::string_view input) noexcept
Includes the definitions for helper functions.
ada_really_inline size_t find_next_host_delimiter(std::string_view view, size_t location) noexcept
Definition helpers.cpp:458
static constexpr std::array< uint8_t, 256 > authority_delimiter_special
Definition helpers.cpp:744
static constexpr std::array< uint8_t, 256 > host_delimiters
Definition helpers.cpp:450
ada_really_inline size_t find_next_host_delimiter_special(std::string_view view, size_t location) noexcept
Definition helpers.cpp:326
ada_unused std::string get_state(ada::state s)
Definition helpers.cpp:36
static constexpr std::array< uint8_t, 256 > authority_delimiter
Definition helpers.cpp:766
static constexpr std::array< uint8_t, 256 > special_host_delimiters
Definition helpers.cpp:317
ada_really_inline int trailing_zeroes(uint32_t input_num) noexcept
Definition helpers.cpp:175
@ NOT_SPECIAL
Definition scheme.h:32
Definition ada_idna.h:13
ada_warn_unused std::string to_string(encoding_type type)
state
Definition state.h:17
@ SPECIAL_RELATIVE_OR_AUTHORITY
@ SPECIAL_AUTHORITY_SLASHES
@ SPECIAL_AUTHORITY_IGNORE_SLASHES
tl::expected< result_type, ada::errors > result
Declarations for the URL scheme.