Ada 3.4.0
Fast spec-compliant URL parser
Loading...
Searching...
No Matches
helpers.cpp
Go to the documentation of this file.
1#include <cstring>
2#include <sstream>
3
4#include "ada/checkers-inl.h"
5#include "ada/common_defs.h"
6#include "ada/scheme.h"
7
8#if ADA_SSSE3
9#include <tmmintrin.h>
10#endif
11
12namespace ada::helpers {
13
14template <typename out_iter>
15void encode_json(std::string_view view, out_iter out) {
16 // trivial implementation. could be faster.
17 const char* hexvalues =
18 "000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f";
19 for (uint8_t c : view) {
20 if (c == '\\') {
21 *out++ = '\\';
22 *out++ = '\\';
23 } else if (c == '"') {
24 *out++ = '\\';
25 *out++ = '"';
26 } else if (c <= 0x1f) {
27 *out++ = '\\';
28 *out++ = 'u';
29 *out++ = '0';
30 *out++ = '0';
31 *out++ = hexvalues[2 * c];
32 *out++ = hexvalues[2 * c + 1];
33 } else {
34 *out++ = c;
35 }
36 }
37}
38
40 switch (s) {
42 return "Authority";
44 return "Scheme Start";
46 return "Scheme";
48 return "Host";
50 return "No Scheme";
52 return "Fragment";
54 return "Relative Scheme";
56 return "Relative Slash";
58 return "File";
60 return "File Host";
62 return "File Slash";
64 return "Path or Authority";
66 return "Special Authority Ignore Slashes";
68 return "Special Authority Slashes";
70 return "Special Relative or Authority";
72 return "Query";
74 return "Path";
76 return "Path Start";
78 return "Opaque Path";
80 return "Port";
81 default:
82 return "unknown state";
83 }
84}
85
86ada_really_inline std::optional<std::string_view> prune_hash(
87 std::string_view& input) noexcept {
88 // compiles down to 20--30 instructions including a class to memchr (C
89 // function). this function should be quite fast.
90 size_t location_of_first = input.find('#');
91 if (location_of_first == std::string_view::npos) {
92 return std::nullopt;
93 }
94 std::string_view hash = input;
95 hash.remove_prefix(location_of_first + 1);
96 input.remove_suffix(input.size() - location_of_first);
97 return hash;
98}
99
100ada_really_inline bool shorten_path(std::string& path,
101 ada::scheme::type type) noexcept {
102 // Let path be url's path.
103 // If url's scheme is "file", path's size is 1, and path[0] is a normalized
104 // Windows drive letter, then return.
105 if (type == ada::scheme::type::FILE &&
106 path.find('/', 1) == std::string_view::npos && !path.empty()) {
108 helpers::substring(path, 1))) {
109 return false;
110 }
111 }
112
113 // Remove path's last item, if any.
114 size_t last_delimiter = path.rfind('/');
115 if (last_delimiter != std::string::npos) {
116 path.erase(last_delimiter);
117 return true;
118 }
119
120 return false;
121}
122
123ada_really_inline bool shorten_path(std::string_view& path,
124 ada::scheme::type type) noexcept {
125 // Let path be url's path.
126 // If url's scheme is "file", path's size is 1, and path[0] is a normalized
127 // Windows drive letter, then return.
128 if (type == ada::scheme::type::FILE &&
129 path.find('/', 1) == std::string_view::npos && !path.empty()) {
131 helpers::substring(path, 1))) {
132 return false;
133 }
134 }
135
136 // Remove path's last item, if any.
137 if (!path.empty()) {
138 size_t slash_loc = path.rfind('/');
139 if (slash_loc != std::string_view::npos) {
140 path.remove_suffix(path.size() - slash_loc);
141 return true;
142 }
143 }
144
145 return false;
146}
147
148ada_really_inline void remove_ascii_tab_or_newline(
149 std::string& input) noexcept {
150 // if this ever becomes a performance issue, we could use an approach similar
151 // to has_tabs_or_newline
152 std::erase_if(input, ada::unicode::is_ascii_tab_or_newline);
153}
154
155ada_really_inline constexpr std::string_view substring(std::string_view input,
156 size_t pos) noexcept {
157 ADA_ASSERT_TRUE(pos <= input.size());
158 // The following is safer but unneeded if we have the above line:
159 // return pos > input.size() ? std::string_view() : input.substr(pos);
160 return input.substr(pos);
161}
162
163ada_really_inline void resize(std::string_view& input, size_t pos) noexcept {
164 ADA_ASSERT_TRUE(pos <= input.size());
165 input.remove_suffix(input.size() - pos);
166}
167
168// computes the number of trailing zeroes
169// this is a private inline function only defined in this source file.
170ada_really_inline int trailing_zeroes(uint32_t input_num) noexcept {
171#ifdef ADA_REGULAR_VISUAL_STUDIO
172 unsigned long ret;
173 // Search the mask data from least significant bit (LSB)
174 // to the most significant bit (MSB) for a set bit (1).
175 _BitScanForward(&ret, input_num);
176 return (int)ret;
177#else // ADA_REGULAR_VISUAL_STUDIO
178 return __builtin_ctzl(input_num);
179#endif // ADA_REGULAR_VISUAL_STUDIO
180}
181
182// starting at index location, this finds the next location of a character
183// :, /, \\, ? or [. If none is found, view.size() is returned.
184// For use within get_host_delimiter_location.
185#if ADA_SSSE3
187 std::string_view view, size_t location) noexcept {
188 // first check for short strings in which case we do it naively.
189 if (view.size() - location < 16) { // slow path
190 for (size_t i = location; i < view.size(); i++) {
191 if (view[i] == ':' || view[i] == '/' || view[i] == '\\' ||
192 view[i] == '?' || view[i] == '[') {
193 return i;
194 }
195 }
196 return size_t(view.size());
197 }
198 // fast path for long strings (expected to be common)
199 // Using SSSE3's _mm_shuffle_epi8 for table lookup (same approach as NEON)
200 size_t i = location;
201 const __m128i low_mask =
202 _mm_setr_epi8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
203 0x01, 0x04, 0x04, 0x00, 0x00, 0x03);
204 const __m128i high_mask =
205 _mm_setr_epi8(0x00, 0x00, 0x02, 0x01, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00,
206 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
207 const __m128i fmask = _mm_set1_epi8(0xf);
208 const __m128i zero = _mm_setzero_si128();
209 for (; i + 15 < view.size(); i += 16) {
210 __m128i word = _mm_loadu_si128((const __m128i*)(view.data() + i));
211 __m128i lowpart = _mm_shuffle_epi8(low_mask, _mm_and_si128(word, fmask));
212 __m128i highpart = _mm_shuffle_epi8(
213 high_mask, _mm_and_si128(_mm_srli_epi16(word, 4), fmask));
214 __m128i classify = _mm_and_si128(lowpart, highpart);
215 __m128i is_zero = _mm_cmpeq_epi8(classify, zero);
216 // _mm_movemask_epi8 returns a 16-bit mask in bits 0-15, with bits 16-31
217 // zero. After NOT (~), bits 16-31 become 1. We must mask to 16 bits to
218 // avoid false positives.
219 int mask = ~_mm_movemask_epi8(is_zero) & 0xFFFF;
220 if (mask != 0) {
221 return i + trailing_zeroes(static_cast<uint32_t>(mask));
222 }
223 }
224 if (i < view.size()) {
225 __m128i word =
226 _mm_loadu_si128((const __m128i*)(view.data() + view.length() - 16));
227 __m128i lowpart = _mm_shuffle_epi8(low_mask, _mm_and_si128(word, fmask));
228 __m128i highpart = _mm_shuffle_epi8(
229 high_mask, _mm_and_si128(_mm_srli_epi16(word, 4), fmask));
230 __m128i classify = _mm_and_si128(lowpart, highpart);
231 __m128i is_zero = _mm_cmpeq_epi8(classify, zero);
232 // _mm_movemask_epi8 returns a 16-bit mask in bits 0-15, with bits 16-31
233 // zero. After NOT (~), bits 16-31 become 1. We must mask to 16 bits to
234 // avoid false positives.
235 int mask = ~_mm_movemask_epi8(is_zero) & 0xFFFF;
236 if (mask != 0) {
237 return view.length() - 16 + trailing_zeroes(static_cast<uint32_t>(mask));
238 }
239 }
240 return size_t(view.size());
241}
242#elif ADA_NEON
243// The ada_make_uint8x16_t macro is necessary because Visual Studio does not
244// support direct initialization of uint8x16_t. See
245// https://developercommunity.visualstudio.com/t/error-C2078:-too-many-initializers-whe/402911?q=backend+neon
246#ifndef ada_make_uint8x16_t
247#define ada_make_uint8x16_t(x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, \
248 x13, x14, x15, x16) \
249 ([=]() { \
250 static uint8_t array[16] = {x1, x2, x3, x4, x5, x6, x7, x8, \
251 x9, x10, x11, x12, x13, x14, x15, x16}; \
252 return vld1q_u8(array); \
253 }())
254#endif
255
257 std::string_view view, size_t location) noexcept {
258 // first check for short strings in which case we do it naively.
259 if (view.size() - location < 16) { // slow path
260 for (size_t i = location; i < view.size(); i++) {
261 if (view[i] == ':' || view[i] == '/' || view[i] == '\\' ||
262 view[i] == '?' || view[i] == '[') {
263 return i;
264 }
265 }
266 return size_t(view.size());
267 }
268 auto to_bitmask = [](uint8x16_t input) -> uint16_t {
269 uint8x16_t bit_mask =
270 ada_make_uint8x16_t(0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, 0x01,
271 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80);
272 uint8x16_t minput = vandq_u8(input, bit_mask);
273 uint8x16_t tmp = vpaddq_u8(minput, minput);
274 tmp = vpaddq_u8(tmp, tmp);
275 tmp = vpaddq_u8(tmp, tmp);
276 return vgetq_lane_u16(vreinterpretq_u16_u8(tmp), 0);
277 };
278
279 // fast path for long strings (expected to be common)
280 size_t i = location;
281 uint8x16_t low_mask =
282 ada_make_uint8x16_t(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
283 0x00, 0x01, 0x04, 0x04, 0x00, 0x00, 0x03);
284 uint8x16_t high_mask =
285 ada_make_uint8x16_t(0x00, 0x00, 0x02, 0x01, 0x00, 0x04, 0x00, 0x00, 0x00,
286 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
287 uint8x16_t fmask = vmovq_n_u8(0xf);
288 uint8x16_t zero{0};
289 for (; i + 15 < view.size(); i += 16) {
290 uint8x16_t word = vld1q_u8((const uint8_t*)view.data() + i);
291 uint8x16_t lowpart = vqtbl1q_u8(low_mask, vandq_u8(word, fmask));
292 uint8x16_t highpart = vqtbl1q_u8(high_mask, vshrq_n_u8(word, 4));
293 uint8x16_t classify = vandq_u8(lowpart, highpart);
294 if (vmaxvq_u32(vreinterpretq_u32_u8(classify)) != 0) {
295 uint8x16_t is_zero = vceqq_u8(classify, zero);
296 uint16_t is_non_zero = ~to_bitmask(is_zero);
297 return i + trailing_zeroes(is_non_zero);
298 }
299 }
300
301 if (i < view.size()) {
302 uint8x16_t word =
303 vld1q_u8((const uint8_t*)view.data() + view.length() - 16);
304 uint8x16_t lowpart = vqtbl1q_u8(low_mask, vandq_u8(word, fmask));
305 uint8x16_t highpart = vqtbl1q_u8(high_mask, vshrq_n_u8(word, 4));
306 uint8x16_t classify = vandq_u8(lowpart, highpart);
307 if (vmaxvq_u32(vreinterpretq_u32_u8(classify)) != 0) {
308 uint8x16_t is_zero = vceqq_u8(classify, zero);
309 uint16_t is_non_zero = ~to_bitmask(is_zero);
310 return view.length() - 16 + trailing_zeroes(is_non_zero);
311 }
312 }
313 return size_t(view.size());
314}
315#elif ADA_SSE2
317 std::string_view view, size_t location) noexcept {
318 // first check for short strings in which case we do it naively.
319 if (view.size() - location < 16) { // slow path
320 for (size_t i = location; i < view.size(); i++) {
321 if (view[i] == ':' || view[i] == '/' || view[i] == '\\' ||
322 view[i] == '?' || view[i] == '[') {
323 return i;
324 }
325 }
326 return size_t(view.size());
327 }
328 // fast path for long strings (expected to be common)
329 size_t i = location;
330 const __m128i mask1 = _mm_set1_epi8(':');
331 const __m128i mask2 = _mm_set1_epi8('/');
332 const __m128i mask3 = _mm_set1_epi8('\\');
333 const __m128i mask4 = _mm_set1_epi8('?');
334 const __m128i mask5 = _mm_set1_epi8('[');
335
336 for (; i + 15 < view.size(); i += 16) {
337 __m128i word = _mm_loadu_si128((const __m128i*)(view.data() + i));
338 __m128i m1 = _mm_cmpeq_epi8(word, mask1);
339 __m128i m2 = _mm_cmpeq_epi8(word, mask2);
340 __m128i m3 = _mm_cmpeq_epi8(word, mask3);
341 __m128i m4 = _mm_cmpeq_epi8(word, mask4);
342 __m128i m5 = _mm_cmpeq_epi8(word, mask5);
343 __m128i m = _mm_or_si128(
344 _mm_or_si128(_mm_or_si128(m1, m2), _mm_or_si128(m3, m4)), m5);
345 int mask = _mm_movemask_epi8(m);
346 if (mask != 0) {
347 return i + trailing_zeroes(mask);
348 }
349 }
350 if (i < view.size()) {
351 __m128i word =
352 _mm_loadu_si128((const __m128i*)(view.data() + view.length() - 16));
353 __m128i m1 = _mm_cmpeq_epi8(word, mask1);
354 __m128i m2 = _mm_cmpeq_epi8(word, mask2);
355 __m128i m3 = _mm_cmpeq_epi8(word, mask3);
356 __m128i m4 = _mm_cmpeq_epi8(word, mask4);
357 __m128i m5 = _mm_cmpeq_epi8(word, mask5);
358 __m128i m = _mm_or_si128(
359 _mm_or_si128(_mm_or_si128(m1, m2), _mm_or_si128(m3, m4)), m5);
360 int mask = _mm_movemask_epi8(m);
361 if (mask != 0) {
362 return view.length() - 16 + trailing_zeroes(mask);
363 }
364 }
365 return size_t(view.length());
366}
367#elif ADA_LSX
369 std::string_view view, size_t location) noexcept {
370 // first check for short strings in which case we do it naively.
371 if (view.size() - location < 16) { // slow path
372 for (size_t i = location; i < view.size(); i++) {
373 if (view[i] == ':' || view[i] == '/' || view[i] == '\\' ||
374 view[i] == '?' || view[i] == '[') {
375 return i;
376 }
377 }
378 return size_t(view.size());
379 }
380 // fast path for long strings (expected to be common)
381 size_t i = location;
382 const __m128i mask1 = __lsx_vrepli_b(':');
383 const __m128i mask2 = __lsx_vrepli_b('/');
384 const __m128i mask3 = __lsx_vrepli_b('\\');
385 const __m128i mask4 = __lsx_vrepli_b('?');
386 const __m128i mask5 = __lsx_vrepli_b('[');
387
388 for (; i + 15 < view.size(); i += 16) {
389 __m128i word = __lsx_vld((const __m128i*)(view.data() + i), 0);
390 __m128i m1 = __lsx_vseq_b(word, mask1);
391 __m128i m2 = __lsx_vseq_b(word, mask2);
392 __m128i m3 = __lsx_vseq_b(word, mask3);
393 __m128i m4 = __lsx_vseq_b(word, mask4);
394 __m128i m5 = __lsx_vseq_b(word, mask5);
395 __m128i m =
396 __lsx_vor_v(__lsx_vor_v(__lsx_vor_v(m1, m2), __lsx_vor_v(m3, m4)), m5);
397 int mask = __lsx_vpickve2gr_hu(__lsx_vmsknz_b(m), 0);
398 if (mask != 0) {
399 return i + trailing_zeroes(mask);
400 }
401 }
402 if (i < view.size()) {
403 __m128i word =
404 __lsx_vld((const __m128i*)(view.data() + view.length() - 16), 0);
405 __m128i m1 = __lsx_vseq_b(word, mask1);
406 __m128i m2 = __lsx_vseq_b(word, mask2);
407 __m128i m3 = __lsx_vseq_b(word, mask3);
408 __m128i m4 = __lsx_vseq_b(word, mask4);
409 __m128i m5 = __lsx_vseq_b(word, mask5);
410 __m128i m =
411 __lsx_vor_v(__lsx_vor_v(__lsx_vor_v(m1, m2), __lsx_vor_v(m3, m4)), m5);
412 int mask = __lsx_vpickve2gr_hu(__lsx_vmsknz_b(m), 0);
413 if (mask != 0) {
414 return view.length() - 16 + trailing_zeroes(mask);
415 }
416 }
417 return size_t(view.length());
418}
419#elif ADA_RVV
421 std::string_view view, size_t location) noexcept {
422 // The LUT approach was a bit slower on the SpacemiT X60, but I could see it
423 // being faster on future hardware.
424#if 0
425 // LUT generated using: s=":/\\?["; list(zip([((ord(c)>>2)&0xF)for c in s],s))
426 static const uint8_t tbl[16] = {
427 0xF, 0, 0, 0, 0, 0, '[', '\\', 0, 0, 0, '/', 0, 0, ':', '?'
428 };
429 vuint8m1_t vtbl = __riscv_vle8_v_u8m1(tbl, 16);
430#endif
431 uint8_t* src = (uint8_t*)view.data() + location;
432 for (size_t vl, n = view.size() - location; n > 0;
433 n -= vl, src += vl, location += vl) {
434 vl = __riscv_vsetvl_e8m1(n);
435 vuint8m1_t v = __riscv_vle8_v_u8m1(src, vl);
436#if 0
437 vuint8m1_t vidx = __riscv_vand(__riscv_vsrl(v, 2, vl), 0xF, vl);
438 vuint8m1_t vlut = __riscv_vrgather(vtbl, vidx, vl);
439 vbool8_t m = __riscv_vmseq(v, vlut, vl);
440#else
441 vbool8_t m1 = __riscv_vmseq(v, ':', vl);
442 vbool8_t m2 = __riscv_vmseq(v, '/', vl);
443 vbool8_t m3 = __riscv_vmseq(v, '?', vl);
444 vbool8_t m4 = __riscv_vmseq(v, '[', vl);
445 vbool8_t m5 = __riscv_vmseq(v, '\\', vl);
446 vbool8_t m = __riscv_vmor(
447 __riscv_vmor(__riscv_vmor(m1, m2, vl), __riscv_vmor(m3, m4, vl), vl),
448 m5, vl);
449#endif
450 long idx = __riscv_vfirst(m, vl);
451 if (idx >= 0) return location + idx;
452 }
453 return size_t(view.size());
454}
455#else
456// : / [ \\ ?
457static constexpr std::array<uint8_t, 256> special_host_delimiters =
458 []() consteval {
459 std::array<uint8_t, 256> result{};
460 for (int i : {':', '/', '[', '\\', '?'}) {
461 result[i] = 1;
462 }
463 return result;
464 }();
465// credit: @the-moisrex recommended a table-based approach
467 std::string_view view, size_t location) noexcept {
468 auto const str = view.substr(location);
469 for (auto pos = str.begin(); pos != str.end(); ++pos) {
470 if (special_host_delimiters[(uint8_t)*pos]) {
471 return pos - str.begin() + location;
472 }
473 }
474 return size_t(view.size());
475}
476#endif
477
478// starting at index location, this finds the next location of a character
479// :, /, ? or [. If none is found, view.size() is returned.
480// For use within get_host_delimiter_location.
481#if ADA_SSSE3
482ada_really_inline size_t find_next_host_delimiter(std::string_view view,
483 size_t location) noexcept {
484 // first check for short strings in which case we do it naively.
485 if (view.size() - location < 16) { // slow path
486 for (size_t i = location; i < view.size(); i++) {
487 if (view[i] == ':' || view[i] == '/' || view[i] == '?' ||
488 view[i] == '[') {
489 return i;
490 }
491 }
492 return size_t(view.size());
493 }
494 // fast path for long strings (expected to be common)
495 size_t i = location;
496 // Lookup tables for bit classification:
497 // ':' (0x3A): low[0xA]=0x01, high[0x3]=0x01 -> match
498 // '/' (0x2F): low[0xF]=0x02, high[0x2]=0x02 -> match
499 // '?' (0x3F): low[0xF]=0x01, high[0x3]=0x01 -> match
500 // '[' (0x5B): low[0xB]=0x04, high[0x5]=0x04 -> match
501 const __m128i low_mask =
502 _mm_setr_epi8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
503 0x01, 0x04, 0x00, 0x00, 0x00, 0x03);
504 const __m128i high_mask =
505 _mm_setr_epi8(0x00, 0x00, 0x02, 0x01, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00,
506 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
507 const __m128i fmask = _mm_set1_epi8(0xf);
508 const __m128i zero = _mm_setzero_si128();
509
510 for (; i + 15 < view.size(); i += 16) {
511 __m128i word = _mm_loadu_si128((const __m128i*)(view.data() + i));
512 __m128i lowpart = _mm_shuffle_epi8(low_mask, _mm_and_si128(word, fmask));
513 __m128i highpart = _mm_shuffle_epi8(
514 high_mask, _mm_and_si128(_mm_srli_epi16(word, 4), fmask));
515 __m128i classify = _mm_and_si128(lowpart, highpart);
516 __m128i is_zero = _mm_cmpeq_epi8(classify, zero);
517 // _mm_movemask_epi8 returns a 16-bit mask in bits 0-15, with bits 16-31
518 // zero. After NOT (~), bits 16-31 become 1. We must mask to 16 bits to
519 // avoid false positives.
520 int mask = ~_mm_movemask_epi8(is_zero) & 0xFFFF;
521 if (mask != 0) {
522 return i + trailing_zeroes(static_cast<uint32_t>(mask));
523 }
524 }
525
526 if (i < view.size()) {
527 __m128i word =
528 _mm_loadu_si128((const __m128i*)(view.data() + view.length() - 16));
529 __m128i lowpart = _mm_shuffle_epi8(low_mask, _mm_and_si128(word, fmask));
530 __m128i highpart = _mm_shuffle_epi8(
531 high_mask, _mm_and_si128(_mm_srli_epi16(word, 4), fmask));
532 __m128i classify = _mm_and_si128(lowpart, highpart);
533 __m128i is_zero = _mm_cmpeq_epi8(classify, zero);
534 // _mm_movemask_epi8 returns a 16-bit mask in bits 0-15, with bits 16-31
535 // zero. After NOT (~), bits 16-31 become 1. We must mask to 16 bits to
536 // avoid false positives.
537 int mask = ~_mm_movemask_epi8(is_zero) & 0xFFFF;
538 if (mask != 0) {
539 return view.length() - 16 + trailing_zeroes(static_cast<uint32_t>(mask));
540 }
541 }
542 return size_t(view.size());
543}
544#elif ADA_NEON
545ada_really_inline size_t find_next_host_delimiter(std::string_view view,
546 size_t location) noexcept {
547 // first check for short strings in which case we do it naively.
548 if (view.size() - location < 16) { // slow path
549 for (size_t i = location; i < view.size(); i++) {
550 if (view[i] == ':' || view[i] == '/' || view[i] == '?' ||
551 view[i] == '[') {
552 return i;
553 }
554 }
555 return size_t(view.size());
556 }
557 auto to_bitmask = [](uint8x16_t input) -> uint16_t {
558 uint8x16_t bit_mask =
559 ada_make_uint8x16_t(0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, 0x01,
560 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80);
561 uint8x16_t minput = vandq_u8(input, bit_mask);
562 uint8x16_t tmp = vpaddq_u8(minput, minput);
563 tmp = vpaddq_u8(tmp, tmp);
564 tmp = vpaddq_u8(tmp, tmp);
565 return vgetq_lane_u16(vreinterpretq_u16_u8(tmp), 0);
566 };
567
568 // fast path for long strings (expected to be common)
569 size_t i = location;
570 uint8x16_t low_mask =
571 ada_make_uint8x16_t(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
572 0x00, 0x01, 0x04, 0x00, 0x00, 0x00, 0x03);
573 uint8x16_t high_mask =
574 ada_make_uint8x16_t(0x00, 0x00, 0x02, 0x01, 0x00, 0x04, 0x00, 0x00, 0x00,
575 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
576 uint8x16_t fmask = vmovq_n_u8(0xf);
577 uint8x16_t zero{0};
578 for (; i + 15 < view.size(); i += 16) {
579 uint8x16_t word = vld1q_u8((const uint8_t*)view.data() + i);
580 uint8x16_t lowpart = vqtbl1q_u8(low_mask, vandq_u8(word, fmask));
581 uint8x16_t highpart = vqtbl1q_u8(high_mask, vshrq_n_u8(word, 4));
582 uint8x16_t classify = vandq_u8(lowpart, highpart);
583 if (vmaxvq_u32(vreinterpretq_u32_u8(classify)) != 0) {
584 uint8x16_t is_zero = vceqq_u8(classify, zero);
585 uint16_t is_non_zero = ~to_bitmask(is_zero);
586 return i + trailing_zeroes(is_non_zero);
587 }
588 }
589
590 if (i < view.size()) {
591 uint8x16_t word =
592 vld1q_u8((const uint8_t*)view.data() + view.length() - 16);
593 uint8x16_t lowpart = vqtbl1q_u8(low_mask, vandq_u8(word, fmask));
594 uint8x16_t highpart = vqtbl1q_u8(high_mask, vshrq_n_u8(word, 4));
595 uint8x16_t classify = vandq_u8(lowpart, highpart);
596 if (vmaxvq_u32(vreinterpretq_u32_u8(classify)) != 0) {
597 uint8x16_t is_zero = vceqq_u8(classify, zero);
598 uint16_t is_non_zero = ~to_bitmask(is_zero);
599 return view.length() - 16 + trailing_zeroes(is_non_zero);
600 }
601 }
602 return size_t(view.size());
603}
604#elif ADA_SSE2
605ada_really_inline size_t find_next_host_delimiter(std::string_view view,
606 size_t location) noexcept {
607 // first check for short strings in which case we do it naively.
608 if (view.size() - location < 16) { // slow path
609 for (size_t i = location; i < view.size(); i++) {
610 if (view[i] == ':' || view[i] == '/' || view[i] == '?' ||
611 view[i] == '[') {
612 return i;
613 }
614 }
615 return size_t(view.size());
616 }
617 // fast path for long strings (expected to be common)
618 size_t i = location;
619 const __m128i mask1 = _mm_set1_epi8(':');
620 const __m128i mask2 = _mm_set1_epi8('/');
621 const __m128i mask4 = _mm_set1_epi8('?');
622 const __m128i mask5 = _mm_set1_epi8('[');
623
624 for (; i + 15 < view.size(); i += 16) {
625 __m128i word = _mm_loadu_si128((const __m128i*)(view.data() + i));
626 __m128i m1 = _mm_cmpeq_epi8(word, mask1);
627 __m128i m2 = _mm_cmpeq_epi8(word, mask2);
628 __m128i m4 = _mm_cmpeq_epi8(word, mask4);
629 __m128i m5 = _mm_cmpeq_epi8(word, mask5);
630 __m128i m = _mm_or_si128(_mm_or_si128(m1, m2), _mm_or_si128(m4, m5));
631 int mask = _mm_movemask_epi8(m);
632 if (mask != 0) {
633 return i + trailing_zeroes(mask);
634 }
635 }
636 if (i < view.size()) {
637 __m128i word =
638 _mm_loadu_si128((const __m128i*)(view.data() + view.length() - 16));
639 __m128i m1 = _mm_cmpeq_epi8(word, mask1);
640 __m128i m2 = _mm_cmpeq_epi8(word, mask2);
641 __m128i m4 = _mm_cmpeq_epi8(word, mask4);
642 __m128i m5 = _mm_cmpeq_epi8(word, mask5);
643 __m128i m = _mm_or_si128(_mm_or_si128(m1, m2), _mm_or_si128(m4, m5));
644 int mask = _mm_movemask_epi8(m);
645 if (mask != 0) {
646 return view.length() - 16 + trailing_zeroes(mask);
647 }
648 }
649 return size_t(view.length());
650}
651#elif ADA_LSX
652ada_really_inline size_t find_next_host_delimiter(std::string_view view,
653 size_t location) noexcept {
654 // first check for short strings in which case we do it naively.
655 if (view.size() - location < 16) { // slow path
656 for (size_t i = location; i < view.size(); i++) {
657 if (view[i] == ':' || view[i] == '/' || view[i] == '?' ||
658 view[i] == '[') {
659 return i;
660 }
661 }
662 return size_t(view.size());
663 }
664 // fast path for long strings (expected to be common)
665 size_t i = location;
666 const __m128i mask1 = __lsx_vrepli_b(':');
667 const __m128i mask2 = __lsx_vrepli_b('/');
668 const __m128i mask4 = __lsx_vrepli_b('?');
669 const __m128i mask5 = __lsx_vrepli_b('[');
670
671 for (; i + 15 < view.size(); i += 16) {
672 __m128i word = __lsx_vld((const __m128i*)(view.data() + i), 0);
673 __m128i m1 = __lsx_vseq_b(word, mask1);
674 __m128i m2 = __lsx_vseq_b(word, mask2);
675 __m128i m4 = __lsx_vseq_b(word, mask4);
676 __m128i m5 = __lsx_vseq_b(word, mask5);
677 __m128i m = __lsx_vor_v(__lsx_vor_v(m1, m2), __lsx_vor_v(m4, m5));
678 int mask = __lsx_vpickve2gr_hu(__lsx_vmsknz_b(m), 0);
679 if (mask != 0) {
680 return i + trailing_zeroes(mask);
681 }
682 }
683 if (i < view.size()) {
684 __m128i word =
685 __lsx_vld((const __m128i*)(view.data() + view.length() - 16), 0);
686 __m128i m1 = __lsx_vseq_b(word, mask1);
687 __m128i m2 = __lsx_vseq_b(word, mask2);
688 __m128i m4 = __lsx_vseq_b(word, mask4);
689 __m128i m5 = __lsx_vseq_b(word, mask5);
690 __m128i m = __lsx_vor_v(__lsx_vor_v(m1, m2), __lsx_vor_v(m4, m5));
691 int mask = __lsx_vpickve2gr_hu(__lsx_vmsknz_b(m), 0);
692 if (mask != 0) {
693 return view.length() - 16 + trailing_zeroes(mask);
694 }
695 }
696 return size_t(view.length());
697}
698#elif ADA_RVV
699ada_really_inline size_t find_next_host_delimiter(std::string_view view,
700 size_t location) noexcept {
701 uint8_t* src = (uint8_t*)view.data() + location;
702 for (size_t vl, n = view.size() - location; n > 0;
703 n -= vl, src += vl, location += vl) {
704 vl = __riscv_vsetvl_e8m1(n);
705 vuint8m1_t v = __riscv_vle8_v_u8m1(src, vl);
706 vbool8_t m1 = __riscv_vmseq(v, ':', vl);
707 vbool8_t m2 = __riscv_vmseq(v, '/', vl);
708 vbool8_t m3 = __riscv_vmseq(v, '?', vl);
709 vbool8_t m4 = __riscv_vmseq(v, '[', vl);
710 vbool8_t m =
711 __riscv_vmor(__riscv_vmor(m1, m2, vl), __riscv_vmor(m3, m4, vl), vl);
712 long idx = __riscv_vfirst(m, vl);
713 if (idx >= 0) return location + idx;
714 }
715 return size_t(view.size());
716}
717#else
718// : / [ ?
719static constexpr std::array<uint8_t, 256> host_delimiters = []() consteval {
720 std::array<uint8_t, 256> result{};
721 for (int i : {':', '/', '?', '['}) {
722 result[i] = 1;
723 }
724 return result;
725}();
726// credit: @the-moisrex recommended a table-based approach
727ada_really_inline size_t find_next_host_delimiter(std::string_view view,
728 size_t location) noexcept {
729 auto const str = view.substr(location);
730 for (auto pos = str.begin(); pos != str.end(); ++pos) {
731 if (host_delimiters[(uint8_t)*pos]) {
732 return pos - str.begin() + location;
733 }
734 }
735 return size_t(view.size());
736}
737#endif
738
739ada_really_inline std::pair<size_t, bool> get_host_delimiter_location(
740 const bool is_special, std::string_view& view) noexcept {
749 const size_t view_size = view.size();
750 size_t location = 0;
751 bool found_colon = false;
771 if (is_special) {
772 // We move to the next delimiter.
773 location = find_next_host_delimiter_special(view, location);
774 // Unless we find '[' then we are going only going to have to call
775 // find_next_host_delimiter_special once.
776 for (; location < view_size;
777 location = find_next_host_delimiter_special(view, location)) {
778 if (view[location] == '[') {
779 location = view.find(']', location);
780 if (location == std::string_view::npos) {
781 // performance: view.find might get translated to a memchr, which
782 // has no notion of std::string_view::npos, so the code does not
783 // reflect the assembly.
784 location = view_size;
785 break;
786 }
787 } else {
788 found_colon = view[location] == ':';
789 break;
790 }
791 }
792 } else {
793 // We move to the next delimiter.
794 location = find_next_host_delimiter(view, location);
795 // Unless we find '[' then we are going only going to have to call
796 // find_next_host_delimiter_special once.
797 for (; location < view_size;
798 location = find_next_host_delimiter(view, location)) {
799 if (view[location] == '[') {
800 location = view.find(']', location);
801 if (location == std::string_view::npos) {
802 // performance: view.find might get translated to a memchr, which
803 // has no notion of std::string_view::npos, so the code does not
804 // reflect the assembly.
805 location = view_size;
806 break;
807 }
808 } else {
809 found_colon = view[location] == ':';
810 break;
811 }
812 }
813 }
814 // performance: remove_suffix may translate into a single instruction.
815 view.remove_suffix(view_size - location);
816 return {location, found_colon};
817}
818
819void trim_c0_whitespace(std::string_view& input) noexcept {
820 while (!input.empty() &&
821 ada::unicode::is_c0_control_or_space(input.front())) {
822 input.remove_prefix(1);
823 }
824 while (!input.empty() && ada::unicode::is_c0_control_or_space(input.back())) {
825 input.remove_suffix(1);
826 }
827}
828
829ada_really_inline void parse_prepared_path(std::string_view input,
831 std::string& path) {
832 ada_log("parse_prepared_path ", input);
833 uint8_t accumulator = checkers::path_signature(input);
834 // Let us first detect a trivial case.
835 // If it is special, we check that we have no dot, no %, no \ and no
836 // character needing percent encoding. Otherwise, we check that we have no %,
837 // no dot, and no character needing percent encoding.
838 constexpr uint8_t need_encoding = 1;
839 constexpr uint8_t backslash_char = 2;
840 constexpr uint8_t dot_char = 4;
841 constexpr uint8_t percent_char = 8;
842 bool special = type != ada::scheme::NOT_SPECIAL;
843 bool may_need_slow_file_handling = (type == ada::scheme::type::FILE &&
845 bool trivial_path =
846 (special ? (accumulator == 0)
847 : ((accumulator & (need_encoding | dot_char | percent_char)) ==
848 0)) &&
849 (!may_need_slow_file_handling);
850 if (accumulator == dot_char && !may_need_slow_file_handling) {
851 // '4' means that we have at least one dot, but nothing that requires
852 // percent encoding or decoding. The only part that is not trivial is
853 // that we may have single dots and double dots path segments.
854 // If we have such segments, then we either have a path that begins
855 // with '.' (easy to check), or we have the sequence './'.
856 // Note: input cannot be empty, it must at least contain one character ('.')
857 // Note: we know that '\' is not present.
858 if (input[0] != '.') {
859 size_t slashdot = 0;
860 bool dot_is_file = true;
861 for (;;) {
862 slashdot = input.find("/.", slashdot);
863 if (slashdot == std::string_view::npos) { // common case
864 break;
865 } else { // uncommon
866 // only three cases matter: /./, /.. or a final /
867 slashdot += 2;
868 dot_is_file &= !(slashdot == input.size() || input[slashdot] == '.' ||
869 input[slashdot] == '/');
870 }
871 }
872 trivial_path = dot_is_file;
873 }
874 }
875 if (trivial_path) {
876 ada_log("parse_path trivial");
877 path += '/';
878 path += input;
879 return;
880 }
881 // We are going to need to look a bit at the path, but let us see if we can
882 // ignore percent encoding *and* backslashes *and* percent characters.
883 // Except for the trivial case, this is likely to capture 99% of paths out
884 // there.
885 bool fast_path =
886 (special &&
887 (accumulator & (need_encoding | backslash_char | percent_char)) == 0) &&
888 (type != ada::scheme::type::FILE);
889 if (fast_path) {
890 ada_log("parse_prepared_path fast");
891 // Here we don't need to worry about \ or percent encoding.
892 // We also do not have a file protocol. We might have dots, however,
893 // but dots must as appear as '.', and they cannot be encoded because
894 // the symbol '%' is not present.
895 size_t previous_location = 0; // We start at 0.
896 do {
897 size_t new_location = input.find('/', previous_location);
898 // std::string_view path_view = input;
899 // We process the last segment separately:
900 if (new_location == std::string_view::npos) {
901 std::string_view path_view = input.substr(previous_location);
902 if (path_view == "..") { // The path ends with ..
903 // e.g., if you receive ".." with an empty path, you go to "/".
904 if (path.empty()) {
905 path = '/';
906 return;
907 }
908 // Fast case where we have nothing to do:
909 if (path.back() == '/') {
910 return;
911 }
912 // If you have the path "/joe/myfriend",
913 // then you delete 'myfriend'.
914 path.resize(path.rfind('/') + 1);
915 return;
916 }
917 path += '/';
918 if (path_view != ".") {
919 path.append(path_view);
920 }
921 return;
922 } else {
923 // This is a non-final segment.
924 std::string_view path_view =
925 input.substr(previous_location, new_location - previous_location);
926 previous_location = new_location + 1;
927 if (path_view == "..") {
928 size_t last_delimiter = path.rfind('/');
929 if (last_delimiter != std::string::npos) {
930 path.erase(last_delimiter);
931 }
932 } else if (path_view != ".") {
933 path += '/';
934 path.append(path_view);
935 }
936 }
937 } while (true);
938 } else {
939 ada_log("parse_path slow");
940 // we have reached the general case
941 bool needs_percent_encoding = (accumulator & 1);
942 std::string path_buffer_tmp;
943 do {
944 size_t location = (special && (accumulator & 2))
945 ? input.find_first_of("/\\")
946 : input.find('/');
947 std::string_view path_view = input;
948 if (location != std::string_view::npos) {
949 path_view.remove_suffix(path_view.size() - location);
950 input.remove_prefix(location + 1);
951 }
952 // path_buffer is either path_view or it might point at a percent encoded
953 // temporary file.
954 std::string_view path_buffer =
955 (needs_percent_encoding &&
956 ada::unicode::percent_encode<false>(
957 path_view, character_sets::PATH_PERCENT_ENCODE, path_buffer_tmp))
958 ? path_buffer_tmp
959 : path_view;
960 if (unicode::is_double_dot_path_segment(path_buffer)) {
961 helpers::shorten_path(path, type);
962 if (location == std::string_view::npos) {
963 path += '/';
964 }
965 } else if (unicode::is_single_dot_path_segment(path_buffer) &&
966 (location == std::string_view::npos)) {
967 path += '/';
968 }
969 // Otherwise, if path_buffer is not a single-dot path segment, then:
970 else if (!unicode::is_single_dot_path_segment(path_buffer)) {
971 // If url's scheme is "file", url's path is empty, and path_buffer is a
972 // Windows drive letter, then replace the second code point in
973 // path_buffer with U+003A (:).
974 if (type == ada::scheme::type::FILE && path.empty() &&
976 path += '/';
977 path += path_buffer[0];
978 path += ':';
979 path_buffer.remove_prefix(2);
980 path.append(path_buffer);
981 } else {
982 // Append path_buffer to url's path.
983 path += '/';
984 path.append(path_buffer);
985 }
986 }
987 if (location == std::string_view::npos) {
988 return;
989 }
990 } while (true);
991 }
992}
993
994bool overlaps(std::string_view input1, const std::string& input2) noexcept {
995 ada_log("helpers::overlaps check if string_view '", input1, "' [",
996 input1.size(), " bytes] is part of string '", input2, "' [",
997 input2.size(), " bytes]");
998 return !input1.empty() && !input2.empty() && input1.data() >= input2.data() &&
999 input1.data() < input2.data() + input2.size();
1000}
1001
1002template <class url_type>
1003ada_really_inline void strip_trailing_spaces_from_opaque_path(
1004 url_type& url) noexcept {
1005 ada_log("helpers::strip_trailing_spaces_from_opaque_path");
1006 if (!url.has_opaque_path) return;
1007 if (url.has_hash()) return;
1008 if (url.has_search()) return;
1009
1010 auto path = std::string(url.get_pathname());
1011 while (!path.empty() && path.back() == ' ') {
1012 path.resize(path.size() - 1);
1013 }
1014 url.update_base_pathname(path);
1015}
1016
1017// @ / \\ ?
1018static constexpr std::array<uint8_t, 256> authority_delimiter_special =
1019 []() consteval {
1020 std::array<uint8_t, 256> result{};
1021 for (uint8_t i : {'@', '/', '\\', '?'}) {
1022 result[i] = 1;
1023 }
1024 return result;
1025 }();
1026// credit: @the-moisrex recommended a table-based approach
1027ada_really_inline size_t
1028find_authority_delimiter_special(std::string_view view) noexcept {
1029 // performance note: we might be able to gain further performance
1030 // with SIMD intrinsics.
1031 for (auto pos = view.begin(); pos != view.end(); ++pos) {
1032 if (authority_delimiter_special[(uint8_t)*pos]) {
1033 return pos - view.begin();
1034 }
1035 }
1036 return size_t(view.size());
1037}
1038
1039// @ / ?
1040static constexpr std::array<uint8_t, 256> authority_delimiter = []() consteval {
1041 std::array<uint8_t, 256> result{};
1042 for (uint8_t i : {'@', '/', '?'}) {
1043 result[i] = 1;
1044 }
1045 return result;
1046}();
1047// credit: @the-moisrex recommended a table-based approach
1048ada_really_inline size_t
1049find_authority_delimiter(std::string_view view) noexcept {
1050 // performance note: we might be able to gain further performance
1051 // with SIMD instrinsics.
1052 for (auto pos = view.begin(); pos != view.end(); ++pos) {
1053 if (authority_delimiter[(uint8_t)*pos]) {
1054 return pos - view.begin();
1055 }
1056 }
1057 return size_t(view.size());
1058}
1059
1060} // namespace ada::helpers
1061
1062namespace ada {
1066#undef ada_make_uint8x16_t
1067} // namespace ada
Definitions for URL specific checkers used within Ada.
Cross-platform compiler macros and common definitions.
#define ADA_ASSERT_TRUE(COND)
#define ada_unused
Definition common_defs.h:88
#define ada_warn_unused
Definition common_defs.h:89
#define ada_really_inline
Definition common_defs.h:85
constexpr uint8_t PATH_PERCENT_ENCODE[32]
constexpr bool is_normalized_windows_drive_letter(std::string_view input) noexcept
constexpr bool is_windows_drive_letter(std::string_view input) noexcept
Includes the definitions for helper functions.
ada_really_inline size_t find_next_host_delimiter(std::string_view view, size_t location) noexcept
Definition helpers.cpp:727
static constexpr std::array< uint8_t, 256 > authority_delimiter_special
Definition helpers.cpp:1018
static constexpr std::array< uint8_t, 256 > host_delimiters
Definition helpers.cpp:719
ada_really_inline size_t find_next_host_delimiter_special(std::string_view view, size_t location) noexcept
Definition helpers.cpp:466
ada_unused std::string get_state(ada::state s)
Definition helpers.cpp:39
static constexpr std::array< uint8_t, 256 > authority_delimiter
Definition helpers.cpp:1040
static constexpr std::array< uint8_t, 256 > special_host_delimiters
Definition helpers.cpp:457
ada_really_inline int trailing_zeroes(uint32_t input_num) noexcept
Definition helpers.cpp:170
type
Enumeration of URL scheme types.
Definition scheme.h:41
@ NOT_SPECIAL
Definition scheme.h:43
Definition ada_idna.h:13
state
States in the URL parsing state machine.
Definition state.h:27
@ SPECIAL_RELATIVE_OR_AUTHORITY
Definition state.h:101
@ FILE_SLASH
Definition state.h:81
@ SCHEME
Definition state.h:41
@ SPECIAL_AUTHORITY_SLASHES
Definition state.h:96
@ FRAGMENT
Definition state.h:56
@ FILE_HOST
Definition state.h:76
@ OPAQUE_PATH
Definition state.h:121
@ RELATIVE_SLASH
Definition state.h:66
@ NO_SCHEME
Definition state.h:51
@ PATH_START
Definition state.h:116
@ RELATIVE_SCHEME
Definition state.h:61
@ SPECIAL_AUTHORITY_IGNORE_SLASHES
Definition state.h:91
@ SCHEME_START
Definition state.h:36
@ AUTHORITY
Definition state.h:31
@ PATH_OR_AUTHORITY
Definition state.h:86
ada_warn_unused std::string_view to_string(encoding_type type)
tl::expected< result_type, ada::errors > result
URL scheme type definitions and utilities.