Ada 3.1.0
Fast spec-compliant URL parser
Loading...
Searching...
No Matches
unicode.cpp
Go to the documentation of this file.
1#include "ada/common_defs.h"
4#include "ada/unicode.h"
5#include "ada/log.h"
6
8#include "ada_idna.cpp"
10
11#include <algorithm>
12#if ADA_NEON
13#include <arm_neon.h>
14#elif ADA_SSE2
15#include <emmintrin.h>
16#endif
17
18namespace ada::unicode {
19
20constexpr bool is_tabs_or_newline(char c) noexcept {
21 return c == '\r' || c == '\n' || c == '\t';
22}
23
24constexpr uint64_t broadcast(uint8_t v) noexcept {
25 return 0x101010101010101ull * v;
26}
27
28constexpr bool to_lower_ascii(char* input, size_t length) noexcept {
29 uint64_t broadcast_80 = broadcast(0x80);
30 uint64_t broadcast_Ap = broadcast(128 - 'A');
31 uint64_t broadcast_Zp = broadcast(128 - 'Z' - 1);
32 uint64_t non_ascii = 0;
33 size_t i = 0;
34
35 for (; i + 7 < length; i += 8) {
36 uint64_t word{};
37 memcpy(&word, input + i, sizeof(word));
38 non_ascii |= (word & broadcast_80);
39 word ^=
40 (((word + broadcast_Ap) ^ (word + broadcast_Zp)) & broadcast_80) >> 2;
41 memcpy(input + i, &word, sizeof(word));
42 }
43 if (i < length) {
44 uint64_t word{};
45 memcpy(&word, input + i, length - i);
46 non_ascii |= (word & broadcast_80);
47 word ^=
48 (((word + broadcast_Ap) ^ (word + broadcast_Zp)) & broadcast_80) >> 2;
49 memcpy(input + i, &word, length - i);
50 }
51 return non_ascii == 0;
52}
53#if ADA_NEON
54ada_really_inline bool has_tabs_or_newline(
55 std::string_view user_input) noexcept {
56 // first check for short strings in which case we do it naively.
57 if (user_input.size() < 16) { // slow path
58 return std::any_of(user_input.begin(), user_input.end(),
60 }
61 // fast path for long strings (expected to be common)
62 size_t i = 0;
75 static uint8_t rnt_array[16] = {1, 0, 0, 0, 0, 0, 0, 0,
76 0, 9, 10, 0, 0, 13, 0, 0};
77 const uint8x16_t rnt = vld1q_u8(rnt_array);
78 // m['0xd', '0xa', '0x9']
79 uint8x16_t running{0};
80 for (; i + 15 < user_input.size(); i += 16) {
81 uint8x16_t word = vld1q_u8((const uint8_t*)user_input.data() + i);
82
83 running = vorrq_u8(running, vceqq_u8(vqtbl1q_u8(rnt, word), word));
84 }
85 if (i < user_input.size()) {
86 uint8x16_t word =
87 vld1q_u8((const uint8_t*)user_input.data() + user_input.length() - 16);
88 running = vorrq_u8(running, vceqq_u8(vqtbl1q_u8(rnt, word), word));
89 }
90 return vmaxvq_u32(vreinterpretq_u32_u8(running)) != 0;
91}
92#elif ADA_SSE2
93ada_really_inline bool has_tabs_or_newline(
94 std::string_view user_input) noexcept {
95 // first check for short strings in which case we do it naively.
96 if (user_input.size() < 16) { // slow path
97 return std::any_of(user_input.begin(), user_input.end(),
99 }
100 // fast path for long strings (expected to be common)
101 size_t i = 0;
102 const __m128i mask1 = _mm_set1_epi8('\r');
103 const __m128i mask2 = _mm_set1_epi8('\n');
104 const __m128i mask3 = _mm_set1_epi8('\t');
105 // If we supported SSSE3, we could use the algorithm that we use for NEON.
106 __m128i running{0};
107 for (; i + 15 < user_input.size(); i += 16) {
108 __m128i word = _mm_loadu_si128((const __m128i*)(user_input.data() + i));
109 running = _mm_or_si128(
110 _mm_or_si128(running, _mm_or_si128(_mm_cmpeq_epi8(word, mask1),
111 _mm_cmpeq_epi8(word, mask2))),
112 _mm_cmpeq_epi8(word, mask3));
113 }
114 if (i < user_input.size()) {
115 __m128i word = _mm_loadu_si128(
116 (const __m128i*)(user_input.data() + user_input.length() - 16));
117 running = _mm_or_si128(
118 _mm_or_si128(running, _mm_or_si128(_mm_cmpeq_epi8(word, mask1),
119 _mm_cmpeq_epi8(word, mask2))),
120 _mm_cmpeq_epi8(word, mask3));
121 }
122 return _mm_movemask_epi8(running) != 0;
123}
124#else
125ada_really_inline bool has_tabs_or_newline(
126 std::string_view user_input) noexcept {
127 auto has_zero_byte = [](uint64_t v) {
128 return ((v - 0x0101010101010101) & ~(v) & 0x8080808080808080);
129 };
130 size_t i = 0;
131 uint64_t mask1 = broadcast('\r');
132 uint64_t mask2 = broadcast('\n');
133 uint64_t mask3 = broadcast('\t');
134 uint64_t running{0};
135 for (; i + 7 < user_input.size(); i += 8) {
136 uint64_t word{};
137 memcpy(&word, user_input.data() + i, sizeof(word));
138 uint64_t xor1 = word ^ mask1;
139 uint64_t xor2 = word ^ mask2;
140 uint64_t xor3 = word ^ mask3;
141 running |= has_zero_byte(xor1) | has_zero_byte(xor2) | has_zero_byte(xor3);
142 }
143 if (i < user_input.size()) {
144 uint64_t word{};
145 memcpy(&word, user_input.data() + i, user_input.size() - i);
146 uint64_t xor1 = word ^ mask1;
147 uint64_t xor2 = word ^ mask2;
148 uint64_t xor3 = word ^ mask3;
149 running |= has_zero_byte(xor1) | has_zero_byte(xor2) | has_zero_byte(xor3);
150 }
151 return running;
152}
153#endif
154
155// A forbidden host code point is U+0000 NULL, U+0009 TAB, U+000A LF, U+000D CR,
156// U+0020 SPACE, U+0023 (#), U+002F (/), U+003A (:), U+003C (<), U+003E (>),
157// U+003F (?), U+0040 (@), U+005B ([), U+005C (\‍), U+005D (]), U+005E (^), or
158// U+007C (|).
159constexpr static std::array<uint8_t, 256> is_forbidden_host_code_point_table =
160 []() consteval {
161 std::array<uint8_t, 256> result{};
162 for (uint8_t c : {'\0', '\x09', '\x0a', '\x0d', ' ', '#', '/', ':', '<',
163 '>', '?', '@', '[', '\\', ']', '^', '|'}) {
164 result[c] = true;
165 }
166 return result;
167 }();
168
169ada_really_inline constexpr bool is_forbidden_host_code_point(
170 const char c) noexcept {
171 return is_forbidden_host_code_point_table[uint8_t(c)];
172}
173
174constexpr static std::array<uint8_t, 256> is_forbidden_domain_code_point_table =
175 []() consteval {
176 std::array<uint8_t, 256> result{};
177 for (uint8_t c : {'\0', '\x09', '\x0a', '\x0d', ' ', '#', '/', ':', '<',
178 '>', '?', '@', '[', '\\', ']', '^', '|', '%'}) {
179 result[c] = true;
180 }
181 for (uint8_t c = 0; c <= 32; c++) {
182 result[c] = true;
183 }
184 for (size_t c = 127; c < 255; c++) {
185 result[c] = true;
186 }
187 return result;
188 }();
189
190static_assert(sizeof(is_forbidden_domain_code_point_table) == 256);
191
192ada_really_inline constexpr bool is_forbidden_domain_code_point(
193 const char c) noexcept {
194 return is_forbidden_domain_code_point_table[uint8_t(c)];
195}
196
197ada_really_inline constexpr bool contains_forbidden_domain_code_point(
198 const char* input, size_t length) noexcept {
199 size_t i = 0;
200 uint8_t accumulator{};
201 for (; i + 4 <= length; i += 4) {
202 accumulator |= is_forbidden_domain_code_point_table[uint8_t(input[i])];
203 accumulator |= is_forbidden_domain_code_point_table[uint8_t(input[i + 1])];
204 accumulator |= is_forbidden_domain_code_point_table[uint8_t(input[i + 2])];
205 accumulator |= is_forbidden_domain_code_point_table[uint8_t(input[i + 3])];
206 }
207 for (; i < length; i++) {
208 accumulator |= is_forbidden_domain_code_point_table[uint8_t(input[i])];
209 }
210 return accumulator;
211}
212
213constexpr static std::array<uint8_t, 256>
215 std::array<uint8_t, 256> result{};
216 for (uint8_t c : {'\0', '\x09', '\x0a', '\x0d', ' ', '#', '/', ':', '<',
217 '>', '?', '@', '[', '\\', ']', '^', '|', '%'}) {
218 result[c] = 1;
219 }
220 for (uint8_t c = 'A'; c <= 'Z'; c++) {
221 result[c] = 2;
222 }
223 for (uint8_t c = 0; c <= 32; c++) {
224 result[c] = 1;
225 }
226 for (size_t c = 127; c < 255; c++) {
227 result[c] = 1;
228 }
229 return result;
230 }();
231
232ada_really_inline constexpr uint8_t
233contains_forbidden_domain_code_point_or_upper(const char* input,
234 size_t length) noexcept {
235 size_t i = 0;
236 uint8_t accumulator{};
237 for (; i + 4 <= length; i += 4) {
238 accumulator |=
240 accumulator |=
242 accumulator |=
244 accumulator |=
246 }
247 for (; i < length; i++) {
248 accumulator |=
250 }
251 return accumulator;
252}
253
254// std::isalnum(c) || c == '+' || c == '-' || c == '.') is true for
255constexpr static std::array<bool, 256> is_alnum_plus_table = []() consteval {
256 std::array<bool, 256> result{};
257 for (size_t c = 0; c < 256; c++) {
258 result[c] = (c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') ||
259 (c >= 'A' && c <= 'Z') || c == '+' || c == '-' || c == '.';
260 }
261 return result;
262}();
263
264ada_really_inline constexpr bool is_alnum_plus(const char c) noexcept {
265 return is_alnum_plus_table[uint8_t(c)];
266 // A table is almost surely much faster than the
267 // following under most compilers: return
268 // return (std::isalnum(c) || c == '+' || c == '-' || c == '.');
269}
270
271ada_really_inline constexpr bool is_ascii_hex_digit(const char c) noexcept {
272 return (c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') ||
273 (c >= 'a' && c <= 'f');
274}
275
276ada_really_inline constexpr bool is_ascii_digit(const char c) noexcept {
277 // An ASCII digit is a code point in the range U+0030 (0) to U+0039 (9),
278 // inclusive.
279 return (c >= '0' && c <= '9');
280}
281
282ada_really_inline constexpr bool is_ascii(const char32_t c) noexcept {
283 // If code point is between U+0000 and U+007F inclusive, then return true.
284 return c <= 0x7F;
285}
286
287ada_really_inline constexpr bool is_c0_control_or_space(const char c) noexcept {
288 return (unsigned char)c <= ' ';
289}
290
291ada_really_inline constexpr bool is_ascii_tab_or_newline(
292 const char c) noexcept {
293 return c == '\t' || c == '\n' || c == '\r';
294}
295
296constexpr std::string_view table_is_double_dot_path_segment[] = {
297 "..", "%2e.", ".%2e", "%2e%2e"};
298
299ada_really_inline constexpr bool is_double_dot_path_segment(
300 std::string_view input) noexcept {
301 // This will catch most cases:
302 // The length must be 2,4 or 6.
303 // We divide by two and require
304 // that the result be between 1 and 3 inclusively.
305 uint64_t half_length = uint64_t(input.size()) / 2;
306 if (half_length - 1 > 2) {
307 return false;
308 }
309 // We have a string of length 2, 4 or 6.
310 // We now check the first character:
311 if ((input[0] != '.') && (input[0] != '%')) {
312 return false;
313 }
314 // We are unlikely the get beyond this point.
315 int hash_value = (input.size() + (unsigned)(input[0])) & 3;
316 const std::string_view target = table_is_double_dot_path_segment[hash_value];
317 if (target.size() != input.size()) {
318 return false;
319 }
320 // We almost never get here.
321 // Optimizing the rest is relatively unimportant.
322 auto prefix_equal_unsafe = [](std::string_view a, std::string_view b) {
323 uint16_t A, B;
324 memcpy(&A, a.data(), sizeof(A));
325 memcpy(&B, b.data(), sizeof(B));
326 return A == B;
327 };
328 if (!prefix_equal_unsafe(input, target)) {
329 return false;
330 }
331 for (size_t i = 2; i < input.size(); i++) {
332 char c = input[i];
333 if ((uint8_t((c | 0x20) - 0x61) <= 25 ? (c | 0x20) : c) != target[i]) {
334 return false;
335 }
336 }
337 return true;
338 // The above code might be a bit better than the code below. Compilers
339 // are not stupid and may use the fact that these strings have length 2,4 and
340 // 6 and other tricks.
341 // return input == ".." ||
342 // input == ".%2e" || input == ".%2E" ||
343 // input == "%2e." || input == "%2E." ||
344 // input == "%2e%2e" || input == "%2E%2E" || input == "%2E%2e" || input ==
345 // "%2e%2E";
346}
347
348ada_really_inline constexpr bool is_single_dot_path_segment(
349 std::string_view input) noexcept {
350 return input == "." || input == "%2e" || input == "%2E";
351}
352
353ada_really_inline constexpr bool is_lowercase_hex(const char c) noexcept {
354 return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f');
355}
356
357constexpr static char hex_to_binary_table[] = {
358 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 0, 0, 0, 0, 0, 0, 10, 11,
359 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
360 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 11, 12, 13, 14, 15};
361unsigned constexpr convert_hex_to_binary(const char c) noexcept {
362 return hex_to_binary_table[c - '0'];
363}
364
365std::string percent_decode(const std::string_view input, size_t first_percent) {
366 // next line is for safety only, we expect users to avoid calling
367 // percent_decode when first_percent is outside the range.
368 if (first_percent == std::string_view::npos) {
369 return std::string(input);
370 }
371 std::string dest;
372 dest.reserve(input.length());
373 dest.append(input.substr(0, first_percent));
374 const char* pointer = input.data() + first_percent;
375 const char* end = input.data() + input.size();
376 // Optimization opportunity: if the following code gets
377 // called often, it can be optimized quite a bit.
378 while (pointer < end) {
379 const char ch = pointer[0];
380 size_t remaining = end - pointer - 1;
381 if (ch != '%' || remaining < 2 ||
382 ( // ch == '%' && // It is unnecessary to check that ch == '%'.
383 (!is_ascii_hex_digit(pointer[1]) ||
384 !is_ascii_hex_digit(pointer[2])))) {
385 dest += ch;
386 pointer++;
387 } else {
388 unsigned a = convert_hex_to_binary(pointer[1]);
389 unsigned b = convert_hex_to_binary(pointer[2]);
390 char c = static_cast<char>(a * 16 + b);
391 dest += c;
392 pointer += 3;
393 }
394 }
395 return dest;
396}
397
398std::string percent_encode(const std::string_view input,
399 const uint8_t character_set[]) {
400 auto pointer = std::ranges::find_if(input, [character_set](const char c) {
401 return character_sets::bit_at(character_set, c);
402 });
403 // Optimization: Don't iterate if percent encode is not required
404 if (pointer == input.end()) {
405 return std::string(input);
406 }
407
408 std::string result;
409 result.reserve(input.length()); // in the worst case, percent encoding might
410 // produce 3 characters.
411 result.append(input.substr(0, std::distance(input.begin(), pointer)));
412
413 for (; pointer != input.end(); pointer++) {
414 if (character_sets::bit_at(character_set, *pointer)) {
415 result.append(character_sets::hex + uint8_t(*pointer) * 4, 3);
416 } else {
417 result += *pointer;
418 }
419 }
420
421 return result;
422}
423
424template <bool append>
425bool percent_encode(const std::string_view input, const uint8_t character_set[],
426 std::string& out) {
427 ada_log("percent_encode ", input, " to output string while ",
428 append ? "appending" : "overwriting");
429 auto pointer =
430 std::find_if(input.begin(), input.end(), [character_set](const char c) {
431 return character_sets::bit_at(character_set, c);
432 });
433 ada_log("percent_encode done checking, moved to ",
434 std::distance(input.begin(), pointer));
435
436 // Optimization: Don't iterate if percent encode is not required
437 if (pointer == input.end()) {
438 ada_log("percent_encode encoding not needed.");
439 return false;
440 }
441 if constexpr (!append) {
442 out.clear();
443 }
444 ada_log("percent_encode appending ", std::distance(input.begin(), pointer),
445 " bytes");
446 out.append(input.data(), std::distance(input.begin(), pointer));
447 ada_log("percent_encode processing ", std::distance(pointer, input.end()),
448 " bytes");
449 for (; pointer != input.end(); pointer++) {
450 if (character_sets::bit_at(character_set, *pointer)) {
451 out.append(character_sets::hex + uint8_t(*pointer) * 4, 3);
452 } else {
453 out += *pointer;
454 }
455 }
456 return true;
457}
458
459bool to_ascii(std::optional<std::string>& out, const std::string_view plain,
460 size_t first_percent) {
461 std::string percent_decoded_buffer;
462 std::string_view input = plain;
463 if (first_percent != std::string_view::npos) {
464 percent_decoded_buffer = unicode::percent_decode(plain, first_percent);
465 input = percent_decoded_buffer;
466 }
467 // input is a non-empty UTF-8 string, must be percent decoded
468 std::string idna_ascii = ada::idna::to_ascii(input);
469 if (idna_ascii.empty() || contains_forbidden_domain_code_point(
470 idna_ascii.data(), idna_ascii.size())) {
471 return false;
472 }
473 out = std::move(idna_ascii);
474 return true;
475}
476
477std::string percent_encode(const std::string_view input,
478 const uint8_t character_set[], size_t index) {
479 std::string out;
480 out.append(input.data(), index);
481 auto pointer = input.begin() + index;
482 for (; pointer != input.end(); pointer++) {
483 if (character_sets::bit_at(character_set, *pointer)) {
484 out.append(character_sets::hex + uint8_t(*pointer) * 4, 3);
485 } else {
486 out += *pointer;
487 }
488 }
489 return out;
490}
491
492} // namespace ada::unicode
Definitions of the character sets used by unicode functions.
Declaration of the character sets used by unicode functions.
Common definitions for cross-platform compiler support.
#define ADA_PUSH_DISABLE_ALL_WARNINGS
Definition common_defs.h:90
#define ADA_POP_DISABLE_WARNINGS
#define ada_really_inline
Definition common_defs.h:81
ada_really_inline constexpr bool bit_at(const uint8_t a[], const uint8_t i)
constexpr char hex[1024]
std::string to_ascii(std::string_view ut8_string)
Includes the declarations for unicode operations.
static constexpr std::array< uint8_t, 256 > is_forbidden_domain_code_point_table
Definition unicode.cpp:174
static constexpr std::array< uint8_t, 256 > is_forbidden_domain_code_point_table_or_upper
Definition unicode.cpp:214
static constexpr char hex_to_binary_table[]
Definition unicode.cpp:357
constexpr uint64_t broadcast(uint8_t v) noexcept
Definition unicode.cpp:24
constexpr std::string_view table_is_double_dot_path_segment[]
Definition unicode.cpp:296
constexpr bool is_tabs_or_newline(char c) noexcept
Definition unicode.cpp:20
static constexpr std::array< uint8_t, 256 > is_forbidden_host_code_point_table
Definition unicode.cpp:159
static constexpr std::array< bool, 256 > is_alnum_plus_table
Definition unicode.cpp:255
tl::expected< result_type, ada::errors > result
Definitions for all unicode specific functions.