Ada 2.9.2
Fast spec-compliant URL parser
Loading...
Searching...
No Matches
unicode.cpp
Go to the documentation of this file.
1#include "ada.h"
3#include "ada/common_defs.h"
4#include "ada/unicode.h"
5
7#include "ada_idna.cpp"
9
10#include <algorithm>
11#if ADA_NEON
12#include <arm_neon.h>
13#elif ADA_SSE2
14#include <emmintrin.h>
15#endif
16
17namespace ada::unicode {
18
19constexpr bool is_tabs_or_newline(char c) noexcept {
20 return c == '\r' || c == '\n' || c == '\t';
21}
22
23constexpr uint64_t broadcast(uint8_t v) noexcept {
24 return 0x101010101010101ull * v;
25}
26
27constexpr bool to_lower_ascii(char* input, size_t length) noexcept {
28 uint64_t broadcast_80 = broadcast(0x80);
29 uint64_t broadcast_Ap = broadcast(128 - 'A');
30 uint64_t broadcast_Zp = broadcast(128 - 'Z' - 1);
31 uint64_t non_ascii = 0;
32 size_t i = 0;
33
34 for (; i + 7 < length; i += 8) {
35 uint64_t word{};
36 memcpy(&word, input + i, sizeof(word));
37 non_ascii |= (word & broadcast_80);
38 word ^=
39 (((word + broadcast_Ap) ^ (word + broadcast_Zp)) & broadcast_80) >> 2;
40 memcpy(input + i, &word, sizeof(word));
41 }
42 if (i < length) {
43 uint64_t word{};
44 memcpy(&word, input + i, length - i);
45 non_ascii |= (word & broadcast_80);
46 word ^=
47 (((word + broadcast_Ap) ^ (word + broadcast_Zp)) & broadcast_80) >> 2;
48 memcpy(input + i, &word, length - i);
49 }
50 return non_ascii == 0;
51}
52#if ADA_NEON
53ada_really_inline bool has_tabs_or_newline(
54 std::string_view user_input) noexcept {
55 // first check for short strings in which case we do it naively.
56 if (user_input.size() < 16) { // slow path
57 return std::any_of(user_input.begin(), user_input.end(),
59 }
60 // fast path for long strings (expected to be common)
61 size_t i = 0;
74 static uint8_t rnt_array[16] = {1, 0, 0, 0, 0, 0, 0, 0,
75 0, 9, 10, 0, 0, 13, 0, 0};
76 const uint8x16_t rnt = vld1q_u8(rnt_array);
77 // m['0xd', '0xa', '0x9']
78 uint8x16_t running{0};
79 for (; i + 15 < user_input.size(); i += 16) {
80 uint8x16_t word = vld1q_u8((const uint8_t*)user_input.data() + i);
81
82 running = vorrq_u8(running, vceqq_u8(vqtbl1q_u8(rnt, word), word));
83 }
84 if (i < user_input.size()) {
85 uint8x16_t word =
86 vld1q_u8((const uint8_t*)user_input.data() + user_input.length() - 16);
87 running = vorrq_u8(running, vceqq_u8(vqtbl1q_u8(rnt, word), word));
88 }
89 return vmaxvq_u32(vreinterpretq_u32_u8(running)) != 0;
90}
91#elif ADA_SSE2
92ada_really_inline bool has_tabs_or_newline(
93 std::string_view user_input) noexcept {
94 // first check for short strings in which case we do it naively.
95 if (user_input.size() < 16) { // slow path
96 return std::any_of(user_input.begin(), user_input.end(),
98 }
99 // fast path for long strings (expected to be common)
100 size_t i = 0;
101 const __m128i mask1 = _mm_set1_epi8('\r');
102 const __m128i mask2 = _mm_set1_epi8('\n');
103 const __m128i mask3 = _mm_set1_epi8('\t');
104 // If we supported SSSE3, we could use the algorithm that we use for NEON.
105 __m128i running{0};
106 for (; i + 15 < user_input.size(); i += 16) {
107 __m128i word = _mm_loadu_si128((const __m128i*)(user_input.data() + i));
108 running = _mm_or_si128(
109 _mm_or_si128(running, _mm_or_si128(_mm_cmpeq_epi8(word, mask1),
110 _mm_cmpeq_epi8(word, mask2))),
111 _mm_cmpeq_epi8(word, mask3));
112 }
113 if (i < user_input.size()) {
114 __m128i word = _mm_loadu_si128(
115 (const __m128i*)(user_input.data() + user_input.length() - 16));
116 running = _mm_or_si128(
117 _mm_or_si128(running, _mm_or_si128(_mm_cmpeq_epi8(word, mask1),
118 _mm_cmpeq_epi8(word, mask2))),
119 _mm_cmpeq_epi8(word, mask3));
120 }
121 return _mm_movemask_epi8(running) != 0;
122}
123#else
124ada_really_inline bool has_tabs_or_newline(
125 std::string_view user_input) noexcept {
126 auto has_zero_byte = [](uint64_t v) {
127 return ((v - 0x0101010101010101) & ~(v) & 0x8080808080808080);
128 };
129 size_t i = 0;
130 uint64_t mask1 = broadcast('\r');
131 uint64_t mask2 = broadcast('\n');
132 uint64_t mask3 = broadcast('\t');
133 uint64_t running{0};
134 for (; i + 7 < user_input.size(); i += 8) {
135 uint64_t word{};
136 memcpy(&word, user_input.data() + i, sizeof(word));
137 uint64_t xor1 = word ^ mask1;
138 uint64_t xor2 = word ^ mask2;
139 uint64_t xor3 = word ^ mask3;
140 running |= has_zero_byte(xor1) | has_zero_byte(xor2) | has_zero_byte(xor3);
141 }
142 if (i < user_input.size()) {
143 uint64_t word{};
144 memcpy(&word, user_input.data() + i, user_input.size() - i);
145 uint64_t xor1 = word ^ mask1;
146 uint64_t xor2 = word ^ mask2;
147 uint64_t xor3 = word ^ mask3;
148 running |= has_zero_byte(xor1) | has_zero_byte(xor2) | has_zero_byte(xor3);
149 }
150 return running;
151}
152#endif
153
154// A forbidden host code point is U+0000 NULL, U+0009 TAB, U+000A LF, U+000D CR,
155// U+0020 SPACE, U+0023 (#), U+002F (/), U+003A (:), U+003C (<), U+003E (>),
156// U+003F (?), U+0040 (@), U+005B ([), U+005C (\‍), U+005D (]), U+005E (^), or
157// U+007C (|).
158constexpr static std::array<uint8_t, 256> is_forbidden_host_code_point_table =
159 []() consteval {
160 std::array<uint8_t, 256> result{};
161 for (uint8_t c : {'\0', '\x09', '\x0a', '\x0d', ' ', '#', '/', ':', '<',
162 '>', '?', '@', '[', '\\', ']', '^', '|'}) {
163 result[c] = true;
164 }
165 return result;
166 }();
167
168ada_really_inline constexpr bool is_forbidden_host_code_point(
169 const char c) noexcept {
170 return is_forbidden_host_code_point_table[uint8_t(c)];
171}
172
173constexpr static std::array<uint8_t, 256> is_forbidden_domain_code_point_table =
174 []() consteval {
175 std::array<uint8_t, 256> result{};
176 for (uint8_t c : {'\0', '\x09', '\x0a', '\x0d', ' ', '#', '/', ':', '<',
177 '>', '?', '@', '[', '\\', ']', '^', '|', '%'}) {
178 result[c] = true;
179 }
180 for (uint8_t c = 0; c <= 32; c++) {
181 result[c] = true;
182 }
183 for (size_t c = 127; c < 255; c++) {
184 result[c] = true;
185 }
186 return result;
187 }();
188
189static_assert(sizeof(is_forbidden_domain_code_point_table) == 256);
190
191ada_really_inline constexpr bool is_forbidden_domain_code_point(
192 const char c) noexcept {
193 return is_forbidden_domain_code_point_table[uint8_t(c)];
194}
195
196ada_really_inline constexpr bool contains_forbidden_domain_code_point(
197 const char* input, size_t length) noexcept {
198 size_t i = 0;
199 uint8_t accumulator{};
200 for (; i + 4 <= length; i += 4) {
201 accumulator |= is_forbidden_domain_code_point_table[uint8_t(input[i])];
202 accumulator |= is_forbidden_domain_code_point_table[uint8_t(input[i + 1])];
203 accumulator |= is_forbidden_domain_code_point_table[uint8_t(input[i + 2])];
204 accumulator |= is_forbidden_domain_code_point_table[uint8_t(input[i + 3])];
205 }
206 for (; i < length; i++) {
207 accumulator |= is_forbidden_domain_code_point_table[uint8_t(input[i])];
208 }
209 return accumulator;
210}
211
212constexpr static std::array<uint8_t, 256>
214 std::array<uint8_t, 256> result{};
215 for (uint8_t c : {'\0', '\x09', '\x0a', '\x0d', ' ', '#', '/', ':', '<',
216 '>', '?', '@', '[', '\\', ']', '^', '|', '%'}) {
217 result[c] = 1;
218 }
219 for (uint8_t c = 'A'; c <= 'Z'; c++) {
220 result[c] = 2;
221 }
222 for (uint8_t c = 0; c <= 32; c++) {
223 result[c] = 1;
224 }
225 for (size_t c = 127; c < 255; c++) {
226 result[c] = 1;
227 }
228 return result;
229 }();
230
231ada_really_inline constexpr uint8_t
232contains_forbidden_domain_code_point_or_upper(const char* input,
233 size_t length) noexcept {
234 size_t i = 0;
235 uint8_t accumulator{};
236 for (; i + 4 <= length; i += 4) {
237 accumulator |=
239 accumulator |=
241 accumulator |=
243 accumulator |=
245 }
246 for (; i < length; i++) {
247 accumulator |=
249 }
250 return accumulator;
251}
252
253// std::isalnum(c) || c == '+' || c == '-' || c == '.') is true for
254constexpr static std::array<bool, 256> is_alnum_plus_table = []() consteval {
255 std::array<bool, 256> result{};
256 for (size_t c = 0; c < 256; c++) {
257 result[c] = (c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') ||
258 (c >= 'A' && c <= 'Z') || c == '+' || c == '-' || c == '.';
259 }
260 return result;
261}();
262
263ada_really_inline constexpr bool is_alnum_plus(const char c) noexcept {
264 return is_alnum_plus_table[uint8_t(c)];
265 // A table is almost surely much faster than the
266 // following under most compilers: return
267 // return (std::isalnum(c) || c == '+' || c == '-' || c == '.');
268}
269
270ada_really_inline constexpr bool is_ascii_hex_digit(const char c) noexcept {
271 return (c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') ||
272 (c >= 'a' && c <= 'f');
273}
274
275ada_really_inline constexpr bool is_c0_control_or_space(const char c) noexcept {
276 return (unsigned char)c <= ' ';
277}
278
279ada_really_inline constexpr bool is_ascii_tab_or_newline(
280 const char c) noexcept {
281 return c == '\t' || c == '\n' || c == '\r';
282}
283
284constexpr std::string_view table_is_double_dot_path_segment[] = {
285 "..", "%2e.", ".%2e", "%2e%2e"};
286
287ada_really_inline constexpr bool is_double_dot_path_segment(
288 std::string_view input) noexcept {
289 // This will catch most cases:
290 // The length must be 2,4 or 6.
291 // We divide by two and require
292 // that the result be between 1 and 3 inclusively.
293 uint64_t half_length = uint64_t(input.size()) / 2;
294 if (half_length - 1 > 2) {
295 return false;
296 }
297 // We have a string of length 2, 4 or 6.
298 // We now check the first character:
299 if ((input[0] != '.') && (input[0] != '%')) {
300 return false;
301 }
302 // We are unlikely the get beyond this point.
303 int hash_value = (input.size() + (unsigned)(input[0])) & 3;
304 const std::string_view target = table_is_double_dot_path_segment[hash_value];
305 if (target.size() != input.size()) {
306 return false;
307 }
308 // We almost never get here.
309 // Optimizing the rest is relatively unimportant.
310 auto prefix_equal_unsafe = [](std::string_view a, std::string_view b) {
311 uint16_t A, B;
312 memcpy(&A, a.data(), sizeof(A));
313 memcpy(&B, b.data(), sizeof(B));
314 return A == B;
315 };
316 if (!prefix_equal_unsafe(input, target)) {
317 return false;
318 }
319 for (size_t i = 2; i < input.size(); i++) {
320 char c = input[i];
321 if ((uint8_t((c | 0x20) - 0x61) <= 25 ? (c | 0x20) : c) != target[i]) {
322 return false;
323 }
324 }
325 return true;
326 // The above code might be a bit better than the code below. Compilers
327 // are not stupid and may use the fact that these strings have length 2,4 and
328 // 6 and other tricks.
329 // return input == ".." ||
330 // input == ".%2e" || input == ".%2E" ||
331 // input == "%2e." || input == "%2E." ||
332 // input == "%2e%2e" || input == "%2E%2E" || input == "%2E%2e" || input ==
333 // "%2e%2E";
334}
335
336ada_really_inline constexpr bool is_single_dot_path_segment(
337 std::string_view input) noexcept {
338 return input == "." || input == "%2e" || input == "%2E";
339}
340
341ada_really_inline constexpr bool is_lowercase_hex(const char c) noexcept {
342 return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f');
343}
344
345constexpr static char hex_to_binary_table[] = {
346 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 0, 0, 0, 0, 0, 0, 10, 11,
347 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
348 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 11, 12, 13, 14, 15};
349unsigned constexpr convert_hex_to_binary(const char c) noexcept {
350 return hex_to_binary_table[c - '0'];
351}
352
353std::string percent_decode(const std::string_view input, size_t first_percent) {
354 // next line is for safety only, we expect users to avoid calling
355 // percent_decode when first_percent is outside the range.
356 if (first_percent == std::string_view::npos) {
357 return std::string(input);
358 }
359 std::string dest;
360 dest.reserve(input.length());
361 dest.append(input.substr(0, first_percent));
362 const char* pointer = input.data() + first_percent;
363 const char* end = input.data() + input.size();
364 // Optimization opportunity: if the following code gets
365 // called often, it can be optimized quite a bit.
366 while (pointer < end) {
367 const char ch = pointer[0];
368 size_t remaining = end - pointer - 1;
369 if (ch != '%' || remaining < 2 ||
370 ( // ch == '%' && // It is unnecessary to check that ch == '%'.
371 (!is_ascii_hex_digit(pointer[1]) ||
372 !is_ascii_hex_digit(pointer[2])))) {
373 dest += ch;
374 pointer++;
375 } else {
376 unsigned a = convert_hex_to_binary(pointer[1]);
377 unsigned b = convert_hex_to_binary(pointer[2]);
378 char c = static_cast<char>(a * 16 + b);
379 dest += c;
380 pointer += 3;
381 }
382 }
383 return dest;
384}
385
386std::string percent_encode(const std::string_view input,
387 const uint8_t character_set[]) {
388 auto pointer = std::ranges::find_if(input, [character_set](const char c) {
389 return character_sets::bit_at(character_set, c);
390 });
391 // Optimization: Don't iterate if percent encode is not required
392 if (pointer == input.end()) {
393 return std::string(input);
394 }
395
396 std::string result;
397 result.reserve(input.length()); // in the worst case, percent encoding might
398 // produce 3 characters.
399 result.append(input.substr(0, std::distance(input.begin(), pointer)));
400
401 for (; pointer != input.end(); pointer++) {
402 if (character_sets::bit_at(character_set, *pointer)) {
403 result.append(character_sets::hex + uint8_t(*pointer) * 4, 3);
404 } else {
405 result += *pointer;
406 }
407 }
408
409 return result;
410}
411
412template <bool append>
413bool percent_encode(const std::string_view input, const uint8_t character_set[],
414 std::string& out) {
415 ada_log("percent_encode ", input, " to output string while ",
416 append ? "appending" : "overwriting");
417 auto pointer =
418 std::find_if(input.begin(), input.end(), [character_set](const char c) {
419 return character_sets::bit_at(character_set, c);
420 });
421 ada_log("percent_encode done checking, moved to ",
422 std::distance(input.begin(), pointer));
423
424 // Optimization: Don't iterate if percent encode is not required
425 if (pointer == input.end()) {
426 ada_log("percent_encode encoding not needed.");
427 return false;
428 }
429 if constexpr (!append) {
430 out.clear();
431 }
432 ada_log("percent_encode appending ", std::distance(input.begin(), pointer),
433 " bytes");
434 out.append(input.data(), std::distance(input.begin(), pointer));
435 ada_log("percent_encode processing ", std::distance(pointer, input.end()),
436 " bytes");
437 for (; pointer != input.end(); pointer++) {
438 if (character_sets::bit_at(character_set, *pointer)) {
439 out.append(character_sets::hex + uint8_t(*pointer) * 4, 3);
440 } else {
441 out += *pointer;
442 }
443 }
444 return true;
445}
446
447bool to_ascii(std::optional<std::string>& out, const std::string_view plain,
448 size_t first_percent) {
449 std::string percent_decoded_buffer;
450 std::string_view input = plain;
451 if (first_percent != std::string_view::npos) {
452 percent_decoded_buffer = unicode::percent_decode(plain, first_percent);
453 input = percent_decoded_buffer;
454 }
455 // input is a non-empty UTF-8 string, must be percent decoded
456 std::string idna_ascii = ada::idna::to_ascii(input);
457 if (idna_ascii.empty() || contains_forbidden_domain_code_point(
458 idna_ascii.data(), idna_ascii.size())) {
459 return false;
460 }
461 out = std::move(idna_ascii);
462 return true;
463}
464
465std::string percent_encode(const std::string_view input,
466 const uint8_t character_set[], size_t index) {
467 std::string out;
468 out.append(input.data(), index);
469 auto pointer = input.begin() + index;
470 for (; pointer != input.end(); pointer++) {
471 if (character_sets::bit_at(character_set, *pointer)) {
472 out.append(character_sets::hex + uint8_t(*pointer) * 4, 3);
473 } else {
474 out += *pointer;
475 }
476 }
477 return out;
478}
479
480} // namespace ada::unicode
Includes all definitions for Ada.
Definitions of the character sets used by unicode functions.
Common definitions for cross-platform compiler support.
#define ADA_PUSH_DISABLE_ALL_WARNINGS
Definition common_defs.h:86
#define ADA_POP_DISABLE_WARNINGS
#define ada_really_inline
Definition common_defs.h:77
ada_really_inline constexpr bool bit_at(const uint8_t a[], const uint8_t i)
constexpr char hex[1024]
std::string to_ascii(std::string_view ut8_string)
bool contains_forbidden_domain_code_point(std::string_view ascii_string)
Includes the declarations for unicode operations.
static constexpr std::array< uint8_t, 256 > is_forbidden_domain_code_point_table
Definition unicode.cpp:173
static constexpr std::array< uint8_t, 256 > is_forbidden_domain_code_point_table_or_upper
Definition unicode.cpp:213
static constexpr char hex_to_binary_table[]
Definition unicode.cpp:345
constexpr uint64_t broadcast(uint8_t v) noexcept
Definition unicode.cpp:23
constexpr std::string_view table_is_double_dot_path_segment[]
Definition unicode.cpp:284
constexpr bool is_tabs_or_newline(char c) noexcept
Definition unicode.cpp:19
static constexpr std::array< uint8_t, 256 > is_forbidden_host_code_point_table
Definition unicode.cpp:158
static constexpr std::array< bool, 256 > is_alnum_plus_table
Definition unicode.cpp:254
tl::expected< result_type, ada::errors > result
Definitions for all unicode specific functions.