1/* (c) Magnus Auvinen. See licence.txt in the root of the distribution for more information. */
2/* If you are missing that file, acquire a complete release at teeworlds.com. */
3
4#include "str.h"
5
6#include "dbg.h"
7#include "detect.h"
8#include "math.h"
9#include "mem.h"
10
11#include <cctype>
12#include <charconv> // std::to_chars
13#include <cstdarg>
14#include <cstdio>
15#include <cstdlib>
16#include <cstring>
17
18int str_copy(char *dst, const char *src, int dst_size)
19{
20 dst[0] = '\0';
21 strncat(dest: dst, src: src, n: dst_size - 1);
22 return str_utf8_fix_truncation(str: dst);
23}
24
25void str_append(char *dst, const char *src, int dst_size)
26{
27 int s = str_length(str: dst);
28 int i = 0;
29 while(s < dst_size)
30 {
31 dst[s] = src[i];
32 if(!src[i]) /* check for null termination */
33 break;
34 s++;
35 i++;
36 }
37
38 dst[dst_size - 1] = 0; /* assure null termination */
39 str_utf8_fix_truncation(str: dst);
40}
41
42void str_truncate(char *dst, int dst_size, const char *src, int truncation_len)
43{
44 int size = dst_size;
45 if(truncation_len < size)
46 {
47 size = truncation_len + 1;
48 }
49 str_copy(dst, src, dst_size: size);
50}
51
52int str_length(const char *str)
53{
54 return (int)strlen(s: str);
55}
56
57int str_format_v(char *buffer, int buffer_size, const char *format, va_list args)
58{
59#if defined(CONF_FAMILY_WINDOWS)
60 _vsprintf_p(buffer, buffer_size, format, args);
61 buffer[buffer_size - 1] = 0; /* assure null termination */
62#else
63 vsnprintf(s: buffer, maxlen: buffer_size, format: format, arg: args);
64 /* null termination is assured by definition of vsnprintf */
65#endif
66 return str_utf8_fix_truncation(str: buffer);
67}
68
69#if !defined(CONF_DEBUG)
70int str_format_int(char *buffer, size_t buffer_size, int value)
71{
72 buffer[0] = '\0'; // Fix false positive clang-analyzer-core.UndefinedBinaryOperatorResult when using result
73 auto result = std::to_chars(buffer, buffer + buffer_size - 1, value);
74 result.ptr[0] = '\0';
75 return result.ptr - buffer;
76}
77#endif
78
79#undef str_format
80int str_format(char *buffer, int buffer_size, const char *format, ...)
81{
82 va_list args;
83 va_start(args, format);
84 int length = str_format_v(buffer, buffer_size, format, args);
85 va_end(args);
86 return length;
87}
88#if !defined(CONF_DEBUG)
89#define str_format str_format_opt
90#endif
91
92char str_uppercase(char c)
93{
94 if(c >= 'a' && c <= 'z')
95 return 'A' + (c - 'a');
96 return c;
97}
98
99bool str_isnum(char c)
100{
101 return c >= '0' && c <= '9';
102}
103
104int str_isallnum(const char *str)
105{
106 while(*str)
107 {
108 if(!str_isnum(c: *str))
109 return 0;
110 str++;
111 }
112 return 1;
113}
114
115int str_isallnum_hex(const char *str)
116{
117 while(*str)
118 {
119 if(!str_isnum(c: *str) && !(*str >= 'a' && *str <= 'f') && !(*str >= 'A' && *str <= 'F'))
120 return 0;
121 str++;
122 }
123 return 1;
124}
125
126int str_isspace(char c)
127{
128 return c == ' ' || c == '\n' || c == '\r' || c == '\t';
129}
130
131const char *str_trim_words(const char *str, int words)
132{
133 while(*str && str_isspace(c: *str))
134 str++;
135 while(words && *str)
136 {
137 if(str_isspace(c: *str) && !str_isspace(c: *(str + 1)))
138 words--;
139 str++;
140 }
141 return str;
142}
143
144bool str_has_cc(const char *str)
145{
146 unsigned char *s = (unsigned char *)str;
147 while(*s)
148 {
149 if(*s < 32)
150 {
151 return true;
152 }
153 s++;
154 }
155 return false;
156}
157
158/* makes sure that the string only contains the characters between 32 and 255 */
159void str_sanitize_cc(char *str_in)
160{
161 unsigned char *str = (unsigned char *)str_in;
162 while(*str)
163 {
164 if(*str < 32)
165 *str = ' ';
166 str++;
167 }
168}
169
170/* makes sure that the string only contains the characters between 32 and 255 + \r\n\t */
171void str_sanitize(char *str_in)
172{
173 unsigned char *str = (unsigned char *)str_in;
174 while(*str)
175 {
176 if(*str < 32 && !(*str == '\r') && !(*str == '\n') && !(*str == '\t'))
177 *str = ' ';
178 str++;
179 }
180}
181
182void str_sanitize_filename(char *str_in)
183{
184 unsigned char *str = (unsigned char *)str_in;
185 while(*str)
186 {
187 if(*str <= 0x1F || *str == 0x7F || *str == '\\' || *str == '/' || *str == '|' || *str == ':' ||
188 *str == '*' || *str == '?' || *str == '<' || *str == '>' || *str == '"')
189 {
190 *str = ' ';
191 }
192 str++;
193 }
194}
195
196bool str_valid_filename(const char *str)
197{
198 // References:
199 // - https://en.wikipedia.org/w/index.php?title=Filename&oldid=1281340521#Comparison_of_filename_limitations
200 // - https://learn.microsoft.com/en-us/windows/win32/fileio/naming-a-file (last update 2024-08-28)
201 if(str[0] == '\0')
202 {
203 return false; // empty name not allowed
204 }
205
206 bool prev_space = false;
207 bool prev_period = false;
208 bool first_space_checked = false;
209 const char *iterator = str;
210 while(*iterator)
211 {
212 const int code = str_utf8_decode(ptr: &iterator);
213 if(code <= 0x1F || code == 0x7F || code == '\\' || code == '/' || code == '|' || code == ':' ||
214 code == '*' || code == '?' || code == '<' || code == '>' || code == '"')
215 {
216 return false; // disallowed characters, mostly for Windows
217 }
218 else if(str_utf8_isspace(code) && code != ' ')
219 {
220 return false; // we only allow regular space characters
221 }
222 if(code == ' ')
223 {
224 if(!first_space_checked)
225 {
226 return false; // leading spaces not allowed
227 }
228 if(prev_space)
229 {
230 return false; // multiple consecutive spaces not allowed
231 }
232 prev_space = true;
233 prev_period = false;
234 }
235 else
236 {
237 prev_space = false;
238 prev_period = code == '.';
239 first_space_checked = true;
240 }
241 }
242 if(prev_space || prev_period)
243 {
244 return false; // trailing spaces and periods not allowed
245 }
246
247 static constexpr const char *RESERVED_NAMES[] = {
248 "CON", "PRN", "AUX", "NUL",
249 "COM0", "COM1", "COM2", "COM3", "COM4", "COM5", "COM6", "COM7", "COM8", "COM9", "COM¹", "COM²", "COM³",
250 "LPT0", "LPT1", "LPT2", "LPT3", "LPT4", "LPT5", "LPT6", "LPT7", "LPT8", "LPT9", "LPT¹", "LPT²", "LPT³"};
251 return std::none_of(first: std::begin(arr: RESERVED_NAMES), last: std::end(arr: RESERVED_NAMES), pred: [str](const char *reserved_name) {
252 const char *prefix = str_startswith_nocase(str, prefix: reserved_name);
253 // reserved name not allowed when it makes up the entire filename or when followed by period
254 return prefix != nullptr && (prefix[0] == '\0' || prefix[0] == '.');
255 });
256}
257
258int str_comp_filenames(const char *a, const char *b)
259{
260 int result;
261
262 for(; *a && *b; ++a, ++b)
263 {
264 if(str_isnum(c: *a) && str_isnum(c: *b))
265 {
266 result = 0;
267 do
268 {
269 if(!result)
270 result = *a - *b;
271 ++a;
272 ++b;
273 } while(str_isnum(c: *a) && str_isnum(c: *b));
274
275 if(str_isnum(c: *a))
276 return 1;
277 else if(str_isnum(c: *b))
278 return -1;
279 else if(result || *a == '\0' || *b == '\0')
280 return result;
281 }
282
283 result = tolower(c: *a) - tolower(c: *b);
284 if(result)
285 return result;
286 }
287 return *a - *b;
288}
289
290void str_clean_whitespaces(char *str)
291{
292 char *read = str;
293 char *write = str;
294
295 /* skip initial whitespace */
296 while(*read == ' ')
297 read++;
298
299 /* end of read string is detected in the loop */
300 while(true)
301 {
302 /* skip whitespace */
303 int found_whitespace = 0;
304 for(; *read == ' '; read++)
305 found_whitespace = 1;
306 /* if not at the end of the string, put a found whitespace here */
307 if(*read)
308 {
309 if(found_whitespace)
310 *write++ = ' ';
311 *write++ = *read++;
312 }
313 else
314 {
315 *write = 0;
316 break;
317 }
318 }
319}
320
321char *str_skip_to_whitespace(char *str)
322{
323 while(*str && !str_isspace(c: *str))
324 str++;
325 return str;
326}
327
328const char *str_skip_to_whitespace_const(const char *str)
329{
330 while(*str && !str_isspace(c: *str))
331 str++;
332 return str;
333}
334
335char *str_skip_whitespaces(char *str)
336{
337 while(*str && str_isspace(c: *str))
338 str++;
339 return str;
340}
341
342const char *str_skip_whitespaces_const(const char *str)
343{
344 while(*str && str_isspace(c: *str))
345 str++;
346 return str;
347}
348
349/* case */
350int str_comp_nocase(const char *a, const char *b)
351{
352#if defined(CONF_FAMILY_WINDOWS)
353 return _stricmp(a, b);
354#else
355 return strcasecmp(s1: a, s2: b);
356#endif
357}
358
359int str_comp_nocase_num(const char *a, const char *b, int num)
360{
361#if defined(CONF_FAMILY_WINDOWS)
362 return _strnicmp(a, b, num);
363#else
364 return strncasecmp(s1: a, s2: b, n: num);
365#endif
366}
367
368int str_comp(const char *a, const char *b)
369{
370 return strcmp(s1: a, s2: b);
371}
372
373int str_comp_num(const char *a, const char *b, int num)
374{
375 return strncmp(s1: a, s2: b, n: num);
376}
377
378const char *str_startswith_nocase(const char *str, const char *prefix)
379{
380 int prefixl = str_length(str: prefix);
381 if(str_comp_nocase_num(a: str, b: prefix, num: prefixl) == 0)
382 {
383 return str + prefixl;
384 }
385 else
386 {
387 return nullptr;
388 }
389}
390
391const char *str_startswith(const char *str, const char *prefix)
392{
393 int prefixl = str_length(str: prefix);
394 if(str_comp_num(a: str, b: prefix, num: prefixl) == 0)
395 {
396 return str + prefixl;
397 }
398 else
399 {
400 return nullptr;
401 }
402}
403
404const char *str_endswith_nocase(const char *str, const char *suffix)
405{
406 int strl = str_length(str);
407 int suffixl = str_length(str: suffix);
408 const char *strsuffix;
409 if(strl < suffixl)
410 {
411 return nullptr;
412 }
413 strsuffix = str + strl - suffixl;
414 if(str_comp_nocase(a: strsuffix, b: suffix) == 0)
415 {
416 return strsuffix;
417 }
418 else
419 {
420 return nullptr;
421 }
422}
423
424const char *str_endswith(const char *str, const char *suffix)
425{
426 int strl = str_length(str);
427 int suffixl = str_length(str: suffix);
428 const char *strsuffix;
429 if(strl < suffixl)
430 {
431 return nullptr;
432 }
433 strsuffix = str + strl - suffixl;
434 if(str_comp(a: strsuffix, b: suffix) == 0)
435 {
436 return strsuffix;
437 }
438 else
439 {
440 return nullptr;
441 }
442}
443
444const char *str_find_nocase(const char *haystack, const char *needle)
445{
446 while(*haystack) /* native implementation */
447 {
448 const char *a = haystack;
449 const char *b = needle;
450 while(*a && *b && tolower(c: (unsigned char)*a) == tolower(c: (unsigned char)*b))
451 {
452 a++;
453 b++;
454 }
455 if(!(*b))
456 return haystack;
457 haystack++;
458 }
459
460 return nullptr;
461}
462
463const char *str_find(const char *haystack, const char *needle)
464{
465 while(*haystack) /* native implementation */
466 {
467 const char *a = haystack;
468 const char *b = needle;
469 while(*a && *b && *a == *b)
470 {
471 a++;
472 b++;
473 }
474 if(!(*b))
475 return haystack;
476 haystack++;
477 }
478
479 return nullptr;
480}
481
482static const char *str_token_get(const char *str, const char *delim, int *length)
483{
484 size_t len = strspn(s: str, accept: delim);
485 if(len > 1)
486 str++;
487 else
488 str += len;
489 if(!*str)
490 return nullptr;
491
492 *length = strcspn(s: str, reject: delim);
493 return str;
494}
495
496const char *str_next_token(const char *str, const char *delim, char *buffer, int buffer_size)
497{
498 int len = 0;
499 const char *tok = str_token_get(str, delim, length: &len);
500 if(len < 0 || tok == nullptr)
501 {
502 buffer[0] = '\0';
503 return nullptr;
504 }
505
506 len = buffer_size > len ? len : buffer_size - 1;
507 mem_copy(dest: buffer, source: tok, size: len);
508 buffer[len] = '\0';
509
510 return tok + len;
511}
512
513int str_in_list(const char *list, const char *delim, const char *needle)
514{
515 const char *tok = list;
516 int len = 0, notfound = 1, needlelen = str_length(str: needle);
517
518 while(notfound && (tok = str_token_get(str: tok, delim, length: &len)))
519 {
520 notfound = needlelen != len || str_comp_num(a: tok, b: needle, num: len);
521 tok = tok + len;
522 }
523
524 return !notfound;
525}
526
527bool str_delimiters_around_offset(const char *haystack, const char *delim, int offset, int *start, int *end)
528{
529 bool found = true;
530 const char *search = haystack;
531 const int delim_len = str_length(str: delim);
532 *start = 0;
533 while(str_find(haystack: search, needle: delim))
534 {
535 const char *test = str_find(haystack: search, needle: delim) + delim_len;
536 int distance = test - haystack;
537 if(distance > offset)
538 break;
539
540 *start = distance;
541 search = test;
542 }
543 if(search == haystack)
544 found = false;
545
546 if(str_find(haystack: search, needle: delim))
547 {
548 *end = str_find(haystack: search, needle: delim) - haystack;
549 }
550 else
551 {
552 *end = str_length(str: haystack);
553 found = false;
554 }
555
556 return found;
557}
558
559const char *str_rchr(const char *haystack, char needle)
560{
561 return strrchr(s: haystack, c: needle);
562}
563
564int str_countchr(const char *haystack, char needle)
565{
566 int count = 0;
567 while(*haystack)
568 {
569 if(*haystack == needle)
570 count++;
571 haystack++;
572 }
573 return count;
574}
575
576void str_hex(char *dst, int dst_size, const void *data, int data_size)
577{
578 static const char hex[] = "0123456789ABCDEF";
579 int data_index;
580 int dst_index;
581 for(data_index = 0, dst_index = 0; data_index < data_size && dst_index < dst_size - 3; data_index++)
582 {
583 dst[data_index * 3] = hex[((const unsigned char *)data)[data_index] >> 4];
584 dst[data_index * 3 + 1] = hex[((const unsigned char *)data)[data_index] & 0xf];
585 dst[data_index * 3 + 2] = ' ';
586 dst_index += 3;
587 }
588 dst[dst_index] = '\0';
589}
590
591void str_hex_cstyle(char *dst, int dst_size, const void *data, int data_size, int bytes_per_line)
592{
593 static const char hex[] = "0123456789ABCDEF";
594 int data_index;
595 int dst_index;
596 int remaining_bytes_per_line = bytes_per_line;
597 for(data_index = 0, dst_index = 0; data_index < data_size && dst_index < dst_size - 6; data_index++)
598 {
599 --remaining_bytes_per_line;
600 dst[data_index * 6] = '0';
601 dst[data_index * 6 + 1] = 'x';
602 dst[data_index * 6 + 2] = hex[((const unsigned char *)data)[data_index] >> 4];
603 dst[data_index * 6 + 3] = hex[((const unsigned char *)data)[data_index] & 0xf];
604 dst[data_index * 6 + 4] = ',';
605 if(remaining_bytes_per_line == 0)
606 {
607 dst[data_index * 6 + 5] = '\n';
608 remaining_bytes_per_line = bytes_per_line;
609 }
610 else
611 {
612 dst[data_index * 6 + 5] = ' ';
613 }
614 dst_index += 6;
615 }
616 dst[dst_index] = '\0';
617 // Remove trailing comma and space/newline
618 if(dst_index >= 1)
619 dst[dst_index - 1] = '\0';
620 if(dst_index >= 2)
621 dst[dst_index - 2] = '\0';
622}
623
624static int hexval(char x)
625{
626 switch(x)
627 {
628 case '0': return 0;
629 case '1': return 1;
630 case '2': return 2;
631 case '3': return 3;
632 case '4': return 4;
633 case '5': return 5;
634 case '6': return 6;
635 case '7': return 7;
636 case '8': return 8;
637 case '9': return 9;
638 case 'a':
639 case 'A': return 10;
640 case 'b':
641 case 'B': return 11;
642 case 'c':
643 case 'C': return 12;
644 case 'd':
645 case 'D': return 13;
646 case 'e':
647 case 'E': return 14;
648 case 'f':
649 case 'F': return 15;
650 default: return -1;
651 }
652}
653
654static int byteval(const char *hex, unsigned char *dst)
655{
656 int v1 = hexval(x: hex[0]);
657 int v2 = hexval(x: hex[1]);
658
659 if(v1 < 0 || v2 < 0)
660 return 1;
661
662 *dst = v1 * 16 + v2;
663 return 0;
664}
665
666int str_hex_decode(void *dst, int dst_size, const char *src)
667{
668 unsigned char *cdst = (unsigned char *)dst;
669 int slen = str_length(str: src);
670 int len = slen / 2;
671 int i;
672 if(slen != dst_size * 2)
673 return 2;
674
675 for(i = 0; i < len && dst_size; i++, dst_size--)
676 {
677 if(byteval(hex: src + i * 2, dst: cdst++))
678 return 1;
679 }
680 return 0;
681}
682
683void str_base64(char *dst, int dst_size, const void *data_raw, int data_size)
684{
685 static const char DIGITS[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
686
687 const unsigned char *data = (const unsigned char *)data_raw;
688 unsigned value = 0;
689 int num_bits = 0;
690 int i = 0;
691 int o = 0;
692
693 dst_size -= 1;
694 dst[dst_size] = 0;
695 while(true)
696 {
697 if(num_bits < 6 && i < data_size)
698 {
699 value = (value << 8) | data[i];
700 num_bits += 8;
701 i += 1;
702 }
703 if(o == dst_size)
704 {
705 return;
706 }
707 if(num_bits > 0)
708 {
709 unsigned padded;
710 if(num_bits >= 6)
711 {
712 padded = (value >> (num_bits - 6)) & 0x3f;
713 }
714 else
715 {
716 padded = (value << (6 - num_bits)) & 0x3f;
717 }
718 dst[o] = DIGITS[padded];
719 num_bits -= 6;
720 o += 1;
721 }
722 else if(o % 4 != 0)
723 {
724 dst[o] = '=';
725 o += 1;
726 }
727 else
728 {
729 dst[o] = 0;
730 return;
731 }
732 }
733}
734
735static int base64_digit_value(char digit)
736{
737 if('A' <= digit && digit <= 'Z')
738 {
739 return digit - 'A';
740 }
741 else if('a' <= digit && digit <= 'z')
742 {
743 return digit - 'a' + 26;
744 }
745 else if('0' <= digit && digit <= '9')
746 {
747 return digit - '0' + 52;
748 }
749 else if(digit == '+')
750 {
751 return 62;
752 }
753 else if(digit == '/')
754 {
755 return 63;
756 }
757 return -1;
758}
759
760int str_base64_decode(void *dst_raw, int dst_size, const char *data)
761{
762 unsigned char *dst = (unsigned char *)dst_raw;
763 int data_len = str_length(str: data);
764
765 int i;
766 int o = 0;
767
768 if(data_len % 4 != 0)
769 {
770 return -3;
771 }
772 if(data_len / 4 * 3 > dst_size)
773 {
774 // Output buffer too small.
775 return -2;
776 }
777 for(i = 0; i < data_len; i += 4)
778 {
779 int num_output_bytes = 3;
780 char copy[4];
781 int d[4];
782 int value;
783 int b;
784 mem_copy(dest: copy, source: data + i, size: sizeof(copy));
785 if(i == data_len - 4)
786 {
787 if(copy[3] == '=')
788 {
789 copy[3] = 'A';
790 num_output_bytes = 2;
791 if(copy[2] == '=')
792 {
793 copy[2] = 'A';
794 num_output_bytes = 1;
795 }
796 }
797 }
798 d[0] = base64_digit_value(digit: copy[0]);
799 d[1] = base64_digit_value(digit: copy[1]);
800 d[2] = base64_digit_value(digit: copy[2]);
801 d[3] = base64_digit_value(digit: copy[3]);
802 if(d[0] == -1 || d[1] == -1 || d[2] == -1 || d[3] == -1)
803 {
804 // Invalid digit.
805 return -1;
806 }
807 value = (d[0] << 18) | (d[1] << 12) | (d[2] << 6) | d[3];
808 for(b = 0; b < 3; b++)
809 {
810 unsigned char byte_value = (value >> (16 - 8 * b)) & 0xff;
811 if(b < num_output_bytes)
812 {
813 dst[o] = byte_value;
814 o += 1;
815 }
816 else
817 {
818 if(byte_value != 0)
819 {
820 // Padding not zeroed.
821 return -2;
822 }
823 }
824 }
825 }
826 return o;
827}
828
829void str_escape(char **dst, const char *src, const char *end)
830{
831 while(*src && *dst + 1 < end)
832 {
833 if(*src == '"' || *src == '\\') // escape \ and "
834 {
835 if(*dst + 2 < end)
836 *(*dst)++ = '\\';
837 else
838 break;
839 }
840 *(*dst)++ = *src++;
841 }
842 **dst = 0;
843}
844
845int str_toint(const char *str)
846{
847 return str_toint_base(str, base: 10);
848}
849
850bool str_toint(const char *str, int *out)
851{
852 // returns true if conversion was successful
853 char *end;
854 int value = strtol(nptr: str, endptr: &end, base: 10);
855 if(*end != '\0')
856 return false;
857 if(out != nullptr)
858 *out = value;
859 return true;
860}
861
862int str_toint_base(const char *str, int base)
863{
864 return strtol(nptr: str, endptr: nullptr, base: base);
865}
866
867unsigned long str_toulong_base(const char *str, int base)
868{
869 return strtoul(nptr: str, endptr: nullptr, base: base);
870}
871
872int64_t str_toint64_base(const char *str, int base)
873{
874 return strtoll(nptr: str, endptr: nullptr, base: base);
875}
876
877float str_tofloat(const char *str)
878{
879 return strtod(nptr: str, endptr: nullptr);
880}
881
882bool str_tofloat(const char *str, float *out)
883{
884 // returns true if conversion was successful
885 char *end;
886 float value = strtod(nptr: str, endptr: &end);
887 if(*end != '\0')
888 return false;
889 if(out != nullptr)
890 *out = value;
891 return true;
892}
893
894unsigned str_quickhash(const char *str)
895{
896 unsigned hash = 5381;
897 for(; *str; str++)
898 hash = ((hash << 5) + hash) + (*str); /* hash * 33 + c */
899 return hash;
900}
901
902int str_utf8_encode(char *ptr, int chr)
903{
904 /* encode */
905 if(chr <= 0x7F)
906 {
907 ptr[0] = (char)chr;
908 return 1;
909 }
910 else if(chr <= 0x7FF)
911 {
912 ptr[0] = 0xC0 | ((chr >> 6) & 0x1F);
913 ptr[1] = 0x80 | (chr & 0x3F);
914 return 2;
915 }
916 else if(chr <= 0xFFFF)
917 {
918 ptr[0] = 0xE0 | ((chr >> 12) & 0x0F);
919 ptr[1] = 0x80 | ((chr >> 6) & 0x3F);
920 ptr[2] = 0x80 | (chr & 0x3F);
921 return 3;
922 }
923 else if(chr <= 0x10FFFF)
924 {
925 ptr[0] = 0xF0 | ((chr >> 18) & 0x07);
926 ptr[1] = 0x80 | ((chr >> 12) & 0x3F);
927 ptr[2] = 0x80 | ((chr >> 6) & 0x3F);
928 ptr[3] = 0x80 | (chr & 0x3F);
929 return 4;
930 }
931
932 return 0;
933}
934
935static unsigned char str_byte_next(const char **ptr)
936{
937 unsigned char byte_value = **ptr;
938 (*ptr)++;
939 return byte_value;
940}
941
942static void str_byte_rewind(const char **ptr)
943{
944 (*ptr)--;
945}
946
947int str_utf8_decode(const char **ptr)
948{
949 // As per https://encoding.spec.whatwg.org/#utf-8-decoder.
950 unsigned char utf8_lower_boundary = 0x80;
951 unsigned char utf8_upper_boundary = 0xBF;
952 int utf8_code_point = 0;
953 int utf8_bytes_seen = 0;
954 int utf8_bytes_needed = 0;
955 while(true)
956 {
957 unsigned char byte_value = str_byte_next(ptr);
958 if(utf8_bytes_needed == 0)
959 {
960 if(byte_value <= 0x7F)
961 {
962 return byte_value;
963 }
964 else if(0xC2 <= byte_value && byte_value <= 0xDF)
965 {
966 utf8_bytes_needed = 1;
967 utf8_code_point = byte_value - 0xC0;
968 }
969 else if(0xE0 <= byte_value && byte_value <= 0xEF)
970 {
971 if(byte_value == 0xE0)
972 utf8_lower_boundary = 0xA0;
973 if(byte_value == 0xED)
974 utf8_upper_boundary = 0x9F;
975 utf8_bytes_needed = 2;
976 utf8_code_point = byte_value - 0xE0;
977 }
978 else if(0xF0 <= byte_value && byte_value <= 0xF4)
979 {
980 if(byte_value == 0xF0)
981 utf8_lower_boundary = 0x90;
982 if(byte_value == 0xF4)
983 utf8_upper_boundary = 0x8F;
984 utf8_bytes_needed = 3;
985 utf8_code_point = byte_value - 0xF0;
986 }
987 else
988 {
989 return -1; // Error.
990 }
991 utf8_code_point = utf8_code_point << (6 * utf8_bytes_needed);
992 continue;
993 }
994 if(!(utf8_lower_boundary <= byte_value && byte_value <= utf8_upper_boundary))
995 {
996 // Resetting variables not necessary, will be done when
997 // the function is called again.
998 str_byte_rewind(ptr);
999 return -1;
1000 }
1001 utf8_lower_boundary = 0x80;
1002 utf8_upper_boundary = 0xBF;
1003 utf8_bytes_seen += 1;
1004 utf8_code_point = utf8_code_point + ((byte_value - 0x80) << (6 * (utf8_bytes_needed - utf8_bytes_seen)));
1005 if(utf8_bytes_seen != utf8_bytes_needed)
1006 {
1007 continue;
1008 }
1009 // Resetting variables not necessary, see above.
1010 return utf8_code_point;
1011 }
1012}
1013
1014void str_utf8_truncate(char *dst, int dst_size, const char *src, int truncation_len)
1015{
1016 int size = -1;
1017 const char *cursor = src;
1018 int pos = 0;
1019 while(pos <= truncation_len && cursor - src < dst_size && size != cursor - src)
1020 {
1021 size = cursor - src;
1022 if(str_utf8_decode(ptr: &cursor) == 0)
1023 {
1024 break;
1025 }
1026 pos++;
1027 }
1028 str_copy(dst, src, dst_size: size + 1);
1029}
1030
1031int str_utf8_fix_truncation(char *str)
1032{
1033 int len = str_length(str);
1034 if(len > 0)
1035 {
1036 int last_char_index = str_utf8_rewind(str, cursor: len);
1037 const char *last_char = str + last_char_index;
1038 // Fix truncated UTF-8.
1039 if(str_utf8_decode(ptr: &last_char) == -1)
1040 {
1041 str[last_char_index] = 0;
1042 return last_char_index;
1043 }
1044 }
1045 return len;
1046}
1047
1048void str_utf8_trim_right(char *param)
1049{
1050 const char *str = param;
1051 char *end = nullptr;
1052 while(*str)
1053 {
1054 char *str_old = (char *)str;
1055 int code = str_utf8_decode(ptr: &str);
1056
1057 // check if unicode is not empty
1058 if(!str_utf8_isspace(code))
1059 {
1060 end = nullptr;
1061 }
1062 else if(!end)
1063 {
1064 end = str_old;
1065 }
1066 }
1067 if(end)
1068 {
1069 *end = 0;
1070 }
1071}
1072
1073void str_utf8_tolower(const char *input, char *output, size_t size)
1074{
1075 size_t out_pos = 0;
1076 while(*input)
1077 {
1078 const int code = str_utf8_tolower_codepoint(code: str_utf8_decode(ptr: &input));
1079 char encoded_code[4];
1080 const int code_size = str_utf8_encode(ptr: encoded_code, chr: code);
1081 if(out_pos + code_size + 1 > size) // +1 for null termination
1082 {
1083 break;
1084 }
1085 mem_copy(dest: &output[out_pos], source: encoded_code, size: code_size);
1086 out_pos += code_size;
1087 }
1088 output[out_pos] = '\0';
1089}
1090
1091int str_utf8_isspace(int code)
1092{
1093 return code <= 0x0020 || code == 0x0085 || code == 0x00A0 || code == 0x034F ||
1094 code == 0x115F || code == 0x1160 || code == 0x1680 || code == 0x180E ||
1095 (code >= 0x2000 && code <= 0x200F) || (code >= 0x2028 && code <= 0x202F) ||
1096 (code >= 0x205F && code <= 0x2064) || (code >= 0x206A && code <= 0x206F) ||
1097 code == 0x2800 || code == 0x3000 || code == 0x3164 ||
1098 (code >= 0xFE00 && code <= 0xFE0F) || code == 0xFEFF || code == 0xFFA0 ||
1099 (code >= 0xFFF9 && code <= 0xFFFC);
1100}
1101
1102int str_utf8_isstart(char c)
1103{
1104 if((c & 0xC0) == 0x80) /* 10xxxxxx */
1105 return 0;
1106 return 1;
1107}
1108
1109int str_utf8_rewind(const char *str, int cursor)
1110{
1111 while(cursor)
1112 {
1113 cursor--;
1114 if(str_utf8_isstart(c: *(str + cursor)))
1115 break;
1116 }
1117 return cursor;
1118}
1119
1120const char *str_utf8_find_nocase(const char *haystack, const char *needle, const char **end)
1121{
1122 while(*haystack) /* native implementation */
1123 {
1124 const char *a = haystack;
1125 const char *b = needle;
1126 const char *a_next = a;
1127 const char *b_next = b;
1128 while(*a && *b && str_utf8_tolower_codepoint(code: str_utf8_decode(ptr: &a_next)) == str_utf8_tolower_codepoint(code: str_utf8_decode(ptr: &b_next)))
1129 {
1130 a = a_next;
1131 b = b_next;
1132 }
1133 if(!(*b))
1134 {
1135 if(end != nullptr)
1136 *end = a_next;
1137 return haystack;
1138 }
1139 str_utf8_decode(ptr: &haystack);
1140 }
1141
1142 if(end != nullptr)
1143 *end = nullptr;
1144 return nullptr;
1145}
1146
1147int str_utf8_comp_nocase(const char *a, const char *b)
1148{
1149 int code_a;
1150 int code_b;
1151
1152 while(*a && *b)
1153 {
1154 code_a = str_utf8_tolower_codepoint(code: str_utf8_decode(ptr: &a));
1155 code_b = str_utf8_tolower_codepoint(code: str_utf8_decode(ptr: &b));
1156
1157 if(code_a != code_b)
1158 return code_a - code_b;
1159 }
1160 return (unsigned char)*a - (unsigned char)*b;
1161}
1162
1163int str_utf8_comp_nocase_num(const char *a, const char *b, int num)
1164{
1165 int code_a;
1166 int code_b;
1167 const char *old_a = a;
1168
1169 if(num <= 0)
1170 return 0;
1171
1172 while(*a && *b)
1173 {
1174 code_a = str_utf8_tolower_codepoint(code: str_utf8_decode(ptr: &a));
1175 code_b = str_utf8_tolower_codepoint(code: str_utf8_decode(ptr: &b));
1176
1177 if(code_a != code_b)
1178 return code_a - code_b;
1179
1180 if(a - old_a >= num)
1181 return 0;
1182 }
1183
1184 return (unsigned char)*a - (unsigned char)*b;
1185}
1186
1187const char *str_utf8_skip_whitespaces(const char *str)
1188{
1189 const char *str_old;
1190 int code;
1191
1192 while(*str)
1193 {
1194 str_old = str;
1195 code = str_utf8_decode(ptr: &str);
1196
1197 // check if unicode is not empty
1198 if(!str_utf8_isspace(code))
1199 {
1200 return str_old;
1201 }
1202 }
1203
1204 return str;
1205}
1206
1207int str_utf8_forward(const char *str, int cursor)
1208{
1209 const char *ptr = str + cursor;
1210 if(str_utf8_decode(ptr: &ptr) == 0)
1211 {
1212 return cursor;
1213 }
1214 return ptr - str;
1215}
1216
1217int str_utf8_check(const char *str)
1218{
1219 int codepoint;
1220 while((codepoint = str_utf8_decode(ptr: &str)))
1221 {
1222 if(codepoint == -1)
1223 {
1224 return 0;
1225 }
1226 }
1227 return 1;
1228}
1229
1230void str_utf8_copy_num(char *dst, const char *src, int dst_size, int num)
1231{
1232 int new_cursor;
1233 int cursor = 0;
1234
1235 while(src[cursor] && num > 0)
1236 {
1237 new_cursor = str_utf8_forward(str: src, cursor);
1238 if(new_cursor >= dst_size) // reserve 1 byte for the null termination
1239 break;
1240 else
1241 cursor = new_cursor;
1242 --num;
1243 }
1244
1245 str_copy(dst, src, dst_size: cursor < dst_size ? cursor + 1 : dst_size);
1246}
1247
1248void str_utf8_stats(const char *str, size_t max_size, size_t max_count, size_t *size, size_t *count)
1249{
1250 const char *cursor = str;
1251 *size = 0;
1252 *count = 0;
1253 while(*size < max_size && *count < max_count)
1254 {
1255 if(str_utf8_decode(ptr: &cursor) == 0)
1256 {
1257 break;
1258 }
1259 if((size_t)(cursor - str) >= max_size)
1260 {
1261 break;
1262 }
1263 *size = cursor - str;
1264 ++(*count);
1265 }
1266}
1267
1268size_t str_utf8_offset_bytes_to_chars(const char *str, size_t byte_offset)
1269{
1270 size_t char_offset = 0;
1271 size_t current_offset = 0;
1272 while(current_offset < byte_offset)
1273 {
1274 const size_t prev_byte_offset = current_offset;
1275 current_offset = str_utf8_forward(str, cursor: current_offset);
1276 if(current_offset == prev_byte_offset)
1277 break;
1278 char_offset++;
1279 }
1280 return char_offset;
1281}
1282
1283size_t str_utf8_offset_chars_to_bytes(const char *str, size_t char_offset)
1284{
1285 size_t byte_offset = 0;
1286 for(size_t i = 0; i < char_offset; i++)
1287 {
1288 const size_t prev_byte_offset = byte_offset;
1289 byte_offset = str_utf8_forward(str, cursor: byte_offset);
1290 if(byte_offset == prev_byte_offset)
1291 break;
1292 }
1293 return byte_offset;
1294}
1295
1296int str_utf8_dist(const char *a, const char *b)
1297{
1298 int buf_len = 2 * (str_length(str: a) + 1 + str_length(str: b) + 1);
1299 int *buf = (int *)calloc(nmemb: buf_len, size: sizeof(*buf));
1300 int result = str_utf8_dist_buffer(a, b, buf, buf_len);
1301 free(ptr: buf);
1302 return result;
1303}
1304
1305static int str_to_utf32_unchecked(const char *str, int **out)
1306{
1307 int out_len = 0;
1308 while((**out = str_utf8_decode(ptr: &str)))
1309 {
1310 (*out)++;
1311 out_len++;
1312 }
1313 return out_len;
1314}
1315
1316int str_utf8_dist_buffer(const char *a_utf8, const char *b_utf8, int *buf, int buf_len)
1317{
1318 int a_utf8_len = str_length(str: a_utf8);
1319 int b_utf8_len = str_length(str: b_utf8);
1320 int *a, *b; // UTF-32
1321 int a_len, b_len; // UTF-32 length
1322 dbg_assert(buf_len >= 2 * (a_utf8_len + 1 + b_utf8_len + 1), "buffer too small");
1323 if(a_utf8_len > b_utf8_len)
1324 {
1325 const char *tmp2 = a_utf8;
1326 a_utf8 = b_utf8;
1327 b_utf8 = tmp2;
1328 }
1329 a = buf;
1330 a_len = str_to_utf32_unchecked(str: a_utf8, out: &buf);
1331 b = buf;
1332 b_len = str_to_utf32_unchecked(str: b_utf8, out: &buf);
1333 return str_utf32_dist_buffer(a, a_len, b, b_len, buf, buf_len: buf_len - b_len - a_len);
1334}
1335
1336int str_utf32_dist_buffer(const int *a, int a_len, const int *b, int b_len, int *buf, int buf_len)
1337{
1338 int i, j;
1339 dbg_assert(buf_len >= (a_len + 1) + (b_len + 1), "buffer too small");
1340 if(a_len > b_len)
1341 {
1342 int tmp1 = a_len;
1343 const int *tmp2 = a;
1344
1345 a_len = b_len;
1346 a = b;
1347
1348 b_len = tmp1;
1349 b = tmp2;
1350 }
1351#define B(i, j) buf[((j) & 1) * (a_len + 1) + (i)]
1352 for(i = 0; i <= a_len; i++)
1353 {
1354 B(i, 0) = i;
1355 }
1356 for(j = 1; j <= b_len; j++)
1357 {
1358 B(0, j) = j;
1359 for(i = 1; i <= a_len; i++)
1360 {
1361 int subst = (a[i - 1] != b[j - 1]);
1362 B(i, j) = minimum(
1363 B(i - 1, j) + 1,
1364 B(i, j - 1) + 1,
1365 B(i - 1, j - 1) + subst);
1366 }
1367 }
1368 return B(a_len, b_len);
1369#undef B
1370}
1371