1#include <base/str.h>
2#include <base/system.h>
3
4#include <cstring>
5
6int str_copy(char *dst, const char *src, int dst_size)
7{
8 dst[0] = '\0';
9 strncat(dest: dst, src: src, n: dst_size - 1);
10 return str_utf8_fix_truncation(str: dst);
11}
12
13void str_append(char *dst, const char *src, int dst_size)
14{
15 int s = str_length(str: dst);
16 int i = 0;
17 while(s < dst_size)
18 {
19 dst[s] = src[i];
20 if(!src[i]) /* check for null termination */
21 break;
22 s++;
23 i++;
24 }
25
26 dst[dst_size - 1] = 0; /* assure null termination */
27 str_utf8_fix_truncation(str: dst);
28}
29
30void str_truncate(char *dst, int dst_size, const char *src, int truncation_len)
31{
32 int size = dst_size;
33 if(truncation_len < size)
34 {
35 size = truncation_len + 1;
36 }
37 str_copy(dst, src, dst_size: size);
38}
39
40int str_length(const char *str)
41{
42 return (int)strlen(s: str);
43}
44
45char str_uppercase(char c)
46{
47 if(c >= 'a' && c <= 'z')
48 return 'A' + (c - 'a');
49 return c;
50}
51
52bool str_isnum(char c)
53{
54 return c >= '0' && c <= '9';
55}
56
57int str_isallnum(const char *str)
58{
59 while(*str)
60 {
61 if(!str_isnum(c: *str))
62 return 0;
63 str++;
64 }
65 return 1;
66}
67
68int str_isallnum_hex(const char *str)
69{
70 while(*str)
71 {
72 if(!str_isnum(c: *str) && !(*str >= 'a' && *str <= 'f') && !(*str >= 'A' && *str <= 'F'))
73 return 0;
74 str++;
75 }
76 return 1;
77}
78
79int str_isspace(char c)
80{
81 return c == ' ' || c == '\n' || c == '\r' || c == '\t';
82}
83
84const char *str_trim_words(const char *str, int words)
85{
86 while(*str && str_isspace(c: *str))
87 str++;
88 while(words && *str)
89 {
90 if(str_isspace(c: *str) && !str_isspace(c: *(str + 1)))
91 words--;
92 str++;
93 }
94 return str;
95}
96
97bool str_has_cc(const char *str)
98{
99 unsigned char *s = (unsigned char *)str;
100 while(*s)
101 {
102 if(*s < 32)
103 {
104 return true;
105 }
106 s++;
107 }
108 return false;
109}
110
111/* makes sure that the string only contains the characters between 32 and 255 */
112void str_sanitize_cc(char *str_in)
113{
114 unsigned char *str = (unsigned char *)str_in;
115 while(*str)
116 {
117 if(*str < 32)
118 *str = ' ';
119 str++;
120 }
121}
122
123/* makes sure that the string only contains the characters between 32 and 255 + \r\n\t */
124void str_sanitize(char *str_in)
125{
126 unsigned char *str = (unsigned char *)str_in;
127 while(*str)
128 {
129 if(*str < 32 && !(*str == '\r') && !(*str == '\n') && !(*str == '\t'))
130 *str = ' ';
131 str++;
132 }
133}
134
135void str_sanitize_filename(char *str_in)
136{
137 unsigned char *str = (unsigned char *)str_in;
138 while(*str)
139 {
140 if(*str <= 0x1F || *str == 0x7F || *str == '\\' || *str == '/' || *str == '|' || *str == ':' ||
141 *str == '*' || *str == '?' || *str == '<' || *str == '>' || *str == '"')
142 {
143 *str = ' ';
144 }
145 str++;
146 }
147}
148
149bool str_valid_filename(const char *str)
150{
151 // References:
152 // - https://en.wikipedia.org/w/index.php?title=Filename&oldid=1281340521#Comparison_of_filename_limitations
153 // - https://learn.microsoft.com/en-us/windows/win32/fileio/naming-a-file (last update 2024-08-28)
154 if(str[0] == '\0')
155 {
156 return false; // empty name not allowed
157 }
158
159 bool prev_space = false;
160 bool prev_period = false;
161 bool first_space_checked = false;
162 const char *iterator = str;
163 while(*iterator)
164 {
165 const int code = str_utf8_decode(ptr: &iterator);
166 if(code <= 0x1F || code == 0x7F || code == '\\' || code == '/' || code == '|' || code == ':' ||
167 code == '*' || code == '?' || code == '<' || code == '>' || code == '"')
168 {
169 return false; // disallowed characters, mostly for Windows
170 }
171 else if(str_utf8_isspace(code) && code != ' ')
172 {
173 return false; // we only allow regular space characters
174 }
175 if(code == ' ')
176 {
177 if(!first_space_checked)
178 {
179 return false; // leading spaces not allowed
180 }
181 if(prev_space)
182 {
183 return false; // multiple consecutive spaces not allowed
184 }
185 prev_space = true;
186 prev_period = false;
187 }
188 else
189 {
190 prev_space = false;
191 prev_period = code == '.';
192 first_space_checked = true;
193 }
194 }
195 if(prev_space || prev_period)
196 {
197 return false; // trailing spaces and periods not allowed
198 }
199
200 static constexpr const char *RESERVED_NAMES[] = {
201 "CON", "PRN", "AUX", "NUL",
202 "COM0", "COM1", "COM2", "COM3", "COM4", "COM5", "COM6", "COM7", "COM8", "COM9", "COM¹", "COM²", "COM³",
203 "LPT0", "LPT1", "LPT2", "LPT3", "LPT4", "LPT5", "LPT6", "LPT7", "LPT8", "LPT9", "LPT¹", "LPT²", "LPT³"};
204 for(const char *reserved_name : RESERVED_NAMES)
205 {
206 const char *prefix = str_startswith_nocase(str, prefix: reserved_name);
207 if(prefix != nullptr && (prefix[0] == '\0' || prefix[0] == '.'))
208 {
209 return false; // reserved name not allowed when it makes up the entire filename or when followed by period
210 }
211 }
212
213 return true;
214}
215
216int str_comp_filenames(const char *a, const char *b)
217{
218 int result;
219
220 for(; *a && *b; ++a, ++b)
221 {
222 if(str_isnum(c: *a) && str_isnum(c: *b))
223 {
224 result = 0;
225 do
226 {
227 if(!result)
228 result = *a - *b;
229 ++a;
230 ++b;
231 } while(str_isnum(c: *a) && str_isnum(c: *b));
232
233 if(str_isnum(c: *a))
234 return 1;
235 else if(str_isnum(c: *b))
236 return -1;
237 else if(result || *a == '\0' || *b == '\0')
238 return result;
239 }
240
241 result = tolower(c: *a) - tolower(c: *b);
242 if(result)
243 return result;
244 }
245 return *a - *b;
246}
247
248void str_clean_whitespaces(char *str)
249{
250 char *read = str;
251 char *write = str;
252
253 /* skip initial whitespace */
254 while(*read == ' ')
255 read++;
256
257 /* end of read string is detected in the loop */
258 while(true)
259 {
260 /* skip whitespace */
261 int found_whitespace = 0;
262 for(; *read == ' '; read++)
263 found_whitespace = 1;
264 /* if not at the end of the string, put a found whitespace here */
265 if(*read)
266 {
267 if(found_whitespace)
268 *write++ = ' ';
269 *write++ = *read++;
270 }
271 else
272 {
273 *write = 0;
274 break;
275 }
276 }
277}
278
279char *str_skip_to_whitespace(char *str)
280{
281 while(*str && !str_isspace(c: *str))
282 str++;
283 return str;
284}
285
286const char *str_skip_to_whitespace_const(const char *str)
287{
288 while(*str && !str_isspace(c: *str))
289 str++;
290 return str;
291}
292
293char *str_skip_whitespaces(char *str)
294{
295 while(*str && str_isspace(c: *str))
296 str++;
297 return str;
298}
299
300const char *str_skip_whitespaces_const(const char *str)
301{
302 while(*str && str_isspace(c: *str))
303 str++;
304 return str;
305}
306
307/* case */
308int str_comp_nocase(const char *a, const char *b)
309{
310#if defined(CONF_FAMILY_WINDOWS)
311 return _stricmp(a, b);
312#else
313 return strcasecmp(s1: a, s2: b);
314#endif
315}
316
317int str_comp_nocase_num(const char *a, const char *b, int num)
318{
319#if defined(CONF_FAMILY_WINDOWS)
320 return _strnicmp(a, b, num);
321#else
322 return strncasecmp(s1: a, s2: b, n: num);
323#endif
324}
325
326int str_comp(const char *a, const char *b)
327{
328 return strcmp(s1: a, s2: b);
329}
330
331int str_comp_num(const char *a, const char *b, int num)
332{
333 return strncmp(s1: a, s2: b, n: num);
334}
335
336const char *str_startswith_nocase(const char *str, const char *prefix)
337{
338 int prefixl = str_length(str: prefix);
339 if(str_comp_nocase_num(a: str, b: prefix, num: prefixl) == 0)
340 {
341 return str + prefixl;
342 }
343 else
344 {
345 return nullptr;
346 }
347}
348
349const char *str_startswith(const char *str, const char *prefix)
350{
351 int prefixl = str_length(str: prefix);
352 if(str_comp_num(a: str, b: prefix, num: prefixl) == 0)
353 {
354 return str + prefixl;
355 }
356 else
357 {
358 return nullptr;
359 }
360}
361
362const char *str_endswith_nocase(const char *str, const char *suffix)
363{
364 int strl = str_length(str);
365 int suffixl = str_length(str: suffix);
366 const char *strsuffix;
367 if(strl < suffixl)
368 {
369 return nullptr;
370 }
371 strsuffix = str + strl - suffixl;
372 if(str_comp_nocase(a: strsuffix, b: suffix) == 0)
373 {
374 return strsuffix;
375 }
376 else
377 {
378 return nullptr;
379 }
380}
381
382const char *str_endswith(const char *str, const char *suffix)
383{
384 int strl = str_length(str);
385 int suffixl = str_length(str: suffix);
386 const char *strsuffix;
387 if(strl < suffixl)
388 {
389 return nullptr;
390 }
391 strsuffix = str + strl - suffixl;
392 if(str_comp(a: strsuffix, b: suffix) == 0)
393 {
394 return strsuffix;
395 }
396 else
397 {
398 return nullptr;
399 }
400}
401
402const char *str_find_nocase(const char *haystack, const char *needle)
403{
404 while(*haystack) /* native implementation */
405 {
406 const char *a = haystack;
407 const char *b = needle;
408 while(*a && *b && tolower(c: (unsigned char)*a) == tolower(c: (unsigned char)*b))
409 {
410 a++;
411 b++;
412 }
413 if(!(*b))
414 return haystack;
415 haystack++;
416 }
417
418 return nullptr;
419}
420
421const char *str_find(const char *haystack, const char *needle)
422{
423 while(*haystack) /* native implementation */
424 {
425 const char *a = haystack;
426 const char *b = needle;
427 while(*a && *b && *a == *b)
428 {
429 a++;
430 b++;
431 }
432 if(!(*b))
433 return haystack;
434 haystack++;
435 }
436
437 return nullptr;
438}
439
440static const char *str_token_get(const char *str, const char *delim, int *length)
441{
442 size_t len = strspn(s: str, accept: delim);
443 if(len > 1)
444 str++;
445 else
446 str += len;
447 if(!*str)
448 return nullptr;
449
450 *length = strcspn(s: str, reject: delim);
451 return str;
452}
453
454const char *str_next_token(const char *str, const char *delim, char *buffer, int buffer_size)
455{
456 int len = 0;
457 const char *tok = str_token_get(str, delim, length: &len);
458 if(len < 0 || tok == nullptr)
459 {
460 buffer[0] = '\0';
461 return nullptr;
462 }
463
464 len = buffer_size > len ? len : buffer_size - 1;
465 mem_copy(dest: buffer, source: tok, size: len);
466 buffer[len] = '\0';
467
468 return tok + len;
469}
470
471int str_in_list(const char *list, const char *delim, const char *needle)
472{
473 const char *tok = list;
474 int len = 0, notfound = 1, needlelen = str_length(str: needle);
475
476 while(notfound && (tok = str_token_get(str: tok, delim, length: &len)))
477 {
478 notfound = needlelen != len || str_comp_num(a: tok, b: needle, num: len);
479 tok = tok + len;
480 }
481
482 return !notfound;
483}
484
485bool str_delimiters_around_offset(const char *haystack, const char *delim, int offset, int *start, int *end)
486{
487 bool found = true;
488 const char *search = haystack;
489 const int delim_len = str_length(str: delim);
490 *start = 0;
491 while(str_find(haystack: search, needle: delim))
492 {
493 const char *test = str_find(haystack: search, needle: delim) + delim_len;
494 int distance = test - haystack;
495 if(distance > offset)
496 break;
497
498 *start = distance;
499 search = test;
500 }
501 if(search == haystack)
502 found = false;
503
504 if(str_find(haystack: search, needle: delim))
505 {
506 *end = str_find(haystack: search, needle: delim) - haystack;
507 }
508 else
509 {
510 *end = str_length(str: haystack);
511 found = false;
512 }
513
514 return found;
515}
516
517const char *str_rchr(const char *haystack, char needle)
518{
519 return strrchr(s: haystack, c: needle);
520}
521
522int str_countchr(const char *haystack, char needle)
523{
524 int count = 0;
525 while(*haystack)
526 {
527 if(*haystack == needle)
528 count++;
529 haystack++;
530 }
531 return count;
532}
533
534void str_hex(char *dst, int dst_size, const void *data, int data_size)
535{
536 static const char hex[] = "0123456789ABCDEF";
537 int data_index;
538 int dst_index;
539 for(data_index = 0, dst_index = 0; data_index < data_size && dst_index < dst_size - 3; data_index++)
540 {
541 dst[data_index * 3] = hex[((const unsigned char *)data)[data_index] >> 4];
542 dst[data_index * 3 + 1] = hex[((const unsigned char *)data)[data_index] & 0xf];
543 dst[data_index * 3 + 2] = ' ';
544 dst_index += 3;
545 }
546 dst[dst_index] = '\0';
547}
548
549void str_hex_cstyle(char *dst, int dst_size, const void *data, int data_size, int bytes_per_line)
550{
551 static const char hex[] = "0123456789ABCDEF";
552 int data_index;
553 int dst_index;
554 int remaining_bytes_per_line = bytes_per_line;
555 for(data_index = 0, dst_index = 0; data_index < data_size && dst_index < dst_size - 6; data_index++)
556 {
557 --remaining_bytes_per_line;
558 dst[data_index * 6] = '0';
559 dst[data_index * 6 + 1] = 'x';
560 dst[data_index * 6 + 2] = hex[((const unsigned char *)data)[data_index] >> 4];
561 dst[data_index * 6 + 3] = hex[((const unsigned char *)data)[data_index] & 0xf];
562 dst[data_index * 6 + 4] = ',';
563 if(remaining_bytes_per_line == 0)
564 {
565 dst[data_index * 6 + 5] = '\n';
566 remaining_bytes_per_line = bytes_per_line;
567 }
568 else
569 {
570 dst[data_index * 6 + 5] = ' ';
571 }
572 dst_index += 6;
573 }
574 dst[dst_index] = '\0';
575 // Remove trailing comma and space/newline
576 if(dst_index >= 1)
577 dst[dst_index - 1] = '\0';
578 if(dst_index >= 2)
579 dst[dst_index - 2] = '\0';
580}
581
582static int hexval(char x)
583{
584 switch(x)
585 {
586 case '0': return 0;
587 case '1': return 1;
588 case '2': return 2;
589 case '3': return 3;
590 case '4': return 4;
591 case '5': return 5;
592 case '6': return 6;
593 case '7': return 7;
594 case '8': return 8;
595 case '9': return 9;
596 case 'a':
597 case 'A': return 10;
598 case 'b':
599 case 'B': return 11;
600 case 'c':
601 case 'C': return 12;
602 case 'd':
603 case 'D': return 13;
604 case 'e':
605 case 'E': return 14;
606 case 'f':
607 case 'F': return 15;
608 default: return -1;
609 }
610}
611
612static int byteval(const char *hex, unsigned char *dst)
613{
614 int v1 = hexval(x: hex[0]);
615 int v2 = hexval(x: hex[1]);
616
617 if(v1 < 0 || v2 < 0)
618 return 1;
619
620 *dst = v1 * 16 + v2;
621 return 0;
622}
623
624int str_hex_decode(void *dst, int dst_size, const char *src)
625{
626 unsigned char *cdst = (unsigned char *)dst;
627 int slen = str_length(str: src);
628 int len = slen / 2;
629 int i;
630 if(slen != dst_size * 2)
631 return 2;
632
633 for(i = 0; i < len && dst_size; i++, dst_size--)
634 {
635 if(byteval(hex: src + i * 2, dst: cdst++))
636 return 1;
637 }
638 return 0;
639}
640
641void str_base64(char *dst, int dst_size, const void *data_raw, int data_size)
642{
643 static const char DIGITS[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
644
645 const unsigned char *data = (const unsigned char *)data_raw;
646 unsigned value = 0;
647 int num_bits = 0;
648 int i = 0;
649 int o = 0;
650
651 dst_size -= 1;
652 dst[dst_size] = 0;
653 while(true)
654 {
655 if(num_bits < 6 && i < data_size)
656 {
657 value = (value << 8) | data[i];
658 num_bits += 8;
659 i += 1;
660 }
661 if(o == dst_size)
662 {
663 return;
664 }
665 if(num_bits > 0)
666 {
667 unsigned padded;
668 if(num_bits >= 6)
669 {
670 padded = (value >> (num_bits - 6)) & 0x3f;
671 }
672 else
673 {
674 padded = (value << (6 - num_bits)) & 0x3f;
675 }
676 dst[o] = DIGITS[padded];
677 num_bits -= 6;
678 o += 1;
679 }
680 else if(o % 4 != 0)
681 {
682 dst[o] = '=';
683 o += 1;
684 }
685 else
686 {
687 dst[o] = 0;
688 return;
689 }
690 }
691}
692
693static int base64_digit_value(char digit)
694{
695 if('A' <= digit && digit <= 'Z')
696 {
697 return digit - 'A';
698 }
699 else if('a' <= digit && digit <= 'z')
700 {
701 return digit - 'a' + 26;
702 }
703 else if('0' <= digit && digit <= '9')
704 {
705 return digit - '0' + 52;
706 }
707 else if(digit == '+')
708 {
709 return 62;
710 }
711 else if(digit == '/')
712 {
713 return 63;
714 }
715 return -1;
716}
717
718int str_base64_decode(void *dst_raw, int dst_size, const char *data)
719{
720 unsigned char *dst = (unsigned char *)dst_raw;
721 int data_len = str_length(str: data);
722
723 int i;
724 int o = 0;
725
726 if(data_len % 4 != 0)
727 {
728 return -3;
729 }
730 if(data_len / 4 * 3 > dst_size)
731 {
732 // Output buffer too small.
733 return -2;
734 }
735 for(i = 0; i < data_len; i += 4)
736 {
737 int num_output_bytes = 3;
738 char copy[4];
739 int d[4];
740 int value;
741 int b;
742 mem_copy(dest: copy, source: data + i, size: sizeof(copy));
743 if(i == data_len - 4)
744 {
745 if(copy[3] == '=')
746 {
747 copy[3] = 'A';
748 num_output_bytes = 2;
749 if(copy[2] == '=')
750 {
751 copy[2] = 'A';
752 num_output_bytes = 1;
753 }
754 }
755 }
756 d[0] = base64_digit_value(digit: copy[0]);
757 d[1] = base64_digit_value(digit: copy[1]);
758 d[2] = base64_digit_value(digit: copy[2]);
759 d[3] = base64_digit_value(digit: copy[3]);
760 if(d[0] == -1 || d[1] == -1 || d[2] == -1 || d[3] == -1)
761 {
762 // Invalid digit.
763 return -1;
764 }
765 value = (d[0] << 18) | (d[1] << 12) | (d[2] << 6) | d[3];
766 for(b = 0; b < 3; b++)
767 {
768 unsigned char byte_value = (value >> (16 - 8 * b)) & 0xff;
769 if(b < num_output_bytes)
770 {
771 dst[o] = byte_value;
772 o += 1;
773 }
774 else
775 {
776 if(byte_value != 0)
777 {
778 // Padding not zeroed.
779 return -2;
780 }
781 }
782 }
783 }
784 return o;
785}
786
787void str_escape(char **dst, const char *src, const char *end)
788{
789 while(*src && *dst + 1 < end)
790 {
791 if(*src == '"' || *src == '\\') // escape \ and "
792 {
793 if(*dst + 2 < end)
794 *(*dst)++ = '\\';
795 else
796 break;
797 }
798 *(*dst)++ = *src++;
799 }
800 **dst = 0;
801}
802
803int str_toint(const char *str)
804{
805 return str_toint_base(str, base: 10);
806}
807
808bool str_toint(const char *str, int *out)
809{
810 // returns true if conversion was successful
811 char *end;
812 int value = strtol(nptr: str, endptr: &end, base: 10);
813 if(*end != '\0')
814 return false;
815 if(out != nullptr)
816 *out = value;
817 return true;
818}
819
820int str_toint_base(const char *str, int base)
821{
822 return strtol(nptr: str, endptr: nullptr, base: base);
823}
824
825unsigned long str_toulong_base(const char *str, int base)
826{
827 return strtoul(nptr: str, endptr: nullptr, base: base);
828}
829
830int64_t str_toint64_base(const char *str, int base)
831{
832 return strtoll(nptr: str, endptr: nullptr, base: base);
833}
834
835float str_tofloat(const char *str)
836{
837 return strtod(nptr: str, endptr: nullptr);
838}
839
840bool str_tofloat(const char *str, float *out)
841{
842 // returns true if conversion was successful
843 char *end;
844 float value = strtod(nptr: str, endptr: &end);
845 if(*end != '\0')
846 return false;
847 if(out != nullptr)
848 *out = value;
849 return true;
850}
851
852unsigned str_quickhash(const char *str)
853{
854 unsigned hash = 5381;
855 for(; *str; str++)
856 hash = ((hash << 5) + hash) + (*str); /* hash * 33 + c */
857 return hash;
858}
859
860int str_utf8_encode(char *ptr, int chr)
861{
862 /* encode */
863 if(chr <= 0x7F)
864 {
865 ptr[0] = (char)chr;
866 return 1;
867 }
868 else if(chr <= 0x7FF)
869 {
870 ptr[0] = 0xC0 | ((chr >> 6) & 0x1F);
871 ptr[1] = 0x80 | (chr & 0x3F);
872 return 2;
873 }
874 else if(chr <= 0xFFFF)
875 {
876 ptr[0] = 0xE0 | ((chr >> 12) & 0x0F);
877 ptr[1] = 0x80 | ((chr >> 6) & 0x3F);
878 ptr[2] = 0x80 | (chr & 0x3F);
879 return 3;
880 }
881 else if(chr <= 0x10FFFF)
882 {
883 ptr[0] = 0xF0 | ((chr >> 18) & 0x07);
884 ptr[1] = 0x80 | ((chr >> 12) & 0x3F);
885 ptr[2] = 0x80 | ((chr >> 6) & 0x3F);
886 ptr[3] = 0x80 | (chr & 0x3F);
887 return 4;
888 }
889
890 return 0;
891}
892
893static unsigned char str_byte_next(const char **ptr)
894{
895 unsigned char byte_value = **ptr;
896 (*ptr)++;
897 return byte_value;
898}
899
900static void str_byte_rewind(const char **ptr)
901{
902 (*ptr)--;
903}
904
905int str_utf8_decode(const char **ptr)
906{
907 // As per https://encoding.spec.whatwg.org/#utf-8-decoder.
908 unsigned char utf8_lower_boundary = 0x80;
909 unsigned char utf8_upper_boundary = 0xBF;
910 int utf8_code_point = 0;
911 int utf8_bytes_seen = 0;
912 int utf8_bytes_needed = 0;
913 while(true)
914 {
915 unsigned char byte_value = str_byte_next(ptr);
916 if(utf8_bytes_needed == 0)
917 {
918 if(byte_value <= 0x7F)
919 {
920 return byte_value;
921 }
922 else if(0xC2 <= byte_value && byte_value <= 0xDF)
923 {
924 utf8_bytes_needed = 1;
925 utf8_code_point = byte_value - 0xC0;
926 }
927 else if(0xE0 <= byte_value && byte_value <= 0xEF)
928 {
929 if(byte_value == 0xE0)
930 utf8_lower_boundary = 0xA0;
931 if(byte_value == 0xED)
932 utf8_upper_boundary = 0x9F;
933 utf8_bytes_needed = 2;
934 utf8_code_point = byte_value - 0xE0;
935 }
936 else if(0xF0 <= byte_value && byte_value <= 0xF4)
937 {
938 if(byte_value == 0xF0)
939 utf8_lower_boundary = 0x90;
940 if(byte_value == 0xF4)
941 utf8_upper_boundary = 0x8F;
942 utf8_bytes_needed = 3;
943 utf8_code_point = byte_value - 0xF0;
944 }
945 else
946 {
947 return -1; // Error.
948 }
949 utf8_code_point = utf8_code_point << (6 * utf8_bytes_needed);
950 continue;
951 }
952 if(!(utf8_lower_boundary <= byte_value && byte_value <= utf8_upper_boundary))
953 {
954 // Resetting variables not necessary, will be done when
955 // the function is called again.
956 str_byte_rewind(ptr);
957 return -1;
958 }
959 utf8_lower_boundary = 0x80;
960 utf8_upper_boundary = 0xBF;
961 utf8_bytes_seen += 1;
962 utf8_code_point = utf8_code_point + ((byte_value - 0x80) << (6 * (utf8_bytes_needed - utf8_bytes_seen)));
963 if(utf8_bytes_seen != utf8_bytes_needed)
964 {
965 continue;
966 }
967 // Resetting variables not necessary, see above.
968 return utf8_code_point;
969 }
970}
971
972void str_utf8_truncate(char *dst, int dst_size, const char *src, int truncation_len)
973{
974 int size = -1;
975 const char *cursor = src;
976 int pos = 0;
977 while(pos <= truncation_len && cursor - src < dst_size && size != cursor - src)
978 {
979 size = cursor - src;
980 if(str_utf8_decode(ptr: &cursor) == 0)
981 {
982 break;
983 }
984 pos++;
985 }
986 str_copy(dst, src, dst_size: size + 1);
987}
988
989int str_utf8_fix_truncation(char *str)
990{
991 int len = str_length(str);
992 if(len > 0)
993 {
994 int last_char_index = str_utf8_rewind(str, cursor: len);
995 const char *last_char = str + last_char_index;
996 // Fix truncated UTF-8.
997 if(str_utf8_decode(ptr: &last_char) == -1)
998 {
999 str[last_char_index] = 0;
1000 return last_char_index;
1001 }
1002 }
1003 return len;
1004}
1005
1006void str_utf8_trim_right(char *param)
1007{
1008 const char *str = param;
1009 char *end = nullptr;
1010 while(*str)
1011 {
1012 char *str_old = (char *)str;
1013 int code = str_utf8_decode(ptr: &str);
1014
1015 // check if unicode is not empty
1016 if(!str_utf8_isspace(code))
1017 {
1018 end = nullptr;
1019 }
1020 else if(!end)
1021 {
1022 end = str_old;
1023 }
1024 }
1025 if(end)
1026 {
1027 *end = 0;
1028 }
1029}
1030
1031void str_utf8_tolower(const char *input, char *output, size_t size)
1032{
1033 size_t out_pos = 0;
1034 while(*input)
1035 {
1036 const int code = str_utf8_tolower_codepoint(code: str_utf8_decode(ptr: &input));
1037 char encoded_code[4];
1038 const int code_size = str_utf8_encode(ptr: encoded_code, chr: code);
1039 if(out_pos + code_size + 1 > size) // +1 for null termination
1040 {
1041 break;
1042 }
1043 mem_copy(dest: &output[out_pos], source: encoded_code, size: code_size);
1044 out_pos += code_size;
1045 }
1046 output[out_pos] = '\0';
1047}
1048
1049int str_utf8_isspace(int code)
1050{
1051 return code <= 0x0020 || code == 0x0085 || code == 0x00A0 || code == 0x034F ||
1052 code == 0x115F || code == 0x1160 || code == 0x1680 || code == 0x180E ||
1053 (code >= 0x2000 && code <= 0x200F) || (code >= 0x2028 && code <= 0x202F) ||
1054 (code >= 0x205F && code <= 0x2064) || (code >= 0x206A && code <= 0x206F) ||
1055 code == 0x2800 || code == 0x3000 || code == 0x3164 ||
1056 (code >= 0xFE00 && code <= 0xFE0F) || code == 0xFEFF || code == 0xFFA0 ||
1057 (code >= 0xFFF9 && code <= 0xFFFC);
1058}
1059
1060int str_utf8_isstart(char c)
1061{
1062 if((c & 0xC0) == 0x80) /* 10xxxxxx */
1063 return 0;
1064 return 1;
1065}
1066
1067int str_utf8_rewind(const char *str, int cursor)
1068{
1069 while(cursor)
1070 {
1071 cursor--;
1072 if(str_utf8_isstart(c: *(str + cursor)))
1073 break;
1074 }
1075 return cursor;
1076}
1077
1078const char *str_utf8_find_nocase(const char *haystack, const char *needle, const char **end)
1079{
1080 while(*haystack) /* native implementation */
1081 {
1082 const char *a = haystack;
1083 const char *b = needle;
1084 const char *a_next = a;
1085 const char *b_next = b;
1086 while(*a && *b && str_utf8_tolower_codepoint(code: str_utf8_decode(ptr: &a_next)) == str_utf8_tolower_codepoint(code: str_utf8_decode(ptr: &b_next)))
1087 {
1088 a = a_next;
1089 b = b_next;
1090 }
1091 if(!(*b))
1092 {
1093 if(end != nullptr)
1094 *end = a_next;
1095 return haystack;
1096 }
1097 str_utf8_decode(ptr: &haystack);
1098 }
1099
1100 if(end != nullptr)
1101 *end = nullptr;
1102 return nullptr;
1103}
1104
1105int str_utf8_comp_nocase(const char *a, const char *b)
1106{
1107 int code_a;
1108 int code_b;
1109
1110 while(*a && *b)
1111 {
1112 code_a = str_utf8_tolower_codepoint(code: str_utf8_decode(ptr: &a));
1113 code_b = str_utf8_tolower_codepoint(code: str_utf8_decode(ptr: &b));
1114
1115 if(code_a != code_b)
1116 return code_a - code_b;
1117 }
1118 return (unsigned char)*a - (unsigned char)*b;
1119}
1120
1121int str_utf8_comp_nocase_num(const char *a, const char *b, int num)
1122{
1123 int code_a;
1124 int code_b;
1125 const char *old_a = a;
1126
1127 if(num <= 0)
1128 return 0;
1129
1130 while(*a && *b)
1131 {
1132 code_a = str_utf8_tolower_codepoint(code: str_utf8_decode(ptr: &a));
1133 code_b = str_utf8_tolower_codepoint(code: str_utf8_decode(ptr: &b));
1134
1135 if(code_a != code_b)
1136 return code_a - code_b;
1137
1138 if(a - old_a >= num)
1139 return 0;
1140 }
1141
1142 return (unsigned char)*a - (unsigned char)*b;
1143}
1144
1145const char *str_utf8_skip_whitespaces(const char *str)
1146{
1147 const char *str_old;
1148 int code;
1149
1150 while(*str)
1151 {
1152 str_old = str;
1153 code = str_utf8_decode(ptr: &str);
1154
1155 // check if unicode is not empty
1156 if(!str_utf8_isspace(code))
1157 {
1158 return str_old;
1159 }
1160 }
1161
1162 return str;
1163}
1164
1165int str_utf8_forward(const char *str, int cursor)
1166{
1167 const char *ptr = str + cursor;
1168 if(str_utf8_decode(ptr: &ptr) == 0)
1169 {
1170 return cursor;
1171 }
1172 return ptr - str;
1173}
1174
1175int str_utf8_check(const char *str)
1176{
1177 int codepoint;
1178 while((codepoint = str_utf8_decode(ptr: &str)))
1179 {
1180 if(codepoint == -1)
1181 {
1182 return 0;
1183 }
1184 }
1185 return 1;
1186}
1187
1188void str_utf8_copy_num(char *dst, const char *src, int dst_size, int num)
1189{
1190 int new_cursor;
1191 int cursor = 0;
1192
1193 while(src[cursor] && num > 0)
1194 {
1195 new_cursor = str_utf8_forward(str: src, cursor);
1196 if(new_cursor >= dst_size) // reserve 1 byte for the null termination
1197 break;
1198 else
1199 cursor = new_cursor;
1200 --num;
1201 }
1202
1203 str_copy(dst, src, dst_size: cursor < dst_size ? cursor + 1 : dst_size);
1204}
1205
1206void str_utf8_stats(const char *str, size_t max_size, size_t max_count, size_t *size, size_t *count)
1207{
1208 const char *cursor = str;
1209 *size = 0;
1210 *count = 0;
1211 while(*size < max_size && *count < max_count)
1212 {
1213 if(str_utf8_decode(ptr: &cursor) == 0)
1214 {
1215 break;
1216 }
1217 if((size_t)(cursor - str) >= max_size)
1218 {
1219 break;
1220 }
1221 *size = cursor - str;
1222 ++(*count);
1223 }
1224}
1225
1226size_t str_utf8_offset_bytes_to_chars(const char *str, size_t byte_offset)
1227{
1228 size_t char_offset = 0;
1229 size_t current_offset = 0;
1230 while(current_offset < byte_offset)
1231 {
1232 const size_t prev_byte_offset = current_offset;
1233 current_offset = str_utf8_forward(str, cursor: current_offset);
1234 if(current_offset == prev_byte_offset)
1235 break;
1236 char_offset++;
1237 }
1238 return char_offset;
1239}
1240
1241size_t str_utf8_offset_chars_to_bytes(const char *str, size_t char_offset)
1242{
1243 size_t byte_offset = 0;
1244 for(size_t i = 0; i < char_offset; i++)
1245 {
1246 const size_t prev_byte_offset = byte_offset;
1247 byte_offset = str_utf8_forward(str, cursor: byte_offset);
1248 if(byte_offset == prev_byte_offset)
1249 break;
1250 }
1251 return byte_offset;
1252}
1253