30static const uint32_t offsetsFromUTF8[6] = {
31 0x00000000UL, 0x00003080UL, 0x000E2080UL,
32 0x03C82080UL, 0xFA082080UL, 0x82082080UL
35static const char trailingBytesForUTF8[256] = {
36 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
37 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
38 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
39 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
40 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
41 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
42 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
43 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
47size_t u8_seqlen(
const char *s)
49 return trailingBytesForUTF8[(
unsigned int)(
unsigned char)s[0]] + 1;
54size_t u8_charlen(uint32_t ch)
60 else if (ch < 0x10000)
62 else if (ch < 0x110000)
67size_t u8_codingsize(uint32_t *wcstr,
size_t n)
72 c += u8_charlen(wcstr[i]);
84size_t u8_toucs(uint32_t *dest,
size_t sz,
const char *src,
size_t srcsz)
87 const char *src_end = src + srcsz;
91 if (sz == 0 || srcsz == 0)
98 if (src >= src_end)
break;
101 nb = trailingBytesForUTF8[(
unsigned char)*src];
102 if (src + nb >= src_end)
107 case 5: ch += (
unsigned char)*src++; ch <<= 6;
109 case 4: ch += (
unsigned char)*src++; ch <<= 6;
111 case 3: ch += (
unsigned char)*src++; ch <<= 6;
113 case 2: ch += (
unsigned char)*src++; ch <<= 6;
115 case 1: ch += (
unsigned char)*src++; ch <<= 6;
117 case 0: ch += (
unsigned char)*src++;
119 ch -= offsetsFromUTF8[nb];
131size_t u8_toutf8(
char *dest,
size_t sz,
const uint32_t *src,
size_t srcsz)
136 char *dest_end = dest + sz;
141 if (dest >= dest_end)
145 else if (ch < 0x800) {
146 if (dest >= dest_end-1)
148 *dest++ = (ch>>6) | 0xC0;
149 *dest++ = (ch & 0x3F) | 0x80;
151 else if (ch < 0x10000) {
152 if (dest >= dest_end-2)
154 *dest++ = (ch>>12) | 0xE0;
155 *dest++ = ((ch>>6) & 0x3F) | 0x80;
156 *dest++ = (ch & 0x3F) | 0x80;
158 else if (ch < 0x110000) {
159 if (dest >= dest_end-3)
161 *dest++ = (ch>>18) | 0xF0;
162 *dest++ = ((ch>>12) & 0x3F) | 0x80;
163 *dest++ = ((ch>>6) & 0x3F) | 0x80;
164 *dest++ = (ch & 0x3F) | 0x80;
171size_t u8_wc_toutf8(
char *dest, uint32_t ch)
178 dest[0] = (ch>>6) | 0xC0;
179 dest[1] = (ch & 0x3F) | 0x80;
183 dest[0] = (ch>>12) | 0xE0;
184 dest[1] = ((ch>>6) & 0x3F) | 0x80;
185 dest[2] = (ch & 0x3F) | 0x80;
189 dest[0] = (ch>>18) | 0xF0;
190 dest[1] = ((ch>>12) & 0x3F) | 0x80;
191 dest[2] = ((ch>>6) & 0x3F) | 0x80;
192 dest[3] = (ch & 0x3F) | 0x80;
199size_t u8_offset(
const char *s,
size_t charnum)
203 while (charnum > 0) {
205 (void)(isutf(s[++i]) || isutf(s[++i]) || ++i);
213size_t u8_charnum(
const char *s,
size_t offset)
215 size_t charnum = 0, i=0;
219 (void)(isutf(s[++i]) || isutf(s[++i]) || ++i);
227size_t u8_strlen(
const char *s)
237 if (s[i++]==0)
break;
238 (void)(isutf(s[++i]) || isutf(s[++i]) || ++i);
245uint32_t u8_nextchar(
const char *s,
size_t *i)
252 ch += (
unsigned char)s[(*i)];
254 }
while (s[*i] && (++(*i)) && !isutf(s[*i]));
255 ch -= offsetsFromUTF8[sz-1];
261uint32_t u8_nextmemchar(
const char *s,
size_t *i)
268 ch += (
unsigned char)s[(*i)++];
270 }
while (!isutf(s[*i]));
271 ch -= offsetsFromUTF8[sz-1];
276void u8_inc(
const char *s,
size_t *i)
278 (void)(isutf(s[++(*i)]) || isutf(s[++(*i)]) || isutf(s[++(*i)]) || ++(*i));
281void u8_dec(
const char *s,
size_t *i)
283 (void)(isutf(s[--(*i)]) || isutf(s[--(*i)]) || isutf(s[--(*i)]) || --(*i));
286int octal_digit(
char c)
288 return (
c >=
'0' &&
c <=
'7');
293 return ((
c >=
'0' &&
c <=
'9') ||
294 (
c >=
'A' &&
c <=
'F') ||
295 (
c >=
'a' &&
c <=
'f'));
298char read_escape_control_char(
char c)
321size_t u8_read_escape_sequence(
const char *str,
size_t ssz, uint32_t *dest)
330 if (octal_digit(c0)) {
333 digs[dno++] = str[i++];
334 }
while (i<ssz && octal_digit(str[i]) && dno<3);
336 ch = strtol(digs, NULL, 8);
338 else if ((c0==
'x' && (ndig=2)) ||
339 (c0==
'u' && (ndig=4)) ||
340 (c0==
'U' && (ndig=8))) {
341 while (i<ssz && hex_digit(str[i]) && dno<ndig) {
342 digs[dno++] = str[i++];
344 if (dno == 0)
return 0;
346 ch = strtol(digs, NULL, 16);
349 ch = (uint32_t)read_escape_control_char(c0);
359size_t u8_unescape(
char *buf,
size_t sz,
const char *src)
365 while (*src &&
c < sz) {
368 amt = u8_read_escape_sequence(src, 1000, &ch);
375 amt = u8_wc_toutf8(temp, ch);
378 memcpy(&buf[
c], temp, amt);
386char *u8_strchr(
const char *s, uint32_t ch,
size_t *charn)
388 size_t i = 0, lasti=0;
393 c = u8_nextchar(s, &i);
396 return (
char*)&s[lasti];
404char *u8_memchr(
const char *s, uint32_t ch,
size_t sz,
size_t *charn)
406 size_t i = 0, lasti=0;
415 c += (
unsigned char)s[i++];
417 }
while (i < sz && !isutf(s[i]));
418 c -= offsetsFromUTF8[csz-1];
421 return (
char*)&s[lasti];
429char *u8_memrchr(
const char *s, uint32_t ch,
size_t sz)
431 size_t i = sz-1, tempi=0;
434 if (sz == 0)
return NULL;
436 while (i && !isutf(s[i])) i--;
440 c = u8_nextmemchar(s, &tempi);
458int u8_isvalid(
const char *str,
size_t length)
460 const unsigned char *p, *pend = (
unsigned char*)str + length;
465 for (p = (
unsigned char*)str; p < pend; p++) {
470 if ((
c & 0xc0) != 0xc0)
472 ab = trailingBytesForUTF8[
c];
479 if ((*p & 0xc0) != 0x80)
486 if ((
c & 0x3e) == 0)
return 0;
491 if (
c == 0xe0 && (*p & 0x20) == 0)
return 0;
496 if (
c == 0xf0 && (*p & 0x30) == 0)
return 0;
501 if (
c == 0xf8 && (*p & 0x38) == 0)
return 0;
507 if (
c == 0xfe ||
c == 0xff ||
508 (
c == 0xfc && (*p & 0x3c) == 0))
return 0;
514 if ((*(++p) & 0xc0) != 0x80)
return 0;
521int u8_reverse(
char *dest,
char * src,
size_t len)
528 c = (
unsigned char)src[si];
539 memcpy(&dest[di], &src[si],
sizeof(int16_t));
545 memcpy(&dest[di+1], &src[si+1],
sizeof(int16_t));
550 memcpy(&dest[di], &src[si],
sizeof(int32_t));