naev 0.11.5
utf8.c
1/*
2 Basic UTF-8 manipulation routines
3 by Jeff Bezanson
4 placed in the public domain Fall 2005
5
6 This code is designed to provide the utilities you need to manipulate
7 UTF-8 as an internal string encoding. These functions do not perform the
8 error checking normally needed when handling UTF-8 data, so if you happen
9 to be from the Unicode Consortium you will want to flay me alive.
10 I do this because error checking can be performed at the boundaries (I/O),
11 with these routines reserved for higher performance on data known to be
12 valid.
13 A UTF-8 validation routine is included.
14*/
15
17#if HAVE_ALLOCA_H
18# include <alloca.h> /* Not available in windows, necessary for linux. */
19#endif /* HAVE_ALLOCA_H */
20#include <assert.h>
21#if HAVE_MALLOC_H
22# include <malloc.h>
23#endif /* HAVE_MALLOC_H */
24#include <stdio.h>
25#include <string.h>
28#include "utf8.h"
29
30static const uint32_t offsetsFromUTF8[6] = {
31 0x00000000UL, 0x00003080UL, 0x000E2080UL,
32 0x03C82080UL, 0xFA082080UL, 0x82082080UL
33};
34
35static const char trailingBytesForUTF8[256] = {
36 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
37 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
38 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
39 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
40 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
41 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
42 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
43 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
44};
45
46/* returns length of next utf-8 sequence */
47size_t u8_seqlen(const char *s)
48{
49 return trailingBytesForUTF8[(unsigned int)(unsigned char)s[0]] + 1;
50}
51
52/* returns the # of bytes needed to encode a certain character
53 0 means the character cannot (or should not) be encoded. */
54size_t u8_charlen(uint32_t ch)
55{
56 if (ch < 0x80)
57 return 1;
58 else if (ch < 0x800)
59 return 2;
60 else if (ch < 0x10000)
61 return 3;
62 else if (ch < 0x110000)
63 return 4;
64 return 0;
65}
66
67size_t u8_codingsize(uint32_t *wcstr, size_t n)
68{
69 size_t i, c=0;
70
71 for (i=0; i < n; i++)
72 c += u8_charlen(wcstr[i]);
73 return c;
74}
75
76/* conversions without error checking
77 only works for valid UTF-8, i.e. no 5- or 6-byte sequences
78 srcsz = source size in bytes
79 sz = dest size in # of wide characters
80
81 returns # characters converted
82 if sz == srcsz+1 (i.e. 4*srcsz+4 bytes), there will always be enough space.
83*/
84size_t u8_toucs(uint32_t *dest, size_t sz, const char *src, size_t srcsz)
85{
86 uint32_t ch;
87 const char *src_end = src + srcsz;
88 size_t nb;
89 size_t i=0;
90
91 if (sz == 0 || srcsz == 0)
92 return 0;
93
94 while (i < sz) {
95 if (!isutf(*src)) { // invalid sequence
96 dest[i++] = 0xFFFD;
97 src++;
98 if (src >= src_end) break;
99 continue;
100 }
101 nb = trailingBytesForUTF8[(unsigned char)*src];
102 if (src + nb >= src_end)
103 break;
104 ch = 0;
105 switch (nb) {
106 /* these fall through deliberately */
107 case 5: ch += (unsigned char)*src++; ch <<= 6;
108 /* Falls through. */
109 case 4: ch += (unsigned char)*src++; ch <<= 6;
110 /* Falls through. */
111 case 3: ch += (unsigned char)*src++; ch <<= 6;
112 /* Falls through. */
113 case 2: ch += (unsigned char)*src++; ch <<= 6;
114 /* Falls through. */
115 case 1: ch += (unsigned char)*src++; ch <<= 6;
116 /* Falls through. */
117 case 0: ch += (unsigned char)*src++;
118 }
119 ch -= offsetsFromUTF8[nb];
120 dest[i++] = ch;
121 }
122 return i;
123}
124
125/* srcsz = number of source characters
126 sz = size of dest buffer in bytes
127
128 returns # bytes stored in dest
129 the destination string will never be bigger than the source string.
130*/
131size_t u8_toutf8(char *dest, size_t sz, const uint32_t *src, size_t srcsz)
132{
133 uint32_t ch;
134 size_t i = 0;
135 char *dest0 = dest;
136 char *dest_end = dest + sz;
137
138 while (i < srcsz) {
139 ch = src[i];
140 if (ch < 0x80) {
141 if (dest >= dest_end)
142 break;
143 *dest++ = (char)ch;
144 }
145 else if (ch < 0x800) {
146 if (dest >= dest_end-1)
147 break;
148 *dest++ = (ch>>6) | 0xC0;
149 *dest++ = (ch & 0x3F) | 0x80;
150 }
151 else if (ch < 0x10000) {
152 if (dest >= dest_end-2)
153 break;
154 *dest++ = (ch>>12) | 0xE0;
155 *dest++ = ((ch>>6) & 0x3F) | 0x80;
156 *dest++ = (ch & 0x3F) | 0x80;
157 }
158 else if (ch < 0x110000) {
159 if (dest >= dest_end-3)
160 break;
161 *dest++ = (ch>>18) | 0xF0;
162 *dest++ = ((ch>>12) & 0x3F) | 0x80;
163 *dest++ = ((ch>>6) & 0x3F) | 0x80;
164 *dest++ = (ch & 0x3F) | 0x80;
165 }
166 i++;
167 }
168 return (dest-dest0);
169}
170
171size_t u8_wc_toutf8(char *dest, uint32_t ch)
172{
173 if (ch < 0x80) {
174 dest[0] = (char)ch;
175 return 1;
176 }
177 if (ch < 0x800) {
178 dest[0] = (ch>>6) | 0xC0;
179 dest[1] = (ch & 0x3F) | 0x80;
180 return 2;
181 }
182 if (ch < 0x10000) {
183 dest[0] = (ch>>12) | 0xE0;
184 dest[1] = ((ch>>6) & 0x3F) | 0x80;
185 dest[2] = (ch & 0x3F) | 0x80;
186 return 3;
187 }
188 if (ch < 0x110000) {
189 dest[0] = (ch>>18) | 0xF0;
190 dest[1] = ((ch>>12) & 0x3F) | 0x80;
191 dest[2] = ((ch>>6) & 0x3F) | 0x80;
192 dest[3] = (ch & 0x3F) | 0x80;
193 return 4;
194 }
195 return 0;
196}
197
198/* charnum => byte offset */
199size_t u8_offset(const char *s, size_t charnum)
200{
201 size_t i=0;
202
203 while (charnum > 0) {
204 if (s[i++] & 0x80) {
205 (void)(isutf(s[++i]) || isutf(s[++i]) || ++i);
206 }
207 charnum--;
208 }
209 return i;
210}
211
212/* byte offset => charnum */
213size_t u8_charnum(const char *s, size_t offset)
214{
215 size_t charnum = 0, i=0;
216
217 while (i < offset) {
218 if (s[i++] & 0x80) {
219 (void)(isutf(s[++i]) || isutf(s[++i]) || ++i);
220 }
221 charnum++;
222 }
223 return charnum;
224}
225
226/* number of characters in NUL-terminated string */
227size_t u8_strlen(const char *s)
228{
229 size_t count = 0;
230 size_t i = 0, lasti;
231
232 while (1) {
233 lasti = i;
234 while (s[i] > 0)
235 i++;
236 count += (i-lasti);
237 if (s[i++]==0) break;
238 (void)(isutf(s[++i]) || isutf(s[++i]) || ++i);
239 count++;
240 }
241 return count;
242}
243
244/* reads the next utf-8 sequence out of a string, updating an index */
245uint32_t u8_nextchar(const char *s, size_t *i)
246{
247 uint32_t ch = 0;
248 size_t sz = 0;
249
250 do {
251 ch <<= 6;
252 ch += (unsigned char)s[(*i)];
253 sz++;
254 } while (s[*i] && (++(*i)) && !isutf(s[*i]));
255 ch -= offsetsFromUTF8[sz-1];
256
257 return ch;
258}
259
260/* next character without NUL character terminator */
261uint32_t u8_nextmemchar(const char *s, size_t *i)
262{
263 uint32_t ch = 0;
264 size_t sz = 0;
265
266 do {
267 ch <<= 6;
268 ch += (unsigned char)s[(*i)++];
269 sz++;
270 } while (!isutf(s[*i]));
271 ch -= offsetsFromUTF8[sz-1];
272
273 return ch;
274}
275
276void u8_inc(const char *s, size_t *i)
277{
278 (void)(isutf(s[++(*i)]) || isutf(s[++(*i)]) || isutf(s[++(*i)]) || ++(*i));
279}
280
281void u8_dec(const char *s, size_t *i)
282{
283 (void)(isutf(s[--(*i)]) || isutf(s[--(*i)]) || isutf(s[--(*i)]) || --(*i));
284}
285
286int octal_digit(char c)
287{
288 return (c >= '0' && c <= '7');
289}
290
291int hex_digit(char c)
292{
293 return ((c >= '0' && c <= '9') ||
294 (c >= 'A' && c <= 'F') ||
295 (c >= 'a' && c <= 'f'));
296}
297
298char read_escape_control_char(char c)
299{
300 if (c == 'n')
301 return '\n';
302 else if (c == 't')
303 return '\t';
304 else if (c == 'r')
305 return '\r';
306 else if (c == 'e')
307 return 033; // '\e'
308 else if (c == 'b')
309 return '\b';
310 else if (c == 'f')
311 return '\f';
312 else if (c == 'v')
313 return '\v';
314 else if (c == 'a')
315 return '\a';
316 return c;
317}
318
319/* assumes that src points to the character after a backslash
320 returns number of input characters processed, 0 if error */
321size_t u8_read_escape_sequence(const char *str, size_t ssz, uint32_t *dest)
322{
323 uint32_t ch;
324 char digs[10];
325 int dno=0, ndig;
326 size_t i=1;
327 char c0 = str[0];
328 assert(ssz > 0);
329
330 if (octal_digit(c0)) {
331 i = 0;
332 do {
333 digs[dno++] = str[i++];
334 } while (i<ssz && octal_digit(str[i]) && dno<3);
335 digs[dno] = '\0';
336 ch = strtol(digs, NULL, 8);
337 }
338 else if ((c0=='x' && (ndig=2)) ||
339 (c0=='u' && (ndig=4)) ||
340 (c0=='U' && (ndig=8))) {
341 while (i<ssz && hex_digit(str[i]) && dno<ndig) {
342 digs[dno++] = str[i++];
343 }
344 if (dno == 0) return 0;
345 digs[dno] = '\0';
346 ch = strtol(digs, NULL, 16);
347 }
348 else {
349 ch = (uint32_t)read_escape_control_char(c0);
350 }
351 *dest = ch;
352
353 return i;
354}
355
356/* convert a string with literal \uxxxx or \Uxxxxxxxx characters to UTF-8
357 example: u8_unescape(mybuf, 256, "hello\\u220e")
358 note the double backslash is needed if called on a C string literal */
359size_t u8_unescape(char *buf, size_t sz, const char *src)
360{
361 size_t c=0, amt;
362 uint32_t ch = 0;
363 char temp[4];
364
365 while (*src && c < sz) {
366 if (*src == '\\') {
367 src++;
368 amt = u8_read_escape_sequence(src, 1000, &ch);
369 }
370 else {
371 ch = (uint32_t)*src;
372 amt = 1;
373 }
374 src += amt;
375 amt = u8_wc_toutf8(temp, ch);
376 if (amt > sz-c)
377 break;
378 memcpy(&buf[c], temp, amt);
379 c += amt;
380 }
381 if (c < sz)
382 buf[c] = '\0';
383 return c;
384}
385
386char *u8_strchr(const char *s, uint32_t ch, size_t *charn)
387{
388 size_t i = 0, lasti=0;
389 uint32_t c;
390
391 *charn = 0;
392 while (s[i]) {
393 c = u8_nextchar(s, &i);
394 if (c == ch) {
395 /* it's const for us, but not necessarily the caller */
396 return (char*)&s[lasti];
397 }
398 lasti = i;
399 (*charn)++;
400 }
401 return NULL;
402}
403
404char *u8_memchr(const char *s, uint32_t ch, size_t sz, size_t *charn)
405{
406 size_t i = 0, lasti=0;
407 uint32_t c;
408 int csz;
409
410 *charn = 0;
411 while (i < sz) {
412 c = csz = 0;
413 do {
414 c <<= 6;
415 c += (unsigned char)s[i++];
416 csz++;
417 } while (i < sz && !isutf(s[i]));
418 c -= offsetsFromUTF8[csz-1];
419
420 if (c == ch) {
421 return (char*)&s[lasti];
422 }
423 lasti = i;
424 (*charn)++;
425 }
426 return NULL;
427}
428
429char *u8_memrchr(const char *s, uint32_t ch, size_t sz)
430{
431 size_t i = sz-1, tempi=0;
432 uint32_t c;
433
434 if (sz == 0) return NULL;
435
436 while (i && !isutf(s[i])) i--;
437
438 while (1) {
439 tempi = i;
440 c = u8_nextmemchar(s, &tempi);
441 if (c == ch) {
442 return (char*)&s[i];
443 }
444 if (i == 0)
445 break;
446 tempi = i;
447 u8_dec(s, &i);
448 if (i > tempi)
449 break;
450 }
451 return NULL;
452}
453
454/* based on the valid_utf8 routine from the PCRE library by Philip Hazel
455
456 length is in bytes, since without knowing whether the string is valid
457 it's hard to know how many characters there are! */
458int u8_isvalid(const char *str, size_t length)
459{
460 const unsigned char *p, *pend = (unsigned char*)str + length;
461 unsigned char c;
462 int ret = 1; /* ASCII */
463 size_t ab;
464
465 for (p = (unsigned char*)str; p < pend; p++) {
466 c = *p;
467 if (c < 128)
468 continue;
469 ret = 2; /* non-ASCII UTF-8 */
470 if ((c & 0xc0) != 0xc0)
471 return 0;
472 ab = trailingBytesForUTF8[c];
473 if (length < ab)
474 return 0;
475 length -= ab;
476
477 p++;
478 /* Check top bits in the second byte */
479 if ((*p & 0xc0) != 0x80)
480 return 0;
481
482 /* Check for overlong sequences for each different length */
483 switch (ab) {
484 /* Check for xx00 000x */
485 case 1:
486 if ((c & 0x3e) == 0) return 0;
487 continue; /* We know there aren't any more bytes to check */
488
489 /* Check for 1110 0000, xx0x xxxx */
490 case 2:
491 if (c == 0xe0 && (*p & 0x20) == 0) return 0;
492 break;
493
494 /* Check for 1111 0000, xx00 xxxx */
495 case 3:
496 if (c == 0xf0 && (*p & 0x30) == 0) return 0;
497 break;
498
499 /* Check for 1111 1000, xx00 0xxx */
500 case 4:
501 if (c == 0xf8 && (*p & 0x38) == 0) return 0;
502 break;
503
504 /* Check for leading 0xfe or 0xff,
505 and then for 1111 1100, xx00 00xx */
506 case 5:
507 if (c == 0xfe || c == 0xff ||
508 (c == 0xfc && (*p & 0x3c) == 0)) return 0;
509 break;
510 }
511
512 /* Check for valid bytes after the 2nd, if any; all must start 10 */
513 while (--ab > 0) {
514 if ((*(++p) & 0xc0) != 0x80) return 0;
515 }
516 }
517
518 return ret;
519}
520
521int u8_reverse(char *dest, char * src, size_t len)
522{
523 size_t si=0, di=len;
524 unsigned char c;
525
526 dest[di] = '\0';
527 while (si < len) {
528 c = (unsigned char)src[si];
529 if ((~c) & 0x80) {
530 di--;
531 dest[di] = c;
532 si++;
533 }
534 else {
535 switch (c>>4) {
536 case 0xC:
537 case 0xD:
538 di -= 2;
539 memcpy(&dest[di], &src[si], sizeof(int16_t));
540 si += 2;
541 break;
542 case 0xE:
543 di -= 3;
544 dest[di] = src[si];
545 memcpy(&dest[di+1], &src[si+1], sizeof(int16_t));
546 si += 3;
547 break;
548 case 0xF:
549 di -= 4;
550 memcpy(&dest[di], &src[si], sizeof(int32_t));
551 si += 4;
552 break;
553 default:
554 return 1;
555 }
556 }
557 }
558 return 0;
559}
static const double c[]
Definition rng.c:264