naev 0.11.5
lutf8lib.c
1/*
2 * From: https://github.com/starwing/luautf8/releases/tag/0.1.5
3
4MIT License
5
6Copyright (c) 2018 Xavier Wang
7
8Permission is hereby granted, free of charge, to any person obtaining a copy
9of this software and associated documentation files (the "Software"), to deal
10in the Software without restriction, including without limitation the rights
11to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12copies of the Software, and to permit persons to whom the Software is
13furnished to do so, subject to the following conditions:
14
15The above copyright notice and this permission notice shall be included in all
16copies or substantial portions of the Software.
17
18THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24SOFTWARE.
25*/
26#include "lutf8lib.h"
27
29#include <assert.h>
30#include <string.h>
31#include <stdint.h>
34#include "unidata.h"
35
36/* UTF-8 string operations */
37
38#define UTF8_BUFFSZ 8
39#define UTF8_MAX 0x7FFFFFFFu
40#define UTF8_MAXCP 0x10FFFFu
41#define iscont(p) ((*(p) & 0xC0) == 0x80)
42#define CAST(tp,expr) ((tp)(expr))
43
44#ifndef LUA_QL
45# define LUA_QL(x) "'" x "'"
46#endif
47
48static int utf8_invalid (utfint ch)
49{ return (ch > UTF8_MAXCP || (0xD800u <= ch && ch <= 0xDFFFu)); }
50
51static size_t utf8_encode (char *buff, utfint x) {
52 int n = 1; /* number of bytes put in buffer (backwards) */
53 lua_assert(x <= UTF8_MAX);
54 if (x < 0x80) /* ascii? */
55 buff[UTF8_BUFFSZ - 1] = x & 0x7F;
56 else { /* need continuation bytes */
57 utfint mfb = 0x3f; /* maximum that fits in first byte */
58 do { /* add continuation bytes */
59 buff[UTF8_BUFFSZ - (n++)] = 0x80 | (x & 0x3f);
60 x >>= 6; /* remove added bits */
61 mfb >>= 1; /* now there is one less bit available in first byte */
62 } while (x > mfb); /* still needs continuation byte? */
63 buff[UTF8_BUFFSZ - n] = ((~mfb << 1) | x) & 0xFF; /* add first byte */
64 }
65 return n;
66}
67
68static const char *utf8_decode (const char *s, utfint *val, int strict) {
69 static const utfint limits[] =
70 {~0u, 0x80u, 0x800u, 0x10000u, 0x200000u, 0x4000000u};
71 unsigned int c = (unsigned char)s[0];
72 utfint res = 0; /* final result */
73 if (c < 0x80) /* ascii? */
74 res = c;
75 else {
76 int count = 0; /* to count number of continuation bytes */
77 for (; c & 0x40; c <<= 1) { /* while it needs continuation bytes... */
78 unsigned int cc = (unsigned char)s[++count]; /* read next byte */
79 if ((cc & 0xC0) != 0x80) /* not a continuation byte? */
80 return NULL; /* invalid byte sequence */
81 res = (res << 6) | (cc & 0x3F); /* add lower 6 bits from cont. byte */
82 }
83 res |= ((utfint)(c & 0x7F) << (count * 5)); /* add first byte */
84 if (count > 5 || res > UTF8_MAX || res < limits[count])
85 return NULL; /* invalid byte sequence */
86 s += count; /* skip continuation bytes read */
87 }
88 if (strict) {
89 /* check for invalid code points; too large or surrogates */
90 if (res > UTF8_MAXCP || (0xD800u <= res && res <= 0xDFFFu))
91 return NULL;
92 }
93 if (val) *val = res;
94 return s + 1; /* +1 to include first byte */
95}
96
97static const char *utf8_prev (const char *s, const char *e) {
98 while (s < e && iscont(e - 1)) --e;
99 return s < e ? e - 1 : s;
100}
101
102static const char *utf8_next (const char *s, const char *e) {
103 while (s < e && iscont(s + 1)) ++s;
104 return s < e ? s + 1 : e;
105}
106
107static size_t utf8_length (const char *s, const char *e) {
108 size_t i;
109 for (i = 0; s < e; ++i)
110 s = utf8_next(s, e);
111 return i;
112}
113
114static const char *utf8_offset (const char *s, const char *e, lua_Integer offset, lua_Integer idx) {
115 const char *p = s + offset - 1;
116 if (idx >= 0) {
117 while (p < e && idx > 0)
118 p = utf8_next(p, e), --idx;
119 return idx == 0 ? p : NULL;
120 } else {
121 while (s < p && idx < 0)
122 p = utf8_prev(s, p), ++idx;
123 return idx == 0 ? p : NULL;
124 }
125}
126
127static const char *utf8_relat (const char *s, const char *e, int idx) {
128 return idx >= 0 ?
129 utf8_offset(s, e, 1, idx - 1) :
130 utf8_offset(s, e, e-s+1, idx);
131}
132
133static int utf8_range(const char *s, const char *e, lua_Integer *i, lua_Integer *j) {
134 const char *ps = utf8_relat(s, e, CAST(int, *i));
135 const char *pe = utf8_relat(s, e, CAST(int, *j));
136 *i = (ps ? ps : (*i > 0 ? e : s)) - s;
137 *j = (pe ? utf8_next(pe, e) : (*j > 0 ? e : s)) - s;
138 return *i < *j;
139}
140
141/* Indexed by top nibble of first byte in code unit */
142static uint8_t utf8_code_unit_len[] = {
143 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, 2, 2, 3, 4
144};
145
146/* Return pointer to first invalid UTF-8 sequence in 's', or NULL if valid */
147static const char *utf8_invalid_offset(const char *s, const char *e) {
148 while (s < e) {
149 uint8_t c = *s;
150 if (c >= 0x80) {
151 /* c < 0xC0 means a continuation byte, but we are not in the middle of a multi-byte code unit
152 * c >= 0xC0 && c < 0xC2 means an overlong 2-byte code unit
153 * c >= 0xF8 means a 5-byte or 6-byte code unit, which is illegal, or else illegal byte 0xFE/0xFF
154 * c >= 0xF5 && c < 0xF8 means a 4-byte code unit encoding invalid codepoint > U+10FFFF */
155 if (c < 0xC2 || c >= 0xF5)
156 return s;
157 uint8_t needed_bytes = utf8_code_unit_len[c >> 4];
158 if (e - s < needed_bytes)
159 return s; /* String is truncated */
160 uint8_t c2 = *(s+1);
161 if ((c2 & 0xC0) != 0x80)
162 return s; /* 2nd byte of code unit is not a continuation byte */
163 if (needed_bytes >= 3) {
164 uint8_t c3 = *(s+2);
165 if ((c3 & 0xC0) != 0x80)
166 return s; /* 3rd byte of code unit is not a continuation byte */
167 if (needed_bytes == 3) {
168 if (c == 0xE0 && c2 < 0xA0)
169 return s; /* Overlong 3-byte code unit */
170 if (c == 0xED && c2 >= 0xA0)
171 return s; /* Reserved codepoint from U+D800-U+DFFF */
172 } else {
173 uint8_t c4 = *(s+3);
174 if ((c4 & 0xC0) != 0x80)
175 return s; /* 4th byte of code unit is not a continuation byte */
176 if (c == 0xF0 && c2 < 0x90)
177 return s; /* Overlong 4-byte code unit */
178 if (c == 0xF4 && c2 >= 0x90)
179 return s; /* Illegal codepoint > U+10FFFF */
180 }
181 }
182 s += needed_bytes;
183 } else {
184 s++;
185 }
186 }
187 return NULL;
188}
189
190/* Unicode character categories */
191
192#define table_size(t) (sizeof(t)/sizeof((t)[0]))
193
194#define utf8_categories(X) \
195 X('a', alpha) \
196 X('c', cntrl) \
197 X('d', digit) \
198 X('l', lower) \
199 X('p', punct) \
200 X('s', space) \
201 X('t', compose) \
202 X('u', upper) \
203 X('x', xdigit)
204
205#define utf8_converters(X) \
206 X(lower) \
207 X(upper) \
208 X(title) \
209 X(fold)
210
211static int find_in_range (range_table *t, size_t size, utfint ch) {
212 size_t begin, end;
213
214 begin = 0;
215 end = size;
216
217 while (begin < end) {
218 size_t mid = (begin + end) / 2;
219 if (t[mid].last < ch)
220 begin = mid + 1;
221 else if (t[mid].first > ch)
222 end = mid;
223 else
224 return (ch - t[mid].first) % t[mid].step == 0;
225 }
226
227 return 0;
228}
229
230static int convert_char (conv_table *t, size_t size, utfint ch) {
231 size_t begin, end;
232
233 begin = 0;
234 end = size;
235
236 while (begin < end) {
237 size_t mid = (begin + end) / 2;
238 if (t[mid].last < ch)
239 begin = mid + 1;
240 else if (t[mid].first > ch)
241 end = mid;
242 else if ((ch - t[mid].first) % t[mid].step == 0)
243 return ch + t[mid].offset;
244 else
245 return ch;
246 }
247
248 return ch;
249}
250
251#define define_category(cls, name) static int utf8_is##name (utfint ch)\
252{ return find_in_range(name##_table, table_size(name##_table), ch); }
253#define define_converter(name) static utfint utf8_to##name (utfint ch) \
254{ return convert_char(to##name##_table, table_size(to##name##_table), ch); }
255utf8_categories(define_category)
256utf8_converters(define_converter)
257#undef define_category
258#undef define_converter
259
260static int utf8_isgraph (utfint ch) {
261 if (find_in_range(space_table, table_size(space_table), ch))
262 return 0;
263 if (find_in_range(graph_table, table_size(graph_table), ch))
264 return 1;
265 if (find_in_range(compose_table, table_size(compose_table), ch))
266 return 1;
267 return 0;
268}
269
270static int utf8_isalnum (utfint ch) {
271 if (find_in_range(alpha_table, table_size(alpha_table), ch))
272 return 1;
273 if (find_in_range(alnum_extend_table, table_size(alnum_extend_table), ch))
274 return 1;
275 return 0;
276}
277
278static int utf8_width (utfint ch, int ambi_is_single) {
279 if (find_in_range(doublewidth_table, table_size(doublewidth_table), ch))
280 return 2;
281 if (find_in_range(ambiwidth_table, table_size(ambiwidth_table), ch))
282 return ambi_is_single ? 1 : 2;
283 if (find_in_range(compose_table, table_size(compose_table), ch))
284 return 0;
285 if (find_in_range(unprintable_table, table_size(unprintable_table), ch))
286 return 0;
287 return 1;
288}
289
290
291/* string module compatible interface */
292
293static int typeerror (lua_State *L, int idx, const char *tname)
294{ return luaL_error(L, "%s expected, got %s", tname, luaL_typename(L, idx)); }
295
296static const char *check_utf8 (lua_State *L, int idx, const char **end) {
297 size_t len;
298 const char *s = luaL_checklstring(L, idx, &len);
299 if (end) *end = s+len;
300 return s;
301}
302
303static const char *to_utf8 (lua_State *L, int idx, const char **end) {
304 size_t len;
305 const char *s = lua_tolstring(L, idx, &len);
306 if (end) *end = s+len;
307 return s;
308}
309
310static const char *utf8_safe_decode (lua_State *L, const char *p, utfint *pval) {
311 p = utf8_decode(p, pval, 0);
312 if (p == NULL) luaL_error(L, "invalid UTF-8 code");
313 return p;
314}
315
316static void add_utf8char (luaL_Buffer *b, utfint ch) {
317 char buff[UTF8_BUFFSZ];
318 size_t n = utf8_encode(buff, ch);
319 luaL_addlstring(b, buff+UTF8_BUFFSZ-n, n);
320}
321
322static lua_Integer byte_relat (lua_Integer pos, size_t len) {
323 if (pos >= 0) return pos;
324 else if (0u - (size_t)pos > len) return 0;
325 else return (lua_Integer)len + pos + 1;
326}
327
328static int Lutf8_len (lua_State *L) {
329 size_t len, n;
330 const char *s = luaL_checklstring(L, 1, &len), *p, *e;
331 lua_Integer posi = byte_relat(luaL_optinteger(L, 2, 1), len);
332 lua_Integer pose = byte_relat(luaL_optinteger(L, 3, -1), len);
333 int lax = lua_toboolean(L, 4);
334 luaL_argcheck(L, 1 <= posi && --posi <= (lua_Integer)len, 2,
335 "initial position out of string");
336 luaL_argcheck(L, --pose < (lua_Integer)len, 3,
337 "final position out of string");
338 for (n = 0, p=s+posi, e=s+pose+1; p < e; ++n) {
339 if (lax)
340 p = utf8_next(p, e);
341 else {
342 utfint ch;
343 const char *np = utf8_decode(p, &ch, !lax);
344 if (np == NULL || utf8_invalid(ch)) {
345 lua_pushnil(L);
346 lua_pushinteger(L, p - s + 1);
347 return 2;
348 }
349 p = np;
350 }
351 }
352 lua_pushinteger(L, n);
353 return 1;
354}
355
356static int Lutf8_sub (lua_State *L) {
357 const char *e, *s = check_utf8(L, 1, &e);
358 lua_Integer posi = luaL_checkinteger(L, 2);
359 lua_Integer pose = luaL_optinteger(L, 3, -1);
360 if (utf8_range(s, e, &posi, &pose))
361 lua_pushlstring(L, s+posi, pose-posi);
362 else
363 lua_pushliteral(L, "");
364 return 1;
365}
366
367static int Lutf8_reverse (lua_State *L) {
368 luaL_Buffer b;
369 const char *prev, *pprev, *ends, *e, *s = check_utf8(L, 1, &e);
370 (void) ends;
371 int lax = lua_toboolean(L, 2);
372 luaL_buffinit(L, &b);
373 if (lax) {
374 for (prev = e; s < prev; e = prev) {
375 prev = utf8_prev(s, prev);
376 luaL_addlstring(&b, prev, e-prev);
377 }
378 } else {
379 for (prev = e; s < prev; prev = pprev) {
380 utfint code = 0;
381 ends = utf8_safe_decode(L, pprev = utf8_prev(s, prev), &code);
382 assert(ends == prev);
383 if (utf8_invalid(code))
384 return luaL_error(L, "invalid UTF-8 code");
385 if (!utf8_iscompose(code)) {
386 luaL_addlstring(&b, pprev, e-pprev);
387 e = pprev;
388 }
389 }
390 }
391 luaL_pushresult(&b);
392 return 1;
393}
394
395static int Lutf8_byte (lua_State *L) {
396 size_t n = 0;
397 const char *e, *s = check_utf8(L, 1, &e);
398 lua_Integer posi = luaL_optinteger(L, 2, 1);
399 lua_Integer pose = luaL_optinteger(L, 3, posi);
400 if (utf8_range(s, e, &posi, &pose)) {
401 for (e = s + pose, s = s + posi; s < e; ++n) {
402 utfint ch = 0;
403 s = utf8_safe_decode(L, s, &ch);
404 lua_pushinteger(L, ch);
405 }
406 }
407 return CAST(int, n);
408}
409
410static int Lutf8_codepoint (lua_State *L) {
411 const char *e, *s = check_utf8(L, 1, &e);
412 size_t len = e-s;
413 lua_Integer posi = byte_relat(luaL_optinteger(L, 2, 1), len);
414 lua_Integer pose = byte_relat(luaL_optinteger(L, 3, posi), len);
415 int lax = lua_toboolean(L, 4);
416 int n;
417 const char *se;
418 luaL_argcheck(L, posi >= 1, 2, "out of range");
419 luaL_argcheck(L, pose <= (lua_Integer)len, 3, "out of range");
420 if (posi > pose) return 0; /* empty interval; return no values */
421 if (pose - posi >= INT_MAX) /* (lua_Integer -> int) overflow? */
422 return luaL_error(L, "string slice too long");
423 n = (int)(pose - posi + 1);
424 luaL_checkstack(L, n, "string slice too long");
425 n = 0; /* count the number of returns */
426 se = s + pose; /* string end */
427 for (n = 0, s += posi - 1; s < se;) {
428 utfint code = 0;
429 s = utf8_safe_decode(L, s, &code);
430 if (!lax && utf8_invalid(code))
431 return luaL_error(L, "invalid UTF-8 code");
432 lua_pushinteger(L, code);
433 n++;
434 }
435 return n;
436}
437
438static int Lutf8_char (lua_State *L) {
439 int i, n = lua_gettop(L); /* number of arguments */
440 luaL_Buffer b;
441 luaL_buffinit(L, &b);
442 for (i = 1; i <= n; ++i) {
443 lua_Integer code = luaL_checkinteger(L, i);
444 luaL_argcheck(L, code <= UTF8_MAXCP, i, "value out of range");
445 add_utf8char(&b, CAST(utfint, code));
446 }
447 luaL_pushresult(&b);
448 return 1;
449}
450
451#define bind_converter(name) \
452static int Lutf8_##name (lua_State *L) { \
453 int t = lua_type(L, 1); \
454 if (t == LUA_TNUMBER) \
455 lua_pushinteger(L, utf8_to##name(CAST(utfint, lua_tointeger(L, 1)))); \
456 else if (t == LUA_TSTRING) { \
457 luaL_Buffer b; \
458 const char *e, *s = to_utf8(L, 1, &e); \
459 luaL_buffinit(L, &b); \
460 while (s < e) { \
461 utfint ch = 0; \
462 s = utf8_safe_decode(L, s, &ch); \
463 add_utf8char(&b, utf8_to##name(ch)); \
464 } \
465 luaL_pushresult(&b); \
466 } \
467 else return typeerror(L, 1, "number/string"); \
468 return 1; \
469}
470utf8_converters(bind_converter)
471#undef bind_converter
472
473
474/* unicode extra interface */
475
476static const char *parse_escape (lua_State *L, const char *s, const char *e, int hex, utfint *pch) {
477 utfint code = 0;
478 int in_bracket = 0;
479 if (*s == '{') ++s, in_bracket = 1;
480 for (; s < e; ++s) {
481 utfint ch = (unsigned char)*s;
482 if (ch >= '0' && ch <= '9') ch = ch - '0';
483 else if (hex && ch >= 'A' && ch <= 'F') ch = 10 + (ch - 'A');
484 else if (hex && ch >= 'a' && ch <= 'f') ch = 10 + (ch - 'a');
485 else if (!in_bracket) break;
486 else if (ch == '}') { ++s; break; }
487 else luaL_error(L, "invalid escape '%c'", ch);
488 code *= hex ? 16 : 10;
489 code += ch;
490 }
491 *pch = code;
492 return s;
493}
494
495static int Lutf8_escape (lua_State *L) {
496 const char *e, *s = check_utf8(L, 1, &e);
497 luaL_Buffer b;
498 luaL_buffinit(L, &b);
499 while (s < e) {
500 utfint ch = 0;
501 s = utf8_safe_decode(L, s, &ch);
502 if (ch == '%') {
503 int hex = 0;
504 switch (*s) {
505 case '0': case '1': case '2': case '3':
506 case '4': case '5': case '6': case '7':
507 case '8': case '9': case '{':
508 break;
509 case 'x': case 'X': hex = 1; /* fall through */
510 case 'u': case 'U': if (s+1 < e) { ++s; break; }
511 /* fall through */
512 default:
513 s = utf8_safe_decode(L, s, &ch);
514 goto next;
515 }
516 s = parse_escape(L, s, e, hex, &ch);
517 }
518next:
519 add_utf8char(&b, ch);
520 }
521 luaL_pushresult(&b);
522 return 1;
523}
524
525static int Lutf8_insert (lua_State *L) {
526 const char *e, *s = check_utf8(L, 1, &e);
527 size_t sublen;
528 const char *subs;
529 luaL_Buffer b;
530 int nargs = 2;
531 const char *first = e;
532 if (lua_type(L, 2) == LUA_TNUMBER) {
533 int idx = (int)lua_tointeger(L, 2);
534 if (idx != 0) first = utf8_relat(s, e, idx);
535 luaL_argcheck(L, first, 2, "invalid index");
536 ++nargs;
537 }
538 subs = luaL_checklstring(L, nargs, &sublen);
539 luaL_buffinit(L, &b);
540 luaL_addlstring(&b, s, first-s);
541 luaL_addlstring(&b, subs, sublen);
542 luaL_addlstring(&b, first, e-first);
543 luaL_pushresult(&b);
544 return 1;
545}
546
547static int Lutf8_remove (lua_State *L) {
548 const char *e, *s = check_utf8(L, 1, &e);
549 lua_Integer posi = luaL_optinteger(L, 2, -1);
550 lua_Integer pose = luaL_optinteger(L, 3, -1);
551 if (!utf8_range(s, e, &posi, &pose))
552 lua_settop(L, 1);
553 else {
554 luaL_Buffer b;
555 luaL_buffinit(L, &b);
556 luaL_addlstring(&b, s, posi);
557 luaL_addlstring(&b, s+pose, e-s-pose);
558 luaL_pushresult(&b);
559 }
560 return 1;
561}
562
563static int push_offset (lua_State *L, const char *s, const char *e, lua_Integer offset, lua_Integer idx) {
564 utfint ch = 0;
565 const char *p;
566 if (idx != 0)
567 p = utf8_offset(s, e, offset, idx);
568 else if (p = s+offset-1, iscont(p))
569 p = utf8_prev(s, p);
570 if (p == NULL || p == e) return 0;
571 utf8_decode(p, &ch, 0);
572 lua_pushinteger(L, p-s+1);
573 lua_pushinteger(L, ch);
574 return 2;
575}
576
577static int Lutf8_charpos (lua_State *L) {
578 const char *e, *s = check_utf8(L, 1, &e);
579 lua_Integer offset = 1;
580 if (lua_isnoneornil(L, 3)) {
581 lua_Integer idx = luaL_optinteger(L, 2, 0);
582 if (idx > 0) --idx;
583 else if (idx < 0) offset = e-s+1;
584 return push_offset(L, s, e, offset, idx);
585 }
586 offset = byte_relat(luaL_optinteger(L, 2, 1), e-s);
587 if (offset < 1) offset = 1;
588 return push_offset(L, s, e, offset, luaL_checkinteger(L, 3));
589}
590
591static int Lutf8_offset (lua_State *L) {
592 size_t len;
593 const char *s = luaL_checklstring(L, 1, &len);
594 lua_Integer n = luaL_checkinteger(L, 2);
595 lua_Integer posi = (n >= 0) ? 1 : len + 1;
596 posi = byte_relat(luaL_optinteger(L, 3, posi), len);
597 luaL_argcheck(L, 1 <= posi && --posi <= (lua_Integer)len, 3,
598 "position out of range");
599 if (n == 0) {
600 /* find beginning of current byte sequence */
601 while (posi > 0 && iscont(s + posi)) posi--;
602 } else {
603 if (iscont(s + posi))
604 return luaL_error(L, "initial position is a continuation byte");
605 if (n < 0) {
606 while (n < 0 && posi > 0) { /* move back */
607 do { /* find beginning of previous character */
608 posi--;
609 } while (posi > 0 && iscont(s + posi));
610 n++;
611 }
612 } else {
613 n--; /* do not move for 1st character */
614 while (n > 0 && posi < (lua_Integer)len) {
615 do { /* find beginning of next character */
616 posi++;
617 } while (iscont(s + posi)); /* (cannot pass final '\0') */
618 n--;
619 }
620 }
621 }
622 if (n == 0) /* did it find given character? */
623 lua_pushinteger(L, posi + 1);
624 else /* no such character */
625 lua_pushnil(L);
626 return 1;
627}
628
629static int Lutf8_next (lua_State *L) {
630 const char *e, *s = check_utf8(L, 1, &e);
631 lua_Integer offset = byte_relat(luaL_optinteger(L, 2, 1), e-s);
632 lua_Integer idx = luaL_optinteger(L, 3, !lua_isnoneornil(L, 2));
633 return push_offset(L, s, e, offset, idx);
634}
635
636static int iter_aux (lua_State *L, int strict) {
637 const char *e, *s = check_utf8(L, 1, &e);
638 int n = CAST(int, lua_tointeger(L, 2));
639 const char *p = n <= 0 ? s : utf8_next(s+n-1, e);
640 if (p < e) {
641 utfint code = 0;
642 utf8_safe_decode(L, p, &code);
643 if (strict && utf8_invalid(code))
644 return luaL_error(L, "invalid UTF-8 code");
645 lua_pushinteger(L, p-s+1);
646 lua_pushinteger(L, code);
647 return 2;
648 }
649 return 0; /* no more codepoints */
650}
651
652static int iter_auxstrict (lua_State *L) { return iter_aux(L, 1); }
653static int iter_auxlax (lua_State *L) { return iter_aux(L, 0); }
654
655static int Lutf8_codes (lua_State *L) {
656 int lax = lua_toboolean(L, 2);
657 luaL_checkstring(L, 1);
658 lua_pushcfunction(L, lax ? iter_auxlax : iter_auxstrict);
659 lua_pushvalue(L, 1);
660 lua_pushinteger(L, 0);
661 return 3;
662}
663
664static int Lutf8_width (lua_State *L) {
665 int t = lua_type(L, 1);
666 int ambi_is_single = !lua_toboolean(L, 2);
667 int default_width = CAST(int, luaL_optinteger(L, 3, 0));
668 if (t == LUA_TNUMBER) {
669 size_t chwidth = utf8_width(CAST(utfint, lua_tointeger(L, 1)), ambi_is_single);
670 if (chwidth == 0) chwidth = default_width;
671 lua_pushinteger(L, (lua_Integer)chwidth);
672 } else if (t != LUA_TSTRING)
673 return typeerror(L, 1, "number/string");
674 else {
675 const char *e, *s = to_utf8(L, 1, &e);
676 int width = 0;
677 while (s < e) {
678 utfint ch = 0;
679 int chwidth;
680 s = utf8_safe_decode(L, s, &ch);
681 chwidth = utf8_width(ch, ambi_is_single);
682 width += chwidth == 0 ? default_width : chwidth;
683 }
684 lua_pushinteger(L, (lua_Integer)width);
685 }
686 return 1;
687}
688
689static int Lutf8_widthindex (lua_State *L) {
690 const char *e, *s = check_utf8(L, 1, &e);
691 int width = CAST(int, luaL_checkinteger(L, 2));
692 int ambi_is_single = !lua_toboolean(L, 3);
693 int default_width = CAST(int, luaL_optinteger(L, 4, 0));
694 size_t idx = 1;
695 while (s < e) {
696 utfint ch = 0;
697 size_t chwidth;
698 s = utf8_safe_decode(L, s, &ch);
699 chwidth = utf8_width(ch, ambi_is_single);
700 if (chwidth == 0) chwidth = default_width;
701 width -= CAST(int, chwidth);
702 if (width <= 0) {
703 lua_pushinteger(L, idx);
704 lua_pushinteger(L, width + chwidth);
705 lua_pushinteger(L, chwidth);
706 return 3;
707 }
708 ++idx;
709 }
710 lua_pushinteger(L, (lua_Integer)idx);
711 return 1;
712}
713
714static int Lutf8_ncasecmp (lua_State *L) {
715 const char *e1, *s1 = check_utf8(L, 1, &e1);
716 const char *e2, *s2 = check_utf8(L, 2, &e2);
717 while (s1 < e1 || s2 < e2) {
718 utfint ch1 = 0, ch2 = 0;
719 if (s1 == e1)
720 ch2 = 1;
721 else if (s2 == e2)
722 ch1 = 1;
723 else {
724 s1 = utf8_safe_decode(L, s1, &ch1);
725 s2 = utf8_safe_decode(L, s2, &ch2);
726 ch1 = utf8_tofold(ch1);
727 ch2 = utf8_tofold(ch2);
728 }
729 if (ch1 != ch2) {
730 lua_pushinteger(L, ch1 > ch2 ? 1 : -1);
731 return 1;
732 }
733 }
734 lua_pushinteger(L, 0);
735 return 1;
736}
737
738
739/* utf8 pattern matching implement */
740
741#ifndef LUA_MAXCAPTURES
742# define LUA_MAXCAPTURES 32
743#endif /* LUA_MAXCAPTURES */
744
745#define CAP_UNFINISHED (-1)
746#define CAP_POSITION (-2)
747
748
749typedef struct MatchState {
750 int matchdepth; /* control for recursive depth (to avoid C stack overflow) */
751 const char *src_init; /* init of source string */
752 const char *src_end; /* end ('\0') of source string */
753 const char *p_end; /* end ('\0') of pattern */
754 lua_State *L;
755 int level; /* total number of captures (finished or unfinished) */
756 struct {
757 const char *init;
758 ptrdiff_t len;
759 } capture[LUA_MAXCAPTURES];
760} MatchState;
761
762/* recursive function */
763static const char *match (MatchState *ms, const char *s, const char *p);
764
765/* maximum recursion depth for 'match' */
766#if !defined(MAXCCALLS)
767#define MAXCCALLS 200
768#endif
769
770#define L_ESC '%'
771#define SPECIALS "^$*+?.([%-"
772
773static int check_capture (MatchState *ms, int l) {
774 l -= '1';
775 if (l < 0 || l >= ms->level || ms->capture[l].len == CAP_UNFINISHED)
776 return luaL_error(ms->L, "invalid capture index %%%d", l + 1);
777 return l;
778}
779
780static int capture_to_close (MatchState *ms) {
781 int level = ms->level;
782 while (--level >= 0)
783 if (ms->capture[level].len == CAP_UNFINISHED) return level;
784 return luaL_error(ms->L, "invalid pattern capture");
785}
786
787static const char *classend (MatchState *ms, const char *p) {
788 utfint ch = 0;
789 p = utf8_safe_decode(ms->L, p, &ch);
790 switch (ch) {
791 case L_ESC: {
792 if (p == ms->p_end)
793 luaL_error(ms->L, "malformed pattern (ends with " LUA_QL("%%") ")");
794 return utf8_next(p, ms->p_end);
795 }
796 case '[': {
797 if (*p == '^') p++;
798 do { /* look for a `]' */
799 if (p == ms->p_end)
800 luaL_error(ms->L, "malformed pattern (missing " LUA_QL("]") ")");
801 if (*(p++) == L_ESC && p < ms->p_end)
802 p++; /* skip escapes (e.g. `%]') */
803 } while (*p != ']');
804 return p+1;
805 }
806 default: {
807 return p;
808 }
809 }
810}
811
812static int match_class (utfint c, utfint cl) {
813 int res;
814 switch (utf8_tolower(cl)) {
815#define X(cls, name) case cls: res = utf8_is##name(c); break;
816 utf8_categories(X)
817#undef X
818 case 'g' : res = utf8_isgraph(c); break;
819 case 'w' : res = utf8_isalnum(c); break;
820 case 'z' : res = (c == 0); break; /* deprecated option */
821 default: return (cl == c);
822 }
823 return (utf8_islower(cl) ? res : !res);
824}
825
826static int matchbracketclass (MatchState *ms, utfint c, const char *p, const char *ec) {
827 int sig = 1;
828 assert(*p == '[');
829 if (*++p == '^') {
830 sig = 0;
831 p++; /* skip the `^' */
832 }
833 while (p < ec) {
834 utfint ch = 0;
835 p = utf8_safe_decode(ms->L, p, &ch);
836 if (ch == L_ESC) {
837 p = utf8_safe_decode(ms->L, p, &ch);
838 if (match_class(c, ch))
839 return sig;
840 } else {
841 utfint next = 0;
842 const char *np = utf8_safe_decode(ms->L, p, &next);
843 if (next == '-' && np < ec) {
844 p = utf8_safe_decode(ms->L, np, &next);
845 if (ch <= c && c <= next)
846 return sig;
847 }
848 else if (ch == c) return sig;
849 }
850 }
851 return !sig;
852}
853
854static int singlematch (MatchState *ms, const char *s, const char *p, const char *ep) {
855 if (s >= ms->src_end)
856 return 0;
857 else {
858 utfint ch=0, pch=0;
859 utf8_safe_decode(ms->L, s, &ch);
860 p = utf8_safe_decode(ms->L, p, &pch);
861 switch (pch) {
862 case '.': return 1; /* matches any char */
863 case L_ESC: utf8_safe_decode(ms->L, p, &pch);
864 return match_class(ch, pch);
865 case '[': return matchbracketclass(ms, ch, p-1, ep-1);
866 default: return pch == ch;
867 }
868 }
869}
870
871static const char *matchbalance (MatchState *ms, const char *s, const char **p) {
872 utfint ch=0, begin=0, end=0;
873 *p = utf8_safe_decode(ms->L, *p, &begin);
874 if (*p >= ms->p_end)
875 luaL_error(ms->L, "malformed pattern "
876 "(missing arguments to " LUA_QL("%%b") ")");
877 *p = utf8_safe_decode(ms->L, *p, &end);
878 s = utf8_safe_decode(ms->L, s, &ch);
879 if (ch != begin) return NULL;
880 else {
881 int cont = 1;
882 while (s < ms->src_end) {
883 s = utf8_safe_decode(ms->L, s, &ch);
884 if (ch == end) {
885 if (--cont == 0) return s;
886 }
887 else if (ch == begin) cont++;
888 }
889 }
890 return NULL; /* string ends out of balance */
891}
892
893static const char *max_expand (MatchState *ms, const char *s, const char *p, const char *ep) {
894 const char *m = s; /* matched end of single match p */
895 while (singlematch(ms, m, p, ep))
896 m = utf8_next(m, ms->src_end);
897 /* keeps trying to match with the maximum repetitions */
898 while (s <= m) {
899 const char *res = match(ms, m, ep+1);
900 if (res) return res;
901 /* else didn't match; reduce 1 repetition to try again */
902 if (s == m) break;
903 m = utf8_prev(s, m);
904 }
905 return NULL;
906}
907
908static const char *min_expand (MatchState *ms, const char *s, const char *p, const char *ep) {
909 for (;;) {
910 const char *res = match(ms, s, ep+1);
911 if (res != NULL)
912 return res;
913 else if (singlematch(ms, s, p, ep))
914 s = utf8_next(s, ms->src_end); /* try with one more repetition */
915 else return NULL;
916 }
917}
918
919static const char *start_capture (MatchState *ms, const char *s, const char *p, int what) {
920 const char *res;
921 int level = ms->level;
922 if (level >= LUA_MAXCAPTURES) luaL_error(ms->L, "too many captures");
923 ms->capture[level].init = s;
924 ms->capture[level].len = what;
925 ms->level = level+1;
926 if ((res=match(ms, s, p)) == NULL) /* match failed? */
927 ms->level--; /* undo capture */
928 return res;
929}
930
931static const char *end_capture (MatchState *ms, const char *s, const char *p) {
932 int l = capture_to_close(ms);
933 const char *res;
934 ms->capture[l].len = s - ms->capture[l].init; /* close capture */
935 if ((res = match(ms, s, p)) == NULL) /* match failed? */
936 ms->capture[l].len = CAP_UNFINISHED; /* undo capture */
937 return res;
938}
939
940static const char *match_capture (MatchState *ms, const char *s, int l) {
941 size_t len;
942 l = check_capture(ms, l);
943 len = ms->capture[l].len;
944 if ((size_t)(ms->src_end-s) >= len &&
945 memcmp(ms->capture[l].init, s, len) == 0)
946 return s+len;
947 else return NULL;
948}
949
950static const char *match (MatchState *ms, const char *s, const char *p) {
951 if (ms->matchdepth-- == 0)
952 luaL_error(ms->L, "pattern too complex");
953 init: /* using goto's to optimize tail recursion */
954 if (p != ms->p_end) { /* end of pattern? */
955 utfint ch = 0;
956 utf8_safe_decode(ms->L, p, &ch);
957 switch (ch) {
958 case '(': { /* start capture */
959 if (*(p + 1) == ')') /* position capture? */
960 s = start_capture(ms, s, p + 2, CAP_POSITION);
961 else
962 s = start_capture(ms, s, p + 1, CAP_UNFINISHED);
963 break;
964 }
965 case ')': { /* end capture */
966 s = end_capture(ms, s, p + 1);
967 break;
968 }
969 case '$': {
970 if ((p + 1) != ms->p_end) /* is the `$' the last char in pattern? */
971 goto dflt; /* no; go to default */
972 s = (s == ms->src_end) ? s : NULL; /* check end of string */
973 break;
974 }
975 case L_ESC: { /* escaped sequence not in the format class[*+?-]? */
976 const char *prev_p = p;
977 p = utf8_safe_decode(ms->L, p+1, &ch);
978 switch (ch) {
979 case 'b': { /* balanced string? */
980 s = matchbalance(ms, s, &p);
981 if (s != NULL)
982 goto init; /* return match(ms, s, p + 4); */
983 /* else fail (s == NULL) */
984 break;
985 }
986 case 'f': { /* frontier? */
987 const char *ep; utfint previous = 0, current = 0;
988 if (*p != '[')
989 luaL_error(ms->L, "missing " LUA_QL("[") " after "
990 LUA_QL("%%f") " in pattern");
991 ep = classend(ms, p); /* points to what is next */
992 if (s != ms->src_init)
993 utf8_decode(utf8_prev(ms->src_init, s), &previous, 0);
994 if (s != ms->src_end)
995 utf8_decode(s, &current, 0);
996 if (!matchbracketclass(ms, previous, p, ep - 1) &&
997 matchbracketclass(ms, current, p, ep - 1)) {
998 p = ep; goto init; /* return match(ms, s, ep); */
999 }
1000 s = NULL; /* match failed */
1001 break;
1002 }
1003 case '0': case '1': case '2': case '3':
1004 case '4': case '5': case '6': case '7':
1005 case '8': case '9': { /* capture results (%0-%9)? */
1006 s = match_capture(ms, s, ch);
1007 if (s != NULL) goto init; /* return match(ms, s, p + 2) */
1008 break;
1009 }
1010 default: p = prev_p; goto dflt;
1011 }
1012 break;
1013 }
1014 default: dflt: { /* pattern class plus optional suffix */
1015 const char *ep = classend(ms, p); /* points to optional suffix */
1016 /* does not match at least once? */
1017 if (!singlematch(ms, s, p, ep)) {
1018 if (*ep == '*' || *ep == '?' || *ep == '-') { /* accept empty? */
1019 p = ep + 1; goto init; /* return match(ms, s, ep + 1); */
1020 } else /* '+' or no suffix */
1021 s = NULL; /* fail */
1022 } else { /* matched once */
1023 const char *next_s = utf8_next(s, ms->src_end);
1024 switch (*ep) { /* handle optional suffix */
1025 case '?': { /* optional */
1026 const char *res;
1027 const char *next_ep = utf8_next(ep, ms->p_end);
1028 if ((res = match(ms, next_s, next_ep)) != NULL)
1029 s = res;
1030 else {
1031 p = next_ep; goto init; /* else return match(ms, s, ep + 1); */
1032 }
1033 break;
1034 }
1035 case '+': /* 1 or more repetitions */
1036 s = next_s; /* 1 match already done */
1037 /* fall through */
1038 case '*': /* 0 or more repetitions */
1039 s = max_expand(ms, s, p, ep);
1040 break;
1041 case '-': /* 0 or more repetitions (minimum) */
1042 s = min_expand(ms, s, p, ep);
1043 break;
1044 default: /* no suffix */
1045 s = next_s; p = ep; goto init; /* return match(ms, s + 1, ep); */
1046 }
1047 }
1048 break;
1049 }
1050 }
1051 }
1052 ms->matchdepth++;
1053 return s;
1054}
1055
1056static const char *lmemfind (const char *s1, size_t l1, const char *s2, size_t l2) {
1057 if (l2 == 0) return s1; /* empty strings are everywhere */
1058 else if (l2 > l1) return NULL; /* avoids a negative `l1' */
1059 else {
1060 const char *init; /* to search for a `*s2' inside `s1' */
1061 l2--; /* 1st char will be checked by `memchr' */
1062 l1 = l1-l2; /* `s2' cannot be found after that */
1063 while (l1 > 0 && (init = (const char *)memchr(s1, *s2, l1)) != NULL) {
1064 init++; /* 1st char is already checked */
1065 if (memcmp(init, s2+1, l2) == 0)
1066 return init-1;
1067 else { /* correct `l1' and `s1' to try again */
1068 l1 -= init-s1;
1069 s1 = init;
1070 }
1071 }
1072 return NULL; /* not found */
1073 }
1074}
1075
1076static int get_index (const char *p, const char *s, const char *e) {
1077 int idx;
1078 for (idx = 0; s < e && s < p; ++idx)
1079 s = utf8_next(s, e);
1080 return s == p ? idx : idx - 1;
1081}
1082
1083static void push_onecapture (MatchState *ms, int i, const char *s, const char *e) {
1084 if (i >= ms->level) {
1085 if (i == 0) /* ms->level == 0, too */
1086 lua_pushlstring(ms->L, s, e - s); /* add whole match */
1087 else
1088 luaL_error(ms->L, "invalid capture index");
1089 } else {
1090 ptrdiff_t l = ms->capture[i].len;
1091 if (l == CAP_UNFINISHED) luaL_error(ms->L, "unfinished capture");
1092 if (l == CAP_POSITION) {
1093 int idx = get_index(ms->capture[i].init, ms->src_init, ms->src_end);
1094 lua_pushinteger(ms->L, idx+1);
1095 } else
1096 lua_pushlstring(ms->L, ms->capture[i].init, l);
1097 }
1098}
1099
1100static int push_captures (MatchState *ms, const char *s, const char *e) {
1101 int i;
1102 int nlevels = (ms->level == 0 && s) ? 1 : ms->level;
1103 luaL_checkstack(ms->L, nlevels, "too many captures");
1104 for (i = 0; i < nlevels; i++)
1105 push_onecapture(ms, i, s, e);
1106 return nlevels; /* number of strings pushed */
1107}
1108
1109/* check whether pattern has no special characters */
1110static int nospecials (const char *p, const char * ep) {
1111 while (p < ep) {
1112 if (strpbrk(p, SPECIALS))
1113 return 0; /* pattern has a special character */
1114 p += strlen(p) + 1; /* may have more after \0 */
1115 }
1116 return 1; /* no special chars found */
1117}
1118
1119
1120/* utf8 pattern matching interface */
1121
1122static int find_aux (lua_State *L, int find) {
1123 const char *es, *s = check_utf8(L, 1, &es);
1124 const char *ep, *p = check_utf8(L, 2, &ep);
1125 lua_Integer idx = luaL_optinteger(L, 3, 1);
1126 const char *init;
1127 if (!idx) idx = 1;
1128 init = utf8_relat(s, es, CAST(int, idx));
1129 if (init == NULL) {
1130 if (idx > 0) {
1131 lua_pushnil(L); /* cannot find anything */
1132 return 1;
1133 }
1134 init = s;
1135 }
1136 /* explicit request or no special characters? */
1137 if (find && (lua_toboolean(L, 4) || nospecials(p, ep))) {
1138 /* do a plain search */
1139 const char *s2 = lmemfind(init, es-init, p, ep-p);
1140 if (s2) {
1141 const char *e2 = s2 + (ep - p);
1142 if (iscont(e2)) e2 = utf8_next(e2, es);
1143 lua_pushinteger(L, idx = get_index(s2, s, es) + 1);
1144 lua_pushinteger(L, idx + get_index(e2, s2, es) - 1);
1145 return 2;
1146 }
1147 } else {
1148 MatchState ms;
1149 int anchor = (*p == '^');
1150 if (anchor) p++; /* skip anchor character */
1151 if (idx < 0) idx += utf8_length(s, es)+1; /* TODO not very good */
1152 ms.L = L;
1153 ms.matchdepth = MAXCCALLS;
1154 ms.src_init = s;
1155 ms.src_end = es;
1156 ms.p_end = ep;
1157 do {
1158 const char *res;
1159 ms.level = 0;
1160 assert(ms.matchdepth == MAXCCALLS);
1161 if ((res=match(&ms, init, p)) != NULL) {
1162 if (find) {
1163 lua_pushinteger(L, idx); /* start */
1164 lua_pushinteger(L, idx + utf8_length(init, res) - 1); /* end */
1165 return push_captures(&ms, NULL, 0) + 2;
1166 } else
1167 return push_captures(&ms, init, res);
1168 }
1169 if (init == es) break;
1170 idx += 1;
1171 init = utf8_next(init, es);
1172 } while (init <= es && !anchor);
1173 }
1174 lua_pushnil(L); /* not found */
1175 return 1;
1176}
1177
1178static int Lutf8_find (lua_State *L) { return find_aux(L, 1); }
1179static int Lutf8_match (lua_State *L) { return find_aux(L, 0); }
1180
1181static int gmatch_aux (lua_State *L) {
1182 MatchState ms;
1183 const char *es, *s = check_utf8(L, lua_upvalueindex(1), &es);
1184 const char *ep, *p = check_utf8(L, lua_upvalueindex(2), &ep);
1185 const char *src;
1186 ms.L = L;
1187 ms.matchdepth = MAXCCALLS;
1188 ms.src_init = s;
1189 ms.src_end = es;
1190 ms.p_end = ep;
1191 for (src = s + (size_t)lua_tointeger(L, lua_upvalueindex(3));
1192 src <= ms.src_end;
1193 src = utf8_next(src, ms.src_end)) {
1194 const char *e;
1195 ms.level = 0;
1196 assert(ms.matchdepth == MAXCCALLS);
1197 if ((e = match(&ms, src, p)) != NULL) {
1198 lua_Integer newstart = e-s;
1199 if (e == src) newstart++; /* empty match? go at least one position */
1200 lua_pushinteger(L, newstart);
1201 lua_replace(L, lua_upvalueindex(3));
1202 return push_captures(&ms, src, e);
1203 }
1204 if (src == ms.src_end) break;
1205 }
1206 return 0; /* not found */
1207}
1208
1209static int Lutf8_gmatch (lua_State *L) {
1210 luaL_checkstring(L, 1);
1211 luaL_checkstring(L, 2);
1212 lua_settop(L, 2);
1213 lua_pushinteger(L, 0);
1214 lua_pushcclosure(L, gmatch_aux, 3);
1215 return 1;
1216}
1217
1218static void add_s (MatchState *ms, luaL_Buffer *b, const char *s, const char *e) {
1219 const char *new_end, *news = to_utf8(ms->L, 3, &new_end);
1220 while (news < new_end) {
1221 utfint ch = 0;
1222 news = utf8_safe_decode(ms->L, news, &ch);
1223 if (ch != L_ESC)
1224 add_utf8char(b, ch);
1225 else {
1226 news = utf8_safe_decode(ms->L, news, &ch); /* skip ESC */
1227 if (!utf8_isdigit(ch)) {
1228 if (ch != L_ESC)
1229 luaL_error(ms->L, "invalid use of " LUA_QL("%c")
1230 " in replacement string", L_ESC);
1231 add_utf8char(b, ch);
1232 } else if (ch == '0')
1233 luaL_addlstring(b, s, e-s);
1234 else {
1235 push_onecapture(ms, ch-'1', s, e);
1236 luaL_addvalue(b); /* add capture to accumulated result */
1237 }
1238 }
1239 }
1240}
1241
1242static void add_value (MatchState *ms, luaL_Buffer *b, const char *s, const char *e, int tr) {
1243 lua_State *L = ms->L;
1244 switch (tr) {
1245 case LUA_TFUNCTION: {
1246 int n;
1247 lua_pushvalue(L, 3);
1248 n = push_captures(ms, s, e);
1249 lua_call(L, n, 1);
1250 break;
1251 }
1252 case LUA_TTABLE: {
1253 push_onecapture(ms, 0, s, e);
1254 lua_gettable(L, 3);
1255 break;
1256 }
1257 default: { /* LUA_TNUMBER or LUA_TSTRING */
1258 add_s(ms, b, s, e);
1259 return;
1260 }
1261 }
1262 if (!lua_toboolean(L, -1)) { /* nil or false? */
1263 lua_pop(L, 1);
1264 lua_pushlstring(L, s, e - s); /* keep original text */
1265 } else if (!lua_isstring(L, -1))
1266 luaL_error(L, "invalid replacement value (a %s)", luaL_typename(L, -1));
1267 luaL_addvalue(b); /* add result to accumulator */
1268}
1269
1270static int Lutf8_gsub (lua_State *L) {
1271 const char *es, *s = check_utf8(L, 1, &es);
1272 const char *ep, *p = check_utf8(L, 2, &ep);
1273 int tr = lua_type(L, 3);
1274 lua_Integer max_s = luaL_optinteger(L, 4, (es-s)+1);
1275 int anchor = (*p == '^');
1276 lua_Integer n = 0;
1277 MatchState ms;
1278 luaL_Buffer b;
1279 luaL_argcheck(L, tr == LUA_TNUMBER || tr == LUA_TSTRING ||
1280 tr == LUA_TFUNCTION || tr == LUA_TTABLE, 3,
1281 "string/function/table expected");
1282 luaL_buffinit(L, &b);
1283 if (anchor) p++; /* skip anchor character */
1284 ms.L = L;
1285 ms.matchdepth = MAXCCALLS;
1286 ms.src_init = s;
1287 ms.src_end = es;
1288 ms.p_end = ep;
1289 while (n < max_s) {
1290 const char *e;
1291 ms.level = 0;
1292 assert(ms.matchdepth == MAXCCALLS);
1293 e = match(&ms, s, p);
1294 if (e) {
1295 n++;
1296 add_value(&ms, &b, s, e, tr);
1297 }
1298 if (e && e > s) /* non empty match? */
1299 s = e; /* skip it */
1300 else if (s < es) {
1301 utfint ch = 0;
1302 s = utf8_safe_decode(L, s, &ch);
1303 add_utf8char(&b, ch);
1304 } else break;
1305 if (anchor) break;
1306 }
1307 luaL_addlstring(&b, s, es-s);
1308 luaL_pushresult(&b);
1309 lua_pushinteger(L, n); /* number of substitutions */
1310 return 2;
1311}
1312
1313static int Lutf8_isvalid(lua_State *L) {
1314 const char *e, *s = check_utf8(L, 1, &e);
1315 const char *invalid = utf8_invalid_offset(s, e);
1316 lua_pushboolean(L, invalid == NULL);
1317 return 1;
1318}
1319
1320static int Lutf8_invalidoffset(lua_State *L) {
1321 const char *e, *s = check_utf8(L, 1, &e);
1322 const char *orig_s = s;
1323 int offset = luaL_optinteger(L, 2, 0);
1324 if (offset > 1) {
1325 offset--;
1326 s += offset;
1327 if (s >= e) {
1328 lua_pushnil(L);
1329 return 1;
1330 }
1331 } else if (offset < 0 && s - e < offset) {
1332 s = e + offset;
1333 }
1334 const char *invalid = utf8_invalid_offset(s, e);
1335 if (invalid == NULL) {
1336 lua_pushnil(L);
1337 } else {
1338 lua_pushinteger(L, invalid - orig_s + 1);
1339 }
1340 return 1;
1341}
1342
1343static int Lutf8_clean(lua_State *L) {
1344 const char *e, *s = check_utf8(L, 1, &e);
1345
1346 /* Default replacement string is REPLACEMENT CHARACTER U+FFFD */
1347 size_t repl_len;
1348 const char *r = luaL_optlstring(L, 2, "\xEF\xBF\xBD", &repl_len);
1349
1350 if (lua_gettop(L) > 1) {
1351 /* Check if replacement string is valid UTF-8 or not */
1352 if (utf8_invalid_offset(r, r + repl_len) != NULL) {
1353 lua_pushstring(L, "replacement string must be valid UTF-8");
1354 lua_error(L);
1355 }
1356 }
1357
1358 const char *invalid = utf8_invalid_offset(s, e);
1359 if (invalid == NULL) {
1360 lua_settop(L, 1); /* Return input string without modification */
1361 lua_pushboolean(L, 1); /* String was clean already */
1362 return 2;
1363 }
1364
1365 luaL_Buffer buff;
1366 luaL_buffinit(L, &buff);
1367
1368 while (1) {
1369 /* Invariant: 's' points to first GOOD byte not in output buffer,
1370 * 'invalid' points to first BAD byte after that */
1371 luaL_addlstring(&buff, s, invalid - s);
1372 luaL_addlstring(&buff, r, repl_len);
1373 /* We do not replace every bad byte with the replacement character,
1374 * but rather a contiguous sequence of bad bytes
1375 * Restore the invariant by stepping forward until we find at least
1376 * one good byte */
1377 s = invalid;
1378 while (s == invalid) {
1379 s++;
1380 invalid = utf8_invalid_offset(s, e);
1381 }
1382 if (invalid == NULL) {
1383 luaL_addlstring(&buff, s, e - s);
1384 luaL_pushresult(&buff);
1385 lua_pushboolean(L, 0); /* String was not clean */
1386 return 2;
1387 }
1388 }
1389}
1390
1391/* lua module import interface */
1392
1393#if LUA_VERSION_NUM >= 502
1394static const char UTF8PATT[] = "[\0-\x7F\xC2-\xF4][\x80-\xBF]*";
1395#else
1396static const char UTF8PATT[] = "[%z\1-\x7F\xC2-\xF4][\x80-\xBF]*";
1397#endif
1398
1399int luaopen_utf8 (lua_State *L) {
1400 luaL_Reg libs[] = {
1401#define ENTRY(name) { #name, Lutf8_##name }
1402 ENTRY(offset),
1403 ENTRY(codes),
1404 ENTRY(codepoint),
1405
1406 ENTRY(len),
1407 ENTRY(sub),
1408 ENTRY(reverse),
1409 ENTRY(lower),
1410 ENTRY(upper),
1411 ENTRY(title),
1412 ENTRY(fold),
1413 ENTRY(byte),
1414 ENTRY(char),
1415 ENTRY(escape),
1416 ENTRY(insert),
1417 ENTRY(remove),
1418 ENTRY(charpos),
1419 ENTRY(next),
1420 ENTRY(width),
1421 ENTRY(widthindex),
1422 ENTRY(ncasecmp),
1423 ENTRY(find),
1424 ENTRY(gmatch),
1425 ENTRY(gsub),
1426 ENTRY(match),
1427 ENTRY(isvalid),
1428 ENTRY(invalidoffset),
1429 ENTRY(clean),
1430#undef ENTRY
1431 { NULL, NULL }
1432 };
1433
1434#if LUA_VERSION_NUM >= 502
1435 luaL_newlib(L, libs);
1436#else
1437 luaL_register(L, "utf8", libs);
1438#endif
1439
1440 lua_pushlstring(L, UTF8PATT, sizeof(UTF8PATT)-1);
1441 lua_setfield(L, -2, "charpattern");
1442
1443 return 1;
1444}
1445
1446/* win32cc: flags+='-Wall -Wextra -s -O2 -mdll -DLUA_BUILD_AS_DLL'
1447 * win32cc: libs+='-llua54.dll' output='lua-utf8.dll'
1448 * win32cc: run='lua.exe test.lua'
1449 * maccc: run='lua -- test_compat.lua'
1450 * maccc: flags+='-g --coverage -bundle -undefined dynamic_lookup' output='lua-utf8.so' */
static const double c[]
Definition rng.c:264