39#define UTF8_MAX 0x7FFFFFFFu
40#define UTF8_MAXCP 0x10FFFFu
41#define iscont(p) ((*(p) & 0xC0) == 0x80)
42#define CAST(tp,expr) ((tp)(expr))
45# define LUA_QL(x) "'" x "'"
48static int utf8_invalid (utfint ch)
49{
return (ch > UTF8_MAXCP || (0xD800u <= ch && ch <= 0xDFFFu)); }
51static size_t utf8_encode (
char *buff, utfint x) {
53 lua_assert(x <= UTF8_MAX);
55 buff[UTF8_BUFFSZ - 1] = x & 0x7F;
59 buff[UTF8_BUFFSZ - (n++)] = 0x80 | (x & 0x3f);
63 buff[UTF8_BUFFSZ - n] = ((~mfb << 1) | x) & 0xFF;
68static const char *utf8_decode (
const char *s, utfint *val,
int strict) {
69 static const utfint limits[] =
70 {~0u, 0x80u, 0x800u, 0x10000u, 0x200000u, 0x4000000u};
71 unsigned int c = (
unsigned char)s[0];
77 for (;
c & 0x40;
c <<= 1) {
78 unsigned int cc = (
unsigned char)s[++count];
79 if ((cc & 0xC0) != 0x80)
81 res = (res << 6) | (cc & 0x3F);
83 res |= ((utfint)(
c & 0x7F) << (count * 5));
84 if (count > 5 || res > UTF8_MAX || res < limits[count])
90 if (res > UTF8_MAXCP || (0xD800u <= res && res <= 0xDFFFu))
97static const char *utf8_prev (
const char *s,
const char *e) {
98 while (s < e && iscont(e - 1)) --e;
99 return s < e ? e - 1 : s;
102static const char *utf8_next (
const char *s,
const char *e) {
103 while (s < e && iscont(s + 1)) ++s;
104 return s < e ? s + 1 : e;
107static size_t utf8_length (
const char *s,
const char *e) {
109 for (i = 0; s < e; ++i)
114static const char *utf8_offset (
const char *s,
const char *e, lua_Integer offset, lua_Integer idx) {
115 const char *p = s + offset - 1;
117 while (p < e && idx > 0)
118 p = utf8_next(p, e), --idx;
119 return idx == 0 ? p : NULL;
121 while (s < p && idx < 0)
122 p = utf8_prev(s, p), ++idx;
123 return idx == 0 ? p : NULL;
127static const char *utf8_relat (
const char *s,
const char *e,
int idx) {
129 utf8_offset(s, e, 1, idx - 1) :
130 utf8_offset(s, e, e-s+1, idx);
133static int utf8_range(
const char *s,
const char *e, lua_Integer *i, lua_Integer *j) {
134 const char *ps = utf8_relat(s, e, CAST(
int, *i));
135 const char *pe = utf8_relat(s, e, CAST(
int, *j));
136 *i = (ps ? ps : (*i > 0 ? e : s)) - s;
137 *j = (pe ? utf8_next(pe, e) : (*j > 0 ? e : s)) - s;
142static uint8_t utf8_code_unit_len[] = {
143 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, 2, 2, 3, 4
147static const char *utf8_invalid_offset(
const char *s,
const char *e) {
157 uint8_t needed_bytes = utf8_code_unit_len[
c >> 4];
158 if (e - s < needed_bytes)
161 if ((c2 & 0xC0) != 0x80)
163 if (needed_bytes >= 3) {
165 if ((c3 & 0xC0) != 0x80)
167 if (needed_bytes == 3) {
168 if (
c == 0xE0 && c2 < 0xA0)
170 if (
c == 0xED && c2 >= 0xA0)
174 if ((c4 & 0xC0) != 0x80)
176 if (
c == 0xF0 && c2 < 0x90)
178 if (
c == 0xF4 && c2 >= 0x90)
192#define table_size(t) (sizeof(t)/sizeof((t)[0]))
194#define utf8_categories(X) \
205#define utf8_converters(X) \
211static int find_in_range (
range_table *t,
size_t size, utfint ch) {
217 while (begin < end) {
218 size_t mid = (begin + end) / 2;
219 if (t[mid].last < ch)
221 else if (t[mid].first > ch)
224 return (ch - t[mid].first) % t[mid].step == 0;
230static int convert_char (
conv_table *t,
size_t size, utfint ch) {
236 while (begin < end) {
237 size_t mid = (begin + end) / 2;
238 if (t[mid].last < ch)
240 else if (t[mid].first > ch)
242 else if ((ch - t[mid].first) % t[mid].step == 0)
243 return ch + t[mid].offset;
251#define define_category(cls, name) static int utf8_is##name (utfint ch)\
252{ return find_in_range(name##_table, table_size(name##_table), ch); }
253#define define_converter(name) static utfint utf8_to##name (utfint ch) \
254{ return convert_char(to##name##_table, table_size(to##name##_table), ch); }
255utf8_categories(define_category)
256utf8_converters(define_converter)
257#undef define_category
258#undef define_converter
260static int utf8_isgraph (utfint ch) {
261 if (find_in_range(space_table, table_size(space_table), ch))
263 if (find_in_range(graph_table, table_size(graph_table), ch))
265 if (find_in_range(compose_table, table_size(compose_table), ch))
270static int utf8_isalnum (utfint ch) {
271 if (find_in_range(alpha_table, table_size(alpha_table), ch))
273 if (find_in_range(alnum_extend_table, table_size(alnum_extend_table), ch))
278static int utf8_width (utfint ch,
int ambi_is_single) {
279 if (find_in_range(doublewidth_table, table_size(doublewidth_table), ch))
281 if (find_in_range(ambiwidth_table, table_size(ambiwidth_table), ch))
282 return ambi_is_single ? 1 : 2;
283 if (find_in_range(compose_table, table_size(compose_table), ch))
285 if (find_in_range(unprintable_table, table_size(unprintable_table), ch))
293static int typeerror (lua_State *L,
int idx,
const char *tname)
294{
return luaL_error(L,
"%s expected, got %s", tname, luaL_typename(L, idx)); }
296static const char *check_utf8 (lua_State *L,
int idx,
const char **end) {
298 const char *s = luaL_checklstring(L, idx, &len);
299 if (end) *end = s+len;
303static const char *to_utf8 (lua_State *L,
int idx,
const char **end) {
305 const char *s = lua_tolstring(L, idx, &len);
306 if (end) *end = s+len;
310static const char *utf8_safe_decode (lua_State *L,
const char *p, utfint *pval) {
311 p = utf8_decode(p, pval, 0);
312 if (p == NULL) luaL_error(L,
"invalid UTF-8 code");
316static void add_utf8char (luaL_Buffer *b, utfint ch) {
317 char buff[UTF8_BUFFSZ];
318 size_t n = utf8_encode(buff, ch);
319 luaL_addlstring(b, buff+UTF8_BUFFSZ-n, n);
322static lua_Integer byte_relat (lua_Integer pos,
size_t len) {
323 if (pos >= 0)
return pos;
324 else if (0u - (
size_t)pos > len)
return 0;
325 else return (lua_Integer)len + pos + 1;
328static int Lutf8_len (lua_State *L) {
330 const char *s = luaL_checklstring(L, 1, &len), *p, *e;
331 lua_Integer posi = byte_relat(luaL_optinteger(L, 2, 1), len);
332 lua_Integer pose = byte_relat(luaL_optinteger(L, 3, -1), len);
333 int lax = lua_toboolean(L, 4);
334 luaL_argcheck(L, 1 <= posi && --posi <= (lua_Integer)len, 2,
335 "initial position out of string");
336 luaL_argcheck(L, --pose < (lua_Integer)len, 3,
337 "final position out of string");
338 for (n = 0, p=s+posi, e=s+pose+1; p < e; ++n) {
343 const char *np = utf8_decode(p, &ch, !lax);
344 if (np == NULL || utf8_invalid(ch)) {
346 lua_pushinteger(L, p - s + 1);
352 lua_pushinteger(L, n);
356static int Lutf8_sub (lua_State *L) {
357 const char *e, *s = check_utf8(L, 1, &e);
358 lua_Integer posi = luaL_checkinteger(L, 2);
359 lua_Integer pose = luaL_optinteger(L, 3, -1);
360 if (utf8_range(s, e, &posi, &pose))
361 lua_pushlstring(L, s+posi, pose-posi);
363 lua_pushliteral(L,
"");
367static int Lutf8_reverse (lua_State *L) {
369 const char *prev, *pprev, *ends, *e, *s = check_utf8(L, 1, &e);
371 int lax = lua_toboolean(L, 2);
372 luaL_buffinit(L, &b);
374 for (prev = e; s < prev; e = prev) {
375 prev = utf8_prev(s, prev);
376 luaL_addlstring(&b, prev, e-prev);
379 for (prev = e; s < prev; prev = pprev) {
381 ends = utf8_safe_decode(L, pprev = utf8_prev(s, prev), &code);
382 assert(ends == prev);
383 if (utf8_invalid(code))
384 return luaL_error(L,
"invalid UTF-8 code");
385 if (!utf8_iscompose(code)) {
386 luaL_addlstring(&b, pprev, e-pprev);
395static int Lutf8_byte (lua_State *L) {
397 const char *e, *s = check_utf8(L, 1, &e);
398 lua_Integer posi = luaL_optinteger(L, 2, 1);
399 lua_Integer pose = luaL_optinteger(L, 3, posi);
400 if (utf8_range(s, e, &posi, &pose)) {
401 for (e = s + pose, s = s + posi; s < e; ++n) {
403 s = utf8_safe_decode(L, s, &ch);
404 lua_pushinteger(L, ch);
410static int Lutf8_codepoint (lua_State *L) {
411 const char *e, *s = check_utf8(L, 1, &e);
413 lua_Integer posi = byte_relat(luaL_optinteger(L, 2, 1), len);
414 lua_Integer pose = byte_relat(luaL_optinteger(L, 3, posi), len);
415 int lax = lua_toboolean(L, 4);
418 luaL_argcheck(L, posi >= 1, 2,
"out of range");
419 luaL_argcheck(L, pose <= (lua_Integer)len, 3,
"out of range");
420 if (posi > pose)
return 0;
421 if (pose - posi >= INT_MAX)
422 return luaL_error(L,
"string slice too long");
423 n = (int)(pose - posi + 1);
424 luaL_checkstack(L, n,
"string slice too long");
427 for (n = 0, s += posi - 1; s < se;) {
429 s = utf8_safe_decode(L, s, &code);
430 if (!lax && utf8_invalid(code))
431 return luaL_error(L,
"invalid UTF-8 code");
432 lua_pushinteger(L, code);
438static int Lutf8_char (lua_State *L) {
439 int i, n = lua_gettop(L);
441 luaL_buffinit(L, &b);
442 for (i = 1; i <= n; ++i) {
443 lua_Integer code = luaL_checkinteger(L, i);
444 luaL_argcheck(L, code <= UTF8_MAXCP, i,
"value out of range");
445 add_utf8char(&b, CAST(utfint, code));
451#define bind_converter(name) \
452static int Lutf8_##name (lua_State *L) { \
453 int t = lua_type(L, 1); \
454 if (t == LUA_TNUMBER) \
455 lua_pushinteger(L, utf8_to##name(CAST(utfint, lua_tointeger(L, 1)))); \
456 else if (t == LUA_TSTRING) { \
458 const char *e, *s = to_utf8(L, 1, &e); \
459 luaL_buffinit(L, &b); \
462 s = utf8_safe_decode(L, s, &ch); \
463 add_utf8char(&b, utf8_to##name(ch)); \
465 luaL_pushresult(&b); \
467 else return typeerror(L, 1, "number/string"); \
470utf8_converters(bind_converter)
476static const char *parse_escape (lua_State *L,
const char *s,
const char *e,
int hex, utfint *pch) {
479 if (*s ==
'{') ++s, in_bracket = 1;
481 utfint ch = (
unsigned char)*s;
482 if (ch >=
'0' && ch <=
'9') ch = ch -
'0';
483 else if (hex && ch >=
'A' && ch <=
'F') ch = 10 + (ch -
'A');
484 else if (hex && ch >=
'a' && ch <=
'f') ch = 10 + (ch -
'a');
485 else if (!in_bracket)
break;
486 else if (ch ==
'}') { ++s;
break; }
487 else luaL_error(L,
"invalid escape '%c'", ch);
488 code *= hex ? 16 : 10;
495static int Lutf8_escape (lua_State *L) {
496 const char *e, *s = check_utf8(L, 1, &e);
498 luaL_buffinit(L, &b);
501 s = utf8_safe_decode(L, s, &ch);
505 case '0':
case '1':
case '2':
case '3':
506 case '4':
case '5':
case '6':
case '7':
507 case '8':
case '9':
case '{':
509 case 'x':
case 'X': hex = 1;
510 case 'u':
case 'U':
if (s+1 < e) { ++s;
break; }
513 s = utf8_safe_decode(L, s, &ch);
516 s = parse_escape(L, s, e, hex, &ch);
519 add_utf8char(&b, ch);
525static int Lutf8_insert (lua_State *L) {
526 const char *e, *s = check_utf8(L, 1, &e);
531 const char *first = e;
532 if (lua_type(L, 2) == LUA_TNUMBER) {
533 int idx = (int)lua_tointeger(L, 2);
534 if (idx != 0) first = utf8_relat(s, e, idx);
535 luaL_argcheck(L, first, 2,
"invalid index");
538 subs = luaL_checklstring(L, nargs, &sublen);
539 luaL_buffinit(L, &b);
540 luaL_addlstring(&b, s, first-s);
541 luaL_addlstring(&b, subs, sublen);
542 luaL_addlstring(&b, first, e-first);
547static int Lutf8_remove (lua_State *L) {
548 const char *e, *s = check_utf8(L, 1, &e);
549 lua_Integer posi = luaL_optinteger(L, 2, -1);
550 lua_Integer pose = luaL_optinteger(L, 3, -1);
551 if (!utf8_range(s, e, &posi, &pose))
555 luaL_buffinit(L, &b);
556 luaL_addlstring(&b, s, posi);
557 luaL_addlstring(&b, s+pose, e-s-pose);
563static int push_offset (lua_State *L,
const char *s,
const char *e, lua_Integer offset, lua_Integer idx) {
567 p = utf8_offset(s, e, offset, idx);
568 else if (p = s+offset-1, iscont(p))
570 if (p == NULL || p == e)
return 0;
571 utf8_decode(p, &ch, 0);
572 lua_pushinteger(L, p-s+1);
573 lua_pushinteger(L, ch);
577static int Lutf8_charpos (lua_State *L) {
578 const char *e, *s = check_utf8(L, 1, &e);
579 lua_Integer offset = 1;
580 if (lua_isnoneornil(L, 3)) {
581 lua_Integer idx = luaL_optinteger(L, 2, 0);
583 else if (idx < 0) offset = e-s+1;
584 return push_offset(L, s, e, offset, idx);
586 offset = byte_relat(luaL_optinteger(L, 2, 1), e-s);
587 if (offset < 1) offset = 1;
588 return push_offset(L, s, e, offset, luaL_checkinteger(L, 3));
591static int Lutf8_offset (lua_State *L) {
593 const char *s = luaL_checklstring(L, 1, &len);
594 lua_Integer n = luaL_checkinteger(L, 2);
595 lua_Integer posi = (n >= 0) ? 1 : len + 1;
596 posi = byte_relat(luaL_optinteger(L, 3, posi), len);
597 luaL_argcheck(L, 1 <= posi && --posi <= (lua_Integer)len, 3,
598 "position out of range");
601 while (posi > 0 && iscont(s + posi)) posi--;
603 if (iscont(s + posi))
604 return luaL_error(L,
"initial position is a continuation byte");
606 while (n < 0 && posi > 0) {
609 }
while (posi > 0 && iscont(s + posi));
614 while (n > 0 && posi < (lua_Integer)len) {
617 }
while (iscont(s + posi));
623 lua_pushinteger(L, posi + 1);
629static int Lutf8_next (lua_State *L) {
630 const char *e, *s = check_utf8(L, 1, &e);
631 lua_Integer offset = byte_relat(luaL_optinteger(L, 2, 1), e-s);
632 lua_Integer idx = luaL_optinteger(L, 3, !lua_isnoneornil(L, 2));
633 return push_offset(L, s, e, offset, idx);
636static int iter_aux (lua_State *L,
int strict) {
637 const char *e, *s = check_utf8(L, 1, &e);
638 int n = CAST(
int, lua_tointeger(L, 2));
639 const char *p = n <= 0 ? s : utf8_next(s+n-1, e);
642 utf8_safe_decode(L, p, &code);
643 if (strict && utf8_invalid(code))
644 return luaL_error(L,
"invalid UTF-8 code");
645 lua_pushinteger(L, p-s+1);
646 lua_pushinteger(L, code);
652static int iter_auxstrict (lua_State *L) {
return iter_aux(L, 1); }
653static int iter_auxlax (lua_State *L) {
return iter_aux(L, 0); }
655static int Lutf8_codes (lua_State *L) {
656 int lax = lua_toboolean(L, 2);
657 luaL_checkstring(L, 1);
658 lua_pushcfunction(L, lax ? iter_auxlax : iter_auxstrict);
660 lua_pushinteger(L, 0);
664static int Lutf8_width (lua_State *L) {
665 int t = lua_type(L, 1);
666 int ambi_is_single = !lua_toboolean(L, 2);
667 int default_width = CAST(
int, luaL_optinteger(L, 3, 0));
668 if (t == LUA_TNUMBER) {
669 size_t chwidth = utf8_width(CAST(utfint, lua_tointeger(L, 1)), ambi_is_single);
670 if (chwidth == 0) chwidth = default_width;
671 lua_pushinteger(L, (lua_Integer)chwidth);
672 }
else if (t != LUA_TSTRING)
673 return typeerror(L, 1,
"number/string");
675 const char *e, *s = to_utf8(L, 1, &e);
680 s = utf8_safe_decode(L, s, &ch);
681 chwidth = utf8_width(ch, ambi_is_single);
682 width += chwidth == 0 ? default_width : chwidth;
684 lua_pushinteger(L, (lua_Integer)width);
689static int Lutf8_widthindex (lua_State *L) {
690 const char *e, *s = check_utf8(L, 1, &e);
691 int width = CAST(
int, luaL_checkinteger(L, 2));
692 int ambi_is_single = !lua_toboolean(L, 3);
693 int default_width = CAST(
int, luaL_optinteger(L, 4, 0));
698 s = utf8_safe_decode(L, s, &ch);
699 chwidth = utf8_width(ch, ambi_is_single);
700 if (chwidth == 0) chwidth = default_width;
701 width -= CAST(
int, chwidth);
703 lua_pushinteger(L, idx);
704 lua_pushinteger(L, width + chwidth);
705 lua_pushinteger(L, chwidth);
710 lua_pushinteger(L, (lua_Integer)idx);
714static int Lutf8_ncasecmp (lua_State *L) {
715 const char *e1, *s1 = check_utf8(L, 1, &e1);
716 const char *e2, *s2 = check_utf8(L, 2, &e2);
717 while (s1 < e1 || s2 < e2) {
718 utfint ch1 = 0, ch2 = 0;
724 s1 = utf8_safe_decode(L, s1, &ch1);
725 s2 = utf8_safe_decode(L, s2, &ch2);
726 ch1 = utf8_tofold(ch1);
727 ch2 = utf8_tofold(ch2);
730 lua_pushinteger(L, ch1 > ch2 ? 1 : -1);
734 lua_pushinteger(L, 0);
741#ifndef LUA_MAXCAPTURES
742# define LUA_MAXCAPTURES 32
745#define CAP_UNFINISHED (-1)
746#define CAP_POSITION (-2)
751 const char *src_init;
759 } capture[LUA_MAXCAPTURES];
763static const char *match (
MatchState *ms,
const char *s,
const char *p);
766#if !defined(MAXCCALLS)
771#define SPECIALS "^$*+?.([%-"
773static int check_capture (
MatchState *ms,
int l) {
775 if (l < 0 || l >= ms->level || ms->capture[l].len == CAP_UNFINISHED)
776 return luaL_error(ms->L,
"invalid capture index %%%d", l + 1);
781 int level = ms->level;
783 if (ms->capture[level].len == CAP_UNFINISHED)
return level;
784 return luaL_error(ms->L,
"invalid pattern capture");
787static const char *classend (
MatchState *ms,
const char *p) {
789 p = utf8_safe_decode(ms->L, p, &ch);
793 luaL_error(ms->L,
"malformed pattern (ends with " LUA_QL(
"%%")
")");
794 return utf8_next(p, ms->p_end);
800 luaL_error(ms->L,
"malformed pattern (missing " LUA_QL(
"]")
")");
801 if (*(p++) == L_ESC && p < ms->p_end)
812static int match_class (utfint
c, utfint cl) {
814 switch (utf8_tolower(cl)) {
815#define X(cls, name) case cls: res = utf8_is##name(c); break;
818 case 'g' : res = utf8_isgraph(
c);
break;
819 case 'w' : res = utf8_isalnum(
c);
break;
820 case 'z' : res = (
c == 0);
break;
821 default:
return (cl ==
c);
823 return (utf8_islower(cl) ? res : !res);
826static int matchbracketclass (
MatchState *ms, utfint
c,
const char *p,
const char *ec) {
835 p = utf8_safe_decode(ms->L, p, &ch);
837 p = utf8_safe_decode(ms->L, p, &ch);
838 if (match_class(
c, ch))
842 const char *np = utf8_safe_decode(ms->L, p, &next);
843 if (next ==
'-' && np < ec) {
844 p = utf8_safe_decode(ms->L, np, &next);
845 if (ch <=
c &&
c <= next)
848 else if (ch ==
c)
return sig;
854static int singlematch (
MatchState *ms,
const char *s,
const char *p,
const char *ep) {
855 if (s >= ms->src_end)
859 utf8_safe_decode(ms->L, s, &ch);
860 p = utf8_safe_decode(ms->L, p, &pch);
863 case L_ESC: utf8_safe_decode(ms->L, p, &pch);
864 return match_class(ch, pch);
865 case '[':
return matchbracketclass(ms, ch, p-1, ep-1);
866 default:
return pch == ch;
871static const char *matchbalance (
MatchState *ms,
const char *s,
const char **p) {
872 utfint ch=0, begin=0, end=0;
873 *p = utf8_safe_decode(ms->L, *p, &begin);
875 luaL_error(ms->L,
"malformed pattern "
876 "(missing arguments to " LUA_QL(
"%%b")
")");
877 *p = utf8_safe_decode(ms->L, *p, &end);
878 s = utf8_safe_decode(ms->L, s, &ch);
879 if (ch != begin)
return NULL;
882 while (s < ms->src_end) {
883 s = utf8_safe_decode(ms->L, s, &ch);
885 if (--cont == 0)
return s;
887 else if (ch == begin) cont++;
893static const char *max_expand (
MatchState *ms,
const char *s,
const char *p,
const char *ep) {
895 while (singlematch(ms, m, p, ep))
896 m = utf8_next(m, ms->src_end);
899 const char *res = match(ms, m, ep+1);
908static const char *min_expand (
MatchState *ms,
const char *s,
const char *p,
const char *ep) {
910 const char *res = match(ms, s, ep+1);
913 else if (singlematch(ms, s, p, ep))
914 s = utf8_next(s, ms->src_end);
919static const char *start_capture (
MatchState *ms,
const char *s,
const char *p,
int what) {
921 int level = ms->level;
922 if (level >= LUA_MAXCAPTURES) luaL_error(ms->L,
"too many captures");
923 ms->capture[level].init = s;
924 ms->capture[level].len = what;
926 if ((res=match(ms, s, p)) == NULL)
931static const char *end_capture (
MatchState *ms,
const char *s,
const char *p) {
932 int l = capture_to_close(ms);
934 ms->capture[l].len = s - ms->capture[l].init;
935 if ((res = match(ms, s, p)) == NULL)
936 ms->capture[l].len = CAP_UNFINISHED;
940static const char *match_capture (
MatchState *ms,
const char *s,
int l) {
942 l = check_capture(ms, l);
943 len = ms->capture[l].len;
944 if ((
size_t)(ms->src_end-s) >= len &&
945 memcmp(ms->capture[l].init, s, len) == 0)
950static const char *match (
MatchState *ms,
const char *s,
const char *p) {
951 if (ms->matchdepth-- == 0)
952 luaL_error(ms->L,
"pattern too complex");
954 if (p != ms->p_end) {
956 utf8_safe_decode(ms->L, p, &ch);
960 s = start_capture(ms, s, p + 2, CAP_POSITION);
962 s = start_capture(ms, s, p + 1, CAP_UNFINISHED);
966 s = end_capture(ms, s, p + 1);
970 if ((p + 1) != ms->p_end)
972 s = (s == ms->src_end) ? s : NULL;
976 const char *prev_p = p;
977 p = utf8_safe_decode(ms->L, p+1, &ch);
980 s = matchbalance(ms, s, &p);
987 const char *ep; utfint previous = 0, current = 0;
989 luaL_error(ms->L,
"missing " LUA_QL(
"[")
" after "
990 LUA_QL(
"%%f")
" in pattern");
991 ep = classend(ms, p);
992 if (s != ms->src_init)
993 utf8_decode(utf8_prev(ms->src_init, s), &previous, 0);
994 if (s != ms->src_end)
995 utf8_decode(s, ¤t, 0);
996 if (!matchbracketclass(ms, previous, p, ep - 1) &&
997 matchbracketclass(ms, current, p, ep - 1)) {
1003 case '0':
case '1':
case '2':
case '3':
1004 case '4':
case '5':
case '6':
case '7':
1005 case '8':
case '9': {
1006 s = match_capture(ms, s, ch);
1007 if (s != NULL)
goto init;
1010 default: p = prev_p;
goto dflt;
1015 const char *ep = classend(ms, p);
1017 if (!singlematch(ms, s, p, ep)) {
1018 if (*ep ==
'*' || *ep ==
'?' || *ep ==
'-') {
1019 p = ep + 1;
goto init;
1023 const char *next_s = utf8_next(s, ms->src_end);
1027 const char *next_ep = utf8_next(ep, ms->p_end);
1028 if ((res = match(ms, next_s, next_ep)) != NULL)
1031 p = next_ep;
goto init;
1039 s = max_expand(ms, s, p, ep);
1042 s = min_expand(ms, s, p, ep);
1045 s = next_s; p = ep;
goto init;
1056static const char *lmemfind (
const char *s1,
size_t l1,
const char *s2,
size_t l2) {
1057 if (l2 == 0)
return s1;
1058 else if (l2 > l1)
return NULL;
1063 while (l1 > 0 && (init = (
const char *)memchr(s1, *s2, l1)) != NULL) {
1065 if (memcmp(init, s2+1, l2) == 0)
1076static int get_index (
const char *p,
const char *s,
const char *e) {
1078 for (idx = 0; s < e && s < p; ++idx)
1079 s = utf8_next(s, e);
1080 return s == p ? idx : idx - 1;
1083static void push_onecapture (
MatchState *ms,
int i,
const char *s,
const char *e) {
1084 if (i >= ms->level) {
1086 lua_pushlstring(ms->L, s, e - s);
1088 luaL_error(ms->L,
"invalid capture index");
1090 ptrdiff_t l = ms->capture[i].len;
1091 if (l == CAP_UNFINISHED) luaL_error(ms->L,
"unfinished capture");
1092 if (l == CAP_POSITION) {
1093 int idx = get_index(ms->capture[i].init, ms->src_init, ms->src_end);
1094 lua_pushinteger(ms->L, idx+1);
1096 lua_pushlstring(ms->L, ms->capture[i].init, l);
1100static int push_captures (
MatchState *ms,
const char *s,
const char *e) {
1102 int nlevels = (ms->level == 0 && s) ? 1 : ms->level;
1103 luaL_checkstack(ms->L, nlevels,
"too many captures");
1104 for (i = 0; i < nlevels; i++)
1105 push_onecapture(ms, i, s, e);
1110static int nospecials (
const char *p,
const char * ep) {
1112 if (strpbrk(p, SPECIALS))
1122static int find_aux (lua_State *L,
int find) {
1123 const char *es, *s = check_utf8(L, 1, &es);
1124 const char *ep, *p = check_utf8(L, 2, &ep);
1125 lua_Integer idx = luaL_optinteger(L, 3, 1);
1128 init = utf8_relat(s, es, CAST(
int, idx));
1137 if (find && (lua_toboolean(L, 4) || nospecials(p, ep))) {
1139 const char *s2 = lmemfind(init, es-init, p, ep-p);
1141 const char *e2 = s2 + (ep - p);
1142 if (iscont(e2)) e2 = utf8_next(e2, es);
1143 lua_pushinteger(L, idx = get_index(s2, s, es) + 1);
1144 lua_pushinteger(L, idx + get_index(e2, s2, es) - 1);
1149 int anchor = (*p ==
'^');
1151 if (idx < 0) idx += utf8_length(s, es)+1;
1153 ms.matchdepth = MAXCCALLS;
1160 assert(ms.matchdepth == MAXCCALLS);
1161 if ((res=match(&ms, init, p)) != NULL) {
1163 lua_pushinteger(L, idx);
1164 lua_pushinteger(L, idx + utf8_length(init, res) - 1);
1165 return push_captures(&ms, NULL, 0) + 2;
1167 return push_captures(&ms, init, res);
1169 if (init == es)
break;
1171 init = utf8_next(init, es);
1172 }
while (init <= es && !anchor);
1178static int Lutf8_find (lua_State *L) {
return find_aux(L, 1); }
1179static int Lutf8_match (lua_State *L) {
return find_aux(L, 0); }
1181static int gmatch_aux (lua_State *L) {
1183 const char *es, *s = check_utf8(L, lua_upvalueindex(1), &es);
1184 const char *ep, *p = check_utf8(L, lua_upvalueindex(2), &ep);
1187 ms.matchdepth = MAXCCALLS;
1191 for (src = s + (
size_t)lua_tointeger(L, lua_upvalueindex(3));
1193 src = utf8_next(src, ms.src_end)) {
1196 assert(ms.matchdepth == MAXCCALLS);
1197 if ((e = match(&ms, src, p)) != NULL) {
1198 lua_Integer newstart = e-s;
1199 if (e == src) newstart++;
1200 lua_pushinteger(L, newstart);
1201 lua_replace(L, lua_upvalueindex(3));
1202 return push_captures(&ms, src, e);
1204 if (src == ms.src_end)
break;
1209static int Lutf8_gmatch (lua_State *L) {
1210 luaL_checkstring(L, 1);
1211 luaL_checkstring(L, 2);
1213 lua_pushinteger(L, 0);
1214 lua_pushcclosure(L, gmatch_aux, 3);
1218static void add_s (
MatchState *ms, luaL_Buffer *b,
const char *s,
const char *e) {
1219 const char *new_end, *news = to_utf8(ms->L, 3, &new_end);
1220 while (news < new_end) {
1222 news = utf8_safe_decode(ms->L, news, &ch);
1224 add_utf8char(b, ch);
1226 news = utf8_safe_decode(ms->L, news, &ch);
1227 if (!utf8_isdigit(ch)) {
1229 luaL_error(ms->L,
"invalid use of " LUA_QL(
"%c")
1230 " in replacement string", L_ESC);
1231 add_utf8char(b, ch);
1232 }
else if (ch ==
'0')
1233 luaL_addlstring(b, s, e-s);
1235 push_onecapture(ms, ch-
'1', s, e);
1242static void add_value (
MatchState *ms, luaL_Buffer *b,
const char *s,
const char *e,
int tr) {
1243 lua_State *L = ms->L;
1245 case LUA_TFUNCTION: {
1247 lua_pushvalue(L, 3);
1248 n = push_captures(ms, s, e);
1253 push_onecapture(ms, 0, s, e);
1262 if (!lua_toboolean(L, -1)) {
1264 lua_pushlstring(L, s, e - s);
1265 }
else if (!lua_isstring(L, -1))
1266 luaL_error(L,
"invalid replacement value (a %s)", luaL_typename(L, -1));
1270static int Lutf8_gsub (lua_State *L) {
1271 const char *es, *s = check_utf8(L, 1, &es);
1272 const char *ep, *p = check_utf8(L, 2, &ep);
1273 int tr = lua_type(L, 3);
1274 lua_Integer max_s = luaL_optinteger(L, 4, (es-s)+1);
1275 int anchor = (*p ==
'^');
1279 luaL_argcheck(L, tr == LUA_TNUMBER || tr == LUA_TSTRING ||
1280 tr == LUA_TFUNCTION || tr == LUA_TTABLE, 3,
1281 "string/function/table expected");
1282 luaL_buffinit(L, &b);
1285 ms.matchdepth = MAXCCALLS;
1292 assert(ms.matchdepth == MAXCCALLS);
1293 e = match(&ms, s, p);
1296 add_value(&ms, &b, s, e, tr);
1302 s = utf8_safe_decode(L, s, &ch);
1303 add_utf8char(&b, ch);
1307 luaL_addlstring(&b, s, es-s);
1308 luaL_pushresult(&b);
1309 lua_pushinteger(L, n);
1313static int Lutf8_isvalid(lua_State *L) {
1314 const char *e, *s = check_utf8(L, 1, &e);
1315 const char *invalid = utf8_invalid_offset(s, e);
1316 lua_pushboolean(L, invalid == NULL);
1320static int Lutf8_invalidoffset(lua_State *L) {
1321 const char *e, *s = check_utf8(L, 1, &e);
1322 const char *orig_s = s;
1323 int offset = luaL_optinteger(L, 2, 0);
1331 }
else if (offset < 0 && s - e < offset) {
1334 const char *invalid = utf8_invalid_offset(s, e);
1335 if (invalid == NULL) {
1338 lua_pushinteger(L, invalid - orig_s + 1);
1343static int Lutf8_clean(lua_State *L) {
1344 const char *e, *s = check_utf8(L, 1, &e);
1348 const char *r = luaL_optlstring(L, 2,
"\xEF\xBF\xBD", &repl_len);
1350 if (lua_gettop(L) > 1) {
1352 if (utf8_invalid_offset(r, r + repl_len) != NULL) {
1353 lua_pushstring(L,
"replacement string must be valid UTF-8");
1358 const char *invalid = utf8_invalid_offset(s, e);
1359 if (invalid == NULL) {
1361 lua_pushboolean(L, 1);
1366 luaL_buffinit(L, &buff);
1371 luaL_addlstring(&buff, s, invalid - s);
1372 luaL_addlstring(&buff, r, repl_len);
1378 while (s == invalid) {
1380 invalid = utf8_invalid_offset(s, e);
1382 if (invalid == NULL) {
1383 luaL_addlstring(&buff, s, e - s);
1384 luaL_pushresult(&buff);
1385 lua_pushboolean(L, 0);
1393#if LUA_VERSION_NUM >= 502
1394static const char UTF8PATT[] =
"[\0-\x7F\xC2-\xF4][\x80-\xBF]*";
1396static const char UTF8PATT[] =
"[%z\1-\x7F\xC2-\xF4][\x80-\xBF]*";
1399int luaopen_utf8 (lua_State *L) {
1401#define ENTRY(name) { #name, Lutf8_##name }
1428 ENTRY(invalidoffset),
1434#if LUA_VERSION_NUM >= 502
1435 luaL_newlib(L, libs);
1437 luaL_register(L,
"utf8", libs);
1440 lua_pushlstring(L, UTF8PATT,
sizeof(UTF8PATT)-1);
1441 lua_setfield(L, -2,
"charpattern");