/* * Integration of https://github.com/starwing/luautf8 * * MIT License * * Copyright (c) 2018 Xavier Wang * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include #include #include #include #include #include #include #include "../unidata.h" /* UTF-8 string operations */ #define UTF8_BUFFSZ 8 #define UTF8_MAX 0x7FFFFFFFu #define UTF8_MAXCP 0x10FFFFu #define iscont(p) ((*(p) & 0xC0) == 0x80) #define CAST(tp,expr) ((tp)(expr)) #ifndef LUA_QL # define LUA_QL(x) "'" x "'" #endif static int utf8_invalid (utfint ch) { return (ch > UTF8_MAXCP || (0xD800u <= ch && ch <= 0xDFFFu)); } static size_t utf8_encode (char *buff, utfint x) { int n = 1; /* number of bytes put in buffer (backwards) */ lua_assert(x <= UTF8_MAX); if (x < 0x80) /* ascii? */ buff[UTF8_BUFFSZ - 1] = x & 0x7F; else { /* need continuation bytes */ utfint mfb = 0x3f; /* maximum that fits in first byte */ do { /* add continuation bytes */ buff[UTF8_BUFFSZ - (n++)] = 0x80 | (x & 0x3f); x >>= 6; /* remove added bits */ mfb >>= 1; /* now there is one less bit available in first byte */ } while (x > mfb); /* still needs continuation byte? */ buff[UTF8_BUFFSZ - n] = ((~mfb << 1) | x) & 0xFF; /* add first byte */ } return n; } static const char *utf8_decode (const char *s, utfint *val, int strict) { static const utfint limits[] = {~0u, 0x80u, 0x800u, 0x10000u, 0x200000u, 0x4000000u}; unsigned int c = (unsigned char)s[0]; utfint res = 0; /* final result */ if (c < 0x80) /* ascii? */ res = c; else { int count = 0; /* to count number of continuation bytes */ for (; c & 0x40; c <<= 1) { /* while it needs continuation bytes... */ unsigned int cc = (unsigned char)s[++count]; /* read next byte */ if ((cc & 0xC0) != 0x80) /* not a continuation byte? */ return NULL; /* invalid byte sequence */ res = (res << 6) | (cc & 0x3F); /* add lower 6 bits from cont. byte */ } res |= ((utfint)(c & 0x7F) << (count * 5)); /* add first byte */ if (count > 5 || res > UTF8_MAX || res < limits[count]) return NULL; /* invalid byte sequence */ s += count; /* skip continuation bytes read */ } if (strict) { /* check for invalid code points; too large or surrogates */ if (res > UTF8_MAXCP || (0xD800u <= res && res <= 0xDFFFu)) return NULL; } if (val) *val = res; return s + 1; /* +1 to include first byte */ } static const char *utf8_prev (const char *s, const char *e) { while (s < e && iscont(e - 1)) --e; return s < e ? e - 1 : s; } static const char *utf8_next (const char *s, const char *e) { while (s < e && iscont(s + 1)) ++s; return s < e ? s + 1 : e; } static size_t utf8_length (const char *s, const char *e) { size_t i; for (i = 0; s < e; ++i) s = utf8_next(s, e); return i; } static const char *utf8_offset (const char *s, const char *e, lua_Integer offset, lua_Integer idx) { const char *p = s + offset - 1; if (idx >= 0) { while (p < e && idx > 0) p = utf8_next(p, e), --idx; return idx == 0 ? p : NULL; } else { while (s < p && idx < 0) p = utf8_prev(s, p), ++idx; return idx == 0 ? p : NULL; } } static const char *utf8_relat (const char *s, const char *e, int idx) { return idx >= 0 ? utf8_offset(s, e, 1, idx - 1) : utf8_offset(s, e, e-s+1, idx); } static int utf8_range(const char *s, const char *e, lua_Integer *i, lua_Integer *j) { const char *ps = utf8_relat(s, e, CAST(int, *i)); const char *pe = utf8_relat(s, e, CAST(int, *j)); *i = (ps ? ps : (*i > 0 ? e : s)) - s; *j = (pe ? utf8_next(pe, e) : (*j > 0 ? e : s)) - s; return *i < *j; } /* Indexed by top nibble of first byte in code unit */ static uint8_t utf8_code_unit_len[] = { 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, 2, 2, 3, 4 }; /* Return pointer to first invalid UTF-8 sequence in 's', or NULL if valid */ static const char *utf8_invalid_offset(const char *s, const char *e) { while (s < e) { uint8_t c = *s; if (c >= 0x80) { /* c < 0xC0 means a continuation byte, but we are not in the middle of a multi-byte code unit * c >= 0xC0 && c < 0xC2 means an overlong 2-byte code unit * c >= 0xF8 means a 5-byte or 6-byte code unit, which is illegal, or else illegal byte 0xFE/0xFF * c >= 0xF5 && c < 0xF8 means a 4-byte code unit encoding invalid codepoint > U+10FFFF */ if (c < 0xC2 || c >= 0xF5) return s; uint8_t needed_bytes = utf8_code_unit_len[c >> 4]; if (e - s < needed_bytes) return s; /* String is truncated */ uint8_t c2 = *(s+1); if ((c2 & 0xC0) != 0x80) return s; /* 2nd byte of code unit is not a continuation byte */ if (needed_bytes >= 3) { uint8_t c3 = *(s+2); if ((c3 & 0xC0) != 0x80) return s; /* 3rd byte of code unit is not a continuation byte */ if (needed_bytes == 3) { if (c == 0xE0 && c2 < 0xA0) return s; /* Overlong 3-byte code unit */ if (c == 0xED && c2 >= 0xA0) return s; /* Reserved codepoint from U+D800-U+DFFF */ } else { uint8_t c4 = *(s+3); if ((c4 & 0xC0) != 0x80) return s; /* 4th byte of code unit is not a continuation byte */ if (c == 0xF0 && c2 < 0x90) return s; /* Overlong 4-byte code unit */ if (c == 0xF4 && c2 >= 0x90) return s; /* Illegal codepoint > U+10FFFF */ } } s += needed_bytes; } else { s++; } } return NULL; } /* Unicode character categories */ #define table_size(t) (sizeof(t)/sizeof((t)[0])) #define utf8_categories(X) \ X('a', alpha) \ X('c', cntrl) \ X('d', digit) \ X('l', lower) \ X('p', punct) \ X('s', space) \ X('t', compose) \ X('u', upper) \ X('x', xdigit) #define utf8_converters(X) \ X(lower) \ X(upper) \ X(title) \ X(fold) static int find_in_range (range_table *t, size_t size, utfint ch) { size_t begin, end; begin = 0; end = size; while (begin < end) { size_t mid = (begin + end) / 2; if (t[mid].last < ch) begin = mid + 1; else if (t[mid].first > ch) end = mid; else return (ch - t[mid].first) % t[mid].step == 0; } return 0; } static int convert_char (conv_table *t, size_t size, utfint ch) { size_t begin, end; begin = 0; end = size; while (begin < end) { size_t mid = (begin + end) / 2; if (t[mid].last < ch) begin = mid + 1; else if (t[mid].first > ch) end = mid; else if ((ch - t[mid].first) % t[mid].step == 0) return ch + t[mid].offset; else return ch; } return ch; } /* Normalization */ static int lookup_canon_cls (utfint ch) { /* The first codepoint with canonicalization class != 0 is U+0300 COMBINING GRAVE ACCENT */ if (ch < 0x300) { return 0; } size_t begin = 0, end = table_size(nfc_combining_table); while (begin < end) { size_t mid = (begin + end) / 2; if (nfc_combining_table[mid].last < ch) begin = mid + 1; else if (nfc_combining_table[mid].first > ch) end = mid; else return nfc_combining_table[mid].canon_cls; } return 0; } static nfc_table *nfc_quickcheck (utfint ch) { /* The first character which needs to be checked for possible NFC violations * is U+0300 COMBINING GRAVE ACCENT */ if (ch < 0x300) { return NULL; } size_t begin = 0, end = table_size(nfc_quickcheck_table); while (begin < end) { size_t mid = (begin + end) / 2; utfint found = nfc_quickcheck_table[mid].cp; if (found < ch) begin = mid + 1; else if (found > ch) end = mid; else return &nfc_quickcheck_table[mid]; } return NULL; } static int nfc_combine (utfint cp1, utfint cp2, utfint *dest) { size_t begin = 0, end = table_size(nfc_composite_table); unsigned int hash = (cp1 * 213) + cp2; while (begin < end) { size_t mid = (begin + end) / 2; utfint val = nfc_composite_table[mid].hash; if (val < hash) { begin = mid + 1; } else if (val > hash) { end = mid; } else if (nfc_composite_table[mid].cp1 == cp1 && nfc_composite_table[mid].cp2 == cp2) { if (dest) *dest = nfc_composite_table[mid].dest; return 1; } else { return 0; } } return 0; } static decompose_table *nfc_decompose (utfint ch) { size_t begin = 0, end = table_size(nfc_decompose_table); while (begin < end) { size_t mid = (begin + end) / 2; utfint found = nfc_decompose_table[mid].cp; if (found < ch) begin = mid + 1; else if (found > ch) end = mid; else return &nfc_decompose_table[mid]; } return NULL; } static int nfc_check (utfint ch, nfc_table *entry, utfint starter, unsigned int canon_cls, unsigned int prev_canon_cls) { int reason = entry->reason; if (reason == REASON_MUST_CONVERT_1 || reason == REASON_MUST_CONVERT_2) { /* This codepoint has a different, canonical form, so this string is not NFC */ return 0; } else if (reason == REASON_STARTER_CAN_COMBINE) { /* It is possible that this 'starter' codepoint should have been combined with the * preceding 'starter' codepoint; if so, this string is not NFC */ if (!prev_canon_cls && nfc_combine(starter, ch, NULL)) { /* These codepoints should have been combined */ return 0; } } else if (reason == REASON_COMBINING_MARK) { /* Combining mark; check if it should have been combined with preceding starter codepoint */ if (canon_cls > prev_canon_cls && nfc_combine(starter, ch, NULL)) { /* Yes, they should have been combined. This string is not NFC */ return 0; } /* Could it be that preceding 'starter' codepoint is already combined, but with a * combining mark which is out of order with this one? */ decompose_table *decomp = nfc_decompose(starter); if (decomp && decomp->canon_cls2 > canon_cls && nfc_combine(decomp->to1, ch, NULL)) { return 0; } } else if (reason == REASON_JAMO_VOWEL) { if (!prev_canon_cls && starter >= 0x1100 && starter <= 0x115F) { /* Preceding codepoint was a leading jamo; they should have been combined */ return 0; } } else if (reason == REASON_JAMO_TRAILING) { if (!prev_canon_cls && starter >= 0xAC00 && starter <= 0xD7A3) { /* Preceding codepoint was a precomposed Hangul syllable; check if it had no trailing jamo */ if ((starter - 0xAC00) % 28 == 0) { /* It didn't have a trailing jamo, so this trailing jamo should have been combined */ return 0; } } } return 1; } static void merge_combining_marks (uint32_t *src1, uint32_t *src2, uint32_t *dest, size_t size1, size_t size2) { while (size1 && size2) { if ((*src1 & 0xFF) > (*src2 & 0xFF)) { *dest++ = *src2++; size2--; } else { *dest++ = *src1++; size1--; } } while (size1) { *dest++ = *src1++; size1--; } while (size2) { *dest++ = *src2++; size2--; } } static void stable_sort_combining_marks (uint32_t *vector, uint32_t *scratch, size_t size) { /* We need to use a stable sort for sorting combining marks which are in the wrong order * when doing NFC normalization; bottom-up merge sort is fast and stable */ size_t limit = size - 1; for (unsigned int i = 0; i < limit; i += 2) { if ((vector[i] & 0xFF) > (vector[i+1] & 0xFF)) { uint32_t temp = vector[i]; vector[i] = vector[i+1]; vector[i+1] = temp; } } if (size <= 2) return; uint32_t *src = vector, *dest = scratch; unsigned int runsize = 2; /* Every consecutive slice of this size is sorted */ while (runsize < size) { unsigned int blocksize = runsize * 2; /* We will now sort slices of this size */ limit = size & ~(blocksize - 1); for (unsigned int i = 0; i < limit; i += blocksize) merge_combining_marks(&src[i], &src[i+runsize], &dest[i], runsize, runsize); if (size - limit > runsize) { merge_combining_marks(&src[limit], &src[limit+runsize], &dest[limit], runsize, size - limit - runsize); } else { memcpy(&dest[limit], &src[limit], (size - limit) * sizeof(uint32_t)); } /* After each series of (progressively larger) merges, we swap src & dest to * avoid memcpy'ing the partially sorted results from dest back into src */ uint32_t *temp = src; src = dest; dest = temp; runsize = blocksize; } if (dest == vector) { /* Since src & dest are swapped on each iteration of the above loop, * this actually means the last buffer which was written into * was 'scratch' */ memcpy(vector, scratch, size * sizeof(uint32_t)); } } static void add_utf8char (luaL_Buffer *b, utfint ch); static void string_to_nfc (lua_State *L, luaL_Buffer *buff, const char *s, const char *e) { /* Converting a string to Normal Form C involves: * 1) Ensuring that codepoints with "built-in" accents are used whenever possible * rather than separate codepoints for a base character and combining mark * 2) Where combining marks must be used, putting them into canonical order * 3) Converting some deprecated codepoints to the recommended variant * 4) Ensuring that Korean Hangul are represented as precomposed syllable * codepoints whenever possible, rather than sequences of Jamo codepoints * * (Combining marks are accents which appear on top of or below the preceding * character. Starter codepoints are the base characters which combining marks can * 'combine' with. Almost all codepoints are starters, including all the Latin alphabet. * Every Unicode codepoint has a numeric 'canonicalization class'; starters have class = 0. * Combining marks must be sorted in order of their canonicalization class. Since the * canonicalization class numbers are not unique, the sort must be stable.) * * When converting to NFC, the largest scope which we need to work on at once * consists of a 'starter' codepoint and either 1 or more ensuing combining marks, * OR else a directly following starter codepoint. * * As we walk through the string, whenever we pass by a complete sequence of starter + * combining marks or starter + starter, we process that sequence to see if it is NFC or not. * If it is, we memcpy the bytes verbatim into the output buffer. If it is not, then we * convert the codepoints to NFC and then emit those codepoints as UTF-8 bytes. */ utfint starter = -1, ch; /* 'starter' is last starter codepoint seen */ const char *to_copy = s; /* pointer to next bytes we might need to memcpy into output buffer */ unsigned int prev_canon_cls = 0; /*, canon_cls = 0; */ int fixedup = 0; /* has the sequence currently under consideration been modified to make it NFC? */ /* Temporary storage for a sequence of consecutive combining marks * In the vast majority of cases, this small on-stack array will provide enough * space; if not, we will switch to a malloc'd buffer */ uint32_t onstack[8]; size_t vec_size = 0, vec_max = sizeof(onstack)/sizeof(uint32_t); uint32_t *vector = onstack; while (s < e) { const char *new_s = utf8_decode(s, &ch, 1); if (new_s == NULL) { lua_pushstring(L, "string is not valid UTF-8"); lua_error(L); } unsigned int canon_cls = lookup_canon_cls(ch); if (!canon_cls) { /* This is a starter codepoint */ nfc_table *entry = nfc_quickcheck(ch); /* Handle preceding starter and optional sequence of combining marks which may have followed it */ if (prev_canon_cls) { /* Before this starter, there was a sequence of combining marks. * Check those over and emit output to 'buff' */ process_combining_marks: /* Check if accumulated combining marks were in correct order */ for (unsigned int i = 1; i < vec_size; i++) { if ((vector[i-1] & 0xFF) > (vector[i] & 0xFF)) { /* Order is incorrect, we need to sort */ uint32_t *scratch = malloc(vec_size * sizeof(uint32_t)); stable_sort_combining_marks(vector, scratch, vec_size); free(scratch); fixedup = 1; } } /* Check if any of those combining marks are in violation of NFC */ unsigned int i = 0; while (i < vec_size) { utfint combine_mark = vector[i] >> 8; nfc_table *mark_entry = nfc_quickcheck(combine_mark); if (mark_entry) { if (mark_entry->reason == REASON_MUST_CONVERT_1) { /* This combining mark must be converted to a different one */ vector[i] = (mark_entry->data1 << 8) | mark_entry->data2; fixedup = 1; continue; } else if (mark_entry->reason == REASON_MUST_CONVERT_2) { /* This combining mark must be converted to two others */ if (vec_size == vec_max) { vec_max *= 2; vector = realloc((vector == onstack) ? NULL : vector, vec_max * sizeof(uint32_t)); } memmove(&vector[i+2], &vector[i+1], sizeof(uint32_t) * (vec_size - i - 1)); vector[i] = (mark_entry->data1 << 8) | lookup_canon_cls(mark_entry->data1); vector[i+1] = (mark_entry->data2 << 8) | lookup_canon_cls(mark_entry->data2); vec_size++; fixedup = 1; continue; } else if (mark_entry->reason == REASON_COMBINING_MARK) { if ((i == 0 || (vector[i] & 0xFF) > (vector[i-1] & 0xFF)) && nfc_combine(starter, combine_mark, &starter)) { /* This combining mark must be combined with preceding starter */ vec_size--; memmove(&vector[i], &vector[i+1], sizeof(uint32_t) * (vec_size - i)); /* Remove element i */ fixedup = 1; continue; } decompose_table *decomp = nfc_decompose(starter); if (decomp && decomp->canon_cls2 > (vector[i] & 0xFF) && nfc_combine(decomp->to1, combine_mark, &starter)) { /* The preceding starter already included an accent, but when represented as a combining * mark, that accent has a HIGHER canonicalization class than this one * Further, this one is able to combine with the same base character * In other words, the base character was wrongly combined with a "lower-priority" * combining mark; fix that up */ vector[i] = (decomp->to2 << 8) | lookup_canon_cls(decomp->to2); fixedup = 1; continue; } } } i++; } if (fixedup) { /* The preceding starter/combining mark sequence was bad; convert fixed-up codepoints * to UTF-8 bytes */ if (starter != -1) add_utf8char(buff, starter); for (unsigned int i = 0; i < vec_size; i++) add_utf8char(buff, vector[i] >> 8); } else { /* The preceding starter/combining mark sequence was good; copy raw bytes to output */ luaL_addlstring(buff, to_copy, s - to_copy); } if (s >= e) { /* We jumped in to the middle of the main loop to finish processing trailing * combining marks... we are actually done now */ if (vector != onstack) free(vector); return; } vec_size = 0; /* Clear vector of combining marks in readiness for next such sequence */ fixedup = 0; } else if (starter != -1) { /* This starter was preceded immediately by another starter * Check if this one should combine with it */ fixedup = 0; if (entry) { if (entry->reason == REASON_STARTER_CAN_COMBINE && nfc_combine(starter, ch, &ch)) { fixedup = 1; } else if (entry->reason == REASON_JAMO_VOWEL && starter >= 0x1100 && starter <= 0x115F) { ch = 0xAC00 + ((starter - 0x1100) * 588) + ((ch - 0x1161) * 28); fixedup = 1; } else if (entry->reason == REASON_JAMO_TRAILING) { if (starter >= 0xAC00 && starter <= 0xD7A3 && (starter - 0xAC00) % 28 == 0) { ch = starter + ch - 0x11A7; fixedup = 1; } } } if (!fixedup) add_utf8char(buff, starter); /* Emit previous starter to output */ } starter = ch; to_copy = s; /* We are finished processing the preceding starter and optional sequence of combining marks * Now check if this (possibly deprecated) starter needs to be converted to a canonical variant */ if (entry) { if (entry->reason == REASON_MUST_CONVERT_1) { starter = entry->data1; fixedup = 1; } else if (entry->reason == REASON_MUST_CONVERT_2) { utfint conv1 = entry->data1; /* It is possible that after converting 'ch' to two other codepoints, * the first one might also need to convert to two codepoints */ nfc_table *conv_entry = nfc_quickcheck(conv1); if (conv_entry && conv_entry->reason == REASON_MUST_CONVERT_2) { add_utf8char(buff, conv_entry->data1); add_utf8char(buff, conv_entry->data2); } else { add_utf8char(buff, conv1); } starter = entry->data2; fixedup = 1; } } } else { /* Accumulate combining marks in vector */ if (vec_size == vec_max) { vec_max *= 2; vector = realloc((vector == onstack) ? NULL : vector, vec_max * sizeof(uint32_t)); } vector[vec_size++] = (ch << 8) | (canon_cls & 0xFF); } s = new_s; prev_canon_cls = canon_cls; } if (vec_size) goto process_combining_marks; /* Finish processing trailing combining marks */ if (starter != -1) add_utf8char(buff, starter); if (vector != onstack) free(vector); } /* Grapheme cluster support */ static int hangul_type (utfint ch) { /* The first Hangul codepoint is U+1100 */ if (ch < 0x1100) { return 0; } size_t begin = 0, end = table_size(hangul_table); while (begin < end) { size_t mid = (begin + end) / 2; if (hangul_table[mid].last < ch) begin = mid + 1; else if (hangul_table[mid].first > ch) end = mid; else return hangul_table[mid].type; } return 0; } static int indic_conjunct_type (utfint ch) { /* The first Indic conjunct codepoint is U+0300 */ if (ch < 0x300) { return 0; } size_t begin = 0, end = table_size(indic_table); while (begin < end) { size_t mid = (begin + end) / 2; if (indic_table[mid].last < ch) begin = mid + 1; else if (indic_table[mid].first > ch) end = mid; else return indic_table[mid].type; } return 0; } #define define_category(cls, name) static int utf8_is##name (utfint ch)\ { return find_in_range(name##_table, table_size(name##_table), ch); } #define define_converter(name) static utfint utf8_to##name (utfint ch) \ { return convert_char(to##name##_table, table_size(to##name##_table), ch); } utf8_categories(define_category) utf8_converters(define_converter) #undef define_category #undef define_converter static int utf8_isgraph (utfint ch) { if (find_in_range(space_table, table_size(space_table), ch)) return 0; if (find_in_range(graph_table, table_size(graph_table), ch)) return 1; if (find_in_range(compose_table, table_size(compose_table), ch)) return 1; return 0; } static int utf8_isalnum (utfint ch) { if (find_in_range(alpha_table, table_size(alpha_table), ch)) return 1; if (find_in_range(alnum_extend_table, table_size(alnum_extend_table), ch)) return 1; return 0; } static int utf8_width (utfint ch, int ambi_is_single) { if (find_in_range(doublewidth_table, table_size(doublewidth_table), ch)) return 2; if (find_in_range(ambiwidth_table, table_size(ambiwidth_table), ch)) return ambi_is_single ? 1 : 2; if (find_in_range(compose_table, table_size(compose_table), ch)) return 0; if (find_in_range(unprintable_table, table_size(unprintable_table), ch)) return 0; return 1; } /* string module compatible interface */ static int typeerror (lua_State *L, int idx, const char *tname) { return luaL_error(L, "%s expected, got %s", tname, luaL_typename(L, idx)); } static const char *check_utf8 (lua_State *L, int idx, const char **end) { size_t len; const char *s = luaL_checklstring(L, idx, &len); if (end) *end = s+len; return s; } static const char *to_utf8 (lua_State *L, int idx, const char **end) { size_t len; const char *s = lua_tolstring(L, idx, &len); if (end) *end = s+len; return s; } static const char *utf8_safe_decode (lua_State *L, const char *p, utfint *pval) { p = utf8_decode(p, pval, 0); if (p == NULL) luaL_error(L, "invalid UTF-8 code"); return p; } static void add_utf8char (luaL_Buffer *b, utfint ch) { char buff[UTF8_BUFFSZ]; size_t n = utf8_encode(buff, ch); luaL_addlstring(b, buff+UTF8_BUFFSZ-n, n); } static lua_Integer byte_relat (lua_Integer pos, size_t len) { if (pos >= 0) return pos; else if (0u - (size_t)pos > len) return 0; else return (lua_Integer)len + pos + 1; } static int Lutf8_len (lua_State *L) { size_t len, n; const char *s = luaL_checklstring(L, 1, &len), *p, *e; lua_Integer posi = byte_relat(luaL_optinteger(L, 2, 1), len); lua_Integer pose = byte_relat(luaL_optinteger(L, 3, -1), len); int lax = lua_toboolean(L, 4); luaL_argcheck(L, 1 <= posi && --posi <= (lua_Integer)len, 2, "initial position out of string"); luaL_argcheck(L, --pose < (lua_Integer)len, 3, "final position out of string"); for (n = 0, p=s+posi, e=s+pose+1; p < e; ++n) { if (lax) p = utf8_next(p, e); else { utfint ch; const char *np = utf8_decode(p, &ch, !lax); if (np == NULL || utf8_invalid(ch)) { lua_pushnil(L); lua_pushinteger(L, p - s + 1); return 2; } p = np; } } lua_pushinteger(L, n); return 1; } static int Lutf8_sub (lua_State *L) { const char *e, *s = check_utf8(L, 1, &e); lua_Integer posi = luaL_checkinteger(L, 2); lua_Integer pose = luaL_optinteger(L, 3, -1); if (utf8_range(s, e, &posi, &pose)) lua_pushlstring(L, s+posi, pose-posi); else lua_pushliteral(L, ""); return 1; } static int Lutf8_reverse (lua_State *L) { luaL_Buffer b; const char *prev, *pprev, *ends, *e, *s = check_utf8(L, 1, &e); (void) ends; int lax = lua_toboolean(L, 2); luaL_buffinit(L, &b); if (lax) { for (prev = e; s < prev; e = prev) { prev = utf8_prev(s, prev); luaL_addlstring(&b, prev, e-prev); } } else { for (prev = e; s < prev; prev = pprev) { utfint code = 0; ends = utf8_safe_decode(L, pprev = utf8_prev(s, prev), &code); assert(ends == prev); if (utf8_invalid(code)) return luaL_error(L, "invalid UTF-8 code"); if (!utf8_iscompose(code)) { luaL_addlstring(&b, pprev, e-pprev); e = pprev; } } } luaL_pushresult(&b); return 1; } static int Lutf8_byte (lua_State *L) { size_t n = 0; const char *e, *s = check_utf8(L, 1, &e); lua_Integer posi = luaL_optinteger(L, 2, 1); lua_Integer pose = luaL_optinteger(L, 3, posi); if (utf8_range(s, e, &posi, &pose)) { for (e = s + pose, s = s + posi; s < e; ++n) { utfint ch = 0; s = utf8_safe_decode(L, s, &ch); lua_pushinteger(L, ch); } } return CAST(int, n); } static int Lutf8_codepoint (lua_State *L) { const char *e, *s = check_utf8(L, 1, &e); size_t len = e-s; lua_Integer posi = byte_relat(luaL_optinteger(L, 2, 1), len); lua_Integer pose = byte_relat(luaL_optinteger(L, 3, posi), len); int lax = lua_toboolean(L, 4); int n; const char *se; luaL_argcheck(L, posi >= 1, 2, "out of range"); luaL_argcheck(L, pose <= (lua_Integer)len, 3, "out of range"); if (posi > pose) return 0; /* empty interval; return no values */ if (pose - posi >= INT_MAX) /* (lua_Integer -> int) overflow? */ return luaL_error(L, "string slice too long"); n = (int)(pose - posi + 1); luaL_checkstack(L, n, "string slice too long"); n = 0; /* count the number of returns */ se = s + pose; /* string end */ for (n = 0, s += posi - 1; s < se;) { utfint code = 0; s = utf8_safe_decode(L, s, &code); if (!lax && utf8_invalid(code)) return luaL_error(L, "invalid UTF-8 code"); lua_pushinteger(L, code); n++; } return n; } static int Lutf8_char (lua_State *L) { int i, n = lua_gettop(L); /* number of arguments */ luaL_Buffer b; luaL_buffinit(L, &b); for (i = 1; i <= n; ++i) { lua_Integer code = luaL_checkinteger(L, i); luaL_argcheck(L, code <= UTF8_MAXCP, i, "value out of range"); add_utf8char(&b, CAST(utfint, code)); } luaL_pushresult(&b); return 1; } #define bind_converter(name) \ static int Lutf8_##name (lua_State *L) { \ int t = lua_type(L, 1); \ if (t == LUA_TNUMBER) \ lua_pushinteger(L, utf8_to##name(CAST(utfint, lua_tointeger(L, 1)))); \ else if (t == LUA_TSTRING) { \ luaL_Buffer b; \ const char *e, *s = to_utf8(L, 1, &e); \ luaL_buffinit(L, &b); \ while (s < e) { \ utfint ch = 0; \ s = utf8_safe_decode(L, s, &ch); \ add_utf8char(&b, utf8_to##name(ch)); \ } \ luaL_pushresult(&b); \ } \ else return typeerror(L, 1, "number/string"); \ return 1; \ } utf8_converters(bind_converter) #undef bind_converter /* unicode extra interface */ static const char *parse_escape (lua_State *L, const char *s, const char *e, int hex, utfint *pch) { utfint code = 0; int in_bracket = 0; if (*s == '{') ++s, in_bracket = 1; for (; s < e; ++s) { utfint ch = (unsigned char)*s; if (ch >= '0' && ch <= '9') ch = ch - '0'; else if (hex && ch >= 'A' && ch <= 'F') ch = 10 + (ch - 'A'); else if (hex && ch >= 'a' && ch <= 'f') ch = 10 + (ch - 'a'); else if (!in_bracket) break; else if (ch == '}') { ++s; break; } else luaL_error(L, "invalid escape '%c'", ch); code *= hex ? 16 : 10; code += ch; } *pch = code; return s; } static int Lutf8_escape (lua_State *L) { const char *e, *s = check_utf8(L, 1, &e); luaL_Buffer b; luaL_buffinit(L, &b); while (s < e) { utfint ch = 0; s = utf8_safe_decode(L, s, &ch); if (ch == '%') { int hex = 0; switch (*s) { case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': case '{': break; case 'x': case 'X': hex = 1; /* fall through */ case 'u': case 'U': if (s+1 < e) { ++s; break; } /* fall through */ default: s = utf8_safe_decode(L, s, &ch); goto next; } s = parse_escape(L, s, e, hex, &ch); } next: add_utf8char(&b, ch); } luaL_pushresult(&b); return 1; } static int Lutf8_insert (lua_State *L) { const char *e, *s = check_utf8(L, 1, &e); size_t sublen; const char *subs; luaL_Buffer b; int nargs = 2; const char *first = e; if (lua_type(L, 2) == LUA_TNUMBER) { int idx = (int)lua_tointeger(L, 2); if (idx != 0) first = utf8_relat(s, e, idx); luaL_argcheck(L, first, 2, "invalid index"); ++nargs; } subs = luaL_checklstring(L, nargs, &sublen); luaL_buffinit(L, &b); luaL_addlstring(&b, s, first-s); luaL_addlstring(&b, subs, sublen); luaL_addlstring(&b, first, e-first); luaL_pushresult(&b); return 1; } static int Lutf8_remove (lua_State *L) { const char *e, *s = check_utf8(L, 1, &e); lua_Integer posi = luaL_optinteger(L, 2, -1); lua_Integer pose = luaL_optinteger(L, 3, -1); if (!utf8_range(s, e, &posi, &pose)) lua_settop(L, 1); else { luaL_Buffer b; luaL_buffinit(L, &b); luaL_addlstring(&b, s, posi); luaL_addlstring(&b, s+pose, e-s-pose); luaL_pushresult(&b); } return 1; } static int push_offset (lua_State *L, const char *s, const char *e, lua_Integer offset, lua_Integer idx) { utfint ch = 0; const char *p; if (idx != 0) p = utf8_offset(s, e, offset, idx); else if (p = s+offset-1, iscont(p)) p = utf8_prev(s, p); if (p == NULL || p == e) return 0; utf8_decode(p, &ch, 0); lua_pushinteger(L, p-s+1); lua_pushinteger(L, ch); return 2; } static int Lutf8_charpos (lua_State *L) { const char *e, *s = check_utf8(L, 1, &e); lua_Integer offset = 1; if (lua_isnoneornil(L, 3)) { lua_Integer idx = luaL_optinteger(L, 2, 0); if (idx > 0) --idx; else if (idx < 0) offset = e-s+1; return push_offset(L, s, e, offset, idx); } offset = byte_relat(luaL_optinteger(L, 2, 1), e-s); if (offset < 1) offset = 1; return push_offset(L, s, e, offset, luaL_checkinteger(L, 3)); } static int Lutf8_offset (lua_State *L) { size_t len; const char *s = luaL_checklstring(L, 1, &len); lua_Integer n = luaL_checkinteger(L, 2); lua_Integer posi = (n >= 0) ? 1 : len + 1; posi = byte_relat(luaL_optinteger(L, 3, posi), len); luaL_argcheck(L, 1 <= posi && --posi <= (lua_Integer)len, 3, "position out of range"); if (n == 0) { /* find beginning of current byte sequence */ while (posi > 0 && iscont(s + posi)) posi--; } else { if (iscont(s + posi)) return luaL_error(L, "initial position is a continuation byte"); if (n < 0) { while (n < 0 && posi > 0) { /* move back */ do { /* find beginning of previous character */ posi--; } while (posi > 0 && iscont(s + posi)); n++; } } else { n--; /* do not move for 1st character */ while (n > 0 && posi < (lua_Integer)len) { do { /* find beginning of next character */ posi++; } while (iscont(s + posi)); /* (cannot pass final '\0') */ n--; } } } if (n == 0) /* did it find given character? */ lua_pushinteger(L, posi + 1); else /* no such character */ lua_pushnil(L); return 1; } static int Lutf8_next (lua_State *L) { const char *e, *s = check_utf8(L, 1, &e); lua_Integer offset = byte_relat(luaL_optinteger(L, 2, 1), e-s); lua_Integer idx = luaL_optinteger(L, 3, !lua_isnoneornil(L, 2)); return push_offset(L, s, e, offset, idx); } static int iter_aux (lua_State *L, int strict) { const char *e, *s = check_utf8(L, 1, &e); int n = CAST(int, lua_tointeger(L, 2)); const char *p = n <= 0 ? s : utf8_next(s+n-1, e); if (p < e) { utfint code = 0; utf8_safe_decode(L, p, &code); if (strict && utf8_invalid(code)) return luaL_error(L, "invalid UTF-8 code"); lua_pushinteger(L, p-s+1); lua_pushinteger(L, code); return 2; } return 0; /* no more codepoints */ } static int iter_auxstrict (lua_State *L) { return iter_aux(L, 1); } static int iter_auxlax (lua_State *L) { return iter_aux(L, 0); } static int Lutf8_codes (lua_State *L) { int lax = lua_toboolean(L, 2); luaL_checkstring(L, 1); lua_pushcfunction(L, lax ? iter_auxlax : iter_auxstrict); lua_pushvalue(L, 1); lua_pushinteger(L, 0); return 3; } static int Lutf8_width (lua_State *L) { int t = lua_type(L, 1); int ambi_is_single = !lua_toboolean(L, 2); int default_width = CAST(int, luaL_optinteger(L, 3, 0)); if (t == LUA_TNUMBER) { size_t chwidth = utf8_width(CAST(utfint, lua_tointeger(L, 1)), ambi_is_single); if (chwidth == 0) chwidth = default_width; lua_pushinteger(L, (lua_Integer)chwidth); } else if (t != LUA_TSTRING) return typeerror(L, 1, "number/string"); else { const char *e, *s = to_utf8(L, 1, &e); int width = 0; while (s < e) { utfint ch = 0; int chwidth; s = utf8_safe_decode(L, s, &ch); chwidth = utf8_width(ch, ambi_is_single); width += chwidth == 0 ? default_width : chwidth; } lua_pushinteger(L, (lua_Integer)width); } return 1; } static int Lutf8_widthindex (lua_State *L) { const char *e, *s = check_utf8(L, 1, &e); int width = CAST(int, luaL_checkinteger(L, 2)); int ambi_is_single = !lua_toboolean(L, 3); int default_width = CAST(int, luaL_optinteger(L, 4, 0)); size_t idx = 1; while (s < e) { utfint ch = 0; size_t chwidth; s = utf8_safe_decode(L, s, &ch); chwidth = utf8_width(ch, ambi_is_single); if (chwidth == 0) chwidth = default_width; width -= CAST(int, chwidth); if (width <= 0) { lua_pushinteger(L, idx); lua_pushinteger(L, width + chwidth); lua_pushinteger(L, chwidth); return 3; } ++idx; } lua_pushinteger(L, (lua_Integer)idx); return 1; } static int Lutf8_ncasecmp (lua_State *L) { const char *e1, *s1 = check_utf8(L, 1, &e1); const char *e2, *s2 = check_utf8(L, 2, &e2); while (s1 < e1 || s2 < e2) { utfint ch1 = 0, ch2 = 0; if (s1 == e1) ch2 = 1; else if (s2 == e2) ch1 = 1; else { s1 = utf8_safe_decode(L, s1, &ch1); s2 = utf8_safe_decode(L, s2, &ch2); ch1 = utf8_tofold(ch1); ch2 = utf8_tofold(ch2); } if (ch1 != ch2) { lua_pushinteger(L, ch1 > ch2 ? 1 : -1); return 1; } } lua_pushinteger(L, 0); return 1; } /* utf8 pattern matching implement */ #ifndef LUA_MAXCAPTURES # define LUA_MAXCAPTURES 32 #endif /* LUA_MAXCAPTURES */ #define CAP_UNFINISHED (-1) #define CAP_POSITION (-2) typedef struct MatchState { int matchdepth; /* control for recursive depth (to avoid C stack overflow) */ const char *src_init; /* init of source string */ const char *src_end; /* end ('\0') of source string */ const char *p_end; /* end ('\0') of pattern */ lua_State *L; int level; /* total number of captures (finished or unfinished) */ struct { const char *init; ptrdiff_t len; } capture[LUA_MAXCAPTURES]; } MatchState; /* recursive function */ static const char *match (MatchState *ms, const char *s, const char *p); /* maximum recursion depth for 'match' */ #if !defined(MAXCCALLS) #define MAXCCALLS 200 #endif #define L_ESC '%' #define SPECIALS "^$*+?.([%-" static int check_capture (MatchState *ms, int l) { l -= '1'; if (l < 0 || l >= ms->level || ms->capture[l].len == CAP_UNFINISHED) return luaL_error(ms->L, "invalid capture index %%%d", l + 1); return l; } static int capture_to_close (MatchState *ms) { int level = ms->level; while (--level >= 0) if (ms->capture[level].len == CAP_UNFINISHED) return level; return luaL_error(ms->L, "invalid pattern capture"); } static const char *classend (MatchState *ms, const char *p) { utfint ch = 0; p = utf8_safe_decode(ms->L, p, &ch); switch (ch) { case L_ESC: { if (p == ms->p_end) luaL_error(ms->L, "malformed pattern (ends with " LUA_QL("%%") ")"); return utf8_next(p, ms->p_end); } case '[': { if (*p == '^') p++; do { /* look for a `]' */ if (p == ms->p_end) luaL_error(ms->L, "malformed pattern (missing " LUA_QL("]") ")"); if (*(p++) == L_ESC && p < ms->p_end) p++; /* skip escapes (e.g. `%]') */ } while (*p != ']'); return p+1; } default: { return p; } } } static int match_class (utfint c, utfint cl) { int res; switch (utf8_tolower(cl)) { #define X(cls, name) case cls: res = utf8_is##name(c); break; utf8_categories(X) #undef X case 'g' : res = utf8_isgraph(c); break; case 'w' : res = utf8_isalnum(c); break; case 'z' : res = (c == 0); break; /* deprecated option */ default: return (cl == c); } return (utf8_islower(cl) ? res : !res); } static int matchbracketclass (MatchState *ms, utfint c, const char *p, const char *ec) { int sig = 1; assert(*p == '['); if (*++p == '^') { sig = 0; p++; /* skip the `^' */ } while (p < ec) { utfint ch = 0; p = utf8_safe_decode(ms->L, p, &ch); if (ch == L_ESC) { p = utf8_safe_decode(ms->L, p, &ch); if (match_class(c, ch)) return sig; } else { utfint next = 0; const char *np = utf8_safe_decode(ms->L, p, &next); if (next == '-' && np < ec) { p = utf8_safe_decode(ms->L, np, &next); if (ch <= c && c <= next) return sig; } else if (ch == c) return sig; } } return !sig; } static int singlematch (MatchState *ms, const char *s, const char *p, const char *ep) { if (s >= ms->src_end) return 0; else { utfint ch=0, pch=0; utf8_safe_decode(ms->L, s, &ch); p = utf8_safe_decode(ms->L, p, &pch); switch (pch) { case '.': return 1; /* matches any char */ case L_ESC: utf8_safe_decode(ms->L, p, &pch); return match_class(ch, pch); case '[': return matchbracketclass(ms, ch, p-1, ep-1); default: return pch == ch; } } } static const char *matchbalance (MatchState *ms, const char *s, const char **p) { utfint ch=0, begin=0, end=0; *p = utf8_safe_decode(ms->L, *p, &begin); if (*p >= ms->p_end) luaL_error(ms->L, "malformed pattern " "(missing arguments to " LUA_QL("%%b") ")"); *p = utf8_safe_decode(ms->L, *p, &end); s = utf8_safe_decode(ms->L, s, &ch); if (ch != begin) return NULL; else { int cont = 1; while (s < ms->src_end) { s = utf8_safe_decode(ms->L, s, &ch); if (ch == end) { if (--cont == 0) return s; } else if (ch == begin) cont++; } } return NULL; /* string ends out of balance */ } static const char *max_expand (MatchState *ms, const char *s, const char *p, const char *ep) { const char *m = s; /* matched end of single match p */ while (singlematch(ms, m, p, ep)) m = utf8_next(m, ms->src_end); /* keeps trying to match with the maximum repetitions */ while (s <= m) { const char *res = match(ms, m, ep+1); if (res) return res; /* else didn't match; reduce 1 repetition to try again */ if (s == m) break; m = utf8_prev(s, m); } return NULL; } static const char *min_expand (MatchState *ms, const char *s, const char *p, const char *ep) { for (;;) { const char *res = match(ms, s, ep+1); if (res != NULL) return res; else if (singlematch(ms, s, p, ep)) s = utf8_next(s, ms->src_end); /* try with one more repetition */ else return NULL; } } static const char *start_capture (MatchState *ms, const char *s, const char *p, int what) { const char *res; int level = ms->level; if (level >= LUA_MAXCAPTURES) luaL_error(ms->L, "too many captures"); ms->capture[level].init = s; ms->capture[level].len = what; ms->level = level+1; if ((res=match(ms, s, p)) == NULL) /* match failed? */ ms->level--; /* undo capture */ return res; } static const char *end_capture (MatchState *ms, const char *s, const char *p) { int l = capture_to_close(ms); const char *res; ms->capture[l].len = s - ms->capture[l].init; /* close capture */ if ((res = match(ms, s, p)) == NULL) /* match failed? */ ms->capture[l].len = CAP_UNFINISHED; /* undo capture */ return res; } static const char *match_capture (MatchState *ms, const char *s, int l) { size_t len; l = check_capture(ms, l); len = ms->capture[l].len; if ((size_t)(ms->src_end-s) >= len && memcmp(ms->capture[l].init, s, len) == 0) return s+len; else return NULL; } static const char *match (MatchState *ms, const char *s, const char *p) { if (ms->matchdepth-- == 0) luaL_error(ms->L, "pattern too complex"); init: /* using goto's to optimize tail recursion */ if (p != ms->p_end) { /* end of pattern? */ utfint ch = 0; utf8_safe_decode(ms->L, p, &ch); switch (ch) { case '(': { /* start capture */ if (*(p + 1) == ')') /* position capture? */ s = start_capture(ms, s, p + 2, CAP_POSITION); else s = start_capture(ms, s, p + 1, CAP_UNFINISHED); break; } case ')': { /* end capture */ s = end_capture(ms, s, p + 1); break; } case '$': { if ((p + 1) != ms->p_end) /* is the `$' the last char in pattern? */ goto dflt; /* no; go to default */ s = (s == ms->src_end) ? s : NULL; /* check end of string */ break; } case L_ESC: { /* escaped sequence not in the format class[*+?-]? */ const char *prev_p = p; p = utf8_safe_decode(ms->L, p+1, &ch); switch (ch) { case 'b': { /* balanced string? */ s = matchbalance(ms, s, &p); if (s != NULL) goto init; /* return match(ms, s, p + 4); */ /* else fail (s == NULL) */ break; } case 'f': { /* frontier? */ const char *ep; utfint previous = 0, current = 0; if (*p != '[') luaL_error(ms->L, "missing " LUA_QL("[") " after " LUA_QL("%%f") " in pattern"); ep = classend(ms, p); /* points to what is next */ if (s != ms->src_init) utf8_decode(utf8_prev(ms->src_init, s), &previous, 0); if (s != ms->src_end) utf8_decode(s, ¤t, 0); if (!matchbracketclass(ms, previous, p, ep - 1) && matchbracketclass(ms, current, p, ep - 1)) { p = ep; goto init; /* return match(ms, s, ep); */ } s = NULL; /* match failed */ break; } case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': { /* capture results (%0-%9)? */ s = match_capture(ms, s, ch); if (s != NULL) goto init; /* return match(ms, s, p + 2) */ break; } default: p = prev_p; goto dflt; } break; } default: dflt: { /* pattern class plus optional suffix */ const char *ep = classend(ms, p); /* points to optional suffix */ /* does not match at least once? */ if (!singlematch(ms, s, p, ep)) { if (*ep == '*' || *ep == '?' || *ep == '-') { /* accept empty? */ p = ep + 1; goto init; /* return match(ms, s, ep + 1); */ } else /* '+' or no suffix */ s = NULL; /* fail */ } else { /* matched once */ const char *next_s = utf8_next(s, ms->src_end); switch (*ep) { /* handle optional suffix */ case '?': { /* optional */ const char *res; const char *next_ep = utf8_next(ep, ms->p_end); if ((res = match(ms, next_s, next_ep)) != NULL) s = res; else { p = next_ep; goto init; /* else return match(ms, s, ep + 1); */ } break; } case '+': /* 1 or more repetitions */ s = next_s; /* 1 match already done */ /* fall through */ case '*': /* 0 or more repetitions */ s = max_expand(ms, s, p, ep); break; case '-': /* 0 or more repetitions (minimum) */ s = min_expand(ms, s, p, ep); break; default: /* no suffix */ s = next_s; p = ep; goto init; /* return match(ms, s + 1, ep); */ } } break; } } } ms->matchdepth++; return s; } static const char *lmemfind (const char *s1, size_t l1, const char *s2, size_t l2) { if (l2 == 0) return s1; /* empty strings are everywhere */ else if (l2 > l1) return NULL; /* avoids a negative `l1' */ else { const char *init; /* to search for a `*s2' inside `s1' */ l2--; /* 1st char will be checked by `memchr' */ l1 = l1-l2; /* `s2' cannot be found after that */ while (l1 > 0 && (init = (const char *)memchr(s1, *s2, l1)) != NULL) { init++; /* 1st char is already checked */ if (memcmp(init, s2+1, l2) == 0) return init-1; else { /* correct `l1' and `s1' to try again */ l1 -= init-s1; s1 = init; } } return NULL; /* not found */ } } static int get_index (const char *p, const char *s, const char *e) { int idx; for (idx = 0; s < e && s < p; ++idx) s = utf8_next(s, e); return s == p ? idx : idx - 1; } static void push_onecapture (MatchState *ms, int i, const char *s, const char *e) { if (i >= ms->level) { if (i == 0) /* ms->level == 0, too */ lua_pushlstring(ms->L, s, e - s); /* add whole match */ else luaL_error(ms->L, "invalid capture index"); } else { ptrdiff_t l = ms->capture[i].len; if (l == CAP_UNFINISHED) luaL_error(ms->L, "unfinished capture"); if (l == CAP_POSITION) { int idx = get_index(ms->capture[i].init, ms->src_init, ms->src_end); lua_pushinteger(ms->L, idx+1); } else lua_pushlstring(ms->L, ms->capture[i].init, l); } } static int push_captures (MatchState *ms, const char *s, const char *e) { int i; int nlevels = (ms->level == 0 && s) ? 1 : ms->level; luaL_checkstack(ms->L, nlevels, "too many captures"); for (i = 0; i < nlevels; i++) push_onecapture(ms, i, s, e); return nlevels; /* number of strings pushed */ } /* check whether pattern has no special characters */ static int nospecials (const char *p, const char * ep) { while (p < ep) { if (strpbrk(p, SPECIALS)) return 0; /* pattern has a special character */ p += strlen(p) + 1; /* may have more after \0 */ } return 1; /* no special chars found */ } /* utf8 pattern matching interface */ static int find_aux (lua_State *L, int find) { const char *es, *s = check_utf8(L, 1, &es); const char *ep, *p = check_utf8(L, 2, &ep); lua_Integer idx = luaL_optinteger(L, 3, 1); const char *init; if (!idx) idx = 1; init = utf8_relat(s, es, CAST(int, idx)); if (init == NULL) { if (idx > 0) { lua_pushnil(L); /* cannot find anything */ return 1; } init = s; } /* explicit request or no special characters? */ if (find && (lua_toboolean(L, 4) || nospecials(p, ep))) { /* do a plain search */ const char *s2 = lmemfind(init, es-init, p, ep-p); if (s2) { const char *e2 = s2 + (ep - p); if (iscont(e2)) e2 = utf8_next(e2, es); lua_pushinteger(L, idx = get_index(s2, s, es) + 1); lua_pushinteger(L, idx + get_index(e2, s2, es) - 1); return 2; } } else { MatchState ms; int anchor = (*p == '^'); if (anchor) p++; /* skip anchor character */ if (idx < 0) idx += utf8_length(s, es)+1; /* TODO not very good */ ms.L = L; ms.matchdepth = MAXCCALLS; ms.src_init = s; ms.src_end = es; ms.p_end = ep; do { const char *res; ms.level = 0; assert(ms.matchdepth == MAXCCALLS); if ((res=match(&ms, init, p)) != NULL) { if (find) { lua_pushinteger(L, idx); /* start */ lua_pushinteger(L, idx + utf8_length(init, res) - 1); /* end */ return push_captures(&ms, NULL, 0) + 2; } else return push_captures(&ms, init, res); } if (init == es) break; idx += 1; init = utf8_next(init, es); } while (init <= es && !anchor); } lua_pushnil(L); /* not found */ return 1; } static int Lutf8_find (lua_State *L) { return find_aux(L, 1); } static int Lutf8_match (lua_State *L) { return find_aux(L, 0); } static int gmatch_aux (lua_State *L) { MatchState ms; const char *es, *s = check_utf8(L, lua_upvalueindex(1), &es); const char *ep, *p = check_utf8(L, lua_upvalueindex(2), &ep); const char *src; ms.L = L; ms.matchdepth = MAXCCALLS; ms.src_init = s; ms.src_end = es; ms.p_end = ep; for (src = s + (size_t)lua_tointeger(L, lua_upvalueindex(3)); src <= ms.src_end; src = utf8_next(src, ms.src_end)) { const char *e; ms.level = 0; assert(ms.matchdepth == MAXCCALLS); if ((e = match(&ms, src, p)) != NULL) { lua_Integer newstart = e-s; if (e == src) newstart++; /* empty match? go at least one position */ lua_pushinteger(L, newstart); lua_replace(L, lua_upvalueindex(3)); return push_captures(&ms, src, e); } if (src == ms.src_end) break; } return 0; /* not found */ } static int Lutf8_gmatch (lua_State *L) { luaL_checkstring(L, 1); luaL_checkstring(L, 2); lua_settop(L, 2); lua_pushinteger(L, 0); lua_pushcclosure(L, gmatch_aux, 3); return 1; } static void add_s (MatchState *ms, luaL_Buffer *b, const char *s, const char *e) { const char *new_end, *news = to_utf8(ms->L, 3, &new_end); while (news < new_end) { utfint ch = 0; news = utf8_safe_decode(ms->L, news, &ch); if (ch != L_ESC) add_utf8char(b, ch); else { news = utf8_safe_decode(ms->L, news, &ch); /* skip ESC */ if (!utf8_isdigit(ch)) { if (ch != L_ESC) luaL_error(ms->L, "invalid use of " LUA_QL("%c") " in replacement string", L_ESC); add_utf8char(b, ch); } else if (ch == '0') luaL_addlstring(b, s, e-s); else { push_onecapture(ms, ch-'1', s, e); luaL_addvalue(b); /* add capture to accumulated result */ } } } } static void add_value (MatchState *ms, luaL_Buffer *b, const char *s, const char *e, int tr) { lua_State *L = ms->L; switch (tr) { case LUA_TFUNCTION: { int n; lua_pushvalue(L, 3); n = push_captures(ms, s, e); lua_call(L, n, 1); break; } case LUA_TTABLE: { push_onecapture(ms, 0, s, e); lua_gettable(L, 3); break; } default: { /* LUA_TNUMBER or LUA_TSTRING */ add_s(ms, b, s, e); return; } } if (!lua_toboolean(L, -1)) { /* nil or false? */ lua_pop(L, 1); lua_pushlstring(L, s, e - s); /* keep original text */ } else if (!lua_isstring(L, -1)) luaL_error(L, "invalid replacement value (a %s)", luaL_typename(L, -1)); luaL_addvalue(b); /* add result to accumulator */ } static int Lutf8_gsub (lua_State *L) { const char *es, *s = check_utf8(L, 1, &es); const char *ep, *p = check_utf8(L, 2, &ep); int tr = lua_type(L, 3); lua_Integer max_s = luaL_optinteger(L, 4, (es-s)+1); int anchor = (*p == '^'); lua_Integer n = 0; MatchState ms; luaL_Buffer b; luaL_argcheck(L, tr == LUA_TNUMBER || tr == LUA_TSTRING || tr == LUA_TFUNCTION || tr == LUA_TTABLE, 3, "string/function/table expected"); luaL_buffinit(L, &b); if (anchor) p++; /* skip anchor character */ ms.L = L; ms.matchdepth = MAXCCALLS; ms.src_init = s; ms.src_end = es; ms.p_end = ep; while (n < max_s) { const char *e; ms.level = 0; assert(ms.matchdepth == MAXCCALLS); e = match(&ms, s, p); if (e) { n++; add_value(&ms, &b, s, e, tr); } if (e && e > s) /* non empty match? */ s = e; /* skip it */ else if (s < es) { utfint ch = 0; s = utf8_safe_decode(L, s, &ch); add_utf8char(&b, ch); } else break; if (anchor) break; } luaL_addlstring(&b, s, es-s); luaL_pushresult(&b); lua_pushinteger(L, n); /* number of substitutions */ return 2; } static int Lutf8_isvalid(lua_State *L) { const char *e, *s = check_utf8(L, 1, &e); const char *invalid = utf8_invalid_offset(s, e); lua_pushboolean(L, invalid == NULL); return 1; } static int Lutf8_invalidoffset(lua_State *L) { const char *e, *s = check_utf8(L, 1, &e); const char *orig_s = s; int offset = luaL_optinteger(L, 2, 0); if (offset > 1) { offset--; s += offset; if (s >= e) { lua_pushnil(L); return 1; } } else if (offset < 0 && s - e < offset) { s = e + offset; } const char *invalid = utf8_invalid_offset(s, e); if (invalid == NULL) { lua_pushnil(L); } else { lua_pushinteger(L, invalid - orig_s + 1); } return 1; } static int Lutf8_clean(lua_State *L) { const char *e, *s = check_utf8(L, 1, &e); /* Default replacement string is REPLACEMENT CHARACTER U+FFFD */ size_t repl_len; const char *r = luaL_optlstring(L, 2, "\xEF\xBF\xBD", &repl_len); int continuous = !lua_toboolean(L, 3); if (lua_gettop(L) > 1) { /* Check if replacement string is valid UTF-8 or not */ if (utf8_invalid_offset(r, r + repl_len) != NULL) { lua_pushstring(L, "replacement string must be valid UTF-8"); lua_error(L); } } const char *invalid = utf8_invalid_offset(s, e); if (invalid == NULL) { lua_settop(L, 1); /* Return input string without modification */ lua_pushboolean(L, 1); /* String was clean already */ return 2; } luaL_Buffer buff; luaL_buffinit(L, &buff); while (1) { /* Invariant: 's' points to first GOOD byte not in output buffer, * 'invalid' points to first BAD byte after that */ luaL_addlstring(&buff, s, invalid - s); luaL_addlstring(&buff, r, repl_len); /* We do not replace every bad byte with the replacement character, * but rather a contiguous sequence of bad bytes * Restore the invariant by stepping forward until we find at least * one good byte */ s = invalid; while (s == invalid) { s++; invalid = utf8_invalid_offset(s, e); if (!continuous) break; } if (invalid == NULL) { luaL_addlstring(&buff, s, e - s); luaL_pushresult(&buff); lua_pushboolean(L, 0); /* String was not clean */ return 2; } } } static int Lutf8_isnfc(lua_State *L) { const char *e, *s = check_utf8(L, 1, &e); utfint starter = 0, ch; unsigned int prev_canon_cls = 0; while (s < e) { s = utf8_decode(s, &ch, 1); if (s == NULL) { lua_pushstring(L, "string is not valid UTF-8"); lua_error(L); } if (ch < 0x300) { starter = ch; /* Fast path */ prev_canon_cls = 0; continue; } unsigned int canon_cls = lookup_canon_cls(ch); if (canon_cls && canon_cls < prev_canon_cls) { /* Combining marks are out of order; this string is not NFC */ lua_pushboolean(L, 0); /* Return false */ return 1; } nfc_table *entry = nfc_quickcheck(ch); if (entry && !nfc_check(ch, entry, starter, canon_cls, prev_canon_cls)) { lua_pushboolean(L, 0); /* Return false */ return 1; } prev_canon_cls = canon_cls; if (!canon_cls) starter = ch; } lua_pushboolean(L, 1); /* Return true */ return 1; } static int Lutf8_normalize_nfc(lua_State *L) { const char *e, *s = check_utf8(L, 1, &e), *p = s, *starter_p = s; utfint starter = 0, ch; unsigned int prev_canon_cls = 0; /* First scan to see if we can find any problems... if not, we may just return the * input string unchanged */ while (p < e) { const char *new_p = utf8_decode(p, &ch, 1); if (new_p == NULL) { lua_pushstring(L, "string is not valid UTF-8"); lua_error(L); } unsigned int canon_cls = lookup_canon_cls(ch); if (canon_cls && canon_cls < prev_canon_cls) { goto build_string; /* Combining marks are out of order; this string is not NFC */ } nfc_table *entry = nfc_quickcheck(ch); if (entry && !nfc_check(ch, entry, starter, canon_cls, prev_canon_cls)) { goto build_string; } prev_canon_cls = canon_cls; if (!canon_cls) { starter = ch; starter_p = p; } p = new_p; } lua_settop(L, 1); /* Return input string without modification */ lua_pushboolean(L, 1); /* String was in normal form already, so 2nd return value is 'true' */ return 2; build_string: ; /* We will need to build a new string, this one is not NFC */ luaL_Buffer buff; luaL_buffinit(L, &buff); luaL_addlstring(&buff, s, starter_p - s); string_to_nfc(L, &buff, starter_p, e); luaL_pushresult(&buff); lua_pushboolean(L, 0); return 2; } static int iterate_grapheme_indices(lua_State *L) { const char *s = luaL_checkstring(L, lua_upvalueindex(1)); lua_Integer pos = luaL_checkinteger(L, lua_upvalueindex(2)); lua_Integer end = luaL_checkinteger(L, lua_upvalueindex(3)); if (pos > end) { lua_pushnil(L); return 1; } const char *e = s + end; utfint ch, next_ch; const char *p = utf8_safe_decode(L, s + pos - 1, &ch); while (1) { const char *next_p = utf8_safe_decode(L, p, &next_ch); int bind = 0; if (ch == '\r') { if (next_ch == '\n') { /* CR binds to following LF */ bind = 1; } else { break; } } else if (ch == '\n' || next_ch == '\r' || next_ch == '\n') { /* CR/LF do not bind to any other codepoint or in any other way */ break; } else if (find_in_range(cntrl_table, table_size(cntrl_table), ch) && !find_in_range(prepend_table, table_size(prepend_table), ch) && ch != 0x200D) { /* Control characters do not bind to anything */ break; } else if (next_ch == 0x200D) { /* U+200D is ZERO WIDTH JOINER, it always binds to preceding char */ if (next_p < e && find_in_range(pictographic_table, table_size(pictographic_table), ch)) { /* After an Extended_Pictographic codepoint and ZWJ, we bind to a following Extended_Pictographic */ utfint nextnext_ch; const char *probe_ep = utf8_safe_decode(L, next_p, &nextnext_ch); if (find_in_range(pictographic_table, table_size(pictographic_table), nextnext_ch)) { p = probe_ep; ch = nextnext_ch; continue; } } bind = 1; } else if (find_in_range(cntrl_table, table_size(cntrl_table), next_ch) && !find_in_range(prepend_table, table_size(prepend_table), next_ch)) { /* Control characters do not bind to anything */ break; } else { if (indic_conjunct_type(ch) == INDIC_CONSONANT) { utfint probed_ch = next_ch; const char *probe = next_p; int indic_type = indic_conjunct_type(probed_ch); int saw_linker = 0; while (indic_type) { /* Consume any number of Extend or Linker codepoints, followed by a single Consonant * The sequence must contain at least one Linker, however! */ if (indic_type == INDIC_LINKER) { saw_linker = 1; } else if (indic_type == INDIC_CONSONANT) { if (!saw_linker) break; p = probe; ch = probed_ch; goto next_iteration; } if (probe >= e) break; probe = utf8_safe_decode(L, probe, &probed_ch); indic_type = indic_conjunct_type(probed_ch); } } if (find_in_range(compose_table, table_size(compose_table), next_ch) || (next_ch >= 0x1F3FB && next_ch <= 0x1F3FF)) { /* The 2nd codepoint has property Grapheme_Extend, or is an Emoji_Modifier codepoint */ if (next_p < e && find_in_range(pictographic_table, table_size(pictographic_table), ch)) { /* Consume any number of 'extend' codepoints, one ZWJ, and following Extended_Pictographic codepoint */ utfint probed_ch; const char *probe = next_p; while (probe < e) { probe = utf8_safe_decode(L, probe, &probed_ch); if (probed_ch == 0x200D) { if (probe < e) { probe = utf8_safe_decode(L, probe, &probed_ch); if (find_in_range(pictographic_table, table_size(pictographic_table), probed_ch)) { next_p = probe; next_ch = probed_ch; } } break; } else if (find_in_range(compose_table, table_size(compose_table), probed_ch) || (probed_ch >= 0x1F3FB && probed_ch <= 0x1F3FF)) { next_p = probe; next_ch = probed_ch; } else { break; } } } bind = 1; } else if (find_in_range(spacing_mark_table, table_size(spacing_mark_table), next_ch)) { /* The 2nd codepoint is in general category Spacing_Mark */ bind = 1; } else if (find_in_range(prepend_table, table_size(prepend_table), ch)) { /* The 1st codepoint has property Prepend_Concatenation_Mark, or is a type of * Indic Syllable which binds to the following codepoint */ bind = 1; } else if (ch >= 0x1F1E6 && ch <= 0x1F1FF && next_ch >= 0x1F1E6 && next_ch <= 0x1F1FF) { /* Regional Indicator (flag) emoji bind together; but only in twos */ p = next_p; ch = 0xFFFE; /* Set 'ch' to bogus value so we will not re-enter this branch on next iteration */ continue; } else { /* Korean Hangul codepoints have their own special rules about when they * are considered a single grapheme cluster */ int hangul1 = hangul_type(ch); if (hangul1) { int hangul2 = hangul_type(next_ch); if (hangul2) { if (hangul1 == HANGUL_L) { bind = (hangul2 != HANGUL_T); } else if (hangul1 == HANGUL_LV || hangul1 == HANGUL_V) { bind = (hangul2 == HANGUL_V || hangul2 == HANGUL_T); } else if (hangul1 == HANGUL_LVT || hangul1 == HANGUL_T) { bind = (hangul2 == HANGUL_T); } } } } } if (!bind) break; p = next_p; ch = next_ch; next_iteration: ; } lua_pushinteger(L, (p - s) + 1); lua_replace(L, lua_upvalueindex(2)); lua_pushinteger(L, pos); lua_pushinteger(L, p - s); return 2; } static int Lutf8_grapheme_indices(lua_State *L) { size_t len; luaL_checklstring(L, 1, &len); lua_Integer start = byte_relat(luaL_optinteger(L, 2, 1), len); lua_Integer end = byte_relat(luaL_optinteger(L, 3, len), len); luaL_argcheck(L, start >= 1, 2, "out of range"); luaL_argcheck(L, end <= (lua_Integer)len, 3, "out of range"); lua_settop(L, 1); lua_pushinteger(L, start); lua_pushinteger(L, end); lua_pushcclosure(L, iterate_grapheme_indices, 3); return 1; } /* lua module import interface */ #if LUA_VERSION_NUM >= 502 static const char UTF8PATT[] = "[\0-\x7F\xC2-\xF4][\x80-\xBF]*"; #else static const char UTF8PATT[] = "[%z\1-\x7F\xC2-\xF4][\x80-\xBF]*"; #endif int luaopen_utf8extra (lua_State *L) { luaL_Reg libs[] = { #define ENTRY(name) { #name, Lutf8_##name } ENTRY(offset), ENTRY(codes), ENTRY(codepoint), ENTRY(len), ENTRY(sub), ENTRY(reverse), ENTRY(lower), ENTRY(upper), ENTRY(title), ENTRY(fold), ENTRY(byte), ENTRY(char), ENTRY(escape), ENTRY(insert), ENTRY(remove), ENTRY(charpos), ENTRY(next), ENTRY(width), ENTRY(widthindex), ENTRY(ncasecmp), ENTRY(find), ENTRY(gmatch), ENTRY(gsub), ENTRY(match), ENTRY(isvalid), ENTRY(invalidoffset), ENTRY(clean), ENTRY(isnfc), ENTRY(normalize_nfc), ENTRY(grapheme_indices), #undef ENTRY { NULL, NULL } }; luaL_newlib(L, libs); lua_pushlstring(L, UTF8PATT, sizeof(UTF8PATT)-1); lua_setfield(L, -2, "charpattern"); return 1; }