18e3e3a7aSWarner Losh /* 2*0495ed39SKyle Evans ** $Id: lutf8lib.c $ 38e3e3a7aSWarner Losh ** Standard library for UTF-8 manipulation 48e3e3a7aSWarner Losh ** See Copyright Notice in lua.h 58e3e3a7aSWarner Losh */ 68e3e3a7aSWarner Losh 78e3e3a7aSWarner Losh #define lutf8lib_c 88e3e3a7aSWarner Losh #define LUA_LIB 98e3e3a7aSWarner Losh 108e3e3a7aSWarner Losh #include "lprefix.h" 118e3e3a7aSWarner Losh 128e3e3a7aSWarner Losh 138e3e3a7aSWarner Losh #include <assert.h> 148e3e3a7aSWarner Losh #include <limits.h> 158e3e3a7aSWarner Losh #include <stdlib.h> 168e3e3a7aSWarner Losh #include <string.h> 178e3e3a7aSWarner Losh 188e3e3a7aSWarner Losh #include "lua.h" 198e3e3a7aSWarner Losh 208e3e3a7aSWarner Losh #include "lauxlib.h" 218e3e3a7aSWarner Losh #include "lualib.h" 228e3e3a7aSWarner Losh 23*0495ed39SKyle Evans 24*0495ed39SKyle Evans #define MAXUNICODE 0x10FFFFu 25*0495ed39SKyle Evans 26*0495ed39SKyle Evans #define MAXUTF 0x7FFFFFFFu 27*0495ed39SKyle Evans 28*0495ed39SKyle Evans /* 29*0495ed39SKyle Evans ** Integer type for decoded UTF-8 values; MAXUTF needs 31 bits. 30*0495ed39SKyle Evans */ 31*0495ed39SKyle Evans #if (UINT_MAX >> 30) >= 1 32*0495ed39SKyle Evans typedef unsigned int utfint; 33*0495ed39SKyle Evans #else 34*0495ed39SKyle Evans typedef unsigned long utfint; 35*0495ed39SKyle Evans #endif 36*0495ed39SKyle Evans 378e3e3a7aSWarner Losh 388e3e3a7aSWarner Losh #define iscont(p) ((*(p) & 0xC0) == 0x80) 398e3e3a7aSWarner Losh 408e3e3a7aSWarner Losh 418e3e3a7aSWarner Losh /* from strlib */ 428e3e3a7aSWarner Losh /* translate a relative string position: negative means back from end */ 438e3e3a7aSWarner Losh static lua_Integer u_posrelat (lua_Integer pos, size_t len) { 448e3e3a7aSWarner Losh if (pos >= 0) return pos; 458e3e3a7aSWarner Losh else if (0u - (size_t)pos > len) return 0; 468e3e3a7aSWarner Losh else return (lua_Integer)len + pos + 1; 478e3e3a7aSWarner Losh } 488e3e3a7aSWarner Losh 498e3e3a7aSWarner Losh 508e3e3a7aSWarner Losh /* 51*0495ed39SKyle Evans ** Decode one UTF-8 sequence, returning NULL if byte sequence is 52*0495ed39SKyle Evans ** invalid. The array 'limits' stores the minimum value for each 53*0495ed39SKyle Evans ** sequence length, to check for overlong representations. Its first 54*0495ed39SKyle Evans ** entry forces an error for non-ascii bytes with no continuation 55*0495ed39SKyle Evans ** bytes (count == 0). 568e3e3a7aSWarner Losh */ 57*0495ed39SKyle Evans static const char *utf8_decode (const char *s, utfint *val, int strict) { 58*0495ed39SKyle Evans static const utfint limits[] = 59*0495ed39SKyle Evans {~(utfint)0, 0x80, 0x800, 0x10000u, 0x200000u, 0x4000000u}; 60*0495ed39SKyle Evans unsigned int c = (unsigned char)s[0]; 61*0495ed39SKyle Evans utfint res = 0; /* final result */ 628e3e3a7aSWarner Losh if (c < 0x80) /* ascii? */ 638e3e3a7aSWarner Losh res = c; 648e3e3a7aSWarner Losh else { 658e3e3a7aSWarner Losh int count = 0; /* to count number of continuation bytes */ 66*0495ed39SKyle Evans for (; c & 0x40; c <<= 1) { /* while it needs continuation bytes... */ 67*0495ed39SKyle Evans unsigned int cc = (unsigned char)s[++count]; /* read next byte */ 688e3e3a7aSWarner Losh if ((cc & 0xC0) != 0x80) /* not a continuation byte? */ 698e3e3a7aSWarner Losh return NULL; /* invalid byte sequence */ 708e3e3a7aSWarner Losh res = (res << 6) | (cc & 0x3F); /* add lower 6 bits from cont. byte */ 718e3e3a7aSWarner Losh } 72*0495ed39SKyle Evans res |= ((utfint)(c & 0x7F) << (count * 5)); /* add first byte */ 73*0495ed39SKyle Evans if (count > 5 || res > MAXUTF || res < limits[count]) 748e3e3a7aSWarner Losh return NULL; /* invalid byte sequence */ 758e3e3a7aSWarner Losh s += count; /* skip continuation bytes read */ 768e3e3a7aSWarner Losh } 77*0495ed39SKyle Evans if (strict) { 78*0495ed39SKyle Evans /* check for invalid code points; too large or surrogates */ 79*0495ed39SKyle Evans if (res > MAXUNICODE || (0xD800u <= res && res <= 0xDFFFu)) 80*0495ed39SKyle Evans return NULL; 81*0495ed39SKyle Evans } 828e3e3a7aSWarner Losh if (val) *val = res; 83*0495ed39SKyle Evans return s + 1; /* +1 to include first byte */ 848e3e3a7aSWarner Losh } 858e3e3a7aSWarner Losh 868e3e3a7aSWarner Losh 878e3e3a7aSWarner Losh /* 88*0495ed39SKyle Evans ** utf8len(s [, i [, j [, lax]]]) --> number of characters that 89*0495ed39SKyle Evans ** start in the range [i,j], or nil + current position if 's' is not 90*0495ed39SKyle Evans ** well formed in that interval 918e3e3a7aSWarner Losh */ 928e3e3a7aSWarner Losh static int utflen (lua_State *L) { 93*0495ed39SKyle Evans lua_Integer n = 0; /* counter for the number of characters */ 94*0495ed39SKyle Evans size_t len; /* string length in bytes */ 958e3e3a7aSWarner Losh const char *s = luaL_checklstring(L, 1, &len); 968e3e3a7aSWarner Losh lua_Integer posi = u_posrelat(luaL_optinteger(L, 2, 1), len); 978e3e3a7aSWarner Losh lua_Integer posj = u_posrelat(luaL_optinteger(L, 3, -1), len); 98*0495ed39SKyle Evans int lax = lua_toboolean(L, 4); 998e3e3a7aSWarner Losh luaL_argcheck(L, 1 <= posi && --posi <= (lua_Integer)len, 2, 100*0495ed39SKyle Evans "initial position out of bounds"); 1018e3e3a7aSWarner Losh luaL_argcheck(L, --posj < (lua_Integer)len, 3, 102*0495ed39SKyle Evans "final position out of bounds"); 1038e3e3a7aSWarner Losh while (posi <= posj) { 104*0495ed39SKyle Evans const char *s1 = utf8_decode(s + posi, NULL, !lax); 1058e3e3a7aSWarner Losh if (s1 == NULL) { /* conversion error? */ 106*0495ed39SKyle Evans luaL_pushfail(L); /* return fail ... */ 1078e3e3a7aSWarner Losh lua_pushinteger(L, posi + 1); /* ... and current position */ 1088e3e3a7aSWarner Losh return 2; 1098e3e3a7aSWarner Losh } 1108e3e3a7aSWarner Losh posi = s1 - s; 1118e3e3a7aSWarner Losh n++; 1128e3e3a7aSWarner Losh } 1138e3e3a7aSWarner Losh lua_pushinteger(L, n); 1148e3e3a7aSWarner Losh return 1; 1158e3e3a7aSWarner Losh } 1168e3e3a7aSWarner Losh 1178e3e3a7aSWarner Losh 1188e3e3a7aSWarner Losh /* 119*0495ed39SKyle Evans ** codepoint(s, [i, [j [, lax]]]) -> returns codepoints for all 120*0495ed39SKyle Evans ** characters that start in the range [i,j] 1218e3e3a7aSWarner Losh */ 1228e3e3a7aSWarner Losh static int codepoint (lua_State *L) { 1238e3e3a7aSWarner Losh size_t len; 1248e3e3a7aSWarner Losh const char *s = luaL_checklstring(L, 1, &len); 1258e3e3a7aSWarner Losh lua_Integer posi = u_posrelat(luaL_optinteger(L, 2, 1), len); 1268e3e3a7aSWarner Losh lua_Integer pose = u_posrelat(luaL_optinteger(L, 3, posi), len); 127*0495ed39SKyle Evans int lax = lua_toboolean(L, 4); 1288e3e3a7aSWarner Losh int n; 1298e3e3a7aSWarner Losh const char *se; 130*0495ed39SKyle Evans luaL_argcheck(L, posi >= 1, 2, "out of bounds"); 131*0495ed39SKyle Evans luaL_argcheck(L, pose <= (lua_Integer)len, 3, "out of bounds"); 1328e3e3a7aSWarner Losh if (posi > pose) return 0; /* empty interval; return no values */ 1338e3e3a7aSWarner Losh if (pose - posi >= INT_MAX) /* (lua_Integer -> int) overflow? */ 1348e3e3a7aSWarner Losh return luaL_error(L, "string slice too long"); 135*0495ed39SKyle Evans n = (int)(pose - posi) + 1; /* upper bound for number of returns */ 1368e3e3a7aSWarner Losh luaL_checkstack(L, n, "string slice too long"); 137*0495ed39SKyle Evans n = 0; /* count the number of returns */ 138*0495ed39SKyle Evans se = s + pose; /* string end */ 1398e3e3a7aSWarner Losh for (s += posi - 1; s < se;) { 140*0495ed39SKyle Evans utfint code; 141*0495ed39SKyle Evans s = utf8_decode(s, &code, !lax); 1428e3e3a7aSWarner Losh if (s == NULL) 1438e3e3a7aSWarner Losh return luaL_error(L, "invalid UTF-8 code"); 1448e3e3a7aSWarner Losh lua_pushinteger(L, code); 1458e3e3a7aSWarner Losh n++; 1468e3e3a7aSWarner Losh } 1478e3e3a7aSWarner Losh return n; 1488e3e3a7aSWarner Losh } 1498e3e3a7aSWarner Losh 1508e3e3a7aSWarner Losh 1518e3e3a7aSWarner Losh static void pushutfchar (lua_State *L, int arg) { 152*0495ed39SKyle Evans lua_Unsigned code = (lua_Unsigned)luaL_checkinteger(L, arg); 153*0495ed39SKyle Evans luaL_argcheck(L, code <= MAXUTF, arg, "value out of range"); 1548e3e3a7aSWarner Losh lua_pushfstring(L, "%U", (long)code); 1558e3e3a7aSWarner Losh } 1568e3e3a7aSWarner Losh 1578e3e3a7aSWarner Losh 1588e3e3a7aSWarner Losh /* 1598e3e3a7aSWarner Losh ** utfchar(n1, n2, ...) -> char(n1)..char(n2)... 1608e3e3a7aSWarner Losh */ 1618e3e3a7aSWarner Losh static int utfchar (lua_State *L) { 1628e3e3a7aSWarner Losh int n = lua_gettop(L); /* number of arguments */ 1638e3e3a7aSWarner Losh if (n == 1) /* optimize common case of single char */ 1648e3e3a7aSWarner Losh pushutfchar(L, 1); 1658e3e3a7aSWarner Losh else { 1668e3e3a7aSWarner Losh int i; 1678e3e3a7aSWarner Losh luaL_Buffer b; 1688e3e3a7aSWarner Losh luaL_buffinit(L, &b); 1698e3e3a7aSWarner Losh for (i = 1; i <= n; i++) { 1708e3e3a7aSWarner Losh pushutfchar(L, i); 1718e3e3a7aSWarner Losh luaL_addvalue(&b); 1728e3e3a7aSWarner Losh } 1738e3e3a7aSWarner Losh luaL_pushresult(&b); 1748e3e3a7aSWarner Losh } 1758e3e3a7aSWarner Losh return 1; 1768e3e3a7aSWarner Losh } 1778e3e3a7aSWarner Losh 1788e3e3a7aSWarner Losh 1798e3e3a7aSWarner Losh /* 1808e3e3a7aSWarner Losh ** offset(s, n, [i]) -> index where n-th character counting from 1818e3e3a7aSWarner Losh ** position 'i' starts; 0 means character at 'i'. 1828e3e3a7aSWarner Losh */ 1838e3e3a7aSWarner Losh static int byteoffset (lua_State *L) { 1848e3e3a7aSWarner Losh size_t len; 1858e3e3a7aSWarner Losh const char *s = luaL_checklstring(L, 1, &len); 1868e3e3a7aSWarner Losh lua_Integer n = luaL_checkinteger(L, 2); 1878e3e3a7aSWarner Losh lua_Integer posi = (n >= 0) ? 1 : len + 1; 1888e3e3a7aSWarner Losh posi = u_posrelat(luaL_optinteger(L, 3, posi), len); 1898e3e3a7aSWarner Losh luaL_argcheck(L, 1 <= posi && --posi <= (lua_Integer)len, 3, 190*0495ed39SKyle Evans "position out of bounds"); 1918e3e3a7aSWarner Losh if (n == 0) { 1928e3e3a7aSWarner Losh /* find beginning of current byte sequence */ 1938e3e3a7aSWarner Losh while (posi > 0 && iscont(s + posi)) posi--; 1948e3e3a7aSWarner Losh } 1958e3e3a7aSWarner Losh else { 1968e3e3a7aSWarner Losh if (iscont(s + posi)) 197e112e9d2SKyle Evans return luaL_error(L, "initial position is a continuation byte"); 1988e3e3a7aSWarner Losh if (n < 0) { 1998e3e3a7aSWarner Losh while (n < 0 && posi > 0) { /* move back */ 2008e3e3a7aSWarner Losh do { /* find beginning of previous character */ 2018e3e3a7aSWarner Losh posi--; 2028e3e3a7aSWarner Losh } while (posi > 0 && iscont(s + posi)); 2038e3e3a7aSWarner Losh n++; 2048e3e3a7aSWarner Losh } 2058e3e3a7aSWarner Losh } 2068e3e3a7aSWarner Losh else { 2078e3e3a7aSWarner Losh n--; /* do not move for 1st character */ 2088e3e3a7aSWarner Losh while (n > 0 && posi < (lua_Integer)len) { 2098e3e3a7aSWarner Losh do { /* find beginning of next character */ 2108e3e3a7aSWarner Losh posi++; 2118e3e3a7aSWarner Losh } while (iscont(s + posi)); /* (cannot pass final '\0') */ 2128e3e3a7aSWarner Losh n--; 2138e3e3a7aSWarner Losh } 2148e3e3a7aSWarner Losh } 2158e3e3a7aSWarner Losh } 2168e3e3a7aSWarner Losh if (n == 0) /* did it find given character? */ 2178e3e3a7aSWarner Losh lua_pushinteger(L, posi + 1); 2188e3e3a7aSWarner Losh else /* no such character */ 219*0495ed39SKyle Evans luaL_pushfail(L); 2208e3e3a7aSWarner Losh return 1; 2218e3e3a7aSWarner Losh } 2228e3e3a7aSWarner Losh 2238e3e3a7aSWarner Losh 224*0495ed39SKyle Evans static int iter_aux (lua_State *L, int strict) { 2258e3e3a7aSWarner Losh size_t len; 2268e3e3a7aSWarner Losh const char *s = luaL_checklstring(L, 1, &len); 2278e3e3a7aSWarner Losh lua_Integer n = lua_tointeger(L, 2) - 1; 2288e3e3a7aSWarner Losh if (n < 0) /* first iteration? */ 2298e3e3a7aSWarner Losh n = 0; /* start from here */ 2308e3e3a7aSWarner Losh else if (n < (lua_Integer)len) { 2318e3e3a7aSWarner Losh n++; /* skip current byte */ 2328e3e3a7aSWarner Losh while (iscont(s + n)) n++; /* and its continuations */ 2338e3e3a7aSWarner Losh } 2348e3e3a7aSWarner Losh if (n >= (lua_Integer)len) 2358e3e3a7aSWarner Losh return 0; /* no more codepoints */ 2368e3e3a7aSWarner Losh else { 237*0495ed39SKyle Evans utfint code; 238*0495ed39SKyle Evans const char *next = utf8_decode(s + n, &code, strict); 239*0495ed39SKyle Evans if (next == NULL) 2408e3e3a7aSWarner Losh return luaL_error(L, "invalid UTF-8 code"); 2418e3e3a7aSWarner Losh lua_pushinteger(L, n + 1); 2428e3e3a7aSWarner Losh lua_pushinteger(L, code); 2438e3e3a7aSWarner Losh return 2; 2448e3e3a7aSWarner Losh } 2458e3e3a7aSWarner Losh } 2468e3e3a7aSWarner Losh 2478e3e3a7aSWarner Losh 248*0495ed39SKyle Evans static int iter_auxstrict (lua_State *L) { 249*0495ed39SKyle Evans return iter_aux(L, 1); 250*0495ed39SKyle Evans } 251*0495ed39SKyle Evans 252*0495ed39SKyle Evans static int iter_auxlax (lua_State *L) { 253*0495ed39SKyle Evans return iter_aux(L, 0); 254*0495ed39SKyle Evans } 255*0495ed39SKyle Evans 256*0495ed39SKyle Evans 2578e3e3a7aSWarner Losh static int iter_codes (lua_State *L) { 258*0495ed39SKyle Evans int lax = lua_toboolean(L, 2); 2598e3e3a7aSWarner Losh luaL_checkstring(L, 1); 260*0495ed39SKyle Evans lua_pushcfunction(L, lax ? iter_auxlax : iter_auxstrict); 2618e3e3a7aSWarner Losh lua_pushvalue(L, 1); 2628e3e3a7aSWarner Losh lua_pushinteger(L, 0); 2638e3e3a7aSWarner Losh return 3; 2648e3e3a7aSWarner Losh } 2658e3e3a7aSWarner Losh 2668e3e3a7aSWarner Losh 2678e3e3a7aSWarner Losh /* pattern to match a single UTF-8 character */ 268*0495ed39SKyle Evans #define UTF8PATT "[\0-\x7F\xC2-\xFD][\x80-\xBF]*" 2698e3e3a7aSWarner Losh 2708e3e3a7aSWarner Losh 2718e3e3a7aSWarner Losh static const luaL_Reg funcs[] = { 2728e3e3a7aSWarner Losh {"offset", byteoffset}, 2738e3e3a7aSWarner Losh {"codepoint", codepoint}, 2748e3e3a7aSWarner Losh {"char", utfchar}, 2758e3e3a7aSWarner Losh {"len", utflen}, 2768e3e3a7aSWarner Losh {"codes", iter_codes}, 2778e3e3a7aSWarner Losh /* placeholders */ 2788e3e3a7aSWarner Losh {"charpattern", NULL}, 2798e3e3a7aSWarner Losh {NULL, NULL} 2808e3e3a7aSWarner Losh }; 2818e3e3a7aSWarner Losh 2828e3e3a7aSWarner Losh 2838e3e3a7aSWarner Losh LUAMOD_API int luaopen_utf8 (lua_State *L) { 2848e3e3a7aSWarner Losh luaL_newlib(L, funcs); 2858e3e3a7aSWarner Losh lua_pushlstring(L, UTF8PATT, sizeof(UTF8PATT)/sizeof(char) - 1); 2868e3e3a7aSWarner Losh lua_setfield(L, -2, "charpattern"); 2878e3e3a7aSWarner Losh return 1; 2888e3e3a7aSWarner Losh } 2898e3e3a7aSWarner Losh 290