18e3e3a7aSWarner Losh /* 20495ed39SKyle Evans ** $Id: lutf8lib.c $ 38e3e3a7aSWarner Losh ** Standard library for UTF-8 manipulation 48e3e3a7aSWarner Losh ** See Copyright Notice in lua.h 58e3e3a7aSWarner Losh */ 68e3e3a7aSWarner Losh 78e3e3a7aSWarner Losh #define lutf8lib_c 88e3e3a7aSWarner Losh #define LUA_LIB 98e3e3a7aSWarner Losh 108e3e3a7aSWarner Losh #include "lprefix.h" 118e3e3a7aSWarner Losh 128e3e3a7aSWarner Losh 138e3e3a7aSWarner Losh #include <assert.h> 148e3e3a7aSWarner Losh #include <limits.h> 158e3e3a7aSWarner Losh #include <stdlib.h> 168e3e3a7aSWarner Losh #include <string.h> 178e3e3a7aSWarner Losh 188e3e3a7aSWarner Losh #include "lua.h" 198e3e3a7aSWarner Losh 208e3e3a7aSWarner Losh #include "lauxlib.h" 218e3e3a7aSWarner Losh #include "lualib.h" 228e3e3a7aSWarner Losh 230495ed39SKyle Evans 240495ed39SKyle Evans #define MAXUNICODE 0x10FFFFu 250495ed39SKyle Evans 260495ed39SKyle Evans #define MAXUTF 0x7FFFFFFFu 270495ed39SKyle Evans 280495ed39SKyle Evans /* 290495ed39SKyle Evans ** Integer type for decoded UTF-8 values; MAXUTF needs 31 bits. 300495ed39SKyle Evans */ 310495ed39SKyle Evans #if (UINT_MAX >> 30) >= 1 320495ed39SKyle Evans typedef unsigned int utfint; 330495ed39SKyle Evans #else 340495ed39SKyle Evans typedef unsigned long utfint; 350495ed39SKyle Evans #endif 360495ed39SKyle Evans 378e3e3a7aSWarner Losh 388e3e3a7aSWarner Losh #define iscont(p) ((*(p) & 0xC0) == 0x80) 398e3e3a7aSWarner Losh 408e3e3a7aSWarner Losh 418e3e3a7aSWarner Losh /* from strlib */ 428e3e3a7aSWarner Losh /* translate a relative string position: negative means back from end */ 438e3e3a7aSWarner Losh static lua_Integer u_posrelat (lua_Integer pos, size_t len) { 448e3e3a7aSWarner Losh if (pos >= 0) return pos; 458e3e3a7aSWarner Losh else if (0u - (size_t)pos > len) return 0; 468e3e3a7aSWarner Losh else return (lua_Integer)len + pos + 1; 478e3e3a7aSWarner Losh } 488e3e3a7aSWarner Losh 498e3e3a7aSWarner Losh 508e3e3a7aSWarner Losh /* 510495ed39SKyle Evans ** Decode one UTF-8 sequence, returning NULL if byte sequence is 520495ed39SKyle Evans ** invalid. The array 'limits' stores the minimum value for each 530495ed39SKyle Evans ** sequence length, to check for overlong representations. Its first 540495ed39SKyle Evans ** entry forces an error for non-ascii bytes with no continuation 550495ed39SKyle Evans ** bytes (count == 0). 568e3e3a7aSWarner Losh */ 570495ed39SKyle Evans static const char *utf8_decode (const char *s, utfint *val, int strict) { 580495ed39SKyle Evans static const utfint limits[] = 590495ed39SKyle Evans {~(utfint)0, 0x80, 0x800, 0x10000u, 0x200000u, 0x4000000u}; 600495ed39SKyle Evans unsigned int c = (unsigned char)s[0]; 610495ed39SKyle Evans utfint res = 0; /* final result */ 628e3e3a7aSWarner Losh if (c < 0x80) /* ascii? */ 638e3e3a7aSWarner Losh res = c; 648e3e3a7aSWarner Losh else { 658e3e3a7aSWarner Losh int count = 0; /* to count number of continuation bytes */ 660495ed39SKyle Evans for (; c & 0x40; c <<= 1) { /* while it needs continuation bytes... */ 670495ed39SKyle Evans unsigned int cc = (unsigned char)s[++count]; /* read next byte */ 688e3e3a7aSWarner Losh if ((cc & 0xC0) != 0x80) /* not a continuation byte? */ 698e3e3a7aSWarner Losh return NULL; /* invalid byte sequence */ 708e3e3a7aSWarner Losh res = (res << 6) | (cc & 0x3F); /* add lower 6 bits from cont. byte */ 718e3e3a7aSWarner Losh } 720495ed39SKyle Evans res |= ((utfint)(c & 0x7F) << (count * 5)); /* add first byte */ 730495ed39SKyle Evans if (count > 5 || res > MAXUTF || res < limits[count]) 748e3e3a7aSWarner Losh return NULL; /* invalid byte sequence */ 758e3e3a7aSWarner Losh s += count; /* skip continuation bytes read */ 768e3e3a7aSWarner Losh } 770495ed39SKyle Evans if (strict) { 780495ed39SKyle Evans /* check for invalid code points; too large or surrogates */ 790495ed39SKyle Evans if (res > MAXUNICODE || (0xD800u <= res && res <= 0xDFFFu)) 800495ed39SKyle Evans return NULL; 810495ed39SKyle Evans } 828e3e3a7aSWarner Losh if (val) *val = res; 830495ed39SKyle Evans return s + 1; /* +1 to include first byte */ 848e3e3a7aSWarner Losh } 858e3e3a7aSWarner Losh 868e3e3a7aSWarner Losh 878e3e3a7aSWarner Losh /* 880495ed39SKyle Evans ** utf8len(s [, i [, j [, lax]]]) --> number of characters that 890495ed39SKyle Evans ** start in the range [i,j], or nil + current position if 's' is not 900495ed39SKyle Evans ** well formed in that interval 918e3e3a7aSWarner Losh */ 928e3e3a7aSWarner Losh static int utflen (lua_State *L) { 930495ed39SKyle Evans lua_Integer n = 0; /* counter for the number of characters */ 940495ed39SKyle Evans size_t len; /* string length in bytes */ 958e3e3a7aSWarner Losh const char *s = luaL_checklstring(L, 1, &len); 968e3e3a7aSWarner Losh lua_Integer posi = u_posrelat(luaL_optinteger(L, 2, 1), len); 978e3e3a7aSWarner Losh lua_Integer posj = u_posrelat(luaL_optinteger(L, 3, -1), len); 980495ed39SKyle Evans int lax = lua_toboolean(L, 4); 998e3e3a7aSWarner Losh luaL_argcheck(L, 1 <= posi && --posi <= (lua_Integer)len, 2, 1000495ed39SKyle Evans "initial position out of bounds"); 1018e3e3a7aSWarner Losh luaL_argcheck(L, --posj < (lua_Integer)len, 3, 1020495ed39SKyle Evans "final position out of bounds"); 1038e3e3a7aSWarner Losh while (posi <= posj) { 1040495ed39SKyle Evans const char *s1 = utf8_decode(s + posi, NULL, !lax); 1058e3e3a7aSWarner Losh if (s1 == NULL) { /* conversion error? */ 1060495ed39SKyle Evans luaL_pushfail(L); /* return fail ... */ 1078e3e3a7aSWarner Losh lua_pushinteger(L, posi + 1); /* ... and current position */ 1088e3e3a7aSWarner Losh return 2; 1098e3e3a7aSWarner Losh } 1108e3e3a7aSWarner Losh posi = s1 - s; 1118e3e3a7aSWarner Losh n++; 1128e3e3a7aSWarner Losh } 1138e3e3a7aSWarner Losh lua_pushinteger(L, n); 1148e3e3a7aSWarner Losh return 1; 1158e3e3a7aSWarner Losh } 1168e3e3a7aSWarner Losh 1178e3e3a7aSWarner Losh 1188e3e3a7aSWarner Losh /* 1190495ed39SKyle Evans ** codepoint(s, [i, [j [, lax]]]) -> returns codepoints for all 1200495ed39SKyle Evans ** characters that start in the range [i,j] 1218e3e3a7aSWarner Losh */ 1228e3e3a7aSWarner Losh static int codepoint (lua_State *L) { 1238e3e3a7aSWarner Losh size_t len; 1248e3e3a7aSWarner Losh const char *s = luaL_checklstring(L, 1, &len); 1258e3e3a7aSWarner Losh lua_Integer posi = u_posrelat(luaL_optinteger(L, 2, 1), len); 1268e3e3a7aSWarner Losh lua_Integer pose = u_posrelat(luaL_optinteger(L, 3, posi), len); 1270495ed39SKyle Evans int lax = lua_toboolean(L, 4); 1288e3e3a7aSWarner Losh int n; 1298e3e3a7aSWarner Losh const char *se; 1300495ed39SKyle Evans luaL_argcheck(L, posi >= 1, 2, "out of bounds"); 1310495ed39SKyle Evans luaL_argcheck(L, pose <= (lua_Integer)len, 3, "out of bounds"); 1328e3e3a7aSWarner Losh if (posi > pose) return 0; /* empty interval; return no values */ 1338e3e3a7aSWarner Losh if (pose - posi >= INT_MAX) /* (lua_Integer -> int) overflow? */ 1348e3e3a7aSWarner Losh return luaL_error(L, "string slice too long"); 1350495ed39SKyle Evans n = (int)(pose - posi) + 1; /* upper bound for number of returns */ 1368e3e3a7aSWarner Losh luaL_checkstack(L, n, "string slice too long"); 1370495ed39SKyle Evans n = 0; /* count the number of returns */ 1380495ed39SKyle Evans se = s + pose; /* string end */ 1398e3e3a7aSWarner Losh for (s += posi - 1; s < se;) { 1400495ed39SKyle Evans utfint code; 1410495ed39SKyle Evans s = utf8_decode(s, &code, !lax); 1428e3e3a7aSWarner Losh if (s == NULL) 1438e3e3a7aSWarner Losh return luaL_error(L, "invalid UTF-8 code"); 1448e3e3a7aSWarner Losh lua_pushinteger(L, code); 1458e3e3a7aSWarner Losh n++; 1468e3e3a7aSWarner Losh } 1478e3e3a7aSWarner Losh return n; 1488e3e3a7aSWarner Losh } 1498e3e3a7aSWarner Losh 1508e3e3a7aSWarner Losh 1518e3e3a7aSWarner Losh static void pushutfchar (lua_State *L, int arg) { 1520495ed39SKyle Evans lua_Unsigned code = (lua_Unsigned)luaL_checkinteger(L, arg); 1530495ed39SKyle Evans luaL_argcheck(L, code <= MAXUTF, arg, "value out of range"); 1548e3e3a7aSWarner Losh lua_pushfstring(L, "%U", (long)code); 1558e3e3a7aSWarner Losh } 1568e3e3a7aSWarner Losh 1578e3e3a7aSWarner Losh 1588e3e3a7aSWarner Losh /* 1598e3e3a7aSWarner Losh ** utfchar(n1, n2, ...) -> char(n1)..char(n2)... 1608e3e3a7aSWarner Losh */ 1618e3e3a7aSWarner Losh static int utfchar (lua_State *L) { 1628e3e3a7aSWarner Losh int n = lua_gettop(L); /* number of arguments */ 1638e3e3a7aSWarner Losh if (n == 1) /* optimize common case of single char */ 1648e3e3a7aSWarner Losh pushutfchar(L, 1); 1658e3e3a7aSWarner Losh else { 1668e3e3a7aSWarner Losh int i; 1678e3e3a7aSWarner Losh luaL_Buffer b; 1688e3e3a7aSWarner Losh luaL_buffinit(L, &b); 1698e3e3a7aSWarner Losh for (i = 1; i <= n; i++) { 1708e3e3a7aSWarner Losh pushutfchar(L, i); 1718e3e3a7aSWarner Losh luaL_addvalue(&b); 1728e3e3a7aSWarner Losh } 1738e3e3a7aSWarner Losh luaL_pushresult(&b); 1748e3e3a7aSWarner Losh } 1758e3e3a7aSWarner Losh return 1; 1768e3e3a7aSWarner Losh } 1778e3e3a7aSWarner Losh 1788e3e3a7aSWarner Losh 1798e3e3a7aSWarner Losh /* 1808e3e3a7aSWarner Losh ** offset(s, n, [i]) -> index where n-th character counting from 1818e3e3a7aSWarner Losh ** position 'i' starts; 0 means character at 'i'. 1828e3e3a7aSWarner Losh */ 1838e3e3a7aSWarner Losh static int byteoffset (lua_State *L) { 1848e3e3a7aSWarner Losh size_t len; 1858e3e3a7aSWarner Losh const char *s = luaL_checklstring(L, 1, &len); 1868e3e3a7aSWarner Losh lua_Integer n = luaL_checkinteger(L, 2); 1878e3e3a7aSWarner Losh lua_Integer posi = (n >= 0) ? 1 : len + 1; 1888e3e3a7aSWarner Losh posi = u_posrelat(luaL_optinteger(L, 3, posi), len); 1898e3e3a7aSWarner Losh luaL_argcheck(L, 1 <= posi && --posi <= (lua_Integer)len, 3, 1900495ed39SKyle Evans "position out of bounds"); 1918e3e3a7aSWarner Losh if (n == 0) { 1928e3e3a7aSWarner Losh /* find beginning of current byte sequence */ 1938e3e3a7aSWarner Losh while (posi > 0 && iscont(s + posi)) posi--; 1948e3e3a7aSWarner Losh } 1958e3e3a7aSWarner Losh else { 1968e3e3a7aSWarner Losh if (iscont(s + posi)) 197e112e9d2SKyle Evans return luaL_error(L, "initial position is a continuation byte"); 1988e3e3a7aSWarner Losh if (n < 0) { 1998e3e3a7aSWarner Losh while (n < 0 && posi > 0) { /* move back */ 2008e3e3a7aSWarner Losh do { /* find beginning of previous character */ 2018e3e3a7aSWarner Losh posi--; 2028e3e3a7aSWarner Losh } while (posi > 0 && iscont(s + posi)); 2038e3e3a7aSWarner Losh n++; 2048e3e3a7aSWarner Losh } 2058e3e3a7aSWarner Losh } 2068e3e3a7aSWarner Losh else { 2078e3e3a7aSWarner Losh n--; /* do not move for 1st character */ 2088e3e3a7aSWarner Losh while (n > 0 && posi < (lua_Integer)len) { 2098e3e3a7aSWarner Losh do { /* find beginning of next character */ 2108e3e3a7aSWarner Losh posi++; 2118e3e3a7aSWarner Losh } while (iscont(s + posi)); /* (cannot pass final '\0') */ 2128e3e3a7aSWarner Losh n--; 2138e3e3a7aSWarner Losh } 2148e3e3a7aSWarner Losh } 2158e3e3a7aSWarner Losh } 2168e3e3a7aSWarner Losh if (n == 0) /* did it find given character? */ 2178e3e3a7aSWarner Losh lua_pushinteger(L, posi + 1); 2188e3e3a7aSWarner Losh else /* no such character */ 2190495ed39SKyle Evans luaL_pushfail(L); 2208e3e3a7aSWarner Losh return 1; 2218e3e3a7aSWarner Losh } 2228e3e3a7aSWarner Losh 2238e3e3a7aSWarner Losh 2240495ed39SKyle Evans static int iter_aux (lua_State *L, int strict) { 2258e3e3a7aSWarner Losh size_t len; 2268e3e3a7aSWarner Losh const char *s = luaL_checklstring(L, 1, &len); 227*8c784bb8SWarner Losh lua_Unsigned n = (lua_Unsigned)lua_tointeger(L, 2); 228*8c784bb8SWarner Losh if (n < len) { 229*8c784bb8SWarner Losh while (iscont(s + n)) n++; /* skip continuation bytes */ 2308e3e3a7aSWarner Losh } 231*8c784bb8SWarner Losh if (n >= len) /* (also handles original 'n' being negative) */ 2328e3e3a7aSWarner Losh return 0; /* no more codepoints */ 2338e3e3a7aSWarner Losh else { 2340495ed39SKyle Evans utfint code; 2350495ed39SKyle Evans const char *next = utf8_decode(s + n, &code, strict); 2360495ed39SKyle Evans if (next == NULL) 2378e3e3a7aSWarner Losh return luaL_error(L, "invalid UTF-8 code"); 2388e3e3a7aSWarner Losh lua_pushinteger(L, n + 1); 2398e3e3a7aSWarner Losh lua_pushinteger(L, code); 2408e3e3a7aSWarner Losh return 2; 2418e3e3a7aSWarner Losh } 2428e3e3a7aSWarner Losh } 2438e3e3a7aSWarner Losh 2448e3e3a7aSWarner Losh 2450495ed39SKyle Evans static int iter_auxstrict (lua_State *L) { 2460495ed39SKyle Evans return iter_aux(L, 1); 2470495ed39SKyle Evans } 2480495ed39SKyle Evans 2490495ed39SKyle Evans static int iter_auxlax (lua_State *L) { 2500495ed39SKyle Evans return iter_aux(L, 0); 2510495ed39SKyle Evans } 2520495ed39SKyle Evans 2530495ed39SKyle Evans 2548e3e3a7aSWarner Losh static int iter_codes (lua_State *L) { 2550495ed39SKyle Evans int lax = lua_toboolean(L, 2); 2568e3e3a7aSWarner Losh luaL_checkstring(L, 1); 2570495ed39SKyle Evans lua_pushcfunction(L, lax ? iter_auxlax : iter_auxstrict); 2588e3e3a7aSWarner Losh lua_pushvalue(L, 1); 2598e3e3a7aSWarner Losh lua_pushinteger(L, 0); 2608e3e3a7aSWarner Losh return 3; 2618e3e3a7aSWarner Losh } 2628e3e3a7aSWarner Losh 2638e3e3a7aSWarner Losh 2648e3e3a7aSWarner Losh /* pattern to match a single UTF-8 character */ 2650495ed39SKyle Evans #define UTF8PATT "[\0-\x7F\xC2-\xFD][\x80-\xBF]*" 2668e3e3a7aSWarner Losh 2678e3e3a7aSWarner Losh 2688e3e3a7aSWarner Losh static const luaL_Reg funcs[] = { 2698e3e3a7aSWarner Losh {"offset", byteoffset}, 2708e3e3a7aSWarner Losh {"codepoint", codepoint}, 2718e3e3a7aSWarner Losh {"char", utfchar}, 2728e3e3a7aSWarner Losh {"len", utflen}, 2738e3e3a7aSWarner Losh {"codes", iter_codes}, 2748e3e3a7aSWarner Losh /* placeholders */ 2758e3e3a7aSWarner Losh {"charpattern", NULL}, 2768e3e3a7aSWarner Losh {NULL, NULL} 2778e3e3a7aSWarner Losh }; 2788e3e3a7aSWarner Losh 2798e3e3a7aSWarner Losh 2808e3e3a7aSWarner Losh LUAMOD_API int luaopen_utf8 (lua_State *L) { 2818e3e3a7aSWarner Losh luaL_newlib(L, funcs); 2828e3e3a7aSWarner Losh lua_pushlstring(L, UTF8PATT, sizeof(UTF8PATT)/sizeof(char) - 1); 2838e3e3a7aSWarner Losh lua_setfield(L, -2, "charpattern"); 2848e3e3a7aSWarner Losh return 1; 2858e3e3a7aSWarner Losh } 2868e3e3a7aSWarner Losh 287