1 /* 2 ** $Id: llex.c $ 3 ** Lexical Analyzer 4 ** See Copyright Notice in lua.h 5 */ 6 7 #define llex_c 8 #define LUA_CORE 9 10 #include "lprefix.h" 11 12 13 #include <locale.h> 14 #include <string.h> 15 16 #include "lua.h" 17 18 #include "lctype.h" 19 #include "ldebug.h" 20 #include "ldo.h" 21 #include "lgc.h" 22 #include "llex.h" 23 #include "lobject.h" 24 #include "lparser.h" 25 #include "lstate.h" 26 #include "lstring.h" 27 #include "ltable.h" 28 #include "lzio.h" 29 30 31 32 #define next(ls) (ls->current = zgetc(ls->z)) 33 34 35 36 #define currIsNewline(ls) (ls->current == '\n' || ls->current == '\r') 37 38 39 /* ORDER RESERVED */ 40 static const char *const luaX_tokens [] = { 41 "and", "break", "do", "else", "elseif", 42 "end", "false", "for", "function", "goto", "if", 43 "in", "local", "nil", "not", "or", "repeat", 44 "return", "then", "true", "until", "while", 45 "//", "..", "...", "==", ">=", "<=", "~=", 46 "<<", ">>", "::", "<eof>", 47 "<number>", "<integer>", "<name>", "<string>" 48 }; 49 50 51 #define save_and_next(ls) (save(ls, ls->current), next(ls)) 52 53 54 static l_noret lexerror (LexState *ls, const char *msg, int token); 55 56 57 static void save (LexState *ls, int c) { 58 Mbuffer *b = ls->buff; 59 if (luaZ_bufflen(b) + 1 > luaZ_sizebuffer(b)) { 60 size_t newsize; 61 if (luaZ_sizebuffer(b) >= MAX_SIZE/2) 62 lexerror(ls, "lexical element too long", 0); 63 newsize = luaZ_sizebuffer(b) * 2; 64 luaZ_resizebuffer(ls->L, b, newsize); 65 } 66 b->buffer[luaZ_bufflen(b)++] = cast_char(c); 67 } 68 69 70 void luaX_init (lua_State *L) { 71 int i; 72 TString *e = luaS_newliteral(L, LUA_ENV); /* create env name */ 73 luaC_fix(L, obj2gco(e)); /* never collect this name */ 74 for (i=0; i<NUM_RESERVED; i++) { 75 TString *ts = luaS_new(L, luaX_tokens[i]); 76 luaC_fix(L, obj2gco(ts)); /* reserved words are never collected */ 77 ts->extra = cast_byte(i+1); /* reserved word */ 78 } 79 } 80 81 82 const char *luaX_token2str (LexState *ls, int token) { 83 if (token < FIRST_RESERVED) { /* single-byte symbols? */ 84 if (lisprint(token)) 85 return luaO_pushfstring(ls->L, "'%c'", token); 86 else /* control character */ 87 return luaO_pushfstring(ls->L, "'<\\%d>'", token); 88 } 89 else { 90 const char *s = luaX_tokens[token - FIRST_RESERVED]; 91 if (token < TK_EOS) /* fixed format (symbols and reserved words)? */ 92 return luaO_pushfstring(ls->L, "'%s'", s); 93 else /* names, strings, and numerals */ 94 return s; 95 } 96 } 97 98 99 static const char *txtToken (LexState *ls, int token) { 100 switch (token) { 101 case TK_NAME: case TK_STRING: 102 case TK_FLT: case TK_INT: 103 save(ls, '\0'); 104 return luaO_pushfstring(ls->L, "'%s'", luaZ_buffer(ls->buff)); 105 default: 106 return luaX_token2str(ls, token); 107 } 108 } 109 110 111 static l_noret lexerror (LexState *ls, const char *msg, int token) { 112 msg = luaG_addinfo(ls->L, msg, ls->source, ls->linenumber); 113 if (token) 114 luaO_pushfstring(ls->L, "%s near %s", msg, txtToken(ls, token)); 115 luaD_throw(ls->L, LUA_ERRSYNTAX); 116 } 117 118 119 l_noret luaX_syntaxerror (LexState *ls, const char *msg) { 120 lexerror(ls, msg, ls->t.token); 121 } 122 123 124 /* 125 ** Creates a new string and anchors it in scanner's table so that it 126 ** will not be collected until the end of the compilation; by that time 127 ** it should be anchored somewhere. It also internalizes long strings, 128 ** ensuring there is only one copy of each unique string. The table 129 ** here is used as a set: the string enters as the key, while its value 130 ** is irrelevant. We use the string itself as the value only because it 131 ** is a TValue readly available. Later, the code generation can change 132 ** this value. 133 */ 134 TString *luaX_newstring (LexState *ls, const char *str, size_t l) { 135 lua_State *L = ls->L; 136 TString *ts = luaS_newlstr(L, str, l); /* create new string */ 137 const TValue *o = luaH_getstr(ls->h, ts); 138 if (!ttisnil(o)) /* string already present? */ 139 ts = keystrval(nodefromval(o)); /* get saved copy */ 140 else { /* not in use yet */ 141 TValue *stv = s2v(L->top++); /* reserve stack space for string */ 142 setsvalue(L, stv, ts); /* temporarily anchor the string */ 143 luaH_finishset(L, ls->h, stv, o, stv); /* t[string] = string */ 144 /* table is not a metatable, so it does not need to invalidate cache */ 145 luaC_checkGC(L); 146 L->top--; /* remove string from stack */ 147 } 148 return ts; 149 } 150 151 152 /* 153 ** increment line number and skips newline sequence (any of 154 ** \n, \r, \n\r, or \r\n) 155 */ 156 static void inclinenumber (LexState *ls) { 157 int old = ls->current; 158 lua_assert(currIsNewline(ls)); 159 next(ls); /* skip '\n' or '\r' */ 160 if (currIsNewline(ls) && ls->current != old) 161 next(ls); /* skip '\n\r' or '\r\n' */ 162 if (++ls->linenumber >= MAX_INT) 163 lexerror(ls, "chunk has too many lines", 0); 164 } 165 166 167 void luaX_setinput (lua_State *L, LexState *ls, ZIO *z, TString *source, 168 int firstchar) { 169 ls->t.token = 0; 170 ls->L = L; 171 ls->current = firstchar; 172 ls->lookahead.token = TK_EOS; /* no look-ahead token */ 173 ls->z = z; 174 ls->fs = NULL; 175 ls->linenumber = 1; 176 ls->lastline = 1; 177 ls->source = source; 178 ls->envn = luaS_newliteral(L, LUA_ENV); /* get env name */ 179 luaZ_resizebuffer(ls->L, ls->buff, LUA_MINBUFFER); /* initialize buffer */ 180 } 181 182 183 184 /* 185 ** ======================================================= 186 ** LEXICAL ANALYZER 187 ** ======================================================= 188 */ 189 190 191 static int check_next1 (LexState *ls, int c) { 192 if (ls->current == c) { 193 next(ls); 194 return 1; 195 } 196 else return 0; 197 } 198 199 200 /* 201 ** Check whether current char is in set 'set' (with two chars) and 202 ** saves it 203 */ 204 static int check_next2 (LexState *ls, const char *set) { 205 lua_assert(set[2] == '\0'); 206 if (ls->current == set[0] || ls->current == set[1]) { 207 save_and_next(ls); 208 return 1; 209 } 210 else return 0; 211 } 212 213 214 /* LUA_NUMBER */ 215 /* 216 ** This function is quite liberal in what it accepts, as 'luaO_str2num' 217 ** will reject ill-formed numerals. Roughly, it accepts the following 218 ** pattern: 219 ** 220 ** %d(%x|%.|([Ee][+-]?))* | 0[Xx](%x|%.|([Pp][+-]?))* 221 ** 222 ** The only tricky part is to accept [+-] only after a valid exponent 223 ** mark, to avoid reading '3-4' or '0xe+1' as a single number. 224 ** 225 ** The caller might have already read an initial dot. 226 */ 227 static int read_numeral (LexState *ls, SemInfo *seminfo) { 228 TValue obj; 229 const char *expo = "Ee"; 230 int first = ls->current; 231 lua_assert(lisdigit(ls->current)); 232 save_and_next(ls); 233 if (first == '0' && check_next2(ls, "xX")) /* hexadecimal? */ 234 expo = "Pp"; 235 for (;;) { 236 if (check_next2(ls, expo)) /* exponent mark? */ 237 check_next2(ls, "-+"); /* optional exponent sign */ 238 else if (lisxdigit(ls->current) || ls->current == '.') /* '%x|%.' */ 239 save_and_next(ls); 240 else break; 241 } 242 if (lislalpha(ls->current)) /* is numeral touching a letter? */ 243 save_and_next(ls); /* force an error */ 244 save(ls, '\0'); 245 if (luaO_str2num(luaZ_buffer(ls->buff), &obj) == 0) /* format error? */ 246 lexerror(ls, "malformed number", TK_FLT); 247 if (ttisinteger(&obj)) { 248 seminfo->i = ivalue(&obj); 249 return TK_INT; 250 } 251 else { 252 lua_assert(ttisfloat(&obj)); 253 seminfo->r = fltvalue(&obj); 254 return TK_FLT; 255 } 256 } 257 258 259 /* 260 ** read a sequence '[=*[' or ']=*]', leaving the last bracket. If 261 ** sequence is well formed, return its number of '='s + 2; otherwise, 262 ** return 1 if it is a single bracket (no '='s and no 2nd bracket); 263 ** otherwise (an unfinished '[==...') return 0. 264 */ 265 static size_t skip_sep (LexState *ls) { 266 size_t count = 0; 267 int s = ls->current; 268 lua_assert(s == '[' || s == ']'); 269 save_and_next(ls); 270 while (ls->current == '=') { 271 save_and_next(ls); 272 count++; 273 } 274 return (ls->current == s) ? count + 2 275 : (count == 0) ? 1 276 : 0; 277 } 278 279 280 static void read_long_string (LexState *ls, SemInfo *seminfo, size_t sep) { 281 int line = ls->linenumber; /* initial line (for error message) */ 282 save_and_next(ls); /* skip 2nd '[' */ 283 if (currIsNewline(ls)) /* string starts with a newline? */ 284 inclinenumber(ls); /* skip it */ 285 for (;;) { 286 switch (ls->current) { 287 case EOZ: { /* error */ 288 const char *what = (seminfo ? "string" : "comment"); 289 const char *msg = luaO_pushfstring(ls->L, 290 "unfinished long %s (starting at line %d)", what, line); 291 lexerror(ls, msg, TK_EOS); 292 break; /* to avoid warnings */ 293 } 294 case ']': { 295 if (skip_sep(ls) == sep) { 296 save_and_next(ls); /* skip 2nd ']' */ 297 goto endloop; 298 } 299 break; 300 } 301 case '\n': case '\r': { 302 save(ls, '\n'); 303 inclinenumber(ls); 304 if (!seminfo) luaZ_resetbuffer(ls->buff); /* avoid wasting space */ 305 break; 306 } 307 default: { 308 if (seminfo) save_and_next(ls); 309 else next(ls); 310 } 311 } 312 } endloop: 313 if (seminfo) 314 seminfo->ts = luaX_newstring(ls, luaZ_buffer(ls->buff) + sep, 315 luaZ_bufflen(ls->buff) - 2 * sep); 316 } 317 318 319 static void esccheck (LexState *ls, int c, const char *msg) { 320 if (!c) { 321 if (ls->current != EOZ) 322 save_and_next(ls); /* add current to buffer for error message */ 323 lexerror(ls, msg, TK_STRING); 324 } 325 } 326 327 328 static int gethexa (LexState *ls) { 329 save_and_next(ls); 330 esccheck (ls, lisxdigit(ls->current), "hexadecimal digit expected"); 331 return luaO_hexavalue(ls->current); 332 } 333 334 335 static int readhexaesc (LexState *ls) { 336 int r = gethexa(ls); 337 r = (r << 4) + gethexa(ls); 338 luaZ_buffremove(ls->buff, 2); /* remove saved chars from buffer */ 339 return r; 340 } 341 342 343 static unsigned long readutf8esc (LexState *ls) { 344 unsigned long r; 345 int i = 4; /* chars to be removed: '\', 'u', '{', and first digit */ 346 save_and_next(ls); /* skip 'u' */ 347 esccheck(ls, ls->current == '{', "missing '{'"); 348 r = gethexa(ls); /* must have at least one digit */ 349 while (cast_void(save_and_next(ls)), lisxdigit(ls->current)) { 350 i++; 351 esccheck(ls, r <= (0x7FFFFFFFu >> 4), "UTF-8 value too large"); 352 r = (r << 4) + luaO_hexavalue(ls->current); 353 } 354 esccheck(ls, ls->current == '}', "missing '}'"); 355 next(ls); /* skip '}' */ 356 luaZ_buffremove(ls->buff, i); /* remove saved chars from buffer */ 357 return r; 358 } 359 360 361 static void utf8esc (LexState *ls) { 362 char buff[UTF8BUFFSZ]; 363 int n = luaO_utf8esc(buff, readutf8esc(ls)); 364 for (; n > 0; n--) /* add 'buff' to string */ 365 save(ls, buff[UTF8BUFFSZ - n]); 366 } 367 368 369 static int readdecesc (LexState *ls) { 370 int i; 371 int r = 0; /* result accumulator */ 372 for (i = 0; i < 3 && lisdigit(ls->current); i++) { /* read up to 3 digits */ 373 r = 10*r + ls->current - '0'; 374 save_and_next(ls); 375 } 376 esccheck(ls, r <= UCHAR_MAX, "decimal escape too large"); 377 luaZ_buffremove(ls->buff, i); /* remove read digits from buffer */ 378 return r; 379 } 380 381 382 static void read_string (LexState *ls, int del, SemInfo *seminfo) { 383 save_and_next(ls); /* keep delimiter (for error messages) */ 384 while (ls->current != del) { 385 switch (ls->current) { 386 case EOZ: 387 lexerror(ls, "unfinished string", TK_EOS); 388 break; /* to avoid warnings */ 389 case '\n': 390 case '\r': 391 lexerror(ls, "unfinished string", TK_STRING); 392 break; /* to avoid warnings */ 393 case '\\': { /* escape sequences */ 394 int c; /* final character to be saved */ 395 save_and_next(ls); /* keep '\\' for error messages */ 396 switch (ls->current) { 397 case 'a': c = '\a'; goto read_save; 398 case 'b': c = '\b'; goto read_save; 399 case 'f': c = '\f'; goto read_save; 400 case 'n': c = '\n'; goto read_save; 401 case 'r': c = '\r'; goto read_save; 402 case 't': c = '\t'; goto read_save; 403 case 'v': c = '\v'; goto read_save; 404 case 'x': c = readhexaesc(ls); goto read_save; 405 case 'u': utf8esc(ls); goto no_save; 406 case '\n': case '\r': 407 inclinenumber(ls); c = '\n'; goto only_save; 408 case '\\': case '\"': case '\'': 409 c = ls->current; goto read_save; 410 case EOZ: goto no_save; /* will raise an error next loop */ 411 case 'z': { /* zap following span of spaces */ 412 luaZ_buffremove(ls->buff, 1); /* remove '\\' */ 413 next(ls); /* skip the 'z' */ 414 while (lisspace(ls->current)) { 415 if (currIsNewline(ls)) inclinenumber(ls); 416 else next(ls); 417 } 418 goto no_save; 419 } 420 default: { 421 esccheck(ls, lisdigit(ls->current), "invalid escape sequence"); 422 c = readdecesc(ls); /* digital escape '\ddd' */ 423 goto only_save; 424 } 425 } 426 read_save: 427 next(ls); 428 /* go through */ 429 only_save: 430 luaZ_buffremove(ls->buff, 1); /* remove '\\' */ 431 save(ls, c); 432 /* go through */ 433 no_save: break; 434 } 435 default: 436 save_and_next(ls); 437 } 438 } 439 save_and_next(ls); /* skip delimiter */ 440 seminfo->ts = luaX_newstring(ls, luaZ_buffer(ls->buff) + 1, 441 luaZ_bufflen(ls->buff) - 2); 442 } 443 444 445 static int llex (LexState *ls, SemInfo *seminfo) { 446 luaZ_resetbuffer(ls->buff); 447 for (;;) { 448 switch (ls->current) { 449 case '\n': case '\r': { /* line breaks */ 450 inclinenumber(ls); 451 break; 452 } 453 case ' ': case '\f': case '\t': case '\v': { /* spaces */ 454 next(ls); 455 break; 456 } 457 case '-': { /* '-' or '--' (comment) */ 458 next(ls); 459 if (ls->current != '-') return '-'; 460 /* else is a comment */ 461 next(ls); 462 if (ls->current == '[') { /* long comment? */ 463 size_t sep = skip_sep(ls); 464 luaZ_resetbuffer(ls->buff); /* 'skip_sep' may dirty the buffer */ 465 if (sep >= 2) { 466 read_long_string(ls, NULL, sep); /* skip long comment */ 467 luaZ_resetbuffer(ls->buff); /* previous call may dirty the buff. */ 468 break; 469 } 470 } 471 /* else short comment */ 472 while (!currIsNewline(ls) && ls->current != EOZ) 473 next(ls); /* skip until end of line (or end of file) */ 474 break; 475 } 476 case '[': { /* long string or simply '[' */ 477 size_t sep = skip_sep(ls); 478 if (sep >= 2) { 479 read_long_string(ls, seminfo, sep); 480 return TK_STRING; 481 } 482 else if (sep == 0) /* '[=...' missing second bracket? */ 483 lexerror(ls, "invalid long string delimiter", TK_STRING); 484 return '['; 485 } 486 case '=': { 487 next(ls); 488 if (check_next1(ls, '=')) return TK_EQ; /* '==' */ 489 else return '='; 490 } 491 case '<': { 492 next(ls); 493 if (check_next1(ls, '=')) return TK_LE; /* '<=' */ 494 else if (check_next1(ls, '<')) return TK_SHL; /* '<<' */ 495 else return '<'; 496 } 497 case '>': { 498 next(ls); 499 if (check_next1(ls, '=')) return TK_GE; /* '>=' */ 500 else if (check_next1(ls, '>')) return TK_SHR; /* '>>' */ 501 else return '>'; 502 } 503 case '/': { 504 next(ls); 505 if (check_next1(ls, '/')) return TK_IDIV; /* '//' */ 506 else return '/'; 507 } 508 case '~': { 509 next(ls); 510 if (check_next1(ls, '=')) return TK_NE; /* '~=' */ 511 else return '~'; 512 } 513 case ':': { 514 next(ls); 515 if (check_next1(ls, ':')) return TK_DBCOLON; /* '::' */ 516 else return ':'; 517 } 518 case '"': case '\'': { /* short literal strings */ 519 read_string(ls, ls->current, seminfo); 520 return TK_STRING; 521 } 522 case '.': { /* '.', '..', '...', or number */ 523 save_and_next(ls); 524 if (check_next1(ls, '.')) { 525 if (check_next1(ls, '.')) 526 return TK_DOTS; /* '...' */ 527 else return TK_CONCAT; /* '..' */ 528 } 529 else if (!lisdigit(ls->current)) return '.'; 530 else return read_numeral(ls, seminfo); 531 } 532 case '0': case '1': case '2': case '3': case '4': 533 case '5': case '6': case '7': case '8': case '9': { 534 return read_numeral(ls, seminfo); 535 } 536 case EOZ: { 537 return TK_EOS; 538 } 539 default: { 540 if (lislalpha(ls->current)) { /* identifier or reserved word? */ 541 TString *ts; 542 do { 543 save_and_next(ls); 544 } while (lislalnum(ls->current)); 545 ts = luaX_newstring(ls, luaZ_buffer(ls->buff), 546 luaZ_bufflen(ls->buff)); 547 seminfo->ts = ts; 548 if (isreserved(ts)) /* reserved word? */ 549 return ts->extra - 1 + FIRST_RESERVED; 550 else { 551 return TK_NAME; 552 } 553 } 554 else { /* single-char tokens ('+', '*', '%', '{', '}', ...) */ 555 int c = ls->current; 556 next(ls); 557 return c; 558 } 559 } 560 } 561 } 562 } 563 564 565 void luaX_next (LexState *ls) { 566 ls->lastline = ls->linenumber; 567 if (ls->lookahead.token != TK_EOS) { /* is there a look-ahead token? */ 568 ls->t = ls->lookahead; /* use this one */ 569 ls->lookahead.token = TK_EOS; /* and discharge it */ 570 } 571 else 572 ls->t.token = llex(ls, &ls->t.seminfo); /* read next token */ 573 } 574 575 576 int luaX_lookahead (LexState *ls) { 577 lua_assert(ls->lookahead.token == TK_EOS); 578 ls->lookahead.token = llex(ls, &ls->lookahead.seminfo); 579 return ls->lookahead.token; 580 } 581 582