1 /* 2 * ***************************************************************************** 3 * 4 * SPDX-License-Identifier: BSD-2-Clause 5 * 6 * Copyright (c) 2018-2021 Gavin D. Howard and contributors. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions are met: 10 * 11 * * Redistributions of source code must retain the above copyright notice, this 12 * list of conditions and the following disclaimer. 13 * 14 * * Redistributions in binary form must reproduce the above copyright notice, 15 * this list of conditions and the following disclaimer in the documentation 16 * and/or other materials provided with the distribution. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 19 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 22 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 23 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 24 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 25 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 26 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 27 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 28 * POSSIBILITY OF SUCH DAMAGE. 29 * 30 * ***************************************************************************** 31 * 32 * The lexer for bc. 33 * 34 */ 35 36 #if BC_ENABLED 37 38 #include <assert.h> 39 #include <ctype.h> 40 #include <string.h> 41 42 #include <bc.h> 43 #include <vm.h> 44 45 /** 46 * Lexes an identifier, which may be a keyword. 47 * @param l The lexer. 48 */ 49 static void bc_lex_identifier(BcLex *l) { 50 51 // We already passed the first character, so we need to be sure to include 52 // it. 53 const char *buf = l->buf + l->i - 1; 54 size_t i; 55 56 // This loop is simply checking for keywords. 57 for (i = 0; i < bc_lex_kws_len; ++i) { 58 59 const BcLexKeyword *kw = bc_lex_kws + i; 60 size_t n = BC_LEX_KW_LEN(kw); 61 62 if (!strncmp(buf, kw->name, n) && !isalnum(buf[n]) && buf[n] != '_') { 63 64 // If the keyword has been redefined, and redefinition is allowed 65 // (it is not allowed for builtin libraries), break out of the loop 66 // and use it as a name. This depends on the argument parser to 67 // ensure that only non-POSIX keywords get redefined. 68 if (!vm.no_redefine && vm.redefined_kws[i]) break; 69 70 l->t = BC_LEX_KW_AUTO + (BcLexType) i; 71 72 // Warn or error, as appropriate for the mode, if the keyword is not 73 // in the POSIX standard. 74 if (!BC_LEX_KW_POSIX(kw)) bc_lex_verr(l, BC_ERR_POSIX_KW, kw->name); 75 76 // We minus 1 because the index has already been incremented. 77 l->i += n - 1; 78 79 // Already have the token; bail. 80 return; 81 } 82 } 83 84 // If not a keyword, parse the name. 85 bc_lex_name(l); 86 87 // POSIX doesn't allow identifiers that are more than one character, so we 88 // might have to warn or error here too. 89 if (BC_ERR(l->str.len - 1 > 1)) 90 bc_lex_verr(l, BC_ERR_POSIX_NAME_LEN, l->str.v); 91 } 92 93 /** 94 * Parses a bc string. This is separate from dc strings because dc strings need 95 * to be balanced. 96 * @param l The lexer. 97 */ 98 static void bc_lex_string(BcLex *l) { 99 100 // We need to keep track of newlines to increment them properly. 101 size_t len, nlines, i; 102 const char *buf; 103 char c; 104 bool got_more; 105 106 l->t = BC_LEX_STR; 107 108 do { 109 110 nlines = 0; 111 buf = l->buf; 112 got_more = false; 113 114 assert(!vm.is_stdin || buf == vm.buffer.v); 115 116 // Fortunately for us, bc doesn't escape quotes. Instead, the equivalent 117 // is '\q', which makes this loop simpler. 118 for (i = l->i; (c = buf[i]) && c != '"'; ++i) nlines += (c == '\n'); 119 120 if (BC_ERR(c == '\0') && !vm.eof && l->is_stdin) 121 got_more = bc_lex_readLine(l); 122 123 } while (got_more && c != '"'); 124 125 // If the string did not end properly, barf. 126 if (c != '"') { 127 l->i = i; 128 bc_lex_err(l, BC_ERR_PARSE_STRING); 129 } 130 131 // Set the temp string to the parsed string. 132 len = i - l->i; 133 bc_vec_string(&l->str, len, l->buf + l->i); 134 135 l->i = i + 1; 136 l->line += nlines; 137 } 138 139 /** 140 * This function takes a lexed operator and checks to see if it's the assignment 141 * version, setting the token appropriately. 142 * @param l The lexer. 143 * @param with The token to assign if it is an assignment operator. 144 * @param without The token to assign if it is not an assignment operator. 145 */ 146 static void bc_lex_assign(BcLex *l, BcLexType with, BcLexType without) { 147 if (l->buf[l->i] == '=') { 148 l->i += 1; 149 l->t = with; 150 } 151 else l->t = without; 152 } 153 154 void bc_lex_token(BcLex *l) { 155 156 // We increment here. This means that all lexing needs to take that into 157 // account, such as when parsing an identifier. If we don't, the first 158 // character of every identifier would be missing. 159 char c = l->buf[l->i++], c2; 160 161 // This is the workhorse of the lexer. 162 switch (c) { 163 164 case '\0': 165 case '\n': 166 case '\t': 167 case '\v': 168 case '\f': 169 case '\r': 170 case ' ': 171 { 172 bc_lex_commonTokens(l, c); 173 break; 174 } 175 176 case '!': 177 { 178 // Even though it's not an assignment, we can use this. 179 bc_lex_assign(l, BC_LEX_OP_REL_NE, BC_LEX_OP_BOOL_NOT); 180 181 // POSIX doesn't allow boolean not. 182 if (l->t == BC_LEX_OP_BOOL_NOT) 183 bc_lex_verr(l, BC_ERR_POSIX_BOOL, "!"); 184 185 break; 186 } 187 188 case '"': 189 { 190 bc_lex_string(l); 191 break; 192 } 193 194 case '#': 195 { 196 // POSIX does not allow line comments. 197 bc_lex_err(l, BC_ERR_POSIX_COMMENT); 198 bc_lex_lineComment(l); 199 break; 200 } 201 202 case '%': 203 { 204 bc_lex_assign(l, BC_LEX_OP_ASSIGN_MODULUS, BC_LEX_OP_MODULUS); 205 break; 206 } 207 208 case '&': 209 { 210 c2 = l->buf[l->i]; 211 212 // Either we have boolean and or an error. And boolean and is not 213 // allowed by POSIX. 214 if (BC_NO_ERR(c2 == '&')) { 215 216 bc_lex_verr(l, BC_ERR_POSIX_BOOL, "&&"); 217 218 l->i += 1; 219 l->t = BC_LEX_OP_BOOL_AND; 220 } 221 else bc_lex_invalidChar(l, c); 222 223 break; 224 } 225 #if BC_ENABLE_EXTRA_MATH 226 case '$': 227 { 228 l->t = BC_LEX_OP_TRUNC; 229 break; 230 } 231 232 case '@': 233 { 234 bc_lex_assign(l, BC_LEX_OP_ASSIGN_PLACES, BC_LEX_OP_PLACES); 235 break; 236 } 237 #endif // BC_ENABLE_EXTRA_MATH 238 case '(': 239 case ')': 240 { 241 l->t = (BcLexType) (c - '(' + BC_LEX_LPAREN); 242 break; 243 } 244 245 case '*': 246 { 247 bc_lex_assign(l, BC_LEX_OP_ASSIGN_MULTIPLY, BC_LEX_OP_MULTIPLY); 248 break; 249 } 250 251 case '+': 252 { 253 c2 = l->buf[l->i]; 254 255 // Have to check for increment first. 256 if (c2 == '+') { 257 l->i += 1; 258 l->t = BC_LEX_OP_INC; 259 } 260 else bc_lex_assign(l, BC_LEX_OP_ASSIGN_PLUS, BC_LEX_OP_PLUS); 261 break; 262 } 263 264 case ',': 265 { 266 l->t = BC_LEX_COMMA; 267 break; 268 } 269 270 case '-': 271 { 272 c2 = l->buf[l->i]; 273 274 // Have to check for decrement first. 275 if (c2 == '-') { 276 l->i += 1; 277 l->t = BC_LEX_OP_DEC; 278 } 279 else bc_lex_assign(l, BC_LEX_OP_ASSIGN_MINUS, BC_LEX_OP_MINUS); 280 break; 281 } 282 283 case '.': 284 { 285 c2 = l->buf[l->i]; 286 287 // If it's alone, it's an alias for last. 288 if (BC_LEX_NUM_CHAR(c2, true, false)) bc_lex_number(l, c); 289 else { 290 l->t = BC_LEX_KW_LAST; 291 bc_lex_err(l, BC_ERR_POSIX_DOT); 292 } 293 294 break; 295 } 296 297 case '/': 298 { 299 c2 = l->buf[l->i]; 300 if (c2 =='*') bc_lex_comment(l); 301 else bc_lex_assign(l, BC_LEX_OP_ASSIGN_DIVIDE, BC_LEX_OP_DIVIDE); 302 break; 303 } 304 305 case '0': 306 case '1': 307 case '2': 308 case '3': 309 case '4': 310 case '5': 311 case '6': 312 case '7': 313 case '8': 314 case '9': 315 case 'A': 316 case 'B': 317 case 'C': 318 case 'D': 319 case 'E': 320 case 'F': 321 // Apparently, GNU bc (and maybe others) allows any uppercase letter as 322 // a number. When single digits, they act like the ones above. When 323 // multi-digit, any letter above the input base is automatically set to 324 // the biggest allowable digit in the input base. 325 case 'G': 326 case 'H': 327 case 'I': 328 case 'J': 329 case 'K': 330 case 'L': 331 case 'M': 332 case 'N': 333 case 'O': 334 case 'P': 335 case 'Q': 336 case 'R': 337 case 'S': 338 case 'T': 339 case 'U': 340 case 'V': 341 case 'W': 342 case 'X': 343 case 'Y': 344 case 'Z': 345 { 346 bc_lex_number(l, c); 347 break; 348 } 349 350 case ';': 351 { 352 l->t = BC_LEX_SCOLON; 353 break; 354 } 355 356 case '<': 357 { 358 #if BC_ENABLE_EXTRA_MATH 359 c2 = l->buf[l->i]; 360 361 // Check for shift. 362 if (c2 == '<') { 363 l->i += 1; 364 bc_lex_assign(l, BC_LEX_OP_ASSIGN_LSHIFT, BC_LEX_OP_LSHIFT); 365 break; 366 } 367 #endif // BC_ENABLE_EXTRA_MATH 368 bc_lex_assign(l, BC_LEX_OP_REL_LE, BC_LEX_OP_REL_LT); 369 break; 370 } 371 372 case '=': 373 { 374 bc_lex_assign(l, BC_LEX_OP_REL_EQ, BC_LEX_OP_ASSIGN); 375 break; 376 } 377 378 case '>': 379 { 380 #if BC_ENABLE_EXTRA_MATH 381 c2 = l->buf[l->i]; 382 383 // Check for shift. 384 if (c2 == '>') { 385 l->i += 1; 386 bc_lex_assign(l, BC_LEX_OP_ASSIGN_RSHIFT, BC_LEX_OP_RSHIFT); 387 break; 388 } 389 #endif // BC_ENABLE_EXTRA_MATH 390 bc_lex_assign(l, BC_LEX_OP_REL_GE, BC_LEX_OP_REL_GT); 391 break; 392 } 393 394 case '[': 395 case ']': 396 { 397 l->t = (BcLexType) (c - '[' + BC_LEX_LBRACKET); 398 break; 399 } 400 401 case '\\': 402 { 403 // In bc, a backslash+newline is whitespace. 404 if (BC_NO_ERR(l->buf[l->i] == '\n')) { 405 l->i += 1; 406 l->t = BC_LEX_WHITESPACE; 407 } 408 else bc_lex_invalidChar(l, c); 409 break; 410 } 411 412 case '^': 413 { 414 bc_lex_assign(l, BC_LEX_OP_ASSIGN_POWER, BC_LEX_OP_POWER); 415 break; 416 } 417 418 case 'a': 419 case 'b': 420 case 'c': 421 case 'd': 422 case 'e': 423 case 'f': 424 case 'g': 425 case 'h': 426 case 'i': 427 case 'j': 428 case 'k': 429 case 'l': 430 case 'm': 431 case 'n': 432 case 'o': 433 case 'p': 434 case 'q': 435 case 'r': 436 case 's': 437 case 't': 438 case 'u': 439 case 'v': 440 case 'w': 441 case 'x': 442 case 'y': 443 case 'z': 444 { 445 bc_lex_identifier(l); 446 break; 447 } 448 449 case '{': 450 case '}': 451 { 452 l->t = (BcLexType) (c - '{' + BC_LEX_LBRACE); 453 break; 454 } 455 456 case '|': 457 { 458 c2 = l->buf[l->i]; 459 460 // Once again, boolean or is not allowed by POSIX. 461 if (BC_NO_ERR(c2 == '|')) { 462 463 bc_lex_verr(l, BC_ERR_POSIX_BOOL, "||"); 464 465 l->i += 1; 466 l->t = BC_LEX_OP_BOOL_OR; 467 } 468 else bc_lex_invalidChar(l, c); 469 470 break; 471 } 472 473 default: 474 { 475 bc_lex_invalidChar(l, c); 476 } 477 } 478 } 479 #endif // BC_ENABLED 480