1 /* 2 * ***************************************************************************** 3 * 4 * SPDX-License-Identifier: BSD-2-Clause 5 * 6 * Copyright (c) 2018-2024 Gavin D. Howard and contributors. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions are met: 10 * 11 * * Redistributions of source code must retain the above copyright notice, this 12 * list of conditions and the following disclaimer. 13 * 14 * * Redistributions in binary form must reproduce the above copyright notice, 15 * this list of conditions and the following disclaimer in the documentation 16 * and/or other materials provided with the distribution. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 19 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 22 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 23 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 24 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 25 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 26 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 27 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 28 * POSSIBILITY OF SUCH DAMAGE. 29 * 30 * ***************************************************************************** 31 * 32 * The lexer for bc. 33 * 34 */ 35 36 #if BC_ENABLED 37 38 #include <assert.h> 39 #include <ctype.h> 40 #include <string.h> 41 42 #include <bc.h> 43 #include <vm.h> 44 45 /** 46 * Lexes an identifier, which may be a keyword. 47 * @param l The lexer. 48 */ 49 static void 50 bc_lex_identifier(BcLex* l) 51 { 52 // We already passed the first character, so we need to be sure to include 53 // it. 54 const char* buf = l->buf + l->i - 1; 55 size_t i; 56 57 // This loop is simply checking for keywords. 58 for (i = 0; i < bc_lex_kws_len; ++i) 59 { 60 const BcLexKeyword* kw = bc_lex_kws + i; 61 size_t n = BC_LEX_KW_LEN(kw); 62 63 if (!strncmp(buf, kw->name, n) && !isalnum(buf[n]) && buf[n] != '_') 64 { 65 // If the keyword has been redefined, and redefinition is allowed 66 // (it is not allowed for builtin libraries), break out of the loop 67 // and use it as a name. This depends on the argument parser to 68 // ensure that only non-POSIX keywords get redefined. 69 if (!vm->no_redefine && vm->redefined_kws[i]) break; 70 71 l->t = BC_LEX_KW_AUTO + (BcLexType) i; 72 73 // Warn or error, as appropriate for the mode, if the keyword is not 74 // in the POSIX standard. 75 if (!BC_LEX_KW_POSIX(kw)) bc_lex_verr(l, BC_ERR_POSIX_KW, kw->name); 76 77 // We minus 1 because the index has already been incremented. 78 l->i += n - 1; 79 80 // Already have the token; bail. 81 return; 82 } 83 } 84 85 // If not a keyword, parse the name. 86 bc_lex_name(l); 87 88 // POSIX doesn't allow identifiers that are more than one character, so we 89 // might have to warn or error here too. 90 if (BC_ERR(l->str.len - 1 > 1)) 91 { 92 bc_lex_verr(l, BC_ERR_POSIX_NAME_LEN, l->str.v); 93 } 94 } 95 96 /** 97 * Parses a bc string. This is separate from dc strings because dc strings need 98 * to be balanced. 99 * @param l The lexer. 100 */ 101 static void 102 bc_lex_string(BcLex* l) 103 { 104 // We need to keep track of newlines to increment them properly. 105 size_t len, nlines, i; 106 const char* buf; 107 char c; 108 bool got_more; 109 110 l->t = BC_LEX_STR; 111 112 do 113 { 114 nlines = 0; 115 buf = l->buf; 116 got_more = false; 117 118 #if !BC_ENABLE_OSSFUZZ 119 assert(vm->mode != BC_MODE_STDIN || buf == vm->buffer.v); 120 #endif // !BC_ENABLE_OSSFUZZ 121 122 // Fortunately for us, bc doesn't escape quotes. Instead, the equivalent 123 // is '\q', which makes this loop simpler. 124 for (i = l->i; (c = buf[i]) && c != '"'; ++i) 125 { 126 nlines += (c == '\n'); 127 } 128 129 if (BC_ERR(c == '\0') && !vm->eof && l->mode != BC_MODE_FILE) 130 { 131 got_more = bc_lex_readLine(l); 132 } 133 } 134 while (got_more && c != '"'); 135 136 // If the string did not end properly, barf. 137 if (c != '"') 138 { 139 l->i = i; 140 bc_lex_err(l, BC_ERR_PARSE_STRING); 141 } 142 143 // Set the temp string to the parsed string. 144 len = i - l->i; 145 bc_vec_string(&l->str, len, l->buf + l->i); 146 147 l->i = i + 1; 148 l->line += nlines; 149 } 150 151 /** 152 * This function takes a lexed operator and checks to see if it's the assignment 153 * version, setting the token appropriately. 154 * @param l The lexer. 155 * @param with The token to assign if it is an assignment operator. 156 * @param without The token to assign if it is not an assignment operator. 157 */ 158 static void 159 bc_lex_assign(BcLex* l, BcLexType with, BcLexType without) 160 { 161 if (l->buf[l->i] == '=') 162 { 163 l->i += 1; 164 l->t = with; 165 } 166 else l->t = without; 167 } 168 169 void 170 bc_lex_token(BcLex* l) 171 { 172 // We increment here. This means that all lexing needs to take that into 173 // account, such as when parsing an identifier. If we don't, the first 174 // character of every identifier would be missing. 175 char c = l->buf[l->i++], c2; 176 177 BC_SIG_ASSERT_LOCKED; 178 179 // This is the workhorse of the lexer. 180 switch (c) 181 { 182 case '\0': 183 case '\n': 184 case '\t': 185 case '\v': 186 case '\f': 187 case '\r': 188 case ' ': 189 { 190 bc_lex_commonTokens(l, c); 191 break; 192 } 193 194 case '!': 195 { 196 // Even though it's not an assignment, we can use this. 197 bc_lex_assign(l, BC_LEX_OP_REL_NE, BC_LEX_OP_BOOL_NOT); 198 199 // POSIX doesn't allow boolean not. 200 if (l->t == BC_LEX_OP_BOOL_NOT) 201 { 202 bc_lex_verr(l, BC_ERR_POSIX_BOOL, "!"); 203 } 204 205 break; 206 } 207 208 case '"': 209 { 210 bc_lex_string(l); 211 break; 212 } 213 214 case '#': 215 { 216 // POSIX does not allow line comments. 217 bc_lex_err(l, BC_ERR_POSIX_COMMENT); 218 bc_lex_lineComment(l); 219 break; 220 } 221 222 case '%': 223 { 224 bc_lex_assign(l, BC_LEX_OP_ASSIGN_MODULUS, BC_LEX_OP_MODULUS); 225 break; 226 } 227 228 case '&': 229 { 230 c2 = l->buf[l->i]; 231 232 // Either we have boolean and or an error. And boolean and is not 233 // allowed by POSIX. 234 if (BC_NO_ERR(c2 == '&')) 235 { 236 bc_lex_verr(l, BC_ERR_POSIX_BOOL, "&&"); 237 238 l->i += 1; 239 l->t = BC_LEX_OP_BOOL_AND; 240 } 241 else bc_lex_invalidChar(l, c); 242 243 break; 244 } 245 #if BC_ENABLE_EXTRA_MATH 246 case '$': 247 { 248 l->t = BC_LEX_OP_TRUNC; 249 break; 250 } 251 252 case '@': 253 { 254 bc_lex_assign(l, BC_LEX_OP_ASSIGN_PLACES, BC_LEX_OP_PLACES); 255 break; 256 } 257 #endif // BC_ENABLE_EXTRA_MATH 258 case '(': 259 case ')': 260 { 261 l->t = (BcLexType) (c - '(' + BC_LEX_LPAREN); 262 break; 263 } 264 265 case '*': 266 { 267 bc_lex_assign(l, BC_LEX_OP_ASSIGN_MULTIPLY, BC_LEX_OP_MULTIPLY); 268 break; 269 } 270 271 case '+': 272 { 273 c2 = l->buf[l->i]; 274 275 // Have to check for increment first. 276 if (c2 == '+') 277 { 278 l->i += 1; 279 l->t = BC_LEX_OP_INC; 280 } 281 else bc_lex_assign(l, BC_LEX_OP_ASSIGN_PLUS, BC_LEX_OP_PLUS); 282 break; 283 } 284 285 case ',': 286 { 287 l->t = BC_LEX_COMMA; 288 break; 289 } 290 291 case '-': 292 { 293 c2 = l->buf[l->i]; 294 295 // Have to check for decrement first. 296 if (c2 == '-') 297 { 298 l->i += 1; 299 l->t = BC_LEX_OP_DEC; 300 } 301 else bc_lex_assign(l, BC_LEX_OP_ASSIGN_MINUS, BC_LEX_OP_MINUS); 302 break; 303 } 304 305 case '.': 306 { 307 c2 = l->buf[l->i]; 308 309 // If it's alone, it's an alias for last. 310 if (BC_LEX_NUM_CHAR(c2, true, false)) bc_lex_number(l, c); 311 else 312 { 313 l->t = BC_LEX_KW_LAST; 314 bc_lex_err(l, BC_ERR_POSIX_DOT); 315 } 316 317 break; 318 } 319 320 case '/': 321 { 322 c2 = l->buf[l->i]; 323 if (c2 == '*') bc_lex_comment(l); 324 else bc_lex_assign(l, BC_LEX_OP_ASSIGN_DIVIDE, BC_LEX_OP_DIVIDE); 325 break; 326 } 327 328 case '0': 329 case '1': 330 case '2': 331 case '3': 332 case '4': 333 case '5': 334 case '6': 335 case '7': 336 case '8': 337 case '9': 338 case 'A': 339 case 'B': 340 case 'C': 341 case 'D': 342 case 'E': 343 case 'F': 344 // Apparently, GNU bc (and maybe others) allows any uppercase letter as 345 // a number. When single digits, they act like the ones above. When 346 // multi-digit, any letter above the input base is automatically set to 347 // the biggest allowable digit in the input base. 348 case 'G': 349 case 'H': 350 case 'I': 351 case 'J': 352 case 'K': 353 case 'L': 354 case 'M': 355 case 'N': 356 case 'O': 357 case 'P': 358 case 'Q': 359 case 'R': 360 case 'S': 361 case 'T': 362 case 'U': 363 case 'V': 364 case 'W': 365 case 'X': 366 case 'Y': 367 case 'Z': 368 { 369 bc_lex_number(l, c); 370 break; 371 } 372 373 case ';': 374 { 375 l->t = BC_LEX_SCOLON; 376 break; 377 } 378 379 case '<': 380 { 381 #if BC_ENABLE_EXTRA_MATH 382 c2 = l->buf[l->i]; 383 384 // Check for shift. 385 if (c2 == '<') 386 { 387 l->i += 1; 388 bc_lex_assign(l, BC_LEX_OP_ASSIGN_LSHIFT, BC_LEX_OP_LSHIFT); 389 break; 390 } 391 #endif // BC_ENABLE_EXTRA_MATH 392 bc_lex_assign(l, BC_LEX_OP_REL_LE, BC_LEX_OP_REL_LT); 393 break; 394 } 395 396 case '=': 397 { 398 bc_lex_assign(l, BC_LEX_OP_REL_EQ, BC_LEX_OP_ASSIGN); 399 break; 400 } 401 402 case '>': 403 { 404 #if BC_ENABLE_EXTRA_MATH 405 c2 = l->buf[l->i]; 406 407 // Check for shift. 408 if (c2 == '>') 409 { 410 l->i += 1; 411 bc_lex_assign(l, BC_LEX_OP_ASSIGN_RSHIFT, BC_LEX_OP_RSHIFT); 412 break; 413 } 414 #endif // BC_ENABLE_EXTRA_MATH 415 bc_lex_assign(l, BC_LEX_OP_REL_GE, BC_LEX_OP_REL_GT); 416 break; 417 } 418 419 case '[': 420 case ']': 421 { 422 l->t = (BcLexType) (c - '[' + BC_LEX_LBRACKET); 423 break; 424 } 425 426 case '\\': 427 { 428 // In bc, a backslash+newline is whitespace. 429 if (BC_NO_ERR(l->buf[l->i] == '\n')) 430 { 431 l->i += 1; 432 l->t = BC_LEX_WHITESPACE; 433 } 434 else bc_lex_invalidChar(l, c); 435 break; 436 } 437 438 case '^': 439 { 440 bc_lex_assign(l, BC_LEX_OP_ASSIGN_POWER, BC_LEX_OP_POWER); 441 break; 442 } 443 444 case 'a': 445 case 'b': 446 case 'c': 447 case 'd': 448 case 'e': 449 case 'f': 450 case 'g': 451 case 'h': 452 case 'i': 453 case 'j': 454 case 'k': 455 case 'l': 456 case 'm': 457 case 'n': 458 case 'o': 459 case 'p': 460 case 'q': 461 case 'r': 462 case 's': 463 case 't': 464 case 'u': 465 case 'v': 466 case 'w': 467 case 'x': 468 case 'y': 469 case 'z': 470 { 471 bc_lex_identifier(l); 472 break; 473 } 474 475 case '{': 476 case '}': 477 { 478 l->t = (BcLexType) (c - '{' + BC_LEX_LBRACE); 479 break; 480 } 481 482 case '|': 483 { 484 c2 = l->buf[l->i]; 485 486 // Once again, boolean or is not allowed by POSIX. 487 if (BC_NO_ERR(c2 == '|')) 488 { 489 bc_lex_verr(l, BC_ERR_POSIX_BOOL, "||"); 490 491 l->i += 1; 492 l->t = BC_LEX_OP_BOOL_OR; 493 } 494 else bc_lex_invalidChar(l, c); 495 496 break; 497 } 498 499 default: 500 { 501 bc_lex_invalidChar(l, c); 502 } 503 } 504 } 505 #endif // BC_ENABLED 506