1 /* 2 * ***************************************************************************** 3 * 4 * SPDX-License-Identifier: BSD-2-Clause 5 * 6 * Copyright (c) 2018-2024 Gavin D. Howard and contributors. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions are met: 10 * 11 * * Redistributions of source code must retain the above copyright notice, this 12 * list of conditions and the following disclaimer. 13 * 14 * * Redistributions in binary form must reproduce the above copyright notice, 15 * this list of conditions and the following disclaimer in the documentation 16 * and/or other materials provided with the distribution. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 19 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 22 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 23 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 24 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 25 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 26 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 27 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 28 * POSSIBILITY OF SUCH DAMAGE. 29 * 30 * ***************************************************************************** 31 * 32 * The lexer for bc. 33 * 34 */ 35 36 #if BC_ENABLED 37 38 #include <assert.h> 39 #include <ctype.h> 40 #include <string.h> 41 42 #include <bc.h> 43 #include <vm.h> 44 45 /** 46 * Lexes an identifier, which may be a keyword. 47 * @param l The lexer. 48 */ 49 static void 50 bc_lex_identifier(BcLex* l) 51 { 52 // We already passed the first character, so we need to be sure to include 53 // it. 54 const char* buf = l->buf + l->i - 1; 55 size_t i; 56 57 // This loop is simply checking for keywords. 58 for (i = 0; i < bc_lex_kws_len; ++i) 59 { 60 const BcLexKeyword* kw = bc_lex_kws + i; 61 size_t n = BC_LEX_KW_LEN(kw); 62 63 if (!strncmp(buf, kw->name, n) && !isalnum(buf[n]) && buf[n] != '_') 64 { 65 // If the keyword has been redefined, and redefinition is allowed 66 // (it is not allowed for builtin libraries), break out of the loop 67 // and use it as a name. This depends on the argument parser to 68 // ensure that only non-POSIX keywords get redefined. 69 if (!vm->no_redefine && vm->redefined_kws[i]) break; 70 71 l->t = BC_LEX_KW_AUTO + (BcLexType) i; 72 73 // Warn or error, as appropriate for the mode, if the keyword is not 74 // in the POSIX standard. 75 if (!BC_LEX_KW_POSIX(kw)) bc_lex_verr(l, BC_ERR_POSIX_KW, kw->name); 76 77 // We minus 1 because the index has already been incremented. 78 l->i += n - 1; 79 80 // Already have the token; bail. 81 return; 82 } 83 } 84 85 // If not a keyword, parse the name. 86 bc_lex_name(l); 87 88 // POSIX doesn't allow identifiers that are more than one character, so we 89 // might have to warn or error here too. 90 if (BC_ERR(l->str.len - 1 > 1)) 91 { 92 bc_lex_verr(l, BC_ERR_POSIX_NAME_LEN, l->str.v); 93 } 94 } 95 96 /** 97 * Parses a bc string. This is separate from dc strings because dc strings need 98 * to be balanced. 99 * @param l The lexer. 100 */ 101 static void 102 bc_lex_string(BcLex* l) 103 { 104 // We need to keep track of newlines to increment them properly. 105 size_t len, nlines, i; 106 const char* buf; 107 char c; 108 bool got_more; 109 110 l->t = BC_LEX_STR; 111 112 do 113 { 114 nlines = 0; 115 buf = l->buf; 116 got_more = false; 117 118 assert(vm->mode != BC_MODE_STDIN || buf == vm->buffer.v); 119 120 // Fortunately for us, bc doesn't escape quotes. Instead, the equivalent 121 // is '\q', which makes this loop simpler. 122 for (i = l->i; (c = buf[i]) && c != '"'; ++i) 123 { 124 nlines += (c == '\n'); 125 } 126 127 if (BC_ERR(c == '\0') && !vm->eof && l->mode != BC_MODE_FILE) 128 { 129 got_more = bc_lex_readLine(l); 130 } 131 } 132 while (got_more && c != '"'); 133 134 // If the string did not end properly, barf. 135 if (c != '"') 136 { 137 l->i = i; 138 bc_lex_err(l, BC_ERR_PARSE_STRING); 139 } 140 141 // Set the temp string to the parsed string. 142 len = i - l->i; 143 bc_vec_string(&l->str, len, l->buf + l->i); 144 145 l->i = i + 1; 146 l->line += nlines; 147 } 148 149 /** 150 * This function takes a lexed operator and checks to see if it's the assignment 151 * version, setting the token appropriately. 152 * @param l The lexer. 153 * @param with The token to assign if it is an assignment operator. 154 * @param without The token to assign if it is not an assignment operator. 155 */ 156 static void 157 bc_lex_assign(BcLex* l, BcLexType with, BcLexType without) 158 { 159 if (l->buf[l->i] == '=') 160 { 161 l->i += 1; 162 l->t = with; 163 } 164 else l->t = without; 165 } 166 167 void 168 bc_lex_token(BcLex* l) 169 { 170 // We increment here. This means that all lexing needs to take that into 171 // account, such as when parsing an identifier. If we don't, the first 172 // character of every identifier would be missing. 173 char c = l->buf[l->i++], c2; 174 175 BC_SIG_ASSERT_LOCKED; 176 177 // This is the workhorse of the lexer. 178 switch (c) 179 { 180 case '\0': 181 case '\n': 182 case '\t': 183 case '\v': 184 case '\f': 185 case '\r': 186 case ' ': 187 { 188 bc_lex_commonTokens(l, c); 189 break; 190 } 191 192 case '!': 193 { 194 // Even though it's not an assignment, we can use this. 195 bc_lex_assign(l, BC_LEX_OP_REL_NE, BC_LEX_OP_BOOL_NOT); 196 197 // POSIX doesn't allow boolean not. 198 if (l->t == BC_LEX_OP_BOOL_NOT) 199 { 200 bc_lex_verr(l, BC_ERR_POSIX_BOOL, "!"); 201 } 202 203 break; 204 } 205 206 case '"': 207 { 208 bc_lex_string(l); 209 break; 210 } 211 212 case '#': 213 { 214 // POSIX does not allow line comments. 215 bc_lex_err(l, BC_ERR_POSIX_COMMENT); 216 bc_lex_lineComment(l); 217 break; 218 } 219 220 case '%': 221 { 222 bc_lex_assign(l, BC_LEX_OP_ASSIGN_MODULUS, BC_LEX_OP_MODULUS); 223 break; 224 } 225 226 case '&': 227 { 228 c2 = l->buf[l->i]; 229 230 // Either we have boolean and or an error. And boolean and is not 231 // allowed by POSIX. 232 if (BC_NO_ERR(c2 == '&')) 233 { 234 bc_lex_verr(l, BC_ERR_POSIX_BOOL, "&&"); 235 236 l->i += 1; 237 l->t = BC_LEX_OP_BOOL_AND; 238 } 239 else bc_lex_invalidChar(l, c); 240 241 break; 242 } 243 #if BC_ENABLE_EXTRA_MATH 244 case '$': 245 { 246 l->t = BC_LEX_OP_TRUNC; 247 break; 248 } 249 250 case '@': 251 { 252 bc_lex_assign(l, BC_LEX_OP_ASSIGN_PLACES, BC_LEX_OP_PLACES); 253 break; 254 } 255 #endif // BC_ENABLE_EXTRA_MATH 256 case '(': 257 case ')': 258 { 259 l->t = (BcLexType) (c - '(' + BC_LEX_LPAREN); 260 break; 261 } 262 263 case '*': 264 { 265 bc_lex_assign(l, BC_LEX_OP_ASSIGN_MULTIPLY, BC_LEX_OP_MULTIPLY); 266 break; 267 } 268 269 case '+': 270 { 271 c2 = l->buf[l->i]; 272 273 // Have to check for increment first. 274 if (c2 == '+') 275 { 276 l->i += 1; 277 l->t = BC_LEX_OP_INC; 278 } 279 else bc_lex_assign(l, BC_LEX_OP_ASSIGN_PLUS, BC_LEX_OP_PLUS); 280 break; 281 } 282 283 case ',': 284 { 285 l->t = BC_LEX_COMMA; 286 break; 287 } 288 289 case '-': 290 { 291 c2 = l->buf[l->i]; 292 293 // Have to check for decrement first. 294 if (c2 == '-') 295 { 296 l->i += 1; 297 l->t = BC_LEX_OP_DEC; 298 } 299 else bc_lex_assign(l, BC_LEX_OP_ASSIGN_MINUS, BC_LEX_OP_MINUS); 300 break; 301 } 302 303 case '.': 304 { 305 c2 = l->buf[l->i]; 306 307 // If it's alone, it's an alias for last. 308 if (BC_LEX_NUM_CHAR(c2, true, false)) bc_lex_number(l, c); 309 else 310 { 311 l->t = BC_LEX_KW_LAST; 312 bc_lex_err(l, BC_ERR_POSIX_DOT); 313 } 314 315 break; 316 } 317 318 case '/': 319 { 320 c2 = l->buf[l->i]; 321 if (c2 == '*') bc_lex_comment(l); 322 else bc_lex_assign(l, BC_LEX_OP_ASSIGN_DIVIDE, BC_LEX_OP_DIVIDE); 323 break; 324 } 325 326 case '0': 327 case '1': 328 case '2': 329 case '3': 330 case '4': 331 case '5': 332 case '6': 333 case '7': 334 case '8': 335 case '9': 336 case 'A': 337 case 'B': 338 case 'C': 339 case 'D': 340 case 'E': 341 case 'F': 342 // Apparently, GNU bc (and maybe others) allows any uppercase letter as 343 // a number. When single digits, they act like the ones above. When 344 // multi-digit, any letter above the input base is automatically set to 345 // the biggest allowable digit in the input base. 346 case 'G': 347 case 'H': 348 case 'I': 349 case 'J': 350 case 'K': 351 case 'L': 352 case 'M': 353 case 'N': 354 case 'O': 355 case 'P': 356 case 'Q': 357 case 'R': 358 case 'S': 359 case 'T': 360 case 'U': 361 case 'V': 362 case 'W': 363 case 'X': 364 case 'Y': 365 case 'Z': 366 { 367 bc_lex_number(l, c); 368 break; 369 } 370 371 case ';': 372 { 373 l->t = BC_LEX_SCOLON; 374 break; 375 } 376 377 case '<': 378 { 379 #if BC_ENABLE_EXTRA_MATH 380 c2 = l->buf[l->i]; 381 382 // Check for shift. 383 if (c2 == '<') 384 { 385 l->i += 1; 386 bc_lex_assign(l, BC_LEX_OP_ASSIGN_LSHIFT, BC_LEX_OP_LSHIFT); 387 break; 388 } 389 #endif // BC_ENABLE_EXTRA_MATH 390 bc_lex_assign(l, BC_LEX_OP_REL_LE, BC_LEX_OP_REL_LT); 391 break; 392 } 393 394 case '=': 395 { 396 bc_lex_assign(l, BC_LEX_OP_REL_EQ, BC_LEX_OP_ASSIGN); 397 break; 398 } 399 400 case '>': 401 { 402 #if BC_ENABLE_EXTRA_MATH 403 c2 = l->buf[l->i]; 404 405 // Check for shift. 406 if (c2 == '>') 407 { 408 l->i += 1; 409 bc_lex_assign(l, BC_LEX_OP_ASSIGN_RSHIFT, BC_LEX_OP_RSHIFT); 410 break; 411 } 412 #endif // BC_ENABLE_EXTRA_MATH 413 bc_lex_assign(l, BC_LEX_OP_REL_GE, BC_LEX_OP_REL_GT); 414 break; 415 } 416 417 case '[': 418 case ']': 419 { 420 l->t = (BcLexType) (c - '[' + BC_LEX_LBRACKET); 421 break; 422 } 423 424 case '\\': 425 { 426 // In bc, a backslash+newline is whitespace. 427 if (BC_NO_ERR(l->buf[l->i] == '\n')) 428 { 429 l->i += 1; 430 l->t = BC_LEX_WHITESPACE; 431 } 432 else bc_lex_invalidChar(l, c); 433 break; 434 } 435 436 case '^': 437 { 438 bc_lex_assign(l, BC_LEX_OP_ASSIGN_POWER, BC_LEX_OP_POWER); 439 break; 440 } 441 442 case 'a': 443 case 'b': 444 case 'c': 445 case 'd': 446 case 'e': 447 case 'f': 448 case 'g': 449 case 'h': 450 case 'i': 451 case 'j': 452 case 'k': 453 case 'l': 454 case 'm': 455 case 'n': 456 case 'o': 457 case 'p': 458 case 'q': 459 case 'r': 460 case 's': 461 case 't': 462 case 'u': 463 case 'v': 464 case 'w': 465 case 'x': 466 case 'y': 467 case 'z': 468 { 469 bc_lex_identifier(l); 470 break; 471 } 472 473 case '{': 474 case '}': 475 { 476 l->t = (BcLexType) (c - '{' + BC_LEX_LBRACE); 477 break; 478 } 479 480 case '|': 481 { 482 c2 = l->buf[l->i]; 483 484 // Once again, boolean or is not allowed by POSIX. 485 if (BC_NO_ERR(c2 == '|')) 486 { 487 bc_lex_verr(l, BC_ERR_POSIX_BOOL, "||"); 488 489 l->i += 1; 490 l->t = BC_LEX_OP_BOOL_OR; 491 } 492 else bc_lex_invalidChar(l, c); 493 494 break; 495 } 496 497 default: 498 { 499 bc_lex_invalidChar(l, c); 500 } 501 } 502 } 503 #endif // BC_ENABLED 504