1 /* 2 * ***************************************************************************** 3 * 4 * SPDX-License-Identifier: BSD-2-Clause 5 * 6 * Copyright (c) 2018-2021 Gavin D. Howard and contributors. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions are met: 10 * 11 * * Redistributions of source code must retain the above copyright notice, this 12 * list of conditions and the following disclaimer. 13 * 14 * * Redistributions in binary form must reproduce the above copyright notice, 15 * this list of conditions and the following disclaimer in the documentation 16 * and/or other materials provided with the distribution. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 19 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 22 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 23 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 24 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 25 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 26 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 27 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 28 * POSSIBILITY OF SUCH DAMAGE. 29 * 30 * ***************************************************************************** 31 * 32 * The lexer for bc. 33 * 34 */ 35 36 #if BC_ENABLED 37 38 #include <assert.h> 39 #include <ctype.h> 40 #include <string.h> 41 42 #include <bc.h> 43 #include <vm.h> 44 45 /** 46 * Lexes an identifier, which may be a keyword. 47 * @param l The lexer. 48 */ 49 static void bc_lex_identifier(BcLex *l) { 50 51 // We already passed the first character, so we need to be sure to include 52 // it. 53 const char *buf = l->buf + l->i - 1; 54 size_t i; 55 56 // This loop is simply checking for keywords. 57 for (i = 0; i < bc_lex_kws_len; ++i) { 58 59 const BcLexKeyword *kw = bc_lex_kws + i; 60 size_t n = BC_LEX_KW_LEN(kw); 61 62 if (!strncmp(buf, kw->name, n) && !isalnum(buf[n]) && buf[n] != '_') { 63 64 // If the keyword has been redefined, and redefinition is allowed 65 // (it is not allowed for builtin libraries), break out of the loop 66 // and use it as a name. This depends on the argument parser to 67 // ensure that only non-POSIX keywords get redefined. 68 if (!vm.no_redefine && vm.redefined_kws[i]) break; 69 70 l->t = BC_LEX_KW_AUTO + (BcLexType) i; 71 72 // Warn or error, as appropriate for the mode, if the keyword is not 73 // in the POSIX standard. 74 if (!BC_LEX_KW_POSIX(kw)) bc_lex_verr(l, BC_ERR_POSIX_KW, kw->name); 75 76 // We minus 1 because the index has already been incremented. 77 l->i += n - 1; 78 79 // Already have the token; bail. 80 return; 81 } 82 } 83 84 // If not a keyword, parse the name. 85 bc_lex_name(l); 86 87 // POSIX doesn't allow identifiers that are more than one character, so we 88 // might have to warn or error here too. 89 if (BC_ERR(l->str.len - 1 > 1)) 90 bc_lex_verr(l, BC_ERR_POSIX_NAME_LEN, l->str.v); 91 } 92 93 /** 94 * Parses a bc string. This is separate from dc strings because dc strings need 95 * to be balanced. 96 * @param l The lexer. 97 */ 98 static void bc_lex_string(BcLex *l) { 99 100 // We need to keep track of newlines to increment them properly. 101 size_t len, nlines, i; 102 const char *buf; 103 char c; 104 bool got_more; 105 106 l->t = BC_LEX_STR; 107 108 do { 109 110 nlines = 0; 111 buf = l->buf; 112 got_more = false; 113 114 assert(!vm.is_stdin || buf == vm.buffer.v); 115 116 // Fortunately for us, bc doesn't escape quotes. Instead, the equivalent 117 // is '\q', which makes this loop simpler. 118 for (i = l->i; (c = buf[i]) && c != '"'; ++i) nlines += (c == '\n'); 119 120 if (BC_ERR(c == '\0') && !vm.eof && (l->is_stdin || l->is_exprs)) 121 got_more = bc_lex_readLine(l); 122 123 } while (got_more && c != '"'); 124 125 // If the string did not end properly, barf. 126 if (c != '"') { 127 l->i = i; 128 bc_lex_err(l, BC_ERR_PARSE_STRING); 129 } 130 131 // Set the temp string to the parsed string. 132 len = i - l->i; 133 bc_vec_string(&l->str, len, l->buf + l->i); 134 135 l->i = i + 1; 136 l->line += nlines; 137 } 138 139 /** 140 * This function takes a lexed operator and checks to see if it's the assignment 141 * version, setting the token appropriately. 142 * @param l The lexer. 143 * @param with The token to assign if it is an assignment operator. 144 * @param without The token to assign if it is not an assignment operator. 145 */ 146 static void bc_lex_assign(BcLex *l, BcLexType with, BcLexType without) { 147 if (l->buf[l->i] == '=') { 148 l->i += 1; 149 l->t = with; 150 } 151 else l->t = without; 152 } 153 154 void bc_lex_token(BcLex *l) { 155 156 // We increment here. This means that all lexing needs to take that into 157 // account, such as when parsing an identifier. If we don't, the first 158 // character of every identifier would be missing. 159 char c = l->buf[l->i++], c2; 160 161 BC_SIG_ASSERT_LOCKED; 162 163 // This is the workhorse of the lexer. 164 switch (c) { 165 166 case '\0': 167 case '\n': 168 case '\t': 169 case '\v': 170 case '\f': 171 case '\r': 172 case ' ': 173 { 174 bc_lex_commonTokens(l, c); 175 break; 176 } 177 178 case '!': 179 { 180 // Even though it's not an assignment, we can use this. 181 bc_lex_assign(l, BC_LEX_OP_REL_NE, BC_LEX_OP_BOOL_NOT); 182 183 // POSIX doesn't allow boolean not. 184 if (l->t == BC_LEX_OP_BOOL_NOT) 185 bc_lex_verr(l, BC_ERR_POSIX_BOOL, "!"); 186 187 break; 188 } 189 190 case '"': 191 { 192 bc_lex_string(l); 193 break; 194 } 195 196 case '#': 197 { 198 // POSIX does not allow line comments. 199 bc_lex_err(l, BC_ERR_POSIX_COMMENT); 200 bc_lex_lineComment(l); 201 break; 202 } 203 204 case '%': 205 { 206 bc_lex_assign(l, BC_LEX_OP_ASSIGN_MODULUS, BC_LEX_OP_MODULUS); 207 break; 208 } 209 210 case '&': 211 { 212 c2 = l->buf[l->i]; 213 214 // Either we have boolean and or an error. And boolean and is not 215 // allowed by POSIX. 216 if (BC_NO_ERR(c2 == '&')) { 217 218 bc_lex_verr(l, BC_ERR_POSIX_BOOL, "&&"); 219 220 l->i += 1; 221 l->t = BC_LEX_OP_BOOL_AND; 222 } 223 else bc_lex_invalidChar(l, c); 224 225 break; 226 } 227 #if BC_ENABLE_EXTRA_MATH 228 case '$': 229 { 230 l->t = BC_LEX_OP_TRUNC; 231 break; 232 } 233 234 case '@': 235 { 236 bc_lex_assign(l, BC_LEX_OP_ASSIGN_PLACES, BC_LEX_OP_PLACES); 237 break; 238 } 239 #endif // BC_ENABLE_EXTRA_MATH 240 case '(': 241 case ')': 242 { 243 l->t = (BcLexType) (c - '(' + BC_LEX_LPAREN); 244 break; 245 } 246 247 case '*': 248 { 249 bc_lex_assign(l, BC_LEX_OP_ASSIGN_MULTIPLY, BC_LEX_OP_MULTIPLY); 250 break; 251 } 252 253 case '+': 254 { 255 c2 = l->buf[l->i]; 256 257 // Have to check for increment first. 258 if (c2 == '+') { 259 l->i += 1; 260 l->t = BC_LEX_OP_INC; 261 } 262 else bc_lex_assign(l, BC_LEX_OP_ASSIGN_PLUS, BC_LEX_OP_PLUS); 263 break; 264 } 265 266 case ',': 267 { 268 l->t = BC_LEX_COMMA; 269 break; 270 } 271 272 case '-': 273 { 274 c2 = l->buf[l->i]; 275 276 // Have to check for decrement first. 277 if (c2 == '-') { 278 l->i += 1; 279 l->t = BC_LEX_OP_DEC; 280 } 281 else bc_lex_assign(l, BC_LEX_OP_ASSIGN_MINUS, BC_LEX_OP_MINUS); 282 break; 283 } 284 285 case '.': 286 { 287 c2 = l->buf[l->i]; 288 289 // If it's alone, it's an alias for last. 290 if (BC_LEX_NUM_CHAR(c2, true, false)) bc_lex_number(l, c); 291 else { 292 l->t = BC_LEX_KW_LAST; 293 bc_lex_err(l, BC_ERR_POSIX_DOT); 294 } 295 296 break; 297 } 298 299 case '/': 300 { 301 c2 = l->buf[l->i]; 302 if (c2 =='*') bc_lex_comment(l); 303 else bc_lex_assign(l, BC_LEX_OP_ASSIGN_DIVIDE, BC_LEX_OP_DIVIDE); 304 break; 305 } 306 307 case '0': 308 case '1': 309 case '2': 310 case '3': 311 case '4': 312 case '5': 313 case '6': 314 case '7': 315 case '8': 316 case '9': 317 case 'A': 318 case 'B': 319 case 'C': 320 case 'D': 321 case 'E': 322 case 'F': 323 // Apparently, GNU bc (and maybe others) allows any uppercase letter as 324 // a number. When single digits, they act like the ones above. When 325 // multi-digit, any letter above the input base is automatically set to 326 // the biggest allowable digit in the input base. 327 case 'G': 328 case 'H': 329 case 'I': 330 case 'J': 331 case 'K': 332 case 'L': 333 case 'M': 334 case 'N': 335 case 'O': 336 case 'P': 337 case 'Q': 338 case 'R': 339 case 'S': 340 case 'T': 341 case 'U': 342 case 'V': 343 case 'W': 344 case 'X': 345 case 'Y': 346 case 'Z': 347 { 348 bc_lex_number(l, c); 349 break; 350 } 351 352 case ';': 353 { 354 l->t = BC_LEX_SCOLON; 355 break; 356 } 357 358 case '<': 359 { 360 #if BC_ENABLE_EXTRA_MATH 361 c2 = l->buf[l->i]; 362 363 // Check for shift. 364 if (c2 == '<') { 365 l->i += 1; 366 bc_lex_assign(l, BC_LEX_OP_ASSIGN_LSHIFT, BC_LEX_OP_LSHIFT); 367 break; 368 } 369 #endif // BC_ENABLE_EXTRA_MATH 370 bc_lex_assign(l, BC_LEX_OP_REL_LE, BC_LEX_OP_REL_LT); 371 break; 372 } 373 374 case '=': 375 { 376 bc_lex_assign(l, BC_LEX_OP_REL_EQ, BC_LEX_OP_ASSIGN); 377 break; 378 } 379 380 case '>': 381 { 382 #if BC_ENABLE_EXTRA_MATH 383 c2 = l->buf[l->i]; 384 385 // Check for shift. 386 if (c2 == '>') { 387 l->i += 1; 388 bc_lex_assign(l, BC_LEX_OP_ASSIGN_RSHIFT, BC_LEX_OP_RSHIFT); 389 break; 390 } 391 #endif // BC_ENABLE_EXTRA_MATH 392 bc_lex_assign(l, BC_LEX_OP_REL_GE, BC_LEX_OP_REL_GT); 393 break; 394 } 395 396 case '[': 397 case ']': 398 { 399 l->t = (BcLexType) (c - '[' + BC_LEX_LBRACKET); 400 break; 401 } 402 403 case '\\': 404 { 405 // In bc, a backslash+newline is whitespace. 406 if (BC_NO_ERR(l->buf[l->i] == '\n')) { 407 l->i += 1; 408 l->t = BC_LEX_WHITESPACE; 409 } 410 else bc_lex_invalidChar(l, c); 411 break; 412 } 413 414 case '^': 415 { 416 bc_lex_assign(l, BC_LEX_OP_ASSIGN_POWER, BC_LEX_OP_POWER); 417 break; 418 } 419 420 case 'a': 421 case 'b': 422 case 'c': 423 case 'd': 424 case 'e': 425 case 'f': 426 case 'g': 427 case 'h': 428 case 'i': 429 case 'j': 430 case 'k': 431 case 'l': 432 case 'm': 433 case 'n': 434 case 'o': 435 case 'p': 436 case 'q': 437 case 'r': 438 case 's': 439 case 't': 440 case 'u': 441 case 'v': 442 case 'w': 443 case 'x': 444 case 'y': 445 case 'z': 446 { 447 bc_lex_identifier(l); 448 break; 449 } 450 451 case '{': 452 case '}': 453 { 454 l->t = (BcLexType) (c - '{' + BC_LEX_LBRACE); 455 break; 456 } 457 458 case '|': 459 { 460 c2 = l->buf[l->i]; 461 462 // Once again, boolean or is not allowed by POSIX. 463 if (BC_NO_ERR(c2 == '|')) { 464 465 bc_lex_verr(l, BC_ERR_POSIX_BOOL, "||"); 466 467 l->i += 1; 468 l->t = BC_LEX_OP_BOOL_OR; 469 } 470 else bc_lex_invalidChar(l, c); 471 472 break; 473 } 474 475 default: 476 { 477 bc_lex_invalidChar(l, c); 478 } 479 } 480 } 481 #endif // BC_ENABLED 482