1 /* 2 * ***************************************************************************** 3 * 4 * SPDX-License-Identifier: BSD-2-Clause 5 * 6 * Copyright (c) 2018-2024 Gavin D. Howard and contributors. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions are met: 10 * 11 * * Redistributions of source code must retain the above copyright notice, this 12 * list of conditions and the following disclaimer. 13 * 14 * * Redistributions in binary form must reproduce the above copyright notice, 15 * this list of conditions and the following disclaimer in the documentation 16 * and/or other materials provided with the distribution. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 19 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 22 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 23 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 24 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 25 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 26 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 27 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 28 * POSSIBILITY OF SUCH DAMAGE. 29 * 30 * ***************************************************************************** 31 * 32 * Common code for the lexers. 33 * 34 */ 35 36 #include <assert.h> 37 #include <ctype.h> 38 #include <stdbool.h> 39 #include <string.h> 40 41 #include <lex.h> 42 #include <vm.h> 43 #include <bc.h> 44 45 void 46 bc_lex_invalidChar(BcLex* l, char c) 47 { 48 l->t = BC_LEX_INVALID; 49 bc_lex_verr(l, BC_ERR_PARSE_CHAR, c); 50 } 51 52 void 53 bc_lex_lineComment(BcLex* l) 54 { 55 l->t = BC_LEX_WHITESPACE; 56 while (l->i < l->len && l->buf[l->i] != '\n') 57 { 58 l->i += 1; 59 } 60 } 61 62 void 63 bc_lex_comment(BcLex* l) 64 { 65 size_t i, nlines = 0; 66 const char* buf; 67 bool end = false, got_more; 68 char c; 69 70 l->i += 1; 71 l->t = BC_LEX_WHITESPACE; 72 73 // This loop is complex because it might need to request more data from 74 // stdin if the comment is not ended. This loop is taken until the comment 75 // is finished or we have EOF. 76 do 77 { 78 buf = l->buf; 79 got_more = false; 80 81 // If we are in stdin mode, the buffer must be the one used for stdin. 82 #if !BC_ENABLE_OSSFUZZ 83 assert(vm->mode != BC_MODE_STDIN || buf == vm->buffer.v); 84 #endif // !BC_ENABLE_OSSFUZZ 85 86 // Find the end of the comment. 87 for (i = l->i; !end; i += !end) 88 { 89 // While we don't have an asterisk, eat, but increment nlines. 90 for (; (c = buf[i]) && c != '*'; ++i) 91 { 92 nlines += (c == '\n'); 93 } 94 95 // If this is true, we need to request more data. 96 if (BC_ERR(!c || buf[i + 1] == '\0')) 97 { 98 #if !BC_ENABLE_OSSFUZZ 99 // Read more, if possible. 100 if (!vm->eof && l->mode != BC_MODE_FILE) 101 { 102 got_more = bc_lex_readLine(l); 103 } 104 #endif // !BC_ENABLE_OSSFUZZ 105 106 break; 107 } 108 109 // If this turns true, we found the end. Yay! 110 end = (buf[i + 1] == '/'); 111 } 112 } 113 while (got_more && !end); 114 115 // If we didn't find the end, barf. 116 if (!end) 117 { 118 l->i = i; 119 bc_lex_err(l, BC_ERR_PARSE_COMMENT); 120 } 121 122 l->i = i + 2; 123 l->line += nlines; 124 } 125 126 void 127 bc_lex_whitespace(BcLex* l) 128 { 129 char c; 130 131 l->t = BC_LEX_WHITESPACE; 132 133 // Eat. We don't eat newlines because they can be special. 134 for (c = l->buf[l->i]; c != '\n' && isspace(c); c = l->buf[++l->i]) 135 { 136 continue; 137 } 138 } 139 140 void 141 bc_lex_commonTokens(BcLex* l, char c) 142 { 143 if (!c) l->t = BC_LEX_EOF; 144 else if (c == '\n') l->t = BC_LEX_NLINE; 145 else bc_lex_whitespace(l); 146 } 147 148 /** 149 * Parses a number. 150 * @param l The lexer. 151 * @param start The start character. 152 * @param int_only Whether this function should only look for an integer. This 153 * is used to implement the exponent of scientific notation. 154 */ 155 static size_t 156 bc_lex_num(BcLex* l, char start, bool int_only) 157 { 158 const char* buf = l->buf + l->i; 159 size_t i; 160 char c; 161 bool last_pt, pt = (start == '.'); 162 163 // This loop looks complex. It is not. It is asking if the character is not 164 // a nul byte and it if it a valid num character based on what we have found 165 // thus far, or whether it is a backslash followed by a newline. I can do 166 // i+1 on the buffer because the buffer must have a nul byte. 167 for (i = 0; (c = buf[i]) && (BC_LEX_NUM_CHAR(c, pt, int_only) || 168 (c == '\\' && buf[i + 1] == '\n')); 169 ++i) 170 { 171 // I don't need to test that the next character is a newline because 172 // the loop condition above ensures that. 173 if (c == '\\') 174 { 175 i += 2; 176 177 // Make sure to eat whitespace at the beginning of the line. 178 while (isspace(buf[i]) && buf[i] != '\n') 179 { 180 i += 1; 181 } 182 183 c = buf[i]; 184 185 // If the next character is not a number character, bail. 186 if (!BC_LEX_NUM_CHAR(c, pt, int_only)) break; 187 } 188 189 // Did we find the radix point? 190 last_pt = (c == '.'); 191 192 // If we did, and we already have one, then break because it's not part 193 // of this number. 194 if (pt && last_pt) break; 195 196 // Set whether we have found a radix point. 197 pt = pt || last_pt; 198 199 bc_vec_push(&l->str, &c); 200 } 201 202 return i; 203 } 204 205 void 206 bc_lex_number(BcLex* l, char start) 207 { 208 l->t = BC_LEX_NUMBER; 209 210 // Make sure the string is clear. 211 bc_vec_popAll(&l->str); 212 bc_vec_push(&l->str, &start); 213 214 // Parse the number. 215 l->i += bc_lex_num(l, start, false); 216 217 #if BC_ENABLE_EXTRA_MATH 218 { 219 char c = l->buf[l->i]; 220 221 // Do we have a number in scientific notation? 222 if (c == 'e') 223 { 224 #if BC_ENABLED 225 // Barf for POSIX. 226 if (BC_IS_POSIX) bc_lex_err(l, BC_ERR_POSIX_EXP_NUM); 227 #endif // BC_ENABLED 228 229 // Push the e. 230 bc_vec_push(&l->str, &c); 231 l->i += 1; 232 c = l->buf[l->i]; 233 234 // Check for negative specifically because bc_lex_num() does not. 235 if (c == BC_LEX_NEG_CHAR) 236 { 237 bc_vec_push(&l->str, &c); 238 l->i += 1; 239 c = l->buf[l->i]; 240 } 241 242 // We must have a number character, so barf if not. 243 if (BC_ERR(!BC_LEX_NUM_CHAR(c, false, true))) 244 { 245 bc_lex_verr(l, BC_ERR_PARSE_CHAR, c); 246 } 247 248 // Parse the exponent. 249 l->i += bc_lex_num(l, 0, true); 250 } 251 } 252 #endif // BC_ENABLE_EXTRA_MATH 253 254 bc_vec_pushByte(&l->str, '\0'); 255 } 256 257 void 258 bc_lex_name(BcLex* l) 259 { 260 size_t i = 0; 261 const char* buf = l->buf + l->i - 1; 262 char c = buf[i]; 263 264 l->t = BC_LEX_NAME; 265 266 // Should be obvious. It's looking for valid characters. 267 while ((c >= 'a' && c <= 'z') || isdigit(c) || c == '_') 268 { 269 c = buf[++i]; 270 } 271 272 // Set the string to the identifier. 273 bc_vec_string(&l->str, i, buf); 274 275 // Increment the index. We minus 1 because it has already been incremented. 276 l->i += i - 1; 277 } 278 279 void 280 bc_lex_init(BcLex* l) 281 { 282 BC_SIG_ASSERT_LOCKED; 283 assert(l != NULL); 284 bc_vec_init(&l->str, sizeof(char), BC_DTOR_NONE); 285 } 286 287 void 288 bc_lex_free(BcLex* l) 289 { 290 BC_SIG_ASSERT_LOCKED; 291 assert(l != NULL); 292 bc_vec_free(&l->str); 293 } 294 295 void 296 bc_lex_file(BcLex* l, const char* file) 297 { 298 assert(l != NULL && file != NULL); 299 l->line = 1; 300 vm->file = file; 301 } 302 303 void 304 bc_lex_next(BcLex* l) 305 { 306 BC_SIG_ASSERT_LOCKED; 307 308 assert(l != NULL); 309 310 l->last = l->t; 311 312 // If this wasn't here, the line number would be off. 313 l->line += (l->i != 0 && l->buf[l->i - 1] == '\n'); 314 315 // If the last token was EOF, someone called this one too many times. 316 if (BC_ERR(l->last == BC_LEX_EOF)) bc_lex_err(l, BC_ERR_PARSE_EOF); 317 318 l->t = BC_LEX_EOF; 319 320 // We are done if this is true. 321 if (l->i == l->len) return; 322 323 // Loop until failure or we don't have whitespace. This 324 // is so the parser doesn't get inundated with whitespace. 325 do 326 { 327 vm->next(l); 328 } 329 while (l->t == BC_LEX_WHITESPACE); 330 } 331 332 /** 333 * Updates the buffer and len so that they are not invalidated when the stdin 334 * buffer grows. 335 * @param l The lexer. 336 * @param text The text. 337 * @param len The length of the text. 338 */ 339 static void 340 bc_lex_fixText(BcLex* l, const char* text, size_t len) 341 { 342 l->buf = text; 343 l->len = len; 344 } 345 346 bool 347 bc_lex_readLine(BcLex* l) 348 { 349 bool good; 350 351 // These are reversed because they should be already locked, but 352 // bc_vm_readLine() needs them to be unlocked. 353 BC_SIG_UNLOCK; 354 355 // Make sure we read from the appropriate place. 356 switch (l->mode) 357 { 358 case BC_MODE_EXPRS: 359 { 360 good = bc_vm_readBuf(false); 361 break; 362 } 363 364 case BC_MODE_FILE: 365 { 366 good = false; 367 break; 368 } 369 370 #if !BC_ENABLE_OSSFUZZ 371 372 case BC_MODE_STDIN: 373 { 374 good = bc_vm_readLine(false); 375 break; 376 } 377 378 #endif // !BC_ENABLE_OSSFUZZ 379 380 #ifdef __GNUC__ 381 #ifndef __clang__ 382 default: 383 { 384 // We should never get here. 385 abort(); 386 } 387 #endif // __clang__ 388 #endif // __GNUC__ 389 } 390 391 BC_SIG_LOCK; 392 393 bc_lex_fixText(l, vm->buffer.v, vm->buffer.len - 1); 394 395 return good; 396 } 397 398 void 399 bc_lex_text(BcLex* l, const char* text, BcMode mode) 400 { 401 BC_SIG_ASSERT_LOCKED; 402 403 assert(l != NULL && text != NULL); 404 405 bc_lex_fixText(l, text, strlen(text)); 406 l->i = 0; 407 l->t = l->last = BC_LEX_INVALID; 408 l->mode = mode; 409 410 bc_lex_next(l); 411 } 412