/* * ***************************************************************************** * * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2018-2024 Gavin D. Howard and contributors. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * * Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * * Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * ***************************************************************************** * * Common code for the lexers. * */ #include #include #include #include #include #include #include void bc_lex_invalidChar(BcLex* l, char c) { l->t = BC_LEX_INVALID; bc_lex_verr(l, BC_ERR_PARSE_CHAR, c); } void bc_lex_lineComment(BcLex* l) { l->t = BC_LEX_WHITESPACE; while (l->i < l->len && l->buf[l->i] != '\n') { l->i += 1; } } void bc_lex_comment(BcLex* l) { size_t i, nlines = 0; const char* buf; bool end = false, got_more; char c; l->i += 1; l->t = BC_LEX_WHITESPACE; // This loop is complex because it might need to request more data from // stdin if the comment is not ended. This loop is taken until the comment // is finished or we have EOF. do { buf = l->buf; got_more = false; // If we are in stdin mode, the buffer must be the one used for stdin. #if !BC_ENABLE_OSSFUZZ assert(vm->mode != BC_MODE_STDIN || buf == vm->buffer.v); #endif // !BC_ENABLE_OSSFUZZ // Find the end of the comment. for (i = l->i; !end; i += !end) { // While we don't have an asterisk, eat, but increment nlines. for (; (c = buf[i]) && c != '*'; ++i) { nlines += (c == '\n'); } // If this is true, we need to request more data. if (BC_ERR(!c || buf[i + 1] == '\0')) { #if !BC_ENABLE_OSSFUZZ // Read more, if possible. if (!vm->eof && l->mode != BC_MODE_FILE) { got_more = bc_lex_readLine(l); } #endif // !BC_ENABLE_OSSFUZZ break; } // If this turns true, we found the end. Yay! end = (buf[i + 1] == '/'); } } while (got_more && !end); // If we didn't find the end, barf. if (!end) { l->i = i; bc_lex_err(l, BC_ERR_PARSE_COMMENT); } l->i = i + 2; l->line += nlines; } void bc_lex_whitespace(BcLex* l) { char c; l->t = BC_LEX_WHITESPACE; // Eat. We don't eat newlines because they can be special. for (c = l->buf[l->i]; c != '\n' && isspace(c); c = l->buf[++l->i]) { continue; } } void bc_lex_commonTokens(BcLex* l, char c) { if (!c) l->t = BC_LEX_EOF; else if (c == '\n') l->t = BC_LEX_NLINE; else bc_lex_whitespace(l); } /** * Parses a number. * @param l The lexer. * @param start The start character. * @param int_only Whether this function should only look for an integer. This * is used to implement the exponent of scientific notation. */ static size_t bc_lex_num(BcLex* l, char start, bool int_only) { const char* buf = l->buf + l->i; size_t i; char c; bool last_pt, pt = (start == '.'); // This loop looks complex. It is not. It is asking if the character is not // a nul byte and it if it a valid num character based on what we have found // thus far, or whether it is a backslash followed by a newline. I can do // i+1 on the buffer because the buffer must have a nul byte. for (i = 0; (c = buf[i]) && (BC_LEX_NUM_CHAR(c, pt, int_only) || (c == '\\' && buf[i + 1] == '\n')); ++i) { // I don't need to test that the next character is a newline because // the loop condition above ensures that. if (c == '\\') { i += 2; // Make sure to eat whitespace at the beginning of the line. while (isspace(buf[i]) && buf[i] != '\n') { i += 1; } c = buf[i]; // If the next character is not a number character, bail. if (!BC_LEX_NUM_CHAR(c, pt, int_only)) break; } // Did we find the radix point? last_pt = (c == '.'); // If we did, and we already have one, then break because it's not part // of this number. if (pt && last_pt) break; // Set whether we have found a radix point. pt = pt || last_pt; bc_vec_push(&l->str, &c); } return i; } void bc_lex_number(BcLex* l, char start) { l->t = BC_LEX_NUMBER; // Make sure the string is clear. bc_vec_popAll(&l->str); bc_vec_push(&l->str, &start); // Parse the number. l->i += bc_lex_num(l, start, false); #if BC_ENABLE_EXTRA_MATH { char c = l->buf[l->i]; // Do we have a number in scientific notation? if (c == 'e') { #if BC_ENABLED // Barf for POSIX. if (BC_IS_POSIX) bc_lex_err(l, BC_ERR_POSIX_EXP_NUM); #endif // BC_ENABLED // Push the e. bc_vec_push(&l->str, &c); l->i += 1; c = l->buf[l->i]; // Check for negative specifically because bc_lex_num() does not. if (c == BC_LEX_NEG_CHAR) { bc_vec_push(&l->str, &c); l->i += 1; c = l->buf[l->i]; } // We must have a number character, so barf if not. if (BC_ERR(!BC_LEX_NUM_CHAR(c, false, true))) { bc_lex_verr(l, BC_ERR_PARSE_CHAR, c); } // Parse the exponent. l->i += bc_lex_num(l, 0, true); } } #endif // BC_ENABLE_EXTRA_MATH bc_vec_pushByte(&l->str, '\0'); } void bc_lex_name(BcLex* l) { size_t i = 0; const char* buf = l->buf + l->i - 1; char c = buf[i]; l->t = BC_LEX_NAME; // Should be obvious. It's looking for valid characters. while ((c >= 'a' && c <= 'z') || isdigit(c) || c == '_') { c = buf[++i]; } // Set the string to the identifier. bc_vec_string(&l->str, i, buf); // Increment the index. We minus 1 because it has already been incremented. l->i += i - 1; } void bc_lex_init(BcLex* l) { BC_SIG_ASSERT_LOCKED; assert(l != NULL); bc_vec_init(&l->str, sizeof(char), BC_DTOR_NONE); } void bc_lex_free(BcLex* l) { BC_SIG_ASSERT_LOCKED; assert(l != NULL); bc_vec_free(&l->str); } void bc_lex_file(BcLex* l, const char* file) { assert(l != NULL && file != NULL); l->line = 1; vm->file = file; } void bc_lex_next(BcLex* l) { BC_SIG_ASSERT_LOCKED; assert(l != NULL); l->last = l->t; // If this wasn't here, the line number would be off. l->line += (l->i != 0 && l->buf[l->i - 1] == '\n'); // If the last token was EOF, someone called this one too many times. if (BC_ERR(l->last == BC_LEX_EOF)) bc_lex_err(l, BC_ERR_PARSE_EOF); l->t = BC_LEX_EOF; // We are done if this is true. if (l->i == l->len) return; // Loop until failure or we don't have whitespace. This // is so the parser doesn't get inundated with whitespace. do { vm->next(l); } while (l->t == BC_LEX_WHITESPACE); } /** * Updates the buffer and len so that they are not invalidated when the stdin * buffer grows. * @param l The lexer. * @param text The text. * @param len The length of the text. */ static void bc_lex_fixText(BcLex* l, const char* text, size_t len) { l->buf = text; l->len = len; } bool bc_lex_readLine(BcLex* l) { bool good; // These are reversed because they should be already locked, but // bc_vm_readLine() needs them to be unlocked. BC_SIG_UNLOCK; // Make sure we read from the appropriate place. switch (l->mode) { case BC_MODE_EXPRS: { good = bc_vm_readBuf(false); break; } case BC_MODE_FILE: { good = false; break; } #if !BC_ENABLE_OSSFUZZ case BC_MODE_STDIN: { good = bc_vm_readLine(false); break; } #endif // !BC_ENABLE_OSSFUZZ #ifdef __GNUC__ #ifndef __clang__ default: { // We should never get here. abort(); } #endif // __clang__ #endif // __GNUC__ } BC_SIG_LOCK; bc_lex_fixText(l, vm->buffer.v, vm->buffer.len - 1); return good; } void bc_lex_text(BcLex* l, const char* text, BcMode mode) { BC_SIG_ASSERT_LOCKED; assert(l != NULL && text != NULL); bc_lex_fixText(l, text, strlen(text)); l->i = 0; l->t = l->last = BC_LEX_INVALID; l->mode = mode; bc_lex_next(l); }