1 /* 2 * ***************************************************************************** 3 * 4 * SPDX-License-Identifier: BSD-2-Clause 5 * 6 * Copyright (c) 2018-2021 Gavin D. Howard and contributors. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions are met: 10 * 11 * * Redistributions of source code must retain the above copyright notice, this 12 * list of conditions and the following disclaimer. 13 * 14 * * Redistributions in binary form must reproduce the above copyright notice, 15 * this list of conditions and the following disclaimer in the documentation 16 * and/or other materials provided with the distribution. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 19 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 22 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 23 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 24 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 25 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 26 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 27 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 28 * POSSIBILITY OF SUCH DAMAGE. 29 * 30 * ***************************************************************************** 31 * 32 * Common code for the lexers. 33 * 34 */ 35 36 #include <assert.h> 37 #include <ctype.h> 38 #include <stdbool.h> 39 #include <string.h> 40 41 #include <lex.h> 42 #include <vm.h> 43 #include <bc.h> 44 45 void bc_lex_invalidChar(BcLex *l, char c) { 46 l->t = BC_LEX_INVALID; 47 bc_lex_verr(l, BC_ERR_PARSE_CHAR, c); 48 } 49 50 void bc_lex_lineComment(BcLex *l) { 51 l->t = BC_LEX_WHITESPACE; 52 while (l->i < l->len && l->buf[l->i] != '\n') l->i += 1; 53 } 54 55 void bc_lex_comment(BcLex *l) { 56 57 size_t i, nlines = 0; 58 const char *buf; 59 bool end = false, got_more; 60 char c; 61 62 l->i += 1; 63 l->t = BC_LEX_WHITESPACE; 64 65 // This loop is complex because it might need to request more data from 66 // stdin if the comment is not ended. This loop is taken until the comment 67 // is finished or we have EOF. 68 do { 69 70 buf = l->buf; 71 got_more = false; 72 73 // If we are in stdin mode, the buffer must be the one used for stdin. 74 assert(!vm.is_stdin || buf == vm.buffer.v); 75 76 // Find the end of the comment. 77 for (i = l->i; !end; i += !end) { 78 79 // While we don't have an asterisk, eat, but increment nlines. 80 for (; (c = buf[i]) && c != '*'; ++i) nlines += (c == '\n'); 81 82 // If this is true, we need to request more data. 83 if (BC_ERR(!c || buf[i + 1] == '\0')) { 84 85 // Read more, if possible. 86 if (!vm.eof && (l->is_stdin || l->is_exprs)) 87 got_more = bc_lex_readLine(l); 88 89 break; 90 } 91 92 // If this turns true, we found the end. Yay! 93 end = (buf[i + 1] == '/'); 94 } 95 96 } while (got_more && !end); 97 98 // If we didn't find the end, barf. 99 if (!end) { 100 l->i = i; 101 bc_lex_err(l, BC_ERR_PARSE_COMMENT); 102 } 103 104 l->i = i + 2; 105 l->line += nlines; 106 } 107 108 void bc_lex_whitespace(BcLex *l) { 109 110 char c; 111 112 l->t = BC_LEX_WHITESPACE; 113 114 // Eat. We don't eat newlines because they can be special. 115 for (c = l->buf[l->i]; c != '\n' && isspace(c); c = l->buf[++l->i]); 116 } 117 118 void bc_lex_commonTokens(BcLex *l, char c) { 119 if (!c) l->t = BC_LEX_EOF; 120 else if (c == '\n') l->t = BC_LEX_NLINE; 121 else bc_lex_whitespace(l); 122 } 123 124 /** 125 * Parses a number. 126 * @param l The lexer. 127 * @param start The start character. 128 * @param int_only Whether this function should only look for an integer. This 129 * is used to implement the exponent of scientific notation. 130 */ 131 static size_t bc_lex_num(BcLex *l, char start, bool int_only) { 132 133 const char *buf = l->buf + l->i; 134 size_t i; 135 char c; 136 bool last_pt, pt = (start == '.'); 137 138 // This loop looks complex. It is not. It is asking if the character is not 139 // a nul byte and it if it a valid num character based on what we have found 140 // thus far, or whether it is a backslash followed by a newline. I can do 141 // i+1 on the buffer because the buffer must have a nul byte. 142 for (i = 0; (c = buf[i]) && (BC_LEX_NUM_CHAR(c, pt, int_only) || 143 (c == '\\' && buf[i + 1] == '\n')); ++i) 144 { 145 // I don't need to test that the next character is a newline because 146 // the loop condition above ensures that. 147 if (c == '\\') { 148 149 i += 2; 150 151 // Make sure to eat whitespace at the beginning of the line. 152 while(isspace(buf[i]) && buf[i] != '\n') i += 1; 153 154 c = buf[i]; 155 156 // If the next character is not a number character, bail. 157 if (!BC_LEX_NUM_CHAR(c, pt, int_only)) break; 158 } 159 160 // Did we find the radix point? 161 last_pt = (c == '.'); 162 163 // If we did, and we already have one, then break because it's not part 164 // of this number. 165 if (pt && last_pt) break; 166 167 // Set whether we have found a radix point. 168 pt = pt || last_pt; 169 170 bc_vec_push(&l->str, &c); 171 } 172 173 return i; 174 } 175 176 void bc_lex_number(BcLex *l, char start) { 177 178 l->t = BC_LEX_NUMBER; 179 180 // Make sure the string is clear. 181 bc_vec_popAll(&l->str); 182 bc_vec_push(&l->str, &start); 183 184 // Parse the number. 185 l->i += bc_lex_num(l, start, false); 186 187 #if BC_ENABLE_EXTRA_MATH 188 { 189 char c = l->buf[l->i]; 190 191 // Do we have a number in scientific notation? 192 if (c == 'e') { 193 194 #if BC_ENABLED 195 // Barf for POSIX. 196 if (BC_IS_POSIX) bc_lex_err(l, BC_ERR_POSIX_EXP_NUM); 197 #endif // BC_ENABLED 198 199 // Push the e. 200 bc_vec_push(&l->str, &c); 201 l->i += 1; 202 c = l->buf[l->i]; 203 204 // Check for negative specifically because bc_lex_num() does not. 205 if (c == BC_LEX_NEG_CHAR) { 206 bc_vec_push(&l->str, &c); 207 l->i += 1; 208 c = l->buf[l->i]; 209 } 210 211 // We must have a number character, so barf if not. 212 if (BC_ERR(!BC_LEX_NUM_CHAR(c, false, true))) 213 bc_lex_verr(l, BC_ERR_PARSE_CHAR, c); 214 215 // Parse the exponent. 216 l->i += bc_lex_num(l, 0, true); 217 } 218 } 219 #endif // BC_ENABLE_EXTRA_MATH 220 221 bc_vec_pushByte(&l->str, '\0'); 222 } 223 224 void bc_lex_name(BcLex *l) { 225 226 size_t i = 0; 227 const char *buf = l->buf + l->i - 1; 228 char c = buf[i]; 229 230 l->t = BC_LEX_NAME; 231 232 // Should be obvious. It's looking for valid characters. 233 while ((c >= 'a' && c <= 'z') || isdigit(c) || c == '_') c = buf[++i]; 234 235 // Set the string to the identifier. 236 bc_vec_string(&l->str, i, buf); 237 238 // Increment the index. We minus 1 because it has already been incremented. 239 l->i += i - 1; 240 } 241 242 void bc_lex_init(BcLex *l) { 243 BC_SIG_ASSERT_LOCKED; 244 assert(l != NULL); 245 bc_vec_init(&l->str, sizeof(char), BC_DTOR_NONE); 246 } 247 248 void bc_lex_free(BcLex *l) { 249 BC_SIG_ASSERT_LOCKED; 250 assert(l != NULL); 251 bc_vec_free(&l->str); 252 } 253 254 void bc_lex_file(BcLex *l, const char *file) { 255 assert(l != NULL && file != NULL); 256 l->line = 1; 257 vm.file = file; 258 } 259 260 void bc_lex_next(BcLex *l) { 261 262 BC_SIG_ASSERT_LOCKED; 263 264 assert(l != NULL); 265 266 l->last = l->t; 267 268 // If this wasn't here, the line number would be off. 269 l->line += (l->i != 0 && l->buf[l->i - 1] == '\n'); 270 271 // If the last token was EOF, someone called this one too many times. 272 if (BC_ERR(l->last == BC_LEX_EOF)) bc_lex_err(l, BC_ERR_PARSE_EOF); 273 274 l->t = BC_LEX_EOF; 275 276 // We are done if this is true. 277 if (l->i == l->len) return; 278 279 // Loop until failure or we don't have whitespace. This 280 // is so the parser doesn't get inundated with whitespace. 281 do { 282 vm.next(l); 283 } while (l->t == BC_LEX_WHITESPACE); 284 } 285 286 /** 287 * Updates the buffer and len so that they are not invalidated when the stdin 288 * buffer grows. 289 * @param l The lexer. 290 * @param text The text. 291 * @param len The length of the text. 292 */ 293 static void bc_lex_fixText(BcLex *l, const char *text, size_t len) { 294 l->buf = text; 295 l->len = len; 296 } 297 298 bool bc_lex_readLine(BcLex *l) { 299 300 bool good; 301 302 // These are reversed because they should be already locked, but 303 // bc_vm_readLine() needs them to be unlocked. 304 BC_SIG_UNLOCK; 305 306 // Make sure we read from the appropriate place. 307 if (l->is_stdin) good = bc_vm_readLine(false); 308 else { 309 assert(l->is_exprs); 310 good = bc_vm_readBuf(false); 311 } 312 313 BC_SIG_LOCK; 314 315 bc_lex_fixText(l, vm.buffer.v, vm.buffer.len - 1); 316 317 return good; 318 } 319 320 void bc_lex_text(BcLex *l, const char *text, bool is_stdin, bool is_exprs) { 321 322 BC_SIG_ASSERT_LOCKED; 323 324 assert(l != NULL && text != NULL); 325 326 bc_lex_fixText(l, text, strlen(text)); 327 l->i = 0; 328 l->t = l->last = BC_LEX_INVALID; 329 l->is_stdin = is_stdin; 330 l->is_exprs = is_exprs; 331 332 assert(!l->is_stdin || !l->is_exprs); 333 334 bc_lex_next(l); 335 } 336