1 /* 2 * ***************************************************************************** 3 * 4 * SPDX-License-Identifier: BSD-2-Clause 5 * 6 * Copyright (c) 2018-2024 Gavin D. Howard and contributors. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions are met: 10 * 11 * * Redistributions of source code must retain the above copyright notice, this 12 * list of conditions and the following disclaimer. 13 * 14 * * Redistributions in binary form must reproduce the above copyright notice, 15 * this list of conditions and the following disclaimer in the documentation 16 * and/or other materials provided with the distribution. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 19 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 22 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 23 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 24 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 25 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 26 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 27 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 28 * POSSIBILITY OF SUCH DAMAGE. 29 * 30 * ***************************************************************************** 31 * 32 * The lexer for dc. 33 * 34 */ 35 36 #if DC_ENABLED 37 38 #include <ctype.h> 39 40 #include <dc.h> 41 #include <vm.h> 42 43 bool 44 dc_lex_negCommand(BcLex* l) 45 { 46 char c = l->buf[l->i]; 47 return !BC_LEX_NUM_CHAR(c, false, false); 48 } 49 50 /** 51 * Processes a dc command that needs a register. This is where the 52 * extended-register extension is implemented. 53 * @param l The lexer. 54 */ 55 static void 56 dc_lex_register(BcLex* l) 57 { 58 // If extended register is enabled and the character is whitespace... 59 if (DC_X && isspace(l->buf[l->i - 1])) 60 { 61 char c; 62 63 // Eat the whitespace. 64 bc_lex_whitespace(l); 65 c = l->buf[l->i]; 66 67 // Check for a letter or underscore. 68 if (BC_ERR(!isalpha(c) && c != '_')) 69 { 70 bc_lex_verr(l, BC_ERR_PARSE_CHAR, c); 71 } 72 73 // Parse a normal identifier. 74 l->i += 1; 75 bc_lex_name(l); 76 } 77 else 78 { 79 // I don't allow newlines because newlines are used for controlling when 80 // execution happens, and allowing newlines would just be complex. 81 if (BC_ERR(l->buf[l->i - 1] == '\n')) 82 { 83 bc_lex_verr(l, BC_ERR_PARSE_CHAR, l->buf[l->i - 1]); 84 } 85 86 // Set the lexer string and token. 87 bc_vec_popAll(&l->str); 88 bc_vec_pushByte(&l->str, (uchar) l->buf[l->i - 1]); 89 bc_vec_pushByte(&l->str, '\0'); 90 l->t = BC_LEX_NAME; 91 } 92 } 93 94 /** 95 * Parses a dc string. Since dc's strings need to check for balanced brackets, 96 * we can't just parse bc and dc strings with different start and end 97 * characters. Oh, and dc strings need to check for escaped brackets. 98 * @param l The lexer. 99 */ 100 static void 101 dc_lex_string(BcLex* l) 102 { 103 size_t depth, nls, i; 104 char c; 105 bool got_more; 106 107 // Set the token and clear the string. 108 l->t = BC_LEX_STR; 109 bc_vec_popAll(&l->str); 110 111 do 112 { 113 depth = 1; 114 nls = 0; 115 got_more = false; 116 117 #if !BC_ENABLE_OSSFUZZ 118 assert(l->mode != BC_MODE_STDIN || l->buf == vm->buffer.v); 119 #endif // !BC_ENABLE_OSSFUZZ 120 121 // This is the meat. As long as we don't run into the NUL byte, and we 122 // have "depth", which means we haven't completely balanced brackets 123 // yet, we continue eating the string. 124 for (i = l->i; (c = l->buf[i]) && depth; ++i) 125 { 126 // Check for escaped brackets and set the depths as appropriate. 127 if (c == '\\') 128 { 129 c = l->buf[++i]; 130 if (!c) break; 131 } 132 else 133 { 134 depth += (c == '['); 135 depth -= (c == ']'); 136 } 137 138 // We want to adjust the line in the lexer as necessary. 139 nls += (c == '\n'); 140 141 if (depth) bc_vec_push(&l->str, &c); 142 } 143 144 if (BC_ERR(c == '\0' && depth)) 145 { 146 if (!vm->eof && l->mode != BC_MODE_FILE) 147 { 148 got_more = bc_lex_readLine(l); 149 } 150 151 if (got_more) 152 { 153 bc_vec_popAll(&l->str); 154 } 155 } 156 } 157 while (got_more && depth); 158 159 // Obviously, if we didn't balance, that's an error. 160 if (BC_ERR(c == '\0' && depth)) 161 { 162 l->i = i; 163 bc_lex_err(l, BC_ERR_PARSE_STRING); 164 } 165 166 bc_vec_pushByte(&l->str, '\0'); 167 168 l->i = i; 169 l->line += nls; 170 } 171 172 /** 173 * Lexes a dc token. This is the dc implementation of BcLexNext. 174 * @param l The lexer. 175 */ 176 void 177 dc_lex_token(BcLex* l) 178 { 179 char c = l->buf[l->i++], c2; 180 size_t i; 181 182 BC_SIG_ASSERT_LOCKED; 183 184 // If the last token was a command that needs a register, we need to parse a 185 // register, so do so. 186 for (i = 0; i < dc_lex_regs_len; ++i) 187 { 188 // If the token is a register token, take care of it and return. 189 if (l->last == dc_lex_regs[i]) 190 { 191 dc_lex_register(l); 192 return; 193 } 194 } 195 196 // These lines are for tokens that easily correspond to one character. We 197 // just set the token. 198 if (c >= '"' && c <= '~' && 199 (l->t = dc_lex_tokens[(c - '"')]) != BC_LEX_INVALID) 200 { 201 return; 202 } 203 204 // This is the workhorse of the lexer when more complicated things are 205 // needed. 206 switch (c) 207 { 208 case '\0': 209 case '\n': 210 case '\t': 211 case '\v': 212 case '\f': 213 case '\r': 214 case ' ': 215 { 216 bc_lex_commonTokens(l, c); 217 break; 218 } 219 220 // We don't have the ! command, so we always expect certain things 221 // after the exclamation point. 222 case '!': 223 { 224 c2 = l->buf[l->i]; 225 226 if (c2 == '=') l->t = BC_LEX_OP_REL_NE; 227 else if (c2 == '<') l->t = BC_LEX_OP_REL_LE; 228 else if (c2 == '>') l->t = BC_LEX_OP_REL_GE; 229 else bc_lex_invalidChar(l, c); 230 231 l->i += 1; 232 233 break; 234 } 235 236 case '#': 237 { 238 bc_lex_lineComment(l); 239 break; 240 } 241 242 case '.': 243 { 244 c2 = l->buf[l->i]; 245 246 // If the character after is a number, this dot is part of a number. 247 // Otherwise, it's the BSD dot (equivalent to last). 248 if (BC_NO_ERR(BC_LEX_NUM_CHAR(c2, true, false))) 249 { 250 bc_lex_number(l, c); 251 } 252 else bc_lex_invalidChar(l, c); 253 254 break; 255 } 256 257 case '0': 258 case '1': 259 case '2': 260 case '3': 261 case '4': 262 case '5': 263 case '6': 264 case '7': 265 case '8': 266 case '9': 267 case 'A': 268 case 'B': 269 case 'C': 270 case 'D': 271 case 'E': 272 case 'F': 273 { 274 bc_lex_number(l, c); 275 break; 276 } 277 278 case 'g': 279 { 280 c2 = l->buf[l->i]; 281 282 if (c2 == 'l') l->t = BC_LEX_KW_LINE_LENGTH; 283 else if (c2 == 'x') l->t = BC_LEX_EXTENDED_REGISTERS; 284 else if (c2 == 'z') l->t = BC_LEX_KW_LEADING_ZERO; 285 else bc_lex_invalidChar(l, c2); 286 287 l->i += 1; 288 289 break; 290 } 291 292 case '[': 293 { 294 dc_lex_string(l); 295 break; 296 } 297 298 default: 299 { 300 bc_lex_invalidChar(l, c); 301 } 302 } 303 } 304 #endif // DC_ENABLED 305