1 /*
2 * *****************************************************************************
3 *
4 * SPDX-License-Identifier: BSD-2-Clause
5 *
6 * Copyright (c) 2018-2024 Gavin D. Howard and contributors.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions are met:
10 *
11 * * Redistributions of source code must retain the above copyright notice, this
12 * list of conditions and the following disclaimer.
13 *
14 * * Redistributions in binary form must reproduce the above copyright notice,
15 * this list of conditions and the following disclaimer in the documentation
16 * and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
22 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28 * POSSIBILITY OF SUCH DAMAGE.
29 *
30 * *****************************************************************************
31 *
32 * The lexer for dc.
33 *
34 */
35
36 #if DC_ENABLED
37
38 #include <ctype.h>
39
40 #include <dc.h>
41 #include <vm.h>
42
43 bool
dc_lex_negCommand(BcLex * l)44 dc_lex_negCommand(BcLex* l)
45 {
46 char c = l->buf[l->i];
47 return !BC_LEX_NUM_CHAR(c, false, false);
48 }
49
50 /**
51 * Processes a dc command that needs a register. This is where the
52 * extended-register extension is implemented.
53 * @param l The lexer.
54 */
55 static void
dc_lex_register(BcLex * l)56 dc_lex_register(BcLex* l)
57 {
58 // If extended register is enabled and the character is whitespace...
59 if (DC_X && isspace(l->buf[l->i - 1]))
60 {
61 char c;
62
63 // Eat the whitespace.
64 bc_lex_whitespace(l);
65 c = l->buf[l->i];
66
67 // Check for a letter or underscore.
68 if (BC_ERR(!isalpha(c) && c != '_'))
69 {
70 bc_lex_verr(l, BC_ERR_PARSE_CHAR, c);
71 }
72
73 // Parse a normal identifier.
74 l->i += 1;
75 bc_lex_name(l);
76 }
77 else
78 {
79 // I don't allow newlines because newlines are used for controlling when
80 // execution happens, and allowing newlines would just be complex.
81 if (BC_ERR(l->buf[l->i - 1] == '\n'))
82 {
83 bc_lex_verr(l, BC_ERR_PARSE_CHAR, l->buf[l->i - 1]);
84 }
85
86 // Set the lexer string and token.
87 bc_vec_popAll(&l->str);
88 bc_vec_pushByte(&l->str, (uchar) l->buf[l->i - 1]);
89 bc_vec_pushByte(&l->str, '\0');
90 l->t = BC_LEX_NAME;
91 }
92 }
93
94 /**
95 * Parses a dc string. Since dc's strings need to check for balanced brackets,
96 * we can't just parse bc and dc strings with different start and end
97 * characters. Oh, and dc strings need to check for escaped brackets.
98 * @param l The lexer.
99 */
100 static void
dc_lex_string(BcLex * l)101 dc_lex_string(BcLex* l)
102 {
103 size_t depth, nls, i;
104 char c;
105 bool got_more;
106
107 // Set the token and clear the string.
108 l->t = BC_LEX_STR;
109 bc_vec_popAll(&l->str);
110
111 do
112 {
113 depth = 1;
114 nls = 0;
115 got_more = false;
116
117 #if !BC_ENABLE_OSSFUZZ
118 assert(l->mode != BC_MODE_STDIN || l->buf == vm->buffer.v);
119 #endif // !BC_ENABLE_OSSFUZZ
120
121 // This is the meat. As long as we don't run into the NUL byte, and we
122 // have "depth", which means we haven't completely balanced brackets
123 // yet, we continue eating the string.
124 for (i = l->i; (c = l->buf[i]) && depth; ++i)
125 {
126 // Check for escaped brackets and set the depths as appropriate.
127 if (c == '\\')
128 {
129 c = l->buf[++i];
130 if (!c) break;
131 }
132 else
133 {
134 depth += (c == '[');
135 depth -= (c == ']');
136 }
137
138 // We want to adjust the line in the lexer as necessary.
139 nls += (c == '\n');
140
141 if (depth) bc_vec_push(&l->str, &c);
142 }
143
144 if (BC_ERR(c == '\0' && depth))
145 {
146 if (!vm->eof && l->mode != BC_MODE_FILE)
147 {
148 got_more = bc_lex_readLine(l);
149 }
150
151 if (got_more)
152 {
153 bc_vec_popAll(&l->str);
154 }
155 }
156 }
157 while (got_more && depth);
158
159 // Obviously, if we didn't balance, that's an error.
160 if (BC_ERR(c == '\0' && depth))
161 {
162 l->i = i;
163 bc_lex_err(l, BC_ERR_PARSE_STRING);
164 }
165
166 bc_vec_pushByte(&l->str, '\0');
167
168 l->i = i;
169 l->line += nls;
170 }
171
172 /**
173 * Lexes a dc token. This is the dc implementation of BcLexNext.
174 * @param l The lexer.
175 */
176 void
dc_lex_token(BcLex * l)177 dc_lex_token(BcLex* l)
178 {
179 char c = l->buf[l->i++], c2;
180 size_t i;
181
182 BC_SIG_ASSERT_LOCKED;
183
184 // If the last token was a command that needs a register, we need to parse a
185 // register, so do so.
186 for (i = 0; i < dc_lex_regs_len; ++i)
187 {
188 // If the token is a register token, take care of it and return.
189 if (l->last == dc_lex_regs[i])
190 {
191 dc_lex_register(l);
192 return;
193 }
194 }
195
196 // These lines are for tokens that easily correspond to one character. We
197 // just set the token.
198 if (c >= '"' && c <= '~' &&
199 (l->t = dc_lex_tokens[(c - '"')]) != BC_LEX_INVALID)
200 {
201 return;
202 }
203
204 // This is the workhorse of the lexer when more complicated things are
205 // needed.
206 switch (c)
207 {
208 case '\0':
209 case '\n':
210 case '\t':
211 case '\v':
212 case '\f':
213 case '\r':
214 case ' ':
215 {
216 bc_lex_commonTokens(l, c);
217 break;
218 }
219
220 // We don't have the ! command, so we always expect certain things
221 // after the exclamation point.
222 case '!':
223 {
224 c2 = l->buf[l->i];
225
226 if (c2 == '=') l->t = BC_LEX_OP_REL_NE;
227 else if (c2 == '<') l->t = BC_LEX_OP_REL_LE;
228 else if (c2 == '>') l->t = BC_LEX_OP_REL_GE;
229 else bc_lex_invalidChar(l, c);
230
231 l->i += 1;
232
233 break;
234 }
235
236 case '#':
237 {
238 bc_lex_lineComment(l);
239 break;
240 }
241
242 case '.':
243 {
244 c2 = l->buf[l->i];
245
246 // If the character after is a number, this dot is part of a number.
247 // Otherwise, it's the BSD dot (equivalent to last).
248 if (BC_NO_ERR(BC_LEX_NUM_CHAR(c2, true, false)))
249 {
250 bc_lex_number(l, c);
251 }
252 else bc_lex_invalidChar(l, c);
253
254 break;
255 }
256
257 case '0':
258 case '1':
259 case '2':
260 case '3':
261 case '4':
262 case '5':
263 case '6':
264 case '7':
265 case '8':
266 case '9':
267 case 'A':
268 case 'B':
269 case 'C':
270 case 'D':
271 case 'E':
272 case 'F':
273 {
274 bc_lex_number(l, c);
275 break;
276 }
277
278 case 'g':
279 {
280 c2 = l->buf[l->i];
281
282 if (c2 == 'l') l->t = BC_LEX_KW_LINE_LENGTH;
283 else if (c2 == 'x') l->t = BC_LEX_EXTENDED_REGISTERS;
284 else if (c2 == 'z') l->t = BC_LEX_KW_LEADING_ZERO;
285 else bc_lex_invalidChar(l, c2);
286
287 l->i += 1;
288
289 break;
290 }
291
292 case '[':
293 {
294 dc_lex_string(l);
295 break;
296 }
297
298 default:
299 {
300 bc_lex_invalidChar(l, c);
301 }
302 }
303 }
304 #endif // DC_ENABLED
305