1 /*
2 * *****************************************************************************
3 *
4 * SPDX-License-Identifier: BSD-2-Clause
5 *
6 * Copyright (c) 2018-2024 Gavin D. Howard and contributors.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions are met:
10 *
11 * * Redistributions of source code must retain the above copyright notice, this
12 * list of conditions and the following disclaimer.
13 *
14 * * Redistributions in binary form must reproduce the above copyright notice,
15 * this list of conditions and the following disclaimer in the documentation
16 * and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
22 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28 * POSSIBILITY OF SUCH DAMAGE.
29 *
30 * *****************************************************************************
31 *
32 * The lexer for bc.
33 *
34 */
35
36 #if BC_ENABLED
37
38 #include <assert.h>
39 #include <ctype.h>
40 #include <string.h>
41
42 #include <bc.h>
43 #include <vm.h>
44
45 /**
46 * Lexes an identifier, which may be a keyword.
47 * @param l The lexer.
48 */
49 static void
bc_lex_identifier(BcLex * l)50 bc_lex_identifier(BcLex* l)
51 {
52 // We already passed the first character, so we need to be sure to include
53 // it.
54 const char* buf = l->buf + l->i - 1;
55 size_t i;
56
57 // This loop is simply checking for keywords.
58 for (i = 0; i < bc_lex_kws_len; ++i)
59 {
60 const BcLexKeyword* kw = bc_lex_kws + i;
61 size_t n = BC_LEX_KW_LEN(kw);
62
63 if (!strncmp(buf, kw->name, n) && !isalnum(buf[n]) && buf[n] != '_')
64 {
65 // If the keyword has been redefined, and redefinition is allowed
66 // (it is not allowed for builtin libraries), break out of the loop
67 // and use it as a name. This depends on the argument parser to
68 // ensure that only non-POSIX keywords get redefined.
69 if (!vm->no_redefine && vm->redefined_kws[i]) break;
70
71 l->t = BC_LEX_KW_AUTO + (BcLexType) i;
72
73 // Warn or error, as appropriate for the mode, if the keyword is not
74 // in the POSIX standard.
75 if (!BC_LEX_KW_POSIX(kw)) bc_lex_verr(l, BC_ERR_POSIX_KW, kw->name);
76
77 // We minus 1 because the index has already been incremented.
78 l->i += n - 1;
79
80 // Already have the token; bail.
81 return;
82 }
83 }
84
85 // If not a keyword, parse the name.
86 bc_lex_name(l);
87
88 // POSIX doesn't allow identifiers that are more than one character, so we
89 // might have to warn or error here too.
90 if (BC_ERR(l->str.len - 1 > 1))
91 {
92 bc_lex_verr(l, BC_ERR_POSIX_NAME_LEN, l->str.v);
93 }
94 }
95
96 /**
97 * Parses a bc string. This is separate from dc strings because dc strings need
98 * to be balanced.
99 * @param l The lexer.
100 */
101 static void
bc_lex_string(BcLex * l)102 bc_lex_string(BcLex* l)
103 {
104 // We need to keep track of newlines to increment them properly.
105 size_t len, nlines, i;
106 const char* buf;
107 char c;
108 bool got_more;
109
110 l->t = BC_LEX_STR;
111
112 do
113 {
114 nlines = 0;
115 buf = l->buf;
116 got_more = false;
117
118 #if !BC_ENABLE_OSSFUZZ
119 assert(vm->mode != BC_MODE_STDIN || buf == vm->buffer.v);
120 #endif // !BC_ENABLE_OSSFUZZ
121
122 // Fortunately for us, bc doesn't escape quotes. Instead, the equivalent
123 // is '\q', which makes this loop simpler.
124 for (i = l->i; (c = buf[i]) && c != '"'; ++i)
125 {
126 nlines += (c == '\n');
127 }
128
129 if (BC_ERR(c == '\0') && !vm->eof && l->mode != BC_MODE_FILE)
130 {
131 got_more = bc_lex_readLine(l);
132 }
133 }
134 while (got_more && c != '"');
135
136 // If the string did not end properly, barf.
137 if (c != '"')
138 {
139 l->i = i;
140 bc_lex_err(l, BC_ERR_PARSE_STRING);
141 }
142
143 // Set the temp string to the parsed string.
144 len = i - l->i;
145 bc_vec_string(&l->str, len, l->buf + l->i);
146
147 l->i = i + 1;
148 l->line += nlines;
149 }
150
151 /**
152 * This function takes a lexed operator and checks to see if it's the assignment
153 * version, setting the token appropriately.
154 * @param l The lexer.
155 * @param with The token to assign if it is an assignment operator.
156 * @param without The token to assign if it is not an assignment operator.
157 */
158 static void
bc_lex_assign(BcLex * l,BcLexType with,BcLexType without)159 bc_lex_assign(BcLex* l, BcLexType with, BcLexType without)
160 {
161 if (l->buf[l->i] == '=')
162 {
163 l->i += 1;
164 l->t = with;
165 }
166 else l->t = without;
167 }
168
169 void
bc_lex_token(BcLex * l)170 bc_lex_token(BcLex* l)
171 {
172 // We increment here. This means that all lexing needs to take that into
173 // account, such as when parsing an identifier. If we don't, the first
174 // character of every identifier would be missing.
175 char c = l->buf[l->i++], c2;
176
177 BC_SIG_ASSERT_LOCKED;
178
179 // This is the workhorse of the lexer.
180 switch (c)
181 {
182 case '\0':
183 case '\n':
184 case '\t':
185 case '\v':
186 case '\f':
187 case '\r':
188 case ' ':
189 {
190 bc_lex_commonTokens(l, c);
191 break;
192 }
193
194 case '!':
195 {
196 // Even though it's not an assignment, we can use this.
197 bc_lex_assign(l, BC_LEX_OP_REL_NE, BC_LEX_OP_BOOL_NOT);
198
199 // POSIX doesn't allow boolean not.
200 if (l->t == BC_LEX_OP_BOOL_NOT)
201 {
202 bc_lex_verr(l, BC_ERR_POSIX_BOOL, "!");
203 }
204
205 break;
206 }
207
208 case '"':
209 {
210 bc_lex_string(l);
211 break;
212 }
213
214 case '#':
215 {
216 // POSIX does not allow line comments.
217 bc_lex_err(l, BC_ERR_POSIX_COMMENT);
218 bc_lex_lineComment(l);
219 break;
220 }
221
222 case '%':
223 {
224 bc_lex_assign(l, BC_LEX_OP_ASSIGN_MODULUS, BC_LEX_OP_MODULUS);
225 break;
226 }
227
228 case '&':
229 {
230 c2 = l->buf[l->i];
231
232 // Either we have boolean and or an error. And boolean and is not
233 // allowed by POSIX.
234 if (BC_NO_ERR(c2 == '&'))
235 {
236 bc_lex_verr(l, BC_ERR_POSIX_BOOL, "&&");
237
238 l->i += 1;
239 l->t = BC_LEX_OP_BOOL_AND;
240 }
241 else bc_lex_invalidChar(l, c);
242
243 break;
244 }
245 #if BC_ENABLE_EXTRA_MATH
246 case '$':
247 {
248 l->t = BC_LEX_OP_TRUNC;
249 break;
250 }
251
252 case '@':
253 {
254 bc_lex_assign(l, BC_LEX_OP_ASSIGN_PLACES, BC_LEX_OP_PLACES);
255 break;
256 }
257 #endif // BC_ENABLE_EXTRA_MATH
258 case '(':
259 case ')':
260 {
261 l->t = (BcLexType) (c - '(' + BC_LEX_LPAREN);
262 break;
263 }
264
265 case '*':
266 {
267 bc_lex_assign(l, BC_LEX_OP_ASSIGN_MULTIPLY, BC_LEX_OP_MULTIPLY);
268 break;
269 }
270
271 case '+':
272 {
273 c2 = l->buf[l->i];
274
275 // Have to check for increment first.
276 if (c2 == '+')
277 {
278 l->i += 1;
279 l->t = BC_LEX_OP_INC;
280 }
281 else bc_lex_assign(l, BC_LEX_OP_ASSIGN_PLUS, BC_LEX_OP_PLUS);
282 break;
283 }
284
285 case ',':
286 {
287 l->t = BC_LEX_COMMA;
288 break;
289 }
290
291 case '-':
292 {
293 c2 = l->buf[l->i];
294
295 // Have to check for decrement first.
296 if (c2 == '-')
297 {
298 l->i += 1;
299 l->t = BC_LEX_OP_DEC;
300 }
301 else bc_lex_assign(l, BC_LEX_OP_ASSIGN_MINUS, BC_LEX_OP_MINUS);
302 break;
303 }
304
305 case '.':
306 {
307 c2 = l->buf[l->i];
308
309 // If it's alone, it's an alias for last.
310 if (BC_LEX_NUM_CHAR(c2, true, false)) bc_lex_number(l, c);
311 else
312 {
313 l->t = BC_LEX_KW_LAST;
314 bc_lex_err(l, BC_ERR_POSIX_DOT);
315 }
316
317 break;
318 }
319
320 case '/':
321 {
322 c2 = l->buf[l->i];
323 if (c2 == '*') bc_lex_comment(l);
324 else bc_lex_assign(l, BC_LEX_OP_ASSIGN_DIVIDE, BC_LEX_OP_DIVIDE);
325 break;
326 }
327
328 case '0':
329 case '1':
330 case '2':
331 case '3':
332 case '4':
333 case '5':
334 case '6':
335 case '7':
336 case '8':
337 case '9':
338 case 'A':
339 case 'B':
340 case 'C':
341 case 'D':
342 case 'E':
343 case 'F':
344 // Apparently, GNU bc (and maybe others) allows any uppercase letter as
345 // a number. When single digits, they act like the ones above. When
346 // multi-digit, any letter above the input base is automatically set to
347 // the biggest allowable digit in the input base.
348 case 'G':
349 case 'H':
350 case 'I':
351 case 'J':
352 case 'K':
353 case 'L':
354 case 'M':
355 case 'N':
356 case 'O':
357 case 'P':
358 case 'Q':
359 case 'R':
360 case 'S':
361 case 'T':
362 case 'U':
363 case 'V':
364 case 'W':
365 case 'X':
366 case 'Y':
367 case 'Z':
368 {
369 bc_lex_number(l, c);
370 break;
371 }
372
373 case ';':
374 {
375 l->t = BC_LEX_SCOLON;
376 break;
377 }
378
379 case '<':
380 {
381 #if BC_ENABLE_EXTRA_MATH
382 c2 = l->buf[l->i];
383
384 // Check for shift.
385 if (c2 == '<')
386 {
387 l->i += 1;
388 bc_lex_assign(l, BC_LEX_OP_ASSIGN_LSHIFT, BC_LEX_OP_LSHIFT);
389 break;
390 }
391 #endif // BC_ENABLE_EXTRA_MATH
392 bc_lex_assign(l, BC_LEX_OP_REL_LE, BC_LEX_OP_REL_LT);
393 break;
394 }
395
396 case '=':
397 {
398 bc_lex_assign(l, BC_LEX_OP_REL_EQ, BC_LEX_OP_ASSIGN);
399 break;
400 }
401
402 case '>':
403 {
404 #if BC_ENABLE_EXTRA_MATH
405 c2 = l->buf[l->i];
406
407 // Check for shift.
408 if (c2 == '>')
409 {
410 l->i += 1;
411 bc_lex_assign(l, BC_LEX_OP_ASSIGN_RSHIFT, BC_LEX_OP_RSHIFT);
412 break;
413 }
414 #endif // BC_ENABLE_EXTRA_MATH
415 bc_lex_assign(l, BC_LEX_OP_REL_GE, BC_LEX_OP_REL_GT);
416 break;
417 }
418
419 case '[':
420 case ']':
421 {
422 l->t = (BcLexType) (c - '[' + BC_LEX_LBRACKET);
423 break;
424 }
425
426 case '\\':
427 {
428 // In bc, a backslash+newline is whitespace.
429 if (BC_NO_ERR(l->buf[l->i] == '\n'))
430 {
431 l->i += 1;
432 l->t = BC_LEX_WHITESPACE;
433 }
434 else bc_lex_invalidChar(l, c);
435 break;
436 }
437
438 case '^':
439 {
440 bc_lex_assign(l, BC_LEX_OP_ASSIGN_POWER, BC_LEX_OP_POWER);
441 break;
442 }
443
444 case 'a':
445 case 'b':
446 case 'c':
447 case 'd':
448 case 'e':
449 case 'f':
450 case 'g':
451 case 'h':
452 case 'i':
453 case 'j':
454 case 'k':
455 case 'l':
456 case 'm':
457 case 'n':
458 case 'o':
459 case 'p':
460 case 'q':
461 case 'r':
462 case 's':
463 case 't':
464 case 'u':
465 case 'v':
466 case 'w':
467 case 'x':
468 case 'y':
469 case 'z':
470 {
471 bc_lex_identifier(l);
472 break;
473 }
474
475 case '{':
476 case '}':
477 {
478 l->t = (BcLexType) (c - '{' + BC_LEX_LBRACE);
479 break;
480 }
481
482 case '|':
483 {
484 c2 = l->buf[l->i];
485
486 // Once again, boolean or is not allowed by POSIX.
487 if (BC_NO_ERR(c2 == '|'))
488 {
489 bc_lex_verr(l, BC_ERR_POSIX_BOOL, "||");
490
491 l->i += 1;
492 l->t = BC_LEX_OP_BOOL_OR;
493 }
494 else bc_lex_invalidChar(l, c);
495
496 break;
497 }
498
499 default:
500 {
501 bc_lex_invalidChar(l, c);
502 }
503 }
504 }
505 #endif // BC_ENABLED
506