xref: /freebsd/contrib/bc/src/bc_lex.c (revision b64c5a0ace59af62eff52bfe110a521dc73c937b)
1 /*
2  * *****************************************************************************
3  *
4  * SPDX-License-Identifier: BSD-2-Clause
5  *
6  * Copyright (c) 2018-2024 Gavin D. Howard and contributors.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions are met:
10  *
11  * * Redistributions of source code must retain the above copyright notice, this
12  *   list of conditions and the following disclaimer.
13  *
14  * * Redistributions in binary form must reproduce the above copyright notice,
15  *   this list of conditions and the following disclaimer in the documentation
16  *   and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
22  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28  * POSSIBILITY OF SUCH DAMAGE.
29  *
30  * *****************************************************************************
31  *
32  * The lexer for bc.
33  *
34  */
35 
36 #if BC_ENABLED
37 
38 #include <assert.h>
39 #include <ctype.h>
40 #include <string.h>
41 
42 #include <bc.h>
43 #include <vm.h>
44 
45 /**
46  * Lexes an identifier, which may be a keyword.
47  * @param l  The lexer.
48  */
49 static void
50 bc_lex_identifier(BcLex* l)
51 {
52 	// We already passed the first character, so we need to be sure to include
53 	// it.
54 	const char* buf = l->buf + l->i - 1;
55 	size_t i;
56 
57 	// This loop is simply checking for keywords.
58 	for (i = 0; i < bc_lex_kws_len; ++i)
59 	{
60 		const BcLexKeyword* kw = bc_lex_kws + i;
61 		size_t n = BC_LEX_KW_LEN(kw);
62 
63 		if (!strncmp(buf, kw->name, n) && !isalnum(buf[n]) && buf[n] != '_')
64 		{
65 			// If the keyword has been redefined, and redefinition is allowed
66 			// (it is not allowed for builtin libraries), break out of the loop
67 			// and use it as a name. This depends on the argument parser to
68 			// ensure that only non-POSIX keywords get redefined.
69 			if (!vm->no_redefine && vm->redefined_kws[i]) break;
70 
71 			l->t = BC_LEX_KW_AUTO + (BcLexType) i;
72 
73 			// Warn or error, as appropriate for the mode, if the keyword is not
74 			// in the POSIX standard.
75 			if (!BC_LEX_KW_POSIX(kw)) bc_lex_verr(l, BC_ERR_POSIX_KW, kw->name);
76 
77 			// We minus 1 because the index has already been incremented.
78 			l->i += n - 1;
79 
80 			// Already have the token; bail.
81 			return;
82 		}
83 	}
84 
85 	// If not a keyword, parse the name.
86 	bc_lex_name(l);
87 
88 	// POSIX doesn't allow identifiers that are more than one character, so we
89 	// might have to warn or error here too.
90 	if (BC_ERR(l->str.len - 1 > 1))
91 	{
92 		bc_lex_verr(l, BC_ERR_POSIX_NAME_LEN, l->str.v);
93 	}
94 }
95 
96 /**
97  * Parses a bc string. This is separate from dc strings because dc strings need
98  * to be balanced.
99  * @param l  The lexer.
100  */
101 static void
102 bc_lex_string(BcLex* l)
103 {
104 	// We need to keep track of newlines to increment them properly.
105 	size_t len, nlines, i;
106 	const char* buf;
107 	char c;
108 	bool got_more;
109 
110 	l->t = BC_LEX_STR;
111 
112 	do
113 	{
114 		nlines = 0;
115 		buf = l->buf;
116 		got_more = false;
117 
118 #if !BC_ENABLE_OSSFUZZ
119 		assert(vm->mode != BC_MODE_STDIN || buf == vm->buffer.v);
120 #endif // !BC_ENABLE_OSSFUZZ
121 
122 		// Fortunately for us, bc doesn't escape quotes. Instead, the equivalent
123 		// is '\q', which makes this loop simpler.
124 		for (i = l->i; (c = buf[i]) && c != '"'; ++i)
125 		{
126 			nlines += (c == '\n');
127 		}
128 
129 		if (BC_ERR(c == '\0') && !vm->eof && l->mode != BC_MODE_FILE)
130 		{
131 			got_more = bc_lex_readLine(l);
132 		}
133 	}
134 	while (got_more && c != '"');
135 
136 	// If the string did not end properly, barf.
137 	if (c != '"')
138 	{
139 		l->i = i;
140 		bc_lex_err(l, BC_ERR_PARSE_STRING);
141 	}
142 
143 	// Set the temp string to the parsed string.
144 	len = i - l->i;
145 	bc_vec_string(&l->str, len, l->buf + l->i);
146 
147 	l->i = i + 1;
148 	l->line += nlines;
149 }
150 
151 /**
152  * This function takes a lexed operator and checks to see if it's the assignment
153  * version, setting the token appropriately.
154  * @param l        The lexer.
155  * @param with     The token to assign if it is an assignment operator.
156  * @param without  The token to assign if it is not an assignment operator.
157  */
158 static void
159 bc_lex_assign(BcLex* l, BcLexType with, BcLexType without)
160 {
161 	if (l->buf[l->i] == '=')
162 	{
163 		l->i += 1;
164 		l->t = with;
165 	}
166 	else l->t = without;
167 }
168 
169 void
170 bc_lex_token(BcLex* l)
171 {
172 	// We increment here. This means that all lexing needs to take that into
173 	// account, such as when parsing an identifier. If we don't, the first
174 	// character of every identifier would be missing.
175 	char c = l->buf[l->i++], c2;
176 
177 	BC_SIG_ASSERT_LOCKED;
178 
179 	// This is the workhorse of the lexer.
180 	switch (c)
181 	{
182 		case '\0':
183 		case '\n':
184 		case '\t':
185 		case '\v':
186 		case '\f':
187 		case '\r':
188 		case ' ':
189 		{
190 			bc_lex_commonTokens(l, c);
191 			break;
192 		}
193 
194 		case '!':
195 		{
196 			// Even though it's not an assignment, we can use this.
197 			bc_lex_assign(l, BC_LEX_OP_REL_NE, BC_LEX_OP_BOOL_NOT);
198 
199 			// POSIX doesn't allow boolean not.
200 			if (l->t == BC_LEX_OP_BOOL_NOT)
201 			{
202 				bc_lex_verr(l, BC_ERR_POSIX_BOOL, "!");
203 			}
204 
205 			break;
206 		}
207 
208 		case '"':
209 		{
210 			bc_lex_string(l);
211 			break;
212 		}
213 
214 		case '#':
215 		{
216 			// POSIX does not allow line comments.
217 			bc_lex_err(l, BC_ERR_POSIX_COMMENT);
218 			bc_lex_lineComment(l);
219 			break;
220 		}
221 
222 		case '%':
223 		{
224 			bc_lex_assign(l, BC_LEX_OP_ASSIGN_MODULUS, BC_LEX_OP_MODULUS);
225 			break;
226 		}
227 
228 		case '&':
229 		{
230 			c2 = l->buf[l->i];
231 
232 			// Either we have boolean and or an error. And boolean and is not
233 			// allowed by POSIX.
234 			if (BC_NO_ERR(c2 == '&'))
235 			{
236 				bc_lex_verr(l, BC_ERR_POSIX_BOOL, "&&");
237 
238 				l->i += 1;
239 				l->t = BC_LEX_OP_BOOL_AND;
240 			}
241 			else bc_lex_invalidChar(l, c);
242 
243 			break;
244 		}
245 #if BC_ENABLE_EXTRA_MATH
246 		case '$':
247 		{
248 			l->t = BC_LEX_OP_TRUNC;
249 			break;
250 		}
251 
252 		case '@':
253 		{
254 			bc_lex_assign(l, BC_LEX_OP_ASSIGN_PLACES, BC_LEX_OP_PLACES);
255 			break;
256 		}
257 #endif // BC_ENABLE_EXTRA_MATH
258 		case '(':
259 		case ')':
260 		{
261 			l->t = (BcLexType) (c - '(' + BC_LEX_LPAREN);
262 			break;
263 		}
264 
265 		case '*':
266 		{
267 			bc_lex_assign(l, BC_LEX_OP_ASSIGN_MULTIPLY, BC_LEX_OP_MULTIPLY);
268 			break;
269 		}
270 
271 		case '+':
272 		{
273 			c2 = l->buf[l->i];
274 
275 			// Have to check for increment first.
276 			if (c2 == '+')
277 			{
278 				l->i += 1;
279 				l->t = BC_LEX_OP_INC;
280 			}
281 			else bc_lex_assign(l, BC_LEX_OP_ASSIGN_PLUS, BC_LEX_OP_PLUS);
282 			break;
283 		}
284 
285 		case ',':
286 		{
287 			l->t = BC_LEX_COMMA;
288 			break;
289 		}
290 
291 		case '-':
292 		{
293 			c2 = l->buf[l->i];
294 
295 			// Have to check for decrement first.
296 			if (c2 == '-')
297 			{
298 				l->i += 1;
299 				l->t = BC_LEX_OP_DEC;
300 			}
301 			else bc_lex_assign(l, BC_LEX_OP_ASSIGN_MINUS, BC_LEX_OP_MINUS);
302 			break;
303 		}
304 
305 		case '.':
306 		{
307 			c2 = l->buf[l->i];
308 
309 			// If it's alone, it's an alias for last.
310 			if (BC_LEX_NUM_CHAR(c2, true, false)) bc_lex_number(l, c);
311 			else
312 			{
313 				l->t = BC_LEX_KW_LAST;
314 				bc_lex_err(l, BC_ERR_POSIX_DOT);
315 			}
316 
317 			break;
318 		}
319 
320 		case '/':
321 		{
322 			c2 = l->buf[l->i];
323 			if (c2 == '*') bc_lex_comment(l);
324 			else bc_lex_assign(l, BC_LEX_OP_ASSIGN_DIVIDE, BC_LEX_OP_DIVIDE);
325 			break;
326 		}
327 
328 		case '0':
329 		case '1':
330 		case '2':
331 		case '3':
332 		case '4':
333 		case '5':
334 		case '6':
335 		case '7':
336 		case '8':
337 		case '9':
338 		case 'A':
339 		case 'B':
340 		case 'C':
341 		case 'D':
342 		case 'E':
343 		case 'F':
344 		// Apparently, GNU bc (and maybe others) allows any uppercase letter as
345 		// a number. When single digits, they act like the ones above. When
346 		// multi-digit, any letter above the input base is automatically set to
347 		// the biggest allowable digit in the input base.
348 		case 'G':
349 		case 'H':
350 		case 'I':
351 		case 'J':
352 		case 'K':
353 		case 'L':
354 		case 'M':
355 		case 'N':
356 		case 'O':
357 		case 'P':
358 		case 'Q':
359 		case 'R':
360 		case 'S':
361 		case 'T':
362 		case 'U':
363 		case 'V':
364 		case 'W':
365 		case 'X':
366 		case 'Y':
367 		case 'Z':
368 		{
369 			bc_lex_number(l, c);
370 			break;
371 		}
372 
373 		case ';':
374 		{
375 			l->t = BC_LEX_SCOLON;
376 			break;
377 		}
378 
379 		case '<':
380 		{
381 #if BC_ENABLE_EXTRA_MATH
382 			c2 = l->buf[l->i];
383 
384 			// Check for shift.
385 			if (c2 == '<')
386 			{
387 				l->i += 1;
388 				bc_lex_assign(l, BC_LEX_OP_ASSIGN_LSHIFT, BC_LEX_OP_LSHIFT);
389 				break;
390 			}
391 #endif // BC_ENABLE_EXTRA_MATH
392 			bc_lex_assign(l, BC_LEX_OP_REL_LE, BC_LEX_OP_REL_LT);
393 			break;
394 		}
395 
396 		case '=':
397 		{
398 			bc_lex_assign(l, BC_LEX_OP_REL_EQ, BC_LEX_OP_ASSIGN);
399 			break;
400 		}
401 
402 		case '>':
403 		{
404 #if BC_ENABLE_EXTRA_MATH
405 			c2 = l->buf[l->i];
406 
407 			// Check for shift.
408 			if (c2 == '>')
409 			{
410 				l->i += 1;
411 				bc_lex_assign(l, BC_LEX_OP_ASSIGN_RSHIFT, BC_LEX_OP_RSHIFT);
412 				break;
413 			}
414 #endif // BC_ENABLE_EXTRA_MATH
415 			bc_lex_assign(l, BC_LEX_OP_REL_GE, BC_LEX_OP_REL_GT);
416 			break;
417 		}
418 
419 		case '[':
420 		case ']':
421 		{
422 			l->t = (BcLexType) (c - '[' + BC_LEX_LBRACKET);
423 			break;
424 		}
425 
426 		case '\\':
427 		{
428 			// In bc, a backslash+newline is whitespace.
429 			if (BC_NO_ERR(l->buf[l->i] == '\n'))
430 			{
431 				l->i += 1;
432 				l->t = BC_LEX_WHITESPACE;
433 			}
434 			else bc_lex_invalidChar(l, c);
435 			break;
436 		}
437 
438 		case '^':
439 		{
440 			bc_lex_assign(l, BC_LEX_OP_ASSIGN_POWER, BC_LEX_OP_POWER);
441 			break;
442 		}
443 
444 		case 'a':
445 		case 'b':
446 		case 'c':
447 		case 'd':
448 		case 'e':
449 		case 'f':
450 		case 'g':
451 		case 'h':
452 		case 'i':
453 		case 'j':
454 		case 'k':
455 		case 'l':
456 		case 'm':
457 		case 'n':
458 		case 'o':
459 		case 'p':
460 		case 'q':
461 		case 'r':
462 		case 's':
463 		case 't':
464 		case 'u':
465 		case 'v':
466 		case 'w':
467 		case 'x':
468 		case 'y':
469 		case 'z':
470 		{
471 			bc_lex_identifier(l);
472 			break;
473 		}
474 
475 		case '{':
476 		case '}':
477 		{
478 			l->t = (BcLexType) (c - '{' + BC_LEX_LBRACE);
479 			break;
480 		}
481 
482 		case '|':
483 		{
484 			c2 = l->buf[l->i];
485 
486 			// Once again, boolean or is not allowed by POSIX.
487 			if (BC_NO_ERR(c2 == '|'))
488 			{
489 				bc_lex_verr(l, BC_ERR_POSIX_BOOL, "||");
490 
491 				l->i += 1;
492 				l->t = BC_LEX_OP_BOOL_OR;
493 			}
494 			else bc_lex_invalidChar(l, c);
495 
496 			break;
497 		}
498 
499 		default:
500 		{
501 			bc_lex_invalidChar(l, c);
502 		}
503 	}
504 }
505 #endif // BC_ENABLED
506