xref: /freebsd/contrib/bc/src/bc_lex.c (revision 6549718b70f0e660a15685369afb4f9caf2215ce)
1 /*
2  * *****************************************************************************
3  *
4  * SPDX-License-Identifier: BSD-2-Clause
5  *
6  * Copyright (c) 2018-2024 Gavin D. Howard and contributors.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions are met:
10  *
11  * * Redistributions of source code must retain the above copyright notice, this
12  *   list of conditions and the following disclaimer.
13  *
14  * * Redistributions in binary form must reproduce the above copyright notice,
15  *   this list of conditions and the following disclaimer in the documentation
16  *   and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
22  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28  * POSSIBILITY OF SUCH DAMAGE.
29  *
30  * *****************************************************************************
31  *
32  * The lexer for bc.
33  *
34  */
35 
36 #if BC_ENABLED
37 
38 #include <assert.h>
39 #include <ctype.h>
40 #include <string.h>
41 
42 #include <bc.h>
43 #include <vm.h>
44 
45 /**
46  * Lexes an identifier, which may be a keyword.
47  * @param l  The lexer.
48  */
49 static void
50 bc_lex_identifier(BcLex* l)
51 {
52 	// We already passed the first character, so we need to be sure to include
53 	// it.
54 	const char* buf = l->buf + l->i - 1;
55 	size_t i;
56 
57 	// This loop is simply checking for keywords.
58 	for (i = 0; i < bc_lex_kws_len; ++i)
59 	{
60 		const BcLexKeyword* kw = bc_lex_kws + i;
61 		size_t n = BC_LEX_KW_LEN(kw);
62 
63 		if (!strncmp(buf, kw->name, n) && !isalnum(buf[n]) && buf[n] != '_')
64 		{
65 			// If the keyword has been redefined, and redefinition is allowed
66 			// (it is not allowed for builtin libraries), break out of the loop
67 			// and use it as a name. This depends on the argument parser to
68 			// ensure that only non-POSIX keywords get redefined.
69 			if (!vm->no_redefine && vm->redefined_kws[i]) break;
70 
71 			l->t = BC_LEX_KW_AUTO + (BcLexType) i;
72 
73 			// Warn or error, as appropriate for the mode, if the keyword is not
74 			// in the POSIX standard.
75 			if (!BC_LEX_KW_POSIX(kw)) bc_lex_verr(l, BC_ERR_POSIX_KW, kw->name);
76 
77 			// We minus 1 because the index has already been incremented.
78 			l->i += n - 1;
79 
80 			// Already have the token; bail.
81 			return;
82 		}
83 	}
84 
85 	// If not a keyword, parse the name.
86 	bc_lex_name(l);
87 
88 	// POSIX doesn't allow identifiers that are more than one character, so we
89 	// might have to warn or error here too.
90 	if (BC_ERR(l->str.len - 1 > 1))
91 	{
92 		bc_lex_verr(l, BC_ERR_POSIX_NAME_LEN, l->str.v);
93 	}
94 }
95 
96 /**
97  * Parses a bc string. This is separate from dc strings because dc strings need
98  * to be balanced.
99  * @param l  The lexer.
100  */
101 static void
102 bc_lex_string(BcLex* l)
103 {
104 	// We need to keep track of newlines to increment them properly.
105 	size_t len, nlines, i;
106 	const char* buf;
107 	char c;
108 	bool got_more;
109 
110 	l->t = BC_LEX_STR;
111 
112 	do
113 	{
114 		nlines = 0;
115 		buf = l->buf;
116 		got_more = false;
117 
118 		assert(vm->mode != BC_MODE_STDIN || buf == vm->buffer.v);
119 
120 		// Fortunately for us, bc doesn't escape quotes. Instead, the equivalent
121 		// is '\q', which makes this loop simpler.
122 		for (i = l->i; (c = buf[i]) && c != '"'; ++i)
123 		{
124 			nlines += (c == '\n');
125 		}
126 
127 		if (BC_ERR(c == '\0') && !vm->eof && l->mode != BC_MODE_FILE)
128 		{
129 			got_more = bc_lex_readLine(l);
130 		}
131 	}
132 	while (got_more && c != '"');
133 
134 	// If the string did not end properly, barf.
135 	if (c != '"')
136 	{
137 		l->i = i;
138 		bc_lex_err(l, BC_ERR_PARSE_STRING);
139 	}
140 
141 	// Set the temp string to the parsed string.
142 	len = i - l->i;
143 	bc_vec_string(&l->str, len, l->buf + l->i);
144 
145 	l->i = i + 1;
146 	l->line += nlines;
147 }
148 
149 /**
150  * This function takes a lexed operator and checks to see if it's the assignment
151  * version, setting the token appropriately.
152  * @param l        The lexer.
153  * @param with     The token to assign if it is an assignment operator.
154  * @param without  The token to assign if it is not an assignment operator.
155  */
156 static void
157 bc_lex_assign(BcLex* l, BcLexType with, BcLexType without)
158 {
159 	if (l->buf[l->i] == '=')
160 	{
161 		l->i += 1;
162 		l->t = with;
163 	}
164 	else l->t = without;
165 }
166 
167 void
168 bc_lex_token(BcLex* l)
169 {
170 	// We increment here. This means that all lexing needs to take that into
171 	// account, such as when parsing an identifier. If we don't, the first
172 	// character of every identifier would be missing.
173 	char c = l->buf[l->i++], c2;
174 
175 	BC_SIG_ASSERT_LOCKED;
176 
177 	// This is the workhorse of the lexer.
178 	switch (c)
179 	{
180 		case '\0':
181 		case '\n':
182 		case '\t':
183 		case '\v':
184 		case '\f':
185 		case '\r':
186 		case ' ':
187 		{
188 			bc_lex_commonTokens(l, c);
189 			break;
190 		}
191 
192 		case '!':
193 		{
194 			// Even though it's not an assignment, we can use this.
195 			bc_lex_assign(l, BC_LEX_OP_REL_NE, BC_LEX_OP_BOOL_NOT);
196 
197 			// POSIX doesn't allow boolean not.
198 			if (l->t == BC_LEX_OP_BOOL_NOT)
199 			{
200 				bc_lex_verr(l, BC_ERR_POSIX_BOOL, "!");
201 			}
202 
203 			break;
204 		}
205 
206 		case '"':
207 		{
208 			bc_lex_string(l);
209 			break;
210 		}
211 
212 		case '#':
213 		{
214 			// POSIX does not allow line comments.
215 			bc_lex_err(l, BC_ERR_POSIX_COMMENT);
216 			bc_lex_lineComment(l);
217 			break;
218 		}
219 
220 		case '%':
221 		{
222 			bc_lex_assign(l, BC_LEX_OP_ASSIGN_MODULUS, BC_LEX_OP_MODULUS);
223 			break;
224 		}
225 
226 		case '&':
227 		{
228 			c2 = l->buf[l->i];
229 
230 			// Either we have boolean and or an error. And boolean and is not
231 			// allowed by POSIX.
232 			if (BC_NO_ERR(c2 == '&'))
233 			{
234 				bc_lex_verr(l, BC_ERR_POSIX_BOOL, "&&");
235 
236 				l->i += 1;
237 				l->t = BC_LEX_OP_BOOL_AND;
238 			}
239 			else bc_lex_invalidChar(l, c);
240 
241 			break;
242 		}
243 #if BC_ENABLE_EXTRA_MATH
244 		case '$':
245 		{
246 			l->t = BC_LEX_OP_TRUNC;
247 			break;
248 		}
249 
250 		case '@':
251 		{
252 			bc_lex_assign(l, BC_LEX_OP_ASSIGN_PLACES, BC_LEX_OP_PLACES);
253 			break;
254 		}
255 #endif // BC_ENABLE_EXTRA_MATH
256 		case '(':
257 		case ')':
258 		{
259 			l->t = (BcLexType) (c - '(' + BC_LEX_LPAREN);
260 			break;
261 		}
262 
263 		case '*':
264 		{
265 			bc_lex_assign(l, BC_LEX_OP_ASSIGN_MULTIPLY, BC_LEX_OP_MULTIPLY);
266 			break;
267 		}
268 
269 		case '+':
270 		{
271 			c2 = l->buf[l->i];
272 
273 			// Have to check for increment first.
274 			if (c2 == '+')
275 			{
276 				l->i += 1;
277 				l->t = BC_LEX_OP_INC;
278 			}
279 			else bc_lex_assign(l, BC_LEX_OP_ASSIGN_PLUS, BC_LEX_OP_PLUS);
280 			break;
281 		}
282 
283 		case ',':
284 		{
285 			l->t = BC_LEX_COMMA;
286 			break;
287 		}
288 
289 		case '-':
290 		{
291 			c2 = l->buf[l->i];
292 
293 			// Have to check for decrement first.
294 			if (c2 == '-')
295 			{
296 				l->i += 1;
297 				l->t = BC_LEX_OP_DEC;
298 			}
299 			else bc_lex_assign(l, BC_LEX_OP_ASSIGN_MINUS, BC_LEX_OP_MINUS);
300 			break;
301 		}
302 
303 		case '.':
304 		{
305 			c2 = l->buf[l->i];
306 
307 			// If it's alone, it's an alias for last.
308 			if (BC_LEX_NUM_CHAR(c2, true, false)) bc_lex_number(l, c);
309 			else
310 			{
311 				l->t = BC_LEX_KW_LAST;
312 				bc_lex_err(l, BC_ERR_POSIX_DOT);
313 			}
314 
315 			break;
316 		}
317 
318 		case '/':
319 		{
320 			c2 = l->buf[l->i];
321 			if (c2 == '*') bc_lex_comment(l);
322 			else bc_lex_assign(l, BC_LEX_OP_ASSIGN_DIVIDE, BC_LEX_OP_DIVIDE);
323 			break;
324 		}
325 
326 		case '0':
327 		case '1':
328 		case '2':
329 		case '3':
330 		case '4':
331 		case '5':
332 		case '6':
333 		case '7':
334 		case '8':
335 		case '9':
336 		case 'A':
337 		case 'B':
338 		case 'C':
339 		case 'D':
340 		case 'E':
341 		case 'F':
342 		// Apparently, GNU bc (and maybe others) allows any uppercase letter as
343 		// a number. When single digits, they act like the ones above. When
344 		// multi-digit, any letter above the input base is automatically set to
345 		// the biggest allowable digit in the input base.
346 		case 'G':
347 		case 'H':
348 		case 'I':
349 		case 'J':
350 		case 'K':
351 		case 'L':
352 		case 'M':
353 		case 'N':
354 		case 'O':
355 		case 'P':
356 		case 'Q':
357 		case 'R':
358 		case 'S':
359 		case 'T':
360 		case 'U':
361 		case 'V':
362 		case 'W':
363 		case 'X':
364 		case 'Y':
365 		case 'Z':
366 		{
367 			bc_lex_number(l, c);
368 			break;
369 		}
370 
371 		case ';':
372 		{
373 			l->t = BC_LEX_SCOLON;
374 			break;
375 		}
376 
377 		case '<':
378 		{
379 #if BC_ENABLE_EXTRA_MATH
380 			c2 = l->buf[l->i];
381 
382 			// Check for shift.
383 			if (c2 == '<')
384 			{
385 				l->i += 1;
386 				bc_lex_assign(l, BC_LEX_OP_ASSIGN_LSHIFT, BC_LEX_OP_LSHIFT);
387 				break;
388 			}
389 #endif // BC_ENABLE_EXTRA_MATH
390 			bc_lex_assign(l, BC_LEX_OP_REL_LE, BC_LEX_OP_REL_LT);
391 			break;
392 		}
393 
394 		case '=':
395 		{
396 			bc_lex_assign(l, BC_LEX_OP_REL_EQ, BC_LEX_OP_ASSIGN);
397 			break;
398 		}
399 
400 		case '>':
401 		{
402 #if BC_ENABLE_EXTRA_MATH
403 			c2 = l->buf[l->i];
404 
405 			// Check for shift.
406 			if (c2 == '>')
407 			{
408 				l->i += 1;
409 				bc_lex_assign(l, BC_LEX_OP_ASSIGN_RSHIFT, BC_LEX_OP_RSHIFT);
410 				break;
411 			}
412 #endif // BC_ENABLE_EXTRA_MATH
413 			bc_lex_assign(l, BC_LEX_OP_REL_GE, BC_LEX_OP_REL_GT);
414 			break;
415 		}
416 
417 		case '[':
418 		case ']':
419 		{
420 			l->t = (BcLexType) (c - '[' + BC_LEX_LBRACKET);
421 			break;
422 		}
423 
424 		case '\\':
425 		{
426 			// In bc, a backslash+newline is whitespace.
427 			if (BC_NO_ERR(l->buf[l->i] == '\n'))
428 			{
429 				l->i += 1;
430 				l->t = BC_LEX_WHITESPACE;
431 			}
432 			else bc_lex_invalidChar(l, c);
433 			break;
434 		}
435 
436 		case '^':
437 		{
438 			bc_lex_assign(l, BC_LEX_OP_ASSIGN_POWER, BC_LEX_OP_POWER);
439 			break;
440 		}
441 
442 		case 'a':
443 		case 'b':
444 		case 'c':
445 		case 'd':
446 		case 'e':
447 		case 'f':
448 		case 'g':
449 		case 'h':
450 		case 'i':
451 		case 'j':
452 		case 'k':
453 		case 'l':
454 		case 'm':
455 		case 'n':
456 		case 'o':
457 		case 'p':
458 		case 'q':
459 		case 'r':
460 		case 's':
461 		case 't':
462 		case 'u':
463 		case 'v':
464 		case 'w':
465 		case 'x':
466 		case 'y':
467 		case 'z':
468 		{
469 			bc_lex_identifier(l);
470 			break;
471 		}
472 
473 		case '{':
474 		case '}':
475 		{
476 			l->t = (BcLexType) (c - '{' + BC_LEX_LBRACE);
477 			break;
478 		}
479 
480 		case '|':
481 		{
482 			c2 = l->buf[l->i];
483 
484 			// Once again, boolean or is not allowed by POSIX.
485 			if (BC_NO_ERR(c2 == '|'))
486 			{
487 				bc_lex_verr(l, BC_ERR_POSIX_BOOL, "||");
488 
489 				l->i += 1;
490 				l->t = BC_LEX_OP_BOOL_OR;
491 			}
492 			else bc_lex_invalidChar(l, c);
493 
494 			break;
495 		}
496 
497 		default:
498 		{
499 			bc_lex_invalidChar(l, c);
500 		}
501 	}
502 }
503 #endif // BC_ENABLED
504