xref: /freebsd/contrib/bc/src/bc_lex.c (revision 349cc55c9796c4596a5b9904cd3281af295f878f)
1 /*
2  * *****************************************************************************
3  *
4  * SPDX-License-Identifier: BSD-2-Clause
5  *
6  * Copyright (c) 2018-2021 Gavin D. Howard and contributors.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions are met:
10  *
11  * * Redistributions of source code must retain the above copyright notice, this
12  *   list of conditions and the following disclaimer.
13  *
14  * * Redistributions in binary form must reproduce the above copyright notice,
15  *   this list of conditions and the following disclaimer in the documentation
16  *   and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
22  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28  * POSSIBILITY OF SUCH DAMAGE.
29  *
30  * *****************************************************************************
31  *
32  * The lexer for bc.
33  *
34  */
35 
36 #if BC_ENABLED
37 
38 #include <assert.h>
39 #include <ctype.h>
40 #include <string.h>
41 
42 #include <bc.h>
43 #include <vm.h>
44 
45 /**
46  * Lexes an identifier, which may be a keyword.
47  * @param l  The lexer.
48  */
49 static void bc_lex_identifier(BcLex *l) {
50 
51 	// We already passed the first character, so we need to be sure to include
52 	// it.
53 	const char *buf = l->buf + l->i - 1;
54 	size_t i;
55 
56 	// This loop is simply checking for keywords.
57 	for (i = 0; i < bc_lex_kws_len; ++i) {
58 
59 		const BcLexKeyword *kw = bc_lex_kws + i;
60 		size_t n = BC_LEX_KW_LEN(kw);
61 
62 		if (!strncmp(buf, kw->name, n) && !isalnum(buf[n]) && buf[n] != '_') {
63 
64 			// If the keyword has been redefined, and redefinition is allowed
65 			// (it is not allowed for builtin libraries), break out of the loop
66 			// and use it as a name. This depends on the argument parser to
67 			// ensure that only non-POSIX keywords get redefined.
68 			if (!vm.no_redefine && vm.redefined_kws[i]) break;
69 
70 			l->t = BC_LEX_KW_AUTO + (BcLexType) i;
71 
72 			// Warn or error, as appropriate for the mode, if the keyword is not
73 			// in the POSIX standard.
74 			if (!BC_LEX_KW_POSIX(kw)) bc_lex_verr(l, BC_ERR_POSIX_KW, kw->name);
75 
76 			// We minus 1 because the index has already been incremented.
77 			l->i += n - 1;
78 
79 			// Already have the token; bail.
80 			return;
81 		}
82 	}
83 
84 	// If not a keyword, parse the name.
85 	bc_lex_name(l);
86 
87 	// POSIX doesn't allow identifiers that are more than one character, so we
88 	// might have to warn or error here too.
89 	if (BC_ERR(l->str.len - 1 > 1))
90 		bc_lex_verr(l, BC_ERR_POSIX_NAME_LEN, l->str.v);
91 }
92 
93 /**
94  * Parses a bc string. This is separate from dc strings because dc strings need
95  * to be balanced.
96  * @param l  The lexer.
97  */
98 static void bc_lex_string(BcLex *l) {
99 
100 	// We need to keep track of newlines to increment them properly.
101 	size_t len, nlines, i;
102 	const char *buf;
103 	char c;
104 	bool got_more;
105 
106 	l->t = BC_LEX_STR;
107 
108 	do {
109 
110 		nlines = 0;
111 		buf = l->buf;
112 		got_more = false;
113 
114 		assert(!vm.is_stdin || buf == vm.buffer.v);
115 
116 		// Fortunately for us, bc doesn't escape quotes. Instead, the equivalent
117 		// is '\q', which makes this loop simpler.
118 		for (i = l->i; (c = buf[i]) && c != '"'; ++i) nlines += (c == '\n');
119 
120 		if (BC_ERR(c == '\0') && !vm.eof && (l->is_stdin || l->is_exprs))
121 			got_more = bc_lex_readLine(l);
122 
123 	} while (got_more && c != '"');
124 
125 	// If the string did not end properly, barf.
126 	if (c != '"') {
127 		l->i = i;
128 		bc_lex_err(l, BC_ERR_PARSE_STRING);
129 	}
130 
131 	// Set the temp string to the parsed string.
132 	len = i - l->i;
133 	bc_vec_string(&l->str, len, l->buf + l->i);
134 
135 	l->i = i + 1;
136 	l->line += nlines;
137 }
138 
139 /**
140  * This function takes a lexed operator and checks to see if it's the assignment
141  * version, setting the token appropriately.
142  * @param l        The lexer.
143  * @param with     The token to assign if it is an assignment operator.
144  * @param without  The token to assign if it is not an assignment operator.
145  */
146 static void bc_lex_assign(BcLex *l, BcLexType with, BcLexType without) {
147 	if (l->buf[l->i] == '=') {
148 		l->i += 1;
149 		l->t = with;
150 	}
151 	else l->t = without;
152 }
153 
154 void bc_lex_token(BcLex *l) {
155 
156 	// We increment here. This means that all lexing needs to take that into
157 	// account, such as when parsing an identifier. If we don't, the first
158 	// character of every identifier would be missing.
159 	char c = l->buf[l->i++], c2;
160 
161 	BC_SIG_ASSERT_LOCKED;
162 
163 	// This is the workhorse of the lexer.
164 	switch (c) {
165 
166 		case '\0':
167 		case '\n':
168 		case '\t':
169 		case '\v':
170 		case '\f':
171 		case '\r':
172 		case ' ':
173 		{
174 			bc_lex_commonTokens(l, c);
175 			break;
176 		}
177 
178 		case '!':
179 		{
180 			// Even though it's not an assignment, we can use this.
181 			bc_lex_assign(l, BC_LEX_OP_REL_NE, BC_LEX_OP_BOOL_NOT);
182 
183 			// POSIX doesn't allow boolean not.
184 			if (l->t == BC_LEX_OP_BOOL_NOT)
185 				bc_lex_verr(l, BC_ERR_POSIX_BOOL, "!");
186 
187 			break;
188 		}
189 
190 		case '"':
191 		{
192 			bc_lex_string(l);
193 			break;
194 		}
195 
196 		case '#':
197 		{
198 			// POSIX does not allow line comments.
199 			bc_lex_err(l, BC_ERR_POSIX_COMMENT);
200 			bc_lex_lineComment(l);
201 			break;
202 		}
203 
204 		case '%':
205 		{
206 			bc_lex_assign(l, BC_LEX_OP_ASSIGN_MODULUS, BC_LEX_OP_MODULUS);
207 			break;
208 		}
209 
210 		case '&':
211 		{
212 			c2 = l->buf[l->i];
213 
214 			// Either we have boolean and or an error. And boolean and is not
215 			// allowed by POSIX.
216 			if (BC_NO_ERR(c2 == '&')) {
217 
218 				bc_lex_verr(l, BC_ERR_POSIX_BOOL, "&&");
219 
220 				l->i += 1;
221 				l->t = BC_LEX_OP_BOOL_AND;
222 			}
223 			else bc_lex_invalidChar(l, c);
224 
225 			break;
226 		}
227 #if BC_ENABLE_EXTRA_MATH
228 		case '$':
229 		{
230 			l->t = BC_LEX_OP_TRUNC;
231 			break;
232 		}
233 
234 		case '@':
235 		{
236 			bc_lex_assign(l, BC_LEX_OP_ASSIGN_PLACES, BC_LEX_OP_PLACES);
237 			break;
238 		}
239 #endif // BC_ENABLE_EXTRA_MATH
240 		case '(':
241 		case ')':
242 		{
243 			l->t = (BcLexType) (c - '(' + BC_LEX_LPAREN);
244 			break;
245 		}
246 
247 		case '*':
248 		{
249 			bc_lex_assign(l, BC_LEX_OP_ASSIGN_MULTIPLY, BC_LEX_OP_MULTIPLY);
250 			break;
251 		}
252 
253 		case '+':
254 		{
255 			c2 = l->buf[l->i];
256 
257 			// Have to check for increment first.
258 			if (c2 == '+') {
259 				l->i += 1;
260 				l->t = BC_LEX_OP_INC;
261 			}
262 			else bc_lex_assign(l, BC_LEX_OP_ASSIGN_PLUS, BC_LEX_OP_PLUS);
263 			break;
264 		}
265 
266 		case ',':
267 		{
268 			l->t = BC_LEX_COMMA;
269 			break;
270 		}
271 
272 		case '-':
273 		{
274 			c2 = l->buf[l->i];
275 
276 			// Have to check for decrement first.
277 			if (c2 == '-') {
278 				l->i += 1;
279 				l->t = BC_LEX_OP_DEC;
280 			}
281 			else bc_lex_assign(l, BC_LEX_OP_ASSIGN_MINUS, BC_LEX_OP_MINUS);
282 			break;
283 		}
284 
285 		case '.':
286 		{
287 			c2 = l->buf[l->i];
288 
289 			// If it's alone, it's an alias for last.
290 			if (BC_LEX_NUM_CHAR(c2, true, false)) bc_lex_number(l, c);
291 			else {
292 				l->t = BC_LEX_KW_LAST;
293 				bc_lex_err(l, BC_ERR_POSIX_DOT);
294 			}
295 
296 			break;
297 		}
298 
299 		case '/':
300 		{
301 			c2 = l->buf[l->i];
302 			if (c2 =='*') bc_lex_comment(l);
303 			else bc_lex_assign(l, BC_LEX_OP_ASSIGN_DIVIDE, BC_LEX_OP_DIVIDE);
304 			break;
305 		}
306 
307 		case '0':
308 		case '1':
309 		case '2':
310 		case '3':
311 		case '4':
312 		case '5':
313 		case '6':
314 		case '7':
315 		case '8':
316 		case '9':
317 		case 'A':
318 		case 'B':
319 		case 'C':
320 		case 'D':
321 		case 'E':
322 		case 'F':
323 		// Apparently, GNU bc (and maybe others) allows any uppercase letter as
324 		// a number. When single digits, they act like the ones above. When
325 		// multi-digit, any letter above the input base is automatically set to
326 		// the biggest allowable digit in the input base.
327 		case 'G':
328 		case 'H':
329 		case 'I':
330 		case 'J':
331 		case 'K':
332 		case 'L':
333 		case 'M':
334 		case 'N':
335 		case 'O':
336 		case 'P':
337 		case 'Q':
338 		case 'R':
339 		case 'S':
340 		case 'T':
341 		case 'U':
342 		case 'V':
343 		case 'W':
344 		case 'X':
345 		case 'Y':
346 		case 'Z':
347 		{
348 			bc_lex_number(l, c);
349 			break;
350 		}
351 
352 		case ';':
353 		{
354 			l->t = BC_LEX_SCOLON;
355 			break;
356 		}
357 
358 		case '<':
359 		{
360 #if BC_ENABLE_EXTRA_MATH
361 			c2 = l->buf[l->i];
362 
363 			// Check for shift.
364 			if (c2 == '<') {
365 				l->i += 1;
366 				bc_lex_assign(l, BC_LEX_OP_ASSIGN_LSHIFT, BC_LEX_OP_LSHIFT);
367 				break;
368 			}
369 #endif // BC_ENABLE_EXTRA_MATH
370 			bc_lex_assign(l, BC_LEX_OP_REL_LE, BC_LEX_OP_REL_LT);
371 			break;
372 		}
373 
374 		case '=':
375 		{
376 			bc_lex_assign(l, BC_LEX_OP_REL_EQ, BC_LEX_OP_ASSIGN);
377 			break;
378 		}
379 
380 		case '>':
381 		{
382 #if BC_ENABLE_EXTRA_MATH
383 			c2 = l->buf[l->i];
384 
385 			// Check for shift.
386 			if (c2 == '>') {
387 				l->i += 1;
388 				bc_lex_assign(l, BC_LEX_OP_ASSIGN_RSHIFT, BC_LEX_OP_RSHIFT);
389 				break;
390 			}
391 #endif // BC_ENABLE_EXTRA_MATH
392 			bc_lex_assign(l, BC_LEX_OP_REL_GE, BC_LEX_OP_REL_GT);
393 			break;
394 		}
395 
396 		case '[':
397 		case ']':
398 		{
399 			l->t = (BcLexType) (c - '[' + BC_LEX_LBRACKET);
400 			break;
401 		}
402 
403 		case '\\':
404 		{
405 			// In bc, a backslash+newline is whitespace.
406 			if (BC_NO_ERR(l->buf[l->i] == '\n')) {
407 				l->i += 1;
408 				l->t = BC_LEX_WHITESPACE;
409 			}
410 			else bc_lex_invalidChar(l, c);
411 			break;
412 		}
413 
414 		case '^':
415 		{
416 			bc_lex_assign(l, BC_LEX_OP_ASSIGN_POWER, BC_LEX_OP_POWER);
417 			break;
418 		}
419 
420 		case 'a':
421 		case 'b':
422 		case 'c':
423 		case 'd':
424 		case 'e':
425 		case 'f':
426 		case 'g':
427 		case 'h':
428 		case 'i':
429 		case 'j':
430 		case 'k':
431 		case 'l':
432 		case 'm':
433 		case 'n':
434 		case 'o':
435 		case 'p':
436 		case 'q':
437 		case 'r':
438 		case 's':
439 		case 't':
440 		case 'u':
441 		case 'v':
442 		case 'w':
443 		case 'x':
444 		case 'y':
445 		case 'z':
446 		{
447 			bc_lex_identifier(l);
448 			break;
449 		}
450 
451 		case '{':
452 		case '}':
453 		{
454 			l->t = (BcLexType) (c - '{' + BC_LEX_LBRACE);
455 			break;
456 		}
457 
458 		case '|':
459 		{
460 			c2 = l->buf[l->i];
461 
462 			// Once again, boolean or is not allowed by POSIX.
463 			if (BC_NO_ERR(c2 == '|')) {
464 
465 				bc_lex_verr(l, BC_ERR_POSIX_BOOL, "||");
466 
467 				l->i += 1;
468 				l->t = BC_LEX_OP_BOOL_OR;
469 			}
470 			else bc_lex_invalidChar(l, c);
471 
472 			break;
473 		}
474 
475 		default:
476 		{
477 			bc_lex_invalidChar(l, c);
478 		}
479 	}
480 }
481 #endif // BC_ENABLED
482