xref: /freebsd/contrib/bc/src/lex.c (revision d59a76183470685bdf0b88013d2baad1f04f030f)
1 /*
2  * *****************************************************************************
3  *
4  * SPDX-License-Identifier: BSD-2-Clause
5  *
6  * Copyright (c) 2018-2024 Gavin D. Howard and contributors.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions are met:
10  *
11  * * Redistributions of source code must retain the above copyright notice, this
12  *   list of conditions and the following disclaimer.
13  *
14  * * Redistributions in binary form must reproduce the above copyright notice,
15  *   this list of conditions and the following disclaimer in the documentation
16  *   and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
22  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28  * POSSIBILITY OF SUCH DAMAGE.
29  *
30  * *****************************************************************************
31  *
32  * Common code for the lexers.
33  *
34  */
35 
36 #include <assert.h>
37 #include <ctype.h>
38 #include <stdbool.h>
39 #include <string.h>
40 
41 #include <lex.h>
42 #include <vm.h>
43 #include <bc.h>
44 
45 void
46 bc_lex_invalidChar(BcLex* l, char c)
47 {
48 	l->t = BC_LEX_INVALID;
49 	bc_lex_verr(l, BC_ERR_PARSE_CHAR, c);
50 }
51 
52 void
53 bc_lex_lineComment(BcLex* l)
54 {
55 	l->t = BC_LEX_WHITESPACE;
56 	while (l->i < l->len && l->buf[l->i] != '\n')
57 	{
58 		l->i += 1;
59 	}
60 }
61 
62 void
63 bc_lex_comment(BcLex* l)
64 {
65 	size_t i, nlines = 0;
66 	const char* buf;
67 	bool end = false, got_more;
68 	char c;
69 
70 	l->i += 1;
71 	l->t = BC_LEX_WHITESPACE;
72 
73 	// This loop is complex because it might need to request more data from
74 	// stdin if the comment is not ended. This loop is taken until the comment
75 	// is finished or we have EOF.
76 	do
77 	{
78 		buf = l->buf;
79 		got_more = false;
80 
81 		// If we are in stdin mode, the buffer must be the one used for stdin.
82 #if !BC_ENABLE_OSSFUZZ
83 		assert(vm->mode != BC_MODE_STDIN || buf == vm->buffer.v);
84 #endif // !BC_ENABLE_OSSFUZZ
85 
86 		// Find the end of the comment.
87 		for (i = l->i; !end; i += !end)
88 		{
89 			// While we don't have an asterisk, eat, but increment nlines.
90 			for (; (c = buf[i]) && c != '*'; ++i)
91 			{
92 				nlines += (c == '\n');
93 			}
94 
95 			// If this is true, we need to request more data.
96 			if (BC_ERR(!c || buf[i + 1] == '\0'))
97 			{
98 #if !BC_ENABLE_OSSFUZZ
99 				// Read more, if possible.
100 				if (!vm->eof && l->mode != BC_MODE_FILE)
101 				{
102 					got_more = bc_lex_readLine(l);
103 				}
104 #endif // !BC_ENABLE_OSSFUZZ
105 
106 				break;
107 			}
108 
109 			// If this turns true, we found the end. Yay!
110 			end = (buf[i + 1] == '/');
111 		}
112 	}
113 	while (got_more && !end);
114 
115 	// If we didn't find the end, barf.
116 	if (!end)
117 	{
118 		l->i = i;
119 		bc_lex_err(l, BC_ERR_PARSE_COMMENT);
120 	}
121 
122 	l->i = i + 2;
123 	l->line += nlines;
124 }
125 
126 void
127 bc_lex_whitespace(BcLex* l)
128 {
129 	char c;
130 
131 	l->t = BC_LEX_WHITESPACE;
132 
133 	// Eat. We don't eat newlines because they can be special.
134 	for (c = l->buf[l->i]; c != '\n' && isspace(c); c = l->buf[++l->i])
135 	{
136 		continue;
137 	}
138 }
139 
140 void
141 bc_lex_commonTokens(BcLex* l, char c)
142 {
143 	if (!c) l->t = BC_LEX_EOF;
144 	else if (c == '\n') l->t = BC_LEX_NLINE;
145 	else bc_lex_whitespace(l);
146 }
147 
148 /**
149  * Parses a number.
150  * @param l         The lexer.
151  * @param start     The start character.
152  * @param int_only  Whether this function should only look for an integer. This
153  *                  is used to implement the exponent of scientific notation.
154  */
155 static size_t
156 bc_lex_num(BcLex* l, char start, bool int_only)
157 {
158 	const char* buf = l->buf + l->i;
159 	size_t i;
160 	char c;
161 	bool last_pt, pt = (start == '.');
162 
163 	// This loop looks complex. It is not. It is asking if the character is not
164 	// a nul byte and it if it a valid num character based on what we have found
165 	// thus far, or whether it is a backslash followed by a newline. I can do
166 	// i+1 on the buffer because the buffer must have a nul byte.
167 	for (i = 0; (c = buf[i]) && (BC_LEX_NUM_CHAR(c, pt, int_only) ||
168 	                             (c == '\\' && buf[i + 1] == '\n'));
169 	     ++i)
170 	{
171 		// I don't need to test that the next character is a newline because
172 		// the loop condition above ensures that.
173 		if (c == '\\')
174 		{
175 			i += 2;
176 
177 			// Make sure to eat whitespace at the beginning of the line.
178 			while (isspace(buf[i]) && buf[i] != '\n')
179 			{
180 				i += 1;
181 			}
182 
183 			c = buf[i];
184 
185 			// If the next character is not a number character, bail.
186 			if (!BC_LEX_NUM_CHAR(c, pt, int_only)) break;
187 		}
188 
189 		// Did we find the radix point?
190 		last_pt = (c == '.');
191 
192 		// If we did, and we already have one, then break because it's not part
193 		// of this number.
194 		if (pt && last_pt) break;
195 
196 		// Set whether we have found a radix point.
197 		pt = pt || last_pt;
198 
199 		bc_vec_push(&l->str, &c);
200 	}
201 
202 	return i;
203 }
204 
205 void
206 bc_lex_number(BcLex* l, char start)
207 {
208 	l->t = BC_LEX_NUMBER;
209 
210 	// Make sure the string is clear.
211 	bc_vec_popAll(&l->str);
212 	bc_vec_push(&l->str, &start);
213 
214 	// Parse the number.
215 	l->i += bc_lex_num(l, start, false);
216 
217 #if BC_ENABLE_EXTRA_MATH
218 	{
219 		char c = l->buf[l->i];
220 
221 		// Do we have a number in scientific notation?
222 		if (c == 'e')
223 		{
224 #if BC_ENABLED
225 			// Barf for POSIX.
226 			if (BC_IS_POSIX) bc_lex_err(l, BC_ERR_POSIX_EXP_NUM);
227 #endif // BC_ENABLED
228 
229 			// Push the e.
230 			bc_vec_push(&l->str, &c);
231 			l->i += 1;
232 			c = l->buf[l->i];
233 
234 			// Check for negative specifically because bc_lex_num() does not.
235 			if (c == BC_LEX_NEG_CHAR)
236 			{
237 				bc_vec_push(&l->str, &c);
238 				l->i += 1;
239 				c = l->buf[l->i];
240 			}
241 
242 			// We must have a number character, so barf if not.
243 			if (BC_ERR(!BC_LEX_NUM_CHAR(c, false, true)))
244 			{
245 				bc_lex_verr(l, BC_ERR_PARSE_CHAR, c);
246 			}
247 
248 			// Parse the exponent.
249 			l->i += bc_lex_num(l, 0, true);
250 		}
251 	}
252 #endif // BC_ENABLE_EXTRA_MATH
253 
254 	bc_vec_pushByte(&l->str, '\0');
255 }
256 
257 void
258 bc_lex_name(BcLex* l)
259 {
260 	size_t i = 0;
261 	const char* buf = l->buf + l->i - 1;
262 	char c = buf[i];
263 
264 	l->t = BC_LEX_NAME;
265 
266 	// Should be obvious. It's looking for valid characters.
267 	while ((c >= 'a' && c <= 'z') || isdigit(c) || c == '_')
268 	{
269 		c = buf[++i];
270 	}
271 
272 	// Set the string to the identifier.
273 	bc_vec_string(&l->str, i, buf);
274 
275 	// Increment the index. We minus 1 because it has already been incremented.
276 	l->i += i - 1;
277 }
278 
279 void
280 bc_lex_init(BcLex* l)
281 {
282 	BC_SIG_ASSERT_LOCKED;
283 	assert(l != NULL);
284 	bc_vec_init(&l->str, sizeof(char), BC_DTOR_NONE);
285 }
286 
287 void
288 bc_lex_free(BcLex* l)
289 {
290 	BC_SIG_ASSERT_LOCKED;
291 	assert(l != NULL);
292 	bc_vec_free(&l->str);
293 }
294 
295 void
296 bc_lex_file(BcLex* l, const char* file)
297 {
298 	assert(l != NULL && file != NULL);
299 	l->line = 1;
300 	vm->file = file;
301 }
302 
303 void
304 bc_lex_next(BcLex* l)
305 {
306 	BC_SIG_ASSERT_LOCKED;
307 
308 	assert(l != NULL);
309 
310 	l->last = l->t;
311 
312 	// If this wasn't here, the line number would be off.
313 	l->line += (l->i != 0 && l->buf[l->i - 1] == '\n');
314 
315 	// If the last token was EOF, someone called this one too many times.
316 	if (BC_ERR(l->last == BC_LEX_EOF)) bc_lex_err(l, BC_ERR_PARSE_EOF);
317 
318 	l->t = BC_LEX_EOF;
319 
320 	// We are done if this is true.
321 	if (l->i == l->len) return;
322 
323 	// Loop until failure or we don't have whitespace. This
324 	// is so the parser doesn't get inundated with whitespace.
325 	do
326 	{
327 		vm->next(l);
328 	}
329 	while (l->t == BC_LEX_WHITESPACE);
330 }
331 
332 /**
333  * Updates the buffer and len so that they are not invalidated when the stdin
334  * buffer grows.
335  * @param l     The lexer.
336  * @param text  The text.
337  * @param len   The length of the text.
338  */
339 static void
340 bc_lex_fixText(BcLex* l, const char* text, size_t len)
341 {
342 	l->buf = text;
343 	l->len = len;
344 }
345 
346 bool
347 bc_lex_readLine(BcLex* l)
348 {
349 	bool good;
350 
351 	// These are reversed because they should be already locked, but
352 	// bc_vm_readLine() needs them to be unlocked.
353 	BC_SIG_UNLOCK;
354 
355 	// Make sure we read from the appropriate place.
356 	switch (l->mode)
357 	{
358 		case BC_MODE_EXPRS:
359 		{
360 			good = bc_vm_readBuf(false);
361 			break;
362 		}
363 
364 		case BC_MODE_FILE:
365 		{
366 			good = false;
367 			break;
368 		}
369 
370 #if !BC_ENABLE_OSSFUZZ
371 
372 		case BC_MODE_STDIN:
373 		{
374 			good = bc_vm_readLine(false);
375 			break;
376 		}
377 
378 #endif // !BC_ENABLE_OSSFUZZ
379 
380 #ifdef __GNUC__
381 #ifndef __clang__
382 		default:
383 		{
384 			// We should never get here.
385 			abort();
386 		}
387 #endif // __clang__
388 #endif // __GNUC__
389 	}
390 
391 	BC_SIG_LOCK;
392 
393 	bc_lex_fixText(l, vm->buffer.v, vm->buffer.len - 1);
394 
395 	return good;
396 }
397 
398 void
399 bc_lex_text(BcLex* l, const char* text, BcMode mode)
400 {
401 	BC_SIG_ASSERT_LOCKED;
402 
403 	assert(l != NULL && text != NULL);
404 
405 	bc_lex_fixText(l, text, strlen(text));
406 	l->i = 0;
407 	l->t = l->last = BC_LEX_INVALID;
408 	l->mode = mode;
409 
410 	bc_lex_next(l);
411 }
412