xref: /freebsd/contrib/bc/src/lex.c (revision 349cc55c9796c4596a5b9904cd3281af295f878f)
1 /*
2  * *****************************************************************************
3  *
4  * SPDX-License-Identifier: BSD-2-Clause
5  *
6  * Copyright (c) 2018-2021 Gavin D. Howard and contributors.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions are met:
10  *
11  * * Redistributions of source code must retain the above copyright notice, this
12  *   list of conditions and the following disclaimer.
13  *
14  * * Redistributions in binary form must reproduce the above copyright notice,
15  *   this list of conditions and the following disclaimer in the documentation
16  *   and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
22  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28  * POSSIBILITY OF SUCH DAMAGE.
29  *
30  * *****************************************************************************
31  *
32  * Common code for the lexers.
33  *
34  */
35 
36 #include <assert.h>
37 #include <ctype.h>
38 #include <stdbool.h>
39 #include <string.h>
40 
41 #include <lex.h>
42 #include <vm.h>
43 #include <bc.h>
44 
45 void bc_lex_invalidChar(BcLex *l, char c) {
46 	l->t = BC_LEX_INVALID;
47 	bc_lex_verr(l, BC_ERR_PARSE_CHAR, c);
48 }
49 
50 void bc_lex_lineComment(BcLex *l) {
51 	l->t = BC_LEX_WHITESPACE;
52 	while (l->i < l->len && l->buf[l->i] != '\n') l->i += 1;
53 }
54 
55 void bc_lex_comment(BcLex *l) {
56 
57 	size_t i, nlines = 0;
58 	const char *buf;
59 	bool end = false, got_more;
60 	char c;
61 
62 	l->i += 1;
63 	l->t = BC_LEX_WHITESPACE;
64 
65 	// This loop is complex because it might need to request more data from
66 	// stdin if the comment is not ended. This loop is taken until the comment
67 	// is finished or we have EOF.
68 	do {
69 
70 		buf = l->buf;
71 		got_more = false;
72 
73 		// If we are in stdin mode, the buffer must be the one used for stdin.
74 		assert(!vm.is_stdin || buf == vm.buffer.v);
75 
76 		// Find the end of the comment.
77 		for (i = l->i; !end; i += !end) {
78 
79 			// While we don't have an asterisk, eat, but increment nlines.
80 			for (; (c = buf[i]) && c != '*'; ++i) nlines += (c == '\n');
81 
82 			// If this is true, we need to request more data.
83 			if (BC_ERR(!c || buf[i + 1] == '\0')) {
84 
85 				// Read more, if possible.
86 				if (!vm.eof && (l->is_stdin || l->is_exprs))
87 					got_more = bc_lex_readLine(l);
88 
89 				break;
90 			}
91 
92 			// If this turns true, we found the end. Yay!
93 			end = (buf[i + 1] == '/');
94 		}
95 
96 	} while (got_more && !end);
97 
98 	// If we didn't find the end, barf.
99 	if (!end) {
100 		l->i = i;
101 		bc_lex_err(l, BC_ERR_PARSE_COMMENT);
102 	}
103 
104 	l->i = i + 2;
105 	l->line += nlines;
106 }
107 
108 void bc_lex_whitespace(BcLex *l) {
109 
110 	char c;
111 
112 	l->t = BC_LEX_WHITESPACE;
113 
114 	// Eat. We don't eat newlines because they can be special.
115 	for (c = l->buf[l->i]; c != '\n' && isspace(c); c = l->buf[++l->i]);
116 }
117 
118 void bc_lex_commonTokens(BcLex *l, char c) {
119 	if (!c) l->t = BC_LEX_EOF;
120 	else if (c == '\n') l->t = BC_LEX_NLINE;
121 	else bc_lex_whitespace(l);
122 }
123 
124 /**
125  * Parses a number.
126  * @param l         The lexer.
127  * @param start     The start character.
128  * @param int_only  Whether this function should only look for an integer. This
129  *                  is used to implement the exponent of scientific notation.
130  */
131 static size_t bc_lex_num(BcLex *l, char start, bool int_only) {
132 
133 	const char *buf = l->buf + l->i;
134 	size_t i;
135 	char c;
136 	bool last_pt, pt = (start == '.');
137 
138 	// This loop looks complex. It is not. It is asking if the character is not
139 	// a nul byte and it if it a valid num character based on what we have found
140 	// thus far, or whether it is a backslash followed by a newline. I can do
141 	// i+1 on the buffer because the buffer must have a nul byte.
142 	for (i = 0; (c = buf[i]) && (BC_LEX_NUM_CHAR(c, pt, int_only) ||
143 	                             (c == '\\' && buf[i + 1] == '\n')); ++i)
144 	{
145 		// I don't need to test that the next character is a newline because
146 		// the loop condition above ensures that.
147 		if (c == '\\') {
148 
149 			i += 2;
150 
151 			// Make sure to eat whitespace at the beginning of the line.
152 			while(isspace(buf[i]) && buf[i] != '\n') i += 1;
153 
154 			c = buf[i];
155 
156 			// If the next character is not a number character, bail.
157 			if (!BC_LEX_NUM_CHAR(c, pt, int_only)) break;
158 		}
159 
160 		// Did we find the radix point?
161 		last_pt = (c == '.');
162 
163 		// If we did, and we already have one, then break because it's not part
164 		// of this number.
165 		if (pt && last_pt) break;
166 
167 		// Set whether we have found a radix point.
168 		pt = pt || last_pt;
169 
170 		bc_vec_push(&l->str, &c);
171 	}
172 
173 	return i;
174 }
175 
176 void bc_lex_number(BcLex *l, char start) {
177 
178 	l->t = BC_LEX_NUMBER;
179 
180 	// Make sure the string is clear.
181 	bc_vec_popAll(&l->str);
182 	bc_vec_push(&l->str, &start);
183 
184 	// Parse the number.
185 	l->i += bc_lex_num(l, start, false);
186 
187 #if BC_ENABLE_EXTRA_MATH
188 	{
189 		char c = l->buf[l->i];
190 
191 		// Do we have a number in scientific notation?
192 		if (c == 'e') {
193 
194 #if BC_ENABLED
195 			// Barf for POSIX.
196 			if (BC_IS_POSIX) bc_lex_err(l, BC_ERR_POSIX_EXP_NUM);
197 #endif // BC_ENABLED
198 
199 			// Push the e.
200 			bc_vec_push(&l->str, &c);
201 			l->i += 1;
202 			c = l->buf[l->i];
203 
204 			// Check for negative specifically because bc_lex_num() does not.
205 			if (c == BC_LEX_NEG_CHAR) {
206 				bc_vec_push(&l->str, &c);
207 				l->i += 1;
208 				c = l->buf[l->i];
209 			}
210 
211 			// We must have a number character, so barf if not.
212 			if (BC_ERR(!BC_LEX_NUM_CHAR(c, false, true)))
213 				bc_lex_verr(l, BC_ERR_PARSE_CHAR, c);
214 
215 			// Parse the exponent.
216 			l->i += bc_lex_num(l, 0, true);
217 		}
218 	}
219 #endif // BC_ENABLE_EXTRA_MATH
220 
221 	bc_vec_pushByte(&l->str, '\0');
222 }
223 
224 void bc_lex_name(BcLex *l) {
225 
226 	size_t i = 0;
227 	const char *buf = l->buf + l->i - 1;
228 	char c = buf[i];
229 
230 	l->t = BC_LEX_NAME;
231 
232 	// Should be obvious. It's looking for valid characters.
233 	while ((c >= 'a' && c <= 'z') || isdigit(c) || c == '_') c = buf[++i];
234 
235 	// Set the string to the identifier.
236 	bc_vec_string(&l->str, i, buf);
237 
238 	// Increment the index. We minus 1 because it has already been incremented.
239 	l->i += i - 1;
240 }
241 
242 void bc_lex_init(BcLex *l) {
243 	BC_SIG_ASSERT_LOCKED;
244 	assert(l != NULL);
245 	bc_vec_init(&l->str, sizeof(char), BC_DTOR_NONE);
246 }
247 
248 void bc_lex_free(BcLex *l) {
249 	BC_SIG_ASSERT_LOCKED;
250 	assert(l != NULL);
251 	bc_vec_free(&l->str);
252 }
253 
254 void bc_lex_file(BcLex *l, const char *file) {
255 	assert(l != NULL && file != NULL);
256 	l->line = 1;
257 	vm.file = file;
258 }
259 
260 void bc_lex_next(BcLex *l) {
261 
262 	BC_SIG_ASSERT_LOCKED;
263 
264 	assert(l != NULL);
265 
266 	l->last = l->t;
267 
268 	// If this wasn't here, the line number would be off.
269 	l->line += (l->i != 0 && l->buf[l->i - 1] == '\n');
270 
271 	// If the last token was EOF, someone called this one too many times.
272 	if (BC_ERR(l->last == BC_LEX_EOF)) bc_lex_err(l, BC_ERR_PARSE_EOF);
273 
274 	l->t = BC_LEX_EOF;
275 
276 	// We are done if this is true.
277 	if (l->i == l->len) return;
278 
279 	// Loop until failure or we don't have whitespace. This
280 	// is so the parser doesn't get inundated with whitespace.
281 	do {
282 		vm.next(l);
283 	} while (l->t == BC_LEX_WHITESPACE);
284 }
285 
286 /**
287  * Updates the buffer and len so that they are not invalidated when the stdin
288  * buffer grows.
289  * @param l     The lexer.
290  * @param text  The text.
291  * @param len   The length of the text.
292  */
293 static void bc_lex_fixText(BcLex *l, const char *text, size_t len) {
294 	l->buf = text;
295 	l->len = len;
296 }
297 
298 bool bc_lex_readLine(BcLex *l) {
299 
300 	bool good;
301 
302 	// These are reversed because they should be already locked, but
303 	// bc_vm_readLine() needs them to be unlocked.
304 	BC_SIG_UNLOCK;
305 
306 	// Make sure we read from the appropriate place.
307 	if (l->is_stdin) good = bc_vm_readLine(false);
308 	else {
309 		assert(l->is_exprs);
310 		good = bc_vm_readBuf(false);
311 	}
312 
313 	BC_SIG_LOCK;
314 
315 	bc_lex_fixText(l, vm.buffer.v, vm.buffer.len - 1);
316 
317 	return good;
318 }
319 
320 void bc_lex_text(BcLex *l, const char *text, bool is_stdin, bool is_exprs) {
321 
322 	BC_SIG_ASSERT_LOCKED;
323 
324 	assert(l != NULL && text != NULL);
325 
326 	bc_lex_fixText(l, text, strlen(text));
327 	l->i = 0;
328 	l->t = l->last = BC_LEX_INVALID;
329 	l->is_stdin = is_stdin;
330 	l->is_exprs = is_exprs;
331 
332 	assert(!l->is_stdin || !l->is_exprs);
333 
334 	bc_lex_next(l);
335 }
336