1 /*
2 * *****************************************************************************
3 *
4 * SPDX-License-Identifier: BSD-2-Clause
5 *
6 * Copyright (c) 2018-2024 Gavin D. Howard and contributors.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions are met:
10 *
11 * * Redistributions of source code must retain the above copyright notice, this
12 * list of conditions and the following disclaimer.
13 *
14 * * Redistributions in binary form must reproduce the above copyright notice,
15 * this list of conditions and the following disclaimer in the documentation
16 * and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
22 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28 * POSSIBILITY OF SUCH DAMAGE.
29 *
30 * *****************************************************************************
31 *
32 * Common code for the lexers.
33 *
34 */
35
36 #include <assert.h>
37 #include <ctype.h>
38 #include <stdbool.h>
39 #include <string.h>
40
41 #include <lex.h>
42 #include <vm.h>
43 #include <bc.h>
44
45 void
bc_lex_invalidChar(BcLex * l,char c)46 bc_lex_invalidChar(BcLex* l, char c)
47 {
48 l->t = BC_LEX_INVALID;
49 bc_lex_verr(l, BC_ERR_PARSE_CHAR, c);
50 }
51
52 void
bc_lex_lineComment(BcLex * l)53 bc_lex_lineComment(BcLex* l)
54 {
55 l->t = BC_LEX_WHITESPACE;
56 while (l->i < l->len && l->buf[l->i] != '\n')
57 {
58 l->i += 1;
59 }
60 }
61
62 void
bc_lex_comment(BcLex * l)63 bc_lex_comment(BcLex* l)
64 {
65 size_t i, nlines = 0;
66 const char* buf;
67 bool end = false, got_more;
68 char c;
69
70 l->i += 1;
71 l->t = BC_LEX_WHITESPACE;
72
73 // This loop is complex because it might need to request more data from
74 // stdin if the comment is not ended. This loop is taken until the comment
75 // is finished or we have EOF.
76 do
77 {
78 buf = l->buf;
79 got_more = false;
80
81 // If we are in stdin mode, the buffer must be the one used for stdin.
82 #if !BC_ENABLE_OSSFUZZ
83 assert(vm->mode != BC_MODE_STDIN || buf == vm->buffer.v);
84 #endif // !BC_ENABLE_OSSFUZZ
85
86 // Find the end of the comment.
87 for (i = l->i; !end; i += !end)
88 {
89 // While we don't have an asterisk, eat, but increment nlines.
90 for (; (c = buf[i]) && c != '*'; ++i)
91 {
92 nlines += (c == '\n');
93 }
94
95 // If this is true, we need to request more data.
96 if (BC_ERR(!c || buf[i + 1] == '\0'))
97 {
98 #if !BC_ENABLE_OSSFUZZ
99 // Read more, if possible.
100 if (!vm->eof && l->mode != BC_MODE_FILE)
101 {
102 got_more = bc_lex_readLine(l);
103 }
104 #endif // !BC_ENABLE_OSSFUZZ
105
106 break;
107 }
108
109 // If this turns true, we found the end. Yay!
110 end = (buf[i + 1] == '/');
111 }
112 }
113 while (got_more && !end);
114
115 // If we didn't find the end, barf.
116 if (!end)
117 {
118 l->i = i;
119 bc_lex_err(l, BC_ERR_PARSE_COMMENT);
120 }
121
122 l->i = i + 2;
123 l->line += nlines;
124 }
125
126 void
bc_lex_whitespace(BcLex * l)127 bc_lex_whitespace(BcLex* l)
128 {
129 char c;
130
131 l->t = BC_LEX_WHITESPACE;
132
133 // Eat. We don't eat newlines because they can be special.
134 for (c = l->buf[l->i]; c != '\n' && isspace(c); c = l->buf[++l->i])
135 {
136 continue;
137 }
138 }
139
140 void
bc_lex_commonTokens(BcLex * l,char c)141 bc_lex_commonTokens(BcLex* l, char c)
142 {
143 if (!c) l->t = BC_LEX_EOF;
144 else if (c == '\n') l->t = BC_LEX_NLINE;
145 else bc_lex_whitespace(l);
146 }
147
148 /**
149 * Parses a number.
150 * @param l The lexer.
151 * @param start The start character.
152 * @param int_only Whether this function should only look for an integer. This
153 * is used to implement the exponent of scientific notation.
154 */
155 static size_t
bc_lex_num(BcLex * l,char start,bool int_only)156 bc_lex_num(BcLex* l, char start, bool int_only)
157 {
158 const char* buf = l->buf + l->i;
159 size_t i;
160 char c;
161 bool last_pt, pt = (start == '.');
162
163 // This loop looks complex. It is not. It is asking if the character is not
164 // a nul byte and it if it a valid num character based on what we have found
165 // thus far, or whether it is a backslash followed by a newline. I can do
166 // i+1 on the buffer because the buffer must have a nul byte.
167 for (i = 0; (c = buf[i]) && (BC_LEX_NUM_CHAR(c, pt, int_only) ||
168 (c == '\\' && buf[i + 1] == '\n'));
169 ++i)
170 {
171 // I don't need to test that the next character is a newline because
172 // the loop condition above ensures that.
173 if (c == '\\')
174 {
175 i += 2;
176
177 // Make sure to eat whitespace at the beginning of the line.
178 while (isspace(buf[i]) && buf[i] != '\n')
179 {
180 i += 1;
181 }
182
183 c = buf[i];
184
185 // If the next character is not a number character, bail.
186 if (!BC_LEX_NUM_CHAR(c, pt, int_only)) break;
187 }
188
189 // Did we find the radix point?
190 last_pt = (c == '.');
191
192 // If we did, and we already have one, then break because it's not part
193 // of this number.
194 if (pt && last_pt) break;
195
196 // Set whether we have found a radix point.
197 pt = pt || last_pt;
198
199 bc_vec_push(&l->str, &c);
200 }
201
202 return i;
203 }
204
205 void
bc_lex_number(BcLex * l,char start)206 bc_lex_number(BcLex* l, char start)
207 {
208 l->t = BC_LEX_NUMBER;
209
210 // Make sure the string is clear.
211 bc_vec_popAll(&l->str);
212 bc_vec_push(&l->str, &start);
213
214 // Parse the number.
215 l->i += bc_lex_num(l, start, false);
216
217 #if BC_ENABLE_EXTRA_MATH
218 {
219 char c = l->buf[l->i];
220
221 // Do we have a number in scientific notation?
222 if (c == 'e')
223 {
224 #if BC_ENABLED
225 // Barf for POSIX.
226 if (BC_IS_POSIX) bc_lex_err(l, BC_ERR_POSIX_EXP_NUM);
227 #endif // BC_ENABLED
228
229 // Push the e.
230 bc_vec_push(&l->str, &c);
231 l->i += 1;
232 c = l->buf[l->i];
233
234 // Check for negative specifically because bc_lex_num() does not.
235 if (c == BC_LEX_NEG_CHAR)
236 {
237 bc_vec_push(&l->str, &c);
238 l->i += 1;
239 c = l->buf[l->i];
240 }
241
242 // We must have a number character, so barf if not.
243 if (BC_ERR(!BC_LEX_NUM_CHAR(c, false, true)))
244 {
245 bc_lex_verr(l, BC_ERR_PARSE_CHAR, c);
246 }
247
248 // Parse the exponent.
249 l->i += bc_lex_num(l, 0, true);
250 }
251 }
252 #endif // BC_ENABLE_EXTRA_MATH
253
254 bc_vec_pushByte(&l->str, '\0');
255 }
256
257 void
bc_lex_name(BcLex * l)258 bc_lex_name(BcLex* l)
259 {
260 size_t i = 0;
261 const char* buf = l->buf + l->i - 1;
262 char c = buf[i];
263
264 l->t = BC_LEX_NAME;
265
266 // Should be obvious. It's looking for valid characters.
267 while ((c >= 'a' && c <= 'z') || isdigit(c) || c == '_')
268 {
269 c = buf[++i];
270 }
271
272 // Set the string to the identifier.
273 bc_vec_string(&l->str, i, buf);
274
275 // Increment the index. We minus 1 because it has already been incremented.
276 l->i += i - 1;
277 }
278
279 void
bc_lex_init(BcLex * l)280 bc_lex_init(BcLex* l)
281 {
282 BC_SIG_ASSERT_LOCKED;
283 assert(l != NULL);
284 bc_vec_init(&l->str, sizeof(char), BC_DTOR_NONE);
285 }
286
287 void
bc_lex_free(BcLex * l)288 bc_lex_free(BcLex* l)
289 {
290 BC_SIG_ASSERT_LOCKED;
291 assert(l != NULL);
292 bc_vec_free(&l->str);
293 }
294
295 void
bc_lex_file(BcLex * l,const char * file)296 bc_lex_file(BcLex* l, const char* file)
297 {
298 assert(l != NULL && file != NULL);
299 l->line = 1;
300 vm->file = file;
301 }
302
303 void
bc_lex_next(BcLex * l)304 bc_lex_next(BcLex* l)
305 {
306 BC_SIG_ASSERT_LOCKED;
307
308 assert(l != NULL);
309
310 l->last = l->t;
311
312 // If this wasn't here, the line number would be off.
313 l->line += (l->i != 0 && l->buf[l->i - 1] == '\n');
314
315 // If the last token was EOF, someone called this one too many times.
316 if (BC_ERR(l->last == BC_LEX_EOF)) bc_lex_err(l, BC_ERR_PARSE_EOF);
317
318 l->t = BC_LEX_EOF;
319
320 // We are done if this is true.
321 if (l->i == l->len) return;
322
323 // Loop until failure or we don't have whitespace. This
324 // is so the parser doesn't get inundated with whitespace.
325 do
326 {
327 vm->next(l);
328 }
329 while (l->t == BC_LEX_WHITESPACE);
330 }
331
332 /**
333 * Updates the buffer and len so that they are not invalidated when the stdin
334 * buffer grows.
335 * @param l The lexer.
336 * @param text The text.
337 * @param len The length of the text.
338 */
339 static void
bc_lex_fixText(BcLex * l,const char * text,size_t len)340 bc_lex_fixText(BcLex* l, const char* text, size_t len)
341 {
342 l->buf = text;
343 l->len = len;
344 }
345
346 bool
bc_lex_readLine(BcLex * l)347 bc_lex_readLine(BcLex* l)
348 {
349 bool good;
350
351 // These are reversed because they should be already locked, but
352 // bc_vm_readLine() needs them to be unlocked.
353 BC_SIG_UNLOCK;
354
355 // Make sure we read from the appropriate place.
356 switch (l->mode)
357 {
358 case BC_MODE_EXPRS:
359 {
360 good = bc_vm_readBuf(false);
361 break;
362 }
363
364 case BC_MODE_FILE:
365 {
366 good = false;
367 break;
368 }
369
370 #if !BC_ENABLE_OSSFUZZ
371
372 case BC_MODE_STDIN:
373 {
374 good = bc_vm_readLine(false);
375 break;
376 }
377
378 #endif // !BC_ENABLE_OSSFUZZ
379
380 #ifdef __GNUC__
381 #ifndef __clang__
382 default:
383 {
384 // We should never get here.
385 abort();
386 }
387 #endif // __clang__
388 #endif // __GNUC__
389 }
390
391 BC_SIG_LOCK;
392
393 bc_lex_fixText(l, vm->buffer.v, vm->buffer.len - 1);
394
395 return good;
396 }
397
398 void
bc_lex_text(BcLex * l,const char * text,BcMode mode)399 bc_lex_text(BcLex* l, const char* text, BcMode mode)
400 {
401 BC_SIG_ASSERT_LOCKED;
402
403 assert(l != NULL && text != NULL);
404
405 bc_lex_fixText(l, text, strlen(text));
406 l->i = 0;
407 l->t = l->last = BC_LEX_INVALID;
408 l->mode = mode;
409
410 bc_lex_next(l);
411 }
412