xref: /illumos-gate/usr/src/tools/smatch/src/tokenize.c (revision c65ebfc7045424bd04a6c7719a27b0ad3399ad54)
1 /*
2  * This is a really stupid C tokenizer. It doesn't do any include
3  * files or anything complex at all. That's the preprocessor.
4  *
5  * Copyright (C) 2003 Transmeta Corp.
6  *               2003 Linus Torvalds
7  *
8  * Permission is hereby granted, free of charge, to any person obtaining a copy
9  * of this software and associated documentation files (the "Software"), to deal
10  * in the Software without restriction, including without limitation the rights
11  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12  * copies of the Software, and to permit persons to whom the Software is
13  * furnished to do so, subject to the following conditions:
14  *
15  * The above copyright notice and this permission notice shall be included in
16  * all copies or substantial portions of the Software.
17  *
18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24  * THE SOFTWARE.
25  */
26 #include <stdio.h>
27 #include <stdlib.h>
28 #include <stdarg.h>
29 #include <stddef.h>
30 #include <string.h>
31 #include <ctype.h>
32 #include <unistd.h>
33 #include <stdint.h>
34 
35 #include "lib.h"
36 #include "allocate.h"
37 #include "token.h"
38 #include "symbol.h"
39 
40 #define EOF (-1)
41 
42 int input_stream_nr = 0;
43 struct stream *input_streams;
44 static int input_streams_allocated;
45 unsigned int tabstop = 8;
46 int no_lineno = 0;
47 
48 #define BUFSIZE (8192)
49 
50 typedef struct {
51 	int fd, offset, size;
52 	int pos, line, nr;
53 	int newline, whitespace;
54 	struct token **tokenlist;
55 	struct token *token;
56 	unsigned char *buffer;
57 } stream_t;
58 
59 const char *stream_name(int stream)
60 {
61 	if (stream < 0 || stream > input_stream_nr)
62 		return "<bad stream>";
63 	return input_streams[stream].name;
64 }
65 
66 static struct position stream_pos(stream_t *stream)
67 {
68 	struct position pos;
69 	pos.type = 0;
70 	pos.stream = stream->nr;
71 	pos.newline = stream->newline;
72 	pos.whitespace = stream->whitespace;
73 	pos.pos = stream->pos;
74 
75 	pos.line = stream->line;
76 	if (no_lineno)
77 		pos.line = 123456;
78 
79 	pos.noexpand = 0;
80 	return pos;
81 }
82 
83 const char *show_special(int val)
84 {
85 	static char buffer[4];
86 
87 	buffer[0] = val;
88 	buffer[1] = 0;
89 	if (val >= SPECIAL_BASE)
90 		strcpy(buffer, (char *) combinations[val - SPECIAL_BASE]);
91 	return buffer;
92 }
93 
94 const char *show_ident(const struct ident *ident)
95 {
96 	static char buffer[256];
97 	if (!ident)
98 		return "<noident>";
99 	sprintf(buffer, "%.*s", ident->len, ident->name);
100 	return buffer;
101 }
102 
103 static char *charstr(char *ptr, unsigned char c, unsigned char escape, unsigned char next)
104 {
105 	if (isprint(c)) {
106 		if (c == escape || c == '\\')
107 			*ptr++ = '\\';
108 		*ptr++ = c;
109 		return ptr;
110 	}
111 	*ptr++ = '\\';
112 	switch (c) {
113 	case '\n':
114 		*ptr++ = 'n';
115 		return ptr;
116 	case '\t':
117 		*ptr++ = 't';
118 		return ptr;
119 	}
120 	if (!isdigit(next))
121 		return ptr + sprintf(ptr, "%o", c);
122 
123 	return ptr + sprintf(ptr, "%03o", c);
124 }
125 
126 const char *show_string(const struct string *string)
127 {
128 	static char buffer[4 * MAX_STRING + 3];
129 	char *ptr;
130 	int i;
131 
132 	if (!string->length)
133 		return "<bad_string>";
134 	ptr = buffer;
135 	*ptr++ = '"';
136 	for (i = 0; i < string->length-1; i++) {
137 		const char *p = string->data + i;
138 		ptr = charstr(ptr, p[0], '"', p[1]);
139 	}
140 	*ptr++ = '"';
141 	*ptr = '\0';
142 	return buffer;
143 }
144 
145 static const char *show_char(const char *s, size_t len, char prefix, char delim)
146 {
147 	static char buffer[MAX_STRING + 4];
148 	char *p = buffer;
149 	if (prefix)
150 		*p++ = prefix;
151 	*p++ = delim;
152 	memcpy(p, s, len);
153 	p += len;
154 	*p++ = delim;
155 	*p++ = '\0';
156 	return buffer;
157 }
158 
159 static const char *quote_char(const char *s, size_t len, char prefix, char delim)
160 {
161 	static char buffer[2*MAX_STRING + 6];
162 	size_t i;
163 	char *p = buffer;
164 	if (prefix)
165 		*p++ = prefix;
166 	if (delim == '"')
167 		*p++ = '\\';
168 	*p++ = delim;
169 	for (i = 0; i < len; i++) {
170 		if (s[i] == '"' || s[i] == '\\')
171 			*p++ = '\\';
172 		*p++ = s[i];
173 	}
174 	if (delim == '"')
175 		*p++ = '\\';
176 	*p++ = delim;
177 	*p++ = '\0';
178 	return buffer;
179 }
180 
181 const char *show_token(const struct token *token)
182 {
183 	static char buffer[256];
184 
185 	if (!token)
186 		return "<no token>";
187 	switch (token_type(token)) {
188 	case TOKEN_ERROR:
189 		return "syntax error";
190 
191 	case TOKEN_EOF:
192 		return "end-of-input";
193 
194 	case TOKEN_IDENT:
195 		return show_ident(token->ident);
196 
197 	case TOKEN_NUMBER:
198 		return token->number;
199 
200 	case TOKEN_SPECIAL:
201 		return show_special(token->special);
202 
203 	case TOKEN_CHAR:
204 		return show_char(token->string->data,
205 			token->string->length - 1, 0, '\'');
206 	case TOKEN_CHAR_EMBEDDED_0 ... TOKEN_CHAR_EMBEDDED_3:
207 		return show_char(token->embedded,
208 			token_type(token) - TOKEN_CHAR, 0, '\'');
209 	case TOKEN_WIDE_CHAR:
210 		return show_char(token->string->data,
211 			token->string->length - 1, 'L', '\'');
212 	case TOKEN_WIDE_CHAR_EMBEDDED_0 ... TOKEN_WIDE_CHAR_EMBEDDED_3:
213 		return show_char(token->embedded,
214 			token_type(token) - TOKEN_WIDE_CHAR, 'L', '\'');
215 	case TOKEN_STRING:
216 		return show_char(token->string->data,
217 			token->string->length - 1, 0, '"');
218 	case TOKEN_WIDE_STRING:
219 		return show_char(token->string->data,
220 			token->string->length - 1, 'L', '"');
221 
222 	case TOKEN_STREAMBEGIN:
223 		sprintf(buffer, "<beginning of '%s'>", stream_name(token->pos.stream));
224 		return buffer;
225 
226 	case TOKEN_STREAMEND:
227 		sprintf(buffer, "<end of '%s'>", stream_name(token->pos.stream));
228 		return buffer;
229 
230 	case TOKEN_UNTAINT:
231 		sprintf(buffer, "<untaint>");
232 		return buffer;
233 
234 	case TOKEN_ARG_COUNT:
235 		sprintf(buffer, "<argcnt>");
236 		return buffer;
237 
238 	default:
239 		sprintf(buffer, "unhandled token type '%d' ", token_type(token));
240 		return buffer;
241 	}
242 }
243 
244 const char *quote_token(const struct token *token)
245 {
246 	static char buffer[256];
247 
248 	switch (token_type(token)) {
249 	case TOKEN_ERROR:
250 		return "syntax error";
251 
252 	case TOKEN_IDENT:
253 		return show_ident(token->ident);
254 
255 	case TOKEN_NUMBER:
256 		return token->number;
257 
258 	case TOKEN_SPECIAL:
259 		return show_special(token->special);
260 
261 	case TOKEN_CHAR:
262 		return quote_char(token->string->data,
263 			token->string->length - 1, 0, '\'');
264 	case TOKEN_CHAR_EMBEDDED_0 ... TOKEN_CHAR_EMBEDDED_3:
265 		return quote_char(token->embedded,
266 			token_type(token) - TOKEN_CHAR, 0, '\'');
267 	case TOKEN_WIDE_CHAR:
268 		return quote_char(token->string->data,
269 			token->string->length - 1, 'L', '\'');
270 	case TOKEN_WIDE_CHAR_EMBEDDED_0 ... TOKEN_WIDE_CHAR_EMBEDDED_3:
271 		return quote_char(token->embedded,
272 			token_type(token) - TOKEN_WIDE_CHAR, 'L', '\'');
273 	case TOKEN_STRING:
274 		return quote_char(token->string->data,
275 			token->string->length - 1, 0, '"');
276 	case TOKEN_WIDE_STRING:
277 		return quote_char(token->string->data,
278 			token->string->length - 1, 'L', '"');
279 	default:
280 		sprintf(buffer, "unhandled token type '%d' ", token_type(token));
281 		return buffer;
282 	}
283 }
284 
285 #define HASHED_INPUT_BITS (6)
286 #define HASHED_INPUT (1 << HASHED_INPUT_BITS)
287 #define HASH_PRIME 0x9e370001UL
288 
289 static int input_stream_hashes[HASHED_INPUT] = { [0 ... HASHED_INPUT-1] = -1 };
290 
291 int *hash_stream(const char *name)
292 {
293 	uint32_t hash = 0;
294 	unsigned char c;
295 
296 	while ((c = *name++) != 0)
297 		hash = (hash + (c << 4) + (c >> 4)) * 11;
298 
299 	hash *= HASH_PRIME;
300 	hash >>= 32 - HASHED_INPUT_BITS;
301 	return input_stream_hashes + hash;
302 }
303 
304 int init_stream(const char *name, int fd, const char **next_path)
305 {
306 	int stream = input_stream_nr, *hash;
307 	struct stream *current;
308 
309 	if (stream >= input_streams_allocated) {
310 		int newalloc = stream * 4 / 3 + 10;
311 		input_streams = realloc(input_streams, newalloc * sizeof(struct stream));
312 		if (!input_streams)
313 			die("Unable to allocate more streams space");
314 		input_streams_allocated = newalloc;
315 	}
316 	current = input_streams + stream;
317 	memset(current, 0, sizeof(*current));
318 	current->name = name;
319 	current->fd = fd;
320 	current->next_path = next_path;
321 	current->path = NULL;
322 	current->constant = CONSTANT_FILE_MAYBE;
323 	input_stream_nr = stream+1;
324 	hash = hash_stream(name);
325 	current->next_stream = *hash;
326 	*hash = stream;
327 	return stream;
328 }
329 
330 static struct token * alloc_token(stream_t *stream)
331 {
332 	struct token *token = __alloc_token(0);
333 	token->pos = stream_pos(stream);
334 	return token;
335 }
336 
337 /*
338  *  Argh...  That was surprisingly messy - handling '\r' complicates the
339  *  things a _lot_.
340  */
341 static int nextchar_slow(stream_t *stream)
342 {
343 	int offset = stream->offset;
344 	int size = stream->size;
345 	int c;
346 	int spliced = 0, had_cr, had_backslash;
347 
348 restart:
349 	had_cr = had_backslash = 0;
350 
351 repeat:
352 	if (offset >= size) {
353 		if (stream->fd < 0)
354 			goto got_eof;
355 		size = read(stream->fd, stream->buffer, BUFSIZE);
356 		if (size <= 0)
357 			goto got_eof;
358 		stream->size = size;
359 		stream->offset = offset = 0;
360 	}
361 
362 	c = stream->buffer[offset++];
363 	if (had_cr)
364 		goto check_lf;
365 
366 	if (c == '\r') {
367 		had_cr = 1;
368 		goto repeat;
369 	}
370 
371 norm:
372 	if (!had_backslash) {
373 		switch (c) {
374 		case '\t':
375 			stream->pos += tabstop - stream->pos % tabstop;
376 			break;
377 		case '\n':
378 			stream->line++;
379 			stream->pos = 0;
380 			stream->newline = 1;
381 			break;
382 		case '\\':
383 			had_backslash = 1;
384 			stream->pos++;
385 			goto repeat;
386 		default:
387 			stream->pos++;
388 		}
389 	} else {
390 		if (c == '\n') {
391 			stream->line++;
392 			stream->pos = 0;
393 			spliced = 1;
394 			goto restart;
395 		}
396 		offset--;
397 		c = '\\';
398 	}
399 out:
400 	stream->offset = offset;
401 
402 	return c;
403 
404 check_lf:
405 	if (c != '\n')
406 		offset--;
407 	c = '\n';
408 	goto norm;
409 
410 got_eof:
411 	if (had_backslash) {
412 		c = '\\';
413 		goto out;
414 	}
415 	if (stream->pos)
416 		warning(stream_pos(stream), "no newline at end of file");
417 	else if (spliced)
418 		warning(stream_pos(stream), "backslash-newline at end of file");
419 	return EOF;
420 }
421 
422 /*
423  *  We want that as light as possible while covering all normal cases.
424  *  Slow path (including the logics with line-splicing and EOF sanity
425  *  checks) is in nextchar_slow().
426  */
427 static inline int nextchar(stream_t *stream)
428 {
429 	int offset = stream->offset;
430 
431 	if (offset < stream->size) {
432 		int c = stream->buffer[offset++];
433 		static const char special[256] = {
434 			['\t'] = 1, ['\r'] = 1, ['\n'] = 1, ['\\'] = 1
435 		};
436 		if (!special[c]) {
437 			stream->offset = offset;
438 			stream->pos++;
439 			return c;
440 		}
441 	}
442 	return nextchar_slow(stream);
443 }
444 
445 struct token eof_token_entry;
446 
447 static struct token *mark_eof(stream_t *stream)
448 {
449 	struct token *end;
450 
451 	end = alloc_token(stream);
452 	token_type(end) = TOKEN_STREAMEND;
453 	end->pos.newline = 1;
454 
455 	eof_token_entry.next = &eof_token_entry;
456 	eof_token_entry.pos.newline = 1;
457 
458 	end->next =  &eof_token_entry;
459 	*stream->tokenlist = end;
460 	stream->tokenlist = NULL;
461 	return end;
462 }
463 
464 static void add_token(stream_t *stream)
465 {
466 	struct token *token = stream->token;
467 
468 	stream->token = NULL;
469 	token->next = NULL;
470 	*stream->tokenlist = token;
471 	stream->tokenlist = &token->next;
472 }
473 
474 static void drop_token(stream_t *stream)
475 {
476 	stream->newline |= stream->token->pos.newline;
477 	stream->whitespace |= stream->token->pos.whitespace;
478 	stream->token = NULL;
479 }
480 
481 enum {
482 	Letter = 1,
483 	Digit = 2,
484 	Hex = 4,
485 	Exp = 8,
486 	Dot = 16,
487 	ValidSecond = 32,
488 	Quote = 64,
489 };
490 
491 static const long cclass[257] = {
492 	['0' + 1 ... '7' + 1] = Digit | Hex,	/* \<octal> */
493 	['8' + 1 ... '9' + 1] = Digit | Hex,
494 	['A' + 1 ... 'D' + 1] = Letter | Hex,
495 	['E' + 1] = Letter | Hex | Exp,	/* E<exp> */
496 	['F' + 1] = Letter | Hex,
497 	['G' + 1 ... 'O' + 1] = Letter,
498 	['P' + 1] = Letter | Exp,	/* P<exp> */
499 	['Q' + 1 ... 'Z' + 1] = Letter,
500 	['a' + 1 ... 'b' + 1] = Letter | Hex, /* \a, \b */
501 	['c' + 1 ... 'd' + 1] = Letter | Hex,
502 	['e' + 1] = Letter | Hex | Exp,/* \e, e<exp> */
503 	['f' + 1] = Letter | Hex,	/* \f */
504 	['g' + 1 ... 'm' + 1] = Letter,
505 	['n' + 1] = Letter,	/* \n */
506 	['o' + 1] = Letter,
507 	['p' + 1] = Letter | Exp,	/* p<exp> */
508 	['q' + 1] = Letter,
509 	['r' + 1] = Letter,	/* \r */
510 	['s' + 1] = Letter,
511 	['t' + 1] = Letter,	/* \t */
512 	['u' + 1] = Letter,
513 	['v' + 1] = Letter,	/* \v */
514 	['w' + 1] = Letter,
515 	['x' + 1] = Letter,	/* \x<hex> */
516 	['y' + 1 ... 'z' + 1] = Letter,
517 	['_' + 1] = Letter,
518 	['.' + 1] = Dot | ValidSecond,
519 	['=' + 1] = ValidSecond,
520 	['+' + 1] = ValidSecond,
521 	['-' + 1] = ValidSecond,
522 	['>' + 1] = ValidSecond,
523 	['<' + 1] = ValidSecond,
524 	['&' + 1] = ValidSecond,
525 	['|' + 1] = ValidSecond,
526 	['#' + 1] = ValidSecond,
527 	['\'' + 1] = Quote,
528 	['"' + 1] = Quote,
529 };
530 
531 /*
532  * pp-number:
533  *	digit
534  *	. digit
535  *	pp-number digit
536  *	pp-number identifier-nodigit
537  *	pp-number e sign
538  *	pp-number E sign
539  *	pp-number p sign
540  *	pp-number P sign
541  *	pp-number .
542  */
543 static int get_one_number(int c, int next, stream_t *stream)
544 {
545 	struct token *token;
546 	static char buffer[4095];
547 	char *p = buffer, *buf, *buffer_end = buffer + sizeof (buffer);
548 	int len;
549 
550 	*p++ = c;
551 	for (;;) {
552 		long class =  cclass[next + 1];
553 		if (!(class & (Dot | Digit | Letter)))
554 			break;
555 		if (p != buffer_end)
556 			*p++ = next;
557 		next = nextchar(stream);
558 		if (class & Exp) {
559 			if (next == '-' || next == '+') {
560 				if (p != buffer_end)
561 					*p++ = next;
562 				next = nextchar(stream);
563 			}
564 		}
565 	}
566 
567 	if (p == buffer_end) {
568 		sparse_error(stream_pos(stream), "number token exceeds %td characters",
569 		      buffer_end - buffer);
570 		// Pretend we saw just "1".
571 		buffer[0] = '1';
572 		p = buffer + 1;
573 	}
574 
575 	*p++ = 0;
576 	len = p - buffer;
577 	buf = __alloc_bytes(len);
578 	memcpy(buf, buffer, len);
579 
580 	token = stream->token;
581 	token_type(token) = TOKEN_NUMBER;
582 	token->number = buf;
583 	add_token(stream);
584 
585 	return next;
586 }
587 
588 static int eat_string(int next, stream_t *stream, enum token_type type)
589 {
590 	static char buffer[MAX_STRING];
591 	struct string *string;
592 	struct token *token = stream->token;
593 	int len = 0;
594 	int escape;
595 	int want_hex = 0;
596 	char delim = type < TOKEN_STRING ? '\'' : '"';
597 
598 	for (escape = 0; escape || next != delim; next = nextchar(stream)) {
599 		if (len < MAX_STRING)
600 			buffer[len] = next;
601 		len++;
602 		if (next == '\n') {
603 			warning(stream_pos(stream),
604 				"Newline in string or character constant");
605 			if (delim == '\'') /* assume it's lost ' */
606 				break;
607 		}
608 		if (next == EOF) {
609 			warning(stream_pos(stream),
610 				"End of file in middle of string");
611 			return next;
612 		}
613 		if (!escape) {
614 			if (want_hex && !(cclass[next + 1] & Hex))
615 				warning(stream_pos(stream),
616 					"\\x used with no following hex digits");
617 			want_hex = 0;
618 			escape = next == '\\';
619 		} else {
620 			escape = 0;
621 			want_hex = next == 'x';
622 		}
623 	}
624 	if (want_hex)
625 		warning(stream_pos(stream),
626 			"\\x used with no following hex digits");
627 	if (len > MAX_STRING) {
628 		warning(stream_pos(stream), "string too long (%d bytes, %d bytes max)", len, MAX_STRING);
629 		len = MAX_STRING;
630 	}
631 	if (delim == '\'' && len <= 4) {
632 		if (len == 0) {
633 			sparse_error(stream_pos(stream),
634 				"empty character constant");
635 			return nextchar(stream);
636 		}
637 		token_type(token) = type + len;
638 		memset(buffer + len, '\0', 4 - len);
639 		memcpy(token->embedded, buffer, 4);
640 	} else {
641 		token_type(token) = type;
642 		string = __alloc_string(len+1);
643 		memcpy(string->data, buffer, len);
644 		string->data[len] = '\0';
645 		string->length = len+1;
646 		token->string = string;
647 	}
648 
649 	/* Pass it on.. */
650 	token = stream->token;
651 	add_token(stream);
652 	return nextchar(stream);
653 }
654 
655 static int drop_stream_eoln(stream_t *stream)
656 {
657 	drop_token(stream);
658 	for (;;) {
659 		switch (nextchar(stream)) {
660 		case EOF:
661 			return EOF;
662 		case '\n':
663 			return nextchar(stream);
664 		}
665 	}
666 }
667 
668 static int drop_stream_comment(stream_t *stream)
669 {
670 	int newline;
671 	int next;
672 	drop_token(stream);
673 	newline = stream->newline;
674 
675 	next = nextchar(stream);
676 	for (;;) {
677 		int curr = next;
678 		if (curr == EOF) {
679 			warning(stream_pos(stream), "End of file in the middle of a comment");
680 			return curr;
681 		}
682 		next = nextchar(stream);
683 		if (curr == '*' && next == '/')
684 			break;
685 	}
686 	stream->newline = newline;
687 	return nextchar(stream);
688 }
689 
690 unsigned char combinations[][4] = COMBINATION_STRINGS;
691 
692 #define NR_COMBINATIONS (SPECIAL_ARG_SEPARATOR - SPECIAL_BASE)
693 
694 /* hash function for two-character punctuators - all give unique values */
695 #define special_hash(c0, c1) (((c0*8+c1*2)+((c0*8+c1*2)>>5))&31)
696 
697 /*
698  * note that we won't get false positives - special_hash(0,0) is 0 and
699  * entry 0 is filled (by +=), so all the missing ones are OK.
700  */
701 static unsigned char hash_results[32][2] = {
702 #define RES(c0, c1) [special_hash(c0, c1)] = {c0, c1}
703 	RES('+', '='), /* 00 */
704 	RES('/', '='), /* 01 */
705 	RES('^', '='), /* 05 */
706 	RES('&', '&'), /* 07 */
707 	RES('#', '#'), /* 08 */
708 	RES('<', '<'), /* 0a */
709 	RES('<', '='), /* 0c */
710 	RES('!', '='), /* 0e */
711 	RES('%', '='), /* 0f */
712 	RES('-', '-'), /* 10 */
713 	RES('-', '='), /* 11 */
714 	RES('-', '>'), /* 13 */
715 	RES('=', '='), /* 15 */
716 	RES('&', '='), /* 17 */
717 	RES('*', '='), /* 18 */
718 	RES('.', '.'), /* 1a */
719 	RES('+', '+'), /* 1b */
720 	RES('|', '='), /* 1c */
721 	RES('>', '='), /* 1d */
722 	RES('|', '|'), /* 1e */
723 	RES('>', '>')  /* 1f */
724 #undef RES
725 };
726 static int code[32] = {
727 #define CODE(c0, c1, value) [special_hash(c0, c1)] = value
728 	CODE('+', '=', SPECIAL_ADD_ASSIGN), /* 00 */
729 	CODE('/', '=', SPECIAL_DIV_ASSIGN), /* 01 */
730 	CODE('^', '=', SPECIAL_XOR_ASSIGN), /* 05 */
731 	CODE('&', '&', SPECIAL_LOGICAL_AND), /* 07 */
732 	CODE('#', '#', SPECIAL_HASHHASH), /* 08 */
733 	CODE('<', '<', SPECIAL_LEFTSHIFT), /* 0a */
734 	CODE('<', '=', SPECIAL_LTE), /* 0c */
735 	CODE('!', '=', SPECIAL_NOTEQUAL), /* 0e */
736 	CODE('%', '=', SPECIAL_MOD_ASSIGN), /* 0f */
737 	CODE('-', '-', SPECIAL_DECREMENT), /* 10 */
738 	CODE('-', '=', SPECIAL_SUB_ASSIGN), /* 11 */
739 	CODE('-', '>', SPECIAL_DEREFERENCE), /* 13 */
740 	CODE('=', '=', SPECIAL_EQUAL), /* 15 */
741 	CODE('&', '=', SPECIAL_AND_ASSIGN), /* 17 */
742 	CODE('*', '=', SPECIAL_MUL_ASSIGN), /* 18 */
743 	CODE('.', '.', SPECIAL_DOTDOT), /* 1a */
744 	CODE('+', '+', SPECIAL_INCREMENT), /* 1b */
745 	CODE('|', '=', SPECIAL_OR_ASSIGN), /* 1c */
746 	CODE('>', '=', SPECIAL_GTE), /* 1d */
747 	CODE('|', '|', SPECIAL_LOGICAL_OR), /* 1e */
748 	CODE('>', '>', SPECIAL_RIGHTSHIFT)  /* 1f */
749 #undef CODE
750 };
751 
752 static int get_one_special(int c, stream_t *stream)
753 {
754 	struct token *token;
755 	int next, value, i;
756 
757 	next = nextchar(stream);
758 
759 	/*
760 	 * Check for numbers, strings, character constants, and comments
761 	 */
762 	switch (c) {
763 	case '.':
764 		if (next >= '0' && next <= '9')
765 			return get_one_number(c, next, stream);
766 		break;
767 	case '"':
768 		return eat_string(next, stream, TOKEN_STRING);
769 	case '\'':
770 		return eat_string(next, stream, TOKEN_CHAR);
771 	case '/':
772 		if (next == '/')
773 			return drop_stream_eoln(stream);
774 		if (next == '*')
775 			return drop_stream_comment(stream);
776 	}
777 
778 	/*
779 	 * Check for combinations
780 	 */
781 	value = c;
782 	if (cclass[next + 1] & ValidSecond) {
783 		i = special_hash(c, next);
784 		if (hash_results[i][0] == c && hash_results[i][1] == next) {
785 			value = code[i];
786 			next = nextchar(stream);
787 			if (value >= SPECIAL_LEFTSHIFT &&
788 			    next == "==."[value - SPECIAL_LEFTSHIFT]) {
789 				value += 3;
790 				next = nextchar(stream);
791 			}
792 		}
793 	}
794 
795 	/* Pass it on.. */
796 	token = stream->token;
797 	token_type(token) = TOKEN_SPECIAL;
798 	token->special = value;
799 	add_token(stream);
800 	return next;
801 }
802 
803 #define IDENT_HASH_BITS (13)
804 #define IDENT_HASH_SIZE (1<<IDENT_HASH_BITS)
805 #define IDENT_HASH_MASK (IDENT_HASH_SIZE-1)
806 
807 #define ident_hash_init(c)		(c)
808 #define ident_hash_add(oldhash,c)	((oldhash)*11 + (c))
809 #define ident_hash_end(hash)		((((hash) >> IDENT_HASH_BITS) + (hash)) & IDENT_HASH_MASK)
810 
811 static struct ident *hash_table[IDENT_HASH_SIZE];
812 static int ident_hit, ident_miss, idents;
813 
814 void show_identifier_stats(void)
815 {
816 	int i;
817 	int distribution[100];
818 
819 	fprintf(stderr, "identifiers: %d hits, %d misses\n",
820 		ident_hit, ident_miss);
821 
822 	for (i = 0; i < 100; i++)
823 		distribution[i] = 0;
824 
825 	for (i = 0; i < IDENT_HASH_SIZE; i++) {
826 		struct ident * ident = hash_table[i];
827 		int count = 0;
828 
829 		while (ident) {
830 			count++;
831 			ident = ident->next;
832 		}
833 		if (count > 99)
834 			count = 99;
835 		distribution[count]++;
836 	}
837 
838 	for (i = 0; i < 100; i++) {
839 		if (distribution[i])
840 			fprintf(stderr, "%2d: %d buckets\n", i, distribution[i]);
841 	}
842 }
843 
844 static struct ident *alloc_ident(const char *name, int len)
845 {
846 	struct ident *ident = __alloc_ident(len);
847 	ident->symbols = NULL;
848 	ident->len = len;
849 	ident->tainted = 0;
850 	memcpy(ident->name, name, len);
851 	return ident;
852 }
853 
854 static struct ident * insert_hash(struct ident *ident, unsigned long hash)
855 {
856 	ident->next = hash_table[hash];
857 	hash_table[hash] = ident;
858 	ident_miss++;
859 	return ident;
860 }
861 
862 static struct ident *create_hashed_ident(const char *name, int len, unsigned long hash)
863 {
864 	struct ident *ident;
865 	struct ident **p;
866 
867 	p = &hash_table[hash];
868 	while ((ident = *p) != NULL) {
869 		if (ident->len == (unsigned char) len) {
870 			if (strncmp(name, ident->name, len) != 0)
871 				goto next;
872 
873 			ident_hit++;
874 			return ident;
875 		}
876 next:
877 		//misses++;
878 		p = &ident->next;
879 	}
880 	ident = alloc_ident(name, len);
881 	*p = ident;
882 	ident->next = NULL;
883 	ident_miss++;
884 	idents++;
885 	return ident;
886 }
887 
888 static unsigned long hash_name(const char *name, int len)
889 {
890 	unsigned long hash;
891 	const unsigned char *p = (const unsigned char *)name;
892 
893 	hash = ident_hash_init(*p++);
894 	while (--len) {
895 		unsigned int i = *p++;
896 		hash = ident_hash_add(hash, i);
897 	}
898 	return ident_hash_end(hash);
899 }
900 
901 struct ident *hash_ident(struct ident *ident)
902 {
903 	return insert_hash(ident, hash_name(ident->name, ident->len));
904 }
905 
906 struct ident *built_in_ident(const char *name)
907 {
908 	int len = strlen(name);
909 	return create_hashed_ident(name, len, hash_name(name, len));
910 }
911 
912 struct token *built_in_token(int stream, struct ident *ident)
913 {
914 	struct token *token;
915 
916 	token = __alloc_token(0);
917 	token->pos.stream = stream;
918 	token_type(token) = TOKEN_IDENT;
919 	token->ident = ident;
920 	return token;
921 }
922 
923 static int get_one_identifier(int c, stream_t *stream)
924 {
925 	struct token *token;
926 	struct ident *ident;
927 	unsigned long hash;
928 	char buf[256];
929 	int len = 1;
930 	int next;
931 
932 	hash = ident_hash_init(c);
933 	buf[0] = c;
934 	for (;;) {
935 		next = nextchar(stream);
936 		if (!(cclass[next + 1] & (Letter | Digit)))
937 			break;
938 		if (len >= sizeof(buf))
939 			break;
940 		hash = ident_hash_add(hash, next);
941 		buf[len] = next;
942 		len++;
943 	};
944 	if (cclass[next + 1] & Quote) {
945 		if (len == 1 && buf[0] == 'L') {
946 			if (next == '\'')
947 				return eat_string(nextchar(stream), stream,
948 							TOKEN_WIDE_CHAR);
949 			else
950 				return eat_string(nextchar(stream), stream,
951 							TOKEN_WIDE_STRING);
952 		}
953 	}
954 	hash = ident_hash_end(hash);
955 	ident = create_hashed_ident(buf, len, hash);
956 
957 	/* Pass it on.. */
958 	token = stream->token;
959 	token_type(token) = TOKEN_IDENT;
960 	token->ident = ident;
961 	add_token(stream);
962 	return next;
963 }
964 
965 static int get_one_token(int c, stream_t *stream)
966 {
967 	long class = cclass[c + 1];
968 	if (class & Digit)
969 		return get_one_number(c, nextchar(stream), stream);
970 	if (class & Letter)
971 		return get_one_identifier(c, stream);
972 	return get_one_special(c, stream);
973 }
974 
975 static struct token *setup_stream(stream_t *stream, int idx, int fd,
976 	unsigned char *buf, unsigned int buf_size)
977 {
978 	struct token *begin;
979 
980 	stream->nr = idx;
981 	stream->line = 1;
982 	stream->newline = 1;
983 	stream->whitespace = 0;
984 	stream->pos = 0;
985 
986 	stream->token = NULL;
987 	stream->fd = fd;
988 	stream->offset = 0;
989 	stream->size = buf_size;
990 	stream->buffer = buf;
991 
992 	begin = alloc_token(stream);
993 	token_type(begin) = TOKEN_STREAMBEGIN;
994 	stream->tokenlist = &begin->next;
995 	return begin;
996 }
997 
998 static struct token *tokenize_stream(stream_t *stream)
999 {
1000 	int c = nextchar(stream);
1001 	while (c != EOF) {
1002 		if (!isspace(c)) {
1003 			struct token *token = alloc_token(stream);
1004 			stream->token = token;
1005 			stream->newline = 0;
1006 			stream->whitespace = 0;
1007 			c = get_one_token(c, stream);
1008 			continue;
1009 		}
1010 		stream->whitespace = 1;
1011 		c = nextchar(stream);
1012 	}
1013 	return mark_eof(stream);
1014 }
1015 
1016 struct token * tokenize_buffer(void *buffer, unsigned long size, struct token **endtoken)
1017 {
1018 	stream_t stream;
1019 	struct token *begin;
1020 
1021 	begin = setup_stream(&stream, 0, -1, buffer, size);
1022 	*endtoken = tokenize_stream(&stream);
1023 	return begin;
1024 }
1025 
1026 struct token * tokenize(const char *name, int fd, struct token *endtoken, const char **next_path)
1027 {
1028 	struct token *begin, *end;
1029 	stream_t stream;
1030 	unsigned char buffer[BUFSIZE];
1031 	int idx;
1032 
1033 	idx = init_stream(name, fd, next_path);
1034 	if (idx < 0) {
1035 		// info(endtoken->pos, "File %s is const", name);
1036 		return endtoken;
1037 	}
1038 
1039 	begin = setup_stream(&stream, idx, fd, buffer, 0);
1040 	end = tokenize_stream(&stream);
1041 	if (endtoken)
1042 		end->next = endtoken;
1043 	return begin;
1044 }
1045