1 /*- 2 * SPDX-License-Identifier: BSD-4-Clause 3 * 4 * Copyright (c) 1985 Sun Microsystems, Inc. 5 * Copyright (c) 1980, 1993 6 * The Regents of the University of California. All rights reserved. 7 * All rights reserved. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 3. All advertising materials mentioning features or use of this software 18 * must display the following acknowledgement: 19 * This product includes software developed by the University of 20 * California, Berkeley and its contributors. 21 * 4. Neither the name of the University nor the names of its contributors 22 * may be used to endorse or promote products derived from this software 23 * without specific prior written permission. 24 * 25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 28 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 35 * SUCH DAMAGE. 36 */ 37 38 #if 0 39 #ifndef lint 40 static char sccsid[] = "@(#)lexi.c 8.1 (Berkeley) 6/6/93"; 41 #endif /* not lint */ 42 #endif 43 #include <sys/cdefs.h> 44 __FBSDID("$FreeBSD$"); 45 46 /* 47 * Here we have the token scanner for indent. It scans off one token and puts 48 * it in the global variable "token". It returns a code, indicating the type 49 * of token scanned. 50 */ 51 52 #include <err.h> 53 #include <stdio.h> 54 #include <ctype.h> 55 #include <stdlib.h> 56 #include <string.h> 57 #include "indent_globs.h" 58 #include "indent_codes.h" 59 #include "indent.h" 60 61 #define alphanum 1 62 #ifdef undef 63 #define opchar 3 64 #endif 65 66 struct templ { 67 const char *rwd; 68 int rwcode; 69 }; 70 71 /* 72 * This table has to be sorted alphabetically, because it'll be used in binary 73 * search. For the same reason, string must be the first thing in struct templ. 74 */ 75 struct templ specials[] = 76 { 77 {"_Bool", 4}, 78 {"_Complex", 4}, 79 {"_Imaginary", 4}, 80 {"auto", 10}, 81 {"bool", 4}, 82 {"break", 9}, 83 {"case", 8}, 84 {"char", 4}, 85 {"complex", 4}, 86 {"const", 4}, 87 {"continue", 12}, 88 {"default", 8}, 89 {"do", 6}, 90 {"double", 4}, 91 {"else", 6}, 92 {"enum", 3}, 93 {"extern", 10}, 94 {"float", 4}, 95 {"for", 5}, 96 {"global", 4}, 97 {"goto", 9}, 98 {"if", 5}, 99 {"imaginary", 4}, 100 {"inline", 12}, 101 {"int", 4}, 102 {"long", 4}, 103 {"offsetof", 1}, 104 {"register", 10}, 105 {"restrict", 12}, 106 {"return", 9}, 107 {"short", 4}, 108 {"signed", 4}, 109 {"sizeof", 2}, 110 {"static", 10}, 111 {"struct", 3}, 112 {"switch", 7}, 113 {"typedef", 11}, 114 {"union", 3}, 115 {"unsigned", 4}, 116 {"void", 4}, 117 {"volatile", 4}, 118 {"while", 5} 119 }; 120 121 const char **typenames; 122 int typename_count; 123 int typename_top = -1; 124 125 char chartype[128] = 126 { /* this is used to facilitate the decision of 127 * what type (alphanumeric, operator) each 128 * character is */ 129 0, 0, 0, 0, 0, 0, 0, 0, 130 0, 0, 0, 0, 0, 0, 0, 0, 131 0, 0, 0, 0, 0, 0, 0, 0, 132 0, 0, 0, 0, 0, 0, 0, 0, 133 0, 3, 0, 0, 1, 3, 3, 0, 134 0, 0, 3, 3, 0, 3, 0, 3, 135 1, 1, 1, 1, 1, 1, 1, 1, 136 1, 1, 0, 0, 3, 3, 3, 3, 137 0, 1, 1, 1, 1, 1, 1, 1, 138 1, 1, 1, 1, 1, 1, 1, 1, 139 1, 1, 1, 1, 1, 1, 1, 1, 140 1, 1, 1, 0, 0, 0, 3, 1, 141 0, 1, 1, 1, 1, 1, 1, 1, 142 1, 1, 1, 1, 1, 1, 1, 1, 143 1, 1, 1, 1, 1, 1, 1, 1, 144 1, 1, 1, 0, 3, 0, 3, 0 145 }; 146 147 static int 148 strcmp_type(const void *e1, const void *e2) 149 { 150 return (strcmp(e1, *(const char * const *)e2)); 151 } 152 153 int 154 lexi(struct parser_state *state) 155 { 156 int unary_delim; /* this is set to 1 if the current token 157 * forces a following operator to be unary */ 158 int code; /* internal code to be returned */ 159 char qchar; /* the delimiter character for a string */ 160 161 e_token = s_token; /* point to start of place to save token */ 162 unary_delim = false; 163 state->col_1 = state->last_nl; /* tell world that this token started 164 * in column 1 iff the last thing 165 * scanned was a newline */ 166 state->last_nl = false; 167 168 while (*buf_ptr == ' ' || *buf_ptr == '\t') { /* get rid of blanks */ 169 state->col_1 = false; /* leading blanks imply token is not in column 170 * 1 */ 171 if (++buf_ptr >= buf_end) 172 fill_buffer(); 173 } 174 175 /* Scan an alphanumeric token */ 176 if (chartype[*buf_ptr & 127] == alphanum || 177 (buf_ptr[0] == '.' && isdigit((unsigned char)buf_ptr[1]))) { 178 /* 179 * we have a character or number 180 */ 181 struct templ *p; 182 183 if (isdigit((unsigned char)*buf_ptr) || 184 (buf_ptr[0] == '.' && isdigit((unsigned char)buf_ptr[1]))) { 185 int seendot = 0, 186 seenexp = 0, 187 seensfx = 0; 188 189 /* 190 * base 2, base 8, base 16: 191 */ 192 if (buf_ptr[0] == '0' && buf_ptr[1] != '.') { 193 int len; 194 195 if (buf_ptr[1] == 'b' || buf_ptr[1] == 'B') 196 len = strspn(buf_ptr + 2, "01") + 2; 197 else if (buf_ptr[1] == 'x' || buf_ptr[1] == 'X') 198 len = strspn(buf_ptr + 2, "0123456789ABCDEFabcdef") + 2; 199 else 200 len = strspn(buf_ptr + 1, "012345678") + 1; 201 if (len > 0) { 202 CHECK_SIZE_TOKEN(len); 203 memcpy(e_token, buf_ptr, len); 204 e_token += len; 205 buf_ptr += len; 206 } 207 else 208 diag2(1, "Unterminated literal"); 209 } 210 else /* base 10: */ 211 while (1) { 212 if (*buf_ptr == '.') { 213 if (seendot) 214 break; 215 else 216 seendot++; 217 } 218 CHECK_SIZE_TOKEN(3); 219 *e_token++ = *buf_ptr++; 220 if (!isdigit((unsigned char)*buf_ptr) && *buf_ptr != '.') { 221 if ((*buf_ptr != 'E' && *buf_ptr != 'e') || seenexp) 222 break; 223 else { 224 seenexp++; 225 seendot++; 226 *e_token++ = *buf_ptr++; 227 if (*buf_ptr == '+' || *buf_ptr == '-') 228 *e_token++ = *buf_ptr++; 229 } 230 } 231 } 232 233 while (1) { 234 CHECK_SIZE_TOKEN(2); 235 if (!(seensfx & 1) && (*buf_ptr == 'U' || *buf_ptr == 'u')) { 236 *e_token++ = *buf_ptr++; 237 seensfx |= 1; 238 continue; 239 } 240 if (!(seensfx & 2) && (strchr("fFlL", *buf_ptr) != NULL)) { 241 if (buf_ptr[1] == buf_ptr[0]) 242 *e_token++ = *buf_ptr++; 243 *e_token++ = *buf_ptr++; 244 seensfx |= 2; 245 continue; 246 } 247 break; 248 } 249 } 250 else 251 while (chartype[*buf_ptr & 127] == alphanum || *buf_ptr == BACKSLASH) { 252 /* fill_buffer() terminates buffer with newline */ 253 if (*buf_ptr == BACKSLASH) { 254 if (*(buf_ptr + 1) == '\n') { 255 buf_ptr += 2; 256 if (buf_ptr >= buf_end) 257 fill_buffer(); 258 } else 259 break; 260 } 261 CHECK_SIZE_TOKEN(1); 262 /* copy it over */ 263 *e_token++ = *buf_ptr++; 264 if (buf_ptr >= buf_end) 265 fill_buffer(); 266 } 267 *e_token = '\0'; 268 269 if (s_token[0] == 'L' && s_token[1] == '\0' && 270 (*buf_ptr == '"' || *buf_ptr == '\'')) 271 return (strpfx); 272 273 while (*buf_ptr == ' ' || *buf_ptr == '\t') { /* get rid of blanks */ 274 if (++buf_ptr >= buf_end) 275 fill_buffer(); 276 } 277 state->keyword = 0; 278 if (state->last_token == structure && !state->p_l_follow) { 279 /* if last token was 'struct' and we're not 280 * in parentheses, then this token 281 * should be treated as a declaration */ 282 state->last_u_d = true; 283 return (decl); 284 } 285 /* 286 * Operator after identifier is binary unless last token was 'struct' 287 */ 288 state->last_u_d = (state->last_token == structure); 289 290 p = bsearch(s_token, 291 specials, 292 sizeof(specials) / sizeof(specials[0]), 293 sizeof(specials[0]), 294 strcmp_type); 295 if (p == NULL) { /* not a special keyword... */ 296 char *u; 297 298 /* ... so maybe a type_t or a typedef */ 299 if ((opt.auto_typedefs && ((u = strrchr(s_token, '_')) != NULL) && 300 strcmp(u, "_t") == 0) || (typename_top >= 0 && 301 bsearch(s_token, typenames, typename_top + 1, 302 sizeof(typenames[0]), strcmp_type))) { 303 state->keyword = 4; /* a type name */ 304 state->last_u_d = true; 305 goto found_typename; 306 } 307 } else { /* we have a keyword */ 308 state->keyword = p->rwcode; 309 state->last_u_d = true; 310 switch (p->rwcode) { 311 case 7: /* it is a switch */ 312 return (swstmt); 313 case 8: /* a case or default */ 314 return (casestmt); 315 316 case 3: /* a "struct" */ 317 /* FALLTHROUGH */ 318 case 4: /* one of the declaration keywords */ 319 found_typename: 320 if (state->p_l_follow) { 321 /* inside parens: cast, param list, offsetof or sizeof */ 322 state->cast_mask |= (1 << state->p_l_follow) & ~state->not_cast_mask; 323 } 324 if (state->last_token == period || state->last_token == unary_op) { 325 state->keyword = 0; 326 break; 327 } 328 if (p != NULL && p->rwcode == 3) 329 return (structure); 330 if (state->p_l_follow) 331 break; 332 return (decl); 333 334 case 5: /* if, while, for */ 335 return (sp_paren); 336 337 case 6: /* do, else */ 338 return (sp_nparen); 339 340 case 10: /* storage class specifier */ 341 return (storage); 342 343 case 11: /* typedef */ 344 return (type_def); 345 346 default: /* all others are treated like any other 347 * identifier */ 348 return (ident); 349 } /* end of switch */ 350 } /* end of if (found_it) */ 351 if (*buf_ptr == '(' && state->tos <= 1 && state->ind_level == 0 && 352 state->in_parameter_declaration == 0 && state->block_init == 0) { 353 char *tp = buf_ptr; 354 while (tp < buf_end) 355 if (*tp++ == ')' && (*tp == ';' || *tp == ',')) 356 goto not_proc; 357 strncpy(state->procname, token, sizeof state->procname - 1); 358 if (state->in_decl) 359 state->in_parameter_declaration = 1; 360 return (funcname); 361 not_proc:; 362 } 363 /* 364 * The following hack attempts to guess whether or not the current 365 * token is in fact a declaration keyword -- one that has been 366 * typedefd 367 */ 368 else if (!state->p_l_follow && !state->block_init && 369 !state->in_stmt && 370 ((*buf_ptr == '*' && buf_ptr[1] != '=') || 371 isalpha((unsigned char)*buf_ptr)) && 372 (state->last_token == semicolon || state->last_token == lbrace || 373 state->last_token == rbrace)) { 374 state->keyword = 4; /* a type name */ 375 state->last_u_d = true; 376 return decl; 377 } 378 if (state->last_token == decl) /* if this is a declared variable, 379 * then following sign is unary */ 380 state->last_u_d = true; /* will make "int a -1" work */ 381 return (ident); /* the ident is not in the list */ 382 } /* end of procesing for alpanum character */ 383 384 /* Scan a non-alphanumeric token */ 385 386 CHECK_SIZE_TOKEN(3); /* things like "<<=" */ 387 *e_token++ = *buf_ptr; /* if it is only a one-character token, it is 388 * moved here */ 389 *e_token = '\0'; 390 if (++buf_ptr >= buf_end) 391 fill_buffer(); 392 393 switch (*token) { 394 case '\n': 395 unary_delim = state->last_u_d; 396 state->last_nl = true; /* remember that we just had a newline */ 397 code = (had_eof ? 0 : newline); 398 399 /* 400 * if data has been exhausted, the newline is a dummy, and we should 401 * return code to stop 402 */ 403 break; 404 405 case '\'': /* start of quoted character */ 406 case '"': /* start of string */ 407 qchar = *token; 408 do { /* copy the string */ 409 while (1) { /* move one character or [/<char>]<char> */ 410 if (*buf_ptr == '\n') { 411 diag2(1, "Unterminated literal"); 412 goto stop_lit; 413 } 414 CHECK_SIZE_TOKEN(2); 415 *e_token = *buf_ptr++; 416 if (buf_ptr >= buf_end) 417 fill_buffer(); 418 if (*e_token == BACKSLASH) { /* if escape, copy extra char */ 419 if (*buf_ptr == '\n') /* check for escaped newline */ 420 ++line_no; 421 *++e_token = *buf_ptr++; 422 ++e_token; /* we must increment this again because we 423 * copied two chars */ 424 if (buf_ptr >= buf_end) 425 fill_buffer(); 426 } 427 else 428 break; /* we copied one character */ 429 } /* end of while (1) */ 430 } while (*e_token++ != qchar); 431 stop_lit: 432 code = ident; 433 break; 434 435 case ('('): 436 case ('['): 437 unary_delim = true; 438 code = lparen; 439 break; 440 441 case (')'): 442 case (']'): 443 code = rparen; 444 break; 445 446 case '#': 447 unary_delim = state->last_u_d; 448 code = preesc; 449 break; 450 451 case '?': 452 unary_delim = true; 453 code = question; 454 break; 455 456 case (':'): 457 code = colon; 458 unary_delim = true; 459 break; 460 461 case (';'): 462 unary_delim = true; 463 code = semicolon; 464 break; 465 466 case ('{'): 467 unary_delim = true; 468 469 /* 470 * if (state->in_or_st) state->block_init = 1; 471 */ 472 /* ? code = state->block_init ? lparen : lbrace; */ 473 code = lbrace; 474 break; 475 476 case ('}'): 477 unary_delim = true; 478 /* ? code = state->block_init ? rparen : rbrace; */ 479 code = rbrace; 480 break; 481 482 case 014: /* a form feed */ 483 unary_delim = state->last_u_d; 484 state->last_nl = true; /* remember this so we can set 'state->col_1' 485 * right */ 486 code = form_feed; 487 break; 488 489 case (','): 490 unary_delim = true; 491 code = comma; 492 break; 493 494 case '.': 495 unary_delim = false; 496 code = period; 497 break; 498 499 case '-': 500 case '+': /* check for -, +, --, ++ */ 501 code = (state->last_u_d ? unary_op : binary_op); 502 unary_delim = true; 503 504 if (*buf_ptr == token[0]) { 505 /* check for doubled character */ 506 *e_token++ = *buf_ptr++; 507 /* buffer overflow will be checked at end of loop */ 508 if (state->last_token == ident || state->last_token == rparen) { 509 code = (state->last_u_d ? unary_op : postop); 510 /* check for following ++ or -- */ 511 unary_delim = false; 512 } 513 } 514 else if (*buf_ptr == '=') 515 /* check for operator += */ 516 *e_token++ = *buf_ptr++; 517 else if (*buf_ptr == '>') { 518 /* check for operator -> */ 519 *e_token++ = *buf_ptr++; 520 unary_delim = false; 521 code = unary_op; 522 state->want_blank = false; 523 } 524 break; /* buffer overflow will be checked at end of 525 * switch */ 526 527 case '=': 528 if (state->in_or_st) 529 state->block_init = 1; 530 #ifdef undef 531 if (chartype[*buf_ptr & 127] == opchar) { /* we have two char assignment */ 532 e_token[-1] = *buf_ptr++; 533 if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr) 534 *e_token++ = *buf_ptr++; 535 *e_token++ = '='; /* Flip =+ to += */ 536 *e_token = 0; 537 } 538 #else 539 if (*buf_ptr == '=') {/* == */ 540 *e_token++ = '='; /* Flip =+ to += */ 541 buf_ptr++; 542 *e_token = 0; 543 } 544 #endif 545 code = binary_op; 546 unary_delim = true; 547 break; 548 /* can drop thru!!! */ 549 550 case '>': 551 case '<': 552 case '!': /* ops like <, <<, <=, !=, etc */ 553 if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') { 554 *e_token++ = *buf_ptr; 555 if (++buf_ptr >= buf_end) 556 fill_buffer(); 557 } 558 if (*buf_ptr == '=') 559 *e_token++ = *buf_ptr++; 560 code = (state->last_u_d ? unary_op : binary_op); 561 unary_delim = true; 562 break; 563 564 case '*': 565 unary_delim = true; 566 if (!state->last_u_d) { 567 if (*buf_ptr == '=') 568 *e_token++ = *buf_ptr++; 569 code = binary_op; 570 break; 571 } 572 while (*buf_ptr == '*' || isspace((unsigned char)*buf_ptr)) { 573 if (*buf_ptr == '*') { 574 CHECK_SIZE_TOKEN(1); 575 *e_token++ = *buf_ptr; 576 } 577 if (++buf_ptr >= buf_end) 578 fill_buffer(); 579 } 580 if (ps.in_decl) { 581 char *tp = buf_ptr; 582 583 while (isalpha((unsigned char)*tp) || 584 isspace((unsigned char)*tp)) { 585 if (++tp >= buf_end) 586 fill_buffer(); 587 } 588 if (*tp == '(') 589 ps.procname[0] = ' '; 590 } 591 code = unary_op; 592 break; 593 594 default: 595 if (token[0] == '/' && *buf_ptr == '*') { 596 /* it is start of comment */ 597 *e_token++ = '*'; 598 599 if (++buf_ptr >= buf_end) 600 fill_buffer(); 601 602 code = comment; 603 unary_delim = state->last_u_d; 604 break; 605 } 606 while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') { 607 /* 608 * handle ||, &&, etc, and also things as in int *****i 609 */ 610 CHECK_SIZE_TOKEN(1); 611 *e_token++ = *buf_ptr; 612 if (++buf_ptr >= buf_end) 613 fill_buffer(); 614 } 615 code = (state->last_u_d ? unary_op : binary_op); 616 unary_delim = true; 617 618 619 } /* end of switch */ 620 if (buf_ptr >= buf_end) /* check for input buffer empty */ 621 fill_buffer(); 622 state->last_u_d = unary_delim; 623 CHECK_SIZE_TOKEN(1); 624 *e_token = '\0'; /* null terminate the token */ 625 return (code); 626 } 627 628 void 629 alloc_typenames(void) 630 { 631 632 typenames = (const char **)malloc(sizeof(typenames[0]) * 633 (typename_count = 16)); 634 if (typenames == NULL) 635 err(1, NULL); 636 } 637 638 void 639 add_typename(const char *key) 640 { 641 int comparison; 642 const char *copy; 643 644 if (typename_top + 1 >= typename_count) { 645 typenames = realloc((void *)typenames, 646 sizeof(typenames[0]) * (typename_count *= 2)); 647 if (typenames == NULL) 648 err(1, NULL); 649 } 650 if (typename_top == -1) 651 typenames[++typename_top] = copy = strdup(key); 652 else if ((comparison = strcmp(key, typenames[typename_top])) >= 0) { 653 /* take advantage of sorted input */ 654 if (comparison == 0) /* remove duplicates */ 655 return; 656 typenames[++typename_top] = copy = strdup(key); 657 } 658 else { 659 int p; 660 661 for (p = 0; (comparison = strcmp(key, typenames[p])) > 0; p++) 662 /* find place for the new key */; 663 if (comparison == 0) /* remove duplicates */ 664 return; 665 memmove(&typenames[p + 1], &typenames[p], 666 sizeof(typenames[0]) * (++typename_top - p)); 667 typenames[p] = copy = strdup(key); 668 } 669 670 if (copy == NULL) 671 err(1, NULL); 672 } 673