1 /*- 2 * SPDX-License-Identifier: BSD-4-Clause 3 * 4 * Copyright (c) 1985 Sun Microsystems, Inc. 5 * Copyright (c) 1980, 1993 6 * The Regents of the University of California. All rights reserved. 7 * All rights reserved. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 3. All advertising materials mentioning features or use of this software 18 * must display the following acknowledgement: 19 * This product includes software developed by the University of 20 * California, Berkeley and its contributors. 21 * 4. Neither the name of the University nor the names of its contributors 22 * may be used to endorse or promote products derived from this software 23 * without specific prior written permission. 24 * 25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 28 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 35 * SUCH DAMAGE. 36 */ 37 38 #if 0 39 #ifndef lint 40 static char sccsid[] = "@(#)lexi.c 8.1 (Berkeley) 6/6/93"; 41 #endif /* not lint */ 42 #endif 43 #include <sys/cdefs.h> 44 __FBSDID("$FreeBSD$"); 45 46 /* 47 * Here we have the token scanner for indent. It scans off one token and puts 48 * it in the global variable "token". It returns a code, indicating the type 49 * of token scanned. 50 */ 51 52 #include <err.h> 53 #include <stdio.h> 54 #include <ctype.h> 55 #include <stdlib.h> 56 #include <string.h> 57 #include "indent_globs.h" 58 #include "indent_codes.h" 59 #include "indent.h" 60 61 #define alphanum 1 62 #ifdef undef 63 #define opchar 3 64 #endif 65 66 struct templ { 67 const char *rwd; 68 int rwcode; 69 }; 70 71 /* 72 * This table has to be sorted alphabetically, because it'll be used in binary 73 * search. For the same reason, string must be the first thing in struct templ. 74 */ 75 struct templ specials[] = 76 { 77 {"auto", 10}, 78 {"break", 9}, 79 {"case", 8}, 80 {"char", 4}, 81 {"const", 4}, 82 {"default", 8}, 83 {"do", 6}, 84 {"double", 4}, 85 {"else", 6}, 86 {"enum", 3}, 87 {"extern", 10}, 88 {"float", 4}, 89 {"for", 5}, 90 {"global", 4}, 91 {"goto", 9}, 92 {"if", 5}, 93 {"int", 4}, 94 {"long", 4}, 95 {"offsetof", 1}, 96 {"register", 10}, 97 {"return", 9}, 98 {"short", 4}, 99 {"sizeof", 2}, 100 {"static", 10}, 101 {"struct", 3}, 102 {"switch", 7}, 103 {"typedef", 11}, 104 {"union", 3}, 105 {"unsigned", 4}, 106 {"void", 4}, 107 {"volatile", 4}, 108 {"while", 5} 109 }; 110 111 const char **typenames; 112 int typename_count; 113 int typename_top = -1; 114 115 char chartype[128] = 116 { /* this is used to facilitate the decision of 117 * what type (alphanumeric, operator) each 118 * character is */ 119 0, 0, 0, 0, 0, 0, 0, 0, 120 0, 0, 0, 0, 0, 0, 0, 0, 121 0, 0, 0, 0, 0, 0, 0, 0, 122 0, 0, 0, 0, 0, 0, 0, 0, 123 0, 3, 0, 0, 1, 3, 3, 0, 124 0, 0, 3, 3, 0, 3, 0, 3, 125 1, 1, 1, 1, 1, 1, 1, 1, 126 1, 1, 0, 0, 3, 3, 3, 3, 127 0, 1, 1, 1, 1, 1, 1, 1, 128 1, 1, 1, 1, 1, 1, 1, 1, 129 1, 1, 1, 1, 1, 1, 1, 1, 130 1, 1, 1, 0, 0, 0, 3, 1, 131 0, 1, 1, 1, 1, 1, 1, 1, 132 1, 1, 1, 1, 1, 1, 1, 1, 133 1, 1, 1, 1, 1, 1, 1, 1, 134 1, 1, 1, 0, 3, 0, 3, 0 135 }; 136 137 static int 138 strcmp_type(const void *e1, const void *e2) 139 { 140 return (strcmp(e1, *(const char * const *)e2)); 141 } 142 143 int 144 lexi(struct parser_state *state) 145 { 146 int unary_delim; /* this is set to 1 if the current token 147 * forces a following operator to be unary */ 148 static int last_code; /* the last token type returned */ 149 static int l_struct; /* set to 1 if the last token was 'struct' */ 150 int code; /* internal code to be returned */ 151 char qchar; /* the delimiter character for a string */ 152 153 e_token = s_token; /* point to start of place to save token */ 154 unary_delim = false; 155 state->col_1 = state->last_nl; /* tell world that this token started 156 * in column 1 iff the last thing 157 * scanned was a newline */ 158 state->last_nl = false; 159 160 while (*buf_ptr == ' ' || *buf_ptr == '\t') { /* get rid of blanks */ 161 state->col_1 = false; /* leading blanks imply token is not in column 162 * 1 */ 163 if (++buf_ptr >= buf_end) 164 fill_buffer(); 165 } 166 167 /* Scan an alphanumeric token */ 168 if (chartype[(int)*buf_ptr] == alphanum || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) { 169 /* 170 * we have a character or number 171 */ 172 struct templ *p; 173 174 if (isdigit(*buf_ptr) || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) { 175 enum base { 176 BASE_2, BASE_8, BASE_10, BASE_16 177 }; 178 int seendot = 0, 179 seenexp = 0, 180 seensfx = 0; 181 enum base in_base = BASE_10; 182 183 if (*buf_ptr == '0') { 184 if (buf_ptr[1] == 'b' || buf_ptr[1] == 'B') 185 in_base = BASE_2; 186 else if (buf_ptr[1] == 'x' || buf_ptr[1] == 'X') 187 in_base = BASE_16; 188 else if (isdigit(buf_ptr[1])) 189 in_base = BASE_8; 190 } 191 switch (in_base) { 192 case BASE_2: 193 *e_token++ = *buf_ptr++; 194 *e_token++ = *buf_ptr++; 195 while (*buf_ptr == '0' || *buf_ptr == '1') { 196 CHECK_SIZE_TOKEN; 197 *e_token++ = *buf_ptr++; 198 } 199 break; 200 case BASE_8: 201 *e_token++ = *buf_ptr++; 202 while (*buf_ptr >= '0' && *buf_ptr <= '8') { 203 CHECK_SIZE_TOKEN; 204 *e_token++ = *buf_ptr++; 205 } 206 break; 207 case BASE_16: 208 *e_token++ = *buf_ptr++; 209 *e_token++ = *buf_ptr++; 210 while (isxdigit(*buf_ptr)) { 211 CHECK_SIZE_TOKEN; 212 *e_token++ = *buf_ptr++; 213 } 214 break; 215 case BASE_10: 216 while (1) { 217 if (*buf_ptr == '.') { 218 if (seendot) 219 break; 220 else 221 seendot++; 222 } 223 CHECK_SIZE_TOKEN; 224 *e_token++ = *buf_ptr++; 225 if (!isdigit(*buf_ptr) && *buf_ptr != '.') { 226 if ((*buf_ptr != 'E' && *buf_ptr != 'e') || seenexp) 227 break; 228 else { 229 seenexp++; 230 seendot++; 231 CHECK_SIZE_TOKEN; 232 *e_token++ = *buf_ptr++; 233 if (*buf_ptr == '+' || *buf_ptr == '-') 234 *e_token++ = *buf_ptr++; 235 } 236 } 237 } 238 break; 239 } 240 while (1) { 241 if (!(seensfx & 1) && (*buf_ptr == 'U' || *buf_ptr == 'u')) { 242 CHECK_SIZE_TOKEN; 243 *e_token++ = *buf_ptr++; 244 seensfx |= 1; 245 continue; 246 } 247 if (!(seensfx & 2) && (strchr("fFlL", *buf_ptr) != NULL)) { 248 CHECK_SIZE_TOKEN; 249 if (buf_ptr[1] == buf_ptr[0]) 250 *e_token++ = *buf_ptr++; 251 *e_token++ = *buf_ptr++; 252 seensfx |= 2; 253 continue; 254 } 255 break; 256 } 257 } 258 else 259 while (chartype[(int)*buf_ptr] == alphanum || *buf_ptr == BACKSLASH) { 260 /* fill_buffer() terminates buffer with newline */ 261 if (*buf_ptr == BACKSLASH) { 262 if (*(buf_ptr + 1) == '\n') { 263 buf_ptr += 2; 264 if (buf_ptr >= buf_end) 265 fill_buffer(); 266 } else 267 break; 268 } 269 CHECK_SIZE_TOKEN; 270 /* copy it over */ 271 *e_token++ = *buf_ptr++; 272 if (buf_ptr >= buf_end) 273 fill_buffer(); 274 } 275 *e_token++ = '\0'; 276 277 if (s_token[0] == 'L' && s_token[1] == '\0' && 278 (*buf_ptr == '"' || *buf_ptr == '\'')) 279 return (strpfx); 280 281 while (*buf_ptr == ' ' || *buf_ptr == '\t') { /* get rid of blanks */ 282 if (++buf_ptr >= buf_end) 283 fill_buffer(); 284 } 285 state->keyword = 0; 286 if (l_struct && !state->p_l_follow) { 287 /* if last token was 'struct' and we're not 288 * in parentheses, then this token 289 * should be treated as a declaration */ 290 l_struct = false; 291 last_code = ident; 292 state->last_u_d = true; 293 return (decl); 294 } 295 state->last_u_d = l_struct; /* Operator after identifier is 296 * binary unless last token was 297 * 'struct' */ 298 l_struct = false; 299 last_code = ident; /* Remember that this is the code we will 300 * return */ 301 302 p = bsearch(s_token, 303 specials, 304 sizeof(specials) / sizeof(specials[0]), 305 sizeof(specials[0]), 306 strcmp_type); 307 if (p == NULL) { /* not a special keyword... */ 308 char *u; 309 310 /* ... so maybe a type_t or a typedef */ 311 if ((auto_typedefs && ((u = strrchr(s_token, '_')) != NULL) && 312 strcmp(u, "_t") == 0) || (typename_top >= 0 && 313 bsearch(s_token, typenames, typename_top + 1, 314 sizeof(typenames[0]), strcmp_type))) { 315 state->keyword = 4; /* a type name */ 316 state->last_u_d = true; 317 goto found_typename; 318 } 319 } else { /* we have a keyword */ 320 state->keyword = p->rwcode; 321 state->last_u_d = true; 322 switch (p->rwcode) { 323 case 7: /* it is a switch */ 324 return (swstmt); 325 case 8: /* a case or default */ 326 return (casestmt); 327 328 case 3: /* a "struct" */ 329 /* 330 * Next time around, we will want to know that we have had a 331 * 'struct' 332 */ 333 l_struct = true; 334 /* FALLTHROUGH */ 335 336 case 4: /* one of the declaration keywords */ 337 found_typename: 338 if (state->p_l_follow) { 339 /* inside parens: cast, param list, offsetof or sizeof */ 340 state->cast_mask |= (1 << state->p_l_follow) & ~state->not_cast_mask; 341 break; 342 } 343 last_code = decl; 344 return (decl); 345 346 case 5: /* if, while, for */ 347 return (sp_paren); 348 349 case 6: /* do, else */ 350 return (sp_nparen); 351 352 case 10: /* storage class specifier */ 353 return (storage); 354 355 case 11: /* typedef */ 356 return (type_def); 357 358 default: /* all others are treated like any other 359 * identifier */ 360 return (ident); 361 } /* end of switch */ 362 } /* end of if (found_it) */ 363 if (*buf_ptr == '(' && state->tos <= 1 && state->ind_level == 0 && 364 state->in_parameter_declaration == 0 && state->block_init == 0) { 365 char *tp = buf_ptr; 366 while (tp < buf_end) 367 if (*tp++ == ')' && (*tp == ';' || *tp == ',')) 368 goto not_proc; 369 strncpy(state->procname, token, sizeof state->procname - 1); 370 if (state->in_decl) 371 state->in_parameter_declaration = 1; 372 return (last_code = funcname); 373 not_proc:; 374 } 375 /* 376 * The following hack attempts to guess whether or not the current 377 * token is in fact a declaration keyword -- one that has been 378 * typedefd 379 */ 380 if (((*buf_ptr == '*' && buf_ptr[1] != '=') || isalpha(*buf_ptr) || *buf_ptr == '_') 381 && !state->p_l_follow 382 && !state->block_init 383 && (state->last_token == rparen || state->last_token == semicolon || 384 state->last_token == decl || 385 state->last_token == lbrace || state->last_token == rbrace)) { 386 state->keyword = 4; /* a type name */ 387 state->last_u_d = true; 388 last_code = decl; 389 return decl; 390 } 391 if (last_code == decl) /* if this is a declared variable, then 392 * following sign is unary */ 393 state->last_u_d = true; /* will make "int a -1" work */ 394 last_code = ident; 395 return (ident); /* the ident is not in the list */ 396 } /* end of procesing for alpanum character */ 397 398 /* Scan a non-alphanumeric token */ 399 400 *e_token++ = *buf_ptr; /* if it is only a one-character token, it is 401 * moved here */ 402 *e_token = '\0'; 403 if (++buf_ptr >= buf_end) 404 fill_buffer(); 405 406 switch (*token) { 407 case '\n': 408 unary_delim = state->last_u_d; 409 state->last_nl = true; /* remember that we just had a newline */ 410 code = (had_eof ? 0 : newline); 411 412 /* 413 * if data has been exhausted, the newline is a dummy, and we should 414 * return code to stop 415 */ 416 break; 417 418 case '\'': /* start of quoted character */ 419 case '"': /* start of string */ 420 qchar = *token; 421 if (troff) { 422 e_token[-1] = '`'; 423 if (qchar == '"') 424 *e_token++ = '`'; 425 e_token = chfont(&bodyf, &stringf, e_token); 426 } 427 do { /* copy the string */ 428 while (1) { /* move one character or [/<char>]<char> */ 429 if (*buf_ptr == '\n') { 430 diag2(1, "Unterminated literal"); 431 goto stop_lit; 432 } 433 CHECK_SIZE_TOKEN; /* Only have to do this once in this loop, 434 * since CHECK_SIZE guarantees that there 435 * are at least 5 entries left */ 436 *e_token = *buf_ptr++; 437 if (buf_ptr >= buf_end) 438 fill_buffer(); 439 if (*e_token == BACKSLASH) { /* if escape, copy extra char */ 440 if (*buf_ptr == '\n') /* check for escaped newline */ 441 ++line_no; 442 if (troff) { 443 *++e_token = BACKSLASH; 444 if (*buf_ptr == BACKSLASH) 445 *++e_token = BACKSLASH; 446 } 447 *++e_token = *buf_ptr++; 448 ++e_token; /* we must increment this again because we 449 * copied two chars */ 450 if (buf_ptr >= buf_end) 451 fill_buffer(); 452 } 453 else 454 break; /* we copied one character */ 455 } /* end of while (1) */ 456 } while (*e_token++ != qchar); 457 if (troff) { 458 e_token = chfont(&stringf, &bodyf, e_token - 1); 459 if (qchar == '"') 460 *e_token++ = '\''; 461 } 462 stop_lit: 463 code = ident; 464 break; 465 466 case ('('): 467 case ('['): 468 unary_delim = true; 469 code = lparen; 470 break; 471 472 case (')'): 473 case (']'): 474 code = rparen; 475 break; 476 477 case '#': 478 unary_delim = state->last_u_d; 479 code = preesc; 480 break; 481 482 case '?': 483 unary_delim = true; 484 code = question; 485 break; 486 487 case (':'): 488 code = colon; 489 unary_delim = true; 490 break; 491 492 case (';'): 493 unary_delim = true; 494 code = semicolon; 495 break; 496 497 case ('{'): 498 unary_delim = true; 499 500 /* 501 * if (state->in_or_st) state->block_init = 1; 502 */ 503 /* ? code = state->block_init ? lparen : lbrace; */ 504 code = lbrace; 505 break; 506 507 case ('}'): 508 unary_delim = true; 509 /* ? code = state->block_init ? rparen : rbrace; */ 510 code = rbrace; 511 break; 512 513 case 014: /* a form feed */ 514 unary_delim = state->last_u_d; 515 state->last_nl = true; /* remember this so we can set 'state->col_1' 516 * right */ 517 code = form_feed; 518 break; 519 520 case (','): 521 unary_delim = true; 522 code = comma; 523 break; 524 525 case '.': 526 unary_delim = false; 527 code = period; 528 break; 529 530 case '-': 531 case '+': /* check for -, +, --, ++ */ 532 code = (state->last_u_d ? unary_op : binary_op); 533 unary_delim = true; 534 535 if (*buf_ptr == token[0]) { 536 /* check for doubled character */ 537 *e_token++ = *buf_ptr++; 538 /* buffer overflow will be checked at end of loop */ 539 if (last_code == ident || last_code == rparen) { 540 code = (state->last_u_d ? unary_op : postop); 541 /* check for following ++ or -- */ 542 unary_delim = false; 543 } 544 } 545 else if (*buf_ptr == '=') 546 /* check for operator += */ 547 *e_token++ = *buf_ptr++; 548 else if (*buf_ptr == '>') { 549 /* check for operator -> */ 550 *e_token++ = *buf_ptr++; 551 unary_delim = false; 552 code = unary_op; 553 state->want_blank = false; 554 } 555 break; /* buffer overflow will be checked at end of 556 * switch */ 557 558 case '=': 559 if (state->in_or_st) 560 state->block_init = 1; 561 #ifdef undef 562 if (chartype[*buf_ptr] == opchar) { /* we have two char assignment */ 563 e_token[-1] = *buf_ptr++; 564 if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr) 565 *e_token++ = *buf_ptr++; 566 *e_token++ = '='; /* Flip =+ to += */ 567 *e_token = 0; 568 } 569 #else 570 if (*buf_ptr == '=') {/* == */ 571 *e_token++ = '='; /* Flip =+ to += */ 572 buf_ptr++; 573 *e_token = 0; 574 } 575 #endif 576 code = binary_op; 577 unary_delim = true; 578 break; 579 /* can drop thru!!! */ 580 581 case '>': 582 case '<': 583 case '!': /* ops like <, <<, <=, !=, etc */ 584 if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') { 585 *e_token++ = *buf_ptr; 586 if (++buf_ptr >= buf_end) 587 fill_buffer(); 588 } 589 if (*buf_ptr == '=') 590 *e_token++ = *buf_ptr++; 591 code = (state->last_u_d ? unary_op : binary_op); 592 unary_delim = true; 593 break; 594 595 default: 596 if (token[0] == '/' && *buf_ptr == '*') { 597 /* it is start of comment */ 598 *e_token++ = '*'; 599 600 if (++buf_ptr >= buf_end) 601 fill_buffer(); 602 603 code = comment; 604 unary_delim = state->last_u_d; 605 break; 606 } 607 while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') { 608 /* 609 * handle ||, &&, etc, and also things as in int *****i 610 */ 611 *e_token++ = *buf_ptr; 612 if (++buf_ptr >= buf_end) 613 fill_buffer(); 614 } 615 code = (state->last_u_d ? unary_op : binary_op); 616 unary_delim = true; 617 618 619 } /* end of switch */ 620 if (code != newline) { 621 l_struct = false; 622 last_code = code; 623 } 624 if (buf_ptr >= buf_end) /* check for input buffer empty */ 625 fill_buffer(); 626 state->last_u_d = unary_delim; 627 *e_token = '\0'; /* null terminate the token */ 628 return (code); 629 } 630 631 void 632 alloc_typenames(void) 633 { 634 635 typenames = (const char **)malloc(sizeof(typenames[0]) * 636 (typename_count = 16)); 637 if (typenames == NULL) 638 err(1, NULL); 639 } 640 641 void 642 add_typename(const char *key) 643 { 644 int comparison; 645 const char *copy; 646 647 if (typename_top + 1 >= typename_count) { 648 typenames = realloc((void *)typenames, 649 sizeof(typenames[0]) * (typename_count *= 2)); 650 if (typenames == NULL) 651 err(1, NULL); 652 } 653 if (typename_top == -1) 654 typenames[++typename_top] = copy = strdup(key); 655 else if ((comparison = strcmp(key, typenames[typename_top])) >= 0) { 656 /* take advantage of sorted input */ 657 if (comparison == 0) /* remove duplicates */ 658 return; 659 typenames[++typename_top] = copy = strdup(key); 660 } 661 else { 662 int p; 663 664 for (p = 0; (comparison = strcmp(key, typenames[p])) > 0; p++) 665 /* find place for the new key */; 666 if (comparison == 0) /* remove duplicates */ 667 return; 668 memmove(&typenames[p + 1], &typenames[p], 669 sizeof(typenames[0]) * (++typename_top - p)); 670 typenames[p] = copy = strdup(key); 671 } 672 673 if (copy == NULL) 674 err(1, NULL); 675 } 676