1 /*- 2 * SPDX-License-Identifier: BSD-4-Clause 3 * 4 * Copyright (c) 1985 Sun Microsystems, Inc. 5 * Copyright (c) 1980, 1993 6 * The Regents of the University of California. All rights reserved. 7 * All rights reserved. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 3. All advertising materials mentioning features or use of this software 18 * must display the following acknowledgement: 19 * This product includes software developed by the University of 20 * California, Berkeley and its contributors. 21 * 4. Neither the name of the University nor the names of its contributors 22 * may be used to endorse or promote products derived from this software 23 * without specific prior written permission. 24 * 25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 28 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 35 * SUCH DAMAGE. 36 */ 37 38 #if 0 39 #ifndef lint 40 static char sccsid[] = "@(#)lexi.c 8.1 (Berkeley) 6/6/93"; 41 #endif /* not lint */ 42 #endif 43 #include <sys/cdefs.h> 44 __FBSDID("$FreeBSD$"); 45 46 /* 47 * Here we have the token scanner for indent. It scans off one token and puts 48 * it in the global variable "token". It returns a code, indicating the type 49 * of token scanned. 50 */ 51 52 #include <err.h> 53 #include <stdio.h> 54 #include <ctype.h> 55 #include <stdlib.h> 56 #include <string.h> 57 #include "indent_globs.h" 58 #include "indent_codes.h" 59 #include "indent.h" 60 61 #define alphanum 1 62 #ifdef undef 63 #define opchar 3 64 #endif 65 66 struct templ { 67 const char *rwd; 68 int rwcode; 69 }; 70 71 /* 72 * This table has to be sorted alphabetically, because it'll be used in binary 73 * search. For the same reason, string must be the first thing in struct templ. 74 */ 75 struct templ specials[] = 76 { 77 {"auto", 10}, 78 {"break", 9}, 79 {"case", 8}, 80 {"char", 4}, 81 {"const", 4}, 82 {"default", 8}, 83 {"do", 6}, 84 {"double", 4}, 85 {"else", 6}, 86 {"enum", 3}, 87 {"extern", 10}, 88 {"float", 4}, 89 {"for", 5}, 90 {"global", 4}, 91 {"goto", 9}, 92 {"if", 5}, 93 {"int", 4}, 94 {"long", 4}, 95 {"offsetof", 1}, 96 {"register", 10}, 97 {"return", 9}, 98 {"short", 4}, 99 {"sizeof", 2}, 100 {"static", 10}, 101 {"struct", 3}, 102 {"switch", 7}, 103 {"typedef", 11}, 104 {"union", 3}, 105 {"unsigned", 4}, 106 {"void", 4}, 107 {"volatile", 4}, 108 {"while", 5} 109 }; 110 111 const char **typenames; 112 int typename_count; 113 int typename_top = -1; 114 115 char chartype[128] = 116 { /* this is used to facilitate the decision of 117 * what type (alphanumeric, operator) each 118 * character is */ 119 0, 0, 0, 0, 0, 0, 0, 0, 120 0, 0, 0, 0, 0, 0, 0, 0, 121 0, 0, 0, 0, 0, 0, 0, 0, 122 0, 0, 0, 0, 0, 0, 0, 0, 123 0, 3, 0, 0, 1, 3, 3, 0, 124 0, 0, 3, 3, 0, 3, 0, 3, 125 1, 1, 1, 1, 1, 1, 1, 1, 126 1, 1, 0, 0, 3, 3, 3, 3, 127 0, 1, 1, 1, 1, 1, 1, 1, 128 1, 1, 1, 1, 1, 1, 1, 1, 129 1, 1, 1, 1, 1, 1, 1, 1, 130 1, 1, 1, 0, 0, 0, 3, 1, 131 0, 1, 1, 1, 1, 1, 1, 1, 132 1, 1, 1, 1, 1, 1, 1, 1, 133 1, 1, 1, 1, 1, 1, 1, 1, 134 1, 1, 1, 0, 3, 0, 3, 0 135 }; 136 137 static int 138 strcmp_type(const void *e1, const void *e2) 139 { 140 return (strcmp(e1, *(const char * const *)e2)); 141 } 142 143 int 144 lexi(void) 145 { 146 int unary_delim; /* this is set to 1 if the current token 147 * forces a following operator to be unary */ 148 static int last_code; /* the last token type returned */ 149 static int l_struct; /* set to 1 if the last token was 'struct' */ 150 int code; /* internal code to be returned */ 151 char qchar; /* the delimiter character for a string */ 152 153 e_token = s_token; /* point to start of place to save token */ 154 unary_delim = false; 155 ps.col_1 = ps.last_nl; /* tell world that this token started in 156 * column 1 iff the last thing scanned was nl */ 157 ps.last_nl = false; 158 159 while (*buf_ptr == ' ' || *buf_ptr == '\t') { /* get rid of blanks */ 160 ps.col_1 = false; /* leading blanks imply token is not in column 161 * 1 */ 162 if (++buf_ptr >= buf_end) 163 fill_buffer(); 164 } 165 166 /* Scan an alphanumeric token */ 167 if (chartype[(int)*buf_ptr] == alphanum || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) { 168 /* 169 * we have a character or number 170 */ 171 struct templ *p; 172 173 if (isdigit(*buf_ptr) || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) { 174 enum base { 175 BASE_2, BASE_8, BASE_10, BASE_16 176 }; 177 int seendot = 0, 178 seenexp = 0, 179 seensfx = 0; 180 enum base in_base = BASE_10; 181 182 if (*buf_ptr == '0') { 183 if (buf_ptr[1] == 'b' || buf_ptr[1] == 'B') 184 in_base = BASE_2; 185 else if (buf_ptr[1] == 'x' || buf_ptr[1] == 'X') 186 in_base = BASE_16; 187 else if (isdigit(buf_ptr[1])) 188 in_base = BASE_8; 189 } 190 switch (in_base) { 191 case BASE_2: 192 *e_token++ = *buf_ptr++; 193 *e_token++ = *buf_ptr++; 194 while (*buf_ptr == '0' || *buf_ptr == '1') { 195 CHECK_SIZE_TOKEN; 196 *e_token++ = *buf_ptr++; 197 } 198 break; 199 case BASE_8: 200 *e_token++ = *buf_ptr++; 201 while (*buf_ptr >= '0' && *buf_ptr <= '8') { 202 CHECK_SIZE_TOKEN; 203 *e_token++ = *buf_ptr++; 204 } 205 break; 206 case BASE_16: 207 *e_token++ = *buf_ptr++; 208 *e_token++ = *buf_ptr++; 209 while (isxdigit(*buf_ptr)) { 210 CHECK_SIZE_TOKEN; 211 *e_token++ = *buf_ptr++; 212 } 213 break; 214 case BASE_10: 215 while (1) { 216 if (*buf_ptr == '.') { 217 if (seendot) 218 break; 219 else 220 seendot++; 221 } 222 CHECK_SIZE_TOKEN; 223 *e_token++ = *buf_ptr++; 224 if (!isdigit(*buf_ptr) && *buf_ptr != '.') { 225 if ((*buf_ptr != 'E' && *buf_ptr != 'e') || seenexp) 226 break; 227 else { 228 seenexp++; 229 seendot++; 230 CHECK_SIZE_TOKEN; 231 *e_token++ = *buf_ptr++; 232 if (*buf_ptr == '+' || *buf_ptr == '-') 233 *e_token++ = *buf_ptr++; 234 } 235 } 236 } 237 break; 238 } 239 while (1) { 240 if (!(seensfx & 1) && (*buf_ptr == 'U' || *buf_ptr == 'u')) { 241 CHECK_SIZE_TOKEN; 242 *e_token++ = *buf_ptr++; 243 seensfx |= 1; 244 continue; 245 } 246 if (!(seensfx & 2) && (strchr("fFlL", *buf_ptr) != NULL)) { 247 CHECK_SIZE_TOKEN; 248 if (buf_ptr[1] == buf_ptr[0]) 249 *e_token++ = *buf_ptr++; 250 *e_token++ = *buf_ptr++; 251 seensfx |= 2; 252 continue; 253 } 254 break; 255 } 256 } 257 else 258 while (chartype[(int)*buf_ptr] == alphanum || *buf_ptr == BACKSLASH) { 259 /* fill_buffer() terminates buffer with newline */ 260 if (*buf_ptr == BACKSLASH) { 261 if (*(buf_ptr + 1) == '\n') { 262 buf_ptr += 2; 263 if (buf_ptr >= buf_end) 264 fill_buffer(); 265 } else 266 break; 267 } 268 CHECK_SIZE_TOKEN; 269 /* copy it over */ 270 *e_token++ = *buf_ptr++; 271 if (buf_ptr >= buf_end) 272 fill_buffer(); 273 } 274 *e_token++ = '\0'; 275 276 if (s_token[0] == 'L' && s_token[1] == '\0' && 277 (*buf_ptr == '"' || *buf_ptr == '\'')) 278 return (strpfx); 279 280 while (*buf_ptr == ' ' || *buf_ptr == '\t') { /* get rid of blanks */ 281 if (++buf_ptr >= buf_end) 282 fill_buffer(); 283 } 284 ps.keyword = 0; 285 if (l_struct && !ps.p_l_follow) { 286 /* if last token was 'struct' and we're not 287 * in parentheses, then this token 288 * should be treated as a declaration */ 289 l_struct = false; 290 last_code = ident; 291 ps.last_u_d = true; 292 return (decl); 293 } 294 ps.last_u_d = l_struct; /* Operator after identifier is binary 295 * unless last token was 'struct' */ 296 l_struct = false; 297 last_code = ident; /* Remember that this is the code we will 298 * return */ 299 300 p = bsearch(s_token, 301 specials, 302 sizeof(specials) / sizeof(specials[0]), 303 sizeof(specials[0]), 304 strcmp_type); 305 if (p == NULL) { /* not a special keyword... */ 306 char *u; 307 308 /* ... so maybe a type_t or a typedef */ 309 if ((auto_typedefs && ((u = strrchr(s_token, '_')) != NULL) && 310 strcmp(u, "_t") == 0) || (typename_top >= 0 && 311 bsearch(s_token, typenames, typename_top + 1, 312 sizeof(typenames[0]), strcmp_type))) { 313 ps.keyword = 4; /* a type name */ 314 ps.last_u_d = true; 315 goto found_typename; 316 } 317 } else { /* we have a keyword */ 318 ps.keyword = p->rwcode; 319 ps.last_u_d = true; 320 switch (p->rwcode) { 321 case 7: /* it is a switch */ 322 return (swstmt); 323 case 8: /* a case or default */ 324 return (casestmt); 325 326 case 3: /* a "struct" */ 327 /* 328 * Next time around, we will want to know that we have had a 329 * 'struct' 330 */ 331 l_struct = true; 332 /* FALLTHROUGH */ 333 334 case 4: /* one of the declaration keywords */ 335 found_typename: 336 if (ps.p_l_follow) { 337 /* inside parens: cast, param list, offsetof or sizeof */ 338 ps.cast_mask |= (1 << ps.p_l_follow) & ~ps.not_cast_mask; 339 break; 340 } 341 last_code = decl; 342 return (decl); 343 344 case 5: /* if, while, for */ 345 return (sp_paren); 346 347 case 6: /* do, else */ 348 return (sp_nparen); 349 350 case 10: /* storage class specifier */ 351 return (storage); 352 353 case 11: /* typedef */ 354 return (type_def); 355 356 default: /* all others are treated like any other 357 * identifier */ 358 return (ident); 359 } /* end of switch */ 360 } /* end of if (found_it) */ 361 if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0 && 362 ps.in_parameter_declaration == 0 && ps.block_init == 0) { 363 char *tp = buf_ptr; 364 while (tp < buf_end) 365 if (*tp++ == ')' && (*tp == ';' || *tp == ',')) 366 goto not_proc; 367 strncpy(ps.procname, token, sizeof ps.procname - 1); 368 if (ps.in_decl) 369 ps.in_parameter_declaration = 1; 370 return (last_code = funcname); 371 not_proc:; 372 } 373 /* 374 * The following hack attempts to guess whether or not the current 375 * token is in fact a declaration keyword -- one that has been 376 * typedefd 377 */ 378 if (((*buf_ptr == '*' && buf_ptr[1] != '=') || isalpha(*buf_ptr) || *buf_ptr == '_') 379 && !ps.p_l_follow 380 && !ps.block_init 381 && (ps.last_token == rparen || ps.last_token == semicolon || 382 ps.last_token == decl || 383 ps.last_token == lbrace || ps.last_token == rbrace)) { 384 ps.keyword = 4; /* a type name */ 385 ps.last_u_d = true; 386 last_code = decl; 387 return decl; 388 } 389 if (last_code == decl) /* if this is a declared variable, then 390 * following sign is unary */ 391 ps.last_u_d = true; /* will make "int a -1" work */ 392 last_code = ident; 393 return (ident); /* the ident is not in the list */ 394 } /* end of procesing for alpanum character */ 395 396 /* Scan a non-alphanumeric token */ 397 398 *e_token++ = *buf_ptr; /* if it is only a one-character token, it is 399 * moved here */ 400 *e_token = '\0'; 401 if (++buf_ptr >= buf_end) 402 fill_buffer(); 403 404 switch (*token) { 405 case '\n': 406 unary_delim = ps.last_u_d; 407 ps.last_nl = true; /* remember that we just had a newline */ 408 code = (had_eof ? 0 : newline); 409 410 /* 411 * if data has been exhausted, the newline is a dummy, and we should 412 * return code to stop 413 */ 414 break; 415 416 case '\'': /* start of quoted character */ 417 case '"': /* start of string */ 418 qchar = *token; 419 if (troff) { 420 e_token[-1] = '`'; 421 if (qchar == '"') 422 *e_token++ = '`'; 423 e_token = chfont(&bodyf, &stringf, e_token); 424 } 425 do { /* copy the string */ 426 while (1) { /* move one character or [/<char>]<char> */ 427 if (*buf_ptr == '\n') { 428 diag2(1, "Unterminated literal"); 429 goto stop_lit; 430 } 431 CHECK_SIZE_TOKEN; /* Only have to do this once in this loop, 432 * since CHECK_SIZE guarantees that there 433 * are at least 5 entries left */ 434 *e_token = *buf_ptr++; 435 if (buf_ptr >= buf_end) 436 fill_buffer(); 437 if (*e_token == BACKSLASH) { /* if escape, copy extra char */ 438 if (*buf_ptr == '\n') /* check for escaped newline */ 439 ++line_no; 440 if (troff) { 441 *++e_token = BACKSLASH; 442 if (*buf_ptr == BACKSLASH) 443 *++e_token = BACKSLASH; 444 } 445 *++e_token = *buf_ptr++; 446 ++e_token; /* we must increment this again because we 447 * copied two chars */ 448 if (buf_ptr >= buf_end) 449 fill_buffer(); 450 } 451 else 452 break; /* we copied one character */ 453 } /* end of while (1) */ 454 } while (*e_token++ != qchar); 455 if (troff) { 456 e_token = chfont(&stringf, &bodyf, e_token - 1); 457 if (qchar == '"') 458 *e_token++ = '\''; 459 } 460 stop_lit: 461 code = ident; 462 break; 463 464 case ('('): 465 case ('['): 466 unary_delim = true; 467 code = lparen; 468 break; 469 470 case (')'): 471 case (']'): 472 code = rparen; 473 break; 474 475 case '#': 476 unary_delim = ps.last_u_d; 477 code = preesc; 478 break; 479 480 case '?': 481 unary_delim = true; 482 code = question; 483 break; 484 485 case (':'): 486 code = colon; 487 unary_delim = true; 488 break; 489 490 case (';'): 491 unary_delim = true; 492 code = semicolon; 493 break; 494 495 case ('{'): 496 unary_delim = true; 497 498 /* 499 * if (ps.in_or_st) ps.block_init = 1; 500 */ 501 /* ? code = ps.block_init ? lparen : lbrace; */ 502 code = lbrace; 503 break; 504 505 case ('}'): 506 unary_delim = true; 507 /* ? code = ps.block_init ? rparen : rbrace; */ 508 code = rbrace; 509 break; 510 511 case 014: /* a form feed */ 512 unary_delim = ps.last_u_d; 513 ps.last_nl = true; /* remember this so we can set 'ps.col_1' 514 * right */ 515 code = form_feed; 516 break; 517 518 case (','): 519 unary_delim = true; 520 code = comma; 521 break; 522 523 case '.': 524 unary_delim = false; 525 code = period; 526 break; 527 528 case '-': 529 case '+': /* check for -, +, --, ++ */ 530 code = (ps.last_u_d ? unary_op : binary_op); 531 unary_delim = true; 532 533 if (*buf_ptr == token[0]) { 534 /* check for doubled character */ 535 *e_token++ = *buf_ptr++; 536 /* buffer overflow will be checked at end of loop */ 537 if (last_code == ident || last_code == rparen) { 538 code = (ps.last_u_d ? unary_op : postop); 539 /* check for following ++ or -- */ 540 unary_delim = false; 541 } 542 } 543 else if (*buf_ptr == '=') 544 /* check for operator += */ 545 *e_token++ = *buf_ptr++; 546 else if (*buf_ptr == '>') { 547 /* check for operator -> */ 548 *e_token++ = *buf_ptr++; 549 unary_delim = false; 550 code = unary_op; 551 ps.want_blank = false; 552 } 553 break; /* buffer overflow will be checked at end of 554 * switch */ 555 556 case '=': 557 if (ps.in_or_st) 558 ps.block_init = 1; 559 #ifdef undef 560 if (chartype[*buf_ptr] == opchar) { /* we have two char assignment */ 561 e_token[-1] = *buf_ptr++; 562 if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr) 563 *e_token++ = *buf_ptr++; 564 *e_token++ = '='; /* Flip =+ to += */ 565 *e_token = 0; 566 } 567 #else 568 if (*buf_ptr == '=') {/* == */ 569 *e_token++ = '='; /* Flip =+ to += */ 570 buf_ptr++; 571 *e_token = 0; 572 } 573 #endif 574 code = binary_op; 575 unary_delim = true; 576 break; 577 /* can drop thru!!! */ 578 579 case '>': 580 case '<': 581 case '!': /* ops like <, <<, <=, !=, etc */ 582 if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') { 583 *e_token++ = *buf_ptr; 584 if (++buf_ptr >= buf_end) 585 fill_buffer(); 586 } 587 if (*buf_ptr == '=') 588 *e_token++ = *buf_ptr++; 589 code = (ps.last_u_d ? unary_op : binary_op); 590 unary_delim = true; 591 break; 592 593 default: 594 if (token[0] == '/' && *buf_ptr == '*') { 595 /* it is start of comment */ 596 *e_token++ = '*'; 597 598 if (++buf_ptr >= buf_end) 599 fill_buffer(); 600 601 code = comment; 602 unary_delim = ps.last_u_d; 603 break; 604 } 605 while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') { 606 /* 607 * handle ||, &&, etc, and also things as in int *****i 608 */ 609 *e_token++ = *buf_ptr; 610 if (++buf_ptr >= buf_end) 611 fill_buffer(); 612 } 613 code = (ps.last_u_d ? unary_op : binary_op); 614 unary_delim = true; 615 616 617 } /* end of switch */ 618 if (code != newline) { 619 l_struct = false; 620 last_code = code; 621 } 622 if (buf_ptr >= buf_end) /* check for input buffer empty */ 623 fill_buffer(); 624 ps.last_u_d = unary_delim; 625 *e_token = '\0'; /* null terminate the token */ 626 return (code); 627 } 628 629 void 630 alloc_typenames(void) 631 { 632 633 typenames = (const char **)malloc(sizeof(typenames[0]) * 634 (typename_count = 16)); 635 if (typenames == NULL) 636 err(1, NULL); 637 } 638 639 void 640 add_typename(const char *key) 641 { 642 int comparison; 643 const char *copy; 644 645 if (typename_top + 1 >= typename_count) { 646 typenames = realloc((void *)typenames, 647 sizeof(typenames[0]) * (typename_count *= 2)); 648 if (typenames == NULL) 649 err(1, NULL); 650 } 651 if (typename_top == -1) 652 typenames[++typename_top] = copy = strdup(key); 653 else if ((comparison = strcmp(key, typenames[typename_top])) >= 0) { 654 /* take advantage of sorted input */ 655 if (comparison == 0) /* remove duplicates */ 656 return; 657 typenames[++typename_top] = copy = strdup(key); 658 } 659 else { 660 int p; 661 662 for (p = 0; (comparison = strcmp(key, typenames[p])) > 0; p++) 663 /* find place for the new key */; 664 if (comparison == 0) /* remove duplicates */ 665 return; 666 memmove(&typenames[p + 1], &typenames[p], 667 sizeof(typenames[0]) * (++typename_top - p)); 668 typenames[p] = copy = strdup(key); 669 } 670 671 if (copy == NULL) 672 err(1, NULL); 673 } 674