1 /*- 2 * Copyright (c) 1985 Sun Microsystems, Inc. 3 * Copyright (c) 1980, 1993 4 * The Regents of the University of California. All rights reserved. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 3. All advertising materials mentioning features or use of this software 16 * must display the following acknowledgement: 17 * This product includes software developed by the University of 18 * California, Berkeley and its contributors. 19 * 4. Neither the name of the University nor the names of its contributors 20 * may be used to endorse or promote products derived from this software 21 * without specific prior written permission. 22 * 23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 26 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 */ 35 36 #if 0 37 #ifndef lint 38 static char sccsid[] = "@(#)lexi.c 8.1 (Berkeley) 6/6/93"; 39 #endif /* not lint */ 40 #endif 41 #include <sys/cdefs.h> 42 __FBSDID("$FreeBSD$"); 43 44 /* 45 * Here we have the token scanner for indent. It scans off one token and puts 46 * it in the global variable "token". It returns a code, indicating the type 47 * of token scanned. 48 */ 49 50 #include <err.h> 51 #include <stdio.h> 52 #include <ctype.h> 53 #include <stdlib.h> 54 #include <string.h> 55 #include "indent_globs.h" 56 #include "indent_codes.h" 57 #include "indent.h" 58 59 #define alphanum 1 60 #ifdef undef 61 #define opchar 3 62 #endif 63 64 struct templ { 65 const char *rwd; 66 int rwcode; 67 }; 68 69 /* 70 * This table has to be sorted alphabetically, because it'll be used in binary 71 * search. For the same reason, string must be the first thing in struct templ. 72 */ 73 struct templ specials[] = 74 { 75 {"auto", 10}, 76 {"break", 9}, 77 {"case", 8}, 78 {"char", 4}, 79 {"const", 4}, 80 {"default", 8}, 81 {"do", 6}, 82 {"double", 4}, 83 {"else", 6}, 84 {"enum", 3}, 85 {"extern", 10}, 86 {"float", 4}, 87 {"for", 5}, 88 {"global", 4}, 89 {"goto", 9}, 90 {"if", 5}, 91 {"int", 4}, 92 {"long", 4}, 93 {"offsetof", 1}, 94 {"register", 10}, 95 {"return", 9}, 96 {"short", 4}, 97 {"sizeof", 2}, 98 {"static", 10}, 99 {"struct", 3}, 100 {"switch", 7}, 101 {"typedef", 10}, 102 {"union", 3}, 103 {"unsigned", 4}, 104 {"void", 4}, 105 {"volatile", 4}, 106 {"while", 5} 107 }; 108 109 const char **typenames; 110 int typename_count; 111 int typename_top = -1; 112 113 char chartype[128] = 114 { /* this is used to facilitate the decision of 115 * what type (alphanumeric, operator) each 116 * character is */ 117 0, 0, 0, 0, 0, 0, 0, 0, 118 0, 0, 0, 0, 0, 0, 0, 0, 119 0, 0, 0, 0, 0, 0, 0, 0, 120 0, 0, 0, 0, 0, 0, 0, 0, 121 0, 3, 0, 0, 1, 3, 3, 0, 122 0, 0, 3, 3, 0, 3, 0, 3, 123 1, 1, 1, 1, 1, 1, 1, 1, 124 1, 1, 0, 0, 3, 3, 3, 3, 125 0, 1, 1, 1, 1, 1, 1, 1, 126 1, 1, 1, 1, 1, 1, 1, 1, 127 1, 1, 1, 1, 1, 1, 1, 1, 128 1, 1, 1, 0, 0, 0, 3, 1, 129 0, 1, 1, 1, 1, 1, 1, 1, 130 1, 1, 1, 1, 1, 1, 1, 1, 131 1, 1, 1, 1, 1, 1, 1, 1, 132 1, 1, 1, 0, 3, 0, 3, 0 133 }; 134 135 static int 136 strcmp_type(const void *e1, const void *e2) 137 { 138 return (strcmp(e1, *(const char * const *)e2)); 139 } 140 141 int 142 lexi(void) 143 { 144 int unary_delim; /* this is set to 1 if the current token 145 * forces a following operator to be unary */ 146 static int last_code; /* the last token type returned */ 147 static int l_struct; /* set to 1 if the last token was 'struct' */ 148 int code; /* internal code to be returned */ 149 char qchar; /* the delimiter character for a string */ 150 151 e_token = s_token; /* point to start of place to save token */ 152 unary_delim = false; 153 ps.col_1 = ps.last_nl; /* tell world that this token started in 154 * column 1 iff the last thing scanned was nl */ 155 ps.last_nl = false; 156 157 while (*buf_ptr == ' ' || *buf_ptr == '\t') { /* get rid of blanks */ 158 ps.col_1 = false; /* leading blanks imply token is not in column 159 * 1 */ 160 if (++buf_ptr >= buf_end) 161 fill_buffer(); 162 } 163 164 /* Scan an alphanumeric token */ 165 if (chartype[(int)*buf_ptr] == alphanum || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) { 166 /* 167 * we have a character or number 168 */ 169 struct templ *p; 170 171 if (isdigit(*buf_ptr) || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) { 172 enum base { 173 BASE_2, BASE_8, BASE_10, BASE_16 174 }; 175 int seendot = 0, 176 seenexp = 0, 177 seensfx = 0; 178 enum base in_base = BASE_10; 179 180 if (*buf_ptr == '0') { 181 if (buf_ptr[1] == 'b' || buf_ptr[1] == 'B') 182 in_base = BASE_2; 183 else if (buf_ptr[1] == 'x' || buf_ptr[1] == 'X') 184 in_base = BASE_16; 185 else if (isdigit(buf_ptr[1])) 186 in_base = BASE_8; 187 } 188 switch (in_base) { 189 case BASE_2: 190 *e_token++ = *buf_ptr++; 191 *e_token++ = *buf_ptr++; 192 while (*buf_ptr == '0' || *buf_ptr == '1') { 193 CHECK_SIZE_TOKEN; 194 *e_token++ = *buf_ptr++; 195 } 196 break; 197 case BASE_8: 198 *e_token++ = *buf_ptr++; 199 while (*buf_ptr >= '0' && *buf_ptr <= '8') { 200 CHECK_SIZE_TOKEN; 201 *e_token++ = *buf_ptr++; 202 } 203 break; 204 case BASE_16: 205 *e_token++ = *buf_ptr++; 206 *e_token++ = *buf_ptr++; 207 while (isxdigit(*buf_ptr)) { 208 CHECK_SIZE_TOKEN; 209 *e_token++ = *buf_ptr++; 210 } 211 break; 212 case BASE_10: 213 while (1) { 214 if (*buf_ptr == '.') { 215 if (seendot) 216 break; 217 else 218 seendot++; 219 } 220 CHECK_SIZE_TOKEN; 221 *e_token++ = *buf_ptr++; 222 if (!isdigit(*buf_ptr) && *buf_ptr != '.') { 223 if ((*buf_ptr != 'E' && *buf_ptr != 'e') || seenexp) 224 break; 225 else { 226 seenexp++; 227 seendot++; 228 CHECK_SIZE_TOKEN; 229 *e_token++ = *buf_ptr++; 230 if (*buf_ptr == '+' || *buf_ptr == '-') 231 *e_token++ = *buf_ptr++; 232 } 233 } 234 } 235 break; 236 } 237 while (1) { 238 if (!(seensfx & 1) && (*buf_ptr == 'U' || *buf_ptr == 'u')) { 239 CHECK_SIZE_TOKEN; 240 *e_token++ = *buf_ptr++; 241 seensfx |= 1; 242 continue; 243 } 244 if (!(seensfx & 2) && (strchr("fFlL", *buf_ptr) != NULL)) { 245 CHECK_SIZE_TOKEN; 246 if (buf_ptr[1] == buf_ptr[0]) 247 *e_token++ = *buf_ptr++; 248 *e_token++ = *buf_ptr++; 249 seensfx |= 2; 250 continue; 251 } 252 break; 253 } 254 } 255 else 256 while (chartype[(int)*buf_ptr] == alphanum || *buf_ptr == BACKSLASH) { 257 /* fill_buffer() terminates buffer with newline */ 258 if (*buf_ptr == BACKSLASH) { 259 if (*(buf_ptr + 1) == '\n') { 260 buf_ptr += 2; 261 if (buf_ptr >= buf_end) 262 fill_buffer(); 263 } else 264 break; 265 } 266 CHECK_SIZE_TOKEN; 267 /* copy it over */ 268 *e_token++ = *buf_ptr++; 269 if (buf_ptr >= buf_end) 270 fill_buffer(); 271 } 272 *e_token++ = '\0'; 273 274 if (s_token[0] == 'L' && s_token[1] == '\0' && 275 (*buf_ptr == '"' || *buf_ptr == '\'')) 276 return (strpfx); 277 278 while (*buf_ptr == ' ' || *buf_ptr == '\t') { /* get rid of blanks */ 279 if (++buf_ptr >= buf_end) 280 fill_buffer(); 281 } 282 ps.keyword = 0; 283 if (l_struct && !ps.p_l_follow) { 284 /* if last token was 'struct' and we're not 285 * in parentheses, then this token 286 * should be treated as a declaration */ 287 l_struct = false; 288 last_code = ident; 289 ps.last_u_d = true; 290 return (decl); 291 } 292 ps.last_u_d = l_struct; /* Operator after identifier is binary 293 * unless last token was 'struct' */ 294 l_struct = false; 295 last_code = ident; /* Remember that this is the code we will 296 * return */ 297 298 p = bsearch(s_token, 299 specials, 300 sizeof(specials) / sizeof(specials[0]), 301 sizeof(specials[0]), 302 strcmp_type); 303 if (p == NULL) { /* not a special keyword... */ 304 char *u; 305 306 /* ... so maybe a type_t or a typedef */ 307 if ((auto_typedefs && ((u = strrchr(s_token, '_')) != NULL) && 308 strcmp(u, "_t") == 0) || (typename_top >= 0 && 309 bsearch(s_token, typenames, typename_top + 1, 310 sizeof(typenames[0]), strcmp_type))) { 311 ps.keyword = 4; /* a type name */ 312 ps.last_u_d = true; 313 goto found_typename; 314 } 315 } else { /* we have a keyword */ 316 ps.keyword = p->rwcode; 317 ps.last_u_d = true; 318 switch (p->rwcode) { 319 case 7: /* it is a switch */ 320 return (swstmt); 321 case 8: /* a case or default */ 322 return (casestmt); 323 324 case 3: /* a "struct" */ 325 /* 326 * Next time around, we will want to know that we have had a 327 * 'struct' 328 */ 329 l_struct = true; 330 /* FALLTHROUGH */ 331 332 case 4: /* one of the declaration keywords */ 333 found_typename: 334 if (ps.p_l_follow) { 335 /* inside parens: cast, param list, offsetof or sizeof */ 336 ps.cast_mask |= (1 << ps.p_l_follow) & ~ps.not_cast_mask; 337 break; 338 } 339 last_code = decl; 340 return (decl); 341 342 case 5: /* if, while, for */ 343 return (sp_paren); 344 345 case 6: /* do, else */ 346 return (sp_nparen); 347 348 case 10: /* storage class specifier */ 349 return (storage); 350 351 default: /* all others are treated like any other 352 * identifier */ 353 return (ident); 354 } /* end of switch */ 355 } /* end of if (found_it) */ 356 if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0 && 357 ps.in_parameter_declaration == 0 && ps.block_init == 0) { 358 char *tp = buf_ptr; 359 while (tp < buf_end) 360 if (*tp++ == ')' && (*tp == ';' || *tp == ',')) 361 goto not_proc; 362 strncpy(ps.procname, token, sizeof ps.procname - 1); 363 if (ps.in_decl) 364 ps.in_parameter_declaration = 1; 365 return (last_code = funcname); 366 not_proc:; 367 } 368 /* 369 * The following hack attempts to guess whether or not the current 370 * token is in fact a declaration keyword -- one that has been 371 * typedefd 372 */ 373 if (((*buf_ptr == '*' && buf_ptr[1] != '=') || isalpha(*buf_ptr) || *buf_ptr == '_') 374 && !ps.p_l_follow 375 && !ps.block_init 376 && (ps.last_token == rparen || ps.last_token == semicolon || 377 ps.last_token == decl || 378 ps.last_token == lbrace || ps.last_token == rbrace)) { 379 ps.keyword = 4; /* a type name */ 380 ps.last_u_d = true; 381 last_code = decl; 382 return decl; 383 } 384 if (last_code == decl) /* if this is a declared variable, then 385 * following sign is unary */ 386 ps.last_u_d = true; /* will make "int a -1" work */ 387 last_code = ident; 388 return (ident); /* the ident is not in the list */ 389 } /* end of procesing for alpanum character */ 390 391 /* Scan a non-alphanumeric token */ 392 393 *e_token++ = *buf_ptr; /* if it is only a one-character token, it is 394 * moved here */ 395 *e_token = '\0'; 396 if (++buf_ptr >= buf_end) 397 fill_buffer(); 398 399 switch (*token) { 400 case '\n': 401 unary_delim = ps.last_u_d; 402 ps.last_nl = true; /* remember that we just had a newline */ 403 code = (had_eof ? 0 : newline); 404 405 /* 406 * if data has been exhausted, the newline is a dummy, and we should 407 * return code to stop 408 */ 409 break; 410 411 case '\'': /* start of quoted character */ 412 case '"': /* start of string */ 413 qchar = *token; 414 if (troff) { 415 e_token[-1] = '`'; 416 if (qchar == '"') 417 *e_token++ = '`'; 418 e_token = chfont(&bodyf, &stringf, e_token); 419 } 420 do { /* copy the string */ 421 while (1) { /* move one character or [/<char>]<char> */ 422 if (*buf_ptr == '\n') { 423 diag2(1, "Unterminated literal"); 424 goto stop_lit; 425 } 426 CHECK_SIZE_TOKEN; /* Only have to do this once in this loop, 427 * since CHECK_SIZE guarantees that there 428 * are at least 5 entries left */ 429 *e_token = *buf_ptr++; 430 if (buf_ptr >= buf_end) 431 fill_buffer(); 432 if (*e_token == BACKSLASH) { /* if escape, copy extra char */ 433 if (*buf_ptr == '\n') /* check for escaped newline */ 434 ++line_no; 435 if (troff) { 436 *++e_token = BACKSLASH; 437 if (*buf_ptr == BACKSLASH) 438 *++e_token = BACKSLASH; 439 } 440 *++e_token = *buf_ptr++; 441 ++e_token; /* we must increment this again because we 442 * copied two chars */ 443 if (buf_ptr >= buf_end) 444 fill_buffer(); 445 } 446 else 447 break; /* we copied one character */ 448 } /* end of while (1) */ 449 } while (*e_token++ != qchar); 450 if (troff) { 451 e_token = chfont(&stringf, &bodyf, e_token - 1); 452 if (qchar == '"') 453 *e_token++ = '\''; 454 } 455 stop_lit: 456 code = ident; 457 break; 458 459 case ('('): 460 case ('['): 461 unary_delim = true; 462 code = lparen; 463 break; 464 465 case (')'): 466 case (']'): 467 code = rparen; 468 break; 469 470 case '#': 471 unary_delim = ps.last_u_d; 472 code = preesc; 473 break; 474 475 case '?': 476 unary_delim = true; 477 code = question; 478 break; 479 480 case (':'): 481 code = colon; 482 unary_delim = true; 483 break; 484 485 case (';'): 486 unary_delim = true; 487 code = semicolon; 488 break; 489 490 case ('{'): 491 unary_delim = true; 492 493 /* 494 * if (ps.in_or_st) ps.block_init = 1; 495 */ 496 /* ? code = ps.block_init ? lparen : lbrace; */ 497 code = lbrace; 498 break; 499 500 case ('}'): 501 unary_delim = true; 502 /* ? code = ps.block_init ? rparen : rbrace; */ 503 code = rbrace; 504 break; 505 506 case 014: /* a form feed */ 507 unary_delim = ps.last_u_d; 508 ps.last_nl = true; /* remember this so we can set 'ps.col_1' 509 * right */ 510 code = form_feed; 511 break; 512 513 case (','): 514 unary_delim = true; 515 code = comma; 516 break; 517 518 case '.': 519 unary_delim = false; 520 code = period; 521 break; 522 523 case '-': 524 case '+': /* check for -, +, --, ++ */ 525 code = (ps.last_u_d ? unary_op : binary_op); 526 unary_delim = true; 527 528 if (*buf_ptr == token[0]) { 529 /* check for doubled character */ 530 *e_token++ = *buf_ptr++; 531 /* buffer overflow will be checked at end of loop */ 532 if (last_code == ident || last_code == rparen) { 533 code = (ps.last_u_d ? unary_op : postop); 534 /* check for following ++ or -- */ 535 unary_delim = false; 536 } 537 } 538 else if (*buf_ptr == '=') 539 /* check for operator += */ 540 *e_token++ = *buf_ptr++; 541 else if (*buf_ptr == '>') { 542 /* check for operator -> */ 543 *e_token++ = *buf_ptr++; 544 if (!pointer_as_binop) { 545 unary_delim = false; 546 code = unary_op; 547 ps.want_blank = false; 548 } 549 } 550 break; /* buffer overflow will be checked at end of 551 * switch */ 552 553 case '=': 554 if (ps.in_or_st) 555 ps.block_init = 1; 556 #ifdef undef 557 if (chartype[*buf_ptr] == opchar) { /* we have two char assignment */ 558 e_token[-1] = *buf_ptr++; 559 if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr) 560 *e_token++ = *buf_ptr++; 561 *e_token++ = '='; /* Flip =+ to += */ 562 *e_token = 0; 563 } 564 #else 565 if (*buf_ptr == '=') {/* == */ 566 *e_token++ = '='; /* Flip =+ to += */ 567 buf_ptr++; 568 *e_token = 0; 569 } 570 #endif 571 code = binary_op; 572 unary_delim = true; 573 break; 574 /* can drop thru!!! */ 575 576 case '>': 577 case '<': 578 case '!': /* ops like <, <<, <=, !=, etc */ 579 if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') { 580 *e_token++ = *buf_ptr; 581 if (++buf_ptr >= buf_end) 582 fill_buffer(); 583 } 584 if (*buf_ptr == '=') 585 *e_token++ = *buf_ptr++; 586 code = (ps.last_u_d ? unary_op : binary_op); 587 unary_delim = true; 588 break; 589 590 default: 591 if (token[0] == '/' && *buf_ptr == '*') { 592 /* it is start of comment */ 593 *e_token++ = '*'; 594 595 if (++buf_ptr >= buf_end) 596 fill_buffer(); 597 598 code = comment; 599 unary_delim = ps.last_u_d; 600 break; 601 } 602 while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') { 603 /* 604 * handle ||, &&, etc, and also things as in int *****i 605 */ 606 *e_token++ = *buf_ptr; 607 if (++buf_ptr >= buf_end) 608 fill_buffer(); 609 } 610 code = (ps.last_u_d ? unary_op : binary_op); 611 unary_delim = true; 612 613 614 } /* end of switch */ 615 if (code != newline) { 616 l_struct = false; 617 last_code = code; 618 } 619 if (buf_ptr >= buf_end) /* check for input buffer empty */ 620 fill_buffer(); 621 ps.last_u_d = unary_delim; 622 *e_token = '\0'; /* null terminate the token */ 623 return (code); 624 } 625 626 void 627 alloc_typenames(void) 628 { 629 630 typenames = (const char **)malloc(sizeof(typenames[0]) * 631 (typename_count = 16)); 632 if (typenames == NULL) 633 err(1, NULL); 634 } 635 636 void 637 add_typename(const char *key) 638 { 639 int comparison; 640 const char *copy; 641 642 if (typename_top + 1 >= typename_count) { 643 typenames = realloc((void *)typenames, 644 sizeof(typenames[0]) * (typename_count *= 2)); 645 if (typenames == NULL) 646 err(1, NULL); 647 } 648 if (typename_top == -1) 649 typenames[++typename_top] = copy = strdup(key); 650 else if ((comparison = strcmp(key, typenames[typename_top])) >= 0) { 651 /* take advantage of sorted input */ 652 if (comparison == 0) /* remove duplicates */ 653 return; 654 typenames[++typename_top] = copy = strdup(key); 655 } 656 else { 657 int p; 658 659 for (p = 0; (comparison = strcmp(key, typenames[p])) > 0; p++) 660 /* find place for the new key */; 661 if (comparison == 0) /* remove duplicates */ 662 return; 663 memmove(&typenames[p + 1], &typenames[p], 664 sizeof(typenames[0]) * (++typename_top - p)); 665 typenames[p] = copy = strdup(key); 666 } 667 668 if (copy == NULL) 669 err(1, NULL); 670 } 671