1 /* 2 * Copyright (c) 1985 Sun Microsystems, Inc. 3 * Copyright (c) 1980, 1993 4 * The Regents of the University of California. All rights reserved. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 3. All advertising materials mentioning features or use of this software 16 * must display the following acknowledgement: 17 * This product includes software developed by the University of 18 * California, Berkeley and its contributors. 19 * 4. Neither the name of the University nor the names of its contributors 20 * may be used to endorse or promote products derived from this software 21 * without specific prior written permission. 22 * 23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 26 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 */ 35 36 #if 0 37 #ifndef lint 38 static char sccsid[] = "@(#)lexi.c 8.1 (Berkeley) 6/6/93"; 39 #endif /* not lint */ 40 #endif 41 #include <sys/cdefs.h> 42 __FBSDID("$FreeBSD$"); 43 44 /* 45 * Here we have the token scanner for indent. It scans off one token and puts 46 * it in the global variable "token". It returns a code, indicating the type 47 * of token scanned. 48 */ 49 50 #include <err.h> 51 #include <stdio.h> 52 #include <ctype.h> 53 #include <stdlib.h> 54 #include <string.h> 55 #include "indent_globs.h" 56 #include "indent_codes.h" 57 #include "indent.h" 58 59 #define alphanum 1 60 #define opchar 3 61 62 struct templ { 63 const char *rwd; 64 int rwcode; 65 }; 66 67 struct templ specials[1000] = 68 { 69 {"switch", 1}, 70 {"case", 2}, 71 {"break", 0}, 72 {"struct", 3}, 73 {"union", 3}, 74 {"enum", 3}, 75 {"default", 2}, 76 {"int", 4}, 77 {"char", 4}, 78 {"float", 4}, 79 {"double", 4}, 80 {"long", 4}, 81 {"short", 4}, 82 {"typdef", 4}, 83 {"unsigned", 4}, 84 {"register", 4}, 85 {"static", 4}, 86 {"global", 4}, 87 {"extern", 4}, 88 {"void", 4}, 89 {"const", 4}, 90 {"volatile", 4}, 91 {"goto", 0}, 92 {"return", 0}, 93 {"if", 5}, 94 {"while", 5}, 95 {"for", 5}, 96 {"else", 6}, 97 {"do", 6}, 98 {"sizeof", 7}, 99 {0, 0} 100 }; 101 102 char chartype[128] = 103 { /* this is used to facilitate the decision of 104 * what type (alphanumeric, operator) each 105 * character is */ 106 0, 0, 0, 0, 0, 0, 0, 0, 107 0, 0, 0, 0, 0, 0, 0, 0, 108 0, 0, 0, 0, 0, 0, 0, 0, 109 0, 0, 0, 0, 0, 0, 0, 0, 110 0, 3, 0, 0, 1, 3, 3, 0, 111 0, 0, 3, 3, 0, 3, 0, 3, 112 1, 1, 1, 1, 1, 1, 1, 1, 113 1, 1, 0, 0, 3, 3, 3, 3, 114 0, 1, 1, 1, 1, 1, 1, 1, 115 1, 1, 1, 1, 1, 1, 1, 1, 116 1, 1, 1, 1, 1, 1, 1, 1, 117 1, 1, 1, 0, 0, 0, 3, 1, 118 0, 1, 1, 1, 1, 1, 1, 1, 119 1, 1, 1, 1, 1, 1, 1, 1, 120 1, 1, 1, 1, 1, 1, 1, 1, 121 1, 1, 1, 0, 3, 0, 3, 0 122 }; 123 124 int 125 lexi(void) 126 { 127 int unary_delim; /* this is set to 1 if the current token 128 * forces a following operator to be unary */ 129 static int last_code; /* the last token type returned */ 130 static int l_struct; /* set to 1 if the last token was 'struct' */ 131 int code; /* internal code to be returned */ 132 char qchar; /* the delimiter character for a string */ 133 134 e_token = s_token; /* point to start of place to save token */ 135 unary_delim = false; 136 ps.col_1 = ps.last_nl; /* tell world that this token started in 137 * column 1 iff the last thing scanned was nl */ 138 ps.last_nl = false; 139 140 while (*buf_ptr == ' ' || *buf_ptr == '\t') { /* get rid of blanks */ 141 ps.col_1 = false; /* leading blanks imply token is not in column 142 * 1 */ 143 if (++buf_ptr >= buf_end) 144 fill_buffer(); 145 } 146 147 /* Scan an alphanumeric token */ 148 if (chartype[(int)*buf_ptr] == alphanum || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) { 149 /* 150 * we have a character or number 151 */ 152 const char *j; /* used for searching thru list of 153 * 154 * reserved words */ 155 struct templ *p; 156 157 if (isdigit(*buf_ptr) || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) { 158 int seendot = 0, 159 seenexp = 0, 160 seensfx = 0; 161 if (*buf_ptr == '0' && 162 (buf_ptr[1] == 'x' || buf_ptr[1] == 'X')) { 163 *e_token++ = *buf_ptr++; 164 *e_token++ = *buf_ptr++; 165 while (isxdigit(*buf_ptr)) { 166 CHECK_SIZE_TOKEN; 167 *e_token++ = *buf_ptr++; 168 } 169 } 170 else 171 while (1) { 172 if (*buf_ptr == '.') { 173 if (seendot) 174 break; 175 else 176 seendot++; 177 } 178 CHECK_SIZE_TOKEN; 179 *e_token++ = *buf_ptr++; 180 if (!isdigit(*buf_ptr) && *buf_ptr != '.') { 181 if ((*buf_ptr != 'E' && *buf_ptr != 'e') || seenexp) 182 break; 183 else { 184 seenexp++; 185 seendot++; 186 CHECK_SIZE_TOKEN; 187 *e_token++ = *buf_ptr++; 188 if (*buf_ptr == '+' || *buf_ptr == '-') 189 *e_token++ = *buf_ptr++; 190 } 191 } 192 } 193 while (1) { 194 if (!(seensfx & 1) && 195 (*buf_ptr == 'U' || *buf_ptr == 'u')) { 196 CHECK_SIZE_TOKEN; 197 *e_token++ = *buf_ptr++; 198 seensfx |= 1; 199 continue; 200 } 201 if (!(seensfx & 2) && 202 (*buf_ptr == 'L' || *buf_ptr == 'l')) { 203 CHECK_SIZE_TOKEN; 204 if (buf_ptr[1] == buf_ptr[0]) 205 *e_token++ = *buf_ptr++; 206 *e_token++ = *buf_ptr++; 207 seensfx |= 2; 208 continue; 209 } 210 break; 211 } 212 } 213 else 214 while (chartype[(int)*buf_ptr] == alphanum || *buf_ptr == BACKSLASH) { 215 /* fill_buffer() terminates buffer with newline */ 216 if (*buf_ptr == BACKSLASH) { 217 if (*(buf_ptr + 1) == '\n') { 218 buf_ptr += 2; 219 if (buf_ptr >= buf_end) 220 fill_buffer(); 221 } else 222 break; 223 } 224 CHECK_SIZE_TOKEN; 225 /* copy it over */ 226 *e_token++ = *buf_ptr++; 227 if (buf_ptr >= buf_end) 228 fill_buffer(); 229 } 230 *e_token++ = '\0'; 231 while (*buf_ptr == ' ' || *buf_ptr == '\t') { /* get rid of blanks */ 232 if (++buf_ptr >= buf_end) 233 fill_buffer(); 234 } 235 ps.its_a_keyword = false; 236 ps.sizeof_keyword = false; 237 if (l_struct && !ps.p_l_follow) { 238 /* if last token was 'struct' and we're not 239 * in parentheses, then this token 240 * should be treated as a declaration */ 241 l_struct = false; 242 last_code = ident; 243 ps.last_u_d = true; 244 return (decl); 245 } 246 ps.last_u_d = l_struct; /* Operator after identifier is binary 247 * unless last token was 'struct' */ 248 l_struct = false; 249 last_code = ident; /* Remember that this is the code we will 250 * return */ 251 252 /* 253 * This loop will check if the token is a keyword. 254 */ 255 for (p = specials; (j = p->rwd) != 0; p++) { 256 const char *q = s_token; /* point at scanned token */ 257 if (*j++ != *q++ || *j++ != *q++) 258 continue; /* This test depends on the fact that 259 * identifiers are always at least 1 character 260 * long (ie. the first two bytes of the 261 * identifier are always meaningful) */ 262 if (q[-1] == 0) 263 break; /* If its a one-character identifier */ 264 while (*q++ == *j) 265 if (*j++ == 0) 266 goto found_keyword; /* I wish that C had a multi-level 267 * break... */ 268 } 269 if (p->rwd) { /* we have a keyword */ 270 found_keyword: 271 ps.its_a_keyword = true; 272 ps.last_u_d = true; 273 switch (p->rwcode) { 274 case 1: /* it is a switch */ 275 return (swstmt); 276 case 2: /* a case or default */ 277 return (casestmt); 278 279 case 3: /* a "struct" */ 280 /* 281 * Next time around, we will want to know that we have had a 282 * 'struct' 283 */ 284 l_struct = true; 285 /* FALLTHROUGH */ 286 287 case 4: /* one of the declaration keywords */ 288 if (ps.p_l_follow) { 289 ps.cast_mask |= (1 << ps.p_l_follow) & ~ps.sizeof_mask; 290 break; /* inside parens: cast, param list or sizeof */ 291 } 292 last_code = decl; 293 return (decl); 294 295 case 5: /* if, while, for */ 296 return (sp_paren); 297 298 case 6: /* do, else */ 299 return (sp_nparen); 300 301 case 7: 302 ps.sizeof_keyword = true; 303 default: /* all others are treated like any other 304 * identifier */ 305 return (ident); 306 } /* end of switch */ 307 } /* end of if (found_it) */ 308 if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0) { 309 char *tp = buf_ptr; 310 while (tp < buf_end) 311 if (*tp++ == ')' && (*tp == ';' || *tp == ',')) 312 goto not_proc; 313 strncpy(ps.procname, token, sizeof ps.procname - 1); 314 ps.in_parameter_declaration = 1; 315 rparen_count = 1; 316 not_proc:; 317 } 318 /* 319 * The following hack attempts to guess whether or not the current 320 * token is in fact a declaration keyword -- one that has been 321 * typedefd 322 */ 323 if (((*buf_ptr == '*' && buf_ptr[1] != '=') || isalpha(*buf_ptr) || *buf_ptr == '_') 324 && !ps.p_l_follow 325 && !ps.block_init 326 && (ps.last_token == rparen || ps.last_token == semicolon || 327 ps.last_token == decl || 328 ps.last_token == lbrace || ps.last_token == rbrace)) { 329 ps.its_a_keyword = true; 330 ps.last_u_d = true; 331 last_code = decl; 332 return decl; 333 } 334 if (last_code == decl) /* if this is a declared variable, then 335 * following sign is unary */ 336 ps.last_u_d = true; /* will make "int a -1" work */ 337 last_code = ident; 338 return (ident); /* the ident is not in the list */ 339 } /* end of procesing for alpanum character */ 340 341 /* Scan a non-alphanumeric token */ 342 343 *e_token++ = *buf_ptr; /* if it is only a one-character token, it is 344 * moved here */ 345 *e_token = '\0'; 346 if (++buf_ptr >= buf_end) 347 fill_buffer(); 348 349 switch (*token) { 350 case '\n': 351 unary_delim = ps.last_u_d; 352 ps.last_nl = true; /* remember that we just had a newline */ 353 code = (had_eof ? 0 : newline); 354 355 /* 356 * if data has been exhausted, the newline is a dummy, and we should 357 * return code to stop 358 */ 359 break; 360 361 case '\'': /* start of quoted character */ 362 case '"': /* start of string */ 363 qchar = *token; 364 if (troff) { 365 e_token[-1] = '`'; 366 if (qchar == '"') 367 *e_token++ = '`'; 368 e_token = chfont(&bodyf, &stringf, e_token); 369 } 370 do { /* copy the string */ 371 while (1) { /* move one character or [/<char>]<char> */ 372 if (*buf_ptr == '\n') { 373 diag2(1, "Unterminated literal"); 374 goto stop_lit; 375 } 376 CHECK_SIZE_TOKEN; /* Only have to do this once in this loop, 377 * since CHECK_SIZE guarantees that there 378 * are at least 5 entries left */ 379 *e_token = *buf_ptr++; 380 if (buf_ptr >= buf_end) 381 fill_buffer(); 382 if (*e_token == BACKSLASH) { /* if escape, copy extra char */ 383 if (*buf_ptr == '\n') /* check for escaped newline */ 384 ++line_no; 385 if (troff) { 386 *++e_token = BACKSLASH; 387 if (*buf_ptr == BACKSLASH) 388 *++e_token = BACKSLASH; 389 } 390 *++e_token = *buf_ptr++; 391 ++e_token; /* we must increment this again because we 392 * copied two chars */ 393 if (buf_ptr >= buf_end) 394 fill_buffer(); 395 } 396 else 397 break; /* we copied one character */ 398 } /* end of while (1) */ 399 } while (*e_token++ != qchar); 400 if (troff) { 401 e_token = chfont(&stringf, &bodyf, e_token - 1); 402 if (qchar == '"') 403 *e_token++ = '\''; 404 } 405 stop_lit: 406 code = ident; 407 break; 408 409 case ('('): 410 case ('['): 411 unary_delim = true; 412 code = lparen; 413 break; 414 415 case (')'): 416 case (']'): 417 code = rparen; 418 break; 419 420 case '#': 421 unary_delim = ps.last_u_d; 422 code = preesc; 423 break; 424 425 case '?': 426 unary_delim = true; 427 code = question; 428 break; 429 430 case (':'): 431 code = colon; 432 unary_delim = true; 433 break; 434 435 case (';'): 436 unary_delim = true; 437 code = semicolon; 438 break; 439 440 case ('{'): 441 unary_delim = true; 442 443 /* 444 * if (ps.in_or_st) ps.block_init = 1; 445 */ 446 /* ? code = ps.block_init ? lparen : lbrace; */ 447 code = lbrace; 448 break; 449 450 case ('}'): 451 unary_delim = true; 452 /* ? code = ps.block_init ? rparen : rbrace; */ 453 code = rbrace; 454 break; 455 456 case 014: /* a form feed */ 457 unary_delim = ps.last_u_d; 458 ps.last_nl = true; /* remember this so we can set 'ps.col_1' 459 * right */ 460 code = form_feed; 461 break; 462 463 case (','): 464 unary_delim = true; 465 code = comma; 466 break; 467 468 case '.': 469 unary_delim = false; 470 code = period; 471 break; 472 473 case '-': 474 case '+': /* check for -, +, --, ++ */ 475 code = (ps.last_u_d ? unary_op : binary_op); 476 unary_delim = true; 477 478 if (*buf_ptr == token[0]) { 479 /* check for doubled character */ 480 *e_token++ = *buf_ptr++; 481 /* buffer overflow will be checked at end of loop */ 482 if (last_code == ident || last_code == rparen) { 483 code = (ps.last_u_d ? unary_op : postop); 484 /* check for following ++ or -- */ 485 unary_delim = false; 486 } 487 } 488 else if (*buf_ptr == '=') 489 /* check for operator += */ 490 *e_token++ = *buf_ptr++; 491 else if (*buf_ptr == '>') { 492 /* check for operator -> */ 493 *e_token++ = *buf_ptr++; 494 if (!pointer_as_binop) { 495 unary_delim = false; 496 code = unary_op; 497 ps.want_blank = false; 498 } 499 } 500 break; /* buffer overflow will be checked at end of 501 * switch */ 502 503 case '=': 504 if (ps.in_or_st) 505 ps.block_init = 1; 506 #ifdef undef 507 if (chartype[*buf_ptr] == opchar) { /* we have two char assignment */ 508 e_token[-1] = *buf_ptr++; 509 if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr) 510 *e_token++ = *buf_ptr++; 511 *e_token++ = '='; /* Flip =+ to += */ 512 *e_token = 0; 513 } 514 #else 515 if (*buf_ptr == '=') {/* == */ 516 *e_token++ = '='; /* Flip =+ to += */ 517 buf_ptr++; 518 *e_token = 0; 519 } 520 #endif 521 code = binary_op; 522 unary_delim = true; 523 break; 524 /* can drop thru!!! */ 525 526 case '>': 527 case '<': 528 case '!': /* ops like <, <<, <=, !=, etc */ 529 if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') { 530 *e_token++ = *buf_ptr; 531 if (++buf_ptr >= buf_end) 532 fill_buffer(); 533 } 534 if (*buf_ptr == '=') 535 *e_token++ = *buf_ptr++; 536 code = (ps.last_u_d ? unary_op : binary_op); 537 unary_delim = true; 538 break; 539 540 default: 541 if (token[0] == '/' && *buf_ptr == '*') { 542 /* it is start of comment */ 543 *e_token++ = '*'; 544 545 if (++buf_ptr >= buf_end) 546 fill_buffer(); 547 548 code = comment; 549 unary_delim = ps.last_u_d; 550 break; 551 } 552 while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') { 553 /* 554 * handle ||, &&, etc, and also things as in int *****i 555 */ 556 *e_token++ = *buf_ptr; 557 if (++buf_ptr >= buf_end) 558 fill_buffer(); 559 } 560 code = (ps.last_u_d ? unary_op : binary_op); 561 unary_delim = true; 562 563 564 } /* end of switch */ 565 if (code != newline) { 566 l_struct = false; 567 last_code = code; 568 } 569 if (buf_ptr >= buf_end) /* check for input buffer empty */ 570 fill_buffer(); 571 ps.last_u_d = unary_delim; 572 *e_token = '\0'; /* null terminate the token */ 573 return (code); 574 } 575 576 /* 577 * Add the given keyword to the keyword table, using val as the keyword type 578 */ 579 void 580 addkey(char *key, int val) 581 { 582 struct templ *p = specials; 583 while (p->rwd) 584 if (p->rwd[0] == key[0] && strcmp(p->rwd, key) == 0) 585 return; 586 else 587 p++; 588 if (p >= specials + sizeof specials / sizeof specials[0]) 589 return; /* For now, table overflows are silently 590 * ignored */ 591 p->rwd = key; 592 p->rwcode = val; 593 p[1].rwd = 0; 594 p[1].rwcode = 0; 595 } 596