1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 23 /* All Rights Reserved */ 24 25 26 /* 27 * Copyright 2004 Sun Microsystems, Inc. All rights reserved. 28 * Use is subject to license terms. 29 */ 30 31 #pragma ident "%Z%%M% %I% %E% SMI" 32 33 #include <stdlib.h> 34 #include <unistd.h> 35 #include <limits.h> 36 #include <string.h> 37 #include <stdio.h> 38 #include <ctype.h> 39 #include <locale.h> 40 #include "hash.h" 41 42 #define Tolower(c) (isupper(c)?tolower(c):c) 43 #define DLEV 2 44 45 /* 46 * ANSI prototypes 47 */ 48 static int ily(char *, char *, char *, int); 49 static int s(char *, char *, char *, int); 50 static int es(char *, char *, char *, int); 51 static int subst(char *, char *, char *, int); 52 static int nop(void); 53 static int bility(char *, char *, char *, int); 54 static int i_to_y(char *, char *, char *, int); 55 static int CCe(char *, char *, char *, int); 56 static int y_to_e(char *, char *, char *, int); 57 static int strip(char *, char *, char *, int); 58 static int ize(char *, char *, char *, int); 59 static int tion(char *, char *, char *, int); 60 static int an(char *, char *, char *, int); 61 int prime(char *); 62 static void ise(void); 63 static int tryword(char *, char *, int); 64 static int trypref(char *, char *, int); 65 static int trysuff(char *, int); 66 static int vowel(int); 67 static int dict(char *, char *); 68 static int monosyl(char *, char *); 69 static int VCe(char *, char *, char *, int); 70 static char *skipv(char *); 71 static void ztos(char *); 72 73 static struct suftab { 74 char *suf; 75 int (*p1)(); 76 int n1; 77 char *d1; 78 char *a1; 79 int (*p2)(); 80 int n2; 81 char *d2; 82 char *a2; 83 } suftab[] = { 84 {"ssen", ily, 4, "-y+iness", "+ness" }, 85 {"ssel", ily, 4, "-y+i+less", "+less" }, 86 {"se", s, 1, "", "+s", es, 2, "-y+ies", "+es" }, 87 {"s'", s, 2, "", "+'s"}, 88 {"s", s, 1, "", "+s"}, 89 {"ecn", subst, 1, "-t+ce", ""}, 90 {"ycn", subst, 1, "-t+cy", ""}, 91 {"ytilb", nop, 0, "", ""}, 92 {"ytilib", bility, 5, "-le+ility", ""}, 93 {"elbaif", i_to_y, 4, "-y+iable", ""}, 94 {"elba", CCe, 4, "-e+able", "+able"}, 95 {"yti", CCe, 3, "-e+ity", "+ity"}, 96 {"ylb", y_to_e, 1, "-e+y", ""}, 97 {"yl", ily, 2, "-y+ily", "+ly"}, 98 {"laci", strip, 2, "", "+al"}, 99 {"latnem", strip, 2, "", "+al"}, 100 {"lanoi", strip, 2, "", "+al"}, 101 {"tnem", strip, 4, "", "+ment"}, 102 {"gni", CCe, 3, "-e+ing", "+ing"}, 103 {"reta", nop, 0, "", ""}, 104 {"retc", nop, 0, "", ""}, 105 {"re", strip, 1, "", "+r", i_to_y, 2, "-y+ier", "+er"}, 106 {"de", strip, 1, "", "+d", i_to_y, 2, "-y+ied", "+ed"}, 107 {"citsi", strip, 2, "", "+ic"}, 108 {"citi", ize, 1, "-ic+e", ""}, 109 {"cihparg", i_to_y, 1, "-y+ic", ""}, 110 {"tse", strip, 2, "", "+st", i_to_y, 3, "-y+iest", "+est"}, 111 {"cirtem", i_to_y, 1, "-y+ic", ""}, 112 {"yrtem", subst, 0, "-er+ry", ""}, 113 {"cigol", i_to_y, 1, "-y+ic", ""}, 114 {"tsigol", i_to_y, 2, "-y+ist", ""}, 115 {"tsi", CCe, 3, "-e+ist", "+ist"}, 116 {"msi", CCe, 3, "-e+ism", "+ist"}, 117 {"noitacifi", i_to_y, 6, "-y+ication", ""}, 118 {"noitazi", ize, 4, "-e+ation", ""}, 119 {"rota", tion, 2, "-e+or", ""}, 120 {"rotc", tion, 2, "", "+or"}, 121 {"noit", tion, 3, "-e+ion", "+ion"}, 122 {"naino", an, 3, "", "+ian"}, 123 {"na", an, 1, "", "+n"}, 124 {"evi", subst, 0, "-ion+ive", ""}, 125 {"ezi", CCe, 3, "-e+ize", "+ize"}, 126 {"pihs", strip, 4, "", "+ship"}, 127 {"dooh", ily, 4, "-y+ihood", "+hood"}, 128 {"luf", ily, 3, "-y+iful", "+ful"}, 129 {"ekil", strip, 4, "", "+like"}, 130 0 131 }; 132 133 static char *preftab[] = { 134 "anti", 135 "auto", 136 "bio", 137 "counter", 138 "dis", 139 "electro", 140 "en", 141 "fore", 142 "geo", 143 "hyper", 144 "intra", 145 "inter", 146 "iso", 147 "kilo", 148 "magneto", 149 "meta", 150 "micro", 151 "mid", 152 "milli", 153 "mis", 154 "mono", 155 "multi", 156 "non", 157 "out", 158 "over", 159 "photo", 160 "poly", 161 "pre", 162 "pseudo", 163 "psycho", 164 "re", 165 "semi", 166 "stereo", 167 "sub", 168 "super", 169 "tele", 170 "thermo", 171 "ultra", 172 "under", /* must precede un */ 173 "un", 174 0 175 }; 176 177 static int vflag; 178 static int xflag; 179 static char *prog; 180 static char word[LINE_MAX]; 181 static char original[LINE_MAX]; 182 static char *deriv[LINE_MAX]; 183 static char affix[LINE_MAX]; 184 static FILE *file, *found; 185 /* 186 * deriv is stack of pointers to notes like +micro +ed 187 * affix is concatenated string of notes 188 * the buffer size 141 stems from the sizes of original and affix. 189 */ 190 191 /* 192 * in an attempt to defray future maintenance misunderstandings, here is 193 * an attempt to describe the input/output expectations of the spell 194 * program. 195 * 196 * spellprog is intended to be called from the shell file spell. 197 * because of this, there is little error checking (this is historical, not 198 * necessarily advisable). 199 * 200 * spellprog options hashed-list pass 201 * 202 * the hashed-list is a list of the form made by spellin. 203 * there are 2 types of hashed lists: 204 * 1. a stop list: this specifies words that by the rules embodied 205 * in spellprog would be recognized as correct, BUT are really 206 * errors. 207 * 2. a dictionary of correctly spelled words. 208 * the pass number determines how the words found in the specified 209 * hashed-list are treated. If the pass number is 1, the hashed-list is 210 * treated as the stop-list, otherwise, it is treated as the regular 211 * dictionary list. in this case, the value of "pass" is a filename. Found 212 * words are written to this file. 213 * 214 * In the normal case, the filename = /dev/null. However, if the v option 215 * is specified, the derivations are written to this file. 216 * The spellprog looks up words in the hashed-list; if a word is found, it 217 * is printed to the stdout. If the hashed-list was the stop-list, the 218 * words found are presumed to be misspellings. in this case, 219 * a control character is printed ( a "-" is appended to the word. 220 * a hyphen will never occur naturally in the input list because deroff 221 * is used in the shell file before calling spellprog.) 222 * If the regualar spelling list was used (hlista or hlistb), the words 223 * are correct, and may be ditched. (unless the -v option was used - 224 * see the manual page). 225 * 226 * spellprog should be called twice : first with the stop-list, to flag all 227 * a priori incorrectly spelled words; second with the dictionary. 228 * 229 * spellprog hstop 1 |\ 230 * spellprog hlista /dev/null 231 * 232 * for a complete scenario, see the shell file: spell. 233 * 234 */ 235 236 void 237 main(int argc, char **argv) 238 { 239 register char *ep, *cp; 240 register char *dp; 241 int fold; 242 int c, j; 243 int pass; 244 245 /* Set locale environment variables local definitions */ 246 (void) setlocale(LC_ALL, ""); 247 #if !defined(TEXT_DOMAIN) /* Should be defined by cc -D */ 248 #define TEXT_DOMAIN "SYS_TEST" /* Use this only if it wasn't */ 249 #endif 250 (void) textdomain(TEXT_DOMAIN); 251 252 253 prog = argv[0]; 254 while ((c = getopt(argc, argv, "bvx")) != EOF) { 255 switch (c) { 256 case 'b': 257 ise(); 258 break; 259 case 'v': 260 vflag++; 261 break; 262 case 'x': 263 xflag++; 264 break; 265 } 266 } 267 268 argc -= optind; 269 argv = &argv[optind]; 270 271 if ((argc < 2) || !prime(*argv)) { 272 (void) fprintf(stderr, 273 gettext("%s: cannot initialize hash table\n"), prog); 274 exit(1); 275 } 276 argc--; 277 argv++; 278 279 /* 280 * if pass is not 1, it is assumed to be a filename. 281 * found words are written to this file. 282 */ 283 pass = **argv; 284 if (pass != '1') 285 found = fopen(*argv, "w"); 286 287 for (;;) { 288 affix[0] = 0; 289 file = stdout; 290 for (ep = word; (*ep = j = getchar()) != '\n'; ep++) 291 if (j == EOF) 292 exit(0); 293 /* 294 * here is the hyphen processing. these words were found in the stop 295 * list. however, if they exist as is, (no derivations tried) in the 296 * dictionary, let them through as correct. 297 * 298 */ 299 if (ep[-1] == '-') { 300 *--ep = 0; 301 if (!tryword(word, ep, 0)) 302 (void) fprintf(file, "%s\n", word); 303 continue; 304 } 305 for (cp = word, dp = original; cp < ep; ) 306 *dp++ = *cp++; 307 *dp = 0; 308 fold = 0; 309 for (cp = word; cp < ep; cp++) 310 if (islower(*cp)) 311 goto lcase; 312 if (((ep - word) == 1) && 313 ((word[0] == 'A') || (word[0] == 'I'))) 314 continue; 315 if (trypref(ep, ".", 0)) 316 goto foundit; 317 ++fold; 318 for (cp = original+1, dp = word+1; dp < ep; dp++, cp++) 319 *dp = Tolower(*cp); 320 lcase: 321 if (((ep - word) == 1) && (word[0] == 'a')) 322 continue; 323 if (trypref(ep, ".", 0)||trysuff(ep, 0)) 324 goto foundit; 325 if (isupper(word[0])) { 326 for (cp = original, dp = word; *dp = *cp++; dp++) 327 if (fold) *dp = Tolower(*dp); 328 word[0] = Tolower(word[0]); 329 goto lcase; 330 } 331 (void) fprintf(file, "%s\n", original); 332 continue; 333 334 foundit: 335 if (pass == '1') 336 (void) fprintf(file, "%s-\n", original); 337 else if (affix[0] != 0 && affix[0] != '.') { 338 file = found; 339 (void) fprintf(file, "%s\t%s\n", affix, 340 original); 341 } 342 } 343 } 344 345 /* 346 * strip exactly one suffix and do 347 * indicated routine(s), which may recursively 348 * strip suffixes 349 */ 350 351 static int 352 trysuff(char *ep, int lev) 353 { 354 register struct suftab *t; 355 register char *cp, *sp; 356 357 lev += DLEV; 358 deriv[lev] = deriv[lev-1] = 0; 359 for (t = &suftab[0]; (sp = t->suf) != 0; t++) { 360 cp = ep; 361 while (*sp) 362 if (*--cp != *sp++) 363 goto next; 364 for (sp = cp; --sp >= word && !vowel(*sp); ); 365 if (sp < word) 366 return (0); 367 if ((*t->p1)(ep-t->n1, t->d1, t->a1, lev+1)) 368 return (1); 369 if (t->p2 != 0) { 370 deriv[lev] = deriv[lev+1] = 0; 371 return ((*t->p2)(ep-t->n2, t->d2, t->a2, lev)); 372 } 373 return (0); 374 next:; 375 } 376 return (0); 377 } 378 379 static int 380 nop(void) 381 { 382 return (0); 383 } 384 385 /* ARGSUSED */ 386 static int 387 strip(char *ep, char *d, char *a, int lev) 388 { 389 return (trypref(ep, a, lev)||trysuff(ep, lev)); 390 } 391 392 static int 393 s(char *ep, char *d, char *a, int lev) 394 { 395 if (lev > DLEV+1) 396 return (0); 397 if (*ep == 's' && ep[-1] == 's') 398 return (0); 399 return (strip(ep, d, a, lev)); 400 } 401 402 /* ARGSUSED */ 403 static int 404 an(char *ep, char *d, char *a, int lev) 405 { 406 if (!isupper(*word)) /* must be proper name */ 407 return (0); 408 return (trypref(ep, a, lev)); 409 } 410 411 /* ARGSUSED */ 412 static int 413 ize(char *ep, char *d, char *a, int lev) 414 { 415 ep[-1] = 'e'; 416 return (strip(ep, "", d, lev)); 417 } 418 419 /* ARGSUSED */ 420 static int 421 y_to_e(char *ep, char *d, char *a, int lev) 422 { 423 *ep++ = 'e'; 424 return (strip(ep, "", d, lev)); 425 } 426 427 static int 428 ily(char *ep, char *d, char *a, int lev) 429 { 430 if (ep[-1] == 'i') 431 return (i_to_y(ep, d, a, lev)); 432 else 433 return (strip(ep, d, a, lev)); 434 } 435 436 static int 437 bility(char *ep, char *d, char *a, int lev) 438 { 439 *ep++ = 'l'; 440 return (y_to_e(ep, d, a, lev)); 441 } 442 443 static int 444 i_to_y(char *ep, char *d, char *a, int lev) 445 { 446 if (ep[-1] == 'i') { 447 ep[-1] = 'y'; 448 a = d; 449 } 450 return (strip(ep, "", a, lev)); 451 } 452 453 static int 454 es(char *ep, char *d, char *a, int lev) 455 { 456 if (lev > DLEV) 457 return (0); 458 switch (ep[-1]) { 459 default: 460 return (0); 461 case 'i': 462 return (i_to_y(ep, d, a, lev)); 463 case 's': 464 case 'h': 465 case 'z': 466 case 'x': 467 return (strip(ep, d, a, lev)); 468 } 469 } 470 471 /* ARGSUSED */ 472 static int 473 subst(char *ep, char *d, char *a, int lev) 474 { 475 char *u, *t; 476 477 if (skipv(skipv(ep-1)) < word) 478 return (0); 479 for (t = d; *t != '+'; t++) 480 continue; 481 for (u = ep; *--t != '-'; ) 482 *--u = *t; 483 return (strip(ep, "", d, lev)); 484 } 485 486 487 static int 488 tion(char *ep, char *d, char *a, int lev) 489 { 490 switch (ep[-2]) { 491 case 'c': 492 case 'r': 493 return (trypref(ep, a, lev)); 494 case 'a': 495 return (y_to_e(ep, d, a, lev)); 496 } 497 return (0); 498 } 499 500 /* possible consonant-consonant-e ending */ 501 static int 502 CCe(char *ep, char *d, char *a, int lev) 503 { 504 switch (ep[-1]) { 505 case 'r': 506 if (ep[-2] == 't') 507 return (y_to_e(ep, d, a, lev)); 508 break; 509 case 'l': 510 if (vowel(ep[-2])) 511 break; 512 switch (ep[-2]) { 513 case 'l': 514 case 'r': 515 case 'w': 516 break; 517 default: 518 return (y_to_e(ep, d, a, lev)); 519 } 520 break; 521 case 's': 522 if (ep[-2] == 's') 523 break; 524 if (*ep == 'a') 525 return (0); 526 if (vowel(ep[-2])) 527 break; 528 if (y_to_e(ep, d, a, lev)) 529 return (1); 530 if (!(ep[-2] == 'n' && ep[-1] == 'g')) 531 return (0); 532 break; 533 case 'c': 534 case 'g': 535 if (*ep == 'a') 536 return (0); 537 if (vowel(ep[-2])) 538 break; 539 if (y_to_e(ep, d, a, lev)) 540 return (1); 541 if (!(ep[-2] == 'n' && ep[-1] == 'g')) 542 return (0); 543 break; 544 case 'v': 545 case 'z': 546 if (vowel(ep[-2])) 547 break; 548 if (y_to_e(ep, d, a, lev)) 549 return (1); 550 if (!(ep[-2] == 'n' && ep[-1] == 'g')) 551 return (0); 552 break; 553 case 'u': 554 if (y_to_e(ep, d, a, lev)) 555 return (1); 556 if (!(ep[-2] == 'n' && ep[-1] == 'g')) 557 return (0); 558 break; 559 } 560 return (VCe(ep, d, a, lev)); 561 } 562 563 /* possible consonant-vowel-consonant-e ending */ 564 static int 565 VCe(char *ep, char *d, char *a, int lev) 566 { 567 char c; 568 c = ep[-1]; 569 if (c == 'e') 570 return (0); 571 if (!vowel(c) && vowel(ep[-2])) { 572 c = *ep; 573 *ep++ = 'e'; 574 if (trypref(ep, d, lev)||trysuff(ep, lev)) 575 return (1); 576 ep--; 577 *ep = c; 578 } 579 return (strip(ep, d, a, lev)); 580 } 581 582 static char * 583 lookuppref(char **wp, char *ep) 584 { 585 register char **sp; 586 register char *bp, *cp; 587 588 for (sp = preftab; *sp; sp++) { 589 bp = *wp; 590 for (cp = *sp; *cp; cp++, bp++) 591 if (Tolower(*bp) != *cp) 592 goto next; 593 for (cp = bp; cp < ep; cp++) 594 if (vowel(*cp)) { 595 *wp = bp; 596 return (*sp); 597 } 598 next:; 599 } 600 return (0); 601 } 602 603 /* 604 * while word is not in dictionary try stripping 605 * prefixes. Fail if no more prefixes. 606 */ 607 static int 608 trypref(char *ep, char *a, int lev) 609 { 610 register char *cp; 611 char *bp; 612 register char *pp; 613 int val = 0; 614 char space[LINE_MAX * 2]; 615 deriv[lev] = a; 616 if (tryword(word, ep, lev)) 617 return (1); 618 bp = word; 619 pp = space; 620 deriv[lev+1] = pp; 621 while (cp = lookuppref(&bp, ep)) { 622 *pp++ = '+'; 623 while (*pp = *cp++) 624 pp++; 625 if (tryword(bp, ep, lev+1)) { 626 val = 1; 627 break; 628 } 629 } 630 deriv[lev+1] = deriv[lev+2] = 0; 631 return (val); 632 } 633 634 static int 635 tryword(char *bp, char *ep, int lev) 636 { 637 register i, j; 638 char duple[3]; 639 if (ep-bp <= 1) 640 return (0); 641 if (vowel(*ep)) { 642 if (monosyl(bp, ep)) 643 return (0); 644 } 645 i = dict(bp, ep); 646 if (i == 0 && vowel(*ep) && ep[-1] == ep[-2] && monosyl(bp, ep-1)) { 647 ep--; 648 deriv[++lev] = duple; 649 duple[0] = '+'; 650 duple[1] = *ep; 651 duple[2] = 0; 652 i = dict(bp, ep); 653 } 654 if (vflag == 0 || i == 0) 655 return (i); 656 /* 657 * when derivations are wanted, collect them 658 * for printing 659 */ 660 j = lev; 661 do { 662 if (deriv[j]) 663 (void) strcat(affix, deriv[j]); 664 } while (--j > 0); 665 return (i); 666 } 667 668 669 static int 670 monosyl(char *bp, char *ep) 671 { 672 if (ep < bp+2) 673 return (0); 674 if (vowel(*--ep) || !vowel(*--ep) || ep[1] == 'x' || ep[1] == 'w') 675 return (0); 676 while (--ep >= bp) 677 if (vowel(*ep)) 678 return (0); 679 return (1); 680 } 681 682 static char * 683 skipv(char *s) 684 { 685 if (s >= word&&vowel(*s)) 686 s--; 687 while (s >= word && !vowel(*s)) 688 s--; 689 return (s); 690 } 691 692 static int 693 vowel(int c) 694 { 695 switch (Tolower(c)) { 696 case 'a': 697 case 'e': 698 case 'i': 699 case 'o': 700 case 'u': 701 case 'y': 702 return (1); 703 } 704 return (0); 705 } 706 707 /* crummy way to Britishise */ 708 static void 709 ise(void) 710 { 711 register struct suftab *p; 712 713 for (p = suftab; p->suf; p++) { 714 ztos(p->suf); 715 ztos(p->d1); 716 ztos(p->a1); 717 } 718 } 719 720 static void 721 ztos(char *s) 722 { 723 for (; *s; s++) 724 if (*s == 'z') 725 *s = 's'; 726 } 727 728 static int 729 dict(char *bp, char *ep) 730 { 731 register temp, result; 732 if (xflag) 733 (void) fprintf(stdout, "=%.*s\n", ep-bp, bp); 734 temp = *ep; 735 *ep = 0; 736 result = hashlook(bp); 737 *ep = temp; 738 return (result); 739 } 740