xref: /titanic_51/usr/src/cmd/spell/spellprog.c (revision d3cf9c7d3cb6a89c5ee679d866610bc6baaf2c9a)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
23 /*	  All Rights Reserved  	*/
24 
25 
26 /*
27  * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
28  * Use is subject to license terms.
29  */
30 
31 #pragma ident	"%Z%%M%	%I%	%E% SMI"
32 
33 #include <stdlib.h>
34 #include <unistd.h>
35 #include <limits.h>
36 #include <string.h>
37 #include <stdio.h>
38 #include <ctype.h>
39 #include <locale.h>
40 #include "hash.h"
41 
42 #define	Tolower(c) (isupper(c)?tolower(c):c)
43 #define	DLEV 2
44 
45 /*
46  * ANSI prototypes
47  */
48 static int	ily(char *, char *, char *, int);
49 static int	s(char *, char *, char *, int);
50 static int	es(char *, char *, char *, int);
51 static int	subst(char *, char *, char *, int);
52 static int	nop(void);
53 static int	bility(char *, char *, char *, int);
54 static int	i_to_y(char *, char *, char *, int);
55 static int	CCe(char *, char *, char *, int);
56 static int	y_to_e(char *, char *, char *, int);
57 static int	strip(char *, char *, char *, int);
58 static int	ize(char *, char *, char *, int);
59 static int	tion(char *, char *, char *, int);
60 static int	an(char *, char *, char *, int);
61 int		prime(char *);
62 static void	ise(void);
63 static int	tryword(char *, char *, int);
64 static int	trypref(char *, char *, int);
65 static int	trysuff(char *, int);
66 static int	vowel(int);
67 static int	dict(char *, char *);
68 static int	monosyl(char *, char *);
69 static int	VCe(char *, char *, char *, int);
70 static char	*skipv(char *);
71 static void	ztos(char *);
72 
73 static struct suftab {
74 	char *suf;
75 	int (*p1)();
76 	int n1;
77 	char *d1;
78 	char *a1;
79 	int (*p2)();
80 	int n2;
81 	char *d2;
82 	char *a2;
83 } suftab[] = {
84 	{"ssen", ily, 4, "-y+iness", "+ness" },
85 	{"ssel", ily, 4, "-y+i+less", "+less" },
86 	{"se", s, 1, "", "+s", 	es, 2, "-y+ies", "+es" },
87 	{"s'", s, 2, "", "+'s"},
88 	{"s", s, 1, "", "+s"},
89 	{"ecn", subst, 1, "-t+ce", ""},
90 	{"ycn", subst, 1, "-t+cy", ""},
91 	{"ytilb", nop, 0, "", ""},
92 	{"ytilib", bility, 5, "-le+ility", ""},
93 	{"elbaif", i_to_y, 4, "-y+iable", ""},
94 	{"elba", CCe, 4, "-e+able", "+able"},
95 	{"yti", CCe, 3, "-e+ity", "+ity"},
96 	{"ylb", y_to_e, 1, "-e+y", ""},
97 	{"yl", ily, 2, "-y+ily", "+ly"},
98 	{"laci", strip, 2, "", "+al"},
99 	{"latnem", strip, 2, "", "+al"},
100 	{"lanoi", strip, 2, "", "+al"},
101 	{"tnem", strip, 4, "", "+ment"},
102 	{"gni", CCe, 3, "-e+ing", "+ing"},
103 	{"reta", nop, 0, "", ""},
104 	{"retc", nop, 0, "", ""},
105 	{"re", strip, 1, "", "+r", i_to_y, 2, "-y+ier", "+er"},
106 	{"de", strip, 1, "", "+d", i_to_y, 2, "-y+ied", "+ed"},
107 	{"citsi", strip, 2, "", "+ic"},
108 	{"citi", ize, 1, "-ic+e", ""},
109 	{"cihparg", i_to_y, 1, "-y+ic", ""},
110 	{"tse", strip, 2, "", "+st", 	i_to_y, 3, "-y+iest", "+est"},
111 	{"cirtem", i_to_y, 1, "-y+ic", ""},
112 	{"yrtem", subst, 0, "-er+ry", ""},
113 	{"cigol", i_to_y, 1, "-y+ic", ""},
114 	{"tsigol", i_to_y, 2, "-y+ist", ""},
115 	{"tsi", CCe, 3, "-e+ist", "+ist"},
116 	{"msi", CCe, 3, "-e+ism", "+ist"},
117 	{"noitacifi", i_to_y, 6, "-y+ication", ""},
118 	{"noitazi", ize, 4, "-e+ation", ""},
119 	{"rota", tion, 2, "-e+or", ""},
120 	{"rotc", tion, 2, "", "+or"},
121 	{"noit", tion, 3, "-e+ion", "+ion"},
122 	{"naino", an, 3, "", "+ian"},
123 	{"na", an, 1, "", "+n"},
124 	{"evi", subst, 0, "-ion+ive", ""},
125 	{"ezi", CCe, 3, "-e+ize", "+ize"},
126 	{"pihs", strip, 4, "", "+ship"},
127 	{"dooh", ily, 4, "-y+ihood", "+hood"},
128 	{"luf", ily, 3, "-y+iful", "+ful"},
129 	{"ekil", strip, 4, "", "+like"},
130 	0
131 };
132 
133 static char *preftab[] = {
134 	"anti",
135 	"auto",
136 	"bio",
137 	"counter",
138 	"dis",
139 	"electro",
140 	"en",
141 	"fore",
142 	"geo",
143 	"hyper",
144 	"intra",
145 	"inter",
146 	"iso",
147 	"kilo",
148 	"magneto",
149 	"meta",
150 	"micro",
151 	"mid",
152 	"milli",
153 	"mis",
154 	"mono",
155 	"multi",
156 	"non",
157 	"out",
158 	"over",
159 	"photo",
160 	"poly",
161 	"pre",
162 	"pseudo",
163 	"psycho",
164 	"re",
165 	"semi",
166 	"stereo",
167 	"sub",
168 	"super",
169 	"tele",
170 	"thermo",
171 	"ultra",
172 	"under",	/* must precede un */
173 	"un",
174 	0
175 };
176 
177 static int vflag;
178 static int xflag;
179 static char *prog;
180 static char word[LINE_MAX];
181 static char original[LINE_MAX];
182 static char *deriv[LINE_MAX];
183 static char affix[LINE_MAX];
184 static FILE *file, *found;
185 /*
186  *	deriv is stack of pointers to notes like +micro +ed
187  *	affix is concatenated string of notes
188  *	the buffer size 141 stems from the sizes of original and affix.
189  */
190 
191 /*
192  *	in an attempt to defray future maintenance misunderstandings, here is
193  *	an attempt to describe the input/output expectations of the spell
194  *	program.
195  *
196  *	spellprog is intended to be called from the shell file spell.
197  *	because of this, there is little error checking (this is historical, not
198  *	necessarily advisable).
199  *
200  *	spellprog options hashed-list pass
201  *
202  *	the hashed-list is a list of the form made by spellin.
203  *	there are 2 types of hashed lists:
204  *		1. a stop list: this specifies words that by the rules embodied
205  *		   in spellprog would be recognized as correct, BUT are really
206  *		   errors.
207  *		2. a dictionary of correctly spelled words.
208  *	the pass number determines how the words found in the specified
209  *	hashed-list are treated. If the pass number is 1, the hashed-list is
210  *	treated as the stop-list, otherwise, it is treated as the regular
211  *	dictionary list. in this case, the value of "pass" is a filename. Found
212  *	words are written to this file.
213  *
214  *	In the normal case, the filename = /dev/null. However, if the v option
215  *	is specified, the derivations are written to this file.
216  *	The spellprog looks up words in the hashed-list; if a word is found, it
217  *	is printed to the stdout. If the hashed-list was the stop-list, the
218  *	words found are presumed to be misspellings. in this case,
219  *	a control character is printed ( a "-" is appended to the word.
220  *	a hyphen will never occur naturally in the input list because deroff
221  *	is used in the shell file before calling spellprog.)
222  *	If the regualar spelling list was used (hlista or hlistb), the words
223  *	are correct, and may be ditched. (unless the -v option was used -
224  *	see the manual page).
225  *
226  *	spellprog should be called twice : first with the stop-list, to flag all
227  *	a priori incorrectly spelled words; second with the dictionary.
228  *
229  *	spellprog hstop 1 |\
230  *	spellprog hlista /dev/null
231  *
232  *	for a complete scenario, see the shell file: spell.
233  *
234  */
235 
236 void
237 main(int argc, char **argv)
238 {
239 	register char *ep, *cp;
240 	register char *dp;
241 	int fold;
242 	int c, j;
243 	int pass;
244 
245 	/* Set locale environment variables local definitions */
246 	(void) setlocale(LC_ALL, "");
247 #if !defined(TEXT_DOMAIN)	/* Should be defined by cc -D */
248 #define	TEXT_DOMAIN "SYS_TEST"	/* Use this only if it wasn't */
249 #endif
250 	(void) textdomain(TEXT_DOMAIN);
251 
252 
253 	prog = argv[0];
254 	while ((c = getopt(argc, argv, "bvx")) != EOF) {
255 		switch (c) {
256 		case 'b':
257 			ise();
258 			break;
259 		case 'v':
260 			vflag++;
261 			break;
262 		case 'x':
263 			xflag++;
264 			break;
265 		}
266 	}
267 
268 	argc -= optind;
269 	argv = &argv[optind];
270 
271 	if ((argc < 2) || !prime(*argv)) {
272 		(void) fprintf(stderr,
273 		    gettext("%s: cannot initialize hash table\n"), prog);
274 		exit(1);
275 	}
276 	argc--;
277 	argv++;
278 
279 /*
280  *	if pass is not 1, it is assumed to be a filename.
281  *	found words are written to this file.
282  */
283 	pass = **argv;
284 	if (pass != '1')
285 		found = fopen(*argv, "w");
286 
287 	for (;;) {
288 		affix[0] = 0;
289 		file = stdout;
290 		for (ep = word; (*ep = j = getchar()) != '\n'; ep++)
291 			if (j == EOF)
292 				exit(0);
293 /*
294  *	here is the hyphen processing. these words were found in the stop
295  *	list. however, if they exist as is, (no derivations tried) in the
296  *	dictionary, let them through as correct.
297  *
298  */
299 		if (ep[-1] == '-') {
300 			*--ep = 0;
301 			if (!tryword(word, ep, 0))
302 				(void) fprintf(file, "%s\n", word);
303 			continue;
304 		}
305 		for (cp = word, dp = original; cp < ep; )
306 			*dp++ = *cp++;
307 		*dp = 0;
308 		fold = 0;
309 		for (cp = word; cp < ep; cp++)
310 			if (islower(*cp))
311 				goto lcase;
312 		if (((ep - word) == 1) &&
313 		    ((word[0] == 'A') || (word[0] == 'I')))
314 			continue;
315 		if (trypref(ep, ".", 0))
316 			goto foundit;
317 		++fold;
318 		for (cp = original+1, dp = word+1; dp < ep; dp++, cp++)
319 			*dp = Tolower(*cp);
320 lcase:
321 		if (((ep - word) == 1) && (word[0] == 'a'))
322 			continue;
323 		if (trypref(ep, ".", 0)||trysuff(ep, 0))
324 			goto foundit;
325 		if (isupper(word[0])) {
326 			for (cp = original, dp = word; *dp = *cp++; dp++)
327 				if (fold) *dp = Tolower(*dp);
328 			word[0] = Tolower(word[0]);
329 			goto lcase;
330 		}
331 		(void) fprintf(file, "%s\n", original);
332 		continue;
333 
334 foundit:
335 		if (pass == '1')
336 			(void) fprintf(file, "%s-\n", original);
337 		else if (affix[0] != 0 && affix[0] != '.') {
338 			file = found;
339 			(void) fprintf(file, "%s\t%s\n", affix,
340 			    original);
341 		}
342 	}
343 }
344 
345 /*
346  *	strip exactly one suffix and do
347  *	indicated routine(s), which may recursively
348  *	strip suffixes
349  */
350 
351 static int
352 trysuff(char *ep, int lev)
353 {
354 	register struct suftab	*t;
355 	register char *cp, *sp;
356 
357 	lev += DLEV;
358 	deriv[lev] = deriv[lev-1] = 0;
359 	for (t = &suftab[0]; (sp = t->suf) != 0; t++) {
360 		cp = ep;
361 		while (*sp)
362 			if (*--cp != *sp++)
363 				goto next;
364 		for (sp = cp; --sp >= word && !vowel(*sp); );
365 		if (sp < word)
366 			return (0);
367 		if ((*t->p1)(ep-t->n1, t->d1, t->a1, lev+1))
368 			return (1);
369 		if (t->p2 != 0) {
370 			deriv[lev] = deriv[lev+1] = 0;
371 			return ((*t->p2)(ep-t->n2, t->d2, t->a2, lev));
372 		}
373 		return (0);
374 next:;
375 	}
376 	return (0);
377 }
378 
379 static int
380 nop(void)
381 {
382 	return (0);
383 }
384 
385 /* ARGSUSED */
386 static int
387 strip(char *ep, char *d, char *a, int lev)
388 {
389 	return (trypref(ep, a, lev)||trysuff(ep, lev));
390 }
391 
392 static int
393 s(char *ep, char *d, char *a, int lev)
394 {
395 	if (lev > DLEV+1)
396 		return (0);
397 	if (*ep == 's' && ep[-1] == 's')
398 		return (0);
399 	return (strip(ep, d, a, lev));
400 }
401 
402 /* ARGSUSED */
403 static int
404 an(char *ep, char *d, char *a, int lev)
405 {
406 	if (!isupper(*word))	/* must be proper name */
407 		return (0);
408 	return (trypref(ep, a, lev));
409 }
410 
411 /* ARGSUSED */
412 static int
413 ize(char *ep, char *d, char *a, int lev)
414 {
415 	ep[-1] = 'e';
416 	return (strip(ep, "", d, lev));
417 }
418 
419 /* ARGSUSED */
420 static int
421 y_to_e(char *ep, char *d, char *a, int lev)
422 {
423 	*ep++ = 'e';
424 	return (strip(ep, "", d, lev));
425 }
426 
427 static int
428 ily(char *ep, char *d, char *a, int lev)
429 {
430 	if (ep[-1] == 'i')
431 		return (i_to_y(ep, d, a, lev));
432 	else
433 		return (strip(ep, d, a, lev));
434 }
435 
436 static int
437 bility(char *ep, char *d, char *a, int lev)
438 {
439 	*ep++ = 'l';
440 	return (y_to_e(ep, d, a, lev));
441 }
442 
443 static int
444 i_to_y(char *ep, char *d, char *a, int lev)
445 {
446 	if (ep[-1] == 'i') {
447 		ep[-1] = 'y';
448 		a = d;
449 	}
450 	return (strip(ep, "", a, lev));
451 }
452 
453 static int
454 es(char *ep, char *d, char *a, int lev)
455 {
456 	if (lev > DLEV)
457 		return (0);
458 	switch (ep[-1]) {
459 	default:
460 		return (0);
461 	case 'i':
462 		return (i_to_y(ep, d, a, lev));
463 	case 's':
464 	case 'h':
465 	case 'z':
466 	case 'x':
467 		return (strip(ep, d, a, lev));
468 	}
469 }
470 
471 /* ARGSUSED */
472 static int
473 subst(char *ep, char *d, char *a, int lev)
474 {
475 	char *u, *t;
476 
477 	if (skipv(skipv(ep-1)) < word)
478 		return (0);
479 	for (t = d; *t != '+'; t++)
480 		continue;
481 	for (u = ep; *--t != '-'; )
482 		*--u = *t;
483 	return (strip(ep, "", d, lev));
484 }
485 
486 
487 static int
488 tion(char *ep, char *d, char *a, int lev)
489 {
490 	switch (ep[-2]) {
491 	case 'c':
492 	case 'r':
493 		return (trypref(ep, a, lev));
494 	case 'a':
495 		return (y_to_e(ep, d, a, lev));
496 	}
497 	return (0);
498 }
499 
500 /*	possible consonant-consonant-e ending */
501 static int
502 CCe(char *ep, char *d, char *a, int lev)
503 {
504 	switch (ep[-1]) {
505 	case 'r':
506 		if (ep[-2] == 't')
507 			return (y_to_e(ep, d, a, lev));
508 		break;
509 	case 'l':
510 		if (vowel(ep[-2]))
511 			break;
512 		switch (ep[-2]) {
513 		case 'l':
514 		case 'r':
515 		case 'w':
516 			break;
517 		default:
518 			return (y_to_e(ep, d, a, lev));
519 		}
520 		break;
521 	case 's':
522 		if (ep[-2] == 's')
523 			break;
524 		if (*ep == 'a')
525 			return (0);
526 		if (vowel(ep[-2]))
527 			break;
528 		if (y_to_e(ep, d, a, lev))
529 			return (1);
530 		if (!(ep[-2] == 'n' && ep[-1] == 'g'))
531 			return (0);
532 		break;
533 	case 'c':
534 	case 'g':
535 		if (*ep == 'a')
536 			return (0);
537 		if (vowel(ep[-2]))
538 			break;
539 		if (y_to_e(ep, d, a, lev))
540 			return (1);
541 		if (!(ep[-2] == 'n' && ep[-1] == 'g'))
542 			return (0);
543 		break;
544 	case 'v':
545 	case 'z':
546 		if (vowel(ep[-2]))
547 			break;
548 		if (y_to_e(ep, d, a, lev))
549 			return (1);
550 		if (!(ep[-2] == 'n' && ep[-1] == 'g'))
551 			return (0);
552 		break;
553 	case 'u':
554 		if (y_to_e(ep, d, a, lev))
555 			return (1);
556 		if (!(ep[-2] == 'n' && ep[-1] == 'g'))
557 			return (0);
558 		break;
559 	}
560 	return (VCe(ep, d, a, lev));
561 }
562 
563 /*	possible consonant-vowel-consonant-e ending */
564 static int
565 VCe(char *ep, char *d, char *a, int lev)
566 {
567 	char c;
568 	c = ep[-1];
569 	if (c == 'e')
570 		return (0);
571 	if (!vowel(c) && vowel(ep[-2])) {
572 		c = *ep;
573 		*ep++ = 'e';
574 		if (trypref(ep, d, lev)||trysuff(ep, lev))
575 			return (1);
576 		ep--;
577 		*ep = c;
578 	}
579 	return (strip(ep, d, a, lev));
580 }
581 
582 static char *
583 lookuppref(char **wp, char *ep)
584 {
585 	register char **sp;
586 	register char *bp, *cp;
587 
588 	for (sp = preftab; *sp; sp++) {
589 		bp = *wp;
590 		for (cp = *sp; *cp; cp++, bp++)
591 			if (Tolower(*bp) != *cp)
592 				goto next;
593 		for (cp = bp; cp < ep; cp++)
594 			if (vowel(*cp)) {
595 				*wp = bp;
596 				return (*sp);
597 			}
598 next:;
599 	}
600 	return (0);
601 }
602 
603 /*
604  *	while word is not in dictionary try stripping
605  *	prefixes. Fail if no more prefixes.
606  */
607 static int
608 trypref(char *ep, char *a, int lev)
609 {
610 	register char *cp;
611 	char *bp;
612 	register char *pp;
613 	int val = 0;
614 	char space[LINE_MAX * 2];
615 	deriv[lev] = a;
616 	if (tryword(word, ep, lev))
617 		return (1);
618 	bp = word;
619 	pp = space;
620 	deriv[lev+1] = pp;
621 	while (cp = lookuppref(&bp, ep)) {
622 		*pp++ = '+';
623 		while (*pp = *cp++)
624 			pp++;
625 		if (tryword(bp, ep, lev+1)) {
626 			val = 1;
627 			break;
628 		}
629 	}
630 	deriv[lev+1] = deriv[lev+2] = 0;
631 	return (val);
632 }
633 
634 static int
635 tryword(char *bp, char *ep, int lev)
636 {
637 	register i, j;
638 	char duple[3];
639 	if (ep-bp <= 1)
640 		return (0);
641 	if (vowel(*ep)) {
642 		if (monosyl(bp, ep))
643 			return (0);
644 	}
645 	i = dict(bp, ep);
646 	if (i == 0 && vowel(*ep) && ep[-1] == ep[-2] && monosyl(bp, ep-1)) {
647 		ep--;
648 		deriv[++lev] = duple;
649 		duple[0] = '+';
650 		duple[1] = *ep;
651 		duple[2] = 0;
652 		i = dict(bp, ep);
653 	}
654 	if (vflag == 0 || i == 0)
655 		return (i);
656 	/*
657 	 *	when derivations are wanted, collect them
658 	 *	for printing
659 	 */
660 	j = lev;
661 	do {
662 		if (deriv[j])
663 			(void) strcat(affix, deriv[j]);
664 	} while (--j > 0);
665 	return (i);
666 }
667 
668 
669 static int
670 monosyl(char *bp, char *ep)
671 {
672 	if (ep < bp+2)
673 		return (0);
674 	if (vowel(*--ep) || !vowel(*--ep) || ep[1] == 'x' || ep[1] == 'w')
675 		return (0);
676 	while (--ep >= bp)
677 		if (vowel(*ep))
678 			return (0);
679 	return (1);
680 }
681 
682 static char *
683 skipv(char *s)
684 {
685 	if (s >= word&&vowel(*s))
686 		s--;
687 	while (s >= word && !vowel(*s))
688 		s--;
689 	return (s);
690 }
691 
692 static int
693 vowel(int c)
694 {
695 	switch (Tolower(c)) {
696 	case 'a':
697 	case 'e':
698 	case 'i':
699 	case 'o':
700 	case 'u':
701 	case 'y':
702 		return (1);
703 	}
704 	return (0);
705 }
706 
707 /* crummy way to Britishise */
708 static void
709 ise(void)
710 {
711 	register struct suftab *p;
712 
713 	for (p = suftab; p->suf; p++) {
714 		ztos(p->suf);
715 		ztos(p->d1);
716 		ztos(p->a1);
717 	}
718 }
719 
720 static void
721 ztos(char *s)
722 {
723 	for (; *s; s++)
724 		if (*s == 'z')
725 			*s = 's';
726 }
727 
728 static int
729 dict(char *bp, char *ep)
730 {
731 	register temp, result;
732 	if (xflag)
733 		(void) fprintf(stdout, "=%.*s\n", ep-bp, bp);
734 	temp = *ep;
735 	*ep = 0;
736 	result = hashlook(bp);
737 	*ep = temp;
738 	return (result);
739 }
740