xref: /illumos-gate/usr/src/cmd/spell/spellprog.c (revision 814a60b13c0ad90e5d2edfd29a7a84bbf416cc1a)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
28 /*	  All Rights Reserved  	*/
29 
30 #pragma ident	"%Z%%M%	%I%	%E% SMI"
31 
32 #include <stdlib.h>
33 #include <unistd.h>
34 #include <limits.h>
35 #include <string.h>
36 #include <stdio.h>
37 #include <ctype.h>
38 #include <locale.h>
39 #include "hash.h"
40 
41 #define	Tolower(c) (isupper(c)?tolower(c):c)
42 #define	DLEV 2
43 
44 /*
45  * ANSI prototypes
46  */
47 static int	ily(char *, char *, char *, int);
48 static int	s(char *, char *, char *, int);
49 static int	es(char *, char *, char *, int);
50 static int	subst(char *, char *, char *, int);
51 static int	nop(void);
52 static int	bility(char *, char *, char *, int);
53 static int	i_to_y(char *, char *, char *, int);
54 static int	CCe(char *, char *, char *, int);
55 static int	y_to_e(char *, char *, char *, int);
56 static int	strip(char *, char *, char *, int);
57 static int	ize(char *, char *, char *, int);
58 static int	tion(char *, char *, char *, int);
59 static int	an(char *, char *, char *, int);
60 int		prime(char *);
61 static void	ise(void);
62 static int	tryword(char *, char *, int);
63 static int	trypref(char *, char *, int);
64 static int	trysuff(char *, int);
65 static int	vowel(int);
66 static int	dict(char *, char *);
67 static int	monosyl(char *, char *);
68 static int	VCe(char *, char *, char *, int);
69 static char	*skipv(char *);
70 static void	ztos(char *);
71 
72 static struct suftab {
73 	char *suf;
74 	int (*p1)();
75 	int n1;
76 	char *d1;
77 	char *a1;
78 	int (*p2)();
79 	int n2;
80 	char *d2;
81 	char *a2;
82 } suftab[] = {
83 	{"ssen", ily, 4, "-y+iness", "+ness" },
84 	{"ssel", ily, 4, "-y+i+less", "+less" },
85 	{"se", s, 1, "", "+s", 	es, 2, "-y+ies", "+es" },
86 	{"s'", s, 2, "", "+'s"},
87 	{"s", s, 1, "", "+s"},
88 	{"ecn", subst, 1, "-t+ce", ""},
89 	{"ycn", subst, 1, "-t+cy", ""},
90 	{"ytilb", nop, 0, "", ""},
91 	{"ytilib", bility, 5, "-le+ility", ""},
92 	{"elbaif", i_to_y, 4, "-y+iable", ""},
93 	{"elba", CCe, 4, "-e+able", "+able"},
94 	{"yti", CCe, 3, "-e+ity", "+ity"},
95 	{"ylb", y_to_e, 1, "-e+y", ""},
96 	{"yl", ily, 2, "-y+ily", "+ly"},
97 	{"laci", strip, 2, "", "+al"},
98 	{"latnem", strip, 2, "", "+al"},
99 	{"lanoi", strip, 2, "", "+al"},
100 	{"tnem", strip, 4, "", "+ment"},
101 	{"gni", CCe, 3, "-e+ing", "+ing"},
102 	{"reta", nop, 0, "", ""},
103 	{"retc", nop, 0, "", ""},
104 	{"re", strip, 1, "", "+r", i_to_y, 2, "-y+ier", "+er"},
105 	{"de", strip, 1, "", "+d", i_to_y, 2, "-y+ied", "+ed"},
106 	{"citsi", strip, 2, "", "+ic"},
107 	{"citi", ize, 1, "-ic+e", ""},
108 	{"cihparg", i_to_y, 1, "-y+ic", ""},
109 	{"tse", strip, 2, "", "+st", 	i_to_y, 3, "-y+iest", "+est"},
110 	{"cirtem", i_to_y, 1, "-y+ic", ""},
111 	{"yrtem", subst, 0, "-er+ry", ""},
112 	{"cigol", i_to_y, 1, "-y+ic", ""},
113 	{"tsigol", i_to_y, 2, "-y+ist", ""},
114 	{"tsi", CCe, 3, "-e+ist", "+ist"},
115 	{"msi", CCe, 3, "-e+ism", "+ist"},
116 	{"noitacifi", i_to_y, 6, "-y+ication", ""},
117 	{"noitazi", ize, 4, "-e+ation", ""},
118 	{"rota", tion, 2, "-e+or", ""},
119 	{"rotc", tion, 2, "", "+or"},
120 	{"noit", tion, 3, "-e+ion", "+ion"},
121 	{"naino", an, 3, "", "+ian"},
122 	{"na", an, 1, "", "+n"},
123 	{"evi", subst, 0, "-ion+ive", ""},
124 	{"ezi", CCe, 3, "-e+ize", "+ize"},
125 	{"pihs", strip, 4, "", "+ship"},
126 	{"dooh", ily, 4, "-y+ihood", "+hood"},
127 	{"luf", ily, 3, "-y+iful", "+ful"},
128 	{"ekil", strip, 4, "", "+like"},
129 	0
130 };
131 
132 static char *preftab[] = {
133 	"anti",
134 	"auto",
135 	"bio",
136 	"counter",
137 	"dis",
138 	"electro",
139 	"en",
140 	"fore",
141 	"geo",
142 	"hyper",
143 	"intra",
144 	"inter",
145 	"iso",
146 	"kilo",
147 	"magneto",
148 	"meta",
149 	"micro",
150 	"mid",
151 	"milli",
152 	"mis",
153 	"mono",
154 	"multi",
155 	"non",
156 	"out",
157 	"over",
158 	"photo",
159 	"poly",
160 	"pre",
161 	"pseudo",
162 	"psycho",
163 	"re",
164 	"semi",
165 	"stereo",
166 	"sub",
167 	"super",
168 	"tele",
169 	"thermo",
170 	"ultra",
171 	"under",	/* must precede un */
172 	"un",
173 	0
174 };
175 
176 static int vflag;
177 static int xflag;
178 static char *prog;
179 static char word[LINE_MAX];
180 static char original[LINE_MAX];
181 static char *deriv[LINE_MAX];
182 static char affix[LINE_MAX];
183 static FILE *file, *found;
184 /*
185  *	deriv is stack of pointers to notes like +micro +ed
186  *	affix is concatenated string of notes
187  *	the buffer size 141 stems from the sizes of original and affix.
188  */
189 
190 /*
191  *	in an attempt to defray future maintenance misunderstandings, here is
192  *	an attempt to describe the input/output expectations of the spell
193  *	program.
194  *
195  *	spellprog is intended to be called from the shell file spell.
196  *	because of this, there is little error checking (this is historical, not
197  *	necessarily advisable).
198  *
199  *	spellprog options hashed-list pass
200  *
201  *	the hashed-list is a list of the form made by spellin.
202  *	there are 2 types of hashed lists:
203  *		1. a stop list: this specifies words that by the rules embodied
204  *		   in spellprog would be recognized as correct, BUT are really
205  *		   errors.
206  *		2. a dictionary of correctly spelled words.
207  *	the pass number determines how the words found in the specified
208  *	hashed-list are treated. If the pass number is 1, the hashed-list is
209  *	treated as the stop-list, otherwise, it is treated as the regular
210  *	dictionary list. in this case, the value of "pass" is a filename. Found
211  *	words are written to this file.
212  *
213  *	In the normal case, the filename = /dev/null. However, if the v option
214  *	is specified, the derivations are written to this file.
215  *	The spellprog looks up words in the hashed-list; if a word is found, it
216  *	is printed to the stdout. If the hashed-list was the stop-list, the
217  *	words found are presumed to be misspellings. in this case,
218  *	a control character is printed ( a "-" is appended to the word.
219  *	a hyphen will never occur naturally in the input list because deroff
220  *	is used in the shell file before calling spellprog.)
221  *	If the regualar spelling list was used (hlista or hlistb), the words
222  *	are correct, and may be ditched. (unless the -v option was used -
223  *	see the manual page).
224  *
225  *	spellprog should be called twice : first with the stop-list, to flag all
226  *	a priori incorrectly spelled words; second with the dictionary.
227  *
228  *	spellprog hstop 1 |\
229  *	spellprog hlista /dev/null
230  *
231  *	for a complete scenario, see the shell file: spell.
232  *
233  */
234 
235 int
236 main(int argc, char **argv)
237 {
238 	char *ep, *cp;
239 	char *dp;
240 	int fold;
241 	int c, j;
242 	int pass;
243 
244 	/* Set locale environment variables local definitions */
245 	(void) setlocale(LC_ALL, "");
246 #if !defined(TEXT_DOMAIN)	/* Should be defined by cc -D */
247 #define	TEXT_DOMAIN "SYS_TEST"	/* Use this only if it wasn't */
248 #endif
249 	(void) textdomain(TEXT_DOMAIN);
250 
251 
252 	prog = argv[0];
253 	while ((c = getopt(argc, argv, "bvx")) != EOF) {
254 		switch (c) {
255 		case 'b':
256 			ise();
257 			break;
258 		case 'v':
259 			vflag++;
260 			break;
261 		case 'x':
262 			xflag++;
263 			break;
264 		}
265 	}
266 
267 	argc -= optind;
268 	argv = &argv[optind];
269 
270 	if ((argc < 2) || !prime(*argv)) {
271 		(void) fprintf(stderr,
272 		    gettext("%s: cannot initialize hash table\n"), prog);
273 		exit(1);
274 	}
275 	argc--;
276 	argv++;
277 
278 /*
279  *	if pass is not 1, it is assumed to be a filename.
280  *	found words are written to this file.
281  */
282 	pass = **argv;
283 	if (pass != '1')
284 		found = fopen(*argv, "w");
285 
286 	for (;;) {
287 		affix[0] = 0;
288 		file = stdout;
289 		for (ep = word; (*ep = j = getchar()) != '\n'; ep++)
290 			if (j == EOF)
291 				exit(0);
292 /*
293  *	here is the hyphen processing. these words were found in the stop
294  *	list. however, if they exist as is, (no derivations tried) in the
295  *	dictionary, let them through as correct.
296  *
297  */
298 		if (ep[-1] == '-') {
299 			*--ep = 0;
300 			if (!tryword(word, ep, 0))
301 				(void) fprintf(file, "%s\n", word);
302 			continue;
303 		}
304 		for (cp = word, dp = original; cp < ep; )
305 			*dp++ = *cp++;
306 		*dp = 0;
307 		fold = 0;
308 		for (cp = word; cp < ep; cp++)
309 			if (islower(*cp))
310 				goto lcase;
311 		if (((ep - word) == 1) &&
312 		    ((word[0] == 'A') || (word[0] == 'I')))
313 			continue;
314 		if (trypref(ep, ".", 0))
315 			goto foundit;
316 		++fold;
317 		for (cp = original+1, dp = word+1; dp < ep; dp++, cp++)
318 			*dp = Tolower(*cp);
319 lcase:
320 		if (((ep - word) == 1) && (word[0] == 'a'))
321 			continue;
322 		if (trypref(ep, ".", 0)||trysuff(ep, 0))
323 			goto foundit;
324 		if (isupper(word[0])) {
325 			for (cp = original, dp = word; *dp = *cp++; dp++)
326 				if (fold) *dp = Tolower(*dp);
327 			word[0] = Tolower(word[0]);
328 			goto lcase;
329 		}
330 		(void) fprintf(file, "%s\n", original);
331 		continue;
332 
333 foundit:
334 		if (pass == '1')
335 			(void) fprintf(file, "%s-\n", original);
336 		else if (affix[0] != 0 && affix[0] != '.') {
337 			file = found;
338 			(void) fprintf(file, "%s\t%s\n", affix,
339 			    original);
340 		}
341 	}
342 }
343 
344 /*
345  *	strip exactly one suffix and do
346  *	indicated routine(s), which may recursively
347  *	strip suffixes
348  */
349 
350 static int
351 trysuff(char *ep, int lev)
352 {
353 	struct suftab	*t;
354 	char *cp, *sp;
355 
356 	lev += DLEV;
357 	deriv[lev] = deriv[lev-1] = 0;
358 	for (t = &suftab[0]; (sp = t->suf) != 0; t++) {
359 		cp = ep;
360 		while (*sp)
361 			if (*--cp != *sp++)
362 				goto next;
363 		for (sp = cp; --sp >= word && !vowel(*sp); );
364 		if (sp < word)
365 			return (0);
366 		if ((*t->p1)(ep-t->n1, t->d1, t->a1, lev+1))
367 			return (1);
368 		if (t->p2 != 0) {
369 			deriv[lev] = deriv[lev+1] = 0;
370 			return ((*t->p2)(ep-t->n2, t->d2, t->a2, lev));
371 		}
372 		return (0);
373 next:;
374 	}
375 	return (0);
376 }
377 
378 static int
379 nop(void)
380 {
381 	return (0);
382 }
383 
384 /* ARGSUSED */
385 static int
386 strip(char *ep, char *d, char *a, int lev)
387 {
388 	return (trypref(ep, a, lev)||trysuff(ep, lev));
389 }
390 
391 static int
392 s(char *ep, char *d, char *a, int lev)
393 {
394 	if (lev > DLEV+1)
395 		return (0);
396 	if (*ep == 's' && ep[-1] == 's')
397 		return (0);
398 	return (strip(ep, d, a, lev));
399 }
400 
401 /* ARGSUSED */
402 static int
403 an(char *ep, char *d, char *a, int lev)
404 {
405 	if (!isupper(*word))	/* must be proper name */
406 		return (0);
407 	return (trypref(ep, a, lev));
408 }
409 
410 /* ARGSUSED */
411 static int
412 ize(char *ep, char *d, char *a, int lev)
413 {
414 	ep[-1] = 'e';
415 	return (strip(ep, "", d, lev));
416 }
417 
418 /* ARGSUSED */
419 static int
420 y_to_e(char *ep, char *d, char *a, int lev)
421 {
422 	*ep++ = 'e';
423 	return (strip(ep, "", d, lev));
424 }
425 
426 static int
427 ily(char *ep, char *d, char *a, int lev)
428 {
429 	if (ep[-1] == 'i')
430 		return (i_to_y(ep, d, a, lev));
431 	else
432 		return (strip(ep, d, a, lev));
433 }
434 
435 static int
436 bility(char *ep, char *d, char *a, int lev)
437 {
438 	*ep++ = 'l';
439 	return (y_to_e(ep, d, a, lev));
440 }
441 
442 static int
443 i_to_y(char *ep, char *d, char *a, int lev)
444 {
445 	if (ep[-1] == 'i') {
446 		ep[-1] = 'y';
447 		a = d;
448 	}
449 	return (strip(ep, "", a, lev));
450 }
451 
452 static int
453 es(char *ep, char *d, char *a, int lev)
454 {
455 	if (lev > DLEV)
456 		return (0);
457 	switch (ep[-1]) {
458 	default:
459 		return (0);
460 	case 'i':
461 		return (i_to_y(ep, d, a, lev));
462 	case 's':
463 	case 'h':
464 	case 'z':
465 	case 'x':
466 		return (strip(ep, d, a, lev));
467 	}
468 }
469 
470 /* ARGSUSED */
471 static int
472 subst(char *ep, char *d, char *a, int lev)
473 {
474 	char *u, *t;
475 
476 	if (skipv(skipv(ep-1)) < word)
477 		return (0);
478 	for (t = d; *t != '+'; t++)
479 		continue;
480 	for (u = ep; *--t != '-'; )
481 		*--u = *t;
482 	return (strip(ep, "", d, lev));
483 }
484 
485 
486 static int
487 tion(char *ep, char *d, char *a, int lev)
488 {
489 	switch (ep[-2]) {
490 	case 'c':
491 	case 'r':
492 		return (trypref(ep, a, lev));
493 	case 'a':
494 		return (y_to_e(ep, d, a, lev));
495 	}
496 	return (0);
497 }
498 
499 /*	possible consonant-consonant-e ending */
500 static int
501 CCe(char *ep, char *d, char *a, int lev)
502 {
503 	switch (ep[-1]) {
504 	case 'r':
505 		if (ep[-2] == 't')
506 			return (y_to_e(ep, d, a, lev));
507 		break;
508 	case 'l':
509 		if (vowel(ep[-2]))
510 			break;
511 		switch (ep[-2]) {
512 		case 'l':
513 		case 'r':
514 		case 'w':
515 			break;
516 		default:
517 			return (y_to_e(ep, d, a, lev));
518 		}
519 		break;
520 	case 's':
521 		if (ep[-2] == 's')
522 			break;
523 		if (*ep == 'a')
524 			return (0);
525 		if (vowel(ep[-2]))
526 			break;
527 		if (y_to_e(ep, d, a, lev))
528 			return (1);
529 		if (!(ep[-2] == 'n' && ep[-1] == 'g'))
530 			return (0);
531 		break;
532 	case 'c':
533 	case 'g':
534 		if (*ep == 'a')
535 			return (0);
536 		if (vowel(ep[-2]))
537 			break;
538 		if (y_to_e(ep, d, a, lev))
539 			return (1);
540 		if (!(ep[-2] == 'n' && ep[-1] == 'g'))
541 			return (0);
542 		break;
543 	case 'v':
544 	case 'z':
545 		if (vowel(ep[-2]))
546 			break;
547 		if (y_to_e(ep, d, a, lev))
548 			return (1);
549 		if (!(ep[-2] == 'n' && ep[-1] == 'g'))
550 			return (0);
551 		break;
552 	case 'u':
553 		if (y_to_e(ep, d, a, lev))
554 			return (1);
555 		if (!(ep[-2] == 'n' && ep[-1] == 'g'))
556 			return (0);
557 		break;
558 	}
559 	return (VCe(ep, d, a, lev));
560 }
561 
562 /*	possible consonant-vowel-consonant-e ending */
563 static int
564 VCe(char *ep, char *d, char *a, int lev)
565 {
566 	char c;
567 	c = ep[-1];
568 	if (c == 'e')
569 		return (0);
570 	if (!vowel(c) && vowel(ep[-2])) {
571 		c = *ep;
572 		*ep++ = 'e';
573 		if (trypref(ep, d, lev)||trysuff(ep, lev))
574 			return (1);
575 		ep--;
576 		*ep = c;
577 	}
578 	return (strip(ep, d, a, lev));
579 }
580 
581 static char *
582 lookuppref(char **wp, char *ep)
583 {
584 	char **sp;
585 	char *bp, *cp;
586 
587 	for (sp = preftab; *sp; sp++) {
588 		bp = *wp;
589 		for (cp = *sp; *cp; cp++, bp++)
590 			if (Tolower(*bp) != *cp)
591 				goto next;
592 		for (cp = bp; cp < ep; cp++)
593 			if (vowel(*cp)) {
594 				*wp = bp;
595 				return (*sp);
596 			}
597 next:;
598 	}
599 	return (0);
600 }
601 
602 /*
603  *	while word is not in dictionary try stripping
604  *	prefixes. Fail if no more prefixes.
605  */
606 static int
607 trypref(char *ep, char *a, int lev)
608 {
609 	char *cp;
610 	char *bp;
611 	char *pp;
612 	int val = 0;
613 	char space[LINE_MAX * 2];
614 	deriv[lev] = a;
615 	if (tryword(word, ep, lev))
616 		return (1);
617 	bp = word;
618 	pp = space;
619 	deriv[lev+1] = pp;
620 	while (cp = lookuppref(&bp, ep)) {
621 		*pp++ = '+';
622 		while (*pp = *cp++)
623 			pp++;
624 		if (tryword(bp, ep, lev+1)) {
625 			val = 1;
626 			break;
627 		}
628 	}
629 	deriv[lev+1] = deriv[lev+2] = 0;
630 	return (val);
631 }
632 
633 static int
634 tryword(char *bp, char *ep, int lev)
635 {
636 	int i, j;
637 	char duple[3];
638 	if (ep-bp <= 1)
639 		return (0);
640 	if (vowel(*ep)) {
641 		if (monosyl(bp, ep))
642 			return (0);
643 	}
644 	i = dict(bp, ep);
645 	if (i == 0 && vowel(*ep) && ep[-1] == ep[-2] && monosyl(bp, ep-1)) {
646 		ep--;
647 		deriv[++lev] = duple;
648 		duple[0] = '+';
649 		duple[1] = *ep;
650 		duple[2] = 0;
651 		i = dict(bp, ep);
652 	}
653 	if (vflag == 0 || i == 0)
654 		return (i);
655 	/*
656 	 *	when derivations are wanted, collect them
657 	 *	for printing
658 	 */
659 	j = lev;
660 	do {
661 		if (deriv[j])
662 			(void) strcat(affix, deriv[j]);
663 	} while (--j > 0);
664 	return (i);
665 }
666 
667 
668 static int
669 monosyl(char *bp, char *ep)
670 {
671 	if (ep < bp+2)
672 		return (0);
673 	if (vowel(*--ep) || !vowel(*--ep) || ep[1] == 'x' || ep[1] == 'w')
674 		return (0);
675 	while (--ep >= bp)
676 		if (vowel(*ep))
677 			return (0);
678 	return (1);
679 }
680 
681 static char *
682 skipv(char *s)
683 {
684 	if (s >= word&&vowel(*s))
685 		s--;
686 	while (s >= word && !vowel(*s))
687 		s--;
688 	return (s);
689 }
690 
691 static int
692 vowel(int c)
693 {
694 	switch (Tolower(c)) {
695 	case 'a':
696 	case 'e':
697 	case 'i':
698 	case 'o':
699 	case 'u':
700 	case 'y':
701 		return (1);
702 	}
703 	return (0);
704 }
705 
706 /* crummy way to Britishise */
707 static void
708 ise(void)
709 {
710 	struct suftab *p;
711 
712 	for (p = suftab; p->suf; p++) {
713 		ztos(p->suf);
714 		ztos(p->d1);
715 		ztos(p->a1);
716 	}
717 }
718 
719 static void
720 ztos(char *s)
721 {
722 	for (; *s; s++)
723 		if (*s == 'z')
724 			*s = 's';
725 }
726 
727 static int
728 dict(char *bp, char *ep)
729 {
730 	int temp, result;
731 	if (xflag)
732 		(void) fprintf(stdout, "=%.*s\n", ep-bp, bp);
733 	temp = *ep;
734 	*ep = 0;
735 	result = hashlook(bp);
736 	*ep = temp;
737 	return (result);
738 }
739