xref: /illumos-gate/usr/src/tools/onbld/Checks/SpellCheck.py (revision e07d85f87c3920e032adb855fdc500e4616c7718)
1#
2# CDDL HEADER START
3#
4# The contents of this file are subject to the terms of the
5# Common Development and Distribution License (the "License").
6# You may not use this file except in compliance with the License.
7#
8# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9# or http://www.opensolaris.org/os/licensing.
10# See the License for the specific language governing permissions
11# and limitations under the License.
12#
13# When distributing Covered Code, include this CDDL HEADER in each
14# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15# If applicable, add the following below this CDDL HEADER, with the
16# fields enclosed by brackets "[]" replaced with your own identifying
17# information: Portions Copyright [yyyy] [name of copyright owner]
18#
19# CDDL HEADER END
20#
21
22#
23# Copyright 2016 Joyent, Inc.
24#
25
26import re, sys
27
28spellMsg = '%s: Line %d contains "%s", a common misspelling of "%s"\n'
29altMsg = '%s: Line %d contains "%s"; please use "%s" instead for consistency with other documentation\n'
30
31misspellings = {
32	'absense': 'absence',
33	'accessable': 'accessible',
34	'accomodate': 'accommodate',
35	'accomodation': 'accommodation',
36	'accross': 'across',
37	'acheive': 'achieve',
38	'addional': 'additional',
39	'addres': 'address',
40	'admininistrative': 'administrative',
41	'adminstered': 'administered',
42	'adminstrate': 'administrate',
43	'adminstration': 'administration',
44	'adminstrative': 'administrative',
45	'adminstrator': 'administrator',
46	'admissability': 'admissibility',
47	'adress': 'address',
48	'adressable': 'addressable',
49	'adressed': 'addressed',
50	'adressing': 'addressing, dressing',
51	'aginst': 'against',
52	'agression': 'aggression',
53	'agressive': 'aggressive',
54	'alot': 'a lot, allot',
55	'and and': 'and',
56	'apparantly': 'apparently',
57	'appearence': 'appearance',
58	'arguement': 'argument',
59	'assasination': 'assassination',
60	'auxilliary': 'auxiliary',
61	'basicly': 'basically',
62	'begining': 'beginning',
63	'belive': 'believe',
64	'beteen': 'between',
65	'betwen': 'between',
66	'beween': 'between',
67	'bewteen': 'between',
68	'bizzare': 'bizarre',
69	'buisness': 'business',
70	'calender': 'calendar',
71	'cemetary': 'cemetery',
72	'chauffer': 'chauffeur',
73	'collegue': 'colleague',
74	'comming': 'coming',
75	'commited': 'committed',
76	'commitee': 'committee',
77	'commiting': 'committing',
78	'comparision': 'comparison',
79	'comparisions': 'comparisons',
80	'compatability': 'compatibility',
81	'compatable': 'compatible',
82	'compatablity': 'compatibility',
83	'compatiable': 'compatible',
84	'compatiblity': 'compatibility',
85	'completly': 'completely',
86	'concious': 'conscious',
87	'condidtion': 'condition',
88	'conected': 'connected',
89	'conjuction': 'conjunction',
90	'continous': 'continuous',
91	'curiousity': 'curiosity',
92	'deamon': 'daemon',
93	'definately': 'definitely',
94	'desireable': 'desirable',
95	'diffrent': 'different',
96	'dilemna': 'dilemma',
97	'dissapear': 'disappear',
98	'dissapoint': 'disappoint',
99	'ecstacy': 'ecstasy',
100	'embarass': 'embarrass',
101	'enviroment': 'environment',
102	'exept': 'except',
103	'existance': 'existence',
104	'familar': 'familiar',
105	'finaly': 'finally',
106	'folowing': 'following',
107	'foriegn': 'foreign',
108	'forseeable': 'foreseeable',
109	'fourty': 'forty',
110	'foward': 'forward',
111	'freind': 'friend',
112	'futher': 'further',
113	'gaurd': 'guard',
114	'glamourous': 'glamorous',
115	'goverment': 'government',
116	'happend': 'happened',
117	'harrassment': 'harassment',
118	'hierachical': 'hierarchical',
119	'hierachies': 'hierarchies',
120	'hierachy': 'hierarchy',
121	'hierarcical': 'hierarchical',
122	'hierarcy': 'hierarchy',
123	'honourary': 'honorary',
124	'humourous': 'humorous',
125	'idiosyncracy': 'idiosyncrasy',
126	'immediatly': 'immediately',
127	'inaccessable': 'inaccessible',
128	'inbetween': 'between',
129	'incidently': 'incidentally',
130	'independant': 'independent',
131	'infomation': 'information',
132	'interupt': 'interrupt',
133	'intial': 'initial',
134	'intially': 'initially',
135	'irresistable': 'irresistible',
136	'jist': 'gist',
137	'knowlege': 'knowledge',
138	'lenght': 'length',
139	'liase': 'liaise',
140	'liason': 'liaison',
141	'libary': 'library',
142	'maching': 'machine, marching, matching',
143	'millenia': 'millennia',
144	'millenium': 'millennium',
145	'neccessary': 'necessary',
146	'negotation': 'negotiation',
147	'nontheless': 'nonetheless',
148	'noticable': 'noticeable',
149	'occassion': 'occasion',
150	'occassional': 'occasional',
151	'occassionally': 'occasionally',
152	'occurance': 'occurrence',
153	'occured': 'occurred',
154	'occurence': 'occurrence',
155	'occuring': 'occurring',
156	'ommision': 'omission',
157	'orginal': 'original',
158	'orginally': 'originally',
159	'pavillion': 'pavilion',
160	'peice': 'piece',
161	'persistant': 'persistent',
162	'politican': 'politician',
163	'posession': 'possession',
164	'possiblity': 'possibility',
165	'preceed': 'precede',
166	'preceeded': 'preceded',
167	'preceeding': 'preceding',
168	'preceeds': 'precedes',
169	'prefered': 'preferred',
170	'prefering': 'preferring',
171	'presense': 'presence',
172	'proces': 'process',
173	'propoganda': 'propaganda',
174	'psuedo': 'pseudo',
175	'publically': 'publicly',
176	'realy': 'really',
177	'reciept': 'receipt',
178	'recieve': 'receive',
179	'recieved': 'received',
180	'reciever': 'receiver',
181	'recievers': 'receivers',
182	'recieves': 'receives',
183	'recieving': 'receiving',
184	'recomend': 'recommend',
185	'recomended': 'recommended',
186	'recomending': 'recommending',
187	'recomends': 'recommends',
188	'recurse': 'recur',
189	'recurses': 'recurs',
190	'recursing': 'recurring',
191	'refered': 'referred',
192	'refering': 'referring',
193	'religous': 'religious',
194	'rember': 'remember',
195	'remeber': 'remember',
196	'repetion': 'repetition',
197	'reponsible': 'responsible',
198	'resistence': 'resistance',
199	'retreive': 'retrieve',
200	'seige': 'siege',
201	'sence': 'since',
202	'seperate': 'separate',
203	'seperated': 'separated',
204	'seperately': 'separately',
205	'seperates': 'separates',
206	'similiar': 'similar',
207	'somwhere': 'somewhere',
208	'sould': 'could, should, sold, soul',
209	'sturcture': 'structure',
210	'succesful': 'successful',
211	'succesfully': 'successfully',
212	'successfull': 'successful',
213	'sucessful': 'successful',
214	'supercede': 'supersede',
215	'supress': 'suppress',
216	'supressed': 'suppressed',
217	'suprise': 'surprise',
218	'suprisingly': 'surprisingly',
219	'sytem': 'system',
220	'tendancy': 'tendency',
221	'the the': 'the',
222	'the these': 'these',
223	'therefor': 'therefore',
224	'threshhold': 'threshold',
225	'tolerence': 'tolerance',
226	'tommorow': 'tomorrow',
227	'tommorrow': 'tomorrow',
228	'tounge': 'tongue',
229	'tranformed': 'transformed',
230	'transfered': 'transferred',
231	'truely': 'truly',
232	'trustworthyness': 'trustworthiness',
233	'uncommited': 'uncommitted',
234	'unforseen': 'unforeseen',
235	'unfortunatly': 'unfortunately',
236	'unsuccessfull': 'unsuccessful',
237	'untill': 'until',
238	'upto': 'up to',
239	'whereever': 'wherever',
240	'wich': 'which',
241	'wierd': 'weird',
242	'wtih': 'with',
243}
244
245alternates = {
246	'parseable': 'parsable',
247	'sub-command': 'subcommand',
248	'sub-commands': 'subcommands',
249	'writeable': 'writable'
250}
251
252misspellingREs = []
253alternateREs = []
254
255for misspelling, correct in misspellings.iteritems():
256	regex = re.compile(r'\b%s\b' % (misspelling), re.IGNORECASE)
257	entry = (regex, misspelling, correct)
258	misspellingREs.append(entry)
259
260for alternate, correct in alternates.iteritems():
261	regex = re.compile(r'\b%s\b' % (alternate), re.IGNORECASE)
262	entry = (regex, alternate, correct)
263	alternateREs.append(entry)
264
265def check(errmsg, output, filename, line, lineno, entry):
266	if entry[0].search(line):
267		output.write(errmsg % (filename, lineno, entry[1], entry[2]))
268		return 1
269	else:
270		return 0
271
272def spellcheck(fh, filename=None, output=sys.stderr, **opts):
273	lineno = 1
274	ret = 0
275
276	if not filename:
277		filename = fh.name
278
279	fh.seek(0)
280	for line in fh:
281		for entry in misspellingREs:
282			ret |= check(spellMsg, output, filename, line,
283			    lineno, entry)
284		for entry in alternateREs:
285			ret |= check(altMsg, output, filename, line,
286			    lineno, entry)
287		lineno += 1
288
289	return ret
290