xref: /titanic_41/usr/src/cmd/sort/common/options.c (revision 7c478bd95313f5f23a4c958a745db2134aa03244)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include "options.h"
30 
31 /*
32  * options
33  *
34  * Overview
35  *   sort(1) supports two methods for specifying the sort key:  the original,
36  *   now-obsolete, +n -m form and the POSIX -k n,m form.  We refer to the former
37  *   as "old specifiers" and the latter as "new specifiers".  The options()
38  *   function parses the command line arguments given to sort, placing the sort
39  *   key specifiers in the internal representation used in fields.c.
40  *
41  * Equivalence of specifiers
42  *   One of sort(1)'s standard peculiarities is the transformation of the
43  *   character offsets and field numbering between the new and old style field
44  *   specifications.  We simply quote from the Single Unix standard:
45  *
46  *	+w.xT -y.zU
47  *
48  *   is equivalent to
49  *
50  * 	undefined		when z == 0, U contains b, and -t is set
51  * 	-k w+1.x+1T,y.0U	when z == 0 otherwise
52  * 	-k w+1.x+1T,y+1.zU	when z > 0
53  *
54  *   Undoubtedly, this seemed logical at the time.  (Using only the field head
55  *   as the coordinate, as done in the obsolete version, seems much simpler.)
56  *   The reverse map is where the key specifier
57  *
58  *	-k w.xT,y.zU
59  *
60  *   is equivalent to
61  *
62  * 	undefined		when z == 0, U contains b, and -t is set
63  *	+w-1.x-1T,y.0U		when z == 0 otherwise
64  *	+w-1.x-1T,y-1.z		when z > 0
65  *
66  *   in the obsolete syntax.  Because the original key specifiers lead to a
67  *   simpler implementation, the internal representation of a field in this
68  *   implementation of sort is mostly that given by the obsolete syntax.
69  */
70 
71 /*
72  * While a key specifier in the obsolete +m ... -n form is being defined (that
73  * is, before the closing -n is seen), a narrower set of options is permitted.
74  * We specify this smaller set of options in OLD_SPEC_OPTIONS_STRING.
75  */
76 #define	OPTIONS_STRING		"cmuo:T:z:dfiMnrbt:k:S:0123456789"
77 #define	OLD_SPEC_OPTIONS_STRING	"bdfiMnrcmuo:T:z:t:k:S:"
78 
79 #define	OPTIONS_OLDSPEC		0x1	/* else new-style spec */
80 #define	OPTIONS_STARTSPEC	0x2	/* else end spec */
81 
82 static int
is_number(char * C)83 is_number(char *C)
84 {
85 	size_t	i;
86 
87 	for (i = 0; i < strlen(C); i++)
88 		if (!isdigit((uchar_t)C[i]))
89 			return (0);
90 
91 	return (1);
92 }
93 
94 /*
95  * If a field specified by the -k option or by the +n syntax contains any
96  * modifiers, then the current global field modifiers are not inherited.
97  */
98 static int
field_spec_has_modifiers(char * C,int length)99 field_spec_has_modifiers(char *C, int length)
100 {
101 	int p_nonmodifiers = strspn(C, ",.1234567890");
102 
103 	if (p_nonmodifiers == length)
104 		return (0);
105 
106 	return (1);
107 }
108 
109 static void
field_apply_all(field_t * fc,flag_t flags)110 field_apply_all(field_t *fc, flag_t flags)
111 {
112 	field_t *f;
113 
114 	for (f = fc; f; f = f->f_next)
115 		if ((f->f_options & FIELD_MODIFIERS_DEFINED) == 0)
116 			f->f_options |= flags;
117 }
118 
119 static int
parse_field_spec(field_t * F,char * C,int flags,int length)120 parse_field_spec(field_t *F, char *C, int flags, int length)
121 {
122 	int p_period = MIN(length, strcspn(C, "."));
123 	int p_modifiers = MIN(length, strspn(C, ".1234567890"));
124 	int p_boundary = MIN(p_period, p_modifiers);
125 	int field = 0;
126 	int offset = 0;
127 	int offset_seen = 0;
128 	int i;
129 	int blanks_flag = 0;
130 
131 	for (i = 0; i < p_boundary; i++) {
132 		if (isdigit((uchar_t)C[i]))
133 			field = (10 * field) + (C[i] - '0');
134 		else
135 			return (1);
136 	}
137 
138 	if (p_period < p_modifiers) {
139 		for (i = p_period + 1; i < p_modifiers; i++) {
140 			if (isdigit((uchar_t)C[i])) {
141 				offset_seen++;
142 				offset = (10 * offset) + (C[i] - '0');
143 			} else {
144 				return (1);
145 			}
146 		}
147 	}
148 
149 	if (p_modifiers < length) {
150 		for (i = p_modifiers; i < length; i++) {
151 			switch (C[i]) {
152 				case 'b':
153 					blanks_flag = 1;
154 					break;
155 				case 'd':
156 					F->f_options |= FIELD_DICTIONARY_ORDER;
157 					break;
158 				case 'f':
159 					F->f_options |= FIELD_FOLD_UPPERCASE;
160 					break;
161 				case 'i':
162 					F->f_options |=
163 					    FIELD_IGNORE_NONPRINTABLES;
164 					break;
165 				case 'M':
166 					F->f_species = MONTH;
167 					break;
168 				case 'n':
169 					F->f_species = NUMERIC;
170 					break;
171 				case 'r':
172 					F->f_options |=
173 					    FIELD_REVERSE_COMPARISONS;
174 					break;
175 				default:
176 					usage();
177 					break;
178 			}
179 		}
180 	}
181 
182 	if (flags & OPTIONS_STARTSPEC) {
183 		F->f_start_field = field;
184 		F->f_start_offset = offset;
185 		if ((flags & OPTIONS_OLDSPEC) != OPTIONS_OLDSPEC) {
186 			F->f_start_field--;
187 			if (offset_seen)
188 				F->f_start_offset--;
189 		}
190 		F->f_options |= blanks_flag ? FIELD_IGNORE_BLANKS_START : 0;
191 	} else {
192 		F->f_end_field = field;
193 		F->f_end_offset = offset;
194 		if ((flags & OPTIONS_OLDSPEC) != OPTIONS_OLDSPEC &&
195 		    offset_seen && offset != 0)
196 			F->f_end_field--;
197 		F->f_options |= blanks_flag ? FIELD_IGNORE_BLANKS_END : 0;
198 	}
199 
200 	return (0);
201 }
202 
203 static void
parse_new_field_spec(sort_t * S,char * arg)204 parse_new_field_spec(sort_t *S, char *arg)
205 {
206 	int length = strlen(arg);
207 	int p_comma = MIN(length, strcspn(arg, ","));
208 	field_t *nF;
209 	int p;
210 
211 	/*
212 	 * New field specifiers do not inherit from the general specifier if
213 	 * they have any modifiers set.  (This is specifically tested in the VSC
214 	 * test suite, assertion 32 for POSIX.cmd/sort.)
215 	 */
216 	if (field_spec_has_modifiers(arg, length)) {
217 		nF = field_new(NULL);
218 		nF->f_options = FIELD_MODIFIERS_DEFINED;
219 	} else {
220 		nF = field_new(S);
221 	}
222 	p = parse_field_spec(nF, arg, OPTIONS_STARTSPEC, p_comma);
223 
224 	if (p != 0)
225 		usage();
226 
227 	if (p_comma < length) {
228 		p = parse_field_spec(nF, &(arg[p_comma + 1]), 0,
229 		    strlen(&(arg[p_comma + 1])));
230 		if (p != 0)
231 			usage();
232 	}
233 
234 	if (nF->f_start_field < 0 || nF->f_start_offset < 0) {
235 		if (S->m_verbose)
236 			warn("-k %s is not a supported field specifier\n", arg);
237 	}
238 	nF->f_start_field = MAX(nF->f_start_field, 0);
239 	nF->f_start_offset = MAX(nF->f_start_offset, 0);
240 
241 	/*
242 	 * If the starting field exceeds a defined ending field, convention
243 	 * dictates that the field is ignored.
244 	 */
245 	if (nF->f_end_field == -1 || nF->f_start_field < nF->f_end_field ||
246 	    (nF->f_start_field == nF->f_end_field &&
247 	    nF->f_start_offset < nF->f_end_offset)) {
248 		field_add_to_chain(&(S->m_fields_head), nF);
249 	} else if (S->m_verbose) {
250 		warn("illegal field -k %s omitted", arg);
251 	}
252 }
253 
254 /*
255  * parse_old_field_spec() is getopt()-aware; it may modify the values of optind,
256  * optarg, and so forth, to correctly determine the characteristics being
257  * assigned to the current field.
258  */
259 static int
parse_old_field_spec(sort_t * S,int argc,char * argv[])260 parse_old_field_spec(sort_t *S, int argc, char *argv[])
261 {
262 	field_t *nF;
263 	int c, p;
264 	char *arg = argv[optind];
265 
266 	if (field_spec_has_modifiers(arg + 1, strlen(arg + 1))) {
267 		nF = field_new(NULL);
268 		nF->f_options = FIELD_MODIFIERS_DEFINED;
269 	} else {
270 		nF = field_new(S);
271 	}
272 
273 	p = parse_field_spec(nF, arg + 1, OPTIONS_OLDSPEC | OPTIONS_STARTSPEC,
274 	    strlen(arg + 1));
275 
276 	if (p != 0) {
277 		field_delete(nF);
278 		return (0);
279 	}
280 
281 	/*
282 	 * In the case that getopt() returns '?' (unrecognized option) or EOF
283 	 * (non-option argument), the field is considered closed.
284 	 */
285 	for (arg = argv[++optind]; optind < argc; arg = argv[optind]) {
286 		if (strlen(arg) >= 2 && *arg == '-' &&
287 		    isdigit(*(uchar_t *)(arg + 1))) {
288 			(void) parse_field_spec(nF, arg + 1,
289 			    OPTIONS_OLDSPEC, strlen(arg) - 1);
290 			field_add_to_chain(&(S->m_fields_head), nF);
291 			optind++;
292 			return (1);
293 		}
294 
295 		if ((c = getopt(argc, argv, OLD_SPEC_OPTIONS_STRING)) != EOF) {
296 			switch (c) {
297 			case 'b':
298 				nF->f_options |= FIELD_IGNORE_BLANKS_START;
299 				break;
300 			case 'd':
301 				nF->f_options |= FIELD_DICTIONARY_ORDER;
302 				break;
303 			case 'f':
304 				nF->f_options |= FIELD_FOLD_UPPERCASE;
305 				break;
306 			case 'i':
307 				nF->f_options |= FIELD_IGNORE_NONPRINTABLES;
308 				break;
309 			case 'M':
310 				nF->f_species = MONTH;
311 				break;
312 			case 'n':
313 				nF->f_species = NUMERIC;
314 				break;
315 			case 'r':
316 				nF->f_options |= FIELD_REVERSE_COMPARISONS;
317 				break;
318 			case '?':
319 			case 'c':
320 			case 'm':
321 			case 'u':
322 				/*
323 				 * Options without arguments.
324 				 */
325 				optind -= 1;
326 				field_add_to_chain(&(S->m_fields_head), nF);
327 				return (1);
328 				/*NOTREACHED*/
329 			case 'o':
330 			case 'T':
331 			case 'z':
332 			case 't':
333 			case 'k':
334 			case 'S':
335 				/*
336 				 * Options with arguments.
337 				 */
338 				if (optarg == argv[optind - 1] + 2) {
339 					optind -= 1;
340 				} else {
341 					optind -= 2;
342 				}
343 				field_add_to_chain(&(S->m_fields_head), nF);
344 				return (1);
345 				/*NOTREACHED*/
346 			default:
347 				die(EMSG_UNKN_OPTION);
348 				/*NOTREACHED*/
349 			}
350 		} else {
351 			break;
352 		}
353 	}
354 
355 	field_add_to_chain(&(S->m_fields_head), nF);
356 	return (1);
357 }
358 
359 int
options(sort_t * S,int argc,char * argv[])360 options(sort_t *S, int argc, char *argv[])
361 {
362 	int c;
363 
364 	optind = 1;
365 	while (optind < argc) {
366 		if (strncmp("-y", argv[optind], strlen("-y")) == 0) {
367 			/*
368 			 * The -y [kmem] option violates the standard syntax
369 			 * outlined in intro(1).  we have to be a little fancy
370 			 * to determine if the next argument is a valid integer.
371 			 * (note, of course, that the previous sort(1) had no
372 			 * mechanism to resolve a final
373 			 *	-y 99999
374 			 * into
375 			 *	-y, file 99999
376 			 * or
377 			 *	-y 99999, file stdin
378 			 *
379 			 * Now one can unambiguously use
380 			 *	-y -- 99999
381 			 * and
382 			 *	-y 99999 -
383 			 * to distinguish these cases.
384 			 *
385 			 * That said, we do not use the information passed using
386 			 * -y option in sort(1); we provide the argument to
387 			 * preserve compatibility for existing scripts.
388 			 */
389 			if (strlen(argv[optind]) == strlen("-y") &&
390 			    optind + 1 < argc &&
391 			    is_number(argv[optind + 1]))
392 				optind += 2;
393 			else
394 				optind += 1;
395 		}
396 
397 		if ((c = getopt(argc, argv, OPTIONS_STRING)) != EOF) {
398 			switch (c) {
399 			case 'c':
400 				S->m_check_if_sorted_only = 1;
401 				break;
402 
403 			case 'm':
404 				S->m_merge_only = 1;
405 				break;
406 
407 			case 'u':
408 				S->m_unique_lines = 1;
409 				break;
410 
411 			case 'o':
412 				S->m_output_filename = optarg;
413 				break;
414 
415 			case 'T':
416 				S->m_tmpdir_template = optarg;
417 				break;
418 
419 			case 'z':
420 				/*
421 				 * ignore optarg -- obsolete
422 				 */
423 				break;
424 
425 			case 'd':
426 				S->m_field_options |= FIELD_DICTIONARY_ORDER;
427 				field_apply_all(S->m_fields_head,
428 				    FIELD_DICTIONARY_ORDER);
429 				break;
430 
431 			case 'f':
432 				S->m_field_options |= FIELD_FOLD_UPPERCASE;
433 				field_apply_all(S->m_fields_head,
434 				    FIELD_FOLD_UPPERCASE);
435 				break;
436 
437 			case 'i':
438 				S->m_field_options |=
439 				    FIELD_IGNORE_NONPRINTABLES;
440 				field_apply_all(S->m_fields_head,
441 				    FIELD_IGNORE_NONPRINTABLES);
442 				break;
443 
444 			case 'M':
445 				S->m_default_species = MONTH;
446 				S->m_field_options &=
447 				    ~FIELD_IGNORE_BLANKS_START;
448 				break;
449 
450 			case 'n':
451 				S->m_default_species = NUMERIC;
452 				{
453 					field_t *f;
454 
455 					for (f = S->m_fields_head; f;
456 					    f = f->f_next)
457 						if ((f->f_options &
458 						    FIELD_MODIFIERS_DEFINED) ==
459 						    0)
460 							f->f_species = NUMERIC;
461 				}
462 				break;
463 
464 			case 'b':
465 				S->m_field_options |=
466 				    FIELD_IGNORE_BLANKS_START |
467 				    FIELD_IGNORE_BLANKS_END;
468 				break;
469 
470 			case 'r':
471 				S->m_field_options |=
472 				    FIELD_REVERSE_COMPARISONS;
473 				field_apply_all(S->m_fields_head,
474 				    FIELD_REVERSE_COMPARISONS);
475 				break;
476 
477 			case 't':
478 				/*
479 				 * delimiter
480 				 */
481 				if (S->m_single_byte_locale) {
482 					/*
483 					 * Most debuggers can't take tabs as
484 					 * input arguments, so we provide an
485 					 * escape sequence to allow testing of
486 					 * this special case for the DEBUG
487 					 * version.
488 					 */
489 					S->m_field_separator.sc =
490 #ifdef DEBUG
491 					    xstreql(optarg, "\\t") ? '\t' :
492 #endif
493 					    optarg[0];
494 				} else
495 					(void) mbtowc(&S->m_field_separator.wc,
496 					    optarg, MB_CUR_MAX);
497 				break;
498 
499 			case 'k':
500 				/*
501 				 * key
502 				 */
503 				(void) parse_new_field_spec(S, optarg);
504 				break;
505 
506 			case 'S':
507 				S->m_memory_limit = strtomem(optarg);
508 #ifdef DEBUG
509 				(void) fprintf(stderr, CMDNAME
510 				    ": limiting size to %d bytes\n",
511 				    S->m_memory_limit);
512 #endif /* DEBUG */
513 				break;
514 
515 			/*
516 			 * We never take a naked -999; these should always be
517 			 * associated with a preceding +000.
518 			 */
519 			case '0':
520 			case '1':
521 			case '2':
522 			case '3':
523 			case '4':
524 			case '5':
525 			case '6':
526 			case '7':
527 			case '8':
528 			case '9':
529 				usage();
530 				break;
531 			case '?':
532 				/* error case */
533 				usage();
534 				break;
535 			}
536 
537 			/*
538 			 * Go back for next argument.
539 			 */
540 			continue;
541 		}
542 
543 		/*
544 		 * There are three (interpretable) possibilities for getopt() to
545 		 * return EOF with arguments on the command line: we have seen
546 		 * the "end-of-options" token, --, we have encountered the
547 		 * old-style field definition, +NNN, or we have found a
548 		 * filename.
549 		 *
550 		 * In the second case, we must also search for the optional -NNN
551 		 * field terminal definition.  (since "+joe", for instance, is
552 		 * a valid filename, we must handle this pattern as well.)  This
553 		 * is performed by parse_old_field_spec().
554 		 */
555 		if (xstreql(argv[optind - 1], "--")) {
556 			/*
557 			 * Process all arguments following end-of-options token
558 			 * as filenames.
559 			 */
560 			while (optind < argc) {
561 				if (xstreql(argv[optind], "-"))
562 					S->m_input_from_stdin = 1;
563 				else
564 					stream_add_file_to_chain(
565 					    &(S->m_input_streams),
566 					    argv[optind]);
567 				optind++;
568 			}
569 
570 			break;
571 		}
572 
573 		if (optind < argc) {
574 			if (xstreql(argv[optind], "-")) {
575 				S->m_input_from_stdin = 1;
576 				optind++;
577 			} else if (*(argv[optind]) != '+' ||
578 			    !parse_old_field_spec(S, argc, argv)) {
579 				/*
580 				 * It's a filename, because it either doesn't
581 				 * start with '+', or if it did, it wasn't an
582 				 * actual field specifier.
583 				 */
584 				stream_add_file_to_chain(&(S->m_input_streams),
585 				    argv[optind]);
586 				optind++;
587 			}
588 		}
589 	}
590 
591 	if (S->m_input_streams == NULL)
592 		S->m_input_from_stdin = 1;
593 
594 	if (S->m_output_filename == NULL)
595 		S->m_output_to_stdout = 1;
596 
597 	/*
598 	 * If no fields, then one great field.  However, if the -b option was
599 	 * set globally, be sure to ignore it, as per UNIX98.
600 	 */
601 	if (S->m_fields_head == NULL) {
602 		S->m_field_options &= ~FIELD_IGNORE_BLANKS_START;
603 
604 		(void) parse_new_field_spec(S, "1");
605 		/*
606 		 * "Entire line" fast path is only valid if no delimiter has
607 		 * been set and no modifiers have been applied.
608 		 */
609 		if (S->m_field_separator.wc == 0 &&
610 		    S->m_default_species == ALPHA &&
611 		    S->m_field_options == 0)
612 			S->m_entire_line = 1;
613 	}
614 
615 	return (0);
616 }
617