1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License, Version 1.0 only
6 * (the "License"). You may not use this file except in compliance
7 * with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22 /*
23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 */
26
27 #include "options.h"
28
29 /*
30 * options
31 *
32 * Overview
33 * sort(1) supports two methods for specifying the sort key: the original,
34 * now-obsolete, +n -m form and the POSIX -k n,m form. We refer to the former
35 * as "old specifiers" and the latter as "new specifiers". The options()
36 * function parses the command line arguments given to sort, placing the sort
37 * key specifiers in the internal representation used in fields.c.
38 *
39 * Equivalence of specifiers
40 * One of sort(1)'s standard peculiarities is the transformation of the
41 * character offsets and field numbering between the new and old style field
42 * specifications. We simply quote from the Single Unix standard:
43 *
44 * +w.xT -y.zU
45 *
46 * is equivalent to
47 *
48 * undefined when z == 0, U contains b, and -t is set
49 * -k w+1.x+1T,y.0U when z == 0 otherwise
50 * -k w+1.x+1T,y+1.zU when z > 0
51 *
52 * Undoubtedly, this seemed logical at the time. (Using only the field head
53 * as the coordinate, as done in the obsolete version, seems much simpler.)
54 * The reverse map is where the key specifier
55 *
56 * -k w.xT,y.zU
57 *
58 * is equivalent to
59 *
60 * undefined when z == 0, U contains b, and -t is set
61 * +w-1.x-1T,y.0U when z == 0 otherwise
62 * +w-1.x-1T,y-1.z when z > 0
63 *
64 * in the obsolete syntax. Because the original key specifiers lead to a
65 * simpler implementation, the internal representation of a field in this
66 * implementation of sort is mostly that given by the obsolete syntax.
67 */
68
69 /*
70 * While a key specifier in the obsolete +m ... -n form is being defined (that
71 * is, before the closing -n is seen), a narrower set of options is permitted.
72 * We specify this smaller set of options in OLD_SPEC_OPTIONS_STRING.
73 */
74 #define OPTIONS_STRING "cmuo:T:z:dfiMnrbt:k:S:0123456789"
75 #define OLD_SPEC_OPTIONS_STRING "bdfiMnrcmuo:T:z:t:k:S:"
76
77 #define OPTIONS_OLDSPEC 0x1 /* else new-style spec */
78 #define OPTIONS_STARTSPEC 0x2 /* else end spec */
79
80 static int
is_number(char * C)81 is_number(char *C)
82 {
83 size_t i;
84
85 for (i = 0; i < strlen(C); i++)
86 if (!isdigit((uchar_t)C[i]))
87 return (0);
88
89 return (1);
90 }
91
92 /*
93 * If a field specified by the -k option or by the +n syntax contains any
94 * modifiers, then the current global field modifiers are not inherited.
95 */
96 static int
field_spec_has_modifiers(char * C,int length)97 field_spec_has_modifiers(char *C, int length)
98 {
99 int p_nonmodifiers = strspn(C, ",.1234567890");
100
101 if (p_nonmodifiers == length)
102 return (0);
103
104 return (1);
105 }
106
107 static void
field_apply_all(field_t * fc,flag_t flags)108 field_apply_all(field_t *fc, flag_t flags)
109 {
110 field_t *f;
111
112 for (f = fc; f; f = f->f_next)
113 if ((f->f_options & FIELD_MODIFIERS_DEFINED) == 0)
114 f->f_options |= flags;
115 }
116
117 static int
parse_field_spec(field_t * F,char * C,int flags,int length)118 parse_field_spec(field_t *F, char *C, int flags, int length)
119 {
120 int p_period = MIN(length, strcspn(C, "."));
121 int p_modifiers = MIN(length, strspn(C, ".1234567890"));
122 int p_boundary = MIN(p_period, p_modifiers);
123 int field = 0;
124 int offset = 0;
125 int offset_seen = 0;
126 int i;
127 int blanks_flag = 0;
128
129 for (i = 0; i < p_boundary; i++) {
130 if (isdigit((uchar_t)C[i]))
131 field = (10 * field) + (C[i] - '0');
132 else
133 return (1);
134 }
135
136 if (p_period < p_modifiers) {
137 for (i = p_period + 1; i < p_modifiers; i++) {
138 if (isdigit((uchar_t)C[i])) {
139 offset_seen++;
140 offset = (10 * offset) + (C[i] - '0');
141 } else {
142 return (1);
143 }
144 }
145 }
146
147 if (p_modifiers < length) {
148 for (i = p_modifiers; i < length; i++) {
149 switch (C[i]) {
150 case 'b':
151 blanks_flag = 1;
152 break;
153 case 'd':
154 F->f_options |= FIELD_DICTIONARY_ORDER;
155 break;
156 case 'f':
157 F->f_options |= FIELD_FOLD_UPPERCASE;
158 break;
159 case 'i':
160 F->f_options |=
161 FIELD_IGNORE_NONPRINTABLES;
162 break;
163 case 'M':
164 F->f_species = MONTH;
165 break;
166 case 'n':
167 F->f_species = NUMERIC;
168 break;
169 case 'r':
170 F->f_options |=
171 FIELD_REVERSE_COMPARISONS;
172 break;
173 default:
174 usage();
175 break;
176 }
177 }
178 }
179
180 if (flags & OPTIONS_STARTSPEC) {
181 F->f_start_field = field;
182 F->f_start_offset = offset;
183 if ((flags & OPTIONS_OLDSPEC) != OPTIONS_OLDSPEC) {
184 F->f_start_field--;
185 if (offset_seen)
186 F->f_start_offset--;
187 }
188 F->f_options |= blanks_flag ? FIELD_IGNORE_BLANKS_START : 0;
189 } else {
190 F->f_end_field = field;
191 F->f_end_offset = offset;
192 if ((flags & OPTIONS_OLDSPEC) != OPTIONS_OLDSPEC &&
193 offset_seen && offset != 0)
194 F->f_end_field--;
195 F->f_options |= blanks_flag ? FIELD_IGNORE_BLANKS_END : 0;
196 }
197
198 return (0);
199 }
200
201 static void
parse_new_field_spec(sort_t * S,char * arg)202 parse_new_field_spec(sort_t *S, char *arg)
203 {
204 int length = strlen(arg);
205 int p_comma = MIN(length, strcspn(arg, ","));
206 field_t *nF;
207 int p;
208
209 /*
210 * New field specifiers do not inherit from the general specifier if
211 * they have any modifiers set. (This is specifically tested in the VSC
212 * test suite, assertion 32 for POSIX.cmd/sort.)
213 */
214 if (field_spec_has_modifiers(arg, length)) {
215 nF = field_new(NULL);
216 nF->f_options = FIELD_MODIFIERS_DEFINED;
217 } else {
218 nF = field_new(S);
219 }
220 p = parse_field_spec(nF, arg, OPTIONS_STARTSPEC, p_comma);
221
222 if (p != 0)
223 usage();
224
225 if (p_comma < length) {
226 p = parse_field_spec(nF, &(arg[p_comma + 1]), 0,
227 strlen(&(arg[p_comma + 1])));
228 if (p != 0)
229 usage();
230 }
231
232 if (nF->f_start_field < 0 || nF->f_start_offset < 0) {
233 if (S->m_verbose)
234 warn("-k %s is not a supported field specifier\n", arg);
235 }
236 nF->f_start_field = MAX(nF->f_start_field, 0);
237 nF->f_start_offset = MAX(nF->f_start_offset, 0);
238
239 /*
240 * If the starting field exceeds a defined ending field, convention
241 * dictates that the field is ignored.
242 */
243 if (nF->f_end_field == -1 || nF->f_start_field < nF->f_end_field ||
244 (nF->f_start_field == nF->f_end_field &&
245 nF->f_start_offset < nF->f_end_offset)) {
246 field_add_to_chain(&(S->m_fields_head), nF);
247 } else if (S->m_verbose) {
248 warn("illegal field -k %s omitted", arg);
249 }
250 }
251
252 /*
253 * parse_old_field_spec() is getopt()-aware; it may modify the values of optind,
254 * optarg, and so forth, to correctly determine the characteristics being
255 * assigned to the current field.
256 */
257 static int
parse_old_field_spec(sort_t * S,int argc,char * argv[])258 parse_old_field_spec(sort_t *S, int argc, char *argv[])
259 {
260 field_t *nF;
261 int c, p;
262 char *arg = argv[optind];
263
264 if (field_spec_has_modifiers(arg + 1, strlen(arg + 1))) {
265 nF = field_new(NULL);
266 nF->f_options = FIELD_MODIFIERS_DEFINED;
267 } else {
268 nF = field_new(S);
269 }
270
271 p = parse_field_spec(nF, arg + 1, OPTIONS_OLDSPEC | OPTIONS_STARTSPEC,
272 strlen(arg + 1));
273
274 if (p != 0) {
275 field_delete(nF);
276 return (0);
277 }
278
279 /*
280 * In the case that getopt() returns '?' (unrecognized option) or EOF
281 * (non-option argument), the field is considered closed.
282 */
283 for (arg = argv[++optind]; optind < argc; arg = argv[optind]) {
284 if (strlen(arg) >= 2 && *arg == '-' &&
285 isdigit(*(uchar_t *)(arg + 1))) {
286 (void) parse_field_spec(nF, arg + 1,
287 OPTIONS_OLDSPEC, strlen(arg) - 1);
288 field_add_to_chain(&(S->m_fields_head), nF);
289 optind++;
290 return (1);
291 }
292
293 if ((c = getopt(argc, argv, OLD_SPEC_OPTIONS_STRING)) != EOF) {
294 switch (c) {
295 case 'b':
296 nF->f_options |= FIELD_IGNORE_BLANKS_START;
297 break;
298 case 'd':
299 nF->f_options |= FIELD_DICTIONARY_ORDER;
300 break;
301 case 'f':
302 nF->f_options |= FIELD_FOLD_UPPERCASE;
303 break;
304 case 'i':
305 nF->f_options |= FIELD_IGNORE_NONPRINTABLES;
306 break;
307 case 'M':
308 nF->f_species = MONTH;
309 break;
310 case 'n':
311 nF->f_species = NUMERIC;
312 break;
313 case 'r':
314 nF->f_options |= FIELD_REVERSE_COMPARISONS;
315 break;
316 case '?':
317 case 'c':
318 case 'm':
319 case 'u':
320 /*
321 * Options without arguments.
322 */
323 optind -= 1;
324 field_add_to_chain(&(S->m_fields_head), nF);
325 return (1);
326 /*NOTREACHED*/
327 case 'o':
328 case 'T':
329 case 'z':
330 case 't':
331 case 'k':
332 case 'S':
333 /*
334 * Options with arguments.
335 */
336 if (optarg == argv[optind - 1] + 2) {
337 optind -= 1;
338 } else {
339 optind -= 2;
340 }
341 field_add_to_chain(&(S->m_fields_head), nF);
342 return (1);
343 /*NOTREACHED*/
344 default:
345 die(EMSG_UNKN_OPTION);
346 /*NOTREACHED*/
347 }
348 } else {
349 break;
350 }
351 }
352
353 field_add_to_chain(&(S->m_fields_head), nF);
354 return (1);
355 }
356
357 int
options(sort_t * S,int argc,char * argv[])358 options(sort_t *S, int argc, char *argv[])
359 {
360 int c;
361
362 optind = 1;
363 while (optind < argc) {
364 if (strncmp("-y", argv[optind], strlen("-y")) == 0) {
365 /*
366 * The -y [kmem] option violates the standard syntax
367 * outlined in intro(1). we have to be a little fancy
368 * to determine if the next argument is a valid integer.
369 * (note, of course, that the previous sort(1) had no
370 * mechanism to resolve a final
371 * -y 99999
372 * into
373 * -y, file 99999
374 * or
375 * -y 99999, file stdin
376 *
377 * Now one can unambiguously use
378 * -y -- 99999
379 * and
380 * -y 99999 -
381 * to distinguish these cases.
382 *
383 * That said, we do not use the information passed using
384 * -y option in sort(1); we provide the argument to
385 * preserve compatibility for existing scripts.
386 */
387 if (strlen(argv[optind]) == strlen("-y") &&
388 optind + 1 < argc &&
389 is_number(argv[optind + 1]))
390 optind += 2;
391 else
392 optind += 1;
393 }
394
395 if ((c = getopt(argc, argv, OPTIONS_STRING)) != EOF) {
396 switch (c) {
397 case 'c':
398 S->m_check_if_sorted_only = 1;
399 break;
400
401 case 'm':
402 S->m_merge_only = 1;
403 break;
404
405 case 'u':
406 S->m_unique_lines = 1;
407 break;
408
409 case 'o':
410 S->m_output_filename = optarg;
411 break;
412
413 case 'T':
414 S->m_tmpdir_template = optarg;
415 break;
416
417 case 'z':
418 /*
419 * ignore optarg -- obsolete
420 */
421 break;
422
423 case 'd':
424 S->m_field_options |= FIELD_DICTIONARY_ORDER;
425 field_apply_all(S->m_fields_head,
426 FIELD_DICTIONARY_ORDER);
427 break;
428
429 case 'f':
430 S->m_field_options |= FIELD_FOLD_UPPERCASE;
431 field_apply_all(S->m_fields_head,
432 FIELD_FOLD_UPPERCASE);
433 break;
434
435 case 'i':
436 S->m_field_options |=
437 FIELD_IGNORE_NONPRINTABLES;
438 field_apply_all(S->m_fields_head,
439 FIELD_IGNORE_NONPRINTABLES);
440 break;
441
442 case 'M':
443 S->m_default_species = MONTH;
444 S->m_field_options &=
445 ~FIELD_IGNORE_BLANKS_START;
446 break;
447
448 case 'n':
449 S->m_default_species = NUMERIC;
450 {
451 field_t *f;
452
453 for (f = S->m_fields_head; f;
454 f = f->f_next)
455 if ((f->f_options &
456 FIELD_MODIFIERS_DEFINED) ==
457 0)
458 f->f_species = NUMERIC;
459 }
460 break;
461
462 case 'b':
463 S->m_field_options |=
464 FIELD_IGNORE_BLANKS_START |
465 FIELD_IGNORE_BLANKS_END;
466 break;
467
468 case 'r':
469 S->m_field_options |=
470 FIELD_REVERSE_COMPARISONS;
471 field_apply_all(S->m_fields_head,
472 FIELD_REVERSE_COMPARISONS);
473 break;
474
475 case 't':
476 /*
477 * delimiter
478 */
479 if (S->m_single_byte_locale) {
480 /*
481 * Most debuggers can't take tabs as
482 * input arguments, so we provide an
483 * escape sequence to allow testing of
484 * this special case for the DEBUG
485 * version.
486 */
487 S->m_field_separator.sc =
488 #ifdef DEBUG
489 xstreql(optarg, "\\t") ? '\t' :
490 #endif
491 optarg[0];
492 } else
493 (void) mbtowc(&S->m_field_separator.wc,
494 optarg, MB_CUR_MAX);
495 break;
496
497 case 'k':
498 /*
499 * key
500 */
501 (void) parse_new_field_spec(S, optarg);
502 break;
503
504 case 'S':
505 S->m_memory_limit = strtomem(optarg);
506 #ifdef DEBUG
507 (void) fprintf(stderr, CMDNAME
508 ": limiting size to %d bytes\n",
509 S->m_memory_limit);
510 #endif /* DEBUG */
511 break;
512
513 /*
514 * We never take a naked -999; these should always be
515 * associated with a preceding +000.
516 */
517 case '0':
518 case '1':
519 case '2':
520 case '3':
521 case '4':
522 case '5':
523 case '6':
524 case '7':
525 case '8':
526 case '9':
527 usage();
528 break;
529 case '?':
530 /* error case */
531 usage();
532 break;
533 }
534
535 /*
536 * Go back for next argument.
537 */
538 continue;
539 }
540
541 /*
542 * There are three (interpretable) possibilities for getopt() to
543 * return EOF with arguments on the command line: we have seen
544 * the "end-of-options" token, --, we have encountered the
545 * old-style field definition, +NNN, or we have found a
546 * filename.
547 *
548 * In the second case, we must also search for the optional -NNN
549 * field terminal definition. (since "+joe", for instance, is
550 * a valid filename, we must handle this pattern as well.) This
551 * is performed by parse_old_field_spec().
552 */
553 if (xstreql(argv[optind - 1], "--")) {
554 /*
555 * Process all arguments following end-of-options token
556 * as filenames.
557 */
558 while (optind < argc) {
559 if (xstreql(argv[optind], "-"))
560 S->m_input_from_stdin = 1;
561 else
562 stream_add_file_to_chain(
563 &(S->m_input_streams),
564 argv[optind]);
565 optind++;
566 }
567
568 break;
569 }
570
571 if (optind < argc) {
572 if (xstreql(argv[optind], "-")) {
573 S->m_input_from_stdin = 1;
574 optind++;
575 } else if (*(argv[optind]) != '+' ||
576 !parse_old_field_spec(S, argc, argv)) {
577 /*
578 * It's a filename, because it either doesn't
579 * start with '+', or if it did, it wasn't an
580 * actual field specifier.
581 */
582 stream_add_file_to_chain(&(S->m_input_streams),
583 argv[optind]);
584 optind++;
585 }
586 }
587 }
588
589 if (S->m_input_streams == NULL)
590 S->m_input_from_stdin = 1;
591
592 if (S->m_output_filename == NULL)
593 S->m_output_to_stdout = 1;
594
595 /*
596 * If no fields, then one great field. However, if the -b option was
597 * set globally, be sure to ignore it, as per UNIX98.
598 */
599 if (S->m_fields_head == NULL) {
600 S->m_field_options &= ~FIELD_IGNORE_BLANKS_START;
601
602 (void) parse_new_field_spec(S, "1");
603 /*
604 * "Entire line" fast path is only valid if no delimiter has
605 * been set and no modifiers have been applied.
606 */
607 if (S->m_field_separator.wc == 0 &&
608 S->m_default_species == ALPHA &&
609 S->m_field_options == 0)
610 S->m_entire_line = 1;
611 }
612
613 return (0);
614 }
615