1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License, Version 1.0 only
6 * (the "License"). You may not use this file except in compliance
7 * with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22 /*
23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 */
26
27 #pragma ident "%Z%%M% %I% %E% SMI"
28
29 #include "options.h"
30
31 /*
32 * options
33 *
34 * Overview
35 * sort(1) supports two methods for specifying the sort key: the original,
36 * now-obsolete, +n -m form and the POSIX -k n,m form. We refer to the former
37 * as "old specifiers" and the latter as "new specifiers". The options()
38 * function parses the command line arguments given to sort, placing the sort
39 * key specifiers in the internal representation used in fields.c.
40 *
41 * Equivalence of specifiers
42 * One of sort(1)'s standard peculiarities is the transformation of the
43 * character offsets and field numbering between the new and old style field
44 * specifications. We simply quote from the Single Unix standard:
45 *
46 * +w.xT -y.zU
47 *
48 * is equivalent to
49 *
50 * undefined when z == 0, U contains b, and -t is set
51 * -k w+1.x+1T,y.0U when z == 0 otherwise
52 * -k w+1.x+1T,y+1.zU when z > 0
53 *
54 * Undoubtedly, this seemed logical at the time. (Using only the field head
55 * as the coordinate, as done in the obsolete version, seems much simpler.)
56 * The reverse map is where the key specifier
57 *
58 * -k w.xT,y.zU
59 *
60 * is equivalent to
61 *
62 * undefined when z == 0, U contains b, and -t is set
63 * +w-1.x-1T,y.0U when z == 0 otherwise
64 * +w-1.x-1T,y-1.z when z > 0
65 *
66 * in the obsolete syntax. Because the original key specifiers lead to a
67 * simpler implementation, the internal representation of a field in this
68 * implementation of sort is mostly that given by the obsolete syntax.
69 */
70
71 /*
72 * While a key specifier in the obsolete +m ... -n form is being defined (that
73 * is, before the closing -n is seen), a narrower set of options is permitted.
74 * We specify this smaller set of options in OLD_SPEC_OPTIONS_STRING.
75 */
76 #define OPTIONS_STRING "cmuo:T:z:dfiMnrbt:k:S:0123456789"
77 #define OLD_SPEC_OPTIONS_STRING "bdfiMnrcmuo:T:z:t:k:S:"
78
79 #define OPTIONS_OLDSPEC 0x1 /* else new-style spec */
80 #define OPTIONS_STARTSPEC 0x2 /* else end spec */
81
82 static int
is_number(char * C)83 is_number(char *C)
84 {
85 size_t i;
86
87 for (i = 0; i < strlen(C); i++)
88 if (!isdigit((uchar_t)C[i]))
89 return (0);
90
91 return (1);
92 }
93
94 /*
95 * If a field specified by the -k option or by the +n syntax contains any
96 * modifiers, then the current global field modifiers are not inherited.
97 */
98 static int
field_spec_has_modifiers(char * C,int length)99 field_spec_has_modifiers(char *C, int length)
100 {
101 int p_nonmodifiers = strspn(C, ",.1234567890");
102
103 if (p_nonmodifiers == length)
104 return (0);
105
106 return (1);
107 }
108
109 static void
field_apply_all(field_t * fc,flag_t flags)110 field_apply_all(field_t *fc, flag_t flags)
111 {
112 field_t *f;
113
114 for (f = fc; f; f = f->f_next)
115 if ((f->f_options & FIELD_MODIFIERS_DEFINED) == 0)
116 f->f_options |= flags;
117 }
118
119 static int
parse_field_spec(field_t * F,char * C,int flags,int length)120 parse_field_spec(field_t *F, char *C, int flags, int length)
121 {
122 int p_period = MIN(length, strcspn(C, "."));
123 int p_modifiers = MIN(length, strspn(C, ".1234567890"));
124 int p_boundary = MIN(p_period, p_modifiers);
125 int field = 0;
126 int offset = 0;
127 int offset_seen = 0;
128 int i;
129 int blanks_flag = 0;
130
131 for (i = 0; i < p_boundary; i++) {
132 if (isdigit((uchar_t)C[i]))
133 field = (10 * field) + (C[i] - '0');
134 else
135 return (1);
136 }
137
138 if (p_period < p_modifiers) {
139 for (i = p_period + 1; i < p_modifiers; i++) {
140 if (isdigit((uchar_t)C[i])) {
141 offset_seen++;
142 offset = (10 * offset) + (C[i] - '0');
143 } else {
144 return (1);
145 }
146 }
147 }
148
149 if (p_modifiers < length) {
150 for (i = p_modifiers; i < length; i++) {
151 switch (C[i]) {
152 case 'b':
153 blanks_flag = 1;
154 break;
155 case 'd':
156 F->f_options |= FIELD_DICTIONARY_ORDER;
157 break;
158 case 'f':
159 F->f_options |= FIELD_FOLD_UPPERCASE;
160 break;
161 case 'i':
162 F->f_options |=
163 FIELD_IGNORE_NONPRINTABLES;
164 break;
165 case 'M':
166 F->f_species = MONTH;
167 break;
168 case 'n':
169 F->f_species = NUMERIC;
170 break;
171 case 'r':
172 F->f_options |=
173 FIELD_REVERSE_COMPARISONS;
174 break;
175 default:
176 usage();
177 break;
178 }
179 }
180 }
181
182 if (flags & OPTIONS_STARTSPEC) {
183 F->f_start_field = field;
184 F->f_start_offset = offset;
185 if ((flags & OPTIONS_OLDSPEC) != OPTIONS_OLDSPEC) {
186 F->f_start_field--;
187 if (offset_seen)
188 F->f_start_offset--;
189 }
190 F->f_options |= blanks_flag ? FIELD_IGNORE_BLANKS_START : 0;
191 } else {
192 F->f_end_field = field;
193 F->f_end_offset = offset;
194 if ((flags & OPTIONS_OLDSPEC) != OPTIONS_OLDSPEC &&
195 offset_seen && offset != 0)
196 F->f_end_field--;
197 F->f_options |= blanks_flag ? FIELD_IGNORE_BLANKS_END : 0;
198 }
199
200 return (0);
201 }
202
203 static void
parse_new_field_spec(sort_t * S,char * arg)204 parse_new_field_spec(sort_t *S, char *arg)
205 {
206 int length = strlen(arg);
207 int p_comma = MIN(length, strcspn(arg, ","));
208 field_t *nF;
209 int p;
210
211 /*
212 * New field specifiers do not inherit from the general specifier if
213 * they have any modifiers set. (This is specifically tested in the VSC
214 * test suite, assertion 32 for POSIX.cmd/sort.)
215 */
216 if (field_spec_has_modifiers(arg, length)) {
217 nF = field_new(NULL);
218 nF->f_options = FIELD_MODIFIERS_DEFINED;
219 } else {
220 nF = field_new(S);
221 }
222 p = parse_field_spec(nF, arg, OPTIONS_STARTSPEC, p_comma);
223
224 if (p != 0)
225 usage();
226
227 if (p_comma < length) {
228 p = parse_field_spec(nF, &(arg[p_comma + 1]), 0,
229 strlen(&(arg[p_comma + 1])));
230 if (p != 0)
231 usage();
232 }
233
234 if (nF->f_start_field < 0 || nF->f_start_offset < 0) {
235 if (S->m_verbose)
236 warn("-k %s is not a supported field specifier\n", arg);
237 }
238 nF->f_start_field = MAX(nF->f_start_field, 0);
239 nF->f_start_offset = MAX(nF->f_start_offset, 0);
240
241 /*
242 * If the starting field exceeds a defined ending field, convention
243 * dictates that the field is ignored.
244 */
245 if (nF->f_end_field == -1 || nF->f_start_field < nF->f_end_field ||
246 (nF->f_start_field == nF->f_end_field &&
247 nF->f_start_offset < nF->f_end_offset)) {
248 field_add_to_chain(&(S->m_fields_head), nF);
249 } else if (S->m_verbose) {
250 warn("illegal field -k %s omitted", arg);
251 }
252 }
253
254 /*
255 * parse_old_field_spec() is getopt()-aware; it may modify the values of optind,
256 * optarg, and so forth, to correctly determine the characteristics being
257 * assigned to the current field.
258 */
259 static int
parse_old_field_spec(sort_t * S,int argc,char * argv[])260 parse_old_field_spec(sort_t *S, int argc, char *argv[])
261 {
262 field_t *nF;
263 int c, p;
264 char *arg = argv[optind];
265
266 if (field_spec_has_modifiers(arg + 1, strlen(arg + 1))) {
267 nF = field_new(NULL);
268 nF->f_options = FIELD_MODIFIERS_DEFINED;
269 } else {
270 nF = field_new(S);
271 }
272
273 p = parse_field_spec(nF, arg + 1, OPTIONS_OLDSPEC | OPTIONS_STARTSPEC,
274 strlen(arg + 1));
275
276 if (p != 0) {
277 field_delete(nF);
278 return (0);
279 }
280
281 /*
282 * In the case that getopt() returns '?' (unrecognized option) or EOF
283 * (non-option argument), the field is considered closed.
284 */
285 for (arg = argv[++optind]; optind < argc; arg = argv[optind]) {
286 if (strlen(arg) >= 2 && *arg == '-' &&
287 isdigit(*(uchar_t *)(arg + 1))) {
288 (void) parse_field_spec(nF, arg + 1,
289 OPTIONS_OLDSPEC, strlen(arg) - 1);
290 field_add_to_chain(&(S->m_fields_head), nF);
291 optind++;
292 return (1);
293 }
294
295 if ((c = getopt(argc, argv, OLD_SPEC_OPTIONS_STRING)) != EOF) {
296 switch (c) {
297 case 'b':
298 nF->f_options |= FIELD_IGNORE_BLANKS_START;
299 break;
300 case 'd':
301 nF->f_options |= FIELD_DICTIONARY_ORDER;
302 break;
303 case 'f':
304 nF->f_options |= FIELD_FOLD_UPPERCASE;
305 break;
306 case 'i':
307 nF->f_options |= FIELD_IGNORE_NONPRINTABLES;
308 break;
309 case 'M':
310 nF->f_species = MONTH;
311 break;
312 case 'n':
313 nF->f_species = NUMERIC;
314 break;
315 case 'r':
316 nF->f_options |= FIELD_REVERSE_COMPARISONS;
317 break;
318 case '?':
319 case 'c':
320 case 'm':
321 case 'u':
322 /*
323 * Options without arguments.
324 */
325 optind -= 1;
326 field_add_to_chain(&(S->m_fields_head), nF);
327 return (1);
328 /*NOTREACHED*/
329 case 'o':
330 case 'T':
331 case 'z':
332 case 't':
333 case 'k':
334 case 'S':
335 /*
336 * Options with arguments.
337 */
338 if (optarg == argv[optind - 1] + 2) {
339 optind -= 1;
340 } else {
341 optind -= 2;
342 }
343 field_add_to_chain(&(S->m_fields_head), nF);
344 return (1);
345 /*NOTREACHED*/
346 default:
347 die(EMSG_UNKN_OPTION);
348 /*NOTREACHED*/
349 }
350 } else {
351 break;
352 }
353 }
354
355 field_add_to_chain(&(S->m_fields_head), nF);
356 return (1);
357 }
358
359 int
options(sort_t * S,int argc,char * argv[])360 options(sort_t *S, int argc, char *argv[])
361 {
362 int c;
363
364 optind = 1;
365 while (optind < argc) {
366 if (strncmp("-y", argv[optind], strlen("-y")) == 0) {
367 /*
368 * The -y [kmem] option violates the standard syntax
369 * outlined in intro(1). we have to be a little fancy
370 * to determine if the next argument is a valid integer.
371 * (note, of course, that the previous sort(1) had no
372 * mechanism to resolve a final
373 * -y 99999
374 * into
375 * -y, file 99999
376 * or
377 * -y 99999, file stdin
378 *
379 * Now one can unambiguously use
380 * -y -- 99999
381 * and
382 * -y 99999 -
383 * to distinguish these cases.
384 *
385 * That said, we do not use the information passed using
386 * -y option in sort(1); we provide the argument to
387 * preserve compatibility for existing scripts.
388 */
389 if (strlen(argv[optind]) == strlen("-y") &&
390 optind + 1 < argc &&
391 is_number(argv[optind + 1]))
392 optind += 2;
393 else
394 optind += 1;
395 }
396
397 if ((c = getopt(argc, argv, OPTIONS_STRING)) != EOF) {
398 switch (c) {
399 case 'c':
400 S->m_check_if_sorted_only = 1;
401 break;
402
403 case 'm':
404 S->m_merge_only = 1;
405 break;
406
407 case 'u':
408 S->m_unique_lines = 1;
409 break;
410
411 case 'o':
412 S->m_output_filename = optarg;
413 break;
414
415 case 'T':
416 S->m_tmpdir_template = optarg;
417 break;
418
419 case 'z':
420 /*
421 * ignore optarg -- obsolete
422 */
423 break;
424
425 case 'd':
426 S->m_field_options |= FIELD_DICTIONARY_ORDER;
427 field_apply_all(S->m_fields_head,
428 FIELD_DICTIONARY_ORDER);
429 break;
430
431 case 'f':
432 S->m_field_options |= FIELD_FOLD_UPPERCASE;
433 field_apply_all(S->m_fields_head,
434 FIELD_FOLD_UPPERCASE);
435 break;
436
437 case 'i':
438 S->m_field_options |=
439 FIELD_IGNORE_NONPRINTABLES;
440 field_apply_all(S->m_fields_head,
441 FIELD_IGNORE_NONPRINTABLES);
442 break;
443
444 case 'M':
445 S->m_default_species = MONTH;
446 S->m_field_options &=
447 ~FIELD_IGNORE_BLANKS_START;
448 break;
449
450 case 'n':
451 S->m_default_species = NUMERIC;
452 {
453 field_t *f;
454
455 for (f = S->m_fields_head; f;
456 f = f->f_next)
457 if ((f->f_options &
458 FIELD_MODIFIERS_DEFINED) ==
459 0)
460 f->f_species = NUMERIC;
461 }
462 break;
463
464 case 'b':
465 S->m_field_options |=
466 FIELD_IGNORE_BLANKS_START |
467 FIELD_IGNORE_BLANKS_END;
468 break;
469
470 case 'r':
471 S->m_field_options |=
472 FIELD_REVERSE_COMPARISONS;
473 field_apply_all(S->m_fields_head,
474 FIELD_REVERSE_COMPARISONS);
475 break;
476
477 case 't':
478 /*
479 * delimiter
480 */
481 if (S->m_single_byte_locale) {
482 /*
483 * Most debuggers can't take tabs as
484 * input arguments, so we provide an
485 * escape sequence to allow testing of
486 * this special case for the DEBUG
487 * version.
488 */
489 S->m_field_separator.sc =
490 #ifdef DEBUG
491 xstreql(optarg, "\\t") ? '\t' :
492 #endif
493 optarg[0];
494 } else
495 (void) mbtowc(&S->m_field_separator.wc,
496 optarg, MB_CUR_MAX);
497 break;
498
499 case 'k':
500 /*
501 * key
502 */
503 (void) parse_new_field_spec(S, optarg);
504 break;
505
506 case 'S':
507 S->m_memory_limit = strtomem(optarg);
508 #ifdef DEBUG
509 (void) fprintf(stderr, CMDNAME
510 ": limiting size to %d bytes\n",
511 S->m_memory_limit);
512 #endif /* DEBUG */
513 break;
514
515 /*
516 * We never take a naked -999; these should always be
517 * associated with a preceding +000.
518 */
519 case '0':
520 case '1':
521 case '2':
522 case '3':
523 case '4':
524 case '5':
525 case '6':
526 case '7':
527 case '8':
528 case '9':
529 usage();
530 break;
531 case '?':
532 /* error case */
533 usage();
534 break;
535 }
536
537 /*
538 * Go back for next argument.
539 */
540 continue;
541 }
542
543 /*
544 * There are three (interpretable) possibilities for getopt() to
545 * return EOF with arguments on the command line: we have seen
546 * the "end-of-options" token, --, we have encountered the
547 * old-style field definition, +NNN, or we have found a
548 * filename.
549 *
550 * In the second case, we must also search for the optional -NNN
551 * field terminal definition. (since "+joe", for instance, is
552 * a valid filename, we must handle this pattern as well.) This
553 * is performed by parse_old_field_spec().
554 */
555 if (xstreql(argv[optind - 1], "--")) {
556 /*
557 * Process all arguments following end-of-options token
558 * as filenames.
559 */
560 while (optind < argc) {
561 if (xstreql(argv[optind], "-"))
562 S->m_input_from_stdin = 1;
563 else
564 stream_add_file_to_chain(
565 &(S->m_input_streams),
566 argv[optind]);
567 optind++;
568 }
569
570 break;
571 }
572
573 if (optind < argc) {
574 if (xstreql(argv[optind], "-")) {
575 S->m_input_from_stdin = 1;
576 optind++;
577 } else if (*(argv[optind]) != '+' ||
578 !parse_old_field_spec(S, argc, argv)) {
579 /*
580 * It's a filename, because it either doesn't
581 * start with '+', or if it did, it wasn't an
582 * actual field specifier.
583 */
584 stream_add_file_to_chain(&(S->m_input_streams),
585 argv[optind]);
586 optind++;
587 }
588 }
589 }
590
591 if (S->m_input_streams == NULL)
592 S->m_input_from_stdin = 1;
593
594 if (S->m_output_filename == NULL)
595 S->m_output_to_stdout = 1;
596
597 /*
598 * If no fields, then one great field. However, if the -b option was
599 * set globally, be sure to ignore it, as per UNIX98.
600 */
601 if (S->m_fields_head == NULL) {
602 S->m_field_options &= ~FIELD_IGNORE_BLANKS_START;
603
604 (void) parse_new_field_spec(S, "1");
605 /*
606 * "Entire line" fast path is only valid if no delimiter has
607 * been set and no modifiers have been applied.
608 */
609 if (S->m_field_separator.wc == 0 &&
610 S->m_default_species == ALPHA &&
611 S->m_field_options == 0)
612 S->m_entire_line = 1;
613 }
614
615 return (0);
616 }
617