1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include "options.h" 30 31 /* 32 * options 33 * 34 * Overview 35 * sort(1) supports two methods for specifying the sort key: the original, 36 * now-obsolete, +n -m form and the POSIX -k n,m form. We refer to the former 37 * as "old specifiers" and the latter as "new specifiers". The options() 38 * function parses the command line arguments given to sort, placing the sort 39 * key specifiers in the internal representation used in fields.c. 40 * 41 * Equivalence of specifiers 42 * One of sort(1)'s standard peculiarities is the transformation of the 43 * character offsets and field numbering between the new and old style field 44 * specifications. We simply quote from the Single Unix standard: 45 * 46 * +w.xT -y.zU 47 * 48 * is equivalent to 49 * 50 * undefined when z == 0, U contains b, and -t is set 51 * -k w+1.x+1T,y.0U when z == 0 otherwise 52 * -k w+1.x+1T,y+1.zU when z > 0 53 * 54 * Undoubtedly, this seemed logical at the time. (Using only the field head 55 * as the coordinate, as done in the obsolete version, seems much simpler.) 56 * The reverse map is where the key specifier 57 * 58 * -k w.xT,y.zU 59 * 60 * is equivalent to 61 * 62 * undefined when z == 0, U contains b, and -t is set 63 * +w-1.x-1T,y.0U when z == 0 otherwise 64 * +w-1.x-1T,y-1.z when z > 0 65 * 66 * in the obsolete syntax. Because the original key specifiers lead to a 67 * simpler implementation, the internal representation of a field in this 68 * implementation of sort is mostly that given by the obsolete syntax. 69 */ 70 71 /* 72 * While a key specifier in the obsolete +m ... -n form is being defined (that 73 * is, before the closing -n is seen), a narrower set of options is permitted. 74 * We specify this smaller set of options in OLD_SPEC_OPTIONS_STRING. 75 */ 76 #define OPTIONS_STRING "cmuo:T:z:dfiMnrbt:k:S:0123456789" 77 #define OLD_SPEC_OPTIONS_STRING "bdfiMnrcmuo:T:z:t:k:S:" 78 79 #define OPTIONS_OLDSPEC 0x1 /* else new-style spec */ 80 #define OPTIONS_STARTSPEC 0x2 /* else end spec */ 81 82 static int 83 is_number(char *C) 84 { 85 size_t i; 86 87 for (i = 0; i < strlen(C); i++) 88 if (!isdigit((uchar_t)C[i])) 89 return (0); 90 91 return (1); 92 } 93 94 /* 95 * If a field specified by the -k option or by the +n syntax contains any 96 * modifiers, then the current global field modifiers are not inherited. 97 */ 98 static int 99 field_spec_has_modifiers(char *C, int length) 100 { 101 int p_nonmodifiers = strspn(C, ",.1234567890"); 102 103 if (p_nonmodifiers == length) 104 return (0); 105 106 return (1); 107 } 108 109 static void 110 field_apply_all(field_t *fc, flag_t flags) 111 { 112 field_t *f; 113 114 for (f = fc; f; f = f->f_next) 115 if ((f->f_options & FIELD_MODIFIERS_DEFINED) == 0) 116 f->f_options |= flags; 117 } 118 119 static int 120 parse_field_spec(field_t *F, char *C, int flags, int length) 121 { 122 int p_period = MIN(length, strcspn(C, ".")); 123 int p_modifiers = MIN(length, strspn(C, ".1234567890")); 124 int p_boundary = MIN(p_period, p_modifiers); 125 int field = 0; 126 int offset = 0; 127 int offset_seen = 0; 128 int i; 129 int blanks_flag = 0; 130 131 for (i = 0; i < p_boundary; i++) { 132 if (isdigit((uchar_t)C[i])) 133 field = (10 * field) + (C[i] - '0'); 134 else 135 return (1); 136 } 137 138 if (p_period < p_modifiers) { 139 for (i = p_period + 1; i < p_modifiers; i++) { 140 if (isdigit((uchar_t)C[i])) { 141 offset_seen++; 142 offset = (10 * offset) + (C[i] - '0'); 143 } else { 144 return (1); 145 } 146 } 147 } 148 149 if (p_modifiers < length) { 150 for (i = p_modifiers; i < length; i++) { 151 switch (C[i]) { 152 case 'b': 153 blanks_flag = 1; 154 break; 155 case 'd': 156 F->f_options |= FIELD_DICTIONARY_ORDER; 157 break; 158 case 'f': 159 F->f_options |= FIELD_FOLD_UPPERCASE; 160 break; 161 case 'i': 162 F->f_options |= 163 FIELD_IGNORE_NONPRINTABLES; 164 break; 165 case 'M': 166 F->f_species = MONTH; 167 break; 168 case 'n': 169 F->f_species = NUMERIC; 170 break; 171 case 'r': 172 F->f_options |= 173 FIELD_REVERSE_COMPARISONS; 174 break; 175 default: 176 usage(); 177 break; 178 } 179 } 180 } 181 182 if (flags & OPTIONS_STARTSPEC) { 183 F->f_start_field = field; 184 F->f_start_offset = offset; 185 if ((flags & OPTIONS_OLDSPEC) != OPTIONS_OLDSPEC) { 186 F->f_start_field--; 187 if (offset_seen) 188 F->f_start_offset--; 189 } 190 F->f_options |= blanks_flag ? FIELD_IGNORE_BLANKS_START : 0; 191 } else { 192 F->f_end_field = field; 193 F->f_end_offset = offset; 194 if ((flags & OPTIONS_OLDSPEC) != OPTIONS_OLDSPEC && 195 offset_seen && offset != 0) 196 F->f_end_field--; 197 F->f_options |= blanks_flag ? FIELD_IGNORE_BLANKS_END : 0; 198 } 199 200 return (0); 201 } 202 203 static void 204 parse_new_field_spec(sort_t *S, char *arg) 205 { 206 int length = strlen(arg); 207 int p_comma = MIN(length, strcspn(arg, ",")); 208 field_t *nF; 209 int p; 210 211 /* 212 * New field specifiers do not inherit from the general specifier if 213 * they have any modifiers set. (This is specifically tested in the VSC 214 * test suite, assertion 32 for POSIX.cmd/sort.) 215 */ 216 if (field_spec_has_modifiers(arg, length)) { 217 nF = field_new(NULL); 218 nF->f_options = FIELD_MODIFIERS_DEFINED; 219 } else { 220 nF = field_new(S); 221 } 222 p = parse_field_spec(nF, arg, OPTIONS_STARTSPEC, p_comma); 223 224 if (p != 0) 225 usage(); 226 227 if (p_comma < length) { 228 p = parse_field_spec(nF, &(arg[p_comma + 1]), 0, 229 strlen(&(arg[p_comma + 1]))); 230 if (p != 0) 231 usage(); 232 } 233 234 if (nF->f_start_field < 0 || nF->f_start_offset < 0) { 235 if (S->m_verbose) 236 warn("-k %s is not a supported field specifier\n", arg); 237 } 238 nF->f_start_field = MAX(nF->f_start_field, 0); 239 nF->f_start_offset = MAX(nF->f_start_offset, 0); 240 241 /* 242 * If the starting field exceeds a defined ending field, convention 243 * dictates that the field is ignored. 244 */ 245 if (nF->f_end_field == -1 || nF->f_start_field < nF->f_end_field || 246 (nF->f_start_field == nF->f_end_field && 247 nF->f_start_offset < nF->f_end_offset)) { 248 field_add_to_chain(&(S->m_fields_head), nF); 249 } else if (S->m_verbose) { 250 warn("illegal field -k %s omitted", arg); 251 } 252 } 253 254 /* 255 * parse_old_field_spec() is getopt()-aware; it may modify the values of optind, 256 * optarg, and so forth, to correctly determine the characteristics being 257 * assigned to the current field. 258 */ 259 static int 260 parse_old_field_spec(sort_t *S, int argc, char *argv[]) 261 { 262 field_t *nF; 263 int c, p; 264 char *arg = argv[optind]; 265 266 if (field_spec_has_modifiers(arg + 1, strlen(arg + 1))) { 267 nF = field_new(NULL); 268 nF->f_options = FIELD_MODIFIERS_DEFINED; 269 } else { 270 nF = field_new(S); 271 } 272 273 p = parse_field_spec(nF, arg + 1, OPTIONS_OLDSPEC | OPTIONS_STARTSPEC, 274 strlen(arg + 1)); 275 276 if (p != 0) { 277 field_delete(nF); 278 return (0); 279 } 280 281 /* 282 * In the case that getopt() returns '?' (unrecognized option) or EOF 283 * (non-option argument), the field is considered closed. 284 */ 285 for (arg = argv[++optind]; optind < argc; arg = argv[optind]) { 286 if (strlen(arg) >= 2 && *arg == '-' && 287 isdigit(*(uchar_t *)(arg + 1))) { 288 (void) parse_field_spec(nF, arg + 1, 289 OPTIONS_OLDSPEC, strlen(arg) - 1); 290 field_add_to_chain(&(S->m_fields_head), nF); 291 optind++; 292 return (1); 293 } 294 295 if ((c = getopt(argc, argv, OLD_SPEC_OPTIONS_STRING)) != EOF) { 296 switch (c) { 297 case 'b': 298 nF->f_options |= FIELD_IGNORE_BLANKS_START; 299 break; 300 case 'd': 301 nF->f_options |= FIELD_DICTIONARY_ORDER; 302 break; 303 case 'f': 304 nF->f_options |= FIELD_FOLD_UPPERCASE; 305 break; 306 case 'i': 307 nF->f_options |= FIELD_IGNORE_NONPRINTABLES; 308 break; 309 case 'M': 310 nF->f_species = MONTH; 311 break; 312 case 'n': 313 nF->f_species = NUMERIC; 314 break; 315 case 'r': 316 nF->f_options |= FIELD_REVERSE_COMPARISONS; 317 break; 318 case '?': 319 case 'c': 320 case 'm': 321 case 'u': 322 /* 323 * Options without arguments. 324 */ 325 optind -= 1; 326 field_add_to_chain(&(S->m_fields_head), nF); 327 return (1); 328 /*NOTREACHED*/ 329 case 'o': 330 case 'T': 331 case 'z': 332 case 't': 333 case 'k': 334 case 'S': 335 /* 336 * Options with arguments. 337 */ 338 if (optarg == argv[optind - 1] + 2) { 339 optind -= 1; 340 } else { 341 optind -= 2; 342 } 343 field_add_to_chain(&(S->m_fields_head), nF); 344 return (1); 345 /*NOTREACHED*/ 346 default: 347 die(EMSG_UNKN_OPTION); 348 /*NOTREACHED*/ 349 } 350 } else { 351 break; 352 } 353 } 354 355 field_add_to_chain(&(S->m_fields_head), nF); 356 return (1); 357 } 358 359 int 360 options(sort_t *S, int argc, char *argv[]) 361 { 362 int c; 363 364 optind = 1; 365 while (optind < argc) { 366 if (strncmp("-y", argv[optind], strlen("-y")) == 0) { 367 /* 368 * The -y [kmem] option violates the standard syntax 369 * outlined in intro(1). we have to be a little fancy 370 * to determine if the next argument is a valid integer. 371 * (note, of course, that the previous sort(1) had no 372 * mechanism to resolve a final 373 * -y 99999 374 * into 375 * -y, file 99999 376 * or 377 * -y 99999, file stdin 378 * 379 * Now one can unambiguously use 380 * -y -- 99999 381 * and 382 * -y 99999 - 383 * to distinguish these cases. 384 * 385 * That said, we do not use the information passed using 386 * -y option in sort(1); we provide the argument to 387 * preserve compatibility for existing scripts. 388 */ 389 if (strlen(argv[optind]) == strlen("-y") && 390 optind + 1 < argc && 391 is_number(argv[optind + 1])) 392 optind += 2; 393 else 394 optind += 1; 395 } 396 397 if ((c = getopt(argc, argv, OPTIONS_STRING)) != EOF) { 398 switch (c) { 399 case 'c': 400 S->m_check_if_sorted_only = 1; 401 break; 402 403 case 'm': 404 S->m_merge_only = 1; 405 break; 406 407 case 'u': 408 S->m_unique_lines = 1; 409 break; 410 411 case 'o': 412 S->m_output_filename = optarg; 413 break; 414 415 case 'T': 416 S->m_tmpdir_template = optarg; 417 break; 418 419 case 'z': 420 /* 421 * ignore optarg -- obsolete 422 */ 423 break; 424 425 case 'd': 426 S->m_field_options |= FIELD_DICTIONARY_ORDER; 427 field_apply_all(S->m_fields_head, 428 FIELD_DICTIONARY_ORDER); 429 break; 430 431 case 'f': 432 S->m_field_options |= FIELD_FOLD_UPPERCASE; 433 field_apply_all(S->m_fields_head, 434 FIELD_FOLD_UPPERCASE); 435 break; 436 437 case 'i': 438 S->m_field_options |= 439 FIELD_IGNORE_NONPRINTABLES; 440 field_apply_all(S->m_fields_head, 441 FIELD_IGNORE_NONPRINTABLES); 442 break; 443 444 case 'M': 445 S->m_default_species = MONTH; 446 S->m_field_options &= 447 ~FIELD_IGNORE_BLANKS_START; 448 break; 449 450 case 'n': 451 S->m_default_species = NUMERIC; 452 { 453 field_t *f; 454 455 for (f = S->m_fields_head; f; 456 f = f->f_next) 457 if ((f->f_options & 458 FIELD_MODIFIERS_DEFINED) == 459 0) 460 f->f_species = NUMERIC; 461 } 462 break; 463 464 case 'b': 465 S->m_field_options |= 466 FIELD_IGNORE_BLANKS_START | 467 FIELD_IGNORE_BLANKS_END; 468 break; 469 470 case 'r': 471 S->m_field_options |= 472 FIELD_REVERSE_COMPARISONS; 473 field_apply_all(S->m_fields_head, 474 FIELD_REVERSE_COMPARISONS); 475 break; 476 477 case 't': 478 /* 479 * delimiter 480 */ 481 if (S->m_single_byte_locale) { 482 /* 483 * Most debuggers can't take tabs as 484 * input arguments, so we provide an 485 * escape sequence to allow testing of 486 * this special case for the DEBUG 487 * version. 488 */ 489 S->m_field_separator.sc = 490 #ifdef DEBUG 491 xstreql(optarg, "\\t") ? '\t' : 492 #endif 493 optarg[0]; 494 } else 495 (void) mbtowc(&S->m_field_separator.wc, 496 optarg, MB_CUR_MAX); 497 break; 498 499 case 'k': 500 /* 501 * key 502 */ 503 (void) parse_new_field_spec(S, optarg); 504 break; 505 506 case 'S': 507 S->m_memory_limit = strtomem(optarg); 508 #ifdef DEBUG 509 (void) fprintf(stderr, CMDNAME 510 ": limiting size to %d bytes\n", 511 S->m_memory_limit); 512 #endif /* DEBUG */ 513 break; 514 515 /* 516 * We never take a naked -999; these should always be 517 * associated with a preceding +000. 518 */ 519 case '0': 520 case '1': 521 case '2': 522 case '3': 523 case '4': 524 case '5': 525 case '6': 526 case '7': 527 case '8': 528 case '9': 529 usage(); 530 break; 531 case '?': 532 /* error case */ 533 usage(); 534 break; 535 } 536 537 /* 538 * Go back for next argument. 539 */ 540 continue; 541 } 542 543 /* 544 * There are three (interpretable) possibilities for getopt() to 545 * return EOF with arguments on the command line: we have seen 546 * the "end-of-options" token, --, we have encountered the 547 * old-style field definition, +NNN, or we have found a 548 * filename. 549 * 550 * In the second case, we must also search for the optional -NNN 551 * field terminal definition. (since "+joe", for instance, is 552 * a valid filename, we must handle this pattern as well.) This 553 * is performed by parse_old_field_spec(). 554 */ 555 if (xstreql(argv[optind - 1], "--")) { 556 /* 557 * Process all arguments following end-of-options token 558 * as filenames. 559 */ 560 while (optind < argc) { 561 if (xstreql(argv[optind], "-")) 562 S->m_input_from_stdin = 1; 563 else 564 stream_add_file_to_chain( 565 &(S->m_input_streams), 566 argv[optind]); 567 optind++; 568 } 569 570 break; 571 } 572 573 if (optind < argc) { 574 if (xstreql(argv[optind], "-")) { 575 S->m_input_from_stdin = 1; 576 optind++; 577 } else if (*(argv[optind]) != '+' || 578 !parse_old_field_spec(S, argc, argv)) { 579 /* 580 * It's a filename, because it either doesn't 581 * start with '+', or if it did, it wasn't an 582 * actual field specifier. 583 */ 584 stream_add_file_to_chain(&(S->m_input_streams), 585 argv[optind]); 586 optind++; 587 } 588 } 589 } 590 591 if (S->m_input_streams == NULL) 592 S->m_input_from_stdin = 1; 593 594 if (S->m_output_filename == NULL) 595 S->m_output_to_stdout = 1; 596 597 /* 598 * If no fields, then one great field. However, if the -b option was 599 * set globally, be sure to ignore it, as per UNIX98. 600 */ 601 if (S->m_fields_head == NULL) { 602 S->m_field_options &= ~FIELD_IGNORE_BLANKS_START; 603 604 (void) parse_new_field_spec(S, "1"); 605 /* 606 * "Entire line" fast path is only valid if no delimiter has 607 * been set and no modifiers have been applied. 608 */ 609 if (S->m_field_separator.wc == 0 && 610 S->m_default_species == ALPHA && 611 S->m_field_options == 0) 612 S->m_entire_line = 1; 613 } 614 615 return (0); 616 } 617