1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved. 24 */ 25 26 #include "fields.h" 27 28 /* 29 * fields 30 * 31 * Overview 32 * By a field, we mean the various delimited character sequences within each 33 * line of the input files. The sort key consists of an ordered sequence of 34 * fields, which need not include all possible fields for the given line. 35 * (Furthermore, not every line need contain sufficient fields for the fields 36 * given within the sort key. In fact, none of the lines in the input stream 37 * need contain sufficient fields.) 38 * 39 * There are two methods for specifying fields for sort(1); these are 40 * discussed in options.c. Here we discuss only the internal representation 41 * of fields, as used for constructing the collation vector for each line as 42 * defined by the sort key. 43 * 44 * Representation 45 * The sort key is a singly-linked list of field specifiers. At present, 46 * fields may belong to one of three species: alphabetical, numerical, or 47 * monthly; the species (f_species) then indicates the conversion function 48 * (f_convert) used to transform the raw characters of the character sequence 49 * to a collatable form. (In principle, this allows us to consider future 50 * field species such as hexadecimal.) 51 * 52 * Fields and offsets are numbered such that zero refers to the first field or 53 * character, respectively. Thus, the interpretation of a key specifier, m.n, 54 * is that the field begins at the nth character beyond the mth occurence of 55 * the key separator. If the blanks flag has been specified, then the field 56 * begins at the nth non-blank character past the mth key separator. If the 57 * key separator is unspecified, then the key separator is defined as one or 58 * more blank characters. 59 * 60 * In general, the various options afforded by sort may be broken into two 61 * categories: field species and field modifiers. For each field species, 62 * there is one or more conversion routines that take a delimited character 63 * sequence and convert it to a character sequence collatable by strcmp() or 64 * memcmp(). For field species that may be further modified, such as the 65 * fold-to-uppercase option for alphabetic fields, the conversion routine may 66 * be aware of how the modifier affects collation. Finally, the no-modifiers 67 * case may present an opportunity for a simplified, faster version. 68 * 69 * Code Structure 70 * The code paths for single-byte and multi-byte locales diverge significantly 71 * in fields.c. Most routines have an *_wide() version, which produces an 72 * equivalent effect for line records whose data field is composed of wide 73 * characters (wchar_t). However, the l_collated field of a line record is 74 * always composed of characters, so that the radix sorts provided in 75 * internal.c can work in both single- and multi-byte locales. Thus, in the 76 * various convert_*_wide() routines, the output is placed in l_collated, with 77 * a length multiplier of 4. 78 */ 79 80 #define BEFORE_NUMBER 0x0 81 #define IN_NUMBER 0x1 82 83 static char numerical_separator; 84 static char numerical_decimal; 85 static char monetary_separator; 86 static char monetary_decimal; 87 88 static wchar_t w_numerical_separator; 89 static wchar_t w_numerical_decimal; 90 static wchar_t w_monetary_separator; 91 static wchar_t w_monetary_decimal; 92 93 #define MONTHS_IN_YEAR 12 94 #define MAX_MON_LEN 20 95 96 enum { MO_NONE = 1, MO_OFFSET = 2 }; 97 98 static char *months[MONTHS_IN_YEAR]; 99 static size_t month_lengths[MONTHS_IN_YEAR]; 100 static wchar_t *w_months[MONTHS_IN_YEAR]; 101 static size_t w_month_lengths[MONTHS_IN_YEAR]; 102 103 #define DECIMAL_CHAR (numerical_decimal) 104 #define IS_BLANK(x) (isspace((uchar_t)(x)) && (x) != '\n') 105 #define IS_SEPARATOR(x) \ 106 ((numerical_separator != '\0' && (x) == numerical_separator) || \ 107 (monetary_separator != '\0' && (x) == monetary_separator)) 108 #define IS_DECIMAL(x) \ 109 ((x) == numerical_decimal || \ 110 (monetary_decimal != '\0' && (x) == monetary_decimal)) 111 #define W_DECIMAL_CHAR (w_numerical_decimal) 112 #define W_IS_BLANK(x) (iswspace(x) && (x) != L'\n') 113 #define W_IS_SEPARATOR(x) \ 114 ((numerical_separator != '\0' && (x) == w_numerical_separator) || \ 115 (monetary_separator != '\0' && (x) == w_monetary_separator)) 116 #define W_IS_DECIMAL(x) \ 117 (((x) == w_numerical_decimal) || \ 118 (monetary_decimal != '\0' && (x) == w_monetary_decimal)) 119 120 #define INTERFIELD_SEPARATOR '\0' 121 #define W_INTERFIELD_SEPARATOR L'\0' 122 123 #define INT_SIGN_FLIP_MASK 0x80000000 124 #define INT_SIGN_PASS_MASK 0x00000000 125 126 /* 127 * strx_ops_t, xfrm_len, and xfrm_cpy: In the case where we are sorting in the 128 * C locale, we want to avoid the expense of transforming strings to collatable 129 * forms since, by definition, an arbitrary string in the C locale is already in 130 * its collatable form. Therefore, we construct a small ops vector (the 131 * strx_ops) and two wrappers: xfrm_len() to massage the strxfrm(NULL, ...) into 132 * strlen()-like behaviour, and xfrm_cpy() to make strncpy() appear 133 * strxfrm()-like. 134 */ 135 /*ARGSUSED*/ 136 static size_t 137 xfrm_len(const char *s2, size_t len) 138 { 139 return (strxfrm(NULL, s2, 0) + 1); 140 } 141 142 /* 143 * The length represented by n includes a null character, so to return the 144 * correct length we subtract 1. Note that this function is only used by 145 * field_convert_alpha, and isn't for general use, as it assumes that n is the 146 * length of s2 plus a null character. 147 */ 148 static size_t 149 C_ncpy(char *s1, const char *s2, size_t n) 150 { 151 (void) strncpy(s1, s2, n); 152 return (n - 1); 153 } 154 155 /*ARGSUSED*/ 156 static size_t 157 C_len(const char *s, size_t len) 158 { 159 ASSERT(s != NULL); 160 return (len); 161 } 162 163 typedef struct _strx_ops { 164 size_t (*sx_len)(const char *, size_t); 165 size_t (*sx_xfrm)(char *, const char *, size_t); 166 } strx_ops_t; 167 168 static const strx_ops_t C_ops = { C_len, C_ncpy }; 169 static const strx_ops_t SB_ops = { xfrm_len, strxfrm }; 170 171 static const strx_ops_t *xfrm_ops; 172 173 static void 174 field_initialize_separator(void) 175 { 176 /* 177 * A locale need not define all of the cases below: only decimal_point 178 * must be defined. Furthermore, sort(1) has traditionally not used the 179 * positive_sign and negative_sign, grouping, or currency_symbols (or 180 * their numeric counterparts, if any). 181 */ 182 struct lconv *conv = localeconv(); 183 184 if (!xstreql(conv->thousands_sep, "")) { 185 numerical_separator = *conv->thousands_sep; 186 (void) mbtowc(&w_numerical_separator, conv->thousands_sep, 187 MB_CUR_MAX); 188 } else 189 numerical_separator = '\0'; 190 191 if (!xstreql(conv->mon_thousands_sep, "")) { 192 monetary_separator = *conv->mon_thousands_sep; 193 (void) mbtowc(&w_monetary_separator, conv->mon_thousands_sep, 194 MB_CUR_MAX); 195 } else 196 monetary_separator = '\0'; 197 198 if (!xstreql(conv->mon_decimal_point, "")) { 199 monetary_decimal = *conv->mon_decimal_point; 200 (void) mbtowc(&w_monetary_decimal, conv->mon_decimal_point, 201 MB_CUR_MAX); 202 } else 203 monetary_decimal = '\0'; 204 205 numerical_decimal = *conv->decimal_point; 206 (void) mbtowc(&w_numerical_decimal, conv->decimal_point, MB_CUR_MAX); 207 } 208 209 static void 210 field_initialize_month(int is_c_locale) 211 { 212 int i; 213 int j; 214 struct tm this_month; 215 const char *c_months[MONTHS_IN_YEAR] = { 216 "JAN", "FEB", "MAR", "APR", "MAY", "JUN", 217 "JUL", "AUG", "SEP", "OCT", "NOV", "DEC" 218 }; 219 220 char month_name[MAX_MON_LEN * MB_LEN_MAX]; 221 wchar_t w_month_name[MAX_MON_LEN]; 222 223 if (is_c_locale) { 224 for (i = 0; i < MONTHS_IN_YEAR; i++) { 225 months[i] = (char *)c_months[i]; 226 month_lengths[i] = strlen(c_months[i]); 227 } 228 /* 229 * We don't need to initialize the wide version of the month 230 * names. 231 */ 232 return; 233 } 234 235 (void) memset(&this_month, 0, sizeof (this_month)); 236 237 for (i = 0; i < MONTHS_IN_YEAR; i++) { 238 this_month.tm_mon = i; 239 240 (void) strftime(month_name, sizeof (month_name), 241 "%b", &this_month); 242 243 for (j = 0; j < strlen(month_name); j++) 244 month_name[j] = toupper(month_name[j]); 245 (void) mbstowcs(w_month_name, month_name, MAX_MON_LEN); 246 247 months[i] = strdup(month_name); 248 month_lengths[i] = strlen(month_name); 249 w_months[i] = wsdup(w_month_name); 250 w_month_lengths[i] = wslen(w_month_name); 251 } 252 } 253 254 void 255 field_initialize(sort_t *S) 256 { 257 field_initialize_month(S->m_c_locale); 258 field_initialize_separator(); 259 260 if (S->m_c_locale) 261 xfrm_ops = &C_ops; 262 else 263 xfrm_ops = &SB_ops; 264 } 265 266 field_t * 267 field_new(sort_t *S) 268 { 269 field_t *F = safe_realloc(NULL, sizeof (field_t)); 270 271 F->f_start_field = -1; 272 F->f_start_offset = -1; 273 F->f_end_field = -1; 274 F->f_end_offset = -1; 275 F->f_next = NULL; 276 277 if (S == NULL) { 278 F->f_species = ALPHA; 279 F->f_options = 0; 280 } else { 281 F->f_species = S->m_default_species; 282 F->f_options = S->m_field_options; 283 } 284 285 return (F); 286 } 287 288 void 289 field_delete(field_t *F) 290 { 291 free(F); 292 } 293 294 /* 295 * The recursive implementation of field_add_to_chain() given below is 296 * inappropriate if function calls are expensive, or a truly large number of 297 * fields are anticipated. 298 */ 299 void 300 field_add_to_chain(field_t **F, field_t *A) 301 { 302 if (*F == NULL) 303 *F = A; 304 else 305 field_add_to_chain(&((*F)->f_next), A); 306 } 307 308 #ifdef DEBUG 309 #ifndef _LP64 310 #define FIELD_FMT \ 311 "\nStart field: %d\tStart offset: %d\nEnd field: %d\tEnd offset: %d\n" 312 #else /* !_LP64 */ 313 #define FIELD_FMT \ 314 "\nStart field: %ld\tStart offset: %ld\nEnd field: %ld\tEnd offset: %ld\n" 315 #endif /* !_LP64 */ 316 317 /* 318 * field_print is used only for debugging purposes. 319 */ 320 void 321 field_print(field_t *F) 322 { 323 char *field_names[] = {"ALPHA", "MONTH", "NUMERIC"}; 324 int status = 0; 325 326 (void) fprintf(stderr, "Type: %s", field_names[F->f_species]); 327 (void) fprintf(stderr, "\tOptions: "); 328 329 if (F->f_options & FIELD_REVERSE_COMPARISONS) { 330 (void) fprintf(stderr, "REVERSE"); 331 status++; 332 } 333 if (F->f_options & FIELD_DICTIONARY_ORDER) { 334 (void) fprintf(stderr, "DICTIONARY "); 335 status++; 336 } 337 if (F->f_options & FIELD_FOLD_UPPERCASE) { 338 (void) fprintf(stderr, "UPPERCASE "); 339 status++; 340 } 341 if (F->f_options & FIELD_IGNORE_NONPRINTABLES) { 342 (void) fprintf(stderr, "PRINTABLES "); 343 status++; 344 } 345 if (F->f_options & FIELD_IGNORE_BLANKS_START) { 346 (void) fprintf(stderr, "BLANKS_START "); 347 status++; 348 } 349 if (F->f_options & FIELD_IGNORE_BLANKS_END) { 350 (void) fprintf(stderr, "BLANKS_END "); 351 status++; 352 } 353 354 if (status == 0) 355 (void) fprintf(stderr, "NO_MODIFIERS"); 356 357 (void) fprintf(stderr, FIELD_FMT, F->f_start_field, F->f_start_offset, 358 F->f_end_field, F->f_end_offset); 359 } 360 #endif /* DEBUG */ 361 362 static ssize_t 363 field_boundary(field_t *F, line_rec_t *L, int is_end, int is_blanks) 364 { 365 char *S = L->l_data.sp; 366 char *T = S; 367 char *eol = S + L->l_data_length; 368 ssize_t field = is_end ? F->f_end_field : F->f_start_field; 369 ssize_t offset = is_end ? F->f_end_offset : F->f_start_offset; 370 ssize_t ret; 371 372 ASSERT(is_end || field > -1); 373 374 if (is_end && field == -1) 375 return (L->l_data_length); 376 377 while (field-- > 0) { 378 while (T < eol && IS_BLANK(*T)) 379 T++; 380 381 while (T < eol && !IS_BLANK(*T)) 382 T++; 383 } 384 385 if ((!is_end || offset > 0) && is_blanks) { 386 while (IS_BLANK(*T)) 387 T++; 388 } 389 390 if ((ret = MAX(T - S, 0) + offset) >= L->l_data_length) 391 return (L->l_data_length); 392 393 return (ret); 394 } 395 396 static void 397 field_delimit(field_t *F, line_rec_t *L, ssize_t *start, ssize_t *end) 398 { 399 ASSERT(F->f_start_field > -1); 400 401 *start = field_boundary(F, L, 0, 402 F->f_options & FIELD_IGNORE_BLANKS_START); 403 *end = field_boundary(F, L, 1, 404 F->f_options & FIELD_IGNORE_BLANKS_END); 405 } 406 407 static ssize_t 408 field_boundary_wide(field_t *F, line_rec_t *L, int is_end, int is_blanks) 409 { 410 wchar_t *S = L->l_data.wp; 411 wchar_t *T = S; 412 wchar_t *eol = S + L->l_data_length; 413 ssize_t field = is_end ? F->f_end_field : F->f_start_field; 414 ssize_t offset = is_end ? F->f_end_offset : F->f_start_offset; 415 ssize_t ret; 416 417 ASSERT(is_end || field > -1); 418 419 if (is_end && field == -1) 420 return (L->l_data_length); 421 422 while (field-- > 0) { 423 while (T < eol && W_IS_BLANK(*T)) 424 T++; 425 426 while (T < eol && !W_IS_BLANK(*T)) 427 T++; 428 } 429 430 if ((!is_end || offset > 0) && is_blanks) { 431 while (W_IS_BLANK(*T)) 432 T++; 433 } 434 435 if ((ret = MAX(T - S, 0) + offset) >= L->l_data_length) 436 return (L->l_data_length); 437 438 return (ret); 439 } 440 441 static void 442 field_delimit_wide(field_t *F, line_rec_t *L, ssize_t *start, ssize_t *end) 443 { 444 ASSERT(F->f_start_field > -1); 445 446 *start = field_boundary_wide(F, L, 0, 447 F->f_options & FIELD_IGNORE_BLANKS_START); 448 *end = field_boundary_wide(F, L, 1, 449 F->f_options & FIELD_IGNORE_BLANKS_END); 450 } 451 452 static ssize_t 453 field_boundary_tabbed(field_t *F, line_rec_t *L, int is_end, int is_blanks, 454 vchar_t delimiter) 455 { 456 char *S = L->l_data.sp; 457 char *T = S; 458 char *eol = S + L->l_data_length; 459 ssize_t field = is_end ? F->f_end_field : F->f_start_field; 460 ssize_t offset = is_end ? F->f_end_offset : F->f_start_offset; 461 ssize_t ret; 462 463 ASSERT(is_end || field > -1); 464 465 if (is_end && field == -1) 466 return (L->l_data_length); 467 468 while (field-- > 0) { 469 T = xstrnchr(T, delimiter.sc, eol - T); 470 if (T == NULL || T > eol) 471 return (L->l_data_length); 472 473 T++; 474 } 475 476 if ((!is_end || offset != 0) && is_blanks) { 477 while (IS_BLANK(*T)) 478 T++; 479 } 480 481 if ((ret = MAX(T - S, 0) + offset) >= L->l_data_length) { 482 if (L->l_data_length <= 0) 483 return (0); 484 if (S[L->l_data_length - 1] == delimiter.sc) { 485 return (L->l_data_length - 1); 486 } else { 487 return (L->l_data_length); 488 } 489 } 490 491 if (is_end && offset == 0) 492 ret--; 493 494 return (ret); 495 } 496 497 /* 498 * field_delimit_tabbed() is called when a field separator has been defined 499 * using the -t option. The character at the offset, start, is either one or 500 * more character positions past the delimiter marking the start of the 501 * field, or at the end of the line. 502 */ 503 static void 504 field_delimit_tabbed(field_t *F, line_rec_t *L, ssize_t *start, ssize_t *end, 505 vchar_t delimiter) 506 { 507 ASSERT(F->f_start_field > -1); 508 509 *start = field_boundary_tabbed(F, L, 0, F->f_options & 510 FIELD_IGNORE_BLANKS_START, delimiter); 511 *end = field_boundary_tabbed(F, L, 1, F->f_options & 512 FIELD_IGNORE_BLANKS_END, delimiter); 513 } 514 515 static ssize_t 516 field_boundary_tabbed_wide(field_t *F, line_rec_t *L, int is_end, int is_blanks, 517 vchar_t delimiter) 518 { 519 wchar_t *S = L->l_data.wp; 520 wchar_t *T = S; 521 wchar_t *eol = S + L->l_data_length; 522 ssize_t field = is_end ? F->f_end_field : F->f_start_field; 523 ssize_t offset = is_end ? F->f_end_offset : F->f_start_offset; 524 ssize_t ret; 525 526 ASSERT(is_end || field > -1); 527 528 if (is_end && field == -1) 529 return (L->l_data_length); 530 531 while (field-- > 0) { 532 T = xwsnchr(T, delimiter.wc, eol - T); 533 if (T == NULL || T > eol) 534 return (L->l_data_length); 535 536 T++; 537 } 538 539 if ((!is_end || offset != 0) && is_blanks) { 540 while (W_IS_BLANK(*T)) 541 T++; 542 } 543 544 if ((ret = MAX(T - S, 0) + offset) >= L->l_data_length) { 545 if (L->l_data_length <= 0) 546 return (0); 547 if (S[L->l_data_length - 1] == delimiter.wc) { 548 return (L->l_data_length - 1); 549 } else { 550 return (L->l_data_length); 551 } 552 } 553 554 if (is_end && offset == 0) 555 ret--; 556 557 return (ret); 558 } 559 560 static void 561 field_delimit_tabbed_wide(field_t *F, line_rec_t *L, ssize_t *start, 562 ssize_t *end, vchar_t delimiter) 563 { 564 ASSERT(F->f_start_field > -1); 565 566 *start = field_boundary_tabbed_wide(F, L, 0, F->f_options & 567 FIELD_IGNORE_BLANKS_START, delimiter); 568 *end = field_boundary_tabbed_wide(F, L, 1, F->f_options & 569 FIELD_IGNORE_BLANKS_END, delimiter); 570 } 571 572 /*ARGSUSED*/ 573 ssize_t 574 field_convert_month(field_t *F, line_rec_t *L, vchar_t delimiter, 575 ssize_t data_offset, ssize_t data_length, ssize_t coll_offset) 576 { 577 int j; 578 ssize_t val; 579 char month_candidate[MAX_MON_LEN * MB_LEN_MAX]; 580 ssize_t month_length = data_length; 581 ssize_t month_offset = data_offset; 582 583 if (sizeof (char) > L->l_collate_bufsize - coll_offset) 584 return (-1); 585 586 (void) memset(month_candidate, 0, MAX_MON_LEN * MB_LEN_MAX); 587 588 589 /* 590 * The month field formally begins with the first non-blank character. 591 */ 592 while (IS_BLANK(*(L->l_data.sp + month_offset))) { 593 month_offset++; 594 month_length--; 595 } 596 597 for (j = 0; j < MAX_MON_LEN && j < month_length; j++) 598 month_candidate[j] = toupper((L->l_data.sp + month_offset)[j]); 599 600 for (j = 0; j < MONTHS_IN_YEAR; j++) { 601 if (xstrneql(month_candidate, months[j], month_lengths[j])) { 602 *(L->l_collate.sp + coll_offset) = '\0' + j + MO_OFFSET; 603 return (1); 604 } 605 } 606 607 /* 608 * no matching month; copy string into field. required behaviour is 609 * that "month-free" keys sort before month-sortable keys, so insert 610 * a "will sort first" token. 611 */ 612 *(L->l_collate.sp + coll_offset) = '\0' + MO_NONE; 613 614 val = field_convert_alpha_simple(F, L, delimiter, data_offset, 615 data_length, coll_offset + 1); 616 617 if (val < 0) 618 return (-1); 619 else 620 return (val + 1); 621 } 622 623 /*ARGSUSED*/ 624 ssize_t 625 field_convert_month_wide(field_t *F, line_rec_t *L, vchar_t delimiter, 626 ssize_t data_offset, ssize_t data_length, ssize_t coll_offset) 627 { 628 ssize_t j; 629 ssize_t val; 630 wchar_t month_candidate[MAX_MON_LEN]; 631 wchar_t *month; 632 wchar_t *buffer = L->l_collate.wp + coll_offset; 633 ssize_t month_length = data_length; 634 ssize_t month_offset = data_offset; 635 636 if (L->l_collate_bufsize - coll_offset * sizeof (wchar_t) < 637 sizeof (wchar_t)) 638 return (-1); 639 640 (void) memset(month_candidate, 0, MAX_MON_LEN * sizeof (wchar_t)); 641 642 643 while (W_IS_BLANK(*(L->l_data.wp + month_offset))) { 644 month_offset++; 645 month_length--; 646 } 647 648 month = L->l_data.wp + month_offset; 649 650 for (j = 0; j < MAX_MON_LEN && j < month_length; j++) 651 month_candidate[j] = towupper(month[j]); 652 653 for (j = 0; j < MONTHS_IN_YEAR; j++) 654 if (xwcsneql(month_candidate, w_months[j], 655 w_month_lengths[j])) { 656 *buffer = L'\0' + j + MO_OFFSET; 657 return (1); 658 } 659 660 *buffer = L'\0' + MO_NONE; 661 662 val = field_convert_alpha_wide(F, L, delimiter, data_offset, 663 data_length, coll_offset + sizeof (wchar_t)); 664 665 if (val < 0) 666 return (-1); 667 else 668 return (val + 1); 669 } 670 671 /* 672 * field_convert_alpha() always fails with return value -1 if the converted 673 * string would cause l_collate_length to exceed l_collate_bufsize 674 */ 675 /*ARGSUSED*/ 676 ssize_t 677 field_convert_alpha(field_t *F, line_rec_t *L, vchar_t delimiter, 678 ssize_t data_offset, ssize_t data_length, ssize_t coll_offset) 679 { 680 static char *compose; 681 static ssize_t compose_length; 682 683 ssize_t clength = 0; 684 ssize_t dlength; 685 ssize_t i; 686 687 if (compose_length < (data_length + 1)) { 688 compose_length = data_length + 1; 689 compose = safe_realloc(compose, compose_length * sizeof (char)); 690 } 691 692 for (i = data_offset; i < data_offset + data_length; i++) { 693 char t = (L->l_data.sp)[i]; 694 695 if ((F->f_options & FIELD_IGNORE_NONPRINTABLES) && 696 !isprint((uchar_t)t)) 697 continue; 698 699 if ((F->f_options & FIELD_DICTIONARY_ORDER) && 700 !isalnum((uchar_t)t) && !isspace((uchar_t)t)) 701 continue; 702 703 if (F->f_options & FIELD_FOLD_UPPERCASE) 704 t = toupper(t); 705 706 compose[clength++] = t; 707 } 708 compose[clength] = '\0'; 709 710 if ((dlength = xfrm_ops->sx_len(compose, clength)) < 711 L->l_collate_bufsize - coll_offset) 712 return (xfrm_ops->sx_xfrm(L->l_collate.sp + coll_offset, 713 compose, dlength + 1)); 714 else 715 return ((ssize_t)-1); 716 } 717 718 /*ARGSUSED*/ 719 ssize_t 720 field_convert_alpha_simple(field_t *F, line_rec_t *L, vchar_t delimiter, 721 ssize_t data_offset, ssize_t data_length, ssize_t coll_offset) 722 { 723 static char *compose; 724 static ssize_t compose_length; 725 726 ssize_t clength; 727 ssize_t dlength; 728 729 if (compose_length < (data_length + 1)) { 730 compose_length = data_length + 1; 731 compose = safe_realloc(compose, compose_length * sizeof (char)); 732 } 733 734 (void) memcpy(compose, L->l_data.sp + data_offset, data_length); 735 clength = data_length; 736 compose[clength] = '\0'; 737 738 if ((dlength = xfrm_ops->sx_len(compose, clength)) < 739 L->l_collate_bufsize - coll_offset) 740 return (xfrm_ops->sx_xfrm(L->l_collate.sp + coll_offset, 741 compose, dlength + 1)); 742 else 743 return ((ssize_t)-1); 744 } 745 746 /*ARGSUSED*/ 747 ssize_t 748 field_convert_alpha_wide(field_t *F, line_rec_t *L, vchar_t delimiter, 749 ssize_t data_offset, ssize_t data_length, ssize_t coll_offset) 750 { 751 wchar_t *compose = safe_realloc(NULL, (data_length + 1) * 752 sizeof (wchar_t)); 753 ssize_t clength = 0; 754 ssize_t dlength; 755 ssize_t i; 756 ssize_t ret; 757 758 for (i = data_offset; i < data_offset + data_length; i++) { 759 wchar_t t = (L->l_data.wp)[i]; 760 761 if ((F->f_options & FIELD_IGNORE_NONPRINTABLES) && !iswprint(t)) 762 continue; 763 764 if ((F->f_options & FIELD_DICTIONARY_ORDER) && !iswalnum(t) && 765 !iswspace(t)) 766 continue; 767 768 if (F->f_options & FIELD_FOLD_UPPERCASE) 769 t = towupper(t); 770 771 compose[clength++] = t; 772 } 773 compose[clength] = L'\0'; 774 775 dlength = wcsxfrm(NULL, compose, (size_t)0); 776 if ((dlength * sizeof (wchar_t)) < L->l_collate_bufsize - 777 coll_offset * sizeof (wchar_t)) { 778 ret = (ssize_t)wcsxfrm(L->l_collate.wp + coll_offset, compose, 779 (size_t)dlength + 1); 780 } else { 781 ret = (ssize_t)-1; 782 } 783 784 safe_free(compose); 785 786 return (ret); 787 } 788 789 /* 790 * field_convert_numeric() converts the given field into a collatable numerical 791 * sequence. The sequence is ordered as { log, integer, separator, fraction }, 792 * with an optional sentinel component at the sequence end. 793 */ 794 /*ARGSUSED*/ 795 ssize_t 796 field_convert_numeric(field_t *F, line_rec_t *L, vchar_t delimiter, 797 ssize_t data_offset, ssize_t data_length, ssize_t coll_offset) 798 { 799 char *number; 800 char *buffer = L->l_collate.sp + coll_offset; 801 ssize_t length; 802 803 char sign = '2'; 804 int log_ten; 805 char *digits = buffer + 1 + sizeof (int) / sizeof (char); 806 size_t j = 0; 807 size_t i; 808 809 int state = BEFORE_NUMBER; 810 811 number = L->l_data.sp + data_offset; 812 length = data_length; 813 814 /* 815 * Eat leading blanks, if any. 816 */ 817 for (i = 0; i < length; i++) 818 if (!IS_BLANK(number[i])) 819 break; 820 821 /* 822 * Test that there is sufficient size in the collation buffer for our 823 * number. In addition to the possible remaining characters in the 824 * field, we also require space for the sign (char), logarithm (int), 825 * separator (char), and as many as two string terminators (for reverse 826 * sorts). 827 */ 828 if (((length - i) + 4 * sizeof (char) + sizeof (int)) > 829 (L->l_collate_bufsize - coll_offset)) 830 return ((ssize_t)-1); 831 832 /* 833 * If negative, set sign. 834 */ 835 if (number[i] == '-') { 836 i++; 837 sign = '0'; 838 } 839 840 /* 841 * Scan integer part; eat leading zeros. 842 */ 843 for (; i < length; i++) { 844 if (IS_SEPARATOR(number[i])) 845 continue; 846 847 if (number[i] == '0' && !(state & IN_NUMBER)) 848 continue; 849 850 if (!isdigit((uchar_t)number[i])) 851 break; 852 853 state |= IN_NUMBER; 854 if (sign == '0') 855 digits[j++] = '0' + '9' - number[i]; 856 else 857 digits[j++] = number[i]; 858 } 859 860 if (i < length && IS_DECIMAL(number[i])) { 861 /* 862 * Integer part terminated by decimal. 863 */ 864 digits[j] = DECIMAL_CHAR; 865 log_ten = j++; 866 867 /* 868 * Scan fractional part. 869 */ 870 for (++i; i < length; i++) { 871 if (IS_SEPARATOR(number[i])) 872 continue; 873 874 if (!isdigit((uchar_t)number[i])) 875 break; 876 877 if (number[i] != '0') 878 state |= IN_NUMBER; 879 880 if (sign == '0') 881 digits[j++] = '0' + '9' - number[i]; 882 else 883 digits[j++] = number[i]; 884 } 885 886 if (sign == '0') 887 digits[j++] = (char)(UCHAR_MAX - INTERFIELD_SEPARATOR); 888 } else { 889 /* 890 * Nondigit or end of string seen. 891 */ 892 log_ten = (int)j; 893 if (sign == '0') 894 digits[j++] = (char)(UCHAR_MAX - INTERFIELD_SEPARATOR); 895 else 896 digits[j] = INTERFIELD_SEPARATOR; 897 } 898 899 if ((state & IN_NUMBER) == 0) { 900 /* 901 * A non-zero number was not detected; treat as defined zero. 902 */ 903 sign = '1'; 904 log_ten = 0; 905 digits[0] = '0'; 906 j = 1; 907 } 908 909 /* 910 * We subtract a constant from the log of negative values so that 911 * they will correctly precede positive values with a zero logarithm. 912 */ 913 if (sign == '0') { 914 if (j != 0) 915 log_ten = -log_ten - 2; 916 else 917 /* 918 * Special case for -0. 919 */ 920 log_ten = -1; 921 } 922 923 buffer[0] = sign; 924 925 /* 926 * Place logarithm in big-endian form. 927 */ 928 for (i = 0; i < sizeof (int); i++) 929 buffer[i + 1] = (log_ten << (i * NBBY)) 930 >> ((sizeof (int) - 1) * NBBY); 931 932 if (j + sizeof (char) + sizeof (int) < 933 L->l_collate_bufsize - coll_offset) 934 return (j + 1 + sizeof (int)); 935 else 936 return ((ssize_t)-1); 937 } 938 939 /*ARGSUSED*/ 940 ssize_t 941 field_convert_numeric_wide(field_t *F, line_rec_t *L, vchar_t delimiter, 942 ssize_t data_offset, ssize_t data_length, ssize_t coll_offset) 943 { 944 wchar_t *number; 945 wchar_t *buffer = L->l_collate.wp + coll_offset; 946 char *lbuffer; 947 ssize_t length; 948 949 wchar_t sign = L'2'; 950 int log_ten; 951 wchar_t *digits = buffer + 1 + sizeof (int)/sizeof (wchar_t); 952 size_t j = 0; 953 size_t i; 954 955 int state = BEFORE_NUMBER; 956 957 number = L->l_data.wp + data_offset; 958 length = data_length; 959 960 for (i = 0; i < length; i++) 961 if (!W_IS_BLANK(number[i])) 962 break; 963 964 if (((length - i) * sizeof (wchar_t) + 4 * sizeof (wchar_t) + 965 sizeof (int)) > (L->l_collate_bufsize - coll_offset)) 966 return ((ssize_t)-1); 967 968 if (number[i] == L'-') { 969 i++; 970 sign = L'0'; 971 } 972 973 for (; i < length; i++) { 974 if (W_IS_SEPARATOR(number[i])) 975 continue; 976 977 if (number[i] == L'0' && !(state & IN_NUMBER)) 978 continue; 979 980 if (!iswdigit(number[i])) 981 break; 982 983 state |= IN_NUMBER; 984 if (sign == L'0') 985 digits[j++] = L'0' + L'9' - number[i]; 986 else 987 digits[j++] = number[i]; 988 } 989 990 if (i < length && W_IS_DECIMAL(number[i])) { 991 digits[j] = W_DECIMAL_CHAR; 992 log_ten = j++; 993 994 for (++i; i < length; i++) { 995 if (W_IS_SEPARATOR(number[i])) 996 continue; 997 998 if (!iswdigit(number[i])) 999 break; 1000 1001 if (number[i] != L'0') 1002 state |= IN_NUMBER; 1003 1004 if (sign == L'0') 1005 digits[j++] = L'0' + L'9' - number[i]; 1006 else 1007 digits[j++] = number[i]; 1008 } 1009 1010 if (sign == L'0') 1011 digits[j++] = (wchar_t)(WCHAR_MAX - 1012 W_INTERFIELD_SEPARATOR); 1013 } else { 1014 log_ten = (int)j; 1015 if (sign == L'0') 1016 digits[j++] = (wchar_t)(WCHAR_MAX - 1017 W_INTERFIELD_SEPARATOR); 1018 else 1019 digits[j] = W_INTERFIELD_SEPARATOR; 1020 } 1021 1022 if ((state & IN_NUMBER) == 0) { 1023 sign = L'1'; 1024 log_ten = 0; 1025 digits[0] = L'0'; 1026 j = 1; 1027 } 1028 1029 if (sign == L'0') { 1030 if (j != 0) 1031 log_ten = -log_ten - 2; 1032 else 1033 log_ten = -1; 1034 } 1035 1036 buffer[0] = sign; 1037 /* 1038 * Place logarithm in big-endian form. 1039 */ 1040 lbuffer = (char *)(buffer + 1); 1041 for (i = 0; i < sizeof (int); i++) 1042 lbuffer[i] = (log_ten << (i * NBBY)) 1043 >> ((sizeof (int) - 1) * NBBY); 1044 1045 if ((j + 1 + sizeof (int)/sizeof (wchar_t)) * sizeof (wchar_t) < 1046 L->l_collate_bufsize - coll_offset * sizeof (wchar_t)) 1047 return (j + 1 + sizeof (int) / sizeof (wchar_t)); 1048 else 1049 return ((ssize_t)-1); 1050 } 1051 1052 /* 1053 * flags contains one of CV_REALLOC, CV_FAIL, specifying the preferred behaviour 1054 * when coll_offset exceeds l_collate_bufsize. 1055 */ 1056 ssize_t 1057 field_convert(field_t *F, line_rec_t *L, int flags, vchar_t field_separator) 1058 { 1059 ssize_t coll_offset = 0; 1060 ssize_t start, end, distance; 1061 field_t *cur_fieldp = F; 1062 1063 while (cur_fieldp != NULL) { 1064 /* 1065 * delimit field 1066 */ 1067 if (!field_separator.sc) 1068 field_delimit(cur_fieldp, L, &start, &end); 1069 else 1070 field_delimit_tabbed(cur_fieldp, L, &start, &end, 1071 field_separator); 1072 1073 distance = 0; 1074 if (end - start > 0 || 1075 (end - start == 0 && F->f_species == NUMERIC)) { 1076 /* 1077 * Convert field, appending to collated field of line 1078 * record. 1079 */ 1080 distance = cur_fieldp->f_convert(cur_fieldp, L, 1081 field_separator, start, end - start, coll_offset); 1082 1083 /* 1084 * branch should execute comparatively rarely 1085 */ 1086 if (distance == -1) { 1087 if (flags & FCV_REALLOC) { 1088 ASSERT(L->l_collate_bufsize > 0); 1089 L->l_collate_bufsize *= 2; 1090 L->l_collate.sp = 1091 safe_realloc(L->l_collate.sp, 1092 L->l_collate_bufsize); 1093 1094 __S(stats_incr_convert_reallocs()); 1095 continue; 1096 } else { 1097 /* 1098 * FCV_FAIL has been set. 1099 */ 1100 return (-1); 1101 } 1102 } 1103 } 1104 1105 if (cur_fieldp->f_options & FIELD_REVERSE_COMPARISONS) { 1106 xstrninv(L->l_collate.sp, coll_offset, distance); 1107 *(L->l_collate.sp + coll_offset + distance) = 1108 (char)(UCHAR_MAX - INTERFIELD_SEPARATOR); 1109 distance++; 1110 } 1111 1112 ASSERT(distance >= 0); 1113 coll_offset += distance; 1114 if (coll_offset >= L->l_collate_bufsize) { 1115 if (flags & FCV_REALLOC) { 1116 ASSERT(L->l_collate_bufsize > 0); 1117 L->l_collate_bufsize *= 2; 1118 L->l_collate.sp = safe_realloc(L->l_collate.sp, 1119 L->l_collate_bufsize); 1120 1121 __S(stats_incr_convert_reallocs()); 1122 } else { 1123 return (-1); 1124 } 1125 } 1126 *(L->l_collate.sp + coll_offset) = INTERFIELD_SEPARATOR; 1127 coll_offset++; 1128 1129 cur_fieldp = cur_fieldp->f_next; 1130 } 1131 1132 L->l_collate_length = coll_offset; 1133 1134 return (L->l_collate_length); 1135 } 1136 1137 ssize_t 1138 field_convert_wide(field_t *F, line_rec_t *L, int flags, 1139 vchar_t field_separator) 1140 { 1141 ssize_t coll_offset = 0; 1142 ssize_t start, end, distance; 1143 field_t *cur_fieldp = F; 1144 1145 while (cur_fieldp != NULL) { 1146 if (!field_separator.wc) 1147 field_delimit_wide(cur_fieldp, L, &start, &end); 1148 else 1149 field_delimit_tabbed_wide(cur_fieldp, L, &start, &end, 1150 field_separator); 1151 1152 distance = 0; 1153 if (end - start > 0 || 1154 end - start == 0 && F->f_species == NUMERIC) { 1155 distance = cur_fieldp->f_convert(cur_fieldp, L, 1156 field_separator, start, end - start, coll_offset); 1157 1158 if (distance == -1) { 1159 if (flags & FCV_REALLOC) { 1160 ASSERT(L->l_collate_bufsize > 0); 1161 L->l_collate_bufsize *= 2; 1162 L->l_collate.wp = safe_realloc( 1163 L->l_collate.wp, 1164 L->l_collate_bufsize); 1165 1166 __S(stats_incr_convert_reallocs()); 1167 continue; 1168 } else { 1169 return (-1); 1170 } 1171 } 1172 } 1173 1174 if (cur_fieldp->f_options & FIELD_REVERSE_COMPARISONS) { 1175 xwcsninv(L->l_collate.wp, coll_offset, distance); 1176 *(L->l_collate.wp + coll_offset + distance) = 1177 WCHAR_MAX - INTERFIELD_SEPARATOR; 1178 distance++; 1179 } 1180 1181 ASSERT(distance >= 0); 1182 coll_offset += distance; 1183 if (coll_offset * sizeof (wchar_t) >= L->l_collate_bufsize) { 1184 if (flags & FCV_REALLOC) { 1185 ASSERT(L->l_collate_bufsize > 0); 1186 L->l_collate_bufsize *= 2; 1187 L->l_collate.wp = safe_realloc(L->l_collate.wp, 1188 L->l_collate_bufsize); 1189 1190 __S(stats_incr_convert_reallocs()); 1191 } else { 1192 return (-1); 1193 } 1194 } 1195 *(L->l_collate.wp + coll_offset) = W_INTERFIELD_SEPARATOR; 1196 coll_offset++; 1197 1198 cur_fieldp = cur_fieldp->f_next; 1199 } 1200 1201 L->l_collate_length = coll_offset * sizeof (wchar_t); 1202 #ifdef _LITTLE_ENDIAN 1203 xwcsntomsb(L->l_collate.wp, coll_offset); 1204 #endif /* _LITTLE_ENDIAN */ 1205 1206 return (L->l_collate_length); 1207 } 1208 1209 /* 1210 * line_convert() and line_convert_wide() are called when the collation vector 1211 * of a given line has been exhausted, and we are performing the final, 1212 * full-line comparison required by the sort specification. Because we do not 1213 * have a guarantee that l_data is null-terminated, we create an explicitly 1214 * null-terminated copy suitable for transformation to a collatable form for the 1215 * current locale. 1216 */ 1217 static void 1218 line_convert(line_rec_t *L) 1219 { 1220 static ssize_t bufsize; 1221 static char *buffer; 1222 1223 if (L->l_raw_collate.sp != NULL) 1224 return; 1225 1226 if (L->l_data_length + 1 > bufsize) { 1227 buffer = safe_realloc(buffer, L->l_data_length + 1); 1228 bufsize = L->l_data_length + 1; 1229 } 1230 1231 (void) strncpy(buffer, L->l_data.sp, L->l_data_length); 1232 buffer[L->l_data_length] = '\0'; 1233 1234 L->l_raw_collate.sp = safe_realloc(L->l_raw_collate.sp, 1235 xfrm_ops->sx_len(buffer, L->l_data_length) + 1); 1236 xfrm_ops->sx_xfrm(L->l_raw_collate.sp, buffer, 1237 xfrm_ops->sx_len(buffer, L->l_data_length) + 1); 1238 1239 __S(stats_incr_line_conversions()); 1240 } 1241 1242 static void 1243 line_convert_wide(line_rec_t *L) 1244 { 1245 static wchar_t *buffer; 1246 static ssize_t bufsize; 1247 1248 ssize_t dlength; 1249 1250 if (L->l_raw_collate.wp != NULL) 1251 return; 1252 1253 if (L->l_data_length + 1 > bufsize) { 1254 buffer = safe_realloc(buffer, (L->l_data_length + 1) * 1255 sizeof (wchar_t)); 1256 bufsize = L->l_data_length + 1; 1257 } 1258 1259 (void) wcsncpy(buffer, L->l_data.wp, L->l_data_length); 1260 buffer[L->l_data_length] = L'\0'; 1261 1262 dlength = wcsxfrm(NULL, buffer, 0) + 1; 1263 L->l_raw_collate.wp = safe_realloc(L->l_raw_collate.wp, dlength * 1264 sizeof (wchar_t)); 1265 (void) wcsxfrm(L->l_raw_collate.wp, buffer, dlength); 1266 1267 __S(stats_incr_line_conversions()); 1268 } 1269 1270 /* 1271 * Our convention for collation is 1272 * 1273 * A > B => r > 0, 1274 * A == B => r = 0, 1275 * A < B => r < 0 1276 * 1277 * This convention is consistent with the definition of memcmp(), strcmp(), and 1278 * strncmp() in the C locale. collated() and collated_wide() have two optional 1279 * behaviours, which can be activated by setting the appropriate values in 1280 * coll_flag: COLL_UNIQUE, which returns 0 if the l_collate fields of the line 1281 * records being compared are identical; COLL_DATA_ONLY, which ignores the 1282 * l_collate field for the current comparison; and COLL_REVERSE, which flips the 1283 * result for comparisons that fall through to an actual data comparison (since 1284 * the collated vector should already reflect reverse ordering from field 1285 * conversion). 1286 */ 1287 int 1288 collated(line_rec_t *A, line_rec_t *B, ssize_t depth, flag_t coll_flag) 1289 { 1290 ssize_t ml = MIN(A->l_collate_length, B->l_collate_length) - depth; 1291 int r; 1292 int mask = (coll_flag & COLL_REVERSE) ? INT_SIGN_FLIP_MASK : 1293 INT_SIGN_PASS_MASK; 1294 ssize_t la, lb; 1295 1296 if (!(coll_flag & COLL_DATA_ONLY)) { 1297 if (ml > 0) { 1298 r = memcmp(A->l_collate.sp + depth, 1299 B->l_collate.sp + depth, ml); 1300 1301 if (r) 1302 return (r); 1303 } 1304 1305 if (A->l_collate_length < B->l_collate_length) 1306 return (-1); 1307 1308 if (A->l_collate_length > B->l_collate_length) 1309 return (1); 1310 } 1311 1312 /* 1313 * This is where we cut out, if we know that the current sort is over 1314 * the entire line. 1315 */ 1316 if (coll_flag & COLL_UNIQUE) 1317 return (0); 1318 1319 line_convert(A); 1320 line_convert(B); 1321 1322 la = strlen(A->l_raw_collate.sp); 1323 lb = strlen(B->l_raw_collate.sp); 1324 1325 r = memcmp(A->l_raw_collate.sp, B->l_raw_collate.sp, MIN(la, lb)); 1326 1327 if (r) 1328 return (r ^ mask); 1329 1330 if (la < lb) 1331 return (-1 ^ mask); 1332 1333 if (la > lb) 1334 return (1 ^ mask); 1335 1336 return (0); 1337 } 1338 1339 int 1340 collated_wide(line_rec_t *A, line_rec_t *B, ssize_t depth, flag_t coll_flag) 1341 { 1342 ssize_t ml = MIN(A->l_collate_length, B->l_collate_length) - depth; 1343 int r; 1344 int mask = (coll_flag & COLL_REVERSE) ? INT_SIGN_FLIP_MASK : 1345 INT_SIGN_PASS_MASK; 1346 ssize_t la, lb; 1347 1348 if (!(coll_flag & COLL_DATA_ONLY)) { 1349 if (ml > 0) { 1350 r = memcmp(A->l_collate.sp + depth, 1351 B->l_collate.sp + depth, ml); 1352 1353 if (r) 1354 return (r); 1355 } 1356 if (A->l_collate_length < B->l_collate_length) 1357 return (-1); 1358 1359 if (A->l_collate_length > B->l_collate_length) 1360 return (1); 1361 } 1362 1363 if (coll_flag & COLL_UNIQUE) 1364 return (0); 1365 1366 line_convert_wide(A); 1367 line_convert_wide(B); 1368 1369 la = wcslen(A->l_raw_collate.wp); 1370 lb = wcslen(B->l_raw_collate.wp); 1371 1372 r = wmemcmp(A->l_raw_collate.wp, B->l_raw_collate.wp, 1373 (size_t)MIN(la, lb)); 1374 1375 if (r) 1376 return (r ^ mask); 1377 1378 if (la < lb) 1379 return (-1 ^ mask); 1380 1381 if (la > lb) 1382 return (1 ^ mask); 1383 1384 return (0); 1385 } 1386