1 /*- 2 * Copyright (c) 1991, 1993, 1994 3 * The Regents of the University of California. All rights reserved. 4 * 5 * This code is derived from software contributed to Berkeley by 6 * Steve Hayman of the Computer Science Department, Indiana University, 7 * Michiro Hikida and David Goodenough. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 3. All advertising materials mentioning features or use of this software 18 * must display the following acknowledgement: 19 * This product includes software developed by the University of 20 * California, Berkeley and its contributors. 21 * 4. Neither the name of the University nor the names of its contributors 22 * may be used to endorse or promote products derived from this software 23 * without specific prior written permission. 24 * 25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 28 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 35 * SUCH DAMAGE. 36 */ 37 38 #ifndef lint 39 static const char copyright[] = 40 "@(#) Copyright (c) 1991, 1993, 1994\n\ 41 The Regents of the University of California. All rights reserved.\n"; 42 #endif /* not lint */ 43 44 #ifndef lint 45 #if 0 46 static char sccsid[] = "@(#)join.c 8.6 (Berkeley) 5/4/95"; 47 #endif 48 #endif /* not lint */ 49 #include <sys/cdefs.h> 50 __FBSDID("$FreeBSD$"); 51 52 #include <sys/param.h> 53 54 #include <err.h> 55 #include <errno.h> 56 #include <locale.h> 57 #include <stdio.h> 58 #include <stdlib.h> 59 #include <string.h> 60 #include <unistd.h> 61 62 /* 63 * There's a structure per input file which encapsulates the state of the 64 * file. We repeatedly read lines from each file until we've read in all 65 * the consecutive lines from the file with a common join field. Then we 66 * compare the set of lines with an equivalent set from the other file. 67 */ 68 typedef struct { 69 char *line; /* line */ 70 u_long linealloc; /* line allocated count */ 71 char **fields; /* line field(s) */ 72 u_long fieldcnt; /* line field(s) count */ 73 u_long fieldalloc; /* line field(s) allocated count */ 74 } LINE; 75 76 typedef struct { 77 FILE *fp; /* file descriptor */ 78 u_long joinf; /* join field (-1, -2, -j) */ 79 int unpair; /* output unpairable lines (-a) */ 80 u_long number; /* 1 for file 1, 2 for file 2 */ 81 82 LINE *set; /* set of lines with same field */ 83 int pushbool; /* if pushback is set */ 84 u_long pushback; /* line on the stack */ 85 u_long setcnt; /* set count */ 86 u_long setalloc; /* set allocated count */ 87 } INPUT; 88 INPUT input1 = { NULL, 0, 0, 1, NULL, 0, 0, 0, 0 }, 89 input2 = { NULL, 0, 0, 2, NULL, 0, 0, 0, 0 }; 90 91 typedef struct { 92 u_long filenum; /* file number */ 93 u_long fieldno; /* field number */ 94 } OLIST; 95 OLIST *olist; /* output field list */ 96 u_long olistcnt; /* output field list count */ 97 u_long olistalloc; /* output field allocated count */ 98 99 int joinout = 1; /* show lines with matched join fields (-v) */ 100 int needsep; /* need separator character */ 101 int spans = 1; /* span multiple delimiters (-t) */ 102 char *empty; /* empty field replacement string (-e) */ 103 static char default_tabchar[] = " \t"; 104 char *tabchar = default_tabchar;/* delimiter characters (-t) */ 105 106 int cmp(LINE *, u_long, LINE *, u_long); 107 void fieldarg(char *); 108 void joinlines(INPUT *, INPUT *); 109 void obsolete(char **); 110 void outfield(LINE *, u_long, int); 111 void outoneline(INPUT *, LINE *); 112 void outtwoline(INPUT *, LINE *, INPUT *, LINE *); 113 void slurp(INPUT *); 114 void usage(void); 115 116 int 117 main(int argc, char *argv[]) 118 { 119 INPUT *F1, *F2; 120 int aflag, ch, cval, vflag; 121 char *end; 122 123 setlocale(LC_ALL, ""); 124 125 F1 = &input1; 126 F2 = &input2; 127 128 aflag = vflag = 0; 129 obsolete(argv); 130 while ((ch = getopt(argc, argv, "\01a:e:j:1:2:o:t:v:")) != -1) { 131 switch (ch) { 132 case '\01': /* See comment in obsolete(). */ 133 aflag = 1; 134 F1->unpair = F2->unpair = 1; 135 break; 136 case '1': 137 if ((F1->joinf = strtol(optarg, &end, 10)) < 1) 138 errx(1, "-1 option field number less than 1"); 139 if (*end) 140 errx(1, "illegal field number -- %s", optarg); 141 --F1->joinf; 142 break; 143 case '2': 144 if ((F2->joinf = strtol(optarg, &end, 10)) < 1) 145 errx(1, "-2 option field number less than 1"); 146 if (*end) 147 errx(1, "illegal field number -- %s", optarg); 148 --F2->joinf; 149 break; 150 case 'a': 151 aflag = 1; 152 switch(strtol(optarg, &end, 10)) { 153 case 1: 154 F1->unpair = 1; 155 break; 156 case 2: 157 F2->unpair = 1; 158 break; 159 default: 160 errx(1, "-a option file number not 1 or 2"); 161 break; 162 } 163 if (*end) 164 errx(1, "illegal file number -- %s", optarg); 165 break; 166 case 'e': 167 empty = optarg; 168 break; 169 case 'j': 170 if ((F1->joinf = F2->joinf = 171 strtol(optarg, &end, 10)) < 1) 172 errx(1, "-j option field number less than 1"); 173 if (*end) 174 errx(1, "illegal field number -- %s", optarg); 175 --F1->joinf; 176 --F2->joinf; 177 break; 178 case 'o': 179 fieldarg(optarg); 180 break; 181 case 't': 182 spans = 0; 183 if (strlen(tabchar = optarg) != 1) 184 errx(1, "illegal tab character specification"); 185 break; 186 case 'v': 187 vflag = 1; 188 joinout = 0; 189 switch (strtol(optarg, &end, 10)) { 190 case 1: 191 F1->unpair = 1; 192 break; 193 case 2: 194 F2->unpair = 1; 195 break; 196 default: 197 errx(1, "-v option file number not 1 or 2"); 198 break; 199 } 200 if (*end) 201 errx(1, "illegal file number -- %s", optarg); 202 break; 203 case '?': 204 default: 205 usage(); 206 } 207 } 208 argc -= optind; 209 argv += optind; 210 211 if (aflag && vflag) 212 errx(1, "the -a and -v options are mutually exclusive"); 213 214 if (argc != 2) 215 usage(); 216 217 /* Open the files; "-" means stdin. */ 218 if (!strcmp(*argv, "-")) 219 F1->fp = stdin; 220 else if ((F1->fp = fopen(*argv, "r")) == NULL) 221 err(1, "%s", *argv); 222 ++argv; 223 if (!strcmp(*argv, "-")) 224 F2->fp = stdin; 225 else if ((F2->fp = fopen(*argv, "r")) == NULL) 226 err(1, "%s", *argv); 227 if (F1->fp == stdin && F2->fp == stdin) 228 errx(1, "only one input file may be stdin"); 229 230 slurp(F1); 231 slurp(F2); 232 while (F1->setcnt && F2->setcnt) { 233 cval = cmp(F1->set, F1->joinf, F2->set, F2->joinf); 234 if (cval == 0) { 235 /* Oh joy, oh rapture, oh beauty divine! */ 236 if (joinout) 237 joinlines(F1, F2); 238 slurp(F1); 239 slurp(F2); 240 } else if (cval < 0) { 241 /* File 1 takes the lead... */ 242 if (F1->unpair) 243 joinlines(F1, NULL); 244 slurp(F1); 245 } else { 246 /* File 2 takes the lead... */ 247 if (F2->unpair) 248 joinlines(F2, NULL); 249 slurp(F2); 250 } 251 } 252 253 /* 254 * Now that one of the files is used up, optionally output any 255 * remaining lines from the other file. 256 */ 257 if (F1->unpair) 258 while (F1->setcnt) { 259 joinlines(F1, NULL); 260 slurp(F1); 261 } 262 if (F2->unpair) 263 while (F2->setcnt) { 264 joinlines(F2, NULL); 265 slurp(F2); 266 } 267 exit(0); 268 } 269 270 void 271 slurp(INPUT *F) 272 { 273 LINE *lp, *lastlp, tmp; 274 size_t len; 275 int cnt; 276 char *bp, *fieldp; 277 278 /* 279 * Read all of the lines from an input file that have the same 280 * join field. 281 */ 282 F->setcnt = 0; 283 for (lastlp = NULL;; ++F->setcnt) { 284 /* 285 * If we're out of space to hold line structures, allocate 286 * more. Initialize the structure so that we know that this 287 * is new space. 288 */ 289 if (F->setcnt == F->setalloc) { 290 cnt = F->setalloc; 291 F->setalloc += 50; 292 if ((F->set = realloc(F->set, 293 F->setalloc * sizeof(LINE))) == NULL) 294 err(1, NULL); 295 memset(F->set + cnt, 0, 50 * sizeof(LINE)); 296 297 /* re-set lastlp in case it moved */ 298 if (lastlp != NULL) 299 lastlp = &F->set[F->setcnt - 1]; 300 } 301 302 /* 303 * Get any pushed back line, else get the next line. Allocate 304 * space as necessary. If taking the line from the stack swap 305 * the two structures so that we don't lose space allocated to 306 * either structure. This could be avoided by doing another 307 * level of indirection, but it's probably okay as is. 308 */ 309 lp = &F->set[F->setcnt]; 310 if (F->setcnt) 311 lastlp = &F->set[F->setcnt - 1]; 312 if (F->pushbool) { 313 tmp = F->set[F->setcnt]; 314 F->set[F->setcnt] = F->set[F->pushback]; 315 F->set[F->pushback] = tmp; 316 F->pushbool = 0; 317 continue; 318 } 319 if ((bp = fgetln(F->fp, &len)) == NULL) 320 return; 321 if (lp->linealloc <= len + 1) { 322 lp->linealloc += MAX(100, len + 1 - lp->linealloc); 323 if ((lp->line = 324 realloc(lp->line, lp->linealloc)) == NULL) 325 err(1, NULL); 326 } 327 memmove(lp->line, bp, len); 328 329 /* Replace trailing newline, if it exists. */ 330 if (bp[len - 1] == '\n') 331 lp->line[len - 1] = '\0'; 332 else 333 lp->line[len] = '\0'; 334 bp = lp->line; 335 336 /* Split the line into fields, allocate space as necessary. */ 337 lp->fieldcnt = 0; 338 while ((fieldp = strsep(&bp, tabchar)) != NULL) { 339 if (spans && *fieldp == '\0') 340 continue; 341 if (lp->fieldcnt == lp->fieldalloc) { 342 lp->fieldalloc += 50; 343 if ((lp->fields = realloc(lp->fields, 344 lp->fieldalloc * sizeof(char *))) == NULL) 345 err(1, NULL); 346 } 347 lp->fields[lp->fieldcnt++] = fieldp; 348 } 349 350 /* See if the join field value has changed. */ 351 if (lastlp != NULL && cmp(lp, F->joinf, lastlp, F->joinf)) { 352 F->pushbool = 1; 353 F->pushback = F->setcnt; 354 break; 355 } 356 } 357 } 358 359 int 360 cmp(LINE *lp1, u_long fieldno1, LINE *lp2, u_long fieldno2) 361 { 362 if (lp1->fieldcnt <= fieldno1) 363 return (lp2->fieldcnt <= fieldno2 ? 0 : 1); 364 if (lp2->fieldcnt <= fieldno2) 365 return (-1); 366 return (strcoll(lp1->fields[fieldno1], lp2->fields[fieldno2])); 367 } 368 369 void 370 joinlines(INPUT *F1, INPUT *F2) 371 { 372 u_long cnt1, cnt2; 373 374 /* 375 * Output the results of a join comparison. The output may be from 376 * either file 1 or file 2 (in which case the first argument is the 377 * file from which to output) or from both. 378 */ 379 if (F2 == NULL) { 380 for (cnt1 = 0; cnt1 < F1->setcnt; ++cnt1) 381 outoneline(F1, &F1->set[cnt1]); 382 return; 383 } 384 for (cnt1 = 0; cnt1 < F1->setcnt; ++cnt1) 385 for (cnt2 = 0; cnt2 < F2->setcnt; ++cnt2) 386 outtwoline(F1, &F1->set[cnt1], F2, &F2->set[cnt2]); 387 } 388 389 void 390 outoneline(INPUT *F, LINE *lp) 391 { 392 u_long cnt; 393 394 /* 395 * Output a single line from one of the files, according to the 396 * join rules. This happens when we are writing unmatched single 397 * lines. Output empty fields in the right places. 398 */ 399 if (olist) 400 for (cnt = 0; cnt < olistcnt; ++cnt) { 401 if (olist[cnt].filenum == (unsigned)F->number) 402 outfield(lp, olist[cnt].fieldno, 0); 403 else if (olist[cnt].filenum == 0) 404 outfield(lp, F->joinf, 0); 405 else 406 outfield(lp, 0, 1); 407 } 408 else 409 for (cnt = 0; cnt < lp->fieldcnt; ++cnt) 410 outfield(lp, cnt, 0); 411 (void)printf("\n"); 412 if (ferror(stdout)) 413 err(1, "stdout"); 414 needsep = 0; 415 } 416 417 void 418 outtwoline(INPUT *F1, LINE *lp1, INPUT *F2, LINE *lp2) 419 { 420 u_long cnt; 421 422 /* Output a pair of lines according to the join list (if any). */ 423 if (olist) 424 for (cnt = 0; cnt < olistcnt; ++cnt) 425 if (olist[cnt].filenum == 0) { 426 if (lp1->fieldcnt >= F1->joinf) 427 outfield(lp1, F1->joinf, 0); 428 else 429 outfield(lp2, F2->joinf, 0); 430 } else if (olist[cnt].filenum == 1) 431 outfield(lp1, olist[cnt].fieldno, 0); 432 else /* if (olist[cnt].filenum == 2) */ 433 outfield(lp2, olist[cnt].fieldno, 0); 434 else { 435 /* 436 * Output the join field, then the remaining fields from F1 437 * and F2. 438 */ 439 outfield(lp1, F1->joinf, 0); 440 for (cnt = 0; cnt < lp1->fieldcnt; ++cnt) 441 if (F1->joinf != cnt) 442 outfield(lp1, cnt, 0); 443 for (cnt = 0; cnt < lp2->fieldcnt; ++cnt) 444 if (F2->joinf != cnt) 445 outfield(lp2, cnt, 0); 446 } 447 (void)printf("\n"); 448 if (ferror(stdout)) 449 err(1, "stdout"); 450 needsep = 0; 451 } 452 453 void 454 outfield(LINE *lp, u_long fieldno, int out_empty) 455 { 456 if (needsep++) 457 (void)printf("%c", *tabchar); 458 if (!ferror(stdout)) { 459 if (lp->fieldcnt <= fieldno || out_empty) { 460 if (empty != NULL) 461 (void)printf("%s", empty); 462 } else { 463 if (*lp->fields[fieldno] == '\0') 464 return; 465 (void)printf("%s", lp->fields[fieldno]); 466 } 467 } 468 if (ferror(stdout)) 469 err(1, "stdout"); 470 } 471 472 /* 473 * Convert an output list argument "2.1, 1.3, 2.4" into an array of output 474 * fields. 475 */ 476 void 477 fieldarg(char *option) 478 { 479 u_long fieldno, filenum; 480 char *end, *token; 481 482 while ((token = strsep(&option, ", \t")) != NULL) { 483 if (*token == '\0') 484 continue; 485 if (token[0] == '0') 486 filenum = fieldno = 0; 487 else if ((token[0] == '1' || token[0] == '2') && 488 token[1] == '.') { 489 filenum = token[0] - '0'; 490 fieldno = strtol(token + 2, &end, 10); 491 if (*end) 492 errx(1, "malformed -o option field"); 493 if (fieldno == 0) 494 errx(1, "field numbers are 1 based"); 495 --fieldno; 496 } else 497 errx(1, "malformed -o option field"); 498 if (olistcnt == olistalloc) { 499 olistalloc += 50; 500 if ((olist = realloc(olist, 501 olistalloc * sizeof(OLIST))) == NULL) 502 err(1, NULL); 503 } 504 olist[olistcnt].filenum = filenum; 505 olist[olistcnt].fieldno = fieldno; 506 ++olistcnt; 507 } 508 } 509 510 void 511 obsolete(char **argv) 512 { 513 size_t len; 514 char **p, *ap, *t; 515 516 while ((ap = *++argv) != NULL) { 517 /* Return if "--". */ 518 if (ap[0] == '-' && ap[1] == '-') 519 return; 520 /* skip if not an option */ 521 if (ap[0] != '-') 522 continue; 523 switch (ap[1]) { 524 case 'a': 525 /* 526 * The original join allowed "-a", which meant the 527 * same as -a1 plus -a2. POSIX 1003.2, Draft 11.2 528 * only specifies this as "-a 1" and "a -2", so we 529 * have to use another option flag, one that is 530 * unlikely to ever be used or accidentally entered 531 * on the command line. (Well, we could reallocate 532 * the argv array, but that hardly seems worthwhile.) 533 */ 534 if (ap[2] == '\0' && (argv[1] == NULL || 535 (strcmp(argv[1], "1") != 0 && 536 strcmp(argv[1], "2") != 0))) { 537 ap[1] = '\01'; 538 warnx("-a option used without an argument; " 539 "reverting to historical behavior"); 540 } 541 break; 542 case 'j': 543 /* 544 * The original join allowed "-j[12] arg" and "-j arg". 545 * Convert the former to "-[12] arg". Don't convert 546 * the latter since getopt(3) can handle it. 547 */ 548 switch(ap[2]) { 549 case '1': 550 if (ap[3] != '\0') 551 goto jbad; 552 ap[1] = '1'; 553 ap[2] = '\0'; 554 break; 555 case '2': 556 if (ap[3] != '\0') 557 goto jbad; 558 ap[1] = '2'; 559 ap[2] = '\0'; 560 break; 561 case '\0': 562 break; 563 default: 564 jbad: errx(1, "illegal option -- %s", ap); 565 usage(); 566 } 567 break; 568 case 'o': 569 /* 570 * The original join allowed "-o arg arg". 571 * Convert to "-o arg -o arg". 572 */ 573 if (ap[2] != '\0') 574 break; 575 for (p = argv + 2; *p; ++p) { 576 if (p[0][0] == '0' || ((p[0][0] != '1' && 577 p[0][0] != '2') || p[0][1] != '.')) 578 break; 579 len = strlen(*p); 580 if (len - 2 != strspn(*p + 2, "0123456789")) 581 break; 582 if ((t = malloc(len + 3)) == NULL) 583 err(1, NULL); 584 t[0] = '-'; 585 t[1] = 'o'; 586 memmove(t + 2, *p, len + 1); 587 *p = t; 588 } 589 argv = p - 1; 590 break; 591 } 592 } 593 } 594 595 void 596 usage(void) 597 { 598 (void)fprintf(stderr, "%s %s\n%s\n", 599 "usage: join [-a fileno | -v fileno ] [-e string] [-1 field]", 600 "[-2 field]", 601 " [-o list] [-t char] file1 file2"); 602 exit(1); 603 } 604