1 /*- 2 * Copyright (c) 1991, 1993, 1994 3 * The Regents of the University of California. All rights reserved. 4 * 5 * This code is derived from software contributed to Berkeley by 6 * Steve Hayman of the Computer Science Department, Indiana University, 7 * Michiro Hikida and David Goodenough. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 3. All advertising materials mentioning features or use of this software 18 * must display the following acknowledgement: 19 * This product includes software developed by the University of 20 * California, Berkeley and its contributors. 21 * 4. Neither the name of the University nor the names of its contributors 22 * may be used to endorse or promote products derived from this software 23 * without specific prior written permission. 24 * 25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 28 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 35 * SUCH DAMAGE. 36 */ 37 38 #ifndef lint 39 static char copyright[] = 40 "@(#) Copyright (c) 1991, 1993, 1994\n\ 41 The Regents of the University of California. All rights reserved.\n"; 42 #endif /* not lint */ 43 44 #ifndef lint 45 static char sccsid[] = "@(#)join.c 8.6 (Berkeley) 5/4/95"; 46 #endif /* not lint */ 47 48 #include <sys/param.h> 49 50 #include <ctype.h> 51 #include <err.h> 52 #include <errno.h> 53 #include <stdio.h> 54 #include <stdlib.h> 55 #include <string.h> 56 #include <unistd.h> 57 58 /* 59 * There's a structure per input file which encapsulates the state of the 60 * file. We repeatedly read lines from each file until we've read in all 61 * the consecutive lines from the file with a common join field. Then we 62 * compare the set of lines with an equivalent set from the other file. 63 */ 64 typedef struct { 65 char *line; /* line */ 66 u_long linealloc; /* line allocated count */ 67 char **fields; /* line field(s) */ 68 u_long fieldcnt; /* line field(s) count */ 69 u_long fieldalloc; /* line field(s) allocated count */ 70 } LINE; 71 72 typedef struct { 73 FILE *fp; /* file descriptor */ 74 u_long joinf; /* join field (-1, -2, -j) */ 75 int unpair; /* output unpairable lines (-a) */ 76 int number; /* 1 for file 1, 2 for file 2 */ 77 78 LINE *set; /* set of lines with same field */ 79 int pushbool; /* if pushback is set */ 80 u_long pushback; /* line on the stack */ 81 u_long setcnt; /* set count */ 82 u_long setalloc; /* set allocated count */ 83 } INPUT; 84 INPUT input1 = { NULL, 0, 0, 1, NULL, 0, 0, 0, }, 85 input2 = { NULL, 0, 0, 2, NULL, 0, 0, 0, }; 86 87 typedef struct { 88 u_long filenum; /* file number */ 89 u_long fieldno; /* field number */ 90 } OLIST; 91 OLIST *olist; /* output field list */ 92 u_long olistcnt; /* output field list count */ 93 u_long olistalloc; /* output field allocated count */ 94 95 int joinout = 1; /* show lines with matched join fields (-v) */ 96 int needsep; /* need separator character */ 97 int spans = 1; /* span multiple delimiters (-t) */ 98 char *empty; /* empty field replacement string (-e) */ 99 char *tabchar = " \t"; /* delimiter characters (-t) */ 100 101 int cmp __P((LINE *, u_long, LINE *, u_long)); 102 void fieldarg __P((char *)); 103 void joinlines __P((INPUT *, INPUT *)); 104 void obsolete __P((char **)); 105 void outfield __P((LINE *, u_long, int)); 106 void outoneline __P((INPUT *, LINE *)); 107 void outtwoline __P((INPUT *, LINE *, INPUT *, LINE *)); 108 void slurp __P((INPUT *)); 109 void usage __P((void)); 110 111 int 112 main(argc, argv) 113 int argc; 114 char *argv[]; 115 { 116 INPUT *F1, *F2; 117 int aflag, ch, cval, vflag; 118 char *end; 119 120 F1 = &input1; 121 F2 = &input2; 122 123 aflag = vflag = 0; 124 obsolete(argv); 125 while ((ch = getopt(argc, argv, "\01a:e:j:1:2:o:t:v:")) != -1) { 126 switch (ch) { 127 case '\01': /* See comment in obsolete(). */ 128 aflag = 1; 129 F1->unpair = F2->unpair = 1; 130 break; 131 case '1': 132 if ((F1->joinf = strtol(optarg, &end, 10)) < 1) 133 errx(1, "-1 option field number less than 1"); 134 if (*end) 135 errx(1, "illegal field number -- %s", optarg); 136 --F1->joinf; 137 break; 138 case '2': 139 if ((F2->joinf = strtol(optarg, &end, 10)) < 1) 140 errx(1, "-2 option field number less than 1"); 141 if (*end) 142 errx(1, "illegal field number -- %s", optarg); 143 --F2->joinf; 144 break; 145 case 'a': 146 aflag = 1; 147 switch(strtol(optarg, &end, 10)) { 148 case 1: 149 F1->unpair = 1; 150 break; 151 case 2: 152 F2->unpair = 1; 153 break; 154 default: 155 errx(1, "-a option file number not 1 or 2"); 156 break; 157 } 158 if (*end) 159 errx(1, "illegal file number -- %s", optarg); 160 break; 161 case 'e': 162 empty = optarg; 163 break; 164 case 'j': 165 if ((F1->joinf = F2->joinf = 166 strtol(optarg, &end, 10)) < 1) 167 errx(1, "-j option field number less than 1"); 168 if (*end) 169 errx(1, "illegal field number -- %s", optarg); 170 --F1->joinf; 171 --F2->joinf; 172 break; 173 case 'o': 174 fieldarg(optarg); 175 break; 176 case 't': 177 spans = 0; 178 if (strlen(tabchar = optarg) != 1) 179 errx(1, "illegal tab character specification"); 180 break; 181 case 'v': 182 vflag = 1; 183 joinout = 0; 184 switch (strtol(optarg, &end, 10)) { 185 case 1: 186 F1->unpair = 1; 187 break; 188 case 2: 189 F2->unpair = 1; 190 break; 191 default: 192 errx(1, "-v option file number not 1 or 2"); 193 break; 194 } 195 if (*end) 196 errx(1, "illegal file number -- %s", optarg); 197 break; 198 case '?': 199 default: 200 usage(); 201 } 202 } 203 argc -= optind; 204 argv += optind; 205 206 if (aflag && vflag) 207 errx(1, "the -a and -v options are mutually exclusive"); 208 209 if (argc != 2) 210 usage(); 211 212 /* Open the files; "-" means stdin. */ 213 if (!strcmp(*argv, "-")) 214 F1->fp = stdin; 215 else if ((F1->fp = fopen(*argv, "r")) == NULL) 216 err(1, "%s", *argv); 217 ++argv; 218 if (!strcmp(*argv, "-")) 219 F2->fp = stdin; 220 else if ((F2->fp = fopen(*argv, "r")) == NULL) 221 err(1, "%s", *argv); 222 if (F1->fp == stdin && F2->fp == stdin) 223 errx(1, "only one input file may be stdin"); 224 225 slurp(F1); 226 slurp(F2); 227 while (F1->setcnt && F2->setcnt) { 228 cval = cmp(F1->set, F1->joinf, F2->set, F2->joinf); 229 if (cval == 0) { 230 /* Oh joy, oh rapture, oh beauty divine! */ 231 if (joinout) 232 joinlines(F1, F2); 233 slurp(F1); 234 slurp(F2); 235 } else if (cval < 0) { 236 /* File 1 takes the lead... */ 237 if (F1->unpair) 238 joinlines(F1, NULL); 239 slurp(F1); 240 } else { 241 /* File 2 takes the lead... */ 242 if (F2->unpair) 243 joinlines(F2, NULL); 244 slurp(F2); 245 } 246 } 247 248 /* 249 * Now that one of the files is used up, optionally output any 250 * remaining lines from the other file. 251 */ 252 if (F1->unpair) 253 while (F1->setcnt) { 254 joinlines(F1, NULL); 255 slurp(F1); 256 } 257 if (F2->unpair) 258 while (F2->setcnt) { 259 joinlines(F2, NULL); 260 slurp(F2); 261 } 262 exit(0); 263 } 264 265 void 266 slurp(F) 267 INPUT *F; 268 { 269 LINE *lp, *lastlp, tmp; 270 size_t len; 271 int cnt; 272 char *bp, *fieldp; 273 274 /* 275 * Read all of the lines from an input file that have the same 276 * join field. 277 */ 278 F->setcnt = 0; 279 for (lastlp = NULL;; ++F->setcnt) { 280 /* 281 * If we're out of space to hold line structures, allocate 282 * more. Initialize the structure so that we know that this 283 * is new space. 284 */ 285 if (F->setcnt == F->setalloc) { 286 cnt = F->setalloc; 287 F->setalloc += 50; 288 if ((F->set = realloc(F->set, 289 F->setalloc * sizeof(LINE))) == NULL) 290 err(1, NULL); 291 memset(F->set + cnt, 0, 50 * sizeof(LINE)); 292 293 /* re-set lastlp in case it moved */ 294 if (lastlp != NULL) 295 lastlp = &F->set[F->setcnt - 1]; 296 } 297 298 /* 299 * Get any pushed back line, else get the next line. Allocate 300 * space as necessary. If taking the line from the stack swap 301 * the two structures so that we don't lose space allocated to 302 * either structure. This could be avoided by doing another 303 * level of indirection, but it's probably okay as is. 304 */ 305 lp = &F->set[F->setcnt]; 306 if (F->setcnt) 307 lastlp = &F->set[F->setcnt - 1]; 308 if (F->pushbool) { 309 tmp = F->set[F->setcnt]; 310 F->set[F->setcnt] = F->set[F->pushback]; 311 F->set[F->pushback] = tmp; 312 F->pushbool = 0; 313 continue; 314 } 315 if ((bp = fgetln(F->fp, &len)) == NULL) 316 return; 317 if (lp->linealloc <= len + 1) { 318 lp->linealloc += MAX(100, len + 1 - lp->linealloc); 319 if ((lp->line = 320 realloc(lp->line, lp->linealloc)) == NULL) 321 err(1, NULL); 322 } 323 memmove(lp->line, bp, len); 324 325 /* Replace trailing newline, if it exists. */ 326 if (bp[len - 1] == '\n') 327 lp->line[len - 1] = '\0'; 328 else 329 lp->line[len] = '\0'; 330 bp = lp->line; 331 332 /* Split the line into fields, allocate space as necessary. */ 333 lp->fieldcnt = 0; 334 while ((fieldp = strsep(&bp, tabchar)) != NULL) { 335 if (spans && *fieldp == '\0') 336 continue; 337 if (lp->fieldcnt == lp->fieldalloc) { 338 lp->fieldalloc += 50; 339 if ((lp->fields = realloc(lp->fields, 340 lp->fieldalloc * sizeof(char *))) == NULL) 341 err(1, NULL); 342 } 343 lp->fields[lp->fieldcnt++] = fieldp; 344 } 345 346 /* See if the join field value has changed. */ 347 if (lastlp != NULL && cmp(lp, F->joinf, lastlp, F->joinf)) { 348 F->pushbool = 1; 349 F->pushback = F->setcnt; 350 break; 351 } 352 } 353 } 354 355 int 356 cmp(lp1, fieldno1, lp2, fieldno2) 357 LINE *lp1, *lp2; 358 u_long fieldno1, fieldno2; 359 { 360 if (lp1->fieldcnt <= fieldno1) 361 return (lp2->fieldcnt <= fieldno2 ? 0 : 1); 362 if (lp2->fieldcnt <= fieldno2) 363 return (-1); 364 return (strcmp(lp1->fields[fieldno1], lp2->fields[fieldno2])); 365 } 366 367 void 368 joinlines(F1, F2) 369 INPUT *F1, *F2; 370 { 371 int cnt1, cnt2; 372 373 /* 374 * Output the results of a join comparison. The output may be from 375 * either file 1 or file 2 (in which case the first argument is the 376 * file from which to output) or from both. 377 */ 378 if (F2 == NULL) { 379 for (cnt1 = 0; cnt1 < F1->setcnt; ++cnt1) 380 outoneline(F1, &F1->set[cnt1]); 381 return; 382 } 383 for (cnt1 = 0; cnt1 < F1->setcnt; ++cnt1) 384 for (cnt2 = 0; cnt2 < F2->setcnt; ++cnt2) 385 outtwoline(F1, &F1->set[cnt1], F2, &F2->set[cnt2]); 386 } 387 388 void 389 outoneline(F, lp) 390 INPUT *F; 391 LINE *lp; 392 { 393 int cnt; 394 395 /* 396 * Output a single line from one of the files, according to the 397 * join rules. This happens when we are writing unmatched single 398 * lines. Output empty fields in the right places. 399 */ 400 if (olist) 401 for (cnt = 0; cnt < olistcnt; ++cnt) { 402 if (olist[cnt].filenum == F->number) 403 outfield(lp, olist[cnt].fieldno, 0); 404 else 405 outfield(lp, 0, 1); 406 } 407 else 408 for (cnt = 0; cnt < lp->fieldcnt; ++cnt) 409 outfield(lp, cnt, 0); 410 (void)printf("\n"); 411 if (ferror(stdout)) 412 err(1, "stdout"); 413 needsep = 0; 414 } 415 416 void 417 outtwoline(F1, lp1, F2, lp2) 418 INPUT *F1, *F2; 419 LINE *lp1, *lp2; 420 { 421 int cnt; 422 423 /* Output a pair of lines according to the join list (if any). */ 424 if (olist) 425 for (cnt = 0; cnt < olistcnt; ++cnt) 426 if (olist[cnt].filenum == 1) 427 outfield(lp1, olist[cnt].fieldno, 0); 428 else /* if (olist[cnt].filenum == 2) */ 429 outfield(lp2, olist[cnt].fieldno, 0); 430 else { 431 /* 432 * Output the join field, then the remaining fields from F1 433 * and F2. 434 */ 435 outfield(lp1, F1->joinf, 0); 436 for (cnt = 0; cnt < lp1->fieldcnt; ++cnt) 437 if (F1->joinf != cnt) 438 outfield(lp1, cnt, 0); 439 for (cnt = 0; cnt < lp2->fieldcnt; ++cnt) 440 if (F2->joinf != cnt) 441 outfield(lp2, cnt, 0); 442 } 443 (void)printf("\n"); 444 if (ferror(stdout)) 445 err(1, "stdout"); 446 needsep = 0; 447 } 448 449 void 450 outfield(lp, fieldno, out_empty) 451 LINE *lp; 452 u_long fieldno; 453 int out_empty; 454 { 455 if (needsep++) 456 (void)printf("%c", *tabchar); 457 if (!ferror(stdout)) 458 if (lp->fieldcnt <= fieldno || out_empty) { 459 if (empty != NULL) 460 (void)printf("%s", empty); 461 } else { 462 if (*lp->fields[fieldno] == '\0') 463 return; 464 (void)printf("%s", lp->fields[fieldno]); 465 } 466 if (ferror(stdout)) 467 err(1, "stdout"); 468 } 469 470 /* 471 * Convert an output list argument "2.1, 1.3, 2.4" into an array of output 472 * fields. 473 */ 474 void 475 fieldarg(option) 476 char *option; 477 { 478 u_long fieldno; 479 char *end, *token; 480 481 while ((token = strsep(&option, ", \t")) != NULL) { 482 if (*token == '\0') 483 continue; 484 if (token[0] != '1' && token[0] != '2' || token[1] != '.') 485 errx(1, "malformed -o option field"); 486 fieldno = strtol(token + 2, &end, 10); 487 if (*end) 488 errx(1, "malformed -o option field"); 489 if (fieldno == 0) 490 errx(1, "field numbers are 1 based"); 491 if (olistcnt == olistalloc) { 492 olistalloc += 50; 493 if ((olist = realloc(olist, 494 olistalloc * sizeof(OLIST))) == NULL) 495 err(1, NULL); 496 } 497 olist[olistcnt].filenum = token[0] - '0'; 498 olist[olistcnt].fieldno = fieldno - 1; 499 ++olistcnt; 500 } 501 } 502 503 void 504 obsolete(argv) 505 char **argv; 506 { 507 int len; 508 char **p, *ap, *t; 509 510 while ((ap = *++argv) != NULL) { 511 /* Return if "--". */ 512 if (ap[0] == '-' && ap[1] == '-') 513 return; 514 switch (ap[1]) { 515 case 'a': 516 /* 517 * The original join allowed "-a", which meant the 518 * same as -a1 plus -a2. POSIX 1003.2, Draft 11.2 519 * only specifies this as "-a 1" and "a -2", so we 520 * have to use another option flag, one that is 521 * unlikely to ever be used or accidentally entered 522 * on the command line. (Well, we could reallocate 523 * the argv array, but that hardly seems worthwhile.) 524 */ 525 if (ap[2] == '\0') 526 ap[1] = '\01'; 527 break; 528 case 'j': 529 /* 530 * The original join allowed "-j[12] arg" and "-j arg". 531 * Convert the former to "-[12] arg". Don't convert 532 * the latter since getopt(3) can handle it. 533 */ 534 switch(ap[2]) { 535 case '1': 536 if (ap[3] != '\0') 537 goto jbad; 538 ap[1] = '1'; 539 ap[2] = '\0'; 540 break; 541 case '2': 542 if (ap[3] != '\0') 543 goto jbad; 544 ap[1] = '2'; 545 ap[2] = '\0'; 546 break; 547 case '\0': 548 break; 549 default: 550 jbad: errx(1, "illegal option -- %s", ap); 551 usage(); 552 } 553 break; 554 case 'o': 555 /* 556 * The original join allowed "-o arg arg". 557 * Convert to "-o arg -o arg". 558 */ 559 if (ap[2] != '\0') 560 break; 561 for (p = argv + 2; *p; ++p) { 562 if (p[0][0] != '1' && 563 p[0][0] != '2' || p[0][1] != '.') 564 break; 565 len = strlen(*p); 566 if (len - 2 != strspn(*p + 2, "0123456789")) 567 break; 568 if ((t = malloc(len + 3)) == NULL) 569 err(1, NULL); 570 t[0] = '-'; 571 t[1] = 'o'; 572 memmove(t + 2, *p, len + 1); 573 *p = t; 574 } 575 argv = p - 1; 576 break; 577 } 578 } 579 } 580 581 void 582 usage() 583 { 584 (void)fprintf(stderr, "%s%s\n", 585 "usage: join [-a fileno | -v fileno ] [-e string] [-1 field] ", 586 "[-2 field]\n [-o list] [-t char] file1 file2"); 587 exit(1); 588 } 589