1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* Copyright (c) 1988 AT&T */ 23 /* All Rights Reserved */ 24 25 26 /* 27 * Copyright 2004 Sun Microsystems, Inc. All rights reserved. 28 * Use is subject to license terms. 29 */ 30 31 #include <ctype.h> 32 #include <stdio.h> 33 #include <sys/types.h> 34 #include <sys/sysmacros.h> 35 #include <sys/byteorder.h> 36 #if SHARE 37 #include <sys/ipc.h> 38 #include <sys/shm.h> 39 #define ERR -1 40 #endif 41 #include "invlib.h" 42 #include "library.h" 43 44 #define DEBUG 0 /* debugging code and realloc messages */ 45 #define BLOCKSIZE 2 * BUFSIZ /* logical block size */ 46 #define LINEMAX 1000 /* sorted posting line max size */ 47 #define POSTINC 10000 /* posting buffer size increment */ 48 #define SEP ' ' /* sorted posting field separator */ 49 #define SETINC 100 /* posting set size increment */ 50 #define STATS 0 /* print statistics */ 51 #define SUPERINC 10000 /* super index size increment */ 52 #define TERMMAX 512 /* term max size */ 53 #define VERSION 1 /* inverted index format version */ 54 #define ZIPFSIZE 200 /* zipf curve size */ 55 #define FREAD "r" /* fopen for reading */ 56 #define FREADP "r+" /* fopen for update */ 57 #define FWRITE "w" /* fopen truncate or create for writing */ 58 #define FWRITEP "w+" /* fopen truncate or create for update */ 59 60 extern char *argv0; /* command name (must be set in main function) */ 61 62 int invbreak; 63 64 #if STATS 65 int showzipf; /* show postings per term distribution */ 66 #endif 67 68 static POSTING *item, *enditem, *item1 = NULL, *item2 = NULL; 69 static unsigned setsize1, setsize2; 70 static long numitems, totterm, zerolong; 71 static char *indexfile, *postingfile; 72 static FILE *outfile, *fpost; 73 static unsigned supersize = SUPERINC, supintsize; 74 static int numpost, numlogblk, amtused, nextpost, 75 lastinblk, numinvitems; 76 static POSTING *POST, *postptr; 77 static unsigned long *SUPINT, *supint, nextsupfing; 78 static char *SUPFING, *supfing; 79 static char thisterm[TERMMAX]; 80 static union { 81 long invblk[BLOCKSIZE / sizeof (long)]; 82 char chrblk[BLOCKSIZE]; 83 } logicalblk; 84 85 #if DEBUG || STATS 86 static long totpost; 87 #endif 88 89 #if STATS 90 static int zipf[ZIPFSIZE + 1]; 91 #endif 92 93 static void invcannotalloc(size_t n); 94 static void invcannotopen(char *file); 95 static void invcannotwrite(char *file); 96 static int invnewterm(void); 97 static int boolready(void); 98 99 long 100 invmake(char *invname, char *invpost, FILE *infile) 101 { 102 unsigned char *s; 103 long num; 104 int i; 105 long fileindex; 106 unsigned postsize = POSTINC * sizeof (POSTING); 107 unsigned long *intptr; 108 char line[LINEMAX]; 109 long tlong; 110 PARAM param; 111 POSTING posting; 112 #if STATS 113 int j; 114 unsigned maxtermlen = 0; 115 #endif 116 /* output file */ 117 if ((outfile = vpfopen(invname, FWRITEP)) == NULL) { 118 invcannotopen(invname); 119 return (0); 120 } 121 indexfile = invname; 122 (void) fseek(outfile, (long)BUFSIZ, 0); 123 124 /* posting file */ 125 if ((fpost = vpfopen(invpost, FWRITE)) == NULL) { 126 invcannotopen(invpost); 127 return (0); 128 } 129 postingfile = invpost; 130 nextpost = 0; 131 /* get space for the postings list */ 132 if ((POST = (POSTING *)malloc(postsize)) == NULL) { 133 invcannotalloc(postsize); 134 return (0); 135 } 136 postptr = POST; 137 /* get space for the superfinger (superindex) */ 138 if ((SUPFING = malloc(supersize)) == NULL) { 139 invcannotalloc(supersize); 140 return (0); 141 } 142 supfing = SUPFING; 143 supintsize = supersize / 40; 144 /* also for the superfinger index */ 145 if ((SUPINT = malloc(supintsize * sizeof (long))) == NULL) { 146 invcannotalloc(supintsize * sizeof (long)); 147 return (0); 148 } 149 supint = SUPINT; 150 supint++; /* leave first term open for a count */ 151 /* initialize using an empty term */ 152 (void) strcpy(thisterm, ""); 153 *supint++ = 0; 154 *supfing++ = ' '; 155 *supfing++ = '\0'; 156 nextsupfing = 2; 157 #if DEBUG || STATS 158 totpost = 0L; 159 #endif 160 totterm = 0L; 161 numpost = 1; 162 163 /* 164 * set up as though a block had come and gone, i.e., set up for 165 * new block 166 */ 167 amtused = 16; /* leave no space - init 3 words + one for luck */ 168 numinvitems = 0; 169 numlogblk = 0; 170 lastinblk = BLOCKSIZE; 171 172 /* now loop as long as more to read (till eof) */ 173 while (fgets(line, LINEMAX, infile) != NULL) { 174 #if DEBUG || STATS 175 ++totpost; 176 #endif 177 s = (unsigned char *) strchr(line, SEP); 178 if (s == NULL) /* where did this line come from ??? */ 179 continue; /* workaround: just skip it */ 180 *s = '\0'; 181 #if STATS 182 if ((i = strlen(line)) > maxtermlen) { 183 maxtermlen = i; 184 } 185 #endif 186 #if DEBUG 187 (void) printf("%ld: %s ", totpost, line); 188 (void) fflush(stdout); 189 #endif 190 if (strcmp(thisterm, line) == 0) { 191 if (postptr + 10 > POST + postsize / sizeof (POSTING)) { 192 i = postptr - POST; 193 postsize += POSTINC * sizeof (POSTING); 194 if ((POST = realloc(POST, postsize)) == NULL) { 195 invcannotalloc(postsize); 196 return (0); 197 } 198 postptr = i + POST; 199 #if DEBUG 200 (void) printf("reallocated post space to %u, " 201 "totpost=%ld\n", postsize, totpost); 202 #endif 203 } 204 numpost++; 205 } else { 206 /* have a new term */ 207 if (!invnewterm()) { 208 return (0); 209 } 210 (void) strcpy(thisterm, line); 211 numpost = 1; 212 postptr = POST; 213 fileindex = 0; 214 } 215 /* get the new posting */ 216 num = *++s - '!'; 217 i = 1; 218 do { 219 num = BASE * num + *++s - '!'; 220 } while (++i < PRECISION); 221 posting.lineoffset = num; 222 while (++fileindex < nsrcoffset && num > srcoffset[fileindex]) { 223 ; 224 } 225 posting.fileindex = --fileindex; 226 posting.type = *++s; 227 num = *++s - '!'; 228 if (*s != '\n') { 229 num = *++s - '!'; 230 while (*++s != '\n') { 231 num = BASE * num + *s - '!'; 232 } 233 posting.fcnoffset = num; 234 } else { 235 posting.fcnoffset = 0; 236 } 237 *postptr++ = posting; 238 #if DEBUG 239 (void) printf("%ld %ld %ld %ld\n", posting.fileindex, 240 posting.fcnoffset, posting.lineoffset, posting.type); 241 (void) fflush(stdout); 242 #endif 243 } 244 if (!invnewterm()) { 245 return (0); 246 } 247 /* now clean up final block */ 248 logicalblk.invblk[0] = numinvitems; 249 /* loops pointer around to start */ 250 logicalblk.invblk[1] = 0; 251 logicalblk.invblk[2] = numlogblk - 1; 252 if (fwrite((char *)&logicalblk, BLOCKSIZE, 1, outfile) == 0) { 253 goto cannotwrite; 254 } 255 numlogblk++; 256 /* write out block to save space. what in it doesn't matter */ 257 if (fwrite((char *)&logicalblk, BLOCKSIZE, 1, outfile) == 0) { 258 goto cannotwrite; 259 } 260 /* finish up the super finger */ 261 *SUPINT = numlogblk; 262 /* add to the offsets the size of the offset pointers */ 263 intptr = (SUPINT + 1); 264 i = (char *)supint - (char *)SUPINT; 265 while (intptr < supint) 266 *intptr++ += i; 267 /* write out the offsets (1 for the N at start) and the super finger */ 268 if (fwrite((char *)SUPINT, sizeof (*SUPINT), numlogblk + 1, 269 outfile) == 0 || 270 fwrite(SUPFING, 1, supfing - SUPFING, outfile) == 0) { 271 goto cannotwrite; 272 } 273 /* save the size for reference later */ 274 nextsupfing = sizeof (long) + sizeof (long) * numlogblk + 275 (supfing - SUPFING); 276 /* 277 * make sure the file ends at a logical block boundary. This is 278 * necessary for invinsert to correctly create extended blocks 279 */ 280 i = nextsupfing % BLOCKSIZE; 281 /* write out junk to fill log blk */ 282 if (fwrite(SUPFING, BLOCKSIZE - i, 1, outfile) == 0 || 283 fflush(outfile) == EOF) { 284 /* rewind doesn't check for write failure */ 285 goto cannotwrite; 286 } 287 /* write the control area */ 288 rewind(outfile); 289 param.version = VERSION; 290 param.filestat = 0; 291 param.sizeblk = BLOCKSIZE; 292 param.startbyte = (numlogblk + 1) * BLOCKSIZE + BUFSIZ; 293 param.supsize = nextsupfing; 294 param.cntlsize = BUFSIZ; 295 param.share = 0; 296 if (fwrite((char *)¶m, sizeof (param), 1, outfile) == 0) { 297 goto cannotwrite; 298 } 299 for (i = 0; i < 10; i++) /* for future use */ 300 if (fwrite((char *)&zerolong, sizeof (zerolong), 301 1, outfile) == 0) { 302 goto cannotwrite; 303 } 304 305 /* make first block loop backwards to last block */ 306 if (fflush(outfile) == EOF) { 307 /* fseek doesn't check for write failure */ 308 goto cannotwrite; 309 } 310 /* get to second word first block */ 311 (void) fseek(outfile, (long)BUFSIZ + 8, 0); 312 tlong = numlogblk - 1; 313 if (fwrite((char *)&tlong, sizeof (tlong), 1, outfile) == 0 || 314 fclose(outfile) == EOF) { 315 cannotwrite: 316 invcannotwrite(invname); 317 return (0); 318 } 319 if (fclose(fpost) == EOF) { 320 invcannotwrite(postingfile); 321 return (0); 322 } 323 --totterm; /* don't count null term */ 324 #if STATS 325 (void) printf("logical blocks = %d, postings = %ld, terms = %ld, " 326 "max term length = %d\n", numlogblk, totpost, totterm, maxtermlen); 327 if (showzipf) { 328 (void) printf( 329 "\n************* ZIPF curve ****************\n"); 330 for (j = ZIPFSIZE; j > 1; j--) 331 if (zipf[j]) 332 break; 333 for (i = 1; i < j; ++i) { 334 (void) printf("%3d -%6d ", i, zipf[i]); 335 if (i % 6 == 0) (void) putchar('\n'); 336 } 337 (void) printf(">%d-%6d\n", ZIPFSIZE, zipf[0]); 338 } 339 #endif 340 /* free all malloc'd memory */ 341 free(POST); 342 free(SUPFING); 343 free(SUPINT); 344 return (totterm); 345 } 346 347 /* add a term to the data base */ 348 349 static int 350 invnewterm(void) 351 { 352 int backupflag, i, j, maxback, holditems, gooditems, howfar; 353 int len, numwilluse, wdlen; 354 char *tptr, *tptr2, *tptr3; 355 union { 356 unsigned long packword[2]; 357 ENTRY e; 358 } iteminfo; 359 360 totterm++; 361 #if STATS 362 /* keep zipfian info on the distribution */ 363 if (numpost <= ZIPFSIZE) 364 zipf[numpost]++; 365 else 366 zipf[0]++; 367 #endif 368 len = strlen(thisterm); 369 wdlen = (len + (sizeof (long) - 1)) / sizeof (long); 370 numwilluse = (wdlen + 3) * sizeof (long); 371 /* new block if at least 1 item in block */ 372 if (numinvitems && numwilluse + amtused > BLOCKSIZE) { 373 /* set up new block */ 374 if (supfing + 500 > SUPFING + supersize) { 375 i = supfing - SUPFING; 376 supersize += 20000; 377 if ((SUPFING = realloc(SUPFING, supersize)) == NULL) { 378 invcannotalloc(supersize); 379 return (0); 380 } 381 supfing = i + SUPFING; 382 #if DEBUG 383 (void) printf("reallocated superfinger space to %d, " 384 "totpost=%ld\n", supersize, totpost); 385 #endif 386 } 387 /* check that room for the offset as well */ 388 if ((numlogblk + 10) > supintsize) { 389 i = supint - SUPINT; 390 supintsize += SUPERINC; 391 if ((SUPINT = realloc((char *)SUPINT, 392 supintsize * sizeof (long))) == NULL) { 393 invcannotalloc(supintsize * sizeof (long)); 394 return (0); 395 } 396 supint = i + SUPINT; 397 #if DEBUG 398 (void) printf("reallocated superfinger offset to %d, " 399 "totpost = %ld\n", supintsize * sizeof (long), 400 totpost); 401 #endif 402 } 403 /* See if backup is efficatious */ 404 backupflag = 0; 405 maxback = strlen(thisterm) / 10; 406 holditems = numinvitems; 407 if (maxback > numinvitems) 408 maxback = numinvitems - 2; 409 howfar = 0; 410 while (--maxback > 0) { 411 howfar++; 412 iteminfo.packword[0] = 413 logicalblk.invblk[--holditems * 2 + 414 (sizeof (long) - 1)]; 415 if ((i = iteminfo.e.size / 10) < maxback) { 416 maxback = i; 417 backupflag = howfar; 418 gooditems = holditems; 419 tptr2 = logicalblk.chrblk + iteminfo.e.offset; 420 } 421 } 422 /* see if backup will occur */ 423 if (backupflag) { 424 numinvitems = gooditems; 425 } 426 logicalblk.invblk[0] = numinvitems; 427 /* set forward pointer pointing to next */ 428 logicalblk.invblk[1] = numlogblk + 1; 429 /* set back pointer to last block */ 430 logicalblk.invblk[2] = numlogblk - 1; 431 if (fwrite((char *)logicalblk.chrblk, 1, 432 BLOCKSIZE, outfile) == 0) { 433 invcannotwrite(indexfile); 434 return (0); 435 } 436 amtused = 16; 437 numlogblk++; 438 /* check if had to back up, if so do it */ 439 if (backupflag) { 440 /* find out where the end of the new block is */ 441 iteminfo.packword[0] = 442 logicalblk.invblk[numinvitems * 2 + 1]; 443 tptr3 = logicalblk.chrblk + iteminfo.e.offset; 444 /* move the index for this block */ 445 for (i = 3; i <= (backupflag * 2 + 2); i++) { 446 logicalblk.invblk[i] = 447 logicalblk.invblk[numinvitems * 2+i]; 448 } 449 /* move the word into the super index */ 450 iteminfo.packword[0] = logicalblk.invblk[3]; 451 iteminfo.packword[1] = logicalblk.invblk[4]; 452 tptr2 = logicalblk.chrblk + iteminfo.e.offset; 453 (void) strncpy(supfing, tptr2, (int)iteminfo.e.size); 454 *(supfing + iteminfo.e.size) = '\0'; 455 #if DEBUG 456 (void) printf("backup %d at term=%s to term=%s\n", 457 backupflag, thisterm, supfing); 458 #endif 459 *supint++ = nextsupfing; 460 nextsupfing += strlen(supfing) + 1; 461 supfing += strlen(supfing) + 1; 462 /* now fix up the logical block */ 463 tptr = logicalblk.chrblk + lastinblk; 464 lastinblk = BLOCKSIZE; 465 tptr2 = logicalblk.chrblk + lastinblk; 466 j = tptr3 - tptr; 467 while (tptr3 > tptr) 468 *--tptr2 = *--tptr3; 469 lastinblk -= j; 470 amtused += (8 * backupflag + j); 471 for (i = 3; i < (backupflag * 2 + 2); i += 2) { 472 iteminfo.packword[0] = logicalblk.invblk[i]; 473 iteminfo.e.offset += (tptr2 - tptr3); 474 logicalblk.invblk[i] = iteminfo.packword[0]; 475 } 476 numinvitems = backupflag; 477 } else { /* no backup needed */ 478 numinvitems = 0; 479 lastinblk = BLOCKSIZE; 480 /* add new term to superindex */ 481 (void) strcpy(supfing, thisterm); 482 supfing += strlen(thisterm) + 1; 483 *supint++ = nextsupfing; 484 nextsupfing += strlen(thisterm) + 1; 485 } 486 } 487 lastinblk -= (numwilluse - 8); 488 iteminfo.e.offset = lastinblk; 489 iteminfo.e.size = (char)len; 490 iteminfo.e.space = 0; 491 iteminfo.e.post = numpost; 492 (void) strncpy(logicalblk.chrblk + lastinblk, thisterm, len); 493 amtused += numwilluse; 494 logicalblk.invblk[(lastinblk/sizeof (long))+wdlen] = nextpost; 495 if ((i = postptr - POST) > 0) { 496 if (fwrite((char *)POST, sizeof (POSTING), i, fpost) == 0) { 497 invcannotwrite(postingfile); 498 return (0); 499 } 500 nextpost += i * sizeof (POSTING); 501 } 502 logicalblk.invblk[3+2*numinvitems++] = iteminfo.packword[0]; 503 logicalblk.invblk[2+2*numinvitems] = iteminfo.packword[1]; 504 return (1); 505 } 506 507 static void 508 swap_ints(void *p, size_t sz) 509 { 510 uint32_t *s; 511 uint32_t *e = (uint32_t *)p + (sz / sizeof (uint32_t)); 512 513 for (s = p; s < e; s++) 514 *s = BSWAP_32(*s); 515 } 516 517 static void 518 write_param(INVCONTROL *invcntl) 519 { 520 if (invcntl->swap) 521 swap_ints(&invcntl->param, sizeof (invcntl->param)); 522 523 rewind(invcntl->invfile); 524 (void) fwrite((char *)&invcntl->param, sizeof (invcntl->param), 1, 525 invcntl->invfile); 526 527 if (invcntl->swap) 528 swap_ints(&invcntl->param, sizeof (invcntl->param)); 529 } 530 531 static void 532 read_superfinger(INVCONTROL *invcntl) 533 { 534 size_t count; 535 536 (void) fseek(invcntl->invfile, invcntl->param.startbyte, SEEK_SET); 537 (void) fread(invcntl->iindex, (int)invcntl->param.supsize, 538 1, invcntl->invfile); 539 540 if (invcntl->swap) { 541 /* 542 * The superfinger consists of a count, followed by 543 * count offsets, followed by a string table (which 544 * the offsets reference). 545 * 546 * We need to swap the count and the offsets. 547 */ 548 count = 1 + BSWAP_32(*(uint32_t *)invcntl->iindex); 549 swap_ints(invcntl->iindex, count * sizeof (unsigned long)); 550 } 551 } 552 553 static void 554 read_logblock(INVCONTROL *invcntl, int block) 555 { 556 /* note always fetch it if the file is busy */ 557 if ((block != invcntl->numblk) || 558 (invcntl->param.filestat >= INVBUSY)) { 559 (void) fseek(invcntl->invfile, 560 (block * invcntl->param.sizeblk) + invcntl->param.cntlsize, 561 SEEK_SET); 562 invcntl->numblk = block; 563 (void) fread(invcntl->logblk, (int)invcntl->param.sizeblk, 564 1, invcntl->invfile); 565 566 if (invcntl->swap) { 567 size_t count; 568 ENTRY *ecur, *eend; 569 uint32_t *postptr; 570 571 /* 572 * A logblock consists of a count, a next block id, 573 * and a previous block id, followed by count 574 * ENTRYs, followed by alternating strings and 575 * offsets. 576 */ 577 swap_ints(invcntl->logblk, 3 * sizeof (unsigned long)); 578 579 count = *(uint32_t *)invcntl->logblk; 580 581 ecur = (ENTRY *)((uint32_t *)invcntl->logblk + 3); 582 eend = ecur + count; 583 584 for (; ecur < eend; ecur++) { 585 ecur->offset = BSWAP_16(ecur->offset); 586 ecur->post = BSWAP_32(ecur->post); 587 588 /* 589 * After the string is the posting offset. 590 */ 591 postptr = (uint32_t *)(invcntl->logblk + 592 ecur->offset + 593 P2ROUNDUP(ecur->size, sizeof (long))); 594 595 *postptr = BSWAP_32(*postptr); 596 } 597 } 598 } 599 } 600 601 void 602 read_next_posting(INVCONTROL *invcntl, POSTING *posting) 603 { 604 (void) fread((char *)posting, sizeof (*posting), 1, invcntl->postfile); 605 if (invcntl->swap) { 606 posting->lineoffset = BSWAP_32(posting->lineoffset); 607 posting->fcnoffset = BSWAP_32(posting->fcnoffset); 608 /* 609 * fileindex is a 24-bit field, so shift it before swapping 610 */ 611 posting->fileindex = BSWAP_32(posting->fileindex << 8); 612 } 613 } 614 615 int 616 invopen(INVCONTROL *invcntl, char *invname, char *invpost, int stat) 617 { 618 int read_index; 619 620 if ((invcntl->invfile = vpfopen(invname, 621 ((stat == 0) ? FREAD : FREADP))) == NULL) { 622 invcannotopen(invname); 623 return (-1); 624 } 625 if (fread((char *)&invcntl->param, sizeof (invcntl->param), 1, 626 invcntl->invfile) == 0) { 627 (void) fprintf(stderr, "%s: empty inverted file\n", argv0); 628 goto closeinv; 629 } 630 if (invcntl->param.version != VERSION && 631 BSWAP_32(invcntl->param.version) != VERSION) { 632 (void) fprintf(stderr, 633 "%s: cannot read old index format; use -U option to " 634 "force database to rebuild\n", argv0); 635 goto closeinv; 636 } 637 invcntl->swap = (invcntl->param.version != VERSION); 638 if (invcntl->swap) 639 swap_ints(&invcntl->param, sizeof (invcntl->param)); 640 641 if (stat == 0 && invcntl->param.filestat == INVALONE) { 642 (void) fprintf(stderr, "%s: inverted file is locked\n", argv0); 643 goto closeinv; 644 } 645 if ((invcntl->postfile = vpfopen(invpost, 646 ((stat == 0) ? FREAD : FREADP))) == NULL) { 647 invcannotopen(invpost); 648 goto closeinv; 649 } 650 /* allocate core for a logical block */ 651 if ((invcntl->logblk = malloc(invcntl->param.sizeblk)) == NULL) { 652 invcannotalloc((unsigned)invcntl->param.sizeblk); 653 goto closeboth; 654 } 655 /* allocate for and read in superfinger */ 656 read_index = 1; 657 invcntl->iindex = NULL; 658 #if SHARE 659 if (invcntl->param.share == 1) { 660 key_t ftok(), shm_key; 661 struct shmid_ds shm_buf; 662 char *shmat(); 663 int shm_id; 664 665 /* see if the shared segment exists */ 666 shm_key = ftok(invname, 2); 667 shm_id = shmget(shm_key, 0, 0); 668 /* 669 * Failure simply means (hopefully) that segment doesn't 670 * exist 671 */ 672 if (shm_id == -1) { 673 /* 674 * Have to give general write permission due to AMdahl 675 * not having protected segments 676 */ 677 shm_id = shmget(shm_key, 678 invcntl->param.supsize + sizeof (long), 679 IPC_CREAT | 0666); 680 if (shm_id == -1) 681 perror("Could not create shared " 682 "memory segment"); 683 } else 684 read_index = 0; 685 686 if (shm_id != -1) { 687 invcntl->iindex = shmat(shm_id, 0, 688 ((read_index) ? 0 : SHM_RDONLY)); 689 if (invcntl->iindex == (char *)ERR) { 690 (void) fprintf(stderr, 691 "%s: shared memory link failed\n", argv0); 692 invcntl->iindex = NULL; 693 read_index = 1; 694 } 695 } 696 } 697 #endif 698 if (invcntl->iindex == NULL) 699 invcntl->iindex = malloc(invcntl->param.supsize + 16); 700 if (invcntl->iindex == NULL) { 701 invcannotalloc((unsigned)invcntl->param.supsize); 702 free(invcntl->logblk); 703 goto closeboth; 704 } 705 if (read_index) { 706 read_superfinger(invcntl); 707 } 708 invcntl->numblk = -1; 709 if (boolready() == -1) { 710 closeboth: 711 (void) fclose(invcntl->postfile); 712 closeinv: 713 (void) fclose(invcntl->invfile); 714 return (-1); 715 } 716 /* write back out the control block if anything changed */ 717 invcntl->param.filestat = stat; 718 if (stat > invcntl->param.filestat) 719 write_param(invcntl); 720 return (1); 721 } 722 723 /* invclose must be called to wrap things up and deallocate core */ 724 void 725 invclose(INVCONTROL *invcntl) 726 { 727 /* write out the control block in case anything changed */ 728 if (invcntl->param.filestat > 0) { 729 invcntl->param.filestat = 0; 730 write_param(invcntl); 731 } 732 (void) fclose(invcntl->invfile); 733 (void) fclose(invcntl->postfile); 734 #if SHARE 735 if (invcntl->param.share > 0) { 736 shmdt(invcntl->iindex); 737 invcntl->iindex = NULL; 738 } 739 #endif 740 if (invcntl->iindex != NULL) 741 free(invcntl->iindex); 742 free(invcntl->logblk); 743 } 744 745 /* invstep steps the inverted file forward one item */ 746 void 747 invstep(INVCONTROL *invcntl) 748 { 749 if (invcntl->keypnt < (*(int *)invcntl->logblk - 1)) { 750 invcntl->keypnt++; 751 return; 752 } 753 754 /* move forward a block else wrap */ 755 read_logblock(invcntl, *(int *)(invcntl->logblk + sizeof (long))); 756 757 invcntl->keypnt = 0; 758 } 759 760 /* invforward moves forward one term in the inverted file */ 761 int 762 invforward(INVCONTROL *invcntl) 763 { 764 invstep(invcntl); 765 /* skip things with 0 postings */ 766 while (((ENTRY *)(invcntl->logblk + 12) + invcntl->keypnt)->post == 0) { 767 invstep(invcntl); 768 } 769 /* Check for having wrapped - reached start of inverted file! */ 770 if ((invcntl->numblk == 0) && (invcntl->keypnt == 0)) 771 return (0); 772 return (1); 773 } 774 775 /* invterm gets the present term from the present logical block */ 776 int 777 invterm(INVCONTROL *invcntl, char *term) 778 { 779 ENTRY * entryptr; 780 781 entryptr = (ENTRY *)(invcntl->logblk + 12) + invcntl->keypnt; 782 (void) strncpy(term, entryptr->offset + invcntl->logblk, 783 (int)entryptr->size); 784 *(term + entryptr->size) = '\0'; 785 return (entryptr->post); 786 } 787 788 /* invfind searches for an individual item in the inverted file */ 789 long 790 invfind(INVCONTROL *invcntl, char *searchterm) 791 { 792 int imid, ilow, ihigh; 793 long num; 794 int i; 795 unsigned long *intptr, *intptr2; 796 ENTRY *entryptr; 797 798 /* make sure it is initialized via invready */ 799 if (invcntl->invfile == 0) 800 return (-1L); 801 802 /* now search for the appropriate finger block */ 803 intptr = (unsigned long *)invcntl->iindex; 804 805 ilow = 0; 806 ihigh = *intptr++ - 1; 807 while (ilow <= ihigh) { 808 imid = (ilow + ihigh) / 2; 809 intptr2 = intptr + imid; 810 i = strcmp(searchterm, (invcntl->iindex + *intptr2)); 811 if (i < 0) 812 ihigh = imid - 1; 813 else if (i > 0) 814 ilow = ++imid; 815 else { 816 ilow = imid + 1; 817 break; 818 } 819 } 820 /* be careful about case where searchterm is after last in this block */ 821 imid = (ilow) ? ilow - 1 : 0; 822 823 /* fetch the appropriate logical block if not in core */ 824 read_logblock(invcntl, imid); 825 826 srch_ext: 827 /* now find the term in this block. tricky this */ 828 intptr = (unsigned long *)invcntl->logblk; 829 830 ilow = 0; 831 ihigh = *intptr - 1; 832 intptr += 3; 833 num = 0; 834 while (ilow <= ihigh) { 835 imid = (ilow + ihigh) / 2; 836 entryptr = (ENTRY *)intptr + imid; 837 i = strncmp(searchterm, (invcntl->logblk + entryptr->offset), 838 (int)entryptr->size); 839 if (i == 0) 840 i = strlen(searchterm) - entryptr->size; 841 if (i < 0) 842 ihigh = imid - 1; 843 else if (i > 0) 844 ilow = ++imid; 845 else { 846 num = entryptr->post; 847 break; 848 } 849 } 850 /* be careful about case where searchterm is after last in this block */ 851 if (imid >= *(long *)invcntl->logblk) { 852 invcntl->keypnt = *(long *)invcntl->logblk; 853 invstep(invcntl); 854 /* note if this happens the term could be in extended block */ 855 if (invcntl->param.startbyte < 856 invcntl->numblk * invcntl->param.sizeblk) 857 goto srch_ext; 858 } else 859 invcntl->keypnt = imid; 860 return (num); 861 } 862 863 #if DEBUG 864 865 /* invdump dumps the block the term parameter is in */ 866 void 867 invdump(INVCONTROL *invcntl, char *term) 868 { 869 long i, j, n, *longptr; 870 ENTRY * entryptr; 871 char temp[512], *ptr; 872 873 /* dump superindex if term is "-" */ 874 if (*term == '-') { 875 j = atoi(term + 1); 876 longptr = (long *)invcntl->iindex; 877 n = *longptr++; 878 (void) printf("Superindex dump, num blocks=%ld\n", n); 879 longptr += j; 880 while ((longptr <= ((long *)invcntl->iindex) + n) && 881 invbreak == 0) { 882 (void) printf("%2ld %6ld %s\n", j++, *longptr, 883 invcntl->iindex + *longptr); 884 longptr++; 885 } 886 return; 887 } else if (*term == '#') { 888 j = atoi(term + 1); 889 /* fetch the appropriate logical block */ 890 read_logblock(invcntl, j); 891 } else 892 i = abs((int)invfind(invcntl, term)); 893 longptr = (long *)invcntl->logblk; 894 n = *longptr++; 895 (void) printf("Entry term to invdump=%s, postings=%ld, " 896 "forward ptr=%ld, back ptr=%ld\n", term, i, *(longptr), 897 *(longptr + 1)); 898 entryptr = (ENTRY *)(invcntl->logblk + 12); 899 (void) printf("%ld terms in this block, block=%ld\n", n, 900 invcntl->numblk); 901 (void) printf("\tterm\t\t\tposts\tsize\toffset\tspace\t1st word\n"); 902 for (j = 0; j < n && invbreak == 0; j++) { 903 ptr = invcntl->logblk + entryptr->offset; 904 (void) strncpy(temp, ptr, (int)entryptr->size); 905 temp[entryptr->size] = '\0'; 906 ptr += (sizeof (long) * 907 (long)((entryptr->size + 908 (sizeof (long) - 1)) / sizeof (long))); 909 (void) printf("%2ld %-24s\t%5ld\t%3d\t%d\t%d\t%ld\n", j, temp, 910 entryptr->post, entryptr->size, entryptr->offset, 911 entryptr->space, *(long *)ptr); 912 entryptr++; 913 } 914 } 915 #endif 916 917 static int 918 boolready(void) 919 { 920 numitems = 0; 921 if (item1 != NULL) 922 free(item1); 923 setsize1 = SETINC; 924 if ((item1 = (POSTING *)malloc(SETINC * sizeof (POSTING))) == NULL) { 925 invcannotalloc(SETINC); 926 return (-1); 927 } 928 if (item2 != NULL) 929 free(item2); 930 setsize2 = SETINC; 931 if ((item2 = (POSTING *)malloc(SETINC * sizeof (POSTING))) == NULL) { 932 invcannotalloc(SETINC); 933 return (-1); 934 } 935 item = item1; 936 enditem = item; 937 return (0); 938 } 939 940 void 941 boolclear(void) 942 { 943 numitems = 0; 944 item = item1; 945 enditem = item; 946 } 947 948 POSTING * 949 boolfile(INVCONTROL *invcntl, long *num, int bool) 950 { 951 ENTRY *entryptr; 952 FILE *file; 953 char *ptr; 954 unsigned long *ptr2; 955 POSTING *newitem; 956 POSTING posting; 957 unsigned u; 958 POSTING *newsetp, *set1p; 959 long newsetc, set1c, set2c; 960 961 entryptr = (ENTRY *) (invcntl->logblk + 12) + invcntl->keypnt; 962 ptr = invcntl->logblk + entryptr->offset; 963 ptr2 = ((unsigned long *)ptr) + 964 (entryptr->size + (sizeof (long) - 1)) / sizeof (long); 965 *num = entryptr->post; 966 switch (bool) { 967 case OR: 968 case NOT: 969 if (*num == 0) { 970 *num = numitems; 971 return (item); 972 } 973 } 974 /* make room for the new set */ 975 u = 0; 976 switch (bool) { 977 case AND: 978 case NOT: 979 newsetp = set1p = item; 980 break; 981 982 case OR: 983 u = enditem - item; 984 /* FALLTHROUGH */ 985 case REVERSENOT: 986 u += *num; 987 if (item == item2) { 988 if (u > setsize1) { 989 u += SETINC; 990 if ((item1 = (POSTING *) realloc(item1, 991 u * sizeof (POSTING))) == NULL) { 992 goto cannotalloc; 993 } 994 setsize1 = u; 995 } 996 newitem = item1; 997 } else { 998 if (u > setsize2) { 999 u += SETINC; 1000 if ((item2 = (POSTING *)realloc(item2, 1001 u * sizeof (POSTING))) == NULL) { 1002 cannotalloc: 1003 invcannotalloc(u * sizeof (POSTING)); 1004 (void) boolready(); 1005 *num = -1; 1006 return (NULL); 1007 } 1008 setsize2 = u; 1009 } 1010 newitem = item2; 1011 } 1012 set1p = item; 1013 newsetp = newitem; 1014 } 1015 file = invcntl->postfile; 1016 (void) fseek(file, (long)*ptr2, SEEK_SET); 1017 read_next_posting(invcntl, &posting); 1018 newsetc = 0; 1019 switch (bool) { 1020 case OR: 1021 /* while something in both sets */ 1022 set1p = item; 1023 newsetp = newitem; 1024 for (set1c = 0, set2c = 0; 1025 set1c < numitems && set2c < *num; newsetc++) { 1026 if (set1p->lineoffset < posting.lineoffset) { 1027 *newsetp++ = *set1p++; 1028 set1c++; 1029 } else if (set1p->lineoffset > posting.lineoffset) { 1030 *newsetp++ = posting; 1031 read_next_posting(invcntl, &posting); 1032 set2c++; 1033 } else if (set1p->type < posting.type) { 1034 *newsetp++ = *set1p++; 1035 set1c++; 1036 } else if (set1p->type > posting.type) { 1037 *newsetp++ = posting; 1038 read_next_posting(invcntl, &posting); 1039 set2c++; 1040 } else { /* identical postings */ 1041 *newsetp++ = *set1p++; 1042 set1c++; 1043 read_next_posting(invcntl, &posting); 1044 set2c++; 1045 } 1046 } 1047 /* find out what ran out and move the rest in */ 1048 if (set1c < numitems) { 1049 newsetc += numitems - set1c; 1050 while (set1c++ < numitems) { 1051 *newsetp++ = *set1p++; 1052 } 1053 } else { 1054 while (set2c++ < *num) { 1055 *newsetp++ = posting; 1056 newsetc++; 1057 read_next_posting(invcntl, &posting); 1058 } 1059 } 1060 item = newitem; 1061 break; /* end of OR */ 1062 #if 0 1063 case AND: 1064 set1c = 0; 1065 set2c = 0; 1066 while (set1c < numitems && set2c < *num) { 1067 if (set1p->lineoffset < posting.lineoffset) { 1068 set1p++; 1069 set1c++; 1070 } else if (set1p->lineoffset > posting.lineoffset) { 1071 read_next_posting(invcntl, &posting); 1072 set2c++; 1073 } else if (set1p->type < posting.type) { 1074 *set1p++; 1075 set1c++; 1076 } else if (set1p->type > posting.type) { 1077 read_next_posting(invcntl, &posting); 1078 set2c++; 1079 } else { /* identical postings */ 1080 *newsetp++ = *set1p++; 1081 newsetc++; 1082 set1c++; 1083 read_next_posting(invcntl, &posting); 1084 set2c++; 1085 } 1086 } 1087 break; /* end of AND */ 1088 1089 case NOT: 1090 set1c = 0; 1091 set2c = 0; 1092 while (set1c < numitems && set2c < *num) { 1093 if (set1p->lineoffset < posting.lineoffset) { 1094 *newsetp++ = *set1p++; 1095 newsetc++; 1096 set1c++; 1097 } else if (set1p->lineoffset > posting.lineoffset) { 1098 read_next_posting(invcntl, &posting); 1099 set2c++; 1100 } else if (set1p->type < posting.type) { 1101 *newsetp++ = *set1p++; 1102 newsetc++; 1103 set1c++; 1104 } else if (set1p->type > posting.type) { 1105 read_next_posting(invcntl, &posting); 1106 set2c++; 1107 } else { /* identical postings */ 1108 set1c++; 1109 set1p++; 1110 read_next_posting(invcntl, &posting); 1111 set2c++; 1112 } 1113 } 1114 newsetc += numitems - set1c; 1115 while (set1c++ < numitems) { 1116 *newsetp++ = *set1p++; 1117 } 1118 break; /* end of NOT */ 1119 1120 case REVERSENOT: /* core NOT incoming set */ 1121 set1c = 0; 1122 set2c = 0; 1123 while (set1c < numitems && set2c < *num) { 1124 if (set1p->lineoffset < posting.lineoffset) { 1125 set1p++; 1126 set1c++; 1127 } else if (set1p->lineoffset > posting.lineoffset) { 1128 *newsetp++ = posting; 1129 read_next_posting(invcntl, &posting); 1130 set2c++; 1131 } else if (set1p->type < posting.type) { 1132 set1p++; 1133 set1c++; 1134 } else if (set1p->type > posting.type) { 1135 *newsetp++ = posting; 1136 read_next_posting(invcntl, &posting); 1137 set2c++; 1138 } else { /* identical postings */ 1139 set1c++; 1140 set1p++; 1141 read_next_posting(invcntl, &posting); 1142 set2c++; 1143 } 1144 } 1145 while (set2c++ < *num) { 1146 *newsetp++ = posting; 1147 newsetc++; 1148 read_next_posting(invcntl, &posting); 1149 } 1150 item = newitem; 1151 break; /* end of REVERSENOT */ 1152 #endif 1153 } 1154 numitems = newsetc; 1155 *num = newsetc; 1156 enditem = (POSTING *)newsetp; 1157 return ((POSTING *)item); 1158 } 1159 1160 #if 0 1161 POSTING * 1162 boolsave(int clear) 1163 { 1164 int i; 1165 POSTING *ptr; 1166 POSTING *oldstuff, *newstuff; 1167 1168 if (numitems == 0) { 1169 if (clear) 1170 boolclear(); 1171 return (NULL); 1172 } 1173 /* 1174 * if clear then give them what we have and use (void) 1175 * boolready to realloc 1176 */ 1177 if (clear) { 1178 ptr = item; 1179 /* free up the space we didn't give them */ 1180 if (item == item1) 1181 item1 = NULL; 1182 else 1183 item2 = NULL; 1184 (void) boolready(); 1185 return (ptr); 1186 } 1187 i = (enditem - item) * sizeof (POSTING) + 100; 1188 if ((ptr = (POSTING *)malloc(i))r == NULL) { 1189 invcannotalloc(i); 1190 return (ptr); 1191 } 1192 /* move present set into place */ 1193 oldstuff = item; 1194 newstuff = ptr; 1195 while (oldstuff < enditem) 1196 *newstuff++ = *oldstuff++; 1197 return (ptr); 1198 } 1199 #endif 1200 1201 static void 1202 invcannotalloc(size_t n) 1203 { 1204 (void) fprintf(stderr, "%s: cannot allocate %u bytes\n", argv0, n); 1205 } 1206 1207 static void 1208 invcannotopen(char *file) 1209 { 1210 (void) fprintf(stderr, "%s: cannot open file %s\n", argv0, file); 1211 } 1212 1213 static void 1214 invcannotwrite(char *file) 1215 { 1216 (void) perror(argv0); /* must be first to preserve errno */ 1217 (void) fprintf(stderr, "%s: write to file %s failed\n", argv0, file); 1218 } 1219