1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License, Version 1.0 only
6 * (the "License"). You may not use this file except in compliance
7 * with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22 /* Copyright (c) 1988 AT&T */
23 /* All Rights Reserved */
24
25
26 /*
27 * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
28 * Use is subject to license terms.
29 */
30
31 #include <ctype.h>
32 #include <stdio.h>
33 #include <sys/types.h>
34 #include <sys/sysmacros.h>
35 #include <sys/byteorder.h>
36 #if SHARE
37 #include <sys/ipc.h>
38 #include <sys/shm.h>
39 #define ERR -1
40 #endif
41 #include "invlib.h"
42 #include "library.h"
43
44 #define DEBUG 0 /* debugging code and realloc messages */
45 #define BLOCKSIZE 2 * BUFSIZ /* logical block size */
46 #define LINEMAX 1000 /* sorted posting line max size */
47 #define POSTINC 10000 /* posting buffer size increment */
48 #define SEP ' ' /* sorted posting field separator */
49 #define SETINC 100 /* posting set size increment */
50 #define STATS 0 /* print statistics */
51 #define SUPERINC 10000 /* super index size increment */
52 #define TERMMAX 512 /* term max size */
53 #define VERSION 1 /* inverted index format version */
54 #define ZIPFSIZE 200 /* zipf curve size */
55 #define FREAD "r" /* fopen for reading */
56 #define FREADP "r+" /* fopen for update */
57 #define FWRITE "w" /* fopen truncate or create for writing */
58 #define FWRITEP "w+" /* fopen truncate or create for update */
59
60 extern char *argv0; /* command name (must be set in main function) */
61
62 int invbreak;
63
64 #if STATS
65 int showzipf; /* show postings per term distribution */
66 #endif
67
68 static POSTING *item, *enditem, *item1 = NULL, *item2 = NULL;
69 static unsigned setsize1, setsize2;
70 static long numitems, totterm, zerolong;
71 static char *indexfile, *postingfile;
72 static FILE *outfile, *fpost;
73 static unsigned supersize = SUPERINC, supintsize;
74 static int numpost, numlogblk, amtused, nextpost,
75 lastinblk, numinvitems;
76 static POSTING *POST, *postptr;
77 static unsigned long *SUPINT, *supint, nextsupfing;
78 static char *SUPFING, *supfing;
79 static char thisterm[TERMMAX];
80 static union {
81 long invblk[BLOCKSIZE / sizeof (long)];
82 char chrblk[BLOCKSIZE];
83 } logicalblk;
84
85 #if DEBUG || STATS
86 static long totpost;
87 #endif
88
89 #if STATS
90 static int zipf[ZIPFSIZE + 1];
91 #endif
92
93 static void invcannotalloc(size_t n);
94 static void invcannotopen(char *file);
95 static void invcannotwrite(char *file);
96 static int invnewterm(void);
97 static int boolready(void);
98
99 long
invmake(char * invname,char * invpost,FILE * infile)100 invmake(char *invname, char *invpost, FILE *infile)
101 {
102 unsigned char *s;
103 long num;
104 int i;
105 long fileindex;
106 unsigned postsize = POSTINC * sizeof (POSTING);
107 unsigned long *intptr;
108 char line[LINEMAX];
109 long tlong;
110 PARAM param;
111 POSTING posting;
112 #if STATS
113 int j;
114 unsigned maxtermlen = 0;
115 #endif
116 /* output file */
117 if ((outfile = vpfopen(invname, FWRITEP)) == NULL) {
118 invcannotopen(invname);
119 return (0);
120 }
121 indexfile = invname;
122 (void) fseek(outfile, (long)BUFSIZ, 0);
123
124 /* posting file */
125 if ((fpost = vpfopen(invpost, FWRITE)) == NULL) {
126 invcannotopen(invpost);
127 return (0);
128 }
129 postingfile = invpost;
130 nextpost = 0;
131 /* get space for the postings list */
132 if ((POST = (POSTING *)malloc(postsize)) == NULL) {
133 invcannotalloc(postsize);
134 return (0);
135 }
136 postptr = POST;
137 /* get space for the superfinger (superindex) */
138 if ((SUPFING = malloc(supersize)) == NULL) {
139 invcannotalloc(supersize);
140 return (0);
141 }
142 supfing = SUPFING;
143 supintsize = supersize / 40;
144 /* also for the superfinger index */
145 if ((SUPINT = malloc(supintsize * sizeof (long))) == NULL) {
146 invcannotalloc(supintsize * sizeof (long));
147 return (0);
148 }
149 supint = SUPINT;
150 supint++; /* leave first term open for a count */
151 /* initialize using an empty term */
152 (void) strcpy(thisterm, "");
153 *supint++ = 0;
154 *supfing++ = ' ';
155 *supfing++ = '\0';
156 nextsupfing = 2;
157 #if DEBUG || STATS
158 totpost = 0L;
159 #endif
160 totterm = 0L;
161 numpost = 1;
162
163 /*
164 * set up as though a block had come and gone, i.e., set up for
165 * new block
166 */
167 amtused = 16; /* leave no space - init 3 words + one for luck */
168 numinvitems = 0;
169 numlogblk = 0;
170 lastinblk = BLOCKSIZE;
171
172 /* now loop as long as more to read (till eof) */
173 while (fgets(line, LINEMAX, infile) != NULL) {
174 #if DEBUG || STATS
175 ++totpost;
176 #endif
177 s = (unsigned char *) strchr(line, SEP);
178 if (s == NULL) /* where did this line come from ??? */
179 continue; /* workaround: just skip it */
180 *s = '\0';
181 #if STATS
182 if ((i = strlen(line)) > maxtermlen) {
183 maxtermlen = i;
184 }
185 #endif
186 #if DEBUG
187 (void) printf("%ld: %s ", totpost, line);
188 (void) fflush(stdout);
189 #endif
190 if (strcmp(thisterm, line) == 0) {
191 if (postptr + 10 > POST + postsize / sizeof (POSTING)) {
192 i = postptr - POST;
193 postsize += POSTINC * sizeof (POSTING);
194 if ((POST = realloc(POST, postsize)) == NULL) {
195 invcannotalloc(postsize);
196 return (0);
197 }
198 postptr = i + POST;
199 #if DEBUG
200 (void) printf("reallocated post space to %u, "
201 "totpost=%ld\n", postsize, totpost);
202 #endif
203 }
204 numpost++;
205 } else {
206 /* have a new term */
207 if (!invnewterm()) {
208 return (0);
209 }
210 (void) strcpy(thisterm, line);
211 numpost = 1;
212 postptr = POST;
213 fileindex = 0;
214 }
215 /* get the new posting */
216 num = *++s - '!';
217 i = 1;
218 do {
219 num = BASE * num + *++s - '!';
220 } while (++i < PRECISION);
221 posting.lineoffset = num;
222 while (++fileindex < nsrcoffset && num > srcoffset[fileindex]) {
223 ;
224 }
225 posting.fileindex = --fileindex;
226 posting.type = *++s;
227 num = *++s - '!';
228 if (*s != '\n') {
229 num = *++s - '!';
230 while (*++s != '\n') {
231 num = BASE * num + *s - '!';
232 }
233 posting.fcnoffset = num;
234 } else {
235 posting.fcnoffset = 0;
236 }
237 *postptr++ = posting;
238 #if DEBUG
239 (void) printf("%ld %ld %ld %ld\n", posting.fileindex,
240 posting.fcnoffset, posting.lineoffset, posting.type);
241 (void) fflush(stdout);
242 #endif
243 }
244 if (!invnewterm()) {
245 return (0);
246 }
247 /* now clean up final block */
248 logicalblk.invblk[0] = numinvitems;
249 /* loops pointer around to start */
250 logicalblk.invblk[1] = 0;
251 logicalblk.invblk[2] = numlogblk - 1;
252 if (fwrite((char *)&logicalblk, BLOCKSIZE, 1, outfile) == 0) {
253 goto cannotwrite;
254 }
255 numlogblk++;
256 /* write out block to save space. what in it doesn't matter */
257 if (fwrite((char *)&logicalblk, BLOCKSIZE, 1, outfile) == 0) {
258 goto cannotwrite;
259 }
260 /* finish up the super finger */
261 *SUPINT = numlogblk;
262 /* add to the offsets the size of the offset pointers */
263 intptr = (SUPINT + 1);
264 i = (char *)supint - (char *)SUPINT;
265 while (intptr < supint)
266 *intptr++ += i;
267 /* write out the offsets (1 for the N at start) and the super finger */
268 if (fwrite((char *)SUPINT, sizeof (*SUPINT), numlogblk + 1,
269 outfile) == 0 ||
270 fwrite(SUPFING, 1, supfing - SUPFING, outfile) == 0) {
271 goto cannotwrite;
272 }
273 /* save the size for reference later */
274 nextsupfing = sizeof (long) + sizeof (long) * numlogblk +
275 (supfing - SUPFING);
276 /*
277 * make sure the file ends at a logical block boundary. This is
278 * necessary for invinsert to correctly create extended blocks
279 */
280 i = nextsupfing % BLOCKSIZE;
281 /* write out junk to fill log blk */
282 if (fwrite(SUPFING, BLOCKSIZE - i, 1, outfile) == 0 ||
283 fflush(outfile) == EOF) {
284 /* rewind doesn't check for write failure */
285 goto cannotwrite;
286 }
287 /* write the control area */
288 rewind(outfile);
289 param.version = VERSION;
290 param.filestat = 0;
291 param.sizeblk = BLOCKSIZE;
292 param.startbyte = (numlogblk + 1) * BLOCKSIZE + BUFSIZ;
293 param.supsize = nextsupfing;
294 param.cntlsize = BUFSIZ;
295 param.share = 0;
296 if (fwrite((char *)¶m, sizeof (param), 1, outfile) == 0) {
297 goto cannotwrite;
298 }
299 for (i = 0; i < 10; i++) /* for future use */
300 if (fwrite((char *)&zerolong, sizeof (zerolong),
301 1, outfile) == 0) {
302 goto cannotwrite;
303 }
304
305 /* make first block loop backwards to last block */
306 if (fflush(outfile) == EOF) {
307 /* fseek doesn't check for write failure */
308 goto cannotwrite;
309 }
310 /* get to second word first block */
311 (void) fseek(outfile, (long)BUFSIZ + 8, 0);
312 tlong = numlogblk - 1;
313 if (fwrite((char *)&tlong, sizeof (tlong), 1, outfile) == 0 ||
314 fclose(outfile) == EOF) {
315 cannotwrite:
316 invcannotwrite(invname);
317 return (0);
318 }
319 if (fclose(fpost) == EOF) {
320 invcannotwrite(postingfile);
321 return (0);
322 }
323 --totterm; /* don't count null term */
324 #if STATS
325 (void) printf("logical blocks = %d, postings = %ld, terms = %ld, "
326 "max term length = %d\n", numlogblk, totpost, totterm, maxtermlen);
327 if (showzipf) {
328 (void) printf(
329 "\n************* ZIPF curve ****************\n");
330 for (j = ZIPFSIZE; j > 1; j--)
331 if (zipf[j])
332 break;
333 for (i = 1; i < j; ++i) {
334 (void) printf("%3d -%6d ", i, zipf[i]);
335 if (i % 6 == 0) (void) putchar('\n');
336 }
337 (void) printf(">%d-%6d\n", ZIPFSIZE, zipf[0]);
338 }
339 #endif
340 /* free all malloc'd memory */
341 free(POST);
342 free(SUPFING);
343 free(SUPINT);
344 return (totterm);
345 }
346
347 /* add a term to the data base */
348
349 static int
invnewterm(void)350 invnewterm(void)
351 {
352 int backupflag, i, j, maxback, holditems, gooditems, howfar;
353 int len, numwilluse, wdlen;
354 char *tptr, *tptr2, *tptr3;
355 union {
356 unsigned long packword[2];
357 ENTRY e;
358 } iteminfo;
359
360 totterm++;
361 #if STATS
362 /* keep zipfian info on the distribution */
363 if (numpost <= ZIPFSIZE)
364 zipf[numpost]++;
365 else
366 zipf[0]++;
367 #endif
368 len = strlen(thisterm);
369 wdlen = (len + (sizeof (long) - 1)) / sizeof (long);
370 numwilluse = (wdlen + 3) * sizeof (long);
371 /* new block if at least 1 item in block */
372 if (numinvitems && numwilluse + amtused > BLOCKSIZE) {
373 /* set up new block */
374 if (supfing + 500 > SUPFING + supersize) {
375 i = supfing - SUPFING;
376 supersize += 20000;
377 if ((SUPFING = realloc(SUPFING, supersize)) == NULL) {
378 invcannotalloc(supersize);
379 return (0);
380 }
381 supfing = i + SUPFING;
382 #if DEBUG
383 (void) printf("reallocated superfinger space to %d, "
384 "totpost=%ld\n", supersize, totpost);
385 #endif
386 }
387 /* check that room for the offset as well */
388 if ((numlogblk + 10) > supintsize) {
389 i = supint - SUPINT;
390 supintsize += SUPERINC;
391 if ((SUPINT = realloc((char *)SUPINT,
392 supintsize * sizeof (long))) == NULL) {
393 invcannotalloc(supintsize * sizeof (long));
394 return (0);
395 }
396 supint = i + SUPINT;
397 #if DEBUG
398 (void) printf("reallocated superfinger offset to %d, "
399 "totpost = %ld\n", supintsize * sizeof (long),
400 totpost);
401 #endif
402 }
403 /* See if backup is efficatious */
404 backupflag = 0;
405 maxback = strlen(thisterm) / 10;
406 holditems = numinvitems;
407 if (maxback > numinvitems)
408 maxback = numinvitems - 2;
409 howfar = 0;
410 while (--maxback > 0) {
411 howfar++;
412 iteminfo.packword[0] =
413 logicalblk.invblk[--holditems * 2 +
414 (sizeof (long) - 1)];
415 if ((i = iteminfo.e.size / 10) < maxback) {
416 maxback = i;
417 backupflag = howfar;
418 gooditems = holditems;
419 tptr2 = logicalblk.chrblk + iteminfo.e.offset;
420 }
421 }
422 /* see if backup will occur */
423 if (backupflag) {
424 numinvitems = gooditems;
425 }
426 logicalblk.invblk[0] = numinvitems;
427 /* set forward pointer pointing to next */
428 logicalblk.invblk[1] = numlogblk + 1;
429 /* set back pointer to last block */
430 logicalblk.invblk[2] = numlogblk - 1;
431 if (fwrite((char *)logicalblk.chrblk, 1,
432 BLOCKSIZE, outfile) == 0) {
433 invcannotwrite(indexfile);
434 return (0);
435 }
436 amtused = 16;
437 numlogblk++;
438 /* check if had to back up, if so do it */
439 if (backupflag) {
440 /* find out where the end of the new block is */
441 iteminfo.packword[0] =
442 logicalblk.invblk[numinvitems * 2 + 1];
443 tptr3 = logicalblk.chrblk + iteminfo.e.offset;
444 /* move the index for this block */
445 for (i = 3; i <= (backupflag * 2 + 2); i++) {
446 logicalblk.invblk[i] =
447 logicalblk.invblk[numinvitems * 2+i];
448 }
449 /* move the word into the super index */
450 iteminfo.packword[0] = logicalblk.invblk[3];
451 iteminfo.packword[1] = logicalblk.invblk[4];
452 tptr2 = logicalblk.chrblk + iteminfo.e.offset;
453 (void) strncpy(supfing, tptr2, (int)iteminfo.e.size);
454 *(supfing + iteminfo.e.size) = '\0';
455 #if DEBUG
456 (void) printf("backup %d at term=%s to term=%s\n",
457 backupflag, thisterm, supfing);
458 #endif
459 *supint++ = nextsupfing;
460 nextsupfing += strlen(supfing) + 1;
461 supfing += strlen(supfing) + 1;
462 /* now fix up the logical block */
463 tptr = logicalblk.chrblk + lastinblk;
464 lastinblk = BLOCKSIZE;
465 tptr2 = logicalblk.chrblk + lastinblk;
466 j = tptr3 - tptr;
467 while (tptr3 > tptr)
468 *--tptr2 = *--tptr3;
469 lastinblk -= j;
470 amtused += (8 * backupflag + j);
471 for (i = 3; i < (backupflag * 2 + 2); i += 2) {
472 iteminfo.packword[0] = logicalblk.invblk[i];
473 iteminfo.e.offset += (tptr2 - tptr3);
474 logicalblk.invblk[i] = iteminfo.packword[0];
475 }
476 numinvitems = backupflag;
477 } else { /* no backup needed */
478 numinvitems = 0;
479 lastinblk = BLOCKSIZE;
480 /* add new term to superindex */
481 (void) strcpy(supfing, thisterm);
482 supfing += strlen(thisterm) + 1;
483 *supint++ = nextsupfing;
484 nextsupfing += strlen(thisterm) + 1;
485 }
486 }
487 lastinblk -= (numwilluse - 8);
488 iteminfo.e.offset = lastinblk;
489 iteminfo.e.size = (char)len;
490 iteminfo.e.space = 0;
491 iteminfo.e.post = numpost;
492 (void) strncpy(logicalblk.chrblk + lastinblk, thisterm, len);
493 amtused += numwilluse;
494 logicalblk.invblk[(lastinblk/sizeof (long))+wdlen] = nextpost;
495 if ((i = postptr - POST) > 0) {
496 if (fwrite((char *)POST, sizeof (POSTING), i, fpost) == 0) {
497 invcannotwrite(postingfile);
498 return (0);
499 }
500 nextpost += i * sizeof (POSTING);
501 }
502 logicalblk.invblk[3+2*numinvitems++] = iteminfo.packword[0];
503 logicalblk.invblk[2+2*numinvitems] = iteminfo.packword[1];
504 return (1);
505 }
506
507 static void
swap_ints(void * p,size_t sz)508 swap_ints(void *p, size_t sz)
509 {
510 uint32_t *s;
511 uint32_t *e = (uint32_t *)p + (sz / sizeof (uint32_t));
512
513 for (s = p; s < e; s++)
514 *s = BSWAP_32(*s);
515 }
516
517 static void
write_param(INVCONTROL * invcntl)518 write_param(INVCONTROL *invcntl)
519 {
520 if (invcntl->swap)
521 swap_ints(&invcntl->param, sizeof (invcntl->param));
522
523 rewind(invcntl->invfile);
524 (void) fwrite((char *)&invcntl->param, sizeof (invcntl->param), 1,
525 invcntl->invfile);
526
527 if (invcntl->swap)
528 swap_ints(&invcntl->param, sizeof (invcntl->param));
529 }
530
531 static void
read_superfinger(INVCONTROL * invcntl)532 read_superfinger(INVCONTROL *invcntl)
533 {
534 size_t count;
535
536 (void) fseek(invcntl->invfile, invcntl->param.startbyte, SEEK_SET);
537 (void) fread(invcntl->iindex, (int)invcntl->param.supsize,
538 1, invcntl->invfile);
539
540 if (invcntl->swap) {
541 /*
542 * The superfinger consists of a count, followed by
543 * count offsets, followed by a string table (which
544 * the offsets reference).
545 *
546 * We need to swap the count and the offsets.
547 */
548 count = 1 + BSWAP_32(*(uint32_t *)invcntl->iindex);
549 swap_ints(invcntl->iindex, count * sizeof (unsigned long));
550 }
551 }
552
553 static void
read_logblock(INVCONTROL * invcntl,int block)554 read_logblock(INVCONTROL *invcntl, int block)
555 {
556 /* note always fetch it if the file is busy */
557 if ((block != invcntl->numblk) ||
558 (invcntl->param.filestat >= INVBUSY)) {
559 (void) fseek(invcntl->invfile,
560 (block * invcntl->param.sizeblk) + invcntl->param.cntlsize,
561 SEEK_SET);
562 invcntl->numblk = block;
563 (void) fread(invcntl->logblk, (int)invcntl->param.sizeblk,
564 1, invcntl->invfile);
565
566 if (invcntl->swap) {
567 size_t count;
568 ENTRY *ecur, *eend;
569 uint32_t *postptr;
570
571 /*
572 * A logblock consists of a count, a next block id,
573 * and a previous block id, followed by count
574 * ENTRYs, followed by alternating strings and
575 * offsets.
576 */
577 swap_ints(invcntl->logblk, 3 * sizeof (unsigned long));
578
579 count = *(uint32_t *)invcntl->logblk;
580
581 ecur = (ENTRY *)((uint32_t *)invcntl->logblk + 3);
582 eend = ecur + count;
583
584 for (; ecur < eend; ecur++) {
585 ecur->offset = BSWAP_16(ecur->offset);
586 ecur->post = BSWAP_32(ecur->post);
587
588 /*
589 * After the string is the posting offset.
590 */
591 postptr = (uint32_t *)(invcntl->logblk +
592 ecur->offset +
593 P2ROUNDUP(ecur->size, sizeof (long)));
594
595 *postptr = BSWAP_32(*postptr);
596 }
597 }
598 }
599 }
600
601 void
read_next_posting(INVCONTROL * invcntl,POSTING * posting)602 read_next_posting(INVCONTROL *invcntl, POSTING *posting)
603 {
604 (void) fread((char *)posting, sizeof (*posting), 1, invcntl->postfile);
605 if (invcntl->swap) {
606 posting->lineoffset = BSWAP_32(posting->lineoffset);
607 posting->fcnoffset = BSWAP_32(posting->fcnoffset);
608 /*
609 * fileindex is a 24-bit field, so shift it before swapping
610 */
611 posting->fileindex = BSWAP_32(posting->fileindex << 8);
612 }
613 }
614
615 int
invopen(INVCONTROL * invcntl,char * invname,char * invpost,int stat)616 invopen(INVCONTROL *invcntl, char *invname, char *invpost, int stat)
617 {
618 int read_index;
619
620 if ((invcntl->invfile = vpfopen(invname,
621 ((stat == 0) ? FREAD : FREADP))) == NULL) {
622 invcannotopen(invname);
623 return (-1);
624 }
625 if (fread((char *)&invcntl->param, sizeof (invcntl->param), 1,
626 invcntl->invfile) == 0) {
627 (void) fprintf(stderr, "%s: empty inverted file\n", argv0);
628 goto closeinv;
629 }
630 if (invcntl->param.version != VERSION &&
631 BSWAP_32(invcntl->param.version) != VERSION) {
632 (void) fprintf(stderr,
633 "%s: cannot read old index format; use -U option to "
634 "force database to rebuild\n", argv0);
635 goto closeinv;
636 }
637 invcntl->swap = (invcntl->param.version != VERSION);
638 if (invcntl->swap)
639 swap_ints(&invcntl->param, sizeof (invcntl->param));
640
641 if (stat == 0 && invcntl->param.filestat == INVALONE) {
642 (void) fprintf(stderr, "%s: inverted file is locked\n", argv0);
643 goto closeinv;
644 }
645 if ((invcntl->postfile = vpfopen(invpost,
646 ((stat == 0) ? FREAD : FREADP))) == NULL) {
647 invcannotopen(invpost);
648 goto closeinv;
649 }
650 /* allocate core for a logical block */
651 if ((invcntl->logblk = malloc(invcntl->param.sizeblk)) == NULL) {
652 invcannotalloc((unsigned)invcntl->param.sizeblk);
653 goto closeboth;
654 }
655 /* allocate for and read in superfinger */
656 read_index = 1;
657 invcntl->iindex = NULL;
658 #if SHARE
659 if (invcntl->param.share == 1) {
660 key_t ftok(), shm_key;
661 struct shmid_ds shm_buf;
662 char *shmat();
663 int shm_id;
664
665 /* see if the shared segment exists */
666 shm_key = ftok(invname, 2);
667 shm_id = shmget(shm_key, 0, 0);
668 /*
669 * Failure simply means (hopefully) that segment doesn't
670 * exist
671 */
672 if (shm_id == -1) {
673 /*
674 * Have to give general write permission due to AMdahl
675 * not having protected segments
676 */
677 shm_id = shmget(shm_key,
678 invcntl->param.supsize + sizeof (long),
679 IPC_CREAT | 0666);
680 if (shm_id == -1)
681 perror("Could not create shared "
682 "memory segment");
683 } else
684 read_index = 0;
685
686 if (shm_id != -1) {
687 invcntl->iindex = shmat(shm_id, 0,
688 ((read_index) ? 0 : SHM_RDONLY));
689 if (invcntl->iindex == (char *)ERR) {
690 (void) fprintf(stderr,
691 "%s: shared memory link failed\n", argv0);
692 invcntl->iindex = NULL;
693 read_index = 1;
694 }
695 }
696 }
697 #endif
698 if (invcntl->iindex == NULL)
699 invcntl->iindex = malloc(invcntl->param.supsize + 16);
700 if (invcntl->iindex == NULL) {
701 invcannotalloc((unsigned)invcntl->param.supsize);
702 free(invcntl->logblk);
703 goto closeboth;
704 }
705 if (read_index) {
706 read_superfinger(invcntl);
707 }
708 invcntl->numblk = -1;
709 if (boolready() == -1) {
710 closeboth:
711 (void) fclose(invcntl->postfile);
712 closeinv:
713 (void) fclose(invcntl->invfile);
714 return (-1);
715 }
716 /* write back out the control block if anything changed */
717 invcntl->param.filestat = stat;
718 if (stat > invcntl->param.filestat)
719 write_param(invcntl);
720 return (1);
721 }
722
723 /* invclose must be called to wrap things up and deallocate core */
724 void
invclose(INVCONTROL * invcntl)725 invclose(INVCONTROL *invcntl)
726 {
727 /* write out the control block in case anything changed */
728 if (invcntl->param.filestat > 0) {
729 invcntl->param.filestat = 0;
730 write_param(invcntl);
731 }
732 (void) fclose(invcntl->invfile);
733 (void) fclose(invcntl->postfile);
734 #if SHARE
735 if (invcntl->param.share > 0) {
736 shmdt(invcntl->iindex);
737 invcntl->iindex = NULL;
738 }
739 #endif
740 if (invcntl->iindex != NULL)
741 free(invcntl->iindex);
742 free(invcntl->logblk);
743 }
744
745 /* invstep steps the inverted file forward one item */
746 void
invstep(INVCONTROL * invcntl)747 invstep(INVCONTROL *invcntl)
748 {
749 if (invcntl->keypnt < (*(int *)invcntl->logblk - 1)) {
750 invcntl->keypnt++;
751 return;
752 }
753
754 /* move forward a block else wrap */
755 read_logblock(invcntl, *(int *)(invcntl->logblk + sizeof (long)));
756
757 invcntl->keypnt = 0;
758 }
759
760 /* invforward moves forward one term in the inverted file */
761 int
invforward(INVCONTROL * invcntl)762 invforward(INVCONTROL *invcntl)
763 {
764 invstep(invcntl);
765 /* skip things with 0 postings */
766 while (((ENTRY *)(invcntl->logblk + 12) + invcntl->keypnt)->post == 0) {
767 invstep(invcntl);
768 }
769 /* Check for having wrapped - reached start of inverted file! */
770 if ((invcntl->numblk == 0) && (invcntl->keypnt == 0))
771 return (0);
772 return (1);
773 }
774
775 /* invterm gets the present term from the present logical block */
776 int
invterm(INVCONTROL * invcntl,char * term)777 invterm(INVCONTROL *invcntl, char *term)
778 {
779 ENTRY * entryptr;
780
781 entryptr = (ENTRY *)(invcntl->logblk + 12) + invcntl->keypnt;
782 (void) strncpy(term, entryptr->offset + invcntl->logblk,
783 (int)entryptr->size);
784 *(term + entryptr->size) = '\0';
785 return (entryptr->post);
786 }
787
788 /* invfind searches for an individual item in the inverted file */
789 long
invfind(INVCONTROL * invcntl,char * searchterm)790 invfind(INVCONTROL *invcntl, char *searchterm)
791 {
792 int imid, ilow, ihigh;
793 long num;
794 int i;
795 unsigned long *intptr, *intptr2;
796 ENTRY *entryptr;
797
798 /* make sure it is initialized via invready */
799 if (invcntl->invfile == 0)
800 return (-1L);
801
802 /* now search for the appropriate finger block */
803 intptr = (unsigned long *)invcntl->iindex;
804
805 ilow = 0;
806 ihigh = *intptr++ - 1;
807 while (ilow <= ihigh) {
808 imid = (ilow + ihigh) / 2;
809 intptr2 = intptr + imid;
810 i = strcmp(searchterm, (invcntl->iindex + *intptr2));
811 if (i < 0)
812 ihigh = imid - 1;
813 else if (i > 0)
814 ilow = ++imid;
815 else {
816 ilow = imid + 1;
817 break;
818 }
819 }
820 /* be careful about case where searchterm is after last in this block */
821 imid = (ilow) ? ilow - 1 : 0;
822
823 /* fetch the appropriate logical block if not in core */
824 read_logblock(invcntl, imid);
825
826 srch_ext:
827 /* now find the term in this block. tricky this */
828 intptr = (unsigned long *)invcntl->logblk;
829
830 ilow = 0;
831 ihigh = *intptr - 1;
832 intptr += 3;
833 num = 0;
834 while (ilow <= ihigh) {
835 imid = (ilow + ihigh) / 2;
836 entryptr = (ENTRY *)intptr + imid;
837 i = strncmp(searchterm, (invcntl->logblk + entryptr->offset),
838 (int)entryptr->size);
839 if (i == 0)
840 i = strlen(searchterm) - entryptr->size;
841 if (i < 0)
842 ihigh = imid - 1;
843 else if (i > 0)
844 ilow = ++imid;
845 else {
846 num = entryptr->post;
847 break;
848 }
849 }
850 /* be careful about case where searchterm is after last in this block */
851 if (imid >= *(long *)invcntl->logblk) {
852 invcntl->keypnt = *(long *)invcntl->logblk;
853 invstep(invcntl);
854 /* note if this happens the term could be in extended block */
855 if (invcntl->param.startbyte <
856 invcntl->numblk * invcntl->param.sizeblk)
857 goto srch_ext;
858 } else
859 invcntl->keypnt = imid;
860 return (num);
861 }
862
863 #if DEBUG
864
865 /* invdump dumps the block the term parameter is in */
866 void
invdump(INVCONTROL * invcntl,char * term)867 invdump(INVCONTROL *invcntl, char *term)
868 {
869 long i, j, n, *longptr;
870 ENTRY * entryptr;
871 char temp[512], *ptr;
872
873 /* dump superindex if term is "-" */
874 if (*term == '-') {
875 j = atoi(term + 1);
876 longptr = (long *)invcntl->iindex;
877 n = *longptr++;
878 (void) printf("Superindex dump, num blocks=%ld\n", n);
879 longptr += j;
880 while ((longptr <= ((long *)invcntl->iindex) + n) &&
881 invbreak == 0) {
882 (void) printf("%2ld %6ld %s\n", j++, *longptr,
883 invcntl->iindex + *longptr);
884 longptr++;
885 }
886 return;
887 } else if (*term == '#') {
888 j = atoi(term + 1);
889 /* fetch the appropriate logical block */
890 read_logblock(invcntl, j);
891 } else
892 i = abs((int)invfind(invcntl, term));
893 longptr = (long *)invcntl->logblk;
894 n = *longptr++;
895 (void) printf("Entry term to invdump=%s, postings=%ld, "
896 "forward ptr=%ld, back ptr=%ld\n", term, i, *(longptr),
897 *(longptr + 1));
898 entryptr = (ENTRY *)(invcntl->logblk + 12);
899 (void) printf("%ld terms in this block, block=%ld\n", n,
900 invcntl->numblk);
901 (void) printf("\tterm\t\t\tposts\tsize\toffset\tspace\t1st word\n");
902 for (j = 0; j < n && invbreak == 0; j++) {
903 ptr = invcntl->logblk + entryptr->offset;
904 (void) strncpy(temp, ptr, (int)entryptr->size);
905 temp[entryptr->size] = '\0';
906 ptr += (sizeof (long) *
907 (long)((entryptr->size +
908 (sizeof (long) - 1)) / sizeof (long)));
909 (void) printf("%2ld %-24s\t%5ld\t%3d\t%d\t%d\t%ld\n", j, temp,
910 entryptr->post, entryptr->size, entryptr->offset,
911 entryptr->space, *(long *)ptr);
912 entryptr++;
913 }
914 }
915 #endif
916
917 static int
boolready(void)918 boolready(void)
919 {
920 numitems = 0;
921 if (item1 != NULL)
922 free(item1);
923 setsize1 = SETINC;
924 if ((item1 = (POSTING *)malloc(SETINC * sizeof (POSTING))) == NULL) {
925 invcannotalloc(SETINC);
926 return (-1);
927 }
928 if (item2 != NULL)
929 free(item2);
930 setsize2 = SETINC;
931 if ((item2 = (POSTING *)malloc(SETINC * sizeof (POSTING))) == NULL) {
932 invcannotalloc(SETINC);
933 return (-1);
934 }
935 item = item1;
936 enditem = item;
937 return (0);
938 }
939
940 void
boolclear(void)941 boolclear(void)
942 {
943 numitems = 0;
944 item = item1;
945 enditem = item;
946 }
947
948 POSTING *
boolfile(INVCONTROL * invcntl,long * num,int bool)949 boolfile(INVCONTROL *invcntl, long *num, int bool)
950 {
951 ENTRY *entryptr;
952 FILE *file;
953 char *ptr;
954 unsigned long *ptr2;
955 POSTING *newitem;
956 POSTING posting;
957 unsigned u;
958 POSTING *newsetp, *set1p;
959 long newsetc, set1c, set2c;
960
961 entryptr = (ENTRY *) (invcntl->logblk + 12) + invcntl->keypnt;
962 ptr = invcntl->logblk + entryptr->offset;
963 ptr2 = ((unsigned long *)ptr) +
964 (entryptr->size + (sizeof (long) - 1)) / sizeof (long);
965 *num = entryptr->post;
966 switch (bool) {
967 case OR:
968 case NOT:
969 if (*num == 0) {
970 *num = numitems;
971 return (item);
972 }
973 }
974 /* make room for the new set */
975 u = 0;
976 switch (bool) {
977 case AND:
978 case NOT:
979 newsetp = set1p = item;
980 break;
981
982 case OR:
983 u = enditem - item;
984 /* FALLTHROUGH */
985 case REVERSENOT:
986 u += *num;
987 if (item == item2) {
988 if (u > setsize1) {
989 u += SETINC;
990 if ((item1 = (POSTING *) realloc(item1,
991 u * sizeof (POSTING))) == NULL) {
992 goto cannotalloc;
993 }
994 setsize1 = u;
995 }
996 newitem = item1;
997 } else {
998 if (u > setsize2) {
999 u += SETINC;
1000 if ((item2 = (POSTING *)realloc(item2,
1001 u * sizeof (POSTING))) == NULL) {
1002 cannotalloc:
1003 invcannotalloc(u * sizeof (POSTING));
1004 (void) boolready();
1005 *num = -1;
1006 return (NULL);
1007 }
1008 setsize2 = u;
1009 }
1010 newitem = item2;
1011 }
1012 set1p = item;
1013 newsetp = newitem;
1014 }
1015 file = invcntl->postfile;
1016 (void) fseek(file, (long)*ptr2, SEEK_SET);
1017 read_next_posting(invcntl, &posting);
1018 newsetc = 0;
1019 switch (bool) {
1020 case OR:
1021 /* while something in both sets */
1022 set1p = item;
1023 newsetp = newitem;
1024 for (set1c = 0, set2c = 0;
1025 set1c < numitems && set2c < *num; newsetc++) {
1026 if (set1p->lineoffset < posting.lineoffset) {
1027 *newsetp++ = *set1p++;
1028 set1c++;
1029 } else if (set1p->lineoffset > posting.lineoffset) {
1030 *newsetp++ = posting;
1031 read_next_posting(invcntl, &posting);
1032 set2c++;
1033 } else if (set1p->type < posting.type) {
1034 *newsetp++ = *set1p++;
1035 set1c++;
1036 } else if (set1p->type > posting.type) {
1037 *newsetp++ = posting;
1038 read_next_posting(invcntl, &posting);
1039 set2c++;
1040 } else { /* identical postings */
1041 *newsetp++ = *set1p++;
1042 set1c++;
1043 read_next_posting(invcntl, &posting);
1044 set2c++;
1045 }
1046 }
1047 /* find out what ran out and move the rest in */
1048 if (set1c < numitems) {
1049 newsetc += numitems - set1c;
1050 while (set1c++ < numitems) {
1051 *newsetp++ = *set1p++;
1052 }
1053 } else {
1054 while (set2c++ < *num) {
1055 *newsetp++ = posting;
1056 newsetc++;
1057 read_next_posting(invcntl, &posting);
1058 }
1059 }
1060 item = newitem;
1061 break; /* end of OR */
1062 #if 0
1063 case AND:
1064 set1c = 0;
1065 set2c = 0;
1066 while (set1c < numitems && set2c < *num) {
1067 if (set1p->lineoffset < posting.lineoffset) {
1068 set1p++;
1069 set1c++;
1070 } else if (set1p->lineoffset > posting.lineoffset) {
1071 read_next_posting(invcntl, &posting);
1072 set2c++;
1073 } else if (set1p->type < posting.type) {
1074 *set1p++;
1075 set1c++;
1076 } else if (set1p->type > posting.type) {
1077 read_next_posting(invcntl, &posting);
1078 set2c++;
1079 } else { /* identical postings */
1080 *newsetp++ = *set1p++;
1081 newsetc++;
1082 set1c++;
1083 read_next_posting(invcntl, &posting);
1084 set2c++;
1085 }
1086 }
1087 break; /* end of AND */
1088
1089 case NOT:
1090 set1c = 0;
1091 set2c = 0;
1092 while (set1c < numitems && set2c < *num) {
1093 if (set1p->lineoffset < posting.lineoffset) {
1094 *newsetp++ = *set1p++;
1095 newsetc++;
1096 set1c++;
1097 } else if (set1p->lineoffset > posting.lineoffset) {
1098 read_next_posting(invcntl, &posting);
1099 set2c++;
1100 } else if (set1p->type < posting.type) {
1101 *newsetp++ = *set1p++;
1102 newsetc++;
1103 set1c++;
1104 } else if (set1p->type > posting.type) {
1105 read_next_posting(invcntl, &posting);
1106 set2c++;
1107 } else { /* identical postings */
1108 set1c++;
1109 set1p++;
1110 read_next_posting(invcntl, &posting);
1111 set2c++;
1112 }
1113 }
1114 newsetc += numitems - set1c;
1115 while (set1c++ < numitems) {
1116 *newsetp++ = *set1p++;
1117 }
1118 break; /* end of NOT */
1119
1120 case REVERSENOT: /* core NOT incoming set */
1121 set1c = 0;
1122 set2c = 0;
1123 while (set1c < numitems && set2c < *num) {
1124 if (set1p->lineoffset < posting.lineoffset) {
1125 set1p++;
1126 set1c++;
1127 } else if (set1p->lineoffset > posting.lineoffset) {
1128 *newsetp++ = posting;
1129 read_next_posting(invcntl, &posting);
1130 set2c++;
1131 } else if (set1p->type < posting.type) {
1132 set1p++;
1133 set1c++;
1134 } else if (set1p->type > posting.type) {
1135 *newsetp++ = posting;
1136 read_next_posting(invcntl, &posting);
1137 set2c++;
1138 } else { /* identical postings */
1139 set1c++;
1140 set1p++;
1141 read_next_posting(invcntl, &posting);
1142 set2c++;
1143 }
1144 }
1145 while (set2c++ < *num) {
1146 *newsetp++ = posting;
1147 newsetc++;
1148 read_next_posting(invcntl, &posting);
1149 }
1150 item = newitem;
1151 break; /* end of REVERSENOT */
1152 #endif
1153 }
1154 numitems = newsetc;
1155 *num = newsetc;
1156 enditem = (POSTING *)newsetp;
1157 return ((POSTING *)item);
1158 }
1159
1160 #if 0
1161 POSTING *
1162 boolsave(int clear)
1163 {
1164 int i;
1165 POSTING *ptr;
1166 POSTING *oldstuff, *newstuff;
1167
1168 if (numitems == 0) {
1169 if (clear)
1170 boolclear();
1171 return (NULL);
1172 }
1173 /*
1174 * if clear then give them what we have and use (void)
1175 * boolready to realloc
1176 */
1177 if (clear) {
1178 ptr = item;
1179 /* free up the space we didn't give them */
1180 if (item == item1)
1181 item1 = NULL;
1182 else
1183 item2 = NULL;
1184 (void) boolready();
1185 return (ptr);
1186 }
1187 i = (enditem - item) * sizeof (POSTING) + 100;
1188 if ((ptr = (POSTING *)malloc(i))r == NULL) {
1189 invcannotalloc(i);
1190 return (ptr);
1191 }
1192 /* move present set into place */
1193 oldstuff = item;
1194 newstuff = ptr;
1195 while (oldstuff < enditem)
1196 *newstuff++ = *oldstuff++;
1197 return (ptr);
1198 }
1199 #endif
1200
1201 static void
invcannotalloc(size_t n)1202 invcannotalloc(size_t n)
1203 {
1204 (void) fprintf(stderr, "%s: cannot allocate %u bytes\n", argv0, n);
1205 }
1206
1207 static void
invcannotopen(char * file)1208 invcannotopen(char *file)
1209 {
1210 (void) fprintf(stderr, "%s: cannot open file %s\n", argv0, file);
1211 }
1212
1213 static void
invcannotwrite(char * file)1214 invcannotwrite(char *file)
1215 {
1216 (void) perror(argv0); /* must be first to preserve errno */
1217 (void) fprintf(stderr, "%s: write to file %s failed\n", argv0, file);
1218 }
1219