1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License, Version 1.0 only
6 * (the "License"). You may not use this file except in compliance
7 * with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22 /* Copyright (c) 1988 AT&T */
23 /* All Rights Reserved */
24
25
26 /*
27 * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
28 * Use is subject to license terms.
29 */
30
31 #pragma ident "%Z%%M% %I% %E% SMI"
32
33 #include <ctype.h>
34 #include <stdio.h>
35 #include <sys/types.h>
36 #include <sys/sysmacros.h>
37 #include <sys/byteorder.h>
38 #if SHARE
39 #include <sys/ipc.h>
40 #include <sys/shm.h>
41 #define ERR -1
42 #endif
43 #include "invlib.h"
44 #include "library.h"
45
46 #define DEBUG 0 /* debugging code and realloc messages */
47 #define BLOCKSIZE 2 * BUFSIZ /* logical block size */
48 #define LINEMAX 1000 /* sorted posting line max size */
49 #define POSTINC 10000 /* posting buffer size increment */
50 #define SEP ' ' /* sorted posting field separator */
51 #define SETINC 100 /* posting set size increment */
52 #define STATS 0 /* print statistics */
53 #define SUPERINC 10000 /* super index size increment */
54 #define TERMMAX 512 /* term max size */
55 #define VERSION 1 /* inverted index format version */
56 #define ZIPFSIZE 200 /* zipf curve size */
57 #define FREAD "r" /* fopen for reading */
58 #define FREADP "r+" /* fopen for update */
59 #define FWRITE "w" /* fopen truncate or create for writing */
60 #define FWRITEP "w+" /* fopen truncate or create for update */
61
62 extern char *argv0; /* command name (must be set in main function) */
63
64 int invbreak;
65
66 #if STATS
67 int showzipf; /* show postings per term distribution */
68 #endif
69
70 static POSTING *item, *enditem, *item1 = NULL, *item2 = NULL;
71 static unsigned setsize1, setsize2;
72 static long numitems, totterm, zerolong;
73 static char *indexfile, *postingfile;
74 static FILE *outfile, *fpost;
75 static unsigned supersize = SUPERINC, supintsize;
76 static int numpost, numlogblk, amtused, nextpost,
77 lastinblk, numinvitems;
78 static POSTING *POST, *postptr;
79 static unsigned long *SUPINT, *supint, nextsupfing;
80 static char *SUPFING, *supfing;
81 static char thisterm[TERMMAX];
82 static union {
83 long invblk[BLOCKSIZE / sizeof (long)];
84 char chrblk[BLOCKSIZE];
85 } logicalblk;
86
87 #if DEBUG || STATS
88 static long totpost;
89 #endif
90
91 #if STATS
92 static int zipf[ZIPFSIZE + 1];
93 #endif
94
95 static void invcannotalloc(size_t n);
96 static void invcannotopen(char *file);
97 static void invcannotwrite(char *file);
98 static int invnewterm(void);
99 static int boolready(void);
100
101 long
invmake(char * invname,char * invpost,FILE * infile)102 invmake(char *invname, char *invpost, FILE *infile)
103 {
104 unsigned char *s;
105 long num;
106 int i;
107 long fileindex;
108 unsigned postsize = POSTINC * sizeof (POSTING);
109 unsigned long *intptr;
110 char line[LINEMAX];
111 long tlong;
112 PARAM param;
113 POSTING posting;
114 #if STATS
115 int j;
116 unsigned maxtermlen = 0;
117 #endif
118 /* output file */
119 if ((outfile = vpfopen(invname, FWRITEP)) == NULL) {
120 invcannotopen(invname);
121 return (0);
122 }
123 indexfile = invname;
124 (void) fseek(outfile, (long)BUFSIZ, 0);
125
126 /* posting file */
127 if ((fpost = vpfopen(invpost, FWRITE)) == NULL) {
128 invcannotopen(invpost);
129 return (0);
130 }
131 postingfile = invpost;
132 nextpost = 0;
133 /* get space for the postings list */
134 if ((POST = (POSTING *)malloc(postsize)) == NULL) {
135 invcannotalloc(postsize);
136 return (0);
137 }
138 postptr = POST;
139 /* get space for the superfinger (superindex) */
140 if ((SUPFING = malloc(supersize)) == NULL) {
141 invcannotalloc(supersize);
142 return (0);
143 }
144 supfing = SUPFING;
145 supintsize = supersize / 40;
146 /* also for the superfinger index */
147 if ((SUPINT = malloc(supintsize * sizeof (long))) == NULL) {
148 invcannotalloc(supintsize * sizeof (long));
149 return (0);
150 }
151 supint = SUPINT;
152 supint++; /* leave first term open for a count */
153 /* initialize using an empty term */
154 (void) strcpy(thisterm, "");
155 *supint++ = 0;
156 *supfing++ = ' ';
157 *supfing++ = '\0';
158 nextsupfing = 2;
159 #if DEBUG || STATS
160 totpost = 0L;
161 #endif
162 totterm = 0L;
163 numpost = 1;
164
165 /*
166 * set up as though a block had come and gone, i.e., set up for
167 * new block
168 */
169 amtused = 16; /* leave no space - init 3 words + one for luck */
170 numinvitems = 0;
171 numlogblk = 0;
172 lastinblk = BLOCKSIZE;
173
174 /* now loop as long as more to read (till eof) */
175 while (fgets(line, LINEMAX, infile) != NULL) {
176 #if DEBUG || STATS
177 ++totpost;
178 #endif
179 s = (unsigned char *) strchr(line, SEP);
180 if (s == NULL) /* where did this line come from ??? */
181 continue; /* workaround: just skip it */
182 *s = '\0';
183 #if STATS
184 if ((i = strlen(line)) > maxtermlen) {
185 maxtermlen = i;
186 }
187 #endif
188 #if DEBUG
189 (void) printf("%ld: %s ", totpost, line);
190 (void) fflush(stdout);
191 #endif
192 if (strcmp(thisterm, line) == 0) {
193 if (postptr + 10 > POST + postsize / sizeof (POSTING)) {
194 i = postptr - POST;
195 postsize += POSTINC * sizeof (POSTING);
196 if ((POST = realloc(POST, postsize)) == NULL) {
197 invcannotalloc(postsize);
198 return (0);
199 }
200 postptr = i + POST;
201 #if DEBUG
202 (void) printf("reallocated post space to %u, "
203 "totpost=%ld\n", postsize, totpost);
204 #endif
205 }
206 numpost++;
207 } else {
208 /* have a new term */
209 if (!invnewterm()) {
210 return (0);
211 }
212 (void) strcpy(thisterm, line);
213 numpost = 1;
214 postptr = POST;
215 fileindex = 0;
216 }
217 /* get the new posting */
218 num = *++s - '!';
219 i = 1;
220 do {
221 num = BASE * num + *++s - '!';
222 } while (++i < PRECISION);
223 posting.lineoffset = num;
224 while (++fileindex < nsrcoffset && num > srcoffset[fileindex]) {
225 ;
226 }
227 posting.fileindex = --fileindex;
228 posting.type = *++s;
229 num = *++s - '!';
230 if (*s != '\n') {
231 num = *++s - '!';
232 while (*++s != '\n') {
233 num = BASE * num + *s - '!';
234 }
235 posting.fcnoffset = num;
236 } else {
237 posting.fcnoffset = 0;
238 }
239 *postptr++ = posting;
240 #if DEBUG
241 (void) printf("%ld %ld %ld %ld\n", posting.fileindex,
242 posting.fcnoffset, posting.lineoffset, posting.type);
243 (void) fflush(stdout);
244 #endif
245 }
246 if (!invnewterm()) {
247 return (0);
248 }
249 /* now clean up final block */
250 logicalblk.invblk[0] = numinvitems;
251 /* loops pointer around to start */
252 logicalblk.invblk[1] = 0;
253 logicalblk.invblk[2] = numlogblk - 1;
254 if (fwrite((char *)&logicalblk, BLOCKSIZE, 1, outfile) == 0) {
255 goto cannotwrite;
256 }
257 numlogblk++;
258 /* write out block to save space. what in it doesn't matter */
259 if (fwrite((char *)&logicalblk, BLOCKSIZE, 1, outfile) == 0) {
260 goto cannotwrite;
261 }
262 /* finish up the super finger */
263 *SUPINT = numlogblk;
264 /* add to the offsets the size of the offset pointers */
265 intptr = (SUPINT + 1);
266 i = (char *)supint - (char *)SUPINT;
267 while (intptr < supint)
268 *intptr++ += i;
269 /* write out the offsets (1 for the N at start) and the super finger */
270 if (fwrite((char *)SUPINT, sizeof (*SUPINT), numlogblk + 1,
271 outfile) == 0 ||
272 fwrite(SUPFING, 1, supfing - SUPFING, outfile) == 0) {
273 goto cannotwrite;
274 }
275 /* save the size for reference later */
276 nextsupfing = sizeof (long) + sizeof (long) * numlogblk +
277 (supfing - SUPFING);
278 /*
279 * make sure the file ends at a logical block boundary. This is
280 * necessary for invinsert to correctly create extended blocks
281 */
282 i = nextsupfing % BLOCKSIZE;
283 /* write out junk to fill log blk */
284 if (fwrite(SUPFING, BLOCKSIZE - i, 1, outfile) == 0 ||
285 fflush(outfile) == EOF) {
286 /* rewind doesn't check for write failure */
287 goto cannotwrite;
288 }
289 /* write the control area */
290 rewind(outfile);
291 param.version = VERSION;
292 param.filestat = 0;
293 param.sizeblk = BLOCKSIZE;
294 param.startbyte = (numlogblk + 1) * BLOCKSIZE + BUFSIZ;
295 param.supsize = nextsupfing;
296 param.cntlsize = BUFSIZ;
297 param.share = 0;
298 if (fwrite((char *)¶m, sizeof (param), 1, outfile) == 0) {
299 goto cannotwrite;
300 }
301 for (i = 0; i < 10; i++) /* for future use */
302 if (fwrite((char *)&zerolong, sizeof (zerolong),
303 1, outfile) == 0) {
304 goto cannotwrite;
305 }
306
307 /* make first block loop backwards to last block */
308 if (fflush(outfile) == EOF) {
309 /* fseek doesn't check for write failure */
310 goto cannotwrite;
311 }
312 /* get to second word first block */
313 (void) fseek(outfile, (long)BUFSIZ + 8, 0);
314 tlong = numlogblk - 1;
315 if (fwrite((char *)&tlong, sizeof (tlong), 1, outfile) == 0 ||
316 fclose(outfile) == EOF) {
317 cannotwrite:
318 invcannotwrite(invname);
319 return (0);
320 }
321 if (fclose(fpost) == EOF) {
322 invcannotwrite(postingfile);
323 return (0);
324 }
325 --totterm; /* don't count null term */
326 #if STATS
327 (void) printf("logical blocks = %d, postings = %ld, terms = %ld, "
328 "max term length = %d\n", numlogblk, totpost, totterm, maxtermlen);
329 if (showzipf) {
330 (void) printf(
331 "\n************* ZIPF curve ****************\n");
332 for (j = ZIPFSIZE; j > 1; j--)
333 if (zipf[j])
334 break;
335 for (i = 1; i < j; ++i) {
336 (void) printf("%3d -%6d ", i, zipf[i]);
337 if (i % 6 == 0) (void) putchar('\n');
338 }
339 (void) printf(">%d-%6d\n", ZIPFSIZE, zipf[0]);
340 }
341 #endif
342 /* free all malloc'd memory */
343 free(POST);
344 free(SUPFING);
345 free(SUPINT);
346 return (totterm);
347 }
348
349 /* add a term to the data base */
350
351 static int
invnewterm(void)352 invnewterm(void)
353 {
354 int backupflag, i, j, maxback, holditems, gooditems, howfar;
355 int len, numwilluse, wdlen;
356 char *tptr, *tptr2, *tptr3;
357 union {
358 unsigned long packword[2];
359 ENTRY e;
360 } iteminfo;
361
362 totterm++;
363 #if STATS
364 /* keep zipfian info on the distribution */
365 if (numpost <= ZIPFSIZE)
366 zipf[numpost]++;
367 else
368 zipf[0]++;
369 #endif
370 len = strlen(thisterm);
371 wdlen = (len + (sizeof (long) - 1)) / sizeof (long);
372 numwilluse = (wdlen + 3) * sizeof (long);
373 /* new block if at least 1 item in block */
374 if (numinvitems && numwilluse + amtused > BLOCKSIZE) {
375 /* set up new block */
376 if (supfing + 500 > SUPFING + supersize) {
377 i = supfing - SUPFING;
378 supersize += 20000;
379 if ((SUPFING = realloc(SUPFING, supersize)) == NULL) {
380 invcannotalloc(supersize);
381 return (0);
382 }
383 supfing = i + SUPFING;
384 #if DEBUG
385 (void) printf("reallocated superfinger space to %d, "
386 "totpost=%ld\n", supersize, totpost);
387 #endif
388 }
389 /* check that room for the offset as well */
390 if ((numlogblk + 10) > supintsize) {
391 i = supint - SUPINT;
392 supintsize += SUPERINC;
393 if ((SUPINT = realloc((char *)SUPINT,
394 supintsize * sizeof (long))) == NULL) {
395 invcannotalloc(supintsize * sizeof (long));
396 return (0);
397 }
398 supint = i + SUPINT;
399 #if DEBUG
400 (void) printf("reallocated superfinger offset to %d, "
401 "totpost = %ld\n", supintsize * sizeof (long),
402 totpost);
403 #endif
404 }
405 /* See if backup is efficatious */
406 backupflag = 0;
407 maxback = strlen(thisterm) / 10;
408 holditems = numinvitems;
409 if (maxback > numinvitems)
410 maxback = numinvitems - 2;
411 howfar = 0;
412 while (--maxback > 0) {
413 howfar++;
414 iteminfo.packword[0] =
415 logicalblk.invblk[--holditems * 2 +
416 (sizeof (long) - 1)];
417 if ((i = iteminfo.e.size / 10) < maxback) {
418 maxback = i;
419 backupflag = howfar;
420 gooditems = holditems;
421 tptr2 = logicalblk.chrblk + iteminfo.e.offset;
422 }
423 }
424 /* see if backup will occur */
425 if (backupflag) {
426 numinvitems = gooditems;
427 }
428 logicalblk.invblk[0] = numinvitems;
429 /* set forward pointer pointing to next */
430 logicalblk.invblk[1] = numlogblk + 1;
431 /* set back pointer to last block */
432 logicalblk.invblk[2] = numlogblk - 1;
433 if (fwrite((char *)logicalblk.chrblk, 1,
434 BLOCKSIZE, outfile) == 0) {
435 invcannotwrite(indexfile);
436 return (0);
437 }
438 amtused = 16;
439 numlogblk++;
440 /* check if had to back up, if so do it */
441 if (backupflag) {
442 /* find out where the end of the new block is */
443 iteminfo.packword[0] =
444 logicalblk.invblk[numinvitems * 2 + 1];
445 tptr3 = logicalblk.chrblk + iteminfo.e.offset;
446 /* move the index for this block */
447 for (i = 3; i <= (backupflag * 2 + 2); i++) {
448 logicalblk.invblk[i] =
449 logicalblk.invblk[numinvitems * 2+i];
450 }
451 /* move the word into the super index */
452 iteminfo.packword[0] = logicalblk.invblk[3];
453 iteminfo.packword[1] = logicalblk.invblk[4];
454 tptr2 = logicalblk.chrblk + iteminfo.e.offset;
455 (void) strncpy(supfing, tptr2, (int)iteminfo.e.size);
456 *(supfing + iteminfo.e.size) = '\0';
457 #if DEBUG
458 (void) printf("backup %d at term=%s to term=%s\n",
459 backupflag, thisterm, supfing);
460 #endif
461 *supint++ = nextsupfing;
462 nextsupfing += strlen(supfing) + 1;
463 supfing += strlen(supfing) + 1;
464 /* now fix up the logical block */
465 tptr = logicalblk.chrblk + lastinblk;
466 lastinblk = BLOCKSIZE;
467 tptr2 = logicalblk.chrblk + lastinblk;
468 j = tptr3 - tptr;
469 while (tptr3 > tptr)
470 *--tptr2 = *--tptr3;
471 lastinblk -= j;
472 amtused += (8 * backupflag + j);
473 for (i = 3; i < (backupflag * 2 + 2); i += 2) {
474 iteminfo.packword[0] = logicalblk.invblk[i];
475 iteminfo.e.offset += (tptr2 - tptr3);
476 logicalblk.invblk[i] = iteminfo.packword[0];
477 }
478 numinvitems = backupflag;
479 } else { /* no backup needed */
480 numinvitems = 0;
481 lastinblk = BLOCKSIZE;
482 /* add new term to superindex */
483 (void) strcpy(supfing, thisterm);
484 supfing += strlen(thisterm) + 1;
485 *supint++ = nextsupfing;
486 nextsupfing += strlen(thisterm) + 1;
487 }
488 }
489 lastinblk -= (numwilluse - 8);
490 iteminfo.e.offset = lastinblk;
491 iteminfo.e.size = (char)len;
492 iteminfo.e.space = 0;
493 iteminfo.e.post = numpost;
494 (void) strncpy(logicalblk.chrblk + lastinblk, thisterm, len);
495 amtused += numwilluse;
496 logicalblk.invblk[(lastinblk/sizeof (long))+wdlen] = nextpost;
497 if ((i = postptr - POST) > 0) {
498 if (fwrite((char *)POST, sizeof (POSTING), i, fpost) == 0) {
499 invcannotwrite(postingfile);
500 return (0);
501 }
502 nextpost += i * sizeof (POSTING);
503 }
504 logicalblk.invblk[3+2*numinvitems++] = iteminfo.packword[0];
505 logicalblk.invblk[2+2*numinvitems] = iteminfo.packword[1];
506 return (1);
507 }
508
509 static void
swap_ints(void * p,size_t sz)510 swap_ints(void *p, size_t sz)
511 {
512 uint32_t *s;
513 uint32_t *e = (uint32_t *)p + (sz / sizeof (uint32_t));
514
515 for (s = p; s < e; s++)
516 *s = BSWAP_32(*s);
517 }
518
519 static void
write_param(INVCONTROL * invcntl)520 write_param(INVCONTROL *invcntl)
521 {
522 if (invcntl->swap)
523 swap_ints(&invcntl->param, sizeof (invcntl->param));
524
525 rewind(invcntl->invfile);
526 (void) fwrite((char *)&invcntl->param, sizeof (invcntl->param), 1,
527 invcntl->invfile);
528
529 if (invcntl->swap)
530 swap_ints(&invcntl->param, sizeof (invcntl->param));
531 }
532
533 static void
read_superfinger(INVCONTROL * invcntl)534 read_superfinger(INVCONTROL *invcntl)
535 {
536 size_t count;
537
538 (void) fseek(invcntl->invfile, invcntl->param.startbyte, SEEK_SET);
539 (void) fread(invcntl->iindex, (int)invcntl->param.supsize,
540 1, invcntl->invfile);
541
542 if (invcntl->swap) {
543 /*
544 * The superfinger consists of a count, followed by
545 * count offsets, followed by a string table (which
546 * the offsets reference).
547 *
548 * We need to swap the count and the offsets.
549 */
550 count = 1 + BSWAP_32(*(uint32_t *)invcntl->iindex);
551 swap_ints(invcntl->iindex, count * sizeof (unsigned long));
552 }
553 }
554
555 static void
read_logblock(INVCONTROL * invcntl,int block)556 read_logblock(INVCONTROL *invcntl, int block)
557 {
558 /* note always fetch it if the file is busy */
559 if ((block != invcntl->numblk) ||
560 (invcntl->param.filestat >= INVBUSY)) {
561 (void) fseek(invcntl->invfile,
562 (block * invcntl->param.sizeblk) + invcntl->param.cntlsize,
563 SEEK_SET);
564 invcntl->numblk = block;
565 (void) fread(invcntl->logblk, (int)invcntl->param.sizeblk,
566 1, invcntl->invfile);
567
568 if (invcntl->swap) {
569 size_t count;
570 ENTRY *ecur, *eend;
571 uint32_t *postptr;
572
573 /*
574 * A logblock consists of a count, a next block id,
575 * and a previous block id, followed by count
576 * ENTRYs, followed by alternating strings and
577 * offsets.
578 */
579 swap_ints(invcntl->logblk, 3 * sizeof (unsigned long));
580
581 count = *(uint32_t *)invcntl->logblk;
582
583 ecur = (ENTRY *)((uint32_t *)invcntl->logblk + 3);
584 eend = ecur + count;
585
586 for (; ecur < eend; ecur++) {
587 ecur->offset = BSWAP_16(ecur->offset);
588 ecur->post = BSWAP_32(ecur->post);
589
590 /*
591 * After the string is the posting offset.
592 */
593 postptr = (uint32_t *)(invcntl->logblk +
594 ecur->offset +
595 P2ROUNDUP(ecur->size, sizeof (long)));
596
597 *postptr = BSWAP_32(*postptr);
598 }
599 }
600 }
601 }
602
603 void
read_next_posting(INVCONTROL * invcntl,POSTING * posting)604 read_next_posting(INVCONTROL *invcntl, POSTING *posting)
605 {
606 (void) fread((char *)posting, sizeof (*posting), 1, invcntl->postfile);
607 if (invcntl->swap) {
608 posting->lineoffset = BSWAP_32(posting->lineoffset);
609 posting->fcnoffset = BSWAP_32(posting->fcnoffset);
610 /*
611 * fileindex is a 24-bit field, so shift it before swapping
612 */
613 posting->fileindex = BSWAP_32(posting->fileindex << 8);
614 }
615 }
616
617 int
invopen(INVCONTROL * invcntl,char * invname,char * invpost,int stat)618 invopen(INVCONTROL *invcntl, char *invname, char *invpost, int stat)
619 {
620 int read_index;
621
622 if ((invcntl->invfile = vpfopen(invname,
623 ((stat == 0) ? FREAD : FREADP))) == NULL) {
624 invcannotopen(invname);
625 return (-1);
626 }
627 if (fread((char *)&invcntl->param, sizeof (invcntl->param), 1,
628 invcntl->invfile) == 0) {
629 (void) fprintf(stderr, "%s: empty inverted file\n", argv0);
630 goto closeinv;
631 }
632 if (invcntl->param.version != VERSION &&
633 BSWAP_32(invcntl->param.version) != VERSION) {
634 (void) fprintf(stderr,
635 "%s: cannot read old index format; use -U option to "
636 "force database to rebuild\n", argv0);
637 goto closeinv;
638 }
639 invcntl->swap = (invcntl->param.version != VERSION);
640 if (invcntl->swap)
641 swap_ints(&invcntl->param, sizeof (invcntl->param));
642
643 if (stat == 0 && invcntl->param.filestat == INVALONE) {
644 (void) fprintf(stderr, "%s: inverted file is locked\n", argv0);
645 goto closeinv;
646 }
647 if ((invcntl->postfile = vpfopen(invpost,
648 ((stat == 0) ? FREAD : FREADP))) == NULL) {
649 invcannotopen(invpost);
650 goto closeinv;
651 }
652 /* allocate core for a logical block */
653 if ((invcntl->logblk = malloc(invcntl->param.sizeblk)) == NULL) {
654 invcannotalloc((unsigned)invcntl->param.sizeblk);
655 goto closeboth;
656 }
657 /* allocate for and read in superfinger */
658 read_index = 1;
659 invcntl->iindex = NULL;
660 #if SHARE
661 if (invcntl->param.share == 1) {
662 key_t ftok(), shm_key;
663 struct shmid_ds shm_buf;
664 char *shmat();
665 int shm_id;
666
667 /* see if the shared segment exists */
668 shm_key = ftok(invname, 2);
669 shm_id = shmget(shm_key, 0, 0);
670 /*
671 * Failure simply means (hopefully) that segment doesn't
672 * exist
673 */
674 if (shm_id == -1) {
675 /*
676 * Have to give general write permission due to AMdahl
677 * not having protected segments
678 */
679 shm_id = shmget(shm_key,
680 invcntl->param.supsize + sizeof (long),
681 IPC_CREAT | 0666);
682 if (shm_id == -1)
683 perror("Could not create shared "
684 "memory segment");
685 } else
686 read_index = 0;
687
688 if (shm_id != -1) {
689 invcntl->iindex = shmat(shm_id, 0,
690 ((read_index) ? 0 : SHM_RDONLY));
691 if (invcntl->iindex == (char *)ERR) {
692 (void) fprintf(stderr,
693 "%s: shared memory link failed\n", argv0);
694 invcntl->iindex = NULL;
695 read_index = 1;
696 }
697 }
698 }
699 #endif
700 if (invcntl->iindex == NULL)
701 invcntl->iindex = malloc(invcntl->param.supsize + 16);
702 if (invcntl->iindex == NULL) {
703 invcannotalloc((unsigned)invcntl->param.supsize);
704 free(invcntl->logblk);
705 goto closeboth;
706 }
707 if (read_index) {
708 read_superfinger(invcntl);
709 }
710 invcntl->numblk = -1;
711 if (boolready() == -1) {
712 closeboth:
713 (void) fclose(invcntl->postfile);
714 closeinv:
715 (void) fclose(invcntl->invfile);
716 return (-1);
717 }
718 /* write back out the control block if anything changed */
719 invcntl->param.filestat = stat;
720 if (stat > invcntl->param.filestat)
721 write_param(invcntl);
722 return (1);
723 }
724
725 /* invclose must be called to wrap things up and deallocate core */
726 void
invclose(INVCONTROL * invcntl)727 invclose(INVCONTROL *invcntl)
728 {
729 /* write out the control block in case anything changed */
730 if (invcntl->param.filestat > 0) {
731 invcntl->param.filestat = 0;
732 write_param(invcntl);
733 }
734 (void) fclose(invcntl->invfile);
735 (void) fclose(invcntl->postfile);
736 #if SHARE
737 if (invcntl->param.share > 0) {
738 shmdt(invcntl->iindex);
739 invcntl->iindex = NULL;
740 }
741 #endif
742 if (invcntl->iindex != NULL)
743 free(invcntl->iindex);
744 free(invcntl->logblk);
745 }
746
747 /* invstep steps the inverted file forward one item */
748 void
invstep(INVCONTROL * invcntl)749 invstep(INVCONTROL *invcntl)
750 {
751 if (invcntl->keypnt < (*(int *)invcntl->logblk - 1)) {
752 invcntl->keypnt++;
753 return;
754 }
755
756 /* move forward a block else wrap */
757 read_logblock(invcntl, *(int *)(invcntl->logblk + sizeof (long)));
758
759 invcntl->keypnt = 0;
760 }
761
762 /* invforward moves forward one term in the inverted file */
763 int
invforward(INVCONTROL * invcntl)764 invforward(INVCONTROL *invcntl)
765 {
766 invstep(invcntl);
767 /* skip things with 0 postings */
768 while (((ENTRY *)(invcntl->logblk + 12) + invcntl->keypnt)->post == 0) {
769 invstep(invcntl);
770 }
771 /* Check for having wrapped - reached start of inverted file! */
772 if ((invcntl->numblk == 0) && (invcntl->keypnt == 0))
773 return (0);
774 return (1);
775 }
776
777 /* invterm gets the present term from the present logical block */
778 int
invterm(INVCONTROL * invcntl,char * term)779 invterm(INVCONTROL *invcntl, char *term)
780 {
781 ENTRY * entryptr;
782
783 entryptr = (ENTRY *)(invcntl->logblk + 12) + invcntl->keypnt;
784 (void) strncpy(term, entryptr->offset + invcntl->logblk,
785 (int)entryptr->size);
786 *(term + entryptr->size) = '\0';
787 return (entryptr->post);
788 }
789
790 /* invfind searches for an individual item in the inverted file */
791 long
invfind(INVCONTROL * invcntl,char * searchterm)792 invfind(INVCONTROL *invcntl, char *searchterm)
793 {
794 int imid, ilow, ihigh;
795 long num;
796 int i;
797 unsigned long *intptr, *intptr2;
798 ENTRY *entryptr;
799
800 /* make sure it is initialized via invready */
801 if (invcntl->invfile == 0)
802 return (-1L);
803
804 /* now search for the appropriate finger block */
805 intptr = (unsigned long *)invcntl->iindex;
806
807 ilow = 0;
808 ihigh = *intptr++ - 1;
809 while (ilow <= ihigh) {
810 imid = (ilow + ihigh) / 2;
811 intptr2 = intptr + imid;
812 i = strcmp(searchterm, (invcntl->iindex + *intptr2));
813 if (i < 0)
814 ihigh = imid - 1;
815 else if (i > 0)
816 ilow = ++imid;
817 else {
818 ilow = imid + 1;
819 break;
820 }
821 }
822 /* be careful about case where searchterm is after last in this block */
823 imid = (ilow) ? ilow - 1 : 0;
824
825 /* fetch the appropriate logical block if not in core */
826 read_logblock(invcntl, imid);
827
828 srch_ext:
829 /* now find the term in this block. tricky this */
830 intptr = (unsigned long *)invcntl->logblk;
831
832 ilow = 0;
833 ihigh = *intptr - 1;
834 intptr += 3;
835 num = 0;
836 while (ilow <= ihigh) {
837 imid = (ilow + ihigh) / 2;
838 entryptr = (ENTRY *)intptr + imid;
839 i = strncmp(searchterm, (invcntl->logblk + entryptr->offset),
840 (int)entryptr->size);
841 if (i == 0)
842 i = strlen(searchterm) - entryptr->size;
843 if (i < 0)
844 ihigh = imid - 1;
845 else if (i > 0)
846 ilow = ++imid;
847 else {
848 num = entryptr->post;
849 break;
850 }
851 }
852 /* be careful about case where searchterm is after last in this block */
853 if (imid >= *(long *)invcntl->logblk) {
854 invcntl->keypnt = *(long *)invcntl->logblk;
855 invstep(invcntl);
856 /* note if this happens the term could be in extended block */
857 if (invcntl->param.startbyte <
858 invcntl->numblk * invcntl->param.sizeblk)
859 goto srch_ext;
860 } else
861 invcntl->keypnt = imid;
862 return (num);
863 }
864
865 #if DEBUG
866
867 /* invdump dumps the block the term parameter is in */
868 void
invdump(INVCONTROL * invcntl,char * term)869 invdump(INVCONTROL *invcntl, char *term)
870 {
871 long i, j, n, *longptr;
872 ENTRY * entryptr;
873 char temp[512], *ptr;
874
875 /* dump superindex if term is "-" */
876 if (*term == '-') {
877 j = atoi(term + 1);
878 longptr = (long *)invcntl->iindex;
879 n = *longptr++;
880 (void) printf("Superindex dump, num blocks=%ld\n", n);
881 longptr += j;
882 while ((longptr <= ((long *)invcntl->iindex) + n) &&
883 invbreak == 0) {
884 (void) printf("%2ld %6ld %s\n", j++, *longptr,
885 invcntl->iindex + *longptr);
886 longptr++;
887 }
888 return;
889 } else if (*term == '#') {
890 j = atoi(term + 1);
891 /* fetch the appropriate logical block */
892 read_logblock(invcntl, j);
893 } else
894 i = abs((int)invfind(invcntl, term));
895 longptr = (long *)invcntl->logblk;
896 n = *longptr++;
897 (void) printf("Entry term to invdump=%s, postings=%ld, "
898 "forward ptr=%ld, back ptr=%ld\n", term, i, *(longptr),
899 *(longptr + 1));
900 entryptr = (ENTRY *)(invcntl->logblk + 12);
901 (void) printf("%ld terms in this block, block=%ld\n", n,
902 invcntl->numblk);
903 (void) printf("\tterm\t\t\tposts\tsize\toffset\tspace\t1st word\n");
904 for (j = 0; j < n && invbreak == 0; j++) {
905 ptr = invcntl->logblk + entryptr->offset;
906 (void) strncpy(temp, ptr, (int)entryptr->size);
907 temp[entryptr->size] = '\0';
908 ptr += (sizeof (long) *
909 (long)((entryptr->size +
910 (sizeof (long) - 1)) / sizeof (long)));
911 (void) printf("%2ld %-24s\t%5ld\t%3d\t%d\t%d\t%ld\n", j, temp,
912 entryptr->post, entryptr->size, entryptr->offset,
913 entryptr->space, *(long *)ptr);
914 entryptr++;
915 }
916 }
917 #endif
918
919 static int
boolready(void)920 boolready(void)
921 {
922 numitems = 0;
923 if (item1 != NULL)
924 free(item1);
925 setsize1 = SETINC;
926 if ((item1 = (POSTING *)malloc(SETINC * sizeof (POSTING))) == NULL) {
927 invcannotalloc(SETINC);
928 return (-1);
929 }
930 if (item2 != NULL)
931 free(item2);
932 setsize2 = SETINC;
933 if ((item2 = (POSTING *)malloc(SETINC * sizeof (POSTING))) == NULL) {
934 invcannotalloc(SETINC);
935 return (-1);
936 }
937 item = item1;
938 enditem = item;
939 return (0);
940 }
941
942 void
boolclear(void)943 boolclear(void)
944 {
945 numitems = 0;
946 item = item1;
947 enditem = item;
948 }
949
950 POSTING *
boolfile(INVCONTROL * invcntl,long * num,int bool)951 boolfile(INVCONTROL *invcntl, long *num, int bool)
952 {
953 ENTRY *entryptr;
954 FILE *file;
955 char *ptr;
956 unsigned long *ptr2;
957 POSTING *newitem;
958 POSTING posting;
959 unsigned u;
960 POSTING *newsetp, *set1p;
961 long newsetc, set1c, set2c;
962
963 entryptr = (ENTRY *) (invcntl->logblk + 12) + invcntl->keypnt;
964 ptr = invcntl->logblk + entryptr->offset;
965 ptr2 = ((unsigned long *)ptr) +
966 (entryptr->size + (sizeof (long) - 1)) / sizeof (long);
967 *num = entryptr->post;
968 switch (bool) {
969 case OR:
970 case NOT:
971 if (*num == 0) {
972 *num = numitems;
973 return (item);
974 }
975 }
976 /* make room for the new set */
977 u = 0;
978 switch (bool) {
979 case AND:
980 case NOT:
981 newsetp = set1p = item;
982 break;
983
984 case OR:
985 u = enditem - item;
986 /* FALLTHROUGH */
987 case REVERSENOT:
988 u += *num;
989 if (item == item2) {
990 if (u > setsize1) {
991 u += SETINC;
992 if ((item1 = (POSTING *) realloc(item1,
993 u * sizeof (POSTING))) == NULL) {
994 goto cannotalloc;
995 }
996 setsize1 = u;
997 }
998 newitem = item1;
999 } else {
1000 if (u > setsize2) {
1001 u += SETINC;
1002 if ((item2 = (POSTING *)realloc(item2,
1003 u * sizeof (POSTING))) == NULL) {
1004 cannotalloc:
1005 invcannotalloc(u * sizeof (POSTING));
1006 (void) boolready();
1007 *num = -1;
1008 return (NULL);
1009 }
1010 setsize2 = u;
1011 }
1012 newitem = item2;
1013 }
1014 set1p = item;
1015 newsetp = newitem;
1016 }
1017 file = invcntl->postfile;
1018 (void) fseek(file, (long)*ptr2, SEEK_SET);
1019 read_next_posting(invcntl, &posting);
1020 newsetc = 0;
1021 switch (bool) {
1022 case OR:
1023 /* while something in both sets */
1024 set1p = item;
1025 newsetp = newitem;
1026 for (set1c = 0, set2c = 0;
1027 set1c < numitems && set2c < *num; newsetc++) {
1028 if (set1p->lineoffset < posting.lineoffset) {
1029 *newsetp++ = *set1p++;
1030 set1c++;
1031 } else if (set1p->lineoffset > posting.lineoffset) {
1032 *newsetp++ = posting;
1033 read_next_posting(invcntl, &posting);
1034 set2c++;
1035 } else if (set1p->type < posting.type) {
1036 *newsetp++ = *set1p++;
1037 set1c++;
1038 } else if (set1p->type > posting.type) {
1039 *newsetp++ = posting;
1040 read_next_posting(invcntl, &posting);
1041 set2c++;
1042 } else { /* identical postings */
1043 *newsetp++ = *set1p++;
1044 set1c++;
1045 read_next_posting(invcntl, &posting);
1046 set2c++;
1047 }
1048 }
1049 /* find out what ran out and move the rest in */
1050 if (set1c < numitems) {
1051 newsetc += numitems - set1c;
1052 while (set1c++ < numitems) {
1053 *newsetp++ = *set1p++;
1054 }
1055 } else {
1056 while (set2c++ < *num) {
1057 *newsetp++ = posting;
1058 newsetc++;
1059 read_next_posting(invcntl, &posting);
1060 }
1061 }
1062 item = newitem;
1063 break; /* end of OR */
1064 #if 0
1065 case AND:
1066 set1c = 0;
1067 set2c = 0;
1068 while (set1c < numitems && set2c < *num) {
1069 if (set1p->lineoffset < posting.lineoffset) {
1070 set1p++;
1071 set1c++;
1072 } else if (set1p->lineoffset > posting.lineoffset) {
1073 read_next_posting(invcntl, &posting);
1074 set2c++;
1075 } else if (set1p->type < posting.type) {
1076 *set1p++;
1077 set1c++;
1078 } else if (set1p->type > posting.type) {
1079 read_next_posting(invcntl, &posting);
1080 set2c++;
1081 } else { /* identical postings */
1082 *newsetp++ = *set1p++;
1083 newsetc++;
1084 set1c++;
1085 read_next_posting(invcntl, &posting);
1086 set2c++;
1087 }
1088 }
1089 break; /* end of AND */
1090
1091 case NOT:
1092 set1c = 0;
1093 set2c = 0;
1094 while (set1c < numitems && set2c < *num) {
1095 if (set1p->lineoffset < posting.lineoffset) {
1096 *newsetp++ = *set1p++;
1097 newsetc++;
1098 set1c++;
1099 } else if (set1p->lineoffset > posting.lineoffset) {
1100 read_next_posting(invcntl, &posting);
1101 set2c++;
1102 } else if (set1p->type < posting.type) {
1103 *newsetp++ = *set1p++;
1104 newsetc++;
1105 set1c++;
1106 } else if (set1p->type > posting.type) {
1107 read_next_posting(invcntl, &posting);
1108 set2c++;
1109 } else { /* identical postings */
1110 set1c++;
1111 set1p++;
1112 read_next_posting(invcntl, &posting);
1113 set2c++;
1114 }
1115 }
1116 newsetc += numitems - set1c;
1117 while (set1c++ < numitems) {
1118 *newsetp++ = *set1p++;
1119 }
1120 break; /* end of NOT */
1121
1122 case REVERSENOT: /* core NOT incoming set */
1123 set1c = 0;
1124 set2c = 0;
1125 while (set1c < numitems && set2c < *num) {
1126 if (set1p->lineoffset < posting.lineoffset) {
1127 set1p++;
1128 set1c++;
1129 } else if (set1p->lineoffset > posting.lineoffset) {
1130 *newsetp++ = posting;
1131 read_next_posting(invcntl, &posting);
1132 set2c++;
1133 } else if (set1p->type < posting.type) {
1134 set1p++;
1135 set1c++;
1136 } else if (set1p->type > posting.type) {
1137 *newsetp++ = posting;
1138 read_next_posting(invcntl, &posting);
1139 set2c++;
1140 } else { /* identical postings */
1141 set1c++;
1142 set1p++;
1143 read_next_posting(invcntl, &posting);
1144 set2c++;
1145 }
1146 }
1147 while (set2c++ < *num) {
1148 *newsetp++ = posting;
1149 newsetc++;
1150 read_next_posting(invcntl, &posting);
1151 }
1152 item = newitem;
1153 break; /* end of REVERSENOT */
1154 #endif
1155 }
1156 numitems = newsetc;
1157 *num = newsetc;
1158 enditem = (POSTING *)newsetp;
1159 return ((POSTING *)item);
1160 }
1161
1162 #if 0
1163 POSTING *
1164 boolsave(int clear)
1165 {
1166 int i;
1167 POSTING *ptr;
1168 POSTING *oldstuff, *newstuff;
1169
1170 if (numitems == 0) {
1171 if (clear)
1172 boolclear();
1173 return (NULL);
1174 }
1175 /*
1176 * if clear then give them what we have and use (void)
1177 * boolready to realloc
1178 */
1179 if (clear) {
1180 ptr = item;
1181 /* free up the space we didn't give them */
1182 if (item == item1)
1183 item1 = NULL;
1184 else
1185 item2 = NULL;
1186 (void) boolready();
1187 return (ptr);
1188 }
1189 i = (enditem - item) * sizeof (POSTING) + 100;
1190 if ((ptr = (POSTING *)malloc(i))r == NULL) {
1191 invcannotalloc(i);
1192 return (ptr);
1193 }
1194 /* move present set into place */
1195 oldstuff = item;
1196 newstuff = ptr;
1197 while (oldstuff < enditem)
1198 *newstuff++ = *oldstuff++;
1199 return (ptr);
1200 }
1201 #endif
1202
1203 static void
invcannotalloc(size_t n)1204 invcannotalloc(size_t n)
1205 {
1206 (void) fprintf(stderr, "%s: cannot allocate %u bytes\n", argv0, n);
1207 }
1208
1209 static void
invcannotopen(char * file)1210 invcannotopen(char *file)
1211 {
1212 (void) fprintf(stderr, "%s: cannot open file %s\n", argv0, file);
1213 }
1214
1215 static void
invcannotwrite(char * file)1216 invcannotwrite(char *file)
1217 {
1218 (void) perror(argv0); /* must be first to preserve errno */
1219 (void) fprintf(stderr, "%s: write to file %s failed\n", argv0, file);
1220 }
1221