xref: /titanic_44/usr/src/tools/cscope-fast/invlib.c (revision 7c478bd95313f5f23a4c958a745db2134aa03244)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*	Copyright (c) 1988 AT&T	*/
23 /*	  All Rights Reserved  	*/
24 
25 
26 /*
27  * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
28  * Use is subject to license terms.
29  */
30 
31 #pragma ident	"%Z%%M%	%I%	%E% SMI"
32 
33 #include <ctype.h>
34 #include <stdio.h>
35 #include <sys/types.h>
36 #include <sys/sysmacros.h>
37 #include <sys/byteorder.h>
38 #if SHARE
39 #include <sys/ipc.h>
40 #include <sys/shm.h>
41 #define	ERR  -1
42 #endif
43 #include "invlib.h"
44 #include "library.h"
45 
46 #define	DEBUG		0	/* debugging code and realloc messages */
47 #define	BLOCKSIZE	2 * BUFSIZ	/* logical block size */
48 #define	LINEMAX		1000	/* sorted posting line max size */
49 #define	POSTINC		10000	/* posting buffer size increment */
50 #define	SEP		' '	/* sorted posting field separator */
51 #define	SETINC		100	/* posting set size increment */
52 #define	STATS		0	/* print statistics */
53 #define	SUPERINC	10000	/* super index size increment */
54 #define	TERMMAX		512	/* term max size */
55 #define	VERSION		1	/* inverted index format version */
56 #define	ZIPFSIZE	200	/* zipf curve size */
57 #define	FREAD	"r"		/* fopen for reading */
58 #define	FREADP	"r+"		/* fopen for update */
59 #define	FWRITE	"w"		/* fopen truncate or create for writing */
60 #define	FWRITEP	"w+"		/* fopen truncate or create for update */
61 
62 extern	char	*argv0;	/* command name (must be set in main function) */
63 
64 int	invbreak;
65 
66 #if STATS
67 int	showzipf;	/* show postings per term distribution */
68 #endif
69 
70 static	POSTING	*item, *enditem, *item1 = NULL, *item2 = NULL;
71 static	unsigned setsize1, setsize2;
72 static	long	numitems, totterm, zerolong;
73 static	char	*indexfile, *postingfile;
74 static	FILE	*outfile, *fpost;
75 static	unsigned supersize = SUPERINC, supintsize;
76 static	int	numpost, numlogblk, amtused, nextpost,
77 		lastinblk, numinvitems;
78 static	POSTING	*POST, *postptr;
79 static	unsigned long	*SUPINT, *supint, nextsupfing;
80 static	char	*SUPFING, *supfing;
81 static	char	thisterm[TERMMAX];
82 static	union {
83 	long	invblk[BLOCKSIZE / sizeof (long)];
84 	char	chrblk[BLOCKSIZE];
85 } logicalblk;
86 
87 #if DEBUG || STATS
88 static	long	totpost;
89 #endif
90 
91 #if STATS
92 static	int	zipf[ZIPFSIZE + 1];
93 #endif
94 
95 static void invcannotalloc(size_t n);
96 static void invcannotopen(char *file);
97 static void invcannotwrite(char *file);
98 static int invnewterm(void);
99 static int boolready(void);
100 
101 long
invmake(char * invname,char * invpost,FILE * infile)102 invmake(char *invname, char *invpost, FILE *infile)
103 {
104 	unsigned char	*s;
105 	long	num;
106 	int	i;
107 	long	fileindex;
108 	unsigned postsize = POSTINC * sizeof (POSTING);
109 	unsigned long	*intptr;
110 	char	line[LINEMAX];
111 	long	tlong;
112 	PARAM	param;
113 	POSTING	posting;
114 #if STATS
115 	int	j;
116 	unsigned maxtermlen = 0;
117 #endif
118 	/* output file */
119 	if ((outfile = vpfopen(invname, FWRITEP)) == NULL) {
120 		invcannotopen(invname);
121 		return (0);
122 	}
123 	indexfile = invname;
124 	(void) fseek(outfile, (long)BUFSIZ, 0);
125 
126 	/* posting file  */
127 	if ((fpost = vpfopen(invpost, FWRITE)) == NULL) {
128 		invcannotopen(invpost);
129 		return (0);
130 	}
131 	postingfile = invpost;
132 	nextpost = 0;
133 	/* get space for the postings list */
134 	if ((POST = (POSTING *)malloc(postsize)) == NULL) {
135 		invcannotalloc(postsize);
136 		return (0);
137 	}
138 	postptr = POST;
139 	/* get space for the superfinger (superindex) */
140 	if ((SUPFING = malloc(supersize)) == NULL) {
141 		invcannotalloc(supersize);
142 		return (0);
143 	}
144 	supfing = SUPFING;
145 	supintsize = supersize / 40;
146 	/* also for the superfinger index */
147 	if ((SUPINT = malloc(supintsize * sizeof (long))) == NULL) {
148 		invcannotalloc(supintsize * sizeof (long));
149 		return (0);
150 	}
151 	supint = SUPINT;
152 	supint++; /* leave first term open for a count */
153 	/* initialize using an empty term */
154 	(void) strcpy(thisterm, "");
155 	*supint++ = 0;
156 	*supfing++ = ' ';
157 	*supfing++ = '\0';
158 	nextsupfing = 2;
159 #if DEBUG || STATS
160 	totpost = 0L;
161 #endif
162 	totterm = 0L;
163 	numpost = 1;
164 
165 	/*
166 	 * set up as though a block had come and gone, i.e., set up for
167 	 * new block
168 	 */
169 	amtused = 16; /* leave no space - init 3 words + one for luck */
170 	numinvitems = 0;
171 	numlogblk = 0;
172 	lastinblk = BLOCKSIZE;
173 
174 	/* now loop as long as more to read (till eof)  */
175 	while (fgets(line, LINEMAX, infile) != NULL) {
176 #if DEBUG || STATS
177 		++totpost;
178 #endif
179 		s = (unsigned char *) strchr(line, SEP);
180 		if (s == NULL)		/* where did this line come from ??? */
181 			continue;	/* workaround: just skip it */
182 		*s = '\0';
183 #if STATS
184 		if ((i = strlen(line)) > maxtermlen) {
185 			maxtermlen = i;
186 		}
187 #endif
188 #if DEBUG
189 		(void) printf("%ld: %s ", totpost, line);
190 		(void) fflush(stdout);
191 #endif
192 		if (strcmp(thisterm, line) == 0) {
193 			if (postptr + 10 > POST + postsize / sizeof (POSTING)) {
194 				i = postptr - POST;
195 				postsize += POSTINC * sizeof (POSTING);
196 				if ((POST = realloc(POST, postsize)) == NULL) {
197 					invcannotalloc(postsize);
198 					return (0);
199 				}
200 				postptr = i + POST;
201 #if DEBUG
202 				(void) printf("reallocated post space to %u, "
203 				    "totpost=%ld\n", postsize, totpost);
204 #endif
205 			}
206 			numpost++;
207 		} else {
208 			/* have a new term */
209 			if (!invnewterm()) {
210 				return (0);
211 			}
212 			(void) strcpy(thisterm, line);
213 			numpost = 1;
214 			postptr = POST;
215 			fileindex = 0;
216 		}
217 		/* get the new posting */
218 		num = *++s - '!';
219 		i = 1;
220 		do {
221 			num = BASE * num + *++s - '!';
222 		} while (++i < PRECISION);
223 		posting.lineoffset = num;
224 		while (++fileindex < nsrcoffset && num > srcoffset[fileindex]) {
225 			;
226 		}
227 		posting.fileindex = --fileindex;
228 		posting.type = *++s;
229 		num = *++s - '!';
230 		if (*s != '\n') {
231 			num = *++s - '!';
232 			while (*++s != '\n') {
233 				num = BASE * num + *s - '!';
234 			}
235 			posting.fcnoffset = num;
236 		} else {
237 			posting.fcnoffset = 0;
238 		}
239 		*postptr++ = posting;
240 #if DEBUG
241 		(void) printf("%ld %ld %ld %ld\n", posting.fileindex,
242 		    posting.fcnoffset, posting.lineoffset, posting.type);
243 		(void) fflush(stdout);
244 #endif
245 	}
246 	if (!invnewterm()) {
247 		return (0);
248 	}
249 	/* now clean up final block  */
250 	logicalblk.invblk[0] = numinvitems;
251 	/* loops pointer around to start */
252 	logicalblk.invblk[1] = 0;
253 	logicalblk.invblk[2] = numlogblk - 1;
254 	if (fwrite((char *)&logicalblk, BLOCKSIZE, 1, outfile) == 0) {
255 		goto cannotwrite;
256 	}
257 	numlogblk++;
258 	/* write out block to save space. what in it doesn't matter */
259 	if (fwrite((char *)&logicalblk, BLOCKSIZE, 1, outfile) == 0) {
260 		goto cannotwrite;
261 	}
262 	/* finish up the super finger */
263 	*SUPINT = numlogblk;
264 	/* add to the offsets the size of the offset pointers */
265 	intptr = (SUPINT + 1);
266 	i = (char *)supint - (char *)SUPINT;
267 	while (intptr < supint)
268 		*intptr++ += i;
269 	/* write out the offsets (1 for the N at start) and the super finger */
270 	if (fwrite((char *)SUPINT, sizeof (*SUPINT), numlogblk + 1,
271 	    outfile) == 0 ||
272 	    fwrite(SUPFING, 1, supfing - SUPFING, outfile) == 0) {
273 		goto cannotwrite;
274 	}
275 	/* save the size for reference later */
276 	nextsupfing = sizeof (long) + sizeof (long) * numlogblk +
277 	    (supfing - SUPFING);
278 	/*
279 	 * make sure the file ends at a logical block boundary.  This is
280 	 * necessary for invinsert to correctly create extended blocks
281 	 */
282 	i = nextsupfing % BLOCKSIZE;
283 	/* write out junk to fill log blk */
284 	if (fwrite(SUPFING, BLOCKSIZE - i, 1, outfile) == 0 ||
285 	    fflush(outfile) == EOF) {
286 		/* rewind doesn't check for write failure */
287 		goto cannotwrite;
288 	}
289 	/* write the control area */
290 	rewind(outfile);
291 	param.version = VERSION;
292 	param.filestat = 0;
293 	param.sizeblk = BLOCKSIZE;
294 	param.startbyte = (numlogblk + 1) * BLOCKSIZE + BUFSIZ;
295 	param.supsize = nextsupfing;
296 	param.cntlsize = BUFSIZ;
297 	param.share = 0;
298 	if (fwrite((char *)&param, sizeof (param), 1, outfile) == 0) {
299 		goto cannotwrite;
300 	}
301 	for (i = 0; i < 10; i++)	/* for future use */
302 		if (fwrite((char *)&zerolong, sizeof (zerolong),
303 		    1, outfile) == 0) {
304 			goto cannotwrite;
305 		}
306 
307 	/* make first block loop backwards to last block */
308 	if (fflush(outfile) == EOF) {
309 		/* fseek doesn't check for write failure */
310 		goto cannotwrite;
311 	}
312 	/* get to second word first block */
313 	(void) fseek(outfile, (long)BUFSIZ + 8, 0);
314 	tlong = numlogblk - 1;
315 	if (fwrite((char *)&tlong, sizeof (tlong), 1, outfile) == 0 ||
316 	    fclose(outfile) == EOF) {
317 cannotwrite:
318 		invcannotwrite(invname);
319 		return (0);
320 	}
321 	if (fclose(fpost) == EOF) {
322 		invcannotwrite(postingfile);
323 		return (0);
324 	}
325 	--totterm;	/* don't count null term */
326 #if STATS
327 	(void) printf("logical blocks = %d, postings = %ld, terms = %ld, "
328 	    "max term length = %d\n", numlogblk, totpost, totterm, maxtermlen);
329 	if (showzipf) {
330 		(void) printf(
331 		    "\n*************   ZIPF curve  ****************\n");
332 		for (j = ZIPFSIZE; j > 1; j--)
333 			if (zipf[j])
334 				break;
335 		for (i = 1; i < j; ++i) {
336 			(void) printf("%3d -%6d ", i, zipf[i]);
337 			if (i % 6 == 0) (void) putchar('\n');
338 		}
339 		(void) printf(">%d-%6d\n", ZIPFSIZE, zipf[0]);
340 	}
341 #endif
342 	/* free all malloc'd memory */
343 	free(POST);
344 	free(SUPFING);
345 	free(SUPINT);
346 	return (totterm);
347 }
348 
349 /* add a term to the data base */
350 
351 static int
invnewterm(void)352 invnewterm(void)
353 {
354 	int	backupflag, i, j, maxback, holditems, gooditems, howfar;
355 	int	len, numwilluse, wdlen;
356 	char	*tptr, *tptr2, *tptr3;
357 	union {
358 		unsigned long	packword[2];
359 		ENTRY		e;
360 	} iteminfo;
361 
362 	totterm++;
363 #if STATS
364 	/* keep zipfian info on the distribution */
365 	if (numpost <= ZIPFSIZE)
366 		zipf[numpost]++;
367 	else
368 		zipf[0]++;
369 #endif
370 	len = strlen(thisterm);
371 	wdlen = (len + (sizeof (long) - 1)) / sizeof (long);
372 	numwilluse = (wdlen + 3) * sizeof (long);
373 	/* new block if at least 1 item in block */
374 	if (numinvitems && numwilluse + amtused > BLOCKSIZE) {
375 		/* set up new block */
376 		if (supfing + 500 > SUPFING + supersize) {
377 			i = supfing - SUPFING;
378 			supersize += 20000;
379 			if ((SUPFING = realloc(SUPFING, supersize)) == NULL) {
380 				invcannotalloc(supersize);
381 				return (0);
382 			}
383 			supfing = i + SUPFING;
384 #if DEBUG
385 			(void) printf("reallocated superfinger space to %d, "
386 			    "totpost=%ld\n", supersize, totpost);
387 #endif
388 		}
389 		/* check that room for the offset as well */
390 		if ((numlogblk + 10) > supintsize) {
391 			i = supint - SUPINT;
392 			supintsize += SUPERINC;
393 			if ((SUPINT = realloc((char *)SUPINT,
394 			    supintsize * sizeof (long))) == NULL) {
395 				invcannotalloc(supintsize * sizeof (long));
396 				return (0);
397 			}
398 			supint = i + SUPINT;
399 #if DEBUG
400 			(void) printf("reallocated superfinger offset to %d, "
401 			    "totpost = %ld\n", supintsize * sizeof (long),
402 			    totpost);
403 #endif
404 		}
405 		/* See if backup is efficatious  */
406 		backupflag = 0;
407 		maxback = strlen(thisterm) / 10;
408 		holditems = numinvitems;
409 		if (maxback > numinvitems)
410 			maxback = numinvitems - 2;
411 		howfar = 0;
412 		while (--maxback > 0) {
413 			howfar++;
414 			iteminfo.packword[0] =
415 			    logicalblk.invblk[--holditems * 2 +
416 			    (sizeof (long) - 1)];
417 			if ((i = iteminfo.e.size / 10) < maxback) {
418 				maxback = i;
419 				backupflag = howfar;
420 				gooditems = holditems;
421 				tptr2 = logicalblk.chrblk + iteminfo.e.offset;
422 			}
423 		}
424 		/* see if backup will occur  */
425 		if (backupflag) {
426 			numinvitems = gooditems;
427 		}
428 		logicalblk.invblk[0] = numinvitems;
429 		/* set forward pointer pointing to next */
430 		logicalblk.invblk[1] = numlogblk + 1;
431 		/* set back pointer to last block */
432 		logicalblk.invblk[2] = numlogblk - 1;
433 		if (fwrite((char *)logicalblk.chrblk, 1,
434 		    BLOCKSIZE, outfile) == 0) {
435 			invcannotwrite(indexfile);
436 			return (0);
437 		}
438 		amtused = 16;
439 		numlogblk++;
440 		/* check if had to back up, if so do it */
441 		if (backupflag) {
442 			/* find out where the end of the new block is */
443 			iteminfo.packword[0] =
444 			    logicalblk.invblk[numinvitems * 2 + 1];
445 			tptr3 = logicalblk.chrblk + iteminfo.e.offset;
446 			/* move the index for this block */
447 			for (i = 3; i <= (backupflag * 2 + 2); i++) {
448 				logicalblk.invblk[i] =
449 				    logicalblk.invblk[numinvitems * 2+i];
450 			}
451 			/* move the word into the super index */
452 			iteminfo.packword[0] = logicalblk.invblk[3];
453 			iteminfo.packword[1] = logicalblk.invblk[4];
454 			tptr2 = logicalblk.chrblk + iteminfo.e.offset;
455 			(void) strncpy(supfing, tptr2, (int)iteminfo.e.size);
456 			*(supfing + iteminfo.e.size) = '\0';
457 #if DEBUG
458 			(void) printf("backup %d at term=%s to term=%s\n",
459 			    backupflag, thisterm, supfing);
460 #endif
461 			*supint++ = nextsupfing;
462 			nextsupfing += strlen(supfing) + 1;
463 			supfing += strlen(supfing) + 1;
464 			/* now fix up the logical block */
465 			tptr = logicalblk.chrblk + lastinblk;
466 			lastinblk = BLOCKSIZE;
467 			tptr2 = logicalblk.chrblk + lastinblk;
468 			j = tptr3 - tptr;
469 			while (tptr3 > tptr)
470 				*--tptr2 = *--tptr3;
471 			lastinblk -= j;
472 			amtused += (8 * backupflag + j);
473 			for (i = 3; i < (backupflag * 2 + 2); i += 2) {
474 				iteminfo.packword[0] = logicalblk.invblk[i];
475 				iteminfo.e.offset += (tptr2 - tptr3);
476 				logicalblk.invblk[i] = iteminfo.packword[0];
477 			}
478 			numinvitems = backupflag;
479 		} else { /* no backup needed */
480 			numinvitems = 0;
481 			lastinblk = BLOCKSIZE;
482 			/* add new term to superindex */
483 			(void) strcpy(supfing, thisterm);
484 			supfing += strlen(thisterm) + 1;
485 			*supint++ = nextsupfing;
486 			nextsupfing += strlen(thisterm) + 1;
487 		}
488 	}
489 	lastinblk -= (numwilluse - 8);
490 	iteminfo.e.offset = lastinblk;
491 	iteminfo.e.size = (char)len;
492 	iteminfo.e.space = 0;
493 	iteminfo.e.post = numpost;
494 	(void) strncpy(logicalblk.chrblk + lastinblk, thisterm, len);
495 	amtused += numwilluse;
496 	logicalblk.invblk[(lastinblk/sizeof (long))+wdlen] = nextpost;
497 	if ((i = postptr - POST) > 0) {
498 		if (fwrite((char *)POST, sizeof (POSTING), i, fpost) == 0) {
499 			invcannotwrite(postingfile);
500 			return (0);
501 		}
502 		nextpost += i * sizeof (POSTING);
503 	}
504 	logicalblk.invblk[3+2*numinvitems++] = iteminfo.packword[0];
505 	logicalblk.invblk[2+2*numinvitems] = iteminfo.packword[1];
506 	return (1);
507 }
508 
509 static void
swap_ints(void * p,size_t sz)510 swap_ints(void *p, size_t sz)
511 {
512 	uint32_t *s;
513 	uint32_t *e = (uint32_t *)p + (sz / sizeof (uint32_t));
514 
515 	for (s = p; s < e; s++)
516 		*s = BSWAP_32(*s);
517 }
518 
519 static void
write_param(INVCONTROL * invcntl)520 write_param(INVCONTROL *invcntl)
521 {
522 	if (invcntl->swap)
523 		swap_ints(&invcntl->param, sizeof (invcntl->param));
524 
525 	rewind(invcntl->invfile);
526 	(void) fwrite((char *)&invcntl->param, sizeof (invcntl->param), 1,
527 	    invcntl->invfile);
528 
529 	if (invcntl->swap)
530 		swap_ints(&invcntl->param, sizeof (invcntl->param));
531 }
532 
533 static void
read_superfinger(INVCONTROL * invcntl)534 read_superfinger(INVCONTROL *invcntl)
535 {
536 	size_t count;
537 
538 	(void) fseek(invcntl->invfile, invcntl->param.startbyte, SEEK_SET);
539 	(void) fread(invcntl->iindex, (int)invcntl->param.supsize,
540 	    1, invcntl->invfile);
541 
542 	if (invcntl->swap) {
543 		/*
544 		 * The superfinger consists of a count, followed by
545 		 * count offsets, followed by a string table (which
546 		 * the offsets reference).
547 		 *
548 		 * We need to swap the count and the offsets.
549 		 */
550 		count = 1 + BSWAP_32(*(uint32_t *)invcntl->iindex);
551 		swap_ints(invcntl->iindex, count * sizeof (unsigned long));
552 	}
553 }
554 
555 static void
read_logblock(INVCONTROL * invcntl,int block)556 read_logblock(INVCONTROL *invcntl, int block)
557 {
558 	/* note always fetch it if the file is busy */
559 	if ((block != invcntl->numblk) ||
560 	    (invcntl->param.filestat >= INVBUSY)) {
561 		(void) fseek(invcntl->invfile,
562 		    (block * invcntl->param.sizeblk) + invcntl->param.cntlsize,
563 		    SEEK_SET);
564 		invcntl->numblk = block;
565 		(void) fread(invcntl->logblk, (int)invcntl->param.sizeblk,
566 		    1, invcntl->invfile);
567 
568 		if (invcntl->swap) {
569 			size_t count;
570 			ENTRY *ecur, *eend;
571 			uint32_t *postptr;
572 
573 			/*
574 			 * A logblock consists of a count, a next block id,
575 			 * and a previous block id, followed by count
576 			 * ENTRYs, followed by alternating strings and
577 			 * offsets.
578 			 */
579 			swap_ints(invcntl->logblk, 3 * sizeof (unsigned long));
580 
581 			count = *(uint32_t *)invcntl->logblk;
582 
583 			ecur = (ENTRY *)((uint32_t *)invcntl->logblk + 3);
584 			eend = ecur + count;
585 
586 			for (; ecur < eend; ecur++) {
587 				ecur->offset = BSWAP_16(ecur->offset);
588 				ecur->post = BSWAP_32(ecur->post);
589 
590 				/*
591 				 * After the string is the posting offset.
592 				 */
593 				postptr = (uint32_t *)(invcntl->logblk +
594 				    ecur->offset +
595 				    P2ROUNDUP(ecur->size, sizeof (long)));
596 
597 				*postptr = BSWAP_32(*postptr);
598 			}
599 		}
600 	}
601 }
602 
603 void
read_next_posting(INVCONTROL * invcntl,POSTING * posting)604 read_next_posting(INVCONTROL *invcntl, POSTING *posting)
605 {
606 	(void) fread((char *)posting, sizeof (*posting), 1, invcntl->postfile);
607 	if (invcntl->swap) {
608 		posting->lineoffset = BSWAP_32(posting->lineoffset);
609 		posting->fcnoffset = BSWAP_32(posting->fcnoffset);
610 		/*
611 		 * fileindex is a 24-bit field, so shift it before swapping
612 		 */
613 		posting->fileindex = BSWAP_32(posting->fileindex << 8);
614 	}
615 }
616 
617 int
invopen(INVCONTROL * invcntl,char * invname,char * invpost,int stat)618 invopen(INVCONTROL *invcntl, char *invname, char *invpost, int stat)
619 {
620 	int	read_index;
621 
622 	if ((invcntl->invfile = vpfopen(invname,
623 	    ((stat == 0) ? FREAD : FREADP))) == NULL) {
624 		invcannotopen(invname);
625 		return (-1);
626 	}
627 	if (fread((char *)&invcntl->param, sizeof (invcntl->param), 1,
628 	    invcntl->invfile) == 0) {
629 		(void) fprintf(stderr, "%s: empty inverted file\n", argv0);
630 		goto closeinv;
631 	}
632 	if (invcntl->param.version != VERSION &&
633 	    BSWAP_32(invcntl->param.version) != VERSION) {
634 		(void) fprintf(stderr,
635 		    "%s: cannot read old index format; use -U option to "
636 		    "force database to rebuild\n", argv0);
637 		goto closeinv;
638 	}
639 	invcntl->swap = (invcntl->param.version != VERSION);
640 	if (invcntl->swap)
641 		swap_ints(&invcntl->param, sizeof (invcntl->param));
642 
643 	if (stat == 0 && invcntl->param.filestat == INVALONE) {
644 		(void) fprintf(stderr, "%s: inverted file is locked\n", argv0);
645 		goto closeinv;
646 	}
647 	if ((invcntl->postfile = vpfopen(invpost,
648 	    ((stat == 0) ? FREAD : FREADP))) == NULL) {
649 		invcannotopen(invpost);
650 		goto closeinv;
651 	}
652 	/* allocate core for a logical block  */
653 	if ((invcntl->logblk = malloc(invcntl->param.sizeblk)) == NULL) {
654 		invcannotalloc((unsigned)invcntl->param.sizeblk);
655 		goto closeboth;
656 	}
657 	/* allocate for and read in superfinger  */
658 	read_index = 1;
659 	invcntl->iindex = NULL;
660 #if SHARE
661 	if (invcntl->param.share == 1) {
662 		key_t ftok(), shm_key;
663 		struct shmid_ds shm_buf;
664 		char	*shmat();
665 		int	shm_id;
666 
667 		/* see if the shared segment exists */
668 		shm_key = ftok(invname, 2);
669 		shm_id = shmget(shm_key, 0, 0);
670 		/*
671 		 * Failure simply means (hopefully) that segment doesn't
672 		 * exist
673 		 */
674 		if (shm_id == -1) {
675 			/*
676 			 * Have to give general write permission due to AMdahl
677 			 * not having protected segments
678 			 */
679 			shm_id = shmget(shm_key,
680 			    invcntl->param.supsize + sizeof (long),
681 			    IPC_CREAT | 0666);
682 			if (shm_id == -1)
683 				perror("Could not create shared "
684 				    "memory segment");
685 		} else
686 			read_index = 0;
687 
688 		if (shm_id != -1) {
689 			invcntl->iindex = shmat(shm_id, 0,
690 			    ((read_index) ? 0 : SHM_RDONLY));
691 			if (invcntl->iindex == (char *)ERR) {
692 				(void) fprintf(stderr,
693 				    "%s: shared memory link failed\n", argv0);
694 				invcntl->iindex = NULL;
695 				read_index = 1;
696 			}
697 		}
698 	}
699 #endif
700 	if (invcntl->iindex == NULL)
701 		invcntl->iindex = malloc(invcntl->param.supsize + 16);
702 	if (invcntl->iindex == NULL) {
703 		invcannotalloc((unsigned)invcntl->param.supsize);
704 		free(invcntl->logblk);
705 		goto closeboth;
706 	}
707 	if (read_index) {
708 		read_superfinger(invcntl);
709 	}
710 	invcntl->numblk = -1;
711 	if (boolready() == -1) {
712 	closeboth:
713 		(void) fclose(invcntl->postfile);
714 	closeinv:
715 		(void) fclose(invcntl->invfile);
716 		return (-1);
717 	}
718 	/* write back out the control block if anything changed */
719 	invcntl->param.filestat = stat;
720 	if (stat > invcntl->param.filestat)
721 		write_param(invcntl);
722 	return (1);
723 }
724 
725 /* invclose must be called to wrap things up and deallocate core */
726 void
invclose(INVCONTROL * invcntl)727 invclose(INVCONTROL *invcntl)
728 {
729 	/* write out the control block in case anything changed */
730 	if (invcntl->param.filestat > 0) {
731 		invcntl->param.filestat = 0;
732 		write_param(invcntl);
733 	}
734 	(void) fclose(invcntl->invfile);
735 	(void) fclose(invcntl->postfile);
736 #if SHARE
737 	if (invcntl->param.share > 0) {
738 		shmdt(invcntl->iindex);
739 		invcntl->iindex = NULL;
740 	}
741 #endif
742 	if (invcntl->iindex != NULL)
743 		free(invcntl->iindex);
744 	free(invcntl->logblk);
745 }
746 
747 /* invstep steps the inverted file forward one item */
748 void
invstep(INVCONTROL * invcntl)749 invstep(INVCONTROL *invcntl)
750 {
751 	if (invcntl->keypnt < (*(int *)invcntl->logblk - 1)) {
752 		invcntl->keypnt++;
753 		return;
754 	}
755 
756 	/* move forward a block else wrap */
757 	read_logblock(invcntl, *(int *)(invcntl->logblk + sizeof (long)));
758 
759 	invcntl->keypnt = 0;
760 }
761 
762 /* invforward moves forward one term in the inverted file  */
763 int
invforward(INVCONTROL * invcntl)764 invforward(INVCONTROL *invcntl)
765 {
766 	invstep(invcntl);
767 	/* skip things with 0 postings */
768 	while (((ENTRY *)(invcntl->logblk + 12) + invcntl->keypnt)->post == 0) {
769 		invstep(invcntl);
770 	}
771 	/* Check for having wrapped - reached start of inverted file! */
772 	if ((invcntl->numblk == 0) && (invcntl->keypnt == 0))
773 		return (0);
774 	return (1);
775 }
776 
777 /*  invterm gets the present term from the present logical block  */
778 int
invterm(INVCONTROL * invcntl,char * term)779 invterm(INVCONTROL *invcntl, char *term)
780 {
781 	ENTRY * entryptr;
782 
783 	entryptr = (ENTRY *)(invcntl->logblk + 12) + invcntl->keypnt;
784 	(void) strncpy(term, entryptr->offset + invcntl->logblk,
785 	    (int)entryptr->size);
786 	*(term + entryptr->size) = '\0';
787 	return (entryptr->post);
788 }
789 
790 /* invfind searches for an individual item in the inverted file */
791 long
invfind(INVCONTROL * invcntl,char * searchterm)792 invfind(INVCONTROL *invcntl, char *searchterm)
793 {
794 	int	imid, ilow, ihigh;
795 	long	num;
796 	int	i;
797 	unsigned long	*intptr, *intptr2;
798 	ENTRY *entryptr;
799 
800 	/* make sure it is initialized via invready  */
801 	if (invcntl->invfile == 0)
802 		return (-1L);
803 
804 	/* now search for the appropriate finger block */
805 	intptr = (unsigned long *)invcntl->iindex;
806 
807 	ilow = 0;
808 	ihigh = *intptr++ - 1;
809 	while (ilow <= ihigh) {
810 		imid = (ilow + ihigh) / 2;
811 		intptr2 = intptr + imid;
812 		i = strcmp(searchterm, (invcntl->iindex + *intptr2));
813 		if (i < 0)
814 			ihigh = imid - 1;
815 		else if (i > 0)
816 			ilow = ++imid;
817 		else {
818 			ilow = imid + 1;
819 			break;
820 		}
821 	}
822 	/* be careful about case where searchterm is after last in this block */
823 	imid = (ilow) ? ilow - 1 : 0;
824 
825 	/* fetch the appropriate logical block if not in core  */
826 	read_logblock(invcntl, imid);
827 
828 srch_ext:
829 	/* now find the term in this block. tricky this  */
830 	intptr = (unsigned long *)invcntl->logblk;
831 
832 	ilow = 0;
833 	ihigh = *intptr - 1;
834 	intptr += 3;
835 	num = 0;
836 	while (ilow <= ihigh) {
837 		imid = (ilow + ihigh) / 2;
838 		entryptr = (ENTRY *)intptr + imid;
839 		i = strncmp(searchterm, (invcntl->logblk + entryptr->offset),
840 		    (int)entryptr->size);
841 		if (i == 0)
842 			i = strlen(searchterm) - entryptr->size;
843 		if (i < 0)
844 			ihigh = imid - 1;
845 		else if (i > 0)
846 			ilow = ++imid;
847 		else {
848 			num = entryptr->post;
849 			break;
850 		}
851 	}
852 	/* be careful about case where searchterm is after last in this block */
853 	if (imid >= *(long *)invcntl->logblk) {
854 		invcntl->keypnt = *(long *)invcntl->logblk;
855 		invstep(invcntl);
856 		/* note if this happens the term could be in extended block */
857 		if (invcntl->param.startbyte <
858 		    invcntl->numblk * invcntl->param.sizeblk)
859 			goto srch_ext;
860 	} else
861 		invcntl->keypnt = imid;
862 	return (num);
863 }
864 
865 #if DEBUG
866 
867 /* invdump dumps the block the term parameter is in */
868 void
invdump(INVCONTROL * invcntl,char * term)869 invdump(INVCONTROL *invcntl, char *term)
870 {
871 	long	i, j, n, *longptr;
872 	ENTRY * entryptr;
873 	char	temp[512], *ptr;
874 
875 	/* dump superindex if term is "-"  */
876 	if (*term == '-') {
877 		j = atoi(term + 1);
878 		longptr = (long *)invcntl->iindex;
879 		n = *longptr++;
880 		(void) printf("Superindex dump, num blocks=%ld\n", n);
881 		longptr += j;
882 		while ((longptr <= ((long *)invcntl->iindex) + n) &&
883 		    invbreak == 0) {
884 			(void) printf("%2ld  %6ld %s\n", j++, *longptr,
885 			    invcntl->iindex + *longptr);
886 			longptr++;
887 		}
888 		return;
889 	} else if (*term == '#') {
890 		j = atoi(term + 1);
891 		/* fetch the appropriate logical block */
892 		read_logblock(invcntl, j);
893 	} else
894 		i = abs((int)invfind(invcntl, term));
895 	longptr = (long *)invcntl->logblk;
896 	n = *longptr++;
897 	(void) printf("Entry term to invdump=%s, postings=%ld, "
898 	    "forward ptr=%ld, back ptr=%ld\n", term, i, *(longptr),
899 	    *(longptr + 1));
900 	entryptr = (ENTRY *)(invcntl->logblk + 12);
901 	(void) printf("%ld terms in this block, block=%ld\n", n,
902 	    invcntl->numblk);
903 	(void) printf("\tterm\t\t\tposts\tsize\toffset\tspace\t1st word\n");
904 	for (j = 0; j < n && invbreak == 0; j++) {
905 		ptr = invcntl->logblk + entryptr->offset;
906 		(void) strncpy(temp, ptr, (int)entryptr->size);
907 		temp[entryptr->size] = '\0';
908 		ptr += (sizeof (long) *
909 		    (long)((entryptr->size +
910 		    (sizeof (long) - 1)) / sizeof (long)));
911 		(void) printf("%2ld  %-24s\t%5ld\t%3d\t%d\t%d\t%ld\n", j, temp,
912 		    entryptr->post, entryptr->size, entryptr->offset,
913 		    entryptr->space, *(long *)ptr);
914 		entryptr++;
915 	}
916 }
917 #endif
918 
919 static int
boolready(void)920 boolready(void)
921 {
922 	numitems = 0;
923 	if (item1 != NULL)
924 		free(item1);
925 	setsize1 = SETINC;
926 	if ((item1 = (POSTING *)malloc(SETINC * sizeof (POSTING))) == NULL) {
927 		invcannotalloc(SETINC);
928 		return (-1);
929 	}
930 	if (item2 != NULL)
931 		free(item2);
932 	setsize2 = SETINC;
933 	if ((item2 = (POSTING *)malloc(SETINC * sizeof (POSTING))) == NULL) {
934 		invcannotalloc(SETINC);
935 		return (-1);
936 	}
937 	item = item1;
938 	enditem = item;
939 	return (0);
940 }
941 
942 void
boolclear(void)943 boolclear(void)
944 {
945 	numitems = 0;
946 	item = item1;
947 	enditem = item;
948 }
949 
950 POSTING *
boolfile(INVCONTROL * invcntl,long * num,int bool)951 boolfile(INVCONTROL *invcntl, long *num, int bool)
952 {
953 	ENTRY	*entryptr;
954 	FILE	*file;
955 	char	*ptr;
956 	unsigned long	*ptr2;
957 	POSTING	*newitem;
958 	POSTING	posting;
959 	unsigned u;
960 	POSTING *newsetp, *set1p;
961 	long	newsetc, set1c, set2c;
962 
963 	entryptr = (ENTRY *) (invcntl->logblk + 12) + invcntl->keypnt;
964 	ptr = invcntl->logblk + entryptr->offset;
965 	ptr2 = ((unsigned long *)ptr) +
966 	    (entryptr->size + (sizeof (long) - 1)) / sizeof (long);
967 	*num = entryptr->post;
968 	switch (bool) {
969 	case OR:
970 	case NOT:
971 		if (*num == 0) {
972 			*num = numitems;
973 			return (item);
974 		}
975 	}
976 	/* make room for the new set */
977 	u = 0;
978 	switch (bool) {
979 	case AND:
980 	case NOT:
981 		newsetp = set1p = item;
982 		break;
983 
984 	case OR:
985 		u = enditem - item;
986 		/* FALLTHROUGH */
987 	case REVERSENOT:
988 		u += *num;
989 		if (item == item2) {
990 			if (u > setsize1) {
991 				u += SETINC;
992 				if ((item1 = (POSTING *) realloc(item1,
993 				    u * sizeof (POSTING))) == NULL) {
994 					goto cannotalloc;
995 				}
996 				setsize1 = u;
997 			}
998 			newitem = item1;
999 		} else {
1000 			if (u > setsize2) {
1001 				u += SETINC;
1002 				if ((item2 = (POSTING *)realloc(item2,
1003 				    u * sizeof (POSTING))) == NULL) {
1004 				cannotalloc:
1005 					invcannotalloc(u * sizeof (POSTING));
1006 					(void) boolready();
1007 					*num = -1;
1008 					return (NULL);
1009 				}
1010 				setsize2 = u;
1011 			}
1012 			newitem = item2;
1013 		}
1014 		set1p = item;
1015 		newsetp = newitem;
1016 	}
1017 	file = invcntl->postfile;
1018 	(void) fseek(file, (long)*ptr2, SEEK_SET);
1019 	read_next_posting(invcntl, &posting);
1020 	newsetc = 0;
1021 	switch (bool) {
1022 	case OR:
1023 		/* while something in both sets */
1024 		set1p = item;
1025 		newsetp = newitem;
1026 		for (set1c = 0, set2c = 0;
1027 		    set1c < numitems && set2c < *num; newsetc++) {
1028 			if (set1p->lineoffset < posting.lineoffset) {
1029 				*newsetp++ = *set1p++;
1030 				set1c++;
1031 			} else if (set1p->lineoffset > posting.lineoffset) {
1032 				*newsetp++ = posting;
1033 				read_next_posting(invcntl, &posting);
1034 				set2c++;
1035 			} else if (set1p->type < posting.type) {
1036 				*newsetp++ = *set1p++;
1037 				set1c++;
1038 			} else if (set1p->type > posting.type) {
1039 				*newsetp++ = posting;
1040 				read_next_posting(invcntl, &posting);
1041 				set2c++;
1042 			} else {	/* identical postings */
1043 				*newsetp++ = *set1p++;
1044 				set1c++;
1045 				read_next_posting(invcntl, &posting);
1046 				set2c++;
1047 			}
1048 		}
1049 		/* find out what ran out and move the rest in */
1050 		if (set1c < numitems) {
1051 			newsetc += numitems - set1c;
1052 			while (set1c++ < numitems) {
1053 				*newsetp++ = *set1p++;
1054 			}
1055 		} else {
1056 			while (set2c++ < *num) {
1057 				*newsetp++ = posting;
1058 				newsetc++;
1059 				read_next_posting(invcntl, &posting);
1060 			}
1061 		}
1062 		item = newitem;
1063 		break; /* end of OR */
1064 #if 0
1065 	case AND:
1066 		set1c = 0;
1067 		set2c = 0;
1068 		while (set1c < numitems && set2c < *num) {
1069 			if (set1p->lineoffset < posting.lineoffset) {
1070 				set1p++;
1071 				set1c++;
1072 			} else if (set1p->lineoffset > posting.lineoffset) {
1073 				read_next_posting(invcntl, &posting);
1074 				set2c++;
1075 			} else if (set1p->type < posting.type)  {
1076 				*set1p++;
1077 				set1c++;
1078 			} else if (set1p->type > posting.type) {
1079 				read_next_posting(invcntl, &posting);
1080 				set2c++;
1081 			} else {	/* identical postings */
1082 				*newsetp++ = *set1p++;
1083 				newsetc++;
1084 				set1c++;
1085 				read_next_posting(invcntl, &posting);
1086 				set2c++;
1087 			}
1088 		}
1089 		break; /* end of AND */
1090 
1091 	case NOT:
1092 		set1c = 0;
1093 		set2c = 0;
1094 		while (set1c < numitems && set2c < *num) {
1095 			if (set1p->lineoffset < posting.lineoffset) {
1096 				*newsetp++ = *set1p++;
1097 				newsetc++;
1098 				set1c++;
1099 			} else if (set1p->lineoffset > posting.lineoffset) {
1100 				read_next_posting(invcntl, &posting);
1101 				set2c++;
1102 			} else if (set1p->type < posting.type) {
1103 				*newsetp++ = *set1p++;
1104 				newsetc++;
1105 				set1c++;
1106 			} else if (set1p->type > posting.type) {
1107 				read_next_posting(invcntl, &posting);
1108 				set2c++;
1109 			} else {	/* identical postings */
1110 				set1c++;
1111 				set1p++;
1112 				read_next_posting(invcntl, &posting);
1113 				set2c++;
1114 			}
1115 		}
1116 		newsetc += numitems - set1c;
1117 		while (set1c++ < numitems) {
1118 			*newsetp++ = *set1p++;
1119 		}
1120 		break; /* end of NOT */
1121 
1122 	case REVERSENOT:  /* core NOT incoming set */
1123 		set1c = 0;
1124 		set2c = 0;
1125 		while (set1c < numitems && set2c < *num) {
1126 			if (set1p->lineoffset < posting.lineoffset) {
1127 				set1p++;
1128 				set1c++;
1129 			} else if (set1p->lineoffset > posting.lineoffset) {
1130 				*newsetp++ = posting;
1131 				read_next_posting(invcntl, &posting);
1132 				set2c++;
1133 			} else if (set1p->type < posting.type) {
1134 				set1p++;
1135 				set1c++;
1136 			} else if (set1p->type > posting.type) {
1137 				*newsetp++ = posting;
1138 				read_next_posting(invcntl, &posting);
1139 				set2c++;
1140 			} else {	/* identical postings */
1141 				set1c++;
1142 				set1p++;
1143 				read_next_posting(invcntl, &posting);
1144 				set2c++;
1145 			}
1146 		}
1147 		while (set2c++ < *num) {
1148 			*newsetp++ = posting;
1149 			newsetc++;
1150 			read_next_posting(invcntl, &posting);
1151 		}
1152 		item = newitem;
1153 		break; /* end of REVERSENOT  */
1154 #endif
1155 	}
1156 	numitems = newsetc;
1157 	*num = newsetc;
1158 	enditem = (POSTING *)newsetp;
1159 	return ((POSTING *)item);
1160 }
1161 
1162 #if 0
1163 POSTING *
1164 boolsave(int clear)
1165 {
1166 	int	i;
1167 	POSTING	*ptr;
1168 	POSTING	*oldstuff, *newstuff;
1169 
1170 	if (numitems == 0) {
1171 		if (clear)
1172 			boolclear();
1173 		return (NULL);
1174 	}
1175 	/*
1176 	 * if clear then give them what we have and use (void)
1177 	 * boolready to realloc
1178 	 */
1179 	if (clear) {
1180 		ptr = item;
1181 		/* free up the space we didn't give them */
1182 		if (item == item1)
1183 			item1 = NULL;
1184 		else
1185 			item2 = NULL;
1186 		(void) boolready();
1187 		return (ptr);
1188 	}
1189 	i = (enditem - item) * sizeof (POSTING) + 100;
1190 	if ((ptr = (POSTING *)malloc(i))r == NULL) {
1191 		invcannotalloc(i);
1192 		return (ptr);
1193 	}
1194 	/* move present set into place  */
1195 	oldstuff = item;
1196 	newstuff = ptr;
1197 	while (oldstuff < enditem)
1198 		*newstuff++ = *oldstuff++;
1199 	return (ptr);
1200 }
1201 #endif
1202 
1203 static void
invcannotalloc(size_t n)1204 invcannotalloc(size_t n)
1205 {
1206 	(void) fprintf(stderr, "%s: cannot allocate %u bytes\n", argv0, n);
1207 }
1208 
1209 static void
invcannotopen(char * file)1210 invcannotopen(char *file)
1211 {
1212 	(void) fprintf(stderr, "%s: cannot open file %s\n", argv0, file);
1213 }
1214 
1215 static void
invcannotwrite(char * file)1216 invcannotwrite(char *file)
1217 {
1218 	(void) perror(argv0);	/* must be first to preserve errno */
1219 	(void) fprintf(stderr, "%s: write to file %s failed\n", argv0, file);
1220 }
1221