xref: /illumos-gate/usr/src/cmd/fs.d/nfs/nfslog/readbuf.c (revision 5c43f0bd385a568d23843a2fa79774668657d147)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * nfs log - read buffer file and return structs in usable form
29  */
30 
31 #include <ctype.h>
32 #include <stdio.h>
33 #include <stdlib.h>
34 #include <stddef.h>
35 #include <string.h>
36 #include <fcntl.h>
37 #include <unistd.h>
38 #include <signal.h>
39 #include <sys/types.h>
40 #include <sys/param.h>
41 #include <sys/stat.h>
42 #include <sys/utsname.h>
43 #include <sys/mman.h>
44 #include <strings.h>
45 #include <errno.h>
46 #include <syslog.h>
47 #include <time.h>
48 #include <limits.h>
49 #include <libintl.h>
50 #include <values.h>
51 #include <search.h>
52 #include <pwd.h>
53 #include <netdb.h>
54 #include <rpc/rpc.h>
55 #include <netconfig.h>
56 #include <netdir.h>
57 #include <nfs/nfs_sec.h>
58 #include <nfs/export.h>
59 #include <rpc/auth.h>
60 #include <rpc/svc.h>
61 #include <rpc/xdr.h>
62 #include <rpc/clnt.h>
63 #include <nfs/nfs.h>
64 #include <nfs/nfs_log.h>
65 #include "nfslogd.h"
66 
67 #define	MAX_LRS_READ_AHEAD 2048
68 #define	MAX_RECS_TO_DELAY 32768
69 
70 static int 		nfslog_init_buf(char *, struct nfslog_buf *, int *);
71 static void		nfslog_free_buf(struct nfslog_buf *, int);
72 static struct nfslog_lr *nfslog_read_buffer(struct nfslog_buf *);
73 static void		free_lrp(struct nfslog_lr *);
74 static struct nfslog_lr *remove_lrp_from_lb(struct nfslog_buf *,
75 			struct nfslog_lr *);
76 static void		insert_lrp_to_lb(struct nfslog_buf *,
77 			struct nfslog_lr *);
78 static void		nfslog_rewrite_bufheader(struct nfslog_buf *);
79 
80 /*
81  * Treat the provided path name as an NFS log buffer file.
82  * Allocate a data structure for its handling and initialize it.
83  * *error contains the previous error condition encountered for
84  * this object. This value can be used to avoid printing the last
85  * error endlessly.
86  * It will set *error appropriately after processing.
87  */
88 struct nfslog_buf *
89 nfslog_open_buf(char *bufpath, int *error)
90 {
91 	struct nfslog_buf	*lbp = NULL;
92 
93 	if (bufpath == NULL) {
94 		*error = EINVAL;
95 		return (NULL);
96 	}
97 
98 	if ((lbp = malloc(sizeof (struct nfslog_buf))) == NULL) {
99 		*error = ENOMEM;
100 		return (NULL);
101 	}
102 	bzero(lbp, sizeof (struct nfslog_buf));
103 
104 	if (nfslog_init_buf(bufpath, lbp, error)) {
105 		free(lbp);
106 		return (NULL);
107 	}
108 	return (lbp);
109 }
110 
111 /*
112  * Free the log buffer struct with all of its baggage and free the data struct
113  */
114 void
115 nfslog_close_buf(struct nfslog_buf *lbp, int close_quick)
116 {
117 	nfslog_free_buf(lbp, close_quick);
118 	free(lbp);
119 }
120 
121 /*
122  * Set up the log buffer struct; simple things are opening and locking
123  * the buffer file and then on to mmap()ing it for later use by the
124  * XDR decode path.  Make sure to read the buffer header before
125  * returning so that we will be at the first true log record.
126  *
127  * *error contains the last error encountered on this object. It can
128  * be used to avoid reporting the same error endlessly. It is reset
129  * to the current error code on return.
130  */
131 static int
132 nfslog_init_buf(char *bufpath, struct nfslog_buf *lbp, int *error)
133 {
134 	struct stat sb;
135 	int preverror = *error;
136 
137 	lbp->next = lbp;
138 	lbp->prev = lbp;
139 	/*
140 	 * set these values so that the free routine will know what to do
141 	 */
142 	lbp->mmap_addr = (intptr_t)MAP_FAILED;
143 	lbp->last_rec_id = MAXINT - 1;
144 	lbp->bh.bh_length = 0;
145 	lbp->bh_lrp = NULL;
146 	lbp->num_lrps = 0;
147 	lbp->lrps = NULL;
148 	lbp->last_record_offset = 0;
149 	lbp->prp = NULL;
150 	lbp->num_pr_queued = 0;
151 
152 	lbp->bufpath = strdup(bufpath);
153 	if (lbp->bufpath == NULL) {
154 		*error = ENOMEM;
155 		if (preverror != *error) {
156 			syslog(LOG_ERR, gettext("Cannot strdup '%s': %s"),
157 				bufpath, strerror(*error));
158 		}
159 		nfslog_free_buf(lbp, FALSE);
160 		return (*error);
161 	}
162 
163 	if ((lbp->fd = open(bufpath, O_RDWR)) < 0) {
164 		*error = errno;
165 		if (preverror != *error) {
166 			syslog(LOG_ERR, gettext("Cannot open '%s': %s"),
167 				bufpath, strerror(*error));
168 		}
169 		nfslog_free_buf(lbp, FALSE);
170 		return (*error);
171 	}
172 
173 	/*
174 	 * Lock the entire buffer file to prevent conflicting access.
175 	 * We get a write lock because we want only 1 process to be
176 	 * generating records from it.
177 	 */
178 	lbp->fl.l_type = F_WRLCK;
179 	lbp->fl.l_whence = SEEK_SET;		/* beginning of file */
180 	lbp->fl.l_start = (offset_t)0;
181 	lbp->fl.l_len = 0;			/* entire file */
182 	lbp->fl.l_sysid = 0;
183 	lbp->fl.l_pid = 0;
184 	if (fcntl(lbp->fd, F_SETLKW, &lbp->fl) == -1) {
185 		*error = errno;
186 		if (preverror != *error) {
187 			syslog(LOG_ERR, gettext("Cannot lock (%s): %s"),
188 				bufpath, strerror(*error));
189 		}
190 		nfslog_free_buf(lbp, FALSE);
191 		return (*error);
192 	}
193 
194 	if (fstat(lbp->fd, &sb)) {
195 		*error = errno;
196 		if (preverror != *error) {
197 			syslog(LOG_ERR, gettext("Cannot stat (%s): %s"),
198 				bufpath, strerror(*error));
199 		}
200 		nfslog_free_buf(lbp, FALSE);
201 		return (*error);
202 	}
203 	lbp->filesize = sb.st_size;
204 
205 	lbp->mmap_addr = (intptr_t)mmap(0, lbp->filesize, PROT_READ|PROT_WRITE,
206 		MAP_SHARED|MAP_NORESERVE, lbp->fd, 0);
207 
208 	/* This is part of the duality of the use of either mmap()|read() */
209 	if (lbp->mmap_addr == (intptr_t)MAP_FAILED) {
210 		lbp->next_rec = 0;
211 	} else {
212 		lbp->next_rec = lbp->mmap_addr;
213 	}
214 
215 	/* Read the header */
216 	if ((lbp->bh_lrp = nfslog_read_buffer(lbp)) == NULL) {
217 		*error = EIO;
218 		if (preverror != *error) {
219 			syslog(LOG_ERR, gettext(
220 				"error in reading file '%s': %s"),
221 				bufpath, strerror(EIO));
222 		}
223 		nfslog_free_buf(lbp, FALSE);
224 		return (*error);
225 	}
226 
227 	if (!xdr_nfslog_buffer_header(&lbp->bh_lrp->xdrs, &lbp->bh)) {
228 		*error = EIO;
229 		if (preverror != *error) {
230 			syslog(LOG_ERR, gettext(
231 				"error in reading file '%s': %s"),
232 				bufpath, strerror(*error));
233 		}
234 		nfslog_free_buf(lbp, FALSE);
235 		return (*error);
236 	}
237 
238 	/*
239 	 * Set the pointer to the next record based on the buffer header.
240 	 * 'lbp->bh.bh_offset' contains the offset of where to begin
241 	 * processing relative to the buffer header.
242 	 */
243 	lbp->next_rec += lbp->bh.bh_offset;
244 
245 	/*
246 	 * If we are going to be using read() for file data, then we may
247 	 * have to adjust the current file pointer to take into account
248 	 * a starting point other than the beginning of the file.
249 	 * If mmap is being used, this is taken care of as a side effect of
250 	 * setting up the value of next_rec.
251 	 */
252 	if (lbp->mmap_addr == (intptr_t)MAP_FAILED && lbp->next_rec != 0) {
253 		(void) lseek(lbp->fd, lbp->next_rec, SEEK_SET);
254 		/* This is a special case of setting the last_record_offset */
255 		lbp->last_record_offset = lbp->next_rec;
256 	} else {
257 		lbp->last_record_offset = lbp->next_rec - lbp->mmap_addr;
258 	}
259 
260 	return (*error = 0);
261 }
262 
263 /*
264  * Free the nfslog buffer and its associated allocations
265  */
266 static void
267 nfslog_free_buf(struct nfslog_buf *lbp, int close_quick)
268 {
269 	XDR	xdrs;
270 	int	error;
271 	caddr_t buffer;
272 	struct nfslog_lr *lrp, *lrp_next;
273 	struct processed_records *prp, *tprp;
274 
275 	/* work to free the offset records and rewrite header */
276 	if (lbp->prp) {
277 		if (lbp->last_record_offset == lbp->prp->start_offset) {
278 
279 			/* adjust the offset for the entire buffer */
280 			lbp->last_record_offset =
281 				lbp->prp->start_offset + lbp->prp->len;
282 
283 			nfslog_rewrite_bufheader(lbp);
284 		}
285 		if (close_quick)
286 			return;
287 		prp = lbp->prp;
288 		do {
289 			tprp = prp->next;
290 			free(prp);
291 			prp = tprp;
292 		} while (lbp->prp != prp);
293 	}
294 
295 	if (close_quick)
296 		return;
297 
298 	/* Take care of the queue log records first */
299 	if (lbp->lrps != NULL) {
300 		lrp = lbp->lrps;
301 		do {
302 			lrp_next = lrp->next;
303 			nfslog_free_logrecord(lrp, FALSE);
304 			lrp = lrp_next;
305 		} while (lrp != lbp->lrps);
306 		lbp->lrps = NULL;
307 	}
308 
309 	/* The buffer header was decoded and needs to be freed */
310 	if (lbp->bh.bh_length != 0) {
311 		buffer = (lbp->bh_lrp->buffer != NULL ?
312 			lbp->bh_lrp->buffer : (caddr_t)lbp->mmap_addr);
313 		xdrmem_create(&xdrs, buffer, lbp->bh_lrp->recsize, XDR_FREE);
314 		(void) xdr_nfslog_buffer_header(&xdrs, &lbp->bh);
315 		lbp->bh.bh_length = 0;
316 	}
317 
318 	/* get rid of the bufheader lrp */
319 	if (lbp->bh_lrp != NULL) {
320 		free_lrp(lbp->bh_lrp);
321 		lbp->bh_lrp = NULL;
322 	}
323 
324 	/* Clean up for mmap() usage */
325 	if (lbp->mmap_addr != (intptr_t)MAP_FAILED) {
326 		if (munmap((void *)lbp->mmap_addr, lbp->filesize)) {
327 			error = errno;
328 			syslog(LOG_ERR, gettext("munmap failed: %s: %s"),
329 				(lbp->bufpath != NULL ? lbp->bufpath : ""),
330 				strerror(error));
331 		}
332 		lbp->mmap_addr = (intptr_t)MAP_FAILED;
333 	}
334 
335 	/* Finally close the buffer file */
336 	if (lbp->fd >= 0) {
337 		lbp->fl.l_type = F_UNLCK;
338 		if (fcntl(lbp->fd, F_SETLK, &lbp->fl) == -1) {
339 			error = errno;
340 			syslog(LOG_ERR,
341 				gettext("Cannot unlock file %s: %s"),
342 				(lbp->bufpath != NULL ? lbp->bufpath : ""),
343 				strerror(error));
344 		}
345 		(void) close(lbp->fd);
346 		lbp->fd = -1;
347 	}
348 	if (lbp->bufpath != NULL)
349 		free(lbp->bufpath);
350 }
351 
352 /*
353  * We are reading a record from the log buffer file.  Since we are reading
354  * an XDR stream, we first have to read the first integer to determine
355  * how much to read in whole for this record.  Our preference is to use
356  * mmap() but if failed initially we will be using read().  Need to be
357  * careful about proper initialization of the log record both from a field
358  * perspective and for XDR decoding.
359  */
360 static struct nfslog_lr *
361 nfslog_read_buffer(struct nfslog_buf *lbp)
362 {
363 	XDR xdrs;
364 	unsigned int	record_size;
365 	struct nfslog_lr *lrp;
366 	char		*sizebuf, tbuf[16];
367 	caddr_t		buffer;
368 	offset_t	next_rec;
369 
370 	lrp = (struct nfslog_lr *)malloc(sizeof (*lrp));
371 	bzero(lrp, sizeof (*lrp));
372 
373 	/* Check to see if mmap worked */
374 	if (lbp->mmap_addr == (intptr_t)MAP_FAILED) {
375 		/*
376 		 * EOF or other failure; we don't try to recover, just return
377 		 */
378 		if (read(lbp->fd, tbuf, BYTES_PER_XDR_UNIT) <= 0) {
379 			free_lrp(lrp);
380 			return (NULL);
381 		}
382 		sizebuf = tbuf;
383 	} else {
384 		/* EOF check for the mmap() case */
385 		if (lbp->filesize <= lbp->next_rec - lbp->mmap_addr) {
386 			free_lrp(lrp);
387 			return (NULL);
388 		}
389 		sizebuf = (char *)(uintptr_t)lbp->next_rec;
390 	}
391 
392 	/* We have to XDR the first int so we know how much is in this record */
393 	xdrmem_create(&xdrs, sizebuf, sizeof (unsigned int), XDR_DECODE);
394 
395 	if (!xdr_u_int(&xdrs, &record_size)) {
396 		free_lrp(lrp);
397 		return (NULL);
398 	}
399 
400 	lrp->recsize = record_size;
401 	next_rec = lbp->next_rec + lrp->recsize;
402 
403 	if (lbp->mmap_addr == (intptr_t)MAP_FAILED) {
404 		/*
405 		 * Read() case - shouldn't be used very much.
406 		 * Note: The 'buffer' field is used later on
407 		 * to determine which method is being used mmap()|read()
408 		 */
409 		if (lbp->filesize < next_rec) {
410 			/* partial record from buffer */
411 			syslog(LOG_ERR, gettext(
412 				"Last partial record in work buffer %s "
413 				"discarded\n"), lbp->bufpath);
414 			free_lrp(lrp);
415 			return (NULL);
416 		}
417 
418 		if ((lrp->buffer = malloc(lrp->recsize)) == NULL) {
419 			free_lrp(lrp);
420 			return (NULL);
421 		}
422 		bcopy(sizebuf, lrp->buffer, BYTES_PER_XDR_UNIT);
423 		if (read(lbp->fd, &lrp->buffer[BYTES_PER_XDR_UNIT],
424 			lrp->recsize - BYTES_PER_XDR_UNIT) <= 0) {
425 			free_lrp(lrp);
426 			return (NULL);
427 		}
428 	} else if (lbp->filesize < next_rec - lbp->mmap_addr) {
429 			/* partial record from buffer */
430 			syslog(LOG_ERR, gettext(
431 				"Last partial record in work buffer %s "
432 				"discarded\n"), lbp->bufpath);
433 			free_lrp(lrp);
434 			return (NULL);
435 	}
436 
437 
438 	/* other initializations */
439 	lrp->next = lrp->prev = lrp;
440 	/* Keep track of the offset at which this record was read */
441 	if (lbp->mmap_addr == (intptr_t)MAP_FAILED)
442 		lrp->f_offset = lbp->next_rec;
443 	else
444 		lrp->f_offset = lbp->next_rec - lbp->mmap_addr;
445 	/* This is the true address of the record */
446 	lrp->record = lbp->next_rec;
447 	lrp->xdrargs = lrp->xdrres = NULL;
448 	lrp->lbp = lbp;
449 
450 	/* Here is the logic for mmap() vs. read() */
451 	buffer = (lrp->buffer != NULL ? lrp->buffer : (caddr_t)lrp->record);
452 
453 	/* Setup for the 'real' XDR decode of the entire record */
454 	xdrmem_create(&lrp->xdrs, buffer, lrp->recsize, XDR_DECODE);
455 
456 	/* calculate the offset for the next record */
457 	lbp->next_rec = next_rec;
458 
459 	return (lrp);
460 }
461 
462 /*
463  * Simple removal of the log record from the log buffer queue.
464  * Make sure to manage the count of records queued.
465  */
466 static struct nfslog_lr *
467 remove_lrp_from_lb(struct nfslog_buf *lbp, struct nfslog_lr *lrp)
468 {
469 	if (lbp->lrps == lrp) {
470 		if (lbp->lrps == lbp->lrps->next) {
471 			lbp->lrps = NULL;
472 		} else {
473 			lbp->lrps = lrp->next;
474 			remque(lrp);
475 		}
476 	} else {
477 		remque(lrp);
478 	}
479 	lbp->num_lrps--;
480 	return (lrp);
481 }
482 
483 /*
484  * Insert a log record struct on the log buffer struct.  The log buffer
485  * has a pointer to the head of a queue of log records that have been
486  * read from the buffer file but have not been processed yet because
487  * the record id did not match the sequence desired for processing.
488  * The insertion must be in the 'correct'/sorted order which adds
489  * to the complexity of this function.
490  */
491 static void
492 insert_lrp_to_lb(struct nfslog_buf *lbp, struct nfslog_lr *lrp)
493 {
494 	int ins_rec_id = lrp->log_record.re_header.rh_rec_id;
495 	struct nfslog_lr *curlrp;
496 
497 	if (lbp->lrps == NULL) {
498 		/* that was easy */
499 		lbp->lrps = lrp;
500 	} else {
501 		/*
502 		 * Does this lrp go before the first on the list?
503 		 * If so, do the insertion by hand since insque is not
504 		 * as flexible when queueing an element to the head of
505 		 * a list.
506 		 */
507 		if (ins_rec_id < lbp->lrps->log_record.re_header.rh_rec_id) {
508 			lrp->next = lbp->lrps;
509 			lrp->prev = lbp->lrps->prev;
510 			lbp->lrps->prev->next = lrp;
511 			lbp->lrps->prev = lrp;
512 			lbp->lrps = lrp;
513 		} else {
514 			/*
515 			 * Search the queue for the correct insertion point.
516 			 * Be careful about the insque so that the record
517 			 * ends up in the right place.
518 			 */
519 			curlrp = lbp->lrps;
520 			do {
521 				if (ins_rec_id <
522 				curlrp->next->log_record.re_header.rh_rec_id)
523 					break;
524 				curlrp = curlrp->next;
525 			} while (curlrp != lbp->lrps);
526 			if (curlrp == lbp->lrps)
527 				insque(lrp, lbp->lrps->prev);
528 			else
529 				insque(lrp, curlrp);
530 		}
531 	}
532 	/* always keep track of how many we have */
533 	lbp->num_lrps++;
534 }
535 
536 /*
537  * We are rewriting the buffer header at the start of the log buffer
538  * for the sole purpose of resetting the bh_offset field.  This is
539  * supposed to represent the progress that the nfslogd daemon has made
540  * in its processing of the log buffer file.
541  * 'lbp->last_record_offset' contains the absolute offset of the end
542  * of the last element processed. The on-disk buffer offset is relative
543  * to the buffer header, therefore we subtract the length of the buffer
544  * header from the absolute offset.
545  */
546 static void
547 nfslog_rewrite_bufheader(struct nfslog_buf *lbp)
548 {
549 	XDR xdrs;
550 	nfslog_buffer_header bh;
551 	/* size big enough for buffer header encode */
552 #define	XBUFSIZE 128
553 	char buffer[XBUFSIZE];
554 	unsigned int wsize;
555 
556 	/*
557 	 * if version 1 buffer is large and the current offset cannot be
558 	 * represented, then don't update the offset in the buffer.
559 	 */
560 	if (lbp->bh.bh_flags & NFSLOG_BH_OFFSET_OVERFLOW) {
561 		/* No need to update the header - offset too big */
562 		return;
563 	}
564 	/*
565 	 * build the buffer header from the original that was saved
566 	 * on initialization; note that the offset is taken from the
567 	 * last record processed (the last offset that represents
568 	 * all records processed without any holes in the processing)
569 	 */
570 	bh = lbp->bh;
571 
572 	/*
573 	 * if version 1 buffer is large and the current offset cannot be
574 	 * represented in 32 bits, then save only the last valid offset
575 	 * in the buffer and mark the flags to indicate that.
576 	 */
577 	if ((bh.bh_version > 1) ||
578 		(lbp->last_record_offset - bh.bh_length < UINT32_MAX)) {
579 		bh.bh_offset = lbp->last_record_offset - bh.bh_length;
580 	} else {
581 		/* don't update the offset in the buffer */
582 		bh.bh_flags |= NFSLOG_BH_OFFSET_OVERFLOW;
583 		lbp->bh.bh_flags = bh.bh_flags;
584 		syslog(LOG_ERR, gettext(
585 			"nfslog_rewrite_bufheader: %s: offset does not fit "
586 			"in a 32 bit field\n"), lbp->bufpath);
587 	}
588 
589 	xdrmem_create(&xdrs, buffer, XBUFSIZE, XDR_ENCODE);
590 
591 	if (!xdr_nfslog_buffer_header(&xdrs, &bh)) {
592 		syslog(LOG_ERR, gettext(
593 			"error in re-writing buffer file %s header\n"),
594 			lbp->bufpath);
595 		return;
596 	}
597 
598 	wsize = xdr_getpos(&xdrs);
599 
600 	if (lbp->mmap_addr == (intptr_t)MAP_FAILED) {
601 		/* go to the beginning of the file */
602 		(void) lseek(lbp->fd, 0, SEEK_SET);
603 		(void) write(lbp->fd, buffer, wsize);
604 		(void) lseek(lbp->fd, lbp->next_rec, SEEK_SET);
605 		(void) fsync(lbp->fd);
606 	} else {
607 		bcopy(buffer, (void *)lbp->mmap_addr, wsize);
608 		(void) msync((void *)lbp->mmap_addr, wsize, MS_SYNC);
609 	}
610 }
611 
612 /*
613  * With the provided lrp, we will take and 'insert' the range that the
614  * record covered in the buffer file into a list of processed ranges
615  * for the buffer file.  These ranges represent the records processed
616  * but not 'marked' in the buffer header as being processed.
617  * This insertion process is being done for two reasons.  The first is that
618  * we do not want to pay the performance penalty of re-writing the buffer header
619  * for each record that we process.  The second reason is that the records
620  * may be processed out of order because of the unique ids.  This will occur
621  * if the kernel has written the records to the buffer file out of order.
622  * The read routine will 'sort' them as the records are read.
623  *
624  * We do not want to re-write the buffer header such that a record is
625  * represented and being processed when it has not been.  In the case
626  * that the nfslogd daemon restarts processing and the buffer header
627  * has been re-written improperly, some records could be skipped.
628  * We will be taking the conservative approach and only writing buffer
629  * header offsets when the entire offset range has been processed.
630  */
631 static void
632 nfslog_ins_last_rec_processed(struct nfslog_lr *lrp)
633 {
634 	struct processed_records *prp, *tp;
635 
636 	/* init the data struct as if it were the only one */
637 	prp = malloc(sizeof (*prp));
638 	prp->next = prp->prev = prp;
639 	prp->start_offset = lrp->f_offset;
640 	prp->len = lrp->recsize;
641 	prp->num_recs = 1;
642 
643 	/* always add since we know we are going to insert */
644 	lrp->lbp->num_pr_queued++;
645 
646 	/* Is this the first one?  If so, take the easy way out */
647 	if (lrp->lbp->prp == NULL) {
648 		lrp->lbp->prp = prp;
649 	} else {
650 		/* sort on insertion... */
651 		tp = lrp->lbp->prp;
652 		do {
653 			if (prp->start_offset < tp->start_offset)
654 				break;
655 			tp = tp->next;
656 		} while (tp != lrp->lbp->prp);
657 		/* insert where appropriate (before the one we found */
658 		insque(prp, tp->prev);
659 		/*
660 		 * special case where the insertion was done at the
661 		 * head of the list
662 		 */
663 		if (tp == lrp->lbp->prp && prp->start_offset < tp->start_offset)
664 			lrp->lbp->prp = prp;
665 
666 		/*
667 		 * now that the entry is in place, we need to see if it can
668 		 * be combined with the previous or following entries.
669 		 * combination is done by adding to the length.
670 		 */
671 		if (prp->start_offset ==
672 			(prp->prev->start_offset + prp->prev->len)) {
673 			tp = prp->prev;
674 			remque(prp);
675 			tp->len += prp->len;
676 			tp->num_recs += prp->num_recs;
677 			free(prp);
678 			prp = tp;
679 		}
680 		if (prp->next->start_offset ==
681 			(prp->start_offset + prp->len)) {
682 			prp->len += prp->next->len;
683 			prp->num_recs += prp->next->num_recs;
684 			tp = prp->next;
685 			remque(tp);
686 			free(tp);
687 		}
688 	}
689 
690 	if (lrp->lbp->num_pr_queued > MAX_RECS_TO_DELAY) {
691 		prp = lrp->lbp->prp;
692 		if (lrp->lbp->last_record_offset ==
693 			prp->start_offset) {
694 
695 			/* adjust the offset for the entire buffer */
696 			lrp->lbp->last_record_offset =
697 				prp->start_offset + prp->len;
698 
699 			nfslog_rewrite_bufheader(lrp->lbp);
700 
701 			tp = prp->next;
702 			if (tp != prp)
703 				remque(prp);
704 			else
705 				tp = NULL;
706 			lrp->lbp->prp = tp;
707 			lrp->lbp->num_pr_queued -= prp->num_recs;
708 			free(prp);
709 		}
710 	}
711 }
712 
713 /*
714  * nfslog_get_logrecord is responsible for retrieving the next log record
715  * from the buffer file. This would normally be very straightforward but there
716  * is the added complexity of attempting to order the requests coming out of
717  * the buffer file.  The fundamental problems is that the kernel nfs logging
718  * functionality does not guarantee that the records were written to the file
719  * in the order that the NFS server processed them.  This can cause a problem
720  * in the fh -> pathname mapping in the case were a lookup for a file comes
721  * later in the buffer file than other operations on the lookup's target.
722  * The fh mapping database will not have an entry and will therefore not
723  * be able to map the fh to a name.
724  *
725  * So to solve this problem, the kernel nfs logging code tags each record
726  * with a monotonically increasing id and is guaranteed to be allocated
727  * in the order that the requests were processed.  Realize however that
728  * this processing guarantee is essentially for one thread on one client.
729  * This id mechanism does not order all requests since it is only the
730  * single client/single thread case that is most concerning to us here.
731  *
732  * This function will do the 'sorting' of the requests as they are
733  * read from the buffer file.  The sorting needs to take into account
734  * that some ids may be missing (operations not logged but ids allocated)
735  * and that the id field will eventually wrap over MAXINT.
736  *
737  * Complexity to solve the fh -> pathname mapping issue.
738  */
739 struct nfslog_lr *
740 nfslog_get_logrecord(struct nfslog_buf *lbp)
741 {
742 	/* figure out what the next should be if the world were perfect */
743 	unsigned int next_rec_id = lbp->last_rec_id + 1;
744 	struct nfslog_lr *lrp = NULL;
745 
746 	/*
747 	 * First we check the queued records on the log buffer struct
748 	 * to see if the one we want is there.  The records are sorted
749 	 * on the record id during the insertions to the queue so that
750 	 * this check is easy.
751 	 */
752 	if (lbp->lrps != NULL) {
753 		/* Does the first record match ? */
754 		if (lbp->lrps->log_record.re_header.rh_rec_id == next_rec_id) {
755 			lrp = remove_lrp_from_lb(lbp, lbp->lrps);
756 			lbp->last_rec_id = lrp->log_record.re_header.rh_rec_id;
757 		} else {
758 			/*
759 			 * Here we are checking for wrap of the record id
760 			 * since it is an unsigned in.  The idea is that
761 			 * if there is a huge span between what we expect
762 			 * and what is queued then we need to flush/empty
763 			 * the queued records first.
764 			 */
765 			if (next_rec_id <
766 				lbp->lrps->log_record.re_header.rh_rec_id &&
767 				((lbp->lrps->log_record.re_header.rh_rec_id -
768 					next_rec_id) > (MAXINT / 2))) {
769 
770 				lrp = remove_lrp_from_lb(lbp, lbp->lrps);
771 				lbp->last_rec_id =
772 					lrp->log_record.re_header.rh_rec_id;
773 			}
774 		}
775 	}
776 	/*
777 	 * So the first queued record didn't match (or there were no queued
778 	 * records to look at).  Now we go to the buffer file looking for
779 	 * the expected log record based on its id.  We loop looking for
780 	 * a matching records and save/queue the records that don't match.
781 	 * Note that we will queue a maximum number to handle the case
782 	 * of a missing record id or a queue that is very confused.  We don't
783 	 * want to consume too much memory.
784 	 */
785 	while (lrp == NULL) {
786 		/* Have we queued too many for this buffer? */
787 		if (lbp->num_lrps >= MAX_LRS_READ_AHEAD) {
788 			lrp = remove_lrp_from_lb(lbp, lbp->lrps);
789 			lbp->last_rec_id = lrp->log_record.re_header.rh_rec_id;
790 			break;
791 		}
792 		/*
793 		 * Get a record from the buffer file.  If none are available,
794 		 * this is probably and EOF condition (could be a read error
795 		 * as well but that is masked. :-().  No records in the
796 		 * file means that we need to pull any queued records
797 		 * so that we don't miss any in the processing.
798 		 */
799 		if ((lrp = nfslog_read_buffer(lbp)) == NULL) {
800 			if (lbp->lrps != NULL) {
801 				lrp = remove_lrp_from_lb(lbp, lbp->lrps);
802 				lbp->last_rec_id =
803 					lrp->log_record.re_header.rh_rec_id;
804 			} else {
805 				return (NULL);  /* it was really and EOF */
806 			}
807 		} else {
808 			/*
809 			 * Just read a record from the buffer file and now we
810 			 * need to XDR the record header so that we can take
811 			 * a look at the record id.
812 			 */
813 			if (!xdr_nfslog_request_record(&lrp->xdrs,
814 				&lrp->log_record)) {
815 				/* Free and return EOF/NULL on error */
816 				nfslog_free_logrecord(lrp, FALSE);
817 				return (NULL);
818 			}
819 			/*
820 			 * If the new record is less than or matches the
821 			 * expected record id, then we return this record
822 			 */
823 			if (lrp->log_record.re_header.rh_rec_id <=
824 				next_rec_id) {
825 
826 				lbp->last_rec_id =
827 					lrp->log_record.re_header.rh_rec_id;
828 			} else {
829 				/*
830 				 * This is not the one we were looking
831 				 * for; queue it for later processing
832 				 * (queueing sorts on record id)
833 				 */
834 				insert_lrp_to_lb(lbp, lrp);
835 				lrp = NULL;
836 			}
837 		}
838 	}
839 	return (lrp);
840 }
841 
842 /*
843  * Free the log record provided.
844  * This is complex because the associated XDR streams also need to be freed
845  * since allocation could have occured during the DECODE phase.  The record
846  * header, args and results need to be XDR_FREEd.  The xdr funtions will
847  * be provided if a free needs to be done.
848  *
849  * Note that caller tells us if the record being freed was processed.
850  * If so, then the buffer header should be updated.  Updating the buffer
851  * header keeps track of where the nfslogd daemon left off in its processing
852  * if it is unable to complete the entire file.
853  */
854 void
855 nfslog_free_logrecord(struct nfslog_lr *lrp, bool_t processing_complete)
856 {
857 	caddr_t			buffer;
858 	nfslog_request_record 	*reqrec;
859 
860 	if (processing_complete) {
861 		nfslog_ins_last_rec_processed(lrp);
862 	}
863 
864 	reqrec = &lrp->log_record;
865 
866 	buffer = (lrp->buffer != NULL ? lrp->buffer : (caddr_t)lrp->record);
867 
868 	xdrmem_create(&lrp->xdrs, buffer, lrp->recsize, XDR_FREE);
869 
870 	(void) xdr_nfslog_request_record(&lrp->xdrs, reqrec);
871 
872 	if (lrp->xdrargs != NULL && reqrec->re_rpc_arg)
873 		(*lrp->xdrargs)(&lrp->xdrs, reqrec->re_rpc_arg);
874 
875 	if (reqrec->re_rpc_arg)
876 		free(reqrec->re_rpc_arg);
877 
878 	if (lrp->xdrres != NULL && reqrec->re_rpc_res)
879 		(*lrp->xdrres)(&lrp->xdrs, reqrec->re_rpc_res);
880 
881 	if (reqrec->re_rpc_res)
882 		free(reqrec->re_rpc_res);
883 
884 	free_lrp(lrp);
885 }
886 
887 static void
888 free_lrp(struct nfslog_lr *lrp)
889 {
890 	if (lrp->buffer != NULL)
891 		free(lrp->buffer);
892 	free(lrp);
893 }
894 
895 /*
896  * Utility function used elsewhere
897  */
898 void
899 nfslog_opaque_print_buf(void *buf, int len, char *outbuf, int *outbufoffsetp,
900 	int maxoffset)
901 {
902 	int	i, j;
903 	uint_t	*ip;
904 	uchar_t	*u_buf = (uchar_t *)buf;
905 	int	outbufoffset = *outbufoffsetp;
906 
907 	outbufoffset += sprintf(&outbuf[outbufoffset], " \"");
908 	if (len <= sizeof (int)) {
909 		for (j = 0; (j < len) && (outbufoffset < maxoffset);
910 			j++, u_buf++)
911 			outbufoffset += sprintf(&outbuf[outbufoffset],
912 						"%02x", *u_buf);
913 		return;
914 	}
915 	/* More than 4 bytes, print with spaces in integer offsets */
916 	j = (int)((uintptr_t)buf % sizeof (int));
917 	i = 0;
918 	if (j > 0) {
919 		i = sizeof (int) - j;
920 		for (; (j < sizeof (int)) && (outbufoffset < maxoffset);
921 			j++, u_buf++)
922 			outbufoffset += sprintf(&outbuf[outbufoffset],
923 						"%02x", *u_buf);
924 	}
925 	/* LINTED */
926 	ip = (uint_t *)u_buf;
927 	for (; ((i + sizeof (int)) <= len) && (outbufoffset < maxoffset);
928 		i += sizeof (int), ip++) {
929 		outbufoffset += sprintf(&outbuf[outbufoffset], " %08x", *ip);
930 	}
931 	if (i < len) {
932 		/* Last element not int */
933 		u_buf = (uchar_t *)ip;
934 		if (i > j)	/* not first element */
935 			outbufoffset += sprintf(&outbuf[outbufoffset], " ");
936 		for (; (i < len) && (outbufoffset < maxoffset); i++, u_buf++) {
937 			outbufoffset += sprintf(&outbuf[outbufoffset],
938 						"%02x", *u_buf);
939 		}
940 	}
941 	if (outbufoffset < maxoffset)
942 		outbufoffset += sprintf(&outbuf[outbufoffset], "\"");
943 	*outbufoffsetp = outbufoffset;
944 }
945