xref: /titanic_50/usr/src/cmd/fs.d/nfs/nfslog/readbuf.c (revision 174bc6499d233e329ecd3d98a880a7b07df16bfa)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  * nfs log - read buffer file and return structs in usable form
31  */
32 
33 #include <ctype.h>
34 #include <stdio.h>
35 #include <stdlib.h>
36 #include <stddef.h>
37 #include <string.h>
38 #include <fcntl.h>
39 #include <unistd.h>
40 #include <signal.h>
41 #include <sys/types.h>
42 #include <sys/param.h>
43 #include <sys/stat.h>
44 #include <sys/utsname.h>
45 #include <sys/mman.h>
46 #include <strings.h>
47 #include <errno.h>
48 #include <syslog.h>
49 #include <time.h>
50 #include <limits.h>
51 #include <libintl.h>
52 #include <values.h>
53 #include <search.h>
54 #include <pwd.h>
55 #include <netdb.h>
56 #include <rpc/rpc.h>
57 #include <netconfig.h>
58 #include <netdir.h>
59 #include <nfs/nfs_sec.h>
60 #include <nfs/export.h>
61 #include <rpc/auth.h>
62 #include <rpc/svc.h>
63 #include <rpc/xdr.h>
64 #include <rpc/clnt.h>
65 #include <nfs/nfs.h>
66 #include <nfs/nfs_log.h>
67 #include "nfslogd.h"
68 
69 #define	MAX_LRS_READ_AHEAD 2048
70 #define	MAX_RECS_TO_DELAY 32768
71 
72 static int 		nfslog_init_buf(char *, struct nfslog_buf *, int *);
73 static void		nfslog_free_buf(struct nfslog_buf *, int);
74 static struct nfslog_lr *nfslog_read_buffer(struct nfslog_buf *);
75 static void		free_lrp(struct nfslog_lr *);
76 static struct nfslog_lr *remove_lrp_from_lb(struct nfslog_buf *,
77 			struct nfslog_lr *);
78 static void		insert_lrp_to_lb(struct nfslog_buf *,
79 			struct nfslog_lr *);
80 static void		nfslog_rewrite_bufheader(struct nfslog_buf *);
81 
82 /*
83  * Treat the provided path name as an NFS log buffer file.
84  * Allocate a data structure for its handling and initialize it.
85  * *error contains the previous error condition encountered for
86  * this object. This value can be used to avoid printing the last
87  * error endlessly.
88  * It will set *error appropriately after processing.
89  */
90 struct nfslog_buf *
91 nfslog_open_buf(char *bufpath, int *error)
92 {
93 	struct nfslog_buf	*lbp = NULL;
94 
95 	if (bufpath == NULL) {
96 		*error = EINVAL;
97 		return (NULL);
98 	}
99 
100 	if ((lbp = malloc(sizeof (struct nfslog_buf))) == NULL) {
101 		*error = ENOMEM;
102 		return (NULL);
103 	}
104 	bzero(lbp, sizeof (struct nfslog_buf));
105 
106 	if (nfslog_init_buf(bufpath, lbp, error)) {
107 		free(lbp);
108 		return (NULL);
109 	}
110 	return (lbp);
111 }
112 
113 /*
114  * Free the log buffer struct with all of its baggage and free the data struct
115  */
116 void
117 nfslog_close_buf(struct nfslog_buf *lbp, int close_quick)
118 {
119 	nfslog_free_buf(lbp, close_quick);
120 	free(lbp);
121 }
122 
123 /*
124  * Set up the log buffer struct; simple things are opening and locking
125  * the buffer file and then on to mmap()ing it for later use by the
126  * XDR decode path.  Make sure to read the buffer header before
127  * returning so that we will be at the first true log record.
128  *
129  * *error contains the last error encountered on this object. It can
130  * be used to avoid reporting the same error endlessly. It is reset
131  * to the current error code on return.
132  */
133 static int
134 nfslog_init_buf(char *bufpath, struct nfslog_buf *lbp, int *error)
135 {
136 	struct stat sb;
137 	int preverror = *error;
138 
139 	lbp->next = lbp;
140 	lbp->prev = lbp;
141 	/*
142 	 * set these values so that the free routine will know what to do
143 	 */
144 	lbp->mmap_addr = (intptr_t)MAP_FAILED;
145 	lbp->last_rec_id = MAXINT - 1;
146 	lbp->bh.bh_length = 0;
147 	lbp->bh_lrp = NULL;
148 	lbp->num_lrps = 0;
149 	lbp->lrps = NULL;
150 	lbp->last_record_offset = 0;
151 	lbp->prp = NULL;
152 	lbp->num_pr_queued = 0;
153 
154 	lbp->bufpath = strdup(bufpath);
155 	if (lbp->bufpath == NULL) {
156 		*error = ENOMEM;
157 		if (preverror != *error) {
158 			syslog(LOG_ERR, gettext("Cannot strdup '%s': %s"),
159 				bufpath, strerror(*error));
160 		}
161 		nfslog_free_buf(lbp, FALSE);
162 		return (*error);
163 	}
164 
165 	if ((lbp->fd = open(bufpath, O_RDWR)) < 0) {
166 		*error = errno;
167 		if (preverror != *error) {
168 			syslog(LOG_ERR, gettext("Cannot open '%s': %s"),
169 				bufpath, strerror(*error));
170 		}
171 		nfslog_free_buf(lbp, FALSE);
172 		return (*error);
173 	}
174 
175 	/*
176 	 * Lock the entire buffer file to prevent conflicting access.
177 	 * We get a write lock because we want only 1 process to be
178 	 * generating records from it.
179 	 */
180 	lbp->fl.l_type = F_WRLCK;
181 	lbp->fl.l_whence = SEEK_SET;		/* beginning of file */
182 	lbp->fl.l_start = (offset_t)0;
183 	lbp->fl.l_len = 0;			/* entire file */
184 	lbp->fl.l_sysid = 0;
185 	lbp->fl.l_pid = 0;
186 	if (fcntl(lbp->fd, F_SETLKW, &lbp->fl) == -1) {
187 		*error = errno;
188 		if (preverror != *error) {
189 			syslog(LOG_ERR, gettext("Cannot lock (%s): %s"),
190 				bufpath, strerror(*error));
191 		}
192 		nfslog_free_buf(lbp, FALSE);
193 		return (*error);
194 	}
195 
196 	if (fstat(lbp->fd, &sb)) {
197 		*error = errno;
198 		if (preverror != *error) {
199 			syslog(LOG_ERR, gettext("Cannot stat (%s): %s"),
200 				bufpath, strerror(*error));
201 		}
202 		nfslog_free_buf(lbp, FALSE);
203 		return (*error);
204 	}
205 	lbp->filesize = sb.st_size;
206 
207 	lbp->mmap_addr = (intptr_t)mmap(0, lbp->filesize, PROT_READ|PROT_WRITE,
208 		MAP_SHARED|MAP_NORESERVE, lbp->fd, 0);
209 
210 	/* This is part of the duality of the use of either mmap()|read() */
211 	if (lbp->mmap_addr == (intptr_t)MAP_FAILED) {
212 		lbp->next_rec = 0;
213 	} else {
214 		lbp->next_rec = lbp->mmap_addr;
215 	}
216 
217 	/* Read the header */
218 	if ((lbp->bh_lrp = nfslog_read_buffer(lbp)) == NULL) {
219 		*error = EIO;
220 		if (preverror != *error) {
221 			syslog(LOG_ERR, gettext(
222 				"error in reading file '%s': %s"),
223 				bufpath, strerror(EIO));
224 		}
225 		nfslog_free_buf(lbp, FALSE);
226 		return (*error);
227 	}
228 
229 	if (!xdr_nfslog_buffer_header(&lbp->bh_lrp->xdrs, &lbp->bh)) {
230 		*error = EIO;
231 		if (preverror != *error) {
232 			syslog(LOG_ERR, gettext(
233 				"error in reading file '%s': %s"),
234 				bufpath, strerror(*error));
235 		}
236 		nfslog_free_buf(lbp, FALSE);
237 		return (*error);
238 	}
239 
240 	/*
241 	 * Set the pointer to the next record based on the buffer header.
242 	 * 'lbp->bh.bh_offset' contains the offset of where to begin
243 	 * processing relative to the buffer header.
244 	 */
245 	lbp->next_rec += lbp->bh.bh_offset;
246 
247 	/*
248 	 * If we are going to be using read() for file data, then we may
249 	 * have to adjust the current file pointer to take into account
250 	 * a starting point other than the beginning of the file.
251 	 * If mmap is being used, this is taken care of as a side effect of
252 	 * setting up the value of next_rec.
253 	 */
254 	if (lbp->mmap_addr == (intptr_t)MAP_FAILED && lbp->next_rec != 0) {
255 		(void) lseek(lbp->fd, lbp->next_rec, SEEK_SET);
256 		/* This is a special case of setting the last_record_offset */
257 		lbp->last_record_offset = lbp->next_rec;
258 	} else {
259 		lbp->last_record_offset = lbp->next_rec - lbp->mmap_addr;
260 	}
261 
262 	return (*error = 0);
263 }
264 
265 /*
266  * Free the nfslog buffer and its associated allocations
267  */
268 static void
269 nfslog_free_buf(struct nfslog_buf *lbp, int close_quick)
270 {
271 	XDR	xdrs;
272 	int	error;
273 	caddr_t buffer;
274 	struct nfslog_lr *lrp, *lrp_next;
275 	struct processed_records *prp, *tprp;
276 
277 	/* work to free the offset records and rewrite header */
278 	if (lbp->prp) {
279 		if (lbp->last_record_offset == lbp->prp->start_offset) {
280 
281 			/* adjust the offset for the entire buffer */
282 			lbp->last_record_offset =
283 				lbp->prp->start_offset + lbp->prp->len;
284 
285 			nfslog_rewrite_bufheader(lbp);
286 		}
287 		if (close_quick)
288 			return;
289 		prp = lbp->prp;
290 		do {
291 			tprp = prp->next;
292 			free(prp);
293 			prp = tprp;
294 		} while (lbp->prp != prp);
295 	}
296 
297 	if (close_quick)
298 		return;
299 
300 	/* Take care of the queue log records first */
301 	if (lbp->lrps != NULL) {
302 		lrp = lbp->lrps;
303 		do {
304 			lrp_next = lrp->next;
305 			nfslog_free_logrecord(lrp, FALSE);
306 			lrp = lrp_next;
307 		} while (lrp != lbp->lrps);
308 		lbp->lrps = NULL;
309 	}
310 
311 	/* The buffer header was decoded and needs to be freed */
312 	if (lbp->bh.bh_length != 0) {
313 		buffer = (lbp->bh_lrp->buffer != NULL ?
314 			lbp->bh_lrp->buffer : (caddr_t)lbp->mmap_addr);
315 		xdrmem_create(&xdrs, buffer, lbp->bh_lrp->recsize, XDR_FREE);
316 		(void) xdr_nfslog_buffer_header(&xdrs, &lbp->bh);
317 		lbp->bh.bh_length = 0;
318 	}
319 
320 	/* get rid of the bufheader lrp */
321 	if (lbp->bh_lrp != NULL) {
322 		free_lrp(lbp->bh_lrp);
323 		lbp->bh_lrp = NULL;
324 	}
325 
326 	/* Clean up for mmap() usage */
327 	if (lbp->mmap_addr != (intptr_t)MAP_FAILED) {
328 		if (munmap((void *)lbp->mmap_addr, lbp->filesize)) {
329 			error = errno;
330 			syslog(LOG_ERR, gettext("munmap failed: %s: %s"),
331 				(lbp->bufpath != NULL ? lbp->bufpath : ""),
332 				strerror(error));
333 		}
334 		lbp->mmap_addr = (intptr_t)MAP_FAILED;
335 	}
336 
337 	/* Finally close the buffer file */
338 	if (lbp->fd >= 0) {
339 		lbp->fl.l_type = F_UNLCK;
340 		if (fcntl(lbp->fd, F_SETLK, &lbp->fl) == -1) {
341 			error = errno;
342 			syslog(LOG_ERR,
343 				gettext("Cannot unlock file %s: %s"),
344 				(lbp->bufpath != NULL ? lbp->bufpath : ""),
345 				strerror(error));
346 		}
347 		(void) close(lbp->fd);
348 		lbp->fd = -1;
349 	}
350 	if (lbp->bufpath != NULL)
351 		free(lbp->bufpath);
352 }
353 
354 /*
355  * We are reading a record from the log buffer file.  Since we are reading
356  * an XDR stream, we first have to read the first integer to determine
357  * how much to read in whole for this record.  Our preference is to use
358  * mmap() but if failed initially we will be using read().  Need to be
359  * careful about proper initialization of the log record both from a field
360  * perspective and for XDR decoding.
361  */
362 static struct nfslog_lr *
363 nfslog_read_buffer(struct nfslog_buf *lbp)
364 {
365 	XDR xdrs;
366 	unsigned int	record_size;
367 	struct nfslog_lr *lrp;
368 	char		*sizebuf, tbuf[16];
369 	caddr_t		buffer;
370 	offset_t	next_rec;
371 
372 	lrp = (struct nfslog_lr *)malloc(sizeof (*lrp));
373 	bzero(lrp, sizeof (*lrp));
374 
375 	/* Check to see if mmap worked */
376 	if (lbp->mmap_addr == (intptr_t)MAP_FAILED) {
377 		/*
378 		 * EOF or other failure; we don't try to recover, just return
379 		 */
380 		if (read(lbp->fd, tbuf, BYTES_PER_XDR_UNIT) <= 0) {
381 			free_lrp(lrp);
382 			return (NULL);
383 		}
384 		sizebuf = tbuf;
385 	} else {
386 		/* EOF check for the mmap() case */
387 		if (lbp->filesize <= lbp->next_rec - lbp->mmap_addr) {
388 			free_lrp(lrp);
389 			return (NULL);
390 		}
391 		sizebuf = (char *)(uintptr_t)lbp->next_rec;
392 	}
393 
394 	/* We have to XDR the first int so we know how much is in this record */
395 	xdrmem_create(&xdrs, sizebuf, sizeof (unsigned int), XDR_DECODE);
396 
397 	if (!xdr_u_int(&xdrs, &record_size)) {
398 		free_lrp(lrp);
399 		return (NULL);
400 	}
401 
402 	lrp->recsize = record_size;
403 	next_rec = lbp->next_rec + lrp->recsize;
404 
405 	if (lbp->mmap_addr == (intptr_t)MAP_FAILED) {
406 		/*
407 		 * Read() case - shouldn't be used very much.
408 		 * Note: The 'buffer' field is used later on
409 		 * to determine which method is being used mmap()|read()
410 		 */
411 		if (lbp->filesize < next_rec) {
412 			/* partial record from buffer */
413 			syslog(LOG_ERR, gettext(
414 				"Last partial record in work buffer %s "
415 				"discarded\n"), lbp->bufpath);
416 			free_lrp(lrp);
417 			return (NULL);
418 		}
419 
420 		if ((lrp->buffer = malloc(lrp->recsize)) == NULL) {
421 			free_lrp(lrp);
422 			return (NULL);
423 		}
424 		bcopy(sizebuf, lrp->buffer, BYTES_PER_XDR_UNIT);
425 		if (read(lbp->fd, &lrp->buffer[BYTES_PER_XDR_UNIT],
426 			lrp->recsize - BYTES_PER_XDR_UNIT) <= 0) {
427 			free_lrp(lrp);
428 			return (NULL);
429 		}
430 	} else if (lbp->filesize < next_rec - lbp->mmap_addr) {
431 			/* partial record from buffer */
432 			syslog(LOG_ERR, gettext(
433 				"Last partial record in work buffer %s "
434 				"discarded\n"), lbp->bufpath);
435 			free_lrp(lrp);
436 			return (NULL);
437 	}
438 
439 
440 	/* other initializations */
441 	lrp->next = lrp->prev = lrp;
442 	/* Keep track of the offset at which this record was read */
443 	if (lbp->mmap_addr == (intptr_t)MAP_FAILED)
444 		lrp->f_offset = lbp->next_rec;
445 	else
446 		lrp->f_offset = lbp->next_rec - lbp->mmap_addr;
447 	/* This is the true address of the record */
448 	lrp->record = lbp->next_rec;
449 	lrp->xdrargs = lrp->xdrres = NULL;
450 	lrp->lbp = lbp;
451 
452 	/* Here is the logic for mmap() vs. read() */
453 	buffer = (lrp->buffer != NULL ? lrp->buffer : (caddr_t)lrp->record);
454 
455 	/* Setup for the 'real' XDR decode of the entire record */
456 	xdrmem_create(&lrp->xdrs, buffer, lrp->recsize, XDR_DECODE);
457 
458 	/* calculate the offset for the next record */
459 	lbp->next_rec = next_rec;
460 
461 	return (lrp);
462 }
463 
464 /*
465  * Simple removal of the log record from the log buffer queue.
466  * Make sure to manage the count of records queued.
467  */
468 static struct nfslog_lr *
469 remove_lrp_from_lb(struct nfslog_buf *lbp, struct nfslog_lr *lrp)
470 {
471 	if (lbp->lrps == lrp) {
472 		if (lbp->lrps == lbp->lrps->next) {
473 			lbp->lrps = NULL;
474 		} else {
475 			lbp->lrps = lrp->next;
476 			remque(lrp);
477 		}
478 	} else {
479 		remque(lrp);
480 	}
481 	lbp->num_lrps--;
482 	return (lrp);
483 }
484 
485 /*
486  * Insert a log record struct on the log buffer struct.  The log buffer
487  * has a pointer to the head of a queue of log records that have been
488  * read from the buffer file but have not been processed yet because
489  * the record id did not match the sequence desired for processing.
490  * The insertion must be in the 'correct'/sorted order which adds
491  * to the complexity of this function.
492  */
493 static void
494 insert_lrp_to_lb(struct nfslog_buf *lbp, struct nfslog_lr *lrp)
495 {
496 	int ins_rec_id = lrp->log_record.re_header.rh_rec_id;
497 	struct nfslog_lr *curlrp;
498 
499 	if (lbp->lrps == NULL) {
500 		/* that was easy */
501 		lbp->lrps = lrp;
502 	} else {
503 		/*
504 		 * Does this lrp go before the first on the list?
505 		 * If so, do the insertion by hand since insque is not
506 		 * as flexible when queueing an element to the head of
507 		 * a list.
508 		 */
509 		if (ins_rec_id < lbp->lrps->log_record.re_header.rh_rec_id) {
510 			lrp->next = lbp->lrps;
511 			lrp->prev = lbp->lrps->prev;
512 			lbp->lrps->prev->next = lrp;
513 			lbp->lrps->prev = lrp;
514 			lbp->lrps = lrp;
515 		} else {
516 			/*
517 			 * Search the queue for the correct insertion point.
518 			 * Be careful about the insque so that the record
519 			 * ends up in the right place.
520 			 */
521 			curlrp = lbp->lrps;
522 			do {
523 				if (ins_rec_id <
524 				curlrp->next->log_record.re_header.rh_rec_id)
525 					break;
526 				curlrp = curlrp->next;
527 			} while (curlrp != lbp->lrps);
528 			if (curlrp == lbp->lrps)
529 				insque(lrp, lbp->lrps->prev);
530 			else
531 				insque(lrp, curlrp);
532 		}
533 	}
534 	/* always keep track of how many we have */
535 	lbp->num_lrps++;
536 }
537 
538 /*
539  * We are rewriting the buffer header at the start of the log buffer
540  * for the sole purpose of resetting the bh_offset field.  This is
541  * supposed to represent the progress that the nfslogd daemon has made
542  * in its processing of the log buffer file.
543  * 'lbp->last_record_offset' contains the absolute offset of the end
544  * of the last element processed. The on-disk buffer offset is relative
545  * to the buffer header, therefore we subtract the length of the buffer
546  * header from the absolute offset.
547  */
548 static void
549 nfslog_rewrite_bufheader(struct nfslog_buf *lbp)
550 {
551 	XDR xdrs;
552 	nfslog_buffer_header bh;
553 	/* size big enough for buffer header encode */
554 #define	XBUFSIZE 128
555 	char buffer[XBUFSIZE];
556 	unsigned int wsize;
557 
558 	/*
559 	 * if version 1 buffer is large and the current offset cannot be
560 	 * represented, then don't update the offset in the buffer.
561 	 */
562 	if (lbp->bh.bh_flags & NFSLOG_BH_OFFSET_OVERFLOW) {
563 		/* No need to update the header - offset too big */
564 		return;
565 	}
566 	/*
567 	 * build the buffer header from the original that was saved
568 	 * on initialization; note that the offset is taken from the
569 	 * last record processed (the last offset that represents
570 	 * all records processed without any holes in the processing)
571 	 */
572 	bh = lbp->bh;
573 
574 	/*
575 	 * if version 1 buffer is large and the current offset cannot be
576 	 * represented in 32 bits, then save only the last valid offset
577 	 * in the buffer and mark the flags to indicate that.
578 	 */
579 	if ((bh.bh_version > 1) ||
580 		(lbp->last_record_offset - bh.bh_length < UINT32_MAX)) {
581 		bh.bh_offset = lbp->last_record_offset - bh.bh_length;
582 	} else {
583 		/* don't update the offset in the buffer */
584 		bh.bh_flags |= NFSLOG_BH_OFFSET_OVERFLOW;
585 		lbp->bh.bh_flags = bh.bh_flags;
586 		syslog(LOG_ERR, gettext(
587 			"nfslog_rewrite_bufheader: %s: offset does not fit "
588 			"in a 32 bit field\n"), lbp->bufpath);
589 	}
590 
591 	xdrmem_create(&xdrs, buffer, XBUFSIZE, XDR_ENCODE);
592 
593 	if (!xdr_nfslog_buffer_header(&xdrs, &bh)) {
594 		syslog(LOG_ERR, gettext(
595 			"error in re-writing buffer file %s header\n"),
596 			lbp->bufpath);
597 		return;
598 	}
599 
600 	wsize = xdr_getpos(&xdrs);
601 
602 	if (lbp->mmap_addr == (intptr_t)MAP_FAILED) {
603 		/* go to the beginning of the file */
604 		(void) lseek(lbp->fd, 0, SEEK_SET);
605 		(void) write(lbp->fd, buffer, wsize);
606 		(void) lseek(lbp->fd, lbp->next_rec, SEEK_SET);
607 		(void) fsync(lbp->fd);
608 	} else {
609 		bcopy(buffer, (void *)lbp->mmap_addr, wsize);
610 		(void) msync((void *)lbp->mmap_addr, wsize, MS_SYNC);
611 	}
612 }
613 
614 /*
615  * With the provided lrp, we will take and 'insert' the range that the
616  * record covered in the buffer file into a list of processed ranges
617  * for the buffer file.  These ranges represent the records processed
618  * but not 'marked' in the buffer header as being processed.
619  * This insertion process is being done for two reasons.  The first is that
620  * we do not want to pay the performance penalty of re-writing the buffer header
621  * for each record that we process.  The second reason is that the records
622  * may be processed out of order because of the unique ids.  This will occur
623  * if the kernel has written the records to the buffer file out of order.
624  * The read routine will 'sort' them as the records are read.
625  *
626  * We do not want to re-write the buffer header such that a record is
627  * represented and being processed when it has not been.  In the case
628  * that the nfslogd daemon restarts processing and the buffer header
629  * has been re-written improperly, some records could be skipped.
630  * We will be taking the conservative approach and only writing buffer
631  * header offsets when the entire offset range has been processed.
632  */
633 static void
634 nfslog_ins_last_rec_processed(struct nfslog_lr *lrp)
635 {
636 	struct processed_records *prp, *tp;
637 
638 	/* init the data struct as if it were the only one */
639 	prp = malloc(sizeof (*prp));
640 	prp->next = prp->prev = prp;
641 	prp->start_offset = lrp->f_offset;
642 	prp->len = lrp->recsize;
643 	prp->num_recs = 1;
644 
645 	/* always add since we know we are going to insert */
646 	lrp->lbp->num_pr_queued++;
647 
648 	/* Is this the first one?  If so, take the easy way out */
649 	if (lrp->lbp->prp == NULL) {
650 		lrp->lbp->prp = prp;
651 	} else {
652 		/* sort on insertion... */
653 		tp = lrp->lbp->prp;
654 		do {
655 			if (prp->start_offset < tp->start_offset)
656 				break;
657 			tp = tp->next;
658 		} while (tp != lrp->lbp->prp);
659 		/* insert where appropriate (before the one we found */
660 		insque(prp, tp->prev);
661 		/*
662 		 * special case where the insertion was done at the
663 		 * head of the list
664 		 */
665 		if (tp == lrp->lbp->prp && prp->start_offset < tp->start_offset)
666 			lrp->lbp->prp = prp;
667 
668 		/*
669 		 * now that the entry is in place, we need to see if it can
670 		 * be combined with the previous or following entries.
671 		 * combination is done by adding to the length.
672 		 */
673 		if (prp->start_offset ==
674 			(prp->prev->start_offset + prp->prev->len)) {
675 			tp = prp->prev;
676 			remque(prp);
677 			tp->len += prp->len;
678 			tp->num_recs += prp->num_recs;
679 			free(prp);
680 			prp = tp;
681 		}
682 		if (prp->next->start_offset ==
683 			(prp->start_offset + prp->len)) {
684 			prp->len += prp->next->len;
685 			prp->num_recs += prp->next->num_recs;
686 			tp = prp->next;
687 			remque(tp);
688 			free(tp);
689 		}
690 	}
691 
692 	if (lrp->lbp->num_pr_queued > MAX_RECS_TO_DELAY) {
693 		prp = lrp->lbp->prp;
694 		if (lrp->lbp->last_record_offset ==
695 			prp->start_offset) {
696 
697 			/* adjust the offset for the entire buffer */
698 			lrp->lbp->last_record_offset =
699 				prp->start_offset + prp->len;
700 
701 			nfslog_rewrite_bufheader(lrp->lbp);
702 
703 			tp = prp->next;
704 			if (tp != prp)
705 				remque(prp);
706 			else
707 				tp = NULL;
708 			lrp->lbp->prp = tp;
709 			lrp->lbp->num_pr_queued -= prp->num_recs;
710 			free(prp);
711 		}
712 	}
713 }
714 
715 /*
716  * nfslog_get_logrecord is responsible for retrieving the next log record
717  * from the buffer file. This would normally be very straightforward but there
718  * is the added complexity of attempting to order the requests coming out of
719  * the buffer file.  The fundamental problems is that the kernel nfs logging
720  * functionality does not guarantee that the records were written to the file
721  * in the order that the NFS server processed them.  This can cause a problem
722  * in the fh -> pathname mapping in the case were a lookup for a file comes
723  * later in the buffer file than other operations on the lookup's target.
724  * The fh mapping database will not have an entry and will therefore not
725  * be able to map the fh to a name.
726  *
727  * So to solve this problem, the kernel nfs logging code tags each record
728  * with a monotonically increasing id and is guaranteed to be allocated
729  * in the order that the requests were processed.  Realize however that
730  * this processing guarantee is essentially for one thread on one client.
731  * This id mechanism does not order all requests since it is only the
732  * single client/single thread case that is most concerning to us here.
733  *
734  * This function will do the 'sorting' of the requests as they are
735  * read from the buffer file.  The sorting needs to take into account
736  * that some ids may be missing (operations not logged but ids allocated)
737  * and that the id field will eventually wrap over MAXINT.
738  *
739  * Complexity to solve the fh -> pathname mapping issue.
740  */
741 struct nfslog_lr *
742 nfslog_get_logrecord(struct nfslog_buf *lbp)
743 {
744 	/* figure out what the next should be if the world were perfect */
745 	unsigned int next_rec_id = lbp->last_rec_id + 1;
746 	struct nfslog_lr *lrp = NULL;
747 
748 	/*
749 	 * First we check the queued records on the log buffer struct
750 	 * to see if the one we want is there.  The records are sorted
751 	 * on the record id during the insertions to the queue so that
752 	 * this check is easy.
753 	 */
754 	if (lbp->lrps != NULL) {
755 		/* Does the first record match ? */
756 		if (lbp->lrps->log_record.re_header.rh_rec_id == next_rec_id) {
757 			lrp = remove_lrp_from_lb(lbp, lbp->lrps);
758 			lbp->last_rec_id = lrp->log_record.re_header.rh_rec_id;
759 		} else {
760 			/*
761 			 * Here we are checking for wrap of the record id
762 			 * since it is an unsigned in.  The idea is that
763 			 * if there is a huge span between what we expect
764 			 * and what is queued then we need to flush/empty
765 			 * the queued records first.
766 			 */
767 			if (next_rec_id <
768 				lbp->lrps->log_record.re_header.rh_rec_id &&
769 				((lbp->lrps->log_record.re_header.rh_rec_id -
770 					next_rec_id) > (MAXINT / 2))) {
771 
772 				lrp = remove_lrp_from_lb(lbp, lbp->lrps);
773 				lbp->last_rec_id =
774 					lrp->log_record.re_header.rh_rec_id;
775 			}
776 		}
777 	}
778 	/*
779 	 * So the first queued record didn't match (or there were no queued
780 	 * records to look at).  Now we go to the buffer file looking for
781 	 * the expected log record based on its id.  We loop looking for
782 	 * a matching records and save/queue the records that don't match.
783 	 * Note that we will queue a maximum number to handle the case
784 	 * of a missing record id or a queue that is very confused.  We don't
785 	 * want to consume too much memory.
786 	 */
787 	while (lrp == NULL) {
788 		/* Have we queued too many for this buffer? */
789 		if (lbp->num_lrps >= MAX_LRS_READ_AHEAD) {
790 			lrp = remove_lrp_from_lb(lbp, lbp->lrps);
791 			lbp->last_rec_id = lrp->log_record.re_header.rh_rec_id;
792 			break;
793 		}
794 		/*
795 		 * Get a record from the buffer file.  If none are available,
796 		 * this is probably and EOF condition (could be a read error
797 		 * as well but that is masked. :-().  No records in the
798 		 * file means that we need to pull any queued records
799 		 * so that we don't miss any in the processing.
800 		 */
801 		if ((lrp = nfslog_read_buffer(lbp)) == NULL) {
802 			if (lbp->lrps != NULL) {
803 				lrp = remove_lrp_from_lb(lbp, lbp->lrps);
804 				lbp->last_rec_id =
805 					lrp->log_record.re_header.rh_rec_id;
806 			} else {
807 				return (NULL);  /* it was really and EOF */
808 			}
809 		} else {
810 			/*
811 			 * Just read a record from the buffer file and now we
812 			 * need to XDR the record header so that we can take
813 			 * a look at the record id.
814 			 */
815 			if (!xdr_nfslog_request_record(&lrp->xdrs,
816 				&lrp->log_record)) {
817 				/* Free and return EOF/NULL on error */
818 				nfslog_free_logrecord(lrp, FALSE);
819 				return (NULL);
820 			}
821 			/*
822 			 * If the new record is less than or matches the
823 			 * expected record id, then we return this record
824 			 */
825 			if (lrp->log_record.re_header.rh_rec_id <=
826 				next_rec_id) {
827 
828 				lbp->last_rec_id =
829 					lrp->log_record.re_header.rh_rec_id;
830 			} else {
831 				/*
832 				 * This is not the one we were looking
833 				 * for; queue it for later processing
834 				 * (queueing sorts on record id)
835 				 */
836 				insert_lrp_to_lb(lbp, lrp);
837 				lrp = NULL;
838 			}
839 		}
840 	}
841 	return (lrp);
842 }
843 
844 /*
845  * Free the log record provided.
846  * This is complex because the associated XDR streams also need to be freed
847  * since allocation could have occured during the DECODE phase.  The record
848  * header, args and results need to be XDR_FREEd.  The xdr funtions will
849  * be provided if a free needs to be done.
850  *
851  * Note that caller tells us if the record being freed was processed.
852  * If so, then the buffer header should be updated.  Updating the buffer
853  * header keeps track of where the nfslogd daemon left off in its processing
854  * if it is unable to complete the entire file.
855  */
856 void
857 nfslog_free_logrecord(struct nfslog_lr *lrp, bool_t processing_complete)
858 {
859 	caddr_t			buffer;
860 	nfslog_request_record 	*reqrec;
861 
862 	if (processing_complete) {
863 		nfslog_ins_last_rec_processed(lrp);
864 	}
865 
866 	reqrec = &lrp->log_record;
867 
868 	buffer = (lrp->buffer != NULL ? lrp->buffer : (caddr_t)lrp->record);
869 
870 	xdrmem_create(&lrp->xdrs, buffer, lrp->recsize, XDR_FREE);
871 
872 	(void) xdr_nfslog_request_record(&lrp->xdrs, reqrec);
873 
874 	if (lrp->xdrargs != NULL && reqrec->re_rpc_arg)
875 		(*lrp->xdrargs)(&lrp->xdrs, reqrec->re_rpc_arg);
876 
877 	if (reqrec->re_rpc_arg)
878 		free(reqrec->re_rpc_arg);
879 
880 	if (lrp->xdrres != NULL && reqrec->re_rpc_res)
881 		(*lrp->xdrres)(&lrp->xdrs, reqrec->re_rpc_res);
882 
883 	if (reqrec->re_rpc_res)
884 		free(reqrec->re_rpc_res);
885 
886 	free_lrp(lrp);
887 }
888 
889 static void
890 free_lrp(struct nfslog_lr *lrp)
891 {
892 	if (lrp->buffer != NULL)
893 		free(lrp->buffer);
894 	free(lrp);
895 }
896 
897 /*
898  * Utility function used elsewhere
899  */
900 void
901 nfslog_opaque_print_buf(void *buf, int len, char *outbuf, int *outbufoffsetp,
902 	int maxoffset)
903 {
904 	int	i, j;
905 	uint_t	*ip;
906 	uchar_t	*u_buf = (uchar_t *)buf;
907 	int	outbufoffset = *outbufoffsetp;
908 
909 	outbufoffset += sprintf(&outbuf[outbufoffset], " \"");
910 	if (len <= sizeof (int)) {
911 		for (j = 0; (j < len) && (outbufoffset < maxoffset);
912 			j++, u_buf++)
913 			outbufoffset += sprintf(&outbuf[outbufoffset],
914 						"%02x", *u_buf);
915 		return;
916 	}
917 	/* More than 4 bytes, print with spaces in integer offsets */
918 	j = (int)((uintptr_t)buf % sizeof (int));
919 	i = 0;
920 	if (j > 0) {
921 		i = sizeof (int) - j;
922 		for (; (j < sizeof (int)) && (outbufoffset < maxoffset);
923 			j++, u_buf++)
924 			outbufoffset += sprintf(&outbuf[outbufoffset],
925 						"%02x", *u_buf);
926 	}
927 	/* LINTED */
928 	ip = (uint_t *)u_buf;
929 	for (; ((i + sizeof (int)) <= len) && (outbufoffset < maxoffset);
930 		i += sizeof (int), ip++) {
931 		outbufoffset += sprintf(&outbuf[outbufoffset], " %08x", *ip);
932 	}
933 	if (i < len) {
934 		/* Last element not int */
935 		u_buf = (uchar_t *)ip;
936 		if (i > j)	/* not first element */
937 			outbufoffset += sprintf(&outbuf[outbufoffset], " ");
938 		for (; (i < len) && (outbufoffset < maxoffset); i++, u_buf++) {
939 			outbufoffset += sprintf(&outbuf[outbufoffset],
940 						"%02x", *u_buf);
941 		}
942 	}
943 	if (outbufoffset < maxoffset)
944 		outbufoffset += sprintf(&outbuf[outbufoffset], "\"");
945 	*outbufoffsetp = outbufoffset;
946 }
947