xref: /titanic_50/usr/src/uts/common/os/devcache.c (revision 381a2a9a387f449fab7d0c7e97c4184c26963abf)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/note.h>
29 #include <sys/t_lock.h>
30 #include <sys/cmn_err.h>
31 #include <sys/instance.h>
32 #include <sys/conf.h>
33 #include <sys/stat.h>
34 #include <sys/ddi.h>
35 #include <sys/hwconf.h>
36 #include <sys/sunddi.h>
37 #include <sys/sunndi.h>
38 #include <sys/ddi_impldefs.h>
39 #include <sys/ndi_impldefs.h>
40 #include <sys/modctl.h>
41 #include <sys/dacf.h>
42 #include <sys/promif.h>
43 #include <sys/cpuvar.h>
44 #include <sys/pathname.h>
45 #include <sys/kobj.h>
46 #include <sys/devcache.h>
47 #include <sys/devcache_impl.h>
48 #include <sys/sysmacros.h>
49 #include <sys/varargs.h>
50 #include <sys/callb.h>
51 
52 /*
53  * This facility provides interfaces to clients to register,
54  * read and update cache data in persisted backing store files,
55  * usually in /etc/devices.  The data persisted through this
56  * mechanism should be stateless data, functioning in the sense
57  * of a cache.  Writes are performed by a background daemon
58  * thread, permitting a client to schedule an update without
59  * blocking, then continue updating the data state in
60  * parallel.  The data is only locked by the daemon thread
61  * to pack the data in preparation for the write.
62  *
63  * Data persisted through this mechanism should be capable
64  * of being regenerated through normal system operation,
65  * for example attaching all disk devices would cause all
66  * devids to be registered for those devices.  By caching
67  * a devid-device tuple, the system can operate in a
68  * more optimal way, directly attaching the device mapped
69  * to a devid, rather than burdensomely driving attach of
70  * the entire device tree to discover a single device.
71  *
72  * Note that a client should only need to include
73  * <sys/devcache.h> for the supported interfaces.
74  *
75  * The data per client is entirely within the control of
76  * the client.  When reading, data unpacked from the backing
77  * store should be inserted in the list.  The pointer to
78  * the list can be retreived via nvf_list().  When writing,
79  * the data on the list is to be packed and returned to the
80  * nvpdaemon as an nvlist.
81  *
82  * Obvious restrictions are imposed by the limits of the
83  * nvlist format.  The data cannot be read or written
84  * piecemeal, and large amounts of data aren't recommended.
85  * However, nvlists do allow that data be named and typed
86  * and can be size-of-int invariant, and the cached data
87  * can be versioned conveniently.
88  *
89  * The registration involves two steps: a handle is
90  * allocated by calling the registration function.
91  * This sets up the data referenced by the handle and
92  * initializes the lock.  Following registration, the
93  * client must initialize the data list.  The list
94  * interfaces require that the list element with offset
95  * to the node link be provided.  The format of the
96  * list element is under the control of the client.
97  *
98  * Locking: the address of the data list r/w lock provided
99  * can be accessed with nvf_lock().  The lock must be held
100  * as reader when traversing the list or checking state,
101  * such as nvf_is_dirty().  The lock must be held as
102  * writer when updating the list or marking it dirty.
103  * The lock must not be held when waking the daemon.
104  *
105  * The data r/w lock is held as writer when the pack,
106  * unpack and free list handlers are called.  The
107  * lock should not be dropped and must be still held
108  * upon return.  The client should also hold the lock
109  * as reader when checking if the list is dirty, and
110  * as writer when marking the list dirty or initiating
111  * a read.
112  *
113  * The asynchronous nature of updates allows for the
114  * possibility that the data may continue to be updated
115  * once the daemon has been notified that an update is
116  * desired.  The data only needs to be locked against
117  * updates when packing the data into the form to be
118  * written.  When the write of the packed data has
119  * completed, the daemon will automatically reschedule
120  * an update if the data was marked dirty after the
121  * point at which it was packed.  Before beginning an
122  * update, the daemon attempts to lock the data as
123  * writer; if the writer lock is already held, it
124  * backs off and retries later.  The model is to give
125  * priority to the kernel processes generating the
126  * data, and that the nature of the data is that
127  * it does not change often, can be re-generated when
128  * needed, so updates should not happen often and
129  * can be delayed until the data stops changing.
130  * The client may update the list or mark it dirty
131  * any time it is able to acquire the lock as
132  * writer first.
133  *
134  * A failed write will be retried after some delay,
135  * in the hope that the cause of the error will be
136  * transient, for example a filesystem with no space
137  * available.  An update on a read-only filesystem
138  * is failed silently and not retried; this would be
139  * the case when booted off install media.
140  *
141  * There is no unregister mechanism as of yet, as it
142  * hasn't been needed so far.
143  */
144 
145 /*
146  * Global list of files registered and updated by the nvpflush
147  * daemon, protected by the nvf_cache_mutex.  While an
148  * update is taking place, a file is temporarily moved to
149  * the dirty list to avoid locking the primary list for
150  * the duration of the update.
151  */
152 list_t		nvf_cache_files;
153 list_t		nvf_dirty_files;
154 kmutex_t	nvf_cache_mutex;
155 
156 
157 /*
158  * Allow some delay from an update of the data before flushing
159  * to permit simultaneous updates of multiple changes.
160  * Changes in the data are expected to be bursty, ie
161  * reconfig or hot-plug of a new adapter.
162  *
163  * kfio_report_error (default 0)
164  *	Set to 1 to enable some error messages related to low-level
165  *	kernel file i/o operations.
166  *
167  * nvpflush_delay (default 10)
168  *	The number of seconds after data is marked dirty before the
169  *	flush daemon is triggered to flush the data.  A longer period
170  *	of time permits more data updates per write.  Note that
171  *	every update resets the timer so no repository write will
172  *	occur while data is being updated continuously.
173  *
174  * nvpdaemon_idle_time (default 60)
175  *	The number of seconds the daemon will sleep idle before exiting.
176  *
177  */
178 #define	NVPFLUSH_DELAY		10
179 #define	NVPDAEMON_IDLE_TIME	60
180 
181 #define	TICKS_PER_SECOND	(drv_usectohz(1000000))
182 
183 /*
184  * Tunables
185  */
186 int kfio_report_error = 0;		/* kernel file i/o operations */
187 int kfio_disable_read = 0;		/* disable all reads */
188 int kfio_disable_write = 0;		/* disable all writes */
189 
190 int nvpflush_delay	= NVPFLUSH_DELAY;
191 int nvpdaemon_idle_time	= NVPDAEMON_IDLE_TIME;
192 
193 static timeout_id_t	nvpflush_id = 0;
194 static int		nvpflush_timer_busy = 0;
195 static int		nvpflush_daemon_active = 0;
196 static kthread_t	*nvpflush_thr_id = 0;
197 
198 static int		do_nvpflush = 0;
199 static int		nvpbusy = 0;
200 static kmutex_t		nvpflush_lock;
201 static kcondvar_t	nvpflush_cv;
202 static kthread_id_t	nvpflush_thread;
203 static clock_t		nvpticks;
204 
205 static void nvpflush_daemon(void);
206 
207 #ifdef	DEBUG
208 int nvpdaemon_debug = 0;
209 int kfio_debug = 0;
210 #endif	/* DEBUG */
211 
212 extern int modrootloaded;
213 extern void mdi_read_devices_files(void);
214 extern void mdi_clean_vhcache(void);
215 
216 /*
217  * Initialize the overall cache file management
218  */
219 void
220 i_ddi_devices_init(void)
221 {
222 	list_create(&nvf_cache_files, sizeof (nvfd_t),
223 	    offsetof(nvfd_t, nvf_link));
224 	list_create(&nvf_dirty_files, sizeof (nvfd_t),
225 	    offsetof(nvfd_t, nvf_link));
226 	mutex_init(&nvf_cache_mutex, NULL, MUTEX_DEFAULT, NULL);
227 	devid_cache_init();
228 }
229 
230 /*
231  * Read cache files
232  * The files read here should be restricted to those
233  * that may be required to mount root.
234  */
235 void
236 i_ddi_read_devices_files(void)
237 {
238 	if (!kfio_disable_read) {
239 		mdi_read_devices_files();
240 		devid_cache_read();
241 	}
242 }
243 
244 void
245 i_ddi_start_flush_daemon(void)
246 {
247 	nvfd_t	*nvfdp;
248 
249 	ASSERT(i_ddi_io_initialized());
250 
251 	mutex_init(&nvpflush_lock, NULL, MUTEX_DRIVER, NULL);
252 	cv_init(&nvpflush_cv, NULL, CV_DRIVER, NULL);
253 
254 	mutex_enter(&nvf_cache_mutex);
255 	for (nvfdp = list_head(&nvf_cache_files); nvfdp;
256 	    nvfdp = list_next(&nvf_cache_files, nvfdp)) {
257 		if (NVF_IS_DIRTY(nvfdp)) {
258 			nvf_wake_daemon();
259 			break;
260 		}
261 	}
262 	mutex_exit(&nvf_cache_mutex);
263 }
264 
265 void
266 i_ddi_clean_devices_files(void)
267 {
268 	devid_cache_cleanup();
269 	mdi_clean_vhcache();
270 }
271 
272 /*
273  * Register a cache file to be managed and updated by the nvpflush daemon.
274  * All operations are performed through the returned handle.
275  * There is no unregister mechanism for now.
276  */
277 nvf_handle_t
278 nvf_register_file(nvf_ops_t *ops)
279 {
280 	nvfd_t *nvfdp;
281 
282 	nvfdp = kmem_zalloc(sizeof (*nvfdp), KM_SLEEP);
283 
284 	nvfdp->nvf_ops = ops;
285 	nvfdp->nvf_flags = 0;
286 	rw_init(&nvfdp->nvf_lock, NULL, RW_DRIVER, NULL);
287 
288 	mutex_enter(&nvf_cache_mutex);
289 	list_insert_tail(&nvf_cache_files, nvfdp);
290 	mutex_exit(&nvf_cache_mutex);
291 
292 	return ((nvf_handle_t)nvfdp);
293 }
294 
295 /*PRINTFLIKE1*/
296 void
297 nvf_error(const char *fmt, ...)
298 {
299 	va_list ap;
300 
301 	if (kfio_report_error) {
302 		va_start(ap, fmt);
303 		vcmn_err(CE_NOTE, fmt, ap);
304 		va_end(ap);
305 	}
306 }
307 
308 /*
309  * Some operations clients may use to manage the data
310  * to be persisted in a cache file.
311  */
312 char *
313 nvf_cache_name(nvf_handle_t handle)
314 {
315 	return (((nvfd_t *)handle)->nvf_cache_path);
316 }
317 
318 krwlock_t *
319 nvf_lock(nvf_handle_t handle)
320 {
321 	return (&(((nvfd_t *)handle)->nvf_lock));
322 }
323 
324 list_t *
325 nvf_list(nvf_handle_t handle)
326 {
327 	return (&(((nvfd_t *)handle)->nvf_data_list));
328 }
329 
330 void
331 nvf_mark_dirty(nvf_handle_t handle)
332 {
333 	ASSERT(RW_WRITE_HELD(&(((nvfd_t *)handle)->nvf_lock)));
334 	NVF_MARK_DIRTY((nvfd_t *)handle);
335 }
336 
337 int
338 nvf_is_dirty(nvf_handle_t handle)
339 {
340 	ASSERT(RW_LOCK_HELD(&(((nvfd_t *)handle)->nvf_lock)));
341 	return (NVF_IS_DIRTY((nvfd_t *)handle));
342 }
343 
344 static uint16_t
345 nvp_cksum(uchar_t *buf, int64_t buflen)
346 {
347 	uint16_t cksum = 0;
348 	uint16_t *p = (uint16_t *)buf;
349 	int64_t n;
350 
351 	if ((buflen & 0x01) != 0) {
352 		buflen--;
353 		cksum = buf[buflen];
354 	}
355 	n = buflen / 2;
356 	while (n-- > 0)
357 		cksum ^= *p++;
358 	return (cksum);
359 }
360 
361 int
362 fread_nvlist(char *filename, nvlist_t **ret_nvlist)
363 {
364 	struct _buf	*file;
365 	nvpf_hdr_t	hdr;
366 	char		*buf;
367 	nvlist_t	*nvl;
368 	int		rval;
369 	uint_t		offset;
370 	int		n;
371 	char		c;
372 	uint16_t	cksum, hdrsum;
373 
374 	*ret_nvlist = NULL;
375 
376 	file = kobj_open_file(filename);
377 	if (file == (struct _buf *)-1) {
378 		KFDEBUG((CE_CONT, "cannot open file: %s\n", filename));
379 		return (ENOENT);
380 	}
381 
382 	offset = 0;
383 	n = kobj_read_file(file, (char *)&hdr, sizeof (hdr), offset);
384 	if (n != sizeof (hdr)) {
385 		kobj_close_file(file);
386 		if (n < 0) {
387 			nvf_error("error reading header: %s\n", filename);
388 			return (EIO);
389 		} else if (n == 0) {
390 			KFDEBUG((CE_CONT, "file empty: %s\n", filename));
391 		} else {
392 			nvf_error("header size incorrect: %s\n", filename);
393 		}
394 		return (EINVAL);
395 	}
396 	offset += n;
397 
398 	KFDEBUG2((CE_CONT, "nvpf_magic: 0x%x\n", hdr.nvpf_magic));
399 	KFDEBUG2((CE_CONT, "nvpf_version: %d\n", hdr.nvpf_version));
400 	KFDEBUG2((CE_CONT, "nvpf_size: %lld\n",
401 		(longlong_t)hdr.nvpf_size));
402 	KFDEBUG2((CE_CONT, "nvpf_hdr_chksum: 0x%x\n",
403 		hdr.nvpf_hdr_chksum));
404 	KFDEBUG2((CE_CONT, "nvpf_chksum: 0x%x\n", hdr.nvpf_chksum));
405 
406 	cksum = hdr.nvpf_hdr_chksum;
407 	hdr.nvpf_hdr_chksum = 0;
408 	hdrsum = nvp_cksum((uchar_t *)&hdr, sizeof (hdr));
409 
410 	if (hdr.nvpf_magic != NVPF_HDR_MAGIC ||
411 	    hdr.nvpf_version != NVPF_HDR_VERSION || hdrsum != cksum) {
412 		kobj_close_file(file);
413 		if (hdrsum != cksum) {
414 			nvf_error("%s: checksum error "
415 			    "(actual 0x%x, expected 0x%x)\n",
416 			    filename, hdrsum, cksum);
417 		}
418 		nvf_error("%s: header information incorrect", filename);
419 		return (EINVAL);
420 	}
421 
422 	ASSERT(hdr.nvpf_size >= 0);
423 
424 	buf = kmem_alloc(hdr.nvpf_size, KM_SLEEP);
425 	n = kobj_read_file(file, buf, hdr.nvpf_size, offset);
426 	if (n != hdr.nvpf_size) {
427 		kmem_free(buf, hdr.nvpf_size);
428 		kobj_close_file(file);
429 		if (n < 0) {
430 			nvf_error("%s: read error %d", filename, n);
431 		} else {
432 			nvf_error("%s: incomplete read %d/%lld",
433 				filename, n, (longlong_t)hdr.nvpf_size);
434 		}
435 		return (EINVAL);
436 	}
437 	offset += n;
438 
439 	rval = kobj_read_file(file, &c, 1, offset);
440 	kobj_close_file(file);
441 	if (rval > 0) {
442 		nvf_error("%s is larger than %lld\n",
443 			filename, (longlong_t)hdr.nvpf_size);
444 		kmem_free(buf, hdr.nvpf_size);
445 		return (EINVAL);
446 	}
447 
448 	cksum = nvp_cksum((uchar_t *)buf, hdr.nvpf_size);
449 	if (hdr.nvpf_chksum != cksum) {
450 		nvf_error("%s: checksum error (actual 0x%x, expected 0x%x)\n",
451 		    filename, hdr.nvpf_chksum, cksum);
452 		kmem_free(buf, hdr.nvpf_size);
453 		return (EINVAL);
454 	}
455 
456 	nvl = NULL;
457 	rval = nvlist_unpack(buf, hdr.nvpf_size, &nvl, 0);
458 	if (rval != 0) {
459 		nvf_error("%s: error %d unpacking nvlist\n",
460 			filename, rval);
461 		kmem_free(buf, hdr.nvpf_size);
462 		return (EINVAL);
463 	}
464 
465 	kmem_free(buf, hdr.nvpf_size);
466 	*ret_nvlist = nvl;
467 	return (0);
468 }
469 
470 static int
471 kfcreate(char *filename, kfile_t **kfilep)
472 {
473 	kfile_t	*fp;
474 	int	rval;
475 
476 	ASSERT(modrootloaded);
477 
478 	fp = kmem_alloc(sizeof (kfile_t), KM_SLEEP);
479 
480 	fp->kf_vnflags = FCREAT | FWRITE | FTRUNC;
481 	fp->kf_fname = filename;
482 	fp->kf_fpos = 0;
483 	fp->kf_state = 0;
484 
485 	KFDEBUG((CE_CONT, "create: %s flags 0x%x\n",
486 		filename, fp->kf_vnflags));
487 	rval = vn_open(filename, UIO_SYSSPACE, fp->kf_vnflags,
488 	    0444, &fp->kf_vp, CRCREAT, 0);
489 	if (rval != 0) {
490 		kmem_free(fp, sizeof (kfile_t));
491 		KFDEBUG((CE_CONT, "%s: create error %d\n",
492 			filename, rval));
493 		return (rval);
494 	}
495 
496 	*kfilep = fp;
497 	return (0);
498 }
499 
500 static int
501 kfremove(char *filename)
502 {
503 	int rval;
504 
505 	KFDEBUG((CE_CONT, "remove: %s\n", filename));
506 	rval = vn_remove(filename, UIO_SYSSPACE, RMFILE);
507 	if (rval != 0) {
508 		KFDEBUG((CE_CONT, "%s: remove error %d\n",
509 			filename, rval));
510 	}
511 	return (rval);
512 }
513 
514 static int
515 kfread(kfile_t *fp, char *buf, ssize_t bufsiz, ssize_t *ret_n)
516 {
517 	ssize_t		resid;
518 	int		err;
519 	ssize_t		n;
520 
521 	ASSERT(modrootloaded);
522 
523 	if (fp->kf_state != 0)
524 		return (fp->kf_state);
525 
526 	err = vn_rdwr(UIO_READ, fp->kf_vp, buf, bufsiz, fp->kf_fpos,
527 		UIO_SYSSPACE, 0, (rlim64_t)0, kcred, &resid);
528 	if (err != 0) {
529 		KFDEBUG((CE_CONT, "%s: read error %d\n",
530 			fp->kf_fname, err));
531 		fp->kf_state = err;
532 		return (err);
533 	}
534 
535 	ASSERT(resid >= 0 && resid <= bufsiz);
536 	n = bufsiz - resid;
537 
538 	KFDEBUG1((CE_CONT, "%s: read %ld bytes ok %ld bufsiz, %ld resid\n",
539 		fp->kf_fname, n, bufsiz, resid));
540 
541 	fp->kf_fpos += n;
542 	*ret_n = n;
543 	return (0);
544 }
545 
546 static int
547 kfwrite(kfile_t *fp, char *buf, ssize_t bufsiz, ssize_t *ret_n)
548 {
549 	rlim64_t	rlimit;
550 	ssize_t		resid;
551 	int		err;
552 	ssize_t		len;
553 	ssize_t		n = 0;
554 
555 	ASSERT(modrootloaded);
556 
557 	if (fp->kf_state != 0)
558 		return (fp->kf_state);
559 
560 	len = bufsiz;
561 	rlimit = bufsiz + 1;
562 	for (;;) {
563 		err = vn_rdwr(UIO_WRITE, fp->kf_vp, buf, len, fp->kf_fpos,
564 			UIO_SYSSPACE, FSYNC, rlimit, kcred, &resid);
565 		if (err) {
566 			KFDEBUG((CE_CONT, "%s: write error %d\n",
567 				fp->kf_fname, err));
568 			fp->kf_state = err;
569 			return (err);
570 		}
571 
572 		KFDEBUG1((CE_CONT, "%s: write %ld bytes ok %ld resid\n",
573 			fp->kf_fname, len-resid, resid));
574 
575 		ASSERT(resid >= 0 && resid <= len);
576 
577 		n += (len - resid);
578 		if (resid == 0)
579 			break;
580 
581 		if (resid == len) {
582 			KFDEBUG((CE_CONT, "%s: filesystem full?\n",
583 				fp->kf_fname));
584 			fp->kf_state = ENOSPC;
585 			return (ENOSPC);
586 		}
587 
588 		len -= resid;
589 		buf += len;
590 		fp->kf_fpos += len;
591 		len = resid;
592 	}
593 
594 	ASSERT(n == bufsiz);
595 	KFDEBUG1((CE_CONT, "%s: wrote %ld bytes ok\n", fp->kf_fname, n));
596 
597 	*ret_n = n;
598 	return (0);
599 }
600 
601 
602 static int
603 kfclose(kfile_t *fp)
604 {
605 	int		rval;
606 
607 	KFDEBUG((CE_CONT, "close: %s\n", fp->kf_fname));
608 
609 	if ((fp->kf_vnflags & FWRITE) && fp->kf_state == 0) {
610 		rval = VOP_FSYNC(fp->kf_vp, FSYNC,  kcred);
611 		if (rval != 0) {
612 			nvf_error("%s: sync error %d\n",
613 				fp->kf_fname, rval);
614 		}
615 		KFDEBUG((CE_CONT, "%s: sync ok\n", fp->kf_fname));
616 	}
617 
618 	rval = VOP_CLOSE(fp->kf_vp, fp->kf_vnflags, 1, (offset_t)0, kcred);
619 	if (rval != 0) {
620 		if (fp->kf_state == 0) {
621 			nvf_error("%s: close error %d\n",
622 				fp->kf_fname, rval);
623 		}
624 	} else {
625 		if (fp->kf_state == 0)
626 			KFDEBUG((CE_CONT, "%s: close ok\n", fp->kf_fname));
627 	}
628 
629 	VN_RELE(fp->kf_vp);
630 	kmem_free(fp, sizeof (kfile_t));
631 	return (rval);
632 }
633 
634 static int
635 kfrename(char *oldname, char *newname)
636 {
637 	int rval;
638 
639 	ASSERT(modrootloaded);
640 
641 	KFDEBUG((CE_CONT, "renaming %s to %s\n", oldname, newname));
642 
643 	if ((rval = vn_rename(oldname, newname, UIO_SYSSPACE)) != 0) {
644 		KFDEBUG((CE_CONT, "rename %s to %s: %d\n",
645 			oldname, newname, rval));
646 	}
647 
648 	return (rval);
649 }
650 
651 int
652 fwrite_nvlist(char *filename, nvlist_t *nvl)
653 {
654 	char	*buf;
655 	char	*nvbuf;
656 	kfile_t	*fp;
657 	char	*newname;
658 	int	len, err, err1;
659 	size_t	buflen;
660 	ssize_t	n;
661 
662 	ASSERT(modrootloaded);
663 
664 	nvbuf = NULL;
665 	err = nvlist_pack(nvl, &nvbuf, &buflen, NV_ENCODE_NATIVE, 0);
666 	if (err != 0) {
667 		nvf_error("%s: error %d packing nvlist\n",
668 			filename, err);
669 		return (err);
670 	}
671 
672 	buf = kmem_alloc(sizeof (nvpf_hdr_t) + buflen, KM_SLEEP);
673 	bzero(buf, sizeof (nvpf_hdr_t));
674 
675 	((nvpf_hdr_t *)buf)->nvpf_magic = NVPF_HDR_MAGIC;
676 	((nvpf_hdr_t *)buf)->nvpf_version = NVPF_HDR_VERSION;
677 	((nvpf_hdr_t *)buf)->nvpf_size = buflen;
678 	((nvpf_hdr_t *)buf)->nvpf_chksum = nvp_cksum((uchar_t *)nvbuf, buflen);
679 	((nvpf_hdr_t *)buf)->nvpf_hdr_chksum =
680 		nvp_cksum((uchar_t *)buf, sizeof (nvpf_hdr_t));
681 
682 	bcopy(nvbuf, buf + sizeof (nvpf_hdr_t), buflen);
683 	kmem_free(nvbuf, buflen);
684 	buflen += sizeof (nvpf_hdr_t);
685 
686 	len = strlen(filename) + MAX_SUFFIX_LEN + 2;
687 	newname = kmem_alloc(len, KM_SLEEP);
688 
689 
690 	(void) sprintf(newname, "%s.%s",
691 		filename, NEW_FILENAME_SUFFIX);
692 
693 	/*
694 	 * To make it unlikely we suffer data loss, write
695 	 * data to the new temporary file.  Once successful
696 	 * complete the transaction by renaming the new file
697 	 * to replace the previous.
698 	 */
699 
700 	if ((err = kfcreate(newname, &fp)) == 0) {
701 		err = kfwrite(fp, buf, buflen, &n);
702 		if (err) {
703 			nvf_error("%s: write error - %d\n",
704 				newname, err);
705 		} else {
706 			if (n != buflen) {
707 				nvf_error(
708 				    "%s: partial write %ld of %ld bytes\n",
709 				    newname, n, buflen);
710 				nvf_error("%s: filesystem may be full?\n",
711 				    newname);
712 				err = EIO;
713 			}
714 		}
715 		if ((err1 = kfclose(fp)) != 0) {
716 			nvf_error("%s: close error\n", newname);
717 			if (err == 0)
718 				err = err1;
719 		}
720 		if (err != 0) {
721 			if (kfremove(newname) != 0) {
722 				nvf_error("%s: remove failed\n",
723 				    newname);
724 			}
725 		}
726 	} else {
727 		nvf_error("%s: create failed - %d\n", filename, err);
728 	}
729 
730 	if (err == 0) {
731 		if ((err = kfrename(newname, filename)) != 0) {
732 			nvf_error("%s: rename from %s failed\n",
733 				newname, filename);
734 		}
735 	}
736 
737 	kmem_free(newname, len);
738 	kmem_free(buf, buflen);
739 
740 	return (err);
741 }
742 
743 static int
744 e_fwrite_nvlist(nvfd_t *nvfd, nvlist_t *nvl)
745 {
746 	int err;
747 
748 	if ((err = fwrite_nvlist(nvfd->nvf_cache_path, nvl)) == 0)
749 		return (DDI_SUCCESS);
750 	else {
751 		if (err == EROFS)
752 			NVF_MARK_READONLY(nvfd);
753 		return (DDI_FAILURE);
754 	}
755 }
756 
757 static void
758 nvp_list_free(nvfd_t *nvf)
759 {
760 	ASSERT(RW_WRITE_HELD(&nvf->nvf_lock));
761 	(nvf->nvf_list_free)((nvf_handle_t)nvf);
762 	ASSERT(RW_WRITE_HELD(&nvf->nvf_lock));
763 }
764 
765 /*
766  * Read a file in the nvlist format
767  *	EIO - i/o error during read
768  *	ENOENT - file not found
769  *	EINVAL - file contents corrupted
770  */
771 static int
772 fread_nvp_list(nvfd_t *nvfd)
773 {
774 	nvlist_t	*nvl;
775 	nvpair_t	*nvp;
776 	char		*name;
777 	nvlist_t	*sublist;
778 	int		rval;
779 	int		rv;
780 
781 	ASSERT(RW_WRITE_HELD(&(nvfd->nvf_lock)));
782 
783 	rval = fread_nvlist(nvfd->nvf_cache_path, &nvl);
784 	if (rval != 0)
785 		return (rval);
786 	ASSERT(nvl != NULL);
787 
788 	nvp = NULL;
789 	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
790 		name = nvpair_name(nvp);
791 		ASSERT(strlen(name) > 0);
792 
793 		switch (nvpair_type(nvp)) {
794 		case DATA_TYPE_NVLIST:
795 			rval = nvpair_value_nvlist(nvp, &sublist);
796 			if (rval != 0) {
797 				nvf_error(
798 				    "nvpair_value_nvlist error %s %d\n",
799 				    name, rval);
800 				goto error;
801 			}
802 
803 			/*
804 			 * unpack nvlist for this device and
805 			 * add elements to data list.
806 			 */
807 			ASSERT(RW_WRITE_HELD(&(nvfd->nvf_lock)));
808 			rv = (nvfd->nvf_unpack_nvlist)
809 			    ((nvf_handle_t)nvfd, sublist, name);
810 			ASSERT(RW_WRITE_HELD(&(nvfd->nvf_lock)));
811 			if (rv != 0) {
812 				nvf_error(
813 				    "%s: %s invalid list element\n",
814 				    nvfd->nvf_cache_path, name);
815 				rval = EINVAL;
816 				goto error;
817 			}
818 			break;
819 
820 		default:
821 			nvf_error("%s: %s unsupported data type %d\n",
822 				nvfd->nvf_cache_path, name, nvpair_type(nvp));
823 			rval = EINVAL;
824 			goto error;
825 		}
826 	}
827 
828 	nvlist_free(nvl);
829 
830 	return (0);
831 
832 error:
833 	nvlist_free(nvl);
834 	nvp_list_free(nvfd);
835 	return (rval);
836 }
837 
838 
839 int
840 nvf_read_file(nvf_handle_t nvf_handle)
841 {
842 	nvfd_t *nvfd = (nvfd_t *)nvf_handle;
843 	int rval;
844 
845 	ASSERT(RW_WRITE_HELD(&nvfd->nvf_lock));
846 
847 	if (kfio_disable_read)
848 		return (0);
849 
850 	KFDEBUG((CE_CONT, "reading %s\n", nvfd->nvf_cache_path));
851 
852 	rval = fread_nvp_list(nvfd);
853 	if (rval) {
854 		switch (rval) {
855 		case EIO:
856 			nvfd->nvf_flags |= NVF_F_REBUILD_MSG;
857 			cmn_err(CE_WARN, "%s: I/O error",
858 				nvfd->nvf_cache_path);
859 			break;
860 		case ENOENT:
861 			nvfd->nvf_flags |= NVF_F_CREATE_MSG;
862 			nvf_error("%s: not found\n",
863 				nvfd->nvf_cache_path);
864 			break;
865 		case EINVAL:
866 		default:
867 			nvfd->nvf_flags |= NVF_F_REBUILD_MSG;
868 			cmn_err(CE_WARN, "%s: data file corrupted",
869 				nvfd->nvf_cache_path);
870 			break;
871 		}
872 	}
873 	return (rval);
874 }
875 
876 static void
877 nvf_write_is_complete(nvfd_t *fd)
878 {
879 	if (fd->nvf_write_complete) {
880 		(fd->nvf_write_complete)((nvf_handle_t)fd);
881 	}
882 }
883 
884 /*ARGSUSED*/
885 static void
886 nvpflush_timeout(void *arg)
887 {
888 	clock_t nticks;
889 
890 	mutex_enter(&nvpflush_lock);
891 	nticks = nvpticks - ddi_get_lbolt();
892 	if (nticks > 4) {
893 		nvpflush_timer_busy = 1;
894 		mutex_exit(&nvpflush_lock);
895 		nvpflush_id = timeout(nvpflush_timeout, NULL, nticks);
896 	} else {
897 		do_nvpflush = 1;
898 		NVPDAEMON_DEBUG((CE_CONT, "signal nvpdaemon\n"));
899 		cv_signal(&nvpflush_cv);
900 		nvpflush_id = 0;
901 		nvpflush_timer_busy = 0;
902 		mutex_exit(&nvpflush_lock);
903 	}
904 }
905 
906 /*
907  * After marking a list as dirty, wake the nvpflush daemon
908  * to perform the update.
909  */
910 void
911 nvf_wake_daemon(void)
912 {
913 	clock_t nticks;
914 
915 	/*
916 	 * If the system isn't up yet
917 	 * don't even think about starting a flush.
918 	 */
919 	if (!i_ddi_io_initialized())
920 		return;
921 
922 	mutex_enter(&nvpflush_lock);
923 
924 	if (nvpflush_daemon_active == 0) {
925 		nvpflush_daemon_active = 1;
926 		mutex_exit(&nvpflush_lock);
927 		NVPDAEMON_DEBUG((CE_CONT, "starting nvpdaemon thread\n"));
928 		nvpflush_thr_id = thread_create(NULL, 0,
929 		    (void (*)())nvpflush_daemon,
930 		    NULL, 0, &p0, TS_RUN, minclsyspri);
931 		mutex_enter(&nvpflush_lock);
932 	}
933 
934 	nticks = nvpflush_delay * TICKS_PER_SECOND;
935 	nvpticks = ddi_get_lbolt() + nticks;
936 	if (nvpflush_timer_busy == 0) {
937 		nvpflush_timer_busy = 1;
938 		mutex_exit(&nvpflush_lock);
939 		nvpflush_id = timeout(nvpflush_timeout, NULL, nticks + 4);
940 	} else
941 		mutex_exit(&nvpflush_lock);
942 }
943 
944 static int
945 nvpflush_one(nvfd_t *nvfd)
946 {
947 	int rval = DDI_SUCCESS;
948 	nvlist_t *nvl;
949 
950 	rw_enter(&nvfd->nvf_lock, RW_READER);
951 
952 	ASSERT((nvfd->nvf_flags & NVF_F_FLUSHING) == 0);
953 
954 	if (!NVF_IS_DIRTY(nvfd) ||
955 	    NVF_IS_READONLY(nvfd) || kfio_disable_write) {
956 		NVF_CLEAR_DIRTY(nvfd);
957 		rw_exit(&nvfd->nvf_lock);
958 		return (DDI_SUCCESS);
959 	}
960 
961 	if (rw_tryupgrade(&nvfd->nvf_lock) == 0) {
962 		nvf_error("nvpflush: "
963 		    "%s rw upgrade failed\n", nvfd->nvf_cache_path);
964 		rw_exit(&nvfd->nvf_lock);
965 		return (DDI_FAILURE);
966 	}
967 	if (((nvfd->nvf_pack_list)
968 	    ((nvf_handle_t)nvfd, &nvl)) != DDI_SUCCESS) {
969 		nvf_error("nvpflush: "
970 		    "%s nvlist construction failed\n", nvfd->nvf_cache_path);
971 		ASSERT(RW_WRITE_HELD(&nvfd->nvf_lock));
972 		rw_exit(&nvfd->nvf_lock);
973 		return (DDI_FAILURE);
974 	}
975 	ASSERT(RW_WRITE_HELD(&nvfd->nvf_lock));
976 
977 	NVF_CLEAR_DIRTY(nvfd);
978 	nvfd->nvf_flags |= NVF_F_FLUSHING;
979 	rw_exit(&nvfd->nvf_lock);
980 
981 	rval = e_fwrite_nvlist(nvfd, nvl);
982 	nvlist_free(nvl);
983 
984 	rw_enter(&nvfd->nvf_lock, RW_WRITER);
985 	nvfd->nvf_flags &= ~NVF_F_FLUSHING;
986 	if (rval == DDI_FAILURE) {
987 		if (NVF_IS_READONLY(nvfd)) {
988 			rval = DDI_SUCCESS;
989 			nvfd->nvf_flags &= ~(NVF_F_ERROR | NVF_F_DIRTY);
990 		} else if ((nvfd->nvf_flags & NVF_F_ERROR) == 0) {
991 			cmn_err(CE_CONT,
992 			    "%s: updated failed\n", nvfd->nvf_cache_path);
993 			nvfd->nvf_flags |= NVF_F_ERROR | NVF_F_DIRTY;
994 		}
995 	} else {
996 		if (nvfd->nvf_flags & NVF_F_CREATE_MSG) {
997 			cmn_err(CE_CONT,
998 			    "!Creating %s\n", nvfd->nvf_cache_path);
999 			nvfd->nvf_flags &= ~NVF_F_CREATE_MSG;
1000 		}
1001 		if (nvfd->nvf_flags & NVF_F_REBUILD_MSG) {
1002 			cmn_err(CE_CONT,
1003 			    "!Rebuilding %s\n", nvfd->nvf_cache_path);
1004 			nvfd->nvf_flags &= ~NVF_F_REBUILD_MSG;
1005 		}
1006 		if (nvfd->nvf_flags & NVF_F_ERROR) {
1007 			cmn_err(CE_CONT,
1008 			    "%s: update now ok\n", nvfd->nvf_cache_path);
1009 			nvfd->nvf_flags &= ~NVF_F_ERROR;
1010 		}
1011 		/*
1012 		 * The file may need to be flushed again if the cached
1013 		 * data was touched while writing the earlier contents.
1014 		 */
1015 		if (NVF_IS_DIRTY(nvfd))
1016 			rval = DDI_FAILURE;
1017 	}
1018 
1019 	rw_exit(&nvfd->nvf_lock);
1020 	return (rval);
1021 }
1022 
1023 
1024 static void
1025 nvpflush_daemon(void)
1026 {
1027 	callb_cpr_t cprinfo;
1028 	nvfd_t *nvfdp, *nextfdp;
1029 	clock_t clk;
1030 	int rval;
1031 	int want_wakeup;
1032 	int is_now_clean;
1033 
1034 	ASSERT(modrootloaded);
1035 
1036 	nvpflush_thread = curthread;
1037 	NVPDAEMON_DEBUG((CE_CONT, "nvpdaemon: init\n"));
1038 
1039 	CALLB_CPR_INIT(&cprinfo, &nvpflush_lock, callb_generic_cpr, "nvp");
1040 	mutex_enter(&nvpflush_lock);
1041 	for (;;) {
1042 
1043 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
1044 		while (do_nvpflush == 0) {
1045 			clk = cv_timedwait(&nvpflush_cv, &nvpflush_lock,
1046 			    ddi_get_lbolt() +
1047 				(nvpdaemon_idle_time * TICKS_PER_SECOND));
1048 			if (clk == -1 &&
1049 			    do_nvpflush == 0 && nvpflush_timer_busy == 0) {
1050 				/*
1051 				 * Note that CALLB_CPR_EXIT calls mutex_exit()
1052 				 * on the lock passed in to CALLB_CPR_INIT,
1053 				 * so the lock must be held when invoking it.
1054 				 */
1055 				CALLB_CPR_SAFE_END(&cprinfo, &nvpflush_lock);
1056 				NVPDAEMON_DEBUG((CE_CONT, "nvpdaemon: exit\n"));
1057 				ASSERT(mutex_owned(&nvpflush_lock));
1058 				nvpflush_thr_id = NULL;
1059 				nvpflush_daemon_active = 0;
1060 				CALLB_CPR_EXIT(&cprinfo);
1061 				thread_exit();
1062 			}
1063 		}
1064 		CALLB_CPR_SAFE_END(&cprinfo, &nvpflush_lock);
1065 
1066 		nvpbusy = 1;
1067 		want_wakeup = 0;
1068 		do_nvpflush = 0;
1069 		mutex_exit(&nvpflush_lock);
1070 
1071 		/*
1072 		 * Try flushing what's dirty, reschedule if there's
1073 		 * a failure or data gets marked as dirty again.
1074 		 * First move each file marked dirty to the dirty
1075 		 * list to avoid locking the list across the write.
1076 		 */
1077 		mutex_enter(&nvf_cache_mutex);
1078 		for (nvfdp = list_head(&nvf_cache_files);
1079 		    nvfdp; nvfdp = nextfdp) {
1080 			nextfdp = list_next(&nvf_cache_files, nvfdp);
1081 			rw_enter(&nvfdp->nvf_lock, RW_READER);
1082 			if (NVF_IS_DIRTY(nvfdp)) {
1083 				list_remove(&nvf_cache_files, nvfdp);
1084 				list_insert_tail(&nvf_dirty_files, nvfdp);
1085 				rw_exit(&nvfdp->nvf_lock);
1086 			} else {
1087 				NVPDAEMON_DEBUG((CE_CONT,
1088 				    "nvpdaemon: not dirty %s\n",
1089 				    nvfdp->nvf_cache_path));
1090 				rw_exit(&nvfdp->nvf_lock);
1091 			}
1092 		}
1093 		mutex_exit(&nvf_cache_mutex);
1094 
1095 		/*
1096 		 * Now go through the dirty list
1097 		 */
1098 		for (nvfdp = list_head(&nvf_dirty_files);
1099 		    nvfdp; nvfdp = nextfdp) {
1100 			nextfdp = list_next(&nvf_dirty_files, nvfdp);
1101 
1102 			is_now_clean = 0;
1103 			rw_enter(&nvfdp->nvf_lock, RW_READER);
1104 			if (NVF_IS_DIRTY(nvfdp)) {
1105 				NVPDAEMON_DEBUG((CE_CONT,
1106 				    "nvpdaemon: flush %s\n",
1107 				    nvfdp->nvf_cache_path));
1108 				rw_exit(&nvfdp->nvf_lock);
1109 				rval = nvpflush_one(nvfdp);
1110 				rw_enter(&nvfdp->nvf_lock, RW_READER);
1111 				if (rval != DDI_SUCCESS ||
1112 				    NVF_IS_DIRTY(nvfdp)) {
1113 					rw_exit(&nvfdp->nvf_lock);
1114 					NVPDAEMON_DEBUG((CE_CONT,
1115 					    "nvpdaemon: %s dirty again\n",
1116 					    nvfdp->nvf_cache_path));
1117 					want_wakeup = 1;
1118 				} else {
1119 					rw_exit(&nvfdp->nvf_lock);
1120 					nvf_write_is_complete(nvfdp);
1121 					is_now_clean = 1;
1122 				}
1123 			} else {
1124 				NVPDAEMON_DEBUG((CE_CONT,
1125 				    "nvpdaemon: not dirty %s\n",
1126 				    nvfdp->nvf_cache_path));
1127 				rw_exit(&nvfdp->nvf_lock);
1128 				is_now_clean = 1;
1129 			}
1130 
1131 			if (is_now_clean) {
1132 				mutex_enter(&nvf_cache_mutex);
1133 				list_remove(&nvf_dirty_files, nvfdp);
1134 				list_insert_tail(&nvf_cache_files,
1135 				    nvfdp);
1136 				mutex_exit(&nvf_cache_mutex);
1137 			}
1138 		}
1139 
1140 		if (want_wakeup)
1141 			nvf_wake_daemon();
1142 
1143 		mutex_enter(&nvpflush_lock);
1144 		nvpbusy = 0;
1145 	}
1146 }
1147